Docker compose scripts for remote adapters (#241)

* tgi docker compose * path * wait for tgi server to start before starting server * update provider-id * move scripts to distribution/ folder * add readme * readme
2024-10-15 16:32:53 -07:00 · 2024-10-15 16:32:53 -07:00 · c4d5d6bb91
commit c4d5d6bb91
parent 770647dede
3 changed files with 129 additions and 0 deletions
--- a/llama_stack/distribution/docker/README.md
+++ b/llama_stack/distribution/docker/README.md
@ -0,0 +1,28 @@
+# Docker Compose Scripts
+
+This folder contains scripts to enable starting a distribution using `docker compose`.
+
+
+#### Example: TGI Inference Adapter
+```
+$ cd llama_stack/distribution/docker/tgi
+$ ls
+compose.yaml  tgi-run.yaml
+$ docker compose up
+```
+
+The script will first start up TGI server, then start up Llama Stack distribution server hooking up to the remote TGI provider for inference. You should be able to see the following outputs --
+```
+[text-generation-inference] | 2024-10-15T18:56:33.810397Z  INFO text_generation_router::server: router/src/server.rs:1813: Using config Some(Llama)
+[text-generation-inference] | 2024-10-15T18:56:33.810448Z  WARN text_generation_router::server: router/src/server.rs:1960: Invalid hostname, defaulting to 0.0.0.0
+[text-generation-inference] | 2024-10-15T18:56:33.864143Z  INFO text_generation_router::server: router/src/server.rs:2353: Connected
+INFO:     Started server process [1]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on http://[::]:5000 (Press CTRL+C to quit)
+```
+
+To kill the server
+```
+docker compose down
+```
--- a/llama_stack/distribution/docker/tgi/compose.yaml
+++ b/llama_stack/distribution/docker/tgi/compose.yaml
@ -0,0 +1,55 @@
+services:
+  text-generation-inference:
+    image: ghcr.io/huggingface/text-generation-inference:latest
+    network_mode: "host"
+    volumes:
+      - $HOME/.cache/huggingface:/data
+    ports:
+      - "5009:5009"
+    devices:
+      - nvidia.com/gpu=all
+    environment:
+      - CUDA_VISIBLE_DEVICES=0
+      - HF_HOME=/data
+      - HF_DATASETS_CACHE=/data
+      - HF_MODULES_CACHE=/data
+      - HF_HUB_CACHE=/data
+    command: ["--dtype", "bfloat16", "--usage-stats", "on", "--sharded", "false", "--model-id", "meta-llama/Llama-3.1-8B-Instruct", "--port", "5009", "--cuda-memory-fraction", "0.3"]
+    deploy:
+      resources:
+        reservations:
+          devices:
+          - driver: nvidia
+            # that's the closest analogue to --gpus; provide
+            # an integer amount of devices or 'all'
+            count: 1
+            # Devices are reserved using a list of capabilities, making
+            # capabilities the only required field. A device MUST
+            # satisfy all the requested capabilities for a successful
+            # reservation.
+            capabilities: [gpu]
+    runtime: nvidia
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://text-generation-inference:5009/health"]
+      interval: 5s
+      timeout: 5s
+      retries: 30
+  llamastack-local-cpu:
+    depends_on:
+      text-generation-inference:
+        condition: service_healthy
+    image: llamastack-local-cpu
+    network_mode: "host"
+    volumes:
+      - ~/.llama:/root/.llama
+      # Link to TGI run.yaml file
+      - ./tgi-run.yaml:/root/llamastack-run-tgi.yaml
+    ports:
+      - "5000:5000"
+    # Hack: wait for TGI server to start before starting docker
+    entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-tgi.yaml"
+    restart_policy:
+      condition: on-failure
+      delay: 3s
+      max_attempts: 5
+      window: 60s
--- a/llama_stack/distribution/docker/tgi/tgi-run.yaml
+++ b/llama_stack/distribution/docker/tgi/tgi-run.yaml
@ -0,0 +1,46 @@
+version: '2'
+built_at: '2024-10-08T17:40:45.325529'
+image_name: local
+docker_image: null
+conda_env: local
+apis:
+- shields
+- agents
+- models
+- memory
+- memory_banks
+- inference
+- safety
+providers:
+  inference:
+  - provider_id: tgi0
+    provider_type: remote::tgi
+    config:
+      url: http://127.0.0.1:5009
+  safety:
+  - provider_id: meta0
+    provider_type: meta-reference
+    config:
+      llama_guard_shield:
+        model: Llama-Guard-3-1B
+        excluded_categories: []
+        disable_input_check: false
+        disable_output_check: false
+      prompt_guard_shield:
+        model: Prompt-Guard-86M
+  memory:
+  - provider_id: meta0
+    provider_type: meta-reference
+    config: {}
+  agents:
+  - provider_id: meta0
+    provider_type: meta-reference
+    config:
+      persistence_store:
+        namespace: null
+        type: sqlite
+        db_path: ~/.llama/runtime/kvstore.db
+  telemetry:
+  - provider_id: meta0
+    provider_type: meta-reference
+    config: {}