Auto-generate distro yamls + docs (#468)

# What does this PR do? Automatically generates - build.yaml - run.yaml - run-with-safety.yaml - parts of markdown docs for the distributions. ## Test Plan At this point, this only updates the YAMLs and the docs. Some testing (especially with ollama and vllm) has been performed but needs to be much more tested.
2024-11-18 14:57:06 -08:00 · 2024-11-18 14:57:06 -08:00 · 2a31163178
commit 2a31163178
parent 0784284ab5
88 changed files with 3008 additions and 852 deletions
--- a/distributions/bedrock/run.yaml
+++ b/distributions/bedrock/run.yaml
@ -1,5 +1,4 @@
 version: '2'
-built_at: '2024-11-01T17:40:45.325529'
 image_name: local
 name: bedrock
 docker_image: null
--- a/distributions/dell-tgi/run.yaml
+++ b/distributions/dell-tgi/run.yaml
@ -1,5 +1,4 @@
 version: '2'
-built_at: '2024-10-08T17:40:45.325529'
 image_name: local
 docker_image: null
 conda_env: local
--- a/distributions/fireworks/run.yaml
+++ b/distributions/fireworks/run.yaml
@ -1,51 +0,0 @@
-version: '2'
-built_at: '2024-10-08T17:40:45.325529'
-image_name: local
-docker_image: null
-conda_env: local
-apis:
- shields
- agents
- models
- memory
- memory_banks
- inference
- safety
-providers:
-  inference:
-  - provider_id: fireworks0
-    provider_type: remote::fireworks
-    config:
-      url: https://api.fireworks.ai/inference
-      # api_key: <ENTER_YOUR_API_KEY>
-  safety:
-  safety:
-  - provider_id: meta0
-    provider_type: inline::llama-guard
-    config:
-      model: Llama-Guard-3-1B
-      excluded_categories: []
-  - provider_id: meta1
-    provider_type: inline::prompt-guard
-    config:
-      model: Prompt-Guard-86M
-  memory:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config: {}
-  # Uncomment to use weaviate memory provider
-  # - provider_id: weaviate0
-  #   provider_type: remote::weaviate
-  #   config: {}
-  agents:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        namespace: null
-        type: sqlite
-        db_path: ~/.llama/runtime/kvstore.db
-  telemetry:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config: {}
--- a/distributions/fireworks/run.yaml
+++ b/distributions/fireworks/run.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/fireworks/run.yaml
--- a/distributions/inline-vllm/run.yaml
+++ b/distributions/inline-vllm/run.yaml
@ -1,5 +1,4 @@
 version: '2'
-built_at: '2024-10-08T17:40:45.325529'
 image_name: local
 docker_image: null
 conda_env: local
--- a/distributions/meta-reference-gpu/run-with-safety.yaml
+++ b/distributions/meta-reference-gpu/run-with-safety.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
--- a/distributions/meta-reference-gpu/run.yaml
+++ b/distributions/meta-reference-gpu/run.yaml
@ -1,69 +0,0 @@
-version: '2'
-built_at: '2024-10-08T17:40:45.325529'
-image_name: local
-docker_image: null
-conda_env: local
-apis:
- shields
- agents
- models
- memory
- memory_banks
- inference
- safety
-providers:
-  inference:
-  - provider_id: inference0
-    provider_type: inline::meta-reference
-    config:
-      model: Llama3.2-3B-Instruct
-      quantization: null
-      torch_seed: null
-      max_seq_len: 4096
-      max_batch_size: 1
-  - provider_id: inference1
-    provider_type: inline::meta-reference
-    config:
-      model: Llama-Guard-3-1B
-      quantization: null
-      torch_seed: null
-      max_seq_len: 2048
-      max_batch_size: 1
-  safety:
-  - provider_id: meta0
-    provider_type: inline::llama-guard
-    config:
-      model: Llama-Guard-3-1B
-      excluded_categories: []
-  - provider_id: meta1
-    provider_type: inline::prompt-guard
-    config:
-      model: Prompt-Guard-86M
-# Uncomment to use prompt guard
-#      prompt_guard_shield:
-#        model: Prompt-Guard-86M
-  memory:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config: {}
-  # Uncomment to use pgvector
-  # - provider_id: pgvector
-  #   provider_type: remote::pgvector
-  #   config:
-  #     host: 127.0.0.1
-  #     port: 5432
-  #     db: postgres
-  #     user: postgres
-  #     password: mysecretpassword
-  agents:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        namespace: null
-        type: sqlite
-        db_path: ~/.llama/runtime/agents_store.db
-  telemetry:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config: {}
--- a/distributions/meta-reference-gpu/run.yaml
+++ b/distributions/meta-reference-gpu/run.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/meta-reference-gpu/run.yaml
--- a/distributions/meta-reference-quantized-gpu/run.yaml
+++ b/distributions/meta-reference-quantized-gpu/run.yaml
@ -1,5 +1,4 @@
 version: '2'
-built_at: '2024-10-08T17:40:45.325529'
 image_name: local
 docker_image: null
 conda_env: local
--- a/distributions/ollama-gpu/run.yaml
+++ b/distributions/ollama-gpu/run.yaml
@ -1,5 +1,4 @@
 version: '2'
-built_at: '2024-10-08T17:40:45.325529'
 image_name: local
 docker_image: null
 conda_env: local
@ -13,20 +12,15 @@ apis:
 - safety
 providers:
  inference:
-  - provider_id: ollama0
+  - provider_id: ollama
    provider_type: remote::ollama
    config:
-      url: http://127.0.0.1:14343
+      url: ${env.OLLAMA_URL:http://127.0.0.1:11434}
  safety:
  - provider_id: meta0
    provider_type: inline::llama-guard
    config:
-      model: Llama-Guard-3-1B
      excluded_categories: []
-  - provider_id: meta1
-    provider_type: inline::prompt-guard
-    config:
-      model: Prompt-Guard-86M
  memory:
  - provider_id: meta0
    provider_type: inline::meta-reference
@ -43,3 +37,10 @@ providers:
  - provider_id: meta0
    provider_type: inline::meta-reference
    config: {}
+models:
+  - model_id: ${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}
+    provider_id: ollama
+  - model_id: ${env.SAFETY_MODEL:Llama-Guard-3-1B}
+    provider_id: ollama
+shields:
+  - shield_id: ${env.SAFETY_MODEL:Llama-Guard-3-1B}
--- a/distributions/ollama/compose.yaml
+++ b/distributions/ollama/compose.yaml
@ -1,30 +1,71 @@
 services:
  ollama:
    image: ollama/ollama:latest
-    network_mode: "host"
+    network_mode: ${NETWORK_MODE:-bridge}
    volumes:
-      - ollama:/root/.ollama # this solution synchronizes with the docker volume and loads the model rocket fast
+      - ~/.ollama:/root/.ollama
    ports:
      - "11434:11434"
+    environment:
+      OLLAMA_DEBUG: 1
    command: []
+    deploy:
+      resources:
+        limits:
+          memory: 8G    # Set maximum memory
+        reservations:
+          memory: 8G    # Set minimum memory reservation
+    # healthcheck:
+    #   # ugh, no CURL in ollama image
+    #   test: ["CMD", "curl", "-f", "http://ollama:11434"]
+    #   interval: 10s
+    #   timeout: 5s
+    #   retries: 5
+
+  ollama-init:
+    image: ollama/ollama:latest
+    depends_on:
+      - ollama
+        # condition: service_healthy
+    network_mode: ${NETWORK_MODE:-bridge}
+    environment:
+      - OLLAMA_HOST=ollama
+      - INFERENCE_MODEL=${INFERENCE_MODEL}
+      - SAFETY_MODEL=${SAFETY_MODEL:-}
+    volumes:
+      - ~/.ollama:/root/.ollama
+      - ./pull-models.sh:/pull-models.sh
+    entrypoint: ["/pull-models.sh"]
+
  llamastack:
    depends_on:
-    - ollama
-    image: llamastack/distribution-ollama
-    network_mode: "host"
+      ollama:
+        condition: service_started
+      ollama-init:
+        condition: service_started
+    image: ${LLAMA_STACK_IMAGE:-llamastack/distribution-ollama}
+    network_mode: ${NETWORK_MODE:-bridge}
    volumes:
      - ~/.llama:/root/.llama
      # Link to ollama run.yaml file
-      - ./run.yaml:/root/my-run.yaml
+      - ~/local/llama-stack/:/app/llama-stack-source
+      - ./run${SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml
    ports:
-      - "5000:5000"
-    # Hack: wait for ollama server to start before starting docker
-    entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
+      - "${LLAMA_STACK_PORT:-5001}:${LLAMA_STACK_PORT:-5001}"
+    environment:
+      - INFERENCE_MODEL=${INFERENCE_MODEL}
+      - SAFETY_MODEL=${SAFETY_MODEL:-}
+      - OLLAMA_URL=http://ollama:11434
+    entrypoint: >
+        python -m llama_stack.distribution.server.server /root/my-run.yaml \
+        --port ${LLAMA_STACK_PORT:-5001}
    deploy:
      restart_policy:
        condition: on-failure
-        delay: 3s
-        max_attempts: 5
+        delay: 10s
+        max_attempts: 3
        window: 60s
 volumes:
  ollama:
+  ollama-init:
+  llamastack:
--- a/distributions/ollama/pull-models.sh
+++ b/distributions/ollama/pull-models.sh
@ -0,0 +1,18 @@
+#!/bin/sh
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+echo "Preloading (${INFERENCE_MODEL}, ${SAFETY_MODEL})..."
+for model in ${INFERENCE_MODEL} ${SAFETY_MODEL}; do
+  echo "Preloading $model..."
+  if ! ollama run "$model"; then
+    echo "Failed to pull and run $model"
+    exit 1
+  fi
+done
+
+echo "All models pulled successfully"
--- a/distributions/ollama/run-with-safety.yaml
+++ b/distributions/ollama/run-with-safety.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/ollama/run-with-safety.yaml
--- a/distributions/ollama/run.yaml
+++ b/distributions/ollama/run.yaml
@ -1,45 +0,0 @@
-version: '2'
-built_at: '2024-10-08T17:40:45.325529'
-image_name: local
-docker_image: null
-conda_env: local
-apis:
- shields
- agents
- models
- memory
- memory_banks
- inference
- safety
-providers:
-  inference:
-  - provider_id: ollama0
-    provider_type: remote::ollama
-    config:
-      url: http://127.0.0.1:14343
-  safety:
-  - provider_id: meta0
-    provider_type: inline::llama-guard
-    config:
-      model: Llama-Guard-3-1B
-      excluded_categories: []
-  - provider_id: meta1
-    provider_type: inline::prompt-guard
-    config:
-      model: Prompt-Guard-86M
-  memory:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config: {}
-  agents:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        namespace: null
-        type: sqlite
-        db_path: ~/.llama/runtime/kvstore.db
-  telemetry:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config: {}
--- a/distributions/ollama/run.yaml
+++ b/distributions/ollama/run.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/ollama/run.yaml
--- a/distributions/remote-vllm/compose.yaml
+++ b/distributions/remote-vllm/compose.yaml
@ -1,33 +1,28 @@
-# NOTES:
-#
-# This Docker Compose (and the associated run.yaml) assumes you will be
-# running in the default "bridged" network mode.
-#
-# If you need "host" network mode, please uncomment
-#  - network_mode: "host"
-#
-# Similarly change "host.docker.internal" to "localhost" in the run.yaml file
-#
 services:
-  vllm-0:
+  vllm-inference:
    image: vllm/vllm-openai:latest
    volumes:
      - $HOME/.cache/huggingface:/root/.cache/huggingface
-    # network_mode: "host"
+    network_mode: ${NETWORK_MODE:-bridged}
    ports:
-       - "5100:5100"
+       - "${VLLM_INFERENCE_PORT:-5100}:${VLLM_INFERENCE_PORT:-5100}"
    devices:
      - nvidia.com/gpu=all
    environment:
-      - CUDA_VISIBLE_DEVICES=0
+      - CUDA_VISIBLE_DEVICES=${VLLM_INFERENCE_GPU:-0}
      - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
    command: >
      --gpu-memory-utilization 0.75
-      --model meta-llama/Llama-3.1-8B-Instruct
+      --model ${VLLM_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
      --enforce-eager
      --max-model-len 8192
      --max-num-seqs 16
-      --port 5100
+      --port ${VLLM_INFERENCE_PORT:-5100}
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:${VLLM_INFERENCE_PORT:-5100}/v1/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 5
    deploy:
      resources:
        reservations:
@ -35,25 +30,34 @@ services:
          - driver: nvidia
            capabilities: [gpu]
    runtime: nvidia
-  vllm-1:
+
+  # A little trick:
+  # if VLLM_SAFETY_MODEL is set, we will create a service for the safety model
+  # otherwise, the entry will end in a hyphen which gets ignored by docker compose
+  vllm-${VLLM_SAFETY_MODEL:+safety}:
    image: vllm/vllm-openai:latest
    volumes:
      - $HOME/.cache/huggingface:/root/.cache/huggingface
-    # network_mode: "host"
+    network_mode: ${NETWORK_MODE:-bridged}
    ports:
-      - "5101:5101"
+      - "${VLLM_SAFETY_PORT:-5101}:${VLLM_SAFETY_PORT:-5101}"
    devices:
      - nvidia.com/gpu=all
    environment:
-      - CUDA_VISIBLE_DEVICES=1
+      - CUDA_VISIBLE_DEVICES=${VLLM_SAFETY_GPU:-1}
      - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
    command: >
      --gpu-memory-utilization 0.75
-      --model meta-llama/Llama-Guard-3-1B
+      --model ${VLLM_SAFETY_MODEL}
      --enforce-eager
      --max-model-len 8192
      --max-num-seqs 16
-      --port 5101
+      --port ${VLLM_SAFETY_PORT:-5101}
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:${VLLM_SAFETY_PORT:-5101}/v1/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 5
    deploy:
      resources:
        reservations:
@ -63,23 +67,25 @@ services:
    runtime: nvidia
  llamastack:
    depends_on:
-    - vllm-0
-    - vllm-1
-      # image: llamastack/distribution-remote-vllm
+      - vllm-inference:
+          condition: service_healthy
+      - vllm-${VLLM_SAFETY_MODEL:+safety}:
+          condition: service_healthy
+    # image: llamastack/distribution-remote-vllm
    image: llamastack/distribution-remote-vllm:test-0.0.52rc3
    volumes:
      - ~/.llama:/root/.llama
-      - ~/local/llama-stack/distributions/remote-vllm/run.yaml:/root/llamastack-run-remote-vllm.yaml
-    # network_mode: "host"
+      - ./run${VLLM_SAFETY_MODEL:+-with-safety}.yaml:/root/llamastack-run-remote-vllm.yaml
+    network_mode: ${NETWORK_MODE:-bridged}
    environment:
-      - LLAMA_INFERENCE_VLLM_URL=${LLAMA_INFERENCE_VLLM_URL:-http://host.docker.internal:5100/v1}
-      - LLAMA_INFERENCE_MODEL=${LLAMA_INFERENCE_MODEL:-Llama3.1-8B-Instruct}
+      - VLLM_URL=http://vllm-inference:${VLLM_INFERENCE_PORT:-5100}/v1
+      - VLLM_SAFETY_URL=http://vllm-safety:${VLLM_SAFETY_PORT:-5101}/v1
+      - INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
      - MAX_TOKENS=${MAX_TOKENS:-4096}
      - SQLITE_STORE_DIR=${SQLITE_STORE_DIR:-$HOME/.llama/distributions/remote-vllm}
-      - LLAMA_SAFETY_VLLM_URL=${LLAMA_SAFETY_VLLM_URL:-http://host.docker.internal:5101/v1}
-      - LLAMA_SAFETY_MODEL=${LLAMA_SAFETY_MODEL:-Llama-Guard-3-1B}
+      - SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
    ports:
-      - "5001:5001"
+      - "${LLAMASTACK_PORT:-5001}:${LLAMASTACK_PORT:-5001}"
    # Hack: wait for vLLM server to start before starting docker
    entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml --port 5001"
    deploy:
@ -89,6 +95,6 @@ services:
        max_attempts: 5
        window: 60s
 volumes:
-  vllm-0:
-  vllm-1:
+  vllm-inference:
+  vllm-safety:
  llamastack:
--- a/distributions/remote-vllm/run-with-safety.yaml
+++ b/distributions/remote-vllm/run-with-safety.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/remote-vllm/run-with-safety.yaml
--- a/distributions/remote-vllm/run.yaml
+++ b/distributions/remote-vllm/run.yaml
@ -1,68 +0,0 @@
-version: '2'
-built_at: '2024-11-11T20:09:45.988375'
-image_name: remote-vllm
-docker_image: remote-vllm
-conda_env: null
-apis:
- inference
- memory
- safety
- agents
- telemetry
-providers:
-  inference:
-  # serves main inference model
-  - provider_id: vllm-0
-    provider_type: remote::vllm
-    config:
-      # NOTE: replace with "localhost" if you are running in "host" network mode
-      url: ${env.LLAMA_INFERENCE_VLLM_URL:http://host.docker.internal:5100/v1}
-      max_tokens: ${env.MAX_TOKENS:4096}
-      api_token: fake
-  # serves safety llama_guard model
-  - provider_id: vllm-1
-    provider_type: remote::vllm
-    config:
-      # NOTE: replace with "localhost" if you are running in "host" network mode
-      url: ${env.LLAMA_SAFETY_VLLM_URL:http://host.docker.internal:5101/v1}
-      max_tokens: ${env.MAX_TOKENS:4096}
-      api_token: fake
-  memory:
-  - provider_id: faiss-0
-    provider_type: inline::faiss
-    config:
-      kvstore:
-        namespace: null
-        type: sqlite
-        db_path: "${env.SQLITE_STORE_DIR:/home/ashwin/.llama/distributions/remote-vllm}/faiss_store.db"
-  safety:
-  - provider_id: llama-guard
-    provider_type: inline::llama-guard
-    config: {}
-  memory:
-  - provider_id: meta0
-    provider_type: inline::faiss
-    config: {}
-  agents:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        namespace: null
-        type: sqlite
-        db_path: "${env.SQLITE_STORE_DIR:/home/ashwin/.llama/distributions/remote-vllm}/agents_store.db"
-  telemetry:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config: {}
-metadata_store:
-  namespace: null
-  type: sqlite
-  db_path: "${env.SQLITE_STORE_DIR:/home/ashwin/.llama/distributions/remote-vllm}/registry.db"
-models:
-  - model_id: ${env.LLAMA_INFERENCE_MODEL:Llama3.1-8B-Instruct}
-    provider_id: vllm-0
-  - model_id: ${env.LLAMA_SAFETY_MODEL:Llama-Guard-3-1B}
-    provider_id: vllm-1
-shields:
-  - shield_id: ${env.LLAMA_SAFETY_MODEL:Llama-Guard-3-1B}
--- a/distributions/remote-vllm/run.yaml
+++ b/distributions/remote-vllm/run.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/remote-vllm/run.yaml
--- a/distributions/tgi/compose.yaml
+++ b/distributions/tgi/compose.yaml
@ -1,51 +1,89 @@
 services:
-  text-generation-inference:
+  tgi-inference:
    image: ghcr.io/huggingface/text-generation-inference:latest
-    network_mode: "host"
    volumes:
      - $HOME/.cache/huggingface:/data
+    network_mode: ${NETWORK_MODE:-bridged}
    ports:
-      - "5009:5009"
+       - "${TGI_INFERENCE_PORT:-8080}:${TGI_INFERENCE_PORT:-8080}"
    devices:
      - nvidia.com/gpu=all
    environment:
-      - CUDA_VISIBLE_DEVICES=0
+      - CUDA_VISIBLE_DEVICES=${TGI_INFERENCE_GPU:-0}
+      - HF_TOKEN=$HF_TOKEN
      - HF_HOME=/data
      - HF_DATASETS_CACHE=/data
      - HF_MODULES_CACHE=/data
      - HF_HUB_CACHE=/data
-    command: ["--dtype", "bfloat16", "--usage-stats", "on", "--sharded", "false", "--model-id", "meta-llama/Llama-3.1-8B-Instruct", "--port", "5009", "--cuda-memory-fraction", "0.3"]
+    command: >
+      --dtype bfloat16
+      --usage-stats off
+      --sharded false
+      --model-id ${TGI_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
+      --port ${TGI_INFERENCE_PORT:-8080}
+      --cuda-memory-fraction 0.75
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://tgi-inference:${TGI_INFERENCE_PORT:-8080}/health"]
+      interval: 5s
+      timeout: 5s
+      retries: 30
    deploy:
      resources:
        reservations:
          devices:
          - driver: nvidia
-            # that's the closest analogue to --gpus; provide
-            # an integer amount of devices or 'all'
-            count: 1
-            # Devices are reserved using a list of capabilities, making
-            # capabilities the only required field. A device MUST
-            # satisfy all the requested capabilities for a successful
-            # reservation.
            capabilities: [gpu]
    runtime: nvidia
+
+  tgi-${TGI_SAFETY_MODEL:+safety}:
+    image: ghcr.io/huggingface/text-generation-inference:latest
+    volumes:
+      - $HOME/.cache/huggingface:/data
+    network_mode: ${NETWORK_MODE:-bridged}
+    ports:
+       - "${TGI_SAFETY_PORT:-8081}:${TGI_SAFETY_PORT:-8081}"
+    devices:
+      - nvidia.com/gpu=all
+    environment:
+      - CUDA_VISIBLE_DEVICES=${TGI_SAFETY_GPU:-1}
+      - HF_TOKEN=$HF_TOKEN
+      - HF_HOME=/data
+      - HF_DATASETS_CACHE=/data
+      - HF_MODULES_CACHE=/data
+      - HF_HUB_CACHE=/data
+    command: >
+      --dtype bfloat16
+      --usage-stats off
+      --sharded false
+      --model-id ${TGI_SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
+      --port ${TGI_SAFETY_PORT:-8081}
+      --cuda-memory-fraction 0.75
    healthcheck:
-      test: ["CMD", "curl", "-f", "http://text-generation-inference:5009/health"]
+      test: ["CMD", "curl", "-f", "http://tgi-safety:${TGI_SAFETY_PORT:-8081}/health"]
      interval: 5s
      timeout: 5s
      retries: 30
+    deploy:
+      resources:
+        reservations:
+          devices:
+          - driver: nvidia
+            capabilities: [gpu]
+    runtime: nvidia
+
  llamastack:
    depends_on:
-      text-generation-inference:
+      tgi-inference:
        condition: service_healthy
-    image: llamastack/distribution-tgi
-    network_mode: "host"
+      tgi-${TGI_SAFETY_MODEL:+safety}:
+        condition: service_healthy
+    image: llamastack/distribution-tgi:test-0.0.52rc3
+    network_mode: ${NETWORK_MODE:-bridged}
    volumes:
      - ~/.llama:/root/.llama
-      # Link to TGI run.yaml file
-      - ./run.yaml:/root/my-run.yaml
+      - ./run${TGI_SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml
    ports:
-      - "5000:5000"
+      - "${LLAMA_STACK_PORT:-5001}:${LLAMA_STACK_PORT:-5001}"
    # Hack: wait for TGI server to start before starting docker
    entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
    restart_policy:
@ -53,3 +91,13 @@ services:
      delay: 3s
      max_attempts: 5
      window: 60s
+    environment:
+      - TGI_URL=http://tgi-inference:${TGI_INFERENCE_PORT:-8080}
+      - SAFETY_TGI_URL=http://tgi-safety:${TGI_SAFETY_PORT:-8081}
+      - INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
+      - SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
+
+volumes:
+  tgi-inference:
+  tgi-safety:
+  llamastack:
--- a/distributions/tgi/run-with-safety.yaml
+++ b/distributions/tgi/run-with-safety.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/tgi/run-with-safety.yaml
--- a/distributions/tgi/run.yaml
+++ b/distributions/tgi/run.yaml
@ -1,45 +0,0 @@
-version: '2'
-built_at: '2024-10-08T17:40:45.325529'
-image_name: local
-docker_image: null
-conda_env: local
-apis:
- shields
- agents
- models
- memory
- memory_banks
- inference
- safety
-providers:
-  inference:
-  - provider_id: tgi0
-    provider_type: remote::tgi
-    config:
-      url: http://127.0.0.1:5009
-  safety:
-  - provider_id: meta0
-    provider_type: inline::llama-guard
-    config:
-      model: Llama-Guard-3-1B
-      excluded_categories: []
-  - provider_id: meta1
-    provider_type: inline::prompt-guard
-    config:
-      model: Prompt-Guard-86M
-  memory:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config: {}
-  agents:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        namespace: null
-        type: sqlite
-        db_path: ~/.llama/runtime/kvstore.db
-  telemetry:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config: {}
--- a/distributions/tgi/run.yaml
+++ b/distributions/tgi/run.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/tgi/run.yaml
--- a/distributions/together/run.yaml
+++ b/distributions/together/run.yaml
@ -1,46 +0,0 @@
-version: '2'
-built_at: '2024-10-08T17:40:45.325529'
-image_name: local
-docker_image: null
-conda_env: local
-apis:
- shields
- agents
- models
- memory
- memory_banks
- inference
- safety
-providers:
-  inference:
-  - provider_id: together0
-    provider_type: remote::together
-    config:
-      url: https://api.together.xyz/v1
-      # api_key: <ENTER_YOUR_API_KEY>
-  safety:
-  - provider_id: meta0
-    provider_type: inline::llama-guard
-    config:
-      model: Llama-Guard-3-1B
-      excluded_categories: []
-  - provider_id: meta1
-    provider_type: inline::prompt-guard
-    config:
-      model: Prompt-Guard-86M
-  memory:
-  - provider_id: meta0
-    provider_type: remote::weaviate
-    config: {}
-  agents:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        namespace: null
-        type: sqlite
-        db_path: ~/.llama/runtime/kvstore.db
-  telemetry:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config: {}
--- a/distributions/together/run.yaml
+++ b/distributions/together/run.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/together/run.yaml
				`@ -0,0 +1 @@`
				`../../llama_stack/templates/fireworks/run.yaml`
				`@ -0,0 +1 @@`
				`../../llama_stack/templates/meta-reference-gpu/run-with-safety.yaml`
				`@ -0,0 +1 @@`
				`../../llama_stack/templates/ollama/run-with-safety.yaml`
				`@ -0,0 +1 @@`
				`../../llama_stack/templates/remote-vllm/run-with-safety.yaml`
				`@ -0,0 +1 @@`
				`../../llama_stack/templates/tgi/run-with-safety.yaml`
				`@ -0,0 +1 @@`
				`../../llama_stack/templates/together/run.yaml`