diff --git a/scripts/docker/tgi/compose.yaml b/scripts/docker/tgi/compose.yaml index d0a1f8c04..d5bcd50f3 100644 --- a/scripts/docker/tgi/compose.yaml +++ b/scripts/docker/tgi/compose.yaml @@ -15,11 +15,6 @@ services: - HF_MODULES_CACHE=/data - HF_HUB_CACHE=/data command: ["--dtype", "bfloat16", "--usage-stats", "on", "--sharded", "false", "--model-id", "meta-llama/Llama-3.1-8B-Instruct", "--port", "5009", "--cuda-memory-fraction", "0.3"] - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:5009/health"] - interval: 5s - timeout: 5s - retries: 10 deploy: resources: reservations: @@ -34,11 +29,15 @@ services: # reservation. capabilities: [gpu] runtime: nvidia + healthcheck: + test: ["CMD", "curl", "-f", "http://text-generation-inference:5009/health"] + interval: 5s + timeout: 5s + retries: 30 llamastack-local-cpu: depends_on: text-generation-inference: condition: service_healthy - restart: on-failure image: llamastack-local-cpu network_mode: "host" volumes: @@ -47,9 +46,10 @@ services: - ./tgi-run.yaml:/root/llamastack-run-tgi.yaml ports: - "5000:5000" - command: ["--yaml_config", "/root/llamastack-run-tgi.yaml"] - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:5009/health"] - interval: 5s - timeout: 5s - retries: 10 + # Hack: wait for TGI server to start before starting docker + entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-tgi.yaml" + restart_policy: + condition: on-failure + delay: 3s + max_attempts: 5 + window: 60s