services: text-generation-inference: image: ghcr.io/huggingface/text-generation-inference:latest network_mode: "host" volumes: - $HOME/.cache/huggingface:/data ports: - "5009:5009" command: ["--dtype", "bfloat16", "--usage-stats", "on", "--sharded", "false", "--model-id", "meta-llama/Llama-3.1-8B-Instruct", "--port", "5009", "--cuda-memory-fraction", "0.3"] runtime: nvidia healthcheck: test: ["CMD", "curl", "-f", "http://text-generation-inference:5009/health"] interval: 5s timeout: 5s retries: 30 llamastack: depends_on: text-generation-inference: condition: service_healthy image: llamastack/llamastack-tgi network_mode: "host" volumes: - ~/.llama:/root/.llama # Link to run.yaml file - ./run.yaml:/root/my-run.yaml ports: - "5000:5000" entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml" restart_policy: condition: on-failure delay: 3s max_attempts: 5 window: 60s