services: text-generation-inference: image: registry.dell.huggingface.co/enterprise-dell-inference-meta-llama-meta-llama-3.1-8b-instruct network_mode: "host" volumes: - $HOME/.cache/huggingface:/data ports: - "5009:5009" devices: - nvidia.com/gpu=all environment: - CUDA_VISIBLE_DEVICES=0,1,2,3,4 - NUM_SHARD=4 - MAX_BATCH_PREFILL_TOKENS=32768 - MAX_INPUT_TOKENS=8000 - MAX_TOTAL_TOKENS=8192 command: [] deploy: resources: reservations: devices: - driver: nvidia # that's the closest analogue to --gpus; provide # an integer amount of devices or 'all' count: all # Devices are reserved using a list of capabilities, making # capabilities the only required field. A device MUST # satisfy all the requested capabilities for a successful # reservation. capabilities: [gpu] runtime: nvidia llamastack: depends_on: text-generation-inference: condition: service_healthy image: llamastack/distribution-tgi network_mode: "host" volumes: - ~/.llama:/root/.llama # Link to TGI run.yaml file - ./run.yaml:/root/my-run.yaml ports: - "8321:8321" # Hack: wait for TGI server to start before starting docker entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml" restart_policy: condition: on-failure delay: 3s max_attempts: 5 window: 60s