forked from phoenix-oss/llama-stack-mirror
		
	
		
			
				
	
	
		
			50 lines
		
	
	
	
		
			1.6 KiB
		
	
	
	
		
			YAML
		
	
	
	
	
	
			
		
		
	
	
			50 lines
		
	
	
	
		
			1.6 KiB
		
	
	
	
		
			YAML
		
	
	
	
	
	
| services:
 | |
|   text-generation-inference:
 | |
|     image: registry.dell.huggingface.co/enterprise-dell-inference-meta-llama-meta-llama-3.1-8b-instruct
 | |
|     network_mode: "host"
 | |
|     volumes:
 | |
|       - $HOME/.cache/huggingface:/data
 | |
|     ports:
 | |
|       - "5009:5009"
 | |
|     devices:
 | |
|       - nvidia.com/gpu=all
 | |
|     environment:
 | |
|       - CUDA_VISIBLE_DEVICES=0,1,2,3,4
 | |
|       - NUM_SHARD=4
 | |
|       - MAX_BATCH_PREFILL_TOKENS=32768
 | |
|       - MAX_INPUT_TOKENS=8000
 | |
|       - MAX_TOTAL_TOKENS=8192
 | |
|     command: []
 | |
|     deploy:
 | |
|       resources:
 | |
|         reservations:
 | |
|           devices:
 | |
|           - driver: nvidia
 | |
|             # that's the closest analogue to --gpus; provide
 | |
|             # an integer amount of devices or 'all'
 | |
|             count: all
 | |
|             # Devices are reserved using a list of capabilities, making
 | |
|             # capabilities the only required field. A device MUST
 | |
|             # satisfy all the requested capabilities for a successful
 | |
|             # reservation.
 | |
|             capabilities: [gpu]
 | |
|     runtime: nvidia
 | |
|   llamastack:
 | |
|     depends_on:
 | |
|       text-generation-inference:
 | |
|         condition: service_healthy
 | |
|     image: llamastack/distribution-tgi
 | |
|     network_mode: "host"
 | |
|     volumes:
 | |
|       - ~/.llama:/root/.llama
 | |
|       # Link to TGI run.yaml file
 | |
|       - ./run.yaml:/root/my-run.yaml
 | |
|     ports:
 | |
|       - "5000:5000"
 | |
|     # Hack: wait for TGI server to start before starting docker
 | |
|     entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
 | |
|     restart_policy:
 | |
|       condition: on-failure
 | |
|       delay: 3s
 | |
|       max_attempts: 5
 | |
|       window: 60s
 |