forked from phoenix-oss/llama-stack-mirror
		
	
		
			
				
	
	
		
			99 lines
		
	
	
	
		
			3.3 KiB
		
	
	
	
		
			YAML
		
	
	
	
	
	
			
		
		
	
	
			99 lines
		
	
	
	
		
			3.3 KiB
		
	
	
	
		
			YAML
		
	
	
	
	
	
| services:
 | |
|   vllm-inference:
 | |
|     image: vllm/vllm-openai:latest
 | |
|     volumes:
 | |
|       - $HOME/.cache/huggingface:/root/.cache/huggingface
 | |
|     network_mode: ${NETWORK_MODE:-bridged}
 | |
|     ports:
 | |
|        - "${VLLM_INFERENCE_PORT:-5100}:${VLLM_INFERENCE_PORT:-5100}"
 | |
|     devices:
 | |
|       - nvidia.com/gpu=all
 | |
|     environment:
 | |
|       - CUDA_VISIBLE_DEVICES=${VLLM_INFERENCE_GPU:-0}
 | |
|       - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
 | |
|     command: >
 | |
|       --gpu-memory-utilization 0.75
 | |
|       --model ${VLLM_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
 | |
|       --enforce-eager
 | |
|       --max-model-len 8192
 | |
|       --max-num-seqs 16
 | |
|       --port ${VLLM_INFERENCE_PORT:-5100}
 | |
|     healthcheck:
 | |
|       test: ["CMD", "curl", "-f", "http://localhost:${VLLM_INFERENCE_PORT:-5100}/v1/health"]
 | |
|       interval: 30s
 | |
|       timeout: 10s
 | |
|       retries: 5
 | |
|     deploy:
 | |
|       resources:
 | |
|         reservations:
 | |
|           devices:
 | |
|           - driver: nvidia
 | |
|             capabilities: [gpu]
 | |
|     runtime: nvidia
 | |
| 
 | |
|   # A little trick:
 | |
|   # if VLLM_SAFETY_MODEL is set, we will create a service for the safety model
 | |
|   # otherwise, the entry will end in a hyphen which gets ignored by docker compose
 | |
|   vllm-${VLLM_SAFETY_MODEL:+safety}:
 | |
|     image: vllm/vllm-openai:latest
 | |
|     volumes:
 | |
|       - $HOME/.cache/huggingface:/root/.cache/huggingface
 | |
|     network_mode: ${NETWORK_MODE:-bridged}
 | |
|     ports:
 | |
|       - "${VLLM_SAFETY_PORT:-5101}:${VLLM_SAFETY_PORT:-5101}"
 | |
|     devices:
 | |
|       - nvidia.com/gpu=all
 | |
|     environment:
 | |
|       - CUDA_VISIBLE_DEVICES=${VLLM_SAFETY_GPU:-1}
 | |
|       - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
 | |
|     command: >
 | |
|       --gpu-memory-utilization 0.75
 | |
|       --model ${VLLM_SAFETY_MODEL}
 | |
|       --enforce-eager
 | |
|       --max-model-len 8192
 | |
|       --max-num-seqs 16
 | |
|       --port ${VLLM_SAFETY_PORT:-5101}
 | |
|     healthcheck:
 | |
|       test: ["CMD", "curl", "-f", "http://localhost:${VLLM_SAFETY_PORT:-5101}/v1/health"]
 | |
|       interval: 30s
 | |
|       timeout: 10s
 | |
|       retries: 5
 | |
|     deploy:
 | |
|       resources:
 | |
|         reservations:
 | |
|           devices:
 | |
|           - driver: nvidia
 | |
|             capabilities: [gpu]
 | |
|     runtime: nvidia
 | |
|   llamastack:
 | |
|     depends_on:
 | |
|       - vllm-inference:
 | |
|           condition: service_healthy
 | |
|       - vllm-${VLLM_SAFETY_MODEL:+safety}:
 | |
|           condition: service_healthy
 | |
|     image: llamastack/distribution-remote-vllm:test-0.0.52rc3
 | |
|     volumes:
 | |
|       - ~/.llama:/root/.llama
 | |
|       - ./run${VLLM_SAFETY_MODEL:+-with-safety}.yaml:/root/llamastack-run-remote-vllm.yaml
 | |
|     network_mode: ${NETWORK_MODE:-bridged}
 | |
|     environment:
 | |
|       - VLLM_URL=http://vllm-inference:${VLLM_INFERENCE_PORT:-5100}/v1
 | |
|       - VLLM_SAFETY_URL=http://vllm-safety:${VLLM_SAFETY_PORT:-5101}/v1
 | |
|       - INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
 | |
|       - MAX_TOKENS=${MAX_TOKENS:-4096}
 | |
|       - SQLITE_STORE_DIR=${SQLITE_STORE_DIR:-$HOME/.llama/distributions/remote-vllm}
 | |
|       - SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
 | |
|     ports:
 | |
|       - "${LLAMA_STACK_PORT:-8321}:${LLAMA_STACK_PORT:-8321}"
 | |
|     # Hack: wait for vLLM server to start before starting docker
 | |
|     entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml --port 8321"
 | |
|     deploy:
 | |
|       restart_policy:
 | |
|         condition: on-failure
 | |
|         delay: 3s
 | |
|         max_attempts: 5
 | |
|         window: 60s
 | |
| volumes:
 | |
|   vllm-inference:
 | |
|   vllm-safety:
 | |
|   llamastack:
 |