forked from phoenix-oss/llama-stack-mirror
		
	# What does this PR do? Automatically generates - build.yaml - run.yaml - run-with-safety.yaml - parts of markdown docs for the distributions. ## Test Plan At this point, this only updates the YAMLs and the docs. Some testing (especially with ollama and vllm) has been performed but needs to be much more tested.
		
			
				
	
	
		
			100 lines
		
	
	
	
		
			3.3 KiB
		
	
	
	
		
			YAML
		
	
	
	
	
	
			
		
		
	
	
			100 lines
		
	
	
	
		
			3.3 KiB
		
	
	
	
		
			YAML
		
	
	
	
	
	
| services:
 | |
|   vllm-inference:
 | |
|     image: vllm/vllm-openai:latest
 | |
|     volumes:
 | |
|       - $HOME/.cache/huggingface:/root/.cache/huggingface
 | |
|     network_mode: ${NETWORK_MODE:-bridged}
 | |
|     ports:
 | |
|        - "${VLLM_INFERENCE_PORT:-5100}:${VLLM_INFERENCE_PORT:-5100}"
 | |
|     devices:
 | |
|       - nvidia.com/gpu=all
 | |
|     environment:
 | |
|       - CUDA_VISIBLE_DEVICES=${VLLM_INFERENCE_GPU:-0}
 | |
|       - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
 | |
|     command: >
 | |
|       --gpu-memory-utilization 0.75
 | |
|       --model ${VLLM_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
 | |
|       --enforce-eager
 | |
|       --max-model-len 8192
 | |
|       --max-num-seqs 16
 | |
|       --port ${VLLM_INFERENCE_PORT:-5100}
 | |
|     healthcheck:
 | |
|       test: ["CMD", "curl", "-f", "http://localhost:${VLLM_INFERENCE_PORT:-5100}/v1/health"]
 | |
|       interval: 30s
 | |
|       timeout: 10s
 | |
|       retries: 5
 | |
|     deploy:
 | |
|       resources:
 | |
|         reservations:
 | |
|           devices:
 | |
|           - driver: nvidia
 | |
|             capabilities: [gpu]
 | |
|     runtime: nvidia
 | |
| 
 | |
|   # A little trick:
 | |
|   # if VLLM_SAFETY_MODEL is set, we will create a service for the safety model
 | |
|   # otherwise, the entry will end in a hyphen which gets ignored by docker compose
 | |
|   vllm-${VLLM_SAFETY_MODEL:+safety}:
 | |
|     image: vllm/vllm-openai:latest
 | |
|     volumes:
 | |
|       - $HOME/.cache/huggingface:/root/.cache/huggingface
 | |
|     network_mode: ${NETWORK_MODE:-bridged}
 | |
|     ports:
 | |
|       - "${VLLM_SAFETY_PORT:-5101}:${VLLM_SAFETY_PORT:-5101}"
 | |
|     devices:
 | |
|       - nvidia.com/gpu=all
 | |
|     environment:
 | |
|       - CUDA_VISIBLE_DEVICES=${VLLM_SAFETY_GPU:-1}
 | |
|       - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
 | |
|     command: >
 | |
|       --gpu-memory-utilization 0.75
 | |
|       --model ${VLLM_SAFETY_MODEL}
 | |
|       --enforce-eager
 | |
|       --max-model-len 8192
 | |
|       --max-num-seqs 16
 | |
|       --port ${VLLM_SAFETY_PORT:-5101}
 | |
|     healthcheck:
 | |
|       test: ["CMD", "curl", "-f", "http://localhost:${VLLM_SAFETY_PORT:-5101}/v1/health"]
 | |
|       interval: 30s
 | |
|       timeout: 10s
 | |
|       retries: 5
 | |
|     deploy:
 | |
|       resources:
 | |
|         reservations:
 | |
|           devices:
 | |
|           - driver: nvidia
 | |
|             capabilities: [gpu]
 | |
|     runtime: nvidia
 | |
|   llamastack:
 | |
|     depends_on:
 | |
|       - vllm-inference:
 | |
|           condition: service_healthy
 | |
|       - vllm-${VLLM_SAFETY_MODEL:+safety}:
 | |
|           condition: service_healthy
 | |
|     # image: llamastack/distribution-remote-vllm
 | |
|     image: llamastack/distribution-remote-vllm:test-0.0.52rc3
 | |
|     volumes:
 | |
|       - ~/.llama:/root/.llama
 | |
|       - ./run${VLLM_SAFETY_MODEL:+-with-safety}.yaml:/root/llamastack-run-remote-vllm.yaml
 | |
|     network_mode: ${NETWORK_MODE:-bridged}
 | |
|     environment:
 | |
|       - VLLM_URL=http://vllm-inference:${VLLM_INFERENCE_PORT:-5100}/v1
 | |
|       - VLLM_SAFETY_URL=http://vllm-safety:${VLLM_SAFETY_PORT:-5101}/v1
 | |
|       - INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
 | |
|       - MAX_TOKENS=${MAX_TOKENS:-4096}
 | |
|       - SQLITE_STORE_DIR=${SQLITE_STORE_DIR:-$HOME/.llama/distributions/remote-vllm}
 | |
|       - SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
 | |
|     ports:
 | |
|       - "${LLAMASTACK_PORT:-5001}:${LLAMASTACK_PORT:-5001}"
 | |
|     # Hack: wait for vLLM server to start before starting docker
 | |
|     entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml --port 5001"
 | |
|     deploy:
 | |
|       restart_policy:
 | |
|         condition: on-failure
 | |
|         delay: 3s
 | |
|         max_attempts: 5
 | |
|         window: 60s
 | |
| volumes:
 | |
|   vllm-inference:
 | |
|   vllm-safety:
 | |
|   llamastack:
 |