mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-16 19:29:27 +00:00
26 lines
725 B
YAML
26 lines
725 B
YAML
services:
|
|
${SERVICE_NAME:-vllm}:
|
|
image: vllm/vllm-openai:latest
|
|
ports:
|
|
- ${VLLM_PORT:-5100}:${VLLM_PORT:-5100}
|
|
volumes:
|
|
- $HOME/.cache/huggingface:/root/.cache/huggingface
|
|
devices:
|
|
- nvidia.com/gpu=all
|
|
deploy:
|
|
resources:
|
|
reservations:
|
|
devices:
|
|
- driver: nvidia
|
|
capabilities: [gpu]
|
|
runtime: nvidia
|
|
environment:
|
|
- CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0}
|
|
- HUGGING_FACE_HUB_TOKEN=${HF_TOKEN}
|
|
command: >
|
|
--gpu-memory-utilization 0.75
|
|
--model ${VLLM_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
|
|
--enforce-eager
|
|
--max-model-len 8192
|
|
--max-num-seqs 16
|
|
--port ${VLLM_PORT:-5100}
|