mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-18 15:49:49 +00:00
Convert TGI
This commit is contained in:
parent
9bb07ce298
commit
028530546f
14 changed files with 485 additions and 160 deletions
|
|
@ -1,26 +0,0 @@
|
|||
services:
|
||||
${SERVICE_NAME:-vllm}:
|
||||
image: vllm/vllm-openai:latest
|
||||
ports:
|
||||
- ${VLLM_PORT:-5100}:${VLLM_PORT:-5100}
|
||||
volumes:
|
||||
- $HOME/.cache/huggingface:/root/.cache/huggingface
|
||||
devices:
|
||||
- nvidia.com/gpu=all
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
capabilities: [gpu]
|
||||
runtime: nvidia
|
||||
environment:
|
||||
- CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0}
|
||||
- HUGGING_FACE_HUB_TOKEN=${HF_TOKEN}
|
||||
command: >
|
||||
--gpu-memory-utilization 0.75
|
||||
--model ${VLLM_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
|
||||
--enforce-eager
|
||||
--max-model-len 8192
|
||||
--max-num-seqs 16
|
||||
--port ${VLLM_PORT:-5100}
|
||||
Loading…
Add table
Add a link
Reference in a new issue