mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-17 09:22:36 +00:00
Adding docker-compose.yaml, starting to simplify
This commit is contained in:
parent
e4509cb568
commit
f38e76ee98
14 changed files with 516 additions and 386 deletions
|
|
@ -9,11 +9,6 @@ from typing import Optional
|
|||
from llama_models.schema_utils import json_schema_type
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from llama_stack.providers.utils.docker.service_config import DockerComposeServiceConfig
|
||||
|
||||
|
||||
DEFAULT_VLLM_PORT = 8000
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class VLLMInferenceAdapterConfig(BaseModel):
|
||||
|
|
@ -33,48 +28,10 @@ class VLLMInferenceAdapterConfig(BaseModel):
|
|||
@classmethod
|
||||
def sample_run_config(
|
||||
cls,
|
||||
url: str = "${env.VLLM_URL:http://host.docker.internal:5100/v1}",
|
||||
url: str = "${env.VLLM_URL}",
|
||||
):
|
||||
return {
|
||||
"url": url,
|
||||
"max_tokens": "${env.VLLM_MAX_TOKENS:4096}",
|
||||
"api_token": "${env.VLLM_API_TOKEN:fake}",
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def sample_docker_compose_config(
|
||||
cls,
|
||||
port: int = DEFAULT_VLLM_PORT,
|
||||
cuda_visible_devices: str = "0",
|
||||
model: str = "meta-llama/Llama-3.2-3B-Instruct",
|
||||
) -> Optional[DockerComposeServiceConfig]:
|
||||
return DockerComposeServiceConfig(
|
||||
image="vllm/vllm-openai:latest",
|
||||
volumes=["$HOME/.cache/huggingface:/root/.cache/huggingface"],
|
||||
devices=["nvidia.com/gpu=all"],
|
||||
deploy={
|
||||
"resources": {
|
||||
"reservations": {
|
||||
"devices": [{"driver": "nvidia", "capabilities": ["gpu"]}]
|
||||
}
|
||||
}
|
||||
},
|
||||
runtime="nvidia",
|
||||
ports=[f"{port}:{port}"],
|
||||
environment={
|
||||
"CUDA_VISIBLE_DEVICES": cuda_visible_devices,
|
||||
"HUGGING_FACE_HUB_TOKEN": "$HF_TOKEN",
|
||||
},
|
||||
command=(
|
||||
" ".join(
|
||||
[
|
||||
"--gpu-memory-utilization 0.75",
|
||||
f"--model {model}",
|
||||
"--enforce-eager",
|
||||
"--max-model-len 8192",
|
||||
"--max-num-seqs 16",
|
||||
f"--port {port}",
|
||||
]
|
||||
)
|
||||
),
|
||||
)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,26 @@
|
|||
services:
|
||||
${SERVICE_NAME:-vllm}:
|
||||
image: vllm/vllm-openai:latest
|
||||
ports:
|
||||
- ${VLLM_PORT:-5100}:${VLLM_PORT:-5100}
|
||||
volumes:
|
||||
- $HOME/.cache/huggingface:/root/.cache/huggingface
|
||||
devices:
|
||||
- nvidia.com/gpu=all
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
capabilities: [gpu]
|
||||
runtime: nvidia
|
||||
environment:
|
||||
- CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0}
|
||||
- HUGGING_FACE_HUB_TOKEN=${HF_TOKEN}
|
||||
command: >
|
||||
--gpu-memory-utilization 0.75
|
||||
--model ${VLLM_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
|
||||
--enforce-eager
|
||||
--max-model-len 8192
|
||||
--max-num-seqs 16
|
||||
--port ${VLLM_PORT:-5100}
|
||||
Loading…
Add table
Add a link
Reference in a new issue