mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-17 16:49:50 +00:00
Convert TGI
This commit is contained in:
parent
9bb07ce298
commit
028530546f
14 changed files with 485 additions and 160 deletions
|
|
@ -12,19 +12,20 @@ from pydantic import BaseModel, Field
|
|||
|
||||
@json_schema_type
|
||||
class TGIImplConfig(BaseModel):
|
||||
host: str = "localhost"
|
||||
port: int = 8080
|
||||
protocol: str = "http"
|
||||
|
||||
@property
|
||||
def url(self) -> str:
|
||||
return f"{self.protocol}://{self.host}:{self.port}"
|
||||
|
||||
url: str = Field(
|
||||
description="The URL for the TGI serving endpoint",
|
||||
)
|
||||
api_token: Optional[str] = Field(
|
||||
default=None,
|
||||
description="A bearer token if your TGI endpoint is protected.",
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def sample_run_config(cls, url: str = "${env.TGI_URL}"):
|
||||
return {
|
||||
"url": url,
|
||||
}
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class InferenceEndpointImplConfig(BaseModel):
|
||||
|
|
|
|||
|
|
@ -1,35 +0,0 @@
|
|||
services:
|
||||
${SERVICE_NAME:-tgi}:
|
||||
image: ghcr.io/huggingface/text-generation-inference:2.3.1
|
||||
network_mode: "host"
|
||||
volumes:
|
||||
- $HOME/.cache/huggingface:/data
|
||||
ports:
|
||||
- ${TGI_PORT:-8000}:${TGI_PORT:-8000}
|
||||
devices:
|
||||
- nvidia.com/gpu=all
|
||||
environment:
|
||||
- CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0}
|
||||
- HF_HOME=/data
|
||||
- HF_DATASETS_CACHE=/data
|
||||
- HF_MODULES_CACHE=/data
|
||||
- HF_HUB_CACHE=/data
|
||||
command: >
|
||||
--dtype bfloat16
|
||||
--usage-stats off
|
||||
--sharded false
|
||||
--model-id ${TGI_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
|
||||
--port ${TGI_PORT:-8000}
|
||||
--cuda-memory-fraction ${TGI_CUDA_MEMORY_FRACTION:-0.8}
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
capabilities: [gpu]
|
||||
runtime: nvidia
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://${SERVICE_NAME:-tgi}:${TGI_PORT:-8000}/health"]
|
||||
interval: 5s
|
||||
timeout: 5s
|
||||
retries: 30
|
||||
|
|
@ -1,26 +0,0 @@
|
|||
services:
|
||||
${SERVICE_NAME:-vllm}:
|
||||
image: vllm/vllm-openai:latest
|
||||
ports:
|
||||
- ${VLLM_PORT:-5100}:${VLLM_PORT:-5100}
|
||||
volumes:
|
||||
- $HOME/.cache/huggingface:/root/.cache/huggingface
|
||||
devices:
|
||||
- nvidia.com/gpu=all
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
capabilities: [gpu]
|
||||
runtime: nvidia
|
||||
environment:
|
||||
- CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0}
|
||||
- HUGGING_FACE_HUB_TOKEN=${HF_TOKEN}
|
||||
command: >
|
||||
--gpu-memory-utilization 0.75
|
||||
--model ${VLLM_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
|
||||
--enforce-eager
|
||||
--max-model-len 8192
|
||||
--max-num-seqs 16
|
||||
--port ${VLLM_PORT:-5100}
|
||||
Loading…
Add table
Add a link
Reference in a new issue