Convert TGI

This commit is contained in:
Ashwin Bharambe 2024-11-17 14:49:41 -08:00
parent 9bb07ce298
commit 028530546f
14 changed files with 485 additions and 160 deletions

View file

@ -12,19 +12,20 @@ from pydantic import BaseModel, Field
@json_schema_type
class TGIImplConfig(BaseModel):
host: str = "localhost"
port: int = 8080
protocol: str = "http"
@property
def url(self) -> str:
return f"{self.protocol}://{self.host}:{self.port}"
url: str = Field(
description="The URL for the TGI serving endpoint",
)
api_token: Optional[str] = Field(
default=None,
description="A bearer token if your TGI endpoint is protected.",
)
@classmethod
def sample_run_config(cls, url: str = "${env.TGI_URL}"):
return {
"url": url,
}
@json_schema_type
class InferenceEndpointImplConfig(BaseModel):

View file

@ -1,35 +0,0 @@
services:
${SERVICE_NAME:-tgi}:
image: ghcr.io/huggingface/text-generation-inference:2.3.1
network_mode: "host"
volumes:
- $HOME/.cache/huggingface:/data
ports:
- ${TGI_PORT:-8000}:${TGI_PORT:-8000}
devices:
- nvidia.com/gpu=all
environment:
- CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0}
- HF_HOME=/data
- HF_DATASETS_CACHE=/data
- HF_MODULES_CACHE=/data
- HF_HUB_CACHE=/data
command: >
--dtype bfloat16
--usage-stats off
--sharded false
--model-id ${TGI_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
--port ${TGI_PORT:-8000}
--cuda-memory-fraction ${TGI_CUDA_MEMORY_FRACTION:-0.8}
deploy:
resources:
reservations:
devices:
- driver: nvidia
capabilities: [gpu]
runtime: nvidia
healthcheck:
test: ["CMD", "curl", "-f", "http://${SERVICE_NAME:-tgi}:${TGI_PORT:-8000}/health"]
interval: 5s
timeout: 5s
retries: 30

View file

@ -1,26 +0,0 @@
services:
${SERVICE_NAME:-vllm}:
image: vllm/vllm-openai:latest
ports:
- ${VLLM_PORT:-5100}:${VLLM_PORT:-5100}
volumes:
- $HOME/.cache/huggingface:/root/.cache/huggingface
devices:
- nvidia.com/gpu=all
deploy:
resources:
reservations:
devices:
- driver: nvidia
capabilities: [gpu]
runtime: nvidia
environment:
- CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0}
- HUGGING_FACE_HUB_TOKEN=${HF_TOKEN}
command: >
--gpu-memory-utilization 0.75
--model ${VLLM_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
--enforce-eager
--max-model-len 8192
--max-num-seqs 16
--port ${VLLM_PORT:-5100}