Convert TGI

2025-12-17 16:49:50 +00:00 · 2024-11-17 14:49:41 -08:00 · 2024-11-17 14:49:41 -08:00 · 028530546f
commit 028530546f
parent 9bb07ce298
14 changed files with 485 additions and 160 deletions
--- a/llama_stack/providers/remote/inference/tgi/config.py
+++ b/llama_stack/providers/remote/inference/tgi/config.py
@ -12,19 +12,20 @@ from pydantic import BaseModel, Field

@json_schema_type
 class TGIImplConfig(BaseModel):
-    host: str = "localhost"
-    port: int = 8080
-    protocol: str = "http"
-
-    @property
-    def url(self) -> str:
-        return f"{self.protocol}://{self.host}:{self.port}"
-
+    url: str = Field(
+        description="The URL for the TGI serving endpoint",
+    )
    api_token: Optional[str] = Field(
        default=None,
        description="A bearer token if your TGI endpoint is protected.",
    )

+    @classmethod
+    def sample_run_config(cls, url: str = "${env.TGI_URL}"):
+        return {
+            "url": url,
+        }
+

@json_schema_type
 class InferenceEndpointImplConfig(BaseModel):
--- a/llama_stack/providers/remote/inference/tgi/docker_compose.yaml
+++ b/llama_stack/providers/remote/inference/tgi/docker_compose.yaml
@ -1,35 +0,0 @@
-services:
-  ${SERVICE_NAME:-tgi}:
-    image: ghcr.io/huggingface/text-generation-inference:2.3.1
-    network_mode: "host"
-    volumes:
-      - $HOME/.cache/huggingface:/data
-    ports:
-      - ${TGI_PORT:-8000}:${TGI_PORT:-8000}
-    devices:
-      - nvidia.com/gpu=all
-    environment:
-      - CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0}
-      - HF_HOME=/data
-      - HF_DATASETS_CACHE=/data
-      - HF_MODULES_CACHE=/data
-      - HF_HUB_CACHE=/data
-    command: >
-      --dtype bfloat16
-      --usage-stats off
-      --sharded false
-      --model-id ${TGI_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
-      --port ${TGI_PORT:-8000}
-      --cuda-memory-fraction ${TGI_CUDA_MEMORY_FRACTION:-0.8}
-    deploy:
-      resources:
-        reservations:
-          devices:
-          - driver: nvidia
-            capabilities: [gpu]
-    runtime: nvidia
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://${SERVICE_NAME:-tgi}:${TGI_PORT:-8000}/health"]
-      interval: 5s
-      timeout: 5s
-      retries: 30
--- a/llama_stack/providers/remote/inference/vllm/docker_compose.yaml
+++ b/llama_stack/providers/remote/inference/vllm/docker_compose.yaml
@ -1,26 +0,0 @@
-services:
-  ${SERVICE_NAME:-vllm}:
-    image: vllm/vllm-openai:latest
-    ports:
-      - ${VLLM_PORT:-5100}:${VLLM_PORT:-5100}
-    volumes:
-      - $HOME/.cache/huggingface:/root/.cache/huggingface
-    devices:
-      - nvidia.com/gpu=all
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              capabilities: [gpu]
-    runtime: nvidia
-    environment:
-      - CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0}
-      - HUGGING_FACE_HUB_TOKEN=${HF_TOKEN}
-    command: >
-      --gpu-memory-utilization 0.75
-      --model ${VLLM_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
-      --enforce-eager
-      --max-model-len 8192
-      --max-num-seqs 16
-      --port ${VLLM_PORT:-5100}