more fixes, plug shutdown handlers

still, FastAPIs sigint handler is not calling ours
2025-12-08 19:10:56 +00:00 · 2024-10-05 23:48:18 -07:00 · 2024-10-05 23:48:18 -07:00 · e45a417543
commit e45a417543
parent 60dead6196
4 changed files with 32 additions and 12 deletions
--- a/llama_stack/providers/adapters/inference/tgi/tgi.py
+++ b/llama_stack/providers/adapters/inference/tgi/tgi.py
@ -13,8 +13,6 @@ from llama_models.llama3.api.chat_format import ChatFormat
 from llama_models.llama3.api.datatypes import StopReason
 from llama_models.llama3.api.tokenizer import Tokenizer

-from llama_stack.distribution.datatypes import RoutableProvider
-
 from llama_stack.apis.inference import *  # noqa: F403
 from llama_stack.providers.utils.inference.augment_messages import (
    augment_messages_for_tools,
@ -25,7 +23,7 @@ from .config import InferenceAPIImplConfig, InferenceEndpointImplConfig, TGIImpl
 logger = logging.getLogger(__name__)


-class _HfAdapter(Inference, RoutableProvider):
+class _HfAdapter(Inference):
    client: AsyncInferenceClient
    max_tokens: int
    model_id: str
@ -34,11 +32,17 @@ class _HfAdapter(Inference, RoutableProvider):
        self.tokenizer = Tokenizer.get_instance()
        self.formatter = ChatFormat(self.tokenizer)

-    async def validate_routing_keys(self, routing_keys: list[str]) -> None:
-        # these are the model names the Llama Stack will use to route requests to this provider
-        # perform validation here if necessary
+    # TODO: make this work properly by checking this against the model_id being
+    # served by the remote endpoint
+    async def register_model(self, model: ModelDef) -> None:
        pass

+    async def list_models(self) -> List[ModelDef]:
+        return []
+
+    async def get_model(self, identifier: str) -> Optional[ModelDef]:
+        return None
+
    async def shutdown(self) -> None:
        pass

--- a/llama_stack/providers/impls/vllm/vllm.py
+++ b/llama_stack/providers/impls/vllm/vllm.py
@ -42,7 +42,7 @@ from llama_stack.apis.inference.inference import (
 from llama_stack.providers.utils.inference.augment_messages import (
    augment_messages_for_tools,
 )
-from llama_stack.providers.utils.inference.routable import RoutableProviderForModels
+from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper

 from .config import VLLMConfig

@ -75,7 +75,7 @@ def _vllm_sampling_params(sampling_params: Any) -> SamplingParams:
    return SamplingParams().from_optional(**kwargs)


-class VLLMInferenceImpl(Inference, RoutableProviderForModels):
+class VLLMInferenceImpl(Inference, ModelRegistryHelper):
    """Inference implementation for vLLM."""

    HF_MODEL_MAPPINGS = {
@ -109,7 +109,7 @@ class VLLMInferenceImpl(Inference, RoutableProviderForModels):

    def __init__(self, config: VLLMConfig):
        Inference.__init__(self)
-        RoutableProviderForModels.__init__(
+        ModelRegistryHelper.__init__(
            self,
            stack_to_provider_models_map=self.HF_MODEL_MAPPINGS,
        )