From 74c8504f50898d953e4fd25dddba35f0abc55f8b Mon Sep 17 00:00:00 2001 From: Fred Reiss Date: Wed, 29 Jan 2025 18:07:36 -0800 Subject: [PATCH] Implement unregister_model() and shutdown() --- .../providers/inline/inference/vllm/vllm.py | 46 ++++++++++++++++--- 1 file changed, 39 insertions(+), 7 deletions(-) diff --git a/llama_stack/providers/inline/inference/vllm/vllm.py b/llama_stack/providers/inline/inference/vllm/vllm.py index dbfb91382..d5bdc11da 100644 --- a/llama_stack/providers/inline/inference/vllm/vllm.py +++ b/llama_stack/providers/inline/inference/vllm/vllm.py @@ -249,6 +249,9 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate): self.engine = None self.chat = None + def __del__(self): + self._shutdown() + ########################################################################### # METHODS INHERITED FROM UNDOCUMENTED IMPLICIT MYSTERY BASE CLASS @@ -266,6 +269,24 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate): """ pass + async def shutdown(self) -> None: + """ + Callback that apparently is invoked when shutting down the Llama + Stack server. Not sure how to shut down a Llama Stack server in such + a way as to trigger this callback. + """ + _info("Shutting down inline vLLM inference provider.") + self._shutdown() + + def _shutdown(self) -> None: + """Internal non-async version of self.shutdown(). Idempotent.""" + if self.engine is not None: + self.engine.shutdown_background_loop() + self.engine = None + self.chat = None + self.model_ids = set() + self.resolved_model_id = None + ########################################################################### # METHODS INHERITED FROM ModelsProtocolPrivate INTERFACE @@ -368,14 +389,25 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate): Callback that is called when the server removes an inference endpoint from an inference provider. - The semantics of this callback are not clear. How should model_id - be interpreted? What happens to pending requests? - - :param model_id: Undocumented string parameter - - :returns: Nothing, at least according to the spec + :param model_id: The same external ID that the higher layers of the + stack previously passed to :func:`register_model()` """ - raise NotImplementedError() + if model_id not in self.model_ids: + raise ValueError( + f"Attempted to unregister model ID '{model_id}', " + f"but that ID is not registered to this provider." + ) + self.model_ids.remove(model_id) + + if len(self.model_ids) == 0: + # Last model was just unregistered. Shut down the connection + # to vLLM and free up resources. + # Note that this operation may cause in-flight chat completion + # requests on the now-unregistered model to return errors. + self.resolved_model_id = None + self.chat = None + self.engine.shutdown_background_loop() + self.engine = None ########################################################################### # METHODS INHERITED FROM Inference INTERFACE