Implement unregister_model() and shutdown()

2025-08-12 04:50:39 +00:00 · 2025-01-29 18:07:36 -08:00 · 2025-01-29 18:07:36 -08:00 · 74c8504f50
commit 74c8504f50
parent 4302200396
1 changed files with 39 additions and 7 deletions
--- a/llama_stack/providers/inline/inference/vllm/vllm.py
+++ b/llama_stack/providers/inline/inference/vllm/vllm.py
@ -249,6 +249,9 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate):
        self.engine = None
        self.chat = None
    def __del__(self):
        self._shutdown()
    ###########################################################################
    # METHODS INHERITED FROM UNDOCUMENTED IMPLICIT MYSTERY BASE CLASS
@ -266,6 +269,24 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate):
        """
        pass
    async def shutdown(self) -> None:
        """
        Callback that apparently is invoked when shutting down the Llama
        Stack server. Not sure how to shut down a Llama Stack server in such
        a way as to trigger this callback.
        """
        _info("Shutting down inline vLLM inference provider.")
        self._shutdown()
    def _shutdown(self) -> None:
        """Internal non-async version of self.shutdown(). Idempotent."""
        if self.engine is not None:
            self.engine.shutdown_background_loop()
            self.engine = None
            self.chat = None
            self.model_ids = set()
            self.resolved_model_id = None
    ###########################################################################
    # METHODS INHERITED FROM ModelsProtocolPrivate INTERFACE
@ -368,14 +389,25 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate):
        Callback that is called when the server removes an inference endpoint
        from an inference provider.
-        The semantics of this callback are not clear. How should model_id
+        :param model_id: The same external ID that the higher layers of the
-         be interpreted? What happens to pending requests?
+         stack previously passed to :func:`register_model()`
        :param model_id: Undocumented string parameter
        :returns: Nothing, at least according to the spec
        """
-        raise NotImplementedError()
+        if model_id not in self.model_ids:
            raise ValueError(
                f"Attempted to unregister model ID '{model_id}', "
                f"but that ID is not registered to this provider."
            )
        self.model_ids.remove(model_id)
        if len(self.model_ids) == 0:
            # Last model was just unregistered. Shut down the connection
            # to vLLM and free up resources.
            # Note that this operation may cause in-flight chat completion
            # requests on the now-unregistered model to return errors.
            self.resolved_model_id = None
            self.chat = None
            self.engine.shutdown_background_loop()
            self.engine = None
    ###########################################################################
    # METHODS INHERITED FROM Inference INTERFACE