feat(ollama): periodically refresh models (#2805)

For self-hosted providers like Ollama (or vLLM), the backing server is running a set of models. That server should be treated as the source of truth and the Stack registry should just be a cache for those models. Of course, in production environments, you may not want this (because you know what model you are running statically) hence there's a config boolean to control this behavior. _This is part of a series of PRs aimed at removing the requirement of needing to set `INFERENCE_MODEL` env variables for running Llama Stack server._ ## Test Plan Copy and modify the starter.yaml template / config and enable `refresh_models: true, refresh_models_interval: 10` for the ollama provider. Then, run: ``` LLAMA_STACK_LOGGING=all=debug \ ENABLE_OLLAMA=ollama uv run llama stack run --image-type venv /tmp/starter.yaml ``` See a gargantuan amount of logs, but verify that the provider is periodically refreshing models. Stop and prune a model from ollama server, restart the server. Verify that the model goes away when I call `uv run llama-stack-client models list`
2025-07-21 03:59:42 +00:00 · 2025-07-18 12:20:36 -07:00 · 2025-07-18 12:20:36 -07:00 · 68a2dfbad7
commit 68a2dfbad7
parent 6d55f2f137
6 changed files with 123 additions and 16 deletions
--- a/llama_stack/distribution/library_client.py
+++ b/llama_stack/distribution/library_client.py
@ -151,6 +151,8 @@ class LlamaStackAsLibraryClient(LlamaStackClient):
        self.skip_logger_removal = skip_logger_removal
        self.provider_data = provider_data

+        self.loop = asyncio.new_event_loop()
+
    def initialize(self):
        if in_notebook():
            import nest_asyncio
@ -159,7 +161,7 @@ class LlamaStackAsLibraryClient(LlamaStackClient):
            if not self.skip_logger_removal:
                self._remove_root_logger_handlers()

-        return asyncio.run(self.async_client.initialize())
+        return self.loop.run_until_complete(self.async_client.initialize())

    def _remove_root_logger_handlers(self):
        """
@ -172,10 +174,7 @@ class LlamaStackAsLibraryClient(LlamaStackClient):
            logger.info(f"Removed handler {handler.__class__.__name__} from root logger")

    def request(self, *args, **kwargs):
-        # NOTE: We are using AsyncLlamaStackClient under the hood
-        # A new event loop is needed to convert the AsyncStream
-        # from async client into SyncStream return type for streaming
-        loop = asyncio.new_event_loop()
+        loop = self.loop
        asyncio.set_event_loop(loop)

        if kwargs.get("stream"):
@ -192,7 +191,6 @@ class LlamaStackAsLibraryClient(LlamaStackClient):
                    pending = asyncio.all_tasks(loop)
                    if pending:
                        loop.run_until_complete(asyncio.gather(*pending, return_exceptions=True))
-                    loop.close()

            return sync_generator()
        else:
@ -202,7 +200,6 @@ class LlamaStackAsLibraryClient(LlamaStackClient):
                pending = asyncio.all_tasks(loop)
                if pending:
                    loop.run_until_complete(asyncio.gather(*pending, return_exceptions=True))
-                loop.close()
            return result