feat: add refresh_models support to inference adapters (default: false) (#3719)

# What does this PR do? inference adapters can now configure `refresh_models: bool` to control periodic model listing from their providers BREAKING CHANGE: together inference adapter default changed. previously always refreshed, now follows config. addresses "models: refresh" on #3517 ## Test Plan ci w/ new tests
2025-12-03 09:53:45 +00:00 · 2025-10-07 09:19:56 -04:00 · 2025-10-07 09:19:56 -04:00 · e892a3f7f4
commit e892a3f7f4
parent 8b9af03a1b
31 changed files with 33 additions and 67 deletions
--- a/llama_stack/providers/remote/inference/databricks/databricks.py
+++ b/llama_stack/providers/remote/inference/databricks/databricks.py
@ -41,9 +41,6 @@ class DatabricksInferenceAdapter(OpenAIMixin):
            ).serving_endpoints.list()  # TODO: this is not async
        ]

-    async def should_refresh_models(self) -> bool:
-        return False
-
    async def openai_completion(
        self,
        model: str,
--- a/llama_stack/providers/remote/inference/ollama/config.py
+++ b/llama_stack/providers/remote/inference/ollama/config.py
@ -6,8 +6,6 @@

 from typing import Any

-from pydantic import Field
-
 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig

 DEFAULT_OLLAMA_URL = "http://localhost:11434"
@ -15,10 +13,6 @@ DEFAULT_OLLAMA_URL = "http://localhost:11434"

 class OllamaImplConfig(RemoteInferenceProviderConfig):
    url: str = DEFAULT_OLLAMA_URL
-    refresh_models: bool = Field(
-        default=False,
-        description="Whether to refresh models periodically",
-    )

    @classmethod
    def sample_run_config(cls, url: str = "${env.OLLAMA_URL:=http://localhost:11434}", **kwargs) -> dict[str, Any]:
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@ -72,9 +72,6 @@ class OllamaInferenceAdapter(OpenAIMixin):
                f"Ollama Server is not running (message: {r['message']}). Make sure to start it using `ollama serve` in a separate terminal"
            )

-    async def should_refresh_models(self) -> bool:
-        return self.config.refresh_models
-
    async def health(self) -> HealthResponse:
        """
        Performs a health check by verifying connectivity to the Ollama server.
--- a/llama_stack/providers/remote/inference/together/together.py
+++ b/llama_stack/providers/remote/inference/together/together.py
@ -63,9 +63,6 @@ class TogetherInferenceAdapter(OpenAIMixin, NeedsRequestProviderData):
        # Together's /v1/models is not compatible with OpenAI's /v1/models. Together support ticket #13355 -> will not fix, use Together's own client
        return [m.id for m in await self._get_client().models.list()]

-    async def should_refresh_models(self) -> bool:
-        return True
-
    async def openai_embeddings(
        self,
        model: str,
--- a/llama_stack/providers/remote/inference/vllm/config.py
+++ b/llama_stack/providers/remote/inference/vllm/config.py
@ -30,10 +30,6 @@ class VLLMInferenceAdapterConfig(RemoteInferenceProviderConfig):
        default=True,
        description="Whether to verify TLS certificates. Can be a boolean or a path to a CA certificate file.",
    )
-    refresh_models: bool = Field(
-        default=False,
-        description="Whether to refresh models periodically",
-    )

    @field_validator("tls_verify")
    @classmethod
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@ -53,10 +53,6 @@ class VLLMInferenceAdapter(OpenAIMixin):
                "You must provide a URL in run.yaml (or via the VLLM_URL environment variable) to use vLLM."
            )

-    async def should_refresh_models(self) -> bool:
-        # Strictly respecting the refresh_models directive
-        return self.config.refresh_models
-
    async def health(self) -> HealthResponse:
        """
        Performs a health check by verifying connectivity to the remote vLLM server.
--- a/llama_stack/providers/utils/inference/model_registry.py
+++ b/llama_stack/providers/utils/inference/model_registry.py
@ -24,6 +24,10 @@ class RemoteInferenceProviderConfig(BaseModel):
        default=None,
        description="List of models that should be registered with the model registry. If None, all models are allowed.",
    )
+    refresh_models: bool = Field(
+        default=False,
+        description="Whether to refresh models periodically from the provider",
+    )


 # TODO: this class is more confusing than useful right now. We need to make it
--- a/llama_stack/providers/utils/inference/openai_mixin.py
+++ b/llama_stack/providers/utils/inference/openai_mixin.py
@ -484,7 +484,7 @@ class OpenAIMixin(NeedsRequestProviderData, ABC, BaseModel):
        return model in self._model_cache

    async def should_refresh_models(self) -> bool:
-        return False
+        return self.config.refresh_models

    #
    # The model_dump implementations are to avoid serializing the extra fields,