feat: add refresh_models support to inference adapters (default: false) (#3719)

# What does this PR do?

inference adapters can now configure `refresh_models: bool` to control
periodic model listing from their providers

BREAKING CHANGE: together inference adapter default changed. previously
always refreshed, now follows config.

addresses "models: refresh" on #3517

## Test Plan

ci w/ new tests
This commit is contained in:
Matthew Farrellee 2025-10-07 09:19:56 -04:00 committed by GitHub
parent 8b9af03a1b
commit e892a3f7f4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
31 changed files with 33 additions and 67 deletions

View file

@ -41,9 +41,6 @@ class DatabricksInferenceAdapter(OpenAIMixin):
).serving_endpoints.list() # TODO: this is not async
]
async def should_refresh_models(self) -> bool:
return False
async def openai_completion(
self,
model: str,

View file

@ -6,8 +6,6 @@
from typing import Any
from pydantic import Field
from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
DEFAULT_OLLAMA_URL = "http://localhost:11434"
@ -15,10 +13,6 @@ DEFAULT_OLLAMA_URL = "http://localhost:11434"
class OllamaImplConfig(RemoteInferenceProviderConfig):
url: str = DEFAULT_OLLAMA_URL
refresh_models: bool = Field(
default=False,
description="Whether to refresh models periodically",
)
@classmethod
def sample_run_config(cls, url: str = "${env.OLLAMA_URL:=http://localhost:11434}", **kwargs) -> dict[str, Any]:

View file

@ -72,9 +72,6 @@ class OllamaInferenceAdapter(OpenAIMixin):
f"Ollama Server is not running (message: {r['message']}). Make sure to start it using `ollama serve` in a separate terminal"
)
async def should_refresh_models(self) -> bool:
return self.config.refresh_models
async def health(self) -> HealthResponse:
"""
Performs a health check by verifying connectivity to the Ollama server.

View file

@ -63,9 +63,6 @@ class TogetherInferenceAdapter(OpenAIMixin, NeedsRequestProviderData):
# Together's /v1/models is not compatible with OpenAI's /v1/models. Together support ticket #13355 -> will not fix, use Together's own client
return [m.id for m in await self._get_client().models.list()]
async def should_refresh_models(self) -> bool:
return True
async def openai_embeddings(
self,
model: str,

View file

@ -30,10 +30,6 @@ class VLLMInferenceAdapterConfig(RemoteInferenceProviderConfig):
default=True,
description="Whether to verify TLS certificates. Can be a boolean or a path to a CA certificate file.",
)
refresh_models: bool = Field(
default=False,
description="Whether to refresh models periodically",
)
@field_validator("tls_verify")
@classmethod

View file

@ -53,10 +53,6 @@ class VLLMInferenceAdapter(OpenAIMixin):
"You must provide a URL in run.yaml (or via the VLLM_URL environment variable) to use vLLM."
)
async def should_refresh_models(self) -> bool:
# Strictly respecting the refresh_models directive
return self.config.refresh_models
async def health(self) -> HealthResponse:
"""
Performs a health check by verifying connectivity to the remote vLLM server.

View file

@ -24,6 +24,10 @@ class RemoteInferenceProviderConfig(BaseModel):
default=None,
description="List of models that should be registered with the model registry. If None, all models are allowed.",
)
refresh_models: bool = Field(
default=False,
description="Whether to refresh models periodically from the provider",
)
# TODO: this class is more confusing than useful right now. We need to make it

View file

@ -484,7 +484,7 @@ class OpenAIMixin(NeedsRequestProviderData, ABC, BaseModel):
return model in self._model_cache
async def should_refresh_models(self) -> bool:
return False
return self.config.refresh_models
#
# The model_dump implementations are to avoid serializing the extra fields,