From 2b54b57145e5bc1c7f66421bb87d8dd710d02ef3 Mon Sep 17 00:00:00 2001 From: Akram Ben Aissi Date: Fri, 3 Oct 2025 20:41:54 +0200 Subject: [PATCH] feat: implement graceful model discovery for vLLM provider - Attempt model discovery first for backward compatibility - If discovery fails and refresh_models=false, continue without error - If discovery fails and refresh_models=true, fail hard with ValueError - Supports dynamic token authentication scenarios Fixes OAuth authentication issues when vLLM service requires dynamic tokens --- .../providers/remote/inference/vllm/vllm.py | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py index 54ac8e1dc..e1607789f 100644 --- a/llama_stack/providers/remote/inference/vllm/vllm.py +++ b/llama_stack/providers/remote/inference/vllm/vllm.py @@ -9,7 +9,6 @@ from typing import Any from urllib.parse import urljoin import httpx -from openai import APIConnectionError from openai.types.chat.chat_completion_chunk import ( ChatCompletionChunk as OpenAIChatCompletionChunk, ) @@ -339,16 +338,19 @@ class VLLMInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin, Inference, ModelsPro pass # Ignore statically unknown model, will check live listing try: res = self.client.models.list() - except APIConnectionError as e: - raise ValueError( - f"Failed to connect to vLLM at {self.config.url}. Please check if vLLM is running and accessible at that URL." - ) from e - available_models = [m.id async for m in res] - if model.provider_resource_id not in available_models: - raise ValueError( - f"Model {model.provider_resource_id} is not being served by vLLM. " - f"Available models: {', '.join(available_models)}" - ) + available_models = [m.id async for m in res] + if model.provider_resource_id not in available_models: + raise ValueError( + f"Model {model.provider_resource_id} is not being served by vLLM. " + f"Available models: {', '.join(available_models)}" + ) + except Exception as e: + if self.config.refresh_models: + raise ValueError(f"Model verification failed: {e}") from e + # if refresh_models is false, gracefully continue without verification + log.warning(f"Model verification failed for model {model.model_id} with error {e}") + log.warning("Continuing without live check (refresh_models=false).") + return model async def _get_params(self, request: ChatCompletionRequest) -> dict: