feat: implement graceful model discovery for vLLM provider

- Attempt model discovery first for backward compatibility - If discovery fails and refresh_models=false, continue without error - If discovery fails and refresh_models=true, fail hard with ValueError - Supports dynamic token authentication scenarios Fixes OAuth authentication issues when vLLM service requires dynamic tokens
2025-10-03 19:57:35 +00:00 · 2025-10-03 20:41:54 +02:00 · 2025-10-03 20:41:54 +02:00 · 2b54b57145
commit 2b54b57145
parent 188a56af5c
1 changed files with 13 additions and 11 deletions
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@ -9,7 +9,6 @@ from typing import Any
 from urllib.parse import urljoin
 import httpx
 from openai import APIConnectionError
 from openai.types.chat.chat_completion_chunk import (
    ChatCompletionChunk as OpenAIChatCompletionChunk,
 )
@ -339,16 +338,19 @@ class VLLMInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin, Inference, ModelsPro
            pass  # Ignore statically unknown model, will check live listing
        try:
            res = self.client.models.list()
-        except APIConnectionError as e:
+            available_models = [m.id async for m in res]
-            raise ValueError(
+            if model.provider_resource_id not in available_models:
-                f"Failed to connect to vLLM at {self.config.url}. Please check if vLLM is running and accessible at that URL."
+                raise ValueError(
-            ) from e
+                    f"Model {model.provider_resource_id} is not being served by vLLM. "
-        available_models = [m.id async for m in res]
+                    f"Available models: {', '.join(available_models)}"
-        if model.provider_resource_id not in available_models:
+                )
-            raise ValueError(
+        except Exception as e:
-                f"Model {model.provider_resource_id} is not being served by vLLM. "
+            if self.config.refresh_models:
-                f"Available models: {', '.join(available_models)}"
+                raise ValueError(f"Model verification failed: {e}") from e
-            )
+            # if refresh_models is false, gracefully continue without verification
            log.warning(f"Model verification failed for model {model.model_id} with error {e}")
            log.warning("Continuing without live check (refresh_models=false).")
        return model
    async def _get_params(self, request: ChatCompletionRequest) -> dict: