From a548169b991b73c69bdc190076d39edd793c8b7e Mon Sep 17 00:00:00 2001 From: Akram Ben Aissi Date: Fri, 10 Oct 2025 15:23:13 +0100 Subject: [PATCH] fix: allow skipping model availability check for vLLM (#3739) # What does this PR do? Allows model check to fail gracefully instead of crashing on startup. ## Test Plan set VLLM_URL to your VLLM server ``` (base) akram@Mac llama-stack % LAMA_STACK_LOGGING="all=debug" VLLM_ENABLE_MODEL_DISCOVERY=false MILVUS_DB_PATH=./milvus.db INFERENCE_MODEL=vllm uv run --with llama-stack llama stack build --distro starter --image-type venv --run ``` ``` INFO 2025-10-08 20:11:24,637 llama_stack.providers.utils.inference.inference_store:74 inference: Write queue disabled for SQLite to avoid concurrency issues INFO 2025-10-08 20:11:24,866 llama_stack.providers.utils.responses.responses_store:96 openai_responses: Write queue disabled for SQLite to avoid concurrency issues ERROR 2025-10-08 20:11:26,160 llama_stack.providers.utils.inference.openai_mixin:439 providers::utils: VLLMInferenceAdapter.list_provider_model_ids() failed with: Found. [...] INFO 2025-10-08 20:11:26,295 uvicorn.error:84 uncategorized: Started server process [83144] INFO 2025-10-08 20:11:26,296 uvicorn.error:48 uncategorized: Waiting for application startup. INFO 2025-10-08 20:11:26,297 llama_stack.core.server.server:170 core::server: Starting up INFO 2025-10-08 20:11:26,297 llama_stack.core.stack:399 core: starting registry refresh task INFO 2025-10-08 20:11:26,311 uvicorn.error:62 uncategorized: Application startup complete. INFO 2025-10-08 20:11:26,312 uvicorn.error:216 uncategorized: Uvicorn running on http://['::', '0.0.0.0']:8321 (Press CTRL+C to quit) ERROR 2025-10-08 20:11:26,791 llama_stack.providers.utils.inference.openai_mixin:439 providers::utils: VLLMInferenceAdapter.list_provider_model_ids() failed with: Found. ``` --- .../providers/remote/inference/vllm/vllm.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py index 310eaf7b6..5974ca176 100644 --- a/llama_stack/providers/remote/inference/vllm/vllm.py +++ b/llama_stack/providers/remote/inference/vllm/vllm.py @@ -77,6 +77,20 @@ class VLLMInferenceAdapter(OpenAIMixin): def get_extra_client_params(self): return {"http_client": httpx.AsyncClient(verify=self.config.tls_verify)} + async def check_model_availability(self, model: str) -> bool: + """ + Skip the check when running without authentication. + """ + if not self.config.api_token: + model_ids = [] + async for m in self.client.models.list(): + if m.id == model: # Found exact match + return True + model_ids.append(m.id) + raise ValueError(f"Model '{model}' not found. Available models: {model_ids}") + log.warning(f"Not checking model availability for {model} as API token may trigger OAuth workflow") + return True + async def openai_chat_completion( self, model: str,