llama-stack-mirror/llama_stack/providers/remote/inference/vllm/vllm.py
Akram Ben Aissi a548169b99
fix: allow skipping model availability check for vLLM (#3739)
# What does this PR do?
<!-- Provide a short summary of what this PR does and why. Link to
relevant issues if applicable. -->
Allows model check to fail gracefully instead of crashing on startup.


<!-- If resolving an issue, uncomment and update the line below -->
<!-- Closes #[issue-number] -->

## Test Plan
<!-- Describe the tests you ran to verify your changes with result
summaries. *Provide clear instructions so the plan can be easily
re-executed.* -->

set VLLM_URL to your VLLM server 

```
(base) akram@Mac llama-stack % LAMA_STACK_LOGGING="all=debug" VLLM_ENABLE_MODEL_DISCOVERY=false  MILVUS_DB_PATH=./milvus.db INFERENCE_MODEL=vllm uv run --with llama-stack llama stack build --distro starter  --image-type venv --run
```



```

INFO     2025-10-08 20:11:24,637 llama_stack.providers.utils.inference.inference_store:74 inference: Write queue disabled for SQLite to avoid concurrency issues
INFO     2025-10-08 20:11:24,866 llama_stack.providers.utils.responses.responses_store:96 openai_responses: Write queue disabled for SQLite to avoid concurrency issues
ERROR    2025-10-08 20:11:26,160 llama_stack.providers.utils.inference.openai_mixin:439 providers::utils: VLLMInferenceAdapter.list_provider_model_ids() failed with: <a
         href="https://oauth.akram.a1ey.p3.openshiftapps.com:443/oauth/authorize?approval_prompt=force&amp;client_id=system%3Aserviceaccount%3Arhoai-30-genai%3Adefault&amp;redirect_uri=ht
         tps%3A%2F%2Fvllm-rhoai-30-genai.apps.rosa.akram.a1ey.p3.openshiftapps.com%2Foauth%2Fcallback&amp;response_type=code&amp;scope=user%3Ainfo+user%3Acheck-access&amp;state=9fba207425
         5851c718aca717a5887d76%3A%2Fmodels">Found</a>.
         
[...]
INFO     2025-10-08 20:11:26,295 uvicorn.error:84 uncategorized: Started server process [83144]
INFO     2025-10-08 20:11:26,296 uvicorn.error:48 uncategorized: Waiting for application startup.
INFO     2025-10-08 20:11:26,297 llama_stack.core.server.server:170 core::server: Starting up
INFO     2025-10-08 20:11:26,297 llama_stack.core.stack:399 core: starting registry refresh task
INFO     2025-10-08 20:11:26,311 uvicorn.error:62 uncategorized: Application startup complete.
INFO     2025-10-08 20:11:26,312 uvicorn.error:216 uncategorized: Uvicorn running on http://['::', '0.0.0.0']:8321 (Press CTRL+C to quit)
ERROR    2025-10-08 20:11:26,791 llama_stack.providers.utils.inference.openai_mixin:439 providers::utils: VLLMInferenceAdapter.list_provider_model_ids() failed with: <a
         href="https://oauth.akram.a1ey.p3.openshiftapps.com:443/oauth/authorize?approval_prompt=force&amp;client_id=system%3Aserviceaccount%3Arhoai-30-genai%3Adefault&amp;redirect_uri=ht
         tps%3A%2F%2Fvllm-rhoai-30-genai.apps.rosa.akram.a1ey.p3.openshiftapps.com%2Foauth%2Fcallback&amp;response_type=code&amp;scope=user%3Ainfo+user%3Acheck-access&amp;state=8ef0cba3e1
         71a4f8b04cb445cfb91a4c%3A%2Fmodels">Found</a>.

```
2025-10-10 07:23:13 -07:00

153 lines
5.5 KiB
Python

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from collections.abc import AsyncIterator
from typing import Any
from urllib.parse import urljoin
import httpx
from openai.types.chat.chat_completion_chunk import (
ChatCompletionChunk as OpenAIChatCompletionChunk,
)
from pydantic import ConfigDict
from llama_stack.apis.inference import (
OpenAIChatCompletion,
OpenAIMessageParam,
OpenAIResponseFormatParam,
ToolChoice,
)
from llama_stack.log import get_logger
from llama_stack.providers.datatypes import (
HealthResponse,
HealthStatus,
)
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
from .config import VLLMInferenceAdapterConfig
log = get_logger(name=__name__, category="inference::vllm")
class VLLMInferenceAdapter(OpenAIMixin):
config: VLLMInferenceAdapterConfig
model_config = ConfigDict(arbitrary_types_allowed=True)
provider_data_api_key_field: str = "vllm_api_token"
def get_api_key(self) -> str:
return self.config.api_token or ""
def get_base_url(self) -> str:
"""Get the base URL from config."""
if not self.config.url:
raise ValueError("No base URL configured")
return self.config.url
async def initialize(self) -> None:
if not self.config.url:
raise ValueError(
"You must provide a URL in run.yaml (or via the VLLM_URL environment variable) to use vLLM."
)
async def health(self) -> HealthResponse:
"""
Performs a health check by verifying connectivity to the remote vLLM server.
This method is used by the Provider API to verify
that the service is running correctly.
Uses the unauthenticated /health endpoint.
Returns:
HealthResponse: A dictionary containing the health status.
"""
try:
base_url = self.get_base_url()
health_url = urljoin(base_url, "health")
async with httpx.AsyncClient() as client:
response = await client.get(health_url)
response.raise_for_status()
return HealthResponse(status=HealthStatus.OK)
except Exception as e:
return HealthResponse(status=HealthStatus.ERROR, message=f"Health check failed: {str(e)}")
def get_extra_client_params(self):
return {"http_client": httpx.AsyncClient(verify=self.config.tls_verify)}
async def check_model_availability(self, model: str) -> bool:
"""
Skip the check when running without authentication.
"""
if not self.config.api_token:
model_ids = []
async for m in self.client.models.list():
if m.id == model: # Found exact match
return True
model_ids.append(m.id)
raise ValueError(f"Model '{model}' not found. Available models: {model_ids}")
log.warning(f"Not checking model availability for {model} as API token may trigger OAuth workflow")
return True
async def openai_chat_completion(
self,
model: str,
messages: list[OpenAIMessageParam],
frequency_penalty: float | None = None,
function_call: str | dict[str, Any] | None = None,
functions: list[dict[str, Any]] | None = None,
logit_bias: dict[str, float] | None = None,
logprobs: bool | None = None,
max_completion_tokens: int | None = None,
max_tokens: int | None = None,
n: int | None = None,
parallel_tool_calls: bool | None = None,
presence_penalty: float | None = None,
response_format: OpenAIResponseFormatParam | None = None,
seed: int | None = None,
stop: str | list[str] | None = None,
stream: bool | None = None,
stream_options: dict[str, Any] | None = None,
temperature: float | None = None,
tool_choice: str | dict[str, Any] | None = None,
tools: list[dict[str, Any]] | None = None,
top_logprobs: int | None = None,
top_p: float | None = None,
user: str | None = None,
) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
max_tokens = max_tokens or self.config.max_tokens
# This is to be consistent with OpenAI API and support vLLM <= v0.6.3
# References:
# * https://platform.openai.com/docs/api-reference/chat/create#chat-create-tool_choice
# * https://github.com/vllm-project/vllm/pull/10000
if not tools and tool_choice is not None:
tool_choice = ToolChoice.none.value
return await super().openai_chat_completion(
model=model,
messages=messages,
frequency_penalty=frequency_penalty,
function_call=function_call,
functions=functions,
logit_bias=logit_bias,
logprobs=logprobs,
max_completion_tokens=max_completion_tokens,
max_tokens=max_tokens,
n=n,
parallel_tool_calls=parallel_tool_calls,
presence_penalty=presence_penalty,
response_format=response_format,
seed=seed,
stop=stop,
stream=stream,
stream_options=stream_options,
temperature=temperature,
tool_choice=tool_choice,
tools=tools,
top_logprobs=top_logprobs,
top_p=top_p,
user=user,
)