# What does this PR do?


## Test Plan
# What does this PR do?


## Test Plan
# What does this PR do?


## Test Plan
Completes the refactoring started in previous commit by:

1. **Fix library client** (critical): Add logic to detect Pydantic model parameters
   and construct them properly from request bodies. The key fix is to NOT exclude
   any params when converting the body for Pydantic models - we need all fields
   to pass to the Pydantic constructor.

   Before: _convert_body excluded all params, leaving body empty for Pydantic construction
   After: Check for Pydantic params first, skip exclusion, construct model with full body

2. **Update remaining providers** to use new Pydantic-based signatures:
   - litellm_openai_mixin: Extract extra fields via __pydantic_extra__
   - databricks: Use TYPE_CHECKING import for params type
   - llama_openai_compat: Use TYPE_CHECKING import for params type
   - sentence_transformers: Update method signatures to use params

3. **Update unit tests** to use new Pydantic signature:
   - test_openai_mixin.py: Use OpenAIChatCompletionRequestParams

This fixes test failures where the library client was trying to construct
Pydantic models with empty dictionaries.
The previous fix had a bug: it called _convert_body() which only keeps fields
that match function parameter names. For Pydantic methods with signature:
  openai_chat_completion(params: OpenAIChatCompletionRequestParams)

The signature only has 'params', but the body has 'model', 'messages', etc.
So _convert_body() returned an empty dict.

Fix: Skip _convert_body() entirely for Pydantic params. Use the raw body
directly to construct the Pydantic model (after stripping NOT_GIVENs).

This properly fixes the ValidationError where required fields were missing.
The streaming code path (_call_streaming) had the same issue as non-streaming:
it called _convert_body() which returned empty dict for Pydantic params.

Applied the same fix as commit 7476c0ae:
- Detect Pydantic model parameters before body conversion
- Skip _convert_body() for Pydantic params
- Construct Pydantic model directly from raw body (after stripping NOT_GIVENs)

This fixes streaming endpoints like openai_chat_completion with stream=True.
The streaming code path (_call_streaming) had the same issue as non-streaming:
it called _convert_body() which returned empty dict for Pydantic params.

Applied the same fix as commit 7476c0ae:
- Detect Pydantic model parameters before body conversion
- Skip _convert_body() for Pydantic params
- Construct Pydantic model directly from raw body (after stripping NOT_GIVENs)

This fixes streaming endpoints like openai_chat_completion with stream=True.
This commit is contained in:
Eric Huang 2025-10-09 13:53:31 -07:00
parent 26fd5dbd34
commit a93130e323
295 changed files with 51966 additions and 3051 deletions

View file

@ -8,7 +8,7 @@ import base64
import uuid
from abc import ABC, abstractmethod
from collections.abc import AsyncIterator, Iterable
from typing import Any
from typing import TYPE_CHECKING, Any
from openai import NOT_GIVEN, AsyncOpenAI
from pydantic import BaseModel, ConfigDict
@ -22,8 +22,13 @@ from llama_stack.apis.inference import (
OpenAIEmbeddingsResponse,
OpenAIEmbeddingUsage,
OpenAIMessageParam,
OpenAIResponseFormatParam,
)
if TYPE_CHECKING:
from llama_stack.apis.inference import (
OpenAIChatCompletionRequestParams,
OpenAICompletionRequestParams,
)
from llama_stack.apis.models import ModelType
from llama_stack.core.request_headers import NeedsRequestProviderData
from llama_stack.log import get_logger
@ -227,96 +232,57 @@ class OpenAIMixin(NeedsRequestProviderData, ABC, BaseModel):
async def openai_completion(
self,
model: str,
prompt: str | list[str] | list[int] | list[list[int]],
best_of: int | None = None,
echo: bool | None = None,
frequency_penalty: float | None = None,
logit_bias: dict[str, float] | None = None,
logprobs: bool | None = None,
max_tokens: int | None = None,
n: int | None = None,
presence_penalty: float | None = None,
seed: int | None = None,
stop: str | list[str] | None = None,
stream: bool | None = None,
stream_options: dict[str, Any] | None = None,
temperature: float | None = None,
top_p: float | None = None,
user: str | None = None,
guided_choice: list[str] | None = None,
prompt_logprobs: int | None = None,
suffix: str | None = None,
params: "OpenAICompletionRequestParams",
) -> OpenAICompletion:
"""
Direct OpenAI completion API call.
"""
# Handle parameters that are not supported by OpenAI API, but may be by the provider
# prompt_logprobs is supported by vLLM
# guided_choice is supported by vLLM
# TODO: test coverage
extra_body: dict[str, Any] = {}
if prompt_logprobs is not None and prompt_logprobs >= 0:
extra_body["prompt_logprobs"] = prompt_logprobs
if guided_choice:
extra_body["guided_choice"] = guided_choice
# Extract extra fields using Pydantic's built-in __pydantic_extra__
extra_body = dict(params.__pydantic_extra__ or {})
# Add vLLM-specific parameters to extra_body if they are set
# (these are explicitly defined in the model but still go to extra_body)
if params.prompt_logprobs is not None and params.prompt_logprobs >= 0:
extra_body["prompt_logprobs"] = params.prompt_logprobs
if params.guided_choice:
extra_body["guided_choice"] = params.guided_choice
# TODO: fix openai_completion to return type compatible with OpenAI's API response
resp = await self.client.completions.create(
**await prepare_openai_completion_params(
model=await self._get_provider_model_id(model),
prompt=prompt,
best_of=best_of,
echo=echo,
frequency_penalty=frequency_penalty,
logit_bias=logit_bias,
logprobs=logprobs,
max_tokens=max_tokens,
n=n,
presence_penalty=presence_penalty,
seed=seed,
stop=stop,
stream=stream,
stream_options=stream_options,
temperature=temperature,
top_p=top_p,
user=user,
suffix=suffix,
model=await self._get_provider_model_id(params.model),
prompt=params.prompt,
best_of=params.best_of,
echo=params.echo,
frequency_penalty=params.frequency_penalty,
logit_bias=params.logit_bias,
logprobs=params.logprobs,
max_tokens=params.max_tokens,
n=params.n,
presence_penalty=params.presence_penalty,
seed=params.seed,
stop=params.stop,
stream=params.stream,
stream_options=params.stream_options,
temperature=params.temperature,
top_p=params.top_p,
user=params.user,
suffix=params.suffix,
),
extra_body=extra_body,
extra_body=extra_body if extra_body else None,
)
return await self._maybe_overwrite_id(resp, stream) # type: ignore[no-any-return]
return await self._maybe_overwrite_id(resp, params.stream) # type: ignore[no-any-return]
async def openai_chat_completion(
self,
model: str,
messages: list[OpenAIMessageParam],
frequency_penalty: float | None = None,
function_call: str | dict[str, Any] | None = None,
functions: list[dict[str, Any]] | None = None,
logit_bias: dict[str, float] | None = None,
logprobs: bool | None = None,
max_completion_tokens: int | None = None,
max_tokens: int | None = None,
n: int | None = None,
parallel_tool_calls: bool | None = None,
presence_penalty: float | None = None,
response_format: OpenAIResponseFormatParam | None = None,
seed: int | None = None,
stop: str | list[str] | None = None,
stream: bool | None = None,
stream_options: dict[str, Any] | None = None,
temperature: float | None = None,
tool_choice: str | dict[str, Any] | None = None,
tools: list[dict[str, Any]] | None = None,
top_logprobs: int | None = None,
top_p: float | None = None,
user: str | None = None,
params: "OpenAIChatCompletionRequestParams",
) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
"""
Direct OpenAI chat completion API call.
"""
messages = params.messages
if self.download_images:
async def _localize_image_url(m: OpenAIMessageParam) -> OpenAIMessageParam:
@ -335,35 +301,40 @@ class OpenAIMixin(NeedsRequestProviderData, ABC, BaseModel):
messages = [await _localize_image_url(m) for m in messages]
params = await prepare_openai_completion_params(
model=await self._get_provider_model_id(model),
request_params = await prepare_openai_completion_params(
model=await self._get_provider_model_id(params.model),
messages=messages,
frequency_penalty=frequency_penalty,
function_call=function_call,
functions=functions,
logit_bias=logit_bias,
logprobs=logprobs,
max_completion_tokens=max_completion_tokens,
max_tokens=max_tokens,
n=n,
parallel_tool_calls=parallel_tool_calls,
presence_penalty=presence_penalty,
response_format=response_format,
seed=seed,
stop=stop,
stream=stream,
stream_options=stream_options,
temperature=temperature,
tool_choice=tool_choice,
tools=tools,
top_logprobs=top_logprobs,
top_p=top_p,
user=user,
frequency_penalty=params.frequency_penalty,
function_call=params.function_call,
functions=params.functions,
logit_bias=params.logit_bias,
logprobs=params.logprobs,
max_completion_tokens=params.max_completion_tokens,
max_tokens=params.max_tokens,
n=params.n,
parallel_tool_calls=params.parallel_tool_calls,
presence_penalty=params.presence_penalty,
response_format=params.response_format,
seed=params.seed,
stop=params.stop,
stream=params.stream,
stream_options=params.stream_options,
temperature=params.temperature,
tool_choice=params.tool_choice,
tools=params.tools,
top_logprobs=params.top_logprobs,
top_p=params.top_p,
user=params.user,
)
resp = await self.client.chat.completions.create(**params)
# Extract any additional provider-specific parameters using Pydantic's __pydantic_extra__
extra_body = dict(params.__pydantic_extra__ or {})
return await self._maybe_overwrite_id(resp, stream) # type: ignore[no-any-return]
resp = await self.client.chat.completions.create(
**request_params, extra_body=extra_body if extra_body else None
)
return await self._maybe_overwrite_id(resp, params.stream) # type: ignore[no-any-return]
async def openai_embeddings(
self,