From 80d58ab51992eac8479460211fde20216727874f Mon Sep 17 00:00:00 2001 From: ehhuang Date: Fri, 10 Oct 2025 15:46:34 -0700 Subject: [PATCH 1/2] chore: refactor (chat)completions endpoints to use shared params struct (#3761) # What does this PR do? Converts openai(_chat)_completions params to pydantic BaseModel to reduce code duplication across all providers. ## Test Plan CI --- [//]: # (BEGIN SAPLING FOOTER) Stack created with [Sapling](https://sapling-scm.com). Best reviewed with [ReviewStack](https://reviewstack.dev/llamastack/llama-stack/pull/3761). * #3777 * __->__ #3761 --- docs/openapi_generator/pyopenapi/generator.py | 41 ++-- docs/openapi_generator/pyopenapi/utility.py | 13 +- docs/static/deprecated-llama-stack-spec.html | 20 +- docs/static/deprecated-llama-stack-spec.yaml | 22 +- docs/static/llama-stack-spec.html | 20 +- docs/static/llama-stack-spec.yaml | 22 +- docs/static/stainless-llama-stack-spec.html | 20 +- docs/static/stainless-llama-stack-spec.yaml | 22 +- llama_stack/apis/inference/inference.py | 206 ++++++++++-------- llama_stack/core/library_client.py | 21 +- llama_stack/core/routers/inference.py | 150 +++---------- llama_stack/core/server/server.py | 12 +- .../agents/meta_reference/agent_instance.py | 4 +- .../meta_reference/responses/streaming.py | 4 +- .../inline/batches/reference/batches.py | 8 +- .../inline/eval/meta_reference/eval.py | 15 +- .../inference/meta_reference/inference.py | 35 +-- .../sentence_transformers.py | 53 +---- .../inline/safety/llama_guard/llama_guard.py | 22 +- .../scoring_fn/llm_as_judge_scoring_fn.py | 5 +- .../tool_runtime/rag/context_retriever.py | 5 +- .../remote/inference/bedrock/bedrock.py | 53 +---- .../remote/inference/databricks/databricks.py | 24 +- .../inference/llama_openai_compat/llama.py | 25 +-- .../inference/passthrough/passthrough.py | 112 ++-------- .../remote/inference/runpod/runpod.py | 65 +----- .../providers/remote/inference/vllm/vllm.py | 64 +----- .../utils/inference/litellm_openai_mixin.py | 147 +++++-------- .../providers/utils/inference/openai_mixin.py | 152 +++++-------- llama_stack/strong_typing/inspection.py | 36 ++- .../meta_reference/test_openai_responses.py | 68 +++--- .../providers/inference/test_remote_vllm.py | 15 +- .../utils/inference/test_openai_mixin.py | 8 +- 33 files changed, 599 insertions(+), 890 deletions(-) diff --git a/docs/openapi_generator/pyopenapi/generator.py b/docs/openapi_generator/pyopenapi/generator.py index a8d6aaee9..30fc9038d 100644 --- a/docs/openapi_generator/pyopenapi/generator.py +++ b/docs/openapi_generator/pyopenapi/generator.py @@ -23,6 +23,7 @@ from llama_stack.strong_typing.inspection import ( is_generic_list, is_type_optional, is_type_union, + is_unwrapped_body_param, unwrap_generic_list, unwrap_optional_type, unwrap_union_types, @@ -769,24 +770,30 @@ class Generator: first = next(iter(op.request_params)) request_name, request_type = first - op_name = "".join(word.capitalize() for word in op.name.split("_")) - request_name = f"{op_name}Request" - fields = [ - ( - name, - type_, - ) - for name, type_ in op.request_params - ] - request_type = make_dataclass( - request_name, - fields, - namespace={ - "__doc__": create_docstring_for_request( - request_name, fields, doc_params + # Special case: if there's a single parameter with Body(embed=False) that's a BaseModel, + # unwrap it to show the flat structure in the OpenAPI spec + # Example: openai_chat_completion() + if (len(op.request_params) == 1 and is_unwrapped_body_param(request_type)): + pass + else: + op_name = "".join(word.capitalize() for word in op.name.split("_")) + request_name = f"{op_name}Request" + fields = [ + ( + name, + type_, ) - }, - ) + for name, type_ in op.request_params + ] + request_type = make_dataclass( + request_name, + fields, + namespace={ + "__doc__": create_docstring_for_request( + request_name, fields, doc_params + ) + }, + ) requestBody = RequestBody( content={ diff --git a/docs/openapi_generator/pyopenapi/utility.py b/docs/openapi_generator/pyopenapi/utility.py index 26ef22112..c1425b250 100644 --- a/docs/openapi_generator/pyopenapi/utility.py +++ b/docs/openapi_generator/pyopenapi/utility.py @@ -8,10 +8,11 @@ import json import typing import inspect from pathlib import Path -from typing import TextIO -from typing import Any, List, Optional, Union, get_type_hints, get_origin, get_args +from typing import Any, List, Optional, TextIO, Union, get_type_hints, get_origin, get_args +from pydantic import BaseModel from llama_stack.strong_typing.schema import object_to_json, StrictJsonType +from llama_stack.strong_typing.inspection import is_unwrapped_body_param from llama_stack.core.resolver import api_protocol_map from .generator import Generator @@ -205,6 +206,14 @@ def _validate_has_return_in_docstring(method) -> str | None: def _validate_has_params_in_docstring(method) -> str | None: source = inspect.getsource(method) sig = inspect.signature(method) + + params_list = [p for p in sig.parameters.values() if p.name != "self"] + if len(params_list) == 1: + param = params_list[0] + param_type = param.annotation + if is_unwrapped_body_param(param_type): + return + # Only check if the method has more than one parameter if len(sig.parameters) > 1 and ":param" not in source: return "does not have a ':param' in its docstring" diff --git a/docs/static/deprecated-llama-stack-spec.html b/docs/static/deprecated-llama-stack-spec.html index 0ea2e8c43..f9bcb48f7 100644 --- a/docs/static/deprecated-llama-stack-spec.html +++ b/docs/static/deprecated-llama-stack-spec.html @@ -1527,7 +1527,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/OpenaiChatCompletionRequest" + "$ref": "#/components/schemas/OpenAIChatCompletionRequest" } } }, @@ -1617,7 +1617,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/OpenaiCompletionRequest" + "$ref": "#/components/schemas/OpenAICompletionRequest" } } }, @@ -7522,7 +7522,7 @@ "title": "OpenAIResponseFormatText", "description": "Text response format for OpenAI-compatible chat completion requests." }, - "OpenaiChatCompletionRequest": { + "OpenAIChatCompletionRequest": { "type": "object", "properties": { "model": { @@ -7769,7 +7769,8 @@ "model", "messages" ], - "title": "OpenaiChatCompletionRequest" + "title": "OpenAIChatCompletionRequest", + "description": "Request parameters for OpenAI-compatible chat completion endpoint." }, "OpenAIChatCompletion": { "type": "object", @@ -7965,7 +7966,7 @@ ], "title": "OpenAICompletionWithInputMessages" }, - "OpenaiCompletionRequest": { + "OpenAICompletionRequest": { "type": "object", "properties": { "model": { @@ -8100,10 +8101,12 @@ "type": "array", "items": { "type": "string" - } + }, + "description": "(Optional) vLLM-specific parameter for guided generation with a list of choices." }, "prompt_logprobs": { - "type": "integer" + "type": "integer", + "description": "(Optional) vLLM-specific parameter for number of log probabilities to return for prompt tokens." }, "suffix": { "type": "string", @@ -8115,7 +8118,8 @@ "model", "prompt" ], - "title": "OpenaiCompletionRequest" + "title": "OpenAICompletionRequest", + "description": "Request parameters for OpenAI-compatible completion endpoint." }, "OpenAICompletion": { "type": "object", diff --git a/docs/static/deprecated-llama-stack-spec.yaml b/docs/static/deprecated-llama-stack-spec.yaml index 008cd8673..552555f7a 100644 --- a/docs/static/deprecated-llama-stack-spec.yaml +++ b/docs/static/deprecated-llama-stack-spec.yaml @@ -1098,7 +1098,7 @@ paths: content: application/json: schema: - $ref: '#/components/schemas/OpenaiChatCompletionRequest' + $ref: '#/components/schemas/OpenAIChatCompletionRequest' required: true deprecated: true /v1/openai/v1/chat/completions/{completion_id}: @@ -1167,7 +1167,7 @@ paths: content: application/json: schema: - $ref: '#/components/schemas/OpenaiCompletionRequest' + $ref: '#/components/schemas/OpenAICompletionRequest' required: true deprecated: true /v1/openai/v1/embeddings: @@ -5575,7 +5575,7 @@ components: title: OpenAIResponseFormatText description: >- Text response format for OpenAI-compatible chat completion requests. - OpenaiChatCompletionRequest: + OpenAIChatCompletionRequest: type: object properties: model: @@ -5717,7 +5717,9 @@ components: required: - model - messages - title: OpenaiChatCompletionRequest + title: OpenAIChatCompletionRequest + description: >- + Request parameters for OpenAI-compatible chat completion endpoint. OpenAIChatCompletion: type: object properties: @@ -5883,7 +5885,7 @@ components: - model - input_messages title: OpenAICompletionWithInputMessages - OpenaiCompletionRequest: + OpenAICompletionRequest: type: object properties: model: @@ -5975,8 +5977,14 @@ components: type: array items: type: string + description: >- + (Optional) vLLM-specific parameter for guided generation with a list of + choices. prompt_logprobs: type: integer + description: >- + (Optional) vLLM-specific parameter for number of log probabilities to + return for prompt tokens. suffix: type: string description: >- @@ -5985,7 +5993,9 @@ components: required: - model - prompt - title: OpenaiCompletionRequest + title: OpenAICompletionRequest + description: >- + Request parameters for OpenAI-compatible completion endpoint. OpenAICompletion: type: object properties: diff --git a/docs/static/llama-stack-spec.html b/docs/static/llama-stack-spec.html index 7e534f995..8f8ff66c9 100644 --- a/docs/static/llama-stack-spec.html +++ b/docs/static/llama-stack-spec.html @@ -153,7 +153,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/OpenaiChatCompletionRequest" + "$ref": "#/components/schemas/OpenAIChatCompletionRequest" } } }, @@ -243,7 +243,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/OpenaiCompletionRequest" + "$ref": "#/components/schemas/OpenAICompletionRequest" } } }, @@ -5018,7 +5018,7 @@ "title": "OpenAIResponseFormatText", "description": "Text response format for OpenAI-compatible chat completion requests." }, - "OpenaiChatCompletionRequest": { + "OpenAIChatCompletionRequest": { "type": "object", "properties": { "model": { @@ -5265,7 +5265,8 @@ "model", "messages" ], - "title": "OpenaiChatCompletionRequest" + "title": "OpenAIChatCompletionRequest", + "description": "Request parameters for OpenAI-compatible chat completion endpoint." }, "OpenAIChatCompletion": { "type": "object", @@ -5461,7 +5462,7 @@ ], "title": "OpenAICompletionWithInputMessages" }, - "OpenaiCompletionRequest": { + "OpenAICompletionRequest": { "type": "object", "properties": { "model": { @@ -5596,10 +5597,12 @@ "type": "array", "items": { "type": "string" - } + }, + "description": "(Optional) vLLM-specific parameter for guided generation with a list of choices." }, "prompt_logprobs": { - "type": "integer" + "type": "integer", + "description": "(Optional) vLLM-specific parameter for number of log probabilities to return for prompt tokens." }, "suffix": { "type": "string", @@ -5611,7 +5614,8 @@ "model", "prompt" ], - "title": "OpenaiCompletionRequest" + "title": "OpenAICompletionRequest", + "description": "Request parameters for OpenAI-compatible completion endpoint." }, "OpenAICompletion": { "type": "object", diff --git a/docs/static/llama-stack-spec.yaml b/docs/static/llama-stack-spec.yaml index bad40c87d..97742f19a 100644 --- a/docs/static/llama-stack-spec.yaml +++ b/docs/static/llama-stack-spec.yaml @@ -98,7 +98,7 @@ paths: content: application/json: schema: - $ref: '#/components/schemas/OpenaiChatCompletionRequest' + $ref: '#/components/schemas/OpenAIChatCompletionRequest' required: true deprecated: false /v1/chat/completions/{completion_id}: @@ -167,7 +167,7 @@ paths: content: application/json: schema: - $ref: '#/components/schemas/OpenaiCompletionRequest' + $ref: '#/components/schemas/OpenAICompletionRequest' required: true deprecated: false /v1/conversations: @@ -3824,7 +3824,7 @@ components: title: OpenAIResponseFormatText description: >- Text response format for OpenAI-compatible chat completion requests. - OpenaiChatCompletionRequest: + OpenAIChatCompletionRequest: type: object properties: model: @@ -3966,7 +3966,9 @@ components: required: - model - messages - title: OpenaiChatCompletionRequest + title: OpenAIChatCompletionRequest + description: >- + Request parameters for OpenAI-compatible chat completion endpoint. OpenAIChatCompletion: type: object properties: @@ -4132,7 +4134,7 @@ components: - model - input_messages title: OpenAICompletionWithInputMessages - OpenaiCompletionRequest: + OpenAICompletionRequest: type: object properties: model: @@ -4224,8 +4226,14 @@ components: type: array items: type: string + description: >- + (Optional) vLLM-specific parameter for guided generation with a list of + choices. prompt_logprobs: type: integer + description: >- + (Optional) vLLM-specific parameter for number of log probabilities to + return for prompt tokens. suffix: type: string description: >- @@ -4234,7 +4242,9 @@ components: required: - model - prompt - title: OpenaiCompletionRequest + title: OpenAICompletionRequest + description: >- + Request parameters for OpenAI-compatible completion endpoint. OpenAICompletion: type: object properties: diff --git a/docs/static/stainless-llama-stack-spec.html b/docs/static/stainless-llama-stack-spec.html index 36c63367c..fcdcd76c5 100644 --- a/docs/static/stainless-llama-stack-spec.html +++ b/docs/static/stainless-llama-stack-spec.html @@ -153,7 +153,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/OpenaiChatCompletionRequest" + "$ref": "#/components/schemas/OpenAIChatCompletionRequest" } } }, @@ -243,7 +243,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/OpenaiCompletionRequest" + "$ref": "#/components/schemas/OpenAICompletionRequest" } } }, @@ -7027,7 +7027,7 @@ "title": "OpenAIResponseFormatText", "description": "Text response format for OpenAI-compatible chat completion requests." }, - "OpenaiChatCompletionRequest": { + "OpenAIChatCompletionRequest": { "type": "object", "properties": { "model": { @@ -7274,7 +7274,8 @@ "model", "messages" ], - "title": "OpenaiChatCompletionRequest" + "title": "OpenAIChatCompletionRequest", + "description": "Request parameters for OpenAI-compatible chat completion endpoint." }, "OpenAIChatCompletion": { "type": "object", @@ -7470,7 +7471,7 @@ ], "title": "OpenAICompletionWithInputMessages" }, - "OpenaiCompletionRequest": { + "OpenAICompletionRequest": { "type": "object", "properties": { "model": { @@ -7605,10 +7606,12 @@ "type": "array", "items": { "type": "string" - } + }, + "description": "(Optional) vLLM-specific parameter for guided generation with a list of choices." }, "prompt_logprobs": { - "type": "integer" + "type": "integer", + "description": "(Optional) vLLM-specific parameter for number of log probabilities to return for prompt tokens." }, "suffix": { "type": "string", @@ -7620,7 +7623,8 @@ "model", "prompt" ], - "title": "OpenaiCompletionRequest" + "title": "OpenAICompletionRequest", + "description": "Request parameters for OpenAI-compatible completion endpoint." }, "OpenAICompletion": { "type": "object", diff --git a/docs/static/stainless-llama-stack-spec.yaml b/docs/static/stainless-llama-stack-spec.yaml index 4475cc8f0..09fc3ded4 100644 --- a/docs/static/stainless-llama-stack-spec.yaml +++ b/docs/static/stainless-llama-stack-spec.yaml @@ -101,7 +101,7 @@ paths: content: application/json: schema: - $ref: '#/components/schemas/OpenaiChatCompletionRequest' + $ref: '#/components/schemas/OpenAIChatCompletionRequest' required: true deprecated: false /v1/chat/completions/{completion_id}: @@ -170,7 +170,7 @@ paths: content: application/json: schema: - $ref: '#/components/schemas/OpenaiCompletionRequest' + $ref: '#/components/schemas/OpenAICompletionRequest' required: true deprecated: false /v1/conversations: @@ -5269,7 +5269,7 @@ components: title: OpenAIResponseFormatText description: >- Text response format for OpenAI-compatible chat completion requests. - OpenaiChatCompletionRequest: + OpenAIChatCompletionRequest: type: object properties: model: @@ -5411,7 +5411,9 @@ components: required: - model - messages - title: OpenaiChatCompletionRequest + title: OpenAIChatCompletionRequest + description: >- + Request parameters for OpenAI-compatible chat completion endpoint. OpenAIChatCompletion: type: object properties: @@ -5577,7 +5579,7 @@ components: - model - input_messages title: OpenAICompletionWithInputMessages - OpenaiCompletionRequest: + OpenAICompletionRequest: type: object properties: model: @@ -5669,8 +5671,14 @@ components: type: array items: type: string + description: >- + (Optional) vLLM-specific parameter for guided generation with a list of + choices. prompt_logprobs: type: integer + description: >- + (Optional) vLLM-specific parameter for number of log probabilities to + return for prompt tokens. suffix: type: string description: >- @@ -5679,7 +5687,9 @@ components: required: - model - prompt - title: OpenaiCompletionRequest + title: OpenAICompletionRequest + description: >- + Request parameters for OpenAI-compatible completion endpoint. OpenAICompletion: type: object properties: diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py index 375ddb231..fb3e78afc 100644 --- a/llama_stack/apis/inference/inference.py +++ b/llama_stack/apis/inference/inference.py @@ -14,7 +14,8 @@ from typing import ( runtime_checkable, ) -from pydantic import BaseModel, Field, field_validator +from fastapi import Body +from pydantic import BaseModel, ConfigDict, Field, field_validator from typing_extensions import TypedDict from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent @@ -1035,6 +1036,118 @@ class ListOpenAIChatCompletionResponse(BaseModel): object: Literal["list"] = "list" +@json_schema_type +class OpenAICompletionRequest(BaseModel): + """Request parameters for OpenAI-compatible completion endpoint. + + :param model: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint. + :param prompt: The prompt to generate a completion for. + :param best_of: (Optional) The number of completions to generate. + :param echo: (Optional) Whether to echo the prompt. + :param frequency_penalty: (Optional) The penalty for repeated tokens. + :param logit_bias: (Optional) The logit bias to use. + :param logprobs: (Optional) The log probabilities to use. + :param max_tokens: (Optional) The maximum number of tokens to generate. + :param n: (Optional) The number of completions to generate. + :param presence_penalty: (Optional) The penalty for repeated tokens. + :param seed: (Optional) The seed to use. + :param stop: (Optional) The stop tokens to use. + :param stream: (Optional) Whether to stream the response. + :param stream_options: (Optional) The stream options to use. + :param temperature: (Optional) The temperature to use. + :param top_p: (Optional) The top p to use. + :param user: (Optional) The user to use. + :param suffix: (Optional) The suffix that should be appended to the completion. + :param guided_choice: (Optional) vLLM-specific parameter for guided generation with a list of choices. + :param prompt_logprobs: (Optional) vLLM-specific parameter for number of log probabilities to return for prompt tokens. + """ + + model_config = ConfigDict(extra="allow") + + # Standard OpenAI completion parameters + model: str + prompt: str | list[str] | list[int] | list[list[int]] + best_of: int | None = None + echo: bool | None = None + frequency_penalty: float | None = None + logit_bias: dict[str, float] | None = None + logprobs: bool | None = None + max_tokens: int | None = None + n: int | None = None + presence_penalty: float | None = None + seed: int | None = None + stop: str | list[str] | None = None + stream: bool | None = None + stream_options: dict[str, Any] | None = None + temperature: float | None = None + top_p: float | None = None + user: str | None = None + + # vLLM-specific parameters (documented here but also allowed via extra fields) + guided_choice: list[str] | None = None + prompt_logprobs: int | None = None + + # for fill-in-the-middle type completion + suffix: str | None = None + + +@json_schema_type +class OpenAIChatCompletionRequest(BaseModel): + """Request parameters for OpenAI-compatible chat completion endpoint. + + :param model: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint. + :param messages: List of messages in the conversation. + :param frequency_penalty: (Optional) The penalty for repeated tokens. + :param function_call: (Optional) The function call to use. + :param functions: (Optional) List of functions to use. + :param logit_bias: (Optional) The logit bias to use. + :param logprobs: (Optional) The log probabilities to use. + :param max_completion_tokens: (Optional) The maximum number of tokens to generate. + :param max_tokens: (Optional) The maximum number of tokens to generate. + :param n: (Optional) The number of completions to generate. + :param parallel_tool_calls: (Optional) Whether to parallelize tool calls. + :param presence_penalty: (Optional) The penalty for repeated tokens. + :param response_format: (Optional) The response format to use. + :param seed: (Optional) The seed to use. + :param stop: (Optional) The stop tokens to use. + :param stream: (Optional) Whether to stream the response. + :param stream_options: (Optional) The stream options to use. + :param temperature: (Optional) The temperature to use. + :param tool_choice: (Optional) The tool choice to use. + :param tools: (Optional) The tools to use. + :param top_logprobs: (Optional) The top log probabilities to use. + :param top_p: (Optional) The top p to use. + :param user: (Optional) The user to use. + """ + + model_config = ConfigDict(extra="allow") + + # Standard OpenAI chat completion parameters + model: str + messages: Annotated[list[OpenAIMessageParam], Field(..., min_length=1)] + frequency_penalty: float | None = None + function_call: str | dict[str, Any] | None = None + functions: list[dict[str, Any]] | None = None + logit_bias: dict[str, float] | None = None + logprobs: bool | None = None + max_completion_tokens: int | None = None + max_tokens: int | None = None + n: int | None = None + parallel_tool_calls: bool | None = None + presence_penalty: float | None = None + response_format: OpenAIResponseFormatParam | None = None + seed: int | None = None + stop: str | list[str] | None = None + stream: bool | None = None + stream_options: dict[str, Any] | None = None + temperature: float | None = None + tool_choice: str | dict[str, Any] | None = None + tools: list[dict[str, Any]] | None = None + top_logprobs: int | None = None + top_p: float | None = None + user: str | None = None + + @runtime_checkable @trace_protocol class InferenceProvider(Protocol): @@ -1069,52 +1182,11 @@ class InferenceProvider(Protocol): @webmethod(route="/completions", method="POST", level=LLAMA_STACK_API_V1) async def openai_completion( self, - # Standard OpenAI completion parameters - model: str, - prompt: str | list[str] | list[int] | list[list[int]], - best_of: int | None = None, - echo: bool | None = None, - frequency_penalty: float | None = None, - logit_bias: dict[str, float] | None = None, - logprobs: bool | None = None, - max_tokens: int | None = None, - n: int | None = None, - presence_penalty: float | None = None, - seed: int | None = None, - stop: str | list[str] | None = None, - stream: bool | None = None, - stream_options: dict[str, Any] | None = None, - temperature: float | None = None, - top_p: float | None = None, - user: str | None = None, - # vLLM-specific parameters - guided_choice: list[str] | None = None, - prompt_logprobs: int | None = None, - # for fill-in-the-middle type completion - suffix: str | None = None, + params: Annotated[OpenAICompletionRequest, Body(...)], ) -> OpenAICompletion: """Create completion. Generate an OpenAI-compatible completion for the given prompt using the specified model. - - :param model: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint. - :param prompt: The prompt to generate a completion for. - :param best_of: (Optional) The number of completions to generate. - :param echo: (Optional) Whether to echo the prompt. - :param frequency_penalty: (Optional) The penalty for repeated tokens. - :param logit_bias: (Optional) The logit bias to use. - :param logprobs: (Optional) The log probabilities to use. - :param max_tokens: (Optional) The maximum number of tokens to generate. - :param n: (Optional) The number of completions to generate. - :param presence_penalty: (Optional) The penalty for repeated tokens. - :param seed: (Optional) The seed to use. - :param stop: (Optional) The stop tokens to use. - :param stream: (Optional) Whether to stream the response. - :param stream_options: (Optional) The stream options to use. - :param temperature: (Optional) The temperature to use. - :param top_p: (Optional) The top p to use. - :param user: (Optional) The user to use. - :param suffix: (Optional) The suffix that should be appended to the completion. :returns: An OpenAICompletion. """ ... @@ -1123,57 +1195,11 @@ class InferenceProvider(Protocol): @webmethod(route="/chat/completions", method="POST", level=LLAMA_STACK_API_V1) async def openai_chat_completion( self, - model: str, - messages: list[OpenAIMessageParam], - frequency_penalty: float | None = None, - function_call: str | dict[str, Any] | None = None, - functions: list[dict[str, Any]] | None = None, - logit_bias: dict[str, float] | None = None, - logprobs: bool | None = None, - max_completion_tokens: int | None = None, - max_tokens: int | None = None, - n: int | None = None, - parallel_tool_calls: bool | None = None, - presence_penalty: float | None = None, - response_format: OpenAIResponseFormatParam | None = None, - seed: int | None = None, - stop: str | list[str] | None = None, - stream: bool | None = None, - stream_options: dict[str, Any] | None = None, - temperature: float | None = None, - tool_choice: str | dict[str, Any] | None = None, - tools: list[dict[str, Any]] | None = None, - top_logprobs: int | None = None, - top_p: float | None = None, - user: str | None = None, + params: Annotated[OpenAIChatCompletionRequest, Body(...)], ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]: """Create chat completions. Generate an OpenAI-compatible chat completion for the given messages using the specified model. - - :param model: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint. - :param messages: List of messages in the conversation. - :param frequency_penalty: (Optional) The penalty for repeated tokens. - :param function_call: (Optional) The function call to use. - :param functions: (Optional) List of functions to use. - :param logit_bias: (Optional) The logit bias to use. - :param logprobs: (Optional) The log probabilities to use. - :param max_completion_tokens: (Optional) The maximum number of tokens to generate. - :param max_tokens: (Optional) The maximum number of tokens to generate. - :param n: (Optional) The number of completions to generate. - :param parallel_tool_calls: (Optional) Whether to parallelize tool calls. - :param presence_penalty: (Optional) The penalty for repeated tokens. - :param response_format: (Optional) The response format to use. - :param seed: (Optional) The seed to use. - :param stop: (Optional) The stop tokens to use. - :param stream: (Optional) Whether to stream the response. - :param stream_options: (Optional) The stream options to use. - :param temperature: (Optional) The temperature to use. - :param tool_choice: (Optional) The tool choice to use. - :param tools: (Optional) The tools to use. - :param top_logprobs: (Optional) The top log probabilities to use. - :param top_p: (Optional) The top p to use. - :param user: (Optional) The user to use. :returns: An OpenAIChatCompletion. """ ... diff --git a/llama_stack/core/library_client.py b/llama_stack/core/library_client.py index 0d9f9f134..5d45bd8ad 100644 --- a/llama_stack/core/library_client.py +++ b/llama_stack/core/library_client.py @@ -54,6 +54,7 @@ from llama_stack.providers.utils.telemetry.tracing import ( setup_logger, start_trace, ) +from llama_stack.strong_typing.inspection import is_unwrapped_body_param logger = get_logger(name=__name__, category="core") @@ -383,7 +384,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient): body, field_names = self._handle_file_uploads(options, body) - body = self._convert_body(path, options.method, body, exclude_params=set(field_names)) + body = self._convert_body(matched_func, body, exclude_params=set(field_names)) trace_path = webmethod.descriptive_name or route_path await start_trace(trace_path, {"__location__": "library_client"}) @@ -446,7 +447,8 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient): func, path_params, route_path, webmethod = find_matching_route(options.method, path, self.route_impls) body |= path_params - body = self._convert_body(path, options.method, body) + # Prepare body for the function call (handles both Pydantic and traditional params) + body = self._convert_body(func, body) trace_path = webmethod.descriptive_name or route_path await start_trace(trace_path, {"__location__": "library_client"}) @@ -493,17 +495,20 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient): ) return await response.parse() - def _convert_body( - self, path: str, method: str, body: dict | None = None, exclude_params: set[str] | None = None - ) -> dict: + def _convert_body(self, func: Any, body: dict | None = None, exclude_params: set[str] | None = None) -> dict: if not body: return {} - assert self.route_impls is not None # Should be guaranteed by request() method, assertion for mypy exclude_params = exclude_params or set() - - func, _, _, _ = find_matching_route(method, path, self.route_impls) sig = inspect.signature(func) + params_list = [p for p in sig.parameters.values() if p.name != "self"] + # Flatten if there's a single unwrapped body parameter (BaseModel or Annotated[BaseModel, Body(embed=False)]) + if len(params_list) == 1: + param = params_list[0] + param_type = param.annotation + if is_unwrapped_body_param(param_type): + base_type = get_args(param_type)[0] + return {param.name: base_type(**body)} # Strip NOT_GIVENs to use the defaults in signature body = {k: v for k, v in body.items() if v is not NOT_GIVEN} diff --git a/llama_stack/core/routers/inference.py b/llama_stack/core/routers/inference.py index 847f6a2d2..5c7532e70 100644 --- a/llama_stack/core/routers/inference.py +++ b/llama_stack/core/routers/inference.py @@ -10,9 +10,10 @@ from collections.abc import AsyncGenerator, AsyncIterator from datetime import UTC, datetime from typing import Annotated, Any +from fastapi import Body from openai.types.chat import ChatCompletionToolChoiceOptionParam as OpenAIChatCompletionToolChoiceOptionParam from openai.types.chat import ChatCompletionToolParam as OpenAIChatCompletionToolParam -from pydantic import Field, TypeAdapter +from pydantic import TypeAdapter from llama_stack.apis.common.content_types import ( InterleavedContent, @@ -31,15 +32,16 @@ from llama_stack.apis.inference import ( OpenAIAssistantMessageParam, OpenAIChatCompletion, OpenAIChatCompletionChunk, + OpenAIChatCompletionRequest, OpenAIChatCompletionToolCall, OpenAIChatCompletionToolCallFunction, OpenAIChoice, OpenAIChoiceLogprobs, OpenAICompletion, + OpenAICompletionRequest, OpenAICompletionWithInputMessages, OpenAIEmbeddingsResponse, OpenAIMessageParam, - OpenAIResponseFormatParam, Order, StopReason, ToolPromptFormat, @@ -181,61 +183,23 @@ class InferenceRouter(Inference): async def openai_completion( self, - model: str, - prompt: str | list[str] | list[int] | list[list[int]], - best_of: int | None = None, - echo: bool | None = None, - frequency_penalty: float | None = None, - logit_bias: dict[str, float] | None = None, - logprobs: bool | None = None, - max_tokens: int | None = None, - n: int | None = None, - presence_penalty: float | None = None, - seed: int | None = None, - stop: str | list[str] | None = None, - stream: bool | None = None, - stream_options: dict[str, Any] | None = None, - temperature: float | None = None, - top_p: float | None = None, - user: str | None = None, - guided_choice: list[str] | None = None, - prompt_logprobs: int | None = None, - suffix: str | None = None, + params: Annotated[OpenAICompletionRequest, Body(...)], ) -> OpenAICompletion: logger.debug( - f"InferenceRouter.openai_completion: {model=}, {stream=}, {prompt=}", - ) - model_obj = await self._get_model(model, ModelType.llm) - params = dict( - model=model_obj.identifier, - prompt=prompt, - best_of=best_of, - echo=echo, - frequency_penalty=frequency_penalty, - logit_bias=logit_bias, - logprobs=logprobs, - max_tokens=max_tokens, - n=n, - presence_penalty=presence_penalty, - seed=seed, - stop=stop, - stream=stream, - stream_options=stream_options, - temperature=temperature, - top_p=top_p, - user=user, - guided_choice=guided_choice, - prompt_logprobs=prompt_logprobs, - suffix=suffix, + f"InferenceRouter.openai_completion: model={params.model}, stream={params.stream}, prompt={params.prompt}", ) + model_obj = await self._get_model(params.model, ModelType.llm) + + # Update params with the resolved model identifier + params.model = model_obj.identifier + provider = await self.routing_table.get_provider_impl(model_obj.identifier) - if stream: - return await provider.openai_completion(**params) + if params.stream: + return await provider.openai_completion(params) # TODO: Metrics do NOT work with openai_completion stream=True due to the fact # that we do not return an AsyncIterator, our tests expect a stream of chunks we cannot intercept currently. - # response_stream = await provider.openai_completion(**params) - response = await provider.openai_completion(**params) + response = await provider.openai_completion(params) if self.telemetry: metrics = self._construct_metrics( prompt_tokens=response.usage.prompt_tokens, @@ -254,93 +218,49 @@ class InferenceRouter(Inference): async def openai_chat_completion( self, - model: str, - messages: Annotated[list[OpenAIMessageParam], Field(..., min_length=1)], - frequency_penalty: float | None = None, - function_call: str | dict[str, Any] | None = None, - functions: list[dict[str, Any]] | None = None, - logit_bias: dict[str, float] | None = None, - logprobs: bool | None = None, - max_completion_tokens: int | None = None, - max_tokens: int | None = None, - n: int | None = None, - parallel_tool_calls: bool | None = None, - presence_penalty: float | None = None, - response_format: OpenAIResponseFormatParam | None = None, - seed: int | None = None, - stop: str | list[str] | None = None, - stream: bool | None = None, - stream_options: dict[str, Any] | None = None, - temperature: float | None = None, - tool_choice: str | dict[str, Any] | None = None, - tools: list[dict[str, Any]] | None = None, - top_logprobs: int | None = None, - top_p: float | None = None, - user: str | None = None, + params: Annotated[OpenAIChatCompletionRequest, Body(...)], ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]: logger.debug( - f"InferenceRouter.openai_chat_completion: {model=}, {stream=}, {messages=}", + f"InferenceRouter.openai_chat_completion: model={params.model}, stream={params.stream}, messages={params.messages}", ) - model_obj = await self._get_model(model, ModelType.llm) + model_obj = await self._get_model(params.model, ModelType.llm) # Use the OpenAI client for a bit of extra input validation without # exposing the OpenAI client itself as part of our API surface - if tool_choice: - TypeAdapter(OpenAIChatCompletionToolChoiceOptionParam).validate_python(tool_choice) - if tools is None: + if params.tool_choice: + TypeAdapter(OpenAIChatCompletionToolChoiceOptionParam).validate_python(params.tool_choice) + if params.tools is None: raise ValueError("'tool_choice' is only allowed when 'tools' is also provided") - if tools: - for tool in tools: + if params.tools: + for tool in params.tools: TypeAdapter(OpenAIChatCompletionToolParam).validate_python(tool) # Some providers make tool calls even when tool_choice is "none" # so just clear them both out to avoid unexpected tool calls - if tool_choice == "none" and tools is not None: - tool_choice = None - tools = None + if params.tool_choice == "none" and params.tools is not None: + params.tool_choice = None + params.tools = None + + # Update params with the resolved model identifier + params.model = model_obj.identifier - params = dict( - model=model_obj.identifier, - messages=messages, - frequency_penalty=frequency_penalty, - function_call=function_call, - functions=functions, - logit_bias=logit_bias, - logprobs=logprobs, - max_completion_tokens=max_completion_tokens, - max_tokens=max_tokens, - n=n, - parallel_tool_calls=parallel_tool_calls, - presence_penalty=presence_penalty, - response_format=response_format, - seed=seed, - stop=stop, - stream=stream, - stream_options=stream_options, - temperature=temperature, - tool_choice=tool_choice, - tools=tools, - top_logprobs=top_logprobs, - top_p=top_p, - user=user, - ) provider = await self.routing_table.get_provider_impl(model_obj.identifier) - if stream: - response_stream = await provider.openai_chat_completion(**params) + if params.stream: + response_stream = await provider.openai_chat_completion(params) # For streaming, the provider returns AsyncIterator[OpenAIChatCompletionChunk] # We need to add metrics to each chunk and store the final completion return self.stream_tokens_and_compute_metrics_openai_chat( response=response_stream, model=model_obj, - messages=messages, + messages=params.messages, ) response = await self._nonstream_openai_chat_completion(provider, params) # Store the response with the ID that will be returned to the client if self.store: - asyncio.create_task(self.store.store_chat_completion(response, messages)) + asyncio.create_task(self.store.store_chat_completion(response, params.messages)) if self.telemetry: metrics = self._construct_metrics( @@ -396,8 +316,10 @@ class InferenceRouter(Inference): return await self.store.get_chat_completion(completion_id) raise NotImplementedError("Get chat completion is not supported: inference store is not configured.") - async def _nonstream_openai_chat_completion(self, provider: Inference, params: dict) -> OpenAIChatCompletion: - response = await provider.openai_chat_completion(**params) + async def _nonstream_openai_chat_completion( + self, provider: Inference, params: OpenAIChatCompletionRequest + ) -> OpenAIChatCompletion: + response = await provider.openai_chat_completion(params) for choice in response.choices: # some providers return an empty list for no tool calls in non-streaming responses # but the OpenAI API returns None. So, set tool_calls to None if it's empty diff --git a/llama_stack/core/server/server.py b/llama_stack/core/server/server.py index edc114381..a8b855f4d 100644 --- a/llama_stack/core/server/server.py +++ b/llama_stack/core/server/server.py @@ -184,7 +184,17 @@ async def lifespan(app: StackApp): def is_streaming_request(func_name: str, request: Request, **kwargs): # TODO: pass the api method and punt it to the Protocol definition directly - return kwargs.get("stream", False) + # If there's a stream parameter at top level, use it + if "stream" in kwargs: + return kwargs["stream"] + + # If there's a stream parameter inside a "params" parameter, e.g. openai_chat_completion() use it + if "params" in kwargs: + params = kwargs["params"] + if hasattr(params, "stream"): + return params.stream + + return False async def maybe_await(value): diff --git a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py index b17c720e9..696fa9c97 100644 --- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py +++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py @@ -49,6 +49,7 @@ from llama_stack.apis.inference import ( Inference, Message, OpenAIAssistantMessageParam, + OpenAIChatCompletionRequest, OpenAIDeveloperMessageParam, OpenAIMessageParam, OpenAISystemMessageParam, @@ -582,7 +583,7 @@ class ChatAgent(ShieldRunnerMixin): max_tokens = getattr(sampling_params, "max_tokens", None) # Use OpenAI chat completion - openai_stream = await self.inference_api.openai_chat_completion( + params = OpenAIChatCompletionRequest( model=self.agent_config.model, messages=openai_messages, tools=openai_tools if openai_tools else None, @@ -593,6 +594,7 @@ class ChatAgent(ShieldRunnerMixin): max_tokens=max_tokens, stream=True, ) + openai_stream = await self.inference_api.openai_chat_completion(params) # Convert OpenAI stream back to Llama Stack format response_stream = convert_openai_chat_completion_stream( diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py index 99fac0530..6c1204fd4 100644 --- a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +++ b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py @@ -49,6 +49,7 @@ from llama_stack.apis.inference import ( OpenAIAssistantMessageParam, OpenAIChatCompletion, OpenAIChatCompletionChunk, + OpenAIChatCompletionRequest, OpenAIChatCompletionToolCall, OpenAIChoice, OpenAIMessageParam, @@ -168,7 +169,7 @@ class StreamingResponseOrchestrator: # (some providers don't support non-empty response_format when tools are present) response_format = None if self.ctx.response_format.type == "text" else self.ctx.response_format logger.debug(f"calling openai_chat_completion with tools: {self.ctx.chat_tools}") - completion_result = await self.inference_api.openai_chat_completion( + params = OpenAIChatCompletionRequest( model=self.ctx.model, messages=messages, tools=self.ctx.chat_tools, @@ -179,6 +180,7 @@ class StreamingResponseOrchestrator: "include_usage": True, }, ) + completion_result = await self.inference_api.openai_chat_completion(params) # Process streaming chunks and build complete response completion_result_data = None diff --git a/llama_stack/providers/inline/batches/reference/batches.py b/llama_stack/providers/inline/batches/reference/batches.py index 39f45d7d1..48690f177 100644 --- a/llama_stack/providers/inline/batches/reference/batches.py +++ b/llama_stack/providers/inline/batches/reference/batches.py @@ -22,6 +22,8 @@ from llama_stack.apis.files import Files, OpenAIFilePurpose from llama_stack.apis.inference import ( Inference, OpenAIAssistantMessageParam, + OpenAIChatCompletionRequest, + OpenAICompletionRequest, OpenAIDeveloperMessageParam, OpenAIMessageParam, OpenAISystemMessageParam, @@ -606,7 +608,8 @@ class ReferenceBatchesImpl(Batches): # TODO(SECURITY): review body for security issues if request.url == "/v1/chat/completions": request.body["messages"] = [convert_to_openai_message_param(msg) for msg in request.body["messages"]] - chat_response = await self.inference_api.openai_chat_completion(**request.body) + chat_params = OpenAIChatCompletionRequest(**request.body) + chat_response = await self.inference_api.openai_chat_completion(chat_params) # this is for mypy, we don't allow streaming so we'll get the right type assert hasattr(chat_response, "model_dump_json"), "Chat response must have model_dump_json method" @@ -620,7 +623,8 @@ class ReferenceBatchesImpl(Batches): }, } elif request.url == "/v1/completions": - completion_response = await self.inference_api.openai_completion(**request.body) + completion_params = OpenAICompletionRequest(**request.body) + completion_response = await self.inference_api.openai_completion(completion_params) # this is for mypy, we don't allow streaming so we'll get the right type assert hasattr(completion_response, "model_dump_json"), ( diff --git a/llama_stack/providers/inline/eval/meta_reference/eval.py b/llama_stack/providers/inline/eval/meta_reference/eval.py index 0dfe23dca..1318f3104 100644 --- a/llama_stack/providers/inline/eval/meta_reference/eval.py +++ b/llama_stack/providers/inline/eval/meta_reference/eval.py @@ -12,7 +12,14 @@ from llama_stack.apis.agents import Agents, StepType from llama_stack.apis.benchmarks import Benchmark from llama_stack.apis.datasetio import DatasetIO from llama_stack.apis.datasets import Datasets -from llama_stack.apis.inference import Inference, OpenAISystemMessageParam, OpenAIUserMessageParam, UserMessage +from llama_stack.apis.inference import ( + Inference, + OpenAIChatCompletionRequest, + OpenAICompletionRequest, + OpenAISystemMessageParam, + OpenAIUserMessageParam, + UserMessage, +) from llama_stack.apis.scoring import Scoring from llama_stack.providers.datatypes import BenchmarksProtocolPrivate from llama_stack.providers.inline.agents.meta_reference.agent_instance import ( @@ -168,11 +175,12 @@ class MetaReferenceEvalImpl( sampling_params["stop"] = candidate.sampling_params.stop input_content = json.loads(x[ColumnName.completion_input.value]) - response = await self.inference_api.openai_completion( + params = OpenAICompletionRequest( model=candidate.model, prompt=input_content, **sampling_params, ) + response = await self.inference_api.openai_completion(params) generations.append({ColumnName.generated_answer.value: response.choices[0].text}) elif ColumnName.chat_completion_input.value in x: chat_completion_input_json = json.loads(x[ColumnName.chat_completion_input.value]) @@ -187,11 +195,12 @@ class MetaReferenceEvalImpl( messages += [OpenAISystemMessageParam(**x) for x in chat_completion_input_json if x["role"] == "system"] messages += input_messages - response = await self.inference_api.openai_chat_completion( + params = OpenAIChatCompletionRequest( model=candidate.model, messages=messages, **sampling_params, ) + response = await self.inference_api.openai_chat_completion(params) generations.append({ColumnName.generated_answer.value: response.choices[0].message.content}) else: raise ValueError("Invalid input row") diff --git a/llama_stack/providers/inline/inference/meta_reference/inference.py b/llama_stack/providers/inline/inference/meta_reference/inference.py index fd65fa10d..72813b4fd 100644 --- a/llama_stack/providers/inline/inference/meta_reference/inference.py +++ b/llama_stack/providers/inline/inference/meta_reference/inference.py @@ -6,16 +6,16 @@ import asyncio from collections.abc import AsyncIterator -from typing import Any from llama_stack.apis.inference import ( InferenceProvider, + OpenAIChatCompletionRequest, + OpenAICompletionRequest, ) from llama_stack.apis.inference.inference import ( OpenAIChatCompletion, OpenAIChatCompletionChunk, - OpenAIMessageParam, - OpenAIResponseFormatParam, + OpenAICompletion, ) from llama_stack.apis.models import Model, ModelType from llama_stack.log import get_logger @@ -65,7 +65,10 @@ class MetaReferenceInferenceImpl( if self.config.create_distributed_process_group: self.generator.stop() - async def openai_completion(self, *args, **kwargs): + async def openai_completion( + self, + params: OpenAICompletionRequest, + ) -> OpenAICompletion: raise NotImplementedError("OpenAI completion not supported by meta reference provider") async def should_refresh_models(self) -> bool: @@ -150,28 +153,6 @@ class MetaReferenceInferenceImpl( async def openai_chat_completion( self, - model: str, - messages: list[OpenAIMessageParam], - frequency_penalty: float | None = None, - function_call: str | dict[str, Any] | None = None, - functions: list[dict[str, Any]] | None = None, - logit_bias: dict[str, float] | None = None, - logprobs: bool | None = None, - max_completion_tokens: int | None = None, - max_tokens: int | None = None, - n: int | None = None, - parallel_tool_calls: bool | None = None, - presence_penalty: float | None = None, - response_format: OpenAIResponseFormatParam | None = None, - seed: int | None = None, - stop: str | list[str] | None = None, - stream: bool | None = None, - stream_options: dict[str, Any] | None = None, - temperature: float | None = None, - tool_choice: str | dict[str, Any] | None = None, - tools: list[dict[str, Any]] | None = None, - top_logprobs: int | None = None, - top_p: float | None = None, - user: str | None = None, + params: OpenAIChatCompletionRequest, ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]: raise NotImplementedError("OpenAI chat completion not supported by meta-reference inference provider") diff --git a/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py b/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py index b984d97bf..4aac2c3d8 100644 --- a/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py +++ b/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py @@ -5,17 +5,16 @@ # the root directory of this source tree. from collections.abc import AsyncIterator -from typing import Any from llama_stack.apis.inference import ( InferenceProvider, + OpenAIChatCompletionRequest, + OpenAICompletionRequest, ) from llama_stack.apis.inference.inference import ( OpenAIChatCompletion, OpenAIChatCompletionChunk, OpenAICompletion, - OpenAIMessageParam, - OpenAIResponseFormatParam, ) from llama_stack.apis.models import ModelType from llama_stack.log import get_logger @@ -73,56 +72,12 @@ class SentenceTransformersInferenceImpl( async def openai_completion( self, - # Standard OpenAI completion parameters - model: str, - prompt: str | list[str] | list[int] | list[list[int]], - best_of: int | None = None, - echo: bool | None = None, - frequency_penalty: float | None = None, - logit_bias: dict[str, float] | None = None, - logprobs: bool | None = None, - max_tokens: int | None = None, - n: int | None = None, - presence_penalty: float | None = None, - seed: int | None = None, - stop: str | list[str] | None = None, - stream: bool | None = None, - stream_options: dict[str, Any] | None = None, - temperature: float | None = None, - top_p: float | None = None, - user: str | None = None, - # vLLM-specific parameters - guided_choice: list[str] | None = None, - prompt_logprobs: int | None = None, - # for fill-in-the-middle type completion - suffix: str | None = None, + params: OpenAICompletionRequest, ) -> OpenAICompletion: raise NotImplementedError("OpenAI completion not supported by sentence transformers provider") async def openai_chat_completion( self, - model: str, - messages: list[OpenAIMessageParam], - frequency_penalty: float | None = None, - function_call: str | dict[str, Any] | None = None, - functions: list[dict[str, Any]] | None = None, - logit_bias: dict[str, float] | None = None, - logprobs: bool | None = None, - max_completion_tokens: int | None = None, - max_tokens: int | None = None, - n: int | None = None, - parallel_tool_calls: bool | None = None, - presence_penalty: float | None = None, - response_format: OpenAIResponseFormatParam | None = None, - seed: int | None = None, - stop: str | list[str] | None = None, - stream: bool | None = None, - stream_options: dict[str, Any] | None = None, - temperature: float | None = None, - tool_choice: str | dict[str, Any] | None = None, - tools: list[dict[str, Any]] | None = None, - top_logprobs: int | None = None, - top_p: float | None = None, - user: str | None = None, + params: OpenAIChatCompletionRequest, ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]: raise NotImplementedError("OpenAI chat completion not supported by sentence transformers provider") diff --git a/llama_stack/providers/inline/safety/llama_guard/llama_guard.py b/llama_stack/providers/inline/safety/llama_guard/llama_guard.py index 206182343..c661de59c 100644 --- a/llama_stack/providers/inline/safety/llama_guard/llama_guard.py +++ b/llama_stack/providers/inline/safety/llama_guard/llama_guard.py @@ -10,7 +10,13 @@ from string import Template from typing import Any from llama_stack.apis.common.content_types import ImageContentItem, TextContentItem -from llama_stack.apis.inference import Inference, Message, UserMessage +from llama_stack.apis.inference import ( + Inference, + Message, + OpenAIChatCompletionRequest, + OpenAIUserMessageParam, + UserMessage, +) from llama_stack.apis.safety import ( RunShieldResponse, Safety, @@ -290,20 +296,21 @@ class LlamaGuardShield: else: shield_input_message = self.build_text_shield_input(messages) - response = await self.inference_api.openai_chat_completion( + params = OpenAIChatCompletionRequest( model=self.model, messages=[shield_input_message], stream=False, temperature=0.0, # default is 1, which is too high for safety ) + response = await self.inference_api.openai_chat_completion(params) content = response.choices[0].message.content content = content.strip() return self.get_shield_response(content) - def build_text_shield_input(self, messages: list[Message]) -> UserMessage: - return UserMessage(content=self.build_prompt(messages)) + def build_text_shield_input(self, messages: list[Message]) -> OpenAIUserMessageParam: + return OpenAIUserMessageParam(role="user", content=self.build_prompt(messages)) - def build_vision_shield_input(self, messages: list[Message]) -> UserMessage: + def build_vision_shield_input(self, messages: list[Message]) -> OpenAIUserMessageParam: conversation = [] most_recent_img = None @@ -335,7 +342,7 @@ class LlamaGuardShield: prompt.append(most_recent_img) prompt.append(self.build_prompt(conversation[::-1])) - return UserMessage(content=prompt) + return OpenAIUserMessageParam(role="user", content=prompt) def build_prompt(self, messages: list[Message]) -> str: categories = self.get_safety_categories() @@ -377,11 +384,12 @@ class LlamaGuardShield: # TODO: Add Image based support for OpenAI Moderations shield_input_message = self.build_text_shield_input(messages) - response = await self.inference_api.openai_chat_completion( + params = OpenAIChatCompletionRequest( model=self.model, messages=[shield_input_message], stream=False, ) + response = await self.inference_api.openai_chat_completion(params) content = response.choices[0].message.content content = content.strip() return self.get_moderation_object(content) diff --git a/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py b/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py index d60efe828..f5e55d1d5 100644 --- a/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py +++ b/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py @@ -6,7 +6,7 @@ import re from typing import Any -from llama_stack.apis.inference import Inference +from llama_stack.apis.inference import Inference, OpenAIChatCompletionRequest from llama_stack.apis.scoring import ScoringResultRow from llama_stack.apis.scoring_functions import ScoringFnParams from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn @@ -55,7 +55,7 @@ class LlmAsJudgeScoringFn(RegisteredBaseScoringFn): generated_answer=generated_answer, ) - judge_response = await self.inference_api.openai_chat_completion( + params = OpenAIChatCompletionRequest( model=fn_def.params.judge_model, messages=[ { @@ -64,6 +64,7 @@ class LlmAsJudgeScoringFn(RegisteredBaseScoringFn): } ], ) + judge_response = await self.inference_api.openai_chat_completion(params) content = judge_response.choices[0].message.content rating_regexes = fn_def.params.judge_score_regexes diff --git a/llama_stack/providers/inline/tool_runtime/rag/context_retriever.py b/llama_stack/providers/inline/tool_runtime/rag/context_retriever.py index 9bc22f979..98098e2d2 100644 --- a/llama_stack/providers/inline/tool_runtime/rag/context_retriever.py +++ b/llama_stack/providers/inline/tool_runtime/rag/context_retriever.py @@ -8,7 +8,7 @@ from jinja2 import Template from llama_stack.apis.common.content_types import InterleavedContent -from llama_stack.apis.inference import OpenAIUserMessageParam +from llama_stack.apis.inference import OpenAIChatCompletionRequest, OpenAIUserMessageParam from llama_stack.apis.tools.rag_tool import ( DefaultRAGQueryGeneratorConfig, LLMRAGQueryGeneratorConfig, @@ -65,11 +65,12 @@ async def llm_rag_query_generator( model = config.model message = OpenAIUserMessageParam(content=rendered_content) - response = await inference_api.openai_chat_completion( + params = OpenAIChatCompletionRequest( model=model, messages=[message], stream=False, ) + response = await inference_api.openai_chat_completion(params) query = response.choices[0].message.content diff --git a/llama_stack/providers/remote/inference/bedrock/bedrock.py b/llama_stack/providers/remote/inference/bedrock/bedrock.py index 9c8a74b47..788c274f1 100644 --- a/llama_stack/providers/remote/inference/bedrock/bedrock.py +++ b/llama_stack/providers/remote/inference/bedrock/bedrock.py @@ -6,21 +6,20 @@ import json from collections.abc import AsyncIterator -from typing import Any from botocore.client import BaseClient from llama_stack.apis.inference import ( ChatCompletionRequest, Inference, + OpenAIChatCompletionRequest, + OpenAICompletionRequest, OpenAIEmbeddingsResponse, ) from llama_stack.apis.inference.inference import ( OpenAIChatCompletion, OpenAIChatCompletionChunk, OpenAICompletion, - OpenAIMessageParam, - OpenAIResponseFormatParam, ) from llama_stack.providers.remote.inference.bedrock.config import BedrockConfig from llama_stack.providers.utils.bedrock.client import create_bedrock_client @@ -135,56 +134,12 @@ class BedrockInferenceAdapter( async def openai_completion( self, - # Standard OpenAI completion parameters - model: str, - prompt: str | list[str] | list[int] | list[list[int]], - best_of: int | None = None, - echo: bool | None = None, - frequency_penalty: float | None = None, - logit_bias: dict[str, float] | None = None, - logprobs: bool | None = None, - max_tokens: int | None = None, - n: int | None = None, - presence_penalty: float | None = None, - seed: int | None = None, - stop: str | list[str] | None = None, - stream: bool | None = None, - stream_options: dict[str, Any] | None = None, - temperature: float | None = None, - top_p: float | None = None, - user: str | None = None, - # vLLM-specific parameters - guided_choice: list[str] | None = None, - prompt_logprobs: int | None = None, - # for fill-in-the-middle type completion - suffix: str | None = None, + params: OpenAICompletionRequest, ) -> OpenAICompletion: raise NotImplementedError("OpenAI completion not supported by the Bedrock provider") async def openai_chat_completion( self, - model: str, - messages: list[OpenAIMessageParam], - frequency_penalty: float | None = None, - function_call: str | dict[str, Any] | None = None, - functions: list[dict[str, Any]] | None = None, - logit_bias: dict[str, float] | None = None, - logprobs: bool | None = None, - max_completion_tokens: int | None = None, - max_tokens: int | None = None, - n: int | None = None, - parallel_tool_calls: bool | None = None, - presence_penalty: float | None = None, - response_format: OpenAIResponseFormatParam | None = None, - seed: int | None = None, - stop: str | list[str] | None = None, - stream: bool | None = None, - stream_options: dict[str, Any] | None = None, - temperature: float | None = None, - tool_choice: str | dict[str, Any] | None = None, - tools: list[dict[str, Any]] | None = None, - top_logprobs: int | None = None, - top_p: float | None = None, - user: str | None = None, + params: OpenAIChatCompletionRequest, ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]: raise NotImplementedError("OpenAI chat completion not supported by the Bedrock provider") diff --git a/llama_stack/providers/remote/inference/databricks/databricks.py b/llama_stack/providers/remote/inference/databricks/databricks.py index 705f4bddd..512913226 100644 --- a/llama_stack/providers/remote/inference/databricks/databricks.py +++ b/llama_stack/providers/remote/inference/databricks/databricks.py @@ -5,11 +5,10 @@ # the root directory of this source tree. from collections.abc import Iterable -from typing import Any from databricks.sdk import WorkspaceClient -from llama_stack.apis.inference import OpenAICompletion +from llama_stack.apis.inference import OpenAICompletion, OpenAICompletionRequest from llama_stack.log import get_logger from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin @@ -40,25 +39,6 @@ class DatabricksInferenceAdapter(OpenAIMixin): async def openai_completion( self, - model: str, - prompt: str | list[str] | list[int] | list[list[int]], - best_of: int | None = None, - echo: bool | None = None, - frequency_penalty: float | None = None, - logit_bias: dict[str, float] | None = None, - logprobs: bool | None = None, - max_tokens: int | None = None, - n: int | None = None, - presence_penalty: float | None = None, - seed: int | None = None, - stop: str | list[str] | None = None, - stream: bool | None = None, - stream_options: dict[str, Any] | None = None, - temperature: float | None = None, - top_p: float | None = None, - user: str | None = None, - guided_choice: list[str] | None = None, - prompt_logprobs: int | None = None, - suffix: str | None = None, + params: OpenAICompletionRequest, ) -> OpenAICompletion: raise NotImplementedError() diff --git a/llama_stack/providers/remote/inference/llama_openai_compat/llama.py b/llama_stack/providers/remote/inference/llama_openai_compat/llama.py index 6995665f7..5a8bdd55e 100644 --- a/llama_stack/providers/remote/inference/llama_openai_compat/llama.py +++ b/llama_stack/providers/remote/inference/llama_openai_compat/llama.py @@ -3,9 +3,7 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import Any - -from llama_stack.apis.inference.inference import OpenAICompletion, OpenAIEmbeddingsResponse +from llama_stack.apis.inference.inference import OpenAICompletion, OpenAICompletionRequest, OpenAIEmbeddingsResponse from llama_stack.log import get_logger from llama_stack.providers.remote.inference.llama_openai_compat.config import LlamaCompatConfig from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin @@ -31,26 +29,7 @@ class LlamaCompatInferenceAdapter(OpenAIMixin): async def openai_completion( self, - model: str, - prompt: str | list[str] | list[int] | list[list[int]], - best_of: int | None = None, - echo: bool | None = None, - frequency_penalty: float | None = None, - logit_bias: dict[str, float] | None = None, - logprobs: bool | None = None, - max_tokens: int | None = None, - n: int | None = None, - presence_penalty: float | None = None, - seed: int | None = None, - stop: str | list[str] | None = None, - stream: bool | None = None, - stream_options: dict[str, Any] | None = None, - temperature: float | None = None, - top_p: float | None = None, - user: str | None = None, - guided_choice: list[str] | None = None, - prompt_logprobs: int | None = None, - suffix: str | None = None, + params: OpenAICompletionRequest, ) -> OpenAICompletion: raise NotImplementedError() diff --git a/llama_stack/providers/remote/inference/passthrough/passthrough.py b/llama_stack/providers/remote/inference/passthrough/passthrough.py index 01078760a..8813ae529 100644 --- a/llama_stack/providers/remote/inference/passthrough/passthrough.py +++ b/llama_stack/providers/remote/inference/passthrough/passthrough.py @@ -13,15 +13,14 @@ from llama_stack.apis.inference import ( Inference, OpenAIChatCompletion, OpenAIChatCompletionChunk, + OpenAIChatCompletionRequest, OpenAICompletion, + OpenAICompletionRequest, OpenAIEmbeddingsResponse, - OpenAIMessageParam, - OpenAIResponseFormatParam, ) from llama_stack.apis.models import Model from llama_stack.core.library_client import convert_pydantic_to_json_value from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper -from llama_stack.providers.utils.inference.openai_compat import prepare_openai_completion_params from .config import PassthroughImplConfig @@ -80,110 +79,31 @@ class PassthroughInferenceAdapter(Inference): async def openai_completion( self, - model: str, - prompt: str | list[str] | list[int] | list[list[int]], - best_of: int | None = None, - echo: bool | None = None, - frequency_penalty: float | None = None, - logit_bias: dict[str, float] | None = None, - logprobs: bool | None = None, - max_tokens: int | None = None, - n: int | None = None, - presence_penalty: float | None = None, - seed: int | None = None, - stop: str | list[str] | None = None, - stream: bool | None = None, - stream_options: dict[str, Any] | None = None, - temperature: float | None = None, - top_p: float | None = None, - user: str | None = None, - guided_choice: list[str] | None = None, - prompt_logprobs: int | None = None, - suffix: str | None = None, + params: OpenAICompletionRequest, ) -> OpenAICompletion: client = self._get_client() - model_obj = await self.model_store.get_model(model) + model_obj = await self.model_store.get_model(params.model) - params = await prepare_openai_completion_params( - model=model_obj.provider_resource_id, - prompt=prompt, - best_of=best_of, - echo=echo, - frequency_penalty=frequency_penalty, - logit_bias=logit_bias, - logprobs=logprobs, - max_tokens=max_tokens, - n=n, - presence_penalty=presence_penalty, - seed=seed, - stop=stop, - stream=stream, - stream_options=stream_options, - temperature=temperature, - top_p=top_p, - user=user, - guided_choice=guided_choice, - prompt_logprobs=prompt_logprobs, - ) + params = params.model_copy() + params.model = model_obj.provider_resource_id - return await client.inference.openai_completion(**params) + request_params = params.model_dump(exclude_none=True) + + return await client.inference.openai_completion(**request_params) async def openai_chat_completion( self, - model: str, - messages: list[OpenAIMessageParam], - frequency_penalty: float | None = None, - function_call: str | dict[str, Any] | None = None, - functions: list[dict[str, Any]] | None = None, - logit_bias: dict[str, float] | None = None, - logprobs: bool | None = None, - max_completion_tokens: int | None = None, - max_tokens: int | None = None, - n: int | None = None, - parallel_tool_calls: bool | None = None, - presence_penalty: float | None = None, - response_format: OpenAIResponseFormatParam | None = None, - seed: int | None = None, - stop: str | list[str] | None = None, - stream: bool | None = None, - stream_options: dict[str, Any] | None = None, - temperature: float | None = None, - tool_choice: str | dict[str, Any] | None = None, - tools: list[dict[str, Any]] | None = None, - top_logprobs: int | None = None, - top_p: float | None = None, - user: str | None = None, + params: OpenAIChatCompletionRequest, ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]: client = self._get_client() - model_obj = await self.model_store.get_model(model) + model_obj = await self.model_store.get_model(params.model) - params = await prepare_openai_completion_params( - model=model_obj.provider_resource_id, - messages=messages, - frequency_penalty=frequency_penalty, - function_call=function_call, - functions=functions, - logit_bias=logit_bias, - logprobs=logprobs, - max_completion_tokens=max_completion_tokens, - max_tokens=max_tokens, - n=n, - parallel_tool_calls=parallel_tool_calls, - presence_penalty=presence_penalty, - response_format=response_format, - seed=seed, - stop=stop, - stream=stream, - stream_options=stream_options, - temperature=temperature, - tool_choice=tool_choice, - tools=tools, - top_logprobs=top_logprobs, - top_p=top_p, - user=user, - ) + params = params.model_copy() + params.model = model_obj.provider_resource_id - return await client.inference.openai_chat_completion(**params) + request_params = params.model_dump(exclude_none=True) + + return await client.inference.openai_chat_completion(**request_params) def cast_value_to_json_dict(self, request_params: dict[str, Any]) -> dict[str, Any]: json_params = {} diff --git a/llama_stack/providers/remote/inference/runpod/runpod.py b/llama_stack/providers/remote/inference/runpod/runpod.py index 67e430ac5..c08136f9f 100644 --- a/llama_stack/providers/remote/inference/runpod/runpod.py +++ b/llama_stack/providers/remote/inference/runpod/runpod.py @@ -4,11 +4,12 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import Any +from collections.abc import AsyncIterator from llama_stack.apis.inference import ( - OpenAIMessageParam, - OpenAIResponseFormatParam, + OpenAIChatCompletion, + OpenAIChatCompletionChunk, + OpenAIChatCompletionRequest, ) from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin @@ -30,56 +31,12 @@ class RunpodInferenceAdapter(OpenAIMixin): async def openai_chat_completion( self, - model: str, - messages: list[OpenAIMessageParam], - frequency_penalty: float | None = None, - function_call: str | dict[str, Any] | None = None, - functions: list[dict[str, Any]] | None = None, - logit_bias: dict[str, float] | None = None, - logprobs: bool | None = None, - max_completion_tokens: int | None = None, - max_tokens: int | None = None, - n: int | None = None, - parallel_tool_calls: bool | None = None, - presence_penalty: float | None = None, - response_format: OpenAIResponseFormatParam | None = None, - seed: int | None = None, - stop: str | list[str] | None = None, - stream: bool | None = None, - stream_options: dict[str, Any] | None = None, - temperature: float | None = None, - tool_choice: str | dict[str, Any] | None = None, - tools: list[dict[str, Any]] | None = None, - top_logprobs: int | None = None, - top_p: float | None = None, - user: str | None = None, - ): + params: OpenAIChatCompletionRequest, + ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]: """Override to add RunPod-specific stream_options requirement.""" - if stream and not stream_options: - stream_options = {"include_usage": True} + params = params.model_copy() - return await super().openai_chat_completion( - model=model, - messages=messages, - frequency_penalty=frequency_penalty, - function_call=function_call, - functions=functions, - logit_bias=logit_bias, - logprobs=logprobs, - max_completion_tokens=max_completion_tokens, - max_tokens=max_tokens, - n=n, - parallel_tool_calls=parallel_tool_calls, - presence_penalty=presence_penalty, - response_format=response_format, - seed=seed, - stop=stop, - stream=stream, - stream_options=stream_options, - temperature=temperature, - tool_choice=tool_choice, - tools=tools, - top_logprobs=top_logprobs, - top_p=top_p, - user=user, - ) + if params.stream and not params.stream_options: + params.stream_options = {"include_usage": True} + + return await super().openai_chat_completion(params) diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py index 72a84c44f..b09326271 100644 --- a/llama_stack/providers/remote/inference/vllm/vllm.py +++ b/llama_stack/providers/remote/inference/vllm/vllm.py @@ -4,7 +4,6 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. from collections.abc import AsyncIterator -from typing import Any from urllib.parse import urljoin import httpx @@ -15,8 +14,7 @@ from pydantic import ConfigDict from llama_stack.apis.inference import ( OpenAIChatCompletion, - OpenAIMessageParam, - OpenAIResponseFormatParam, + OpenAIChatCompletionRequest, ToolChoice, ) from llama_stack.log import get_logger @@ -95,61 +93,19 @@ class VLLMInferenceAdapter(OpenAIMixin): async def openai_chat_completion( self, - model: str, - messages: list[OpenAIMessageParam], - frequency_penalty: float | None = None, - function_call: str | dict[str, Any] | None = None, - functions: list[dict[str, Any]] | None = None, - logit_bias: dict[str, float] | None = None, - logprobs: bool | None = None, - max_completion_tokens: int | None = None, - max_tokens: int | None = None, - n: int | None = None, - parallel_tool_calls: bool | None = None, - presence_penalty: float | None = None, - response_format: OpenAIResponseFormatParam | None = None, - seed: int | None = None, - stop: str | list[str] | None = None, - stream: bool | None = None, - stream_options: dict[str, Any] | None = None, - temperature: float | None = None, - tool_choice: str | dict[str, Any] | None = None, - tools: list[dict[str, Any]] | None = None, - top_logprobs: int | None = None, - top_p: float | None = None, - user: str | None = None, + params: OpenAIChatCompletionRequest, ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]: - max_tokens = max_tokens or self.config.max_tokens + params = params.model_copy() + + # Apply vLLM-specific defaults + if params.max_tokens is None and self.config.max_tokens: + params.max_tokens = self.config.max_tokens # This is to be consistent with OpenAI API and support vLLM <= v0.6.3 # References: # * https://platform.openai.com/docs/api-reference/chat/create#chat-create-tool_choice # * https://github.com/vllm-project/vllm/pull/10000 - if not tools and tool_choice is not None: - tool_choice = ToolChoice.none.value + if not params.tools and params.tool_choice is not None: + params.tool_choice = ToolChoice.none.value - return await super().openai_chat_completion( - model=model, - messages=messages, - frequency_penalty=frequency_penalty, - function_call=function_call, - functions=functions, - logit_bias=logit_bias, - logprobs=logprobs, - max_completion_tokens=max_completion_tokens, - max_tokens=max_tokens, - n=n, - parallel_tool_calls=parallel_tool_calls, - presence_penalty=presence_penalty, - response_format=response_format, - seed=seed, - stop=stop, - stream=stream, - stream_options=stream_options, - temperature=temperature, - tool_choice=tool_choice, - tools=tools, - top_logprobs=top_logprobs, - top_p=top_p, - user=user, - ) + return await super().openai_chat_completion(params) diff --git a/llama_stack/providers/utils/inference/litellm_openai_mixin.py b/llama_stack/providers/utils/inference/litellm_openai_mixin.py index 6bef97dd5..eed078a0e 100644 --- a/llama_stack/providers/utils/inference/litellm_openai_mixin.py +++ b/llama_stack/providers/utils/inference/litellm_openai_mixin.py @@ -7,7 +7,6 @@ import base64 import struct from collections.abc import AsyncIterator -from typing import Any import litellm @@ -17,12 +16,12 @@ from llama_stack.apis.inference import ( JsonSchemaResponseFormat, OpenAIChatCompletion, OpenAIChatCompletionChunk, + OpenAIChatCompletionRequest, OpenAICompletion, + OpenAICompletionRequest, OpenAIEmbeddingData, OpenAIEmbeddingsResponse, OpenAIEmbeddingUsage, - OpenAIMessageParam, - OpenAIResponseFormatParam, ToolChoice, ) from llama_stack.core.request_headers import NeedsRequestProviderData @@ -227,116 +226,80 @@ class LiteLLMOpenAIMixin( async def openai_completion( self, - model: str, - prompt: str | list[str] | list[int] | list[list[int]], - best_of: int | None = None, - echo: bool | None = None, - frequency_penalty: float | None = None, - logit_bias: dict[str, float] | None = None, - logprobs: bool | None = None, - max_tokens: int | None = None, - n: int | None = None, - presence_penalty: float | None = None, - seed: int | None = None, - stop: str | list[str] | None = None, - stream: bool | None = None, - stream_options: dict[str, Any] | None = None, - temperature: float | None = None, - top_p: float | None = None, - user: str | None = None, - guided_choice: list[str] | None = None, - prompt_logprobs: int | None = None, - suffix: str | None = None, + params: OpenAICompletionRequest, ) -> OpenAICompletion: - model_obj = await self.model_store.get_model(model) - params = await prepare_openai_completion_params( + model_obj = await self.model_store.get_model(params.model) + + request_params = await prepare_openai_completion_params( model=self.get_litellm_model_name(model_obj.provider_resource_id), - prompt=prompt, - best_of=best_of, - echo=echo, - frequency_penalty=frequency_penalty, - logit_bias=logit_bias, - logprobs=logprobs, - max_tokens=max_tokens, - n=n, - presence_penalty=presence_penalty, - seed=seed, - stop=stop, - stream=stream, - stream_options=stream_options, - temperature=temperature, - top_p=top_p, - user=user, - guided_choice=guided_choice, - prompt_logprobs=prompt_logprobs, + prompt=params.prompt, + best_of=params.best_of, + echo=params.echo, + frequency_penalty=params.frequency_penalty, + logit_bias=params.logit_bias, + logprobs=params.logprobs, + max_tokens=params.max_tokens, + n=params.n, + presence_penalty=params.presence_penalty, + seed=params.seed, + stop=params.stop, + stream=params.stream, + stream_options=params.stream_options, + temperature=params.temperature, + top_p=params.top_p, + user=params.user, + guided_choice=params.guided_choice, + prompt_logprobs=params.prompt_logprobs, + suffix=params.suffix, api_key=self.get_api_key(), api_base=self.api_base, ) - return await litellm.atext_completion(**params) + return await litellm.atext_completion(**request_params) async def openai_chat_completion( self, - model: str, - messages: list[OpenAIMessageParam], - frequency_penalty: float | None = None, - function_call: str | dict[str, Any] | None = None, - functions: list[dict[str, Any]] | None = None, - logit_bias: dict[str, float] | None = None, - logprobs: bool | None = None, - max_completion_tokens: int | None = None, - max_tokens: int | None = None, - n: int | None = None, - parallel_tool_calls: bool | None = None, - presence_penalty: float | None = None, - response_format: OpenAIResponseFormatParam | None = None, - seed: int | None = None, - stop: str | list[str] | None = None, - stream: bool | None = None, - stream_options: dict[str, Any] | None = None, - temperature: float | None = None, - tool_choice: str | dict[str, Any] | None = None, - tools: list[dict[str, Any]] | None = None, - top_logprobs: int | None = None, - top_p: float | None = None, - user: str | None = None, + params: OpenAIChatCompletionRequest, ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]: # Add usage tracking for streaming when telemetry is active from llama_stack.providers.utils.telemetry.tracing import get_current_span - if stream and get_current_span() is not None: + stream_options = params.stream_options + if params.stream and get_current_span() is not None: if stream_options is None: stream_options = {"include_usage": True} elif "include_usage" not in stream_options: stream_options = {**stream_options, "include_usage": True} - model_obj = await self.model_store.get_model(model) - params = await prepare_openai_completion_params( + + model_obj = await self.model_store.get_model(params.model) + + request_params = await prepare_openai_completion_params( model=self.get_litellm_model_name(model_obj.provider_resource_id), - messages=messages, - frequency_penalty=frequency_penalty, - function_call=function_call, - functions=functions, - logit_bias=logit_bias, - logprobs=logprobs, - max_completion_tokens=max_completion_tokens, - max_tokens=max_tokens, - n=n, - parallel_tool_calls=parallel_tool_calls, - presence_penalty=presence_penalty, - response_format=response_format, - seed=seed, - stop=stop, - stream=stream, + messages=params.messages, + frequency_penalty=params.frequency_penalty, + function_call=params.function_call, + functions=params.functions, + logit_bias=params.logit_bias, + logprobs=params.logprobs, + max_completion_tokens=params.max_completion_tokens, + max_tokens=params.max_tokens, + n=params.n, + parallel_tool_calls=params.parallel_tool_calls, + presence_penalty=params.presence_penalty, + response_format=params.response_format, + seed=params.seed, + stop=params.stop, + stream=params.stream, stream_options=stream_options, - temperature=temperature, - tool_choice=tool_choice, - tools=tools, - top_logprobs=top_logprobs, - top_p=top_p, - user=user, + temperature=params.temperature, + tool_choice=params.tool_choice, + tools=params.tools, + top_logprobs=params.top_logprobs, + top_p=params.top_p, + user=params.user, api_key=self.get_api_key(), api_base=self.api_base, ) - return await litellm.acompletion(**params) + return await litellm.acompletion(**request_params) async def check_model_availability(self, model: str) -> bool: """ diff --git a/llama_stack/providers/utils/inference/openai_mixin.py b/llama_stack/providers/utils/inference/openai_mixin.py index 33a8b81b5..502bc207b 100644 --- a/llama_stack/providers/utils/inference/openai_mixin.py +++ b/llama_stack/providers/utils/inference/openai_mixin.py @@ -17,12 +17,13 @@ from llama_stack.apis.inference import ( Model, OpenAIChatCompletion, OpenAIChatCompletionChunk, + OpenAIChatCompletionRequest, OpenAICompletion, + OpenAICompletionRequest, OpenAIEmbeddingData, OpenAIEmbeddingsResponse, OpenAIEmbeddingUsage, OpenAIMessageParam, - OpenAIResponseFormatParam, ) from llama_stack.apis.models import ModelType from llama_stack.core.request_headers import NeedsRequestProviderData @@ -222,26 +223,7 @@ class OpenAIMixin(NeedsRequestProviderData, ABC, BaseModel): async def openai_completion( self, - model: str, - prompt: str | list[str] | list[int] | list[list[int]], - best_of: int | None = None, - echo: bool | None = None, - frequency_penalty: float | None = None, - logit_bias: dict[str, float] | None = None, - logprobs: bool | None = None, - max_tokens: int | None = None, - n: int | None = None, - presence_penalty: float | None = None, - seed: int | None = None, - stop: str | list[str] | None = None, - stream: bool | None = None, - stream_options: dict[str, Any] | None = None, - temperature: float | None = None, - top_p: float | None = None, - user: str | None = None, - guided_choice: list[str] | None = None, - prompt_logprobs: int | None = None, - suffix: str | None = None, + params: OpenAICompletionRequest, ) -> OpenAICompletion: """ Direct OpenAI completion API call. @@ -251,67 +233,45 @@ class OpenAIMixin(NeedsRequestProviderData, ABC, BaseModel): # guided_choice is supported by vLLM # TODO: test coverage extra_body: dict[str, Any] = {} - if prompt_logprobs is not None and prompt_logprobs >= 0: - extra_body["prompt_logprobs"] = prompt_logprobs - if guided_choice: - extra_body["guided_choice"] = guided_choice + if params.prompt_logprobs is not None and params.prompt_logprobs >= 0: + extra_body["prompt_logprobs"] = params.prompt_logprobs + if params.guided_choice: + extra_body["guided_choice"] = params.guided_choice # TODO: fix openai_completion to return type compatible with OpenAI's API response - resp = await self.client.completions.create( - **await prepare_openai_completion_params( - model=await self._get_provider_model_id(model), - prompt=prompt, - best_of=best_of, - echo=echo, - frequency_penalty=frequency_penalty, - logit_bias=logit_bias, - logprobs=logprobs, - max_tokens=max_tokens, - n=n, - presence_penalty=presence_penalty, - seed=seed, - stop=stop, - stream=stream, - stream_options=stream_options, - temperature=temperature, - top_p=top_p, - user=user, - suffix=suffix, - ), - extra_body=extra_body, + completion_kwargs = await prepare_openai_completion_params( + model=await self._get_provider_model_id(params.model), + prompt=params.prompt, + best_of=params.best_of, + echo=params.echo, + frequency_penalty=params.frequency_penalty, + logit_bias=params.logit_bias, + logprobs=params.logprobs, + max_tokens=params.max_tokens, + n=params.n, + presence_penalty=params.presence_penalty, + seed=params.seed, + stop=params.stop, + stream=params.stream, + stream_options=params.stream_options, + temperature=params.temperature, + top_p=params.top_p, + user=params.user, + suffix=params.suffix, ) + resp = await self.client.completions.create(**completion_kwargs, extra_body=extra_body) - return await self._maybe_overwrite_id(resp, stream) # type: ignore[no-any-return] + return await self._maybe_overwrite_id(resp, params.stream) # type: ignore[no-any-return] async def openai_chat_completion( self, - model: str, - messages: list[OpenAIMessageParam], - frequency_penalty: float | None = None, - function_call: str | dict[str, Any] | None = None, - functions: list[dict[str, Any]] | None = None, - logit_bias: dict[str, float] | None = None, - logprobs: bool | None = None, - max_completion_tokens: int | None = None, - max_tokens: int | None = None, - n: int | None = None, - parallel_tool_calls: bool | None = None, - presence_penalty: float | None = None, - response_format: OpenAIResponseFormatParam | None = None, - seed: int | None = None, - stop: str | list[str] | None = None, - stream: bool | None = None, - stream_options: dict[str, Any] | None = None, - temperature: float | None = None, - tool_choice: str | dict[str, Any] | None = None, - tools: list[dict[str, Any]] | None = None, - top_logprobs: int | None = None, - top_p: float | None = None, - user: str | None = None, + params: OpenAIChatCompletionRequest, ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]: """ Direct OpenAI chat completion API call. """ + messages = params.messages + if self.download_images: async def _localize_image_url(m: OpenAIMessageParam) -> OpenAIMessageParam: @@ -330,35 +290,35 @@ class OpenAIMixin(NeedsRequestProviderData, ABC, BaseModel): messages = [await _localize_image_url(m) for m in messages] - params = await prepare_openai_completion_params( - model=await self._get_provider_model_id(model), + request_params = await prepare_openai_completion_params( + model=await self._get_provider_model_id(params.model), messages=messages, - frequency_penalty=frequency_penalty, - function_call=function_call, - functions=functions, - logit_bias=logit_bias, - logprobs=logprobs, - max_completion_tokens=max_completion_tokens, - max_tokens=max_tokens, - n=n, - parallel_tool_calls=parallel_tool_calls, - presence_penalty=presence_penalty, - response_format=response_format, - seed=seed, - stop=stop, - stream=stream, - stream_options=stream_options, - temperature=temperature, - tool_choice=tool_choice, - tools=tools, - top_logprobs=top_logprobs, - top_p=top_p, - user=user, + frequency_penalty=params.frequency_penalty, + function_call=params.function_call, + functions=params.functions, + logit_bias=params.logit_bias, + logprobs=params.logprobs, + max_completion_tokens=params.max_completion_tokens, + max_tokens=params.max_tokens, + n=params.n, + parallel_tool_calls=params.parallel_tool_calls, + presence_penalty=params.presence_penalty, + response_format=params.response_format, + seed=params.seed, + stop=params.stop, + stream=params.stream, + stream_options=params.stream_options, + temperature=params.temperature, + tool_choice=params.tool_choice, + tools=params.tools, + top_logprobs=params.top_logprobs, + top_p=params.top_p, + user=params.user, ) - resp = await self.client.chat.completions.create(**params) + resp = await self.client.chat.completions.create(**request_params) - return await self._maybe_overwrite_id(resp, stream) # type: ignore[no-any-return] + return await self._maybe_overwrite_id(resp, params.stream) # type: ignore[no-any-return] async def openai_embeddings( self, diff --git a/llama_stack/strong_typing/inspection.py b/llama_stack/strong_typing/inspection.py index 42713e371..f3a4bef90 100644 --- a/llama_stack/strong_typing/inspection.py +++ b/llama_stack/strong_typing/inspection.py @@ -50,6 +50,10 @@ if sys.version_info >= (3, 10): else: from typing_extensions import TypeGuard + +from pydantic import BaseModel +from pydantic.fields import FieldInfo + S = TypeVar("S") T = TypeVar("T") K = TypeVar("K") @@ -570,7 +574,8 @@ def get_class_properties(typ: type) -> Iterable[Tuple[str, type | str]]: elif hasattr(typ, "model_fields"): # Pydantic BaseModel - use model_fields to exclude ClassVar and other non-field attributes # Reconstruct Annotated type if discriminator exists to preserve metadata - from typing import Annotated, Any, cast + from typing import Annotated, Any + from pydantic.fields import FieldInfo def get_field_type(name: str, field: Any) -> type | str: @@ -1049,3 +1054,32 @@ def check_recursive( pred = lambda typ, obj: True # noqa: E731 return RecursiveChecker(pred).check(type(obj), obj) + + +def is_unwrapped_body_param(param_type: Any) -> bool: + """ + Check if a parameter type represents an unwrapped body parameter. + An unwrapped body parameter is an Annotated type with Body(embed=False) + + This is used to determine whether request parameters should be flattened + in OpenAPI specs and client libraries (matching FastAPI's embed=False behavior). + + Args: + param_type: The parameter type annotation to check + + Returns: + True if the parameter should be treated as an unwrapped body parameter + """ + # Check if it's Annotated with Body(embed=False) + if typing.get_origin(param_type) is Annotated: + args = typing.get_args(param_type) + base_type = args[0] + metadata = args[1:] + + # Look for Body annotation with embed=False + # Body() returns a FieldInfo object, so we check for that type and the embed attribute + for item in metadata: + if isinstance(item, FieldInfo) and hasattr(item, "embed") and not item.embed: + return inspect.isclass(base_type) and issubclass(base_type, BaseModel) + + return False diff --git a/tests/unit/providers/agents/meta_reference/test_openai_responses.py b/tests/unit/providers/agents/meta_reference/test_openai_responses.py index 3f4868ff5..8025ea5ae 100644 --- a/tests/unit/providers/agents/meta_reference/test_openai_responses.py +++ b/tests/unit/providers/agents/meta_reference/test_openai_responses.py @@ -33,6 +33,7 @@ from llama_stack.apis.agents.openai_responses import ( from llama_stack.apis.inference import ( OpenAIAssistantMessageParam, OpenAIChatCompletionContentPartTextParam, + OpenAIChatCompletionRequest, OpenAIDeveloperMessageParam, OpenAIJSONSchema, OpenAIResponseFormatJSONObject, @@ -161,15 +162,17 @@ async def test_create_openai_response_with_string_input(openai_responses_impl, m chunks = [chunk async for chunk in result] mock_inference_api.openai_chat_completion.assert_called_once_with( - model=model, - messages=[OpenAIUserMessageParam(role="user", content="What is the capital of Ireland?", name=None)], - response_format=None, - tools=None, - stream=True, - temperature=0.1, - stream_options={ - "include_usage": True, - }, + OpenAIChatCompletionRequest( + model=model, + messages=[OpenAIUserMessageParam(role="user", content="What is the capital of Ireland?", name=None)], + response_format=None, + tools=None, + stream=True, + temperature=0.1, + stream_options={ + "include_usage": True, + }, + ) ) # Should have content part events for text streaming @@ -256,13 +259,15 @@ async def test_create_openai_response_with_string_input_with_tools(openai_respon # Verify first_call = mock_inference_api.openai_chat_completion.call_args_list[0] - assert first_call.kwargs["messages"][0].content == "What is the capital of Ireland?" - assert first_call.kwargs["tools"] is not None - assert first_call.kwargs["temperature"] == 0.1 + first_params = first_call.args[0] + assert first_params.messages[0].content == "What is the capital of Ireland?" + assert first_params.tools is not None + assert first_params.temperature == 0.1 second_call = mock_inference_api.openai_chat_completion.call_args_list[1] - assert second_call.kwargs["messages"][-1].content == "Dublin" - assert second_call.kwargs["temperature"] == 0.1 + second_params = second_call.args[0] + assert second_params.messages[-1].content == "Dublin" + assert second_params.temperature == 0.1 openai_responses_impl.tool_groups_api.get_tool.assert_called_once_with("web_search") openai_responses_impl.tool_runtime_api.invoke_tool.assert_called_once_with( @@ -348,9 +353,10 @@ async def test_create_openai_response_with_tool_call_type_none(openai_responses_ # Verify inference API was called correctly (after iterating over result) first_call = mock_inference_api.openai_chat_completion.call_args_list[0] - assert first_call.kwargs["messages"][0].content == input_text - assert first_call.kwargs["tools"] is not None - assert first_call.kwargs["temperature"] == 0.1 + first_params = first_call.args[0] + assert first_params.messages[0].content == input_text + assert first_params.tools is not None + assert first_params.temperature == 0.1 # Check response.created event (should have empty output) assert len(chunks[0].response.output) == 0 @@ -394,9 +400,10 @@ async def test_create_openai_response_with_tool_call_function_arguments_none(ope def assert_common_expectations(chunks) -> None: first_call = mock_inference_api.openai_chat_completion.call_args_list[0] - assert first_call.kwargs["messages"][0].content == input_text - assert first_call.kwargs["tools"] is not None - assert first_call.kwargs["temperature"] == 0.1 + first_params = first_call.args[0] + assert first_params.messages[0].content == input_text + assert first_params.tools is not None + assert first_params.temperature == 0.1 assert len(chunks[0].response.output) == 0 completed_chunk = chunks[-1] assert completed_chunk.type == "response.completed" @@ -512,7 +519,9 @@ async def test_create_openai_response_with_multiple_messages(openai_responses_im # Verify the the correct messages were sent to the inference API i.e. # All of the responses message were convered to the chat completion message objects - inference_messages = mock_inference_api.openai_chat_completion.call_args_list[0].kwargs["messages"] + call_args = mock_inference_api.openai_chat_completion.call_args_list[0] + params = call_args.args[0] + inference_messages = params.messages for i, m in enumerate(input_messages): if isinstance(m.content, str): assert inference_messages[i].content == m.content @@ -680,7 +689,8 @@ async def test_create_openai_response_with_instructions(openai_responses_impl, m # Verify mock_inference_api.openai_chat_completion.assert_called_once() call_args = mock_inference_api.openai_chat_completion.call_args - sent_messages = call_args.kwargs["messages"] + params = call_args.args[0] + sent_messages = params.messages # Check that instructions were prepended as a system message assert len(sent_messages) == 2 @@ -718,7 +728,8 @@ async def test_create_openai_response_with_instructions_and_multiple_messages( # Verify mock_inference_api.openai_chat_completion.assert_called_once() call_args = mock_inference_api.openai_chat_completion.call_args - sent_messages = call_args.kwargs["messages"] + params = call_args.args[0] + sent_messages = params.messages # Check that instructions were prepended as a system message assert len(sent_messages) == 4 # 1 system + 3 input messages @@ -778,7 +789,8 @@ async def test_create_openai_response_with_instructions_and_previous_response( # Verify mock_inference_api.openai_chat_completion.assert_called_once() call_args = mock_inference_api.openai_chat_completion.call_args - sent_messages = call_args.kwargs["messages"] + params = call_args.args[0] + sent_messages = params.messages # Check that instructions were prepended as a system message assert len(sent_messages) == 4, sent_messages @@ -1018,7 +1030,8 @@ async def test_reuse_mcp_tool_list( ) assert len(mock_inference_api.openai_chat_completion.call_args_list) == 2 second_call = mock_inference_api.openai_chat_completion.call_args_list[1] - tools_seen = second_call.kwargs["tools"] + second_params = second_call.args[0] + tools_seen = second_params.tools assert len(tools_seen) == 1 assert tools_seen[0]["function"]["name"] == "test_tool" assert tools_seen[0]["function"]["description"] == "a test tool" @@ -1065,8 +1078,9 @@ async def test_create_openai_response_with_text_format( # Verify first_call = mock_inference_api.openai_chat_completion.call_args_list[0] - assert first_call.kwargs["messages"][0].content == input_text - assert first_call.kwargs["response_format"] == response_format + first_params = first_call.args[0] + assert first_params.messages[0].content == input_text + assert first_params.response_format == response_format async def test_create_openai_response_with_invalid_text_format(openai_responses_impl, mock_inference_api): diff --git a/tests/unit/providers/inference/test_remote_vllm.py b/tests/unit/providers/inference/test_remote_vllm.py index 6d6bb20d5..569fb5031 100644 --- a/tests/unit/providers/inference/test_remote_vllm.py +++ b/tests/unit/providers/inference/test_remote_vllm.py @@ -13,6 +13,7 @@ import pytest from llama_stack.apis.inference import ( OpenAIAssistantMessageParam, OpenAIChatCompletion, + OpenAIChatCompletionRequest, OpenAIChoice, ToolChoice, ) @@ -56,13 +57,14 @@ async def test_old_vllm_tool_choice(vllm_inference_adapter): mock_client_property.return_value = mock_client # No tools but auto tool choice - await vllm_inference_adapter.openai_chat_completion( - "mock-model", - [], + params = OpenAIChatCompletionRequest( + model="mock-model", + messages=[{"role": "user", "content": "test"}], stream=False, tools=None, tool_choice=ToolChoice.auto.value, ) + await vllm_inference_adapter.openai_chat_completion(params) mock_client.chat.completions.create.assert_called() call_args = mock_client.chat.completions.create.call_args # Ensure tool_choice gets converted to none for older vLLM versions @@ -171,9 +173,12 @@ async def test_openai_chat_completion_is_async(vllm_inference_adapter): ) async def do_inference(): - await vllm_inference_adapter.openai_chat_completion( - "mock-model", messages=["one fish", "two fish"], stream=False + params = OpenAIChatCompletionRequest( + model="mock-model", + messages=[{"role": "user", "content": "one fish two fish"}], + stream=False, ) + await vllm_inference_adapter.openai_chat_completion(params) with patch.object(VLLMInferenceAdapter, "client", new_callable=PropertyMock) as mock_create_client: mock_client = MagicMock() diff --git a/tests/unit/providers/utils/inference/test_openai_mixin.py b/tests/unit/providers/utils/inference/test_openai_mixin.py index 8ce4925e1..4a24d72ed 100644 --- a/tests/unit/providers/utils/inference/test_openai_mixin.py +++ b/tests/unit/providers/utils/inference/test_openai_mixin.py @@ -12,7 +12,7 @@ from unittest.mock import AsyncMock, MagicMock, Mock, PropertyMock, patch import pytest from pydantic import BaseModel, Field -from llama_stack.apis.inference import Model, OpenAIUserMessageParam +from llama_stack.apis.inference import Model, OpenAIChatCompletionRequest, OpenAIUserMessageParam from llama_stack.apis.models import ModelType from llama_stack.core.request_headers import request_provider_data_context from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig @@ -271,7 +271,8 @@ class TestOpenAIMixinImagePreprocessing: with patch("llama_stack.providers.utils.inference.openai_mixin.localize_image_content") as mock_localize: mock_localize.return_value = (b"fake_image_data", "jpeg") - await mixin.openai_chat_completion(model="test-model", messages=[message]) + params = OpenAIChatCompletionRequest(model="test-model", messages=[message]) + await mixin.openai_chat_completion(params) mock_localize.assert_called_once_with("http://example.com/image.jpg") @@ -303,7 +304,8 @@ class TestOpenAIMixinImagePreprocessing: with patch.object(type(mixin), "client", new_callable=PropertyMock, return_value=mock_client): with patch("llama_stack.providers.utils.inference.openai_mixin.localize_image_content") as mock_localize: - await mixin.openai_chat_completion(model="test-model", messages=[message]) + params = OpenAIChatCompletionRequest(model="test-model", messages=[message]) + await mixin.openai_chat_completion(params) mock_localize.assert_not_called() From 10c7e67fca7c855bba596b133d834c129d37fb6b Mon Sep 17 00:00:00 2001 From: Eric Huang Date: Fri, 10 Oct 2025 15:46:56 -0700 Subject: [PATCH 2/2] featu: support passing "extra body" throught to providers # What does this PR do? Allows passing through extra_body parameters to inference providers. closes #2720 ## Test Plan CI and added new test --- docs/static/deprecated-llama-stack-spec.html | 23 +- docs/static/deprecated-llama-stack-spec.yaml | 24 +- docs/static/llama-stack-spec.html | 23 +- docs/static/llama-stack-spec.yaml | 24 +- docs/static/stainless-llama-stack-spec.html | 23 +- docs/static/stainless-llama-stack-spec.yaml | 24 +- llama_stack/apis/inference/inference.py | 24 +- llama_stack/core/routers/inference.py | 10 +- .../agents/meta_reference/agent_instance.py | 4 +- .../meta_reference/responses/streaming.py | 4 +- .../inline/batches/reference/batches.py | 8 +- .../inline/eval/meta_reference/eval.py | 8 +- .../inference/meta_reference/inference.py | 8 +- .../sentence_transformers.py | 8 +- .../inline/safety/llama_guard/llama_guard.py | 6 +- .../scoring_fn/llm_as_judge_scoring_fn.py | 4 +- .../tool_runtime/rag/context_retriever.py | 4 +- .../remote/inference/bedrock/bedrock.py | 8 +- .../remote/inference/databricks/databricks.py | 4 +- .../inference/llama_openai_compat/llama.py | 9 +- .../inference/passthrough/passthrough.py | 8 +- .../remote/inference/runpod/runpod.py | 4 +- .../providers/remote/inference/vllm/vllm.py | 4 +- .../utils/inference/litellm_openai_mixin.py | 10 +- .../providers/utils/inference/openai_mixin.py | 24 +- ...34a95f56931b792d5939f4cebc57-9ecd9600.json | 881 ++++++++++++++++++ ...34a95f56931b792d5939f4cebc57-fb68f5a6.json | 45 + ...f93f2ea6ed882f1186cf4fdda5bb-f15cee9a.json | 543 +++++++++++ ...b890b4e65a5f9917a2d75c5795782ab7cbfff.json | 48 + ...92002485023b937d72b7aa8d4c15c9204fc5c.json | 54 ++ ...30b99015b5ed0e2bbf24418a31146ffcbca9b.json | 54 ++ .../inference/test_openai_completion.py | 2 +- .../meta_reference/test_openai_responses.py | 4 +- .../providers/inference/test_remote_vllm.py | 156 +++- .../utils/inference/test_openai_mixin.py | 6 +- 35 files changed, 1893 insertions(+), 200 deletions(-) create mode 100644 tests/integration/common/recordings/models-64a2277c90f0f42576f60c1030e3a020403d34a95f56931b792d5939f4cebc57-9ecd9600.json create mode 100644 tests/integration/common/recordings/models-64a2277c90f0f42576f60c1030e3a020403d34a95f56931b792d5939f4cebc57-fb68f5a6.json create mode 100644 tests/integration/common/recordings/models-bd3df37825f32706c88677a327960bfa47dcf93f2ea6ed882f1186cf4fdda5bb-f15cee9a.json create mode 100644 tests/integration/inference/recordings/d2ba309413e85d6166f7543a879b890b4e65a5f9917a2d75c5795782ab7cbfff.json create mode 100644 tests/integration/inference/recordings/e3727f6c749ab8bdee2f581300092002485023b937d72b7aa8d4c15c9204fc5c.json create mode 100644 tests/integration/inference/recordings/f02f1bfd75adaea87b91dedc59430b99015b5ed0e2bbf24418a31146ffcbca9b.json diff --git a/docs/static/deprecated-llama-stack-spec.html b/docs/static/deprecated-llama-stack-spec.html index f9bcb48f7..570b0b750 100644 --- a/docs/static/deprecated-llama-stack-spec.html +++ b/docs/static/deprecated-llama-stack-spec.html @@ -1527,7 +1527,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/OpenAIChatCompletionRequest" + "$ref": "#/components/schemas/OpenAIChatCompletionRequestWithExtraBody" } } }, @@ -1617,7 +1617,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/OpenAICompletionRequest" + "$ref": "#/components/schemas/OpenAICompletionRequestWithExtraBody" } } }, @@ -7522,7 +7522,7 @@ "title": "OpenAIResponseFormatText", "description": "Text response format for OpenAI-compatible chat completion requests." }, - "OpenAIChatCompletionRequest": { + "OpenAIChatCompletionRequestWithExtraBody": { "type": "object", "properties": { "model": { @@ -7769,7 +7769,7 @@ "model", "messages" ], - "title": "OpenAIChatCompletionRequest", + "title": "OpenAIChatCompletionRequestWithExtraBody", "description": "Request parameters for OpenAI-compatible chat completion endpoint." }, "OpenAIChatCompletion": { @@ -7966,7 +7966,7 @@ ], "title": "OpenAICompletionWithInputMessages" }, - "OpenAICompletionRequest": { + "OpenAICompletionRequestWithExtraBody": { "type": "object", "properties": { "model": { @@ -8097,17 +8097,6 @@ "type": "string", "description": "(Optional) The user to use." }, - "guided_choice": { - "type": "array", - "items": { - "type": "string" - }, - "description": "(Optional) vLLM-specific parameter for guided generation with a list of choices." - }, - "prompt_logprobs": { - "type": "integer", - "description": "(Optional) vLLM-specific parameter for number of log probabilities to return for prompt tokens." - }, "suffix": { "type": "string", "description": "(Optional) The suffix that should be appended to the completion." @@ -8118,7 +8107,7 @@ "model", "prompt" ], - "title": "OpenAICompletionRequest", + "title": "OpenAICompletionRequestWithExtraBody", "description": "Request parameters for OpenAI-compatible completion endpoint." }, "OpenAICompletion": { diff --git a/docs/static/deprecated-llama-stack-spec.yaml b/docs/static/deprecated-llama-stack-spec.yaml index 552555f7a..845e51f8c 100644 --- a/docs/static/deprecated-llama-stack-spec.yaml +++ b/docs/static/deprecated-llama-stack-spec.yaml @@ -1098,7 +1098,7 @@ paths: content: application/json: schema: - $ref: '#/components/schemas/OpenAIChatCompletionRequest' + $ref: '#/components/schemas/OpenAIChatCompletionRequestWithExtraBody' required: true deprecated: true /v1/openai/v1/chat/completions/{completion_id}: @@ -1167,7 +1167,7 @@ paths: content: application/json: schema: - $ref: '#/components/schemas/OpenAICompletionRequest' + $ref: '#/components/schemas/OpenAICompletionRequestWithExtraBody' required: true deprecated: true /v1/openai/v1/embeddings: @@ -5575,7 +5575,7 @@ components: title: OpenAIResponseFormatText description: >- Text response format for OpenAI-compatible chat completion requests. - OpenAIChatCompletionRequest: + OpenAIChatCompletionRequestWithExtraBody: type: object properties: model: @@ -5717,7 +5717,7 @@ components: required: - model - messages - title: OpenAIChatCompletionRequest + title: OpenAIChatCompletionRequestWithExtraBody description: >- Request parameters for OpenAI-compatible chat completion endpoint. OpenAIChatCompletion: @@ -5885,7 +5885,7 @@ components: - model - input_messages title: OpenAICompletionWithInputMessages - OpenAICompletionRequest: + OpenAICompletionRequestWithExtraBody: type: object properties: model: @@ -5973,18 +5973,6 @@ components: user: type: string description: (Optional) The user to use. - guided_choice: - type: array - items: - type: string - description: >- - (Optional) vLLM-specific parameter for guided generation with a list of - choices. - prompt_logprobs: - type: integer - description: >- - (Optional) vLLM-specific parameter for number of log probabilities to - return for prompt tokens. suffix: type: string description: >- @@ -5993,7 +5981,7 @@ components: required: - model - prompt - title: OpenAICompletionRequest + title: OpenAICompletionRequestWithExtraBody description: >- Request parameters for OpenAI-compatible completion endpoint. OpenAICompletion: diff --git a/docs/static/llama-stack-spec.html b/docs/static/llama-stack-spec.html index 8f8ff66c9..cc656063d 100644 --- a/docs/static/llama-stack-spec.html +++ b/docs/static/llama-stack-spec.html @@ -153,7 +153,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/OpenAIChatCompletionRequest" + "$ref": "#/components/schemas/OpenAIChatCompletionRequestWithExtraBody" } } }, @@ -243,7 +243,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/OpenAICompletionRequest" + "$ref": "#/components/schemas/OpenAICompletionRequestWithExtraBody" } } }, @@ -5018,7 +5018,7 @@ "title": "OpenAIResponseFormatText", "description": "Text response format for OpenAI-compatible chat completion requests." }, - "OpenAIChatCompletionRequest": { + "OpenAIChatCompletionRequestWithExtraBody": { "type": "object", "properties": { "model": { @@ -5265,7 +5265,7 @@ "model", "messages" ], - "title": "OpenAIChatCompletionRequest", + "title": "OpenAIChatCompletionRequestWithExtraBody", "description": "Request parameters for OpenAI-compatible chat completion endpoint." }, "OpenAIChatCompletion": { @@ -5462,7 +5462,7 @@ ], "title": "OpenAICompletionWithInputMessages" }, - "OpenAICompletionRequest": { + "OpenAICompletionRequestWithExtraBody": { "type": "object", "properties": { "model": { @@ -5593,17 +5593,6 @@ "type": "string", "description": "(Optional) The user to use." }, - "guided_choice": { - "type": "array", - "items": { - "type": "string" - }, - "description": "(Optional) vLLM-specific parameter for guided generation with a list of choices." - }, - "prompt_logprobs": { - "type": "integer", - "description": "(Optional) vLLM-specific parameter for number of log probabilities to return for prompt tokens." - }, "suffix": { "type": "string", "description": "(Optional) The suffix that should be appended to the completion." @@ -5614,7 +5603,7 @@ "model", "prompt" ], - "title": "OpenAICompletionRequest", + "title": "OpenAICompletionRequestWithExtraBody", "description": "Request parameters for OpenAI-compatible completion endpoint." }, "OpenAICompletion": { diff --git a/docs/static/llama-stack-spec.yaml b/docs/static/llama-stack-spec.yaml index 97742f19a..66e84b4f2 100644 --- a/docs/static/llama-stack-spec.yaml +++ b/docs/static/llama-stack-spec.yaml @@ -98,7 +98,7 @@ paths: content: application/json: schema: - $ref: '#/components/schemas/OpenAIChatCompletionRequest' + $ref: '#/components/schemas/OpenAIChatCompletionRequestWithExtraBody' required: true deprecated: false /v1/chat/completions/{completion_id}: @@ -167,7 +167,7 @@ paths: content: application/json: schema: - $ref: '#/components/schemas/OpenAICompletionRequest' + $ref: '#/components/schemas/OpenAICompletionRequestWithExtraBody' required: true deprecated: false /v1/conversations: @@ -3824,7 +3824,7 @@ components: title: OpenAIResponseFormatText description: >- Text response format for OpenAI-compatible chat completion requests. - OpenAIChatCompletionRequest: + OpenAIChatCompletionRequestWithExtraBody: type: object properties: model: @@ -3966,7 +3966,7 @@ components: required: - model - messages - title: OpenAIChatCompletionRequest + title: OpenAIChatCompletionRequestWithExtraBody description: >- Request parameters for OpenAI-compatible chat completion endpoint. OpenAIChatCompletion: @@ -4134,7 +4134,7 @@ components: - model - input_messages title: OpenAICompletionWithInputMessages - OpenAICompletionRequest: + OpenAICompletionRequestWithExtraBody: type: object properties: model: @@ -4222,18 +4222,6 @@ components: user: type: string description: (Optional) The user to use. - guided_choice: - type: array - items: - type: string - description: >- - (Optional) vLLM-specific parameter for guided generation with a list of - choices. - prompt_logprobs: - type: integer - description: >- - (Optional) vLLM-specific parameter for number of log probabilities to - return for prompt tokens. suffix: type: string description: >- @@ -4242,7 +4230,7 @@ components: required: - model - prompt - title: OpenAICompletionRequest + title: OpenAICompletionRequestWithExtraBody description: >- Request parameters for OpenAI-compatible completion endpoint. OpenAICompletion: diff --git a/docs/static/stainless-llama-stack-spec.html b/docs/static/stainless-llama-stack-spec.html index fcdcd76c5..10305b239 100644 --- a/docs/static/stainless-llama-stack-spec.html +++ b/docs/static/stainless-llama-stack-spec.html @@ -153,7 +153,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/OpenAIChatCompletionRequest" + "$ref": "#/components/schemas/OpenAIChatCompletionRequestWithExtraBody" } } }, @@ -243,7 +243,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/OpenAICompletionRequest" + "$ref": "#/components/schemas/OpenAICompletionRequestWithExtraBody" } } }, @@ -7027,7 +7027,7 @@ "title": "OpenAIResponseFormatText", "description": "Text response format for OpenAI-compatible chat completion requests." }, - "OpenAIChatCompletionRequest": { + "OpenAIChatCompletionRequestWithExtraBody": { "type": "object", "properties": { "model": { @@ -7274,7 +7274,7 @@ "model", "messages" ], - "title": "OpenAIChatCompletionRequest", + "title": "OpenAIChatCompletionRequestWithExtraBody", "description": "Request parameters for OpenAI-compatible chat completion endpoint." }, "OpenAIChatCompletion": { @@ -7471,7 +7471,7 @@ ], "title": "OpenAICompletionWithInputMessages" }, - "OpenAICompletionRequest": { + "OpenAICompletionRequestWithExtraBody": { "type": "object", "properties": { "model": { @@ -7602,17 +7602,6 @@ "type": "string", "description": "(Optional) The user to use." }, - "guided_choice": { - "type": "array", - "items": { - "type": "string" - }, - "description": "(Optional) vLLM-specific parameter for guided generation with a list of choices." - }, - "prompt_logprobs": { - "type": "integer", - "description": "(Optional) vLLM-specific parameter for number of log probabilities to return for prompt tokens." - }, "suffix": { "type": "string", "description": "(Optional) The suffix that should be appended to the completion." @@ -7623,7 +7612,7 @@ "model", "prompt" ], - "title": "OpenAICompletionRequest", + "title": "OpenAICompletionRequestWithExtraBody", "description": "Request parameters for OpenAI-compatible completion endpoint." }, "OpenAICompletion": { diff --git a/docs/static/stainless-llama-stack-spec.yaml b/docs/static/stainless-llama-stack-spec.yaml index 09fc3ded4..afeeabc62 100644 --- a/docs/static/stainless-llama-stack-spec.yaml +++ b/docs/static/stainless-llama-stack-spec.yaml @@ -101,7 +101,7 @@ paths: content: application/json: schema: - $ref: '#/components/schemas/OpenAIChatCompletionRequest' + $ref: '#/components/schemas/OpenAIChatCompletionRequestWithExtraBody' required: true deprecated: false /v1/chat/completions/{completion_id}: @@ -170,7 +170,7 @@ paths: content: application/json: schema: - $ref: '#/components/schemas/OpenAICompletionRequest' + $ref: '#/components/schemas/OpenAICompletionRequestWithExtraBody' required: true deprecated: false /v1/conversations: @@ -5269,7 +5269,7 @@ components: title: OpenAIResponseFormatText description: >- Text response format for OpenAI-compatible chat completion requests. - OpenAIChatCompletionRequest: + OpenAIChatCompletionRequestWithExtraBody: type: object properties: model: @@ -5411,7 +5411,7 @@ components: required: - model - messages - title: OpenAIChatCompletionRequest + title: OpenAIChatCompletionRequestWithExtraBody description: >- Request parameters for OpenAI-compatible chat completion endpoint. OpenAIChatCompletion: @@ -5579,7 +5579,7 @@ components: - model - input_messages title: OpenAICompletionWithInputMessages - OpenAICompletionRequest: + OpenAICompletionRequestWithExtraBody: type: object properties: model: @@ -5667,18 +5667,6 @@ components: user: type: string description: (Optional) The user to use. - guided_choice: - type: array - items: - type: string - description: >- - (Optional) vLLM-specific parameter for guided generation with a list of - choices. - prompt_logprobs: - type: integer - description: >- - (Optional) vLLM-specific parameter for number of log probabilities to - return for prompt tokens. suffix: type: string description: >- @@ -5687,7 +5675,7 @@ components: required: - model - prompt - title: OpenAICompletionRequest + title: OpenAICompletionRequestWithExtraBody description: >- Request parameters for OpenAI-compatible completion endpoint. OpenAICompletion: diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py index fb3e78afc..85339e2e0 100644 --- a/llama_stack/apis/inference/inference.py +++ b/llama_stack/apis/inference/inference.py @@ -15,7 +15,7 @@ from typing import ( ) from fastapi import Body -from pydantic import BaseModel, ConfigDict, Field, field_validator +from pydantic import BaseModel, Field, field_validator from typing_extensions import TypedDict from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent @@ -1036,8 +1036,9 @@ class ListOpenAIChatCompletionResponse(BaseModel): object: Literal["list"] = "list" +# extra_body can be accessed via .model_extra @json_schema_type -class OpenAICompletionRequest(BaseModel): +class OpenAICompletionRequestWithExtraBody(BaseModel, extra="allow"): """Request parameters for OpenAI-compatible completion endpoint. :param model: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint. @@ -1058,12 +1059,8 @@ class OpenAICompletionRequest(BaseModel): :param top_p: (Optional) The top p to use. :param user: (Optional) The user to use. :param suffix: (Optional) The suffix that should be appended to the completion. - :param guided_choice: (Optional) vLLM-specific parameter for guided generation with a list of choices. - :param prompt_logprobs: (Optional) vLLM-specific parameter for number of log probabilities to return for prompt tokens. """ - model_config = ConfigDict(extra="allow") - # Standard OpenAI completion parameters model: str prompt: str | list[str] | list[int] | list[list[int]] @@ -1082,17 +1079,12 @@ class OpenAICompletionRequest(BaseModel): temperature: float | None = None top_p: float | None = None user: str | None = None - - # vLLM-specific parameters (documented here but also allowed via extra fields) - guided_choice: list[str] | None = None - prompt_logprobs: int | None = None - - # for fill-in-the-middle type completion suffix: str | None = None +# extra_body can be accessed via .model_extra @json_schema_type -class OpenAIChatCompletionRequest(BaseModel): +class OpenAIChatCompletionRequestWithExtraBody(BaseModel, extra="allow"): """Request parameters for OpenAI-compatible chat completion endpoint. :param model: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint. @@ -1120,8 +1112,6 @@ class OpenAIChatCompletionRequest(BaseModel): :param user: (Optional) The user to use. """ - model_config = ConfigDict(extra="allow") - # Standard OpenAI chat completion parameters model: str messages: Annotated[list[OpenAIMessageParam], Field(..., min_length=1)] @@ -1182,7 +1172,7 @@ class InferenceProvider(Protocol): @webmethod(route="/completions", method="POST", level=LLAMA_STACK_API_V1) async def openai_completion( self, - params: Annotated[OpenAICompletionRequest, Body(...)], + params: Annotated[OpenAICompletionRequestWithExtraBody, Body(...)], ) -> OpenAICompletion: """Create completion. @@ -1195,7 +1185,7 @@ class InferenceProvider(Protocol): @webmethod(route="/chat/completions", method="POST", level=LLAMA_STACK_API_V1) async def openai_chat_completion( self, - params: Annotated[OpenAIChatCompletionRequest, Body(...)], + params: Annotated[OpenAIChatCompletionRequestWithExtraBody, Body(...)], ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]: """Create chat completions. diff --git a/llama_stack/core/routers/inference.py b/llama_stack/core/routers/inference.py index 5c7532e70..e16d08371 100644 --- a/llama_stack/core/routers/inference.py +++ b/llama_stack/core/routers/inference.py @@ -32,13 +32,13 @@ from llama_stack.apis.inference import ( OpenAIAssistantMessageParam, OpenAIChatCompletion, OpenAIChatCompletionChunk, - OpenAIChatCompletionRequest, + OpenAIChatCompletionRequestWithExtraBody, OpenAIChatCompletionToolCall, OpenAIChatCompletionToolCallFunction, OpenAIChoice, OpenAIChoiceLogprobs, OpenAICompletion, - OpenAICompletionRequest, + OpenAICompletionRequestWithExtraBody, OpenAICompletionWithInputMessages, OpenAIEmbeddingsResponse, OpenAIMessageParam, @@ -183,7 +183,7 @@ class InferenceRouter(Inference): async def openai_completion( self, - params: Annotated[OpenAICompletionRequest, Body(...)], + params: Annotated[OpenAICompletionRequestWithExtraBody, Body(...)], ) -> OpenAICompletion: logger.debug( f"InferenceRouter.openai_completion: model={params.model}, stream={params.stream}, prompt={params.prompt}", @@ -218,7 +218,7 @@ class InferenceRouter(Inference): async def openai_chat_completion( self, - params: Annotated[OpenAIChatCompletionRequest, Body(...)], + params: Annotated[OpenAIChatCompletionRequestWithExtraBody, Body(...)], ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]: logger.debug( f"InferenceRouter.openai_chat_completion: model={params.model}, stream={params.stream}, messages={params.messages}", @@ -317,7 +317,7 @@ class InferenceRouter(Inference): raise NotImplementedError("Get chat completion is not supported: inference store is not configured.") async def _nonstream_openai_chat_completion( - self, provider: Inference, params: OpenAIChatCompletionRequest + self, provider: Inference, params: OpenAIChatCompletionRequestWithExtraBody ) -> OpenAIChatCompletion: response = await provider.openai_chat_completion(params) for choice in response.choices: diff --git a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py index 696fa9c97..96f271669 100644 --- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py +++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py @@ -49,7 +49,7 @@ from llama_stack.apis.inference import ( Inference, Message, OpenAIAssistantMessageParam, - OpenAIChatCompletionRequest, + OpenAIChatCompletionRequestWithExtraBody, OpenAIDeveloperMessageParam, OpenAIMessageParam, OpenAISystemMessageParam, @@ -583,7 +583,7 @@ class ChatAgent(ShieldRunnerMixin): max_tokens = getattr(sampling_params, "max_tokens", None) # Use OpenAI chat completion - params = OpenAIChatCompletionRequest( + params = OpenAIChatCompletionRequestWithExtraBody( model=self.agent_config.model, messages=openai_messages, tools=openai_tools if openai_tools else None, diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py index 6c1204fd4..cfd69cdeb 100644 --- a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +++ b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py @@ -49,7 +49,7 @@ from llama_stack.apis.inference import ( OpenAIAssistantMessageParam, OpenAIChatCompletion, OpenAIChatCompletionChunk, - OpenAIChatCompletionRequest, + OpenAIChatCompletionRequestWithExtraBody, OpenAIChatCompletionToolCall, OpenAIChoice, OpenAIMessageParam, @@ -169,7 +169,7 @@ class StreamingResponseOrchestrator: # (some providers don't support non-empty response_format when tools are present) response_format = None if self.ctx.response_format.type == "text" else self.ctx.response_format logger.debug(f"calling openai_chat_completion with tools: {self.ctx.chat_tools}") - params = OpenAIChatCompletionRequest( + params = OpenAIChatCompletionRequestWithExtraBody( model=self.ctx.model, messages=messages, tools=self.ctx.chat_tools, diff --git a/llama_stack/providers/inline/batches/reference/batches.py b/llama_stack/providers/inline/batches/reference/batches.py index 48690f177..102537dd7 100644 --- a/llama_stack/providers/inline/batches/reference/batches.py +++ b/llama_stack/providers/inline/batches/reference/batches.py @@ -22,8 +22,8 @@ from llama_stack.apis.files import Files, OpenAIFilePurpose from llama_stack.apis.inference import ( Inference, OpenAIAssistantMessageParam, - OpenAIChatCompletionRequest, - OpenAICompletionRequest, + OpenAIChatCompletionRequestWithExtraBody, + OpenAICompletionRequestWithExtraBody, OpenAIDeveloperMessageParam, OpenAIMessageParam, OpenAISystemMessageParam, @@ -608,7 +608,7 @@ class ReferenceBatchesImpl(Batches): # TODO(SECURITY): review body for security issues if request.url == "/v1/chat/completions": request.body["messages"] = [convert_to_openai_message_param(msg) for msg in request.body["messages"]] - chat_params = OpenAIChatCompletionRequest(**request.body) + chat_params = OpenAIChatCompletionRequestWithExtraBody(**request.body) chat_response = await self.inference_api.openai_chat_completion(chat_params) # this is for mypy, we don't allow streaming so we'll get the right type @@ -623,7 +623,7 @@ class ReferenceBatchesImpl(Batches): }, } elif request.url == "/v1/completions": - completion_params = OpenAICompletionRequest(**request.body) + completion_params = OpenAICompletionRequestWithExtraBody(**request.body) completion_response = await self.inference_api.openai_completion(completion_params) # this is for mypy, we don't allow streaming so we'll get the right type diff --git a/llama_stack/providers/inline/eval/meta_reference/eval.py b/llama_stack/providers/inline/eval/meta_reference/eval.py index 1318f3104..3c1e2e462 100644 --- a/llama_stack/providers/inline/eval/meta_reference/eval.py +++ b/llama_stack/providers/inline/eval/meta_reference/eval.py @@ -14,8 +14,8 @@ from llama_stack.apis.datasetio import DatasetIO from llama_stack.apis.datasets import Datasets from llama_stack.apis.inference import ( Inference, - OpenAIChatCompletionRequest, - OpenAICompletionRequest, + OpenAIChatCompletionRequestWithExtraBody, + OpenAICompletionRequestWithExtraBody, OpenAISystemMessageParam, OpenAIUserMessageParam, UserMessage, @@ -175,7 +175,7 @@ class MetaReferenceEvalImpl( sampling_params["stop"] = candidate.sampling_params.stop input_content = json.loads(x[ColumnName.completion_input.value]) - params = OpenAICompletionRequest( + params = OpenAICompletionRequestWithExtraBody( model=candidate.model, prompt=input_content, **sampling_params, @@ -195,7 +195,7 @@ class MetaReferenceEvalImpl( messages += [OpenAISystemMessageParam(**x) for x in chat_completion_input_json if x["role"] == "system"] messages += input_messages - params = OpenAIChatCompletionRequest( + params = OpenAIChatCompletionRequestWithExtraBody( model=candidate.model, messages=messages, **sampling_params, diff --git a/llama_stack/providers/inline/inference/meta_reference/inference.py b/llama_stack/providers/inline/inference/meta_reference/inference.py index 72813b4fd..286335a7d 100644 --- a/llama_stack/providers/inline/inference/meta_reference/inference.py +++ b/llama_stack/providers/inline/inference/meta_reference/inference.py @@ -9,8 +9,8 @@ from collections.abc import AsyncIterator from llama_stack.apis.inference import ( InferenceProvider, - OpenAIChatCompletionRequest, - OpenAICompletionRequest, + OpenAIChatCompletionRequestWithExtraBody, + OpenAICompletionRequestWithExtraBody, ) from llama_stack.apis.inference.inference import ( OpenAIChatCompletion, @@ -67,7 +67,7 @@ class MetaReferenceInferenceImpl( async def openai_completion( self, - params: OpenAICompletionRequest, + params: OpenAICompletionRequestWithExtraBody, ) -> OpenAICompletion: raise NotImplementedError("OpenAI completion not supported by meta reference provider") @@ -153,6 +153,6 @@ class MetaReferenceInferenceImpl( async def openai_chat_completion( self, - params: OpenAIChatCompletionRequest, + params: OpenAIChatCompletionRequestWithExtraBody, ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]: raise NotImplementedError("OpenAI chat completion not supported by meta-reference inference provider") diff --git a/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py b/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py index 4aac2c3d8..306e1325e 100644 --- a/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py +++ b/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py @@ -8,8 +8,8 @@ from collections.abc import AsyncIterator from llama_stack.apis.inference import ( InferenceProvider, - OpenAIChatCompletionRequest, - OpenAICompletionRequest, + OpenAIChatCompletionRequestWithExtraBody, + OpenAICompletionRequestWithExtraBody, ) from llama_stack.apis.inference.inference import ( OpenAIChatCompletion, @@ -72,12 +72,12 @@ class SentenceTransformersInferenceImpl( async def openai_completion( self, - params: OpenAICompletionRequest, + params: OpenAICompletionRequestWithExtraBody, ) -> OpenAICompletion: raise NotImplementedError("OpenAI completion not supported by sentence transformers provider") async def openai_chat_completion( self, - params: OpenAIChatCompletionRequest, + params: OpenAIChatCompletionRequestWithExtraBody, ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]: raise NotImplementedError("OpenAI chat completion not supported by sentence transformers provider") diff --git a/llama_stack/providers/inline/safety/llama_guard/llama_guard.py b/llama_stack/providers/inline/safety/llama_guard/llama_guard.py index c661de59c..e73aadedc 100644 --- a/llama_stack/providers/inline/safety/llama_guard/llama_guard.py +++ b/llama_stack/providers/inline/safety/llama_guard/llama_guard.py @@ -13,7 +13,7 @@ from llama_stack.apis.common.content_types import ImageContentItem, TextContentI from llama_stack.apis.inference import ( Inference, Message, - OpenAIChatCompletionRequest, + OpenAIChatCompletionRequestWithExtraBody, OpenAIUserMessageParam, UserMessage, ) @@ -296,7 +296,7 @@ class LlamaGuardShield: else: shield_input_message = self.build_text_shield_input(messages) - params = OpenAIChatCompletionRequest( + params = OpenAIChatCompletionRequestWithExtraBody( model=self.model, messages=[shield_input_message], stream=False, @@ -384,7 +384,7 @@ class LlamaGuardShield: # TODO: Add Image based support for OpenAI Moderations shield_input_message = self.build_text_shield_input(messages) - params = OpenAIChatCompletionRequest( + params = OpenAIChatCompletionRequestWithExtraBody( model=self.model, messages=[shield_input_message], stream=False, diff --git a/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py b/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py index f5e55d1d5..fbecb6e20 100644 --- a/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py +++ b/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py @@ -6,7 +6,7 @@ import re from typing import Any -from llama_stack.apis.inference import Inference, OpenAIChatCompletionRequest +from llama_stack.apis.inference import Inference, OpenAIChatCompletionRequestWithExtraBody from llama_stack.apis.scoring import ScoringResultRow from llama_stack.apis.scoring_functions import ScoringFnParams from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn @@ -55,7 +55,7 @@ class LlmAsJudgeScoringFn(RegisteredBaseScoringFn): generated_answer=generated_answer, ) - params = OpenAIChatCompletionRequest( + params = OpenAIChatCompletionRequestWithExtraBody( model=fn_def.params.judge_model, messages=[ { diff --git a/llama_stack/providers/inline/tool_runtime/rag/context_retriever.py b/llama_stack/providers/inline/tool_runtime/rag/context_retriever.py index 98098e2d2..14cbec49d 100644 --- a/llama_stack/providers/inline/tool_runtime/rag/context_retriever.py +++ b/llama_stack/providers/inline/tool_runtime/rag/context_retriever.py @@ -8,7 +8,7 @@ from jinja2 import Template from llama_stack.apis.common.content_types import InterleavedContent -from llama_stack.apis.inference import OpenAIChatCompletionRequest, OpenAIUserMessageParam +from llama_stack.apis.inference import OpenAIChatCompletionRequestWithExtraBody, OpenAIUserMessageParam from llama_stack.apis.tools.rag_tool import ( DefaultRAGQueryGeneratorConfig, LLMRAGQueryGeneratorConfig, @@ -65,7 +65,7 @@ async def llm_rag_query_generator( model = config.model message = OpenAIUserMessageParam(content=rendered_content) - params = OpenAIChatCompletionRequest( + params = OpenAIChatCompletionRequestWithExtraBody( model=model, messages=[message], stream=False, diff --git a/llama_stack/providers/remote/inference/bedrock/bedrock.py b/llama_stack/providers/remote/inference/bedrock/bedrock.py index 788c274f1..057ed758b 100644 --- a/llama_stack/providers/remote/inference/bedrock/bedrock.py +++ b/llama_stack/providers/remote/inference/bedrock/bedrock.py @@ -12,8 +12,8 @@ from botocore.client import BaseClient from llama_stack.apis.inference import ( ChatCompletionRequest, Inference, - OpenAIChatCompletionRequest, - OpenAICompletionRequest, + OpenAIChatCompletionRequestWithExtraBody, + OpenAICompletionRequestWithExtraBody, OpenAIEmbeddingsResponse, ) from llama_stack.apis.inference.inference import ( @@ -134,12 +134,12 @@ class BedrockInferenceAdapter( async def openai_completion( self, - params: OpenAICompletionRequest, + params: OpenAICompletionRequestWithExtraBody, ) -> OpenAICompletion: raise NotImplementedError("OpenAI completion not supported by the Bedrock provider") async def openai_chat_completion( self, - params: OpenAIChatCompletionRequest, + params: OpenAIChatCompletionRequestWithExtraBody, ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]: raise NotImplementedError("OpenAI chat completion not supported by the Bedrock provider") diff --git a/llama_stack/providers/remote/inference/databricks/databricks.py b/llama_stack/providers/remote/inference/databricks/databricks.py index 512913226..44996507f 100644 --- a/llama_stack/providers/remote/inference/databricks/databricks.py +++ b/llama_stack/providers/remote/inference/databricks/databricks.py @@ -8,7 +8,7 @@ from collections.abc import Iterable from databricks.sdk import WorkspaceClient -from llama_stack.apis.inference import OpenAICompletion, OpenAICompletionRequest +from llama_stack.apis.inference import OpenAICompletion, OpenAICompletionRequestWithExtraBody from llama_stack.log import get_logger from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin @@ -39,6 +39,6 @@ class DatabricksInferenceAdapter(OpenAIMixin): async def openai_completion( self, - params: OpenAICompletionRequest, + params: OpenAICompletionRequestWithExtraBody, ) -> OpenAICompletion: raise NotImplementedError() diff --git a/llama_stack/providers/remote/inference/llama_openai_compat/llama.py b/llama_stack/providers/remote/inference/llama_openai_compat/llama.py index 5a8bdd55e..e5fb3c77f 100644 --- a/llama_stack/providers/remote/inference/llama_openai_compat/llama.py +++ b/llama_stack/providers/remote/inference/llama_openai_compat/llama.py @@ -3,7 +3,12 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from llama_stack.apis.inference.inference import OpenAICompletion, OpenAICompletionRequest, OpenAIEmbeddingsResponse + +from llama_stack.apis.inference.inference import ( + OpenAICompletion, + OpenAICompletionRequestWithExtraBody, + OpenAIEmbeddingsResponse, +) from llama_stack.log import get_logger from llama_stack.providers.remote.inference.llama_openai_compat.config import LlamaCompatConfig from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin @@ -29,7 +34,7 @@ class LlamaCompatInferenceAdapter(OpenAIMixin): async def openai_completion( self, - params: OpenAICompletionRequest, + params: OpenAICompletionRequestWithExtraBody, ) -> OpenAICompletion: raise NotImplementedError() diff --git a/llama_stack/providers/remote/inference/passthrough/passthrough.py b/llama_stack/providers/remote/inference/passthrough/passthrough.py index 8813ae529..11306095b 100644 --- a/llama_stack/providers/remote/inference/passthrough/passthrough.py +++ b/llama_stack/providers/remote/inference/passthrough/passthrough.py @@ -13,9 +13,9 @@ from llama_stack.apis.inference import ( Inference, OpenAIChatCompletion, OpenAIChatCompletionChunk, - OpenAIChatCompletionRequest, + OpenAIChatCompletionRequestWithExtraBody, OpenAICompletion, - OpenAICompletionRequest, + OpenAICompletionRequestWithExtraBody, OpenAIEmbeddingsResponse, ) from llama_stack.apis.models import Model @@ -79,7 +79,7 @@ class PassthroughInferenceAdapter(Inference): async def openai_completion( self, - params: OpenAICompletionRequest, + params: OpenAICompletionRequestWithExtraBody, ) -> OpenAICompletion: client = self._get_client() model_obj = await self.model_store.get_model(params.model) @@ -93,7 +93,7 @@ class PassthroughInferenceAdapter(Inference): async def openai_chat_completion( self, - params: OpenAIChatCompletionRequest, + params: OpenAIChatCompletionRequestWithExtraBody, ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]: client = self._get_client() model_obj = await self.model_store.get_model(params.model) diff --git a/llama_stack/providers/remote/inference/runpod/runpod.py b/llama_stack/providers/remote/inference/runpod/runpod.py index c08136f9f..db60644ca 100644 --- a/llama_stack/providers/remote/inference/runpod/runpod.py +++ b/llama_stack/providers/remote/inference/runpod/runpod.py @@ -9,7 +9,7 @@ from collections.abc import AsyncIterator from llama_stack.apis.inference import ( OpenAIChatCompletion, OpenAIChatCompletionChunk, - OpenAIChatCompletionRequest, + OpenAIChatCompletionRequestWithExtraBody, ) from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin @@ -31,7 +31,7 @@ class RunpodInferenceAdapter(OpenAIMixin): async def openai_chat_completion( self, - params: OpenAIChatCompletionRequest, + params: OpenAIChatCompletionRequestWithExtraBody, ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]: """Override to add RunPod-specific stream_options requirement.""" params = params.model_copy() diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py index b09326271..74a18f3de 100644 --- a/llama_stack/providers/remote/inference/vllm/vllm.py +++ b/llama_stack/providers/remote/inference/vllm/vllm.py @@ -14,7 +14,7 @@ from pydantic import ConfigDict from llama_stack.apis.inference import ( OpenAIChatCompletion, - OpenAIChatCompletionRequest, + OpenAIChatCompletionRequestWithExtraBody, ToolChoice, ) from llama_stack.log import get_logger @@ -93,7 +93,7 @@ class VLLMInferenceAdapter(OpenAIMixin): async def openai_chat_completion( self, - params: OpenAIChatCompletionRequest, + params: OpenAIChatCompletionRequestWithExtraBody, ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]: params = params.model_copy() diff --git a/llama_stack/providers/utils/inference/litellm_openai_mixin.py b/llama_stack/providers/utils/inference/litellm_openai_mixin.py index eed078a0e..d1be1789a 100644 --- a/llama_stack/providers/utils/inference/litellm_openai_mixin.py +++ b/llama_stack/providers/utils/inference/litellm_openai_mixin.py @@ -16,9 +16,9 @@ from llama_stack.apis.inference import ( JsonSchemaResponseFormat, OpenAIChatCompletion, OpenAIChatCompletionChunk, - OpenAIChatCompletionRequest, + OpenAIChatCompletionRequestWithExtraBody, OpenAICompletion, - OpenAICompletionRequest, + OpenAICompletionRequestWithExtraBody, OpenAIEmbeddingData, OpenAIEmbeddingsResponse, OpenAIEmbeddingUsage, @@ -226,7 +226,7 @@ class LiteLLMOpenAIMixin( async def openai_completion( self, - params: OpenAICompletionRequest, + params: OpenAICompletionRequestWithExtraBody, ) -> OpenAICompletion: model_obj = await self.model_store.get_model(params.model) @@ -248,8 +248,6 @@ class LiteLLMOpenAIMixin( temperature=params.temperature, top_p=params.top_p, user=params.user, - guided_choice=params.guided_choice, - prompt_logprobs=params.prompt_logprobs, suffix=params.suffix, api_key=self.get_api_key(), api_base=self.api_base, @@ -258,7 +256,7 @@ class LiteLLMOpenAIMixin( async def openai_chat_completion( self, - params: OpenAIChatCompletionRequest, + params: OpenAIChatCompletionRequestWithExtraBody, ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]: # Add usage tracking for streaming when telemetry is active from llama_stack.providers.utils.telemetry.tracing import get_current_span diff --git a/llama_stack/providers/utils/inference/openai_mixin.py b/llama_stack/providers/utils/inference/openai_mixin.py index 502bc207b..863ea161c 100644 --- a/llama_stack/providers/utils/inference/openai_mixin.py +++ b/llama_stack/providers/utils/inference/openai_mixin.py @@ -17,9 +17,9 @@ from llama_stack.apis.inference import ( Model, OpenAIChatCompletion, OpenAIChatCompletionChunk, - OpenAIChatCompletionRequest, + OpenAIChatCompletionRequestWithExtraBody, OpenAICompletion, - OpenAICompletionRequest, + OpenAICompletionRequestWithExtraBody, OpenAIEmbeddingData, OpenAIEmbeddingsResponse, OpenAIEmbeddingUsage, @@ -223,21 +223,11 @@ class OpenAIMixin(NeedsRequestProviderData, ABC, BaseModel): async def openai_completion( self, - params: OpenAICompletionRequest, + params: OpenAICompletionRequestWithExtraBody, ) -> OpenAICompletion: """ Direct OpenAI completion API call. """ - # Handle parameters that are not supported by OpenAI API, but may be by the provider - # prompt_logprobs is supported by vLLM - # guided_choice is supported by vLLM - # TODO: test coverage - extra_body: dict[str, Any] = {} - if params.prompt_logprobs is not None and params.prompt_logprobs >= 0: - extra_body["prompt_logprobs"] = params.prompt_logprobs - if params.guided_choice: - extra_body["guided_choice"] = params.guided_choice - # TODO: fix openai_completion to return type compatible with OpenAI's API response completion_kwargs = await prepare_openai_completion_params( model=await self._get_provider_model_id(params.model), @@ -259,13 +249,15 @@ class OpenAIMixin(NeedsRequestProviderData, ABC, BaseModel): user=params.user, suffix=params.suffix, ) - resp = await self.client.completions.create(**completion_kwargs, extra_body=extra_body) + if extra_body := params.model_extra: + completion_kwargs["extra_body"] = extra_body + resp = await self.client.completions.create(**completion_kwargs) return await self._maybe_overwrite_id(resp, params.stream) # type: ignore[no-any-return] async def openai_chat_completion( self, - params: OpenAIChatCompletionRequest, + params: OpenAIChatCompletionRequestWithExtraBody, ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]: """ Direct OpenAI chat completion API call. @@ -316,6 +308,8 @@ class OpenAIMixin(NeedsRequestProviderData, ABC, BaseModel): user=params.user, ) + if extra_body := params.model_extra: + request_params["extra_body"] = extra_body resp = await self.client.chat.completions.create(**request_params) return await self._maybe_overwrite_id(resp, params.stream) # type: ignore[no-any-return] diff --git a/tests/integration/common/recordings/models-64a2277c90f0f42576f60c1030e3a020403d34a95f56931b792d5939f4cebc57-9ecd9600.json b/tests/integration/common/recordings/models-64a2277c90f0f42576f60c1030e3a020403d34a95f56931b792d5939f4cebc57-9ecd9600.json new file mode 100644 index 000000000..2d89edb5a --- /dev/null +++ b/tests/integration/common/recordings/models-64a2277c90f0f42576f60c1030e3a020403d34a95f56931b792d5939f4cebc57-9ecd9600.json @@ -0,0 +1,881 @@ +{ + "test_id": null, + "request": { + "method": "POST", + "url": "https://api.openai.com/v1/v1/models", + "headers": {}, + "body": {}, + "endpoint": "/v1/models", + "model": "" + }, + "response": { + "body": [ + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4-0613", + "created": 1686588896, + "object": "model", + "owned_by": "openai" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4", + "created": 1687882411, + "object": "model", + "owned_by": "openai" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-3.5-turbo", + "created": 1677610602, + "object": "model", + "owned_by": "openai" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "sora-2-pro", + "created": 1759708663, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-audio-mini-2025-10-06", + "created": 1759512137, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-realtime-mini", + "created": 1759517133, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-realtime-mini-2025-10-06", + "created": 1759517175, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "sora-2", + "created": 1759708615, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "davinci-002", + "created": 1692634301, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "babbage-002", + "created": 1692634615, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-3.5-turbo-instruct", + "created": 1692901427, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-3.5-turbo-instruct-0914", + "created": 1694122472, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "dall-e-3", + "created": 1698785189, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "dall-e-2", + "created": 1698798177, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4-1106-preview", + "created": 1698957206, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-3.5-turbo-1106", + "created": 1698959748, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "tts-1-hd", + "created": 1699046015, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "tts-1-1106", + "created": 1699053241, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "tts-1-hd-1106", + "created": 1699053533, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "text-embedding-3-small", + "created": 1705948997, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "text-embedding-3-large", + "created": 1705953180, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4-0125-preview", + "created": 1706037612, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4-turbo-preview", + "created": 1706037777, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-3.5-turbo-0125", + "created": 1706048358, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4-turbo", + "created": 1712361441, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4-turbo-2024-04-09", + "created": 1712601677, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o", + "created": 1715367049, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-2024-05-13", + "created": 1715368132, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-mini-2024-07-18", + "created": 1721172717, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-mini", + "created": 1721172741, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-2024-08-06", + "created": 1722814719, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "chatgpt-4o-latest", + "created": 1723515131, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "o1-mini-2024-09-12", + "created": 1725648979, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "o1-mini", + "created": 1725649008, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-realtime-preview-2024-10-01", + "created": 1727131766, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-audio-preview-2024-10-01", + "created": 1727389042, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-audio-preview", + "created": 1727460443, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-realtime-preview", + "created": 1727659998, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "omni-moderation-latest", + "created": 1731689265, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "omni-moderation-2024-09-26", + "created": 1732734466, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-realtime-preview-2024-12-17", + "created": 1733945430, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-audio-preview-2024-12-17", + "created": 1734034239, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-mini-realtime-preview-2024-12-17", + "created": 1734112601, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-mini-audio-preview-2024-12-17", + "created": 1734115920, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "o1-2024-12-17", + "created": 1734326976, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "o1", + "created": 1734375816, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-mini-realtime-preview", + "created": 1734387380, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-mini-audio-preview", + "created": 1734387424, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "o3-mini", + "created": 1737146383, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "o3-mini-2025-01-31", + "created": 1738010200, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-2024-11-20", + "created": 1739331543, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-search-preview-2025-03-11", + "created": 1741388170, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-search-preview", + "created": 1741388720, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-mini-search-preview-2025-03-11", + "created": 1741390858, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-mini-search-preview", + "created": 1741391161, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-transcribe", + "created": 1742068463, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-mini-transcribe", + "created": 1742068596, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "o1-pro-2025-03-19", + "created": 1742251504, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "o1-pro", + "created": 1742251791, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-mini-tts", + "created": 1742403959, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "o3-2025-04-16", + "created": 1744133301, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "o4-mini-2025-04-16", + "created": 1744133506, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "o3", + "created": 1744225308, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "o4-mini", + "created": 1744225351, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4.1-2025-04-14", + "created": 1744315746, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4.1", + "created": 1744316542, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4.1-mini-2025-04-14", + "created": 1744317547, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4.1-mini", + "created": 1744318173, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4.1-nano-2025-04-14", + "created": 1744321025, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4.1-nano", + "created": 1744321707, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-image-1", + "created": 1745517030, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "codex-mini-latest", + "created": 1746673257, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-realtime-preview-2025-06-03", + "created": 1748907838, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-4o-audio-preview-2025-06-03", + "created": 1748908498, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "o4-mini-deep-research", + "created": 1749685485, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "o4-mini-deep-research-2025-06-26", + "created": 1750866121, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-5-chat-latest", + "created": 1754073306, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-5-2025-08-07", + "created": 1754075360, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-5", + "created": 1754425777, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-5-mini-2025-08-07", + "created": 1754425867, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-5-mini", + "created": 1754425928, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-5-nano-2025-08-07", + "created": 1754426303, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-5-nano", + "created": 1754426384, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-audio-2025-08-28", + "created": 1756256146, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-realtime", + "created": 1756271701, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-realtime-2025-08-28", + "created": 1756271773, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-audio", + "created": 1756339249, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-5-codex", + "created": 1757527818, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-image-1-mini", + "created": 1758845821, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-5-pro-2025-10-06", + "created": 1759469707, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-5-pro", + "created": 1759469822, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-audio-mini", + "created": 1759512027, + "object": "model", + "owned_by": "system" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "gpt-3.5-turbo-16k", + "created": 1683758102, + "object": "model", + "owned_by": "openai-internal" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "tts-1", + "created": 1681940951, + "object": "model", + "owned_by": "openai-internal" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "whisper-1", + "created": 1677532384, + "object": "model", + "owned_by": "openai-internal" + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "text-embedding-ada-002", + "created": 1671217299, + "object": "model", + "owned_by": "openai-internal" + } + } + ], + "is_streaming": false + }, + "id_normalization_mapping": {} +} diff --git a/tests/integration/common/recordings/models-64a2277c90f0f42576f60c1030e3a020403d34a95f56931b792d5939f4cebc57-fb68f5a6.json b/tests/integration/common/recordings/models-64a2277c90f0f42576f60c1030e3a020403d34a95f56931b792d5939f4cebc57-fb68f5a6.json new file mode 100644 index 000000000..05812e981 --- /dev/null +++ b/tests/integration/common/recordings/models-64a2277c90f0f42576f60c1030e3a020403d34a95f56931b792d5939f4cebc57-fb68f5a6.json @@ -0,0 +1,45 @@ +{ + "test_id": null, + "request": { + "method": "POST", + "url": "http://localhost:8000/v1/v1/models", + "headers": {}, + "body": {}, + "endpoint": "/v1/models", + "model": "" + }, + "response": { + "body": [ + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "Qwen/Qwen3-0.6B", + "created": 1760135828, + "object": "model", + "owned_by": "vllm", + "root": "Qwen/Qwen3-0.6B", + "parent": null, + "max_model_len": 4096, + "permission": [ + { + "id": "modelperm-5119df1e8c3246148a1d43e60357e420", + "object": "model_permission", + "created": 1760135828, + "allow_create_engine": false, + "allow_sampling": true, + "allow_logprobs": true, + "allow_search_indices": false, + "allow_view": true, + "allow_fine_tuning": false, + "organization": "*", + "group": null, + "is_blocking": false + } + ] + } + } + ], + "is_streaming": false + }, + "id_normalization_mapping": {} +} diff --git a/tests/integration/common/recordings/models-bd3df37825f32706c88677a327960bfa47dcf93f2ea6ed882f1186cf4fdda5bb-f15cee9a.json b/tests/integration/common/recordings/models-bd3df37825f32706c88677a327960bfa47dcf93f2ea6ed882f1186cf4fdda5bb-f15cee9a.json new file mode 100644 index 000000000..84e8eec92 --- /dev/null +++ b/tests/integration/common/recordings/models-bd3df37825f32706c88677a327960bfa47dcf93f2ea6ed882f1186cf4fdda5bb-f15cee9a.json @@ -0,0 +1,543 @@ +{ + "test_id": null, + "request": { + "method": "POST", + "url": "https://api.fireworks.ai/inference/v1/v1/models", + "headers": {}, + "body": {}, + "endpoint": "/v1/models", + "model": "" + }, + "response": { + "body": [ + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/flux-1-dev-fp8", + "created": 1729532889, + "object": "model", + "owned_by": "fireworks", + "kind": "FLUMINA_BASE_MODEL", + "supports_chat": false, + "supports_image_input": false, + "supports_tools": false + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/flux-kontext-max", + "created": 1750714611, + "object": "model", + "owned_by": "fireworks", + "kind": "FLUMINA_BASE_MODEL", + "supports_chat": true, + "supports_image_input": true, + "supports_tools": false + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/flux-kontext-pro", + "created": 1750488264, + "object": "model", + "owned_by": "fireworks", + "kind": "FLUMINA_BASE_MODEL", + "supports_chat": true, + "supports_image_input": true, + "supports_tools": false + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/sentientfoundation-serverless/models/dobby-mini-unhinged-plus-llama-3-1-8b", + "created": 1748467427, + "object": "model", + "owned_by": "sentientfoundation-serverless", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": false, + "context_length": 131072 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/sentientfoundation/models/dobby-unhinged-llama-3-3-70b-new", + "created": 1739563474, + "object": "model", + "owned_by": "sentientfoundation", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": false, + "context_length": 131072 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/gpt-oss-120b", + "created": 1754345600, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": true, + "context_length": 131072 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/qwen3-235b-a22b-instruct-2507", + "created": 1753124424, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": true, + "context_length": 262144 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/qwen3-235b-a22b-thinking-2507", + "created": 1753455434, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": false, + "context_length": 262144 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/deepseek-v3-0324", + "created": 1742827220, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": true, + "context_length": 163840 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/kimi-k2-instruct", + "created": 1752259096, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": true, + "context_length": 131072 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/gpt-oss-20b", + "created": 1754345466, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": false, + "context_length": 131072 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/kimi-k2-instruct-0905", + "created": 1757018994, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": true, + "context_length": 262144 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/llama-v3p3-70b-instruct", + "created": 1733442103, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": false, + "context_length": 131072 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/qwen3-235b-a22b", + "created": 1745885249, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": true, + "context_length": 131072 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/glm-4p5-air", + "created": 1754089426, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": true, + "context_length": 131072 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/deepseek-v3p1", + "created": 1755758988, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": true, + "context_length": 163840 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/flux-1-schnell-fp8", + "created": 1729535376, + "object": "model", + "owned_by": "fireworks", + "kind": "FLUMINA_BASE_MODEL", + "supports_chat": false, + "supports_image_input": false, + "supports_tools": false + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/llama-v3p1-405b-instruct", + "created": 1721428386, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": true, + "context_length": 131072 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/llama4-scout-instruct-basic", + "created": 1743878279, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": true, + "supports_tools": true, + "context_length": 1048576 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/qwen3-30b-a3b", + "created": 1745878133, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": true, + "context_length": 131072 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/llama-v3p1-70b-instruct", + "created": 1721287357, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": true, + "context_length": 131072 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/deepseek-r1-0528", + "created": 1748456377, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": true, + "context_length": 163840 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/mixtral-8x22b-instruct", + "created": 1713375508, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": true, + "context_length": 65536 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/llama4-maverick-instruct-basic", + "created": 1743878495, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": true, + "supports_tools": true, + "context_length": 1048576 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/qwen2p5-vl-32b-instruct", + "created": 1743392739, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": true, + "supports_tools": false, + "context_length": 128000 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/deepseek-v3p1-terminus", + "created": 1758586241, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": true, + "context_length": 163840 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/llama-v3p1-8b-instruct", + "created": 1721692808, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": false, + "context_length": 131072 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/qwen3-coder-480b-a35b-instruct", + "created": 1753211090, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": true, + "context_length": 262144 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/qwen3-30b-a3b-thinking-2507", + "created": 1753916446, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": false + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/qwen3-embedding-8b", + "created": 1755707090, + "object": "model", + "owned_by": "fireworks", + "kind": "EMBEDDING_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": false, + "context_length": 40960 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/qwen3-reranker-8b", + "created": 1759865045, + "object": "model", + "owned_by": "fireworks", + "kind": "EMBEDDING_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": false, + "context_length": 40960 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/glm-4p5", + "created": 1753809636, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": true, + "context_length": 131072 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/qwen3-coder-30b-a3b-instruct", + "created": 1754063588, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": false, + "context_length": 262144 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/deepseek-r1", + "created": 1737397673, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": false, + "context_length": 163840 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/deepseek-v3", + "created": 1735576668, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": true, + "context_length": 131072 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/deepseek-r1-basic", + "created": 1742306746, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": false, + "context_length": 163840 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/fireworks/models/qwen3-30b-a3b-instruct-2507", + "created": 1753808388, + "object": "model", + "owned_by": "fireworks", + "kind": "HF_BASE_MODEL", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": false, + "context_length": 262144 + } + }, + { + "__type__": "openai.types.model.Model", + "__data__": { + "id": "accounts/tvergho-87e44d/models/debatecards-70b-ft-3epoch-dpo-v2", + "created": 1743381121, + "object": "model", + "owned_by": "tvergho-87e44d", + "kind": "HF_PEFT_ADDON", + "supports_chat": true, + "supports_image_input": false, + "supports_tools": false + } + } + ], + "is_streaming": false + }, + "id_normalization_mapping": {} +} diff --git a/tests/integration/inference/recordings/d2ba309413e85d6166f7543a879b890b4e65a5f9917a2d75c5795782ab7cbfff.json b/tests/integration/inference/recordings/d2ba309413e85d6166f7543a879b890b4e65a5f9917a2d75c5795782ab7cbfff.json new file mode 100644 index 000000000..6b726d9fe --- /dev/null +++ b/tests/integration/inference/recordings/d2ba309413e85d6166f7543a879b890b4e65a5f9917a2d75c5795782ab7cbfff.json @@ -0,0 +1,48 @@ +{ + "test_id": "tests/integration/inference/test_openai_completion.py::test_openai_completion_guided_choice[txt=vllm/Qwen/Qwen3-0.6B]", + "request": { + "method": "POST", + "url": "http://localhost:8000/v1/v1/completions", + "headers": {}, + "body": { + "model": "Qwen/Qwen3-0.6B", + "prompt": "I am feeling really sad today.", + "stream": false + }, + "endpoint": "/v1/completions", + "model": "Qwen/Qwen3-0.6B" + }, + "response": { + "body": { + "__type__": "openai.types.completion.Completion", + "__data__": { + "id": "rec-d2ba309413e8", + "choices": [ + { + "finish_reason": "length", + "index": 0, + "logprobs": null, + "text": " I have been working on a project that I feel like I'm not doing well", + "stop_reason": null, + "prompt_logprobs": null + } + ], + "created": 0, + "model": "Qwen/Qwen3-0.6B", + "object": "text_completion", + "system_fingerprint": null, + "usage": { + "completion_tokens": 16, + "prompt_tokens": 7, + "total_tokens": 23, + "completion_tokens_details": null, + "prompt_tokens_details": null + }, + "service_tier": null, + "kv_transfer_params": null + } + }, + "is_streaming": false + }, + "id_normalization_mapping": {} +} diff --git a/tests/integration/inference/recordings/e3727f6c749ab8bdee2f581300092002485023b937d72b7aa8d4c15c9204fc5c.json b/tests/integration/inference/recordings/e3727f6c749ab8bdee2f581300092002485023b937d72b7aa8d4c15c9204fc5c.json new file mode 100644 index 000000000..21cc0300f --- /dev/null +++ b/tests/integration/inference/recordings/e3727f6c749ab8bdee2f581300092002485023b937d72b7aa8d4c15c9204fc5c.json @@ -0,0 +1,54 @@ +{ + "test_id": "tests/integration/inference/test_openai_completion.py::test_openai_completion_guided_choice[txt=vllm/Qwen/Qwen3-0.6B]", + "request": { + "method": "POST", + "url": "http://localhost:8000/v1/v1/completions", + "headers": {}, + "body": { + "model": "Qwen/Qwen3-0.6B", + "prompt": "I am feeling really sad today.", + "stream": false, + "extra_body": { + "guided_choices": [ + "joy", + "sadness" + ] + } + }, + "endpoint": "/v1/completions", + "model": "Qwen/Qwen3-0.6B" + }, + "response": { + "body": { + "__type__": "openai.types.completion.Completion", + "__data__": { + "id": "rec-e3727f6c749a", + "choices": [ + { + "finish_reason": "length", + "index": 0, + "logprobs": null, + "text": " I feel that I am not good enough, and I feel like I have no", + "stop_reason": null, + "prompt_logprobs": null + } + ], + "created": 0, + "model": "Qwen/Qwen3-0.6B", + "object": "text_completion", + "system_fingerprint": null, + "usage": { + "completion_tokens": 16, + "prompt_tokens": 7, + "total_tokens": 23, + "completion_tokens_details": null, + "prompt_tokens_details": null + }, + "service_tier": null, + "kv_transfer_params": null + } + }, + "is_streaming": false + }, + "id_normalization_mapping": {} +} diff --git a/tests/integration/inference/recordings/f02f1bfd75adaea87b91dedc59430b99015b5ed0e2bbf24418a31146ffcbca9b.json b/tests/integration/inference/recordings/f02f1bfd75adaea87b91dedc59430b99015b5ed0e2bbf24418a31146ffcbca9b.json new file mode 100644 index 000000000..8a54ca1f7 --- /dev/null +++ b/tests/integration/inference/recordings/f02f1bfd75adaea87b91dedc59430b99015b5ed0e2bbf24418a31146ffcbca9b.json @@ -0,0 +1,54 @@ +{ + "test_id": "tests/integration/inference/test_openai_completion.py::test_openai_completion_guided_choice[txt=vllm/Qwen/Qwen3-0.6B]", + "request": { + "method": "POST", + "url": "http://localhost:8000/v1/v1/completions", + "headers": {}, + "body": { + "model": "Qwen/Qwen3-0.6B", + "prompt": "I am feeling really sad today.", + "stream": false, + "extra_body": { + "guided_choice": [ + "joy", + "sadness" + ] + } + }, + "endpoint": "/v1/completions", + "model": "Qwen/Qwen3-0.6B" + }, + "response": { + "body": { + "__type__": "openai.types.completion.Completion", + "__data__": { + "id": "rec-f02f1bfd75ad", + "choices": [ + { + "finish_reason": "stop", + "index": 0, + "logprobs": null, + "text": "sadness", + "stop_reason": null, + "prompt_logprobs": null + } + ], + "created": 0, + "model": "Qwen/Qwen3-0.6B", + "object": "text_completion", + "system_fingerprint": null, + "usage": { + "completion_tokens": 3, + "prompt_tokens": 7, + "total_tokens": 10, + "completion_tokens_details": null, + "prompt_tokens_details": null + }, + "service_tier": null, + "kv_transfer_params": null + } + }, + "is_streaming": false + }, + "id_normalization_mapping": {} +} diff --git a/tests/integration/inference/test_openai_completion.py b/tests/integration/inference/test_openai_completion.py index 2c065560e..3f0cffb2d 100644 --- a/tests/integration/inference/test_openai_completion.py +++ b/tests/integration/inference/test_openai_completion.py @@ -223,7 +223,7 @@ def test_openai_completion_guided_choice(llama_stack_client, client_with_models, model=text_model_id, prompt=prompt, stream=False, - guided_choice=["joy", "sadness"], + extra_body={"guided_choice": ["joy", "sadness"]}, ) assert len(response.choices) > 0 choice = response.choices[0] diff --git a/tests/unit/providers/agents/meta_reference/test_openai_responses.py b/tests/unit/providers/agents/meta_reference/test_openai_responses.py index 8025ea5ae..81978c60c 100644 --- a/tests/unit/providers/agents/meta_reference/test_openai_responses.py +++ b/tests/unit/providers/agents/meta_reference/test_openai_responses.py @@ -33,7 +33,7 @@ from llama_stack.apis.agents.openai_responses import ( from llama_stack.apis.inference import ( OpenAIAssistantMessageParam, OpenAIChatCompletionContentPartTextParam, - OpenAIChatCompletionRequest, + OpenAIChatCompletionRequestWithExtraBody, OpenAIDeveloperMessageParam, OpenAIJSONSchema, OpenAIResponseFormatJSONObject, @@ -162,7 +162,7 @@ async def test_create_openai_response_with_string_input(openai_responses_impl, m chunks = [chunk async for chunk in result] mock_inference_api.openai_chat_completion.assert_called_once_with( - OpenAIChatCompletionRequest( + OpenAIChatCompletionRequestWithExtraBody( model=model, messages=[OpenAIUserMessageParam(role="user", content="What is the capital of Ireland?", name=None)], response_format=None, diff --git a/tests/unit/providers/inference/test_remote_vllm.py b/tests/unit/providers/inference/test_remote_vllm.py index 569fb5031..ffd45798e 100644 --- a/tests/unit/providers/inference/test_remote_vllm.py +++ b/tests/unit/providers/inference/test_remote_vllm.py @@ -13,11 +13,16 @@ import pytest from llama_stack.apis.inference import ( OpenAIAssistantMessageParam, OpenAIChatCompletion, - OpenAIChatCompletionRequest, + OpenAIChatCompletionRequestWithExtraBody, OpenAIChoice, + OpenAICompletion, + OpenAICompletionChoice, + OpenAICompletionRequestWithExtraBody, ToolChoice, ) from llama_stack.apis.models import Model +from llama_stack.core.routers.inference import InferenceRouter +from llama_stack.core.routing_tables.models import ModelsRoutingTable from llama_stack.providers.datatypes import HealthStatus from llama_stack.providers.remote.inference.vllm.config import VLLMInferenceAdapterConfig from llama_stack.providers.remote.inference.vllm.vllm import VLLMInferenceAdapter @@ -57,7 +62,7 @@ async def test_old_vllm_tool_choice(vllm_inference_adapter): mock_client_property.return_value = mock_client # No tools but auto tool choice - params = OpenAIChatCompletionRequest( + params = OpenAIChatCompletionRequestWithExtraBody( model="mock-model", messages=[{"role": "user", "content": "test"}], stream=False, @@ -173,7 +178,7 @@ async def test_openai_chat_completion_is_async(vllm_inference_adapter): ) async def do_inference(): - params = OpenAIChatCompletionRequest( + params = OpenAIChatCompletionRequestWithExtraBody( model="mock-model", messages=[{"role": "user", "content": "one fish two fish"}], stream=False, @@ -191,3 +196,148 @@ async def test_openai_chat_completion_is_async(vllm_inference_adapter): assert mock_create_client.call_count == 4 # no cheating assert total_time < (sleep_time * 2), f"Total time taken: {total_time}s exceeded expected max" + + +async def test_vllm_completion_extra_body(): + """ + Test that vLLM-specific guided_choice and prompt_logprobs parameters are correctly forwarded + via extra_body to the underlying OpenAI client through the InferenceRouter. + """ + # Set up the vLLM adapter + config = VLLMInferenceAdapterConfig(url="http://mocked.localhost:12345") + vllm_adapter = VLLMInferenceAdapter(config=config) + vllm_adapter.__provider_id__ = "vllm" + await vllm_adapter.initialize() + + # Create a mock model store + mock_model_store = AsyncMock() + mock_model = Model(identifier="mock-model", provider_resource_id="mock-model", provider_id="vllm") + mock_model_store.get_model.return_value = mock_model + mock_model_store.has_model.return_value = True + + # Create a mock dist_registry + mock_dist_registry = MagicMock() + mock_dist_registry.get = AsyncMock(return_value=mock_model) + mock_dist_registry.set = AsyncMock() + + # Set up the routing table + routing_table = ModelsRoutingTable( + impls_by_provider_id={"vllm": vllm_adapter}, + dist_registry=mock_dist_registry, + policy=[], + ) + # Inject the model store into the adapter + vllm_adapter.model_store = routing_table + + # Create the InferenceRouter + router = InferenceRouter(routing_table=routing_table) + + # Patch the OpenAI client + with patch.object(VLLMInferenceAdapter, "client", new_callable=PropertyMock) as mock_client_property: + mock_client = MagicMock() + mock_client.completions.create = AsyncMock( + return_value=OpenAICompletion( + id="cmpl-abc123", + created=1, + model="mock-model", + choices=[ + OpenAICompletionChoice( + text="joy", + finish_reason="stop", + index=0, + ) + ], + ) + ) + mock_client_property.return_value = mock_client + + # Test with guided_choice and prompt_logprobs as extra fields + params = OpenAICompletionRequestWithExtraBody( + model="mock-model", + prompt="I am feeling happy", + stream=False, + guided_choice=["joy", "sadness"], + prompt_logprobs=5, + ) + await router.openai_completion(params) + + # Verify that the client was called with extra_body containing both parameters + mock_client.completions.create.assert_called_once() + call_kwargs = mock_client.completions.create.call_args.kwargs + assert "extra_body" in call_kwargs + assert "guided_choice" in call_kwargs["extra_body"] + assert call_kwargs["extra_body"]["guided_choice"] == ["joy", "sadness"] + assert "prompt_logprobs" in call_kwargs["extra_body"] + assert call_kwargs["extra_body"]["prompt_logprobs"] == 5 + + +async def test_vllm_chat_completion_extra_body(): + """ + Test that vLLM-specific parameters (e.g., chat_template_kwargs) are correctly forwarded + via extra_body to the underlying OpenAI client through the InferenceRouter for chat completion. + """ + # Set up the vLLM adapter + config = VLLMInferenceAdapterConfig(url="http://mocked.localhost:12345") + vllm_adapter = VLLMInferenceAdapter(config=config) + vllm_adapter.__provider_id__ = "vllm" + await vllm_adapter.initialize() + + # Create a mock model store + mock_model_store = AsyncMock() + mock_model = Model(identifier="mock-model", provider_resource_id="mock-model", provider_id="vllm") + mock_model_store.get_model.return_value = mock_model + mock_model_store.has_model.return_value = True + + # Create a mock dist_registry + mock_dist_registry = MagicMock() + mock_dist_registry.get = AsyncMock(return_value=mock_model) + mock_dist_registry.set = AsyncMock() + + # Set up the routing table + routing_table = ModelsRoutingTable( + impls_by_provider_id={"vllm": vllm_adapter}, + dist_registry=mock_dist_registry, + policy=[], + ) + # Inject the model store into the adapter + vllm_adapter.model_store = routing_table + + # Create the InferenceRouter + router = InferenceRouter(routing_table=routing_table) + + # Patch the OpenAI client + with patch.object(VLLMInferenceAdapter, "client", new_callable=PropertyMock) as mock_client_property: + mock_client = MagicMock() + mock_client.chat.completions.create = AsyncMock( + return_value=OpenAIChatCompletion( + id="chatcmpl-abc123", + created=1, + model="mock-model", + choices=[ + OpenAIChoice( + message=OpenAIAssistantMessageParam( + content="test response", + ), + finish_reason="stop", + index=0, + ) + ], + ) + ) + mock_client_property.return_value = mock_client + + # Test with chat_template_kwargs as extra field + params = OpenAIChatCompletionRequestWithExtraBody( + model="mock-model", + messages=[{"role": "user", "content": "test"}], + stream=False, + chat_template_kwargs={"thinking": True}, + ) + await router.openai_chat_completion(params) + + # Verify that the client was called with extra_body containing chat_template_kwargs + mock_client.chat.completions.create.assert_called_once() + call_kwargs = mock_client.chat.completions.create.call_args.kwargs + assert "extra_body" in call_kwargs + assert "chat_template_kwargs" in call_kwargs["extra_body"] + assert call_kwargs["extra_body"]["chat_template_kwargs"] == {"thinking": True} diff --git a/tests/unit/providers/utils/inference/test_openai_mixin.py b/tests/unit/providers/utils/inference/test_openai_mixin.py index 4a24d72ed..80c219055 100644 --- a/tests/unit/providers/utils/inference/test_openai_mixin.py +++ b/tests/unit/providers/utils/inference/test_openai_mixin.py @@ -12,7 +12,7 @@ from unittest.mock import AsyncMock, MagicMock, Mock, PropertyMock, patch import pytest from pydantic import BaseModel, Field -from llama_stack.apis.inference import Model, OpenAIChatCompletionRequest, OpenAIUserMessageParam +from llama_stack.apis.inference import Model, OpenAIChatCompletionRequestWithExtraBody, OpenAIUserMessageParam from llama_stack.apis.models import ModelType from llama_stack.core.request_headers import request_provider_data_context from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig @@ -271,7 +271,7 @@ class TestOpenAIMixinImagePreprocessing: with patch("llama_stack.providers.utils.inference.openai_mixin.localize_image_content") as mock_localize: mock_localize.return_value = (b"fake_image_data", "jpeg") - params = OpenAIChatCompletionRequest(model="test-model", messages=[message]) + params = OpenAIChatCompletionRequestWithExtraBody(model="test-model", messages=[message]) await mixin.openai_chat_completion(params) mock_localize.assert_called_once_with("http://example.com/image.jpg") @@ -304,7 +304,7 @@ class TestOpenAIMixinImagePreprocessing: with patch.object(type(mixin), "client", new_callable=PropertyMock, return_value=mock_client): with patch("llama_stack.providers.utils.inference.openai_mixin.localize_image_content") as mock_localize: - params = OpenAIChatCompletionRequest(model="test-model", messages=[message]) + params = OpenAIChatCompletionRequestWithExtraBody(model="test-model", messages=[message]) await mixin.openai_chat_completion(params) mock_localize.assert_not_called()