diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 1927f2013..84a9bc67d 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -8965,6 +8965,50 @@ ], "title": "OpenAIImageURL" }, + "OpenAIJSONSchema": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "description": { + "type": "string" + }, + "strict": { + "type": "boolean" + }, + "schema": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "additionalProperties": false, + "required": [ + "name" + ], + "title": "OpenAIJSONSchema" + }, "OpenAIMessageParam": { "oneOf": [ { @@ -8994,6 +9038,76 @@ } } }, + "OpenAIResponseFormatJSONObject": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "json_object", + "default": "json_object" + } + }, + "additionalProperties": false, + "required": [ + "type" + ], + "title": "OpenAIResponseFormatJSONObject" + }, + "OpenAIResponseFormatJSONSchema": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "json_schema", + "default": "json_schema" + }, + "json_schema": { + "$ref": "#/components/schemas/OpenAIJSONSchema" + } + }, + "additionalProperties": false, + "required": [ + "type", + "json_schema" + ], + "title": "OpenAIResponseFormatJSONSchema" + }, + "OpenAIResponseFormatParam": { + "oneOf": [ + { + "$ref": "#/components/schemas/OpenAIResponseFormatText" + }, + { + "$ref": "#/components/schemas/OpenAIResponseFormatJSONSchema" + }, + { + "$ref": "#/components/schemas/OpenAIResponseFormatJSONObject" + } + ], + "discriminator": { + "propertyName": "type", + "mapping": { + "text": "#/components/schemas/OpenAIResponseFormatText", + "json_schema": "#/components/schemas/OpenAIResponseFormatJSONSchema", + "json_object": "#/components/schemas/OpenAIResponseFormatJSONObject" + } + } + }, + "OpenAIResponseFormatText": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "text", + "default": "text" + } + }, + "additionalProperties": false, + "required": [ + "type" + ], + "title": "OpenAIResponseFormatText" + }, "OpenAISystemMessageParam": { "type": "object", "properties": { @@ -9215,10 +9329,7 @@ "description": "(Optional) The penalty for repeated tokens" }, "response_format": { - "type": "object", - "additionalProperties": { - "type": "string" - }, + "$ref": "#/components/schemas/OpenAIResponseFormatParam", "description": "(Optional) The response format to use" }, "seed": { diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 1070b76a4..3fcc83f15 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -6157,6 +6157,29 @@ components: required: - url title: OpenAIImageURL + OpenAIJSONSchema: + type: object + properties: + name: + type: string + description: + type: string + strict: + type: boolean + schema: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + additionalProperties: false + required: + - name + title: OpenAIJSONSchema OpenAIMessageParam: oneOf: - $ref: '#/components/schemas/OpenAIUserMessageParam' @@ -6172,6 +6195,53 @@ components: assistant: '#/components/schemas/OpenAIAssistantMessageParam' tool: '#/components/schemas/OpenAIToolMessageParam' developer: '#/components/schemas/OpenAIDeveloperMessageParam' + OpenAIResponseFormatJSONObject: + type: object + properties: + type: + type: string + const: json_object + default: json_object + additionalProperties: false + required: + - type + title: OpenAIResponseFormatJSONObject + OpenAIResponseFormatJSONSchema: + type: object + properties: + type: + type: string + const: json_schema + default: json_schema + json_schema: + $ref: '#/components/schemas/OpenAIJSONSchema' + additionalProperties: false + required: + - type + - json_schema + title: OpenAIResponseFormatJSONSchema + OpenAIResponseFormatParam: + oneOf: + - $ref: '#/components/schemas/OpenAIResponseFormatText' + - $ref: '#/components/schemas/OpenAIResponseFormatJSONSchema' + - $ref: '#/components/schemas/OpenAIResponseFormatJSONObject' + discriminator: + propertyName: type + mapping: + text: '#/components/schemas/OpenAIResponseFormatText' + json_schema: '#/components/schemas/OpenAIResponseFormatJSONSchema' + json_object: '#/components/schemas/OpenAIResponseFormatJSONObject' + OpenAIResponseFormatText: + type: object + properties: + type: + type: string + const: text + default: text + additionalProperties: false + required: + - type + title: OpenAIResponseFormatText OpenAISystemMessageParam: type: object properties: @@ -6331,9 +6401,7 @@ components: description: >- (Optional) The penalty for repeated tokens response_format: - type: object - additionalProperties: - type: string + $ref: '#/components/schemas/OpenAIResponseFormatParam' description: (Optional) The response format to use seed: type: integer diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py index 0e70c876e..4251d37ab 100644 --- a/llama_stack/apis/inference/inference.py +++ b/llama_stack/apis/inference/inference.py @@ -18,7 +18,7 @@ from typing import ( ) from pydantic import BaseModel, Field, field_validator -from typing_extensions import Annotated +from typing_extensions import Annotated, TypedDict from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent, InterleavedContentItem from llama_stack.apis.models import Model @@ -558,6 +558,46 @@ OpenAIMessageParam = Annotated[ register_schema(OpenAIMessageParam, name="OpenAIMessageParam") +@json_schema_type +class OpenAIResponseFormatText(BaseModel): + type: Literal["text"] = "text" + + +@json_schema_type +class OpenAIJSONSchema(TypedDict, total=False): + name: str + description: Optional[str] = None + strict: Optional[bool] = None + + # Pydantic BaseModel cannot be used with a schema param, since it already + # has one. And, we don't want to alias here because then have to handle + # that alias when converting to OpenAI params. So, to support schema, + # we use a TypedDict. + schema: Optional[Dict[str, Any]] = None + + +@json_schema_type +class OpenAIResponseFormatJSONSchema(BaseModel): + type: Literal["json_schema"] = "json_schema" + json_schema: OpenAIJSONSchema + + +@json_schema_type +class OpenAIResponseFormatJSONObject(BaseModel): + type: Literal["json_object"] = "json_object" + + +OpenAIResponseFormatParam = Annotated[ + Union[ + OpenAIResponseFormatText, + OpenAIResponseFormatJSONSchema, + OpenAIResponseFormatJSONObject, + ], + Field(discriminator="type"), +] +register_schema(OpenAIResponseFormatParam, name="OpenAIResponseFormatParam") + + @json_schema_type class OpenAITopLogProb(BaseModel): """The top log probability for a token from an OpenAI-compatible chat completion response. @@ -903,7 +943,7 @@ class Inference(Protocol): n: Optional[int] = None, parallel_tool_calls: Optional[bool] = None, presence_penalty: Optional[float] = None, - response_format: Optional[Dict[str, str]] = None, + response_format: Optional[OpenAIResponseFormatParam] = None, seed: Optional[int] = None, stop: Optional[Union[str, List[str]]] = None, stream: Optional[bool] = None, diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py index b9623ef3c..b9f363be0 100644 --- a/llama_stack/distribution/routers/routers.py +++ b/llama_stack/distribution/routers/routers.py @@ -37,7 +37,12 @@ from llama_stack.apis.inference import ( ToolDefinition, ToolPromptFormat, ) -from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam +from llama_stack.apis.inference.inference import ( + OpenAIChatCompletion, + OpenAICompletion, + OpenAIMessageParam, + OpenAIResponseFormatParam, +) from llama_stack.apis.models import Model, ModelType from llama_stack.apis.safety import RunShieldResponse, Safety from llama_stack.apis.scoring import ( @@ -530,7 +535,7 @@ class InferenceRouter(Inference): n: Optional[int] = None, parallel_tool_calls: Optional[bool] = None, presence_penalty: Optional[float] = None, - response_format: Optional[Dict[str, str]] = None, + response_format: Optional[OpenAIResponseFormatParam] = None, seed: Optional[int] = None, stop: Optional[Union[str, List[str]]] = None, stream: Optional[bool] = None, diff --git a/llama_stack/providers/remote/inference/fireworks/fireworks.py b/llama_stack/providers/remote/inference/fireworks/fireworks.py index b59e9f2cb..8385209f1 100644 --- a/llama_stack/providers/remote/inference/fireworks/fireworks.py +++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py @@ -32,7 +32,12 @@ from llama_stack.apis.inference import ( ToolDefinition, ToolPromptFormat, ) -from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam +from llama_stack.apis.inference.inference import ( + OpenAIChatCompletion, + OpenAICompletion, + OpenAIMessageParam, + OpenAIResponseFormatParam, +) from llama_stack.distribution.request_headers import NeedsRequestProviderData from llama_stack.log import get_logger from llama_stack.providers.utils.inference.model_registry import ( @@ -336,7 +341,7 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv n: Optional[int] = None, parallel_tool_calls: Optional[bool] = None, presence_penalty: Optional[float] = None, - response_format: Optional[Dict[str, str]] = None, + response_format: Optional[OpenAIResponseFormatParam] = None, seed: Optional[int] = None, stop: Optional[Union[str, List[str]]] = None, stream: Optional[bool] = None, diff --git a/llama_stack/providers/remote/inference/nvidia/nvidia.py b/llama_stack/providers/remote/inference/nvidia/nvidia.py index d6f717719..b2a244f11 100644 --- a/llama_stack/providers/remote/inference/nvidia/nvidia.py +++ b/llama_stack/providers/remote/inference/nvidia/nvidia.py @@ -35,7 +35,12 @@ from llama_stack.apis.inference import ( ToolConfig, ToolDefinition, ) -from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam +from llama_stack.apis.inference.inference import ( + OpenAIChatCompletion, + OpenAICompletion, + OpenAIMessageParam, + OpenAIResponseFormatParam, +) from llama_stack.models.llama.datatypes import ToolPromptFormat from llama_stack.providers.utils.inference.model_registry import ( ModelRegistryHelper, @@ -329,7 +334,7 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper): n: Optional[int] = None, parallel_tool_calls: Optional[bool] = None, presence_penalty: Optional[float] = None, - response_format: Optional[Dict[str, str]] = None, + response_format: Optional[OpenAIResponseFormatParam] = None, seed: Optional[int] = None, stop: Optional[Union[str, List[str]]] = None, stream: Optional[bool] = None, diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py index 33b48af46..a24d35ab2 100644 --- a/llama_stack/providers/remote/inference/ollama/ollama.py +++ b/llama_stack/providers/remote/inference/ollama/ollama.py @@ -39,7 +39,12 @@ from llama_stack.apis.inference import ( ToolDefinition, ToolPromptFormat, ) -from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam +from llama_stack.apis.inference.inference import ( + OpenAIChatCompletion, + OpenAICompletion, + OpenAIMessageParam, + OpenAIResponseFormatParam, +) from llama_stack.apis.models import Model, ModelType from llama_stack.log import get_logger from llama_stack.providers.datatypes import ModelsProtocolPrivate @@ -393,7 +398,7 @@ class OllamaInferenceAdapter( n: Optional[int] = None, parallel_tool_calls: Optional[bool] = None, presence_penalty: Optional[float] = None, - response_format: Optional[Dict[str, str]] = None, + response_format: Optional[OpenAIResponseFormatParam] = None, seed: Optional[int] = None, stop: Optional[Union[str, List[str]]] = None, stream: Optional[bool] = None, diff --git a/llama_stack/providers/remote/inference/passthrough/passthrough.py b/llama_stack/providers/remote/inference/passthrough/passthrough.py index 0eb38c395..63054ae0a 100644 --- a/llama_stack/providers/remote/inference/passthrough/passthrough.py +++ b/llama_stack/providers/remote/inference/passthrough/passthrough.py @@ -26,7 +26,12 @@ from llama_stack.apis.inference import ( ToolDefinition, ToolPromptFormat, ) -from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam +from llama_stack.apis.inference.inference import ( + OpenAIChatCompletion, + OpenAICompletion, + OpenAIMessageParam, + OpenAIResponseFormatParam, +) from llama_stack.apis.models import Model from llama_stack.distribution.library_client import convert_pydantic_to_json_value, convert_to_pydantic from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper @@ -266,7 +271,7 @@ class PassthroughInferenceAdapter(Inference): n: Optional[int] = None, parallel_tool_calls: Optional[bool] = None, presence_penalty: Optional[float] = None, - response_format: Optional[Dict[str, str]] = None, + response_format: Optional[OpenAIResponseFormatParam] = None, seed: Optional[int] = None, stop: Optional[Union[str, List[str]]] = None, stream: Optional[bool] = None, diff --git a/llama_stack/providers/remote/inference/together/together.py b/llama_stack/providers/remote/inference/together/together.py index 1615b8cd1..4ebf9956e 100644 --- a/llama_stack/providers/remote/inference/together/together.py +++ b/llama_stack/providers/remote/inference/together/together.py @@ -31,7 +31,12 @@ from llama_stack.apis.inference import ( ToolDefinition, ToolPromptFormat, ) -from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam +from llama_stack.apis.inference.inference import ( + OpenAIChatCompletion, + OpenAICompletion, + OpenAIMessageParam, + OpenAIResponseFormatParam, +) from llama_stack.distribution.request_headers import NeedsRequestProviderData from llama_stack.log import get_logger from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper @@ -315,7 +320,7 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi n: Optional[int] = None, parallel_tool_calls: Optional[bool] = None, presence_penalty: Optional[float] = None, - response_format: Optional[Dict[str, str]] = None, + response_format: Optional[OpenAIResponseFormatParam] = None, seed: Optional[int] = None, stop: Optional[Union[str, List[str]]] = None, stream: Optional[bool] = None, diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py index 0044d2e75..eca68e399 100644 --- a/llama_stack/providers/remote/inference/vllm/vllm.py +++ b/llama_stack/providers/remote/inference/vllm/vllm.py @@ -45,7 +45,12 @@ from llama_stack.apis.inference import ( ToolDefinition, ToolPromptFormat, ) -from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam +from llama_stack.apis.inference.inference import ( + OpenAIChatCompletion, + OpenAICompletion, + OpenAIMessageParam, + OpenAIResponseFormatParam, +) from llama_stack.apis.models import Model, ModelType from llama_stack.models.llama.datatypes import BuiltinTool, StopReason, ToolCall from llama_stack.models.llama.sku_list import all_registered_models @@ -487,7 +492,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate): n: Optional[int] = None, parallel_tool_calls: Optional[bool] = None, presence_penalty: Optional[float] = None, - response_format: Optional[Dict[str, str]] = None, + response_format: Optional[OpenAIResponseFormatParam] = None, seed: Optional[int] = None, stop: Optional[Union[str, List[str]]] = None, stream: Optional[bool] = None, diff --git a/llama_stack/providers/utils/inference/litellm_openai_mixin.py b/llama_stack/providers/utils/inference/litellm_openai_mixin.py index cd0f4ec67..6d98a0cb4 100644 --- a/llama_stack/providers/utils/inference/litellm_openai_mixin.py +++ b/llama_stack/providers/utils/inference/litellm_openai_mixin.py @@ -30,7 +30,12 @@ from llama_stack.apis.inference import ( ToolDefinition, ToolPromptFormat, ) -from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam +from llama_stack.apis.inference.inference import ( + OpenAIChatCompletion, + OpenAICompletion, + OpenAIMessageParam, + OpenAIResponseFormatParam, +) from llama_stack.apis.models.models import Model from llama_stack.distribution.request_headers import NeedsRequestProviderData from llama_stack.log import get_logger @@ -270,7 +275,7 @@ class LiteLLMOpenAIMixin( guided_choice: Optional[List[str]] = None, prompt_logprobs: Optional[int] = None, ) -> OpenAICompletion: - model_obj = await self._get_model(model) + model_obj = await self.model_store.get_model(model) params = await prepare_openai_completion_params( model=model_obj.provider_resource_id, prompt=prompt, @@ -308,7 +313,7 @@ class LiteLLMOpenAIMixin( n: Optional[int] = None, parallel_tool_calls: Optional[bool] = None, presence_penalty: Optional[float] = None, - response_format: Optional[Dict[str, str]] = None, + response_format: Optional[OpenAIResponseFormatParam] = None, seed: Optional[int] = None, stop: Optional[Union[str, List[str]]] = None, stream: Optional[bool] = None, @@ -320,7 +325,7 @@ class LiteLLMOpenAIMixin( top_p: Optional[float] = None, user: Optional[str] = None, ) -> OpenAIChatCompletion: - model_obj = await self._get_model(model) + model_obj = await self.model_store.get_model(model) params = await prepare_openai_completion_params( model=model_obj.provider_resource_id, messages=messages, diff --git a/llama_stack/providers/utils/inference/openai_compat.py b/llama_stack/providers/utils/inference/openai_compat.py index f33cb4443..1fa202475 100644 --- a/llama_stack/providers/utils/inference/openai_compat.py +++ b/llama_stack/providers/utils/inference/openai_compat.py @@ -85,7 +85,12 @@ from llama_stack.apis.inference import ( TopPSamplingStrategy, UserMessage, ) -from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAICompletionChoice +from llama_stack.apis.inference.inference import ( + OpenAIChatCompletion, + OpenAICompletion, + OpenAICompletionChoice, + OpenAIResponseFormatParam, +) from llama_stack.models.llama.datatypes import ( BuiltinTool, StopReason, @@ -1080,7 +1085,20 @@ async def convert_openai_chat_completion_stream( async def prepare_openai_completion_params(**params): - completion_params = {k: v for k, v in params.items() if v is not None} + async def _prepare_value(value: Any) -> Any: + new_value = value + if isinstance(value, list): + new_value = [await _prepare_value(v) for v in value] + elif isinstance(value, dict): + new_value = {k: await _prepare_value(v) for k, v in value.items()} + elif isinstance(value, BaseModel): + new_value = value.model_dump(exclude_none=True) + return new_value + + completion_params = {} + for k, v in params.items(): + if v is not None: + completion_params[k] = await _prepare_value(v) return completion_params @@ -1167,7 +1185,7 @@ class OpenAIChatCompletionUnsupportedMixin: n: Optional[int] = None, parallel_tool_calls: Optional[bool] = None, presence_penalty: Optional[float] = None, - response_format: Optional[Dict[str, str]] = None, + response_format: Optional[OpenAIResponseFormatParam] = None, seed: Optional[int] = None, stop: Optional[Union[str, List[str]]] = None, stream: Optional[bool] = None,