diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index dbfe65960..6b858eecf 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -7241,6 +7241,79 @@ ], "title": "OpenAIResponseOutputMessageWebSearchToolCall" }, + "OpenAIResponseText": { + "type": "object", + "properties": { + "format": { + "type": "object", + "properties": { + "type": { + "oneOf": [ + { + "type": "string", + "const": "text" + }, + { + "type": "string", + "const": "json_schema" + }, + { + "type": "string", + "const": "json_object" + } + ], + "description": "Must be \"text\", \"json_schema\", or \"json_object\" to identify the format type" + }, + "name": { + "type": "string", + "description": "The name of the response format. Only used for json_schema." + }, + "schema": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + }, + "description": "The JSON schema the response should conform to. In a Python SDK, this is often a `pydantic` model. Only used for json_schema." + }, + "description": { + "type": "string", + "description": "(Optional) A description of the response format. Only used for json_schema." + }, + "strict": { + "type": "boolean", + "description": "(Optional) Whether to strictly enforce the JSON schema. If true, the response must match the schema exactly. Only used for json_schema." + } + }, + "additionalProperties": false, + "required": [ + "type" + ], + "title": "OpenAIResponseTextFormat", + "description": "Configuration for Responses API text format." + } + }, + "additionalProperties": false, + "title": "OpenAIResponseText" + }, "CreateOpenaiResponseRequest": { "type": "object", "properties": { @@ -7278,6 +7351,9 @@ "temperature": { "type": "number" }, + "text": { + "$ref": "#/components/schemas/OpenAIResponseText" + }, "tools": { "type": "array", "items": { @@ -7351,6 +7427,9 @@ "temperature": { "type": "number" }, + "text": { + "$ref": "#/components/schemas/OpenAIResponseText" + }, "top_p": { "type": "number" }, @@ -7369,7 +7448,8 @@ "object", "output", "parallel_tool_calls", - "status" + "status", + "text" ], "title": "OpenAIResponseObject" }, @@ -10406,6 +10486,9 @@ "temperature": { "type": "number" }, + "text": { + "$ref": "#/components/schemas/OpenAIResponseText" + }, "top_p": { "type": "number" }, @@ -10431,6 +10514,7 @@ "output", "parallel_tool_calls", "status", + "text", "input" ], "title": "OpenAIResponseObjectWithInput" diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index c185488b4..b5172e947 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -5118,6 +5118,57 @@ components: - type title: >- OpenAIResponseOutputMessageWebSearchToolCall + OpenAIResponseText: + type: object + properties: + format: + type: object + properties: + type: + oneOf: + - type: string + const: text + - type: string + const: json_schema + - type: string + const: json_object + description: >- + Must be "text", "json_schema", or "json_object" to identify the format + type + name: + type: string + description: >- + The name of the response format. Only used for json_schema. + schema: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + description: >- + The JSON schema the response should conform to. In a Python SDK, this + is often a `pydantic` model. Only used for json_schema. + description: + type: string + description: >- + (Optional) A description of the response format. Only used for json_schema. + strict: + type: boolean + description: >- + (Optional) Whether to strictly enforce the JSON schema. If true, the + response must match the schema exactly. Only used for json_schema. + additionalProperties: false + required: + - type + title: OpenAIResponseTextFormat + description: >- + Configuration for Responses API text format. + additionalProperties: false + title: OpenAIResponseText CreateOpenaiResponseRequest: type: object properties: @@ -5145,6 +5196,8 @@ components: type: boolean temperature: type: number + text: + $ref: '#/components/schemas/OpenAIResponseText' tools: type: array items: @@ -5196,6 +5249,8 @@ components: type: string temperature: type: number + text: + $ref: '#/components/schemas/OpenAIResponseText' top_p: type: number truncation: @@ -5211,6 +5266,7 @@ components: - output - parallel_tool_calls - status + - text title: OpenAIResponseObject OpenAIResponseOutput: oneOf: @@ -7288,6 +7344,8 @@ components: type: string temperature: type: number + text: + $ref: '#/components/schemas/OpenAIResponseText' top_p: type: number truncation: @@ -7307,6 +7365,7 @@ components: - output - parallel_tool_calls - status + - text - input title: OpenAIResponseObjectWithInput ListProvidersResponse: diff --git a/llama_stack/apis/agents/agents.py b/llama_stack/apis/agents/agents.py index 956f4a614..cc4ee0648 100644 --- a/llama_stack/apis/agents/agents.py +++ b/llama_stack/apis/agents/agents.py @@ -37,6 +37,7 @@ from .openai_responses import ( OpenAIResponseInputTool, OpenAIResponseObject, OpenAIResponseObjectStream, + OpenAIResponseText, ) # TODO: use enum.StrEnum when we drop support for python 3.10 @@ -603,6 +604,7 @@ class Agents(Protocol): store: bool | None = True, stream: bool | None = False, temperature: float | None = None, + text: OpenAIResponseText | None = None, tools: list[OpenAIResponseInputTool] | None = None, max_infer_iters: int | None = 10, # this is an extension to the OpenAI API ) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]: diff --git a/llama_stack/apis/agents/openai_responses.py b/llama_stack/apis/agents/openai_responses.py index 6806e1d3f..6fa18b115 100644 --- a/llama_stack/apis/agents/openai_responses.py +++ b/llama_stack/apis/agents/openai_responses.py @@ -7,6 +7,7 @@ from typing import Annotated, Any, Literal from pydantic import BaseModel, Field +from typing_extensions import TypedDict from llama_stack.schema_utils import json_schema_type, register_schema @@ -126,6 +127,32 @@ OpenAIResponseOutput = Annotated[ register_schema(OpenAIResponseOutput, name="OpenAIResponseOutput") +# This has to be a TypedDict because we need a "schema" field and our strong +# typing code in the schema generator doesn't support Pydantic aliases. That also +# means we can't use a discriminator field here, because TypedDicts don't support +# default values which the strong typing code requires for discriminators. +class OpenAIResponseTextFormat(TypedDict, total=False): + """Configuration for Responses API text format. + + :param type: Must be "text", "json_schema", or "json_object" to identify the format type + :param name: The name of the response format. Only used for json_schema. + :param schema: The JSON schema the response should conform to. In a Python SDK, this is often a `pydantic` model. Only used for json_schema. + :param description: (Optional) A description of the response format. Only used for json_schema. + :param strict: (Optional) Whether to strictly enforce the JSON schema. If true, the response must match the schema exactly. Only used for json_schema. + """ + + type: Literal["text"] | Literal["json_schema"] | Literal["json_object"] + name: str | None + schema: dict[str, Any] | None + description: str | None + strict: bool | None + + +@json_schema_type +class OpenAIResponseText(BaseModel): + format: OpenAIResponseTextFormat | None = None + + @json_schema_type class OpenAIResponseObject(BaseModel): created_at: int @@ -138,6 +165,9 @@ class OpenAIResponseObject(BaseModel): previous_response_id: str | None = None status: str temperature: float | None = None + # Default to text format to avoid breaking the loading of old responses + # before the field was added. New responses will have this set always. + text: OpenAIResponseText = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")) top_p: float | None = None truncation: str | None = None user: str | None = None diff --git a/llama_stack/providers/inline/agents/meta_reference/agents.py b/llama_stack/providers/inline/agents/meta_reference/agents.py index 854f8b285..4c3dcab15 100644 --- a/llama_stack/providers/inline/agents/meta_reference/agents.py +++ b/llama_stack/providers/inline/agents/meta_reference/agents.py @@ -29,6 +29,7 @@ from llama_stack.apis.agents import ( Session, Turn, ) +from llama_stack.apis.agents.openai_responses import OpenAIResponseText from llama_stack.apis.common.responses import PaginatedResponse from llama_stack.apis.inference import ( Inference, @@ -324,11 +325,12 @@ class MetaReferenceAgentsImpl(Agents): store: bool | None = True, stream: bool | None = False, temperature: float | None = None, + text: OpenAIResponseText | None = None, tools: list[OpenAIResponseInputTool] | None = None, max_infer_iters: int | None = 10, ) -> OpenAIResponseObject: return await self.openai_responses_impl.create_openai_response( - input, model, instructions, previous_response_id, store, stream, temperature, tools, max_infer_iters + input, model, instructions, previous_response_id, store, stream, temperature, text, tools, max_infer_iters ) async def list_openai_responses( diff --git a/llama_stack/providers/inline/agents/meta_reference/openai_responses.py b/llama_stack/providers/inline/agents/meta_reference/openai_responses.py index f4f1bac43..661f04ef1 100644 --- a/llama_stack/providers/inline/agents/meta_reference/openai_responses.py +++ b/llama_stack/providers/inline/agents/meta_reference/openai_responses.py @@ -37,6 +37,8 @@ from llama_stack.apis.agents.openai_responses import ( OpenAIResponseOutputMessageFunctionToolCall, OpenAIResponseOutputMessageMCPListTools, OpenAIResponseOutputMessageWebSearchToolCall, + OpenAIResponseText, + OpenAIResponseTextFormat, ) from llama_stack.apis.inference.inference import ( Inference, @@ -50,7 +52,12 @@ from llama_stack.apis.inference.inference import ( OpenAIChoice, OpenAIDeveloperMessageParam, OpenAIImageURL, + OpenAIJSONSchema, OpenAIMessageParam, + OpenAIResponseFormatJSONObject, + OpenAIResponseFormatJSONSchema, + OpenAIResponseFormatParam, + OpenAIResponseFormatText, OpenAISystemMessageParam, OpenAIToolMessageParam, OpenAIUserMessageParam, @@ -158,6 +165,21 @@ async def _convert_chat_choice_to_response_message(choice: OpenAIChoice) -> Open ) +async def _convert_response_text_to_chat_response_format(text: OpenAIResponseText) -> OpenAIResponseFormatParam: + """ + Convert an OpenAI Response text parameter into an OpenAI Chat Completion response format. + """ + if not text.format or text.format["type"] == "text": + return OpenAIResponseFormatText(type="text") + if text.format["type"] == "json_object": + return OpenAIResponseFormatJSONObject() + if text.format["type"] == "json_schema": + return OpenAIResponseFormatJSONSchema( + json_schema=OpenAIJSONSchema(name=text.format["name"], schema=text.format["schema"]) + ) + raise ValueError(f"Unsupported text format: {text.format}") + + async def _get_message_type_by_role(role: str): role_to_type = { "user": OpenAIUserMessageParam, @@ -180,6 +202,7 @@ class ChatCompletionContext(BaseModel): mcp_tool_to_server: dict[str, OpenAIResponseInputToolMCP] stream: bool temperature: float | None + response_format: OpenAIResponseFormatParam class OpenAIResponsesImpl: @@ -343,10 +366,12 @@ class OpenAIResponsesImpl: store: bool | None = True, stream: bool | None = False, temperature: float | None = None, + text: OpenAIResponseText | None = None, tools: list[OpenAIResponseInputTool] | None = None, max_infer_iters: int | None = 10, ): stream = False if stream is None else stream + text = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")) if text is None else text output_messages: list[OpenAIResponseOutput] = [] @@ -355,6 +380,9 @@ class OpenAIResponsesImpl: messages = await _convert_response_input_to_chat_messages(input) await self._prepend_instructions(messages, instructions) + # Structured outputs + response_format = await _convert_response_text_to_chat_response_format(text) + # Tool setup chat_tools, mcp_tool_to_server, mcp_list_message = ( await self._convert_response_tools_to_chat_tools(tools) if tools else (None, {}, None) @@ -369,6 +397,7 @@ class OpenAIResponsesImpl: mcp_tool_to_server=mcp_tool_to_server, stream=stream, temperature=temperature, + response_format=response_format, ) # Fork to streaming vs non-streaming - let each handle ALL inference rounds @@ -379,6 +408,7 @@ class OpenAIResponsesImpl: input=input, model=model, store=store, + text=text, tools=tools, max_infer_iters=max_infer_iters, ) @@ -389,6 +419,7 @@ class OpenAIResponsesImpl: input=input, model=model, store=store, + text=text, tools=tools, max_infer_iters=max_infer_iters, ) @@ -400,6 +431,7 @@ class OpenAIResponsesImpl: input: str | list[OpenAIResponseInput], model: str, store: bool | None, + text: OpenAIResponseText, tools: list[OpenAIResponseInputTool] | None, max_infer_iters: int | None, ) -> OpenAIResponseObject: @@ -416,6 +448,7 @@ class OpenAIResponsesImpl: tools=ctx.tools, stream=False, temperature=ctx.temperature, + response_format=ctx.response_format, ) current_response = OpenAIChatCompletion(**inference_result.model_dump()) @@ -470,6 +503,7 @@ class OpenAIResponsesImpl: object="response", status="completed", output=output_messages, + text=text, ) logger.debug(f"OpenAI Responses response: {response}") @@ -489,6 +523,7 @@ class OpenAIResponsesImpl: input: str | list[OpenAIResponseInput], model: str, store: bool | None, + text: OpenAIResponseText, tools: list[OpenAIResponseInputTool] | None, max_infer_iters: int | None, ) -> AsyncIterator[OpenAIResponseObjectStream]: @@ -503,6 +538,7 @@ class OpenAIResponsesImpl: object="response", status="in_progress", output=output_messages.copy(), + text=text, ) # Emit response.created immediately @@ -520,6 +556,7 @@ class OpenAIResponsesImpl: tools=ctx.tools, stream=True, temperature=ctx.temperature, + response_format=ctx.response_format, ) # Process streaming chunks and build complete response @@ -645,6 +682,7 @@ class OpenAIResponsesImpl: model=model, object="response", status="completed", + text=text, output=output_messages, ) diff --git a/tests/unit/providers/agents/meta_reference/test_openai_responses.py b/tests/unit/providers/agents/meta_reference/test_openai_responses.py index 7a367e394..e524cc7d0 100644 --- a/tests/unit/providers/agents/meta_reference/test_openai_responses.py +++ b/tests/unit/providers/agents/meta_reference/test_openai_responses.py @@ -25,11 +25,17 @@ from llama_stack.apis.agents.openai_responses import ( OpenAIResponseObjectWithInput, OpenAIResponseOutputMessageContentOutputText, OpenAIResponseOutputMessageWebSearchToolCall, + OpenAIResponseText, + OpenAIResponseTextFormat, ) from llama_stack.apis.inference.inference import ( OpenAIAssistantMessageParam, OpenAIChatCompletionContentPartTextParam, OpenAIDeveloperMessageParam, + OpenAIJSONSchema, + OpenAIResponseFormatJSONObject, + OpenAIResponseFormatJSONSchema, + OpenAIResponseFormatText, OpenAIUserMessageParam, ) from llama_stack.apis.tools.tools import Tool, ToolGroups, ToolInvocationResult, ToolParameter, ToolRuntime @@ -96,6 +102,7 @@ async def test_create_openai_response_with_string_input(openai_responses_impl, m mock_inference_api.openai_chat_completion.assert_called_once_with( model=model, messages=[OpenAIUserMessageParam(role="user", content="What is the capital of Ireland?", name=None)], + response_format=OpenAIResponseFormatText(), tools=None, stream=False, temperature=0.1, @@ -320,6 +327,7 @@ async def test_prepend_previous_response_basic(openai_responses_impl, mock_respo model="fake_model", output=[response_output_message], status="completed", + text=OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")), input=[input_item_message], ) mock_responses_store.get_response_object.return_value = previous_response @@ -362,6 +370,7 @@ async def test_prepend_previous_response_web_search(openai_responses_impl, mock_ model="fake_model", output=[output_web_search, output_message], status="completed", + text=OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")), input=[input_item_message], ) mock_responses_store.get_response_object.return_value = response @@ -483,6 +492,7 @@ async def test_create_openai_response_with_instructions_and_previous_response( model="fake_model", output=[response_output_message], status="completed", + text=OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")), input=[input_item_message], ) mock_responses_store.get_response_object.return_value = response @@ -576,6 +586,7 @@ async def test_responses_store_list_input_items_logic(): object="response", status="completed", output=[], + text=OpenAIResponseText(format=(OpenAIResponseTextFormat(type="text"))), input=input_items, ) @@ -644,6 +655,7 @@ async def test_store_response_uses_rehydrated_input_with_previous_response( created_at=1234567890, model="meta-llama/Llama-3.1-8B-Instruct", status="completed", + text=OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")), input=[ OpenAIResponseMessage( id="msg-prev-user", role="user", content=[OpenAIResponseInputMessageContentText(text="What is 2+2?")] @@ -694,3 +706,61 @@ async def test_store_response_uses_rehydrated_input_with_previous_response( # Verify the response itself is correct assert result.model == model assert result.status == "completed" + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "text_format, response_format", + [ + (OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")), OpenAIResponseFormatText()), + ( + OpenAIResponseText(format=OpenAIResponseTextFormat(name="Test", schema={"foo": "bar"}, type="json_schema")), + OpenAIResponseFormatJSONSchema(json_schema=OpenAIJSONSchema(name="Test", schema={"foo": "bar"})), + ), + (OpenAIResponseText(format=OpenAIResponseTextFormat(type="json_object")), OpenAIResponseFormatJSONObject()), + # ensure text param with no format specified defaults to text + (OpenAIResponseText(format=None), OpenAIResponseFormatText()), + # ensure text param of None defaults to text + (None, OpenAIResponseFormatText()), + ], +) +async def test_create_openai_response_with_text_format( + openai_responses_impl, mock_inference_api, text_format, response_format +): + """Test creating Responses with text formats.""" + # Setup + input_text = "How hot it is in San Francisco today?" + model = "meta-llama/Llama-3.1-8B-Instruct" + + # Load the chat completion fixture + mock_chat_completion = load_chat_completion_fixture("simple_chat_completion.yaml") + mock_inference_api.openai_chat_completion.return_value = mock_chat_completion + + # Execute + _result = await openai_responses_impl.create_openai_response( + input=input_text, + model=model, + text=text_format, + ) + + # Verify + first_call = mock_inference_api.openai_chat_completion.call_args_list[0] + assert first_call.kwargs["messages"][0].content == input_text + assert first_call.kwargs["response_format"] is not None + assert first_call.kwargs["response_format"] == response_format + + +@pytest.mark.asyncio +async def test_create_openai_response_with_invalid_text_format(openai_responses_impl, mock_inference_api): + """Test creating an OpenAI response with an invalid text format.""" + # Setup + input_text = "How hot it is in San Francisco today?" + model = "meta-llama/Llama-3.1-8B-Instruct" + + # Execute + with pytest.raises(ValueError): + _result = await openai_responses_impl.create_openai_response( + input=input_text, + model=model, + text=OpenAIResponseText(format={"type": "invalid"}), + ) diff --git a/tests/verifications/openai_api/test_responses.py b/tests/verifications/openai_api/test_responses.py index c9b190e62..28020d3b1 100644 --- a/tests/verifications/openai_api/test_responses.py +++ b/tests/verifications/openai_api/test_responses.py @@ -546,3 +546,39 @@ async def test_response_streaming_multi_turn_tool_execution( assert expected_output.lower() in final_response.output_text.lower(), ( f"Expected '{expected_output}' to appear in response: {final_response.output_text}" ) + + +@pytest.mark.parametrize( + "text_format", + # Not testing json_object because most providers don't actually support it. + [ + {"type": "text"}, + { + "type": "json_schema", + "name": "capitals", + "description": "A schema for the capital of each country", + "schema": {"type": "object", "properties": {"capital": {"type": "string"}}}, + "strict": True, + }, + ], +) +def test_response_text_format(request, openai_client, model, provider, verification_config, text_format): + if isinstance(openai_client, LlamaStackAsLibraryClient): + pytest.skip("Responses API text format is not yet supported in library client.") + + test_name_base = get_base_test_name(request) + if should_skip_test(verification_config, provider, model, test_name_base): + pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.") + + stream = False + response = openai_client.responses.create( + model=model, + input="What is the capital of France?", + stream=stream, + text={"format": text_format}, + ) + # by_alias=True is needed because otherwise Pydantic renames our "schema" field + assert response.text.format.model_dump(exclude_none=True, by_alias=True) == text_format + assert "paris" in response.output_text.lower() + if text_format["type"] == "json_schema": + assert "paris" in json.loads(response.output_text)["capital"].lower()