feat: Structured output for Responses API

This adds the missing `text` parameter to the Responses API that is how users control structured outputs. All we do with that parameter is map it to the corresponding chat completion response_format. The unit tests exercise the various permutations allowed for this property, while a couple of new verification tests actually use it for real to verify the model outputs are following the format as expected. Signed-off-by: Ben Browning <bbrownin@redhat.com>
2025-12-30 12:34:18 +00:00 · 2025-05-31 13:44:20 -04:00 · 2025-05-31 13:44:20 -04:00 · badf8594d1
commit badf8594d1
parent 4540c9b3e5
8 changed files with 323 additions and 2 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -7241,6 +7241,79 @@
                ],
                "title": "OpenAIResponseOutputMessageWebSearchToolCall"
            },
            "OpenAIResponseText": {
                "type": "object",
                "properties": {
                    "format": {
                        "type": "object",
                        "properties": {
                            "type": {
                                "oneOf": [
                                    {
                                        "type": "string",
                                        "const": "text"
                                    },
                                    {
                                        "type": "string",
                                        "const": "json_schema"
                                    },
                                    {
                                        "type": "string",
                                        "const": "json_object"
                                    }
                                ],
                                "description": "Must be \"text\", \"json_schema\", or \"json_object\" to identify the format type"
                            },
                            "name": {
                                "type": "string",
                                "description": "The name of the response format. Only used for json_schema."
                            },
                            "schema": {
                                "type": "object",
                                "additionalProperties": {
                                    "oneOf": [
                                        {
                                            "type": "null"
                                        },
                                        {
                                            "type": "boolean"
                                        },
                                        {
                                            "type": "number"
                                        },
                                        {
                                            "type": "string"
                                        },
                                        {
                                            "type": "array"
                                        },
                                        {
                                            "type": "object"
                                        }
                                    ]
                                },
                                "description": "The JSON schema the response should conform to. In a Python SDK, this is often a `pydantic` model. Only used for json_schema."
                            },
                            "description": {
                                "type": "string",
                                "description": "(Optional) A description of the response format. Only used for json_schema."
                            },
                            "strict": {
                                "type": "boolean",
                                "description": "(Optional) Whether to strictly enforce the JSON schema. If true, the response must match the schema exactly. Only used for json_schema."
                            }
                        },
                        "additionalProperties": false,
                        "required": [
                            "type"
                        ],
                        "title": "OpenAIResponseTextFormat",
                        "description": "Configuration for Responses API text format."
                    }
                },
                "additionalProperties": false,
                "title": "OpenAIResponseText"
            },
            "CreateOpenaiResponseRequest": {
                "type": "object",
                "properties": {
@ -7278,6 +7351,9 @@
                    "temperature": {
                        "type": "number"
                    },
                    "text": {
                        "$ref": "#/components/schemas/OpenAIResponseText"
                    },
                    "tools": {
                        "type": "array",
                        "items": {
@ -7351,6 +7427,9 @@
                    "temperature": {
                        "type": "number"
                    },
                    "text": {
                        "$ref": "#/components/schemas/OpenAIResponseText"
                    },
                    "top_p": {
                        "type": "number"
                    },
@ -7369,7 +7448,8 @@
                    "object",
                    "output",
                    "parallel_tool_calls",
-                    "status"
+                    "status",
                    "text"
                ],
                "title": "OpenAIResponseObject"
            },
@ -10406,6 +10486,9 @@
                    "temperature": {
                        "type": "number"
                    },
                    "text": {
                        "$ref": "#/components/schemas/OpenAIResponseText"
                    },
                    "top_p": {
                        "type": "number"
                    },
@ -10431,6 +10514,7 @@
                    "output",
                    "parallel_tool_calls",
                    "status",
                    "text",
                    "input"
                ],
                "title": "OpenAIResponseObjectWithInput"
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -5118,6 +5118,57 @@ components:
        - type
      title: >-
        OpenAIResponseOutputMessageWebSearchToolCall
    OpenAIResponseText:
      type: object
      properties:
        format:
          type: object
          properties:
            type:
              oneOf:
                - type: string
                  const: text
                - type: string
                  const: json_schema
                - type: string
                  const: json_object
              description: >-
                Must be "text", "json_schema", or "json_object" to identify the format
                type
            name:
              type: string
              description: >-
                The name of the response format. Only used for json_schema.
            schema:
              type: object
              additionalProperties:
                oneOf:
                  - type: 'null'
                  - type: boolean
                  - type: number
                  - type: string
                  - type: array
                  - type: object
              description: >-
                The JSON schema the response should conform to. In a Python SDK, this
                is often a `pydantic` model. Only used for json_schema.
            description:
              type: string
              description: >-
                (Optional) A description of the response format. Only used for json_schema.
            strict:
              type: boolean
              description: >-
                (Optional) Whether to strictly enforce the JSON schema. If true, the
                response must match the schema exactly. Only used for json_schema.
          additionalProperties: false
          required:
            - type
          title: OpenAIResponseTextFormat
          description: >-
            Configuration for Responses API text format.
      additionalProperties: false
      title: OpenAIResponseText
    CreateOpenaiResponseRequest:
      type: object
      properties:
@ -5145,6 +5196,8 @@ components:
          type: boolean
        temperature:
          type: number
        text:
          $ref: '#/components/schemas/OpenAIResponseText'
        tools:
          type: array
          items:
@ -5196,6 +5249,8 @@ components:
          type: string
        temperature:
          type: number
        text:
          $ref: '#/components/schemas/OpenAIResponseText'
        top_p:
          type: number
        truncation:
@ -5211,6 +5266,7 @@ components:
        - output
        - parallel_tool_calls
        - status
        - text
      title: OpenAIResponseObject
    OpenAIResponseOutput:
      oneOf:
@ -7288,6 +7344,8 @@ components:
          type: string
        temperature:
          type: number
        text:
          $ref: '#/components/schemas/OpenAIResponseText'
        top_p:
          type: number
        truncation:
@ -7307,6 +7365,7 @@ components:
        - output
        - parallel_tool_calls
        - status
        - text
        - input
      title: OpenAIResponseObjectWithInput
    ListProvidersResponse:
--- a/llama_stack/apis/agents/agents.py
+++ b/llama_stack/apis/agents/agents.py
@ -37,6 +37,7 @@ from .openai_responses import (
    OpenAIResponseInputTool,
    OpenAIResponseObject,
    OpenAIResponseObjectStream,
    OpenAIResponseText,
 )
 # TODO: use enum.StrEnum when we drop support for python 3.10
@ -603,6 +604,7 @@ class Agents(Protocol):
        store: bool | None = True,
        stream: bool | None = False,
        temperature: float | None = None,
        text: OpenAIResponseText | None = None,
        tools: list[OpenAIResponseInputTool] | None = None,
        max_infer_iters: int | None = 10,  # this is an extension to the OpenAI API
    ) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]:
--- a/llama_stack/apis/agents/openai_responses.py
+++ b/llama_stack/apis/agents/openai_responses.py
@ -7,6 +7,7 @@
 from typing import Annotated, Any, Literal
 from pydantic import BaseModel, Field
 from typing_extensions import TypedDict
 from llama_stack.schema_utils import json_schema_type, register_schema
@ -126,6 +127,32 @@ OpenAIResponseOutput = Annotated[
 register_schema(OpenAIResponseOutput, name="OpenAIResponseOutput")
 # This has to be a TypedDict because we need a "schema" field and our strong
 # typing code in the schema generator doesn't support Pydantic aliases. That also
 # means we can't use a discriminator field here, because TypedDicts don't support
 # default values which the strong typing code requires for discriminators.
 class OpenAIResponseTextFormat(TypedDict, total=False):
    """Configuration for Responses API text format.
    :param type: Must be "text", "json_schema", or "json_object" to identify the format type
    :param name: The name of the response format. Only used for json_schema.
    :param schema: The JSON schema the response should conform to. In a Python SDK, this is often a `pydantic` model. Only used for json_schema.
    :param description: (Optional) A description of the response format. Only used for json_schema.
    :param strict: (Optional) Whether to strictly enforce the JSON schema. If true, the response must match the schema exactly. Only used for json_schema.
    """
    type: Literal["text"] | Literal["json_schema"] | Literal["json_object"]
    name: str | None
    schema: dict[str, Any] | None
    description: str | None
    strict: bool | None
@json_schema_type
 class OpenAIResponseText(BaseModel):
    format: OpenAIResponseTextFormat | None = None
@json_schema_type
 class OpenAIResponseObject(BaseModel):
    created_at: int
@ -138,6 +165,9 @@ class OpenAIResponseObject(BaseModel):
    previous_response_id: str | None = None
    status: str
    temperature: float | None = None
    # Default to text format to avoid breaking the loading of old responses
    # before the field was added. New responses will have this set always.
    text: OpenAIResponseText = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text"))
    top_p: float | None = None
    truncation: str | None = None
    user: str | None = None
--- a/llama_stack/providers/inline/agents/meta_reference/agents.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agents.py
@ -29,6 +29,7 @@ from llama_stack.apis.agents import (
    Session,
    Turn,
 )
 from llama_stack.apis.agents.openai_responses import OpenAIResponseText
 from llama_stack.apis.common.responses import PaginatedResponse
 from llama_stack.apis.inference import (
    Inference,
@ -324,11 +325,12 @@ class MetaReferenceAgentsImpl(Agents):
        store: bool | None = True,
        stream: bool | None = False,
        temperature: float | None = None,
        text: OpenAIResponseText | None = None,
        tools: list[OpenAIResponseInputTool] | None = None,
        max_infer_iters: int | None = 10,
    ) -> OpenAIResponseObject:
        return await self.openai_responses_impl.create_openai_response(
-            input, model, instructions, previous_response_id, store, stream, temperature, tools, max_infer_iters
+            input, model, instructions, previous_response_id, store, stream, temperature, text, tools, max_infer_iters
        )
    async def list_openai_responses(
--- a/llama_stack/providers/inline/agents/meta_reference/openai_responses.py
+++ b/llama_stack/providers/inline/agents/meta_reference/openai_responses.py
@ -37,6 +37,8 @@ from llama_stack.apis.agents.openai_responses import (
    OpenAIResponseOutputMessageFunctionToolCall,
    OpenAIResponseOutputMessageMCPListTools,
    OpenAIResponseOutputMessageWebSearchToolCall,
    OpenAIResponseText,
    OpenAIResponseTextFormat,
 )
 from llama_stack.apis.inference.inference import (
    Inference,
@ -50,7 +52,12 @@ from llama_stack.apis.inference.inference import (
    OpenAIChoice,
    OpenAIDeveloperMessageParam,
    OpenAIImageURL,
    OpenAIJSONSchema,
    OpenAIMessageParam,
    OpenAIResponseFormatJSONObject,
    OpenAIResponseFormatJSONSchema,
    OpenAIResponseFormatParam,
    OpenAIResponseFormatText,
    OpenAISystemMessageParam,
    OpenAIToolMessageParam,
    OpenAIUserMessageParam,
@ -158,6 +165,21 @@ async def _convert_chat_choice_to_response_message(choice: OpenAIChoice) -> Open
    )
 async def _convert_response_text_to_chat_response_format(text: OpenAIResponseText) -> OpenAIResponseFormatParam:
    """
    Convert an OpenAI Response text parameter into an OpenAI Chat Completion response format.
    """
    if not text.format or text.format["type"] == "text":
        return OpenAIResponseFormatText(type="text")
    if text.format["type"] == "json_object":
        return OpenAIResponseFormatJSONObject()
    if text.format["type"] == "json_schema":
        return OpenAIResponseFormatJSONSchema(
            json_schema=OpenAIJSONSchema(name=text.format["name"], schema=text.format["schema"])
        )
    raise ValueError(f"Unsupported text format: {text.format}")
 async def _get_message_type_by_role(role: str):
    role_to_type = {
        "user": OpenAIUserMessageParam,
@ -180,6 +202,7 @@ class ChatCompletionContext(BaseModel):
    mcp_tool_to_server: dict[str, OpenAIResponseInputToolMCP]
    stream: bool
    temperature: float | None
    response_format: OpenAIResponseFormatParam
 class OpenAIResponsesImpl:
@ -343,10 +366,12 @@ class OpenAIResponsesImpl:
        store: bool | None = True,
        stream: bool | None = False,
        temperature: float | None = None,
        text: OpenAIResponseText | None = None,
        tools: list[OpenAIResponseInputTool] | None = None,
        max_infer_iters: int | None = 10,
    ):
        stream = False if stream is None else stream
        text = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")) if text is None else text
        output_messages: list[OpenAIResponseOutput] = []
@ -355,6 +380,9 @@ class OpenAIResponsesImpl:
        messages = await _convert_response_input_to_chat_messages(input)
        await self._prepend_instructions(messages, instructions)
        # Structured outputs
        response_format = await _convert_response_text_to_chat_response_format(text)
        # Tool setup
        chat_tools, mcp_tool_to_server, mcp_list_message = (
            await self._convert_response_tools_to_chat_tools(tools) if tools else (None, {}, None)
@ -369,6 +397,7 @@ class OpenAIResponsesImpl:
            mcp_tool_to_server=mcp_tool_to_server,
            stream=stream,
            temperature=temperature,
            response_format=response_format,
        )
        # Fork to streaming vs non-streaming - let each handle ALL inference rounds
@ -379,6 +408,7 @@ class OpenAIResponsesImpl:
                input=input,
                model=model,
                store=store,
                text=text,
                tools=tools,
                max_infer_iters=max_infer_iters,
            )
@ -389,6 +419,7 @@ class OpenAIResponsesImpl:
                input=input,
                model=model,
                store=store,
                text=text,
                tools=tools,
                max_infer_iters=max_infer_iters,
            )
@ -400,6 +431,7 @@ class OpenAIResponsesImpl:
        input: str | list[OpenAIResponseInput],
        model: str,
        store: bool | None,
        text: OpenAIResponseText,
        tools: list[OpenAIResponseInputTool] | None,
        max_infer_iters: int | None,
    ) -> OpenAIResponseObject:
@ -416,6 +448,7 @@ class OpenAIResponsesImpl:
                tools=ctx.tools,
                stream=False,
                temperature=ctx.temperature,
                response_format=ctx.response_format,
            )
            current_response = OpenAIChatCompletion(**inference_result.model_dump())
@ -470,6 +503,7 @@ class OpenAIResponsesImpl:
            object="response",
            status="completed",
            output=output_messages,
            text=text,
        )
        logger.debug(f"OpenAI Responses response: {response}")
@ -489,6 +523,7 @@ class OpenAIResponsesImpl:
        input: str | list[OpenAIResponseInput],
        model: str,
        store: bool | None,
        text: OpenAIResponseText,
        tools: list[OpenAIResponseInputTool] | None,
        max_infer_iters: int | None,
    ) -> AsyncIterator[OpenAIResponseObjectStream]:
@ -503,6 +538,7 @@ class OpenAIResponsesImpl:
            object="response",
            status="in_progress",
            output=output_messages.copy(),
            text=text,
        )
        # Emit response.created immediately
@ -520,6 +556,7 @@ class OpenAIResponsesImpl:
                tools=ctx.tools,
                stream=True,
                temperature=ctx.temperature,
                response_format=ctx.response_format,
            )
            # Process streaming chunks and build complete response
@ -645,6 +682,7 @@ class OpenAIResponsesImpl:
            model=model,
            object="response",
            status="completed",
            text=text,
            output=output_messages,
        )
--- a/tests/unit/providers/agents/meta_reference/test_openai_responses.py
+++ b/tests/unit/providers/agents/meta_reference/test_openai_responses.py
@ -25,11 +25,17 @@ from llama_stack.apis.agents.openai_responses import (
    OpenAIResponseObjectWithInput,
    OpenAIResponseOutputMessageContentOutputText,
    OpenAIResponseOutputMessageWebSearchToolCall,
    OpenAIResponseText,
    OpenAIResponseTextFormat,
 )
 from llama_stack.apis.inference.inference import (
    OpenAIAssistantMessageParam,
    OpenAIChatCompletionContentPartTextParam,
    OpenAIDeveloperMessageParam,
    OpenAIJSONSchema,
    OpenAIResponseFormatJSONObject,
    OpenAIResponseFormatJSONSchema,
    OpenAIResponseFormatText,
    OpenAIUserMessageParam,
 )
 from llama_stack.apis.tools.tools import Tool, ToolGroups, ToolInvocationResult, ToolParameter, ToolRuntime
@ -96,6 +102,7 @@ async def test_create_openai_response_with_string_input(openai_responses_impl, m
    mock_inference_api.openai_chat_completion.assert_called_once_with(
        model=model,
        messages=[OpenAIUserMessageParam(role="user", content="What is the capital of Ireland?", name=None)],
        response_format=OpenAIResponseFormatText(),
        tools=None,
        stream=False,
        temperature=0.1,
@ -320,6 +327,7 @@ async def test_prepend_previous_response_basic(openai_responses_impl, mock_respo
        model="fake_model",
        output=[response_output_message],
        status="completed",
        text=OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")),
        input=[input_item_message],
    )
    mock_responses_store.get_response_object.return_value = previous_response
@ -362,6 +370,7 @@ async def test_prepend_previous_response_web_search(openai_responses_impl, mock_
        model="fake_model",
        output=[output_web_search, output_message],
        status="completed",
        text=OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")),
        input=[input_item_message],
    )
    mock_responses_store.get_response_object.return_value = response
@ -483,6 +492,7 @@ async def test_create_openai_response_with_instructions_and_previous_response(
        model="fake_model",
        output=[response_output_message],
        status="completed",
        text=OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")),
        input=[input_item_message],
    )
    mock_responses_store.get_response_object.return_value = response
@ -576,6 +586,7 @@ async def test_responses_store_list_input_items_logic():
        object="response",
        status="completed",
        output=[],
        text=OpenAIResponseText(format=(OpenAIResponseTextFormat(type="text"))),
        input=input_items,
    )
@ -644,6 +655,7 @@ async def test_store_response_uses_rehydrated_input_with_previous_response(
        created_at=1234567890,
        model="meta-llama/Llama-3.1-8B-Instruct",
        status="completed",
        text=OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")),
        input=[
            OpenAIResponseMessage(
                id="msg-prev-user", role="user", content=[OpenAIResponseInputMessageContentText(text="What is 2+2?")]
@ -694,3 +706,61 @@ async def test_store_response_uses_rehydrated_input_with_previous_response(
    # Verify the response itself is correct
    assert result.model == model
    assert result.status == "completed"
@pytest.mark.asyncio
@pytest.mark.parametrize(
    "text_format, response_format",
    [
        (OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")), OpenAIResponseFormatText()),
        (
            OpenAIResponseText(format=OpenAIResponseTextFormat(name="Test", schema={"foo": "bar"}, type="json_schema")),
            OpenAIResponseFormatJSONSchema(json_schema=OpenAIJSONSchema(name="Test", schema={"foo": "bar"})),
        ),
        (OpenAIResponseText(format=OpenAIResponseTextFormat(type="json_object")), OpenAIResponseFormatJSONObject()),
        # ensure text param with no format specified defaults to text
        (OpenAIResponseText(format=None), OpenAIResponseFormatText()),
        # ensure text param of None defaults to text
        (None, OpenAIResponseFormatText()),
    ],
 )
 async def test_create_openai_response_with_text_format(
    openai_responses_impl, mock_inference_api, text_format, response_format
 ):
    """Test creating Responses with text formats."""
    # Setup
    input_text = "How hot it is in San Francisco today?"
    model = "meta-llama/Llama-3.1-8B-Instruct"
    # Load the chat completion fixture
    mock_chat_completion = load_chat_completion_fixture("simple_chat_completion.yaml")
    mock_inference_api.openai_chat_completion.return_value = mock_chat_completion
    # Execute
    _result = await openai_responses_impl.create_openai_response(
        input=input_text,
        model=model,
        text=text_format,
    )
    # Verify
    first_call = mock_inference_api.openai_chat_completion.call_args_list[0]
    assert first_call.kwargs["messages"][0].content == input_text
    assert first_call.kwargs["response_format"] is not None
    assert first_call.kwargs["response_format"] == response_format
@pytest.mark.asyncio
 async def test_create_openai_response_with_invalid_text_format(openai_responses_impl, mock_inference_api):
    """Test creating an OpenAI response with an invalid text format."""
    # Setup
    input_text = "How hot it is in San Francisco today?"
    model = "meta-llama/Llama-3.1-8B-Instruct"
    # Execute
    with pytest.raises(ValueError):
        _result = await openai_responses_impl.create_openai_response(
            input=input_text,
            model=model,
            text=OpenAIResponseText(format={"type": "invalid"}),
        )
--- a/tests/verifications/openai_api/test_responses.py
+++ b/tests/verifications/openai_api/test_responses.py
@ -546,3 +546,39 @@ async def test_response_streaming_multi_turn_tool_execution(
            assert expected_output.lower() in final_response.output_text.lower(), (
                f"Expected '{expected_output}' to appear in response: {final_response.output_text}"
            )
@pytest.mark.parametrize(
    "text_format",
    # Not testing json_object because most providers don't actually support it.
    [
        {"type": "text"},
        {
            "type": "json_schema",
            "name": "capitals",
            "description": "A schema for the capital of each country",
            "schema": {"type": "object", "properties": {"capital": {"type": "string"}}},
            "strict": True,
        },
    ],
 )
 def test_response_text_format(request, openai_client, model, provider, verification_config, text_format):
    if isinstance(openai_client, LlamaStackAsLibraryClient):
        pytest.skip("Responses API text format is not yet supported in library client.")
    test_name_base = get_base_test_name(request)
    if should_skip_test(verification_config, provider, model, test_name_base):
        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
    stream = False
    response = openai_client.responses.create(
        model=model,
        input="What is the capital of France?",
        stream=stream,
        text={"format": text_format},
    )
    # by_alias=True is needed because otherwise Pydantic renames our "schema" field
    assert response.text.format.model_dump(exclude_none=True, by_alias=True) == text_format
    assert "paris" in response.output_text.lower()
    if text_format["type"] == "json_schema":
        assert "paris" in json.loads(response.output_text)["capital"].lower()