feat: Structured output for Responses API (#2324)

# What does this PR do? This adds the missing `text` parameter to the Responses API that is how users control structured outputs. All we do with that parameter is map it to the corresponding chat completion response_format. ## Test Plan The new unit tests exercise the various permutations allowed for this property, while a couple of new verification tests actually use it for real to verify the model outputs are following the format as expected. Unit tests: `python -m pytest -s -v tests/unit/providers/agents/meta_reference/test_openai_responses.py` Verification tests: ``` llama stack run llama_stack/templates/together/run.yaml pytest -s -vv 'tests/verifications/openai_api/test_responses.py' \ --base-url=http://localhost:8321/v1/openai/v1 \ --model meta-llama/Llama-4-Scout-17B-16E-Instruct ``` Note that the verification tests can only be run with a real Llama Stack server (as opposed to using the library client via `--provider=stack:together`) because the Llama Stack python client is not yet updated to accept this text field. Signed-off-by: Ben Browning <bbrownin@redhat.com>
2025-12-03 09:53:45 +00:00 · 2025-06-03 17:43:00 -04:00 · 2025-06-03 17:43:00 -04:00 · 8bee2954be
commit 8bee2954be
parent c70ca8344f
8 changed files with 323 additions and 2 deletions
--- a/llama_stack/providers/inline/agents/meta_reference/agents.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agents.py
@ -29,6 +29,7 @@ from llama_stack.apis.agents import (
    Session,
    Turn,
 )
+from llama_stack.apis.agents.openai_responses import OpenAIResponseText
 from llama_stack.apis.common.responses import PaginatedResponse
 from llama_stack.apis.inference import (
    Inference,
@ -324,11 +325,12 @@ class MetaReferenceAgentsImpl(Agents):
        store: bool | None = True,
        stream: bool | None = False,
        temperature: float | None = None,
+        text: OpenAIResponseText | None = None,
        tools: list[OpenAIResponseInputTool] | None = None,
        max_infer_iters: int | None = 10,
    ) -> OpenAIResponseObject:
        return await self.openai_responses_impl.create_openai_response(
-            input, model, instructions, previous_response_id, store, stream, temperature, tools, max_infer_iters
+            input, model, instructions, previous_response_id, store, stream, temperature, text, tools, max_infer_iters
        )

    async def list_openai_responses(
--- a/llama_stack/providers/inline/agents/meta_reference/openai_responses.py
+++ b/llama_stack/providers/inline/agents/meta_reference/openai_responses.py
@ -37,6 +37,8 @@ from llama_stack.apis.agents.openai_responses import (
    OpenAIResponseOutputMessageFunctionToolCall,
    OpenAIResponseOutputMessageMCPListTools,
    OpenAIResponseOutputMessageWebSearchToolCall,
+    OpenAIResponseText,
+    OpenAIResponseTextFormat,
 )
 from llama_stack.apis.inference.inference import (
    Inference,
@ -50,7 +52,12 @@ from llama_stack.apis.inference.inference import (
    OpenAIChoice,
    OpenAIDeveloperMessageParam,
    OpenAIImageURL,
+    OpenAIJSONSchema,
    OpenAIMessageParam,
+    OpenAIResponseFormatJSONObject,
+    OpenAIResponseFormatJSONSchema,
+    OpenAIResponseFormatParam,
+    OpenAIResponseFormatText,
    OpenAISystemMessageParam,
    OpenAIToolMessageParam,
    OpenAIUserMessageParam,
@ -158,6 +165,21 @@ async def _convert_chat_choice_to_response_message(choice: OpenAIChoice) -> Open
    )


+async def _convert_response_text_to_chat_response_format(text: OpenAIResponseText) -> OpenAIResponseFormatParam:
+    """
+    Convert an OpenAI Response text parameter into an OpenAI Chat Completion response format.
+    """
+    if not text.format or text.format["type"] == "text":
+        return OpenAIResponseFormatText(type="text")
+    if text.format["type"] == "json_object":
+        return OpenAIResponseFormatJSONObject()
+    if text.format["type"] == "json_schema":
+        return OpenAIResponseFormatJSONSchema(
+            json_schema=OpenAIJSONSchema(name=text.format["name"], schema=text.format["schema"])
+        )
+    raise ValueError(f"Unsupported text format: {text.format}")
+
+
 async def _get_message_type_by_role(role: str):
    role_to_type = {
        "user": OpenAIUserMessageParam,
@ -180,6 +202,7 @@ class ChatCompletionContext(BaseModel):
    mcp_tool_to_server: dict[str, OpenAIResponseInputToolMCP]
    stream: bool
    temperature: float | None
+    response_format: OpenAIResponseFormatParam


 class OpenAIResponsesImpl:
@ -343,10 +366,12 @@ class OpenAIResponsesImpl:
        store: bool | None = True,
        stream: bool | None = False,
        temperature: float | None = None,
+        text: OpenAIResponseText | None = None,
        tools: list[OpenAIResponseInputTool] | None = None,
        max_infer_iters: int | None = 10,
    ):
        stream = False if stream is None else stream
+        text = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")) if text is None else text

        output_messages: list[OpenAIResponseOutput] = []

@ -355,6 +380,9 @@ class OpenAIResponsesImpl:
        messages = await _convert_response_input_to_chat_messages(input)
        await self._prepend_instructions(messages, instructions)

+        # Structured outputs
+        response_format = await _convert_response_text_to_chat_response_format(text)
+
        # Tool setup
        chat_tools, mcp_tool_to_server, mcp_list_message = (
            await self._convert_response_tools_to_chat_tools(tools) if tools else (None, {}, None)
@ -369,6 +397,7 @@ class OpenAIResponsesImpl:
            mcp_tool_to_server=mcp_tool_to_server,
            stream=stream,
            temperature=temperature,
+            response_format=response_format,
        )

        # Fork to streaming vs non-streaming - let each handle ALL inference rounds
@ -379,6 +408,7 @@ class OpenAIResponsesImpl:
                input=input,
                model=model,
                store=store,
+                text=text,
                tools=tools,
                max_infer_iters=max_infer_iters,
            )
@ -389,6 +419,7 @@ class OpenAIResponsesImpl:
                input=input,
                model=model,
                store=store,
+                text=text,
                tools=tools,
                max_infer_iters=max_infer_iters,
            )
@ -400,6 +431,7 @@ class OpenAIResponsesImpl:
        input: str | list[OpenAIResponseInput],
        model: str,
        store: bool | None,
+        text: OpenAIResponseText,
        tools: list[OpenAIResponseInputTool] | None,
        max_infer_iters: int | None,
    ) -> OpenAIResponseObject:
@ -416,6 +448,7 @@ class OpenAIResponsesImpl:
                tools=ctx.tools,
                stream=False,
                temperature=ctx.temperature,
+                response_format=ctx.response_format,
            )
            current_response = OpenAIChatCompletion(**inference_result.model_dump())

@ -470,6 +503,7 @@ class OpenAIResponsesImpl:
            object="response",
            status="completed",
            output=output_messages,
+            text=text,
        )
        logger.debug(f"OpenAI Responses response: {response}")

@ -489,6 +523,7 @@ class OpenAIResponsesImpl:
        input: str | list[OpenAIResponseInput],
        model: str,
        store: bool | None,
+        text: OpenAIResponseText,
        tools: list[OpenAIResponseInputTool] | None,
        max_infer_iters: int | None,
    ) -> AsyncIterator[OpenAIResponseObjectStream]:
@ -503,6 +538,7 @@ class OpenAIResponsesImpl:
            object="response",
            status="in_progress",
            output=output_messages.copy(),
+            text=text,
        )

        # Emit response.created immediately
@ -520,6 +556,7 @@ class OpenAIResponsesImpl:
                tools=ctx.tools,
                stream=True,
                temperature=ctx.temperature,
+                response_format=ctx.response_format,
            )

            # Process streaming chunks and build complete response
@ -645,6 +682,7 @@ class OpenAIResponsesImpl:
            model=model,
            object="response",
            status="completed",
+            text=text,
            output=output_messages,
        )