mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-06-27 18:50:41 +00:00
feat: Structured output for Responses API (#2324)
# What does this PR do? This adds the missing `text` parameter to the Responses API that is how users control structured outputs. All we do with that parameter is map it to the corresponding chat completion response_format. ## Test Plan The new unit tests exercise the various permutations allowed for this property, while a couple of new verification tests actually use it for real to verify the model outputs are following the format as expected. Unit tests: `python -m pytest -s -v tests/unit/providers/agents/meta_reference/test_openai_responses.py` Verification tests: ``` llama stack run llama_stack/templates/together/run.yaml pytest -s -vv 'tests/verifications/openai_api/test_responses.py' \ --base-url=http://localhost:8321/v1/openai/v1 \ --model meta-llama/Llama-4-Scout-17B-16E-Instruct ``` Note that the verification tests can only be run with a real Llama Stack server (as opposed to using the library client via `--provider=stack:together`) because the Llama Stack python client is not yet updated to accept this text field. Signed-off-by: Ben Browning <bbrownin@redhat.com>
This commit is contained in:
parent
c70ca8344f
commit
8bee2954be
8 changed files with 323 additions and 2 deletions
|
@ -37,6 +37,7 @@ from .openai_responses import (
|
|||
OpenAIResponseInputTool,
|
||||
OpenAIResponseObject,
|
||||
OpenAIResponseObjectStream,
|
||||
OpenAIResponseText,
|
||||
)
|
||||
|
||||
# TODO: use enum.StrEnum when we drop support for python 3.10
|
||||
|
@ -603,6 +604,7 @@ class Agents(Protocol):
|
|||
store: bool | None = True,
|
||||
stream: bool | None = False,
|
||||
temperature: float | None = None,
|
||||
text: OpenAIResponseText | None = None,
|
||||
tools: list[OpenAIResponseInputTool] | None = None,
|
||||
max_infer_iters: int | None = 10, # this is an extension to the OpenAI API
|
||||
) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]:
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
from typing import Annotated, Any, Literal
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
from typing_extensions import TypedDict
|
||||
|
||||
from llama_stack.schema_utils import json_schema_type, register_schema
|
||||
|
||||
|
@ -126,6 +127,32 @@ OpenAIResponseOutput = Annotated[
|
|||
register_schema(OpenAIResponseOutput, name="OpenAIResponseOutput")
|
||||
|
||||
|
||||
# This has to be a TypedDict because we need a "schema" field and our strong
|
||||
# typing code in the schema generator doesn't support Pydantic aliases. That also
|
||||
# means we can't use a discriminator field here, because TypedDicts don't support
|
||||
# default values which the strong typing code requires for discriminators.
|
||||
class OpenAIResponseTextFormat(TypedDict, total=False):
|
||||
"""Configuration for Responses API text format.
|
||||
|
||||
:param type: Must be "text", "json_schema", or "json_object" to identify the format type
|
||||
:param name: The name of the response format. Only used for json_schema.
|
||||
:param schema: The JSON schema the response should conform to. In a Python SDK, this is often a `pydantic` model. Only used for json_schema.
|
||||
:param description: (Optional) A description of the response format. Only used for json_schema.
|
||||
:param strict: (Optional) Whether to strictly enforce the JSON schema. If true, the response must match the schema exactly. Only used for json_schema.
|
||||
"""
|
||||
|
||||
type: Literal["text"] | Literal["json_schema"] | Literal["json_object"]
|
||||
name: str | None
|
||||
schema: dict[str, Any] | None
|
||||
description: str | None
|
||||
strict: bool | None
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class OpenAIResponseText(BaseModel):
|
||||
format: OpenAIResponseTextFormat | None = None
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class OpenAIResponseObject(BaseModel):
|
||||
created_at: int
|
||||
|
@ -138,6 +165,9 @@ class OpenAIResponseObject(BaseModel):
|
|||
previous_response_id: str | None = None
|
||||
status: str
|
||||
temperature: float | None = None
|
||||
# Default to text format to avoid breaking the loading of old responses
|
||||
# before the field was added. New responses will have this set always.
|
||||
text: OpenAIResponseText = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text"))
|
||||
top_p: float | None = None
|
||||
truncation: str | None = None
|
||||
user: str | None = None
|
||||
|
|
|
@ -29,6 +29,7 @@ from llama_stack.apis.agents import (
|
|||
Session,
|
||||
Turn,
|
||||
)
|
||||
from llama_stack.apis.agents.openai_responses import OpenAIResponseText
|
||||
from llama_stack.apis.common.responses import PaginatedResponse
|
||||
from llama_stack.apis.inference import (
|
||||
Inference,
|
||||
|
@ -324,11 +325,12 @@ class MetaReferenceAgentsImpl(Agents):
|
|||
store: bool | None = True,
|
||||
stream: bool | None = False,
|
||||
temperature: float | None = None,
|
||||
text: OpenAIResponseText | None = None,
|
||||
tools: list[OpenAIResponseInputTool] | None = None,
|
||||
max_infer_iters: int | None = 10,
|
||||
) -> OpenAIResponseObject:
|
||||
return await self.openai_responses_impl.create_openai_response(
|
||||
input, model, instructions, previous_response_id, store, stream, temperature, tools, max_infer_iters
|
||||
input, model, instructions, previous_response_id, store, stream, temperature, text, tools, max_infer_iters
|
||||
)
|
||||
|
||||
async def list_openai_responses(
|
||||
|
|
|
@ -37,6 +37,8 @@ from llama_stack.apis.agents.openai_responses import (
|
|||
OpenAIResponseOutputMessageFunctionToolCall,
|
||||
OpenAIResponseOutputMessageMCPListTools,
|
||||
OpenAIResponseOutputMessageWebSearchToolCall,
|
||||
OpenAIResponseText,
|
||||
OpenAIResponseTextFormat,
|
||||
)
|
||||
from llama_stack.apis.inference.inference import (
|
||||
Inference,
|
||||
|
@ -50,7 +52,12 @@ from llama_stack.apis.inference.inference import (
|
|||
OpenAIChoice,
|
||||
OpenAIDeveloperMessageParam,
|
||||
OpenAIImageURL,
|
||||
OpenAIJSONSchema,
|
||||
OpenAIMessageParam,
|
||||
OpenAIResponseFormatJSONObject,
|
||||
OpenAIResponseFormatJSONSchema,
|
||||
OpenAIResponseFormatParam,
|
||||
OpenAIResponseFormatText,
|
||||
OpenAISystemMessageParam,
|
||||
OpenAIToolMessageParam,
|
||||
OpenAIUserMessageParam,
|
||||
|
@ -158,6 +165,21 @@ async def _convert_chat_choice_to_response_message(choice: OpenAIChoice) -> Open
|
|||
)
|
||||
|
||||
|
||||
async def _convert_response_text_to_chat_response_format(text: OpenAIResponseText) -> OpenAIResponseFormatParam:
|
||||
"""
|
||||
Convert an OpenAI Response text parameter into an OpenAI Chat Completion response format.
|
||||
"""
|
||||
if not text.format or text.format["type"] == "text":
|
||||
return OpenAIResponseFormatText(type="text")
|
||||
if text.format["type"] == "json_object":
|
||||
return OpenAIResponseFormatJSONObject()
|
||||
if text.format["type"] == "json_schema":
|
||||
return OpenAIResponseFormatJSONSchema(
|
||||
json_schema=OpenAIJSONSchema(name=text.format["name"], schema=text.format["schema"])
|
||||
)
|
||||
raise ValueError(f"Unsupported text format: {text.format}")
|
||||
|
||||
|
||||
async def _get_message_type_by_role(role: str):
|
||||
role_to_type = {
|
||||
"user": OpenAIUserMessageParam,
|
||||
|
@ -180,6 +202,7 @@ class ChatCompletionContext(BaseModel):
|
|||
mcp_tool_to_server: dict[str, OpenAIResponseInputToolMCP]
|
||||
stream: bool
|
||||
temperature: float | None
|
||||
response_format: OpenAIResponseFormatParam
|
||||
|
||||
|
||||
class OpenAIResponsesImpl:
|
||||
|
@ -343,10 +366,12 @@ class OpenAIResponsesImpl:
|
|||
store: bool | None = True,
|
||||
stream: bool | None = False,
|
||||
temperature: float | None = None,
|
||||
text: OpenAIResponseText | None = None,
|
||||
tools: list[OpenAIResponseInputTool] | None = None,
|
||||
max_infer_iters: int | None = 10,
|
||||
):
|
||||
stream = False if stream is None else stream
|
||||
text = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")) if text is None else text
|
||||
|
||||
output_messages: list[OpenAIResponseOutput] = []
|
||||
|
||||
|
@ -355,6 +380,9 @@ class OpenAIResponsesImpl:
|
|||
messages = await _convert_response_input_to_chat_messages(input)
|
||||
await self._prepend_instructions(messages, instructions)
|
||||
|
||||
# Structured outputs
|
||||
response_format = await _convert_response_text_to_chat_response_format(text)
|
||||
|
||||
# Tool setup
|
||||
chat_tools, mcp_tool_to_server, mcp_list_message = (
|
||||
await self._convert_response_tools_to_chat_tools(tools) if tools else (None, {}, None)
|
||||
|
@ -369,6 +397,7 @@ class OpenAIResponsesImpl:
|
|||
mcp_tool_to_server=mcp_tool_to_server,
|
||||
stream=stream,
|
||||
temperature=temperature,
|
||||
response_format=response_format,
|
||||
)
|
||||
|
||||
# Fork to streaming vs non-streaming - let each handle ALL inference rounds
|
||||
|
@ -379,6 +408,7 @@ class OpenAIResponsesImpl:
|
|||
input=input,
|
||||
model=model,
|
||||
store=store,
|
||||
text=text,
|
||||
tools=tools,
|
||||
max_infer_iters=max_infer_iters,
|
||||
)
|
||||
|
@ -389,6 +419,7 @@ class OpenAIResponsesImpl:
|
|||
input=input,
|
||||
model=model,
|
||||
store=store,
|
||||
text=text,
|
||||
tools=tools,
|
||||
max_infer_iters=max_infer_iters,
|
||||
)
|
||||
|
@ -400,6 +431,7 @@ class OpenAIResponsesImpl:
|
|||
input: str | list[OpenAIResponseInput],
|
||||
model: str,
|
||||
store: bool | None,
|
||||
text: OpenAIResponseText,
|
||||
tools: list[OpenAIResponseInputTool] | None,
|
||||
max_infer_iters: int | None,
|
||||
) -> OpenAIResponseObject:
|
||||
|
@ -416,6 +448,7 @@ class OpenAIResponsesImpl:
|
|||
tools=ctx.tools,
|
||||
stream=False,
|
||||
temperature=ctx.temperature,
|
||||
response_format=ctx.response_format,
|
||||
)
|
||||
current_response = OpenAIChatCompletion(**inference_result.model_dump())
|
||||
|
||||
|
@ -470,6 +503,7 @@ class OpenAIResponsesImpl:
|
|||
object="response",
|
||||
status="completed",
|
||||
output=output_messages,
|
||||
text=text,
|
||||
)
|
||||
logger.debug(f"OpenAI Responses response: {response}")
|
||||
|
||||
|
@ -489,6 +523,7 @@ class OpenAIResponsesImpl:
|
|||
input: str | list[OpenAIResponseInput],
|
||||
model: str,
|
||||
store: bool | None,
|
||||
text: OpenAIResponseText,
|
||||
tools: list[OpenAIResponseInputTool] | None,
|
||||
max_infer_iters: int | None,
|
||||
) -> AsyncIterator[OpenAIResponseObjectStream]:
|
||||
|
@ -503,6 +538,7 @@ class OpenAIResponsesImpl:
|
|||
object="response",
|
||||
status="in_progress",
|
||||
output=output_messages.copy(),
|
||||
text=text,
|
||||
)
|
||||
|
||||
# Emit response.created immediately
|
||||
|
@ -520,6 +556,7 @@ class OpenAIResponsesImpl:
|
|||
tools=ctx.tools,
|
||||
stream=True,
|
||||
temperature=ctx.temperature,
|
||||
response_format=ctx.response_format,
|
||||
)
|
||||
|
||||
# Process streaming chunks and build complete response
|
||||
|
@ -645,6 +682,7 @@ class OpenAIResponsesImpl:
|
|||
model=model,
|
||||
object="response",
|
||||
status="completed",
|
||||
text=text,
|
||||
output=output_messages,
|
||||
)
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue