feat: Add max_output_tokens to Response API

OpenAI Responses and Completions have a max_output_tokens field.
It is currently missing from the create and response object in Responses API.

This PR fixes it.

fixes: #3562
Signed-off-by: Abhishek Bongale <abhishekbongale@outlook.com>
This commit is contained in:
Abhishek Bongale 2025-11-03 10:25:04 +00:00
parent 939a2db58f
commit 97b345b3f8
9 changed files with 59 additions and 0 deletions

View file

@ -6880,6 +6880,10 @@ components:
type: string
description: >-
(Optional) System message inserted into the model's context
max_output_tokens:
type: integer
description: >-
(Optional) Upper bound for response tokens generation.
input:
type: array
items:
@ -7238,6 +7242,10 @@ components:
(Optional) Additional fields to include in the response.
max_infer_iters:
type: integer
max_output_tokens:
type: integer
description: >-
(Optional) Maximum tokens generated in a response.
additionalProperties: false
required:
- input
@ -7319,6 +7327,10 @@ components:
type: string
description: >-
(Optional) System message inserted into the model's context
max_output_tokens:
type: integer
description: >-
(Optional) Upper bound for response tokens generation.
additionalProperties: false
required:
- created_at

View file

@ -6164,6 +6164,10 @@ components:
type: string
description: >-
(Optional) System message inserted into the model's context
max_output_tokens:
type: integer
description: >-
(Optional) Upper bound for response tokens generation.
input:
type: array
items:
@ -6522,6 +6526,10 @@ components:
(Optional) Additional fields to include in the response.
max_infer_iters:
type: integer
max_output_tokens:
type: integer
description: >-
(Optional) Maximum tokens generated in a response.
additionalProperties: false
required:
- input
@ -6603,6 +6611,10 @@ components:
type: string
description: >-
(Optional) System message inserted into the model's context
max_output_tokens:
type: integer
description: >-
(Optional) Upper bound for response tokens generation.
additionalProperties: false
required:
- created_at

View file

@ -6880,6 +6880,10 @@ components:
type: string
description: >-
(Optional) System message inserted into the model's context
max_output_tokens:
type: integer
description: >-
(Optional) Upper bound for response tokens generation.
input:
type: array
items:
@ -7238,6 +7242,10 @@ components:
(Optional) Additional fields to include in the response.
max_infer_iters:
type: integer
max_output_tokens:
type: integer
description: >-
(Optional) Maximum tokens generated in a response.
additionalProperties: false
required:
- input
@ -7319,6 +7327,10 @@ components:
type: string
description: >-
(Optional) System message inserted into the model's context
max_output_tokens:
type: integer
description: >-
(Optional) Upper bound for response tokens generation.
additionalProperties: false
required:
- created_at

View file

@ -87,6 +87,7 @@ class Agents(Protocol):
"List of guardrails to apply during response generation. Guardrails provide safety and content moderation."
),
] = None,
max_output_tokens: int | None = None,
) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]:
"""Create a model response.
@ -97,6 +98,7 @@ class Agents(Protocol):
:param conversation: (Optional) The ID of a conversation to add the response to. Must begin with 'conv_'. Input and output messages will be automatically added to the conversation.
:param include: (Optional) Additional fields to include in the response.
:param guardrails: (Optional) List of guardrails to apply during response generation. Can be guardrail IDs (strings) or guardrail specifications.
:param max_output_tokens: (Optional) Maximum tokens generated in a response.
:returns: An OpenAIResponseObject.
"""
...

View file

@ -591,6 +591,7 @@ class OpenAIResponseObject(BaseModel):
:param truncation: (Optional) Truncation strategy applied to the response
:param usage: (Optional) Token usage information for the response
:param instructions: (Optional) System message inserted into the model's context
:param max_output_tokens: (Optional) Upper bound for response tokens generation.
"""
created_at: int
@ -612,6 +613,7 @@ class OpenAIResponseObject(BaseModel):
truncation: str | None = None
usage: OpenAIResponseUsage | None = None
instructions: str | None = None
max_output_tokens: int | None = None
@json_schema_type

View file

@ -102,6 +102,7 @@ class MetaReferenceAgentsImpl(Agents):
include: list[str] | None = None,
max_infer_iters: int | None = 10,
guardrails: list[ResponseGuardrail] | None = None,
max_output_tokens: int | None = None,
) -> OpenAIResponseObject:
assert self.openai_responses_impl is not None, "OpenAI responses not initialized"
result = await self.openai_responses_impl.create_openai_response(
@ -119,6 +120,7 @@ class MetaReferenceAgentsImpl(Agents):
include,
max_infer_iters,
guardrails,
max_output_tokens,
)
return result # type: ignore[no-any-return]

View file

@ -255,6 +255,7 @@ class OpenAIResponsesImpl:
include: list[str] | None = None,
max_infer_iters: int | None = 10,
guardrails: list[str | ResponseGuardrailSpec] | None = None,
max_output_tokens: int | None = None,
):
stream = bool(stream)
text = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")) if text is None else text
@ -282,6 +283,7 @@ class OpenAIResponsesImpl:
tools=tools,
max_infer_iters=max_infer_iters,
guardrail_ids=guardrail_ids,
max_output_tokens=max_output_tokens,
)
if stream:
@ -331,6 +333,7 @@ class OpenAIResponsesImpl:
tools: list[OpenAIResponseInputTool] | None = None,
max_infer_iters: int | None = 10,
guardrail_ids: list[str] | None = None,
max_output_tokens: int | None = None,
) -> AsyncIterator[OpenAIResponseObjectStream]:
# These should never be None when called from create_openai_response (which sets defaults)
# but we assert here to help mypy understand the types
@ -356,6 +359,7 @@ class OpenAIResponsesImpl:
response_format=response_format,
tool_context=tool_context,
inputs=all_input,
max_output_tokens=max_output_tokens,
)
# Create orchestrator and delegate streaming logic

View file

@ -221,6 +221,16 @@ class StreamingResponseOrchestrator:
try:
while True:
# Check if the max_output_tokens are depleted are not
if (
self.ctx.max_output_tokens
and self.accumulated_usage
and self.accumulated_usage.output_tokens >= self.ctx.max_output_tokens
):
logger.info("exiting inference loop, remaining max_output_tokens is depleted")
final_status = "incomplete"
break
# Text is the default response format for chat completion so don't need to pass it
# (some providers don't support non-empty response_format when tools are present)
response_format = (

View file

@ -160,6 +160,7 @@ class ChatCompletionContext(BaseModel):
tool_context: ToolContext | None
approval_requests: list[OpenAIResponseMCPApprovalRequest] = []
approval_responses: dict[str, OpenAIResponseMCPApprovalResponse] = {}
max_output_tokens: int | None
def __init__(
self,
@ -170,6 +171,7 @@ class ChatCompletionContext(BaseModel):
response_format: OpenAIResponseFormatParam,
tool_context: ToolContext,
inputs: list[OpenAIResponseInput] | str,
max_output_tokens: int | None,
):
super().__init__(
model=model,
@ -178,6 +180,7 @@ class ChatCompletionContext(BaseModel):
temperature=temperature,
response_format=response_format,
tool_context=tool_context,
max_output_tokens=max_output_tokens,
)
if not isinstance(inputs, str):
self.approval_requests = [input for input in inputs if input.type == "mcp_approval_request"]