From 97b345b3f89bb6186fa318f62170d79b74b237d8 Mon Sep 17 00:00:00 2001 From: Abhishek Bongale Date: Mon, 3 Nov 2025 10:25:04 +0000 Subject: [PATCH] feat: Add max_output_tokens to Response API OpenAI Responses and Completions have a max_output_tokens field. It is currently missing from the create and response object in Responses API. This PR fixes it. fixes: #3562 Signed-off-by: Abhishek Bongale --- client-sdks/stainless/openapi.yml | 12 ++++++++++++ docs/static/llama-stack-spec.yaml | 12 ++++++++++++ docs/static/stainless-llama-stack-spec.yaml | 12 ++++++++++++ src/llama_stack/apis/agents/agents.py | 2 ++ src/llama_stack/apis/agents/openai_responses.py | 2 ++ .../providers/inline/agents/meta_reference/agents.py | 2 ++ .../meta_reference/responses/openai_responses.py | 4 ++++ .../agents/meta_reference/responses/streaming.py | 10 ++++++++++ .../inline/agents/meta_reference/responses/types.py | 3 +++ 9 files changed, 59 insertions(+) diff --git a/client-sdks/stainless/openapi.yml b/client-sdks/stainless/openapi.yml index 448b08908..859cf8ee7 100644 --- a/client-sdks/stainless/openapi.yml +++ b/client-sdks/stainless/openapi.yml @@ -6880,6 +6880,10 @@ components: type: string description: >- (Optional) System message inserted into the model's context + max_output_tokens: + type: integer + description: >- + (Optional) Upper bound for response tokens generation. input: type: array items: @@ -7238,6 +7242,10 @@ components: (Optional) Additional fields to include in the response. max_infer_iters: type: integer + max_output_tokens: + type: integer + description: >- + (Optional) Maximum tokens generated in a response. additionalProperties: false required: - input @@ -7319,6 +7327,10 @@ components: type: string description: >- (Optional) System message inserted into the model's context + max_output_tokens: + type: integer + description: >- + (Optional) Upper bound for response tokens generation. additionalProperties: false required: - created_at diff --git a/docs/static/llama-stack-spec.yaml b/docs/static/llama-stack-spec.yaml index cc720ad18..01405f998 100644 --- a/docs/static/llama-stack-spec.yaml +++ b/docs/static/llama-stack-spec.yaml @@ -6164,6 +6164,10 @@ components: type: string description: >- (Optional) System message inserted into the model's context + max_output_tokens: + type: integer + description: >- + (Optional) Upper bound for response tokens generation. input: type: array items: @@ -6522,6 +6526,10 @@ components: (Optional) Additional fields to include in the response. max_infer_iters: type: integer + max_output_tokens: + type: integer + description: >- + (Optional) Maximum tokens generated in a response. additionalProperties: false required: - input @@ -6603,6 +6611,10 @@ components: type: string description: >- (Optional) System message inserted into the model's context + max_output_tokens: + type: integer + description: >- + (Optional) Upper bound for response tokens generation. additionalProperties: false required: - created_at diff --git a/docs/static/stainless-llama-stack-spec.yaml b/docs/static/stainless-llama-stack-spec.yaml index 448b08908..859cf8ee7 100644 --- a/docs/static/stainless-llama-stack-spec.yaml +++ b/docs/static/stainless-llama-stack-spec.yaml @@ -6880,6 +6880,10 @@ components: type: string description: >- (Optional) System message inserted into the model's context + max_output_tokens: + type: integer + description: >- + (Optional) Upper bound for response tokens generation. input: type: array items: @@ -7238,6 +7242,10 @@ components: (Optional) Additional fields to include in the response. max_infer_iters: type: integer + max_output_tokens: + type: integer + description: >- + (Optional) Maximum tokens generated in a response. additionalProperties: false required: - input @@ -7319,6 +7327,10 @@ components: type: string description: >- (Optional) System message inserted into the model's context + max_output_tokens: + type: integer + description: >- + (Optional) Upper bound for response tokens generation. additionalProperties: false required: - created_at diff --git a/src/llama_stack/apis/agents/agents.py b/src/llama_stack/apis/agents/agents.py index cadef2edc..97454f905 100644 --- a/src/llama_stack/apis/agents/agents.py +++ b/src/llama_stack/apis/agents/agents.py @@ -87,6 +87,7 @@ class Agents(Protocol): "List of guardrails to apply during response generation. Guardrails provide safety and content moderation." ), ] = None, + max_output_tokens: int | None = None, ) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]: """Create a model response. @@ -97,6 +98,7 @@ class Agents(Protocol): :param conversation: (Optional) The ID of a conversation to add the response to. Must begin with 'conv_'. Input and output messages will be automatically added to the conversation. :param include: (Optional) Additional fields to include in the response. :param guardrails: (Optional) List of guardrails to apply during response generation. Can be guardrail IDs (strings) or guardrail specifications. + :param max_output_tokens: (Optional) Maximum tokens generated in a response. :returns: An OpenAIResponseObject. """ ... diff --git a/src/llama_stack/apis/agents/openai_responses.py b/src/llama_stack/apis/agents/openai_responses.py index 69e2b2012..206753b3d 100644 --- a/src/llama_stack/apis/agents/openai_responses.py +++ b/src/llama_stack/apis/agents/openai_responses.py @@ -591,6 +591,7 @@ class OpenAIResponseObject(BaseModel): :param truncation: (Optional) Truncation strategy applied to the response :param usage: (Optional) Token usage information for the response :param instructions: (Optional) System message inserted into the model's context + :param max_output_tokens: (Optional) Upper bound for response tokens generation. """ created_at: int @@ -612,6 +613,7 @@ class OpenAIResponseObject(BaseModel): truncation: str | None = None usage: OpenAIResponseUsage | None = None instructions: str | None = None + max_output_tokens: int | None = None @json_schema_type diff --git a/src/llama_stack/providers/inline/agents/meta_reference/agents.py b/src/llama_stack/providers/inline/agents/meta_reference/agents.py index 7141d58bc..f6cb5d861 100644 --- a/src/llama_stack/providers/inline/agents/meta_reference/agents.py +++ b/src/llama_stack/providers/inline/agents/meta_reference/agents.py @@ -102,6 +102,7 @@ class MetaReferenceAgentsImpl(Agents): include: list[str] | None = None, max_infer_iters: int | None = 10, guardrails: list[ResponseGuardrail] | None = None, + max_output_tokens: int | None = None, ) -> OpenAIResponseObject: assert self.openai_responses_impl is not None, "OpenAI responses not initialized" result = await self.openai_responses_impl.create_openai_response( @@ -119,6 +120,7 @@ class MetaReferenceAgentsImpl(Agents): include, max_infer_iters, guardrails, + max_output_tokens, ) return result # type: ignore[no-any-return] diff --git a/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py b/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py index 933cfe963..59be85488 100644 --- a/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py @@ -255,6 +255,7 @@ class OpenAIResponsesImpl: include: list[str] | None = None, max_infer_iters: int | None = 10, guardrails: list[str | ResponseGuardrailSpec] | None = None, + max_output_tokens: int | None = None, ): stream = bool(stream) text = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")) if text is None else text @@ -282,6 +283,7 @@ class OpenAIResponsesImpl: tools=tools, max_infer_iters=max_infer_iters, guardrail_ids=guardrail_ids, + max_output_tokens=max_output_tokens, ) if stream: @@ -331,6 +333,7 @@ class OpenAIResponsesImpl: tools: list[OpenAIResponseInputTool] | None = None, max_infer_iters: int | None = 10, guardrail_ids: list[str] | None = None, + max_output_tokens: int | None = None, ) -> AsyncIterator[OpenAIResponseObjectStream]: # These should never be None when called from create_openai_response (which sets defaults) # but we assert here to help mypy understand the types @@ -356,6 +359,7 @@ class OpenAIResponsesImpl: response_format=response_format, tool_context=tool_context, inputs=all_input, + max_output_tokens=max_output_tokens, ) # Create orchestrator and delegate streaming logic diff --git a/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py b/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py index ef5603420..d05a3432b 100644 --- a/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py @@ -221,6 +221,16 @@ class StreamingResponseOrchestrator: try: while True: + # Check if the max_output_tokens are depleted are not + if ( + self.ctx.max_output_tokens + and self.accumulated_usage + and self.accumulated_usage.output_tokens >= self.ctx.max_output_tokens + ): + logger.info("exiting inference loop, remaining max_output_tokens is depleted") + final_status = "incomplete" + break + # Text is the default response format for chat completion so don't need to pass it # (some providers don't support non-empty response_format when tools are present) response_format = ( diff --git a/src/llama_stack/providers/inline/agents/meta_reference/responses/types.py b/src/llama_stack/providers/inline/agents/meta_reference/responses/types.py index 3b9a14b01..1a75e5dc5 100644 --- a/src/llama_stack/providers/inline/agents/meta_reference/responses/types.py +++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/types.py @@ -160,6 +160,7 @@ class ChatCompletionContext(BaseModel): tool_context: ToolContext | None approval_requests: list[OpenAIResponseMCPApprovalRequest] = [] approval_responses: dict[str, OpenAIResponseMCPApprovalResponse] = {} + max_output_tokens: int | None def __init__( self, @@ -170,6 +171,7 @@ class ChatCompletionContext(BaseModel): response_format: OpenAIResponseFormatParam, tool_context: ToolContext, inputs: list[OpenAIResponseInput] | str, + max_output_tokens: int | None, ): super().__init__( model=model, @@ -178,6 +180,7 @@ class ChatCompletionContext(BaseModel): temperature=temperature, response_format=response_format, tool_context=tool_context, + max_output_tokens=max_output_tokens, ) if not isinstance(inputs, str): self.approval_requests = [input for input in inputs if input.type == "mcp_approval_request"]