From 97b345b3f89bb6186fa318f62170d79b74b237d8 Mon Sep 17 00:00:00 2001
From: Abhishek Bongale <abhishekbongale@outlook.com>
Date: Mon, 3 Nov 2025 10:25:04 +0000
Subject: [PATCH] feat: Add max_output_tokens to Response API

OpenAI Responses and Completions have a max_output_tokens field.
It is currently missing from the create and response object in Responses API.

This PR fixes it.

fixes: #3562
Signed-off-by: Abhishek Bongale <abhishekbongale@outlook.com>
---
 client-sdks/stainless/openapi.yml                    | 12 ++++++++++++
 docs/static/llama-stack-spec.yaml                    | 12 ++++++++++++
 docs/static/stainless-llama-stack-spec.yaml          | 12 ++++++++++++
 src/llama_stack/apis/agents/agents.py                |  2 ++
 src/llama_stack/apis/agents/openai_responses.py      |  2 ++
 .../providers/inline/agents/meta_reference/agents.py |  2 ++
 .../meta_reference/responses/openai_responses.py     |  4 ++++
 .../agents/meta_reference/responses/streaming.py     | 10 ++++++++++
 .../inline/agents/meta_reference/responses/types.py  |  3 +++
 9 files changed, 59 insertions(+)

diff --git a/client-sdks/stainless/openapi.yml b/client-sdks/stainless/openapi.yml
index 448b08908..859cf8ee7 100644
--- a/client-sdks/stainless/openapi.yml
+++ b/client-sdks/stainless/openapi.yml
@@ -6880,6 +6880,10 @@ components:
           type: string
           description: >-
             (Optional) System message inserted into the model's context
+        max_output_tokens:
+          type: integer
+          description: >-
+            (Optional) Upper bound for response tokens generation.
         input:
           type: array
           items:
@@ -7238,6 +7242,10 @@ components:
             (Optional) Additional fields to include in the response.
         max_infer_iters:
           type: integer
+        max_output_tokens:
+          type: integer
+          description: >-
+            (Optional) Maximum tokens generated in a response.
       additionalProperties: false
       required:
         - input
@@ -7319,6 +7327,10 @@ components:
           type: string
           description: >-
             (Optional) System message inserted into the model's context
+        max_output_tokens:
+          type: integer
+          description: >-
+            (Optional) Upper bound for response tokens generation.
       additionalProperties: false
       required:
         - created_at
diff --git a/docs/static/llama-stack-spec.yaml b/docs/static/llama-stack-spec.yaml
index cc720ad18..01405f998 100644
--- a/docs/static/llama-stack-spec.yaml
+++ b/docs/static/llama-stack-spec.yaml
@@ -6164,6 +6164,10 @@ components:
           type: string
           description: >-
             (Optional) System message inserted into the model's context
+        max_output_tokens:
+          type: integer
+          description: >-
+            (Optional) Upper bound for response tokens generation.
         input:
           type: array
           items:
@@ -6522,6 +6526,10 @@ components:
             (Optional) Additional fields to include in the response.
         max_infer_iters:
           type: integer
+        max_output_tokens:
+          type: integer
+          description: >-
+            (Optional) Maximum tokens generated in a response.
       additionalProperties: false
       required:
         - input
@@ -6603,6 +6611,10 @@ components:
           type: string
           description: >-
             (Optional) System message inserted into the model's context
+        max_output_tokens:
+          type: integer
+          description: >-
+            (Optional) Upper bound for response tokens generation.
       additionalProperties: false
       required:
         - created_at
diff --git a/docs/static/stainless-llama-stack-spec.yaml b/docs/static/stainless-llama-stack-spec.yaml
index 448b08908..859cf8ee7 100644
--- a/docs/static/stainless-llama-stack-spec.yaml
+++ b/docs/static/stainless-llama-stack-spec.yaml
@@ -6880,6 +6880,10 @@ components:
           type: string
           description: >-
             (Optional) System message inserted into the model's context
+        max_output_tokens:
+          type: integer
+          description: >-
+            (Optional) Upper bound for response tokens generation.
         input:
           type: array
           items:
@@ -7238,6 +7242,10 @@ components:
             (Optional) Additional fields to include in the response.
         max_infer_iters:
           type: integer
+        max_output_tokens:
+          type: integer
+          description: >-
+            (Optional) Maximum tokens generated in a response.
       additionalProperties: false
       required:
         - input
@@ -7319,6 +7327,10 @@ components:
           type: string
           description: >-
             (Optional) System message inserted into the model's context
+        max_output_tokens:
+          type: integer
+          description: >-
+            (Optional) Upper bound for response tokens generation.
       additionalProperties: false
       required:
         - created_at
diff --git a/src/llama_stack/apis/agents/agents.py b/src/llama_stack/apis/agents/agents.py
index cadef2edc..97454f905 100644
--- a/src/llama_stack/apis/agents/agents.py
+++ b/src/llama_stack/apis/agents/agents.py
@@ -87,6 +87,7 @@ class Agents(Protocol):
                 "List of guardrails to apply during response generation. Guardrails provide safety and content moderation."
             ),
         ] = None,
+        max_output_tokens: int | None = None,
     ) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]:
         """Create a model response.
 
@@ -97,6 +98,7 @@ class Agents(Protocol):
         :param conversation: (Optional) The ID of a conversation to add the response to. Must begin with 'conv_'. Input and output messages will be automatically added to the conversation.
         :param include: (Optional) Additional fields to include in the response.
         :param guardrails: (Optional) List of guardrails to apply during response generation. Can be guardrail IDs (strings) or guardrail specifications.
+        :param max_output_tokens: (Optional) Maximum tokens generated in a response.
         :returns: An OpenAIResponseObject.
         """
         ...
diff --git a/src/llama_stack/apis/agents/openai_responses.py b/src/llama_stack/apis/agents/openai_responses.py
index 69e2b2012..206753b3d 100644
--- a/src/llama_stack/apis/agents/openai_responses.py
+++ b/src/llama_stack/apis/agents/openai_responses.py
@@ -591,6 +591,7 @@ class OpenAIResponseObject(BaseModel):
     :param truncation: (Optional) Truncation strategy applied to the response
     :param usage: (Optional) Token usage information for the response
     :param instructions: (Optional) System message inserted into the model's context
+    :param max_output_tokens: (Optional) Upper bound for response tokens generation.
     """
 
     created_at: int
@@ -612,6 +613,7 @@ class OpenAIResponseObject(BaseModel):
     truncation: str | None = None
     usage: OpenAIResponseUsage | None = None
     instructions: str | None = None
+    max_output_tokens: int | None = None
 
 
 @json_schema_type
diff --git a/src/llama_stack/providers/inline/agents/meta_reference/agents.py b/src/llama_stack/providers/inline/agents/meta_reference/agents.py
index 7141d58bc..f6cb5d861 100644
--- a/src/llama_stack/providers/inline/agents/meta_reference/agents.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/agents.py
@@ -102,6 +102,7 @@ class MetaReferenceAgentsImpl(Agents):
         include: list[str] | None = None,
         max_infer_iters: int | None = 10,
         guardrails: list[ResponseGuardrail] | None = None,
+        max_output_tokens: int | None = None,
     ) -> OpenAIResponseObject:
         assert self.openai_responses_impl is not None, "OpenAI responses not initialized"
         result = await self.openai_responses_impl.create_openai_response(
@@ -119,6 +120,7 @@ class MetaReferenceAgentsImpl(Agents):
             include,
             max_infer_iters,
             guardrails,
+            max_output_tokens,
         )
         return result  # type: ignore[no-any-return]
 
diff --git a/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py b/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
index 933cfe963..59be85488 100644
--- a/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
@@ -255,6 +255,7 @@ class OpenAIResponsesImpl:
         include: list[str] | None = None,
         max_infer_iters: int | None = 10,
         guardrails: list[str | ResponseGuardrailSpec] | None = None,
+        max_output_tokens: int | None = None,
     ):
         stream = bool(stream)
         text = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")) if text is None else text
@@ -282,6 +283,7 @@ class OpenAIResponsesImpl:
             tools=tools,
             max_infer_iters=max_infer_iters,
             guardrail_ids=guardrail_ids,
+            max_output_tokens=max_output_tokens,
         )
 
         if stream:
@@ -331,6 +333,7 @@ class OpenAIResponsesImpl:
         tools: list[OpenAIResponseInputTool] | None = None,
         max_infer_iters: int | None = 10,
         guardrail_ids: list[str] | None = None,
+        max_output_tokens: int | None = None,
     ) -> AsyncIterator[OpenAIResponseObjectStream]:
         # These should never be None when called from create_openai_response (which sets defaults)
         # but we assert here to help mypy understand the types
@@ -356,6 +359,7 @@ class OpenAIResponsesImpl:
             response_format=response_format,
             tool_context=tool_context,
             inputs=all_input,
+            max_output_tokens=max_output_tokens,
         )
 
         # Create orchestrator and delegate streaming logic
diff --git a/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py b/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
index ef5603420..d05a3432b 100644
--- a/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
@@ -221,6 +221,16 @@ class StreamingResponseOrchestrator:
 
         try:
             while True:
+                # Check if the max_output_tokens are depleted are not
+                if (
+                    self.ctx.max_output_tokens
+                    and self.accumulated_usage
+                    and self.accumulated_usage.output_tokens >= self.ctx.max_output_tokens
+                ):
+                    logger.info("exiting inference loop, remaining max_output_tokens is depleted")
+                    final_status = "incomplete"
+                    break
+
                 # Text is the default response format for chat completion so don't need to pass it
                 # (some providers don't support non-empty response_format when tools are present)
                 response_format = (
diff --git a/src/llama_stack/providers/inline/agents/meta_reference/responses/types.py b/src/llama_stack/providers/inline/agents/meta_reference/responses/types.py
index 3b9a14b01..1a75e5dc5 100644
--- a/src/llama_stack/providers/inline/agents/meta_reference/responses/types.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/types.py
@@ -160,6 +160,7 @@ class ChatCompletionContext(BaseModel):
     tool_context: ToolContext | None
     approval_requests: list[OpenAIResponseMCPApprovalRequest] = []
     approval_responses: dict[str, OpenAIResponseMCPApprovalResponse] = {}
+    max_output_tokens: int | None
 
     def __init__(
         self,
@@ -170,6 +171,7 @@ class ChatCompletionContext(BaseModel):
         response_format: OpenAIResponseFormatParam,
         tool_context: ToolContext,
         inputs: list[OpenAIResponseInput] | str,
+        max_output_tokens: int | None,
     ):
         super().__init__(
             model=model,
@@ -178,6 +180,7 @@ class ChatCompletionContext(BaseModel):
             temperature=temperature,
             response_format=response_format,
             tool_context=tool_context,
+            max_output_tokens=max_output_tokens,
         )
         if not isinstance(inputs, str):
             self.approval_requests = [input for input in inputs if input.type == "mcp_approval_request"]