feat: Add max_output_tokens to Response API

OpenAI Responses and Completions have a max_output_tokens field. It is currently missing from the create and response object in Responses API. This PR fixes it. fixes: #3562 Signed-off-by: Abhishek Bongale <abhishekbongale@outlook.com>
2025-12-03 09:53:45 +00:00 · 2025-11-03 10:25:04 +00:00 · 2025-11-03 10:25:04 +00:00 · 97b345b3f8
commit 97b345b3f8
parent 939a2db58f
9 changed files with 59 additions and 0 deletions
--- a/client-sdks/stainless/openapi.yml
+++ b/client-sdks/stainless/openapi.yml
@ -6880,6 +6880,10 @@ components:
          type: string
          description: >-
            (Optional) System message inserted into the model's context
        max_output_tokens:
          type: integer
          description: >-
            (Optional) Upper bound for response tokens generation.
        input:
          type: array
          items:
@ -7238,6 +7242,10 @@ components:
            (Optional) Additional fields to include in the response.
        max_infer_iters:
          type: integer
        max_output_tokens:
          type: integer
          description: >-
            (Optional) Maximum tokens generated in a response.
      additionalProperties: false
      required:
        - input
@ -7319,6 +7327,10 @@ components:
          type: string
          description: >-
            (Optional) System message inserted into the model's context
        max_output_tokens:
          type: integer
          description: >-
            (Optional) Upper bound for response tokens generation.
      additionalProperties: false
      required:
        - created_at
--- a/docs/static/llama-stack-spec.yaml
+++ b/docs/static/llama-stack-spec.yaml
@ -6164,6 +6164,10 @@ components:
          type: string
          description: >-
            (Optional) System message inserted into the model's context
        max_output_tokens:
          type: integer
          description: >-
            (Optional) Upper bound for response tokens generation.
        input:
          type: array
          items:
@ -6522,6 +6526,10 @@ components:
            (Optional) Additional fields to include in the response.
        max_infer_iters:
          type: integer
        max_output_tokens:
          type: integer
          description: >-
            (Optional) Maximum tokens generated in a response.
      additionalProperties: false
      required:
        - input
@ -6603,6 +6611,10 @@ components:
          type: string
          description: >-
            (Optional) System message inserted into the model's context
        max_output_tokens:
          type: integer
          description: >-
            (Optional) Upper bound for response tokens generation.
      additionalProperties: false
      required:
        - created_at
--- a/docs/static/stainless-llama-stack-spec.yaml
+++ b/docs/static/stainless-llama-stack-spec.yaml
@ -6880,6 +6880,10 @@ components:
          type: string
          description: >-
            (Optional) System message inserted into the model's context
        max_output_tokens:
          type: integer
          description: >-
            (Optional) Upper bound for response tokens generation.
        input:
          type: array
          items:
@ -7238,6 +7242,10 @@ components:
            (Optional) Additional fields to include in the response.
        max_infer_iters:
          type: integer
        max_output_tokens:
          type: integer
          description: >-
            (Optional) Maximum tokens generated in a response.
      additionalProperties: false
      required:
        - input
@ -7319,6 +7327,10 @@ components:
          type: string
          description: >-
            (Optional) System message inserted into the model's context
        max_output_tokens:
          type: integer
          description: >-
            (Optional) Upper bound for response tokens generation.
      additionalProperties: false
      required:
        - created_at
--- a/src/llama_stack/apis/agents/agents.py
+++ b/src/llama_stack/apis/agents/agents.py
@ -87,6 +87,7 @@ class Agents(Protocol):
                "List of guardrails to apply during response generation. Guardrails provide safety and content moderation."
            ),
        ] = None,
        max_output_tokens: int | None = None,
    ) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]:
        """Create a model response.
@ -97,6 +98,7 @@ class Agents(Protocol):
        :param conversation: (Optional) The ID of a conversation to add the response to. Must begin with 'conv_'. Input and output messages will be automatically added to the conversation.
        :param include: (Optional) Additional fields to include in the response.
        :param guardrails: (Optional) List of guardrails to apply during response generation. Can be guardrail IDs (strings) or guardrail specifications.
        :param max_output_tokens: (Optional) Maximum tokens generated in a response.
        :returns: An OpenAIResponseObject.
        """
        ...
--- a/src/llama_stack/apis/agents/openai_responses.py
+++ b/src/llama_stack/apis/agents/openai_responses.py
@ -591,6 +591,7 @@ class OpenAIResponseObject(BaseModel):
    :param truncation: (Optional) Truncation strategy applied to the response
    :param usage: (Optional) Token usage information for the response
    :param instructions: (Optional) System message inserted into the model's context
    :param max_output_tokens: (Optional) Upper bound for response tokens generation.
    """
    created_at: int
@ -612,6 +613,7 @@ class OpenAIResponseObject(BaseModel):
    truncation: str | None = None
    usage: OpenAIResponseUsage | None = None
    instructions: str | None = None
    max_output_tokens: int | None = None
@json_schema_type
--- a/src/llama_stack/providers/inline/agents/meta_reference/agents.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/agents.py
@ -102,6 +102,7 @@ class MetaReferenceAgentsImpl(Agents):
        include: list[str] | None = None,
        max_infer_iters: int | None = 10,
        guardrails: list[ResponseGuardrail] | None = None,
        max_output_tokens: int | None = None,
    ) -> OpenAIResponseObject:
        assert self.openai_responses_impl is not None, "OpenAI responses not initialized"
        result = await self.openai_responses_impl.create_openai_response(
@ -119,6 +120,7 @@ class MetaReferenceAgentsImpl(Agents):
            include,
            max_infer_iters,
            guardrails,
            max_output_tokens,
        )
        return result  # type: ignore[no-any-return]
--- a/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
@ -255,6 +255,7 @@ class OpenAIResponsesImpl:
        include: list[str] | None = None,
        max_infer_iters: int | None = 10,
        guardrails: list[str | ResponseGuardrailSpec] | None = None,
        max_output_tokens: int | None = None,
    ):
        stream = bool(stream)
        text = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")) if text is None else text
@ -282,6 +283,7 @@ class OpenAIResponsesImpl:
            tools=tools,
            max_infer_iters=max_infer_iters,
            guardrail_ids=guardrail_ids,
            max_output_tokens=max_output_tokens,
        )
        if stream:
@ -331,6 +333,7 @@ class OpenAIResponsesImpl:
        tools: list[OpenAIResponseInputTool] | None = None,
        max_infer_iters: int | None = 10,
        guardrail_ids: list[str] | None = None,
        max_output_tokens: int | None = None,
    ) -> AsyncIterator[OpenAIResponseObjectStream]:
        # These should never be None when called from create_openai_response (which sets defaults)
        # but we assert here to help mypy understand the types
@ -356,6 +359,7 @@ class OpenAIResponsesImpl:
            response_format=response_format,
            tool_context=tool_context,
            inputs=all_input,
            max_output_tokens=max_output_tokens,
        )
        # Create orchestrator and delegate streaming logic
--- a/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
@ -221,6 +221,16 @@ class StreamingResponseOrchestrator:
        try:
            while True:
                # Check if the max_output_tokens are depleted are not
                if (
                    self.ctx.max_output_tokens
                    and self.accumulated_usage
                    and self.accumulated_usage.output_tokens >= self.ctx.max_output_tokens
                ):
                    logger.info("exiting inference loop, remaining max_output_tokens is depleted")
                    final_status = "incomplete"
                    break
                # Text is the default response format for chat completion so don't need to pass it
                # (some providers don't support non-empty response_format when tools are present)
                response_format = (
--- a/src/llama_stack/providers/inline/agents/meta_reference/responses/types.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/types.py
@ -160,6 +160,7 @@ class ChatCompletionContext(BaseModel):
    tool_context: ToolContext | None
    approval_requests: list[OpenAIResponseMCPApprovalRequest] = []
    approval_responses: dict[str, OpenAIResponseMCPApprovalResponse] = {}
    max_output_tokens: int | None
    def __init__(
        self,
@ -170,6 +171,7 @@ class ChatCompletionContext(BaseModel):
        response_format: OpenAIResponseFormatParam,
        tool_context: ToolContext,
        inputs: list[OpenAIResponseInput] | str,
        max_output_tokens: int | None,
    ):
        super().__init__(
            model=model,
@ -178,6 +180,7 @@ class ChatCompletionContext(BaseModel):
            temperature=temperature,
            response_format=response_format,
            tool_context=tool_context,
            max_output_tokens=max_output_tokens,
        )
        if not isinstance(inputs, str):
            self.approval_requests = [input for input in inputs if input.type == "mcp_approval_request"]