feat: Add max_output_tokens to Response API

Responses and Completions have a max_output_tokens field. It is currently
missing from the create and response object in Responses API.

This PR fixes it.

fixes: #3562
Signed-off-by: Abhishek Bongale <abhishekbongale@outlook.com>
This commit is contained in:
Abhishek Bongale 2025-10-06 09:46:09 +01:00
parent 92219fd8fb
commit bb58da22a1
14 changed files with 127 additions and 20 deletions

View file

@ -1,7 +1,7 @@
---
description: "Agents
APIs for creating and interacting with agentic systems."
APIs for creating and interacting with agentic systems."
sidebar_label: Agents
title: Agents
---
@ -12,6 +12,6 @@ title: Agents
Agents
APIs for creating and interacting with agentic systems.
APIs for creating and interacting with agentic systems.
This section contains documentation for all available providers for the **agents** API.

View file

@ -1,14 +1,14 @@
---
description: "The Batches API enables efficient processing of multiple requests in a single operation,
particularly useful for processing large datasets, batch evaluation workflows, and
cost-effective inference at scale.
particularly useful for processing large datasets, batch evaluation workflows, and
cost-effective inference at scale.
The API is designed to allow use of openai client libraries for seamless integration.
The API is designed to allow use of openai client libraries for seamless integration.
This API provides the following extensions:
- idempotent batch creation
This API provides the following extensions:
- idempotent batch creation
Note: This API is currently under active development and may undergo changes."
Note: This API is currently under active development and may undergo changes."
sidebar_label: Batches
title: Batches
---
@ -18,14 +18,14 @@ title: Batches
## Overview
The Batches API enables efficient processing of multiple requests in a single operation,
particularly useful for processing large datasets, batch evaluation workflows, and
cost-effective inference at scale.
particularly useful for processing large datasets, batch evaluation workflows, and
cost-effective inference at scale.
The API is designed to allow use of openai client libraries for seamless integration.
The API is designed to allow use of openai client libraries for seamless integration.
This API provides the following extensions:
- idempotent batch creation
This API provides the following extensions:
- idempotent batch creation
Note: This API is currently under active development and may undergo changes.
Note: This API is currently under active development and may undergo changes.
This section contains documentation for all available providers for the **batches** API.

View file

@ -1,9 +1,9 @@
---
description: "Llama Stack Inference API for generating completions, chat completions, and embeddings.
This API provides the raw interface to the underlying models. Two kinds of models are supported:
- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.
- Embedding models: these models generate embeddings to be used for semantic search."
This API provides the raw interface to the underlying models. Two kinds of models are supported:
- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.
- Embedding models: these models generate embeddings to be used for semantic search."
sidebar_label: Inference
title: Inference
---
@ -14,8 +14,8 @@ title: Inference
Llama Stack Inference API for generating completions, chat completions, and embeddings.
This API provides the raw interface to the underlying models. Two kinds of models are supported:
- LLM models: these models generate "raw" and "chat" (conversational) completions.
- Embedding models: these models generate embeddings to be used for semantic search.
This API provides the raw interface to the underlying models. Two kinds of models are supported:
- LLM models: these models generate "raw" and "chat" (conversational) completions.
- Embedding models: these models generate embeddings to be used for semantic search.
This section contains documentation for all available providers for the **inference** API.

View file

@ -9096,6 +9096,10 @@
"type": "string",
"description": "(Optional) Truncation strategy applied to the response"
},
"max_output_tokens": {
"type": "integer",
"description": "(Optional) Upper bound for response tokens generation"
},
"input": {
"type": "array",
"items": {
@ -9914,6 +9918,9 @@
},
"max_infer_iters": {
"type": "integer"
},
"max_output_tokens": {
"type": "integer"
}
},
"additionalProperties": false,
@ -9983,6 +9990,10 @@
"truncation": {
"type": "string",
"description": "(Optional) Truncation strategy applied to the response"
},
"max_output_tokens": {
"type": "integer",
"description": "(Optional) Upper bound for response tokens generation"
}
},
"additionalProperties": false,

View file

@ -6740,6 +6740,10 @@ components:
type: string
description: >-
(Optional) Truncation strategy applied to the response
max_output_tokens:
type: integer
description: >-
(Optional) Upper bound for response tokens generation
input:
type: array
items:
@ -7351,6 +7355,8 @@ components:
(Optional) Additional fields to include in the response.
max_infer_iters:
type: integer
max_output_tokens:
type: integer
additionalProperties: false
required:
- input
@ -7414,6 +7420,10 @@ components:
type: string
description: >-
(Optional) Truncation strategy applied to the response
max_output_tokens:
type: integer
description: >-
(Optional) Upper bound for response tokens generation
additionalProperties: false
required:
- created_at

View file

@ -7503,6 +7503,10 @@
"type": "string",
"description": "(Optional) Truncation strategy applied to the response"
},
"max_output_tokens": {
"type": "integer",
"description": "(Optional) Upper bound for response tokens generation"
},
"input": {
"type": "array",
"items": {
@ -8009,6 +8013,9 @@
},
"max_infer_iters": {
"type": "integer"
},
"max_output_tokens": {
"type": "integer"
}
},
"additionalProperties": false,
@ -8078,6 +8085,10 @@
"truncation": {
"type": "string",
"description": "(Optional) Truncation strategy applied to the response"
},
"max_output_tokens": {
"type": "integer",
"description": "(Optional) Upper bound for response tokens generation"
}
},
"additionalProperties": false,

View file

@ -5660,6 +5660,10 @@ components:
type: string
description: >-
(Optional) Truncation strategy applied to the response
max_output_tokens:
type: integer
description: >-
(Optional) Upper bound for response tokens generation
input:
type: array
items:
@ -6014,6 +6018,8 @@ components:
(Optional) Additional fields to include in the response.
max_infer_iters:
type: integer
max_output_tokens:
type: integer
additionalProperties: false
required:
- input
@ -6077,6 +6083,10 @@ components:
type: string
description: >-
(Optional) Truncation strategy applied to the response
max_output_tokens:
type: integer
description: >-
(Optional) Upper bound for response tokens generation
additionalProperties: false
required:
- created_at

View file

@ -9512,6 +9512,10 @@
"type": "string",
"description": "(Optional) Truncation strategy applied to the response"
},
"max_output_tokens": {
"type": "integer",
"description": "(Optional) Upper bound for response tokens generation"
},
"input": {
"type": "array",
"items": {
@ -10018,6 +10022,9 @@
},
"max_infer_iters": {
"type": "integer"
},
"max_output_tokens": {
"type": "integer"
}
},
"additionalProperties": false,
@ -10087,6 +10094,10 @@
"truncation": {
"type": "string",
"description": "(Optional) Truncation strategy applied to the response"
},
"max_output_tokens": {
"type": "integer",
"description": "(Optional) Upper bound for response tokens generation"
}
},
"additionalProperties": false,

View file

@ -7105,6 +7105,10 @@ components:
type: string
description: >-
(Optional) Truncation strategy applied to the response
max_output_tokens:
type: integer
description: >-
(Optional) Upper bound for response tokens generation
input:
type: array
items:
@ -7459,6 +7463,8 @@ components:
(Optional) Additional fields to include in the response.
max_infer_iters:
type: integer
max_output_tokens:
type: integer
additionalProperties: false
required:
- input
@ -7522,6 +7528,10 @@ components:
type: string
description: >-
(Optional) Truncation strategy applied to the response
max_output_tokens:
type: integer
description: >-
(Optional) Upper bound for response tokens generation
additionalProperties: false
required:
- created_at

View file

@ -825,6 +825,7 @@ class Agents(Protocol):
"List of shields to apply during response generation. Shields provide safety and content moderation."
),
] = None,
max_output_tokens: int | None = None,
) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]:
"""Create a new OpenAI response.

View file

@ -363,6 +363,7 @@ class OpenAIResponseObject(BaseModel):
:param text: Text formatting configuration for the response
:param top_p: (Optional) Nucleus sampling parameter used for generation
:param truncation: (Optional) Truncation strategy applied to the response
:param max_output_tokens: (Optional) Upper bound for response tokens generation
"""
created_at: int
@ -380,6 +381,7 @@ class OpenAIResponseObject(BaseModel):
text: OpenAIResponseText = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text"))
top_p: float | None = None
truncation: str | None = None
max_output_tokens: int | None = None
@json_schema_type

View file

@ -204,6 +204,7 @@ class OpenAIResponsesImpl:
store: bool | None = True,
stream: bool | None = False,
temperature: float | None = None,
max_output_tokens: int | None = None,
text: OpenAIResponseText | None = None,
tools: list[OpenAIResponseInputTool] | None = None,
include: list[str] | None = None,
@ -224,6 +225,7 @@ class OpenAIResponsesImpl:
previous_response_id=previous_response_id,
store=store,
temperature=temperature,
max_output_tokens=max_output_tokens,
text=text,
tools=tools,
max_infer_iters=max_infer_iters,
@ -252,6 +254,7 @@ class OpenAIResponsesImpl:
previous_response_id: str | None = None,
store: bool | None = True,
temperature: float | None = None,
max_output_tokens: int | None = None,
text: OpenAIResponseText | None = None,
tools: list[OpenAIResponseInputTool] | None = None,
max_infer_iters: int | None = 10,
@ -268,6 +271,7 @@ class OpenAIResponsesImpl:
messages=messages,
response_tools=tools,
temperature=temperature,
max_tokens=max_output_tokens,
response_format=response_format,
inputs=input,
)

View file

@ -63,6 +63,7 @@ class ChatCompletionContext(BaseModel):
response_format: OpenAIResponseFormatParam
approval_requests: list[OpenAIResponseMCPApprovalRequest] = []
approval_responses: dict[str, OpenAIResponseMCPApprovalResponse] = {}
max_tokens: int | None = None
def __init__(
self,
@ -72,6 +73,7 @@ class ChatCompletionContext(BaseModel):
temperature: float | None,
response_format: OpenAIResponseFormatParam,
inputs: list[OpenAIResponseInput] | str,
max_tokens: int | None = None,
):
super().__init__(
model=model,

View file

@ -297,3 +297,38 @@ def test_function_call_output_response_with_none_arguments(openai_client, client
assert response.output[0].type == "function_call"
assert response.output[0].arguments == "{}"
_ = response.output[0].call_id
def test_response_with_max_output_tokens(compat_client, text_model_id):
"""Test that the `max_output_tokens` parameter is used."""
if not isinstance(compat_client, OpenAI):
pytest.skip("This test requires the OpenAI client.")
response = compat_client.responses.create(
model=text_model_id,
input=[
{
"role": "user",
"content": "what's the current time? You MUST call the `get_current_time` function to find out.",
}
],
max_output_tokens=15,
stream=False,
)
assert response.id is not None
assert response.model == text_model_id
assert hasattr(response, "max_output_tokens")
assert response.max_output_tokens == 15
output_text = ""
for item in response.output:
if item.type == "message" and item.role == "assistant":
if item.content and item.content.type == "text":
output_text = item.content.text
break
assert output_text, "Assistant response content should not be empty"
assert len(output_text.split()) < 30