mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-13 21:02:39 +00:00
feat: Add max_output_tokens to Response API
Responses and Completions have a max_output_tokens field. It is currently missing from the create and response object in Responses API. This PR fixes it. fixes: #3562 Signed-off-by: Abhishek Bongale <abhishekbongale@outlook.com>
This commit is contained in:
parent
92219fd8fb
commit
bb58da22a1
14 changed files with 127 additions and 20 deletions
11
docs/static/deprecated-llama-stack-spec.html
vendored
11
docs/static/deprecated-llama-stack-spec.html
vendored
|
|
@ -9096,6 +9096,10 @@
|
|||
"type": "string",
|
||||
"description": "(Optional) Truncation strategy applied to the response"
|
||||
},
|
||||
"max_output_tokens": {
|
||||
"type": "integer",
|
||||
"description": "(Optional) Upper bound for response tokens generation"
|
||||
},
|
||||
"input": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
|
|
@ -9914,6 +9918,9 @@
|
|||
},
|
||||
"max_infer_iters": {
|
||||
"type": "integer"
|
||||
},
|
||||
"max_output_tokens": {
|
||||
"type": "integer"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
|
|
@ -9983,6 +9990,10 @@
|
|||
"truncation": {
|
||||
"type": "string",
|
||||
"description": "(Optional) Truncation strategy applied to the response"
|
||||
},
|
||||
"max_output_tokens": {
|
||||
"type": "integer",
|
||||
"description": "(Optional) Upper bound for response tokens generation"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
|
|
|
|||
10
docs/static/deprecated-llama-stack-spec.yaml
vendored
10
docs/static/deprecated-llama-stack-spec.yaml
vendored
|
|
@ -6740,6 +6740,10 @@ components:
|
|||
type: string
|
||||
description: >-
|
||||
(Optional) Truncation strategy applied to the response
|
||||
max_output_tokens:
|
||||
type: integer
|
||||
description: >-
|
||||
(Optional) Upper bound for response tokens generation
|
||||
input:
|
||||
type: array
|
||||
items:
|
||||
|
|
@ -7351,6 +7355,8 @@ components:
|
|||
(Optional) Additional fields to include in the response.
|
||||
max_infer_iters:
|
||||
type: integer
|
||||
max_output_tokens:
|
||||
type: integer
|
||||
additionalProperties: false
|
||||
required:
|
||||
- input
|
||||
|
|
@ -7414,6 +7420,10 @@ components:
|
|||
type: string
|
||||
description: >-
|
||||
(Optional) Truncation strategy applied to the response
|
||||
max_output_tokens:
|
||||
type: integer
|
||||
description: >-
|
||||
(Optional) Upper bound for response tokens generation
|
||||
additionalProperties: false
|
||||
required:
|
||||
- created_at
|
||||
|
|
|
|||
11
docs/static/llama-stack-spec.html
vendored
11
docs/static/llama-stack-spec.html
vendored
|
|
@ -7503,6 +7503,10 @@
|
|||
"type": "string",
|
||||
"description": "(Optional) Truncation strategy applied to the response"
|
||||
},
|
||||
"max_output_tokens": {
|
||||
"type": "integer",
|
||||
"description": "(Optional) Upper bound for response tokens generation"
|
||||
},
|
||||
"input": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
|
|
@ -8009,6 +8013,9 @@
|
|||
},
|
||||
"max_infer_iters": {
|
||||
"type": "integer"
|
||||
},
|
||||
"max_output_tokens": {
|
||||
"type": "integer"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
|
|
@ -8078,6 +8085,10 @@
|
|||
"truncation": {
|
||||
"type": "string",
|
||||
"description": "(Optional) Truncation strategy applied to the response"
|
||||
},
|
||||
"max_output_tokens": {
|
||||
"type": "integer",
|
||||
"description": "(Optional) Upper bound for response tokens generation"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
|
|
|
|||
10
docs/static/llama-stack-spec.yaml
vendored
10
docs/static/llama-stack-spec.yaml
vendored
|
|
@ -5660,6 +5660,10 @@ components:
|
|||
type: string
|
||||
description: >-
|
||||
(Optional) Truncation strategy applied to the response
|
||||
max_output_tokens:
|
||||
type: integer
|
||||
description: >-
|
||||
(Optional) Upper bound for response tokens generation
|
||||
input:
|
||||
type: array
|
||||
items:
|
||||
|
|
@ -6014,6 +6018,8 @@ components:
|
|||
(Optional) Additional fields to include in the response.
|
||||
max_infer_iters:
|
||||
type: integer
|
||||
max_output_tokens:
|
||||
type: integer
|
||||
additionalProperties: false
|
||||
required:
|
||||
- input
|
||||
|
|
@ -6077,6 +6083,10 @@ components:
|
|||
type: string
|
||||
description: >-
|
||||
(Optional) Truncation strategy applied to the response
|
||||
max_output_tokens:
|
||||
type: integer
|
||||
description: >-
|
||||
(Optional) Upper bound for response tokens generation
|
||||
additionalProperties: false
|
||||
required:
|
||||
- created_at
|
||||
|
|
|
|||
11
docs/static/stainless-llama-stack-spec.html
vendored
11
docs/static/stainless-llama-stack-spec.html
vendored
|
|
@ -9512,6 +9512,10 @@
|
|||
"type": "string",
|
||||
"description": "(Optional) Truncation strategy applied to the response"
|
||||
},
|
||||
"max_output_tokens": {
|
||||
"type": "integer",
|
||||
"description": "(Optional) Upper bound for response tokens generation"
|
||||
},
|
||||
"input": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
|
|
@ -10018,6 +10022,9 @@
|
|||
},
|
||||
"max_infer_iters": {
|
||||
"type": "integer"
|
||||
},
|
||||
"max_output_tokens": {
|
||||
"type": "integer"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
|
|
@ -10087,6 +10094,10 @@
|
|||
"truncation": {
|
||||
"type": "string",
|
||||
"description": "(Optional) Truncation strategy applied to the response"
|
||||
},
|
||||
"max_output_tokens": {
|
||||
"type": "integer",
|
||||
"description": "(Optional) Upper bound for response tokens generation"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
|
|
|
|||
10
docs/static/stainless-llama-stack-spec.yaml
vendored
10
docs/static/stainless-llama-stack-spec.yaml
vendored
|
|
@ -7105,6 +7105,10 @@ components:
|
|||
type: string
|
||||
description: >-
|
||||
(Optional) Truncation strategy applied to the response
|
||||
max_output_tokens:
|
||||
type: integer
|
||||
description: >-
|
||||
(Optional) Upper bound for response tokens generation
|
||||
input:
|
||||
type: array
|
||||
items:
|
||||
|
|
@ -7459,6 +7463,8 @@ components:
|
|||
(Optional) Additional fields to include in the response.
|
||||
max_infer_iters:
|
||||
type: integer
|
||||
max_output_tokens:
|
||||
type: integer
|
||||
additionalProperties: false
|
||||
required:
|
||||
- input
|
||||
|
|
@ -7522,6 +7528,10 @@ components:
|
|||
type: string
|
||||
description: >-
|
||||
(Optional) Truncation strategy applied to the response
|
||||
max_output_tokens:
|
||||
type: integer
|
||||
description: >-
|
||||
(Optional) Upper bound for response tokens generation
|
||||
additionalProperties: false
|
||||
required:
|
||||
- created_at
|
||||
|
|
|
|||
|
|
@ -825,6 +825,7 @@ class Agents(Protocol):
|
|||
"List of shields to apply during response generation. Shields provide safety and content moderation."
|
||||
),
|
||||
] = None,
|
||||
max_output_tokens: int | None = None,
|
||||
) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]:
|
||||
"""Create a new OpenAI response.
|
||||
|
||||
|
|
|
|||
|
|
@ -363,6 +363,7 @@ class OpenAIResponseObject(BaseModel):
|
|||
:param text: Text formatting configuration for the response
|
||||
:param top_p: (Optional) Nucleus sampling parameter used for generation
|
||||
:param truncation: (Optional) Truncation strategy applied to the response
|
||||
:param max_output_tokens: (Optional) Upper bound for response tokens generation
|
||||
"""
|
||||
|
||||
created_at: int
|
||||
|
|
@ -380,6 +381,7 @@ class OpenAIResponseObject(BaseModel):
|
|||
text: OpenAIResponseText = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text"))
|
||||
top_p: float | None = None
|
||||
truncation: str | None = None
|
||||
max_output_tokens: int | None = None
|
||||
|
||||
|
||||
@json_schema_type
|
||||
|
|
|
|||
|
|
@ -204,6 +204,7 @@ class OpenAIResponsesImpl:
|
|||
store: bool | None = True,
|
||||
stream: bool | None = False,
|
||||
temperature: float | None = None,
|
||||
max_output_tokens: int | None = None,
|
||||
text: OpenAIResponseText | None = None,
|
||||
tools: list[OpenAIResponseInputTool] | None = None,
|
||||
include: list[str] | None = None,
|
||||
|
|
@ -224,6 +225,7 @@ class OpenAIResponsesImpl:
|
|||
previous_response_id=previous_response_id,
|
||||
store=store,
|
||||
temperature=temperature,
|
||||
max_output_tokens=max_output_tokens,
|
||||
text=text,
|
||||
tools=tools,
|
||||
max_infer_iters=max_infer_iters,
|
||||
|
|
@ -252,6 +254,7 @@ class OpenAIResponsesImpl:
|
|||
previous_response_id: str | None = None,
|
||||
store: bool | None = True,
|
||||
temperature: float | None = None,
|
||||
max_output_tokens: int | None = None,
|
||||
text: OpenAIResponseText | None = None,
|
||||
tools: list[OpenAIResponseInputTool] | None = None,
|
||||
max_infer_iters: int | None = 10,
|
||||
|
|
@ -268,6 +271,7 @@ class OpenAIResponsesImpl:
|
|||
messages=messages,
|
||||
response_tools=tools,
|
||||
temperature=temperature,
|
||||
max_tokens=max_output_tokens,
|
||||
response_format=response_format,
|
||||
inputs=input,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -63,6 +63,7 @@ class ChatCompletionContext(BaseModel):
|
|||
response_format: OpenAIResponseFormatParam
|
||||
approval_requests: list[OpenAIResponseMCPApprovalRequest] = []
|
||||
approval_responses: dict[str, OpenAIResponseMCPApprovalResponse] = {}
|
||||
max_tokens: int | None = None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
|
@ -72,6 +73,7 @@ class ChatCompletionContext(BaseModel):
|
|||
temperature: float | None,
|
||||
response_format: OpenAIResponseFormatParam,
|
||||
inputs: list[OpenAIResponseInput] | str,
|
||||
max_tokens: int | None = None,
|
||||
):
|
||||
super().__init__(
|
||||
model=model,
|
||||
|
|
|
|||
|
|
@ -297,3 +297,38 @@ def test_function_call_output_response_with_none_arguments(openai_client, client
|
|||
assert response.output[0].type == "function_call"
|
||||
assert response.output[0].arguments == "{}"
|
||||
_ = response.output[0].call_id
|
||||
|
||||
|
||||
def test_response_with_max_output_tokens(compat_client, text_model_id):
|
||||
"""Test that the `max_output_tokens` parameter is used."""
|
||||
if not isinstance(compat_client, OpenAI):
|
||||
pytest.skip("This test requires the OpenAI client.")
|
||||
|
||||
response = compat_client.responses.create(
|
||||
model=text_model_id,
|
||||
input=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "what's the current time? You MUST call the `get_current_time` function to find out.",
|
||||
}
|
||||
],
|
||||
max_output_tokens=15,
|
||||
stream=False,
|
||||
)
|
||||
|
||||
assert response.id is not None
|
||||
assert response.model == text_model_id
|
||||
|
||||
assert hasattr(response, "max_output_tokens")
|
||||
assert response.max_output_tokens == 15
|
||||
|
||||
output_text = ""
|
||||
for item in response.output:
|
||||
if item.type == "message" and item.role == "assistant":
|
||||
if item.content and item.content.type == "text":
|
||||
output_text = item.content.text
|
||||
break
|
||||
|
||||
assert output_text, "Assistant response content should not be empty"
|
||||
|
||||
assert len(output_text.split()) < 30
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue