mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-13 17:12:37 +00:00
feat: Add max_output_tokens to Response API
Responses and Completions have a max_output_tokens field. It is currently missing from the create and response object in Responses API. This PR fixes it. fixes: #3562 Signed-off-by: Abhishek Bongale <abhishekbongale@outlook.com>
This commit is contained in:
parent
92219fd8fb
commit
bb58da22a1
14 changed files with 127 additions and 20 deletions
|
|
@ -1,7 +1,7 @@
|
|||
---
|
||||
description: "Agents
|
||||
|
||||
APIs for creating and interacting with agentic systems."
|
||||
APIs for creating and interacting with agentic systems."
|
||||
sidebar_label: Agents
|
||||
title: Agents
|
||||
---
|
||||
|
|
@ -12,6 +12,6 @@ title: Agents
|
|||
|
||||
Agents
|
||||
|
||||
APIs for creating and interacting with agentic systems.
|
||||
APIs for creating and interacting with agentic systems.
|
||||
|
||||
This section contains documentation for all available providers for the **agents** API.
|
||||
|
|
|
|||
|
|
@ -1,14 +1,14 @@
|
|||
---
|
||||
description: "The Batches API enables efficient processing of multiple requests in a single operation,
|
||||
particularly useful for processing large datasets, batch evaluation workflows, and
|
||||
cost-effective inference at scale.
|
||||
particularly useful for processing large datasets, batch evaluation workflows, and
|
||||
cost-effective inference at scale.
|
||||
|
||||
The API is designed to allow use of openai client libraries for seamless integration.
|
||||
The API is designed to allow use of openai client libraries for seamless integration.
|
||||
|
||||
This API provides the following extensions:
|
||||
- idempotent batch creation
|
||||
This API provides the following extensions:
|
||||
- idempotent batch creation
|
||||
|
||||
Note: This API is currently under active development and may undergo changes."
|
||||
Note: This API is currently under active development and may undergo changes."
|
||||
sidebar_label: Batches
|
||||
title: Batches
|
||||
---
|
||||
|
|
@ -18,14 +18,14 @@ title: Batches
|
|||
## Overview
|
||||
|
||||
The Batches API enables efficient processing of multiple requests in a single operation,
|
||||
particularly useful for processing large datasets, batch evaluation workflows, and
|
||||
cost-effective inference at scale.
|
||||
particularly useful for processing large datasets, batch evaluation workflows, and
|
||||
cost-effective inference at scale.
|
||||
|
||||
The API is designed to allow use of openai client libraries for seamless integration.
|
||||
The API is designed to allow use of openai client libraries for seamless integration.
|
||||
|
||||
This API provides the following extensions:
|
||||
- idempotent batch creation
|
||||
This API provides the following extensions:
|
||||
- idempotent batch creation
|
||||
|
||||
Note: This API is currently under active development and may undergo changes.
|
||||
Note: This API is currently under active development and may undergo changes.
|
||||
|
||||
This section contains documentation for all available providers for the **batches** API.
|
||||
|
|
|
|||
|
|
@ -1,9 +1,9 @@
|
|||
---
|
||||
description: "Llama Stack Inference API for generating completions, chat completions, and embeddings.
|
||||
|
||||
This API provides the raw interface to the underlying models. Two kinds of models are supported:
|
||||
- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.
|
||||
- Embedding models: these models generate embeddings to be used for semantic search."
|
||||
This API provides the raw interface to the underlying models. Two kinds of models are supported:
|
||||
- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.
|
||||
- Embedding models: these models generate embeddings to be used for semantic search."
|
||||
sidebar_label: Inference
|
||||
title: Inference
|
||||
---
|
||||
|
|
@ -14,8 +14,8 @@ title: Inference
|
|||
|
||||
Llama Stack Inference API for generating completions, chat completions, and embeddings.
|
||||
|
||||
This API provides the raw interface to the underlying models. Two kinds of models are supported:
|
||||
- LLM models: these models generate "raw" and "chat" (conversational) completions.
|
||||
- Embedding models: these models generate embeddings to be used for semantic search.
|
||||
This API provides the raw interface to the underlying models. Two kinds of models are supported:
|
||||
- LLM models: these models generate "raw" and "chat" (conversational) completions.
|
||||
- Embedding models: these models generate embeddings to be used for semantic search.
|
||||
|
||||
This section contains documentation for all available providers for the **inference** API.
|
||||
|
|
|
|||
11
docs/static/deprecated-llama-stack-spec.html
vendored
11
docs/static/deprecated-llama-stack-spec.html
vendored
|
|
@ -9096,6 +9096,10 @@
|
|||
"type": "string",
|
||||
"description": "(Optional) Truncation strategy applied to the response"
|
||||
},
|
||||
"max_output_tokens": {
|
||||
"type": "integer",
|
||||
"description": "(Optional) Upper bound for response tokens generation"
|
||||
},
|
||||
"input": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
|
|
@ -9914,6 +9918,9 @@
|
|||
},
|
||||
"max_infer_iters": {
|
||||
"type": "integer"
|
||||
},
|
||||
"max_output_tokens": {
|
||||
"type": "integer"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
|
|
@ -9983,6 +9990,10 @@
|
|||
"truncation": {
|
||||
"type": "string",
|
||||
"description": "(Optional) Truncation strategy applied to the response"
|
||||
},
|
||||
"max_output_tokens": {
|
||||
"type": "integer",
|
||||
"description": "(Optional) Upper bound for response tokens generation"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
|
|
|
|||
10
docs/static/deprecated-llama-stack-spec.yaml
vendored
10
docs/static/deprecated-llama-stack-spec.yaml
vendored
|
|
@ -6740,6 +6740,10 @@ components:
|
|||
type: string
|
||||
description: >-
|
||||
(Optional) Truncation strategy applied to the response
|
||||
max_output_tokens:
|
||||
type: integer
|
||||
description: >-
|
||||
(Optional) Upper bound for response tokens generation
|
||||
input:
|
||||
type: array
|
||||
items:
|
||||
|
|
@ -7351,6 +7355,8 @@ components:
|
|||
(Optional) Additional fields to include in the response.
|
||||
max_infer_iters:
|
||||
type: integer
|
||||
max_output_tokens:
|
||||
type: integer
|
||||
additionalProperties: false
|
||||
required:
|
||||
- input
|
||||
|
|
@ -7414,6 +7420,10 @@ components:
|
|||
type: string
|
||||
description: >-
|
||||
(Optional) Truncation strategy applied to the response
|
||||
max_output_tokens:
|
||||
type: integer
|
||||
description: >-
|
||||
(Optional) Upper bound for response tokens generation
|
||||
additionalProperties: false
|
||||
required:
|
||||
- created_at
|
||||
|
|
|
|||
11
docs/static/llama-stack-spec.html
vendored
11
docs/static/llama-stack-spec.html
vendored
|
|
@ -7503,6 +7503,10 @@
|
|||
"type": "string",
|
||||
"description": "(Optional) Truncation strategy applied to the response"
|
||||
},
|
||||
"max_output_tokens": {
|
||||
"type": "integer",
|
||||
"description": "(Optional) Upper bound for response tokens generation"
|
||||
},
|
||||
"input": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
|
|
@ -8009,6 +8013,9 @@
|
|||
},
|
||||
"max_infer_iters": {
|
||||
"type": "integer"
|
||||
},
|
||||
"max_output_tokens": {
|
||||
"type": "integer"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
|
|
@ -8078,6 +8085,10 @@
|
|||
"truncation": {
|
||||
"type": "string",
|
||||
"description": "(Optional) Truncation strategy applied to the response"
|
||||
},
|
||||
"max_output_tokens": {
|
||||
"type": "integer",
|
||||
"description": "(Optional) Upper bound for response tokens generation"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
|
|
|
|||
10
docs/static/llama-stack-spec.yaml
vendored
10
docs/static/llama-stack-spec.yaml
vendored
|
|
@ -5660,6 +5660,10 @@ components:
|
|||
type: string
|
||||
description: >-
|
||||
(Optional) Truncation strategy applied to the response
|
||||
max_output_tokens:
|
||||
type: integer
|
||||
description: >-
|
||||
(Optional) Upper bound for response tokens generation
|
||||
input:
|
||||
type: array
|
||||
items:
|
||||
|
|
@ -6014,6 +6018,8 @@ components:
|
|||
(Optional) Additional fields to include in the response.
|
||||
max_infer_iters:
|
||||
type: integer
|
||||
max_output_tokens:
|
||||
type: integer
|
||||
additionalProperties: false
|
||||
required:
|
||||
- input
|
||||
|
|
@ -6077,6 +6083,10 @@ components:
|
|||
type: string
|
||||
description: >-
|
||||
(Optional) Truncation strategy applied to the response
|
||||
max_output_tokens:
|
||||
type: integer
|
||||
description: >-
|
||||
(Optional) Upper bound for response tokens generation
|
||||
additionalProperties: false
|
||||
required:
|
||||
- created_at
|
||||
|
|
|
|||
11
docs/static/stainless-llama-stack-spec.html
vendored
11
docs/static/stainless-llama-stack-spec.html
vendored
|
|
@ -9512,6 +9512,10 @@
|
|||
"type": "string",
|
||||
"description": "(Optional) Truncation strategy applied to the response"
|
||||
},
|
||||
"max_output_tokens": {
|
||||
"type": "integer",
|
||||
"description": "(Optional) Upper bound for response tokens generation"
|
||||
},
|
||||
"input": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
|
|
@ -10018,6 +10022,9 @@
|
|||
},
|
||||
"max_infer_iters": {
|
||||
"type": "integer"
|
||||
},
|
||||
"max_output_tokens": {
|
||||
"type": "integer"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
|
|
@ -10087,6 +10094,10 @@
|
|||
"truncation": {
|
||||
"type": "string",
|
||||
"description": "(Optional) Truncation strategy applied to the response"
|
||||
},
|
||||
"max_output_tokens": {
|
||||
"type": "integer",
|
||||
"description": "(Optional) Upper bound for response tokens generation"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
|
|
|
|||
10
docs/static/stainless-llama-stack-spec.yaml
vendored
10
docs/static/stainless-llama-stack-spec.yaml
vendored
|
|
@ -7105,6 +7105,10 @@ components:
|
|||
type: string
|
||||
description: >-
|
||||
(Optional) Truncation strategy applied to the response
|
||||
max_output_tokens:
|
||||
type: integer
|
||||
description: >-
|
||||
(Optional) Upper bound for response tokens generation
|
||||
input:
|
||||
type: array
|
||||
items:
|
||||
|
|
@ -7459,6 +7463,8 @@ components:
|
|||
(Optional) Additional fields to include in the response.
|
||||
max_infer_iters:
|
||||
type: integer
|
||||
max_output_tokens:
|
||||
type: integer
|
||||
additionalProperties: false
|
||||
required:
|
||||
- input
|
||||
|
|
@ -7522,6 +7528,10 @@ components:
|
|||
type: string
|
||||
description: >-
|
||||
(Optional) Truncation strategy applied to the response
|
||||
max_output_tokens:
|
||||
type: integer
|
||||
description: >-
|
||||
(Optional) Upper bound for response tokens generation
|
||||
additionalProperties: false
|
||||
required:
|
||||
- created_at
|
||||
|
|
|
|||
|
|
@ -825,6 +825,7 @@ class Agents(Protocol):
|
|||
"List of shields to apply during response generation. Shields provide safety and content moderation."
|
||||
),
|
||||
] = None,
|
||||
max_output_tokens: int | None = None,
|
||||
) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]:
|
||||
"""Create a new OpenAI response.
|
||||
|
||||
|
|
|
|||
|
|
@ -363,6 +363,7 @@ class OpenAIResponseObject(BaseModel):
|
|||
:param text: Text formatting configuration for the response
|
||||
:param top_p: (Optional) Nucleus sampling parameter used for generation
|
||||
:param truncation: (Optional) Truncation strategy applied to the response
|
||||
:param max_output_tokens: (Optional) Upper bound for response tokens generation
|
||||
"""
|
||||
|
||||
created_at: int
|
||||
|
|
@ -380,6 +381,7 @@ class OpenAIResponseObject(BaseModel):
|
|||
text: OpenAIResponseText = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text"))
|
||||
top_p: float | None = None
|
||||
truncation: str | None = None
|
||||
max_output_tokens: int | None = None
|
||||
|
||||
|
||||
@json_schema_type
|
||||
|
|
|
|||
|
|
@ -204,6 +204,7 @@ class OpenAIResponsesImpl:
|
|||
store: bool | None = True,
|
||||
stream: bool | None = False,
|
||||
temperature: float | None = None,
|
||||
max_output_tokens: int | None = None,
|
||||
text: OpenAIResponseText | None = None,
|
||||
tools: list[OpenAIResponseInputTool] | None = None,
|
||||
include: list[str] | None = None,
|
||||
|
|
@ -224,6 +225,7 @@ class OpenAIResponsesImpl:
|
|||
previous_response_id=previous_response_id,
|
||||
store=store,
|
||||
temperature=temperature,
|
||||
max_output_tokens=max_output_tokens,
|
||||
text=text,
|
||||
tools=tools,
|
||||
max_infer_iters=max_infer_iters,
|
||||
|
|
@ -252,6 +254,7 @@ class OpenAIResponsesImpl:
|
|||
previous_response_id: str | None = None,
|
||||
store: bool | None = True,
|
||||
temperature: float | None = None,
|
||||
max_output_tokens: int | None = None,
|
||||
text: OpenAIResponseText | None = None,
|
||||
tools: list[OpenAIResponseInputTool] | None = None,
|
||||
max_infer_iters: int | None = 10,
|
||||
|
|
@ -268,6 +271,7 @@ class OpenAIResponsesImpl:
|
|||
messages=messages,
|
||||
response_tools=tools,
|
||||
temperature=temperature,
|
||||
max_tokens=max_output_tokens,
|
||||
response_format=response_format,
|
||||
inputs=input,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -63,6 +63,7 @@ class ChatCompletionContext(BaseModel):
|
|||
response_format: OpenAIResponseFormatParam
|
||||
approval_requests: list[OpenAIResponseMCPApprovalRequest] = []
|
||||
approval_responses: dict[str, OpenAIResponseMCPApprovalResponse] = {}
|
||||
max_tokens: int | None = None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
|
@ -72,6 +73,7 @@ class ChatCompletionContext(BaseModel):
|
|||
temperature: float | None,
|
||||
response_format: OpenAIResponseFormatParam,
|
||||
inputs: list[OpenAIResponseInput] | str,
|
||||
max_tokens: int | None = None,
|
||||
):
|
||||
super().__init__(
|
||||
model=model,
|
||||
|
|
|
|||
|
|
@ -297,3 +297,38 @@ def test_function_call_output_response_with_none_arguments(openai_client, client
|
|||
assert response.output[0].type == "function_call"
|
||||
assert response.output[0].arguments == "{}"
|
||||
_ = response.output[0].call_id
|
||||
|
||||
|
||||
def test_response_with_max_output_tokens(compat_client, text_model_id):
|
||||
"""Test that the `max_output_tokens` parameter is used."""
|
||||
if not isinstance(compat_client, OpenAI):
|
||||
pytest.skip("This test requires the OpenAI client.")
|
||||
|
||||
response = compat_client.responses.create(
|
||||
model=text_model_id,
|
||||
input=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "what's the current time? You MUST call the `get_current_time` function to find out.",
|
||||
}
|
||||
],
|
||||
max_output_tokens=15,
|
||||
stream=False,
|
||||
)
|
||||
|
||||
assert response.id is not None
|
||||
assert response.model == text_model_id
|
||||
|
||||
assert hasattr(response, "max_output_tokens")
|
||||
assert response.max_output_tokens == 15
|
||||
|
||||
output_text = ""
|
||||
for item in response.output:
|
||||
if item.type == "message" and item.role == "assistant":
|
||||
if item.content and item.content.type == "text":
|
||||
output_text = item.content.text
|
||||
break
|
||||
|
||||
assert output_text, "Assistant response content should not be empty"
|
||||
|
||||
assert len(output_text.split()) < 30
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue