mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-03 09:53:45 +00:00
feat: Add max_output_tokens to Response API
OpenAI Responses and Completions have a max_output_tokens field. It is currently missing from the create and response object in Responses API. This PR fixes it. fixes: #3562 Signed-off-by: Abhishek Bongale <abhishekbongale@outlook.com>
This commit is contained in:
parent
939a2db58f
commit
97b345b3f8
9 changed files with 59 additions and 0 deletions
|
|
@ -6880,6 +6880,10 @@ components:
|
|||
type: string
|
||||
description: >-
|
||||
(Optional) System message inserted into the model's context
|
||||
max_output_tokens:
|
||||
type: integer
|
||||
description: >-
|
||||
(Optional) Upper bound for response tokens generation.
|
||||
input:
|
||||
type: array
|
||||
items:
|
||||
|
|
@ -7238,6 +7242,10 @@ components:
|
|||
(Optional) Additional fields to include in the response.
|
||||
max_infer_iters:
|
||||
type: integer
|
||||
max_output_tokens:
|
||||
type: integer
|
||||
description: >-
|
||||
(Optional) Maximum tokens generated in a response.
|
||||
additionalProperties: false
|
||||
required:
|
||||
- input
|
||||
|
|
@ -7319,6 +7327,10 @@ components:
|
|||
type: string
|
||||
description: >-
|
||||
(Optional) System message inserted into the model's context
|
||||
max_output_tokens:
|
||||
type: integer
|
||||
description: >-
|
||||
(Optional) Upper bound for response tokens generation.
|
||||
additionalProperties: false
|
||||
required:
|
||||
- created_at
|
||||
|
|
|
|||
12
docs/static/llama-stack-spec.yaml
vendored
12
docs/static/llama-stack-spec.yaml
vendored
|
|
@ -6164,6 +6164,10 @@ components:
|
|||
type: string
|
||||
description: >-
|
||||
(Optional) System message inserted into the model's context
|
||||
max_output_tokens:
|
||||
type: integer
|
||||
description: >-
|
||||
(Optional) Upper bound for response tokens generation.
|
||||
input:
|
||||
type: array
|
||||
items:
|
||||
|
|
@ -6522,6 +6526,10 @@ components:
|
|||
(Optional) Additional fields to include in the response.
|
||||
max_infer_iters:
|
||||
type: integer
|
||||
max_output_tokens:
|
||||
type: integer
|
||||
description: >-
|
||||
(Optional) Maximum tokens generated in a response.
|
||||
additionalProperties: false
|
||||
required:
|
||||
- input
|
||||
|
|
@ -6603,6 +6611,10 @@ components:
|
|||
type: string
|
||||
description: >-
|
||||
(Optional) System message inserted into the model's context
|
||||
max_output_tokens:
|
||||
type: integer
|
||||
description: >-
|
||||
(Optional) Upper bound for response tokens generation.
|
||||
additionalProperties: false
|
||||
required:
|
||||
- created_at
|
||||
|
|
|
|||
12
docs/static/stainless-llama-stack-spec.yaml
vendored
12
docs/static/stainless-llama-stack-spec.yaml
vendored
|
|
@ -6880,6 +6880,10 @@ components:
|
|||
type: string
|
||||
description: >-
|
||||
(Optional) System message inserted into the model's context
|
||||
max_output_tokens:
|
||||
type: integer
|
||||
description: >-
|
||||
(Optional) Upper bound for response tokens generation.
|
||||
input:
|
||||
type: array
|
||||
items:
|
||||
|
|
@ -7238,6 +7242,10 @@ components:
|
|||
(Optional) Additional fields to include in the response.
|
||||
max_infer_iters:
|
||||
type: integer
|
||||
max_output_tokens:
|
||||
type: integer
|
||||
description: >-
|
||||
(Optional) Maximum tokens generated in a response.
|
||||
additionalProperties: false
|
||||
required:
|
||||
- input
|
||||
|
|
@ -7319,6 +7327,10 @@ components:
|
|||
type: string
|
||||
description: >-
|
||||
(Optional) System message inserted into the model's context
|
||||
max_output_tokens:
|
||||
type: integer
|
||||
description: >-
|
||||
(Optional) Upper bound for response tokens generation.
|
||||
additionalProperties: false
|
||||
required:
|
||||
- created_at
|
||||
|
|
|
|||
|
|
@ -87,6 +87,7 @@ class Agents(Protocol):
|
|||
"List of guardrails to apply during response generation. Guardrails provide safety and content moderation."
|
||||
),
|
||||
] = None,
|
||||
max_output_tokens: int | None = None,
|
||||
) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]:
|
||||
"""Create a model response.
|
||||
|
||||
|
|
@ -97,6 +98,7 @@ class Agents(Protocol):
|
|||
:param conversation: (Optional) The ID of a conversation to add the response to. Must begin with 'conv_'. Input and output messages will be automatically added to the conversation.
|
||||
:param include: (Optional) Additional fields to include in the response.
|
||||
:param guardrails: (Optional) List of guardrails to apply during response generation. Can be guardrail IDs (strings) or guardrail specifications.
|
||||
:param max_output_tokens: (Optional) Maximum tokens generated in a response.
|
||||
:returns: An OpenAIResponseObject.
|
||||
"""
|
||||
...
|
||||
|
|
|
|||
|
|
@ -591,6 +591,7 @@ class OpenAIResponseObject(BaseModel):
|
|||
:param truncation: (Optional) Truncation strategy applied to the response
|
||||
:param usage: (Optional) Token usage information for the response
|
||||
:param instructions: (Optional) System message inserted into the model's context
|
||||
:param max_output_tokens: (Optional) Upper bound for response tokens generation.
|
||||
"""
|
||||
|
||||
created_at: int
|
||||
|
|
@ -612,6 +613,7 @@ class OpenAIResponseObject(BaseModel):
|
|||
truncation: str | None = None
|
||||
usage: OpenAIResponseUsage | None = None
|
||||
instructions: str | None = None
|
||||
max_output_tokens: int | None = None
|
||||
|
||||
|
||||
@json_schema_type
|
||||
|
|
|
|||
|
|
@ -102,6 +102,7 @@ class MetaReferenceAgentsImpl(Agents):
|
|||
include: list[str] | None = None,
|
||||
max_infer_iters: int | None = 10,
|
||||
guardrails: list[ResponseGuardrail] | None = None,
|
||||
max_output_tokens: int | None = None,
|
||||
) -> OpenAIResponseObject:
|
||||
assert self.openai_responses_impl is not None, "OpenAI responses not initialized"
|
||||
result = await self.openai_responses_impl.create_openai_response(
|
||||
|
|
@ -119,6 +120,7 @@ class MetaReferenceAgentsImpl(Agents):
|
|||
include,
|
||||
max_infer_iters,
|
||||
guardrails,
|
||||
max_output_tokens,
|
||||
)
|
||||
return result # type: ignore[no-any-return]
|
||||
|
||||
|
|
|
|||
|
|
@ -255,6 +255,7 @@ class OpenAIResponsesImpl:
|
|||
include: list[str] | None = None,
|
||||
max_infer_iters: int | None = 10,
|
||||
guardrails: list[str | ResponseGuardrailSpec] | None = None,
|
||||
max_output_tokens: int | None = None,
|
||||
):
|
||||
stream = bool(stream)
|
||||
text = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")) if text is None else text
|
||||
|
|
@ -282,6 +283,7 @@ class OpenAIResponsesImpl:
|
|||
tools=tools,
|
||||
max_infer_iters=max_infer_iters,
|
||||
guardrail_ids=guardrail_ids,
|
||||
max_output_tokens=max_output_tokens,
|
||||
)
|
||||
|
||||
if stream:
|
||||
|
|
@ -331,6 +333,7 @@ class OpenAIResponsesImpl:
|
|||
tools: list[OpenAIResponseInputTool] | None = None,
|
||||
max_infer_iters: int | None = 10,
|
||||
guardrail_ids: list[str] | None = None,
|
||||
max_output_tokens: int | None = None,
|
||||
) -> AsyncIterator[OpenAIResponseObjectStream]:
|
||||
# These should never be None when called from create_openai_response (which sets defaults)
|
||||
# but we assert here to help mypy understand the types
|
||||
|
|
@ -356,6 +359,7 @@ class OpenAIResponsesImpl:
|
|||
response_format=response_format,
|
||||
tool_context=tool_context,
|
||||
inputs=all_input,
|
||||
max_output_tokens=max_output_tokens,
|
||||
)
|
||||
|
||||
# Create orchestrator and delegate streaming logic
|
||||
|
|
|
|||
|
|
@ -221,6 +221,16 @@ class StreamingResponseOrchestrator:
|
|||
|
||||
try:
|
||||
while True:
|
||||
# Check if the max_output_tokens are depleted are not
|
||||
if (
|
||||
self.ctx.max_output_tokens
|
||||
and self.accumulated_usage
|
||||
and self.accumulated_usage.output_tokens >= self.ctx.max_output_tokens
|
||||
):
|
||||
logger.info("exiting inference loop, remaining max_output_tokens is depleted")
|
||||
final_status = "incomplete"
|
||||
break
|
||||
|
||||
# Text is the default response format for chat completion so don't need to pass it
|
||||
# (some providers don't support non-empty response_format when tools are present)
|
||||
response_format = (
|
||||
|
|
|
|||
|
|
@ -160,6 +160,7 @@ class ChatCompletionContext(BaseModel):
|
|||
tool_context: ToolContext | None
|
||||
approval_requests: list[OpenAIResponseMCPApprovalRequest] = []
|
||||
approval_responses: dict[str, OpenAIResponseMCPApprovalResponse] = {}
|
||||
max_output_tokens: int | None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
|
@ -170,6 +171,7 @@ class ChatCompletionContext(BaseModel):
|
|||
response_format: OpenAIResponseFormatParam,
|
||||
tool_context: ToolContext,
|
||||
inputs: list[OpenAIResponseInput] | str,
|
||||
max_output_tokens: int | None,
|
||||
):
|
||||
super().__init__(
|
||||
model=model,
|
||||
|
|
@ -178,6 +180,7 @@ class ChatCompletionContext(BaseModel):
|
|||
temperature=temperature,
|
||||
response_format=response_format,
|
||||
tool_context=tool_context,
|
||||
max_output_tokens=max_output_tokens,
|
||||
)
|
||||
if not isinstance(inputs, str):
|
||||
self.approval_requests = [input for input in inputs if input.type == "mcp_approval_request"]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue