mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-03 09:53:45 +00:00
feat: Add max_output_tokens to Response API
OpenAI Responses and Completions have a max_output_tokens field. It is currently missing from the create and response object in Responses API. This PR fixes it. fixes: #3562 Signed-off-by: Abhishek Bongale <abhishekbongale@outlook.com>
This commit is contained in:
parent
939a2db58f
commit
97b345b3f8
9 changed files with 59 additions and 0 deletions
|
|
@ -6880,6 +6880,10 @@ components:
|
||||||
type: string
|
type: string
|
||||||
description: >-
|
description: >-
|
||||||
(Optional) System message inserted into the model's context
|
(Optional) System message inserted into the model's context
|
||||||
|
max_output_tokens:
|
||||||
|
type: integer
|
||||||
|
description: >-
|
||||||
|
(Optional) Upper bound for response tokens generation.
|
||||||
input:
|
input:
|
||||||
type: array
|
type: array
|
||||||
items:
|
items:
|
||||||
|
|
@ -7238,6 +7242,10 @@ components:
|
||||||
(Optional) Additional fields to include in the response.
|
(Optional) Additional fields to include in the response.
|
||||||
max_infer_iters:
|
max_infer_iters:
|
||||||
type: integer
|
type: integer
|
||||||
|
max_output_tokens:
|
||||||
|
type: integer
|
||||||
|
description: >-
|
||||||
|
(Optional) Maximum tokens generated in a response.
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
required:
|
required:
|
||||||
- input
|
- input
|
||||||
|
|
@ -7319,6 +7327,10 @@ components:
|
||||||
type: string
|
type: string
|
||||||
description: >-
|
description: >-
|
||||||
(Optional) System message inserted into the model's context
|
(Optional) System message inserted into the model's context
|
||||||
|
max_output_tokens:
|
||||||
|
type: integer
|
||||||
|
description: >-
|
||||||
|
(Optional) Upper bound for response tokens generation.
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
required:
|
required:
|
||||||
- created_at
|
- created_at
|
||||||
|
|
|
||||||
12
docs/static/llama-stack-spec.yaml
vendored
12
docs/static/llama-stack-spec.yaml
vendored
|
|
@ -6164,6 +6164,10 @@ components:
|
||||||
type: string
|
type: string
|
||||||
description: >-
|
description: >-
|
||||||
(Optional) System message inserted into the model's context
|
(Optional) System message inserted into the model's context
|
||||||
|
max_output_tokens:
|
||||||
|
type: integer
|
||||||
|
description: >-
|
||||||
|
(Optional) Upper bound for response tokens generation.
|
||||||
input:
|
input:
|
||||||
type: array
|
type: array
|
||||||
items:
|
items:
|
||||||
|
|
@ -6522,6 +6526,10 @@ components:
|
||||||
(Optional) Additional fields to include in the response.
|
(Optional) Additional fields to include in the response.
|
||||||
max_infer_iters:
|
max_infer_iters:
|
||||||
type: integer
|
type: integer
|
||||||
|
max_output_tokens:
|
||||||
|
type: integer
|
||||||
|
description: >-
|
||||||
|
(Optional) Maximum tokens generated in a response.
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
required:
|
required:
|
||||||
- input
|
- input
|
||||||
|
|
@ -6603,6 +6611,10 @@ components:
|
||||||
type: string
|
type: string
|
||||||
description: >-
|
description: >-
|
||||||
(Optional) System message inserted into the model's context
|
(Optional) System message inserted into the model's context
|
||||||
|
max_output_tokens:
|
||||||
|
type: integer
|
||||||
|
description: >-
|
||||||
|
(Optional) Upper bound for response tokens generation.
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
required:
|
required:
|
||||||
- created_at
|
- created_at
|
||||||
|
|
|
||||||
12
docs/static/stainless-llama-stack-spec.yaml
vendored
12
docs/static/stainless-llama-stack-spec.yaml
vendored
|
|
@ -6880,6 +6880,10 @@ components:
|
||||||
type: string
|
type: string
|
||||||
description: >-
|
description: >-
|
||||||
(Optional) System message inserted into the model's context
|
(Optional) System message inserted into the model's context
|
||||||
|
max_output_tokens:
|
||||||
|
type: integer
|
||||||
|
description: >-
|
||||||
|
(Optional) Upper bound for response tokens generation.
|
||||||
input:
|
input:
|
||||||
type: array
|
type: array
|
||||||
items:
|
items:
|
||||||
|
|
@ -7238,6 +7242,10 @@ components:
|
||||||
(Optional) Additional fields to include in the response.
|
(Optional) Additional fields to include in the response.
|
||||||
max_infer_iters:
|
max_infer_iters:
|
||||||
type: integer
|
type: integer
|
||||||
|
max_output_tokens:
|
||||||
|
type: integer
|
||||||
|
description: >-
|
||||||
|
(Optional) Maximum tokens generated in a response.
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
required:
|
required:
|
||||||
- input
|
- input
|
||||||
|
|
@ -7319,6 +7327,10 @@ components:
|
||||||
type: string
|
type: string
|
||||||
description: >-
|
description: >-
|
||||||
(Optional) System message inserted into the model's context
|
(Optional) System message inserted into the model's context
|
||||||
|
max_output_tokens:
|
||||||
|
type: integer
|
||||||
|
description: >-
|
||||||
|
(Optional) Upper bound for response tokens generation.
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
required:
|
required:
|
||||||
- created_at
|
- created_at
|
||||||
|
|
|
||||||
|
|
@ -87,6 +87,7 @@ class Agents(Protocol):
|
||||||
"List of guardrails to apply during response generation. Guardrails provide safety and content moderation."
|
"List of guardrails to apply during response generation. Guardrails provide safety and content moderation."
|
||||||
),
|
),
|
||||||
] = None,
|
] = None,
|
||||||
|
max_output_tokens: int | None = None,
|
||||||
) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]:
|
) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]:
|
||||||
"""Create a model response.
|
"""Create a model response.
|
||||||
|
|
||||||
|
|
@ -97,6 +98,7 @@ class Agents(Protocol):
|
||||||
:param conversation: (Optional) The ID of a conversation to add the response to. Must begin with 'conv_'. Input and output messages will be automatically added to the conversation.
|
:param conversation: (Optional) The ID of a conversation to add the response to. Must begin with 'conv_'. Input and output messages will be automatically added to the conversation.
|
||||||
:param include: (Optional) Additional fields to include in the response.
|
:param include: (Optional) Additional fields to include in the response.
|
||||||
:param guardrails: (Optional) List of guardrails to apply during response generation. Can be guardrail IDs (strings) or guardrail specifications.
|
:param guardrails: (Optional) List of guardrails to apply during response generation. Can be guardrail IDs (strings) or guardrail specifications.
|
||||||
|
:param max_output_tokens: (Optional) Maximum tokens generated in a response.
|
||||||
:returns: An OpenAIResponseObject.
|
:returns: An OpenAIResponseObject.
|
||||||
"""
|
"""
|
||||||
...
|
...
|
||||||
|
|
|
||||||
|
|
@ -591,6 +591,7 @@ class OpenAIResponseObject(BaseModel):
|
||||||
:param truncation: (Optional) Truncation strategy applied to the response
|
:param truncation: (Optional) Truncation strategy applied to the response
|
||||||
:param usage: (Optional) Token usage information for the response
|
:param usage: (Optional) Token usage information for the response
|
||||||
:param instructions: (Optional) System message inserted into the model's context
|
:param instructions: (Optional) System message inserted into the model's context
|
||||||
|
:param max_output_tokens: (Optional) Upper bound for response tokens generation.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
created_at: int
|
created_at: int
|
||||||
|
|
@ -612,6 +613,7 @@ class OpenAIResponseObject(BaseModel):
|
||||||
truncation: str | None = None
|
truncation: str | None = None
|
||||||
usage: OpenAIResponseUsage | None = None
|
usage: OpenAIResponseUsage | None = None
|
||||||
instructions: str | None = None
|
instructions: str | None = None
|
||||||
|
max_output_tokens: int | None = None
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
|
|
|
||||||
|
|
@ -102,6 +102,7 @@ class MetaReferenceAgentsImpl(Agents):
|
||||||
include: list[str] | None = None,
|
include: list[str] | None = None,
|
||||||
max_infer_iters: int | None = 10,
|
max_infer_iters: int | None = 10,
|
||||||
guardrails: list[ResponseGuardrail] | None = None,
|
guardrails: list[ResponseGuardrail] | None = None,
|
||||||
|
max_output_tokens: int | None = None,
|
||||||
) -> OpenAIResponseObject:
|
) -> OpenAIResponseObject:
|
||||||
assert self.openai_responses_impl is not None, "OpenAI responses not initialized"
|
assert self.openai_responses_impl is not None, "OpenAI responses not initialized"
|
||||||
result = await self.openai_responses_impl.create_openai_response(
|
result = await self.openai_responses_impl.create_openai_response(
|
||||||
|
|
@ -119,6 +120,7 @@ class MetaReferenceAgentsImpl(Agents):
|
||||||
include,
|
include,
|
||||||
max_infer_iters,
|
max_infer_iters,
|
||||||
guardrails,
|
guardrails,
|
||||||
|
max_output_tokens,
|
||||||
)
|
)
|
||||||
return result # type: ignore[no-any-return]
|
return result # type: ignore[no-any-return]
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -255,6 +255,7 @@ class OpenAIResponsesImpl:
|
||||||
include: list[str] | None = None,
|
include: list[str] | None = None,
|
||||||
max_infer_iters: int | None = 10,
|
max_infer_iters: int | None = 10,
|
||||||
guardrails: list[str | ResponseGuardrailSpec] | None = None,
|
guardrails: list[str | ResponseGuardrailSpec] | None = None,
|
||||||
|
max_output_tokens: int | None = None,
|
||||||
):
|
):
|
||||||
stream = bool(stream)
|
stream = bool(stream)
|
||||||
text = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")) if text is None else text
|
text = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")) if text is None else text
|
||||||
|
|
@ -282,6 +283,7 @@ class OpenAIResponsesImpl:
|
||||||
tools=tools,
|
tools=tools,
|
||||||
max_infer_iters=max_infer_iters,
|
max_infer_iters=max_infer_iters,
|
||||||
guardrail_ids=guardrail_ids,
|
guardrail_ids=guardrail_ids,
|
||||||
|
max_output_tokens=max_output_tokens,
|
||||||
)
|
)
|
||||||
|
|
||||||
if stream:
|
if stream:
|
||||||
|
|
@ -331,6 +333,7 @@ class OpenAIResponsesImpl:
|
||||||
tools: list[OpenAIResponseInputTool] | None = None,
|
tools: list[OpenAIResponseInputTool] | None = None,
|
||||||
max_infer_iters: int | None = 10,
|
max_infer_iters: int | None = 10,
|
||||||
guardrail_ids: list[str] | None = None,
|
guardrail_ids: list[str] | None = None,
|
||||||
|
max_output_tokens: int | None = None,
|
||||||
) -> AsyncIterator[OpenAIResponseObjectStream]:
|
) -> AsyncIterator[OpenAIResponseObjectStream]:
|
||||||
# These should never be None when called from create_openai_response (which sets defaults)
|
# These should never be None when called from create_openai_response (which sets defaults)
|
||||||
# but we assert here to help mypy understand the types
|
# but we assert here to help mypy understand the types
|
||||||
|
|
@ -356,6 +359,7 @@ class OpenAIResponsesImpl:
|
||||||
response_format=response_format,
|
response_format=response_format,
|
||||||
tool_context=tool_context,
|
tool_context=tool_context,
|
||||||
inputs=all_input,
|
inputs=all_input,
|
||||||
|
max_output_tokens=max_output_tokens,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Create orchestrator and delegate streaming logic
|
# Create orchestrator and delegate streaming logic
|
||||||
|
|
|
||||||
|
|
@ -221,6 +221,16 @@ class StreamingResponseOrchestrator:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
while True:
|
while True:
|
||||||
|
# Check if the max_output_tokens are depleted are not
|
||||||
|
if (
|
||||||
|
self.ctx.max_output_tokens
|
||||||
|
and self.accumulated_usage
|
||||||
|
and self.accumulated_usage.output_tokens >= self.ctx.max_output_tokens
|
||||||
|
):
|
||||||
|
logger.info("exiting inference loop, remaining max_output_tokens is depleted")
|
||||||
|
final_status = "incomplete"
|
||||||
|
break
|
||||||
|
|
||||||
# Text is the default response format for chat completion so don't need to pass it
|
# Text is the default response format for chat completion so don't need to pass it
|
||||||
# (some providers don't support non-empty response_format when tools are present)
|
# (some providers don't support non-empty response_format when tools are present)
|
||||||
response_format = (
|
response_format = (
|
||||||
|
|
|
||||||
|
|
@ -160,6 +160,7 @@ class ChatCompletionContext(BaseModel):
|
||||||
tool_context: ToolContext | None
|
tool_context: ToolContext | None
|
||||||
approval_requests: list[OpenAIResponseMCPApprovalRequest] = []
|
approval_requests: list[OpenAIResponseMCPApprovalRequest] = []
|
||||||
approval_responses: dict[str, OpenAIResponseMCPApprovalResponse] = {}
|
approval_responses: dict[str, OpenAIResponseMCPApprovalResponse] = {}
|
||||||
|
max_output_tokens: int | None
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
|
@ -170,6 +171,7 @@ class ChatCompletionContext(BaseModel):
|
||||||
response_format: OpenAIResponseFormatParam,
|
response_format: OpenAIResponseFormatParam,
|
||||||
tool_context: ToolContext,
|
tool_context: ToolContext,
|
||||||
inputs: list[OpenAIResponseInput] | str,
|
inputs: list[OpenAIResponseInput] | str,
|
||||||
|
max_output_tokens: int | None,
|
||||||
):
|
):
|
||||||
super().__init__(
|
super().__init__(
|
||||||
model=model,
|
model=model,
|
||||||
|
|
@ -178,6 +180,7 @@ class ChatCompletionContext(BaseModel):
|
||||||
temperature=temperature,
|
temperature=temperature,
|
||||||
response_format=response_format,
|
response_format=response_format,
|
||||||
tool_context=tool_context,
|
tool_context=tool_context,
|
||||||
|
max_output_tokens=max_output_tokens,
|
||||||
)
|
)
|
||||||
if not isinstance(inputs, str):
|
if not isinstance(inputs, str):
|
||||||
self.approval_requests = [input for input in inputs if input.type == "mcp_approval_request"]
|
self.approval_requests = [input for input in inputs if input.type == "mcp_approval_request"]
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue