mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-13 21:02:39 +00:00
feat: Add max_output_tokens to Response API
Responses and Completions have a max_output_tokens field. It is currently missing from the create and response object in Responses API. This PR fixes it. fixes: #3562 Signed-off-by: Abhishek Bongale <abhishekbongale@outlook.com>
This commit is contained in:
parent
92219fd8fb
commit
bb58da22a1
14 changed files with 127 additions and 20 deletions
|
|
@ -1,7 +1,7 @@
|
||||||
---
|
---
|
||||||
description: "Agents
|
description: "Agents
|
||||||
|
|
||||||
APIs for creating and interacting with agentic systems."
|
APIs for creating and interacting with agentic systems."
|
||||||
sidebar_label: Agents
|
sidebar_label: Agents
|
||||||
title: Agents
|
title: Agents
|
||||||
---
|
---
|
||||||
|
|
@ -12,6 +12,6 @@ title: Agents
|
||||||
|
|
||||||
Agents
|
Agents
|
||||||
|
|
||||||
APIs for creating and interacting with agentic systems.
|
APIs for creating and interacting with agentic systems.
|
||||||
|
|
||||||
This section contains documentation for all available providers for the **agents** API.
|
This section contains documentation for all available providers for the **agents** API.
|
||||||
|
|
|
||||||
|
|
@ -1,14 +1,14 @@
|
||||||
---
|
---
|
||||||
description: "The Batches API enables efficient processing of multiple requests in a single operation,
|
description: "The Batches API enables efficient processing of multiple requests in a single operation,
|
||||||
particularly useful for processing large datasets, batch evaluation workflows, and
|
particularly useful for processing large datasets, batch evaluation workflows, and
|
||||||
cost-effective inference at scale.
|
cost-effective inference at scale.
|
||||||
|
|
||||||
The API is designed to allow use of openai client libraries for seamless integration.
|
The API is designed to allow use of openai client libraries for seamless integration.
|
||||||
|
|
||||||
This API provides the following extensions:
|
This API provides the following extensions:
|
||||||
- idempotent batch creation
|
- idempotent batch creation
|
||||||
|
|
||||||
Note: This API is currently under active development and may undergo changes."
|
Note: This API is currently under active development and may undergo changes."
|
||||||
sidebar_label: Batches
|
sidebar_label: Batches
|
||||||
title: Batches
|
title: Batches
|
||||||
---
|
---
|
||||||
|
|
@ -18,14 +18,14 @@ title: Batches
|
||||||
## Overview
|
## Overview
|
||||||
|
|
||||||
The Batches API enables efficient processing of multiple requests in a single operation,
|
The Batches API enables efficient processing of multiple requests in a single operation,
|
||||||
particularly useful for processing large datasets, batch evaluation workflows, and
|
particularly useful for processing large datasets, batch evaluation workflows, and
|
||||||
cost-effective inference at scale.
|
cost-effective inference at scale.
|
||||||
|
|
||||||
The API is designed to allow use of openai client libraries for seamless integration.
|
The API is designed to allow use of openai client libraries for seamless integration.
|
||||||
|
|
||||||
This API provides the following extensions:
|
This API provides the following extensions:
|
||||||
- idempotent batch creation
|
- idempotent batch creation
|
||||||
|
|
||||||
Note: This API is currently under active development and may undergo changes.
|
Note: This API is currently under active development and may undergo changes.
|
||||||
|
|
||||||
This section contains documentation for all available providers for the **batches** API.
|
This section contains documentation for all available providers for the **batches** API.
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,9 @@
|
||||||
---
|
---
|
||||||
description: "Llama Stack Inference API for generating completions, chat completions, and embeddings.
|
description: "Llama Stack Inference API for generating completions, chat completions, and embeddings.
|
||||||
|
|
||||||
This API provides the raw interface to the underlying models. Two kinds of models are supported:
|
This API provides the raw interface to the underlying models. Two kinds of models are supported:
|
||||||
- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.
|
- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.
|
||||||
- Embedding models: these models generate embeddings to be used for semantic search."
|
- Embedding models: these models generate embeddings to be used for semantic search."
|
||||||
sidebar_label: Inference
|
sidebar_label: Inference
|
||||||
title: Inference
|
title: Inference
|
||||||
---
|
---
|
||||||
|
|
@ -14,8 +14,8 @@ title: Inference
|
||||||
|
|
||||||
Llama Stack Inference API for generating completions, chat completions, and embeddings.
|
Llama Stack Inference API for generating completions, chat completions, and embeddings.
|
||||||
|
|
||||||
This API provides the raw interface to the underlying models. Two kinds of models are supported:
|
This API provides the raw interface to the underlying models. Two kinds of models are supported:
|
||||||
- LLM models: these models generate "raw" and "chat" (conversational) completions.
|
- LLM models: these models generate "raw" and "chat" (conversational) completions.
|
||||||
- Embedding models: these models generate embeddings to be used for semantic search.
|
- Embedding models: these models generate embeddings to be used for semantic search.
|
||||||
|
|
||||||
This section contains documentation for all available providers for the **inference** API.
|
This section contains documentation for all available providers for the **inference** API.
|
||||||
|
|
|
||||||
11
docs/static/deprecated-llama-stack-spec.html
vendored
11
docs/static/deprecated-llama-stack-spec.html
vendored
|
|
@ -9096,6 +9096,10 @@
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"description": "(Optional) Truncation strategy applied to the response"
|
"description": "(Optional) Truncation strategy applied to the response"
|
||||||
},
|
},
|
||||||
|
"max_output_tokens": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "(Optional) Upper bound for response tokens generation"
|
||||||
|
},
|
||||||
"input": {
|
"input": {
|
||||||
"type": "array",
|
"type": "array",
|
||||||
"items": {
|
"items": {
|
||||||
|
|
@ -9914,6 +9918,9 @@
|
||||||
},
|
},
|
||||||
"max_infer_iters": {
|
"max_infer_iters": {
|
||||||
"type": "integer"
|
"type": "integer"
|
||||||
|
},
|
||||||
|
"max_output_tokens": {
|
||||||
|
"type": "integer"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"additionalProperties": false,
|
"additionalProperties": false,
|
||||||
|
|
@ -9983,6 +9990,10 @@
|
||||||
"truncation": {
|
"truncation": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"description": "(Optional) Truncation strategy applied to the response"
|
"description": "(Optional) Truncation strategy applied to the response"
|
||||||
|
},
|
||||||
|
"max_output_tokens": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "(Optional) Upper bound for response tokens generation"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"additionalProperties": false,
|
"additionalProperties": false,
|
||||||
|
|
|
||||||
10
docs/static/deprecated-llama-stack-spec.yaml
vendored
10
docs/static/deprecated-llama-stack-spec.yaml
vendored
|
|
@ -6740,6 +6740,10 @@ components:
|
||||||
type: string
|
type: string
|
||||||
description: >-
|
description: >-
|
||||||
(Optional) Truncation strategy applied to the response
|
(Optional) Truncation strategy applied to the response
|
||||||
|
max_output_tokens:
|
||||||
|
type: integer
|
||||||
|
description: >-
|
||||||
|
(Optional) Upper bound for response tokens generation
|
||||||
input:
|
input:
|
||||||
type: array
|
type: array
|
||||||
items:
|
items:
|
||||||
|
|
@ -7351,6 +7355,8 @@ components:
|
||||||
(Optional) Additional fields to include in the response.
|
(Optional) Additional fields to include in the response.
|
||||||
max_infer_iters:
|
max_infer_iters:
|
||||||
type: integer
|
type: integer
|
||||||
|
max_output_tokens:
|
||||||
|
type: integer
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
required:
|
required:
|
||||||
- input
|
- input
|
||||||
|
|
@ -7414,6 +7420,10 @@ components:
|
||||||
type: string
|
type: string
|
||||||
description: >-
|
description: >-
|
||||||
(Optional) Truncation strategy applied to the response
|
(Optional) Truncation strategy applied to the response
|
||||||
|
max_output_tokens:
|
||||||
|
type: integer
|
||||||
|
description: >-
|
||||||
|
(Optional) Upper bound for response tokens generation
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
required:
|
required:
|
||||||
- created_at
|
- created_at
|
||||||
|
|
|
||||||
11
docs/static/llama-stack-spec.html
vendored
11
docs/static/llama-stack-spec.html
vendored
|
|
@ -7503,6 +7503,10 @@
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"description": "(Optional) Truncation strategy applied to the response"
|
"description": "(Optional) Truncation strategy applied to the response"
|
||||||
},
|
},
|
||||||
|
"max_output_tokens": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "(Optional) Upper bound for response tokens generation"
|
||||||
|
},
|
||||||
"input": {
|
"input": {
|
||||||
"type": "array",
|
"type": "array",
|
||||||
"items": {
|
"items": {
|
||||||
|
|
@ -8009,6 +8013,9 @@
|
||||||
},
|
},
|
||||||
"max_infer_iters": {
|
"max_infer_iters": {
|
||||||
"type": "integer"
|
"type": "integer"
|
||||||
|
},
|
||||||
|
"max_output_tokens": {
|
||||||
|
"type": "integer"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"additionalProperties": false,
|
"additionalProperties": false,
|
||||||
|
|
@ -8078,6 +8085,10 @@
|
||||||
"truncation": {
|
"truncation": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"description": "(Optional) Truncation strategy applied to the response"
|
"description": "(Optional) Truncation strategy applied to the response"
|
||||||
|
},
|
||||||
|
"max_output_tokens": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "(Optional) Upper bound for response tokens generation"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"additionalProperties": false,
|
"additionalProperties": false,
|
||||||
|
|
|
||||||
10
docs/static/llama-stack-spec.yaml
vendored
10
docs/static/llama-stack-spec.yaml
vendored
|
|
@ -5660,6 +5660,10 @@ components:
|
||||||
type: string
|
type: string
|
||||||
description: >-
|
description: >-
|
||||||
(Optional) Truncation strategy applied to the response
|
(Optional) Truncation strategy applied to the response
|
||||||
|
max_output_tokens:
|
||||||
|
type: integer
|
||||||
|
description: >-
|
||||||
|
(Optional) Upper bound for response tokens generation
|
||||||
input:
|
input:
|
||||||
type: array
|
type: array
|
||||||
items:
|
items:
|
||||||
|
|
@ -6014,6 +6018,8 @@ components:
|
||||||
(Optional) Additional fields to include in the response.
|
(Optional) Additional fields to include in the response.
|
||||||
max_infer_iters:
|
max_infer_iters:
|
||||||
type: integer
|
type: integer
|
||||||
|
max_output_tokens:
|
||||||
|
type: integer
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
required:
|
required:
|
||||||
- input
|
- input
|
||||||
|
|
@ -6077,6 +6083,10 @@ components:
|
||||||
type: string
|
type: string
|
||||||
description: >-
|
description: >-
|
||||||
(Optional) Truncation strategy applied to the response
|
(Optional) Truncation strategy applied to the response
|
||||||
|
max_output_tokens:
|
||||||
|
type: integer
|
||||||
|
description: >-
|
||||||
|
(Optional) Upper bound for response tokens generation
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
required:
|
required:
|
||||||
- created_at
|
- created_at
|
||||||
|
|
|
||||||
11
docs/static/stainless-llama-stack-spec.html
vendored
11
docs/static/stainless-llama-stack-spec.html
vendored
|
|
@ -9512,6 +9512,10 @@
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"description": "(Optional) Truncation strategy applied to the response"
|
"description": "(Optional) Truncation strategy applied to the response"
|
||||||
},
|
},
|
||||||
|
"max_output_tokens": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "(Optional) Upper bound for response tokens generation"
|
||||||
|
},
|
||||||
"input": {
|
"input": {
|
||||||
"type": "array",
|
"type": "array",
|
||||||
"items": {
|
"items": {
|
||||||
|
|
@ -10018,6 +10022,9 @@
|
||||||
},
|
},
|
||||||
"max_infer_iters": {
|
"max_infer_iters": {
|
||||||
"type": "integer"
|
"type": "integer"
|
||||||
|
},
|
||||||
|
"max_output_tokens": {
|
||||||
|
"type": "integer"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"additionalProperties": false,
|
"additionalProperties": false,
|
||||||
|
|
@ -10087,6 +10094,10 @@
|
||||||
"truncation": {
|
"truncation": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"description": "(Optional) Truncation strategy applied to the response"
|
"description": "(Optional) Truncation strategy applied to the response"
|
||||||
|
},
|
||||||
|
"max_output_tokens": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "(Optional) Upper bound for response tokens generation"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"additionalProperties": false,
|
"additionalProperties": false,
|
||||||
|
|
|
||||||
10
docs/static/stainless-llama-stack-spec.yaml
vendored
10
docs/static/stainless-llama-stack-spec.yaml
vendored
|
|
@ -7105,6 +7105,10 @@ components:
|
||||||
type: string
|
type: string
|
||||||
description: >-
|
description: >-
|
||||||
(Optional) Truncation strategy applied to the response
|
(Optional) Truncation strategy applied to the response
|
||||||
|
max_output_tokens:
|
||||||
|
type: integer
|
||||||
|
description: >-
|
||||||
|
(Optional) Upper bound for response tokens generation
|
||||||
input:
|
input:
|
||||||
type: array
|
type: array
|
||||||
items:
|
items:
|
||||||
|
|
@ -7459,6 +7463,8 @@ components:
|
||||||
(Optional) Additional fields to include in the response.
|
(Optional) Additional fields to include in the response.
|
||||||
max_infer_iters:
|
max_infer_iters:
|
||||||
type: integer
|
type: integer
|
||||||
|
max_output_tokens:
|
||||||
|
type: integer
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
required:
|
required:
|
||||||
- input
|
- input
|
||||||
|
|
@ -7522,6 +7528,10 @@ components:
|
||||||
type: string
|
type: string
|
||||||
description: >-
|
description: >-
|
||||||
(Optional) Truncation strategy applied to the response
|
(Optional) Truncation strategy applied to the response
|
||||||
|
max_output_tokens:
|
||||||
|
type: integer
|
||||||
|
description: >-
|
||||||
|
(Optional) Upper bound for response tokens generation
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
required:
|
required:
|
||||||
- created_at
|
- created_at
|
||||||
|
|
|
||||||
|
|
@ -825,6 +825,7 @@ class Agents(Protocol):
|
||||||
"List of shields to apply during response generation. Shields provide safety and content moderation."
|
"List of shields to apply during response generation. Shields provide safety and content moderation."
|
||||||
),
|
),
|
||||||
] = None,
|
] = None,
|
||||||
|
max_output_tokens: int | None = None,
|
||||||
) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]:
|
) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]:
|
||||||
"""Create a new OpenAI response.
|
"""Create a new OpenAI response.
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -363,6 +363,7 @@ class OpenAIResponseObject(BaseModel):
|
||||||
:param text: Text formatting configuration for the response
|
:param text: Text formatting configuration for the response
|
||||||
:param top_p: (Optional) Nucleus sampling parameter used for generation
|
:param top_p: (Optional) Nucleus sampling parameter used for generation
|
||||||
:param truncation: (Optional) Truncation strategy applied to the response
|
:param truncation: (Optional) Truncation strategy applied to the response
|
||||||
|
:param max_output_tokens: (Optional) Upper bound for response tokens generation
|
||||||
"""
|
"""
|
||||||
|
|
||||||
created_at: int
|
created_at: int
|
||||||
|
|
@ -380,6 +381,7 @@ class OpenAIResponseObject(BaseModel):
|
||||||
text: OpenAIResponseText = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text"))
|
text: OpenAIResponseText = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text"))
|
||||||
top_p: float | None = None
|
top_p: float | None = None
|
||||||
truncation: str | None = None
|
truncation: str | None = None
|
||||||
|
max_output_tokens: int | None = None
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
|
|
|
||||||
|
|
@ -204,6 +204,7 @@ class OpenAIResponsesImpl:
|
||||||
store: bool | None = True,
|
store: bool | None = True,
|
||||||
stream: bool | None = False,
|
stream: bool | None = False,
|
||||||
temperature: float | None = None,
|
temperature: float | None = None,
|
||||||
|
max_output_tokens: int | None = None,
|
||||||
text: OpenAIResponseText | None = None,
|
text: OpenAIResponseText | None = None,
|
||||||
tools: list[OpenAIResponseInputTool] | None = None,
|
tools: list[OpenAIResponseInputTool] | None = None,
|
||||||
include: list[str] | None = None,
|
include: list[str] | None = None,
|
||||||
|
|
@ -224,6 +225,7 @@ class OpenAIResponsesImpl:
|
||||||
previous_response_id=previous_response_id,
|
previous_response_id=previous_response_id,
|
||||||
store=store,
|
store=store,
|
||||||
temperature=temperature,
|
temperature=temperature,
|
||||||
|
max_output_tokens=max_output_tokens,
|
||||||
text=text,
|
text=text,
|
||||||
tools=tools,
|
tools=tools,
|
||||||
max_infer_iters=max_infer_iters,
|
max_infer_iters=max_infer_iters,
|
||||||
|
|
@ -252,6 +254,7 @@ class OpenAIResponsesImpl:
|
||||||
previous_response_id: str | None = None,
|
previous_response_id: str | None = None,
|
||||||
store: bool | None = True,
|
store: bool | None = True,
|
||||||
temperature: float | None = None,
|
temperature: float | None = None,
|
||||||
|
max_output_tokens: int | None = None,
|
||||||
text: OpenAIResponseText | None = None,
|
text: OpenAIResponseText | None = None,
|
||||||
tools: list[OpenAIResponseInputTool] | None = None,
|
tools: list[OpenAIResponseInputTool] | None = None,
|
||||||
max_infer_iters: int | None = 10,
|
max_infer_iters: int | None = 10,
|
||||||
|
|
@ -268,6 +271,7 @@ class OpenAIResponsesImpl:
|
||||||
messages=messages,
|
messages=messages,
|
||||||
response_tools=tools,
|
response_tools=tools,
|
||||||
temperature=temperature,
|
temperature=temperature,
|
||||||
|
max_tokens=max_output_tokens,
|
||||||
response_format=response_format,
|
response_format=response_format,
|
||||||
inputs=input,
|
inputs=input,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -63,6 +63,7 @@ class ChatCompletionContext(BaseModel):
|
||||||
response_format: OpenAIResponseFormatParam
|
response_format: OpenAIResponseFormatParam
|
||||||
approval_requests: list[OpenAIResponseMCPApprovalRequest] = []
|
approval_requests: list[OpenAIResponseMCPApprovalRequest] = []
|
||||||
approval_responses: dict[str, OpenAIResponseMCPApprovalResponse] = {}
|
approval_responses: dict[str, OpenAIResponseMCPApprovalResponse] = {}
|
||||||
|
max_tokens: int | None = None
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
|
@ -72,6 +73,7 @@ class ChatCompletionContext(BaseModel):
|
||||||
temperature: float | None,
|
temperature: float | None,
|
||||||
response_format: OpenAIResponseFormatParam,
|
response_format: OpenAIResponseFormatParam,
|
||||||
inputs: list[OpenAIResponseInput] | str,
|
inputs: list[OpenAIResponseInput] | str,
|
||||||
|
max_tokens: int | None = None,
|
||||||
):
|
):
|
||||||
super().__init__(
|
super().__init__(
|
||||||
model=model,
|
model=model,
|
||||||
|
|
|
||||||
|
|
@ -297,3 +297,38 @@ def test_function_call_output_response_with_none_arguments(openai_client, client
|
||||||
assert response.output[0].type == "function_call"
|
assert response.output[0].type == "function_call"
|
||||||
assert response.output[0].arguments == "{}"
|
assert response.output[0].arguments == "{}"
|
||||||
_ = response.output[0].call_id
|
_ = response.output[0].call_id
|
||||||
|
|
||||||
|
|
||||||
|
def test_response_with_max_output_tokens(compat_client, text_model_id):
|
||||||
|
"""Test that the `max_output_tokens` parameter is used."""
|
||||||
|
if not isinstance(compat_client, OpenAI):
|
||||||
|
pytest.skip("This test requires the OpenAI client.")
|
||||||
|
|
||||||
|
response = compat_client.responses.create(
|
||||||
|
model=text_model_id,
|
||||||
|
input=[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "what's the current time? You MUST call the `get_current_time` function to find out.",
|
||||||
|
}
|
||||||
|
],
|
||||||
|
max_output_tokens=15,
|
||||||
|
stream=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert response.id is not None
|
||||||
|
assert response.model == text_model_id
|
||||||
|
|
||||||
|
assert hasattr(response, "max_output_tokens")
|
||||||
|
assert response.max_output_tokens == 15
|
||||||
|
|
||||||
|
output_text = ""
|
||||||
|
for item in response.output:
|
||||||
|
if item.type == "message" and item.role == "assistant":
|
||||||
|
if item.content and item.content.type == "text":
|
||||||
|
output_text = item.content.text
|
||||||
|
break
|
||||||
|
|
||||||
|
assert output_text, "Assistant response content should not be empty"
|
||||||
|
|
||||||
|
assert len(output_text.split()) < 30
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue