mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-03 09:53:45 +00:00
Wire through parallel_tool_calls to Responses API
Signed-off-by: Anastas Stoyanovsky <astoyano@redhat.com>
This commit is contained in:
parent
7093978754
commit
7a9b7ecdc2
9 changed files with 159 additions and 20 deletions
|
|
@ -6723,9 +6723,12 @@ components:
|
|||
type: array
|
||||
title: Output
|
||||
parallel_tool_calls:
|
||||
type: boolean
|
||||
title: Parallel Tool Calls
|
||||
default: false
|
||||
type: boolean
|
||||
default: true
|
||||
description: >-
|
||||
(Optional) Whether to allow more than one function tool call generated
|
||||
per turn.
|
||||
previous_response_id:
|
||||
anyOf:
|
||||
- type: string
|
||||
|
|
@ -6838,8 +6841,10 @@ components:
|
|||
- created_at
|
||||
- id
|
||||
- model
|
||||
- object
|
||||
- output
|
||||
- status
|
||||
- text
|
||||
- input
|
||||
title: OpenAIResponseObjectWithInput
|
||||
description: OpenAI response object extended with input context information.
|
||||
|
|
@ -7122,9 +7127,12 @@ components:
|
|||
- type: 'null'
|
||||
title: OpenAIResponsePrompt
|
||||
instructions:
|
||||
type: string
|
||||
anyOf:
|
||||
- type: string
|
||||
- type: 'null'
|
||||
parallel_tool_calls:
|
||||
type: boolean
|
||||
previous_response_id:
|
||||
anyOf:
|
||||
- type: string
|
||||
|
|
@ -7253,7 +7261,10 @@ components:
|
|||
parallel_tool_calls:
|
||||
type: boolean
|
||||
title: Parallel Tool Calls
|
||||
default: false
|
||||
default: true
|
||||
description: >-
|
||||
(Optional) Whether to allow more than one function tool call generated
|
||||
per turn.
|
||||
previous_response_id:
|
||||
anyOf:
|
||||
- type: string
|
||||
|
|
|
|||
13
docs/static/llama-stack-spec.yaml
vendored
13
docs/static/llama-stack-spec.yaml
vendored
|
|
@ -5746,7 +5746,10 @@ components:
|
|||
parallel_tool_calls:
|
||||
type: boolean
|
||||
title: Parallel Tool Calls
|
||||
default: false
|
||||
default: true
|
||||
description: >-
|
||||
(Optional) Whether to allow more than one function tool call generated
|
||||
per turn.
|
||||
previous_response_id:
|
||||
anyOf:
|
||||
- type: string
|
||||
|
|
@ -6143,9 +6146,12 @@ components:
|
|||
- type: 'null'
|
||||
title: OpenAIResponsePrompt
|
||||
instructions:
|
||||
type: string
|
||||
anyOf:
|
||||
- type: string
|
||||
- type: 'null'
|
||||
parallel_tool_calls:
|
||||
type: boolean
|
||||
previous_response_id:
|
||||
anyOf:
|
||||
- type: string
|
||||
|
|
@ -6274,7 +6280,10 @@ components:
|
|||
parallel_tool_calls:
|
||||
type: boolean
|
||||
title: Parallel Tool Calls
|
||||
default: false
|
||||
default: true
|
||||
description: >-
|
||||
(Optional) Whether to allow more than one function tool call generated
|
||||
per turn.
|
||||
previous_response_id:
|
||||
anyOf:
|
||||
- type: string
|
||||
|
|
|
|||
13
docs/static/stainless-llama-stack-spec.yaml
vendored
13
docs/static/stainless-llama-stack-spec.yaml
vendored
|
|
@ -6725,7 +6725,10 @@ components:
|
|||
parallel_tool_calls:
|
||||
type: boolean
|
||||
title: Parallel Tool Calls
|
||||
default: false
|
||||
default: true
|
||||
description: >-
|
||||
(Optional) Whether to allow more than one function tool call generated
|
||||
per turn.
|
||||
previous_response_id:
|
||||
anyOf:
|
||||
- type: string
|
||||
|
|
@ -7125,6 +7128,9 @@ components:
|
|||
anyOf:
|
||||
- type: string
|
||||
- type: 'null'
|
||||
type: string
|
||||
parallel_tool_calls:
|
||||
type: boolean
|
||||
previous_response_id:
|
||||
anyOf:
|
||||
- type: string
|
||||
|
|
@ -7253,7 +7259,10 @@ components:
|
|||
parallel_tool_calls:
|
||||
type: boolean
|
||||
title: Parallel Tool Calls
|
||||
default: false
|
||||
default: true
|
||||
description: >-
|
||||
(Optional) Whether to allow more than one function tool call generated
|
||||
per turn.
|
||||
previous_response_id:
|
||||
anyOf:
|
||||
- type: string
|
||||
|
|
|
|||
|
|
@ -92,6 +92,7 @@ class MetaReferenceAgentsImpl(Agents):
|
|||
model: str,
|
||||
prompt: OpenAIResponsePrompt | None = None,
|
||||
instructions: str | None = None,
|
||||
parallel_tool_calls: bool | None = True,
|
||||
previous_response_id: str | None = None,
|
||||
conversation: str | None = None,
|
||||
store: bool | None = True,
|
||||
|
|
@ -120,6 +121,7 @@ class MetaReferenceAgentsImpl(Agents):
|
|||
include,
|
||||
max_infer_iters,
|
||||
guardrails,
|
||||
parallel_tool_calls,
|
||||
max_tool_calls,
|
||||
)
|
||||
return result # type: ignore[no-any-return]
|
||||
|
|
|
|||
|
|
@ -252,6 +252,7 @@ class OpenAIResponsesImpl:
|
|||
include: list[str] | None = None,
|
||||
max_infer_iters: int | None = 10,
|
||||
guardrails: list[str | ResponseGuardrailSpec] | None = None,
|
||||
parallel_tool_calls: bool | None = None,
|
||||
max_tool_calls: int | None = None,
|
||||
):
|
||||
stream = bool(stream)
|
||||
|
|
@ -296,6 +297,7 @@ class OpenAIResponsesImpl:
|
|||
tools=tools,
|
||||
max_infer_iters=max_infer_iters,
|
||||
guardrail_ids=guardrail_ids,
|
||||
parallel_tool_calls=parallel_tool_calls,
|
||||
max_tool_calls=max_tool_calls,
|
||||
)
|
||||
|
||||
|
|
@ -346,6 +348,7 @@ class OpenAIResponsesImpl:
|
|||
tools: list[OpenAIResponseInputTool] | None = None,
|
||||
max_infer_iters: int | None = 10,
|
||||
guardrail_ids: list[str] | None = None,
|
||||
parallel_tool_calls: bool | None = True,
|
||||
max_tool_calls: int | None = None,
|
||||
) -> AsyncIterator[OpenAIResponseObjectStream]:
|
||||
# These should never be None when called from create_openai_response (which sets defaults)
|
||||
|
|
@ -385,6 +388,7 @@ class OpenAIResponsesImpl:
|
|||
created_at=created_at,
|
||||
text=text,
|
||||
max_infer_iters=max_infer_iters,
|
||||
parallel_tool_calls=parallel_tool_calls,
|
||||
tool_executor=self.tool_executor,
|
||||
safety_api=self.safety_api,
|
||||
guardrail_ids=guardrail_ids,
|
||||
|
|
|
|||
|
|
@ -114,6 +114,7 @@ class StreamingResponseOrchestrator:
|
|||
safety_api,
|
||||
guardrail_ids: list[str] | None = None,
|
||||
prompt: OpenAIResponsePrompt | None = None,
|
||||
parallel_tool_calls: bool | None = None,
|
||||
max_tool_calls: int | None = None,
|
||||
):
|
||||
self.inference_api = inference_api
|
||||
|
|
@ -128,6 +129,8 @@ class StreamingResponseOrchestrator:
|
|||
self.prompt = prompt
|
||||
# System message that is inserted into the model's context
|
||||
self.instructions = instructions
|
||||
# Whether to allow more than one function tool call generated per turn.
|
||||
self.parallel_tool_calls = parallel_tool_calls
|
||||
# Max number of total calls to built-in tools that can be processed in a response
|
||||
self.max_tool_calls = max_tool_calls
|
||||
self.sequence_number = 0
|
||||
|
|
@ -190,6 +193,7 @@ class StreamingResponseOrchestrator:
|
|||
usage=self.accumulated_usage,
|
||||
instructions=self.instructions,
|
||||
prompt=self.prompt,
|
||||
parallel_tool_calls=self.parallel_tool_calls,
|
||||
max_tool_calls=self.max_tool_calls,
|
||||
)
|
||||
|
||||
|
|
@ -301,6 +305,7 @@ class StreamingResponseOrchestrator:
|
|||
completion_result_data,
|
||||
output_messages,
|
||||
next_turn_messages,
|
||||
not self.parallel_tool_calls,
|
||||
):
|
||||
yield stream_event
|
||||
|
||||
|
|
@ -897,6 +902,7 @@ class StreamingResponseOrchestrator:
|
|||
completion_result_data: ChatCompletionResult,
|
||||
output_messages: list[OpenAIResponseOutput],
|
||||
next_turn_messages: list,
|
||||
incremental_function_calling: bool,
|
||||
) -> AsyncIterator[OpenAIResponseObjectStream]:
|
||||
"""Coordinate execution of both function and non-function tool calls."""
|
||||
# Execute non-function tool calls
|
||||
|
|
@ -1020,6 +1026,10 @@ class StreamingResponseOrchestrator:
|
|||
sequence_number=self.sequence_number,
|
||||
)
|
||||
|
||||
# TODO: Make sure that multi-turn incremental execution works
|
||||
if incremental_function_calling:
|
||||
break
|
||||
|
||||
async def _process_new_tools(
|
||||
self, tools: list[OpenAIResponseInputTool], output_messages: list[OpenAIResponseOutput]
|
||||
) -> AsyncIterator[OpenAIResponseObjectStream]:
|
||||
|
|
|
|||
|
|
@ -72,6 +72,7 @@ class Agents(Protocol):
|
|||
model: str,
|
||||
prompt: OpenAIResponsePrompt | None = None,
|
||||
instructions: str | None = None,
|
||||
parallel_tool_calls: bool | None = True,
|
||||
previous_response_id: str | None = None,
|
||||
conversation: str | None = None,
|
||||
store: bool | None = True,
|
||||
|
|
|
|||
|
|
@ -585,7 +585,7 @@ class OpenAIResponseObject(BaseModel):
|
|||
:param model: Model identifier used for generation
|
||||
:param object: Object type identifier, always "response"
|
||||
:param output: List of generated output items (messages, tool calls, etc.)
|
||||
:param parallel_tool_calls: Whether tool calls can be executed in parallel
|
||||
:param parallel_tool_calls: (Optional) Whether to allow more than one function tool call generated per turn.
|
||||
:param previous_response_id: (Optional) ID of the previous response in a conversation
|
||||
:param prompt: (Optional) Reference to a prompt template and its variables.
|
||||
:param status: Current status of the response generation
|
||||
|
|
@ -605,7 +605,7 @@ class OpenAIResponseObject(BaseModel):
|
|||
model: str
|
||||
object: Literal["response"] = "response"
|
||||
output: Sequence[OpenAIResponseOutput]
|
||||
parallel_tool_calls: bool = False
|
||||
parallel_tool_calls: bool | None = True
|
||||
previous_response_id: str | None = None
|
||||
prompt: OpenAIResponsePrompt | None = None
|
||||
status: str
|
||||
|
|
|
|||
|
|
@ -682,3 +682,96 @@ def test_max_tool_calls_with_builtin_tools(openai_client, client_with_models, te
|
|||
|
||||
# Verify we have a valid max_tool_calls field
|
||||
assert response_3.max_tool_calls == max_tool_calls[1]
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Tool calling is not reliable.")
|
||||
def test_parallel_tool_calls_true(openai_client, client_with_models, text_model_id):
|
||||
"""Test handling of max_tool_calls with function tools in responses."""
|
||||
if isinstance(client_with_models, LlamaStackAsLibraryClient):
|
||||
pytest.skip("OpenAI responses are not supported when testing with library client yet.")
|
||||
|
||||
client = openai_client
|
||||
parallel_tool_calls = True
|
||||
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"name": "get_weather",
|
||||
"description": "Get weather information for a specified location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city name (e.g., 'New York', 'London')",
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
]
|
||||
|
||||
# First create a response that triggers function tools
|
||||
response = client.responses.create(
|
||||
model=text_model_id,
|
||||
input="Get the weather in New York and in Paris",
|
||||
tools=tools,
|
||||
stream=False,
|
||||
parallel_tool_calls=parallel_tool_calls,
|
||||
)
|
||||
|
||||
# Verify we got two function calls and that the max_tool_calls do not affect function tools
|
||||
assert len(response.output) == 2
|
||||
assert response.output[0].type == "function_call"
|
||||
assert response.output[0].name == "get_weather"
|
||||
assert response.output[0].status == "completed"
|
||||
assert response.output[1].type == "function_call"
|
||||
assert response.output[1].name == "get_weather"
|
||||
assert response.output[0].status == "completed"
|
||||
|
||||
# Verify we have a valid max_tool_calls field
|
||||
assert response.parallel_tool_calls == parallel_tool_calls
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Tool calling is not reliable.")
|
||||
def test_parallel_tool_calls_false(openai_client, client_with_models, text_model_id):
|
||||
"""Test handling of max_tool_calls with function tools in responses."""
|
||||
if isinstance(client_with_models, LlamaStackAsLibraryClient):
|
||||
pytest.skip("OpenAI responses are not supported when testing with library client yet.")
|
||||
|
||||
client = openai_client
|
||||
parallel_tool_calls = False
|
||||
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"name": "get_weather",
|
||||
"description": "Get weather information for a specified location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city name (e.g., 'New York', 'London')",
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
]
|
||||
|
||||
# First create a response that triggers function tools
|
||||
response = client.responses.create(
|
||||
model=text_model_id,
|
||||
input="Get the weather in New York and in Paris",
|
||||
tools=tools,
|
||||
stream=False,
|
||||
parallel_tool_calls=parallel_tool_calls,
|
||||
)
|
||||
|
||||
# Verify we got two function calls and that the max_tool_calls do not affect function tools
|
||||
assert len(response.output) == 1
|
||||
assert response.output[0].type == "function_call"
|
||||
assert response.output[0].name == "get_weather"
|
||||
assert response.output[0].status == "completed"
|
||||
|
||||
# Verify we have a valid max_tool_calls field
|
||||
assert response.parallel_tool_calls == parallel_tool_calls
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue