mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-03 09:53:45 +00:00
Wire through parallel_tool_calls to Responses API
Signed-off-by: Anastas Stoyanovsky <astoyano@redhat.com>
This commit is contained in:
parent
7093978754
commit
7a9b7ecdc2
9 changed files with 159 additions and 20 deletions
|
|
@ -6723,9 +6723,12 @@ components:
|
||||||
type: array
|
type: array
|
||||||
title: Output
|
title: Output
|
||||||
parallel_tool_calls:
|
parallel_tool_calls:
|
||||||
type: boolean
|
|
||||||
title: Parallel Tool Calls
|
title: Parallel Tool Calls
|
||||||
default: false
|
type: boolean
|
||||||
|
default: true
|
||||||
|
description: >-
|
||||||
|
(Optional) Whether to allow more than one function tool call generated
|
||||||
|
per turn.
|
||||||
previous_response_id:
|
previous_response_id:
|
||||||
anyOf:
|
anyOf:
|
||||||
- type: string
|
- type: string
|
||||||
|
|
@ -6838,8 +6841,10 @@ components:
|
||||||
- created_at
|
- created_at
|
||||||
- id
|
- id
|
||||||
- model
|
- model
|
||||||
|
- object
|
||||||
- output
|
- output
|
||||||
- status
|
- status
|
||||||
|
- text
|
||||||
- input
|
- input
|
||||||
title: OpenAIResponseObjectWithInput
|
title: OpenAIResponseObjectWithInput
|
||||||
description: OpenAI response object extended with input context information.
|
description: OpenAI response object extended with input context information.
|
||||||
|
|
@ -7122,9 +7127,12 @@ components:
|
||||||
- type: 'null'
|
- type: 'null'
|
||||||
title: OpenAIResponsePrompt
|
title: OpenAIResponsePrompt
|
||||||
instructions:
|
instructions:
|
||||||
|
type: string
|
||||||
anyOf:
|
anyOf:
|
||||||
- type: string
|
- type: string
|
||||||
- type: 'null'
|
- type: 'null'
|
||||||
|
parallel_tool_calls:
|
||||||
|
type: boolean
|
||||||
previous_response_id:
|
previous_response_id:
|
||||||
anyOf:
|
anyOf:
|
||||||
- type: string
|
- type: string
|
||||||
|
|
@ -7253,7 +7261,10 @@ components:
|
||||||
parallel_tool_calls:
|
parallel_tool_calls:
|
||||||
type: boolean
|
type: boolean
|
||||||
title: Parallel Tool Calls
|
title: Parallel Tool Calls
|
||||||
default: false
|
default: true
|
||||||
|
description: >-
|
||||||
|
(Optional) Whether to allow more than one function tool call generated
|
||||||
|
per turn.
|
||||||
previous_response_id:
|
previous_response_id:
|
||||||
anyOf:
|
anyOf:
|
||||||
- type: string
|
- type: string
|
||||||
|
|
|
||||||
13
docs/static/llama-stack-spec.yaml
vendored
13
docs/static/llama-stack-spec.yaml
vendored
|
|
@ -5746,7 +5746,10 @@ components:
|
||||||
parallel_tool_calls:
|
parallel_tool_calls:
|
||||||
type: boolean
|
type: boolean
|
||||||
title: Parallel Tool Calls
|
title: Parallel Tool Calls
|
||||||
default: false
|
default: true
|
||||||
|
description: >-
|
||||||
|
(Optional) Whether to allow more than one function tool call generated
|
||||||
|
per turn.
|
||||||
previous_response_id:
|
previous_response_id:
|
||||||
anyOf:
|
anyOf:
|
||||||
- type: string
|
- type: string
|
||||||
|
|
@ -6143,9 +6146,12 @@ components:
|
||||||
- type: 'null'
|
- type: 'null'
|
||||||
title: OpenAIResponsePrompt
|
title: OpenAIResponsePrompt
|
||||||
instructions:
|
instructions:
|
||||||
|
type: string
|
||||||
anyOf:
|
anyOf:
|
||||||
- type: string
|
- type: string
|
||||||
- type: 'null'
|
- type: 'null'
|
||||||
|
parallel_tool_calls:
|
||||||
|
type: boolean
|
||||||
previous_response_id:
|
previous_response_id:
|
||||||
anyOf:
|
anyOf:
|
||||||
- type: string
|
- type: string
|
||||||
|
|
@ -6274,7 +6280,10 @@ components:
|
||||||
parallel_tool_calls:
|
parallel_tool_calls:
|
||||||
type: boolean
|
type: boolean
|
||||||
title: Parallel Tool Calls
|
title: Parallel Tool Calls
|
||||||
default: false
|
default: true
|
||||||
|
description: >-
|
||||||
|
(Optional) Whether to allow more than one function tool call generated
|
||||||
|
per turn.
|
||||||
previous_response_id:
|
previous_response_id:
|
||||||
anyOf:
|
anyOf:
|
||||||
- type: string
|
- type: string
|
||||||
|
|
|
||||||
13
docs/static/stainless-llama-stack-spec.yaml
vendored
13
docs/static/stainless-llama-stack-spec.yaml
vendored
|
|
@ -6725,7 +6725,10 @@ components:
|
||||||
parallel_tool_calls:
|
parallel_tool_calls:
|
||||||
type: boolean
|
type: boolean
|
||||||
title: Parallel Tool Calls
|
title: Parallel Tool Calls
|
||||||
default: false
|
default: true
|
||||||
|
description: >-
|
||||||
|
(Optional) Whether to allow more than one function tool call generated
|
||||||
|
per turn.
|
||||||
previous_response_id:
|
previous_response_id:
|
||||||
anyOf:
|
anyOf:
|
||||||
- type: string
|
- type: string
|
||||||
|
|
@ -7125,6 +7128,9 @@ components:
|
||||||
anyOf:
|
anyOf:
|
||||||
- type: string
|
- type: string
|
||||||
- type: 'null'
|
- type: 'null'
|
||||||
|
type: string
|
||||||
|
parallel_tool_calls:
|
||||||
|
type: boolean
|
||||||
previous_response_id:
|
previous_response_id:
|
||||||
anyOf:
|
anyOf:
|
||||||
- type: string
|
- type: string
|
||||||
|
|
@ -7253,7 +7259,10 @@ components:
|
||||||
parallel_tool_calls:
|
parallel_tool_calls:
|
||||||
type: boolean
|
type: boolean
|
||||||
title: Parallel Tool Calls
|
title: Parallel Tool Calls
|
||||||
default: false
|
default: true
|
||||||
|
description: >-
|
||||||
|
(Optional) Whether to allow more than one function tool call generated
|
||||||
|
per turn.
|
||||||
previous_response_id:
|
previous_response_id:
|
||||||
anyOf:
|
anyOf:
|
||||||
- type: string
|
- type: string
|
||||||
|
|
|
||||||
|
|
@ -92,6 +92,7 @@ class MetaReferenceAgentsImpl(Agents):
|
||||||
model: str,
|
model: str,
|
||||||
prompt: OpenAIResponsePrompt | None = None,
|
prompt: OpenAIResponsePrompt | None = None,
|
||||||
instructions: str | None = None,
|
instructions: str | None = None,
|
||||||
|
parallel_tool_calls: bool | None = True,
|
||||||
previous_response_id: str | None = None,
|
previous_response_id: str | None = None,
|
||||||
conversation: str | None = None,
|
conversation: str | None = None,
|
||||||
store: bool | None = True,
|
store: bool | None = True,
|
||||||
|
|
@ -120,6 +121,7 @@ class MetaReferenceAgentsImpl(Agents):
|
||||||
include,
|
include,
|
||||||
max_infer_iters,
|
max_infer_iters,
|
||||||
guardrails,
|
guardrails,
|
||||||
|
parallel_tool_calls,
|
||||||
max_tool_calls,
|
max_tool_calls,
|
||||||
)
|
)
|
||||||
return result # type: ignore[no-any-return]
|
return result # type: ignore[no-any-return]
|
||||||
|
|
|
||||||
|
|
@ -252,6 +252,7 @@ class OpenAIResponsesImpl:
|
||||||
include: list[str] | None = None,
|
include: list[str] | None = None,
|
||||||
max_infer_iters: int | None = 10,
|
max_infer_iters: int | None = 10,
|
||||||
guardrails: list[str | ResponseGuardrailSpec] | None = None,
|
guardrails: list[str | ResponseGuardrailSpec] | None = None,
|
||||||
|
parallel_tool_calls: bool | None = None,
|
||||||
max_tool_calls: int | None = None,
|
max_tool_calls: int | None = None,
|
||||||
):
|
):
|
||||||
stream = bool(stream)
|
stream = bool(stream)
|
||||||
|
|
@ -296,6 +297,7 @@ class OpenAIResponsesImpl:
|
||||||
tools=tools,
|
tools=tools,
|
||||||
max_infer_iters=max_infer_iters,
|
max_infer_iters=max_infer_iters,
|
||||||
guardrail_ids=guardrail_ids,
|
guardrail_ids=guardrail_ids,
|
||||||
|
parallel_tool_calls=parallel_tool_calls,
|
||||||
max_tool_calls=max_tool_calls,
|
max_tool_calls=max_tool_calls,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -346,6 +348,7 @@ class OpenAIResponsesImpl:
|
||||||
tools: list[OpenAIResponseInputTool] | None = None,
|
tools: list[OpenAIResponseInputTool] | None = None,
|
||||||
max_infer_iters: int | None = 10,
|
max_infer_iters: int | None = 10,
|
||||||
guardrail_ids: list[str] | None = None,
|
guardrail_ids: list[str] | None = None,
|
||||||
|
parallel_tool_calls: bool | None = True,
|
||||||
max_tool_calls: int | None = None,
|
max_tool_calls: int | None = None,
|
||||||
) -> AsyncIterator[OpenAIResponseObjectStream]:
|
) -> AsyncIterator[OpenAIResponseObjectStream]:
|
||||||
# These should never be None when called from create_openai_response (which sets defaults)
|
# These should never be None when called from create_openai_response (which sets defaults)
|
||||||
|
|
@ -385,6 +388,7 @@ class OpenAIResponsesImpl:
|
||||||
created_at=created_at,
|
created_at=created_at,
|
||||||
text=text,
|
text=text,
|
||||||
max_infer_iters=max_infer_iters,
|
max_infer_iters=max_infer_iters,
|
||||||
|
parallel_tool_calls=parallel_tool_calls,
|
||||||
tool_executor=self.tool_executor,
|
tool_executor=self.tool_executor,
|
||||||
safety_api=self.safety_api,
|
safety_api=self.safety_api,
|
||||||
guardrail_ids=guardrail_ids,
|
guardrail_ids=guardrail_ids,
|
||||||
|
|
|
||||||
|
|
@ -114,6 +114,7 @@ class StreamingResponseOrchestrator:
|
||||||
safety_api,
|
safety_api,
|
||||||
guardrail_ids: list[str] | None = None,
|
guardrail_ids: list[str] | None = None,
|
||||||
prompt: OpenAIResponsePrompt | None = None,
|
prompt: OpenAIResponsePrompt | None = None,
|
||||||
|
parallel_tool_calls: bool | None = None,
|
||||||
max_tool_calls: int | None = None,
|
max_tool_calls: int | None = None,
|
||||||
):
|
):
|
||||||
self.inference_api = inference_api
|
self.inference_api = inference_api
|
||||||
|
|
@ -128,6 +129,8 @@ class StreamingResponseOrchestrator:
|
||||||
self.prompt = prompt
|
self.prompt = prompt
|
||||||
# System message that is inserted into the model's context
|
# System message that is inserted into the model's context
|
||||||
self.instructions = instructions
|
self.instructions = instructions
|
||||||
|
# Whether to allow more than one function tool call generated per turn.
|
||||||
|
self.parallel_tool_calls = parallel_tool_calls
|
||||||
# Max number of total calls to built-in tools that can be processed in a response
|
# Max number of total calls to built-in tools that can be processed in a response
|
||||||
self.max_tool_calls = max_tool_calls
|
self.max_tool_calls = max_tool_calls
|
||||||
self.sequence_number = 0
|
self.sequence_number = 0
|
||||||
|
|
@ -190,6 +193,7 @@ class StreamingResponseOrchestrator:
|
||||||
usage=self.accumulated_usage,
|
usage=self.accumulated_usage,
|
||||||
instructions=self.instructions,
|
instructions=self.instructions,
|
||||||
prompt=self.prompt,
|
prompt=self.prompt,
|
||||||
|
parallel_tool_calls=self.parallel_tool_calls,
|
||||||
max_tool_calls=self.max_tool_calls,
|
max_tool_calls=self.max_tool_calls,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -301,6 +305,7 @@ class StreamingResponseOrchestrator:
|
||||||
completion_result_data,
|
completion_result_data,
|
||||||
output_messages,
|
output_messages,
|
||||||
next_turn_messages,
|
next_turn_messages,
|
||||||
|
not self.parallel_tool_calls,
|
||||||
):
|
):
|
||||||
yield stream_event
|
yield stream_event
|
||||||
|
|
||||||
|
|
@ -897,6 +902,7 @@ class StreamingResponseOrchestrator:
|
||||||
completion_result_data: ChatCompletionResult,
|
completion_result_data: ChatCompletionResult,
|
||||||
output_messages: list[OpenAIResponseOutput],
|
output_messages: list[OpenAIResponseOutput],
|
||||||
next_turn_messages: list,
|
next_turn_messages: list,
|
||||||
|
incremental_function_calling: bool,
|
||||||
) -> AsyncIterator[OpenAIResponseObjectStream]:
|
) -> AsyncIterator[OpenAIResponseObjectStream]:
|
||||||
"""Coordinate execution of both function and non-function tool calls."""
|
"""Coordinate execution of both function and non-function tool calls."""
|
||||||
# Execute non-function tool calls
|
# Execute non-function tool calls
|
||||||
|
|
@ -1020,6 +1026,10 @@ class StreamingResponseOrchestrator:
|
||||||
sequence_number=self.sequence_number,
|
sequence_number=self.sequence_number,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# TODO: Make sure that multi-turn incremental execution works
|
||||||
|
if incremental_function_calling:
|
||||||
|
break
|
||||||
|
|
||||||
async def _process_new_tools(
|
async def _process_new_tools(
|
||||||
self, tools: list[OpenAIResponseInputTool], output_messages: list[OpenAIResponseOutput]
|
self, tools: list[OpenAIResponseInputTool], output_messages: list[OpenAIResponseOutput]
|
||||||
) -> AsyncIterator[OpenAIResponseObjectStream]:
|
) -> AsyncIterator[OpenAIResponseObjectStream]:
|
||||||
|
|
|
||||||
|
|
@ -72,6 +72,7 @@ class Agents(Protocol):
|
||||||
model: str,
|
model: str,
|
||||||
prompt: OpenAIResponsePrompt | None = None,
|
prompt: OpenAIResponsePrompt | None = None,
|
||||||
instructions: str | None = None,
|
instructions: str | None = None,
|
||||||
|
parallel_tool_calls: bool | None = True,
|
||||||
previous_response_id: str | None = None,
|
previous_response_id: str | None = None,
|
||||||
conversation: str | None = None,
|
conversation: str | None = None,
|
||||||
store: bool | None = True,
|
store: bool | None = True,
|
||||||
|
|
|
||||||
|
|
@ -585,7 +585,7 @@ class OpenAIResponseObject(BaseModel):
|
||||||
:param model: Model identifier used for generation
|
:param model: Model identifier used for generation
|
||||||
:param object: Object type identifier, always "response"
|
:param object: Object type identifier, always "response"
|
||||||
:param output: List of generated output items (messages, tool calls, etc.)
|
:param output: List of generated output items (messages, tool calls, etc.)
|
||||||
:param parallel_tool_calls: Whether tool calls can be executed in parallel
|
:param parallel_tool_calls: (Optional) Whether to allow more than one function tool call generated per turn.
|
||||||
:param previous_response_id: (Optional) ID of the previous response in a conversation
|
:param previous_response_id: (Optional) ID of the previous response in a conversation
|
||||||
:param prompt: (Optional) Reference to a prompt template and its variables.
|
:param prompt: (Optional) Reference to a prompt template and its variables.
|
||||||
:param status: Current status of the response generation
|
:param status: Current status of the response generation
|
||||||
|
|
@ -605,7 +605,7 @@ class OpenAIResponseObject(BaseModel):
|
||||||
model: str
|
model: str
|
||||||
object: Literal["response"] = "response"
|
object: Literal["response"] = "response"
|
||||||
output: Sequence[OpenAIResponseOutput]
|
output: Sequence[OpenAIResponseOutput]
|
||||||
parallel_tool_calls: bool = False
|
parallel_tool_calls: bool | None = True
|
||||||
previous_response_id: str | None = None
|
previous_response_id: str | None = None
|
||||||
prompt: OpenAIResponsePrompt | None = None
|
prompt: OpenAIResponsePrompt | None = None
|
||||||
status: str
|
status: str
|
||||||
|
|
|
||||||
|
|
@ -682,3 +682,96 @@ def test_max_tool_calls_with_builtin_tools(openai_client, client_with_models, te
|
||||||
|
|
||||||
# Verify we have a valid max_tool_calls field
|
# Verify we have a valid max_tool_calls field
|
||||||
assert response_3.max_tool_calls == max_tool_calls[1]
|
assert response_3.max_tool_calls == max_tool_calls[1]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip(reason="Tool calling is not reliable.")
|
||||||
|
def test_parallel_tool_calls_true(openai_client, client_with_models, text_model_id):
|
||||||
|
"""Test handling of max_tool_calls with function tools in responses."""
|
||||||
|
if isinstance(client_with_models, LlamaStackAsLibraryClient):
|
||||||
|
pytest.skip("OpenAI responses are not supported when testing with library client yet.")
|
||||||
|
|
||||||
|
client = openai_client
|
||||||
|
parallel_tool_calls = True
|
||||||
|
|
||||||
|
tools = [
|
||||||
|
{
|
||||||
|
"type": "function",
|
||||||
|
"name": "get_weather",
|
||||||
|
"description": "Get weather information for a specified location",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"location": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The city name (e.g., 'New York', 'London')",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
# First create a response that triggers function tools
|
||||||
|
response = client.responses.create(
|
||||||
|
model=text_model_id,
|
||||||
|
input="Get the weather in New York and in Paris",
|
||||||
|
tools=tools,
|
||||||
|
stream=False,
|
||||||
|
parallel_tool_calls=parallel_tool_calls,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify we got two function calls and that the max_tool_calls do not affect function tools
|
||||||
|
assert len(response.output) == 2
|
||||||
|
assert response.output[0].type == "function_call"
|
||||||
|
assert response.output[0].name == "get_weather"
|
||||||
|
assert response.output[0].status == "completed"
|
||||||
|
assert response.output[1].type == "function_call"
|
||||||
|
assert response.output[1].name == "get_weather"
|
||||||
|
assert response.output[0].status == "completed"
|
||||||
|
|
||||||
|
# Verify we have a valid max_tool_calls field
|
||||||
|
assert response.parallel_tool_calls == parallel_tool_calls
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip(reason="Tool calling is not reliable.")
|
||||||
|
def test_parallel_tool_calls_false(openai_client, client_with_models, text_model_id):
|
||||||
|
"""Test handling of max_tool_calls with function tools in responses."""
|
||||||
|
if isinstance(client_with_models, LlamaStackAsLibraryClient):
|
||||||
|
pytest.skip("OpenAI responses are not supported when testing with library client yet.")
|
||||||
|
|
||||||
|
client = openai_client
|
||||||
|
parallel_tool_calls = False
|
||||||
|
|
||||||
|
tools = [
|
||||||
|
{
|
||||||
|
"type": "function",
|
||||||
|
"name": "get_weather",
|
||||||
|
"description": "Get weather information for a specified location",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"location": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The city name (e.g., 'New York', 'London')",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
# First create a response that triggers function tools
|
||||||
|
response = client.responses.create(
|
||||||
|
model=text_model_id,
|
||||||
|
input="Get the weather in New York and in Paris",
|
||||||
|
tools=tools,
|
||||||
|
stream=False,
|
||||||
|
parallel_tool_calls=parallel_tool_calls,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify we got two function calls and that the max_tool_calls do not affect function tools
|
||||||
|
assert len(response.output) == 1
|
||||||
|
assert response.output[0].type == "function_call"
|
||||||
|
assert response.output[0].name == "get_weather"
|
||||||
|
assert response.output[0].status == "completed"
|
||||||
|
|
||||||
|
# Verify we have a valid max_tool_calls field
|
||||||
|
assert response.parallel_tool_calls == parallel_tool_calls
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue