From 7a9b7ecdc2b41eb91e5da4dc3c6aa61f35fbfe78 Mon Sep 17 00:00:00 2001 From: Anastas Stoyanovsky Date: Tue, 11 Nov 2025 08:54:02 -0500 Subject: [PATCH] Wire through parallel_tool_calls to Responses API Signed-off-by: Anastas Stoyanovsky --- client-sdks/stainless/openapi.yml | 39 +++++--- docs/static/llama-stack-spec.yaml | 13 ++- docs/static/stainless-llama-stack-spec.yaml | 13 ++- .../inline/agents/meta_reference/agents.py | 2 + .../responses/openai_responses.py | 4 + .../meta_reference/responses/streaming.py | 10 ++ src/llama_stack_api/agents.py | 1 + src/llama_stack_api/openai_responses.py | 4 +- .../agents/test_openai_responses.py | 93 +++++++++++++++++++ 9 files changed, 159 insertions(+), 20 deletions(-) diff --git a/client-sdks/stainless/openapi.yml b/client-sdks/stainless/openapi.yml index 3a6735cbc..6ddae1ce3 100644 --- a/client-sdks/stainless/openapi.yml +++ b/client-sdks/stainless/openapi.yml @@ -6723,9 +6723,12 @@ components: type: array title: Output parallel_tool_calls: - type: boolean title: Parallel Tool Calls - default: false + type: boolean + default: true + description: >- + (Optional) Whether to allow more than one function tool call generated + per turn. previous_response_id: anyOf: - type: string @@ -6835,12 +6838,14 @@ components: title: Input type: object required: - - created_at - - id - - model - - output - - status - - input + - created_at + - id + - model + - object + - output + - status + - text + - input title: OpenAIResponseObjectWithInput description: OpenAI response object extended with input context information. OpenAIResponseOutput: @@ -7122,9 +7127,12 @@ components: - type: 'null' title: OpenAIResponsePrompt instructions: + type: string anyOf: - type: string - type: 'null' + parallel_tool_calls: + type: boolean previous_response_id: anyOf: - type: string @@ -7253,7 +7261,10 @@ components: parallel_tool_calls: type: boolean title: Parallel Tool Calls - default: false + default: true + description: >- + (Optional) Whether to allow more than one function tool call generated + per turn. previous_response_id: anyOf: - type: string @@ -7325,11 +7336,11 @@ components: - type: 'null' type: object required: - - created_at - - id - - model - - output - - status + - created_at + - id + - model + - output + - status title: OpenAIResponseObject description: Complete OpenAI response object containing generation results and metadata. OpenAIResponseContentPartOutputText: diff --git a/docs/static/llama-stack-spec.yaml b/docs/static/llama-stack-spec.yaml index a12ac342f..f9aef99a6 100644 --- a/docs/static/llama-stack-spec.yaml +++ b/docs/static/llama-stack-spec.yaml @@ -5746,7 +5746,10 @@ components: parallel_tool_calls: type: boolean title: Parallel Tool Calls - default: false + default: true + description: >- + (Optional) Whether to allow more than one function tool call generated + per turn. previous_response_id: anyOf: - type: string @@ -6143,9 +6146,12 @@ components: - type: 'null' title: OpenAIResponsePrompt instructions: + type: string anyOf: - type: string - type: 'null' + parallel_tool_calls: + type: boolean previous_response_id: anyOf: - type: string @@ -6274,7 +6280,10 @@ components: parallel_tool_calls: type: boolean title: Parallel Tool Calls - default: false + default: true + description: >- + (Optional) Whether to allow more than one function tool call generated + per turn. previous_response_id: anyOf: - type: string diff --git a/docs/static/stainless-llama-stack-spec.yaml b/docs/static/stainless-llama-stack-spec.yaml index 3a6735cbc..a632844b2 100644 --- a/docs/static/stainless-llama-stack-spec.yaml +++ b/docs/static/stainless-llama-stack-spec.yaml @@ -6725,7 +6725,10 @@ components: parallel_tool_calls: type: boolean title: Parallel Tool Calls - default: false + default: true + description: >- + (Optional) Whether to allow more than one function tool call generated + per turn. previous_response_id: anyOf: - type: string @@ -7125,6 +7128,9 @@ components: anyOf: - type: string - type: 'null' + type: string + parallel_tool_calls: + type: boolean previous_response_id: anyOf: - type: string @@ -7253,7 +7259,10 @@ components: parallel_tool_calls: type: boolean title: Parallel Tool Calls - default: false + default: true + description: >- + (Optional) Whether to allow more than one function tool call generated + per turn. previous_response_id: anyOf: - type: string diff --git a/src/llama_stack/providers/inline/agents/meta_reference/agents.py b/src/llama_stack/providers/inline/agents/meta_reference/agents.py index 347f6fdb1..e47e757be 100644 --- a/src/llama_stack/providers/inline/agents/meta_reference/agents.py +++ b/src/llama_stack/providers/inline/agents/meta_reference/agents.py @@ -92,6 +92,7 @@ class MetaReferenceAgentsImpl(Agents): model: str, prompt: OpenAIResponsePrompt | None = None, instructions: str | None = None, + parallel_tool_calls: bool | None = True, previous_response_id: str | None = None, conversation: str | None = None, store: bool | None = True, @@ -120,6 +121,7 @@ class MetaReferenceAgentsImpl(Agents): include, max_infer_iters, guardrails, + parallel_tool_calls, max_tool_calls, ) return result # type: ignore[no-any-return] diff --git a/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py b/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py index cb0fe284e..7e080a675 100644 --- a/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py @@ -252,6 +252,7 @@ class OpenAIResponsesImpl: include: list[str] | None = None, max_infer_iters: int | None = 10, guardrails: list[str | ResponseGuardrailSpec] | None = None, + parallel_tool_calls: bool | None = None, max_tool_calls: int | None = None, ): stream = bool(stream) @@ -296,6 +297,7 @@ class OpenAIResponsesImpl: tools=tools, max_infer_iters=max_infer_iters, guardrail_ids=guardrail_ids, + parallel_tool_calls=parallel_tool_calls, max_tool_calls=max_tool_calls, ) @@ -346,6 +348,7 @@ class OpenAIResponsesImpl: tools: list[OpenAIResponseInputTool] | None = None, max_infer_iters: int | None = 10, guardrail_ids: list[str] | None = None, + parallel_tool_calls: bool | None = True, max_tool_calls: int | None = None, ) -> AsyncIterator[OpenAIResponseObjectStream]: # These should never be None when called from create_openai_response (which sets defaults) @@ -385,6 +388,7 @@ class OpenAIResponsesImpl: created_at=created_at, text=text, max_infer_iters=max_infer_iters, + parallel_tool_calls=parallel_tool_calls, tool_executor=self.tool_executor, safety_api=self.safety_api, guardrail_ids=guardrail_ids, diff --git a/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py b/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py index 95c690147..185d849bc 100644 --- a/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py @@ -114,6 +114,7 @@ class StreamingResponseOrchestrator: safety_api, guardrail_ids: list[str] | None = None, prompt: OpenAIResponsePrompt | None = None, + parallel_tool_calls: bool | None = None, max_tool_calls: int | None = None, ): self.inference_api = inference_api @@ -128,6 +129,8 @@ class StreamingResponseOrchestrator: self.prompt = prompt # System message that is inserted into the model's context self.instructions = instructions + # Whether to allow more than one function tool call generated per turn. + self.parallel_tool_calls = parallel_tool_calls # Max number of total calls to built-in tools that can be processed in a response self.max_tool_calls = max_tool_calls self.sequence_number = 0 @@ -190,6 +193,7 @@ class StreamingResponseOrchestrator: usage=self.accumulated_usage, instructions=self.instructions, prompt=self.prompt, + parallel_tool_calls=self.parallel_tool_calls, max_tool_calls=self.max_tool_calls, ) @@ -301,6 +305,7 @@ class StreamingResponseOrchestrator: completion_result_data, output_messages, next_turn_messages, + not self.parallel_tool_calls, ): yield stream_event @@ -897,6 +902,7 @@ class StreamingResponseOrchestrator: completion_result_data: ChatCompletionResult, output_messages: list[OpenAIResponseOutput], next_turn_messages: list, + incremental_function_calling: bool, ) -> AsyncIterator[OpenAIResponseObjectStream]: """Coordinate execution of both function and non-function tool calls.""" # Execute non-function tool calls @@ -1020,6 +1026,10 @@ class StreamingResponseOrchestrator: sequence_number=self.sequence_number, ) + # TODO: Make sure that multi-turn incremental execution works + if incremental_function_calling: + break + async def _process_new_tools( self, tools: list[OpenAIResponseInputTool], output_messages: list[OpenAIResponseOutput] ) -> AsyncIterator[OpenAIResponseObjectStream]: diff --git a/src/llama_stack_api/agents.py b/src/llama_stack_api/agents.py index ca0611746..9b767608a 100644 --- a/src/llama_stack_api/agents.py +++ b/src/llama_stack_api/agents.py @@ -72,6 +72,7 @@ class Agents(Protocol): model: str, prompt: OpenAIResponsePrompt | None = None, instructions: str | None = None, + parallel_tool_calls: bool | None = True, previous_response_id: str | None = None, conversation: str | None = None, store: bool | None = True, diff --git a/src/llama_stack_api/openai_responses.py b/src/llama_stack_api/openai_responses.py index 952418f1c..e20004487 100644 --- a/src/llama_stack_api/openai_responses.py +++ b/src/llama_stack_api/openai_responses.py @@ -585,7 +585,7 @@ class OpenAIResponseObject(BaseModel): :param model: Model identifier used for generation :param object: Object type identifier, always "response" :param output: List of generated output items (messages, tool calls, etc.) - :param parallel_tool_calls: Whether tool calls can be executed in parallel + :param parallel_tool_calls: (Optional) Whether to allow more than one function tool call generated per turn. :param previous_response_id: (Optional) ID of the previous response in a conversation :param prompt: (Optional) Reference to a prompt template and its variables. :param status: Current status of the response generation @@ -605,7 +605,7 @@ class OpenAIResponseObject(BaseModel): model: str object: Literal["response"] = "response" output: Sequence[OpenAIResponseOutput] - parallel_tool_calls: bool = False + parallel_tool_calls: bool | None = True previous_response_id: str | None = None prompt: OpenAIResponsePrompt | None = None status: str diff --git a/tests/integration/agents/test_openai_responses.py b/tests/integration/agents/test_openai_responses.py index 057cee774..d98880c4a 100644 --- a/tests/integration/agents/test_openai_responses.py +++ b/tests/integration/agents/test_openai_responses.py @@ -682,3 +682,96 @@ def test_max_tool_calls_with_builtin_tools(openai_client, client_with_models, te # Verify we have a valid max_tool_calls field assert response_3.max_tool_calls == max_tool_calls[1] + + +@pytest.mark.skip(reason="Tool calling is not reliable.") +def test_parallel_tool_calls_true(openai_client, client_with_models, text_model_id): + """Test handling of max_tool_calls with function tools in responses.""" + if isinstance(client_with_models, LlamaStackAsLibraryClient): + pytest.skip("OpenAI responses are not supported when testing with library client yet.") + + client = openai_client + parallel_tool_calls = True + + tools = [ + { + "type": "function", + "name": "get_weather", + "description": "Get weather information for a specified location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city name (e.g., 'New York', 'London')", + }, + }, + }, + } + ] + + # First create a response that triggers function tools + response = client.responses.create( + model=text_model_id, + input="Get the weather in New York and in Paris", + tools=tools, + stream=False, + parallel_tool_calls=parallel_tool_calls, + ) + + # Verify we got two function calls and that the max_tool_calls do not affect function tools + assert len(response.output) == 2 + assert response.output[0].type == "function_call" + assert response.output[0].name == "get_weather" + assert response.output[0].status == "completed" + assert response.output[1].type == "function_call" + assert response.output[1].name == "get_weather" + assert response.output[0].status == "completed" + + # Verify we have a valid max_tool_calls field + assert response.parallel_tool_calls == parallel_tool_calls + + +@pytest.mark.skip(reason="Tool calling is not reliable.") +def test_parallel_tool_calls_false(openai_client, client_with_models, text_model_id): + """Test handling of max_tool_calls with function tools in responses.""" + if isinstance(client_with_models, LlamaStackAsLibraryClient): + pytest.skip("OpenAI responses are not supported when testing with library client yet.") + + client = openai_client + parallel_tool_calls = False + + tools = [ + { + "type": "function", + "name": "get_weather", + "description": "Get weather information for a specified location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city name (e.g., 'New York', 'London')", + }, + }, + }, + } + ] + + # First create a response that triggers function tools + response = client.responses.create( + model=text_model_id, + input="Get the weather in New York and in Paris", + tools=tools, + stream=False, + parallel_tool_calls=parallel_tool_calls, + ) + + # Verify we got two function calls and that the max_tool_calls do not affect function tools + assert len(response.output) == 1 + assert response.output[0].type == "function_call" + assert response.output[0].name == "get_weather" + assert response.output[0].status == "completed" + + # Verify we have a valid max_tool_calls field + assert response.parallel_tool_calls == parallel_tool_calls