From 7a9b7ecdc2b41eb91e5da4dc3c6aa61f35fbfe78 Mon Sep 17 00:00:00 2001
From: Anastas Stoyanovsky <astoyano@redhat.com>
Date: Tue, 11 Nov 2025 08:54:02 -0500
Subject: [PATCH] Wire through parallel_tool_calls to Responses API

Signed-off-by: Anastas Stoyanovsky <astoyano@redhat.com>
---
 client-sdks/stainless/openapi.yml             | 39 +++++---
 docs/static/llama-stack-spec.yaml             | 13 ++-
 docs/static/stainless-llama-stack-spec.yaml   | 13 ++-
 .../inline/agents/meta_reference/agents.py    |  2 +
 .../responses/openai_responses.py             |  4 +
 .../meta_reference/responses/streaming.py     | 10 ++
 src/llama_stack_api/agents.py                 |  1 +
 src/llama_stack_api/openai_responses.py       |  4 +-
 .../agents/test_openai_responses.py           | 93 +++++++++++++++++++
 9 files changed, 159 insertions(+), 20 deletions(-)

diff --git a/client-sdks/stainless/openapi.yml b/client-sdks/stainless/openapi.yml
index 3a6735cbc..6ddae1ce3 100644
--- a/client-sdks/stainless/openapi.yml
+++ b/client-sdks/stainless/openapi.yml
@@ -6723,9 +6723,12 @@ components:
           type: array
           title: Output
         parallel_tool_calls:
-          type: boolean
           title: Parallel Tool Calls
-          default: false
+          type: boolean
+          default: true
+          description: >-
+            (Optional) Whether to allow more than one function tool call generated
+            per turn.
         previous_response_id:
           anyOf:
           - type: string
@@ -6835,12 +6838,14 @@ components:
           title: Input
       type: object
       required:
-      - created_at
-      - id
-      - model
-      - output
-      - status
-      - input
+        - created_at
+        - id
+        - model
+        - object
+        - output
+        - status
+        - text
+        - input
       title: OpenAIResponseObjectWithInput
       description: OpenAI response object extended with input context information.
     OpenAIResponseOutput:
@@ -7122,9 +7127,12 @@ components:
           - type: 'null'
           title: OpenAIResponsePrompt
         instructions:
+          type: string
           anyOf:
           - type: string
           - type: 'null'
+        parallel_tool_calls:
+          type: boolean
         previous_response_id:
           anyOf:
           - type: string
@@ -7253,7 +7261,10 @@ components:
         parallel_tool_calls:
           type: boolean
           title: Parallel Tool Calls
-          default: false
+          default: true
+          description: >-
+            (Optional) Whether to allow more than one function tool call generated
+            per turn.
         previous_response_id:
           anyOf:
           - type: string
@@ -7325,11 +7336,11 @@ components:
           - type: 'null'
       type: object
       required:
-      - created_at
-      - id
-      - model
-      - output
-      - status
+        - created_at
+        - id
+        - model
+        - output
+        - status
       title: OpenAIResponseObject
       description: Complete OpenAI response object containing generation results and metadata.
     OpenAIResponseContentPartOutputText:
diff --git a/docs/static/llama-stack-spec.yaml b/docs/static/llama-stack-spec.yaml
index a12ac342f..f9aef99a6 100644
--- a/docs/static/llama-stack-spec.yaml
+++ b/docs/static/llama-stack-spec.yaml
@@ -5746,7 +5746,10 @@ components:
         parallel_tool_calls:
           type: boolean
           title: Parallel Tool Calls
-          default: false
+          default: true
+          description: >-
+            (Optional) Whether to allow more than one function tool call generated
+            per turn.
         previous_response_id:
           anyOf:
           - type: string
@@ -6143,9 +6146,12 @@ components:
           - type: 'null'
           title: OpenAIResponsePrompt
         instructions:
+          type: string
           anyOf:
           - type: string
           - type: 'null'
+        parallel_tool_calls:
+          type: boolean
         previous_response_id:
           anyOf:
           - type: string
@@ -6274,7 +6280,10 @@ components:
         parallel_tool_calls:
           type: boolean
           title: Parallel Tool Calls
-          default: false
+          default: true
+          description: >-
+            (Optional) Whether to allow more than one function tool call generated
+            per turn.
         previous_response_id:
           anyOf:
           - type: string
diff --git a/docs/static/stainless-llama-stack-spec.yaml b/docs/static/stainless-llama-stack-spec.yaml
index 3a6735cbc..a632844b2 100644
--- a/docs/static/stainless-llama-stack-spec.yaml
+++ b/docs/static/stainless-llama-stack-spec.yaml
@@ -6725,7 +6725,10 @@ components:
         parallel_tool_calls:
           type: boolean
           title: Parallel Tool Calls
-          default: false
+          default: true
+          description: >-
+            (Optional) Whether to allow more than one function tool call generated
+            per turn.
         previous_response_id:
           anyOf:
           - type: string
@@ -7125,6 +7128,9 @@ components:
           anyOf:
           - type: string
           - type: 'null'
+          type: string
+        parallel_tool_calls:
+          type: boolean
         previous_response_id:
           anyOf:
           - type: string
@@ -7253,7 +7259,10 @@ components:
         parallel_tool_calls:
           type: boolean
           title: Parallel Tool Calls
-          default: false
+          default: true
+          description: >-
+            (Optional) Whether to allow more than one function tool call generated
+            per turn.
         previous_response_id:
           anyOf:
           - type: string
diff --git a/src/llama_stack/providers/inline/agents/meta_reference/agents.py b/src/llama_stack/providers/inline/agents/meta_reference/agents.py
index 347f6fdb1..e47e757be 100644
--- a/src/llama_stack/providers/inline/agents/meta_reference/agents.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/agents.py
@@ -92,6 +92,7 @@ class MetaReferenceAgentsImpl(Agents):
         model: str,
         prompt: OpenAIResponsePrompt | None = None,
         instructions: str | None = None,
+        parallel_tool_calls: bool | None = True,
         previous_response_id: str | None = None,
         conversation: str | None = None,
         store: bool | None = True,
@@ -120,6 +121,7 @@ class MetaReferenceAgentsImpl(Agents):
             include,
             max_infer_iters,
             guardrails,
+            parallel_tool_calls,
             max_tool_calls,
         )
         return result  # type: ignore[no-any-return]
diff --git a/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py b/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
index cb0fe284e..7e080a675 100644
--- a/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
@@ -252,6 +252,7 @@ class OpenAIResponsesImpl:
         include: list[str] | None = None,
         max_infer_iters: int | None = 10,
         guardrails: list[str | ResponseGuardrailSpec] | None = None,
+        parallel_tool_calls: bool | None = None,
         max_tool_calls: int | None = None,
     ):
         stream = bool(stream)
@@ -296,6 +297,7 @@ class OpenAIResponsesImpl:
             tools=tools,
             max_infer_iters=max_infer_iters,
             guardrail_ids=guardrail_ids,
+            parallel_tool_calls=parallel_tool_calls,
             max_tool_calls=max_tool_calls,
         )
 
@@ -346,6 +348,7 @@ class OpenAIResponsesImpl:
         tools: list[OpenAIResponseInputTool] | None = None,
         max_infer_iters: int | None = 10,
         guardrail_ids: list[str] | None = None,
+        parallel_tool_calls: bool | None = True,
         max_tool_calls: int | None = None,
     ) -> AsyncIterator[OpenAIResponseObjectStream]:
         # These should never be None when called from create_openai_response (which sets defaults)
@@ -385,6 +388,7 @@ class OpenAIResponsesImpl:
             created_at=created_at,
             text=text,
             max_infer_iters=max_infer_iters,
+            parallel_tool_calls=parallel_tool_calls,
             tool_executor=self.tool_executor,
             safety_api=self.safety_api,
             guardrail_ids=guardrail_ids,
diff --git a/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py b/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
index 95c690147..185d849bc 100644
--- a/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
@@ -114,6 +114,7 @@ class StreamingResponseOrchestrator:
         safety_api,
         guardrail_ids: list[str] | None = None,
         prompt: OpenAIResponsePrompt | None = None,
+        parallel_tool_calls: bool | None = None,
         max_tool_calls: int | None = None,
     ):
         self.inference_api = inference_api
@@ -128,6 +129,8 @@ class StreamingResponseOrchestrator:
         self.prompt = prompt
         # System message that is inserted into the model's context
         self.instructions = instructions
+        # Whether to allow more than one function tool call generated per turn.
+        self.parallel_tool_calls = parallel_tool_calls
         # Max number of total calls to built-in tools that can be processed in a response
         self.max_tool_calls = max_tool_calls
         self.sequence_number = 0
@@ -190,6 +193,7 @@ class StreamingResponseOrchestrator:
             usage=self.accumulated_usage,
             instructions=self.instructions,
             prompt=self.prompt,
+            parallel_tool_calls=self.parallel_tool_calls,
             max_tool_calls=self.max_tool_calls,
         )
 
@@ -301,6 +305,7 @@ class StreamingResponseOrchestrator:
                     completion_result_data,
                     output_messages,
                     next_turn_messages,
+                    not self.parallel_tool_calls,
                 ):
                     yield stream_event
 
@@ -897,6 +902,7 @@ class StreamingResponseOrchestrator:
         completion_result_data: ChatCompletionResult,
         output_messages: list[OpenAIResponseOutput],
         next_turn_messages: list,
+        incremental_function_calling: bool,
     ) -> AsyncIterator[OpenAIResponseObjectStream]:
         """Coordinate execution of both function and non-function tool calls."""
         # Execute non-function tool calls
@@ -1020,6 +1026,10 @@ class StreamingResponseOrchestrator:
                 sequence_number=self.sequence_number,
             )
 
+            # TODO: Make sure that multi-turn incremental execution works
+            if incremental_function_calling:
+                break
+
     async def _process_new_tools(
         self, tools: list[OpenAIResponseInputTool], output_messages: list[OpenAIResponseOutput]
     ) -> AsyncIterator[OpenAIResponseObjectStream]:
diff --git a/src/llama_stack_api/agents.py b/src/llama_stack_api/agents.py
index ca0611746..9b767608a 100644
--- a/src/llama_stack_api/agents.py
+++ b/src/llama_stack_api/agents.py
@@ -72,6 +72,7 @@ class Agents(Protocol):
         model: str,
         prompt: OpenAIResponsePrompt | None = None,
         instructions: str | None = None,
+        parallel_tool_calls: bool | None = True,
         previous_response_id: str | None = None,
         conversation: str | None = None,
         store: bool | None = True,
diff --git a/src/llama_stack_api/openai_responses.py b/src/llama_stack_api/openai_responses.py
index 952418f1c..e20004487 100644
--- a/src/llama_stack_api/openai_responses.py
+++ b/src/llama_stack_api/openai_responses.py
@@ -585,7 +585,7 @@ class OpenAIResponseObject(BaseModel):
     :param model: Model identifier used for generation
     :param object: Object type identifier, always "response"
     :param output: List of generated output items (messages, tool calls, etc.)
-    :param parallel_tool_calls: Whether tool calls can be executed in parallel
+    :param parallel_tool_calls: (Optional) Whether to allow more than one function tool call generated per turn.
     :param previous_response_id: (Optional) ID of the previous response in a conversation
     :param prompt: (Optional) Reference to a prompt template and its variables.
     :param status: Current status of the response generation
@@ -605,7 +605,7 @@ class OpenAIResponseObject(BaseModel):
     model: str
     object: Literal["response"] = "response"
     output: Sequence[OpenAIResponseOutput]
-    parallel_tool_calls: bool = False
+    parallel_tool_calls: bool | None = True
     previous_response_id: str | None = None
     prompt: OpenAIResponsePrompt | None = None
     status: str
diff --git a/tests/integration/agents/test_openai_responses.py b/tests/integration/agents/test_openai_responses.py
index 057cee774..d98880c4a 100644
--- a/tests/integration/agents/test_openai_responses.py
+++ b/tests/integration/agents/test_openai_responses.py
@@ -682,3 +682,96 @@ def test_max_tool_calls_with_builtin_tools(openai_client, client_with_models, te
 
     # Verify we have a valid max_tool_calls field
     assert response_3.max_tool_calls == max_tool_calls[1]
+
+
+@pytest.mark.skip(reason="Tool calling is not reliable.")
+def test_parallel_tool_calls_true(openai_client, client_with_models, text_model_id):
+    """Test handling of max_tool_calls with function tools in responses."""
+    if isinstance(client_with_models, LlamaStackAsLibraryClient):
+        pytest.skip("OpenAI responses are not supported when testing with library client yet.")
+
+    client = openai_client
+    parallel_tool_calls = True
+
+    tools = [
+        {
+            "type": "function",
+            "name": "get_weather",
+            "description": "Get weather information for a specified location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "The city name (e.g., 'New York', 'London')",
+                    },
+                },
+            },
+        }
+    ]
+
+    # First create a response that triggers function tools
+    response = client.responses.create(
+        model=text_model_id,
+        input="Get the weather in New York and in Paris",
+        tools=tools,
+        stream=False,
+        parallel_tool_calls=parallel_tool_calls,
+    )
+
+    # Verify we got two function calls and that the max_tool_calls do not affect function tools
+    assert len(response.output) == 2
+    assert response.output[0].type == "function_call"
+    assert response.output[0].name == "get_weather"
+    assert response.output[0].status == "completed"
+    assert response.output[1].type == "function_call"
+    assert response.output[1].name == "get_weather"
+    assert response.output[0].status == "completed"
+
+    # Verify we have a valid max_tool_calls field
+    assert response.parallel_tool_calls == parallel_tool_calls
+
+
+@pytest.mark.skip(reason="Tool calling is not reliable.")
+def test_parallel_tool_calls_false(openai_client, client_with_models, text_model_id):
+    """Test handling of max_tool_calls with function tools in responses."""
+    if isinstance(client_with_models, LlamaStackAsLibraryClient):
+        pytest.skip("OpenAI responses are not supported when testing with library client yet.")
+
+    client = openai_client
+    parallel_tool_calls = False
+
+    tools = [
+        {
+            "type": "function",
+            "name": "get_weather",
+            "description": "Get weather information for a specified location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "The city name (e.g., 'New York', 'London')",
+                    },
+                },
+            },
+        }
+    ]
+
+    # First create a response that triggers function tools
+    response = client.responses.create(
+        model=text_model_id,
+        input="Get the weather in New York and in Paris",
+        tools=tools,
+        stream=False,
+        parallel_tool_calls=parallel_tool_calls,
+    )
+
+    # Verify we got two function calls and that the max_tool_calls do not affect function tools
+    assert len(response.output) == 1
+    assert response.output[0].type == "function_call"
+    assert response.output[0].name == "get_weather"
+    assert response.output[0].status == "completed"
+
+    # Verify we have a valid max_tool_calls field
+    assert response.parallel_tool_calls == parallel_tool_calls