From a3580e6bc012535a43e0b08bfae4f6e6563a4bbd Mon Sep 17 00:00:00 2001
From: Anastas Stoyanovsky <astoyano@redhat.com>
Date: Tue, 18 Nov 2025 14:25:08 -0500
Subject: [PATCH] feat!: Wire through parallel_tool_calls to Responses API
 (#4124)

# What does this PR do?
Initial PR against #4123
Adds `parallel_tool_calls` spec to Responses API and basic initial
implementation where no more than one function call is generated when
set to `False`.

## Test Plan
* Unit tests have been added to verify no more than one function call is
generated.
* A followup PR will verify passing through `parallel_tool_calls` to
providers.
* A followup PR will address verification and/or implementation of
incremental function calling across multiple conversational turns.

---------

Signed-off-by: Anastas Stoyanovsky <astoyano@redhat.com>
---
 client-sdks/stainless/openapi.yml             | 19 +++++++++++++------
 docs/static/deprecated-llama-stack-spec.yaml  | 19 +++++++++++++------
 .../static/experimental-llama-stack-spec.yaml | 14 ++++++++------
 docs/static/llama-stack-spec.yaml             | 19 +++++++++++++------
 docs/static/stainless-llama-stack-spec.yaml   | 19 +++++++++++++------
 .../inline/agents/meta_reference/agents.py    |  2 ++
 .../responses/openai_responses.py             |  4 ++++
 .../meta_reference/responses/streaming.py     |  4 ++++
 src/llama_stack_api/agents.py                 |  1 +
 src/llama_stack_api/openai_responses.py       |  4 ++--
 10 files changed, 73 insertions(+), 32 deletions(-)

diff --git a/client-sdks/stainless/openapi.yml b/client-sdks/stainless/openapi.yml
index 3a6735cbc..a6ebc868c 100644
--- a/client-sdks/stainless/openapi.yml
+++ b/client-sdks/stainless/openapi.yml
@@ -6723,9 +6723,10 @@ components:
           type: array
           title: Output
         parallel_tool_calls:
-          type: boolean
-          title: Parallel Tool Calls
-          default: false
+          anyOf:
+          - type: boolean
+          - type: 'null'
+          default: true
         previous_response_id:
           anyOf:
           - type: string
@@ -7125,6 +7126,11 @@ components:
           anyOf:
           - type: string
           - type: 'null'
+        parallel_tool_calls:
+          anyOf:
+          - type: boolean
+          - type: 'null'
+          default: true
         previous_response_id:
           anyOf:
           - type: string
@@ -7251,9 +7257,10 @@ components:
           type: array
           title: Output
         parallel_tool_calls:
-          type: boolean
-          title: Parallel Tool Calls
-          default: false
+          anyOf:
+          - type: boolean
+          - type: 'null'
+          default: true
         previous_response_id:
           anyOf:
           - type: string
diff --git a/docs/static/deprecated-llama-stack-spec.yaml b/docs/static/deprecated-llama-stack-spec.yaml
index 0bade1866..207af8926 100644
--- a/docs/static/deprecated-llama-stack-spec.yaml
+++ b/docs/static/deprecated-llama-stack-spec.yaml
@@ -3566,9 +3566,10 @@ components:
           type: array
           title: Output
         parallel_tool_calls:
-          type: boolean
-          title: Parallel Tool Calls
-          default: false
+          anyOf:
+          - type: boolean
+          - type: 'null'
+          default: true
         previous_response_id:
           anyOf:
           - type: string
@@ -3968,6 +3969,11 @@ components:
           anyOf:
           - type: string
           - type: 'null'
+        parallel_tool_calls:
+          anyOf:
+          - type: boolean
+          - type: 'null'
+          default: true
         previous_response_id:
           anyOf:
           - type: string
@@ -4094,9 +4100,10 @@ components:
           type: array
           title: Output
         parallel_tool_calls:
-          type: boolean
-          title: Parallel Tool Calls
-          default: false
+          anyOf:
+          - type: boolean
+          - type: 'null'
+          default: true
         previous_response_id:
           anyOf:
           - type: string
diff --git a/docs/static/experimental-llama-stack-spec.yaml b/docs/static/experimental-llama-stack-spec.yaml
index 4271989d6..f81a93d33 100644
--- a/docs/static/experimental-llama-stack-spec.yaml
+++ b/docs/static/experimental-llama-stack-spec.yaml
@@ -3263,9 +3263,10 @@ components:
           type: array
           title: Output
         parallel_tool_calls:
-          type: boolean
-          title: Parallel Tool Calls
-          default: false
+          anyOf:
+          - type: boolean
+          - type: 'null'
+          default: true
         previous_response_id:
           anyOf:
           - type: string
@@ -3662,9 +3663,10 @@ components:
           type: array
           title: Output
         parallel_tool_calls:
-          type: boolean
-          title: Parallel Tool Calls
-          default: false
+          anyOf:
+          - type: boolean
+          - type: 'null'
+          default: true
         previous_response_id:
           anyOf:
           - type: string
diff --git a/docs/static/llama-stack-spec.yaml b/docs/static/llama-stack-spec.yaml
index a12ac342f..816f3d0fb 100644
--- a/docs/static/llama-stack-spec.yaml
+++ b/docs/static/llama-stack-spec.yaml
@@ -5744,9 +5744,10 @@ components:
           type: array
           title: Output
         parallel_tool_calls:
-          type: boolean
-          title: Parallel Tool Calls
-          default: false
+          anyOf:
+          - type: boolean
+          - type: 'null'
+          default: true
         previous_response_id:
           anyOf:
           - type: string
@@ -6146,6 +6147,11 @@ components:
           anyOf:
           - type: string
           - type: 'null'
+        parallel_tool_calls:
+          anyOf:
+          - type: boolean
+          - type: 'null'
+          default: true
         previous_response_id:
           anyOf:
           - type: string
@@ -6272,9 +6278,10 @@ components:
           type: array
           title: Output
         parallel_tool_calls:
-          type: boolean
-          title: Parallel Tool Calls
-          default: false
+          anyOf:
+          - type: boolean
+          - type: 'null'
+          default: true
         previous_response_id:
           anyOf:
           - type: string
diff --git a/docs/static/stainless-llama-stack-spec.yaml b/docs/static/stainless-llama-stack-spec.yaml
index 3a6735cbc..a6ebc868c 100644
--- a/docs/static/stainless-llama-stack-spec.yaml
+++ b/docs/static/stainless-llama-stack-spec.yaml
@@ -6723,9 +6723,10 @@ components:
           type: array
           title: Output
         parallel_tool_calls:
-          type: boolean
-          title: Parallel Tool Calls
-          default: false
+          anyOf:
+          - type: boolean
+          - type: 'null'
+          default: true
         previous_response_id:
           anyOf:
           - type: string
@@ -7125,6 +7126,11 @@ components:
           anyOf:
           - type: string
           - type: 'null'
+        parallel_tool_calls:
+          anyOf:
+          - type: boolean
+          - type: 'null'
+          default: true
         previous_response_id:
           anyOf:
           - type: string
@@ -7251,9 +7257,10 @@ components:
           type: array
           title: Output
         parallel_tool_calls:
-          type: boolean
-          title: Parallel Tool Calls
-          default: false
+          anyOf:
+          - type: boolean
+          - type: 'null'
+          default: true
         previous_response_id:
           anyOf:
           - type: string
diff --git a/src/llama_stack/providers/inline/agents/meta_reference/agents.py b/src/llama_stack/providers/inline/agents/meta_reference/agents.py
index 347f6fdb1..e47e757be 100644
--- a/src/llama_stack/providers/inline/agents/meta_reference/agents.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/agents.py
@@ -92,6 +92,7 @@ class MetaReferenceAgentsImpl(Agents):
         model: str,
         prompt: OpenAIResponsePrompt | None = None,
         instructions: str | None = None,
+        parallel_tool_calls: bool | None = True,
         previous_response_id: str | None = None,
         conversation: str | None = None,
         store: bool | None = True,
@@ -120,6 +121,7 @@ class MetaReferenceAgentsImpl(Agents):
             include,
             max_infer_iters,
             guardrails,
+            parallel_tool_calls,
             max_tool_calls,
         )
         return result  # type: ignore[no-any-return]
diff --git a/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py b/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
index cb0fe284e..7e080a675 100644
--- a/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
@@ -252,6 +252,7 @@ class OpenAIResponsesImpl:
         include: list[str] | None = None,
         max_infer_iters: int | None = 10,
         guardrails: list[str | ResponseGuardrailSpec] | None = None,
+        parallel_tool_calls: bool | None = None,
         max_tool_calls: int | None = None,
     ):
         stream = bool(stream)
@@ -296,6 +297,7 @@ class OpenAIResponsesImpl:
             tools=tools,
             max_infer_iters=max_infer_iters,
             guardrail_ids=guardrail_ids,
+            parallel_tool_calls=parallel_tool_calls,
             max_tool_calls=max_tool_calls,
         )
 
@@ -346,6 +348,7 @@ class OpenAIResponsesImpl:
         tools: list[OpenAIResponseInputTool] | None = None,
         max_infer_iters: int | None = 10,
         guardrail_ids: list[str] | None = None,
+        parallel_tool_calls: bool | None = True,
         max_tool_calls: int | None = None,
     ) -> AsyncIterator[OpenAIResponseObjectStream]:
         # These should never be None when called from create_openai_response (which sets defaults)
@@ -385,6 +388,7 @@ class OpenAIResponsesImpl:
             created_at=created_at,
             text=text,
             max_infer_iters=max_infer_iters,
+            parallel_tool_calls=parallel_tool_calls,
             tool_executor=self.tool_executor,
             safety_api=self.safety_api,
             guardrail_ids=guardrail_ids,
diff --git a/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py b/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
index 95c690147..cdbd87244 100644
--- a/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
@@ -114,6 +114,7 @@ class StreamingResponseOrchestrator:
         safety_api,
         guardrail_ids: list[str] | None = None,
         prompt: OpenAIResponsePrompt | None = None,
+        parallel_tool_calls: bool | None = None,
         max_tool_calls: int | None = None,
     ):
         self.inference_api = inference_api
@@ -128,6 +129,8 @@ class StreamingResponseOrchestrator:
         self.prompt = prompt
         # System message that is inserted into the model's context
         self.instructions = instructions
+        # Whether to allow more than one function tool call generated per turn.
+        self.parallel_tool_calls = parallel_tool_calls
         # Max number of total calls to built-in tools that can be processed in a response
         self.max_tool_calls = max_tool_calls
         self.sequence_number = 0
@@ -190,6 +193,7 @@ class StreamingResponseOrchestrator:
             usage=self.accumulated_usage,
             instructions=self.instructions,
             prompt=self.prompt,
+            parallel_tool_calls=self.parallel_tool_calls,
             max_tool_calls=self.max_tool_calls,
         )
 
diff --git a/src/llama_stack_api/agents.py b/src/llama_stack_api/agents.py
index ca0611746..9b767608a 100644
--- a/src/llama_stack_api/agents.py
+++ b/src/llama_stack_api/agents.py
@@ -72,6 +72,7 @@ class Agents(Protocol):
         model: str,
         prompt: OpenAIResponsePrompt | None = None,
         instructions: str | None = None,
+        parallel_tool_calls: bool | None = True,
         previous_response_id: str | None = None,
         conversation: str | None = None,
         store: bool | None = True,
diff --git a/src/llama_stack_api/openai_responses.py b/src/llama_stack_api/openai_responses.py
index 952418f1c..e20004487 100644
--- a/src/llama_stack_api/openai_responses.py
+++ b/src/llama_stack_api/openai_responses.py
@@ -585,7 +585,7 @@ class OpenAIResponseObject(BaseModel):
     :param model: Model identifier used for generation
     :param object: Object type identifier, always "response"
     :param output: List of generated output items (messages, tool calls, etc.)
-    :param parallel_tool_calls: Whether tool calls can be executed in parallel
+    :param parallel_tool_calls: (Optional) Whether to allow more than one function tool call generated per turn.
     :param previous_response_id: (Optional) ID of the previous response in a conversation
     :param prompt: (Optional) Reference to a prompt template and its variables.
     :param status: Current status of the response generation
@@ -605,7 +605,7 @@ class OpenAIResponseObject(BaseModel):
     model: str
     object: Literal["response"] = "response"
     output: Sequence[OpenAIResponseOutput]
-    parallel_tool_calls: bool = False
+    parallel_tool_calls: bool | None = True
     previous_response_id: str | None = None
     prompt: OpenAIResponsePrompt | None = None
     status: str