Wire through parallel_tool_calls to Responses API

Signed-off-by: Anastas Stoyanovsky <astoyano@redhat.com>
2025-12-03 09:53:45 +00:00 · 2025-11-11 08:54:02 -05:00 · 2025-11-11 08:54:02 -05:00 · 7a9b7ecdc2
commit 7a9b7ecdc2
parent 7093978754
9 changed files with 159 additions and 20 deletions
--- a/client-sdks/stainless/openapi.yml
+++ b/client-sdks/stainless/openapi.yml
@ -6723,9 +6723,12 @@ components:
          type: array
          title: Output
        parallel_tool_calls:
          type: boolean
          title: Parallel Tool Calls
-          default: false
+          type: boolean
          default: true
          description: >-
            (Optional) Whether to allow more than one function tool call generated
            per turn.
        previous_response_id:
          anyOf:
          - type: string
@ -6838,8 +6841,10 @@ components:
        - created_at
        - id
        - model
        - object
        - output
        - status
        - text
        - input
      title: OpenAIResponseObjectWithInput
      description: OpenAI response object extended with input context information.
@ -7122,9 +7127,12 @@ components:
          - type: 'null'
          title: OpenAIResponsePrompt
        instructions:
          type: string
          anyOf:
          - type: string
          - type: 'null'
        parallel_tool_calls:
          type: boolean
        previous_response_id:
          anyOf:
          - type: string
@ -7253,7 +7261,10 @@ components:
        parallel_tool_calls:
          type: boolean
          title: Parallel Tool Calls
-          default: false
+          default: true
          description: >-
            (Optional) Whether to allow more than one function tool call generated
            per turn.
        previous_response_id:
          anyOf:
          - type: string
--- a/docs/static/llama-stack-spec.yaml
+++ b/docs/static/llama-stack-spec.yaml
@ -5746,7 +5746,10 @@ components:
        parallel_tool_calls:
          type: boolean
          title: Parallel Tool Calls
-          default: false
+          default: true
          description: >-
            (Optional) Whether to allow more than one function tool call generated
            per turn.
        previous_response_id:
          anyOf:
          - type: string
@ -6143,9 +6146,12 @@ components:
          - type: 'null'
          title: OpenAIResponsePrompt
        instructions:
          type: string
          anyOf:
          - type: string
          - type: 'null'
        parallel_tool_calls:
          type: boolean
        previous_response_id:
          anyOf:
          - type: string
@ -6274,7 +6280,10 @@ components:
        parallel_tool_calls:
          type: boolean
          title: Parallel Tool Calls
-          default: false
+          default: true
          description: >-
            (Optional) Whether to allow more than one function tool call generated
            per turn.
        previous_response_id:
          anyOf:
          - type: string
--- a/docs/static/stainless-llama-stack-spec.yaml
+++ b/docs/static/stainless-llama-stack-spec.yaml
@ -6725,7 +6725,10 @@ components:
        parallel_tool_calls:
          type: boolean
          title: Parallel Tool Calls
-          default: false
+          default: true
          description: >-
            (Optional) Whether to allow more than one function tool call generated
            per turn.
        previous_response_id:
          anyOf:
          - type: string
@ -7125,6 +7128,9 @@ components:
          anyOf:
          - type: string
          - type: 'null'
          type: string
        parallel_tool_calls:
          type: boolean
        previous_response_id:
          anyOf:
          - type: string
@ -7253,7 +7259,10 @@ components:
        parallel_tool_calls:
          type: boolean
          title: Parallel Tool Calls
-          default: false
+          default: true
          description: >-
            (Optional) Whether to allow more than one function tool call generated
            per turn.
        previous_response_id:
          anyOf:
          - type: string
--- a/src/llama_stack/providers/inline/agents/meta_reference/agents.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/agents.py
@ -92,6 +92,7 @@ class MetaReferenceAgentsImpl(Agents):
        model: str,
        prompt: OpenAIResponsePrompt | None = None,
        instructions: str | None = None,
        parallel_tool_calls: bool | None = True,
        previous_response_id: str | None = None,
        conversation: str | None = None,
        store: bool | None = True,
@ -120,6 +121,7 @@ class MetaReferenceAgentsImpl(Agents):
            include,
            max_infer_iters,
            guardrails,
            parallel_tool_calls,
            max_tool_calls,
        )
        return result  # type: ignore[no-any-return]
--- a/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
@ -252,6 +252,7 @@ class OpenAIResponsesImpl:
        include: list[str] | None = None,
        max_infer_iters: int | None = 10,
        guardrails: list[str | ResponseGuardrailSpec] | None = None,
        parallel_tool_calls: bool | None = None,
        max_tool_calls: int | None = None,
    ):
        stream = bool(stream)
@ -296,6 +297,7 @@ class OpenAIResponsesImpl:
            tools=tools,
            max_infer_iters=max_infer_iters,
            guardrail_ids=guardrail_ids,
            parallel_tool_calls=parallel_tool_calls,
            max_tool_calls=max_tool_calls,
        )
@ -346,6 +348,7 @@ class OpenAIResponsesImpl:
        tools: list[OpenAIResponseInputTool] | None = None,
        max_infer_iters: int | None = 10,
        guardrail_ids: list[str] | None = None,
        parallel_tool_calls: bool | None = True,
        max_tool_calls: int | None = None,
    ) -> AsyncIterator[OpenAIResponseObjectStream]:
        # These should never be None when called from create_openai_response (which sets defaults)
@ -385,6 +388,7 @@ class OpenAIResponsesImpl:
            created_at=created_at,
            text=text,
            max_infer_iters=max_infer_iters,
            parallel_tool_calls=parallel_tool_calls,
            tool_executor=self.tool_executor,
            safety_api=self.safety_api,
            guardrail_ids=guardrail_ids,
--- a/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
@ -114,6 +114,7 @@ class StreamingResponseOrchestrator:
        safety_api,
        guardrail_ids: list[str] | None = None,
        prompt: OpenAIResponsePrompt | None = None,
        parallel_tool_calls: bool | None = None,
        max_tool_calls: int | None = None,
    ):
        self.inference_api = inference_api
@ -128,6 +129,8 @@ class StreamingResponseOrchestrator:
        self.prompt = prompt
        # System message that is inserted into the model's context
        self.instructions = instructions
        # Whether to allow more than one function tool call generated per turn.
        self.parallel_tool_calls = parallel_tool_calls
        # Max number of total calls to built-in tools that can be processed in a response
        self.max_tool_calls = max_tool_calls
        self.sequence_number = 0
@ -190,6 +193,7 @@ class StreamingResponseOrchestrator:
            usage=self.accumulated_usage,
            instructions=self.instructions,
            prompt=self.prompt,
            parallel_tool_calls=self.parallel_tool_calls,
            max_tool_calls=self.max_tool_calls,
        )
@ -301,6 +305,7 @@ class StreamingResponseOrchestrator:
                    completion_result_data,
                    output_messages,
                    next_turn_messages,
                    not self.parallel_tool_calls,
                ):
                    yield stream_event
@ -897,6 +902,7 @@ class StreamingResponseOrchestrator:
        completion_result_data: ChatCompletionResult,
        output_messages: list[OpenAIResponseOutput],
        next_turn_messages: list,
        incremental_function_calling: bool,
    ) -> AsyncIterator[OpenAIResponseObjectStream]:
        """Coordinate execution of both function and non-function tool calls."""
        # Execute non-function tool calls
@ -1020,6 +1026,10 @@ class StreamingResponseOrchestrator:
                sequence_number=self.sequence_number,
            )
            # TODO: Make sure that multi-turn incremental execution works
            if incremental_function_calling:
                break
    async def _process_new_tools(
        self, tools: list[OpenAIResponseInputTool], output_messages: list[OpenAIResponseOutput]
    ) -> AsyncIterator[OpenAIResponseObjectStream]:
--- a/src/llama_stack_api/agents.py
+++ b/src/llama_stack_api/agents.py
@ -72,6 +72,7 @@ class Agents(Protocol):
        model: str,
        prompt: OpenAIResponsePrompt | None = None,
        instructions: str | None = None,
        parallel_tool_calls: bool | None = True,
        previous_response_id: str | None = None,
        conversation: str | None = None,
        store: bool | None = True,
--- a/src/llama_stack_api/openai_responses.py
+++ b/src/llama_stack_api/openai_responses.py
@ -585,7 +585,7 @@ class OpenAIResponseObject(BaseModel):
    :param model: Model identifier used for generation
    :param object: Object type identifier, always "response"
    :param output: List of generated output items (messages, tool calls, etc.)
-    :param parallel_tool_calls: Whether tool calls can be executed in parallel
+    :param parallel_tool_calls: (Optional) Whether to allow more than one function tool call generated per turn.
    :param previous_response_id: (Optional) ID of the previous response in a conversation
    :param prompt: (Optional) Reference to a prompt template and its variables.
    :param status: Current status of the response generation
@ -605,7 +605,7 @@ class OpenAIResponseObject(BaseModel):
    model: str
    object: Literal["response"] = "response"
    output: Sequence[OpenAIResponseOutput]
-    parallel_tool_calls: bool = False
+    parallel_tool_calls: bool | None = True
    previous_response_id: str | None = None
    prompt: OpenAIResponsePrompt | None = None
    status: str
--- a/tests/integration/agents/test_openai_responses.py
+++ b/tests/integration/agents/test_openai_responses.py
@ -682,3 +682,96 @@ def test_max_tool_calls_with_builtin_tools(openai_client, client_with_models, te
    # Verify we have a valid max_tool_calls field
    assert response_3.max_tool_calls == max_tool_calls[1]
@pytest.mark.skip(reason="Tool calling is not reliable.")
 def test_parallel_tool_calls_true(openai_client, client_with_models, text_model_id):
    """Test handling of max_tool_calls with function tools in responses."""
    if isinstance(client_with_models, LlamaStackAsLibraryClient):
        pytest.skip("OpenAI responses are not supported when testing with library client yet.")
    client = openai_client
    parallel_tool_calls = True
    tools = [
        {
            "type": "function",
            "name": "get_weather",
            "description": "Get weather information for a specified location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "The city name (e.g., 'New York', 'London')",
                    },
                },
            },
        }
    ]
    # First create a response that triggers function tools
    response = client.responses.create(
        model=text_model_id,
        input="Get the weather in New York and in Paris",
        tools=tools,
        stream=False,
        parallel_tool_calls=parallel_tool_calls,
    )
    # Verify we got two function calls and that the max_tool_calls do not affect function tools
    assert len(response.output) == 2
    assert response.output[0].type == "function_call"
    assert response.output[0].name == "get_weather"
    assert response.output[0].status == "completed"
    assert response.output[1].type == "function_call"
    assert response.output[1].name == "get_weather"
    assert response.output[0].status == "completed"
    # Verify we have a valid max_tool_calls field
    assert response.parallel_tool_calls == parallel_tool_calls
@pytest.mark.skip(reason="Tool calling is not reliable.")
 def test_parallel_tool_calls_false(openai_client, client_with_models, text_model_id):
    """Test handling of max_tool_calls with function tools in responses."""
    if isinstance(client_with_models, LlamaStackAsLibraryClient):
        pytest.skip("OpenAI responses are not supported when testing with library client yet.")
    client = openai_client
    parallel_tool_calls = False
    tools = [
        {
            "type": "function",
            "name": "get_weather",
            "description": "Get weather information for a specified location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "The city name (e.g., 'New York', 'London')",
                    },
                },
            },
        }
    ]
    # First create a response that triggers function tools
    response = client.responses.create(
        model=text_model_id,
        input="Get the weather in New York and in Paris",
        tools=tools,
        stream=False,
        parallel_tool_calls=parallel_tool_calls,
    )
    # Verify we got two function calls and that the max_tool_calls do not affect function tools
    assert len(response.output) == 1
    assert response.output[0].type == "function_call"
    assert response.output[0].name == "get_weather"
    assert response.output[0].status == "completed"
    # Verify we have a valid max_tool_calls field
    assert response.parallel_tool_calls == parallel_tool_calls