feat(responses): add MCP argument streaming and content part events (#3136)

# What does this PR do? Adds content part streaming events to the OpenAI-compatible Responses API to support more granular streaming of response content. This introduces: 1. New schema types for content parts: `OpenAIResponseContentPart` with variants for text output and refusals 2. New streaming event types: - `OpenAIResponseObjectStreamResponseContentPartAdded` for when content parts begin - `OpenAIResponseObjectStreamResponseContentPartDone` for when content parts complete 3. Implementation in the reference provider to emit these events during streaming responses. Also emits MCP arguments just like function call ones. ## Test Plan Updated existing streaming tests to verify content part events are properly emitted
2025-12-06 10:37:22 +00:00 · 2025-08-13 16:34:26 -07:00 · 2025-08-13 16:34:26 -07:00 · e1e161553c
commit e1e161553c
parent 8638537d14
6 changed files with 480 additions and 35 deletions
--- a/tests/integration/non_ci/responses/test_responses.py
+++ b/tests/integration/non_ci/responses/test_responses.py
@ -590,9 +590,17 @@ def test_response_streaming_multi_turn_tool_execution(compat_client, text_model_
        # Verify tool call streaming events are present
        chunk_types = [chunk.type for chunk in chunks]

-        # Should have function call arguments delta events for tool calls
-        delta_events = [chunk for chunk in chunks if chunk.type == "response.function_call_arguments.delta"]
-        done_events = [chunk for chunk in chunks if chunk.type == "response.function_call_arguments.done"]
+        # Should have function call or MCP arguments delta/done events for tool calls
+        delta_events = [
+            chunk
+            for chunk in chunks
+            if chunk.type in ["response.function_call_arguments.delta", "response.mcp_call.arguments.delta"]
+        ]
+        done_events = [
+            chunk
+            for chunk in chunks
+            if chunk.type in ["response.function_call_arguments.done", "response.mcp_call.arguments.done"]
+        ]

        # Should have output item events for tool calls
        item_added_events = [chunk for chunk in chunks if chunk.type == "response.output_item.added"]
@ -606,8 +614,12 @@ def test_response_streaming_multi_turn_tool_execution(compat_client, text_model_
        assert len(chunks) > 10, f"Expected rich streaming with many events, got only {len(chunks)} chunks"

        # Since this test involves MCP tool calls, we should see streaming events
-        assert len(delta_events) > 0, f"Expected function_call_arguments.delta events, got chunk types: {chunk_types}"
-        assert len(done_events) > 0, f"Expected function_call_arguments.done events, got chunk types: {chunk_types}"
+        assert len(delta_events) > 0, (
+            f"Expected function_call_arguments.delta or mcp_call.arguments.delta events, got chunk types: {chunk_types}"
+        )
+        assert len(done_events) > 0, (
+            f"Expected function_call_arguments.done or mcp_call.arguments.done events, got chunk types: {chunk_types}"
+        )

        # Should have output item events for function calls
        assert len(item_added_events) > 0, f"Expected response.output_item.added events, got chunk types: {chunk_types}"
@ -670,22 +682,32 @@ def test_response_streaming_multi_turn_tool_execution(compat_client, text_model_
            assert isinstance(done_event.output_index, int), "Output index should be integer"
            assert done_event.output_index >= 0, "Output index should be non-negative"

-        # Group function call argument events by item_id (these should have proper tracking)
-        function_call_events_by_item_id = {}
+        # Group function call and MCP argument events by item_id (these should have proper tracking)
+        argument_events_by_item_id = {}
        for chunk in chunks:
            if hasattr(chunk, "item_id") and chunk.type in [
                "response.function_call_arguments.delta",
                "response.function_call_arguments.done",
+                "response.mcp_call.arguments.delta",
+                "response.mcp_call.arguments.done",
            ]:
                item_id = chunk.item_id
-                if item_id not in function_call_events_by_item_id:
-                    function_call_events_by_item_id[item_id] = []
-                function_call_events_by_item_id[item_id].append(chunk)
+                if item_id not in argument_events_by_item_id:
+                    argument_events_by_item_id[item_id] = []
+                argument_events_by_item_id[item_id].append(chunk)

-        for item_id, related_events in function_call_events_by_item_id.items():
-            # Should have at least one delta and one done event for a complete function call
-            delta_events = [e for e in related_events if e.type == "response.function_call_arguments.delta"]
-            done_events = [e for e in related_events if e.type == "response.function_call_arguments.done"]
+        for item_id, related_events in argument_events_by_item_id.items():
+            # Should have at least one delta and one done event for a complete tool call
+            delta_events = [
+                e
+                for e in related_events
+                if e.type in ["response.function_call_arguments.delta", "response.mcp_call.arguments.delta"]
+            ]
+            done_events = [
+                e
+                for e in related_events
+                if e.type in ["response.function_call_arguments.done", "response.mcp_call.arguments.done"]
+            ]

            assert len(delta_events) > 0, f"Item {item_id} should have at least one delta event"
            assert len(done_events) == 1, f"Item {item_id} should have exactly one done event"
@ -694,6 +716,33 @@ def test_response_streaming_multi_turn_tool_execution(compat_client, text_model_
            for event in related_events:
                assert event.item_id == item_id, f"Event should have consistent item_id {item_id}, got {event.item_id}"

+        # Verify content part events if they exist (for text streaming)
+        content_part_added_events = [chunk for chunk in chunks if chunk.type == "response.content_part.added"]
+        content_part_done_events = [chunk for chunk in chunks if chunk.type == "response.content_part.done"]
+
+        # Content part events should be paired (if any exist)
+        if len(content_part_added_events) > 0:
+            assert len(content_part_done_events) > 0, (
+                "Should have content_part.done events if content_part.added events exist"
+            )
+
+            # Verify content part event structure
+            for added_event in content_part_added_events:
+                assert hasattr(added_event, "response_id"), "Content part added event should have response_id"
+                assert hasattr(added_event, "item_id"), "Content part added event should have item_id"
+                assert hasattr(added_event, "part"), "Content part added event should have part"
+
+                # TODO: enable this after the client types are updated
+                # assert added_event.part.type == "output_text", "Content part should be an output_text"
+
+            for done_event in content_part_done_events:
+                assert hasattr(done_event, "response_id"), "Content part done event should have response_id"
+                assert hasattr(done_event, "item_id"), "Content part done event should have item_id"
+                assert hasattr(done_event, "part"), "Content part done event should have part"
+
+                # TODO: enable this after the client types are updated
+                # assert len(done_event.part.text) > 0, "Content part should have text when done"
+
        # Basic pairing check: each output_item.added should be followed by some activity
        # (but we can't enforce strict 1:1 pairing due to the complexity of multi-turn scenarios)
        assert len(item_added_events) > 0, "Should have at least one output_item.added event"
--- a/tests/unit/providers/agents/meta_reference/test_openai_responses.py
+++ b/tests/unit/providers/agents/meta_reference/test_openai_responses.py
@ -136,9 +136,12 @@ async def test_create_openai_response_with_string_input(openai_responses_impl, m
        input=input_text,
        model=model,
        temperature=0.1,
+        stream=True,  # Enable streaming to test content part events
    )

-    # Verify
+    # For streaming response, collect all chunks
+    chunks = [chunk async for chunk in result]
+
    mock_inference_api.openai_chat_completion.assert_called_once_with(
        model=model,
        messages=[OpenAIUserMessageParam(role="user", content="What is the capital of Ireland?", name=None)],
@ -147,11 +150,32 @@ async def test_create_openai_response_with_string_input(openai_responses_impl, m
        stream=True,
        temperature=0.1,
    )
+
+    # Should have content part events for text streaming
+    # Expected: response.created, content_part.added, output_text.delta, content_part.done, response.completed
+    assert len(chunks) >= 4
+    assert chunks[0].type == "response.created"
+
+    # Check for content part events
+    content_part_added_events = [c for c in chunks if c.type == "response.content_part.added"]
+    content_part_done_events = [c for c in chunks if c.type == "response.content_part.done"]
+    text_delta_events = [c for c in chunks if c.type == "response.output_text.delta"]
+
+    assert len(content_part_added_events) >= 1, "Should have content_part.added event for text"
+    assert len(content_part_done_events) >= 1, "Should have content_part.done event for text"
+    assert len(text_delta_events) >= 1, "Should have text delta events"
+
+    # Verify final event is completion
+    assert chunks[-1].type == "response.completed"
+
+    # When streaming, the final response is in the last chunk
+    final_response = chunks[-1].response
+    assert final_response.model == model
+    assert len(final_response.output) == 1
+    assert isinstance(final_response.output[0], OpenAIResponseMessage)
+
    openai_responses_impl.responses_store.store_response_object.assert_called_once()
-    assert result.model == model
-    assert len(result.output) == 1
-    assert isinstance(result.output[0], OpenAIResponseMessage)
-    assert result.output[0].content[0].text == "Dublin"
+    assert final_response.output[0].content[0].text == "Dublin"


 async def test_create_openai_response_with_string_input_with_tools(openai_responses_impl, mock_inference_api):
@ -272,6 +296,8 @@ async def test_create_openai_response_with_tool_call_type_none(openai_responses_

    # Check that we got the content from our mocked tool execution result
    chunks = [chunk async for chunk in result]
+
+    # Verify event types
    # Should have: response.created, output_item.added, function_call_arguments.delta,
    # function_call_arguments.done, output_item.done, response.completed
    assert len(chunks) == 6