feat(responses): add output_text delta events to responses (#2265)

This adds initial streaming support to the Responses API. This PR makes sure that the _first_ inference call made to chat completions streams out. There's more to be done: - tool call output tokens need to stream out when possible - we need to loop through multiple rounds of inference and they all need to stream out. ## Test Plan Added a test. Executed as: ``` FIREWORKS_API_KEY=... \ pytest -s -v 'tests/verifications/openai_api/test_responses.py' \ --provider=stack:fireworks --model meta-llama/Llama-4-Scout-17B-16E-Instruct ``` Then, started a llama stack fireworks distro and tested against it like this: ``` OPENAI_API_KEY=blah \ pytest -s -v 'tests/verifications/openai_api/test_responses.py' \ --base-url http://localhost:8321/v1/openai/v1 \ --model meta-llama/Llama-4-Scout-17B-16E-Instruct ```
2025-12-03 18:00:36 +00:00 · 2025-05-27 13:07:14 -07:00 · 2025-05-27 13:07:14 -07:00 · 5cdb29758a
commit 5cdb29758a
parent 6ee319ae08
8 changed files with 493 additions and 160 deletions
--- a/tests/unit/providers/agents/meta_reference/test_openai_responses.py
+++ b/tests/unit/providers/agents/meta_reference/test_openai_responses.py
@ -232,9 +232,17 @@ async def test_create_openai_response_with_tool_call_type_none(openai_responses_

    # Check that we got the content from our mocked tool execution result
    chunks = [chunk async for chunk in result]
-    assert len(chunks) > 0
-    assert chunks[0].response.output[0].type == "function_call"
-    assert chunks[0].response.output[0].name == "get_weather"
+    assert len(chunks) == 2  # Should have response.created and response.completed
+
+    # Check response.created event (should have empty output)
+    assert chunks[0].type == "response.created"
+    assert len(chunks[0].response.output) == 0
+
+    # Check response.completed event (should have the tool call)
+    assert chunks[1].type == "response.completed"
+    assert len(chunks[1].response.output) == 1
+    assert chunks[1].response.output[0].type == "function_call"
+    assert chunks[1].response.output[0].name == "get_weather"


@pytest.mark.asyncio
--- a/tests/verifications/openai_api/conftest.py
+++ b/tests/verifications/openai_api/conftest.py
@ -10,17 +10,17 @@ from tests.verifications.openai_api.fixtures.fixtures import _load_all_verificat
 def pytest_generate_tests(metafunc):
    """Dynamically parametrize tests based on the selected provider and config."""
    if "model" in metafunc.fixturenames:
+        model = metafunc.config.getoption("model")
+        if model:
+            metafunc.parametrize("model", [model])
+            return
+
        provider = metafunc.config.getoption("provider")
        if not provider:
            print("Warning: --provider not specified. Skipping model parametrization.")
            metafunc.parametrize("model", [])
            return

-        model = metafunc.config.getoption("model")
-        if model:
-            metafunc.parametrize("model", [model])
-            return
-
        try:
            config_data = _load_all_verification_configs()
        except (OSError, FileNotFoundError) as e:
--- a/tests/verifications/openai_api/fixtures/test_cases/responses.yaml
+++ b/tests/verifications/openai_api/fixtures/test_cases/responses.yaml
@ -77,11 +77,12 @@ test_response_image:
          image_url: "https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg"
      output: "llama"

+# the models are really poor at tool calling after seeing images :/
 test_response_multi_turn_image:
  test_name: test_response_multi_turn_image
  test_params:
    case:
-    - case_id: "llama_image_search"
+    - case_id: "llama_image_understanding"
      turns:
      - input:
        - role: user
@ -91,7 +92,5 @@ test_response_multi_turn_image:
          - type: input_image
            image_url: "https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg"
        output: "llama"
-      - input: "Search the web using the search tool for the animal from the previous response. Your search query should be a single phrase that includes the animal's name and the words 'maverick', 'scout' and 'llm'"
-        tools:
-        - type: web_search
-        output: "model"
+      - input: "What country do you find this animal primarily in? What continent?"
+        output: "peru"
--- a/tests/verifications/openai_api/test_responses.py
+++ b/tests/verifications/openai_api/test_responses.py
@ -7,6 +7,7 @@
 import json

 import httpx
+import openai
 import pytest

 from llama_stack import LlamaStackAsLibraryClient
@ -61,23 +62,151 @@ def test_response_streaming_basic(request, openai_client, model, provider, verif
    if should_skip_test(verification_config, provider, model, test_name_base):
        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")

+    import time
+
    response = openai_client.responses.create(
        model=model,
        input=case["input"],
        stream=True,
    )
-    streamed_content = []
+
+    # Track events and timing to verify proper streaming
+    events = []
+    event_times = []
    response_id = ""
+
+    start_time = time.time()
+
    for chunk in response:
-        if chunk.type == "response.completed":
+        current_time = time.time()
+        event_times.append(current_time - start_time)
+        events.append(chunk)
+
+        if chunk.type == "response.created":
+            # Verify response.created is emitted first and immediately
+            assert len(events) == 1, "response.created should be the first event"
+            assert event_times[0] < 0.1, "response.created should be emitted immediately"
+            assert chunk.response.status == "in_progress"
            response_id = chunk.response.id
-            streamed_content.append(chunk.response.output_text.strip())

-    assert len(streamed_content) > 0
-    assert case["output"].lower() in "".join(streamed_content).lower()
+        elif chunk.type == "response.completed":
+            # Verify response.completed comes after response.created
+            assert len(events) >= 2, "response.completed should come after response.created"
+            assert chunk.response.status == "completed"
+            assert chunk.response.id == response_id, "Response ID should be consistent"

+            # Verify content quality
+            output_text = chunk.response.output_text.lower().strip()
+            assert len(output_text) > 0, "Response should have content"
+            assert case["output"].lower() in output_text, f"Expected '{case['output']}' in response"
+
+    # Verify we got both required events
+    event_types = [event.type for event in events]
+    assert "response.created" in event_types, "Missing response.created event"
+    assert "response.completed" in event_types, "Missing response.completed event"
+
+    # Verify event order
+    created_index = event_types.index("response.created")
+    completed_index = event_types.index("response.completed")
+    assert created_index < completed_index, "response.created should come before response.completed"
+
+    # Verify stored response matches streamed response
    retrieved_response = openai_client.responses.retrieve(response_id=response_id)
-    assert retrieved_response.output_text == "".join(streamed_content)
+    final_event = events[-1]
+    assert retrieved_response.output_text == final_event.response.output_text
+
+
+@pytest.mark.parametrize(
+    "case",
+    responses_test_cases["test_response_basic"]["test_params"]["case"],
+    ids=case_id_generator,
+)
+def test_response_streaming_incremental_content(request, openai_client, model, provider, verification_config, case):
+    """Test that streaming actually delivers content incrementally, not just at the end."""
+    test_name_base = get_base_test_name(request)
+    if should_skip_test(verification_config, provider, model, test_name_base):
+        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
+
+    import time
+
+    response = openai_client.responses.create(
+        model=model,
+        input=case["input"],
+        stream=True,
+    )
+
+    # Track all events and their content to verify incremental streaming
+    events = []
+    content_snapshots = []
+    event_times = []
+
+    start_time = time.time()
+
+    for chunk in response:
+        current_time = time.time()
+        event_times.append(current_time - start_time)
+        events.append(chunk)
+
+        # Track content at each event based on event type
+        if chunk.type == "response.output_text.delta":
+            # For delta events, track the delta content
+            content_snapshots.append(chunk.delta)
+        elif hasattr(chunk, "response") and hasattr(chunk.response, "output_text"):
+            # For response.created/completed events, track the full output_text
+            content_snapshots.append(chunk.response.output_text)
+        else:
+            content_snapshots.append("")
+
+    # Verify we have the expected events
+    event_types = [event.type for event in events]
+    assert "response.created" in event_types, "Missing response.created event"
+    assert "response.completed" in event_types, "Missing response.completed event"
+
+    # Check if we have incremental content updates
+    created_index = event_types.index("response.created")
+    completed_index = event_types.index("response.completed")
+
+    # The key test: verify content progression
+    created_content = content_snapshots[created_index]
+    completed_content = content_snapshots[completed_index]
+
+    # Verify that response.created has empty or minimal content
+    assert len(created_content) == 0, f"response.created should have empty content, got: {repr(created_content[:100])}"
+
+    # Verify that response.completed has the full content
+    assert len(completed_content) > 0, "response.completed should have content"
+    assert case["output"].lower() in completed_content.lower(), f"Expected '{case['output']}' in final content"
+
+    # Check for true incremental streaming by looking for delta events
+    delta_events = [i for i, event_type in enumerate(event_types) if event_type == "response.output_text.delta"]
+
+    # Assert that we have delta events (true incremental streaming)
+    assert len(delta_events) > 0, "Expected delta events for true incremental streaming, but found none"
+
+    # Verify delta events have content and accumulate to final content
+    delta_content_total = ""
+    non_empty_deltas = 0
+
+    for delta_idx in delta_events:
+        delta_content = content_snapshots[delta_idx]
+        if delta_content:
+            delta_content_total += delta_content
+            non_empty_deltas += 1
+
+    # Assert that we have meaningful delta content
+    assert non_empty_deltas > 0, "Delta events found but none contain content"
+    assert len(delta_content_total) > 0, "Delta events found but total delta content is empty"
+
+    # Verify that the accumulated delta content matches the final content
+    assert delta_content_total.strip() == completed_content.strip(), (
+        f"Delta content '{delta_content_total}' should match final content '{completed_content}'"
+    )
+
+    # Verify timing: delta events should come between created and completed
+    for delta_idx in delta_events:
+        assert created_index < delta_idx < completed_index, (
+            f"Delta event at index {delta_idx} should be between created ({created_index}) and completed ({completed_index})"
+        )


@pytest.mark.parametrize(
@ -178,7 +307,7 @@ def test_response_non_streaming_mcp_tool(request, openai_client, model, provider
        exc_type = (
            AuthenticationRequiredError
            if isinstance(openai_client, LlamaStackAsLibraryClient)
-            else httpx.HTTPStatusError
+            else (httpx.HTTPStatusError, openai.AuthenticationError)
        )
        with pytest.raises(exc_type):
            openai_client.responses.create(