feat(responses): add reasoning and annotation added events

Implements missing streaming events from OpenAI Responses API spec: reasoning text/summary events for o1/o3 models, refusal events for safety moderation, annotation events for citations, and file search streaming events. Added optional reasoning_content field to chat completion chunks to support non-standard provider extensions. Refactored streaming orchestrator to handle new content types via helper methods.
2025-12-13 13:42:35 +00:00 · 2025-10-11 14:24:32 -07:00 · 2025-10-11 14:24:32 -07:00 · 3f1f7c3f7f
commit 3f1f7c3f7f
parent 32fde8d9a8
9 changed files with 3679 additions and 0 deletions
--- a/llama_stack/apis/agents/openai_responses.py
+++ b/llama_stack/apis/agents/openai_responses.py
@ -953,6 +953,248 @@ class OpenAIResponseObjectStreamResponseContentPartDone(BaseModel):
    type: Literal["response.content_part.done"] = "response.content_part.done"


+@json_schema_type
+class OpenAIResponseObjectStreamResponseReasoningTextDelta(BaseModel):
+    """Streaming event for incremental reasoning text updates.
+
+    :param content_index: Index position of the reasoning content part
+    :param delta: Incremental reasoning text being added
+    :param item_id: Unique identifier of the output item being updated
+    :param output_index: Index position of the item in the output list
+    :param sequence_number: Sequential number for ordering streaming events
+    :param type: Event type identifier, always "response.reasoning_text.delta"
+    """
+
+    content_index: int
+    delta: str
+    item_id: str
+    output_index: int
+    sequence_number: int
+    type: Literal["response.reasoning_text.delta"] = "response.reasoning_text.delta"
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseReasoningTextDone(BaseModel):
+    """Streaming event for when reasoning text is completed.
+
+    :param content_index: Index position of the reasoning content part
+    :param text: Final complete reasoning text
+    :param item_id: Unique identifier of the completed output item
+    :param output_index: Index position of the item in the output list
+    :param sequence_number: Sequential number for ordering streaming events
+    :param type: Event type identifier, always "response.reasoning_text.done"
+    """
+
+    content_index: int
+    text: str
+    item_id: str
+    output_index: int
+    sequence_number: int
+    type: Literal["response.reasoning_text.done"] = "response.reasoning_text.done"
+
+
+@json_schema_type
+class OpenAIResponseContentPartReasoningSummary(BaseModel):
+    """Reasoning summary part in a streamed response.
+
+    :param type: Content part type identifier, always "summary_text"
+    :param text: Summary text
+    """
+
+    type: Literal["summary_text"] = "summary_text"
+    text: str
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseReasoningSummaryPartAdded(BaseModel):
+    """Streaming event for when a new reasoning summary part is added.
+
+    :param item_id: Unique identifier of the output item
+    :param output_index: Index position of the output item
+    :param part: The summary part that was added
+    :param sequence_number: Sequential number for ordering streaming events
+    :param summary_index: Index of the summary part within the reasoning summary
+    :param type: Event type identifier, always "response.reasoning_summary_part.added"
+    """
+
+    item_id: str
+    output_index: int
+    part: OpenAIResponseContentPartReasoningSummary
+    sequence_number: int
+    summary_index: int
+    type: Literal["response.reasoning_summary_part.added"] = "response.reasoning_summary_part.added"
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseReasoningSummaryPartDone(BaseModel):
+    """Streaming event for when a reasoning summary part is completed.
+
+    :param item_id: Unique identifier of the output item
+    :param output_index: Index position of the output item
+    :param part: The completed summary part
+    :param sequence_number: Sequential number for ordering streaming events
+    :param summary_index: Index of the summary part within the reasoning summary
+    :param type: Event type identifier, always "response.reasoning_summary_part.done"
+    """
+
+    item_id: str
+    output_index: int
+    part: OpenAIResponseContentPartReasoningSummary
+    sequence_number: int
+    summary_index: int
+    type: Literal["response.reasoning_summary_part.done"] = "response.reasoning_summary_part.done"
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseReasoningSummaryTextDelta(BaseModel):
+    """Streaming event for incremental reasoning summary text updates.
+
+    :param delta: Incremental summary text being added
+    :param item_id: Unique identifier of the output item
+    :param output_index: Index position of the output item
+    :param sequence_number: Sequential number for ordering streaming events
+    :param summary_index: Index of the summary part within the reasoning summary
+    :param type: Event type identifier, always "response.reasoning_summary_text.delta"
+    """
+
+    delta: str
+    item_id: str
+    output_index: int
+    sequence_number: int
+    summary_index: int
+    type: Literal["response.reasoning_summary_text.delta"] = "response.reasoning_summary_text.delta"
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseReasoningSummaryTextDone(BaseModel):
+    """Streaming event for when reasoning summary text is completed.
+
+    :param text: Final complete summary text
+    :param item_id: Unique identifier of the output item
+    :param output_index: Index position of the output item
+    :param sequence_number: Sequential number for ordering streaming events
+    :param summary_index: Index of the summary part within the reasoning summary
+    :param type: Event type identifier, always "response.reasoning_summary_text.done"
+    """
+
+    text: str
+    item_id: str
+    output_index: int
+    sequence_number: int
+    summary_index: int
+    type: Literal["response.reasoning_summary_text.done"] = "response.reasoning_summary_text.done"
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseRefusalDelta(BaseModel):
+    """Streaming event for incremental refusal text updates.
+
+    :param content_index: Index position of the content part
+    :param delta: Incremental refusal text being added
+    :param item_id: Unique identifier of the output item
+    :param output_index: Index position of the item in the output list
+    :param sequence_number: Sequential number for ordering streaming events
+    :param type: Event type identifier, always "response.refusal.delta"
+    """
+
+    content_index: int
+    delta: str
+    item_id: str
+    output_index: int
+    sequence_number: int
+    type: Literal["response.refusal.delta"] = "response.refusal.delta"
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseRefusalDone(BaseModel):
+    """Streaming event for when refusal text is completed.
+
+    :param content_index: Index position of the content part
+    :param refusal: Final complete refusal text
+    :param item_id: Unique identifier of the output item
+    :param output_index: Index position of the item in the output list
+    :param sequence_number: Sequential number for ordering streaming events
+    :param type: Event type identifier, always "response.refusal.done"
+    """
+
+    content_index: int
+    refusal: str
+    item_id: str
+    output_index: int
+    sequence_number: int
+    type: Literal["response.refusal.done"] = "response.refusal.done"
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseOutputTextAnnotationAdded(BaseModel):
+    """Streaming event for when an annotation is added to output text.
+
+    :param item_id: Unique identifier of the item to which the annotation is being added
+    :param output_index: Index position of the output item in the response's output array
+    :param content_index: Index position of the content part within the output item
+    :param annotation_index: Index of the annotation within the content part
+    :param annotation: The annotation object being added
+    :param sequence_number: Sequential number for ordering streaming events
+    :param type: Event type identifier, always "response.output_text.annotation.added"
+    """
+
+    item_id: str
+    output_index: int
+    content_index: int
+    annotation_index: int
+    annotation: OpenAIResponseAnnotations
+    sequence_number: int
+    type: Literal["response.output_text.annotation.added"] = "response.output_text.annotation.added"
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseFileSearchCallInProgress(BaseModel):
+    """Streaming event for file search calls in progress.
+
+    :param item_id: Unique identifier of the file search call
+    :param output_index: Index position of the item in the output list
+    :param sequence_number: Sequential number for ordering streaming events
+    :param type: Event type identifier, always "response.file_search_call.in_progress"
+    """
+
+    item_id: str
+    output_index: int
+    sequence_number: int
+    type: Literal["response.file_search_call.in_progress"] = "response.file_search_call.in_progress"
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseFileSearchCallSearching(BaseModel):
+    """Streaming event for file search currently searching.
+
+    :param item_id: Unique identifier of the file search call
+    :param output_index: Index position of the item in the output list
+    :param sequence_number: Sequential number for ordering streaming events
+    :param type: Event type identifier, always "response.file_search_call.searching"
+    """
+
+    item_id: str
+    output_index: int
+    sequence_number: int
+    type: Literal["response.file_search_call.searching"] = "response.file_search_call.searching"
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseFileSearchCallCompleted(BaseModel):
+    """Streaming event for completed file search calls.
+
+    :param item_id: Unique identifier of the completed file search call
+    :param output_index: Index position of the item in the output list
+    :param sequence_number: Sequential number for ordering streaming events
+    :param type: Event type identifier, always "response.file_search_call.completed"
+    """
+
+    item_id: str
+    output_index: int
+    sequence_number: int
+    type: Literal["response.file_search_call.completed"] = "response.file_search_call.completed"
+
+
 OpenAIResponseObjectStream = Annotated[
    OpenAIResponseObjectStreamResponseCreated
    | OpenAIResponseObjectStreamResponseInProgress
@ -975,6 +1217,18 @@ OpenAIResponseObjectStream = Annotated[
    | OpenAIResponseObjectStreamResponseMcpCallCompleted
    | OpenAIResponseObjectStreamResponseContentPartAdded
    | OpenAIResponseObjectStreamResponseContentPartDone
+    | OpenAIResponseObjectStreamResponseReasoningTextDelta
+    | OpenAIResponseObjectStreamResponseReasoningTextDone
+    | OpenAIResponseObjectStreamResponseReasoningSummaryPartAdded
+    | OpenAIResponseObjectStreamResponseReasoningSummaryPartDone
+    | OpenAIResponseObjectStreamResponseReasoningSummaryTextDelta
+    | OpenAIResponseObjectStreamResponseReasoningSummaryTextDone
+    | OpenAIResponseObjectStreamResponseRefusalDelta
+    | OpenAIResponseObjectStreamResponseRefusalDone
+    | OpenAIResponseObjectStreamResponseOutputTextAnnotationAdded
+    | OpenAIResponseObjectStreamResponseFileSearchCallInProgress
+    | OpenAIResponseObjectStreamResponseFileSearchCallSearching
+    | OpenAIResponseObjectStreamResponseFileSearchCallCompleted
    | OpenAIResponseObjectStreamResponseIncomplete
    | OpenAIResponseObjectStreamResponseFailed
    | OpenAIResponseObjectStreamResponseCompleted,
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@ -776,12 +776,14 @@ class OpenAIChoiceDelta(BaseModel):
    :param refusal: (Optional) The refusal of the delta
    :param role: (Optional) The role of the delta
    :param tool_calls: (Optional) The tool calls of the delta
+    :param reasoning_content: (Optional) The reasoning content from the model (non-standard, for o1/o3 models)
    """

    content: str | None = None
    refusal: str | None = None
    role: str | None = None
    tool_calls: list[OpenAIChatCompletionToolCall] | None = None
+    reasoning_content: str | None = None


@json_schema_type
--- a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
+++ b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
@ -13,6 +13,8 @@ from llama_stack.apis.agents.openai_responses import (
    ApprovalFilter,
    MCPListToolsTool,
    OpenAIResponseContentPartOutputText,
+    OpenAIResponseContentPartReasoningText,
+    OpenAIResponseContentPartRefusal,
    OpenAIResponseError,
    OpenAIResponseInputTool,
    OpenAIResponseInputToolMCP,
@ -35,6 +37,10 @@ from llama_stack.apis.agents.openai_responses import (
    OpenAIResponseObjectStreamResponseOutputItemAdded,
    OpenAIResponseObjectStreamResponseOutputItemDone,
    OpenAIResponseObjectStreamResponseOutputTextDelta,
+    OpenAIResponseObjectStreamResponseReasoningTextDelta,
+    OpenAIResponseObjectStreamResponseReasoningTextDone,
+    OpenAIResponseObjectStreamResponseRefusalDelta,
+    OpenAIResponseObjectStreamResponseRefusalDone,
    OpenAIResponseOutput,
    OpenAIResponseOutputMessageFunctionToolCall,
    OpenAIResponseOutputMessageMCPListTools,
@ -353,6 +359,128 @@ class StreamingResponseOrchestrator:
                ),
            )

+    async def _handle_reasoning_content_chunk(
+        self,
+        reasoning_content: str,
+        reasoning_part_emitted: bool,
+        reasoning_content_index: int,
+        message_item_id: str,
+        message_output_index: int,
+    ) -> AsyncIterator[OpenAIResponseObjectStream]:
+        # Emit content_part.added event for first reasoning chunk
+        if not reasoning_part_emitted:
+            self.sequence_number += 1
+            yield OpenAIResponseObjectStreamResponseContentPartAdded(
+                content_index=reasoning_content_index,
+                response_id=self.response_id,
+                item_id=message_item_id,
+                output_index=message_output_index,
+                part=OpenAIResponseContentPartReasoningText(
+                    text="",  # Will be filled incrementally via reasoning deltas
+                ),
+                sequence_number=self.sequence_number,
+            )
+        # Emit reasoning_text.delta event
+        self.sequence_number += 1
+        yield OpenAIResponseObjectStreamResponseReasoningTextDelta(
+            content_index=reasoning_content_index,
+            delta=reasoning_content,
+            item_id=message_item_id,
+            output_index=message_output_index,
+            sequence_number=self.sequence_number,
+        )
+
+    async def _handle_refusal_content_chunk(
+        self,
+        refusal_content: str,
+        refusal_part_emitted: bool,
+        refusal_content_index: int,
+        message_item_id: str,
+        message_output_index: int,
+    ) -> AsyncIterator[OpenAIResponseObjectStream]:
+        # Emit content_part.added event for first refusal chunk
+        if not refusal_part_emitted:
+            self.sequence_number += 1
+            yield OpenAIResponseObjectStreamResponseContentPartAdded(
+                content_index=refusal_content_index,
+                response_id=self.response_id,
+                item_id=message_item_id,
+                output_index=message_output_index,
+                part=OpenAIResponseContentPartRefusal(
+                    refusal="",  # Will be filled incrementally via refusal deltas
+                ),
+                sequence_number=self.sequence_number,
+            )
+        # Emit refusal.delta event
+        self.sequence_number += 1
+        yield OpenAIResponseObjectStreamResponseRefusalDelta(
+            content_index=refusal_content_index,
+            delta=refusal_content,
+            item_id=message_item_id,
+            output_index=message_output_index,
+            sequence_number=self.sequence_number,
+        )
+
+    async def _emit_reasoning_done_events(
+        self,
+        reasoning_text_accumulated: list[str],
+        reasoning_content_index: int,
+        message_item_id: str,
+        message_output_index: int,
+    ) -> AsyncIterator[OpenAIResponseObjectStream]:
+        final_reasoning_text = "".join(reasoning_text_accumulated)
+        # Emit reasoning_text.done event
+        self.sequence_number += 1
+        yield OpenAIResponseObjectStreamResponseReasoningTextDone(
+            content_index=reasoning_content_index,
+            text=final_reasoning_text,
+            item_id=message_item_id,
+            output_index=message_output_index,
+            sequence_number=self.sequence_number,
+        )
+        # Emit content_part.done for reasoning
+        self.sequence_number += 1
+        yield OpenAIResponseObjectStreamResponseContentPartDone(
+            content_index=reasoning_content_index,
+            response_id=self.response_id,
+            item_id=message_item_id,
+            output_index=message_output_index,
+            part=OpenAIResponseContentPartReasoningText(
+                text=final_reasoning_text,
+            ),
+            sequence_number=self.sequence_number,
+        )
+
+    async def _emit_refusal_done_events(
+        self,
+        refusal_text_accumulated: list[str],
+        refusal_content_index: int,
+        message_item_id: str,
+        message_output_index: int,
+    ) -> AsyncIterator[OpenAIResponseObjectStream]:
+        final_refusal_text = "".join(refusal_text_accumulated)
+        # Emit refusal.done event
+        self.sequence_number += 1
+        yield OpenAIResponseObjectStreamResponseRefusalDone(
+            content_index=refusal_content_index,
+            refusal=final_refusal_text,
+            item_id=message_item_id,
+            output_index=message_output_index,
+            sequence_number=self.sequence_number,
+        )
+        # Emit content_part.done for refusal
+        self.sequence_number += 1
+        yield OpenAIResponseObjectStreamResponseContentPartDone(
+            content_index=refusal_content_index,
+            response_id=self.response_id,
+            item_id=message_item_id,
+            output_index=message_output_index,
+            part=OpenAIResponseContentPartRefusal(
+                refusal=final_refusal_text,
+            ),
+            sequence_number=self.sequence_number,
+        )
+
    async def _process_streaming_chunks(
        self, completion_result, output_messages: list[OpenAIResponseOutput]
    ) -> AsyncIterator[OpenAIResponseObjectStream | ChatCompletionResult]:
@ -371,8 +499,14 @@ class StreamingResponseOrchestrator:
        tool_call_item_ids: dict[int, str] = {}
        # Track content parts for streaming events
        content_part_emitted = False
+        reasoning_part_emitted = False
+        refusal_part_emitted = False
        content_index = 0
+        reasoning_content_index = 1  # reasoning is a separate content part
+        refusal_content_index = 2  # refusal is a separate content part
        message_output_index = len(output_messages)
+        reasoning_text_accumulated = []
+        refusal_text_accumulated = []

        async for chunk in completion_result:
            chat_response_id = chunk.id
@ -413,6 +547,32 @@ class StreamingResponseOrchestrator:
                if chunk_choice.finish_reason:
                    chunk_finish_reason = chunk_choice.finish_reason

+                # Handle reasoning content if present (non-standard field for o1/o3 models)
+                if hasattr(chunk_choice.delta, "reasoning_content") and chunk_choice.delta.reasoning_content:
+                    async for event in self._handle_reasoning_content_chunk(
+                        reasoning_content=chunk_choice.delta.reasoning_content,
+                        reasoning_part_emitted=reasoning_part_emitted,
+                        reasoning_content_index=reasoning_content_index,
+                        message_item_id=message_item_id,
+                        message_output_index=message_output_index,
+                    ):
+                        yield event
+                    reasoning_part_emitted = True
+                    reasoning_text_accumulated.append(chunk_choice.delta.reasoning_content)
+
+                # Handle refusal content if present
+                if chunk_choice.delta.refusal:
+                    async for event in self._handle_refusal_content_chunk(
+                        refusal_content=chunk_choice.delta.refusal,
+                        refusal_part_emitted=refusal_part_emitted,
+                        refusal_content_index=refusal_content_index,
+                        message_item_id=message_item_id,
+                        message_output_index=message_output_index,
+                    ):
+                        yield event
+                    refusal_part_emitted = True
+                    refusal_text_accumulated.append(chunk_choice.delta.refusal)
+
                # Aggregate tool call arguments across chunks
                if chunk_choice.delta.tool_calls:
                    for tool_call in chunk_choice.delta.tool_calls:
@ -514,6 +674,26 @@ class StreamingResponseOrchestrator:
                sequence_number=self.sequence_number,
            )

+        # Emit reasoning done events if reasoning content was streamed
+        if reasoning_part_emitted:
+            async for event in self._emit_reasoning_done_events(
+                reasoning_text_accumulated=reasoning_text_accumulated,
+                reasoning_content_index=reasoning_content_index,
+                message_item_id=message_item_id,
+                message_output_index=message_output_index,
+            ):
+                yield event
+
+        # Emit refusal done events if refusal content was streamed
+        if refusal_part_emitted:
+            async for event in self._emit_refusal_done_events(
+                refusal_text_accumulated=refusal_text_accumulated,
+                refusal_content_index=refusal_content_index,
+                message_item_id=message_item_id,
+                message_output_index=message_output_index,
+            ):
+                yield event
+
        # Clear content when there are tool calls (OpenAI spec behavior)
        if chat_response_tool_calls:
            chat_response_content = []