diff --git a/docs/source/distributions/k8s-benchmark/openai-mock-server.py b/docs/source/distributions/k8s-benchmark/openai-mock-server.py old mode 100644 new mode 100755 diff --git a/llama_stack/providers/inline/agents/meta_reference/agents.py b/llama_stack/providers/inline/agents/meta_reference/agents.py index 0f12a0865..30196c429 100644 --- a/llama_stack/providers/inline/agents/meta_reference/agents.py +++ b/llama_stack/providers/inline/agents/meta_reference/agents.py @@ -48,8 +48,8 @@ from llama_stack.providers.utils.responses.responses_store import ResponsesStore from .agent_instance import ChatAgent from .config import MetaReferenceAgentsImplConfig -from .openai_responses import OpenAIResponsesImpl from .persistence import AgentInfo +from .responses.openai_responses import OpenAIResponsesImpl logger = logging.getLogger() diff --git a/llama_stack/providers/inline/agents/meta_reference/openai_responses.py b/llama_stack/providers/inline/agents/meta_reference/openai_responses.py deleted file mode 100644 index 6aca4d68e..000000000 --- a/llama_stack/providers/inline/agents/meta_reference/openai_responses.py +++ /dev/null @@ -1,1154 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -import asyncio -import json -import time -import uuid -from collections.abc import AsyncIterator -from typing import Any - -from openai.types.chat import ChatCompletionToolParam -from pydantic import BaseModel - -from llama_stack.apis.agents import Order -from llama_stack.apis.agents.openai_responses import ( - AllowedToolsFilter, - ListOpenAIResponseInputItem, - ListOpenAIResponseObject, - OpenAIDeleteResponseObject, - OpenAIResponseContentPartOutputText, - OpenAIResponseInput, - OpenAIResponseInputFunctionToolCallOutput, - OpenAIResponseInputMessageContent, - OpenAIResponseInputMessageContentImage, - OpenAIResponseInputMessageContentText, - OpenAIResponseInputTool, - OpenAIResponseInputToolFileSearch, - OpenAIResponseInputToolMCP, - OpenAIResponseMessage, - OpenAIResponseObject, - OpenAIResponseObjectStream, - OpenAIResponseObjectStreamResponseCompleted, - OpenAIResponseObjectStreamResponseContentPartAdded, - OpenAIResponseObjectStreamResponseContentPartDone, - OpenAIResponseObjectStreamResponseCreated, - OpenAIResponseObjectStreamResponseFunctionCallArgumentsDelta, - OpenAIResponseObjectStreamResponseFunctionCallArgumentsDone, - OpenAIResponseObjectStreamResponseMcpCallArgumentsDelta, - OpenAIResponseObjectStreamResponseMcpCallArgumentsDone, - OpenAIResponseObjectStreamResponseMcpCallCompleted, - OpenAIResponseObjectStreamResponseMcpCallFailed, - OpenAIResponseObjectStreamResponseMcpCallInProgress, - OpenAIResponseObjectStreamResponseOutputItemAdded, - OpenAIResponseObjectStreamResponseOutputItemDone, - OpenAIResponseObjectStreamResponseOutputTextDelta, - OpenAIResponseObjectStreamResponseWebSearchCallCompleted, - OpenAIResponseObjectStreamResponseWebSearchCallInProgress, - OpenAIResponseObjectStreamResponseWebSearchCallSearching, - OpenAIResponseOutput, - OpenAIResponseOutputMessageContent, - OpenAIResponseOutputMessageContentOutputText, - OpenAIResponseOutputMessageFileSearchToolCall, - OpenAIResponseOutputMessageFileSearchToolCallResults, - OpenAIResponseOutputMessageFunctionToolCall, - OpenAIResponseOutputMessageMCPListTools, - OpenAIResponseOutputMessageWebSearchToolCall, - OpenAIResponseText, - OpenAIResponseTextFormat, - WebSearchToolTypes, -) -from llama_stack.apis.common.content_types import TextContentItem -from llama_stack.apis.inference import ( - Inference, - OpenAIAssistantMessageParam, - OpenAIChatCompletion, - OpenAIChatCompletionContentPartImageParam, - OpenAIChatCompletionContentPartParam, - OpenAIChatCompletionContentPartTextParam, - OpenAIChatCompletionToolCall, - OpenAIChatCompletionToolCallFunction, - OpenAIChoice, - OpenAIDeveloperMessageParam, - OpenAIImageURL, - OpenAIJSONSchema, - OpenAIMessageParam, - OpenAIResponseFormatJSONObject, - OpenAIResponseFormatJSONSchema, - OpenAIResponseFormatParam, - OpenAIResponseFormatText, - OpenAISystemMessageParam, - OpenAIToolMessageParam, - OpenAIUserMessageParam, -) -from llama_stack.apis.tools import ToolGroups, ToolInvocationResult, ToolRuntime -from llama_stack.apis.vector_io import VectorIO -from llama_stack.log import get_logger -from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition -from llama_stack.providers.utils.inference.openai_compat import ( - convert_tooldef_to_openai_tool, -) -from llama_stack.providers.utils.responses.responses_store import ResponsesStore - -logger = get_logger(name=__name__, category="openai_responses") - -OPENAI_RESPONSES_PREFIX = "openai_responses:" - - -class ToolExecutionResult(BaseModel): - """Result of streaming tool execution.""" - - stream_event: OpenAIResponseObjectStream | None = None - sequence_number: int - final_output_message: OpenAIResponseOutput | None = None - final_input_message: OpenAIMessageParam | None = None - - -async def _convert_response_content_to_chat_content( - content: (str | list[OpenAIResponseInputMessageContent] | list[OpenAIResponseOutputMessageContent]), -) -> str | list[OpenAIChatCompletionContentPartParam]: - """ - Convert the content parts from an OpenAI Response API request into OpenAI Chat Completion content parts. - - The content schemas of each API look similar, but are not exactly the same. - """ - if isinstance(content, str): - return content - - converted_parts = [] - for content_part in content: - if isinstance(content_part, OpenAIResponseInputMessageContentText): - converted_parts.append(OpenAIChatCompletionContentPartTextParam(text=content_part.text)) - elif isinstance(content_part, OpenAIResponseOutputMessageContentOutputText): - converted_parts.append(OpenAIChatCompletionContentPartTextParam(text=content_part.text)) - elif isinstance(content_part, OpenAIResponseInputMessageContentImage): - if content_part.image_url: - image_url = OpenAIImageURL(url=content_part.image_url, detail=content_part.detail) - converted_parts.append(OpenAIChatCompletionContentPartImageParam(image_url=image_url)) - elif isinstance(content_part, str): - converted_parts.append(OpenAIChatCompletionContentPartTextParam(text=content_part)) - else: - raise ValueError( - f"Llama Stack OpenAI Responses does not yet support content type '{type(content_part)}' in this context" - ) - return converted_parts - - -async def _convert_response_input_to_chat_messages( - input: str | list[OpenAIResponseInput], -) -> list[OpenAIMessageParam]: - """ - Convert the input from an OpenAI Response API request into OpenAI Chat Completion messages. - """ - messages: list[OpenAIMessageParam] = [] - if isinstance(input, list): - for input_item in input: - if isinstance(input_item, OpenAIResponseInputFunctionToolCallOutput): - messages.append( - OpenAIToolMessageParam( - content=input_item.output, - tool_call_id=input_item.call_id, - ) - ) - elif isinstance(input_item, OpenAIResponseOutputMessageFunctionToolCall): - tool_call = OpenAIChatCompletionToolCall( - index=0, - id=input_item.call_id, - function=OpenAIChatCompletionToolCallFunction( - name=input_item.name, - arguments=input_item.arguments, - ), - ) - messages.append(OpenAIAssistantMessageParam(tool_calls=[tool_call])) - else: - content = await _convert_response_content_to_chat_content(input_item.content) - message_type = await _get_message_type_by_role(input_item.role) - if message_type is None: - raise ValueError( - f"Llama Stack OpenAI Responses does not yet support message role '{input_item.role}' in this context" - ) - messages.append(message_type(content=content)) - else: - messages.append(OpenAIUserMessageParam(content=input)) - return messages - - -async def _convert_chat_choice_to_response_message( - choice: OpenAIChoice, -) -> OpenAIResponseMessage: - """ - Convert an OpenAI Chat Completion choice into an OpenAI Response output message. - """ - output_content = "" - if isinstance(choice.message.content, str): - output_content = choice.message.content - elif isinstance(choice.message.content, OpenAIChatCompletionContentPartTextParam): - output_content = choice.message.content.text - else: - raise ValueError( - f"Llama Stack OpenAI Responses does not yet support output content type: {type(choice.message.content)}" - ) - - return OpenAIResponseMessage( - id=f"msg_{uuid.uuid4()}", - content=[OpenAIResponseOutputMessageContentOutputText(text=output_content)], - status="completed", - role="assistant", - ) - - -async def _convert_response_text_to_chat_response_format( - text: OpenAIResponseText, -) -> OpenAIResponseFormatParam: - """ - Convert an OpenAI Response text parameter into an OpenAI Chat Completion response format. - """ - if not text.format or text.format["type"] == "text": - return OpenAIResponseFormatText(type="text") - if text.format["type"] == "json_object": - return OpenAIResponseFormatJSONObject() - if text.format["type"] == "json_schema": - return OpenAIResponseFormatJSONSchema( - json_schema=OpenAIJSONSchema(name=text.format["name"], schema=text.format["schema"]) - ) - raise ValueError(f"Unsupported text format: {text.format}") - - -async def _get_message_type_by_role(role: str): - role_to_type = { - "user": OpenAIUserMessageParam, - "system": OpenAISystemMessageParam, - "assistant": OpenAIAssistantMessageParam, - "developer": OpenAIDeveloperMessageParam, - } - return role_to_type.get(role) - - -class OpenAIResponsePreviousResponseWithInputItems(BaseModel): - input_items: ListOpenAIResponseInputItem - response: OpenAIResponseObject - - -class ChatCompletionContext(BaseModel): - model: str - messages: list[OpenAIMessageParam] - response_tools: list[OpenAIResponseInputTool] | None = None - chat_tools: list[ChatCompletionToolParam] | None = None - mcp_tool_to_server: dict[str, OpenAIResponseInputToolMCP] - temperature: float | None - response_format: OpenAIResponseFormatParam - - -class OpenAIResponsesImpl: - def __init__( - self, - inference_api: Inference, - tool_groups_api: ToolGroups, - tool_runtime_api: ToolRuntime, - responses_store: ResponsesStore, - vector_io_api: VectorIO, # VectorIO - ): - self.inference_api = inference_api - self.tool_groups_api = tool_groups_api - self.tool_runtime_api = tool_runtime_api - self.responses_store = responses_store - self.vector_io_api = vector_io_api - - async def _prepend_previous_response( - self, - input: str | list[OpenAIResponseInput], - previous_response_id: str | None = None, - ): - if previous_response_id: - previous_response_with_input = await self.responses_store.get_response_object(previous_response_id) - - # previous response input items - new_input_items = previous_response_with_input.input - - # previous response output items - new_input_items.extend(previous_response_with_input.output) - - # new input items from the current request - if isinstance(input, str): - new_input_items.append(OpenAIResponseMessage(content=input, role="user")) - else: - new_input_items.extend(input) - - input = new_input_items - - return input - - async def _prepend_instructions(self, messages, instructions): - if instructions: - messages.insert(0, OpenAISystemMessageParam(content=instructions)) - - async def get_openai_response( - self, - response_id: str, - ) -> OpenAIResponseObject: - response_with_input = await self.responses_store.get_response_object(response_id) - return OpenAIResponseObject(**{k: v for k, v in response_with_input.model_dump().items() if k != "input"}) - - async def list_openai_responses( - self, - after: str | None = None, - limit: int | None = 50, - model: str | None = None, - order: Order | None = Order.desc, - ) -> ListOpenAIResponseObject: - return await self.responses_store.list_responses(after, limit, model, order) - - async def list_openai_response_input_items( - self, - response_id: str, - after: str | None = None, - before: str | None = None, - include: list[str] | None = None, - limit: int | None = 20, - order: Order | None = Order.desc, - ) -> ListOpenAIResponseInputItem: - """List input items for a given OpenAI response. - - :param response_id: The ID of the response to retrieve input items for. - :param after: An item ID to list items after, used for pagination. - :param before: An item ID to list items before, used for pagination. - :param include: Additional fields to include in the response. - :param limit: A limit on the number of objects to be returned. - :param order: The order to return the input items in. - :returns: An ListOpenAIResponseInputItem. - """ - return await self.responses_store.list_response_input_items(response_id, after, before, include, limit, order) - - async def _store_response( - self, - response: OpenAIResponseObject, - input: str | list[OpenAIResponseInput], - ) -> None: - new_input_id = f"msg_{uuid.uuid4()}" - if isinstance(input, str): - # synthesize a message from the input string - input_content = OpenAIResponseInputMessageContentText(text=input) - input_content_item = OpenAIResponseMessage( - role="user", - content=[input_content], - id=new_input_id, - ) - input_items_data = [input_content_item] - else: - # we already have a list of messages - input_items_data = [] - for input_item in input: - if isinstance(input_item, OpenAIResponseMessage): - # These may or may not already have an id, so dump to dict, check for id, and add if missing - input_item_dict = input_item.model_dump() - if "id" not in input_item_dict: - input_item_dict["id"] = new_input_id - input_items_data.append(OpenAIResponseMessage(**input_item_dict)) - else: - input_items_data.append(input_item) - - await self.responses_store.store_response_object( - response_object=response, - input=input_items_data, - ) - - async def create_openai_response( - self, - input: str | list[OpenAIResponseInput], - model: str, - instructions: str | None = None, - previous_response_id: str | None = None, - store: bool | None = True, - stream: bool | None = False, - temperature: float | None = None, - text: OpenAIResponseText | None = None, - tools: list[OpenAIResponseInputTool] | None = None, - include: list[str] | None = None, - max_infer_iters: int | None = 10, - ): - stream = bool(stream) - text = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")) if text is None else text - - stream_gen = self._create_streaming_response( - input=input, - model=model, - instructions=instructions, - previous_response_id=previous_response_id, - store=store, - temperature=temperature, - text=text, - tools=tools, - max_infer_iters=max_infer_iters, - ) - - if stream: - return stream_gen - else: - response = None - async for stream_chunk in stream_gen: - if stream_chunk.type == "response.completed": - if response is not None: - raise ValueError("The response stream completed multiple times! Earlier response: {response}") - response = stream_chunk.response - # don't leave the generator half complete! - - if response is None: - raise ValueError("The response stream never completed") - return response - - async def _create_streaming_response( - self, - input: str | list[OpenAIResponseInput], - model: str, - instructions: str | None = None, - previous_response_id: str | None = None, - store: bool | None = True, - temperature: float | None = None, - text: OpenAIResponseText | None = None, - tools: list[OpenAIResponseInputTool] | None = None, - max_infer_iters: int | None = 10, - ) -> AsyncIterator[OpenAIResponseObjectStream]: - output_messages: list[OpenAIResponseOutput] = [] - - # Input preprocessing - input = await self._prepend_previous_response(input, previous_response_id) - messages = await _convert_response_input_to_chat_messages(input) - await self._prepend_instructions(messages, instructions) - - # Structured outputs - response_format = await _convert_response_text_to_chat_response_format(text) - - # Tool setup, TODO: refactor this slightly since this can also yield events - chat_tools, mcp_tool_to_server, mcp_list_message = ( - await self._convert_response_tools_to_chat_tools(tools) if tools else (None, {}, None) - ) - if mcp_list_message: - output_messages.append(mcp_list_message) - - ctx = ChatCompletionContext( - model=model, - messages=messages, - response_tools=tools, - chat_tools=chat_tools, - mcp_tool_to_server=mcp_tool_to_server, - temperature=temperature, - response_format=response_format, - ) - - # Create initial response and emit response.created immediately - response_id = f"resp-{uuid.uuid4()}" - created_at = int(time.time()) - - initial_response = OpenAIResponseObject( - created_at=created_at, - id=response_id, - model=model, - object="response", - status="in_progress", - output=output_messages.copy(), - text=text, - ) - - yield OpenAIResponseObjectStreamResponseCreated(response=initial_response) - - n_iter = 0 - messages = ctx.messages.copy() - - while True: - completion_result = await self.inference_api.openai_chat_completion( - model=ctx.model, - messages=messages, - tools=ctx.chat_tools, - stream=True, - temperature=ctx.temperature, - response_format=ctx.response_format, - ) - - # Process streaming chunks and build complete response - chat_response_id = "" - chat_response_content = [] - chat_response_tool_calls: dict[int, OpenAIChatCompletionToolCall] = {} - chunk_created = 0 - chunk_model = "" - chunk_finish_reason = "" - sequence_number = 0 - - # Create a placeholder message item for delta events - message_item_id = f"msg_{uuid.uuid4()}" - # Track tool call items for streaming events - tool_call_item_ids: dict[int, str] = {} - # Track content parts for streaming events - content_part_emitted = False - - async for chunk in completion_result: - chat_response_id = chunk.id - chunk_created = chunk.created - chunk_model = chunk.model - for chunk_choice in chunk.choices: - # Emit incremental text content as delta events - if chunk_choice.delta.content: - # Emit content_part.added event for first text chunk - if not content_part_emitted: - content_part_emitted = True - sequence_number += 1 - yield OpenAIResponseObjectStreamResponseContentPartAdded( - response_id=response_id, - item_id=message_item_id, - part=OpenAIResponseContentPartOutputText( - text="", # Will be filled incrementally via text deltas - ), - sequence_number=sequence_number, - ) - sequence_number += 1 - yield OpenAIResponseObjectStreamResponseOutputTextDelta( - content_index=0, - delta=chunk_choice.delta.content, - item_id=message_item_id, - output_index=0, - sequence_number=sequence_number, - ) - - # Collect content for final response - chat_response_content.append(chunk_choice.delta.content or "") - if chunk_choice.finish_reason: - chunk_finish_reason = chunk_choice.finish_reason - - # Aggregate tool call arguments across chunks - if chunk_choice.delta.tool_calls: - for tool_call in chunk_choice.delta.tool_calls: - response_tool_call = chat_response_tool_calls.get(tool_call.index, None) - # Create new tool call entry if this is the first chunk for this index - is_new_tool_call = response_tool_call is None - if is_new_tool_call: - tool_call_dict: dict[str, Any] = tool_call.model_dump() - tool_call_dict.pop("type", None) - response_tool_call = OpenAIChatCompletionToolCall(**tool_call_dict) - chat_response_tool_calls[tool_call.index] = response_tool_call - - # Create item ID for this tool call for streaming events - tool_call_item_id = f"fc_{uuid.uuid4()}" - tool_call_item_ids[tool_call.index] = tool_call_item_id - - # Emit output_item.added event for the new function call - sequence_number += 1 - function_call_item = OpenAIResponseOutputMessageFunctionToolCall( - arguments="", # Will be filled incrementally via delta events - call_id=tool_call.id or "", - name=tool_call.function.name if tool_call.function else "", - id=tool_call_item_id, - status="in_progress", - ) - yield OpenAIResponseObjectStreamResponseOutputItemAdded( - response_id=response_id, - item=function_call_item, - output_index=len(output_messages), - sequence_number=sequence_number, - ) - - # Stream tool call arguments as they arrive (differentiate between MCP and function calls) - if tool_call.function and tool_call.function.arguments: - tool_call_item_id = tool_call_item_ids[tool_call.index] - sequence_number += 1 - - # Check if this is an MCP tool call - is_mcp_tool = ( - ctx.mcp_tool_to_server - and tool_call.function.name - and tool_call.function.name in ctx.mcp_tool_to_server - ) - if is_mcp_tool: - # Emit MCP-specific argument delta event - yield OpenAIResponseObjectStreamResponseMcpCallArgumentsDelta( - delta=tool_call.function.arguments, - item_id=tool_call_item_id, - output_index=len(output_messages), - sequence_number=sequence_number, - ) - else: - # Emit function call argument delta event - yield OpenAIResponseObjectStreamResponseFunctionCallArgumentsDelta( - delta=tool_call.function.arguments, - item_id=tool_call_item_id, - output_index=len(output_messages), - sequence_number=sequence_number, - ) - - # Accumulate arguments for final response (only for subsequent chunks) - if not is_new_tool_call: - response_tool_call.function.arguments = ( - response_tool_call.function.arguments or "" - ) + tool_call.function.arguments - - # Emit arguments.done events for completed tool calls (differentiate between MCP and function calls) - for tool_call_index in sorted(chat_response_tool_calls.keys()): - tool_call_item_id = tool_call_item_ids[tool_call_index] - final_arguments = chat_response_tool_calls[tool_call_index].function.arguments or "" - tool_call_name = chat_response_tool_calls[tool_call_index].function.name - - # Check if this is an MCP tool call - is_mcp_tool = ctx.mcp_tool_to_server and tool_call_name and tool_call_name in ctx.mcp_tool_to_server - sequence_number += 1 - if is_mcp_tool: - # Emit MCP-specific argument done event - yield OpenAIResponseObjectStreamResponseMcpCallArgumentsDone( - arguments=final_arguments, - item_id=tool_call_item_id, - output_index=len(output_messages), - sequence_number=sequence_number, - ) - else: - # Emit function call argument done event - yield OpenAIResponseObjectStreamResponseFunctionCallArgumentsDone( - arguments=final_arguments, - item_id=tool_call_item_id, - output_index=len(output_messages), - sequence_number=sequence_number, - ) - - # Convert collected chunks to complete response - if chat_response_tool_calls: - tool_calls = [chat_response_tool_calls[i] for i in sorted(chat_response_tool_calls.keys())] - else: - tool_calls = None - - # Emit content_part.done event if text content was streamed (before content gets cleared) - if content_part_emitted: - final_text = "".join(chat_response_content) - sequence_number += 1 - yield OpenAIResponseObjectStreamResponseContentPartDone( - response_id=response_id, - item_id=message_item_id, - part=OpenAIResponseContentPartOutputText( - text=final_text, - ), - sequence_number=sequence_number, - ) - - # Clear content when there are tool calls (OpenAI spec behavior) - if chat_response_tool_calls: - chat_response_content = [] - - assistant_message = OpenAIAssistantMessageParam( - content="".join(chat_response_content), - tool_calls=tool_calls, - ) - current_response = OpenAIChatCompletion( - id=chat_response_id, - choices=[ - OpenAIChoice( - message=assistant_message, - finish_reason=chunk_finish_reason, - index=0, - ) - ], - created=chunk_created, - model=chunk_model, - ) - - function_tool_calls = [] - non_function_tool_calls = [] - - next_turn_messages = messages.copy() - for choice in current_response.choices: - next_turn_messages.append(choice.message) - - if choice.message.tool_calls and tools: - for tool_call in choice.message.tool_calls: - if _is_function_tool_call(tool_call, tools): - function_tool_calls.append(tool_call) - else: - non_function_tool_calls.append(tool_call) - else: - output_messages.append(await _convert_chat_choice_to_response_message(choice)) - - # execute non-function tool calls - for tool_call in non_function_tool_calls: - # Find the item_id for this tool call - matching_item_id = None - for index, item_id in tool_call_item_ids.items(): - response_tool_call = chat_response_tool_calls.get(index) - if response_tool_call and response_tool_call.id == tool_call.id: - matching_item_id = item_id - break - - # Use a fallback item_id if not found - if not matching_item_id: - matching_item_id = f"tc_{uuid.uuid4()}" - - # Execute tool call with streaming - tool_call_log = None - tool_response_message = None - async for result in self._execute_tool_call( - tool_call, ctx, sequence_number, response_id, len(output_messages), matching_item_id - ): - if result.stream_event: - # Forward streaming events - sequence_number = result.sequence_number - yield result.stream_event - - if result.final_output_message is not None: - tool_call_log = result.final_output_message - tool_response_message = result.final_input_message - sequence_number = result.sequence_number - - if tool_call_log: - output_messages.append(tool_call_log) - - # Emit output_item.done event for completed non-function tool call - if matching_item_id: - sequence_number += 1 - yield OpenAIResponseObjectStreamResponseOutputItemDone( - response_id=response_id, - item=tool_call_log, - output_index=len(output_messages) - 1, - sequence_number=sequence_number, - ) - - if tool_response_message: - next_turn_messages.append(tool_response_message) - - for tool_call in function_tool_calls: - # Find the item_id for this tool call from our tracking dictionary - matching_item_id = None - for index, item_id in tool_call_item_ids.items(): - response_tool_call = chat_response_tool_calls.get(index) - if response_tool_call and response_tool_call.id == tool_call.id: - matching_item_id = item_id - break - - # Use existing item_id or create new one if not found - final_item_id = matching_item_id or f"fc_{uuid.uuid4()}" - - function_call_item = OpenAIResponseOutputMessageFunctionToolCall( - arguments=tool_call.function.arguments or "", - call_id=tool_call.id, - name=tool_call.function.name or "", - id=final_item_id, - status="completed", - ) - output_messages.append(function_call_item) - - # Emit output_item.done event for completed function call - sequence_number += 1 - yield OpenAIResponseObjectStreamResponseOutputItemDone( - response_id=response_id, - item=function_call_item, - output_index=len(output_messages) - 1, - sequence_number=sequence_number, - ) - - if not function_tool_calls and not non_function_tool_calls: - break - - if function_tool_calls: - logger.info("Exiting inference loop since there is a function (client-side) tool call") - break - - n_iter += 1 - if n_iter >= max_infer_iters: - logger.info(f"Exiting inference loop since iteration count({n_iter}) exceeds {max_infer_iters=}") - break - - messages = next_turn_messages - - # Create final response - final_response = OpenAIResponseObject( - created_at=created_at, - id=response_id, - model=model, - object="response", - status="completed", - text=text, - output=output_messages, - ) - - # Emit response.completed - yield OpenAIResponseObjectStreamResponseCompleted(response=final_response) - - if store: - await self._store_response( - response=final_response, - input=input, - ) - - async def delete_openai_response(self, response_id: str) -> OpenAIDeleteResponseObject: - return await self.responses_store.delete_response_object(response_id) - - async def _convert_response_tools_to_chat_tools( - self, tools: list[OpenAIResponseInputTool] - ) -> tuple[ - list[ChatCompletionToolParam], - dict[str, OpenAIResponseInputToolMCP], - OpenAIResponseOutput | None, - ]: - from llama_stack.apis.agents.openai_responses import ( - MCPListToolsTool, - ) - from llama_stack.apis.tools import Tool - - mcp_tool_to_server = {} - - def make_openai_tool(tool_name: str, tool: Tool) -> ChatCompletionToolParam: - tool_def = ToolDefinition( - tool_name=tool_name, - description=tool.description, - parameters={ - param.name: ToolParamDefinition( - param_type=param.parameter_type, - description=param.description, - required=param.required, - default=param.default, - ) - for param in tool.parameters - }, - ) - return convert_tooldef_to_openai_tool(tool_def) - - mcp_list_message = None - chat_tools: list[ChatCompletionToolParam] = [] - for input_tool in tools: - # TODO: Handle other tool types - if input_tool.type == "function": - chat_tools.append(ChatCompletionToolParam(type="function", function=input_tool.model_dump())) - elif input_tool.type in WebSearchToolTypes: - tool_name = "web_search" - tool = await self.tool_groups_api.get_tool(tool_name) - if not tool: - raise ValueError(f"Tool {tool_name} not found") - chat_tools.append(make_openai_tool(tool_name, tool)) - elif input_tool.type == "file_search": - tool_name = "knowledge_search" - tool = await self.tool_groups_api.get_tool(tool_name) - if not tool: - raise ValueError(f"Tool {tool_name} not found") - chat_tools.append(make_openai_tool(tool_name, tool)) - elif input_tool.type == "mcp": - from llama_stack.providers.utils.tools.mcp import list_mcp_tools - - always_allowed = None - never_allowed = None - if input_tool.allowed_tools: - if isinstance(input_tool.allowed_tools, list): - always_allowed = input_tool.allowed_tools - elif isinstance(input_tool.allowed_tools, AllowedToolsFilter): - always_allowed = input_tool.allowed_tools.always - never_allowed = input_tool.allowed_tools.never - - tool_defs = await list_mcp_tools( - endpoint=input_tool.server_url, - headers=input_tool.headers or {}, - ) - - mcp_list_message = OpenAIResponseOutputMessageMCPListTools( - id=f"mcp_list_{uuid.uuid4()}", - status="completed", - server_label=input_tool.server_label, - tools=[], - ) - for t in tool_defs.data: - if never_allowed and t.name in never_allowed: - continue - if not always_allowed or t.name in always_allowed: - chat_tools.append(make_openai_tool(t.name, t)) - if t.name in mcp_tool_to_server: - raise ValueError(f"Duplicate tool name {t.name} found for server {input_tool.server_label}") - mcp_tool_to_server[t.name] = input_tool - mcp_list_message.tools.append( - MCPListToolsTool( - name=t.name, - description=t.description, - input_schema={ - "type": "object", - "properties": { - p.name: { - "type": p.parameter_type, - "description": p.description, - } - for p in t.parameters - }, - "required": [p.name for p in t.parameters if p.required], - }, - ) - ) - else: - raise ValueError(f"Llama Stack OpenAI Responses does not yet support tool type: {input_tool.type}") - return chat_tools, mcp_tool_to_server, mcp_list_message - - async def _execute_knowledge_search_via_vector_store( - self, - query: str, - response_file_search_tool: OpenAIResponseInputToolFileSearch, - ) -> ToolInvocationResult: - """Execute knowledge search using vector_stores.search API with filters support.""" - search_results = [] - - # Create search tasks for all vector stores - async def search_single_store(vector_store_id): - try: - search_response = await self.vector_io_api.openai_search_vector_store( - vector_store_id=vector_store_id, - query=query, - filters=response_file_search_tool.filters, - max_num_results=response_file_search_tool.max_num_results, - ranking_options=response_file_search_tool.ranking_options, - rewrite_query=False, - ) - return search_response.data - except Exception as e: - logger.warning(f"Failed to search vector store {vector_store_id}: {e}") - return [] - - # Run all searches in parallel using gather - search_tasks = [search_single_store(vid) for vid in response_file_search_tool.vector_store_ids] - all_results = await asyncio.gather(*search_tasks) - - # Flatten results - for results in all_results: - search_results.extend(results) - - # Convert search results to tool result format matching memory.py - # Format the results as interleaved content similar to memory.py - content_items = [] - content_items.append( - TextContentItem( - text=f"knowledge_search tool found {len(search_results)} chunks:\nBEGIN of knowledge_search tool results.\n" - ) - ) - - for i, result_item in enumerate(search_results): - chunk_text = result_item.content[0].text if result_item.content else "" - metadata_text = f"document_id: {result_item.file_id}, score: {result_item.score}" - if result_item.attributes: - metadata_text += f", attributes: {result_item.attributes}" - text_content = f"[{i + 1}] {metadata_text}\n{chunk_text}\n" - content_items.append(TextContentItem(text=text_content)) - - content_items.append(TextContentItem(text="END of knowledge_search tool results.\n")) - content_items.append( - TextContentItem( - text=f'The above results were retrieved to help answer the user\'s query: "{query}". Use them as supporting information only in answering this query.\n', - ) - ) - - return ToolInvocationResult( - content=content_items, - metadata={ - "document_ids": [r.file_id for r in search_results], - "chunks": [r.content[0].text if r.content else "" for r in search_results], - "scores": [r.score for r in search_results], - }, - ) - - async def _execute_tool_call( - self, - tool_call: OpenAIChatCompletionToolCall, - ctx: ChatCompletionContext, - sequence_number: int, - response_id: str, - output_index: int, - item_id: str, - ) -> AsyncIterator[ToolExecutionResult]: - from llama_stack.providers.utils.inference.prompt_adapter import ( - interleaved_content_as_str, - ) - - tool_call_id = tool_call.id - function = tool_call.function - tool_kwargs = json.loads(function.arguments) if function.arguments else {} - - if not function or not tool_call_id or not function.name: - yield ToolExecutionResult(sequence_number=sequence_number) - return - - # Emit in_progress event based on tool type (only for tools with specific streaming events) - progress_event = None - if ctx.mcp_tool_to_server and function.name in ctx.mcp_tool_to_server: - sequence_number += 1 - progress_event = OpenAIResponseObjectStreamResponseMcpCallInProgress( - item_id=item_id, - output_index=output_index, - sequence_number=sequence_number, - ) - elif function.name == "web_search": - sequence_number += 1 - progress_event = OpenAIResponseObjectStreamResponseWebSearchCallInProgress( - item_id=item_id, - output_index=output_index, - sequence_number=sequence_number, - ) - # Note: knowledge_search and other custom tools don't have specific streaming events in OpenAI spec - - if progress_event: - yield ToolExecutionResult(stream_event=progress_event, sequence_number=sequence_number) - - # For web search, emit searching event - if function.name == "web_search": - sequence_number += 1 - searching_event = OpenAIResponseObjectStreamResponseWebSearchCallSearching( - item_id=item_id, - output_index=output_index, - sequence_number=sequence_number, - ) - yield ToolExecutionResult(stream_event=searching_event, sequence_number=sequence_number) - - # Execute the actual tool call - error_exc = None - result = None - try: - if ctx.mcp_tool_to_server and function.name in ctx.mcp_tool_to_server: - from llama_stack.providers.utils.tools.mcp import invoke_mcp_tool - - mcp_tool = ctx.mcp_tool_to_server[function.name] - result = await invoke_mcp_tool( - endpoint=mcp_tool.server_url, - headers=mcp_tool.headers or {}, - tool_name=function.name, - kwargs=tool_kwargs, - ) - elif function.name == "knowledge_search": - response_file_search_tool = next( - (t for t in ctx.response_tools if isinstance(t, OpenAIResponseInputToolFileSearch)), - None, - ) - if response_file_search_tool: - # Use vector_stores.search API instead of knowledge_search tool - # to support filters and ranking_options - query = tool_kwargs.get("query", "") - result = await self._execute_knowledge_search_via_vector_store( - query=query, - response_file_search_tool=response_file_search_tool, - ) - else: - result = await self.tool_runtime_api.invoke_tool( - tool_name=function.name, - kwargs=tool_kwargs, - ) - except Exception as e: - error_exc = e - - # Emit completion or failure event based on result (only for tools with specific streaming events) - has_error = error_exc or (result and ((result.error_code and result.error_code > 0) or result.error_message)) - completion_event = None - - if ctx.mcp_tool_to_server and function.name in ctx.mcp_tool_to_server: - sequence_number += 1 - if has_error: - completion_event = OpenAIResponseObjectStreamResponseMcpCallFailed( - sequence_number=sequence_number, - ) - else: - completion_event = OpenAIResponseObjectStreamResponseMcpCallCompleted( - sequence_number=sequence_number, - ) - elif function.name == "web_search": - sequence_number += 1 - completion_event = OpenAIResponseObjectStreamResponseWebSearchCallCompleted( - item_id=item_id, - output_index=output_index, - sequence_number=sequence_number, - ) - # Note: knowledge_search and other custom tools don't have specific completion events in OpenAI spec - - if completion_event: - yield ToolExecutionResult(stream_event=completion_event, sequence_number=sequence_number) - - # Build the result message and input message - if function.name in ctx.mcp_tool_to_server: - from llama_stack.apis.agents.openai_responses import ( - OpenAIResponseOutputMessageMCPCall, - ) - - message = OpenAIResponseOutputMessageMCPCall( - id=tool_call_id, - arguments=function.arguments, - name=function.name, - server_label=ctx.mcp_tool_to_server[function.name].server_label, - ) - if error_exc: - message.error = str(error_exc) - elif (result and result.error_code and result.error_code > 0) or (result and result.error_message): - message.error = f"Error (code {result.error_code}): {result.error_message}" - elif result and result.content: - message.output = interleaved_content_as_str(result.content) - else: - if function.name == "web_search": - message = OpenAIResponseOutputMessageWebSearchToolCall( - id=tool_call_id, - status="completed", - ) - if has_error: - message.status = "failed" - elif function.name == "knowledge_search": - message = OpenAIResponseOutputMessageFileSearchToolCall( - id=tool_call_id, - queries=[tool_kwargs.get("query", "")], - status="completed", - ) - if result and "document_ids" in result.metadata: - message.results = [] - for i, doc_id in enumerate(result.metadata["document_ids"]): - text = result.metadata["chunks"][i] if "chunks" in result.metadata else None - score = result.metadata["scores"][i] if "scores" in result.metadata else None - message.results.append( - OpenAIResponseOutputMessageFileSearchToolCallResults( - file_id=doc_id, - filename=doc_id, - text=text, - score=score, - attributes={}, - ) - ) - if has_error: - message.status = "failed" - else: - raise ValueError(f"Unknown tool {function.name} called") - - input_message = None - if result and result.content: - if isinstance(result.content, str): - content = result.content - elif isinstance(result.content, list): - from llama_stack.apis.common.content_types import ( - ImageContentItem, - TextContentItem, - ) - - content = [] - for item in result.content: - if isinstance(item, TextContentItem): - part = OpenAIChatCompletionContentPartTextParam(text=item.text) - elif isinstance(item, ImageContentItem): - if item.image.data: - url = f"data:image;base64,{item.image.data}" - else: - url = item.image.url - part = OpenAIChatCompletionContentPartImageParam(image_url=OpenAIImageURL(url=url)) - else: - raise ValueError(f"Unknown result content type: {type(item)}") - content.append(part) - else: - raise ValueError(f"Unknown result content type: {type(result.content)}") - input_message = OpenAIToolMessageParam(content=content, tool_call_id=tool_call_id) - else: - text = str(error_exc) if error_exc else "Tool execution failed" - input_message = OpenAIToolMessageParam(content=text, tool_call_id=tool_call_id) - - # Yield the final result - yield ToolExecutionResult( - sequence_number=sequence_number, final_output_message=message, final_input_message=input_message - ) - - -def _is_function_tool_call( - tool_call: OpenAIChatCompletionToolCall, - tools: list[OpenAIResponseInputTool], -) -> bool: - if not tool_call.function: - return False - for t in tools: - if t.type == "function" and t.name == tool_call.function.name: - return True - return False diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/__init__.py b/llama_stack/providers/inline/agents/meta_reference/responses/__init__.py new file mode 100644 index 000000000..756f351d8 --- /dev/null +++ b/llama_stack/providers/inline/agents/meta_reference/responses/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py b/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py new file mode 100644 index 000000000..9a87038bf --- /dev/null +++ b/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py @@ -0,0 +1,499 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import time +import uuid +from collections.abc import AsyncIterator + +from openai.types.chat import ChatCompletionToolParam +from pydantic import BaseModel + +from llama_stack.apis.agents import Order +from llama_stack.apis.agents.openai_responses import ( + AllowedToolsFilter, + ListOpenAIResponseInputItem, + ListOpenAIResponseObject, + MCPListToolsTool, + OpenAIDeleteResponseObject, + OpenAIResponseInput, + OpenAIResponseInputFunctionToolCallOutput, + OpenAIResponseInputMessageContent, + OpenAIResponseInputMessageContentImage, + OpenAIResponseInputMessageContentText, + OpenAIResponseInputTool, + OpenAIResponseInputToolMCP, + OpenAIResponseMessage, + OpenAIResponseObject, + OpenAIResponseObjectStream, + OpenAIResponseOutput, + OpenAIResponseOutputMessageContent, + OpenAIResponseOutputMessageContentOutputText, + OpenAIResponseOutputMessageFunctionToolCall, + OpenAIResponseOutputMessageMCPListTools, + OpenAIResponseText, + OpenAIResponseTextFormat, + WebSearchToolTypes, +) +from llama_stack.apis.inference import ( + Inference, + OpenAIAssistantMessageParam, + OpenAIChatCompletionContentPartImageParam, + OpenAIChatCompletionContentPartParam, + OpenAIChatCompletionContentPartTextParam, + OpenAIChatCompletionToolCall, + OpenAIChatCompletionToolCallFunction, + OpenAIDeveloperMessageParam, + OpenAIImageURL, + OpenAIJSONSchema, + OpenAIMessageParam, + OpenAIResponseFormatJSONObject, + OpenAIResponseFormatJSONSchema, + OpenAIResponseFormatParam, + OpenAIResponseFormatText, + OpenAISystemMessageParam, + OpenAIToolMessageParam, + OpenAIUserMessageParam, +) +from llama_stack.apis.tools import Tool, ToolGroups, ToolRuntime +from llama_stack.apis.vector_io import VectorIO +from llama_stack.log import get_logger +from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition +from llama_stack.providers.utils.inference.openai_compat import ( + convert_tooldef_to_openai_tool, +) +from llama_stack.providers.utils.responses.responses_store import ResponsesStore + +from .streaming import StreamingResponseOrchestrator +from .tool_executor import ToolExecutor +from .types import ChatCompletionContext + +logger = get_logger(name=__name__, category="responses") + + +async def _convert_response_content_to_chat_content( + content: (str | list[OpenAIResponseInputMessageContent] | list[OpenAIResponseOutputMessageContent]), +) -> str | list[OpenAIChatCompletionContentPartParam]: + """ + Convert the content parts from an OpenAI Response API request into OpenAI Chat Completion content parts. + + The content schemas of each API look similar, but are not exactly the same. + """ + if isinstance(content, str): + return content + + converted_parts = [] + for content_part in content: + if isinstance(content_part, OpenAIResponseInputMessageContentText): + converted_parts.append(OpenAIChatCompletionContentPartTextParam(text=content_part.text)) + elif isinstance(content_part, OpenAIResponseOutputMessageContentOutputText): + converted_parts.append(OpenAIChatCompletionContentPartTextParam(text=content_part.text)) + elif isinstance(content_part, OpenAIResponseInputMessageContentImage): + if content_part.image_url: + image_url = OpenAIImageURL(url=content_part.image_url, detail=content_part.detail) + converted_parts.append(OpenAIChatCompletionContentPartImageParam(image_url=image_url)) + elif isinstance(content_part, str): + converted_parts.append(OpenAIChatCompletionContentPartTextParam(text=content_part)) + else: + raise ValueError( + f"Llama Stack OpenAI Responses does not yet support content type '{type(content_part)}' in this context" + ) + return converted_parts + + +async def _convert_response_input_to_chat_messages( + input: str | list[OpenAIResponseInput], +) -> list[OpenAIMessageParam]: + """ + Convert the input from an OpenAI Response API request into OpenAI Chat Completion messages. + """ + messages: list[OpenAIMessageParam] = [] + if isinstance(input, list): + for input_item in input: + if isinstance(input_item, OpenAIResponseInputFunctionToolCallOutput): + messages.append( + OpenAIToolMessageParam( + content=input_item.output, + tool_call_id=input_item.call_id, + ) + ) + elif isinstance(input_item, OpenAIResponseOutputMessageFunctionToolCall): + tool_call = OpenAIChatCompletionToolCall( + index=0, + id=input_item.call_id, + function=OpenAIChatCompletionToolCallFunction( + name=input_item.name, + arguments=input_item.arguments, + ), + ) + messages.append(OpenAIAssistantMessageParam(tool_calls=[tool_call])) + else: + content = await _convert_response_content_to_chat_content(input_item.content) + message_type = await _get_message_type_by_role(input_item.role) + if message_type is None: + raise ValueError( + f"Llama Stack OpenAI Responses does not yet support message role '{input_item.role}' in this context" + ) + messages.append(message_type(content=content)) + else: + messages.append(OpenAIUserMessageParam(content=input)) + return messages + + +async def _convert_response_text_to_chat_response_format( + text: OpenAIResponseText, +) -> OpenAIResponseFormatParam: + """ + Convert an OpenAI Response text parameter into an OpenAI Chat Completion response format. + """ + if not text.format or text.format["type"] == "text": + return OpenAIResponseFormatText(type="text") + if text.format["type"] == "json_object": + return OpenAIResponseFormatJSONObject() + if text.format["type"] == "json_schema": + return OpenAIResponseFormatJSONSchema( + json_schema=OpenAIJSONSchema(name=text.format["name"], schema=text.format["schema"]) + ) + raise ValueError(f"Unsupported text format: {text.format}") + + +async def _get_message_type_by_role(role: str): + role_to_type = { + "user": OpenAIUserMessageParam, + "system": OpenAISystemMessageParam, + "assistant": OpenAIAssistantMessageParam, + "developer": OpenAIDeveloperMessageParam, + } + return role_to_type.get(role) + + +class OpenAIResponsePreviousResponseWithInputItems(BaseModel): + input_items: ListOpenAIResponseInputItem + response: OpenAIResponseObject + + +class OpenAIResponsesImpl: + def __init__( + self, + inference_api: Inference, + tool_groups_api: ToolGroups, + tool_runtime_api: ToolRuntime, + responses_store: ResponsesStore, + vector_io_api: VectorIO, # VectorIO + ): + self.inference_api = inference_api + self.tool_groups_api = tool_groups_api + self.tool_runtime_api = tool_runtime_api + self.responses_store = responses_store + self.vector_io_api = vector_io_api + self.tool_executor = ToolExecutor( + tool_groups_api=tool_groups_api, + tool_runtime_api=tool_runtime_api, + vector_io_api=vector_io_api, + ) + + async def _prepend_previous_response( + self, + input: str | list[OpenAIResponseInput], + previous_response_id: str | None = None, + ): + if previous_response_id: + previous_response_with_input = await self.responses_store.get_response_object(previous_response_id) + + # previous response input items + new_input_items = previous_response_with_input.input + + # previous response output items + new_input_items.extend(previous_response_with_input.output) + + # new input items from the current request + if isinstance(input, str): + new_input_items.append(OpenAIResponseMessage(content=input, role="user")) + else: + new_input_items.extend(input) + + input = new_input_items + + return input + + async def _prepend_instructions(self, messages, instructions): + if instructions: + messages.insert(0, OpenAISystemMessageParam(content=instructions)) + + async def get_openai_response( + self, + response_id: str, + ) -> OpenAIResponseObject: + response_with_input = await self.responses_store.get_response_object(response_id) + return OpenAIResponseObject(**{k: v for k, v in response_with_input.model_dump().items() if k != "input"}) + + async def list_openai_responses( + self, + after: str | None = None, + limit: int | None = 50, + model: str | None = None, + order: Order | None = Order.desc, + ) -> ListOpenAIResponseObject: + return await self.responses_store.list_responses(after, limit, model, order) + + async def list_openai_response_input_items( + self, + response_id: str, + after: str | None = None, + before: str | None = None, + include: list[str] | None = None, + limit: int | None = 20, + order: Order | None = Order.desc, + ) -> ListOpenAIResponseInputItem: + """List input items for a given OpenAI response. + + :param response_id: The ID of the response to retrieve input items for. + :param after: An item ID to list items after, used for pagination. + :param before: An item ID to list items before, used for pagination. + :param include: Additional fields to include in the response. + :param limit: A limit on the number of objects to be returned. + :param order: The order to return the input items in. + :returns: An ListOpenAIResponseInputItem. + """ + return await self.responses_store.list_response_input_items(response_id, after, before, include, limit, order) + + async def _store_response( + self, + response: OpenAIResponseObject, + input: str | list[OpenAIResponseInput], + ) -> None: + new_input_id = f"msg_{uuid.uuid4()}" + if isinstance(input, str): + # synthesize a message from the input string + input_content = OpenAIResponseInputMessageContentText(text=input) + input_content_item = OpenAIResponseMessage( + role="user", + content=[input_content], + id=new_input_id, + ) + input_items_data = [input_content_item] + else: + # we already have a list of messages + input_items_data = [] + for input_item in input: + if isinstance(input_item, OpenAIResponseMessage): + # These may or may not already have an id, so dump to dict, check for id, and add if missing + input_item_dict = input_item.model_dump() + if "id" not in input_item_dict: + input_item_dict["id"] = new_input_id + input_items_data.append(OpenAIResponseMessage(**input_item_dict)) + else: + input_items_data.append(input_item) + + await self.responses_store.store_response_object( + response_object=response, + input=input_items_data, + ) + + async def create_openai_response( + self, + input: str | list[OpenAIResponseInput], + model: str, + instructions: str | None = None, + previous_response_id: str | None = None, + store: bool | None = True, + stream: bool | None = False, + temperature: float | None = None, + text: OpenAIResponseText | None = None, + tools: list[OpenAIResponseInputTool] | None = None, + include: list[str] | None = None, + max_infer_iters: int | None = 10, + ): + stream = bool(stream) + text = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")) if text is None else text + + stream_gen = self._create_streaming_response( + input=input, + model=model, + instructions=instructions, + previous_response_id=previous_response_id, + store=store, + temperature=temperature, + text=text, + tools=tools, + max_infer_iters=max_infer_iters, + ) + + if stream: + return stream_gen + else: + response = None + async for stream_chunk in stream_gen: + if stream_chunk.type == "response.completed": + if response is not None: + raise ValueError("The response stream completed multiple times! Earlier response: {response}") + response = stream_chunk.response + # don't leave the generator half complete! + + if response is None: + raise ValueError("The response stream never completed") + return response + + async def _create_streaming_response( + self, + input: str | list[OpenAIResponseInput], + model: str, + instructions: str | None = None, + previous_response_id: str | None = None, + store: bool | None = True, + temperature: float | None = None, + text: OpenAIResponseText | None = None, + tools: list[OpenAIResponseInputTool] | None = None, + max_infer_iters: int | None = 10, + ) -> AsyncIterator[OpenAIResponseObjectStream]: + # Input preprocessing + input = await self._prepend_previous_response(input, previous_response_id) + messages = await _convert_response_input_to_chat_messages(input) + await self._prepend_instructions(messages, instructions) + + # Structured outputs + response_format = await _convert_response_text_to_chat_response_format(text) + + # Tool setup, TODO: refactor this slightly since this can also yield events + chat_tools, mcp_tool_to_server, mcp_list_message = ( + await self._convert_response_tools_to_chat_tools(tools) if tools else (None, {}, None) + ) + + ctx = ChatCompletionContext( + model=model, + messages=messages, + response_tools=tools, + chat_tools=chat_tools, + mcp_tool_to_server=mcp_tool_to_server, + temperature=temperature, + response_format=response_format, + ) + + # Create orchestrator and delegate streaming logic + response_id = f"resp-{uuid.uuid4()}" + created_at = int(time.time()) + + orchestrator = StreamingResponseOrchestrator( + inference_api=self.inference_api, + ctx=ctx, + response_id=response_id, + created_at=created_at, + text=text, + max_infer_iters=max_infer_iters, + tool_executor=self.tool_executor, + mcp_list_message=mcp_list_message, + ) + + # Stream the response + final_response = None + async for stream_chunk in orchestrator.create_response(): + if stream_chunk.type == "response.completed": + final_response = stream_chunk.response + yield stream_chunk + + # Store the response if requested + if store and final_response: + await self._store_response( + response=final_response, + input=input, + ) + + async def delete_openai_response(self, response_id: str) -> OpenAIDeleteResponseObject: + return await self.responses_store.delete_response_object(response_id) + + async def _convert_response_tools_to_chat_tools( + self, tools: list[OpenAIResponseInputTool] + ) -> tuple[ + list[ChatCompletionToolParam], + dict[str, OpenAIResponseInputToolMCP], + OpenAIResponseOutput | None, + ]: + mcp_tool_to_server = {} + + def make_openai_tool(tool_name: str, tool: Tool) -> ChatCompletionToolParam: + tool_def = ToolDefinition( + tool_name=tool_name, + description=tool.description, + parameters={ + param.name: ToolParamDefinition( + param_type=param.parameter_type, + description=param.description, + required=param.required, + default=param.default, + ) + for param in tool.parameters + }, + ) + return convert_tooldef_to_openai_tool(tool_def) + + mcp_list_message = None + chat_tools: list[ChatCompletionToolParam] = [] + for input_tool in tools: + # TODO: Handle other tool types + if input_tool.type == "function": + chat_tools.append(ChatCompletionToolParam(type="function", function=input_tool.model_dump())) + elif input_tool.type in WebSearchToolTypes: + tool_name = "web_search" + tool = await self.tool_groups_api.get_tool(tool_name) + if not tool: + raise ValueError(f"Tool {tool_name} not found") + chat_tools.append(make_openai_tool(tool_name, tool)) + elif input_tool.type == "file_search": + tool_name = "knowledge_search" + tool = await self.tool_groups_api.get_tool(tool_name) + if not tool: + raise ValueError(f"Tool {tool_name} not found") + chat_tools.append(make_openai_tool(tool_name, tool)) + elif input_tool.type == "mcp": + from llama_stack.providers.utils.tools.mcp import list_mcp_tools + + always_allowed = None + never_allowed = None + if input_tool.allowed_tools: + if isinstance(input_tool.allowed_tools, list): + always_allowed = input_tool.allowed_tools + elif isinstance(input_tool.allowed_tools, AllowedToolsFilter): + always_allowed = input_tool.allowed_tools.always + never_allowed = input_tool.allowed_tools.never + + tool_defs = await list_mcp_tools( + endpoint=input_tool.server_url, + headers=input_tool.headers or {}, + ) + + mcp_list_message = OpenAIResponseOutputMessageMCPListTools( + id=f"mcp_list_{uuid.uuid4()}", + status="completed", + server_label=input_tool.server_label, + tools=[], + ) + for t in tool_defs.data: + if never_allowed and t.name in never_allowed: + continue + if not always_allowed or t.name in always_allowed: + chat_tools.append(make_openai_tool(t.name, t)) + if t.name in mcp_tool_to_server: + raise ValueError(f"Duplicate tool name {t.name} found for server {input_tool.server_label}") + mcp_tool_to_server[t.name] = input_tool + mcp_list_message.tools.append( + MCPListToolsTool( + name=t.name, + description=t.description, + input_schema={ + "type": "object", + "properties": { + p.name: { + "type": p.parameter_type, + "description": p.description, + } + for p in t.parameters + }, + "required": [p.name for p in t.parameters if p.required], + }, + ) + ) + else: + raise ValueError(f"Llama Stack OpenAI Responses does not yet support tool type: {input_tool.type}") + return chat_tools, mcp_tool_to_server, mcp_list_message diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py new file mode 100644 index 000000000..2e4ce0c37 --- /dev/null +++ b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py @@ -0,0 +1,451 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import uuid +from collections.abc import AsyncIterator +from typing import Any + +from llama_stack.apis.agents.openai_responses import ( + OpenAIResponseContentPartOutputText, + OpenAIResponseObject, + OpenAIResponseObjectStream, + OpenAIResponseObjectStreamResponseCompleted, + OpenAIResponseObjectStreamResponseContentPartAdded, + OpenAIResponseObjectStreamResponseContentPartDone, + OpenAIResponseObjectStreamResponseCreated, + OpenAIResponseObjectStreamResponseFunctionCallArgumentsDelta, + OpenAIResponseObjectStreamResponseFunctionCallArgumentsDone, + OpenAIResponseObjectStreamResponseMcpCallArgumentsDelta, + OpenAIResponseObjectStreamResponseMcpCallArgumentsDone, + OpenAIResponseObjectStreamResponseOutputItemAdded, + OpenAIResponseObjectStreamResponseOutputItemDone, + OpenAIResponseObjectStreamResponseOutputTextDelta, + OpenAIResponseOutput, + OpenAIResponseOutputMessageFunctionToolCall, + OpenAIResponseText, +) +from llama_stack.apis.inference import ( + Inference, + OpenAIAssistantMessageParam, + OpenAIChatCompletion, + OpenAIChatCompletionToolCall, + OpenAIChoice, +) +from llama_stack.log import get_logger + +from .types import ChatCompletionContext, ChatCompletionResult +from .utils import convert_chat_choice_to_response_message, is_function_tool_call + +logger = get_logger(name=__name__, category="responses") + + +class StreamingResponseOrchestrator: + def __init__( + self, + inference_api: Inference, + ctx: ChatCompletionContext, + response_id: str, + created_at: int, + text: OpenAIResponseText, + max_infer_iters: int, + tool_executor, # Will be the tool execution logic from the main class + mcp_list_message: OpenAIResponseOutput | None = None, + ): + self.inference_api = inference_api + self.ctx = ctx + self.response_id = response_id + self.created_at = created_at + self.text = text + self.max_infer_iters = max_infer_iters + self.tool_executor = tool_executor + self.sequence_number = 0 + self.mcp_list_message = mcp_list_message + + async def create_response(self) -> AsyncIterator[OpenAIResponseObjectStream]: + # Initialize output messages with MCP list message if present + output_messages: list[OpenAIResponseOutput] = [] + if self.mcp_list_message: + output_messages.append(self.mcp_list_message) + # Create initial response and emit response.created immediately + initial_response = OpenAIResponseObject( + created_at=self.created_at, + id=self.response_id, + model=self.ctx.model, + object="response", + status="in_progress", + output=output_messages.copy(), + text=self.text, + ) + + yield OpenAIResponseObjectStreamResponseCreated(response=initial_response) + + n_iter = 0 + messages = self.ctx.messages.copy() + + while True: + completion_result = await self.inference_api.openai_chat_completion( + model=self.ctx.model, + messages=messages, + tools=self.ctx.chat_tools, + stream=True, + temperature=self.ctx.temperature, + response_format=self.ctx.response_format, + ) + + # Process streaming chunks and build complete response + completion_result_data = None + async for stream_event_or_result in self._process_streaming_chunks(completion_result, output_messages): + if isinstance(stream_event_or_result, ChatCompletionResult): + completion_result_data = stream_event_or_result + else: + yield stream_event_or_result + if not completion_result_data: + raise ValueError("Streaming chunk processor failed to return completion data") + current_response = self._build_chat_completion(completion_result_data) + + function_tool_calls, non_function_tool_calls, next_turn_messages = self._separate_tool_calls( + current_response, messages + ) + + # Handle choices with no tool calls + for choice in current_response.choices: + if not (choice.message.tool_calls and self.ctx.response_tools): + output_messages.append(await convert_chat_choice_to_response_message(choice)) + + # Execute tool calls and coordinate results + async for stream_event in self._coordinate_tool_execution( + function_tool_calls, + non_function_tool_calls, + completion_result_data, + output_messages, + next_turn_messages, + ): + yield stream_event + + if not function_tool_calls and not non_function_tool_calls: + break + + if function_tool_calls: + logger.info("Exiting inference loop since there is a function (client-side) tool call") + break + + n_iter += 1 + if n_iter >= self.max_infer_iters: + logger.info(f"Exiting inference loop since iteration count({n_iter}) exceeds {self.max_infer_iters=}") + break + + messages = next_turn_messages + + # Create final response + final_response = OpenAIResponseObject( + created_at=self.created_at, + id=self.response_id, + model=self.ctx.model, + object="response", + status="completed", + text=self.text, + output=output_messages, + ) + + # Emit response.completed + yield OpenAIResponseObjectStreamResponseCompleted(response=final_response) + + def _separate_tool_calls(self, current_response, messages) -> tuple[list, list, list]: + """Separate tool calls into function and non-function categories.""" + function_tool_calls = [] + non_function_tool_calls = [] + next_turn_messages = messages.copy() + + for choice in current_response.choices: + next_turn_messages.append(choice.message) + + if choice.message.tool_calls and self.ctx.response_tools: + for tool_call in choice.message.tool_calls: + if is_function_tool_call(tool_call, self.ctx.response_tools): + function_tool_calls.append(tool_call) + else: + non_function_tool_calls.append(tool_call) + + return function_tool_calls, non_function_tool_calls, next_turn_messages + + async def _process_streaming_chunks( + self, completion_result, output_messages: list[OpenAIResponseOutput] + ) -> AsyncIterator[OpenAIResponseObjectStream | ChatCompletionResult]: + """Process streaming chunks and emit events, returning completion data.""" + # Initialize result tracking + chat_response_id = "" + chat_response_content = [] + chat_response_tool_calls: dict[int, OpenAIChatCompletionToolCall] = {} + chunk_created = 0 + chunk_model = "" + chunk_finish_reason = "" + + # Create a placeholder message item for delta events + message_item_id = f"msg_{uuid.uuid4()}" + # Track tool call items for streaming events + tool_call_item_ids: dict[int, str] = {} + # Track content parts for streaming events + content_part_emitted = False + + async for chunk in completion_result: + chat_response_id = chunk.id + chunk_created = chunk.created + chunk_model = chunk.model + for chunk_choice in chunk.choices: + # Emit incremental text content as delta events + if chunk_choice.delta.content: + # Emit content_part.added event for first text chunk + if not content_part_emitted: + content_part_emitted = True + self.sequence_number += 1 + yield OpenAIResponseObjectStreamResponseContentPartAdded( + response_id=self.response_id, + item_id=message_item_id, + part=OpenAIResponseContentPartOutputText( + text="", # Will be filled incrementally via text deltas + ), + sequence_number=self.sequence_number, + ) + self.sequence_number += 1 + yield OpenAIResponseObjectStreamResponseOutputTextDelta( + content_index=0, + delta=chunk_choice.delta.content, + item_id=message_item_id, + output_index=0, + sequence_number=self.sequence_number, + ) + + # Collect content for final response + chat_response_content.append(chunk_choice.delta.content or "") + if chunk_choice.finish_reason: + chunk_finish_reason = chunk_choice.finish_reason + + # Aggregate tool call arguments across chunks + if chunk_choice.delta.tool_calls: + for tool_call in chunk_choice.delta.tool_calls: + response_tool_call = chat_response_tool_calls.get(tool_call.index, None) + # Create new tool call entry if this is the first chunk for this index + is_new_tool_call = response_tool_call is None + if is_new_tool_call: + tool_call_dict: dict[str, Any] = tool_call.model_dump() + tool_call_dict.pop("type", None) + response_tool_call = OpenAIChatCompletionToolCall(**tool_call_dict) + chat_response_tool_calls[tool_call.index] = response_tool_call + + # Create item ID for this tool call for streaming events + tool_call_item_id = f"fc_{uuid.uuid4()}" + tool_call_item_ids[tool_call.index] = tool_call_item_id + + # Emit output_item.added event for the new function call + self.sequence_number += 1 + function_call_item = OpenAIResponseOutputMessageFunctionToolCall( + arguments="", # Will be filled incrementally via delta events + call_id=tool_call.id or "", + name=tool_call.function.name if tool_call.function else "", + id=tool_call_item_id, + status="in_progress", + ) + yield OpenAIResponseObjectStreamResponseOutputItemAdded( + response_id=self.response_id, + item=function_call_item, + output_index=len(output_messages), + sequence_number=self.sequence_number, + ) + + # Stream tool call arguments as they arrive (differentiate between MCP and function calls) + if tool_call.function and tool_call.function.arguments: + tool_call_item_id = tool_call_item_ids[tool_call.index] + self.sequence_number += 1 + + # Check if this is an MCP tool call + is_mcp_tool = ( + tool_call.function.name and tool_call.function.name in self.ctx.mcp_tool_to_server + ) + if is_mcp_tool: + # Emit MCP-specific argument delta event + yield OpenAIResponseObjectStreamResponseMcpCallArgumentsDelta( + delta=tool_call.function.arguments, + item_id=tool_call_item_id, + output_index=len(output_messages), + sequence_number=self.sequence_number, + ) + else: + # Emit function call argument delta event + yield OpenAIResponseObjectStreamResponseFunctionCallArgumentsDelta( + delta=tool_call.function.arguments, + item_id=tool_call_item_id, + output_index=len(output_messages), + sequence_number=self.sequence_number, + ) + + # Accumulate arguments for final response (only for subsequent chunks) + if not is_new_tool_call: + response_tool_call.function.arguments = ( + response_tool_call.function.arguments or "" + ) + tool_call.function.arguments + + # Emit arguments.done events for completed tool calls (differentiate between MCP and function calls) + for tool_call_index in sorted(chat_response_tool_calls.keys()): + tool_call_item_id = tool_call_item_ids[tool_call_index] + final_arguments = chat_response_tool_calls[tool_call_index].function.arguments or "" + tool_call_name = chat_response_tool_calls[tool_call_index].function.name + + # Check if this is an MCP tool call + is_mcp_tool = ( + self.ctx.mcp_tool_to_server and tool_call_name and tool_call_name in self.ctx.mcp_tool_to_server + ) + self.sequence_number += 1 + done_event_cls = ( + OpenAIResponseObjectStreamResponseMcpCallArgumentsDone + if is_mcp_tool + else OpenAIResponseObjectStreamResponseFunctionCallArgumentsDone + ) + yield done_event_cls( + arguments=final_arguments, + item_id=tool_call_item_id, + output_index=len(output_messages), + sequence_number=self.sequence_number, + ) + + # Emit content_part.done event if text content was streamed (before content gets cleared) + if content_part_emitted: + final_text = "".join(chat_response_content) + self.sequence_number += 1 + yield OpenAIResponseObjectStreamResponseContentPartDone( + response_id=self.response_id, + item_id=message_item_id, + part=OpenAIResponseContentPartOutputText( + text=final_text, + ), + sequence_number=self.sequence_number, + ) + + # Clear content when there are tool calls (OpenAI spec behavior) + if chat_response_tool_calls: + chat_response_content = [] + + yield ChatCompletionResult( + response_id=chat_response_id, + content=chat_response_content, + tool_calls=chat_response_tool_calls, + created=chunk_created, + model=chunk_model, + finish_reason=chunk_finish_reason, + message_item_id=message_item_id, + tool_call_item_ids=tool_call_item_ids, + content_part_emitted=content_part_emitted, + ) + + def _build_chat_completion(self, result: ChatCompletionResult) -> OpenAIChatCompletion: + """Build OpenAIChatCompletion from ChatCompletionResult.""" + # Convert collected chunks to complete response + if result.tool_calls: + tool_calls = [result.tool_calls[i] for i in sorted(result.tool_calls.keys())] + else: + tool_calls = None + + assistant_message = OpenAIAssistantMessageParam( + content=result.content_text, + tool_calls=tool_calls, + ) + return OpenAIChatCompletion( + id=result.response_id, + choices=[ + OpenAIChoice( + message=assistant_message, + finish_reason=result.finish_reason, + index=0, + ) + ], + created=result.created, + model=result.model, + ) + + async def _coordinate_tool_execution( + self, + function_tool_calls: list, + non_function_tool_calls: list, + completion_result_data: ChatCompletionResult, + output_messages: list[OpenAIResponseOutput], + next_turn_messages: list, + ) -> AsyncIterator[OpenAIResponseObjectStream]: + """Coordinate execution of both function and non-function tool calls.""" + # Execute non-function tool calls + for tool_call in non_function_tool_calls: + # Find the item_id for this tool call + matching_item_id = None + for index, item_id in completion_result_data.tool_call_item_ids.items(): + response_tool_call = completion_result_data.tool_calls.get(index) + if response_tool_call and response_tool_call.id == tool_call.id: + matching_item_id = item_id + break + + # Use a fallback item_id if not found + if not matching_item_id: + matching_item_id = f"tc_{uuid.uuid4()}" + + # Execute tool call with streaming + tool_call_log = None + tool_response_message = None + async for result in self.tool_executor.execute_tool_call( + tool_call, self.ctx, self.sequence_number, len(output_messages), matching_item_id + ): + if result.stream_event: + # Forward streaming events + self.sequence_number = result.sequence_number + yield result.stream_event + + if result.final_output_message is not None: + tool_call_log = result.final_output_message + tool_response_message = result.final_input_message + self.sequence_number = result.sequence_number + + if tool_call_log: + output_messages.append(tool_call_log) + + # Emit output_item.done event for completed non-function tool call + if matching_item_id: + self.sequence_number += 1 + yield OpenAIResponseObjectStreamResponseOutputItemDone( + response_id=self.response_id, + item=tool_call_log, + output_index=len(output_messages) - 1, + sequence_number=self.sequence_number, + ) + + if tool_response_message: + next_turn_messages.append(tool_response_message) + + # Execute function tool calls (client-side) + for tool_call in function_tool_calls: + # Find the item_id for this tool call from our tracking dictionary + matching_item_id = None + for index, item_id in completion_result_data.tool_call_item_ids.items(): + response_tool_call = completion_result_data.tool_calls.get(index) + if response_tool_call and response_tool_call.id == tool_call.id: + matching_item_id = item_id + break + + # Use existing item_id or create new one if not found + final_item_id = matching_item_id or f"fc_{uuid.uuid4()}" + + function_call_item = OpenAIResponseOutputMessageFunctionToolCall( + arguments=tool_call.function.arguments or "", + call_id=tool_call.id, + name=tool_call.function.name or "", + id=final_item_id, + status="completed", + ) + output_messages.append(function_call_item) + + # Emit output_item.done event for completed function call + self.sequence_number += 1 + yield OpenAIResponseObjectStreamResponseOutputItemDone( + response_id=self.response_id, + item=function_call_item, + output_index=len(output_messages) - 1, + sequence_number=self.sequence_number, + ) diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py b/llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py new file mode 100644 index 000000000..6b7845138 --- /dev/null +++ b/llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py @@ -0,0 +1,365 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import asyncio +import json +from collections.abc import AsyncIterator + +from llama_stack.apis.agents.openai_responses import ( + OpenAIResponseInputToolFileSearch, + OpenAIResponseObjectStreamResponseMcpCallCompleted, + OpenAIResponseObjectStreamResponseMcpCallFailed, + OpenAIResponseObjectStreamResponseMcpCallInProgress, + OpenAIResponseObjectStreamResponseWebSearchCallCompleted, + OpenAIResponseObjectStreamResponseWebSearchCallInProgress, + OpenAIResponseObjectStreamResponseWebSearchCallSearching, + OpenAIResponseOutputMessageFileSearchToolCall, + OpenAIResponseOutputMessageFileSearchToolCallResults, + OpenAIResponseOutputMessageWebSearchToolCall, +) +from llama_stack.apis.common.content_types import ( + ImageContentItem, + TextContentItem, +) +from llama_stack.apis.inference import ( + OpenAIChatCompletionContentPartImageParam, + OpenAIChatCompletionContentPartTextParam, + OpenAIChatCompletionToolCall, + OpenAIImageURL, + OpenAIToolMessageParam, +) +from llama_stack.apis.tools import ToolGroups, ToolInvocationResult, ToolRuntime +from llama_stack.apis.vector_io import VectorIO +from llama_stack.log import get_logger + +from .types import ChatCompletionContext, ToolExecutionResult + +logger = get_logger(name=__name__, category="responses") + + +class ToolExecutor: + def __init__( + self, + tool_groups_api: ToolGroups, + tool_runtime_api: ToolRuntime, + vector_io_api: VectorIO, + ): + self.tool_groups_api = tool_groups_api + self.tool_runtime_api = tool_runtime_api + self.vector_io_api = vector_io_api + + async def execute_tool_call( + self, + tool_call: OpenAIChatCompletionToolCall, + ctx: ChatCompletionContext, + sequence_number: int, + output_index: int, + item_id: str, + ) -> AsyncIterator[ToolExecutionResult]: + tool_call_id = tool_call.id + function = tool_call.function + tool_kwargs = json.loads(function.arguments) if function.arguments else {} + + if not function or not tool_call_id or not function.name: + yield ToolExecutionResult(sequence_number=sequence_number) + return + + # Emit progress events for tool execution start + async for event_result in self._emit_progress_events( + function.name, ctx, sequence_number, output_index, item_id + ): + sequence_number = event_result.sequence_number + yield event_result + + # Execute the actual tool call + error_exc, result = await self._execute_tool(function.name, tool_kwargs, ctx) + + # Emit completion events for tool execution + has_error = error_exc or (result and ((result.error_code and result.error_code > 0) or result.error_message)) + async for event_result in self._emit_completion_events( + function.name, ctx, sequence_number, output_index, item_id, has_error + ): + sequence_number = event_result.sequence_number + yield event_result + + # Build result messages from tool execution + output_message, input_message = await self._build_result_messages( + function, tool_call_id, tool_kwargs, ctx, error_exc, result, has_error + ) + + # Yield the final result + yield ToolExecutionResult( + sequence_number=sequence_number, final_output_message=output_message, final_input_message=input_message + ) + + async def _execute_knowledge_search_via_vector_store( + self, + query: str, + response_file_search_tool: OpenAIResponseInputToolFileSearch, + ) -> ToolInvocationResult: + """Execute knowledge search using vector_stores.search API with filters support.""" + search_results = [] + + # Create search tasks for all vector stores + async def search_single_store(vector_store_id): + try: + search_response = await self.vector_io_api.openai_search_vector_store( + vector_store_id=vector_store_id, + query=query, + filters=response_file_search_tool.filters, + max_num_results=response_file_search_tool.max_num_results, + ranking_options=response_file_search_tool.ranking_options, + rewrite_query=False, + ) + return search_response.data + except Exception as e: + logger.warning(f"Failed to search vector store {vector_store_id}: {e}") + return [] + + # Run all searches in parallel using gather + search_tasks = [search_single_store(vid) for vid in response_file_search_tool.vector_store_ids] + all_results = await asyncio.gather(*search_tasks) + + # Flatten results + for results in all_results: + search_results.extend(results) + + # Convert search results to tool result format matching memory.py + # Format the results as interleaved content similar to memory.py + content_items = [] + content_items.append( + TextContentItem( + text=f"knowledge_search tool found {len(search_results)} chunks:\nBEGIN of knowledge_search tool results.\n" + ) + ) + + for i, result_item in enumerate(search_results): + chunk_text = result_item.content[0].text if result_item.content else "" + metadata_text = f"document_id: {result_item.file_id}, score: {result_item.score}" + if result_item.attributes: + metadata_text += f", attributes: {result_item.attributes}" + text_content = f"[{i + 1}] {metadata_text}\n{chunk_text}\n" + content_items.append(TextContentItem(text=text_content)) + + content_items.append(TextContentItem(text="END of knowledge_search tool results.\n")) + content_items.append( + TextContentItem( + text=f'The above results were retrieved to help answer the user\'s query: "{query}". Use them as supporting information only in answering this query.\n', + ) + ) + + return ToolInvocationResult( + content=content_items, + metadata={ + "document_ids": [r.file_id for r in search_results], + "chunks": [r.content[0].text if r.content else "" for r in search_results], + "scores": [r.score for r in search_results], + }, + ) + + async def _emit_progress_events( + self, function_name: str, ctx: ChatCompletionContext, sequence_number: int, output_index: int, item_id: str + ) -> AsyncIterator[ToolExecutionResult]: + """Emit progress events for tool execution start.""" + # Emit in_progress event based on tool type (only for tools with specific streaming events) + progress_event = None + if ctx.mcp_tool_to_server and function_name in ctx.mcp_tool_to_server: + sequence_number += 1 + progress_event = OpenAIResponseObjectStreamResponseMcpCallInProgress( + item_id=item_id, + output_index=output_index, + sequence_number=sequence_number, + ) + elif function_name == "web_search": + sequence_number += 1 + progress_event = OpenAIResponseObjectStreamResponseWebSearchCallInProgress( + item_id=item_id, + output_index=output_index, + sequence_number=sequence_number, + ) + # Note: knowledge_search and other custom tools don't have specific streaming events in OpenAI spec + + if progress_event: + yield ToolExecutionResult(stream_event=progress_event, sequence_number=sequence_number) + + # For web search, emit searching event + if function_name == "web_search": + sequence_number += 1 + searching_event = OpenAIResponseObjectStreamResponseWebSearchCallSearching( + item_id=item_id, + output_index=output_index, + sequence_number=sequence_number, + ) + yield ToolExecutionResult(stream_event=searching_event, sequence_number=sequence_number) + + async def _execute_tool( + self, function_name: str, tool_kwargs: dict, ctx: ChatCompletionContext + ) -> tuple[Exception | None, any]: + """Execute the tool and return error exception and result.""" + error_exc = None + result = None + + try: + if ctx.mcp_tool_to_server and function_name in ctx.mcp_tool_to_server: + from llama_stack.providers.utils.tools.mcp import invoke_mcp_tool + + mcp_tool = ctx.mcp_tool_to_server[function_name] + result = await invoke_mcp_tool( + endpoint=mcp_tool.server_url, + headers=mcp_tool.headers or {}, + tool_name=function_name, + kwargs=tool_kwargs, + ) + elif function_name == "knowledge_search": + response_file_search_tool = next( + (t for t in ctx.response_tools if isinstance(t, OpenAIResponseInputToolFileSearch)), + None, + ) + if response_file_search_tool: + # Use vector_stores.search API instead of knowledge_search tool + # to support filters and ranking_options + query = tool_kwargs.get("query", "") + result = await self._execute_knowledge_search_via_vector_store( + query=query, + response_file_search_tool=response_file_search_tool, + ) + else: + result = await self.tool_runtime_api.invoke_tool( + tool_name=function_name, + kwargs=tool_kwargs, + ) + except Exception as e: + error_exc = e + + return error_exc, result + + async def _emit_completion_events( + self, + function_name: str, + ctx: ChatCompletionContext, + sequence_number: int, + output_index: int, + item_id: str, + has_error: bool, + ) -> AsyncIterator[ToolExecutionResult]: + """Emit completion or failure events for tool execution.""" + completion_event = None + + if ctx.mcp_tool_to_server and function_name in ctx.mcp_tool_to_server: + sequence_number += 1 + if has_error: + completion_event = OpenAIResponseObjectStreamResponseMcpCallFailed( + sequence_number=sequence_number, + ) + else: + completion_event = OpenAIResponseObjectStreamResponseMcpCallCompleted( + sequence_number=sequence_number, + ) + elif function_name == "web_search": + sequence_number += 1 + completion_event = OpenAIResponseObjectStreamResponseWebSearchCallCompleted( + item_id=item_id, + output_index=output_index, + sequence_number=sequence_number, + ) + # Note: knowledge_search and other custom tools don't have specific completion events in OpenAI spec + + if completion_event: + yield ToolExecutionResult(stream_event=completion_event, sequence_number=sequence_number) + + async def _build_result_messages( + self, + function, + tool_call_id: str, + tool_kwargs: dict, + ctx: ChatCompletionContext, + error_exc: Exception | None, + result: any, + has_error: bool, + ) -> tuple[any, any]: + """Build output and input messages from tool execution results.""" + from llama_stack.providers.utils.inference.prompt_adapter import ( + interleaved_content_as_str, + ) + + # Build output message + if function.name in ctx.mcp_tool_to_server: + from llama_stack.apis.agents.openai_responses import ( + OpenAIResponseOutputMessageMCPCall, + ) + + message = OpenAIResponseOutputMessageMCPCall( + id=tool_call_id, + arguments=function.arguments, + name=function.name, + server_label=ctx.mcp_tool_to_server[function.name].server_label, + ) + if error_exc: + message.error = str(error_exc) + elif (result and result.error_code and result.error_code > 0) or (result and result.error_message): + message.error = f"Error (code {result.error_code}): {result.error_message}" + elif result and result.content: + message.output = interleaved_content_as_str(result.content) + else: + if function.name == "web_search": + message = OpenAIResponseOutputMessageWebSearchToolCall( + id=tool_call_id, + status="completed", + ) + if has_error: + message.status = "failed" + elif function.name == "knowledge_search": + message = OpenAIResponseOutputMessageFileSearchToolCall( + id=tool_call_id, + queries=[tool_kwargs.get("query", "")], + status="completed", + ) + if result and "document_ids" in result.metadata: + message.results = [] + for i, doc_id in enumerate(result.metadata["document_ids"]): + text = result.metadata["chunks"][i] if "chunks" in result.metadata else None + score = result.metadata["scores"][i] if "scores" in result.metadata else None + message.results.append( + OpenAIResponseOutputMessageFileSearchToolCallResults( + file_id=doc_id, + filename=doc_id, + text=text, + score=score, + attributes={}, + ) + ) + if has_error: + message.status = "failed" + else: + raise ValueError(f"Unknown tool {function.name} called") + + # Build input message + input_message = None + if result and result.content: + if isinstance(result.content, str): + content = result.content + elif isinstance(result.content, list): + content = [] + for item in result.content: + if isinstance(item, TextContentItem): + part = OpenAIChatCompletionContentPartTextParam(text=item.text) + elif isinstance(item, ImageContentItem): + if item.image.data: + url = f"data:image;base64,{item.image.data}" + else: + url = item.image.url + part = OpenAIChatCompletionContentPartImageParam(image_url=OpenAIImageURL(url=url)) + else: + raise ValueError(f"Unknown result content type: {type(item)}") + content.append(part) + else: + raise ValueError(f"Unknown result content type: {type(result.content)}") + input_message = OpenAIToolMessageParam(content=content, tool_call_id=tool_call_id) + else: + text = str(error_exc) if error_exc else "Tool execution failed" + input_message = OpenAIToolMessageParam(content=text, tool_call_id=tool_call_id) + + return message, input_message diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/types.py b/llama_stack/providers/inline/agents/meta_reference/responses/types.py new file mode 100644 index 000000000..502d8a192 --- /dev/null +++ b/llama_stack/providers/inline/agents/meta_reference/responses/types.py @@ -0,0 +1,62 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from dataclasses import dataclass + +from openai.types.chat import ChatCompletionToolParam +from pydantic import BaseModel + +from llama_stack.apis.agents.openai_responses import ( + OpenAIResponseInputTool, + OpenAIResponseInputToolMCP, + OpenAIResponseObjectStream, + OpenAIResponseOutput, +) +from llama_stack.apis.inference import OpenAIChatCompletionToolCall, OpenAIMessageParam, OpenAIResponseFormatParam + + +class ToolExecutionResult(BaseModel): + """Result of streaming tool execution.""" + + stream_event: OpenAIResponseObjectStream | None = None + sequence_number: int + final_output_message: OpenAIResponseOutput | None = None + final_input_message: OpenAIMessageParam | None = None + + +@dataclass +class ChatCompletionResult: + """Result of processing streaming chat completion chunks.""" + + response_id: str + content: list[str] + tool_calls: dict[int, OpenAIChatCompletionToolCall] + created: int + model: str + finish_reason: str + message_item_id: str # For streaming events + tool_call_item_ids: dict[int, str] # For streaming events + content_part_emitted: bool # Tracking state + + @property + def content_text(self) -> str: + """Get joined content as string.""" + return "".join(self.content) + + @property + def has_tool_calls(self) -> bool: + """Check if there are any tool calls.""" + return bool(self.tool_calls) + + +class ChatCompletionContext(BaseModel): + model: str + messages: list[OpenAIMessageParam] + response_tools: list[OpenAIResponseInputTool] | None = None + chat_tools: list[ChatCompletionToolParam] | None = None + mcp_tool_to_server: dict[str, OpenAIResponseInputToolMCP] + temperature: float | None + response_format: OpenAIResponseFormatParam diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/utils.py b/llama_stack/providers/inline/agents/meta_reference/responses/utils.py new file mode 100644 index 000000000..53d408ab7 --- /dev/null +++ b/llama_stack/providers/inline/agents/meta_reference/responses/utils.py @@ -0,0 +1,50 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import uuid + +from llama_stack.apis.agents.openai_responses import ( + OpenAIResponseInputTool, + OpenAIResponseMessage, + OpenAIResponseOutputMessageContentOutputText, +) +from llama_stack.apis.inference import ( + OpenAIChatCompletionContentPartTextParam, + OpenAIChatCompletionToolCall, + OpenAIChoice, +) + + +async def convert_chat_choice_to_response_message(choice: OpenAIChoice) -> OpenAIResponseMessage: + """Convert an OpenAI Chat Completion choice into an OpenAI Response output message.""" + output_content = "" + if isinstance(choice.message.content, str): + output_content = choice.message.content + elif isinstance(choice.message.content, OpenAIChatCompletionContentPartTextParam): + output_content = choice.message.content.text + else: + raise ValueError( + f"Llama Stack OpenAI Responses does not yet support output content type: {type(choice.message.content)}" + ) + + return OpenAIResponseMessage( + id=f"msg_{uuid.uuid4()}", + content=[OpenAIResponseOutputMessageContentOutputText(text=output_content)], + status="completed", + role="assistant", + ) + + +def is_function_tool_call( + tool_call: OpenAIChatCompletionToolCall, + tools: list[OpenAIResponseInputTool], +) -> bool: + if not tool_call.function: + return False + for t in tools: + if t.type == "function" and t.name == tool_call.function.name: + return True + return False diff --git a/tests/unit/providers/agents/meta_reference/test_openai_responses.py b/tests/unit/providers/agents/meta_reference/test_openai_responses.py index 4132a74a3..5ea14d7c7 100644 --- a/tests/unit/providers/agents/meta_reference/test_openai_responses.py +++ b/tests/unit/providers/agents/meta_reference/test_openai_responses.py @@ -41,7 +41,7 @@ from llama_stack.apis.inference import ( ) from llama_stack.apis.tools.tools import Tool, ToolGroups, ToolInvocationResult, ToolParameter, ToolRuntime from llama_stack.core.access_control.access_control import default_policy -from llama_stack.providers.inline.agents.meta_reference.openai_responses import ( +from llama_stack.providers.inline.agents.meta_reference.responses.openai_responses import ( OpenAIResponsesImpl, ) from llama_stack.providers.utils.responses.responses_store import ResponsesStore