feat: reuse previous mcp tool listings where possible (#3710)

# What does this PR do?
This PR checks whether, if a previous response is linked, there are
mcp_list_tools objects that can be reused instead of listing the tools
explicitly every time.

 Closes #3106 

## Test Plan
Tested manually.
Added unit tests to cover new behaviour.

---------

Signed-off-by: Gordon Sim <gsim@redhat.com>
Co-authored-by: Ashwin Bharambe <ashwin.bharambe@gmail.com>
This commit is contained in:
grs 2025-10-10 17:28:25 +01:00 committed by GitHub
parent 0066d986c5
commit 8bf07f91cb
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
12 changed files with 1835 additions and 983 deletions

View file

@ -346,6 +346,138 @@ class OpenAIResponseText(BaseModel):
format: OpenAIResponseTextFormat | None = None
# Must match type Literals of OpenAIResponseInputToolWebSearch below
WebSearchToolTypes = ["web_search", "web_search_preview", "web_search_preview_2025_03_11"]
@json_schema_type
class OpenAIResponseInputToolWebSearch(BaseModel):
"""Web search tool configuration for OpenAI response inputs.
:param type: Web search tool type variant to use
:param search_context_size: (Optional) Size of search context, must be "low", "medium", or "high"
"""
# Must match values of WebSearchToolTypes above
type: Literal["web_search"] | Literal["web_search_preview"] | Literal["web_search_preview_2025_03_11"] = (
"web_search"
)
# TODO: actually use search_context_size somewhere...
search_context_size: str | None = Field(default="medium", pattern="^low|medium|high$")
# TODO: add user_location
@json_schema_type
class OpenAIResponseInputToolFunction(BaseModel):
"""Function tool configuration for OpenAI response inputs.
:param type: Tool type identifier, always "function"
:param name: Name of the function that can be called
:param description: (Optional) Description of what the function does
:param parameters: (Optional) JSON schema defining the function's parameters
:param strict: (Optional) Whether to enforce strict parameter validation
"""
type: Literal["function"] = "function"
name: str
description: str | None = None
parameters: dict[str, Any] | None
strict: bool | None = None
@json_schema_type
class OpenAIResponseInputToolFileSearch(BaseModel):
"""File search tool configuration for OpenAI response inputs.
:param type: Tool type identifier, always "file_search"
:param vector_store_ids: List of vector store identifiers to search within
:param filters: (Optional) Additional filters to apply to the search
:param max_num_results: (Optional) Maximum number of search results to return (1-50)
:param ranking_options: (Optional) Options for ranking and scoring search results
"""
type: Literal["file_search"] = "file_search"
vector_store_ids: list[str]
filters: dict[str, Any] | None = None
max_num_results: int | None = Field(default=10, ge=1, le=50)
ranking_options: FileSearchRankingOptions | None = None
class ApprovalFilter(BaseModel):
"""Filter configuration for MCP tool approval requirements.
:param always: (Optional) List of tool names that always require approval
:param never: (Optional) List of tool names that never require approval
"""
always: list[str] | None = None
never: list[str] | None = None
class AllowedToolsFilter(BaseModel):
"""Filter configuration for restricting which MCP tools can be used.
:param tool_names: (Optional) List of specific tool names that are allowed
"""
tool_names: list[str] | None = None
@json_schema_type
class OpenAIResponseInputToolMCP(BaseModel):
"""Model Context Protocol (MCP) tool configuration for OpenAI response inputs.
:param type: Tool type identifier, always "mcp"
:param server_label: Label to identify this MCP server
:param server_url: URL endpoint of the MCP server
:param headers: (Optional) HTTP headers to include when connecting to the server
:param require_approval: Approval requirement for tool calls ("always", "never", or filter)
:param allowed_tools: (Optional) Restriction on which tools can be used from this server
"""
type: Literal["mcp"] = "mcp"
server_label: str
server_url: str
headers: dict[str, Any] | None = None
require_approval: Literal["always"] | Literal["never"] | ApprovalFilter = "never"
allowed_tools: list[str] | AllowedToolsFilter | None = None
OpenAIResponseInputTool = Annotated[
OpenAIResponseInputToolWebSearch
| OpenAIResponseInputToolFileSearch
| OpenAIResponseInputToolFunction
| OpenAIResponseInputToolMCP,
Field(discriminator="type"),
]
register_schema(OpenAIResponseInputTool, name="OpenAIResponseInputTool")
@json_schema_type
class OpenAIResponseToolMCP(BaseModel):
"""Model Context Protocol (MCP) tool configuration for OpenAI response object.
:param type: Tool type identifier, always "mcp"
:param server_label: Label to identify this MCP server
:param allowed_tools: (Optional) Restriction on which tools can be used from this server
"""
type: Literal["mcp"] = "mcp"
server_label: str
allowed_tools: list[str] | AllowedToolsFilter | None = None
OpenAIResponseTool = Annotated[
OpenAIResponseInputToolWebSearch
| OpenAIResponseInputToolFileSearch
| OpenAIResponseInputToolFunction
| OpenAIResponseToolMCP, # The only type that differes from that in the inputs is the MCP tool
Field(discriminator="type"),
]
register_schema(OpenAIResponseTool, name="OpenAIResponseTool")
class OpenAIResponseUsageOutputTokensDetails(BaseModel):
"""Token details for output tokens in OpenAI response usage.
@ -398,6 +530,7 @@ class OpenAIResponseObject(BaseModel):
:param temperature: (Optional) Sampling temperature used for generation
:param text: Text formatting configuration for the response
:param top_p: (Optional) Nucleus sampling parameter used for generation
:param tools: (Optional) An array of tools the model may call while generating a response.
:param truncation: (Optional) Truncation strategy applied to the response
:param usage: (Optional) Token usage information for the response
"""
@ -416,6 +549,7 @@ class OpenAIResponseObject(BaseModel):
# before the field was added. New responses will have this set always.
text: OpenAIResponseText = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text"))
top_p: float | None = None
tools: list[OpenAIResponseTool] | None = None
truncation: str | None = None
usage: OpenAIResponseUsage | None = None
@ -878,114 +1012,6 @@ OpenAIResponseInput = Annotated[
register_schema(OpenAIResponseInput, name="OpenAIResponseInput")
# Must match type Literals of OpenAIResponseInputToolWebSearch below
WebSearchToolTypes = ["web_search", "web_search_preview", "web_search_preview_2025_03_11"]
@json_schema_type
class OpenAIResponseInputToolWebSearch(BaseModel):
"""Web search tool configuration for OpenAI response inputs.
:param type: Web search tool type variant to use
:param search_context_size: (Optional) Size of search context, must be "low", "medium", or "high"
"""
# Must match values of WebSearchToolTypes above
type: Literal["web_search"] | Literal["web_search_preview"] | Literal["web_search_preview_2025_03_11"] = (
"web_search"
)
# TODO: actually use search_context_size somewhere...
search_context_size: str | None = Field(default="medium", pattern="^low|medium|high$")
# TODO: add user_location
@json_schema_type
class OpenAIResponseInputToolFunction(BaseModel):
"""Function tool configuration for OpenAI response inputs.
:param type: Tool type identifier, always "function"
:param name: Name of the function that can be called
:param description: (Optional) Description of what the function does
:param parameters: (Optional) JSON schema defining the function's parameters
:param strict: (Optional) Whether to enforce strict parameter validation
"""
type: Literal["function"] = "function"
name: str
description: str | None = None
parameters: dict[str, Any] | None
strict: bool | None = None
@json_schema_type
class OpenAIResponseInputToolFileSearch(BaseModel):
"""File search tool configuration for OpenAI response inputs.
:param type: Tool type identifier, always "file_search"
:param vector_store_ids: List of vector store identifiers to search within
:param filters: (Optional) Additional filters to apply to the search
:param max_num_results: (Optional) Maximum number of search results to return (1-50)
:param ranking_options: (Optional) Options for ranking and scoring search results
"""
type: Literal["file_search"] = "file_search"
vector_store_ids: list[str]
filters: dict[str, Any] | None = None
max_num_results: int | None = Field(default=10, ge=1, le=50)
ranking_options: FileSearchRankingOptions | None = None
class ApprovalFilter(BaseModel):
"""Filter configuration for MCP tool approval requirements.
:param always: (Optional) List of tool names that always require approval
:param never: (Optional) List of tool names that never require approval
"""
always: list[str] | None = None
never: list[str] | None = None
class AllowedToolsFilter(BaseModel):
"""Filter configuration for restricting which MCP tools can be used.
:param tool_names: (Optional) List of specific tool names that are allowed
"""
tool_names: list[str] | None = None
@json_schema_type
class OpenAIResponseInputToolMCP(BaseModel):
"""Model Context Protocol (MCP) tool configuration for OpenAI response inputs.
:param type: Tool type identifier, always "mcp"
:param server_label: Label to identify this MCP server
:param server_url: URL endpoint of the MCP server
:param headers: (Optional) HTTP headers to include when connecting to the server
:param require_approval: Approval requirement for tool calls ("always", "never", or filter)
:param allowed_tools: (Optional) Restriction on which tools can be used from this server
"""
type: Literal["mcp"] = "mcp"
server_label: str
server_url: str
headers: dict[str, Any] | None = None
require_approval: Literal["always"] | Literal["never"] | ApprovalFilter = "never"
allowed_tools: list[str] | AllowedToolsFilter | None = None
OpenAIResponseInputTool = Annotated[
OpenAIResponseInputToolWebSearch
| OpenAIResponseInputToolFileSearch
| OpenAIResponseInputToolFunction
| OpenAIResponseInputToolMCP,
Field(discriminator="type"),
]
register_schema(OpenAIResponseInputTool, name="OpenAIResponseInputTool")
class ListOpenAIResponseInputItem(BaseModel):
"""List container for OpenAI response input items.

View file

@ -39,7 +39,7 @@ from llama_stack.providers.utils.responses.responses_store import (
from .streaming import StreamingResponseOrchestrator
from .tool_executor import ToolExecutor
from .types import ChatCompletionContext
from .types import ChatCompletionContext, ToolContext
from .utils import (
convert_response_input_to_chat_messages,
convert_response_text_to_chat_response_format,
@ -91,13 +91,15 @@ class OpenAIResponsesImpl:
async def _process_input_with_previous_response(
self,
input: str | list[OpenAIResponseInput],
tools: list[OpenAIResponseInputTool] | None,
previous_response_id: str | None,
) -> tuple[str | list[OpenAIResponseInput], list[OpenAIMessageParam]]:
"""Process input with optional previous response context.
Returns:
tuple: (all_input for storage, messages for chat completion)
tuple: (all_input for storage, messages for chat completion, tool context)
"""
tool_context = ToolContext(tools)
if previous_response_id:
previous_response: _OpenAIResponseObjectWithInputAndMessages = (
await self.responses_store.get_response_object(previous_response_id)
@ -113,11 +115,13 @@ class OpenAIResponsesImpl:
else:
# Backward compatibility: reconstruct from inputs
messages = await convert_response_input_to_chat_messages(all_input)
tool_context.recover_tools_from_previous_response(previous_response)
else:
all_input = input
messages = await convert_response_input_to_chat_messages(input)
return all_input, messages
return all_input, messages, tool_context
async def _prepend_instructions(self, messages, instructions):
if instructions:
@ -273,7 +277,9 @@ class OpenAIResponsesImpl:
max_infer_iters: int | None = 10,
) -> AsyncIterator[OpenAIResponseObjectStream]:
# Input preprocessing
all_input, messages = await self._process_input_with_previous_response(input, previous_response_id)
all_input, messages, tool_context = await self._process_input_with_previous_response(
input, tools, previous_response_id
)
await self._prepend_instructions(messages, instructions)
# Structured outputs
@ -285,6 +291,7 @@ class OpenAIResponsesImpl:
response_tools=tools,
temperature=temperature,
response_format=response_format,
tool_context=tool_context,
inputs=all_input,
)

View file

@ -99,7 +99,7 @@ class StreamingResponseOrchestrator:
self.tool_executor = tool_executor
self.sequence_number = 0
# Store MCP tool mapping that gets built during tool processing
self.mcp_tool_to_server: dict[str, OpenAIResponseInputToolMCP] = {}
self.mcp_tool_to_server: dict[str, OpenAIResponseInputToolMCP] = ctx.tool_context.previous_tools or {}
# Track final messages after all tool executions
self.final_messages: list[OpenAIMessageParam] = []
# mapping for annotations
@ -129,6 +129,7 @@ class StreamingResponseOrchestrator:
status=status,
output=self._clone_outputs(outputs),
text=self.text,
tools=self.ctx.available_tools(),
error=error,
)
@ -146,10 +147,8 @@ class StreamingResponseOrchestrator:
sequence_number=self.sequence_number,
)
# Process all tools (including MCP tools) and emit streaming events
if self.ctx.response_tools:
async for stream_event in self._process_tools(self.ctx.response_tools, output_messages):
yield stream_event
async for stream_event in self._process_tools(output_messages):
yield stream_event
n_iter = 0
messages = self.ctx.messages.copy()
@ -590,7 +589,7 @@ class StreamingResponseOrchestrator:
sequence_number=self.sequence_number,
)
async def _process_tools(
async def _process_new_tools(
self, tools: list[OpenAIResponseInputTool], output_messages: list[OpenAIResponseOutput]
) -> AsyncIterator[OpenAIResponseObjectStream]:
"""Process all tools and emit appropriate streaming events."""
@ -645,7 +644,6 @@ class StreamingResponseOrchestrator:
yield OpenAIResponseObjectStreamResponseMcpListToolsInProgress(
sequence_number=self.sequence_number,
)
try:
# Parse allowed/never allowed tools
always_allowed = None
@ -707,39 +705,26 @@ class StreamingResponseOrchestrator:
},
)
)
# Add the MCP list message to output
output_messages.append(mcp_list_message)
# Emit output_item.added for the MCP list tools message
self.sequence_number += 1
yield OpenAIResponseObjectStreamResponseOutputItemAdded(
response_id=self.response_id,
item=mcp_list_message,
output_index=len(output_messages) - 1,
sequence_number=self.sequence_number,
)
# Emit mcp_list_tools.completed
self.sequence_number += 1
yield OpenAIResponseObjectStreamResponseMcpListToolsCompleted(
sequence_number=self.sequence_number,
)
# Emit output_item.done for the MCP list tools message
self.sequence_number += 1
yield OpenAIResponseObjectStreamResponseOutputItemDone(
response_id=self.response_id,
item=mcp_list_message,
output_index=len(output_messages) - 1,
sequence_number=self.sequence_number,
)
async for stream_event in self._add_mcp_list_tools(mcp_list_message, output_messages):
yield stream_event
except Exception as e:
# TODO: Emit mcp_list_tools.failed event if needed
logger.exception(f"Failed to list MCP tools from {mcp_tool.server_url}: {e}")
raise
async def _process_tools(
self, output_messages: list[OpenAIResponseOutput]
) -> AsyncIterator[OpenAIResponseObjectStream]:
# Handle all mcp tool lists from previous response that are still valid:
for tool in self.ctx.tool_context.previous_tool_listings:
async for evt in self._reuse_mcp_list_tools(tool, output_messages):
yield evt
# Process all remaining tools (including MCP tools) and emit streaming events
if self.ctx.tool_context.tools_to_process:
async for stream_event in self._process_new_tools(self.ctx.tool_context.tools_to_process, output_messages):
yield stream_event
def _approval_required(self, tool_name: str) -> bool:
if tool_name not in self.mcp_tool_to_server:
return False
@ -774,7 +759,6 @@ class StreamingResponseOrchestrator:
output_index=len(output_messages) - 1,
sequence_number=self.sequence_number,
)
self.sequence_number += 1
yield OpenAIResponseObjectStreamResponseOutputItemDone(
response_id=self.response_id,
@ -782,3 +766,60 @@ class StreamingResponseOrchestrator:
output_index=len(output_messages) - 1,
sequence_number=self.sequence_number,
)
async def _add_mcp_list_tools(
self, mcp_list_message: OpenAIResponseOutputMessageMCPListTools, output_messages: list[OpenAIResponseOutput]
) -> AsyncIterator[OpenAIResponseObjectStream]:
# Add the MCP list message to output
output_messages.append(mcp_list_message)
# Emit output_item.added for the MCP list tools message
self.sequence_number += 1
yield OpenAIResponseObjectStreamResponseOutputItemAdded(
response_id=self.response_id,
item=mcp_list_message,
output_index=len(output_messages) - 1,
sequence_number=self.sequence_number,
)
# Emit mcp_list_tools.completed
self.sequence_number += 1
yield OpenAIResponseObjectStreamResponseMcpListToolsCompleted(
sequence_number=self.sequence_number,
)
# Emit output_item.done for the MCP list tools message
self.sequence_number += 1
yield OpenAIResponseObjectStreamResponseOutputItemDone(
response_id=self.response_id,
item=mcp_list_message,
output_index=len(output_messages) - 1,
sequence_number=self.sequence_number,
)
async def _reuse_mcp_list_tools(
self, original: OpenAIResponseOutputMessageMCPListTools, output_messages: list[OpenAIResponseOutput]
) -> AsyncIterator[OpenAIResponseObjectStream]:
for t in original.tools:
from llama_stack.models.llama.datatypes import ToolDefinition
from llama_stack.providers.utils.inference.openai_compat import convert_tooldef_to_openai_tool
# convert from input_schema to map of ToolParamDefinitions...
tool_def = ToolDefinition(
tool_name=t.name,
description=t.description,
input_schema=t.input_schema,
)
# ...then can convert that to openai completions tool
openai_tool = convert_tooldef_to_openai_tool(tool_def)
if self.ctx.chat_tools is None:
self.ctx.chat_tools = []
self.ctx.chat_tools.append(openai_tool)
mcp_list_message = OpenAIResponseOutputMessageMCPListTools(
id=f"mcp_list_{uuid.uuid4()}",
server_label=original.server_label,
tools=original.tools,
)
async for stream_event in self._add_mcp_list_tools(mcp_list_message, output_messages):
yield stream_event

View file

@ -12,10 +12,18 @@ from pydantic import BaseModel
from llama_stack.apis.agents.openai_responses import (
OpenAIResponseInput,
OpenAIResponseInputTool,
OpenAIResponseInputToolFileSearch,
OpenAIResponseInputToolFunction,
OpenAIResponseInputToolMCP,
OpenAIResponseInputToolWebSearch,
OpenAIResponseMCPApprovalRequest,
OpenAIResponseMCPApprovalResponse,
OpenAIResponseObject,
OpenAIResponseObjectStream,
OpenAIResponseOutput,
OpenAIResponseOutputMessageMCPListTools,
OpenAIResponseTool,
OpenAIResponseToolMCP,
)
from llama_stack.apis.inference import OpenAIChatCompletionToolCall, OpenAIMessageParam, OpenAIResponseFormatParam
@ -55,6 +63,86 @@ class ChatCompletionResult:
return bool(self.tool_calls)
class ToolContext(BaseModel):
"""Holds information about tools from this and (if relevant)
previous response in order to facilitate reuse of previous
listings where appropriate."""
# tools argument passed into current request:
current_tools: list[OpenAIResponseInputTool]
# reconstructed map of tool -> mcp server from previous response:
previous_tools: dict[str, OpenAIResponseInputToolMCP]
# reusable mcp-list-tools objects from previous response:
previous_tool_listings: list[OpenAIResponseOutputMessageMCPListTools]
# tool arguments from current request that still need to be processed:
tools_to_process: list[OpenAIResponseInputTool]
def __init__(
self,
current_tools: list[OpenAIResponseInputTool] | None,
):
super().__init__(
current_tools=current_tools or [],
previous_tools={},
previous_tool_listings=[],
tools_to_process=current_tools or [],
)
def recover_tools_from_previous_response(
self,
previous_response: OpenAIResponseObject,
):
"""Determine which mcp_list_tools objects from previous response we can reuse."""
if self.current_tools and previous_response.tools:
previous_tools_by_label: dict[str, OpenAIResponseToolMCP] = {}
for tool in previous_response.tools:
if isinstance(tool, OpenAIResponseToolMCP):
previous_tools_by_label[tool.server_label] = tool
# collect tool definitions which are the same in current and previous requests:
tools_to_process = []
matched: dict[str, OpenAIResponseInputToolMCP] = {}
for tool in self.current_tools:
if isinstance(tool, OpenAIResponseInputToolMCP) and tool.server_label in previous_tools_by_label:
previous_tool = previous_tools_by_label[tool.server_label]
if previous_tool.allowed_tools == tool.allowed_tools:
matched[tool.server_label] = tool
else:
tools_to_process.append(tool)
else:
tools_to_process.append(tool)
# tools that are not the same or were not previously defined need to be processed:
self.tools_to_process = tools_to_process
# for all matched definitions, get the mcp_list_tools objects from the previous output:
self.previous_tool_listings = [
obj for obj in previous_response.output if obj.type == "mcp_list_tools" and obj.server_label in matched
]
# reconstruct the tool to server mappings that can be reused:
for listing in self.previous_tool_listings:
definition = matched[listing.server_label]
for tool in listing.tools:
self.previous_tools[tool.name] = definition
def available_tools(self) -> list[OpenAIResponseTool]:
if not self.current_tools:
return []
def convert_tool(tool: OpenAIResponseInputTool) -> OpenAIResponseTool:
if isinstance(tool, OpenAIResponseInputToolWebSearch):
return tool
if isinstance(tool, OpenAIResponseInputToolFileSearch):
return tool
if isinstance(tool, OpenAIResponseInputToolFunction):
return tool
if isinstance(tool, OpenAIResponseInputToolMCP):
return OpenAIResponseToolMCP(
server_label=tool.server_label,
allowed_tools=tool.allowed_tools,
)
return [convert_tool(tool) for tool in self.current_tools]
class ChatCompletionContext(BaseModel):
model: str
messages: list[OpenAIMessageParam]
@ -62,6 +150,7 @@ class ChatCompletionContext(BaseModel):
chat_tools: list[ChatCompletionToolParam] | None = None
temperature: float | None
response_format: OpenAIResponseFormatParam
tool_context: ToolContext | None
approval_requests: list[OpenAIResponseMCPApprovalRequest] = []
approval_responses: dict[str, OpenAIResponseMCPApprovalResponse] = {}
@ -72,6 +161,7 @@ class ChatCompletionContext(BaseModel):
response_tools: list[OpenAIResponseInputTool] | None,
temperature: float | None,
response_format: OpenAIResponseFormatParam,
tool_context: ToolContext,
inputs: list[OpenAIResponseInput] | str,
):
super().__init__(
@ -80,6 +170,7 @@ class ChatCompletionContext(BaseModel):
response_tools=response_tools,
temperature=temperature,
response_format=response_format,
tool_context=tool_context,
)
if not isinstance(inputs, str):
self.approval_requests = [input for input in inputs if input.type == "mcp_approval_request"]
@ -96,3 +187,8 @@ class ChatCompletionContext(BaseModel):
if request.name == tool_name and request.arguments == arguments:
return request
return None
def available_tools(self) -> list[OpenAIResponseTool]:
if not self.tool_context:
return []
return self.tool_context.available_tools()