diff --git a/docs/docs/providers/agents/index.mdx b/docs/docs/providers/agents/index.mdx index 5cd37776d..c86805b14 100644 --- a/docs/docs/providers/agents/index.mdx +++ b/docs/docs/providers/agents/index.mdx @@ -1,12 +1,12 @@ --- description: "Agents API for creating and interacting with agentic systems. - Main functionalities provided by this API: - - Create agents with specific instructions and ability to use tools. - - Interactions with agents are grouped into sessions (\"threads\"), and each interaction is called a \"turn\". - - Agents can be provided with various tools (see the ToolGroups and ToolRuntime APIs for more details). - - Agents can be provided with various shields (see the Safety API for more details). - - Agents can also use Memory to retrieve information from knowledge bases. See the RAG Tool and Vector IO APIs for more details." +Main functionalities provided by this API: +- Create agents with specific instructions and ability to use tools. +- Interactions with agents are grouped into sessions (\"threads\"), and each interaction is called a \"turn\". +- Agents can be provided with various tools (see the ToolGroups and ToolRuntime APIs for more details). +- Agents can be provided with various shields (see the Safety API for more details). +- Agents can also use Memory to retrieve information from knowledge bases. See the RAG Tool and Vector IO APIs for more details." sidebar_label: Agents title: Agents --- @@ -17,11 +17,11 @@ title: Agents Agents API for creating and interacting with agentic systems. - Main functionalities provided by this API: - - Create agents with specific instructions and ability to use tools. - - Interactions with agents are grouped into sessions ("threads"), and each interaction is called a "turn". - - Agents can be provided with various tools (see the ToolGroups and ToolRuntime APIs for more details). - - Agents can be provided with various shields (see the Safety API for more details). - - Agents can also use Memory to retrieve information from knowledge bases. See the RAG Tool and Vector IO APIs for more details. +Main functionalities provided by this API: +- Create agents with specific instructions and ability to use tools. +- Interactions with agents are grouped into sessions ("threads"), and each interaction is called a "turn". +- Agents can be provided with various tools (see the ToolGroups and ToolRuntime APIs for more details). +- Agents can be provided with various shields (see the Safety API for more details). +- Agents can also use Memory to retrieve information from knowledge bases. See the RAG Tool and Vector IO APIs for more details. This section contains documentation for all available providers for the **agents** API. diff --git a/docs/docs/providers/batches/index.mdx b/docs/docs/providers/batches/index.mdx index 2c64b277f..18e5e314d 100644 --- a/docs/docs/providers/batches/index.mdx +++ b/docs/docs/providers/batches/index.mdx @@ -1,14 +1,14 @@ --- description: "The Batches API enables efficient processing of multiple requests in a single operation, - particularly useful for processing large datasets, batch evaluation workflows, and - cost-effective inference at scale. +particularly useful for processing large datasets, batch evaluation workflows, and +cost-effective inference at scale. - The API is designed to allow use of openai client libraries for seamless integration. +The API is designed to allow use of openai client libraries for seamless integration. - This API provides the following extensions: - - idempotent batch creation +This API provides the following extensions: + - idempotent batch creation - Note: This API is currently under active development and may undergo changes." +Note: This API is currently under active development and may undergo changes." sidebar_label: Batches title: Batches --- @@ -18,14 +18,14 @@ title: Batches ## Overview The Batches API enables efficient processing of multiple requests in a single operation, - particularly useful for processing large datasets, batch evaluation workflows, and - cost-effective inference at scale. +particularly useful for processing large datasets, batch evaluation workflows, and +cost-effective inference at scale. - The API is designed to allow use of openai client libraries for seamless integration. +The API is designed to allow use of openai client libraries for seamless integration. - This API provides the following extensions: - - idempotent batch creation +This API provides the following extensions: + - idempotent batch creation - Note: This API is currently under active development and may undergo changes. +Note: This API is currently under active development and may undergo changes. This section contains documentation for all available providers for the **batches** API. diff --git a/docs/docs/providers/inference/index.mdx b/docs/docs/providers/inference/index.mdx index ebbaf1be1..1dc479675 100644 --- a/docs/docs/providers/inference/index.mdx +++ b/docs/docs/providers/inference/index.mdx @@ -1,9 +1,9 @@ --- description: "Llama Stack Inference API for generating completions, chat completions, and embeddings. - This API provides the raw interface to the underlying models. Two kinds of models are supported: - - LLM models: these models generate \"raw\" and \"chat\" (conversational) completions. - - Embedding models: these models generate embeddings to be used for semantic search." +This API provides the raw interface to the underlying models. Two kinds of models are supported: +- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions. +- Embedding models: these models generate embeddings to be used for semantic search." sidebar_label: Inference title: Inference --- @@ -14,8 +14,8 @@ title: Inference Llama Stack Inference API for generating completions, chat completions, and embeddings. - This API provides the raw interface to the underlying models. Two kinds of models are supported: - - LLM models: these models generate "raw" and "chat" (conversational) completions. - - Embedding models: these models generate embeddings to be used for semantic search. +This API provides the raw interface to the underlying models. Two kinds of models are supported: +- LLM models: these models generate "raw" and "chat" (conversational) completions. +- Embedding models: these models generate embeddings to be used for semantic search. This section contains documentation for all available providers for the **inference** API. diff --git a/docs/static/llama-stack-spec.html b/docs/static/llama-stack-spec.html index 9e28e0f42..9cf75631d 100644 --- a/docs/static/llama-stack-spec.html +++ b/docs/static/llama-stack-spec.html @@ -8614,6 +8614,12 @@ { "$ref": "#/components/schemas/OpenAIResponseInputFunctionToolCallOutput" }, + { + "$ref": "#/components/schemas/OpenAIResponseMCPApprovalRequest" + }, + { + "$ref": "#/components/schemas/OpenAIResponseMCPApprovalResponse" + }, { "$ref": "#/components/schemas/OpenAIResponseMessage" } @@ -9031,6 +9037,68 @@ "title": "OpenAIResponseInputToolWebSearch", "description": "Web search tool configuration for OpenAI response inputs." }, + "OpenAIResponseMCPApprovalRequest": { + "type": "object", + "properties": { + "arguments": { + "type": "string" + }, + "id": { + "type": "string" + }, + "name": { + "type": "string" + }, + "server_label": { + "type": "string" + }, + "type": { + "type": "string", + "const": "mcp_approval_request", + "default": "mcp_approval_request" + } + }, + "additionalProperties": false, + "required": [ + "arguments", + "id", + "name", + "server_label", + "type" + ], + "title": "OpenAIResponseMCPApprovalRequest", + "description": "A request for human approval of a tool invocation." + }, + "OpenAIResponseMCPApprovalResponse": { + "type": "object", + "properties": { + "approval_request_id": { + "type": "string" + }, + "approve": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "mcp_approval_response", + "default": "mcp_approval_response" + }, + "id": { + "type": "string" + }, + "reason": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "approval_request_id", + "approve", + "type" + ], + "title": "OpenAIResponseMCPApprovalResponse", + "description": "A response to an MCP approval request." + }, "OpenAIResponseMessage": { "type": "object", "properties": { @@ -9539,6 +9607,9 @@ }, { "$ref": "#/components/schemas/OpenAIResponseOutputMessageMCPListTools" + }, + { + "$ref": "#/components/schemas/OpenAIResponseMCPApprovalRequest" } ], "discriminator": { @@ -9549,7 +9620,8 @@ "file_search_call": "#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall", "function_call": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall", "mcp_call": "#/components/schemas/OpenAIResponseOutputMessageMCPCall", - "mcp_list_tools": "#/components/schemas/OpenAIResponseOutputMessageMCPListTools" + "mcp_list_tools": "#/components/schemas/OpenAIResponseOutputMessageMCPListTools", + "mcp_approval_request": "#/components/schemas/OpenAIResponseMCPApprovalRequest" } } }, diff --git a/docs/static/llama-stack-spec.yaml b/docs/static/llama-stack-spec.yaml index 1c06c74a5..f204277bd 100644 --- a/docs/static/llama-stack-spec.yaml +++ b/docs/static/llama-stack-spec.yaml @@ -6254,6 +6254,8 @@ components: - $ref: '#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall' - $ref: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall' - $ref: '#/components/schemas/OpenAIResponseInputFunctionToolCallOutput' + - $ref: '#/components/schemas/OpenAIResponseMCPApprovalRequest' + - $ref: '#/components/schemas/OpenAIResponseMCPApprovalResponse' - $ref: '#/components/schemas/OpenAIResponseMessage' "OpenAIResponseInputFunctionToolCallOutput": type: object @@ -6548,6 +6550,53 @@ components: title: OpenAIResponseInputToolWebSearch description: >- Web search tool configuration for OpenAI response inputs. + OpenAIResponseMCPApprovalRequest: + type: object + properties: + arguments: + type: string + id: + type: string + name: + type: string + server_label: + type: string + type: + type: string + const: mcp_approval_request + default: mcp_approval_request + additionalProperties: false + required: + - arguments + - id + - name + - server_label + - type + title: OpenAIResponseMCPApprovalRequest + description: >- + A request for human approval of a tool invocation. + OpenAIResponseMCPApprovalResponse: + type: object + properties: + approval_request_id: + type: string + approve: + type: boolean + type: + type: string + const: mcp_approval_response + default: mcp_approval_response + id: + type: string + reason: + type: string + additionalProperties: false + required: + - approval_request_id + - approve + - type + title: OpenAIResponseMCPApprovalResponse + description: A response to an MCP approval request. OpenAIResponseMessage: type: object properties: @@ -6944,6 +6993,7 @@ components: - $ref: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall' - $ref: '#/components/schemas/OpenAIResponseOutputMessageMCPCall' - $ref: '#/components/schemas/OpenAIResponseOutputMessageMCPListTools' + - $ref: '#/components/schemas/OpenAIResponseMCPApprovalRequest' discriminator: propertyName: type mapping: @@ -6953,6 +7003,7 @@ components: function_call: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall' mcp_call: '#/components/schemas/OpenAIResponseOutputMessageMCPCall' mcp_list_tools: '#/components/schemas/OpenAIResponseOutputMessageMCPListTools' + mcp_approval_request: '#/components/schemas/OpenAIResponseMCPApprovalRequest' OpenAIResponseOutputMessageMCPCall: type: object properties: diff --git a/llama_stack/apis/agents/openai_responses.py b/llama_stack/apis/agents/openai_responses.py index 591992479..b449ff8c0 100644 --- a/llama_stack/apis/agents/openai_responses.py +++ b/llama_stack/apis/agents/openai_responses.py @@ -276,13 +276,40 @@ class OpenAIResponseOutputMessageMCPListTools(BaseModel): tools: list[MCPListToolsTool] +@json_schema_type +class OpenAIResponseMCPApprovalRequest(BaseModel): + """ + A request for human approval of a tool invocation. + """ + + arguments: str + id: str + name: str + server_label: str + type: Literal["mcp_approval_request"] = "mcp_approval_request" + + +@json_schema_type +class OpenAIResponseMCPApprovalResponse(BaseModel): + """ + A response to an MCP approval request. + """ + + approval_request_id: str + approve: bool + type: Literal["mcp_approval_response"] = "mcp_approval_response" + id: str | None = None + reason: str | None = None + + OpenAIResponseOutput = Annotated[ OpenAIResponseMessage | OpenAIResponseOutputMessageWebSearchToolCall | OpenAIResponseOutputMessageFileSearchToolCall | OpenAIResponseOutputMessageFunctionToolCall | OpenAIResponseOutputMessageMCPCall - | OpenAIResponseOutputMessageMCPListTools, + | OpenAIResponseOutputMessageMCPListTools + | OpenAIResponseMCPApprovalRequest, Field(discriminator="type"), ] register_schema(OpenAIResponseOutput, name="OpenAIResponseOutput") @@ -725,6 +752,8 @@ OpenAIResponseInput = Annotated[ | OpenAIResponseOutputMessageFileSearchToolCall | OpenAIResponseOutputMessageFunctionToolCall | OpenAIResponseInputFunctionToolCallOutput + | OpenAIResponseMCPApprovalRequest + | OpenAIResponseMCPApprovalResponse | # Fallback to the generic message type as a last resort OpenAIResponseMessage, diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py b/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py index c632e61aa..a14d9267e 100644 --- a/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +++ b/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py @@ -238,6 +238,7 @@ class OpenAIResponsesImpl: temperature=temperature, response_format=response_format, ) + ctx.extract_approvals(input) # Create orchestrator and delegate streaming logic response_id = f"resp-{uuid.uuid4()}" diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py index 3e69fa5cd..c8b4adbab 100644 --- a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +++ b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py @@ -10,10 +10,12 @@ from typing import Any from llama_stack.apis.agents.openai_responses import ( AllowedToolsFilter, + ApprovalFilter, MCPListToolsTool, OpenAIResponseContentPartOutputText, OpenAIResponseInputTool, OpenAIResponseInputToolMCP, + OpenAIResponseMCPApprovalRequest, OpenAIResponseObject, OpenAIResponseObjectStream, OpenAIResponseObjectStreamResponseCompleted, @@ -117,10 +119,17 @@ class StreamingResponseOrchestrator: raise ValueError("Streaming chunk processor failed to return completion data") current_response = self._build_chat_completion(completion_result_data) - function_tool_calls, non_function_tool_calls, next_turn_messages = self._separate_tool_calls( + function_tool_calls, non_function_tool_calls, approvals, next_turn_messages = self._separate_tool_calls( current_response, messages ) + # add any approval requests required + for tool_call in approvals: + async for evt in self._add_mcp_approval_request( + tool_call.function.name, tool_call.function.arguments, output_messages + ): + yield evt + # Handle choices with no tool calls for choice in current_response.choices: if not (choice.message.tool_calls and self.ctx.response_tools): @@ -164,10 +173,11 @@ class StreamingResponseOrchestrator: # Emit response.completed yield OpenAIResponseObjectStreamResponseCompleted(response=final_response) - def _separate_tool_calls(self, current_response, messages) -> tuple[list, list, list]: + def _separate_tool_calls(self, current_response, messages) -> tuple[list, list, list, list]: """Separate tool calls into function and non-function categories.""" function_tool_calls = [] non_function_tool_calls = [] + approvals = [] next_turn_messages = messages.copy() for choice in current_response.choices: @@ -178,9 +188,23 @@ class StreamingResponseOrchestrator: if is_function_tool_call(tool_call, self.ctx.response_tools): function_tool_calls.append(tool_call) else: - non_function_tool_calls.append(tool_call) + if self._approval_required(tool_call.function.name): + approval_response = self.ctx.approval_response( + tool_call.function.name, tool_call.function.arguments + ) + if approval_response: + if approval_response.approve: + logger.info(f"Approval granted for {tool_call.id} on {tool_call.function.name}") + non_function_tool_calls.append(tool_call) + else: + logger.info(f"Approval denied for {tool_call.id} on {tool_call.function.name}") + else: + logger.info(f"Requesting approval for {tool_call.id} on {tool_call.function.name}") + approvals.append(tool_call) + else: + non_function_tool_calls.append(tool_call) - return function_tool_calls, non_function_tool_calls, next_turn_messages + return function_tool_calls, non_function_tool_calls, approvals, next_turn_messages async def _process_streaming_chunks( self, completion_result, output_messages: list[OpenAIResponseOutput] @@ -632,3 +656,46 @@ class StreamingResponseOrchestrator: # TODO: Emit mcp_list_tools.failed event if needed logger.exception(f"Failed to list MCP tools from {mcp_tool.server_url}: {e}") raise + + def _approval_required(self, tool_name: str) -> bool: + if tool_name not in self.mcp_tool_to_server: + return False + mcp_server = self.mcp_tool_to_server[tool_name] + if mcp_server.require_approval == "always": + return True + if mcp_server.require_approval == "never": + return False + if isinstance(mcp_server, ApprovalFilter): + if tool_name in mcp_server.always: + return True + if tool_name in mcp_server.never: + return False + return True + + async def _add_mcp_approval_request( + self, tool_name: str, arguments: str, output_messages: list[OpenAIResponseOutput] + ) -> AsyncIterator[OpenAIResponseObjectStream]: + mcp_server = self.mcp_tool_to_server[tool_name] + mcp_approval_request = OpenAIResponseMCPApprovalRequest( + arguments=arguments, + id=f"approval_{uuid.uuid4()}", + name=tool_name, + server_label=mcp_server.server_label, + ) + output_messages.append(mcp_approval_request) + + self.sequence_number += 1 + yield OpenAIResponseObjectStreamResponseOutputItemAdded( + response_id=self.response_id, + item=mcp_approval_request, + output_index=len(output_messages) - 1, + sequence_number=self.sequence_number, + ) + + self.sequence_number += 1 + yield OpenAIResponseObjectStreamResponseOutputItemDone( + response_id=self.response_id, + item=mcp_approval_request, + output_index=len(output_messages) - 1, + sequence_number=self.sequence_number, + ) diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/types.py b/llama_stack/providers/inline/agents/meta_reference/responses/types.py index 89086c262..5dcc9099d 100644 --- a/llama_stack/providers/inline/agents/meta_reference/responses/types.py +++ b/llama_stack/providers/inline/agents/meta_reference/responses/types.py @@ -11,6 +11,8 @@ from pydantic import BaseModel from llama_stack.apis.agents.openai_responses import ( OpenAIResponseInputTool, + OpenAIResponseMCPApprovalRequest, + OpenAIResponseMCPApprovalResponse, OpenAIResponseObjectStream, OpenAIResponseOutput, ) @@ -58,3 +60,24 @@ class ChatCompletionContext(BaseModel): chat_tools: list[ChatCompletionToolParam] | None = None temperature: float | None response_format: OpenAIResponseFormatParam + approval_requests: list[OpenAIResponseMCPApprovalRequest] = [] + approval_responses: dict[str, OpenAIResponseMCPApprovalResponse] = {} + + def extract_approvals(self, inputs): + if not isinstance(inputs, str): + self.approval_requests = [input for input in inputs if input.type == "mcp_approval_request"] + self.approval_responses = { + input.approval_request_id: input for input in inputs if input.type == "mcp_approval_response" + } + + def approval_response(self, tool_name: str, arguments: str) -> OpenAIResponseMCPApprovalResponse | None: + request = self._approval_request(tool_name, arguments) + if request and request.id in self.approval_responses: + return self.approval_responses[request.id] + return None + + def _approval_request(self, tool_name: str, arguments: str) -> OpenAIResponseMCPApprovalRequest | None: + for request in self.approval_requests: + if request.name == tool_name and request.arguments == arguments: + return request + return None diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/utils.py b/llama_stack/providers/inline/agents/meta_reference/responses/utils.py index 7aaeb4cd5..310a88298 100644 --- a/llama_stack/providers/inline/agents/meta_reference/responses/utils.py +++ b/llama_stack/providers/inline/agents/meta_reference/responses/utils.py @@ -13,6 +13,8 @@ from llama_stack.apis.agents.openai_responses import ( OpenAIResponseInputMessageContentImage, OpenAIResponseInputMessageContentText, OpenAIResponseInputTool, + OpenAIResponseMCPApprovalRequest, + OpenAIResponseMCPApprovalResponse, OpenAIResponseMessage, OpenAIResponseOutputMessageContent, OpenAIResponseOutputMessageContentOutputText, @@ -149,6 +151,11 @@ async def convert_response_input_to_chat_messages( elif isinstance(input_item, OpenAIResponseOutputMessageMCPListTools): # the tool list will be handled separately pass + elif isinstance(input_item, OpenAIResponseMCPApprovalRequest) or isinstance( + input_item, OpenAIResponseMCPApprovalResponse + ): + # these are handled by the responses impl itself and not pass through to chat completions + pass else: content = await convert_response_content_to_chat_content(input_item.content) message_type = await get_message_type_by_role(input_item.role) diff --git a/tests/integration/responses/test_tool_responses.py b/tests/integration/responses/test_tool_responses.py index c5c9e6fc1..f23734892 100644 --- a/tests/integration/responses/test_tool_responses.py +++ b/tests/integration/responses/test_tool_responses.py @@ -246,6 +246,82 @@ def test_response_sequential_mcp_tool(compat_client, text_model_id, case): assert "boiling point" in text_content.lower() +@pytest.mark.parametrize("case", mcp_tool_test_cases) +@pytest.mark.parametrize("approve", [True, False]) +def test_response_mcp_tool_approval(compat_client, text_model_id, case, approve): + if not isinstance(compat_client, LlamaStackAsLibraryClient): + pytest.skip("in-process MCP server is only supported in library client") + + with make_mcp_server() as mcp_server_info: + tools = setup_mcp_tools(case.tools, mcp_server_info) + for tool in tools: + tool["require_approval"] = "always" + + response = compat_client.responses.create( + model=text_model_id, + input=case.input, + tools=tools, + stream=False, + ) + + assert len(response.output) >= 2 + list_tools = response.output[0] + assert list_tools.type == "mcp_list_tools" + assert list_tools.server_label == "localmcp" + assert len(list_tools.tools) == 2 + assert {t.name for t in list_tools.tools} == { + "get_boiling_point", + "greet_everyone", + } + + approval_request = response.output[1] + assert approval_request.type == "mcp_approval_request" + assert approval_request.name == "get_boiling_point" + assert json.loads(approval_request.arguments) == { + "liquid_name": "myawesomeliquid", + "celsius": True, + } + + # send approval response + response = compat_client.responses.create( + previous_response_id=response.id, + model=text_model_id, + input=[{"type": "mcp_approval_response", "approval_request_id": approval_request.id, "approve": approve}], + tools=tools, + stream=False, + ) + + if approve: + assert len(response.output) >= 3 + list_tools = response.output[0] + assert list_tools.type == "mcp_list_tools" + assert list_tools.server_label == "localmcp" + assert len(list_tools.tools) == 2 + assert {t.name for t in list_tools.tools} == { + "get_boiling_point", + "greet_everyone", + } + + call = response.output[1] + assert call.type == "mcp_call" + assert call.name == "get_boiling_point" + assert json.loads(call.arguments) == { + "liquid_name": "myawesomeliquid", + "celsius": True, + } + assert call.error is None + assert "-100" in call.output + + # sometimes the model will call the tool again, so we need to get the last message + message = response.output[-1] + text_content = message.content[0].text + assert "boiling point" in text_content.lower() + else: + assert len(response.output) >= 1 + for output in response.output: + assert output.type != "mcp_call" + + @pytest.mark.parametrize("case", custom_tool_test_cases) def test_response_non_streaming_custom_tool(compat_client, text_model_id, case): response = compat_client.responses.create(