feat(tests): enable MCP tests in server mode (#4146)

We would like to run all OpenAI compatibility tests using only the openai-client library. This is most friendly for contributors since they can run tests without needing to update the client-sdks (which is getting easier but still a long pole.) This is the first step in enabling that -- no using "library client" for any of the Responses tests. This seems like a reasonable trade-off since the usage of an embeddeble library client for Responses (or any OpenAI-compatible) behavior seems to be not very common. To do this, we needed to enable MCP tests (which only worked in library client mode) for server mode.
2025-12-03 09:53:45 +00:00 · 2025-11-13 07:23:23 -08:00 · 2025-11-13 07:23:23 -08:00 · 1e81056a22
commit 1e81056a22
parent 9eb81439d2
29 changed files with 13388 additions and 127 deletions
--- a/tests/integration/responses/test_tool_responses.py
+++ b/tests/integration/responses/test_tool_responses.py
@ -9,6 +9,7 @@ import logging  # allow-direct-logging
 import os

 import httpx
+import llama_stack_client
 import openai
 import pytest

@ -29,8 +30,8 @@ from .streaming_assertions import StreamingValidator


@pytest.mark.parametrize("case", web_search_test_cases)
-def test_response_non_streaming_web_search(compat_client, text_model_id, case):
-    response = compat_client.responses.create(
+def test_response_non_streaming_web_search(responses_client, text_model_id, case):
+    response = responses_client.responses.create(
        model=text_model_id,
        input=case.input,
        tools=case.tools,
@ -48,12 +49,9 @@ def test_response_non_streaming_web_search(compat_client, text_model_id, case):

@pytest.mark.parametrize("case", file_search_test_cases)
 def test_response_non_streaming_file_search(
-    compat_client, text_model_id, embedding_model_id, embedding_dimension, tmp_path, case
+    responses_client, text_model_id, embedding_model_id, embedding_dimension, tmp_path, case
 ):
-    if isinstance(compat_client, LlamaStackAsLibraryClient):
-        pytest.skip("Responses API file search is not yet supported in library client.")
-
-    vector_store = new_vector_store(compat_client, "test_vector_store", embedding_model_id, embedding_dimension)
+    vector_store = new_vector_store(responses_client, "test_vector_store", embedding_model_id, embedding_dimension)

    if case.file_content:
        file_name = "test_response_non_streaming_file_search.txt"
@ -65,16 +63,16 @@ def test_response_non_streaming_file_search(
    else:
        raise ValueError("No file content or path provided for case")

-    file_response = upload_file(compat_client, file_name, file_path)
+    file_response = upload_file(responses_client, file_name, file_path)

    # Attach our file to the vector store
-    compat_client.vector_stores.files.create(
+    responses_client.vector_stores.files.create(
        vector_store_id=vector_store.id,
        file_id=file_response.id,
    )

    # Wait for the file to be attached
-    wait_for_file_attachment(compat_client, vector_store.id, file_response.id)
+    wait_for_file_attachment(responses_client, vector_store.id, file_response.id)

    # Update our tools with the right vector store id
    tools = case.tools
@ -83,7 +81,7 @@ def test_response_non_streaming_file_search(
            tool["vector_store_ids"] = [vector_store.id]

    # Create the response request, which should query our vector store
-    response = compat_client.responses.create(
+    response = responses_client.responses.create(
        model=text_model_id,
        input=case.input,
        tools=tools,
@ -105,15 +103,12 @@ def test_response_non_streaming_file_search(


 def test_response_non_streaming_file_search_empty_vector_store(
-    compat_client, text_model_id, embedding_model_id, embedding_dimension
+    responses_client, text_model_id, embedding_model_id, embedding_dimension
 ):
-    if isinstance(compat_client, LlamaStackAsLibraryClient):
-        pytest.skip("Responses API file search is not yet supported in library client.")
-
-    vector_store = new_vector_store(compat_client, "test_vector_store", embedding_model_id, embedding_dimension)
+    vector_store = new_vector_store(responses_client, "test_vector_store", embedding_model_id, embedding_dimension)

    # Create the response request, which should query our vector store
-    response = compat_client.responses.create(
+    response = responses_client.responses.create(
        model=text_model_id,
        input="How many experts does the Llama 4 Maverick model have?",
        tools=[{"type": "file_search", "vector_store_ids": [vector_store.id]}],
@ -133,13 +128,10 @@ def test_response_non_streaming_file_search_empty_vector_store(


 def test_response_sequential_file_search(
-    compat_client, text_model_id, embedding_model_id, embedding_dimension, tmp_path
+    responses_client, text_model_id, embedding_model_id, embedding_dimension, tmp_path
 ):
    """Test file search with sequential responses using previous_response_id."""
-    if isinstance(compat_client, LlamaStackAsLibraryClient):
-        pytest.skip("Responses API file search is not yet supported in library client.")
-
-    vector_store = new_vector_store(compat_client, "test_vector_store", embedding_model_id, embedding_dimension)
+    vector_store = new_vector_store(responses_client, "test_vector_store", embedding_model_id, embedding_dimension)

    # Create a test file with content
    file_content = "The Llama 4 Maverick model has 128 experts in its mixture of experts architecture."
@ -147,21 +139,21 @@ def test_response_sequential_file_search(
    file_path = tmp_path / file_name
    file_path.write_text(file_content)

-    file_response = upload_file(compat_client, file_name, file_path)
+    file_response = upload_file(responses_client, file_name, file_path)

    # Attach the file to the vector store
-    compat_client.vector_stores.files.create(
+    responses_client.vector_stores.files.create(
        vector_store_id=vector_store.id,
        file_id=file_response.id,
    )

    # Wait for the file to be attached
-    wait_for_file_attachment(compat_client, vector_store.id, file_response.id)
+    wait_for_file_attachment(responses_client, vector_store.id, file_response.id)

    tools = [{"type": "file_search", "vector_store_ids": [vector_store.id]}]

    # First response request with file search
-    response = compat_client.responses.create(
+    response = responses_client.responses.create(
        model=text_model_id,
        input="How many experts does the Llama 4 Maverick model have?",
        tools=tools,
@ -178,7 +170,7 @@ def test_response_sequential_file_search(
    assert "128" in response.output_text or "experts" in response.output_text.lower()

    # Second response request using previous_response_id
-    response2 = compat_client.responses.create(
+    response2 = responses_client.responses.create(
        model=text_model_id,
        input="Can you tell me more about the architecture?",
        tools=tools,
@ -199,14 +191,11 @@ def test_response_sequential_file_search(


@pytest.mark.parametrize("case", mcp_tool_test_cases)
-def test_response_non_streaming_mcp_tool(compat_client, text_model_id, case, caplog):
-    if not isinstance(compat_client, LlamaStackAsLibraryClient):
-        pytest.skip("in-process MCP server is only supported in library client")
-
+def test_response_non_streaming_mcp_tool(responses_client, text_model_id, case, caplog):
    with make_mcp_server() as mcp_server_info:
        tools = setup_mcp_tools(case.tools, mcp_server_info)

-        response = compat_client.responses.create(
+        response = responses_client.responses.create(
            model=text_model_id,
            input=case.input,
            tools=tools,
@ -243,15 +232,15 @@ def test_response_non_streaming_mcp_tool(compat_client, text_model_id, case, cap

        exc_type = (
            AuthenticationRequiredError
-            if isinstance(compat_client, LlamaStackAsLibraryClient)
-            else (httpx.HTTPStatusError, openai.AuthenticationError)
+            if isinstance(responses_client, LlamaStackAsLibraryClient)
+            else (httpx.HTTPStatusError, openai.AuthenticationError, llama_stack_client.AuthenticationError)
        )
        # Suppress expected auth error logs only for the failing auth attempt
        with caplog.at_level(
            logging.CRITICAL, logger="llama_stack.providers.inline.agents.meta_reference.responses.streaming"
        ):
            with pytest.raises(exc_type):
-                compat_client.responses.create(
+                responses_client.responses.create(
                    model=text_model_id,
                    input=case.input,
                    tools=tools,
@ -262,7 +251,7 @@ def test_response_non_streaming_mcp_tool(compat_client, text_model_id, case, cap
            if tool["type"] == "mcp":
                tool["headers"] = {"Authorization": "Bearer test-token"}

-        response = compat_client.responses.create(
+        response = responses_client.responses.create(
            model=text_model_id,
            input=case.input,
            tools=tools,
@ -272,14 +261,11 @@ def test_response_non_streaming_mcp_tool(compat_client, text_model_id, case, cap


@pytest.mark.parametrize("case", mcp_tool_test_cases)
-def test_response_sequential_mcp_tool(compat_client, text_model_id, case):
-    if not isinstance(compat_client, LlamaStackAsLibraryClient):
-        pytest.skip("in-process MCP server is only supported in library client")
-
+def test_response_sequential_mcp_tool(responses_client, text_model_id, case):
    with make_mcp_server() as mcp_server_info:
        tools = setup_mcp_tools(case.tools, mcp_server_info)

-        response = compat_client.responses.create(
+        response = responses_client.responses.create(
            model=text_model_id,
            input=case.input,
            tools=tools,
@ -311,7 +297,7 @@ def test_response_sequential_mcp_tool(compat_client, text_model_id, case):
        text_content = message.content[0].text
        assert "boiling point" in text_content.lower()

-        response2 = compat_client.responses.create(
+        response2 = responses_client.responses.create(
            model=text_model_id, input=case.input, tools=tools, stream=False, previous_response_id=response.id
        )

@ -323,16 +309,13 @@ def test_response_sequential_mcp_tool(compat_client, text_model_id, case):

@pytest.mark.parametrize("case", mcp_tool_test_cases)
@pytest.mark.parametrize("approve", [True, False])
-def test_response_mcp_tool_approval(compat_client, text_model_id, case, approve):
-    if not isinstance(compat_client, LlamaStackAsLibraryClient):
-        pytest.skip("in-process MCP server is only supported in library client")
-
+def test_response_mcp_tool_approval(responses_client, text_model_id, case, approve):
    with make_mcp_server() as mcp_server_info:
        tools = setup_mcp_tools(case.tools, mcp_server_info)
        for tool in tools:
            tool["require_approval"] = "always"

-        response = compat_client.responses.create(
+        response = responses_client.responses.create(
            model=text_model_id,
            input=case.input,
            tools=tools,
@ -352,13 +335,13 @@ def test_response_mcp_tool_approval(compat_client, text_model_id, case, approve)
        approval_request = response.output[1]
        assert approval_request.type == "mcp_approval_request"
        assert approval_request.name == "get_boiling_point"
-        assert json.loads(approval_request.arguments) == {
-            "liquid_name": "myawesomeliquid",
-            "celsius": True,
-        }
+        args = json.loads(approval_request.arguments)
+        assert args["liquid_name"] == "myawesomeliquid"
+        # celsius has a default value of True, so it may be omitted or explicitly set
+        assert args.get("celsius", True) is True

        # send approval response
-        response = compat_client.responses.create(
+        response = responses_client.responses.create(
            previous_response_id=response.id,
            model=text_model_id,
            input=[{"type": "mcp_approval_response", "approval_request_id": approval_request.id, "approve": approve}],
@ -398,8 +381,8 @@ def test_response_mcp_tool_approval(compat_client, text_model_id, case, approve)


@pytest.mark.parametrize("case", custom_tool_test_cases)
-def test_response_non_streaming_custom_tool(compat_client, text_model_id, case):
-    response = compat_client.responses.create(
+def test_response_non_streaming_custom_tool(responses_client, text_model_id, case):
+    response = responses_client.responses.create(
        model=text_model_id,
        input=case.input,
        tools=case.tools,
@ -412,8 +395,8 @@ def test_response_non_streaming_custom_tool(compat_client, text_model_id, case):


@pytest.mark.parametrize("case", custom_tool_test_cases)
-def test_response_function_call_ordering_1(compat_client, text_model_id, case):
-    response = compat_client.responses.create(
+def test_response_function_call_ordering_1(responses_client, text_model_id, case):
+    response = responses_client.responses.create(
        model=text_model_id,
        input=case.input,
        tools=case.tools,
@ -437,13 +420,13 @@ def test_response_function_call_ordering_1(compat_client, text_model_id, case):
            "call_id": response.output[0].call_id,
        }
    )
-    response = compat_client.responses.create(
+    response = responses_client.responses.create(
        model=text_model_id, input=inputs, tools=case.tools, stream=False, previous_response_id=response.id
    )
    assert len(response.output) == 1


-def test_response_function_call_ordering_2(compat_client, text_model_id):
+def test_response_function_call_ordering_2(responses_client, text_model_id):
    tools = [
        {
            "type": "function",
@ -468,7 +451,7 @@ def test_response_function_call_ordering_2(compat_client, text_model_id):
            "content": "Is the weather better in San Francisco or Los Angeles?",
        }
    ]
-    response = compat_client.responses.create(
+    response = responses_client.responses.create(
        model=text_model_id,
        input=inputs,
        tools=tools,
@ -489,7 +472,7 @@ def test_response_function_call_ordering_2(compat_client, text_model_id):
                    "call_id": output.call_id,
                }
            )
-    response = compat_client.responses.create(
+    response = responses_client.responses.create(
        model=text_model_id,
        input=inputs,
        tools=tools,
@ -500,15 +483,12 @@ def test_response_function_call_ordering_2(compat_client, text_model_id):


@pytest.mark.parametrize("case", multi_turn_tool_execution_test_cases)
-def test_response_non_streaming_multi_turn_tool_execution(compat_client, text_model_id, case):
+def test_response_non_streaming_multi_turn_tool_execution(responses_client, text_model_id, case):
    """Test multi-turn tool execution where multiple MCP tool calls are performed in sequence."""
-    if not isinstance(compat_client, LlamaStackAsLibraryClient):
-        pytest.skip("in-process MCP server is only supported in library client")
-
    with make_mcp_server(tools=dependency_tools()) as mcp_server_info:
        tools = setup_mcp_tools(case.tools, mcp_server_info)

-        response = compat_client.responses.create(
+        response = responses_client.responses.create(
            input=case.input,
            model=text_model_id,
            tools=tools,
@ -550,15 +530,12 @@ def test_response_non_streaming_multi_turn_tool_execution(compat_client, text_mo


@pytest.mark.parametrize("case", multi_turn_tool_execution_streaming_test_cases)
-def test_response_streaming_multi_turn_tool_execution(compat_client, text_model_id, case):
+def test_response_streaming_multi_turn_tool_execution(responses_client, text_model_id, case):
    """Test streaming multi-turn tool execution where multiple MCP tool calls are performed in sequence."""
-    if not isinstance(compat_client, LlamaStackAsLibraryClient):
-        pytest.skip("in-process MCP server is only supported in library client")
-
    with make_mcp_server(tools=dependency_tools()) as mcp_server_info:
        tools = setup_mcp_tools(case.tools, mcp_server_info)

-        stream = compat_client.responses.create(
+        stream = responses_client.responses.create(
            input=case.input,
            model=text_model_id,
            tools=tools,