Merge branch 'main' into vllm_health_check

2025-12-21 09:28:41 +00:00 · 2025-06-05 18:09:36 +05:30 · 2025-06-05 18:09:36 +05:30 · c18b585d32
commit c18b585d32
parent 44401f0a88 ef885d2147
143 changed files with 9210 additions and 5347 deletions
--- a/tests/unit/providers/agent/test_meta_reference_agent.py
+++ b/tests/unit/providers/agent/test_meta_reference_agent.py
@ -59,6 +59,7 @@ async def agents_impl(config, mock_apis):
        mock_apis["safety_api"],
        mock_apis["tool_runtime_api"],
        mock_apis["tool_groups_api"],
+        {},
    )
    await impl.initialize()
    yield impl
--- a/tests/unit/providers/agents/meta_reference/test_openai_responses.py
+++ b/tests/unit/providers/agents/meta_reference/test_openai_responses.py
@ -25,11 +25,17 @@ from llama_stack.apis.agents.openai_responses import (
    OpenAIResponseObjectWithInput,
    OpenAIResponseOutputMessageContentOutputText,
    OpenAIResponseOutputMessageWebSearchToolCall,
+    OpenAIResponseText,
+    OpenAIResponseTextFormat,
 )
 from llama_stack.apis.inference.inference import (
    OpenAIAssistantMessageParam,
    OpenAIChatCompletionContentPartTextParam,
    OpenAIDeveloperMessageParam,
+    OpenAIJSONSchema,
+    OpenAIResponseFormatJSONObject,
+    OpenAIResponseFormatJSONSchema,
+    OpenAIResponseFormatText,
    OpenAIUserMessageParam,
 )
 from llama_stack.apis.tools.tools import Tool, ToolGroups, ToolInvocationResult, ToolParameter, ToolRuntime
@ -96,6 +102,7 @@ async def test_create_openai_response_with_string_input(openai_responses_impl, m
    mock_inference_api.openai_chat_completion.assert_called_once_with(
        model=model,
        messages=[OpenAIUserMessageParam(role="user", content="What is the capital of Ireland?", name=None)],
+        response_format=OpenAIResponseFormatText(),
        tools=None,
        stream=False,
        temperature=0.1,
@ -224,16 +231,16 @@ async def test_create_openai_response_with_tool_call_type_none(openai_responses_
        ],
    )

-    # Verify
+    # Check that we got the content from our mocked tool execution result
+    chunks = [chunk async for chunk in result]
+    assert len(chunks) == 2  # Should have response.created and response.completed
+
+    # Verify inference API was called correctly (after iterating over result)
    first_call = mock_inference_api.openai_chat_completion.call_args_list[0]
    assert first_call.kwargs["messages"][0].content == input_text
    assert first_call.kwargs["tools"] is not None
    assert first_call.kwargs["temperature"] == 0.1

-    # Check that we got the content from our mocked tool execution result
-    chunks = [chunk async for chunk in result]
-    assert len(chunks) == 2  # Should have response.created and response.completed
-
    # Check response.created event (should have empty output)
    assert chunks[0].type == "response.created"
    assert len(chunks[0].response.output) == 0
@ -320,6 +327,7 @@ async def test_prepend_previous_response_basic(openai_responses_impl, mock_respo
        model="fake_model",
        output=[response_output_message],
        status="completed",
+        text=OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")),
        input=[input_item_message],
    )
    mock_responses_store.get_response_object.return_value = previous_response
@ -362,6 +370,7 @@ async def test_prepend_previous_response_web_search(openai_responses_impl, mock_
        model="fake_model",
        output=[output_web_search, output_message],
        status="completed",
+        text=OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")),
        input=[input_item_message],
    )
    mock_responses_store.get_response_object.return_value = response
@ -483,6 +492,7 @@ async def test_create_openai_response_with_instructions_and_previous_response(
        model="fake_model",
        output=[response_output_message],
        status="completed",
+        text=OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")),
        input=[input_item_message],
    )
    mock_responses_store.get_response_object.return_value = response
@ -576,6 +586,7 @@ async def test_responses_store_list_input_items_logic():
        object="response",
        status="completed",
        output=[],
+        text=OpenAIResponseText(format=(OpenAIResponseTextFormat(type="text"))),
        input=input_items,
    )

@ -644,6 +655,7 @@ async def test_store_response_uses_rehydrated_input_with_previous_response(
        created_at=1234567890,
        model="meta-llama/Llama-3.1-8B-Instruct",
        status="completed",
+        text=OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")),
        input=[
            OpenAIResponseMessage(
                id="msg-prev-user", role="user", content=[OpenAIResponseInputMessageContentText(text="What is 2+2?")]
@ -694,3 +706,61 @@ async def test_store_response_uses_rehydrated_input_with_previous_response(
    # Verify the response itself is correct
    assert result.model == model
    assert result.status == "completed"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "text_format, response_format",
+    [
+        (OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")), OpenAIResponseFormatText()),
+        (
+            OpenAIResponseText(format=OpenAIResponseTextFormat(name="Test", schema={"foo": "bar"}, type="json_schema")),
+            OpenAIResponseFormatJSONSchema(json_schema=OpenAIJSONSchema(name="Test", schema={"foo": "bar"})),
+        ),
+        (OpenAIResponseText(format=OpenAIResponseTextFormat(type="json_object")), OpenAIResponseFormatJSONObject()),
+        # ensure text param with no format specified defaults to text
+        (OpenAIResponseText(format=None), OpenAIResponseFormatText()),
+        # ensure text param of None defaults to text
+        (None, OpenAIResponseFormatText()),
+    ],
+)
+async def test_create_openai_response_with_text_format(
+    openai_responses_impl, mock_inference_api, text_format, response_format
+):
+    """Test creating Responses with text formats."""
+    # Setup
+    input_text = "How hot it is in San Francisco today?"
+    model = "meta-llama/Llama-3.1-8B-Instruct"
+
+    # Load the chat completion fixture
+    mock_chat_completion = load_chat_completion_fixture("simple_chat_completion.yaml")
+    mock_inference_api.openai_chat_completion.return_value = mock_chat_completion
+
+    # Execute
+    _result = await openai_responses_impl.create_openai_response(
+        input=input_text,
+        model=model,
+        text=text_format,
+    )
+
+    # Verify
+    first_call = mock_inference_api.openai_chat_completion.call_args_list[0]
+    assert first_call.kwargs["messages"][0].content == input_text
+    assert first_call.kwargs["response_format"] is not None
+    assert first_call.kwargs["response_format"] == response_format
+
+
+@pytest.mark.asyncio
+async def test_create_openai_response_with_invalid_text_format(openai_responses_impl, mock_inference_api):
+    """Test creating an OpenAI response with an invalid text format."""
+    # Setup
+    input_text = "How hot it is in San Francisco today?"
+    model = "meta-llama/Llama-3.1-8B-Instruct"
+
+    # Execute
+    with pytest.raises(ValueError):
+        _result = await openai_responses_impl.create_openai_response(
+            input=input_text,
+            model=model,
+            text=OpenAIResponseText(format={"type": "invalid"}),
+        )
--- a/tests/unit/providers/agents/test_persistence_access_control.py
+++ b/tests/unit/providers/agents/test_persistence_access_control.py
@ -12,24 +12,24 @@ import pytest

 from llama_stack.apis.agents import Turn
 from llama_stack.apis.inference import CompletionMessage, StopReason
-from llama_stack.distribution.datatypes import AccessAttributes
+from llama_stack.distribution.datatypes import User
 from llama_stack.providers.inline.agents.meta_reference.persistence import AgentPersistence, AgentSessionInfo


@pytest.fixture
 async def test_setup(sqlite_kvstore):
-    agent_persistence = AgentPersistence(agent_id="test_agent", kvstore=sqlite_kvstore)
+    agent_persistence = AgentPersistence(agent_id="test_agent", kvstore=sqlite_kvstore, policy={})
    yield agent_persistence


@pytest.mark.asyncio
-@patch("llama_stack.providers.inline.agents.meta_reference.persistence.get_auth_attributes")
-async def test_session_creation_with_access_attributes(mock_get_auth_attributes, test_setup):
+@patch("llama_stack.providers.inline.agents.meta_reference.persistence.get_authenticated_user")
+async def test_session_creation_with_access_attributes(mock_get_authenticated_user, test_setup):
    agent_persistence = test_setup

    # Set creator's attributes for the session
    creator_attributes = {"roles": ["researcher"], "teams": ["ai-team"]}
-    mock_get_auth_attributes.return_value = creator_attributes
+    mock_get_authenticated_user.return_value = User("test_user", creator_attributes)

    # Create a session
    session_id = await agent_persistence.create_session("Test Session")
@ -37,14 +37,15 @@ async def test_session_creation_with_access_attributes(mock_get_auth_attributes,
    # Get the session and verify access attributes were set
    session_info = await agent_persistence.get_session_info(session_id)
    assert session_info is not None
-    assert session_info.access_attributes is not None
-    assert session_info.access_attributes.roles == ["researcher"]
-    assert session_info.access_attributes.teams == ["ai-team"]
+    assert session_info.owner is not None
+    assert session_info.owner.attributes is not None
+    assert session_info.owner.attributes["roles"] == ["researcher"]
+    assert session_info.owner.attributes["teams"] == ["ai-team"]


@pytest.mark.asyncio
-@patch("llama_stack.providers.inline.agents.meta_reference.persistence.get_auth_attributes")
-async def test_session_access_control(mock_get_auth_attributes, test_setup):
+@patch("llama_stack.providers.inline.agents.meta_reference.persistence.get_authenticated_user")
+async def test_session_access_control(mock_get_authenticated_user, test_setup):
    agent_persistence = test_setup

    # Create a session with specific access attributes
@ -53,8 +54,9 @@ async def test_session_access_control(mock_get_auth_attributes, test_setup):
        session_id=session_id,
        session_name="Restricted Session",
        started_at=datetime.now(),
-        access_attributes=AccessAttributes(roles=["admin"], teams=["security-team"]),
+        owner=User("someone", {"roles": ["admin"], "teams": ["security-team"]}),
        turns=[],
+        identifier="Restricted Session",
    )

    await agent_persistence.kvstore.set(
@ -63,20 +65,22 @@ async def test_session_access_control(mock_get_auth_attributes, test_setup):
    )

    # User with matching attributes can access
-    mock_get_auth_attributes.return_value = {"roles": ["admin", "user"], "teams": ["security-team", "other-team"]}
+    mock_get_authenticated_user.return_value = User(
+        "testuser", {"roles": ["admin", "user"], "teams": ["security-team", "other-team"]}
+    )
    retrieved_session = await agent_persistence.get_session_info(session_id)
    assert retrieved_session is not None
    assert retrieved_session.session_id == session_id

    # User without matching attributes cannot access
-    mock_get_auth_attributes.return_value = {"roles": ["user"], "teams": ["other-team"]}
+    mock_get_authenticated_user.return_value = User("testuser", {"roles": ["user"], "teams": ["other-team"]})
    retrieved_session = await agent_persistence.get_session_info(session_id)
    assert retrieved_session is None


@pytest.mark.asyncio
-@patch("llama_stack.providers.inline.agents.meta_reference.persistence.get_auth_attributes")
-async def test_turn_access_control(mock_get_auth_attributes, test_setup):
+@patch("llama_stack.providers.inline.agents.meta_reference.persistence.get_authenticated_user")
+async def test_turn_access_control(mock_get_authenticated_user, test_setup):
    agent_persistence = test_setup

    # Create a session with restricted access
@ -85,8 +89,9 @@ async def test_turn_access_control(mock_get_auth_attributes, test_setup):
        session_id=session_id,
        session_name="Restricted Session",
        started_at=datetime.now(),
-        access_attributes=AccessAttributes(roles=["admin"]),
+        owner=User("someone", {"roles": ["admin"]}),
        turns=[],
+        identifier="Restricted Session",
    )

    await agent_persistence.kvstore.set(
@ -109,7 +114,7 @@ async def test_turn_access_control(mock_get_auth_attributes, test_setup):
    )

    # Admin can add turn
-    mock_get_auth_attributes.return_value = {"roles": ["admin"]}
+    mock_get_authenticated_user.return_value = User("testuser", {"roles": ["admin"]})
    await agent_persistence.add_turn_to_session(session_id, turn)

    # Admin can get turn
@ -118,7 +123,7 @@ async def test_turn_access_control(mock_get_auth_attributes, test_setup):
    assert retrieved_turn.turn_id == turn_id

    # Regular user cannot get turn
-    mock_get_auth_attributes.return_value = {"roles": ["user"]}
+    mock_get_authenticated_user.return_value = User("testuser", {"roles": ["user"]})
    with pytest.raises(ValueError):
        await agent_persistence.get_session_turn(session_id, turn_id)

@ -128,8 +133,8 @@ async def test_turn_access_control(mock_get_auth_attributes, test_setup):


@pytest.mark.asyncio
-@patch("llama_stack.providers.inline.agents.meta_reference.persistence.get_auth_attributes")
-async def test_tool_call_and_infer_iters_access_control(mock_get_auth_attributes, test_setup):
+@patch("llama_stack.providers.inline.agents.meta_reference.persistence.get_authenticated_user")
+async def test_tool_call_and_infer_iters_access_control(mock_get_authenticated_user, test_setup):
    agent_persistence = test_setup

    # Create a session with restricted access
@ -138,8 +143,9 @@ async def test_tool_call_and_infer_iters_access_control(mock_get_auth_attributes
        session_id=session_id,
        session_name="Restricted Session",
        started_at=datetime.now(),
-        access_attributes=AccessAttributes(roles=["admin"]),
+        owner=User("someone", {"roles": ["admin"]}),
        turns=[],
+        identifier="Restricted Session",
    )

    await agent_persistence.kvstore.set(
@ -150,7 +156,7 @@ async def test_tool_call_and_infer_iters_access_control(mock_get_auth_attributes
    turn_id = str(uuid.uuid4())

    # Admin user can set inference iterations
-    mock_get_auth_attributes.return_value = {"roles": ["admin"]}
+    mock_get_authenticated_user.return_value = User("testuser", {"roles": ["admin"]})
    await agent_persistence.set_num_infer_iters_in_turn(session_id, turn_id, 5)

    # Admin user can get inference iterations
@ -158,7 +164,7 @@ async def test_tool_call_and_infer_iters_access_control(mock_get_auth_attributes
    assert infer_iters == 5

    # Regular user cannot get inference iterations
-    mock_get_auth_attributes.return_value = {"roles": ["user"]}
+    mock_get_authenticated_user.return_value = User("testuser", {"roles": ["user"]})
    infer_iters = await agent_persistence.get_num_infer_iters_in_turn(session_id, turn_id)
    assert infer_iters is None

--- a/tests/unit/providers/inference/test_remote_vllm.py
+++ b/tests/unit/providers/inference/test_remote_vllm.py
@ -70,9 +70,12 @@ class MockInferenceAdapterWithSleep:
            # ruff: noqa: N802
            def do_POST(self):
                time.sleep(sleep_time)
+                response_body = json.dumps(response).encode("utf-8")
                self.send_response(code=200)
+                self.send_header("Content-Type", "application/json")
+                self.send_header("Content-Length", len(response_body))
                self.end_headers()
-                self.wfile.write(json.dumps(response).encode("utf-8"))
+                self.wfile.write(response_body)

        self.request_handler = DelayedRequestHandler