Pass parallel_tool_calls directly and document intended usage in integration test

Signed-off-by: Anastas Stoyanovsky <astoyano@redhat.com>
2025-12-03 18:00:36 +00:00 · 2025-11-19 14:33:54 -05:00 · 2025-11-19 14:33:54 -05:00 · 958d0dc515
commit 958d0dc515
parent 91f1b352b4
8 changed files with 31 additions and 196 deletions
--- a/docs/docs/providers/agents/index.mdx
+++ b/docs/docs/providers/agents/index.mdx
@ -13,6 +13,6 @@ title: Agents
 Agents
-    APIs for creating and interacting with agentic systems.
+APIs for creating and interacting with agentic systems.
 This section contains documentation for all available providers for the **agents** API.
--- a/docs/docs/providers/batches/index.mdx
+++ b/docs/docs/providers/batches/index.mdx
@ -19,14 +19,14 @@ title: Batches
 ## Overview
 The Batches API enables efficient processing of multiple requests in a single operation,
-    particularly useful for processing large datasets, batch evaluation workflows, and
+particularly useful for processing large datasets, batch evaluation workflows, and
-    cost-effective inference at scale.
+cost-effective inference at scale.
-    The API is designed to allow use of openai client libraries for seamless integration.
+The API is designed to allow use of openai client libraries for seamless integration.
-    This API provides the following extensions:
+This API provides the following extensions:
 - idempotent batch creation
-    Note: This API is currently under active development and may undergo changes.
+Note: This API is currently under active development and may undergo changes.
 This section contains documentation for all available providers for the **batches** API.
--- a/docs/docs/providers/eval/index.mdx
+++ b/docs/docs/providers/eval/index.mdx
@ -13,6 +13,6 @@ title: Eval
 Evaluations
-    Llama Stack Evaluation API for running evaluations on model and agent candidates.
+Llama Stack Evaluation API for running evaluations on model and agent candidates.
 This section contains documentation for all available providers for the **eval** API.
--- a/docs/docs/providers/files/index.mdx
+++ b/docs/docs/providers/files/index.mdx
@ -13,6 +13,6 @@ title: Files
 Files
-    This API is used to upload documents that can be used with other Llama Stack APIs.
+This API is used to upload documents that can be used with other Llama Stack APIs.
 This section contains documentation for all available providers for the **files** API.
--- a/docs/docs/providers/inference/index.mdx
+++ b/docs/docs/providers/inference/index.mdx
@ -18,11 +18,11 @@ title: Inference
 Inference
-    Llama Stack Inference API for generating completions, chat completions, and embeddings.
+Llama Stack Inference API for generating completions, chat completions, and embeddings.
-    This API provides the raw interface to the underlying models. Three kinds of models are supported:
+This API provides the raw interface to the underlying models. Three kinds of models are supported:
-    - LLM models: these models generate "raw" and "chat" (conversational) completions.
+- LLM models: these models generate "raw" and "chat" (conversational) completions.
-    - Embedding models: these models generate embeddings to be used for semantic search.
+- Embedding models: these models generate embeddings to be used for semantic search.
-    - Rerank models: these models reorder the documents based on their relevance to a query.
+- Rerank models: these models reorder the documents based on their relevance to a query.
 This section contains documentation for all available providers for the **inference** API.
--- a/docs/docs/providers/safety/index.mdx
+++ b/docs/docs/providers/safety/index.mdx
@ -13,6 +13,6 @@ title: Safety
 Safety
-    OpenAI-compatible Moderations API.
+OpenAI-compatible Moderations API.
 This section contains documentation for all available providers for the **safety** API.
--- a/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
@ -242,6 +242,7 @@ class StreamingResponseOrchestrator:
                    messages=messages,
                    # Pydantic models are dict-compatible but mypy treats them as distinct types
                    tools=self.ctx.chat_tools,  # type: ignore[arg-type]
                    parallel_tool_calls=self.parallel_tool_calls,
                    stream=True,
                    temperature=self.ctx.temperature,
                    response_format=response_format,
--- a/tests/integration/agents/test_openai_responses.py
+++ b/tests/integration/agents/test_openai_responses.py
@ -516,169 +516,3 @@ def test_response_with_instructions(openai_client, client_with_models, text_mode
    # Verify instructions from previous response was not carried over to the next response
    assert response_with_instructions2.instructions == instructions2
@pytest.mark.skip(reason="Tool calling is not reliable.")
 def test_max_tool_calls_with_function_tools(openai_client, client_with_models, text_model_id):
    """Test handling of max_tool_calls with function tools in responses."""
    if isinstance(client_with_models, LlamaStackAsLibraryClient):
        pytest.skip("OpenAI responses are not supported when testing with library client yet.")
    client = openai_client
    max_tool_calls = 1
    tools = [
        {
            "type": "function",
            "name": "get_weather",
            "description": "Get weather information for a specified location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "The city name (e.g., 'New York', 'London')",
                    },
                },
            },
        },
        {
            "type": "function",
            "name": "get_time",
            "description": "Get current time for a specified location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "The city name (e.g., 'New York', 'London')",
                    },
                },
            },
        },
    ]
    # First create a response that triggers function tools
    response = client.responses.create(
        model=text_model_id,
        input="Can you tell me the weather in Paris and the current time?",
        tools=tools,
        stream=False,
        max_tool_calls=max_tool_calls,
    )
    # Verify we got two function calls and that the max_tool_calls do not affect function tools
    assert len(response.output) == 2
    assert response.output[0].type == "function_call"
    assert response.output[0].name == "get_weather"
    assert response.output[0].status == "completed"
    assert response.output[1].type == "function_call"
    assert response.output[1].name == "get_time"
    assert response.output[0].status == "completed"
    # Verify we have a valid max_tool_calls field
    assert response.max_tool_calls == max_tool_calls
 def test_max_tool_calls_invalid(openai_client, client_with_models, text_model_id):
    """Test handling of invalid max_tool_calls in responses."""
    if isinstance(client_with_models, LlamaStackAsLibraryClient):
        pytest.skip("OpenAI responses are not supported when testing with library client yet.")
    client = openai_client
    input = "Search for today's top technology news."
    invalid_max_tool_calls = 0
    tools = [
        {"type": "web_search"},
    ]
    # Create a response with an invalid max_tool_calls value i.e. 0
    # Handle ValueError from LLS and BadRequestError from OpenAI client
    with pytest.raises((ValueError, BadRequestError)) as excinfo:
        client.responses.create(
            model=text_model_id,
            input=input,
            tools=tools,
            stream=False,
            max_tool_calls=invalid_max_tool_calls,
        )
    error_message = str(excinfo.value)
    assert f"Invalid max_tool_calls={invalid_max_tool_calls}; should be >= 1" in error_message, (
        f"Expected error message about invalid max_tool_calls, got: {error_message}"
    )
 def test_max_tool_calls_with_builtin_tools(openai_client, client_with_models, text_model_id):
    """Test handling of max_tool_calls with built-in tools in responses."""
    if isinstance(client_with_models, LlamaStackAsLibraryClient):
        pytest.skip("OpenAI responses are not supported when testing with library client yet.")
    client = openai_client
    input = "Search for today's top technology and a positive news story. You MUST make exactly two separate web search calls."
    max_tool_calls = [1, 5]
    tools = [
        {"type": "web_search"},
    ]
    # First create a response that triggers web_search tools without max_tool_calls
    response = client.responses.create(
        model=text_model_id,
        input=input,
        tools=tools,
        stream=False,
    )
    # Verify we got two web search calls followed by a message
    assert len(response.output) == 3
    assert response.output[0].type == "web_search_call"
    assert response.output[0].status == "completed"
    assert response.output[1].type == "web_search_call"
    assert response.output[1].status == "completed"
    assert response.output[2].type == "message"
    assert response.output[2].status == "completed"
    assert response.output[2].role == "assistant"
    # Next create a response that triggers web_search tools with max_tool_calls set to 1
    response_2 = client.responses.create(
        model=text_model_id,
        input=input,
        tools=tools,
        stream=False,
        max_tool_calls=max_tool_calls[0],
    )
    # Verify we got one web search tool call followed by a message
    assert len(response_2.output) == 2
    assert response_2.output[0].type == "web_search_call"
    assert response_2.output[0].status == "completed"
    assert response_2.output[1].type == "message"
    assert response_2.output[1].status == "completed"
    assert response_2.output[1].role == "assistant"
    # Verify we have a valid max_tool_calls field
    assert response_2.max_tool_calls == max_tool_calls[0]
    # Finally create a response that triggers web_search tools with max_tool_calls set to 5
    response_3 = client.responses.create(
        model=text_model_id,
        input=input,
        tools=tools,
        stream=False,
        max_tool_calls=max_tool_calls[1],
    )
    # Verify we got two web search calls followed by a message
    assert len(response_3.output) == 3
    assert response_3.output[0].type == "web_search_call"
    assert response_3.output[0].status == "completed"
    assert response_3.output[1].type == "web_search_call"
    assert response_3.output[1].status == "completed"
    assert response_3.output[2].type == "message"
    assert response_3.output[2].status == "completed"
    assert response_3.output[2].role == "assistant"
    # Verify we have a valid max_tool_calls field
    assert response_3.max_tool_calls == max_tool_calls[1]