fix: multiple tool calls in remote-vllm chat_completion (#2161)

# What does this PR do? This fixes an issue in how we used the tool_call_buf from streaming tool calls in the remote-vllm provider where it would end up concatenating parameters from multiple different tool call results instead of aggregating the results from each tool call separately. It also fixes an issue found while digging into that where we were accidentally mixing the json string form of tool call parameters with the string representation of the python form, which mean we'd end up with single quotes in what should be double-quoted json strings. Closes #1120 ## Test Plan The following tests are now passing 100% for the remote-vllm provider, where some of the test_text_inference were failing before this change: ``` VLLM_URL="http://localhost:8000/v1" INFERENCE_MODEL="RedHatAI/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic" LLAMA_STACK_CONFIG=remote-vllm python -m pytest -v tests/integration/inference/test_text_inference.py --text-model "RedHatAI/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic" VLLM_URL="http://localhost:8000/v1" INFERENCE_MODEL="RedHatAI/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic" LLAMA_STACK_CONFIG=remote-vllm python -m pytest -v tests/integration/inference/test_vision_inference.py --vision-model "RedHatAI/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic" ``` All but one of the agent tests are passing (including the multi-tool one). See the PR at https://github.com/vllm-project/vllm/pull/17917 and a gist at https://gist.github.com/bbrowning/4734240ce96b4264340caa9584e47c9e for changes needed there, which will have to get made upstream in vLLM. Agent tests: ``` VLLM_URL="http://localhost:8000/v1" INFERENCE_MODEL="RedHatAI/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic" LLAMA_STACK_CONFIG=remote-vllm python -m pytest -v tests/integration/agents/test_agents.py --text-model "RedHatAI/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic" ```` --------- Signed-off-by: Ben Browning <bbrownin@redhat.com>
2025-05-15 14:23:29 -04:00 · 2025-05-15 14:23:29 -04:00 · 10b1056dea
commit 10b1056dea
parent bb5fca9521
4 changed files with 225 additions and 34 deletions
--- a/tests/integration/agents/test_agents.py
+++ b/tests/integration/agents/test_agents.py
@ -266,6 +266,7 @@ def test_builtin_tool_web_search(llama_stack_client, agent_config):
    assert found_tool_execution


+@pytest.mark.skip(reason="Code interpreter is currently disabled in the Stack")
 def test_builtin_tool_code_execution(llama_stack_client, agent_config):
    agent_config = {
        **agent_config,
@ -346,7 +347,7 @@ def test_custom_tool(llama_stack_client, agent_config):
        messages=[
            {
                "role": "user",
-                "content": "What is the boiling point of polyjuice?",
+                "content": "What is the boiling point of the liquid polyjuice in celsius?",
            },
        ],
        session_id=session_id,
@ -420,7 +421,7 @@ def run_agent_with_tool_choice(client, agent_config, tool_choice):
        messages=[
            {
                "role": "user",
-                "content": "What is the boiling point of polyjuice?",
+                "content": "What is the boiling point of the liquid polyjuice in celsius?",
            },
        ],
        session_id=session_id,
@ -674,8 +675,8 @@ def test_create_turn_response(llama_stack_client, agent_config, client_tools):


 def test_multi_tool_calls(llama_stack_client, agent_config):
-    if "gpt" not in agent_config["model"]:
-        pytest.xfail("Only tested on GPT models")
+    if "gpt" not in agent_config["model"] and "llama-4" not in agent_config["model"].lower():
+        pytest.xfail("Only tested on GPT and Llama 4 models")

    agent_config = {
        **agent_config,
@ -689,23 +690,34 @@ def test_multi_tool_calls(llama_stack_client, agent_config):
        messages=[
            {
                "role": "user",
-                "content": "Call get_boiling_point twice to answer: What is the boiling point of polyjuice in both celsius and fahrenheit?",
+                "content": "Call get_boiling_point twice to answer: What is the boiling point of polyjuice in both celsius and fahrenheit?.\nUse the tool responses to answer the question.",
            },
        ],
        session_id=session_id,
        stream=False,
    )
    steps = response.steps
-    assert len(steps) == 7
-    assert steps[0].step_type == "shield_call"
-    assert steps[1].step_type == "inference"
-    assert steps[2].step_type == "shield_call"
-    assert steps[3].step_type == "tool_execution"
-    assert steps[4].step_type == "shield_call"
-    assert steps[5].step_type == "inference"
-    assert steps[6].step_type == "shield_call"

-    tool_execution_step = steps[3]
+    has_input_shield = agent_config.get("input_shields")
+    has_output_shield = agent_config.get("output_shields")
+    assert len(steps) == 3 + (2 if has_input_shield else 0) + (2 if has_output_shield else 0)
+    if has_input_shield:
+        assert steps[0].step_type == "shield_call"
+        steps.pop(0)
+    assert steps[0].step_type == "inference"
+    if has_output_shield:
+        assert steps[1].step_type == "shield_call"
+        steps.pop(1)
+    assert steps[1].step_type == "tool_execution"
+    tool_execution_step = steps[1]
+    if has_input_shield:
+        assert steps[2].step_type == "shield_call"
+        steps.pop(2)
+    assert steps[2].step_type == "inference"
+    if has_output_shield:
+        assert steps[3].step_type == "shield_call"
+        steps.pop(3)
+
    assert len(tool_execution_step.tool_calls) == 2
    assert tool_execution_step.tool_calls[0].tool_name.startswith("get_boiling_point")
    assert tool_execution_step.tool_calls[1].tool_name.startswith("get_boiling_point")