fix: multiple tool calls in remote-vllm chat_completion

This fixes an issue in how we used the tool_call_buf from streaming tool calls in the remote-vllm provider where it would end up concatenating parameters from multiple different tool call results instead of aggregating the results from each tool call separately. It also fixes an issue found while digging into that where we were accidentally mixing the json string form of tool call parameters with the string representation of the python form, which mean we'd end up with single quotes in what should be double-quoted json strings. The following tests are now passing 100% for the remote-vllm provider, where some of the test_text_inference were failing before this change: ``` VLLM_URL="http://localhost:8000/v1" INFERENCE_MODEL="RedHatAI/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic" LLAMA_STACK_CONFIG=remote-vllm python -m pytest -v tests/integration/inference/test_text_inference.py --text-model "RedHatAI/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic" VLLM_URL="http://localhost:8000/v1" INFERENCE_MODEL="RedHatAI/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic" LLAMA_STACK_CONFIG=remote-vllm python -m pytest -v tests/integration/inference/test_vision_inference.py --vision-model "RedHatAI/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic" ``` Many of the agent tests are passing, although some are failing due to bugs in vLLM's pythonic tool parser for Llama models. See the PR at https://github.com/vllm-project/vllm/pull/17917 and a gist at https://gist.github.com/bbrowning/b5007709015cb2aabd85e0bd08e6d60f for changes needed there, which will have to get made upstream in vLLM. Agent tests: ``` VLLM_URL="http://localhost:8000/v1" INFERENCE_MODEL="RedHatAI/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic" LLAMA_STACK_CONFIG=remote-vllm python -m pytest -v tests/integration/agents/test_agents.py --text-model "RedHatAI/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic" ```` Signed-off-by: Ben Browning <bbrownin@redhat.com>
2025-12-27 23:52:00 +00:00 · 2025-05-14 07:00:53 -04:00 · 2025-05-14 07:00:53 -04:00 · 9f2a7e6a74
commit 9f2a7e6a74
parent aa5bef8e05
3 changed files with 196 additions and 17 deletions
--- a/tests/unit/providers/inference/test_remote_vllm.py
+++ b/tests/unit/providers/inference/test_remote_vllm.py
@ -24,6 +24,12 @@ from openai.types.chat.chat_completion_chunk import (
 from openai.types.chat.chat_completion_chunk import (
    ChoiceDelta as OpenAIChoiceDelta,
 )
+from openai.types.chat.chat_completion_chunk import (
+    ChoiceDeltaToolCall as OpenAIChoiceDeltaToolCall,
+)
+from openai.types.chat.chat_completion_chunk import (
+    ChoiceDeltaToolCallFunction as OpenAIChoiceDeltaToolCallFunction,
+)
 from openai.types.model import Model as OpenAIModel

 from llama_stack.apis.inference import (
@ -206,8 +212,164 @@ async def test_tool_call_delta_empty_tool_call_buf():
            yield chunk

    chunks = [chunk async for chunk in _process_vllm_chat_completion_stream_response(mock_stream())]
-    assert len(chunks) == 1
-    assert chunks[0].event.stop_reason == StopReason.end_of_turn
+    assert len(chunks) == 2
+    assert chunks[0].event.event_type.value == "start"
+    assert chunks[1].event.event_type.value == "complete"
+    assert chunks[1].event.stop_reason == StopReason.end_of_turn
+
+
+@pytest.mark.asyncio
+async def test_tool_call_delta_streaming_arguments_dict():
+    async def mock_stream():
+        mock_chunk_1 = OpenAIChatCompletionChunk(
+            id="chunk-1",
+            created=1,
+            model="foo",
+            object="chat.completion.chunk",
+            choices=[
+                OpenAIChoice(
+                    delta=OpenAIChoiceDelta(
+                        content="",
+                        tool_calls=[
+                            OpenAIChoiceDeltaToolCall(
+                                id="tc_1",
+                                index=1,
+                                function=OpenAIChoiceDeltaToolCallFunction(
+                                    name="power",
+                                    arguments="",
+                                ),
+                            )
+                        ],
+                    ),
+                    finish_reason=None,
+                    index=0,
+                )
+            ],
+        )
+        mock_chunk_2 = OpenAIChatCompletionChunk(
+            id="chunk-2",
+            created=1,
+            model="foo",
+            object="chat.completion.chunk",
+            choices=[
+                OpenAIChoice(
+                    delta=OpenAIChoiceDelta(
+                        content="",
+                        tool_calls=[
+                            OpenAIChoiceDeltaToolCall(
+                                id="tc_1",
+                                index=1,
+                                function=OpenAIChoiceDeltaToolCallFunction(
+                                    name="power",
+                                    arguments='{"number": 28, "power": 3}',
+                                ),
+                            )
+                        ],
+                    ),
+                    finish_reason=None,
+                    index=0,
+                )
+            ],
+        )
+        mock_chunk_3 = OpenAIChatCompletionChunk(
+            id="chunk-3",
+            created=1,
+            model="foo",
+            object="chat.completion.chunk",
+            choices=[
+                OpenAIChoice(delta=OpenAIChoiceDelta(content="", tool_calls=None), finish_reason="tool_calls", index=0)
+            ],
+        )
+        for chunk in [mock_chunk_1, mock_chunk_2, mock_chunk_3]:
+            yield chunk
+
+    chunks = [chunk async for chunk in _process_vllm_chat_completion_stream_response(mock_stream())]
+    assert len(chunks) == 3
+    assert chunks[0].event.event_type.value == "start"
+    assert chunks[1].event.event_type.value == "progress"
+    assert chunks[1].event.delta.type == "tool_call"
+    assert chunks[1].event.delta.parse_status.value == "succeeded"
+    assert chunks[1].event.delta.tool_call.arguments_json == '{"number": 28, "power": 3}'
+    assert chunks[2].event.event_type.value == "complete"
+
+
+@pytest.mark.asyncio
+async def test_multiple_tool_calls():
+    async def mock_stream():
+        mock_chunk_1 = OpenAIChatCompletionChunk(
+            id="chunk-1",
+            created=1,
+            model="foo",
+            object="chat.completion.chunk",
+            choices=[
+                OpenAIChoice(
+                    delta=OpenAIChoiceDelta(
+                        content="",
+                        tool_calls=[
+                            OpenAIChoiceDeltaToolCall(
+                                id="",
+                                index=1,
+                                function=OpenAIChoiceDeltaToolCallFunction(
+                                    name="power",
+                                    arguments='{"number": 28, "power": 3}',
+                                ),
+                            ),
+                        ],
+                    ),
+                    finish_reason=None,
+                    index=0,
+                )
+            ],
+        )
+        mock_chunk_2 = OpenAIChatCompletionChunk(
+            id="chunk-2",
+            created=1,
+            model="foo",
+            object="chat.completion.chunk",
+            choices=[
+                OpenAIChoice(
+                    delta=OpenAIChoiceDelta(
+                        content="",
+                        tool_calls=[
+                            OpenAIChoiceDeltaToolCall(
+                                id="",
+                                index=2,
+                                function=OpenAIChoiceDeltaToolCallFunction(
+                                    name="multiple",
+                                    arguments='{"first_number": 4, "second_number": 7}',
+                                ),
+                            ),
+                        ],
+                    ),
+                    finish_reason=None,
+                    index=0,
+                )
+            ],
+        )
+        mock_chunk_3 = OpenAIChatCompletionChunk(
+            id="chunk-3",
+            created=1,
+            model="foo",
+            object="chat.completion.chunk",
+            choices=[
+                OpenAIChoice(delta=OpenAIChoiceDelta(content="", tool_calls=None), finish_reason="tool_calls", index=0)
+            ],
+        )
+        for chunk in [mock_chunk_1, mock_chunk_2, mock_chunk_3]:
+            yield chunk
+
+    chunks = [chunk async for chunk in _process_vllm_chat_completion_stream_response(mock_stream())]
+    assert len(chunks) == 4
+    assert chunks[0].event.event_type.value == "start"
+    assert chunks[1].event.event_type.value == "progress"
+    assert chunks[1].event.delta.type == "tool_call"
+    assert chunks[1].event.delta.parse_status.value == "succeeded"
+    assert chunks[1].event.delta.tool_call.arguments_json == '{"number": 28, "power": 3}'
+    assert chunks[2].event.event_type.value == "progress"
+    assert chunks[2].event.delta.type == "tool_call"
+    assert chunks[2].event.delta.parse_status.value == "succeeded"
+    assert chunks[2].event.delta.tool_call.arguments_json == '{"first_number": 4, "second_number": 7}'
+    assert chunks[3].event.event_type.value == "complete"


@pytest.mark.asyncio
@ -231,7 +393,8 @@ async def test_process_vllm_chat_completion_stream_response_no_choices():
            yield chunk

    chunks = [chunk async for chunk in _process_vllm_chat_completion_stream_response(mock_stream())]
-    assert len(chunks) == 0
+    assert len(chunks) == 1
+    assert chunks[0].event.event_type.value == "start"


 def test_chat_completion_doesnt_block_event_loop(caplog):