Fixes; make inference tests pass with newer tool call types

2025-12-03 09:53:45 +00:00 · 2025-01-13 23:16:16 -08:00 · 2025-01-13 23:16:16 -08:00 · 2c2969f331
commit 2c2969f331
parent d9d34433fc
5 changed files with 24 additions and 25 deletions
--- a/llama_stack/distribution/store/registry.py
+++ b/llama_stack/distribution/store/registry.py
@ -35,7 +35,7 @@ class DistributionRegistry(Protocol):
 REGISTER_PREFIX = "distributions:registry"
-KEY_VERSION = "v4"
+KEY_VERSION = "v5"
 KEY_FORMAT = f"{REGISTER_PREFIX}:{KEY_VERSION}::" + "{type}:{identifier}"
--- a/llama_stack/providers/utils/inference/openai_compat.py
+++ b/llama_stack/providers/utils/inference/openai_compat.py
@ -142,7 +142,7 @@ async def process_completion_stream_response(
            text = ""
            continue
        yield CompletionResponseStreamChunk(
-            delta=TextDelta(text=text),
+            delta=text,
            stop_reason=stop_reason,
        )
        if finish_reason:
@ -153,7 +153,7 @@ async def process_completion_stream_response(
            break
    yield CompletionResponseStreamChunk(
-        delta=TextDelta(text=""),
+        delta="",
        stop_reason=stop_reason,
    )
--- a/llama_stack/providers/utils/inference/prompt_adapter.py
+++ b/llama_stack/providers/utils/inference/prompt_adapter.py
@ -265,6 +265,7 @@ def chat_completion_request_to_messages(
    For eg. for llama_3_1, add system message with the appropriate tools or
    add user messsage for custom tools, etc.
    """
    assert llama_model is not None, "llama_model is required"
    model = resolve_model(llama_model)
    if model is None:
        log.error(f"Could not resolve model {llama_model}")
--- a/tests/client-sdk/conftest.py
+++ b/tests/client-sdk/conftest.py
@ -12,6 +12,11 @@ from llama_stack.providers.tests.env import get_env_or_fail
 from llama_stack_client import LlamaStackClient
 def pytest_configure(config):
    config.option.tbstyle = "short"
    config.option.disable_warnings = True
@pytest.fixture(scope="session")
 def provider_data():
    # check env for tavily secret, brave secret and inject all into provider data
@ -29,6 +34,7 @@ def llama_stack_client(provider_data):
        client = LlamaStackAsLibraryClient(
            get_env_or_fail("LLAMA_STACK_CONFIG"),
            provider_data=provider_data,
            skip_logger_removal=True,
        )
        client.initialize()
    elif os.environ.get("LLAMA_STACK_BASE_URL"):
--- a/tests/client-sdk/inference/test_inference.py
+++ b/tests/client-sdk/inference/test_inference.py
@ -6,9 +6,9 @@
 import pytest
 from llama_stack_client.lib.inference.event_logger import EventLogger
 from pydantic import BaseModel
 PROVIDER_TOOL_PROMPT_FORMAT = {
    "remote::ollama": "python_list",
    "remote::together": "json",
@ -39,7 +39,7 @@ def text_model_id(llama_stack_client):
    available_models = [
        model.identifier
        for model in llama_stack_client.models.list()
-        if model.identifier.startswith("meta-llama")
+        if model.identifier.startswith("meta-llama") and "405" not in model.identifier
    ]
    assert len(available_models) > 0
    return available_models[0]
@ -208,12 +208,9 @@ def test_text_chat_completion_streaming(
        stream=True,
    )
    streamed_content = [
-        str(log.content.lower().strip())
+        str(chunk.event.delta.text.lower().strip()) for chunk in response
        for log in EventLogger().log(response)
        if log is not None
    ]
    assert len(streamed_content) > 0
    assert "assistant>" in streamed_content[0]
    assert expected.lower() in "".join(streamed_content)
@ -250,17 +247,16 @@ def test_text_chat_completion_with_tool_calling_and_non_streaming(
 def extract_tool_invocation_content(response):
    text_content: str = ""
    tool_invocation_content: str = ""
-    for log in EventLogger().log(response):
+    for chunk in response:
-        if log is None:
+        delta = chunk.event.delta
-            continue
+        if delta.type == "text":
-        if isinstance(log.content, str):
+            text_content += delta.text
-            text_content += log.content
+        elif delta.type == "tool_call":
-        elif isinstance(log.content, object):
+            if isinstance(delta.content, str):
-            if isinstance(log.content.content, str):
+                tool_invocation_content += delta.content
-                continue
+            else:
-            elif isinstance(log.content.content, object):
+                call = delta.content
-                tool_invocation_content += f"[{log.content.content.tool_name}, {log.content.content.arguments}]"
+                tool_invocation_content += f"[{call.tool_name}, {call.arguments}]"
    return text_content, tool_invocation_content
@ -280,7 +276,6 @@ def test_text_chat_completion_with_tool_calling_and_streaming(
    )
    text_content, tool_invocation_content = extract_tool_invocation_content(response)
    assert "Assistant>" in text_content
    assert tool_invocation_content == "[get_weather, {'location': 'San Francisco, CA'}]"
@ -368,10 +363,7 @@ def test_image_chat_completion_streaming(llama_stack_client, vision_model_id):
        stream=True,
    )
    streamed_content = [
-        str(log.content.lower().strip())
+        str(chunk.event.delta.text.lower().strip()) for chunk in response
        for log in EventLogger().log(response)
        if log is not None
    ]
    assert len(streamed_content) > 0
    assert "assistant>" in streamed_content[0]
    assert any(expected in streamed_content for expected in {"dog", "puppy", "pup"})