diff --git a/llama_stack/distribution/store/registry.py b/llama_stack/distribution/store/registry.py index d26b4447c..010d137ec 100644 --- a/llama_stack/distribution/store/registry.py +++ b/llama_stack/distribution/store/registry.py @@ -35,7 +35,7 @@ class DistributionRegistry(Protocol): REGISTER_PREFIX = "distributions:registry" -KEY_VERSION = "v4" +KEY_VERSION = "v5" KEY_FORMAT = f"{REGISTER_PREFIX}:{KEY_VERSION}::" + "{type}:{identifier}" diff --git a/llama_stack/providers/utils/inference/openai_compat.py b/llama_stack/providers/utils/inference/openai_compat.py index 82e01c364..4c46954cf 100644 --- a/llama_stack/providers/utils/inference/openai_compat.py +++ b/llama_stack/providers/utils/inference/openai_compat.py @@ -142,7 +142,7 @@ async def process_completion_stream_response( text = "" continue yield CompletionResponseStreamChunk( - delta=TextDelta(text=text), + delta=text, stop_reason=stop_reason, ) if finish_reason: @@ -153,7 +153,7 @@ async def process_completion_stream_response( break yield CompletionResponseStreamChunk( - delta=TextDelta(text=""), + delta="", stop_reason=stop_reason, ) diff --git a/llama_stack/providers/utils/inference/prompt_adapter.py b/llama_stack/providers/utils/inference/prompt_adapter.py index 2d66dc60b..de4918f5c 100644 --- a/llama_stack/providers/utils/inference/prompt_adapter.py +++ b/llama_stack/providers/utils/inference/prompt_adapter.py @@ -265,6 +265,7 @@ def chat_completion_request_to_messages( For eg. for llama_3_1, add system message with the appropriate tools or add user messsage for custom tools, etc. """ + assert llama_model is not None, "llama_model is required" model = resolve_model(llama_model) if model is None: log.error(f"Could not resolve model {llama_model}") diff --git a/tests/client-sdk/conftest.py b/tests/client-sdk/conftest.py index 16e6d1bbd..b40d54ee5 100644 --- a/tests/client-sdk/conftest.py +++ b/tests/client-sdk/conftest.py @@ -12,6 +12,11 @@ from llama_stack.providers.tests.env import get_env_or_fail from llama_stack_client import LlamaStackClient +def pytest_configure(config): + config.option.tbstyle = "short" + config.option.disable_warnings = True + + @pytest.fixture(scope="session") def provider_data(): # check env for tavily secret, brave secret and inject all into provider data @@ -29,6 +34,7 @@ def llama_stack_client(provider_data): client = LlamaStackAsLibraryClient( get_env_or_fail("LLAMA_STACK_CONFIG"), provider_data=provider_data, + skip_logger_removal=True, ) client.initialize() elif os.environ.get("LLAMA_STACK_BASE_URL"): diff --git a/tests/client-sdk/inference/test_inference.py b/tests/client-sdk/inference/test_inference.py index ef6219389..a50dba3a0 100644 --- a/tests/client-sdk/inference/test_inference.py +++ b/tests/client-sdk/inference/test_inference.py @@ -6,9 +6,9 @@ import pytest -from llama_stack_client.lib.inference.event_logger import EventLogger from pydantic import BaseModel + PROVIDER_TOOL_PROMPT_FORMAT = { "remote::ollama": "python_list", "remote::together": "json", @@ -39,7 +39,7 @@ def text_model_id(llama_stack_client): available_models = [ model.identifier for model in llama_stack_client.models.list() - if model.identifier.startswith("meta-llama") + if model.identifier.startswith("meta-llama") and "405" not in model.identifier ] assert len(available_models) > 0 return available_models[0] @@ -208,12 +208,9 @@ def test_text_chat_completion_streaming( stream=True, ) streamed_content = [ - str(log.content.lower().strip()) - for log in EventLogger().log(response) - if log is not None + str(chunk.event.delta.text.lower().strip()) for chunk in response ] assert len(streamed_content) > 0 - assert "assistant>" in streamed_content[0] assert expected.lower() in "".join(streamed_content) @@ -250,17 +247,16 @@ def test_text_chat_completion_with_tool_calling_and_non_streaming( def extract_tool_invocation_content(response): text_content: str = "" tool_invocation_content: str = "" - for log in EventLogger().log(response): - if log is None: - continue - if isinstance(log.content, str): - text_content += log.content - elif isinstance(log.content, object): - if isinstance(log.content.content, str): - continue - elif isinstance(log.content.content, object): - tool_invocation_content += f"[{log.content.content.tool_name}, {log.content.content.arguments}]" - + for chunk in response: + delta = chunk.event.delta + if delta.type == "text": + text_content += delta.text + elif delta.type == "tool_call": + if isinstance(delta.content, str): + tool_invocation_content += delta.content + else: + call = delta.content + tool_invocation_content += f"[{call.tool_name}, {call.arguments}]" return text_content, tool_invocation_content @@ -280,7 +276,6 @@ def test_text_chat_completion_with_tool_calling_and_streaming( ) text_content, tool_invocation_content = extract_tool_invocation_content(response) - assert "Assistant>" in text_content assert tool_invocation_content == "[get_weather, {'location': 'San Francisco, CA'}]" @@ -368,10 +363,7 @@ def test_image_chat_completion_streaming(llama_stack_client, vision_model_id): stream=True, ) streamed_content = [ - str(log.content.lower().strip()) - for log in EventLogger().log(response) - if log is not None + str(chunk.event.delta.text.lower().strip()) for chunk in response ] assert len(streamed_content) > 0 - assert "assistant>" in streamed_content[0] assert any(expected in streamed_content for expected in {"dog", "puppy", "pup"})