Merge branch 'main' into feat/litellm_sambanova_usage

2025-12-28 01:01:59 +00:00 · 2025-05-05 11:49:58 -05:00 · 2025-05-05 11:49:58 -05:00 · b7f16ac7a6
commit b7f16ac7a6
parent daf0c26420 a4247ce0a8
535 changed files with 23539 additions and 8112 deletions
--- a/tests/integration/agents/test_agents.py
+++ b/tests/integration/agents/test_agents.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, Dict
+from typing import Any
 from uuid import uuid4

 import pytest
@ -37,7 +37,7 @@ def get_boiling_point(liquid_name: str, celcius: bool = True) -> int:
        return -1


-def get_boiling_point_with_metadata(liquid_name: str, celcius: bool = True) -> Dict[str, Any]:
+def get_boiling_point_with_metadata(liquid_name: str, celcius: bool = True) -> dict[str, Any]:
    """
    Returns the boiling point of a liquid in Celcius or Fahrenheit

@ -115,6 +115,70 @@ def test_agent_simple(llama_stack_client_with_mocked_inference, agent_config):
        assert "I can't" in logs_str


+def test_agent_name(llama_stack_client, text_model_id):
+    agent_name = f"test-agent-{uuid4()}"
+
+    try:
+        agent = Agent(
+            llama_stack_client,
+            model=text_model_id,
+            instructions="You are a helpful assistant",
+            name=agent_name,
+        )
+    except TypeError:
+        agent = Agent(
+            llama_stack_client,
+            model=text_model_id,
+            instructions="You are a helpful assistant",
+        )
+        return
+
+    session_id = agent.create_session(f"test-session-{uuid4()}")
+
+    agent.create_turn(
+        messages=[
+            {
+                "role": "user",
+                "content": "Give me a sentence that contains the word: hello",
+            }
+        ],
+        session_id=session_id,
+        stream=False,
+    )
+
+    all_spans = []
+    for span in llama_stack_client.telemetry.query_spans(
+        attribute_filters=[
+            {"key": "session_id", "op": "eq", "value": session_id},
+        ],
+        attributes_to_return=["input", "output", "agent_name", "agent_id", "session_id"],
+    ):
+        all_spans.append(span.attributes)
+
+    agent_name_spans = []
+    for span in llama_stack_client.telemetry.query_spans(
+        attribute_filters=[],
+        attributes_to_return=["agent_name"],
+    ):
+        if "agent_name" in span.attributes:
+            agent_name_spans.append(span.attributes)
+
+    agent_logs = []
+    for span in llama_stack_client.telemetry.query_spans(
+        attribute_filters=[
+            {"key": "agent_name", "op": "eq", "value": agent_name},
+        ],
+        attributes_to_return=["input", "output", "agent_name"],
+    ):
+        if "output" in span.attributes and span.attributes["output"] != "no shields":
+            agent_logs.append(span.attributes)
+
+    assert len(agent_logs) == 1
+    assert agent_logs[0]["agent_name"] == agent_name
+    assert "Give me a sentence that contains the word: hello" in agent_logs[0]["input"]
+    assert "hello" in agent_logs[0]["output"].lower()
+
+
 def test_tool_config(llama_stack_client_with_mocked_inference, agent_config):
    common_params = dict(
        model="meta-llama/Llama-3.2-3B-Instruct",
@ -231,6 +295,7 @@ def test_builtin_tool_code_execution(llama_stack_client_with_mocked_inference, a
 # This test must be run in an environment where `bwrap` is available. If you are running against a
 # server, this means the _server_ must have `bwrap` available. If you are using library client, then
 # you must have `bwrap` available in test's environment.
+@pytest.mark.skip(reason="Code interpreter is currently disabled in the Stack")
 def test_code_interpreter_for_attachments(llama_stack_client_with_mocked_inference, agent_config):
    agent_config = {
        **agent_config,
@ -487,6 +552,7 @@ def test_rag_agent_with_attachments(llama_stack_client_with_mocked_inference, ag
    assert "lora" in response.output_message.content.lower()


+@pytest.mark.skip(reason="Code interpreter is currently disabled in the Stack")
 def test_rag_and_code_agent(llama_stack_client_with_mocked_inference, agent_config):
    if "llama-4" in agent_config["model"].lower():
        pytest.xfail("Not working for llama4")
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@ -10,6 +10,7 @@ import platform
 import textwrap
 import time

+import pytest
 from dotenv import load_dotenv

 from llama_stack.log import get_logger
@ -19,10 +20,29 @@ from .report import Report
 logger = get_logger(__name__, category="tests")


+@pytest.hookimpl(hookwrapper=True)
+def pytest_runtest_makereport(item, call):
+    outcome = yield
+    report = outcome.get_result()
+    if report.when == "call":
+        item.execution_outcome = report.outcome
+        item.was_xfail = getattr(report, "wasxfail", False)
+
+
 def pytest_runtest_teardown(item):
-    interval_seconds = os.getenv("LLAMA_STACK_TEST_INTERVAL_SECONDS")
-    if interval_seconds:
-        time.sleep(float(interval_seconds))
+    # Check if the test actually ran and passed or failed, but was not skipped or an expected failure (xfail)
+    outcome = getattr(item, "execution_outcome", None)
+    was_xfail = getattr(item, "was_xfail", False)
+
+    name = item.nodeid
+    if not any(x in name for x in ("inference/", "safety/", "agents/")):
+        return
+
+    logger.debug(f"Test '{item.nodeid}' outcome was '{outcome}' (xfail={was_xfail})")
+    if outcome in ("passed", "failed") and not was_xfail:
+        interval_seconds = os.getenv("LLAMA_STACK_TEST_INTERVAL_SECONDS")
+        if interval_seconds:
+            time.sleep(float(interval_seconds))


 def pytest_configure(config):
--- a/tests/integration/datasets/test_datasets.py
+++ b/tests/integration/datasets/test_datasets.py
@ -31,6 +31,7 @@ def data_url_from_file(file_path: str) -> str:
    return data_url


+@pytest.mark.skip(reason="flaky. Couldn't find 'llamastack/simpleqa' on the Hugging Face Hub")
@pytest.mark.parametrize(
    "purpose, source, provider_id, limit",
    [
--- a/tests/integration/fixtures/common.py
+++ b/tests/integration/fixtures/common.py
@ -14,6 +14,7 @@ from pathlib import Path
 import pytest
 import yaml
 from llama_stack_client import LlamaStackClient
+from openai import OpenAI

 from llama_stack import LlamaStackAsLibraryClient
 from llama_stack.apis.datatypes import Api
@ -207,3 +208,9 @@ def llama_stack_client(request, provider_data, text_model_id):
        raise RuntimeError("Initialization failed")

    return client
+
+
+@pytest.fixture(scope="session")
+def openai_client(client_with_models):
+    base_url = f"{client_with_models.base_url}/v1/openai/v1"
+    return OpenAI(base_url=base_url, api_key="fake")
--- a/tests/integration/fixtures/recordable_mock.py
+++ b/tests/integration/fixtures/recordable_mock.py
@ -24,7 +24,7 @@ class RecordableMock:
        # Load existing cache if available and not recording
        if self.json_path.exists():
            try:
-                with open(self.json_path, "r") as f:
+                with open(self.json_path) as f:
                    self.cache = json.load(f)
            except Exception as e:
                print(f"Error loading cache from {self.json_path}: {e}")
--- a/tests/integration/inference/test_openai_completion.py
+++ b/tests/integration/inference/test_openai_completion.py
@ -75,19 +75,24 @@ def openai_client(client_with_models):
    return OpenAI(base_url=base_url, api_key="bar")


+@pytest.fixture(params=["openai_client", "llama_stack_client"])
+def compat_client(request):
+    return request.getfixturevalue(request.param)
+
+
@pytest.mark.parametrize(
    "test_case",
    [
        "inference:completion:sanity",
    ],
 )
-def test_openai_completion_non_streaming(openai_client, client_with_models, text_model_id, test_case):
+def test_openai_completion_non_streaming(llama_stack_client, client_with_models, text_model_id, test_case):
    skip_if_model_doesnt_support_openai_completion(client_with_models, text_model_id)
    tc = TestCase(test_case)

    # ollama needs more verbose prompting for some reason here...
    prompt = "Respond to this question and explain your answer. " + tc["content"]
-    response = openai_client.completions.create(
+    response = llama_stack_client.completions.create(
        model=text_model_id,
        prompt=prompt,
        stream=False,
@ -103,13 +108,13 @@ def test_openai_completion_non_streaming(openai_client, client_with_models, text
        "inference:completion:sanity",
    ],
 )
-def test_openai_completion_streaming(openai_client, client_with_models, text_model_id, test_case):
+def test_openai_completion_streaming(llama_stack_client, client_with_models, text_model_id, test_case):
    skip_if_model_doesnt_support_openai_completion(client_with_models, text_model_id)
    tc = TestCase(test_case)

    # ollama needs more verbose prompting for some reason here...
    prompt = "Respond to this question and explain your answer. " + tc["content"]
-    response = openai_client.completions.create(
+    response = llama_stack_client.completions.create(
        model=text_model_id,
        prompt=prompt,
        stream=True,
@ -127,11 +132,11 @@ def test_openai_completion_streaming(openai_client, client_with_models, text_mod
        0,
    ],
 )
-def test_openai_completion_prompt_logprobs(openai_client, client_with_models, text_model_id, prompt_logprobs):
+def test_openai_completion_prompt_logprobs(llama_stack_client, client_with_models, text_model_id, prompt_logprobs):
    skip_if_provider_isnt_vllm(client_with_models, text_model_id)

    prompt = "Hello, world!"
-    response = openai_client.completions.create(
+    response = llama_stack_client.completions.create(
        model=text_model_id,
        prompt=prompt,
        stream=False,
@ -144,11 +149,11 @@ def test_openai_completion_prompt_logprobs(openai_client, client_with_models, te
    assert len(choice.prompt_logprobs) > 0


-def test_openai_completion_guided_choice(openai_client, client_with_models, text_model_id):
+def test_openai_completion_guided_choice(llama_stack_client, client_with_models, text_model_id):
    skip_if_provider_isnt_vllm(client_with_models, text_model_id)

    prompt = "I am feeling really sad today."
-    response = openai_client.completions.create(
+    response = llama_stack_client.completions.create(
        model=text_model_id,
        prompt=prompt,
        stream=False,
@ -161,6 +166,9 @@ def test_openai_completion_guided_choice(openai_client, client_with_models, text
    assert choice.text in ["joy", "sadness"]


+# Run the chat-completion tests with both the OpenAI client and the LlamaStack client
+
+
@pytest.mark.parametrize(
    "test_case",
    [
@ -168,13 +176,13 @@ def test_openai_completion_guided_choice(openai_client, client_with_models, text
        "inference:chat_completion:non_streaming_02",
    ],
 )
-def test_openai_chat_completion_non_streaming(openai_client, client_with_models, text_model_id, test_case):
+def test_openai_chat_completion_non_streaming(compat_client, client_with_models, text_model_id, test_case):
    skip_if_model_doesnt_support_openai_chat_completion(client_with_models, text_model_id)
    tc = TestCase(test_case)
    question = tc["question"]
    expected = tc["expected"]

-    response = openai_client.chat.completions.create(
+    response = compat_client.chat.completions.create(
        model=text_model_id,
        messages=[
            {
@ -196,13 +204,13 @@ def test_openai_chat_completion_non_streaming(openai_client, client_with_models,
        "inference:chat_completion:streaming_02",
    ],
 )
-def test_openai_chat_completion_streaming(openai_client, client_with_models, text_model_id, test_case):
+def test_openai_chat_completion_streaming(compat_client, client_with_models, text_model_id, test_case):
    skip_if_model_doesnt_support_openai_chat_completion(client_with_models, text_model_id)
    tc = TestCase(test_case)
    question = tc["question"]
    expected = tc["expected"]

-    response = openai_client.chat.completions.create(
+    response = compat_client.chat.completions.create(
        model=text_model_id,
        messages=[{"role": "user", "content": question}],
        stream=True,
--- a/tests/integration/post_training/test_post_training.py
+++ b/tests/integration/post_training/test_post_training.py
@ -3,7 +3,6 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import List

 import pytest

@ -77,7 +76,7 @@ class TestPostTraining:
    async def test_get_training_jobs(self, post_training_stack):
        post_training_impl = post_training_stack
        jobs_list = await post_training_impl.get_training_jobs()
-        assert isinstance(jobs_list, List)
+        assert isinstance(jobs_list, list)
        assert jobs_list[0].job_uuid == "1234"

    @pytest.mark.asyncio
--- a/tests/integration/providers/nvidia/init.py
+++ b/tests/integration/providers/nvidia/init.py
@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
--- a/tests/integration/providers/nvidia/conftest.py
+++ b/tests/integration/providers/nvidia/conftest.py
@ -0,0 +1,14 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import os
+
+import pytest
+
+# Skip all tests in this directory when running in GitHub Actions
+in_github_actions = os.environ.get("GITHUB_ACTIONS") == "true"
+if in_github_actions:
+    pytest.skip("Skipping NVIDIA tests in GitHub Actions environment", allow_module_level=True)
--- a/tests/integration/providers/nvidia/test_datastore.py
+++ b/tests/integration/providers/nvidia/test_datastore.py
@ -0,0 +1,47 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+
+import pytest
+
+# How to run this test:
+#
+# LLAMA_STACK_CONFIG="nvidia" pytest -v tests/integration/providers/nvidia/test_datastore.py
+
+
+# nvidia provider only
+@pytest.mark.parametrize(
+    "provider_id",
+    [
+        "nvidia",
+    ],
+)
+def test_register_and_unregister(llama_stack_client, provider_id):
+    purpose = "eval/messages-answer"
+    source = {
+        "type": "uri",
+        "uri": "hf://datasets/llamastack/simpleqa?split=train",
+    }
+    dataset_id = f"test-dataset-{provider_id}"
+    dataset = llama_stack_client.datasets.register(
+        dataset_id=dataset_id,
+        purpose=purpose,
+        source=source,
+        metadata={"provider_id": provider_id, "format": "json", "description": "Test dataset description"},
+    )
+    assert dataset.identifier is not None
+    assert dataset.provider_id == provider_id
+    assert dataset.identifier == dataset_id
+
+    dataset_list = llama_stack_client.datasets.list()
+    provider_datasets = [d for d in dataset_list if d.provider_id == provider_id]
+    assert any(provider_datasets)
+    assert any(d.identifier == dataset_id for d in provider_datasets)
+
+    llama_stack_client.datasets.unregister(dataset.identifier)
+    dataset_list = llama_stack_client.datasets.list()
+    provider_datasets = [d for d in dataset_list if d.identifier == dataset.identifier]
+    assert not any(provider_datasets)
--- a/tests/integration/test_cases/openai/responses.json
+++ b/tests/integration/test_cases/openai/responses.json
@ -0,0 +1,37 @@
+{
+  "non_streaming_01": {
+    "data": {
+      "question": "Which planet do humans live on?",
+      "expected": "Earth"
+    }
+  },
+  "non_streaming_02": {
+    "data": {
+      "question": "Which planet has rings around it with a name starting with letter S?",
+      "expected": "Saturn"
+    }
+  },
+  "streaming_01": {
+    "data": {
+      "question": "What's the name of the Sun in latin?",
+      "expected": "Sol"
+    }
+  },
+  "streaming_02": {
+    "data": {
+      "question": "What is the name of the US captial?",
+      "expected": "Washington"
+    }
+  },
+  "tools_web_search_01": {
+    "data": {
+      "input": "How many experts does the Llama 4 Maverick model have?",
+      "tools": [
+        {
+          "type": "web_search"
+        }
+      ],
+      "expected": "128"
+    }
+  }
+}
--- a/tests/integration/test_cases/test_case.py
+++ b/tests/integration/test_cases/test_case.py
@ -12,6 +12,7 @@ class TestCase:
    _apis = [
        "inference/chat_completion",
        "inference/completion",
+        "openai/responses",
    ]
    _jsonblob = {}

@ -19,7 +20,7 @@ class TestCase:
        # loading all test cases
        if self._jsonblob == {}:
            for api in self._apis:
-                with open(pathlib.Path(__file__).parent / f"{api}.json", "r") as f:
+                with open(pathlib.Path(__file__).parent / f"{api}.json") as f:
                    coloned = api.replace("/", ":")
                    try:
                        loaded = json.load(f)
--- a/tests/integration/tool_runtime/test_registration.py
+++ b/tests/integration/tool_runtime/test_registration.py
@ -114,7 +114,7 @@ def test_register_and_unregister_toolgroup(llama_stack_client, mcp_server):
    llama_stack_client.toolgroups.unregister(toolgroup_id=test_toolgroup_id)

    # Verify it is unregistered
-    with pytest.raises(ValueError, match=f"Tool group '{test_toolgroup_id}' not found"):
+    with pytest.raises(Exception, match=f"Tool group '{test_toolgroup_id}' not found"):
        llama_stack_client.toolgroups.get(toolgroup_id=test_toolgroup_id)

    # Verify tools are also unregistered