Merge branch 'main' into feat/litellm_sambanova_usage

2025-12-30 00:19:33 +00:00 · 2025-04-01 07:57:21 -05:00 · 2025-04-01 07:57:21 -05:00 · 9c9f9577e2
commit 9c9f9577e2
parent 8783dd8162 19f504e9e2
173 changed files with 3073 additions and 3118 deletions
--- a/tests/integration/README.md
+++ b/tests/integration/README.md
@ -23,8 +23,8 @@ Model parameters can be influenced by the following options:
 - `--judge-model`: comma-separated list of judge models.
 - `--embedding-dimension`: output dimensionality of the embedding model to use for testing. Default: 384

-Each of these are comma-separated lists and can be used to generate multiple parameter combinations.
-
+Each of these are comma-separated lists and can be used to generate multiple parameter combinations. Note that tests will be skipped
+if no model is specified.

 Experimental, under development, options:
 - `--record-responses`: record new API responses instead of using cached ones
@ -36,7 +36,7 @@ Experimental, under development, options:
 Run all text inference tests with the `together` distribution:

 ```bash
-pytest -s -v tests/api/inference/test_text_inference.py \
+pytest -s -v tests/integration/inference/test_text_inference.py \
   --stack-config=together \
   --text-model=meta-llama/Llama-3.1-8B-Instruct
 ```
@ -44,7 +44,7 @@ pytest -s -v tests/api/inference/test_text_inference.py \
 Run all text inference tests with the `together` distribution and `meta-llama/Llama-3.1-8B-Instruct`:

 ```bash
-pytest -s -v tests/api/inference/test_text_inference.py \
+pytest -s -v tests/integration/inference/test_text_inference.py \
   --stack-config=together \
   --text-model=meta-llama/Llama-3.1-8B-Instruct
 ```
@ -57,7 +57,7 @@ VISION_MODELS=meta-llama/Llama-3.2-11B-Vision-Instruct
 EMBEDDING_MODELS=all-MiniLM-L6-v2
 export TOGETHER_API_KEY=<together_api_key>

-pytest -s -v tests/api/inference/ \
+pytest -s -v tests/integration/inference/ \
   --stack-config=together \
   --text-model=$TEXT_MODELS \
   --vision-model=$VISION_MODELS \
@ -69,7 +69,7 @@ Same thing but instead of using the distribution, use an adhoc stack with just o
 ```bash
 export FIREWORKS_API_KEY=<fireworks_api_key>

-pytest -s -v tests/api/inference/ \
+pytest -s -v tests/integration/inference/ \
   --stack-config=inference=fireworks \
   --text-model=$TEXT_MODELS \
   --vision-model=$VISION_MODELS \
@ -81,7 +81,7 @@ Running Vector IO tests for a number of embedding models:
 ```bash
 EMBEDDING_MODELS=all-MiniLM-L6-v2

-pytest -s -v tests/api/vector_io/ \
+pytest -s -v tests/integration/vector_io/ \
   --stack-config=inference=sentence-transformers,vector_io=sqlite-vec \
   --embedding-model=$EMBEDDING_MODELS
 ```
--- a/tests/integration/agents/test_agents.py
+++ b/tests/integration/agents/test_agents.py
@ -173,6 +173,7 @@ def test_tool_config(llama_stack_client_with_mocked_inference, agent_config):
 def test_builtin_tool_web_search(llama_stack_client_with_mocked_inference, agent_config):
    agent_config = {
        **agent_config,
+        "instructions": "You are a helpful assistant that can use web search to answer questions.",
        "tools": [
            "builtin::websearch",
        ],
@ -184,20 +185,20 @@ def test_builtin_tool_web_search(llama_stack_client_with_mocked_inference, agent
        messages=[
            {
                "role": "user",
-                "content": "Search the web and tell me who the founder of Meta is.",
+                "content": "Search the web and tell me what is the local time in Tokyo currently.",
            }
        ],
        session_id=session_id,
+        stream=False,
    )

-    logs = [str(log) for log in AgentEventLogger().log(response) if log is not None]
-    logs_str = "".join(logs)
-
-    assert "tool_execution>" in logs_str
-    assert "Tool:brave_search Response:" in logs_str
-    assert "mark zuckerberg" in logs_str.lower()
-    if len(agent_config["output_shields"]) > 0:
-        assert "No Violation" in logs_str
+    found_tool_execution = False
+    for step in response.steps:
+        if step.step_type == "tool_execution":
+            assert step.tool_calls[0].tool_name == "brave_search"
+            found_tool_execution = True
+            break
+    assert found_tool_execution


 def test_builtin_tool_code_execution(llama_stack_client_with_mocked_inference, agent_config):
@ -427,19 +428,7 @@ def test_rag_agent(llama_stack_client_with_mocked_inference, agent_config, rag_t
            assert expected_kw in response.output_message.content.lower()


-@pytest.mark.parametrize(
-    "tool",
-    [
-        dict(
-            name="builtin::rag/knowledge_search",
-            args={
-                "vector_db_ids": [],
-            },
-        ),
-        "builtin::rag/knowledge_search",
-    ],
-)
-def test_rag_agent_with_attachments(llama_stack_client_with_mocked_inference, agent_config, tool):
+def test_rag_agent_with_attachments(llama_stack_client_with_mocked_inference, agent_config):
    urls = ["chat.rst", "llama3.rst", "memory_optimizations.rst", "lora_finetune.rst"]
    documents = [
        Document(
@ -452,7 +441,6 @@ def test_rag_agent_with_attachments(llama_stack_client_with_mocked_inference, ag
    ]
    agent_config = {
        **agent_config,
-        "tools": [tool],
    }
    rag_agent = Agent(llama_stack_client_with_mocked_inference, **agent_config)
    session_id = rag_agent.create_session(f"test-session-{uuid4()}")
@ -486,10 +474,6 @@ def test_rag_agent_with_attachments(llama_stack_client_with_mocked_inference, ag
            stream=False,
        )

-    # rag is called
-    tool_execution_step = [step for step in response.steps if step.step_type == "tool_execution"]
-    assert len(tool_execution_step) >= 1
-    assert tool_execution_step[0].tool_calls[0].tool_name == "knowledge_search"
    assert "lora" in response.output_message.content.lower()


@ -536,19 +520,7 @@ def test_rag_and_code_agent(llama_stack_client_with_mocked_inference, agent_conf
        ],
    }
    agent = Agent(llama_stack_client_with_mocked_inference, **agent_config)
-    inflation_doc = Document(
-        document_id="test_csv",
-        content="https://raw.githubusercontent.com/meta-llama/llama-stack-apps/main/examples/resources/inflation.csv",
-        mime_type="text/csv",
-        metadata={},
-    )
    user_prompts = [
-        (
-            "Here is a csv file, can you describe it?",
-            [inflation_doc],
-            "code_interpreter",
-            "",
-        ),
        (
            "when was Perplexity the company founded?",
            [],
--- a/tests/integration/inference/test_text_inference.py
+++ b/tests/integration/inference/test_text_inference.py
@ -117,6 +117,33 @@ def test_text_completion_streaming(client_with_models, text_model_id, test_case)
    assert len(content_str) > 10


+@pytest.mark.parametrize(
+    "test_case",
+    [
+        "inference:completion:stop_sequence",
+    ],
+)
+def test_text_completion_stop_sequence(client_with_models, text_model_id, inference_provider_type, test_case):
+    skip_if_model_doesnt_support_completion(client_with_models, text_model_id)
+    # This is only supported/tested for remote vLLM: https://github.com/meta-llama/llama-stack/issues/1771
+    if inference_provider_type != "remote::vllm":
+        pytest.xfail(f"{inference_provider_type} doesn't support 'stop' parameter yet")
+    tc = TestCase(test_case)
+
+    response = client_with_models.inference.completion(
+        content=tc["content"],
+        stream=True,
+        model_id=text_model_id,
+        sampling_params={
+            "max_tokens": 50,
+            "stop": ["1963"],
+        },
+    )
+    streamed_content = [chunk.delta for chunk in response]
+    content_str = "".join(streamed_content).lower().strip()
+    assert "1963" not in content_str
+
+
@pytest.mark.parametrize(
    "test_case",
    [
@ -266,6 +293,7 @@ def test_text_chat_completion_first_token_profiling(client_with_models, text_mod
        model_id=text_model_id,
        messages=messages,
        stream=False,
+        timeout=120,  # Increase timeout to 2 minutes for large conversation history
    )
    message_content = response.completion_message.content.lower().strip()
    assert len(message_content) > 0
@ -292,6 +320,7 @@ def test_text_chat_completion_streaming(client_with_models, text_model_id, test_
        model_id=text_model_id,
        messages=[{"role": "user", "content": question}],
        stream=True,
+        timeout=120,  # Increase timeout to 2 minutes for large conversation history
    )
    streamed_content = [str(chunk.event.delta.text.lower().strip()) for chunk in response]
    assert len(streamed_content) > 0
--- a/tests/integration/telemetry/test_telemetry.py
+++ b/tests/integration/telemetry/test_telemetry.py
@ -0,0 +1,43 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import time
+from uuid import uuid4
+
+from llama_stack_client import Agent
+
+
+def test_agent_query_spans(llama_stack_client, text_model_id):
+    agent = Agent(llama_stack_client, model=text_model_id, instructions="You are a helpful assistant")
+    session_id = agent.create_session(f"test-session-{uuid4()}")
+    agent.create_turn(
+        messages=[
+            {
+                "role": "user",
+                "content": "Give me a sentence that contains the word: hello",
+            }
+        ],
+        session_id=session_id,
+        stream=False,
+    )
+
+    # Wait for the span to be logged
+    time.sleep(2)
+
+    agent_logs = []
+
+    for span in llama_stack_client.telemetry.query_spans(
+        attribute_filters=[
+            {"key": "session_id", "op": "eq", "value": session_id},
+        ],
+        attributes_to_return=["input", "output"],
+    ):
+        if span.attributes["output"] != "no shields":
+            agent_logs.append(span.attributes)
+
+    assert len(agent_logs) == 1
+    assert "Give me a sentence that contains the word: hello" in agent_logs[0]["input"]
+    assert "hello" in agent_logs[0]["output"].lower()
--- a/tests/integration/test_cases/inference/completion.json
+++ b/tests/integration/test_cases/inference/completion.json
@ -10,6 +10,11 @@
            "expected": "1963"
        }
    },
+    "stop_sequence": {
+        "data": {
+            "content": "Return the exact same sentence and don't add additional words): Michael Jordan was born in the year of 1963"
+        }
+    },
    "streaming": {
        "data": {
            "content": "Roses are red,"