mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-30 00:19:33 +00:00
Merge branch 'main' into feat/litellm_sambanova_usage
This commit is contained in:
commit
9c9f9577e2
173 changed files with 3073 additions and 3118 deletions
|
|
@ -23,8 +23,8 @@ Model parameters can be influenced by the following options:
|
|||
- `--judge-model`: comma-separated list of judge models.
|
||||
- `--embedding-dimension`: output dimensionality of the embedding model to use for testing. Default: 384
|
||||
|
||||
Each of these are comma-separated lists and can be used to generate multiple parameter combinations.
|
||||
|
||||
Each of these are comma-separated lists and can be used to generate multiple parameter combinations. Note that tests will be skipped
|
||||
if no model is specified.
|
||||
|
||||
Experimental, under development, options:
|
||||
- `--record-responses`: record new API responses instead of using cached ones
|
||||
|
|
@ -36,7 +36,7 @@ Experimental, under development, options:
|
|||
Run all text inference tests with the `together` distribution:
|
||||
|
||||
```bash
|
||||
pytest -s -v tests/api/inference/test_text_inference.py \
|
||||
pytest -s -v tests/integration/inference/test_text_inference.py \
|
||||
--stack-config=together \
|
||||
--text-model=meta-llama/Llama-3.1-8B-Instruct
|
||||
```
|
||||
|
|
@ -44,7 +44,7 @@ pytest -s -v tests/api/inference/test_text_inference.py \
|
|||
Run all text inference tests with the `together` distribution and `meta-llama/Llama-3.1-8B-Instruct`:
|
||||
|
||||
```bash
|
||||
pytest -s -v tests/api/inference/test_text_inference.py \
|
||||
pytest -s -v tests/integration/inference/test_text_inference.py \
|
||||
--stack-config=together \
|
||||
--text-model=meta-llama/Llama-3.1-8B-Instruct
|
||||
```
|
||||
|
|
@ -57,7 +57,7 @@ VISION_MODELS=meta-llama/Llama-3.2-11B-Vision-Instruct
|
|||
EMBEDDING_MODELS=all-MiniLM-L6-v2
|
||||
export TOGETHER_API_KEY=<together_api_key>
|
||||
|
||||
pytest -s -v tests/api/inference/ \
|
||||
pytest -s -v tests/integration/inference/ \
|
||||
--stack-config=together \
|
||||
--text-model=$TEXT_MODELS \
|
||||
--vision-model=$VISION_MODELS \
|
||||
|
|
@ -69,7 +69,7 @@ Same thing but instead of using the distribution, use an adhoc stack with just o
|
|||
```bash
|
||||
export FIREWORKS_API_KEY=<fireworks_api_key>
|
||||
|
||||
pytest -s -v tests/api/inference/ \
|
||||
pytest -s -v tests/integration/inference/ \
|
||||
--stack-config=inference=fireworks \
|
||||
--text-model=$TEXT_MODELS \
|
||||
--vision-model=$VISION_MODELS \
|
||||
|
|
@ -81,7 +81,7 @@ Running Vector IO tests for a number of embedding models:
|
|||
```bash
|
||||
EMBEDDING_MODELS=all-MiniLM-L6-v2
|
||||
|
||||
pytest -s -v tests/api/vector_io/ \
|
||||
pytest -s -v tests/integration/vector_io/ \
|
||||
--stack-config=inference=sentence-transformers,vector_io=sqlite-vec \
|
||||
--embedding-model=$EMBEDDING_MODELS
|
||||
```
|
||||
|
|
|
|||
|
|
@ -173,6 +173,7 @@ def test_tool_config(llama_stack_client_with_mocked_inference, agent_config):
|
|||
def test_builtin_tool_web_search(llama_stack_client_with_mocked_inference, agent_config):
|
||||
agent_config = {
|
||||
**agent_config,
|
||||
"instructions": "You are a helpful assistant that can use web search to answer questions.",
|
||||
"tools": [
|
||||
"builtin::websearch",
|
||||
],
|
||||
|
|
@ -184,20 +185,20 @@ def test_builtin_tool_web_search(llama_stack_client_with_mocked_inference, agent
|
|||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Search the web and tell me who the founder of Meta is.",
|
||||
"content": "Search the web and tell me what is the local time in Tokyo currently.",
|
||||
}
|
||||
],
|
||||
session_id=session_id,
|
||||
stream=False,
|
||||
)
|
||||
|
||||
logs = [str(log) for log in AgentEventLogger().log(response) if log is not None]
|
||||
logs_str = "".join(logs)
|
||||
|
||||
assert "tool_execution>" in logs_str
|
||||
assert "Tool:brave_search Response:" in logs_str
|
||||
assert "mark zuckerberg" in logs_str.lower()
|
||||
if len(agent_config["output_shields"]) > 0:
|
||||
assert "No Violation" in logs_str
|
||||
found_tool_execution = False
|
||||
for step in response.steps:
|
||||
if step.step_type == "tool_execution":
|
||||
assert step.tool_calls[0].tool_name == "brave_search"
|
||||
found_tool_execution = True
|
||||
break
|
||||
assert found_tool_execution
|
||||
|
||||
|
||||
def test_builtin_tool_code_execution(llama_stack_client_with_mocked_inference, agent_config):
|
||||
|
|
@ -427,19 +428,7 @@ def test_rag_agent(llama_stack_client_with_mocked_inference, agent_config, rag_t
|
|||
assert expected_kw in response.output_message.content.lower()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"tool",
|
||||
[
|
||||
dict(
|
||||
name="builtin::rag/knowledge_search",
|
||||
args={
|
||||
"vector_db_ids": [],
|
||||
},
|
||||
),
|
||||
"builtin::rag/knowledge_search",
|
||||
],
|
||||
)
|
||||
def test_rag_agent_with_attachments(llama_stack_client_with_mocked_inference, agent_config, tool):
|
||||
def test_rag_agent_with_attachments(llama_stack_client_with_mocked_inference, agent_config):
|
||||
urls = ["chat.rst", "llama3.rst", "memory_optimizations.rst", "lora_finetune.rst"]
|
||||
documents = [
|
||||
Document(
|
||||
|
|
@ -452,7 +441,6 @@ def test_rag_agent_with_attachments(llama_stack_client_with_mocked_inference, ag
|
|||
]
|
||||
agent_config = {
|
||||
**agent_config,
|
||||
"tools": [tool],
|
||||
}
|
||||
rag_agent = Agent(llama_stack_client_with_mocked_inference, **agent_config)
|
||||
session_id = rag_agent.create_session(f"test-session-{uuid4()}")
|
||||
|
|
@ -486,10 +474,6 @@ def test_rag_agent_with_attachments(llama_stack_client_with_mocked_inference, ag
|
|||
stream=False,
|
||||
)
|
||||
|
||||
# rag is called
|
||||
tool_execution_step = [step for step in response.steps if step.step_type == "tool_execution"]
|
||||
assert len(tool_execution_step) >= 1
|
||||
assert tool_execution_step[0].tool_calls[0].tool_name == "knowledge_search"
|
||||
assert "lora" in response.output_message.content.lower()
|
||||
|
||||
|
||||
|
|
@ -536,19 +520,7 @@ def test_rag_and_code_agent(llama_stack_client_with_mocked_inference, agent_conf
|
|||
],
|
||||
}
|
||||
agent = Agent(llama_stack_client_with_mocked_inference, **agent_config)
|
||||
inflation_doc = Document(
|
||||
document_id="test_csv",
|
||||
content="https://raw.githubusercontent.com/meta-llama/llama-stack-apps/main/examples/resources/inflation.csv",
|
||||
mime_type="text/csv",
|
||||
metadata={},
|
||||
)
|
||||
user_prompts = [
|
||||
(
|
||||
"Here is a csv file, can you describe it?",
|
||||
[inflation_doc],
|
||||
"code_interpreter",
|
||||
"",
|
||||
),
|
||||
(
|
||||
"when was Perplexity the company founded?",
|
||||
[],
|
||||
|
|
|
|||
|
|
@ -117,6 +117,33 @@ def test_text_completion_streaming(client_with_models, text_model_id, test_case)
|
|||
assert len(content_str) > 10
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"test_case",
|
||||
[
|
||||
"inference:completion:stop_sequence",
|
||||
],
|
||||
)
|
||||
def test_text_completion_stop_sequence(client_with_models, text_model_id, inference_provider_type, test_case):
|
||||
skip_if_model_doesnt_support_completion(client_with_models, text_model_id)
|
||||
# This is only supported/tested for remote vLLM: https://github.com/meta-llama/llama-stack/issues/1771
|
||||
if inference_provider_type != "remote::vllm":
|
||||
pytest.xfail(f"{inference_provider_type} doesn't support 'stop' parameter yet")
|
||||
tc = TestCase(test_case)
|
||||
|
||||
response = client_with_models.inference.completion(
|
||||
content=tc["content"],
|
||||
stream=True,
|
||||
model_id=text_model_id,
|
||||
sampling_params={
|
||||
"max_tokens": 50,
|
||||
"stop": ["1963"],
|
||||
},
|
||||
)
|
||||
streamed_content = [chunk.delta for chunk in response]
|
||||
content_str = "".join(streamed_content).lower().strip()
|
||||
assert "1963" not in content_str
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"test_case",
|
||||
[
|
||||
|
|
@ -266,6 +293,7 @@ def test_text_chat_completion_first_token_profiling(client_with_models, text_mod
|
|||
model_id=text_model_id,
|
||||
messages=messages,
|
||||
stream=False,
|
||||
timeout=120, # Increase timeout to 2 minutes for large conversation history
|
||||
)
|
||||
message_content = response.completion_message.content.lower().strip()
|
||||
assert len(message_content) > 0
|
||||
|
|
@ -292,6 +320,7 @@ def test_text_chat_completion_streaming(client_with_models, text_model_id, test_
|
|||
model_id=text_model_id,
|
||||
messages=[{"role": "user", "content": question}],
|
||||
stream=True,
|
||||
timeout=120, # Increase timeout to 2 minutes for large conversation history
|
||||
)
|
||||
streamed_content = [str(chunk.event.delta.text.lower().strip()) for chunk in response]
|
||||
assert len(streamed_content) > 0
|
||||
|
|
|
|||
43
tests/integration/telemetry/test_telemetry.py
Normal file
43
tests/integration/telemetry/test_telemetry.py
Normal file
|
|
@ -0,0 +1,43 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
import time
|
||||
from uuid import uuid4
|
||||
|
||||
from llama_stack_client import Agent
|
||||
|
||||
|
||||
def test_agent_query_spans(llama_stack_client, text_model_id):
|
||||
agent = Agent(llama_stack_client, model=text_model_id, instructions="You are a helpful assistant")
|
||||
session_id = agent.create_session(f"test-session-{uuid4()}")
|
||||
agent.create_turn(
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Give me a sentence that contains the word: hello",
|
||||
}
|
||||
],
|
||||
session_id=session_id,
|
||||
stream=False,
|
||||
)
|
||||
|
||||
# Wait for the span to be logged
|
||||
time.sleep(2)
|
||||
|
||||
agent_logs = []
|
||||
|
||||
for span in llama_stack_client.telemetry.query_spans(
|
||||
attribute_filters=[
|
||||
{"key": "session_id", "op": "eq", "value": session_id},
|
||||
],
|
||||
attributes_to_return=["input", "output"],
|
||||
):
|
||||
if span.attributes["output"] != "no shields":
|
||||
agent_logs.append(span.attributes)
|
||||
|
||||
assert len(agent_logs) == 1
|
||||
assert "Give me a sentence that contains the word: hello" in agent_logs[0]["input"]
|
||||
assert "hello" in agent_logs[0]["output"].lower()
|
||||
|
|
@ -10,6 +10,11 @@
|
|||
"expected": "1963"
|
||||
}
|
||||
},
|
||||
"stop_sequence": {
|
||||
"data": {
|
||||
"content": "Return the exact same sentence and don't add additional words): Michael Jordan was born in the year of 1963"
|
||||
}
|
||||
},
|
||||
"streaming": {
|
||||
"data": {
|
||||
"content": "Roses are red,"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue