mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-28 04:21:58 +00:00
Merge branch 'meta-llama:main' into feat/litellm_sambanova_usage
This commit is contained in:
commit
13c660f5a5
57 changed files with 10986 additions and 93 deletions
|
|
@ -0,0 +1,3 @@
|
|||
# Ollama external provider for Llama Stack
|
||||
|
||||
Template code to create a new external provider for Llama Stack.
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
adapter:
|
||||
adapter_type: custom_ollama
|
||||
pip_packages: ["ollama", "aiohttp"]
|
||||
config_class: llama_stack_provider_ollama.config.OllamaImplConfig
|
||||
module: llama_stack_provider_ollama
|
||||
api_dependencies: []
|
||||
optional_api_dependencies: []
|
||||
|
|
@ -0,0 +1,44 @@
|
|||
[project]
|
||||
dependencies = [
|
||||
"llama-stack",
|
||||
"pydantic",
|
||||
"ollama",
|
||||
"aiohttp",
|
||||
"aiosqlite",
|
||||
"autoevals",
|
||||
"blobfile",
|
||||
"chardet",
|
||||
"chromadb-client",
|
||||
"datasets",
|
||||
"faiss-cpu",
|
||||
"fastapi",
|
||||
"fire",
|
||||
"httpx",
|
||||
"matplotlib",
|
||||
"mcp",
|
||||
"nltk",
|
||||
"numpy",
|
||||
"openai",
|
||||
"opentelemetry-exporter-otlp-proto-http",
|
||||
"opentelemetry-sdk",
|
||||
"pandas",
|
||||
"pillow",
|
||||
"psycopg2-binary",
|
||||
"pymongo",
|
||||
"pypdf",
|
||||
"redis",
|
||||
"requests",
|
||||
"scikit-learn",
|
||||
"scipy",
|
||||
"sentencepiece",
|
||||
"tqdm",
|
||||
"transformers",
|
||||
"tree_sitter",
|
||||
"uvicorn",
|
||||
]
|
||||
|
||||
name = "llama-stack-provider-ollama"
|
||||
version = "0.1.0"
|
||||
description = "External provider for Ollama using the Llama Stack API"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
135
tests/external-provider/llama-stack-provider-ollama/run.yaml
Normal file
135
tests/external-provider/llama-stack-provider-ollama/run.yaml
Normal file
|
|
@ -0,0 +1,135 @@
|
|||
version: '2'
|
||||
image_name: ollama
|
||||
apis:
|
||||
- agents
|
||||
- datasetio
|
||||
- eval
|
||||
- inference
|
||||
- safety
|
||||
- scoring
|
||||
- telemetry
|
||||
- tool_runtime
|
||||
- vector_io
|
||||
providers:
|
||||
inference:
|
||||
- provider_id: custom_ollama
|
||||
provider_type: remote::custom_ollama
|
||||
config:
|
||||
url: ${env.OLLAMA_URL:http://localhost:11434}
|
||||
vector_io:
|
||||
- provider_id: faiss
|
||||
provider_type: inline::faiss
|
||||
config:
|
||||
kvstore:
|
||||
type: sqlite
|
||||
namespace: null
|
||||
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/faiss_store.db
|
||||
safety:
|
||||
- provider_id: llama-guard
|
||||
provider_type: inline::llama-guard
|
||||
config:
|
||||
excluded_categories: []
|
||||
agents:
|
||||
- provider_id: meta-reference
|
||||
provider_type: inline::meta-reference
|
||||
config:
|
||||
persistence_store:
|
||||
type: sqlite
|
||||
namespace: null
|
||||
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/agents_store.db
|
||||
telemetry:
|
||||
- provider_id: meta-reference
|
||||
provider_type: inline::meta-reference
|
||||
config:
|
||||
service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
|
||||
sinks: ${env.TELEMETRY_SINKS:console,sqlite}
|
||||
sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/ollama/trace_store.db}
|
||||
eval:
|
||||
- provider_id: meta-reference
|
||||
provider_type: inline::meta-reference
|
||||
config:
|
||||
kvstore:
|
||||
type: sqlite
|
||||
namespace: null
|
||||
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/meta_reference_eval.db
|
||||
datasetio:
|
||||
- provider_id: huggingface
|
||||
provider_type: remote::huggingface
|
||||
config:
|
||||
kvstore:
|
||||
type: sqlite
|
||||
namespace: null
|
||||
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/huggingface_datasetio.db
|
||||
- provider_id: localfs
|
||||
provider_type: inline::localfs
|
||||
config:
|
||||
kvstore:
|
||||
type: sqlite
|
||||
namespace: null
|
||||
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/localfs_datasetio.db
|
||||
scoring:
|
||||
- provider_id: basic
|
||||
provider_type: inline::basic
|
||||
config: {}
|
||||
- provider_id: llm-as-judge
|
||||
provider_type: inline::llm-as-judge
|
||||
config: {}
|
||||
- provider_id: braintrust
|
||||
provider_type: inline::braintrust
|
||||
config:
|
||||
openai_api_key: ${env.OPENAI_API_KEY:}
|
||||
tool_runtime:
|
||||
- provider_id: brave-search
|
||||
provider_type: remote::brave-search
|
||||
config:
|
||||
api_key: ${env.BRAVE_SEARCH_API_KEY:}
|
||||
max_results: 3
|
||||
- provider_id: tavily-search
|
||||
provider_type: remote::tavily-search
|
||||
config:
|
||||
api_key: ${env.TAVILY_SEARCH_API_KEY:}
|
||||
max_results: 3
|
||||
- provider_id: code-interpreter
|
||||
provider_type: inline::code-interpreter
|
||||
config: {}
|
||||
- provider_id: rag-runtime
|
||||
provider_type: inline::rag-runtime
|
||||
config: {}
|
||||
- provider_id: model-context-protocol
|
||||
provider_type: remote::model-context-protocol
|
||||
config: {}
|
||||
- provider_id: wolfram-alpha
|
||||
provider_type: remote::wolfram-alpha
|
||||
config:
|
||||
api_key: ${env.WOLFRAM_ALPHA_API_KEY:}
|
||||
metadata_store:
|
||||
type: sqlite
|
||||
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/registry.db
|
||||
models:
|
||||
- metadata: {}
|
||||
model_id: ${env.INFERENCE_MODEL}
|
||||
provider_id: custom_ollama
|
||||
model_type: llm
|
||||
- metadata:
|
||||
embedding_dimension: 384
|
||||
model_id: all-MiniLM-L6-v2
|
||||
provider_id: custom_ollama
|
||||
provider_model_id: all-minilm:latest
|
||||
model_type: embedding
|
||||
shields: []
|
||||
vector_dbs: []
|
||||
datasets: []
|
||||
scoring_fns: []
|
||||
benchmarks: []
|
||||
tool_groups:
|
||||
- toolgroup_id: builtin::websearch
|
||||
provider_id: tavily-search
|
||||
- toolgroup_id: builtin::rag
|
||||
provider_id: rag-runtime
|
||||
- toolgroup_id: builtin::code_interpreter
|
||||
provider_id: code-interpreter
|
||||
- toolgroup_id: builtin::wolfram_alpha
|
||||
provider_id: wolfram-alpha
|
||||
server:
|
||||
port: 8321
|
||||
external_providers_dir: /tmp/providers.d
|
||||
124
tests/integration/tool_runtime/test_registration.py
Normal file
124
tests/integration/tool_runtime/test_registration.py
Normal file
|
|
@ -0,0 +1,124 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
import socket
|
||||
import threading
|
||||
import time
|
||||
|
||||
import httpx
|
||||
import mcp.types as types
|
||||
import pytest
|
||||
import uvicorn
|
||||
from llama_stack_client.types.shared_params.url import URL
|
||||
from mcp.server.fastmcp import Context, FastMCP
|
||||
from mcp.server.sse import SseServerTransport
|
||||
from starlette.applications import Starlette
|
||||
from starlette.routing import Mount, Route
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def mcp_server():
|
||||
server = FastMCP("FastMCP Test Server")
|
||||
|
||||
@server.tool()
|
||||
async def fetch(url: str, ctx: Context) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:
|
||||
headers = {"User-Agent": "MCP Test Server (github.com/modelcontextprotocol/python-sdk)"}
|
||||
async with httpx.AsyncClient(follow_redirects=True, headers=headers) as client:
|
||||
response = await client.get(url)
|
||||
response.raise_for_status()
|
||||
return [types.TextContent(type="text", text=response.text)]
|
||||
|
||||
sse = SseServerTransport("/messages/")
|
||||
|
||||
async def handle_sse(request):
|
||||
async with sse.connect_sse(request.scope, request.receive, request._send) as streams:
|
||||
await server._mcp_server.run(
|
||||
streams[0],
|
||||
streams[1],
|
||||
server._mcp_server.create_initialization_options(),
|
||||
)
|
||||
|
||||
app = Starlette(
|
||||
debug=True,
|
||||
routes=[
|
||||
Route("/sse", endpoint=handle_sse),
|
||||
Mount("/messages/", app=sse.handle_post_message),
|
||||
],
|
||||
)
|
||||
|
||||
def get_open_port():
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
|
||||
sock.bind(("", 0))
|
||||
return sock.getsockname()[1]
|
||||
|
||||
port = get_open_port()
|
||||
|
||||
def run_server():
|
||||
uvicorn.run(app, host="0.0.0.0", port=port)
|
||||
|
||||
# Start the server in a new thread
|
||||
server_thread = threading.Thread(target=run_server, daemon=True)
|
||||
server_thread.start()
|
||||
|
||||
# Polling until the server is ready
|
||||
timeout = 10
|
||||
start_time = time.time()
|
||||
|
||||
while time.time() - start_time < timeout:
|
||||
try:
|
||||
response = httpx.get(f"http://localhost:{port}/sse")
|
||||
if response.status_code == 200:
|
||||
break
|
||||
except (httpx.RequestError, httpx.HTTPStatusError):
|
||||
pass
|
||||
time.sleep(0.1)
|
||||
|
||||
yield port
|
||||
|
||||
|
||||
def test_register_and_unregister_toolgroup(llama_stack_client, mcp_server):
|
||||
"""
|
||||
Integration test for registering and unregistering a toolgroup using the ToolGroups API.
|
||||
"""
|
||||
port = mcp_server
|
||||
test_toolgroup_id = "remote::web-fetch"
|
||||
provider_id = "model-context-protocol"
|
||||
|
||||
# Cleanup before running the test
|
||||
toolgroups = llama_stack_client.toolgroups.list()
|
||||
for toolgroup in toolgroups:
|
||||
if toolgroup.identifier == test_toolgroup_id:
|
||||
llama_stack_client.toolgroups.unregister(toolgroup_id=test_toolgroup_id)
|
||||
|
||||
# Register the toolgroup
|
||||
llama_stack_client.toolgroups.register(
|
||||
toolgroup_id=test_toolgroup_id,
|
||||
provider_id=provider_id,
|
||||
mcp_endpoint=URL(uri=f"http://localhost:{port}/sse"),
|
||||
)
|
||||
|
||||
# Verify registration
|
||||
registered_toolgroup = llama_stack_client.toolgroups.get(toolgroup_id=test_toolgroup_id)
|
||||
assert registered_toolgroup is not None
|
||||
assert registered_toolgroup.identifier == test_toolgroup_id
|
||||
assert registered_toolgroup.provider_id == provider_id
|
||||
|
||||
# Verify tools listing
|
||||
tools_list_response = llama_stack_client.tools.list(toolgroup_id=test_toolgroup_id)
|
||||
assert isinstance(tools_list_response, list)
|
||||
assert tools_list_response
|
||||
|
||||
# Unregister the toolgroup
|
||||
llama_stack_client.toolgroups.unregister(toolgroup_id=test_toolgroup_id)
|
||||
|
||||
# Verify it is unregistered
|
||||
with pytest.raises(ValueError, match=f"Tool group '{test_toolgroup_id}' not found"):
|
||||
llama_stack_client.toolgroups.get(toolgroup_id=test_toolgroup_id)
|
||||
|
||||
# Verify tools are also unregistered
|
||||
unregister_tools_list_response = llama_stack_client.tools.list(toolgroup_id=test_toolgroup_id)
|
||||
assert isinstance(unregister_tools_list_response, list)
|
||||
assert not unregister_tools_list_response
|
||||
155
tests/unit/distribution/test_context.py
Normal file
155
tests/unit/distribution/test_context.py
Normal file
|
|
@ -0,0 +1,155 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
import asyncio
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from contextvars import ContextVar
|
||||
|
||||
import pytest
|
||||
|
||||
from llama_stack.distribution.utils.context import preserve_contexts_async_generator
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_preserve_contexts_with_exception():
|
||||
# Create context variable
|
||||
context_var = ContextVar("exception_var", default="initial")
|
||||
token = context_var.set("start_value")
|
||||
|
||||
# Create an async generator that raises an exception
|
||||
async def exception_generator():
|
||||
yield context_var.get()
|
||||
context_var.set("modified")
|
||||
raise ValueError("Test exception")
|
||||
yield None # This will never be reached
|
||||
|
||||
# Wrap the generator
|
||||
wrapped_gen = preserve_contexts_async_generator(exception_generator(), [context_var])
|
||||
|
||||
# First iteration should work
|
||||
value = await wrapped_gen.__anext__()
|
||||
assert value == "start_value"
|
||||
|
||||
# Second iteration should raise the exception
|
||||
with pytest.raises(ValueError, match="Test exception"):
|
||||
await wrapped_gen.__anext__()
|
||||
|
||||
# Clean up
|
||||
context_var.reset(token)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_preserve_contexts_empty_generator():
|
||||
# Create context variable
|
||||
context_var = ContextVar("empty_var", default="initial")
|
||||
token = context_var.set("value")
|
||||
|
||||
# Create an empty async generator
|
||||
async def empty_generator():
|
||||
if False: # This condition ensures the generator yields nothing
|
||||
yield None
|
||||
|
||||
# Wrap the generator
|
||||
wrapped_gen = preserve_contexts_async_generator(empty_generator(), [context_var])
|
||||
|
||||
# The generator should raise StopAsyncIteration immediately
|
||||
with pytest.raises(StopAsyncIteration):
|
||||
await wrapped_gen.__anext__()
|
||||
|
||||
# Context variable should remain unchanged
|
||||
assert context_var.get() == "value"
|
||||
|
||||
# Clean up
|
||||
context_var.reset(token)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_preserve_contexts_across_event_loops():
|
||||
"""
|
||||
Test that context variables are preserved across event loop boundaries with nested generators.
|
||||
This simulates the real-world scenario where:
|
||||
1. A new event loop is created for each streaming request
|
||||
2. The async generator runs inside that loop
|
||||
3. There are multiple levels of nested generators
|
||||
4. Context needs to be preserved across these boundaries
|
||||
"""
|
||||
# Create context variables
|
||||
request_id = ContextVar("request_id", default=None)
|
||||
user_id = ContextVar("user_id", default=None)
|
||||
|
||||
# Set initial values
|
||||
|
||||
# Results container to verify values across thread boundaries
|
||||
results = []
|
||||
|
||||
# Inner-most generator (level 2)
|
||||
async def inner_generator():
|
||||
# Should have the context from the outer scope
|
||||
yield (1, request_id.get(), user_id.get())
|
||||
|
||||
# Modify one context variable
|
||||
user_id.set("user-modified")
|
||||
|
||||
# Should reflect the modification
|
||||
yield (2, request_id.get(), user_id.get())
|
||||
|
||||
# Middle generator (level 1)
|
||||
async def middle_generator():
|
||||
inner_gen = inner_generator()
|
||||
|
||||
# Forward the first yield from inner
|
||||
item = await inner_gen.__anext__()
|
||||
yield item
|
||||
|
||||
# Forward the second yield from inner
|
||||
item = await inner_gen.__anext__()
|
||||
yield item
|
||||
|
||||
request_id.set("req-modified")
|
||||
|
||||
# Add our own yield with both modified variables
|
||||
yield (3, request_id.get(), user_id.get())
|
||||
|
||||
# Function to run in a separate thread with a new event loop
|
||||
def run_in_new_loop():
|
||||
# Create a new event loop for this thread
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
|
||||
try:
|
||||
# Outer generator (runs in the new loop)
|
||||
async def outer_generator():
|
||||
request_id.set("req-12345")
|
||||
user_id.set("user-6789")
|
||||
# Wrap the middle generator
|
||||
wrapped_gen = preserve_contexts_async_generator(middle_generator(), [request_id, user_id])
|
||||
|
||||
# Process all items from the middle generator
|
||||
async for item in wrapped_gen:
|
||||
# Store results for verification
|
||||
results.append(item)
|
||||
|
||||
# Run the outer generator in the new loop
|
||||
loop.run_until_complete(outer_generator())
|
||||
finally:
|
||||
loop.close()
|
||||
|
||||
# Run the generator chain in a separate thread with a new event loop
|
||||
with ThreadPoolExecutor(max_workers=1) as executor:
|
||||
future = executor.submit(run_in_new_loop)
|
||||
future.result() # Wait for completion
|
||||
|
||||
# Verify the results
|
||||
assert len(results) == 3
|
||||
|
||||
# First yield should have original values
|
||||
assert results[0] == (1, "req-12345", "user-6789")
|
||||
|
||||
# Second yield should have modified user_id
|
||||
assert results[1] == (2, "req-12345", "user-modified")
|
||||
|
||||
# Third yield should have both modified values
|
||||
assert results[2] == (3, "req-modified", "user-modified")
|
||||
223
tests/unit/distribution/test_distribution.py
Normal file
223
tests/unit/distribution/test_distribution.py
Normal file
|
|
@ -0,0 +1,223 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from typing import Any, Dict
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
import yaml
|
||||
from pydantic import BaseModel, Field, ValidationError
|
||||
|
||||
from llama_stack.distribution.datatypes import Api, Provider, StackRunConfig
|
||||
from llama_stack.distribution.distribution import get_provider_registry
|
||||
from llama_stack.providers.datatypes import ProviderSpec
|
||||
|
||||
|
||||
class SampleConfig(BaseModel):
|
||||
foo: str = Field(
|
||||
default="bar",
|
||||
description="foo",
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def sample_run_config(cls, **kwargs: Any) -> Dict[str, Any]:
|
||||
return {
|
||||
"foo": "baz",
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_providers():
|
||||
"""Mock the available_providers function to return test providers."""
|
||||
with patch("llama_stack.providers.registry.inference.available_providers") as mock:
|
||||
mock.return_value = [
|
||||
ProviderSpec(
|
||||
provider_type="test_provider",
|
||||
api=Api.inference,
|
||||
adapter_type="test_adapter",
|
||||
config_class="test_provider.config.TestProviderConfig",
|
||||
)
|
||||
]
|
||||
yield mock
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def base_config(tmp_path):
|
||||
"""Create a base StackRunConfig with common settings."""
|
||||
return StackRunConfig(
|
||||
image_name="test_image",
|
||||
providers={
|
||||
"inference": [
|
||||
Provider(
|
||||
provider_id="sample_provider",
|
||||
provider_type="sample",
|
||||
config=SampleConfig.sample_run_config(),
|
||||
)
|
||||
]
|
||||
},
|
||||
external_providers_dir=str(tmp_path),
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def provider_spec_yaml():
|
||||
"""Common provider spec YAML for testing."""
|
||||
return """
|
||||
adapter:
|
||||
adapter_type: test_provider
|
||||
config_class: test_provider.config.TestProviderConfig
|
||||
module: test_provider
|
||||
api_dependencies:
|
||||
- safety
|
||||
"""
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def inline_provider_spec_yaml():
|
||||
"""Common inline provider spec YAML for testing."""
|
||||
return """
|
||||
module: test_provider
|
||||
config_class: test_provider.config.TestProviderConfig
|
||||
pip_packages:
|
||||
- test-package
|
||||
api_dependencies:
|
||||
- safety
|
||||
optional_api_dependencies:
|
||||
- vector_io
|
||||
provider_data_validator: test_provider.validator.TestValidator
|
||||
container_image: test-image:latest
|
||||
"""
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def api_directories(tmp_path):
|
||||
"""Create the API directory structure for testing."""
|
||||
# Create remote provider directory
|
||||
remote_inference_dir = tmp_path / "remote" / "inference"
|
||||
remote_inference_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Create inline provider directory
|
||||
inline_inference_dir = tmp_path / "inline" / "inference"
|
||||
inline_inference_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
return remote_inference_dir, inline_inference_dir
|
||||
|
||||
|
||||
class TestProviderRegistry:
|
||||
"""Test suite for provider registry functionality."""
|
||||
|
||||
def test_builtin_providers(self, mock_providers):
|
||||
"""Test loading built-in providers."""
|
||||
registry = get_provider_registry(None)
|
||||
|
||||
assert Api.inference in registry
|
||||
assert "test_provider" in registry[Api.inference]
|
||||
assert registry[Api.inference]["test_provider"].provider_type == "test_provider"
|
||||
assert registry[Api.inference]["test_provider"].api == Api.inference
|
||||
|
||||
def test_external_remote_providers(self, api_directories, mock_providers, base_config, provider_spec_yaml):
|
||||
"""Test loading external remote providers from YAML files."""
|
||||
remote_dir, _ = api_directories
|
||||
with open(remote_dir / "test_provider.yaml", "w") as f:
|
||||
f.write(provider_spec_yaml)
|
||||
|
||||
registry = get_provider_registry(base_config)
|
||||
assert len(registry[Api.inference]) == 2
|
||||
|
||||
assert Api.inference in registry
|
||||
assert "remote::test_provider" in registry[Api.inference]
|
||||
provider = registry[Api.inference]["remote::test_provider"]
|
||||
assert provider.adapter.adapter_type == "test_provider"
|
||||
assert provider.adapter.module == "test_provider"
|
||||
assert provider.adapter.config_class == "test_provider.config.TestProviderConfig"
|
||||
assert Api.safety in provider.api_dependencies
|
||||
|
||||
def test_external_inline_providers(self, api_directories, mock_providers, base_config, inline_provider_spec_yaml):
|
||||
"""Test loading external inline providers from YAML files."""
|
||||
_, inline_dir = api_directories
|
||||
with open(inline_dir / "test_provider.yaml", "w") as f:
|
||||
f.write(inline_provider_spec_yaml)
|
||||
|
||||
registry = get_provider_registry(base_config)
|
||||
assert len(registry[Api.inference]) == 2
|
||||
|
||||
assert Api.inference in registry
|
||||
assert "inline::test_provider" in registry[Api.inference]
|
||||
provider = registry[Api.inference]["inline::test_provider"]
|
||||
assert provider.provider_type == "inline::test_provider"
|
||||
assert provider.module == "test_provider"
|
||||
assert provider.config_class == "test_provider.config.TestProviderConfig"
|
||||
assert provider.pip_packages == ["test-package"]
|
||||
assert Api.safety in provider.api_dependencies
|
||||
assert Api.vector_io in provider.optional_api_dependencies
|
||||
assert provider.provider_data_validator == "test_provider.validator.TestValidator"
|
||||
assert provider.container_image == "test-image:latest"
|
||||
|
||||
def test_invalid_yaml(self, api_directories, mock_providers, base_config):
|
||||
"""Test handling of invalid YAML files."""
|
||||
remote_dir, inline_dir = api_directories
|
||||
with open(remote_dir / "invalid.yaml", "w") as f:
|
||||
f.write("invalid: yaml: content: -")
|
||||
with open(inline_dir / "invalid.yaml", "w") as f:
|
||||
f.write("invalid: yaml: content: -")
|
||||
|
||||
with pytest.raises(yaml.YAMLError):
|
||||
get_provider_registry(base_config)
|
||||
|
||||
def test_missing_directory(self, mock_providers):
|
||||
"""Test handling of missing external providers directory."""
|
||||
config = StackRunConfig(
|
||||
image_name="test_image",
|
||||
providers={
|
||||
"inference": [
|
||||
Provider(
|
||||
provider_id="sample_provider",
|
||||
provider_type="sample",
|
||||
config=SampleConfig.sample_run_config(),
|
||||
)
|
||||
]
|
||||
},
|
||||
external_providers_dir="/nonexistent/dir",
|
||||
)
|
||||
with pytest.raises(FileNotFoundError):
|
||||
get_provider_registry(config)
|
||||
|
||||
def test_empty_api_directory(self, api_directories, mock_providers, base_config):
|
||||
"""Test handling of empty API directory."""
|
||||
registry = get_provider_registry(base_config)
|
||||
assert len(registry[Api.inference]) == 1 # Only built-in provider
|
||||
|
||||
def test_malformed_remote_provider_spec(self, api_directories, mock_providers, base_config):
|
||||
"""Test handling of malformed remote provider spec (missing required fields)."""
|
||||
remote_dir, _ = api_directories
|
||||
malformed_spec = """
|
||||
adapter:
|
||||
adapter_type: test_provider
|
||||
# Missing required fields
|
||||
api_dependencies:
|
||||
- safety
|
||||
"""
|
||||
with open(remote_dir / "malformed.yaml", "w") as f:
|
||||
f.write(malformed_spec)
|
||||
|
||||
with pytest.raises(ValidationError):
|
||||
get_provider_registry(base_config)
|
||||
|
||||
def test_malformed_inline_provider_spec(self, api_directories, mock_providers, base_config):
|
||||
"""Test handling of malformed inline provider spec (missing required fields)."""
|
||||
_, inline_dir = api_directories
|
||||
malformed_spec = """
|
||||
module: test_provider
|
||||
# Missing required config_class
|
||||
pip_packages:
|
||||
- test-package
|
||||
"""
|
||||
with open(inline_dir / "malformed.yaml", "w") as f:
|
||||
f.write(malformed_spec)
|
||||
|
||||
with pytest.raises(KeyError) as exc_info:
|
||||
get_provider_registry(base_config)
|
||||
assert "config_class" in str(exc_info.value)
|
||||
65
tests/verifications/README.md
Normal file
65
tests/verifications/README.md
Normal file
|
|
@ -0,0 +1,65 @@
|
|||
# Llama Stack Verifications
|
||||
|
||||
Llama Stack Verifications provide standardized test suites to ensure API compatibility and behavior consistency across different LLM providers. These tests help verify that different models and providers implement the expected interfaces and behaviors correctly.
|
||||
|
||||
## Overview
|
||||
|
||||
This framework allows you to run the same set of verification tests against different LLM providers' OpenAI-compatible endpoints (Fireworks, Together, Groq, Cerebras, etc., and OpenAI itself) to ensure they meet the expected behavior and interface standards.
|
||||
|
||||
## Features
|
||||
|
||||
The verification suite currently tests:
|
||||
|
||||
- Basic chat completions (streaming and non-streaming)
|
||||
- Image input capabilities
|
||||
- Structured JSON output formatting
|
||||
- Tool calling functionality
|
||||
|
||||
## Running Tests
|
||||
|
||||
To run the verification tests, use pytest with the following parameters:
|
||||
|
||||
```bash
|
||||
cd llama-stack
|
||||
pytest tests/verifications/openai --provider=<provider-name>
|
||||
```
|
||||
|
||||
Example:
|
||||
```bash
|
||||
# Run all tests
|
||||
pytest tests/verifications/openai --provider=together
|
||||
|
||||
# Only run tests with Llama 4 models
|
||||
pytest tests/verifications/openai --provider=together -k 'Llama-4'
|
||||
```
|
||||
|
||||
### Parameters
|
||||
|
||||
- `--provider`: The provider name (openai, fireworks, together, groq, cerebras, etc.)
|
||||
- `--base-url`: The base URL for the provider's API (optional - defaults to the standard URL for the specified provider)
|
||||
- `--api-key`: Your API key for the provider (optional - defaults to the standard API_KEY name for the specified provider)
|
||||
|
||||
## Supported Providers
|
||||
|
||||
The verification suite currently supports:
|
||||
- OpenAI
|
||||
- Fireworks
|
||||
- Together
|
||||
- Groq
|
||||
- Cerebras
|
||||
|
||||
## Adding New Test Cases
|
||||
|
||||
To add new test cases, create appropriate JSON files in the `openai/fixtures/test_cases/` directory following the existing patterns.
|
||||
|
||||
|
||||
## Structure
|
||||
|
||||
- `__init__.py` - Marks the directory as a Python package
|
||||
- `conftest.py` - Global pytest configuration and fixtures
|
||||
- `openai/` - Tests specific to OpenAI-compatible APIs
|
||||
- `fixtures/` - Test fixtures and utilities
|
||||
- `fixtures.py` - Provider-specific fixtures
|
||||
- `load.py` - Utilities for loading test cases
|
||||
- `test_cases/` - JSON test case definitions
|
||||
- `test_chat_completion.py` - Tests for chat completion APIs
|
||||
88
tests/verifications/REPORT.md
Normal file
88
tests/verifications/REPORT.md
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
# Test Results Report
|
||||
|
||||
*Generated on: 2025-04-08 21:14:02*
|
||||
|
||||
*This report was generated by running `python tests/verifications/generate_report.py`*
|
||||
|
||||
## Legend
|
||||
|
||||
- ✅ - Test passed
|
||||
- ❌ - Test failed
|
||||
- ⚪ - Test not applicable or not run for this model
|
||||
|
||||
|
||||
## Summary
|
||||
|
||||
| Provider | Pass Rate | Tests Passed | Total Tests |
|
||||
| --- | --- | --- | --- |
|
||||
| Together | 67.7% | 21 | 31 |
|
||||
| Fireworks | 90.3% | 28 | 31 |
|
||||
| Openai | 100.0% | 22 | 22 |
|
||||
|
||||
|
||||
|
||||
## Together
|
||||
|
||||
*Tests run on: 2025-04-08 16:19:59*
|
||||
|
||||
```bash
|
||||
pytest tests/verifications/openai/test_chat_completion.py --provider=together -v
|
||||
```
|
||||
|
||||
| Test | Llama-3.3-70B-Instruct | Llama-4-Maverick-17B-128E-Instruct | Llama-4-Scout-17B-16E-Instruct |
|
||||
| --- | --- | --- | --- |
|
||||
| test_chat_non_streaming_basic (case 0) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_basic (case 1) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_image (case 0) | ⚪ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_structured_output (case 0) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_structured_output (case 1) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_tool_calling (case 0) | ✅ | ✅ | ✅ |
|
||||
| test_chat_streaming_basic (case 0) | ✅ | ❌ | ❌ |
|
||||
| test_chat_streaming_basic (case 1) | ✅ | ❌ | ❌ |
|
||||
| test_chat_streaming_image (case 0) | ⚪ | ❌ | ❌ |
|
||||
| test_chat_streaming_structured_output (case 0) | ✅ | ❌ | ❌ |
|
||||
| test_chat_streaming_structured_output (case 1) | ✅ | ❌ | ❌ |
|
||||
|
||||
## Fireworks
|
||||
|
||||
*Tests run on: 2025-04-08 16:18:28*
|
||||
|
||||
```bash
|
||||
pytest tests/verifications/openai/test_chat_completion.py --provider=fireworks -v
|
||||
```
|
||||
|
||||
| Test | Llama-3.3-70B-Instruct | Llama-4-Maverick-17B-128E-Instruct | Llama-4-Scout-17B-16E-Instruct |
|
||||
| --- | --- | --- | --- |
|
||||
| test_chat_non_streaming_basic (case 0) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_basic (case 1) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_image (case 0) | ⚪ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_structured_output (case 0) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_structured_output (case 1) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_tool_calling (case 0) | ✅ | ❌ | ❌ |
|
||||
| test_chat_streaming_basic (case 0) | ✅ | ✅ | ✅ |
|
||||
| test_chat_streaming_basic (case 1) | ✅ | ✅ | ✅ |
|
||||
| test_chat_streaming_image (case 0) | ⚪ | ✅ | ✅ |
|
||||
| test_chat_streaming_structured_output (case 0) | ✅ | ✅ | ✅ |
|
||||
| test_chat_streaming_structured_output (case 1) | ❌ | ✅ | ✅ |
|
||||
|
||||
## Openai
|
||||
|
||||
*Tests run on: 2025-04-08 16:22:02*
|
||||
|
||||
```bash
|
||||
pytest tests/verifications/openai/test_chat_completion.py --provider=openai -v
|
||||
```
|
||||
|
||||
| Test | gpt-4o | gpt-4o-mini |
|
||||
| --- | --- | --- |
|
||||
| test_chat_non_streaming_basic (case 0) | ✅ | ✅ |
|
||||
| test_chat_non_streaming_basic (case 1) | ✅ | ✅ |
|
||||
| test_chat_non_streaming_image (case 0) | ✅ | ✅ |
|
||||
| test_chat_non_streaming_structured_output (case 0) | ✅ | ✅ |
|
||||
| test_chat_non_streaming_structured_output (case 1) | ✅ | ✅ |
|
||||
| test_chat_non_streaming_tool_calling (case 0) | ✅ | ✅ |
|
||||
| test_chat_streaming_basic (case 0) | ✅ | ✅ |
|
||||
| test_chat_streaming_basic (case 1) | ✅ | ✅ |
|
||||
| test_chat_streaming_image (case 0) | ✅ | ✅ |
|
||||
| test_chat_streaming_structured_output (case 0) | ✅ | ✅ |
|
||||
| test_chat_streaming_structured_output (case 1) | ✅ | ✅ |
|
||||
5
tests/verifications/__init__.py
Normal file
5
tests/verifications/__init__.py
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
28
tests/verifications/conftest.py
Normal file
28
tests/verifications/conftest.py
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
|
||||
def pytest_addoption(parser):
|
||||
parser.addoption(
|
||||
"--base-url",
|
||||
action="store",
|
||||
help="Base URL for OpenAI compatible API",
|
||||
)
|
||||
parser.addoption(
|
||||
"--api-key",
|
||||
action="store",
|
||||
help="API key",
|
||||
)
|
||||
parser.addoption(
|
||||
"--provider",
|
||||
action="store",
|
||||
help="Provider to use for testing",
|
||||
)
|
||||
|
||||
|
||||
pytest_plugins = [
|
||||
"tests.verifications.openai.fixtures.fixtures",
|
||||
]
|
||||
485
tests/verifications/generate_report.py
Executable file
485
tests/verifications/generate_report.py
Executable file
|
|
@ -0,0 +1,485 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
"""
|
||||
Test Report Generator
|
||||
|
||||
Requirements:
|
||||
pip install pytest-json-report
|
||||
|
||||
Usage:
|
||||
# Generate a report using existing test results
|
||||
python tests/verifications/generate_report.py
|
||||
|
||||
# Run tests and generate a report
|
||||
python tests/verifications/generate_report.py --run-tests
|
||||
|
||||
# Run tests for specific providers
|
||||
python tests/verifications/generate_report.py --run-tests --providers fireworks openai
|
||||
|
||||
# Save the report to a custom location
|
||||
python tests/verifications/generate_report.py --output custom_report.md
|
||||
|
||||
# Clean up old test result files
|
||||
python tests/verifications/generate_report.py --cleanup
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
# Define the root directory for test results
|
||||
RESULTS_DIR = Path(__file__).parent / "test_results"
|
||||
RESULTS_DIR.mkdir(exist_ok=True)
|
||||
|
||||
# Maximum number of test result files to keep per provider
|
||||
MAX_RESULTS_PER_PROVIDER = 1
|
||||
|
||||
# Custom order of providers
|
||||
PROVIDER_ORDER = ["together", "fireworks", "groq", "cerebras", "openai"]
|
||||
|
||||
# Dictionary to store providers and their models (will be populated dynamically)
|
||||
PROVIDERS = defaultdict(set)
|
||||
|
||||
# Tests will be dynamically extracted from results
|
||||
ALL_TESTS = set()
|
||||
|
||||
|
||||
def run_tests(provider):
|
||||
"""Run pytest for a specific provider and save results"""
|
||||
print(f"Running tests for provider: {provider}")
|
||||
|
||||
timestamp = int(time.time())
|
||||
result_file = RESULTS_DIR / f"{provider}_{timestamp}.json"
|
||||
temp_json_file = RESULTS_DIR / f"temp_{provider}_{timestamp}.json"
|
||||
|
||||
# Run pytest with JSON output
|
||||
cmd = [
|
||||
"python",
|
||||
"-m",
|
||||
"pytest",
|
||||
"tests/verifications/openai/test_chat_completion.py",
|
||||
f"--provider={provider}",
|
||||
"-v",
|
||||
"--json-report",
|
||||
f"--json-report-file={temp_json_file}",
|
||||
]
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
print(f"Pytest exit code: {result.returncode}")
|
||||
|
||||
# Check if the JSON file was created
|
||||
if temp_json_file.exists():
|
||||
# Read the JSON file and save it to our results format
|
||||
with open(temp_json_file, "r") as f:
|
||||
test_results = json.load(f)
|
||||
|
||||
# Save results to our own format with a trailing newline
|
||||
with open(result_file, "w") as f:
|
||||
json.dump(test_results, f, indent=2)
|
||||
f.write("\n") # Add a trailing newline for precommit
|
||||
|
||||
# Clean up temp file
|
||||
temp_json_file.unlink()
|
||||
|
||||
print(f"Test results saved to {result_file}")
|
||||
return result_file
|
||||
else:
|
||||
print(f"Error: JSON report file not created for {provider}")
|
||||
print(f"Command stdout: {result.stdout}")
|
||||
print(f"Command stderr: {result.stderr}")
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f"Error running tests for {provider}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def parse_results(result_file):
|
||||
"""Parse the test results file and extract pass/fail by model and test"""
|
||||
if not os.path.exists(result_file):
|
||||
print(f"Results file does not exist: {result_file}")
|
||||
return {}
|
||||
|
||||
with open(result_file, "r") as f:
|
||||
results = json.load(f)
|
||||
|
||||
# Initialize results dictionary
|
||||
parsed_results = defaultdict(lambda: defaultdict(dict))
|
||||
provider = os.path.basename(result_file).split("_")[0]
|
||||
|
||||
# Debug: Print summary of test results
|
||||
print(f"Test results summary for {provider}:")
|
||||
print(f"Total tests: {results.get('summary', {}).get('total', 0)}")
|
||||
print(f"Passed: {results.get('summary', {}).get('passed', 0)}")
|
||||
print(f"Failed: {results.get('summary', {}).get('failed', 0)}")
|
||||
print(f"Error: {results.get('summary', {}).get('error', 0)}")
|
||||
print(f"Skipped: {results.get('summary', {}).get('skipped', 0)}")
|
||||
|
||||
# Extract test results
|
||||
if "tests" not in results or not results["tests"]:
|
||||
print(f"No test results found in {result_file}")
|
||||
return parsed_results
|
||||
|
||||
# Map for normalizing model names
|
||||
model_name_map = {
|
||||
"Llama-3.3-8B-Instruct": "Llama-3.3-8B-Instruct",
|
||||
"Llama-3.3-70B-Instruct": "Llama-3.3-70B-Instruct",
|
||||
"Llama-3.2-11B-Vision-Instruct": "Llama-3.2-11B-Vision-Instruct",
|
||||
"Llama-4-Scout-17B-16E": "Llama-4-Scout-17B-16E-Instruct",
|
||||
"Llama-4-Scout-17B-16E-Instruct": "Llama-4-Scout-17B-16E-Instruct",
|
||||
"Llama-4-Maverick-17B-128E": "Llama-4-Maverick-17B-128E-Instruct",
|
||||
"Llama-4-Maverick-17B-128E-Instruct": "Llama-4-Maverick-17B-128E-Instruct",
|
||||
"gpt-4o": "gpt-4o",
|
||||
"gpt-4o-mini": "gpt-4o-mini",
|
||||
}
|
||||
|
||||
# Keep track of all models found for this provider
|
||||
provider_models = set()
|
||||
|
||||
# Track all unique test cases for each base test
|
||||
test_case_counts = defaultdict(int)
|
||||
|
||||
# First pass: count the number of cases for each test
|
||||
for test in results["tests"]:
|
||||
test_id = test.get("nodeid", "")
|
||||
|
||||
if "call" in test:
|
||||
test_name = test_id.split("::")[1].split("[")[0]
|
||||
input_output_match = re.search(r"\[input_output(\d+)-", test_id)
|
||||
if input_output_match:
|
||||
test_case_counts[test_name] += 1
|
||||
|
||||
# Second pass: process the tests with case numbers only for tests with multiple cases
|
||||
for test in results["tests"]:
|
||||
test_id = test.get("nodeid", "")
|
||||
outcome = test.get("outcome", "")
|
||||
|
||||
# Only process tests that have been executed (not setup errors)
|
||||
if "call" in test:
|
||||
# Regular test that actually ran
|
||||
test_name = test_id.split("::")[1].split("[")[0]
|
||||
|
||||
# Extract input_output parameter to differentiate between test cases
|
||||
input_output_match = re.search(r"\[input_output(\d+)-", test_id)
|
||||
input_output_index = input_output_match.group(1) if input_output_match else ""
|
||||
|
||||
# Create a more detailed test name with case number only if there are multiple cases
|
||||
detailed_test_name = test_name
|
||||
if input_output_index and test_case_counts[test_name] > 1:
|
||||
detailed_test_name = f"{test_name} (case {input_output_index})"
|
||||
|
||||
# Track all unique test names
|
||||
ALL_TESTS.add(detailed_test_name)
|
||||
|
||||
# Extract model name from test_id using a more robust pattern
|
||||
model_match = re.search(r"\[input_output\d+-([^\]]+)\]", test_id)
|
||||
if model_match:
|
||||
raw_model = model_match.group(1)
|
||||
model = model_name_map.get(raw_model, raw_model)
|
||||
|
||||
# Add to set of known models for this provider
|
||||
provider_models.add(model)
|
||||
|
||||
# Also update the global PROVIDERS dictionary
|
||||
PROVIDERS[provider].add(model)
|
||||
|
||||
# Store the result
|
||||
if outcome == "passed":
|
||||
parsed_results[provider][model][detailed_test_name] = True
|
||||
else:
|
||||
parsed_results[provider][model][detailed_test_name] = False
|
||||
|
||||
print(f"Parsed test result: {detailed_test_name} for model {model}: {outcome}")
|
||||
elif outcome == "error" and "setup" in test and test.get("setup", {}).get("outcome") == "failed":
|
||||
# This is a setup failure, which likely means a configuration issue
|
||||
# Extract the base test name and model name
|
||||
parts = test_id.split("::")
|
||||
if len(parts) > 1:
|
||||
test_name = parts[1].split("[")[0]
|
||||
|
||||
# Extract input_output parameter to differentiate between test cases
|
||||
input_output_match = re.search(r"\[input_output(\d+)-", test_id)
|
||||
input_output_index = input_output_match.group(1) if input_output_match else ""
|
||||
|
||||
# Create a more detailed test name with case number only if there are multiple cases
|
||||
detailed_test_name = test_name
|
||||
if input_output_index and test_case_counts[test_name] > 1:
|
||||
detailed_test_name = f"{test_name} (case {input_output_index})"
|
||||
|
||||
if detailed_test_name in ALL_TESTS:
|
||||
# Use a more robust pattern for model extraction
|
||||
model_match = re.search(r"\[input_output\d+-([^\]]+)\]", test_id)
|
||||
if model_match:
|
||||
raw_model = model_match.group(1)
|
||||
model = model_name_map.get(raw_model, raw_model)
|
||||
|
||||
# Add to set of known models for this provider
|
||||
provider_models.add(model)
|
||||
|
||||
# Also update the global PROVIDERS dictionary
|
||||
PROVIDERS[provider].add(model)
|
||||
|
||||
# Mark setup failures as false (failed)
|
||||
parsed_results[provider][model][detailed_test_name] = False
|
||||
print(f"Parsed setup failure: {detailed_test_name} for model {model}")
|
||||
|
||||
# Debug: Print parsed results
|
||||
if not parsed_results[provider]:
|
||||
print(f"Warning: No test results parsed for provider {provider}")
|
||||
else:
|
||||
for model, tests in parsed_results[provider].items():
|
||||
print(f"Model {model}: {len(tests)} test results")
|
||||
|
||||
return parsed_results
|
||||
|
||||
|
||||
def cleanup_old_results():
|
||||
"""Clean up old test result files, keeping only the newest N per provider"""
|
||||
for provider in PROVIDERS.keys():
|
||||
# Get all result files for this provider
|
||||
provider_files = list(RESULTS_DIR.glob(f"{provider}_*.json"))
|
||||
|
||||
# Sort by timestamp (newest first)
|
||||
provider_files.sort(key=lambda x: int(x.stem.split("_")[1]), reverse=True)
|
||||
|
||||
# Remove old files beyond the max to keep
|
||||
if len(provider_files) > MAX_RESULTS_PER_PROVIDER:
|
||||
for old_file in provider_files[MAX_RESULTS_PER_PROVIDER:]:
|
||||
try:
|
||||
old_file.unlink()
|
||||
print(f"Removed old result file: {old_file}")
|
||||
except Exception as e:
|
||||
print(f"Error removing file {old_file}: {e}")
|
||||
|
||||
|
||||
def get_latest_results_by_provider():
|
||||
"""Get the latest test result file for each provider"""
|
||||
provider_results = {}
|
||||
|
||||
# Get all result files
|
||||
result_files = list(RESULTS_DIR.glob("*.json"))
|
||||
|
||||
# Extract all provider names from filenames
|
||||
all_providers = set()
|
||||
for file in result_files:
|
||||
# File format is provider_timestamp.json
|
||||
parts = file.stem.split("_")
|
||||
if len(parts) >= 2:
|
||||
all_providers.add(parts[0])
|
||||
|
||||
# Group by provider
|
||||
for provider in all_providers:
|
||||
provider_files = [f for f in result_files if f.name.startswith(f"{provider}_")]
|
||||
|
||||
# Sort by timestamp (newest first)
|
||||
provider_files.sort(key=lambda x: int(x.stem.split("_")[1]), reverse=True)
|
||||
|
||||
if provider_files:
|
||||
provider_results[provider] = provider_files[0]
|
||||
|
||||
return provider_results
|
||||
|
||||
|
||||
def generate_report(results_dict, output_file=None):
|
||||
"""Generate the markdown report"""
|
||||
if output_file is None:
|
||||
# Default to creating the report in the same directory as this script
|
||||
output_file = Path(__file__).parent / "REPORT.md"
|
||||
else:
|
||||
output_file = Path(output_file)
|
||||
|
||||
# Get the timestamp from result files
|
||||
provider_timestamps = {}
|
||||
provider_results = get_latest_results_by_provider()
|
||||
for provider, result_file in provider_results.items():
|
||||
# Extract timestamp from filename (format: provider_timestamp.json)
|
||||
try:
|
||||
timestamp_str = result_file.stem.split("_")[1]
|
||||
timestamp = int(timestamp_str)
|
||||
formatted_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(timestamp))
|
||||
provider_timestamps[provider] = formatted_time
|
||||
except (IndexError, ValueError):
|
||||
provider_timestamps[provider] = "Unknown"
|
||||
|
||||
# Convert provider model sets to sorted lists
|
||||
for provider in PROVIDERS:
|
||||
PROVIDERS[provider] = sorted(PROVIDERS[provider])
|
||||
|
||||
# Sort tests alphabetically
|
||||
sorted_tests = sorted(ALL_TESTS)
|
||||
|
||||
report = ["# Test Results Report\n"]
|
||||
report.append(f"*Generated on: {time.strftime('%Y-%m-%d %H:%M:%S')}*\n")
|
||||
report.append("*This report was generated by running `python tests/verifications/generate_report.py`*\n")
|
||||
|
||||
# Icons for pass/fail
|
||||
pass_icon = "✅"
|
||||
fail_icon = "❌"
|
||||
na_icon = "⚪"
|
||||
|
||||
# Add emoji legend
|
||||
report.append("## Legend\n")
|
||||
report.append(f"- {pass_icon} - Test passed")
|
||||
report.append(f"- {fail_icon} - Test failed")
|
||||
report.append(f"- {na_icon} - Test not applicable or not run for this model")
|
||||
report.append("\n")
|
||||
|
||||
# Add a summary section
|
||||
report.append("## Summary\n")
|
||||
|
||||
# Count total tests and passes
|
||||
total_tests = 0
|
||||
passed_tests = 0
|
||||
provider_totals = {}
|
||||
|
||||
# Prepare summary data
|
||||
for provider in PROVIDERS.keys():
|
||||
provider_passed = 0
|
||||
provider_total = 0
|
||||
|
||||
if provider in results_dict:
|
||||
provider_models = PROVIDERS[provider]
|
||||
for model in provider_models:
|
||||
if model in results_dict[provider]:
|
||||
model_results = results_dict[provider][model]
|
||||
for test in sorted_tests:
|
||||
if test in model_results:
|
||||
provider_total += 1
|
||||
total_tests += 1
|
||||
if model_results[test]:
|
||||
provider_passed += 1
|
||||
passed_tests += 1
|
||||
|
||||
provider_totals[provider] = (provider_passed, provider_total)
|
||||
|
||||
# Add summary table
|
||||
report.append("| Provider | Pass Rate | Tests Passed | Total Tests |")
|
||||
report.append("| --- | --- | --- | --- |")
|
||||
|
||||
# Use the custom order for summary table
|
||||
for provider in [p for p in PROVIDER_ORDER if p in PROVIDERS]:
|
||||
passed, total = provider_totals.get(provider, (0, 0))
|
||||
pass_rate = f"{(passed / total * 100):.1f}%" if total > 0 else "N/A"
|
||||
report.append(f"| {provider.capitalize()} | {pass_rate} | {passed} | {total} |")
|
||||
|
||||
# Add providers not in the custom order
|
||||
for provider in [p for p in PROVIDERS if p not in PROVIDER_ORDER]:
|
||||
passed, total = provider_totals.get(provider, (0, 0))
|
||||
pass_rate = f"{(passed / total * 100):.1f}%" if total > 0 else "N/A"
|
||||
report.append(f"| {provider.capitalize()} | {pass_rate} | {passed} | {total} |")
|
||||
|
||||
report.append("\n")
|
||||
|
||||
# Process each provider in the custom order, then any additional providers
|
||||
for provider in sorted(
|
||||
PROVIDERS.keys(), key=lambda p: (PROVIDER_ORDER.index(p) if p in PROVIDER_ORDER else float("inf"), p)
|
||||
):
|
||||
if not PROVIDERS[provider]:
|
||||
# Skip providers with no models
|
||||
continue
|
||||
|
||||
report.append(f"\n## {provider.capitalize()}\n")
|
||||
|
||||
# Add timestamp when test was run
|
||||
if provider in provider_timestamps:
|
||||
report.append(f"*Tests run on: {provider_timestamps[provider]}*\n")
|
||||
|
||||
# Add test command for reproducing results
|
||||
test_cmd = f"pytest tests/verifications/openai/test_chat_completion.py --provider={provider} -v"
|
||||
report.append(f"```bash\n{test_cmd}\n```\n")
|
||||
|
||||
# Get the relevant models for this provider
|
||||
provider_models = PROVIDERS[provider]
|
||||
|
||||
# Create table header with models as columns
|
||||
header = "| Test | " + " | ".join(provider_models) + " |"
|
||||
separator = "| --- | " + " | ".join(["---"] * len(provider_models)) + " |"
|
||||
|
||||
report.append(header)
|
||||
report.append(separator)
|
||||
|
||||
# Get results for this provider
|
||||
provider_results = results_dict.get(provider, {})
|
||||
|
||||
# Add rows for each test
|
||||
for test in sorted_tests:
|
||||
row = f"| {test} |"
|
||||
|
||||
# Add results for each model in this test
|
||||
for model in provider_models:
|
||||
if model in provider_results and test in provider_results[model]:
|
||||
result = pass_icon if provider_results[model][test] else fail_icon
|
||||
else:
|
||||
result = na_icon
|
||||
row += f" {result} |"
|
||||
|
||||
report.append(row)
|
||||
|
||||
# Write to file
|
||||
with open(output_file, "w") as f:
|
||||
f.write("\n".join(report))
|
||||
f.write("\n")
|
||||
|
||||
print(f"Report generated: {output_file}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Generate test report")
|
||||
parser.add_argument("--run-tests", action="store_true", help="Run tests before generating report")
|
||||
parser.add_argument(
|
||||
"--providers",
|
||||
type=str,
|
||||
nargs="+",
|
||||
help="Specify providers to test (comma-separated or space-separated, default: all)",
|
||||
)
|
||||
parser.add_argument("--output", type=str, help="Output file location (default: tests/verifications/REPORT.md)")
|
||||
args = parser.parse_args()
|
||||
|
||||
all_results = {}
|
||||
|
||||
if args.run_tests:
|
||||
# Get list of available providers from command line or use detected providers
|
||||
if args.providers:
|
||||
# Handle both comma-separated and space-separated lists
|
||||
test_providers = []
|
||||
for provider_arg in args.providers:
|
||||
# Split by comma if commas are present
|
||||
if "," in provider_arg:
|
||||
test_providers.extend(provider_arg.split(","))
|
||||
else:
|
||||
test_providers.append(provider_arg)
|
||||
else:
|
||||
# Default providers to test
|
||||
test_providers = PROVIDER_ORDER
|
||||
|
||||
for provider in test_providers:
|
||||
provider = provider.strip() # Remove any whitespace
|
||||
result_file = run_tests(provider)
|
||||
if result_file:
|
||||
provider_results = parse_results(result_file)
|
||||
all_results.update(provider_results)
|
||||
else:
|
||||
# Use existing results
|
||||
provider_result_files = get_latest_results_by_provider()
|
||||
|
||||
for result_file in provider_result_files.values():
|
||||
provider_results = parse_results(result_file)
|
||||
all_results.update(provider_results)
|
||||
|
||||
# Generate the report
|
||||
generate_report(all_results, args.output)
|
||||
|
||||
cleanup_old_results()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
5
tests/verifications/openai/__init__.py
Normal file
5
tests/verifications/openai/__init__.py
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
5
tests/verifications/openai/fixtures/__init__.py
Normal file
5
tests/verifications/openai/fixtures/__init__.py
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
97
tests/verifications/openai/fixtures/fixtures.py
Normal file
97
tests/verifications/openai/fixtures/fixtures.py
Normal file
|
|
@ -0,0 +1,97 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from openai import OpenAI
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def providers_model_mapping():
|
||||
"""
|
||||
Mapping from model names used in test cases to provider's model names.
|
||||
"""
|
||||
return {
|
||||
"fireworks": {
|
||||
"Llama-3.3-70B-Instruct": "accounts/fireworks/models/llama-v3p1-70b-instruct",
|
||||
"Llama-3.2-11B-Vision-Instruct": "accounts/fireworks/models/llama-v3p2-11b-vision-instruct",
|
||||
"Llama-4-Scout-17B-16E-Instruct": "accounts/fireworks/models/llama4-scout-instruct-basic",
|
||||
"Llama-4-Maverick-17B-128E-Instruct": "accounts/fireworks/models/llama4-maverick-instruct-basic",
|
||||
},
|
||||
"together": {
|
||||
"Llama-3.3-70B-Instruct": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
||||
"Llama-3.2-11B-Vision-Instruct": "meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
|
||||
"Llama-4-Scout-17B-16E-Instruct": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
||||
"Llama-4-Maverick-17B-128E-Instruct": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
||||
},
|
||||
"groq": {
|
||||
"Llama-3.3-70B-Instruct": "llama-3.3-70b-versatile",
|
||||
"Llama-3.2-11B-Vision-Instruct": "llama-3.2-11b-vision-preview",
|
||||
"Llama-4-Scout-17B-16E-Instruct": "llama-4-scout-17b-16e-instruct",
|
||||
"Llama-4-Maverick-17B-128E-Instruct": "llama-4-maverick-17b-128e-instruct",
|
||||
},
|
||||
"cerebras": {
|
||||
"Llama-3.3-70B-Instruct": "llama-3.3-70b",
|
||||
},
|
||||
"openai": {
|
||||
"gpt-4o": "gpt-4o",
|
||||
"gpt-4o-mini": "gpt-4o-mini",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def provider_metadata():
|
||||
return {
|
||||
"fireworks": ("https://api.fireworks.ai/inference/v1", "FIREWORKS_API_KEY"),
|
||||
"together": ("https://api.together.xyz/v1", "TOGETHER_API_KEY"),
|
||||
"groq": ("https://api.groq.com/openai/v1", "GROQ_API_KEY"),
|
||||
"cerebras": ("https://api.cerebras.ai/v1", "CEREBRAS_API_KEY"),
|
||||
"openai": ("https://api.openai.com/v1", "OPENAI_API_KEY"),
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def provider(request, provider_metadata):
|
||||
provider = request.config.getoption("--provider")
|
||||
base_url = request.config.getoption("--base-url")
|
||||
|
||||
if provider and base_url and provider_metadata[provider][0] != base_url:
|
||||
raise ValueError(f"Provider {provider} is not supported for base URL {base_url}")
|
||||
|
||||
if not provider:
|
||||
if not base_url:
|
||||
raise ValueError("Provider and base URL are not provided")
|
||||
for provider, metadata in provider_metadata.items():
|
||||
if metadata[0] == base_url:
|
||||
provider = provider
|
||||
break
|
||||
|
||||
return provider
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def base_url(request, provider, provider_metadata):
|
||||
return request.config.getoption("--base-url") or provider_metadata[provider][0]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def api_key(request, provider, provider_metadata):
|
||||
return request.config.getoption("--api-key") or os.getenv(provider_metadata[provider][1])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def model_mapping(provider, providers_model_mapping):
|
||||
return providers_model_mapping[provider]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def openai_client(base_url, api_key):
|
||||
return OpenAI(
|
||||
base_url=base_url,
|
||||
api_key=api_key,
|
||||
)
|
||||
16
tests/verifications/openai/fixtures/load.py
Normal file
16
tests/verifications/openai/fixtures/load.py
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
|
||||
|
||||
def load_test_cases(name: str):
|
||||
fixture_dir = Path(__file__).parent / "test_cases"
|
||||
yaml_path = fixture_dir / f"{name}.yaml"
|
||||
with open(yaml_path, "r") as f:
|
||||
return yaml.safe_load(f)
|
||||
|
|
@ -0,0 +1,162 @@
|
|||
test_chat_basic:
|
||||
test_name: test_chat_basic
|
||||
test_params:
|
||||
input_output:
|
||||
- input:
|
||||
messages:
|
||||
- content: Which planet do humans live on?
|
||||
role: user
|
||||
output: Earth
|
||||
- input:
|
||||
messages:
|
||||
- content: Which planet has rings around it with a name starting with letter
|
||||
S?
|
||||
role: user
|
||||
output: Saturn
|
||||
model:
|
||||
- Llama-3.3-8B-Instruct
|
||||
- Llama-3.3-70B-Instruct
|
||||
- Llama-4-Scout-17B-16E
|
||||
- Llama-4-Scout-17B-16E-Instruct
|
||||
- Llama-4-Maverick-17B-128E
|
||||
- Llama-4-Maverick-17B-128E-Instruct
|
||||
- gpt-4o
|
||||
- gpt-4o-mini
|
||||
test_chat_image:
|
||||
test_name: test_chat_image
|
||||
test_params:
|
||||
input_output:
|
||||
- input:
|
||||
messages:
|
||||
- content:
|
||||
- text: What is in this image?
|
||||
type: text
|
||||
- image_url:
|
||||
url: https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg
|
||||
type: image_url
|
||||
role: user
|
||||
output: llama
|
||||
model:
|
||||
- Llama-4-Scout-17B-16E
|
||||
- Llama-4-Scout-17B-16E-Instruct
|
||||
- Llama-4-Maverick-17B-128E
|
||||
- Llama-4-Maverick-17B-128E-Instruct
|
||||
- gpt-4o
|
||||
- gpt-4o-mini
|
||||
test_chat_structured_output:
|
||||
test_name: test_chat_structured_output
|
||||
test_params:
|
||||
input_output:
|
||||
- input:
|
||||
messages:
|
||||
- content: Extract the event information.
|
||||
role: system
|
||||
- content: Alice and Bob are going to a science fair on Friday.
|
||||
role: user
|
||||
response_format:
|
||||
json_schema:
|
||||
name: calendar_event
|
||||
schema:
|
||||
properties:
|
||||
date:
|
||||
title: Date
|
||||
type: string
|
||||
name:
|
||||
title: Name
|
||||
type: string
|
||||
participants:
|
||||
items:
|
||||
type: string
|
||||
title: Participants
|
||||
type: array
|
||||
required:
|
||||
- name
|
||||
- date
|
||||
- participants
|
||||
title: CalendarEvent
|
||||
type: object
|
||||
type: json_schema
|
||||
output: valid_calendar_event
|
||||
- input:
|
||||
messages:
|
||||
- content: You are a helpful math tutor. Guide the user through the solution
|
||||
step by step.
|
||||
role: system
|
||||
- content: how can I solve 8x + 7 = -23
|
||||
role: user
|
||||
response_format:
|
||||
json_schema:
|
||||
name: math_reasoning
|
||||
schema:
|
||||
$defs:
|
||||
Step:
|
||||
properties:
|
||||
explanation:
|
||||
title: Explanation
|
||||
type: string
|
||||
output:
|
||||
title: Output
|
||||
type: string
|
||||
required:
|
||||
- explanation
|
||||
- output
|
||||
title: Step
|
||||
type: object
|
||||
properties:
|
||||
final_answer:
|
||||
title: Final Answer
|
||||
type: string
|
||||
steps:
|
||||
items:
|
||||
$ref: '#/$defs/Step'
|
||||
title: Steps
|
||||
type: array
|
||||
required:
|
||||
- steps
|
||||
- final_answer
|
||||
title: MathReasoning
|
||||
type: object
|
||||
type: json_schema
|
||||
output: valid_math_reasoning
|
||||
model:
|
||||
- Llama-3.3-8B-Instruct
|
||||
- Llama-3.3-70B-Instruct
|
||||
- Llama-4-Scout-17B-16E
|
||||
- Llama-4-Scout-17B-16E-Instruct
|
||||
- Llama-4-Maverick-17B-128E
|
||||
- Llama-4-Maverick-17B-128E-Instruct
|
||||
- gpt-4o
|
||||
- gpt-4o-mini
|
||||
test_tool_calling:
|
||||
test_name: test_tool_calling
|
||||
test_params:
|
||||
input_output:
|
||||
- input:
|
||||
messages:
|
||||
- content: You are a helpful assistant that can use tools to get information.
|
||||
role: system
|
||||
- content: What's the weather like in San Francisco?
|
||||
role: user
|
||||
tools:
|
||||
- function:
|
||||
description: Get current temperature for a given location.
|
||||
name: get_weather
|
||||
parameters:
|
||||
additionalProperties: false
|
||||
properties:
|
||||
location:
|
||||
description: "City and country e.g. Bogot\xE1, Colombia"
|
||||
type: string
|
||||
required:
|
||||
- location
|
||||
type: object
|
||||
type: function
|
||||
output: get_weather_tool_call
|
||||
model:
|
||||
- Llama-3.3-70B-Instruct
|
||||
- Llama-4-Scout-17B-16E
|
||||
- Llama-4-Scout-17B-16E-Instruct
|
||||
- Llama-4-Maverick-17B-128E
|
||||
- Llama-4-Maverick-17B-128E-Instruct
|
||||
- gpt-4o
|
||||
- gpt-4o-mini
|
||||
202
tests/verifications/openai/test_chat_completion.py
Normal file
202
tests/verifications/openai/test_chat_completion.py
Normal file
|
|
@ -0,0 +1,202 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
from pydantic import BaseModel
|
||||
|
||||
from tests.verifications.openai.fixtures.load import load_test_cases
|
||||
|
||||
chat_completion_test_cases = load_test_cases("chat_completion")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def correct_model_name(model, provider, providers_model_mapping):
|
||||
"""Return the provider-specific model name based on the generic model name."""
|
||||
mapping = providers_model_mapping[provider]
|
||||
if model not in mapping:
|
||||
pytest.skip(f"Provider {provider} does not support model {model}")
|
||||
return mapping[model]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", chat_completion_test_cases["test_chat_basic"]["test_params"]["model"])
|
||||
@pytest.mark.parametrize(
|
||||
"input_output",
|
||||
chat_completion_test_cases["test_chat_basic"]["test_params"]["input_output"],
|
||||
)
|
||||
def test_chat_non_streaming_basic(openai_client, input_output, correct_model_name):
|
||||
response = openai_client.chat.completions.create(
|
||||
model=correct_model_name,
|
||||
messages=input_output["input"]["messages"],
|
||||
stream=False,
|
||||
)
|
||||
assert response.choices[0].message.role == "assistant"
|
||||
assert input_output["output"].lower() in response.choices[0].message.content.lower()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", chat_completion_test_cases["test_chat_basic"]["test_params"]["model"])
|
||||
@pytest.mark.parametrize(
|
||||
"input_output",
|
||||
chat_completion_test_cases["test_chat_basic"]["test_params"]["input_output"],
|
||||
)
|
||||
def test_chat_streaming_basic(openai_client, input_output, correct_model_name):
|
||||
response = openai_client.chat.completions.create(
|
||||
model=correct_model_name,
|
||||
messages=input_output["input"]["messages"],
|
||||
stream=True,
|
||||
)
|
||||
content = ""
|
||||
for chunk in response:
|
||||
content += chunk.choices[0].delta.content or ""
|
||||
|
||||
# TODO: add detailed type validation
|
||||
|
||||
assert input_output["output"].lower() in content.lower()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", chat_completion_test_cases["test_chat_image"]["test_params"]["model"])
|
||||
@pytest.mark.parametrize(
|
||||
"input_output",
|
||||
chat_completion_test_cases["test_chat_image"]["test_params"]["input_output"],
|
||||
)
|
||||
def test_chat_non_streaming_image(openai_client, input_output, correct_model_name):
|
||||
response = openai_client.chat.completions.create(
|
||||
model=correct_model_name,
|
||||
messages=input_output["input"]["messages"],
|
||||
stream=False,
|
||||
)
|
||||
assert response.choices[0].message.role == "assistant"
|
||||
assert input_output["output"].lower() in response.choices[0].message.content.lower()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", chat_completion_test_cases["test_chat_image"]["test_params"]["model"])
|
||||
@pytest.mark.parametrize(
|
||||
"input_output",
|
||||
chat_completion_test_cases["test_chat_image"]["test_params"]["input_output"],
|
||||
)
|
||||
def test_chat_streaming_image(openai_client, input_output, correct_model_name):
|
||||
response = openai_client.chat.completions.create(
|
||||
model=correct_model_name,
|
||||
messages=input_output["input"]["messages"],
|
||||
stream=True,
|
||||
)
|
||||
content = ""
|
||||
for chunk in response:
|
||||
content += chunk.choices[0].delta.content or ""
|
||||
|
||||
# TODO: add detailed type validation
|
||||
|
||||
assert input_output["output"].lower() in content.lower()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
chat_completion_test_cases["test_chat_structured_output"]["test_params"]["model"],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"input_output",
|
||||
chat_completion_test_cases["test_chat_structured_output"]["test_params"]["input_output"],
|
||||
)
|
||||
def test_chat_non_streaming_structured_output(openai_client, input_output, correct_model_name):
|
||||
response = openai_client.chat.completions.create(
|
||||
model=correct_model_name,
|
||||
messages=input_output["input"]["messages"],
|
||||
response_format=input_output["input"]["response_format"],
|
||||
stream=False,
|
||||
)
|
||||
|
||||
assert response.choices[0].message.role == "assistant"
|
||||
maybe_json_content = response.choices[0].message.content
|
||||
|
||||
validate_structured_output(maybe_json_content, input_output["output"])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
chat_completion_test_cases["test_chat_structured_output"]["test_params"]["model"],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"input_output",
|
||||
chat_completion_test_cases["test_chat_structured_output"]["test_params"]["input_output"],
|
||||
)
|
||||
def test_chat_streaming_structured_output(openai_client, input_output, correct_model_name):
|
||||
response = openai_client.chat.completions.create(
|
||||
model=correct_model_name,
|
||||
messages=input_output["input"]["messages"],
|
||||
response_format=input_output["input"]["response_format"],
|
||||
stream=True,
|
||||
)
|
||||
maybe_json_content = ""
|
||||
for chunk in response:
|
||||
maybe_json_content += chunk.choices[0].delta.content or ""
|
||||
validate_structured_output(maybe_json_content, input_output["output"])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
chat_completion_test_cases["test_tool_calling"]["test_params"]["model"],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"input_output",
|
||||
chat_completion_test_cases["test_tool_calling"]["test_params"]["input_output"],
|
||||
)
|
||||
def test_chat_non_streaming_tool_calling(openai_client, input_output, correct_model_name):
|
||||
response = openai_client.chat.completions.create(
|
||||
model=correct_model_name,
|
||||
messages=input_output["input"]["messages"],
|
||||
tools=input_output["input"]["tools"],
|
||||
stream=False,
|
||||
)
|
||||
|
||||
assert response.choices[0].message.role == "assistant"
|
||||
assert len(response.choices[0].message.tool_calls) > 0
|
||||
assert input_output["output"] == "get_weather_tool_call"
|
||||
assert response.choices[0].message.tool_calls[0].function.name == "get_weather"
|
||||
# TODO: add detailed type validation
|
||||
|
||||
|
||||
def get_structured_output(maybe_json_content: str, schema_name: str) -> Any | None:
|
||||
if schema_name == "valid_calendar_event":
|
||||
|
||||
class CalendarEvent(BaseModel):
|
||||
name: str
|
||||
date: str
|
||||
participants: list[str]
|
||||
|
||||
try:
|
||||
calendar_event = CalendarEvent.model_validate_json(maybe_json_content)
|
||||
return calendar_event
|
||||
except Exception:
|
||||
return None
|
||||
elif schema_name == "valid_math_reasoning":
|
||||
|
||||
class Step(BaseModel):
|
||||
explanation: str
|
||||
output: str
|
||||
|
||||
class MathReasoning(BaseModel):
|
||||
steps: list[Step]
|
||||
final_answer: str
|
||||
|
||||
try:
|
||||
math_reasoning = MathReasoning.model_validate_json(maybe_json_content)
|
||||
return math_reasoning
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def validate_structured_output(maybe_json_content: str, schema_name: str) -> None:
|
||||
structured_output = get_structured_output(maybe_json_content, schema_name)
|
||||
assert structured_output is not None
|
||||
if schema_name == "valid_calendar_event":
|
||||
assert structured_output.name is not None
|
||||
assert structured_output.date is not None
|
||||
assert len(structured_output.participants) == 2
|
||||
elif schema_name == "valid_math_reasoning":
|
||||
assert len(structured_output.final_answer) > 0
|
||||
2744
tests/verifications/test_results/fireworks_1744154308.json
Normal file
2744
tests/verifications/test_results/fireworks_1744154308.json
Normal file
File diff suppressed because it is too large
Load diff
2672
tests/verifications/test_results/openai_1744154522.json
Normal file
2672
tests/verifications/test_results/openai_1744154522.json
Normal file
File diff suppressed because it is too large
Load diff
2830
tests/verifications/test_results/together_1744154399.json
Normal file
2830
tests/verifications/test_results/together_1744154399.json
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue