Merge branch 'main' into feat/litellm_sambanova_usage

This commit is contained in:
jhpiedrahitao 2025-04-11 19:28:02 -05:00
commit 172a918fe3
66 changed files with 9320 additions and 9446 deletions

View file

@ -0,0 +1,216 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import pytest
from openai import OpenAI
from llama_stack.distribution.library_client import LlamaStackAsLibraryClient
from ..test_cases.test_case import TestCase
def provider_from_model(client_with_models, model_id):
models = {m.identifier: m for m in client_with_models.models.list()}
models.update({m.provider_resource_id: m for m in client_with_models.models.list()})
provider_id = models[model_id].provider_id
providers = {p.provider_id: p for p in client_with_models.providers.list()}
return providers[provider_id]
def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id):
if isinstance(client_with_models, LlamaStackAsLibraryClient):
pytest.skip("OpenAI completions are not supported when testing with library client yet.")
provider = provider_from_model(client_with_models, model_id)
if provider.provider_type in (
"inline::meta-reference",
"inline::sentence-transformers",
"inline::vllm",
"remote::bedrock",
"remote::cerebras",
"remote::databricks",
# Technically Nvidia does support OpenAI completions, but none of their hosted models
# support both completions and chat completions endpoint and all the Llama models are
# just chat completions
"remote::nvidia",
"remote::runpod",
"remote::sambanova",
"remote::tgi",
):
pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI completions.")
def skip_if_model_doesnt_support_openai_chat_completion(client_with_models, model_id):
if isinstance(client_with_models, LlamaStackAsLibraryClient):
pytest.skip("OpenAI chat completions are not supported when testing with library client yet.")
provider = provider_from_model(client_with_models, model_id)
if provider.provider_type in (
"inline::meta-reference",
"inline::sentence-transformers",
"inline::vllm",
"remote::bedrock",
"remote::cerebras",
"remote::databricks",
"remote::runpod",
"remote::sambanova",
"remote::tgi",
):
pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI chat completions.")
def skip_if_provider_isnt_vllm(client_with_models, model_id):
provider = provider_from_model(client_with_models, model_id)
if provider.provider_type != "remote::vllm":
pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support vllm extra_body parameters.")
@pytest.fixture
def openai_client(client_with_models):
base_url = f"{client_with_models.base_url}/v1/openai/v1"
return OpenAI(base_url=base_url, api_key="bar")
@pytest.mark.parametrize(
"test_case",
[
"inference:completion:sanity",
],
)
def test_openai_completion_non_streaming(openai_client, client_with_models, text_model_id, test_case):
skip_if_model_doesnt_support_openai_completion(client_with_models, text_model_id)
tc = TestCase(test_case)
# ollama needs more verbose prompting for some reason here...
prompt = "Respond to this question and explain your answer. " + tc["content"]
response = openai_client.completions.create(
model=text_model_id,
prompt=prompt,
stream=False,
)
assert len(response.choices) > 0
choice = response.choices[0]
assert len(choice.text) > 10
@pytest.mark.parametrize(
"test_case",
[
"inference:completion:sanity",
],
)
def test_openai_completion_streaming(openai_client, client_with_models, text_model_id, test_case):
skip_if_model_doesnt_support_openai_completion(client_with_models, text_model_id)
tc = TestCase(test_case)
# ollama needs more verbose prompting for some reason here...
prompt = "Respond to this question and explain your answer. " + tc["content"]
response = openai_client.completions.create(
model=text_model_id,
prompt=prompt,
stream=True,
max_tokens=50,
)
streamed_content = [chunk.choices[0].text for chunk in response]
content_str = "".join(streamed_content).lower().strip()
assert len(content_str) > 10
@pytest.mark.parametrize(
"prompt_logprobs",
[
1,
0,
],
)
def test_openai_completion_prompt_logprobs(openai_client, client_with_models, text_model_id, prompt_logprobs):
skip_if_provider_isnt_vllm(client_with_models, text_model_id)
prompt = "Hello, world!"
response = openai_client.completions.create(
model=text_model_id,
prompt=prompt,
stream=False,
extra_body={
"prompt_logprobs": prompt_logprobs,
},
)
assert len(response.choices) > 0
choice = response.choices[0]
assert len(choice.prompt_logprobs) > 0
def test_openai_completion_guided_choice(openai_client, client_with_models, text_model_id):
skip_if_provider_isnt_vllm(client_with_models, text_model_id)
prompt = "I am feeling really sad today."
response = openai_client.completions.create(
model=text_model_id,
prompt=prompt,
stream=False,
extra_body={
"guided_choice": ["joy", "sadness"],
},
)
assert len(response.choices) > 0
choice = response.choices[0]
assert choice.text in ["joy", "sadness"]
@pytest.mark.parametrize(
"test_case",
[
"inference:chat_completion:non_streaming_01",
"inference:chat_completion:non_streaming_02",
],
)
def test_openai_chat_completion_non_streaming(openai_client, client_with_models, text_model_id, test_case):
skip_if_model_doesnt_support_openai_chat_completion(client_with_models, text_model_id)
tc = TestCase(test_case)
question = tc["question"]
expected = tc["expected"]
response = openai_client.chat.completions.create(
model=text_model_id,
messages=[
{
"role": "user",
"content": question,
}
],
stream=False,
)
message_content = response.choices[0].message.content.lower().strip()
assert len(message_content) > 0
assert expected.lower() in message_content
@pytest.mark.parametrize(
"test_case",
[
"inference:chat_completion:streaming_01",
"inference:chat_completion:streaming_02",
],
)
def test_openai_chat_completion_streaming(openai_client, client_with_models, text_model_id, test_case):
skip_if_model_doesnt_support_openai_chat_completion(client_with_models, text_model_id)
tc = TestCase(test_case)
question = tc["question"]
expected = tc["expected"]
response = openai_client.chat.completions.create(
model=text_model_id,
messages=[{"role": "user", "content": question}],
stream=True,
timeout=120, # Increase timeout to 2 minutes for large conversation history
)
streamed_content = []
for chunk in response:
if chunk.choices[0].delta.content:
streamed_content.append(chunk.choices[0].delta.content.lower().strip())
assert len(streamed_content) > 0
assert expected.lower() in "".join(streamed_content)

View file

@ -0,0 +1,326 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import json
import os
import unittest
from typing import Any
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from llama_stack.apis.inference.inference import CompletionMessage, UserMessage
from llama_stack.apis.safety import RunShieldResponse, ViolationLevel
from llama_stack.apis.shields import Shield
from llama_stack.providers.remote.safety.nvidia.config import NVIDIASafetyConfig
from llama_stack.providers.remote.safety.nvidia.nvidia import NVIDIASafetyAdapter
class TestNVIDIASafetyAdapter(unittest.TestCase):
def setUp(self):
os.environ["NVIDIA_GUARDRAILS_URL"] = "http://nemo.test"
# Initialize the adapter
self.config = NVIDIASafetyConfig(
guardrails_service_url=os.environ["NVIDIA_GUARDRAILS_URL"],
)
self.adapter = NVIDIASafetyAdapter(config=self.config)
self.shield_store = AsyncMock()
self.adapter.shield_store = self.shield_store
# Mock the HTTP request methods
self.guardrails_post_patcher = patch(
"llama_stack.providers.remote.safety.nvidia.nvidia.NeMoGuardrails._guardrails_post"
)
self.mock_guardrails_post = self.guardrails_post_patcher.start()
self.mock_guardrails_post.return_value = {"status": "allowed"}
def tearDown(self):
"""Clean up after each test."""
self.guardrails_post_patcher.stop()
@pytest.fixture(autouse=True)
def inject_fixtures(self, run_async):
self.run_async = run_async
def _assert_request(
self,
mock_call: MagicMock,
expected_url: str,
expected_headers: dict[str, str] | None = None,
expected_json: dict[str, Any] | None = None,
) -> None:
"""
Helper method to verify request details in mock API calls.
Args:
mock_call: The MagicMock object that was called
expected_url: The expected URL to which the request was made
expected_headers: Optional dictionary of expected request headers
expected_json: Optional dictionary of expected JSON payload
"""
call_args = mock_call.call_args
# Check URL
assert call_args[0][0] == expected_url
# Check headers if provided
if expected_headers:
for key, value in expected_headers.items():
assert call_args[1]["headers"][key] == value
# Check JSON if provided
if expected_json:
for key, value in expected_json.items():
if isinstance(value, dict):
for nested_key, nested_value in value.items():
assert call_args[1]["json"][key][nested_key] == nested_value
else:
assert call_args[1]["json"][key] == value
def test_register_shield_with_valid_id(self):
shield = Shield(
provider_id="nvidia",
type="shield",
identifier="test-shield",
provider_resource_id="test-model",
)
# Register the shield
self.run_async(self.adapter.register_shield(shield))
def test_register_shield_without_id(self):
shield = Shield(
provider_id="nvidia",
type="shield",
identifier="test-shield",
provider_resource_id="",
)
# Register the shield should raise a ValueError
with self.assertRaises(ValueError):
self.run_async(self.adapter.register_shield(shield))
def test_run_shield_allowed(self):
# Set up the shield
shield_id = "test-shield"
shield = Shield(
provider_id="nvidia",
type="shield",
identifier=shield_id,
provider_resource_id="test-model",
)
self.shield_store.get_shield.return_value = shield
# Mock Guardrails API response
self.mock_guardrails_post.return_value = {"status": "allowed"}
# Run the shield
messages = [
UserMessage(role="user", content="Hello, how are you?"),
CompletionMessage(
role="assistant",
content="I'm doing well, thank you for asking!",
stop_reason="end_of_message",
tool_calls=[],
),
]
result = self.run_async(self.adapter.run_shield(shield_id, messages))
# Verify the shield store was called
self.shield_store.get_shield.assert_called_once_with(shield_id)
# Verify the Guardrails API was called correctly
self.mock_guardrails_post.assert_called_once_with(
path="/v1/guardrail/checks",
data={
"model": shield_id,
"messages": [
json.loads(messages[0].model_dump_json()),
json.loads(messages[1].model_dump_json()),
],
"temperature": 1.0,
"top_p": 1,
"frequency_penalty": 0,
"presence_penalty": 0,
"max_tokens": 160,
"stream": False,
"guardrails": {
"config_id": "self-check",
},
},
)
# Verify the result
assert isinstance(result, RunShieldResponse)
assert result.violation is None
def test_run_shield_blocked(self):
# Set up the shield
shield_id = "test-shield"
shield = Shield(
provider_id="nvidia",
type="shield",
identifier=shield_id,
provider_resource_id="test-model",
)
self.shield_store.get_shield.return_value = shield
# Mock Guardrails API response
self.mock_guardrails_post.return_value = {"status": "blocked", "rails_status": {"reason": "harmful_content"}}
# Run the shield
messages = [
UserMessage(role="user", content="Hello, how are you?"),
CompletionMessage(
role="assistant",
content="I'm doing well, thank you for asking!",
stop_reason="end_of_message",
tool_calls=[],
),
]
result = self.run_async(self.adapter.run_shield(shield_id, messages))
# Verify the shield store was called
self.shield_store.get_shield.assert_called_once_with(shield_id)
# Verify the Guardrails API was called correctly
self.mock_guardrails_post.assert_called_once_with(
path="/v1/guardrail/checks",
data={
"model": shield_id,
"messages": [
json.loads(messages[0].model_dump_json()),
json.loads(messages[1].model_dump_json()),
],
"temperature": 1.0,
"top_p": 1,
"frequency_penalty": 0,
"presence_penalty": 0,
"max_tokens": 160,
"stream": False,
"guardrails": {
"config_id": "self-check",
},
},
)
# Verify the result
assert result.violation is not None
assert isinstance(result, RunShieldResponse)
assert result.violation.user_message == "Sorry I cannot do this."
assert result.violation.violation_level == ViolationLevel.ERROR
assert result.violation.metadata == {"reason": "harmful_content"}
def test_run_shield_not_found(self):
# Set up shield store to return None
shield_id = "non-existent-shield"
self.shield_store.get_shield.return_value = None
messages = [
UserMessage(role="user", content="Hello, how are you?"),
]
with self.assertRaises(ValueError):
self.run_async(self.adapter.run_shield(shield_id, messages))
# Verify the shield store was called
self.shield_store.get_shield.assert_called_once_with(shield_id)
# Verify the Guardrails API was not called
self.mock_guardrails_post.assert_not_called()
def test_run_shield_http_error(self):
shield_id = "test-shield"
shield = Shield(
provider_id="nvidia",
type="shield",
identifier=shield_id,
provider_resource_id="test-model",
)
self.shield_store.get_shield.return_value = shield
# Mock Guardrails API to raise an exception
error_msg = "API Error: 500 Internal Server Error"
self.mock_guardrails_post.side_effect = Exception(error_msg)
# Running the shield should raise an exception
messages = [
UserMessage(role="user", content="Hello, how are you?"),
CompletionMessage(
role="assistant",
content="I'm doing well, thank you for asking!",
stop_reason="end_of_message",
tool_calls=[],
),
]
with self.assertRaises(Exception) as context:
self.run_async(self.adapter.run_shield(shield_id, messages))
# Verify the shield store was called
self.shield_store.get_shield.assert_called_once_with(shield_id)
# Verify the Guardrails API was called correctly
self.mock_guardrails_post.assert_called_once_with(
path="/v1/guardrail/checks",
data={
"model": shield_id,
"messages": [
json.loads(messages[0].model_dump_json()),
json.loads(messages[1].model_dump_json()),
],
"temperature": 1.0,
"top_p": 1,
"frequency_penalty": 0,
"presence_penalty": 0,
"max_tokens": 160,
"stream": False,
"guardrails": {
"config_id": "self-check",
},
},
)
# Verify the exception message
assert error_msg in str(context.exception)
def test_init_nemo_guardrails(self):
from llama_stack.providers.remote.safety.nvidia.nvidia import NeMoGuardrails
test_config_id = "test-custom-config-id"
config = NVIDIASafetyConfig(
guardrails_service_url=os.environ["NVIDIA_GUARDRAILS_URL"],
config_id=test_config_id,
)
# Initialize with default parameters
test_model = "test-model"
guardrails = NeMoGuardrails(config, test_model)
# Verify the attributes are set correctly
assert guardrails.config_id == test_config_id
assert guardrails.model == test_model
assert guardrails.threshold == 0.9 # Default value
assert guardrails.temperature == 1.0 # Default value
assert guardrails.guardrails_service_url == os.environ["NVIDIA_GUARDRAILS_URL"]
# Initialize with custom parameters
guardrails = NeMoGuardrails(config, test_model, threshold=0.8, temperature=0.7)
# Verify the attributes are set correctly
assert guardrails.config_id == test_config_id
assert guardrails.model == test_model
assert guardrails.threshold == 0.8
assert guardrails.temperature == 0.7
assert guardrails.guardrails_service_url == os.environ["NVIDIA_GUARDRAILS_URL"]
def test_init_nemo_guardrails_invalid_temperature(self):
from llama_stack.providers.remote.safety.nvidia.nvidia import NeMoGuardrails
config = NVIDIASafetyConfig(
guardrails_service_url=os.environ["NVIDIA_GUARDRAILS_URL"],
config_id="test-custom-config-id",
)
with self.assertRaises(ValueError):
NeMoGuardrails(config, "test-model", temperature=0)

View file

@ -1,6 +1,6 @@
# Test Results Report
*Generated on: 2025-04-08 21:14:02*
*Generated on: 2025-04-10 16:48:18*
*This report was generated by running `python tests/verifications/generate_report.py`*
@ -15,74 +15,118 @@
| Provider | Pass Rate | Tests Passed | Total Tests |
| --- | --- | --- | --- |
| Together | 67.7% | 21 | 31 |
| Fireworks | 90.3% | 28 | 31 |
| Openai | 100.0% | 22 | 22 |
| Together | 64.7% | 22 | 34 |
| Fireworks | 82.4% | 28 | 34 |
| Openai | 100.0% | 24 | 24 |
## Together
*Tests run on: 2025-04-08 16:19:59*
*Tests run on: 2025-04-10 16:46:35*
```bash
pytest tests/verifications/openai/test_chat_completion.py --provider=together -v
# Run all tests for this provider:
pytest tests/verifications/openai_api/test_chat_completion.py --provider=together -v
# Example: Run only the 'earth' case of test_chat_non_streaming_basic:
pytest tests/verifications/openai_api/test_chat_completion.py --provider=together -k "test_chat_non_streaming_basic and earth"
```
| Test | Llama-3.3-70B-Instruct | Llama-4-Maverick-17B-128E-Instruct | Llama-4-Scout-17B-16E-Instruct |
**Model Key (Together)**
| Display Name | Full Model ID |
| --- | --- |
| Llama-3.3-70B-Instruct | `meta-llama/Llama-3.3-70B-Instruct-Turbo` |
| Llama-4-Maverick-Instruct | `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8` |
| Llama-4-Scout-Instruct | `meta-llama/Llama-4-Scout-17B-16E-Instruct` |
| Test | Llama-3.3-70B-Instruct | Llama-4-Maverick-Instruct | Llama-4-Scout-Instruct |
| --- | --- | --- | --- |
| test_chat_non_streaming_basic (case 0) | ✅ | ✅ | ✅ |
| test_chat_non_streaming_basic (case 1) | ✅ | ✅ | ✅ |
| test_chat_non_streaming_image (case 0) | ⚪ | ✅ | ✅ |
| test_chat_non_streaming_structured_output (case 0) | ✅ | ✅ | ✅ |
| test_chat_non_streaming_structured_output (case 1) | ✅ | ✅ | ✅ |
| test_chat_non_streaming_tool_calling (case 0) | ✅ | ✅ | ✅ |
| test_chat_streaming_basic (case 0) | ✅ | ❌ | ❌ |
| test_chat_streaming_basic (case 1) | ✅ | ❌ | ❌ |
| test_chat_streaming_image (case 0) | ⚪ | ❌ | ❌ |
| test_chat_streaming_structured_output (case 0) | ✅ | ❌ | ❌ |
| test_chat_streaming_structured_output (case 1) | ✅ | ❌ | ❌ |
| test_chat_non_streaming_basic (earth) | ✅ | ✅ | ✅ |
| test_chat_non_streaming_basic (saturn) | ✅ | ✅ | ✅ |
| test_chat_non_streaming_image | ⚪ | ✅ | ✅ |
| test_chat_non_streaming_structured_output (calendar) | ✅ | ✅ | ✅ |
| test_chat_non_streaming_structured_output (math) | ✅ | ✅ | ✅ |
| test_chat_non_streaming_tool_calling | ✅ | ✅ | ✅ |
| test_chat_streaming_basic (earth) | ✅ | ❌ | ❌ |
| test_chat_streaming_basic (saturn) | ✅ | ❌ | ❌ |
| test_chat_streaming_image | ⚪ | ❌ | ❌ |
| test_chat_streaming_structured_output (calendar) | ✅ | ❌ | ❌ |
| test_chat_streaming_structured_output (math) | ✅ | ❌ | ❌ |
| test_chat_streaming_tool_calling | ✅ | ❌ | ❌ |
## Fireworks
*Tests run on: 2025-04-08 16:18:28*
*Tests run on: 2025-04-10 16:44:44*
```bash
pytest tests/verifications/openai/test_chat_completion.py --provider=fireworks -v
# Run all tests for this provider:
pytest tests/verifications/openai_api/test_chat_completion.py --provider=fireworks -v
# Example: Run only the 'earth' case of test_chat_non_streaming_basic:
pytest tests/verifications/openai_api/test_chat_completion.py --provider=fireworks -k "test_chat_non_streaming_basic and earth"
```
| Test | Llama-3.3-70B-Instruct | Llama-4-Maverick-17B-128E-Instruct | Llama-4-Scout-17B-16E-Instruct |
**Model Key (Fireworks)**
| Display Name | Full Model ID |
| --- | --- |
| Llama-3.3-70B-Instruct | `accounts/fireworks/models/llama-v3p3-70b-instruct` |
| Llama-4-Maverick-Instruct | `accounts/fireworks/models/llama4-maverick-instruct-basic` |
| Llama-4-Scout-Instruct | `accounts/fireworks/models/llama4-scout-instruct-basic` |
| Test | Llama-3.3-70B-Instruct | Llama-4-Maverick-Instruct | Llama-4-Scout-Instruct |
| --- | --- | --- | --- |
| test_chat_non_streaming_basic (case 0) | ✅ | ✅ | ✅ |
| test_chat_non_streaming_basic (case 1) | ✅ | ✅ | ✅ |
| test_chat_non_streaming_image (case 0) | ⚪ | ✅ | ✅ |
| test_chat_non_streaming_structured_output (case 0) | ✅ | ✅ | ✅ |
| test_chat_non_streaming_structured_output (case 1) | ✅ | ✅ | ✅ |
| test_chat_non_streaming_tool_calling (case 0) | ✅ | ❌ | ❌ |
| test_chat_streaming_basic (case 0) | ✅ | ✅ | ✅ |
| test_chat_streaming_basic (case 1) | ✅ | ✅ | ✅ |
| test_chat_streaming_image (case 0) | ⚪ | ✅ | ✅ |
| test_chat_streaming_structured_output (case 0) | ✅ | ✅ | ✅ |
| test_chat_streaming_structured_output (case 1) | ❌ | ✅ | ✅ |
| test_chat_non_streaming_basic (earth) | ✅ | ✅ | ✅ |
| test_chat_non_streaming_basic (saturn) | ✅ | ✅ | ✅ |
| test_chat_non_streaming_image | ⚪ | ✅ | ✅ |
| test_chat_non_streaming_structured_output (calendar) | ✅ | ✅ | ✅ |
| test_chat_non_streaming_structured_output (math) | ✅ | ✅ | ✅ |
| test_chat_non_streaming_tool_calling | ❌ | ❌ | ❌ |
| test_chat_streaming_basic (earth) | ✅ | ✅ | ✅ |
| test_chat_streaming_basic (saturn) | ✅ | ✅ | ✅ |
| test_chat_streaming_image | ⚪ | ✅ | ✅ |
| test_chat_streaming_structured_output (calendar) | ✅ | ✅ | ✅ |
| test_chat_streaming_structured_output (math) | ✅ | ✅ | ✅ |
| test_chat_streaming_tool_calling | ❌ | ❌ | ❌ |
## Openai
*Tests run on: 2025-04-08 16:22:02*
*Tests run on: 2025-04-10 16:47:28*
```bash
pytest tests/verifications/openai/test_chat_completion.py --provider=openai -v
# Run all tests for this provider:
pytest tests/verifications/openai_api/test_chat_completion.py --provider=openai -v
# Example: Run only the 'earth' case of test_chat_non_streaming_basic:
pytest tests/verifications/openai_api/test_chat_completion.py --provider=openai -k "test_chat_non_streaming_basic and earth"
```
**Model Key (Openai)**
| Display Name | Full Model ID |
| --- | --- |
| gpt-4o | `gpt-4o` |
| gpt-4o-mini | `gpt-4o-mini` |
| Test | gpt-4o | gpt-4o-mini |
| --- | --- | --- |
| test_chat_non_streaming_basic (case 0) | ✅ | ✅ |
| test_chat_non_streaming_basic (case 1) | ✅ | ✅ |
| test_chat_non_streaming_image (case 0) | ✅ | ✅ |
| test_chat_non_streaming_structured_output (case 0) | ✅ | ✅ |
| test_chat_non_streaming_structured_output (case 1) | ✅ | ✅ |
| test_chat_non_streaming_tool_calling (case 0) | ✅ | ✅ |
| test_chat_streaming_basic (case 0) | ✅ | ✅ |
| test_chat_streaming_basic (case 1) | ✅ | ✅ |
| test_chat_streaming_image (case 0) | ✅ | ✅ |
| test_chat_streaming_structured_output (case 0) | ✅ | ✅ |
| test_chat_streaming_structured_output (case 1) | ✅ | ✅ |
| test_chat_non_streaming_basic (earth) | ✅ | ✅ |
| test_chat_non_streaming_basic (saturn) | ✅ | ✅ |
| test_chat_non_streaming_image | ✅ | ✅ |
| test_chat_non_streaming_structured_output (calendar) | ✅ | ✅ |
| test_chat_non_streaming_structured_output (math) | ✅ | ✅ |
| test_chat_non_streaming_tool_calling | ✅ | ✅ |
| test_chat_streaming_basic (earth) | ✅ | ✅ |
| test_chat_streaming_basic (saturn) | ✅ | ✅ |
| test_chat_streaming_image | ✅ | ✅ |
| test_chat_streaming_structured_output (calendar) | ✅ | ✅ |
| test_chat_streaming_structured_output (math) | ✅ | ✅ |
| test_chat_streaming_tool_calling | ✅ | ✅ |

View file

@ -0,0 +1,10 @@
base_url: https://api.cerebras.ai/v1
api_key_var: CEREBRAS_API_KEY
models:
- llama-3.3-70b
model_display_names:
llama-3.3-70b: Llama-3.3-70B-Instruct
test_exclusions:
llama-3.3-70b:
- test_chat_non_streaming_image
- test_chat_streaming_image

View file

@ -0,0 +1,14 @@
base_url: https://api.fireworks.ai/inference/v1
api_key_var: FIREWORKS_API_KEY
models:
- accounts/fireworks/models/llama-v3p3-70b-instruct
- accounts/fireworks/models/llama4-scout-instruct-basic
- accounts/fireworks/models/llama4-maverick-instruct-basic
model_display_names:
accounts/fireworks/models/llama-v3p3-70b-instruct: Llama-3.3-70B-Instruct
accounts/fireworks/models/llama4-scout-instruct-basic: Llama-4-Scout-Instruct
accounts/fireworks/models/llama4-maverick-instruct-basic: Llama-4-Maverick-Instruct
test_exclusions:
accounts/fireworks/models/llama-v3p3-70b-instruct:
- test_chat_non_streaming_image
- test_chat_streaming_image

View file

@ -0,0 +1,14 @@
base_url: https://api.groq.com/openai/v1
api_key_var: GROQ_API_KEY
models:
- llama-3.3-70b-versatile
- llama-4-scout-17b-16e-instruct
- llama-4-maverick-17b-128e-instruct
model_display_names:
llama-3.3-70b-versatile: Llama-3.3-70B-Instruct
llama-4-scout-17b-16e-instruct: Llama-4-Scout-Instruct
llama-4-maverick-17b-128e-instruct: Llama-4-Maverick-Instruct
test_exclusions:
llama-3.3-70b-versatile:
- test_chat_non_streaming_image
- test_chat_streaming_image

View file

@ -0,0 +1,9 @@
base_url: https://api.openai.com/v1
api_key_var: OPENAI_API_KEY
models:
- gpt-4o
- gpt-4o-mini
model_display_names:
gpt-4o: gpt-4o
gpt-4o-mini: gpt-4o-mini
test_exclusions: {}

View file

@ -0,0 +1,14 @@
base_url: https://api.together.xyz/v1
api_key_var: TOGETHER_API_KEY
models:
- meta-llama/Llama-3.3-70B-Instruct-Turbo
- meta-llama/Llama-4-Scout-17B-16E-Instruct
- meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
model_display_names:
meta-llama/Llama-3.3-70B-Instruct-Turbo: Llama-3.3-70B-Instruct
meta-llama/Llama-4-Scout-17B-16E-Instruct: Llama-4-Scout-Instruct
meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8: Llama-4-Maverick-Instruct
test_exclusions:
meta-llama/Llama-3.3-70B-Instruct-Turbo:
- test_chat_non_streaming_image
- test_chat_streaming_image

View file

@ -4,6 +4,10 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import re
import pytest
def pytest_addoption(parser):
parser.addoption(
@ -14,7 +18,7 @@ def pytest_addoption(parser):
parser.addoption(
"--api-key",
action="store",
help="API key",
help="API key to use for the provider",
)
parser.addoption(
"--provider",
@ -24,5 +28,64 @@ def pytest_addoption(parser):
pytest_plugins = [
"tests.verifications.openai.fixtures.fixtures",
"pytest_jsonreport",
"tests.verifications.openai_api.fixtures.fixtures",
"tests.verifications.openai_api.fixtures.load",
]
@pytest.hookimpl(optionalhook=True)
def pytest_json_runtest_metadata(item, call):
"""Add model and case_id to pytest-json report metadata."""
metadata = {}
nodeid = item.nodeid
# 1. Extract model from callspec if available
model = item.callspec.params.get("model") if hasattr(item, "callspec") else None
if model:
metadata["model"] = model
else:
# Fallback: Try parsing from nodeid (less reliable)
match_model = re.search(r"\[(.*?)-", nodeid)
if match_model:
model = match_model.group(1) # Store model even if found via fallback
metadata["model"] = model
else:
print(f"Warning: Could not determine model for test {nodeid}")
model = None # Ensure model is None if not found
# 2. Extract case_id using the known model string if possible
if model:
# Construct a regex pattern to find the case_id *after* the model name and a hyphen.
# Escape the model name in case it contains regex special characters.
pattern = re.escape(model) + r"-(.*?)\]$"
match_case = re.search(pattern, nodeid)
if match_case:
case_id = match_case.group(1)
metadata["case_id"] = case_id
else:
# Fallback if the pattern didn't match (e.g., nodeid format unexpected)
# Try the old less specific regex as a last resort.
match_case_fallback = re.search(r"-(.*?)\]$", nodeid)
if match_case_fallback:
case_id = match_case_fallback.group(1)
metadata["case_id"] = case_id
print(f"Warning: Used fallback regex to parse case_id from nodeid {nodeid}")
else:
print(f"Warning: Could not parse case_id from nodeid {nodeid} even with fallback.")
if "case" in (item.callspec.params if hasattr(item, "callspec") else {}):
metadata["case_id"] = "parsing_failed"
elif "case" in (item.callspec.params if hasattr(item, "callspec") else {}):
# Cannot reliably parse case_id without model, but we know it's a case test.
# Try the generic fallback regex.
match_case_fallback = re.search(r"-(.*?)\]$", nodeid)
if match_case_fallback:
case_id = match_case_fallback.group(1)
metadata["case_id"] = case_id
print(f"Warning: Used fallback regex to parse case_id from nodeid {nodeid} (model unknown)")
else:
print(f"Warning: Could not parse case_id from nodeid {nodeid} (model unknown)")
metadata["case_id"] = "parsing_failed_no_model"
# else: Not a test with a model or case param we need to handle.
return metadata

View file

@ -4,27 +4,48 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "pytest-json-report",
# "pyyaml",
# ]
# ///
"""
Test Report Generator
Requirements:
pip install pytest-json-report
Description:
This script runs pytest tests (specifically designed for OpenAI API compatibility checks)
for different providers, aggregates the results from JSON reports, and generates
a markdown summary report (REPORT.md).
It automatically cleans up old test result files, keeping only the latest
per provider.
Configuration:
- Provider details (models, display names) are loaded from `tests/verifications/config.yaml`.
- Test cases are defined in YAML files within `tests/verifications/openai_api/fixtures/test_cases/`.
- Test results are stored in `tests/verifications/test_results/`.
Usage:
# Generate a report using existing test results
# Generate a report using the latest existing test results
python tests/verifications/generate_report.py
# Run tests and generate a report
# Run tests for all configured providers and generate a report
python tests/verifications/generate_report.py --run-tests
# Run tests for specific providers
# Run tests only for specific providers (space-separated)
python tests/verifications/generate_report.py --run-tests --providers fireworks openai
# Run tests matching a keyword expression (uses pytest -k)
python tests/verifications/generate_report.py --run-tests --providers fireworks --k "streaming"
# Run a specific test case for a provider
python tests/verifications/generate_report.py --run-tests --providers fireworks --k "test_chat_streaming_basic and basic_earth"
# Save the report to a custom location
python tests/verifications/generate_report.py --output custom_report.md
# Clean up old test result files
python tests/verifications/generate_report.py --cleanup
"""
import argparse
@ -35,6 +56,9 @@ import subprocess
import time
from collections import defaultdict
from pathlib import Path
from typing import Any, DefaultDict, Dict, Set, Tuple
from tests.verifications.openai_api.fixtures.fixtures import _load_all_verification_configs
# Define the root directory for test results
RESULTS_DIR = Path(__file__).parent / "test_results"
@ -43,47 +67,52 @@ RESULTS_DIR.mkdir(exist_ok=True)
# Maximum number of test result files to keep per provider
MAX_RESULTS_PER_PROVIDER = 1
# Custom order of providers
PROVIDER_ORDER = ["together", "fireworks", "groq", "cerebras", "openai"]
# Dictionary to store providers and their models (will be populated dynamically)
PROVIDERS = defaultdict(set)
# Tests will be dynamically extracted from results
ALL_TESTS = set()
VERIFICATION_CONFIG = _load_all_verification_configs()
def run_tests(provider):
def run_tests(provider, keyword=None):
"""Run pytest for a specific provider and save results"""
print(f"Running tests for provider: {provider}")
timestamp = int(time.time())
result_file = RESULTS_DIR / f"{provider}_{timestamp}.json"
temp_json_file = RESULTS_DIR / f"temp_{provider}_{timestamp}.json"
# Use a constant filename for the final result and temp file
result_file = RESULTS_DIR / f"{provider}.json"
temp_json_file = RESULTS_DIR / f"temp_{provider}.json"
# Determine project root directory relative to this script
project_root = Path(__file__).parent.parent.parent
# Run pytest with JSON output
cmd = [
"python",
"-m",
"pytest",
"tests/verifications/openai/test_chat_completion.py",
"tests/verifications/openai_api/test_chat_completion.py",
f"--provider={provider}",
"-v",
"--json-report",
f"--json-report-file={temp_json_file}",
]
# Append -k argument if provided
if keyword:
cmd.extend(["-k", keyword])
try:
result = subprocess.run(cmd, capture_output=True, text=True)
# Run subprocess with cwd set to project root
result = subprocess.run(cmd, capture_output=True, text=True, cwd=project_root)
print(f"Pytest exit code: {result.returncode}")
# Check if the JSON file was created
if temp_json_file.exists():
# Read the JSON file and save it to our results format
with open(temp_json_file, "r") as f:
test_results = json.load(f)
# Save results to our own format with a trailing newline
test_results["run_timestamp"] = timestamp
# Save results to the final (overwritten) file
with open(result_file, "w") as f:
json.dump(test_results, f, indent=2)
f.write("\n") # Add a trailing newline for precommit
@ -103,18 +132,40 @@ def run_tests(provider):
return None
def parse_results(result_file):
"""Parse the test results file and extract pass/fail by model and test"""
def parse_results(
result_file,
) -> Tuple[DefaultDict[str, DefaultDict[str, Dict[str, bool]]], DefaultDict[str, Set[str]], Set[str], str]:
"""Parse a single test results file.
Returns:
Tuple containing:
- parsed_results: DefaultDict[provider, DefaultDict[model, Dict[test_name, pass_status]]]
- providers_in_file: DefaultDict[provider, Set[model]] found in this file.
- tests_in_file: Set[test_name] found in this file.
- run_timestamp: Timestamp when the test was run
"""
if not os.path.exists(result_file):
print(f"Results file does not exist: {result_file}")
return {}
# Return empty defaultdicts/set matching the type hint
return defaultdict(lambda: defaultdict(dict)), defaultdict(set), set(), ""
with open(result_file, "r") as f:
results = json.load(f)
# Initialize results dictionary
parsed_results = defaultdict(lambda: defaultdict(dict))
provider = os.path.basename(result_file).split("_")[0]
# Initialize results dictionary with specific types
parsed_results: DefaultDict[str, DefaultDict[str, Dict[str, bool]]] = defaultdict(lambda: defaultdict(dict))
providers_in_file: DefaultDict[str, Set[str]] = defaultdict(set)
tests_in_file: Set[str] = set()
# Extract provider from filename (e.g., "openai.json" -> "openai")
provider: str = result_file.stem
# Extract run timestamp from the JSON data
run_timestamp_unix = results.get("run_timestamp")
run_timestamp_str = (
time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(run_timestamp_unix))
if run_timestamp_unix is not None
else "Unknown"
)
# Debug: Print summary of test results
print(f"Test results summary for {provider}:")
@ -127,195 +178,131 @@ def parse_results(result_file):
# Extract test results
if "tests" not in results or not results["tests"]:
print(f"No test results found in {result_file}")
return parsed_results
# Return empty defaultdicts/set matching the type hint
return defaultdict(lambda: defaultdict(dict)), defaultdict(set), set(), ""
# Map for normalizing model names
model_name_map = {
"Llama-3.3-8B-Instruct": "Llama-3.3-8B-Instruct",
"Llama-3.3-70B-Instruct": "Llama-3.3-70B-Instruct",
"Llama-3.2-11B-Vision-Instruct": "Llama-3.2-11B-Vision-Instruct",
"Llama-4-Scout-17B-16E": "Llama-4-Scout-17B-16E-Instruct",
"Llama-4-Scout-17B-16E-Instruct": "Llama-4-Scout-17B-16E-Instruct",
"Llama-4-Maverick-17B-128E": "Llama-4-Maverick-17B-128E-Instruct",
"Llama-4-Maverick-17B-128E-Instruct": "Llama-4-Maverick-17B-128E-Instruct",
"gpt-4o": "gpt-4o",
"gpt-4o-mini": "gpt-4o-mini",
}
# Keep track of all models found for this provider
provider_models = set()
# Track all unique test cases for each base test
test_case_counts = defaultdict(int)
# First pass: count the number of cases for each test
# Process the tests
for test in results["tests"]:
test_id = test.get("nodeid", "")
if "call" in test:
test_name = test_id.split("::")[1].split("[")[0]
input_output_match = re.search(r"\[input_output(\d+)-", test_id)
if input_output_match:
test_case_counts[test_name] += 1
if not (call_phase := test.get("call")):
continue
call_outcome = call_phase.get("outcome")
if call_outcome not in ("passed", "failed"):
continue
# Second pass: process the tests with case numbers only for tests with multiple cases
for test in results["tests"]:
test_id = test.get("nodeid", "")
outcome = test.get("outcome", "")
# --- Extract data from metadata ---
metadata = test.get("metadata", {})
model = metadata.get("model")
case_id = metadata.get("case_id") # String ID (if provided)
case_index = metadata.get("case_index") # Integer index (if no ID provided)
# Only process tests that have been executed (not setup errors)
if "call" in test:
# Regular test that actually ran
test_name = test_id.split("::")[1].split("[")[0]
# Check if we have a model and at least one case identifier
if not model or (case_id is None and case_index is None):
print(
f"Warning: Missing 'model' or case identifier ('case_id'/'case_index') metadata for test: {test_id}. Skipping."
)
continue
# Extract input_output parameter to differentiate between test cases
input_output_match = re.search(r"\[input_output(\d+)-", test_id)
input_output_index = input_output_match.group(1) if input_output_match else ""
try:
test_name_base = test_id.split("::")[1].split("[")[0]
except (IndexError, ValueError) as e:
print(f"Warning: Could not parse base test name for {test_id}. Error: {e}. Skipping.")
continue
# Create a more detailed test name with case number only if there are multiple cases
detailed_test_name = test_name
if input_output_index and test_case_counts[test_name] > 1:
detailed_test_name = f"{test_name} (case {input_output_index})"
# Construct detailed test name using ID or index
if case_id is not None:
detailed_test_name = f"{test_name_base} ({case_id})"
elif case_index == 0:
# If case_id is missing and index is 0, assume single case, use base name only
detailed_test_name = test_name_base
elif case_index is not None: # case_index > 0
# Use case_index for naming if case_id wasn't provided and index > 0
detailed_test_name = f"{test_name_base} (case{case_index})"
else:
# This case should be prevented by the earlier check, but handle defensively
print(f"Error: No case identifier found for test {test_id} after initial check. Skipping.")
continue
# Track all unique test names
ALL_TESTS.add(detailed_test_name)
# Populate collections for this file
tests_in_file.add(detailed_test_name)
providers_in_file[provider].add(model)
# Extract model name from test_id using a more robust pattern
model_match = re.search(r"\[input_output\d+-([^\]]+)\]", test_id)
if model_match:
raw_model = model_match.group(1)
model = model_name_map.get(raw_model, raw_model)
if call_outcome == "passed":
parsed_results[provider][model][detailed_test_name] = True
elif call_outcome == "failed":
parsed_results[provider][model][detailed_test_name] = False
# Add to set of known models for this provider
provider_models.add(model)
# Final Summary Warning (Optional)
if not parsed_results.get(provider):
print(f"Warning: No valid test results parsed for provider {provider} from file {result_file}")
# Also update the global PROVIDERS dictionary
PROVIDERS[provider].add(model)
# Store the result
if outcome == "passed":
parsed_results[provider][model][detailed_test_name] = True
else:
parsed_results[provider][model][detailed_test_name] = False
print(f"Parsed test result: {detailed_test_name} for model {model}: {outcome}")
elif outcome == "error" and "setup" in test and test.get("setup", {}).get("outcome") == "failed":
# This is a setup failure, which likely means a configuration issue
# Extract the base test name and model name
parts = test_id.split("::")
if len(parts) > 1:
test_name = parts[1].split("[")[0]
# Extract input_output parameter to differentiate between test cases
input_output_match = re.search(r"\[input_output(\d+)-", test_id)
input_output_index = input_output_match.group(1) if input_output_match else ""
# Create a more detailed test name with case number only if there are multiple cases
detailed_test_name = test_name
if input_output_index and test_case_counts[test_name] > 1:
detailed_test_name = f"{test_name} (case {input_output_index})"
if detailed_test_name in ALL_TESTS:
# Use a more robust pattern for model extraction
model_match = re.search(r"\[input_output\d+-([^\]]+)\]", test_id)
if model_match:
raw_model = model_match.group(1)
model = model_name_map.get(raw_model, raw_model)
# Add to set of known models for this provider
provider_models.add(model)
# Also update the global PROVIDERS dictionary
PROVIDERS[provider].add(model)
# Mark setup failures as false (failed)
parsed_results[provider][model][detailed_test_name] = False
print(f"Parsed setup failure: {detailed_test_name} for model {model}")
# Debug: Print parsed results
if not parsed_results[provider]:
print(f"Warning: No test results parsed for provider {provider}")
else:
for model, tests in parsed_results[provider].items():
print(f"Model {model}: {len(tests)} test results")
return parsed_results
return parsed_results, providers_in_file, tests_in_file, run_timestamp_str
def cleanup_old_results():
"""Clean up old test result files, keeping only the newest N per provider"""
for provider in PROVIDERS.keys():
# Get all result files for this provider
provider_files = list(RESULTS_DIR.glob(f"{provider}_*.json"))
# Sort by timestamp (newest first)
provider_files.sort(key=lambda x: int(x.stem.split("_")[1]), reverse=True)
# Remove old files beyond the max to keep
if len(provider_files) > MAX_RESULTS_PER_PROVIDER:
for old_file in provider_files[MAX_RESULTS_PER_PROVIDER:]:
try:
old_file.unlink()
print(f"Removed old result file: {old_file}")
except Exception as e:
print(f"Error removing file {old_file}: {e}")
def get_latest_results_by_provider():
"""Get the latest test result file for each provider"""
def get_all_result_files_by_provider():
"""Get all test result files, keyed by provider."""
provider_results = {}
# Get all result files
result_files = list(RESULTS_DIR.glob("*.json"))
# Extract all provider names from filenames
all_providers = set()
for file in result_files:
# File format is provider_timestamp.json
parts = file.stem.split("_")
if len(parts) >= 2:
all_providers.add(parts[0])
# Group by provider
for provider in all_providers:
provider_files = [f for f in result_files if f.name.startswith(f"{provider}_")]
# Sort by timestamp (newest first)
provider_files.sort(key=lambda x: int(x.stem.split("_")[1]), reverse=True)
if provider_files:
provider_results[provider] = provider_files[0]
provider = file.stem
if provider:
provider_results[provider] = file
return provider_results
def generate_report(results_dict, output_file=None):
"""Generate the markdown report"""
def generate_report(
results_dict: Dict[str, Any],
providers: Dict[str, Set[str]],
all_tests: Set[str],
provider_timestamps: Dict[str, str],
output_file=None,
):
"""Generate the markdown report.
Args:
results_dict: Aggregated results [provider][model][test_name] -> status.
providers: Dict of all providers and their models {provider: {models}}.
all_tests: Set of all test names found.
provider_timestamps: Dict of provider to timestamp when tests were run
output_file: Optional path to save the report.
"""
if output_file is None:
# Default to creating the report in the same directory as this script
output_file = Path(__file__).parent / "REPORT.md"
else:
output_file = Path(output_file)
# Get the timestamp from result files
provider_timestamps = {}
provider_results = get_latest_results_by_provider()
for provider, result_file in provider_results.items():
# Extract timestamp from filename (format: provider_timestamp.json)
try:
timestamp_str = result_file.stem.split("_")[1]
timestamp = int(timestamp_str)
formatted_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(timestamp))
provider_timestamps[provider] = formatted_time
except (IndexError, ValueError):
provider_timestamps[provider] = "Unknown"
# Convert provider model sets to sorted lists (use passed-in providers dict)
providers_sorted = {prov: sorted(models) for prov, models in providers.items()}
# Convert provider model sets to sorted lists
for provider in PROVIDERS:
PROVIDERS[provider] = sorted(PROVIDERS[provider])
# Sort tests alphabetically (use passed-in all_tests set)
sorted_tests = sorted(all_tests)
# Sort tests alphabetically
sorted_tests = sorted(ALL_TESTS)
# Calculate counts for each base test name
base_test_case_counts: DefaultDict[str, int] = defaultdict(int)
base_test_name_map: Dict[str, str] = {}
for test_name in sorted_tests:
match = re.match(r"^(.*?)( \([^)]+\))?$", test_name)
if match:
base_name = match.group(1).strip()
base_test_case_counts[base_name] += 1
base_test_name_map[test_name] = base_name
else:
# Should not happen with current naming, but handle defensively
base_test_case_counts[test_name] += 1
base_test_name_map[test_name] = test_name
if not sorted_tests:
print("Warning: No test results found to generate a report.")
# Optionally create an empty report or return early
with open(output_file, "w") as f:
f.write("# Test Results Report\n\nNo test results found.\n")
print(f"Generated empty report: {output_file}")
return
report = ["# Test Results Report\n"]
report.append(f"*Generated on: {time.strftime('%Y-%m-%d %H:%M:%S')}*\n")
@ -336,19 +323,15 @@ def generate_report(results_dict, output_file=None):
# Add a summary section
report.append("## Summary\n")
# Count total tests and passes
# Count total tests and passes (use passed-in providers and all_tests)
total_tests = 0
passed_tests = 0
provider_totals = {}
# Prepare summary data
for provider in PROVIDERS.keys():
for provider, models in providers_sorted.items():
provider_passed = 0
provider_total = 0
if provider in results_dict:
provider_models = PROVIDERS[provider]
for model in provider_models:
for model in models:
if model in results_dict[provider]:
model_results = results_dict[provider][model]
for test in sorted_tests:
@ -358,33 +341,26 @@ def generate_report(results_dict, output_file=None):
if model_results[test]:
provider_passed += 1
passed_tests += 1
provider_totals[provider] = (provider_passed, provider_total)
# Add summary table
# Add summary table (use passed-in providers dict)
report.append("| Provider | Pass Rate | Tests Passed | Total Tests |")
report.append("| --- | --- | --- | --- |")
# Use the custom order for summary table
for provider in [p for p in PROVIDER_ORDER if p in PROVIDERS]:
for provider in [p for p in PROVIDER_ORDER if p in providers]: # Check against keys of passed-in dict
passed, total = provider_totals.get(provider, (0, 0))
pass_rate = f"{(passed / total * 100):.1f}%" if total > 0 else "N/A"
report.append(f"| {provider.capitalize()} | {pass_rate} | {passed} | {total} |")
# Add providers not in the custom order
for provider in [p for p in PROVIDERS if p not in PROVIDER_ORDER]:
for provider in [p for p in providers if p not in PROVIDER_ORDER]: # Check against keys of passed-in dict
passed, total = provider_totals.get(provider, (0, 0))
pass_rate = f"{(passed / total * 100):.1f}%" if total > 0 else "N/A"
report.append(f"| {provider.capitalize()} | {pass_rate} | {passed} | {total} |")
report.append("\n")
# Process each provider in the custom order, then any additional providers
for provider in sorted(
PROVIDERS.keys(), key=lambda p: (PROVIDER_ORDER.index(p) if p in PROVIDER_ORDER else float("inf"), p)
providers_sorted.keys(), key=lambda p: (PROVIDER_ORDER.index(p) if p in PROVIDER_ORDER else float("inf"), p)
):
if not PROVIDERS[provider]:
# Skip providers with no models
provider_models = providers_sorted[provider] # Use sorted models
if not provider_models:
continue
report.append(f"\n## {provider.capitalize()}\n")
@ -394,34 +370,70 @@ def generate_report(results_dict, output_file=None):
report.append(f"*Tests run on: {provider_timestamps[provider]}*\n")
# Add test command for reproducing results
test_cmd = f"pytest tests/verifications/openai/test_chat_completion.py --provider={provider} -v"
report.append(f"```bash\n{test_cmd}\n```\n")
test_cmd_all = f"pytest tests/verifications/openai_api/test_chat_completion.py --provider={provider} -v"
report.append(f"```bash\n# Run all tests for this provider:\n{test_cmd_all}\n")
# Get the relevant models for this provider
provider_models = PROVIDERS[provider]
# Find an example test with a case ID
example_base_test_name = None
example_case_id = None
# Get first test as fallback base, handle empty list
first_test_name = sorted_tests[0] if sorted_tests else "unknown_test"
# Create table header with models as columns
header = "| Test | " + " | ".join(provider_models) + " |"
match = re.match(r"^(.*?) \((.*?)\)$", first_test_name)
if match:
example_base_test_name = match.group(1).strip()
example_case_id = match.group(2).strip()
else:
example_base_test_name = first_test_name
base_name = base_test_name_map.get(first_test_name, first_test_name) # Get base name
case_count = base_test_case_counts.get(base_name, 1) # Get count
filter_str = f"{example_base_test_name} and {example_case_id}" if case_count > 1 else example_base_test_name
test_cmd_specific_case = (
f'pytest tests/verifications/openai_api/test_chat_completion.py --provider={provider} -k "{filter_str}"'
)
report.append(
f"# Example: Run only the '{example_case_id}' case of {example_base_test_name}:\n{test_cmd_specific_case}\n```\n"
)
# Get display names (use passed-in providers dict)
provider_config = VERIFICATION_CONFIG.get("providers", {}).get(provider, {})
display_name_map = provider_config.get("model_display_names", {})
# Add Model Key Table (use provider_models)
report.append(f"\n**Model Key ({provider.capitalize()})**\n")
provider_key_lines = ["| Display Name | Full Model ID |", "| --- | --- |"]
for model_id in provider_models:
display_name = display_name_map.get(model_id, model_id)
provider_key_lines.append(f"| {display_name} | `{model_id}` |")
report.extend(provider_key_lines)
report.append("\n")
# Create results table header (use provider_models)
display_names = [display_name_map.get(m, m) for m in provider_models]
header = "| Test | " + " | ".join(display_names) + " |"
separator = "| --- | " + " | ".join(["---"] * len(provider_models)) + " |"
report.append(header)
report.append(separator)
# Get results for this provider
provider_results = results_dict.get(provider, {})
# Get results for this provider from results_dict
provider_results_data = results_dict.get(provider, {})
# Add rows for each test
# Add rows for each test (use sorted_tests)
for test in sorted_tests:
row = f"| {test} |"
# Determine display name based on case count
base_name = base_test_name_map.get(test, test) # Get base name
case_count = base_test_case_counts.get(base_name, 1) # Get count
display_test_name = base_name if case_count == 1 else test # Choose display name
row = f"| {display_test_name} |" # Use display name
# Add results for each model in this test
for model in provider_models:
if model in provider_results and test in provider_results[model]:
result = pass_icon if provider_results[model][test] else fail_icon
for model_id in provider_models:
if model_id in provider_results_data and test in provider_results_data[model_id]:
result = pass_icon if provider_results_data[model_id][test] else fail_icon
else:
result = na_icon
row += f" {result} |"
report.append(row)
# Write to file
@ -442,9 +454,14 @@ def main():
help="Specify providers to test (comma-separated or space-separated, default: all)",
)
parser.add_argument("--output", type=str, help="Output file location (default: tests/verifications/REPORT.md)")
parser.add_argument("--k", type=str, help="Keyword expression to filter tests (passed to pytest -k)")
args = parser.parse_args()
all_results = {}
# Initialize collections to aggregate results in main
aggregated_providers = defaultdict(set)
aggregated_tests = set()
provider_timestamps = {}
if args.run_tests:
# Get list of available providers from command line or use detected providers
@ -463,22 +480,31 @@ def main():
for provider in test_providers:
provider = provider.strip() # Remove any whitespace
result_file = run_tests(provider)
result_file = run_tests(provider, keyword=args.k)
if result_file:
provider_results = parse_results(result_file)
all_results.update(provider_results)
# Parse and aggregate results
parsed_results, providers_in_file, tests_in_file, run_timestamp = parse_results(result_file)
all_results.update(parsed_results)
for prov, models in providers_in_file.items():
aggregated_providers[prov].update(models)
if run_timestamp:
provider_timestamps[prov] = run_timestamp
aggregated_tests.update(tests_in_file)
else:
# Use existing results
provider_result_files = get_latest_results_by_provider()
provider_result_files = get_all_result_files_by_provider()
for result_file in provider_result_files.values():
provider_results = parse_results(result_file)
all_results.update(provider_results)
# Parse and aggregate results
parsed_results, providers_in_file, tests_in_file, run_timestamp = parse_results(result_file)
all_results.update(parsed_results)
for prov, models in providers_in_file.items():
aggregated_providers[prov].update(models)
if run_timestamp:
provider_timestamps[prov] = run_timestamp
aggregated_tests.update(tests_in_file)
# Generate the report
generate_report(all_results, args.output)
cleanup_old_results()
generate_report(all_results, aggregated_providers, aggregated_tests, provider_timestamps, args.output)
if __name__ == "__main__":

View file

@ -1,97 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import os
import pytest
from openai import OpenAI
@pytest.fixture
def providers_model_mapping():
"""
Mapping from model names used in test cases to provider's model names.
"""
return {
"fireworks": {
"Llama-3.3-70B-Instruct": "accounts/fireworks/models/llama-v3p1-70b-instruct",
"Llama-3.2-11B-Vision-Instruct": "accounts/fireworks/models/llama-v3p2-11b-vision-instruct",
"Llama-4-Scout-17B-16E-Instruct": "accounts/fireworks/models/llama4-scout-instruct-basic",
"Llama-4-Maverick-17B-128E-Instruct": "accounts/fireworks/models/llama4-maverick-instruct-basic",
},
"together": {
"Llama-3.3-70B-Instruct": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
"Llama-3.2-11B-Vision-Instruct": "meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
"Llama-4-Scout-17B-16E-Instruct": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
"Llama-4-Maverick-17B-128E-Instruct": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
},
"groq": {
"Llama-3.3-70B-Instruct": "llama-3.3-70b-versatile",
"Llama-3.2-11B-Vision-Instruct": "llama-3.2-11b-vision-preview",
"Llama-4-Scout-17B-16E-Instruct": "llama-4-scout-17b-16e-instruct",
"Llama-4-Maverick-17B-128E-Instruct": "llama-4-maverick-17b-128e-instruct",
},
"cerebras": {
"Llama-3.3-70B-Instruct": "llama-3.3-70b",
},
"openai": {
"gpt-4o": "gpt-4o",
"gpt-4o-mini": "gpt-4o-mini",
},
}
@pytest.fixture
def provider_metadata():
return {
"fireworks": ("https://api.fireworks.ai/inference/v1", "FIREWORKS_API_KEY"),
"together": ("https://api.together.xyz/v1", "TOGETHER_API_KEY"),
"groq": ("https://api.groq.com/openai/v1", "GROQ_API_KEY"),
"cerebras": ("https://api.cerebras.ai/v1", "CEREBRAS_API_KEY"),
"openai": ("https://api.openai.com/v1", "OPENAI_API_KEY"),
}
@pytest.fixture
def provider(request, provider_metadata):
provider = request.config.getoption("--provider")
base_url = request.config.getoption("--base-url")
if provider and base_url and provider_metadata[provider][0] != base_url:
raise ValueError(f"Provider {provider} is not supported for base URL {base_url}")
if not provider:
if not base_url:
raise ValueError("Provider and base URL are not provided")
for provider, metadata in provider_metadata.items():
if metadata[0] == base_url:
provider = provider
break
return provider
@pytest.fixture
def base_url(request, provider, provider_metadata):
return request.config.getoption("--base-url") or provider_metadata[provider][0]
@pytest.fixture
def api_key(request, provider, provider_metadata):
return request.config.getoption("--api-key") or os.getenv(provider_metadata[provider][1])
@pytest.fixture
def model_mapping(provider, providers_model_mapping):
return providers_model_mapping[provider]
@pytest.fixture
def openai_client(base_url, api_key):
return OpenAI(
base_url=base_url,
api_key=api_key,
)

View file

@ -1,202 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import Any
import pytest
from pydantic import BaseModel
from tests.verifications.openai.fixtures.load import load_test_cases
chat_completion_test_cases = load_test_cases("chat_completion")
@pytest.fixture
def correct_model_name(model, provider, providers_model_mapping):
"""Return the provider-specific model name based on the generic model name."""
mapping = providers_model_mapping[provider]
if model not in mapping:
pytest.skip(f"Provider {provider} does not support model {model}")
return mapping[model]
@pytest.mark.parametrize("model", chat_completion_test_cases["test_chat_basic"]["test_params"]["model"])
@pytest.mark.parametrize(
"input_output",
chat_completion_test_cases["test_chat_basic"]["test_params"]["input_output"],
)
def test_chat_non_streaming_basic(openai_client, input_output, correct_model_name):
response = openai_client.chat.completions.create(
model=correct_model_name,
messages=input_output["input"]["messages"],
stream=False,
)
assert response.choices[0].message.role == "assistant"
assert input_output["output"].lower() in response.choices[0].message.content.lower()
@pytest.mark.parametrize("model", chat_completion_test_cases["test_chat_basic"]["test_params"]["model"])
@pytest.mark.parametrize(
"input_output",
chat_completion_test_cases["test_chat_basic"]["test_params"]["input_output"],
)
def test_chat_streaming_basic(openai_client, input_output, correct_model_name):
response = openai_client.chat.completions.create(
model=correct_model_name,
messages=input_output["input"]["messages"],
stream=True,
)
content = ""
for chunk in response:
content += chunk.choices[0].delta.content or ""
# TODO: add detailed type validation
assert input_output["output"].lower() in content.lower()
@pytest.mark.parametrize("model", chat_completion_test_cases["test_chat_image"]["test_params"]["model"])
@pytest.mark.parametrize(
"input_output",
chat_completion_test_cases["test_chat_image"]["test_params"]["input_output"],
)
def test_chat_non_streaming_image(openai_client, input_output, correct_model_name):
response = openai_client.chat.completions.create(
model=correct_model_name,
messages=input_output["input"]["messages"],
stream=False,
)
assert response.choices[0].message.role == "assistant"
assert input_output["output"].lower() in response.choices[0].message.content.lower()
@pytest.mark.parametrize("model", chat_completion_test_cases["test_chat_image"]["test_params"]["model"])
@pytest.mark.parametrize(
"input_output",
chat_completion_test_cases["test_chat_image"]["test_params"]["input_output"],
)
def test_chat_streaming_image(openai_client, input_output, correct_model_name):
response = openai_client.chat.completions.create(
model=correct_model_name,
messages=input_output["input"]["messages"],
stream=True,
)
content = ""
for chunk in response:
content += chunk.choices[0].delta.content or ""
# TODO: add detailed type validation
assert input_output["output"].lower() in content.lower()
@pytest.mark.parametrize(
"model",
chat_completion_test_cases["test_chat_structured_output"]["test_params"]["model"],
)
@pytest.mark.parametrize(
"input_output",
chat_completion_test_cases["test_chat_structured_output"]["test_params"]["input_output"],
)
def test_chat_non_streaming_structured_output(openai_client, input_output, correct_model_name):
response = openai_client.chat.completions.create(
model=correct_model_name,
messages=input_output["input"]["messages"],
response_format=input_output["input"]["response_format"],
stream=False,
)
assert response.choices[0].message.role == "assistant"
maybe_json_content = response.choices[0].message.content
validate_structured_output(maybe_json_content, input_output["output"])
@pytest.mark.parametrize(
"model",
chat_completion_test_cases["test_chat_structured_output"]["test_params"]["model"],
)
@pytest.mark.parametrize(
"input_output",
chat_completion_test_cases["test_chat_structured_output"]["test_params"]["input_output"],
)
def test_chat_streaming_structured_output(openai_client, input_output, correct_model_name):
response = openai_client.chat.completions.create(
model=correct_model_name,
messages=input_output["input"]["messages"],
response_format=input_output["input"]["response_format"],
stream=True,
)
maybe_json_content = ""
for chunk in response:
maybe_json_content += chunk.choices[0].delta.content or ""
validate_structured_output(maybe_json_content, input_output["output"])
@pytest.mark.parametrize(
"model",
chat_completion_test_cases["test_tool_calling"]["test_params"]["model"],
)
@pytest.mark.parametrize(
"input_output",
chat_completion_test_cases["test_tool_calling"]["test_params"]["input_output"],
)
def test_chat_non_streaming_tool_calling(openai_client, input_output, correct_model_name):
response = openai_client.chat.completions.create(
model=correct_model_name,
messages=input_output["input"]["messages"],
tools=input_output["input"]["tools"],
stream=False,
)
assert response.choices[0].message.role == "assistant"
assert len(response.choices[0].message.tool_calls) > 0
assert input_output["output"] == "get_weather_tool_call"
assert response.choices[0].message.tool_calls[0].function.name == "get_weather"
# TODO: add detailed type validation
def get_structured_output(maybe_json_content: str, schema_name: str) -> Any | None:
if schema_name == "valid_calendar_event":
class CalendarEvent(BaseModel):
name: str
date: str
participants: list[str]
try:
calendar_event = CalendarEvent.model_validate_json(maybe_json_content)
return calendar_event
except Exception:
return None
elif schema_name == "valid_math_reasoning":
class Step(BaseModel):
explanation: str
output: str
class MathReasoning(BaseModel):
steps: list[Step]
final_answer: str
try:
math_reasoning = MathReasoning.model_validate_json(maybe_json_content)
return math_reasoning
except Exception:
return None
return None
def validate_structured_output(maybe_json_content: str, schema_name: str) -> None:
structured_output = get_structured_output(maybe_json_content, schema_name)
assert structured_output is not None
if schema_name == "valid_calendar_event":
assert structured_output.name is not None
assert structured_output.date is not None
assert len(structured_output.participants) == 2
elif schema_name == "valid_math_reasoning":
assert len(structured_output.final_answer) > 0

View file

@ -0,0 +1,105 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import os
from pathlib import Path
import pytest
import yaml
from openai import OpenAI
# --- Helper Function to Load Config ---
def _load_all_verification_configs():
"""Load and aggregate verification configs from the conf/ directory."""
# Note: Path is relative to *this* file (fixtures.py)
conf_dir = Path(__file__).parent.parent.parent / "conf"
if not conf_dir.is_dir():
# Use pytest.fail if called during test collection, otherwise raise error
# For simplicity here, we'll raise an error, assuming direct calls
# are less likely or can handle it.
raise FileNotFoundError(f"Verification config directory not found at {conf_dir}")
all_provider_configs = {}
yaml_files = list(conf_dir.glob("*.yaml"))
if not yaml_files:
raise FileNotFoundError(f"No YAML configuration files found in {conf_dir}")
for config_path in yaml_files:
provider_name = config_path.stem
try:
with open(config_path, "r") as f:
provider_config = yaml.safe_load(f)
if provider_config:
all_provider_configs[provider_name] = provider_config
else:
# Log warning if possible, or just skip empty files silently
print(f"Warning: Config file {config_path} is empty or invalid.")
except Exception as e:
raise IOError(f"Error loading config file {config_path}: {e}") from e
return {"providers": all_provider_configs}
# --- End Helper Function ---
@pytest.fixture(scope="session")
def verification_config():
"""Pytest fixture to provide the loaded verification config."""
try:
return _load_all_verification_configs()
except (FileNotFoundError, IOError) as e:
pytest.fail(str(e)) # Fail test collection if config loading fails
@pytest.fixture
def provider(request, verification_config):
provider = request.config.getoption("--provider")
base_url = request.config.getoption("--base-url")
if provider and base_url and verification_config["providers"][provider]["base_url"] != base_url:
raise ValueError(f"Provider {provider} is not supported for base URL {base_url}")
if not provider:
if not base_url:
raise ValueError("Provider and base URL are not provided")
for provider, metadata in verification_config["providers"].items():
if metadata["base_url"] == base_url:
provider = provider
break
return provider
@pytest.fixture
def base_url(request, provider, verification_config):
return request.config.getoption("--base-url") or verification_config["providers"][provider]["base_url"]
@pytest.fixture
def api_key(request, provider, verification_config):
provider_conf = verification_config.get("providers", {}).get(provider, {})
api_key_env_var = provider_conf.get("api_key_var")
key_from_option = request.config.getoption("--api-key")
key_from_env = os.getenv(api_key_env_var) if api_key_env_var else None
final_key = key_from_option or key_from_env
return final_key
@pytest.fixture
def model_mapping(provider, providers_model_mapping):
return providers_model_mapping[provider]
@pytest.fixture
def openai_client(base_url, api_key):
return OpenAI(
base_url=base_url,
api_key=api_key,
)

View file

@ -1,31 +1,24 @@
test_chat_basic:
test_name: test_chat_basic
test_params:
input_output:
- input:
case:
- case_id: "earth"
input:
messages:
- content: Which planet do humans live on?
role: user
output: Earth
- input:
- case_id: "saturn"
input:
messages:
- content: Which planet has rings around it with a name starting with letter
S?
role: user
output: Saturn
model:
- Llama-3.3-8B-Instruct
- Llama-3.3-70B-Instruct
- Llama-4-Scout-17B-16E
- Llama-4-Scout-17B-16E-Instruct
- Llama-4-Maverick-17B-128E
- Llama-4-Maverick-17B-128E-Instruct
- gpt-4o
- gpt-4o-mini
test_chat_image:
test_name: test_chat_image
test_params:
input_output:
case:
- input:
messages:
- content:
@ -36,18 +29,12 @@ test_chat_image:
type: image_url
role: user
output: llama
model:
- Llama-4-Scout-17B-16E
- Llama-4-Scout-17B-16E-Instruct
- Llama-4-Maverick-17B-128E
- Llama-4-Maverick-17B-128E-Instruct
- gpt-4o
- gpt-4o-mini
test_chat_structured_output:
test_name: test_chat_structured_output
test_params:
input_output:
- input:
case:
- case_id: "calendar"
input:
messages:
- content: Extract the event information.
role: system
@ -77,7 +64,8 @@ test_chat_structured_output:
type: object
type: json_schema
output: valid_calendar_event
- input:
- case_id: "math"
input:
messages:
- content: You are a helpful math tutor. Guide the user through the solution
step by step.
@ -118,19 +106,10 @@ test_chat_structured_output:
type: object
type: json_schema
output: valid_math_reasoning
model:
- Llama-3.3-8B-Instruct
- Llama-3.3-70B-Instruct
- Llama-4-Scout-17B-16E
- Llama-4-Scout-17B-16E-Instruct
- Llama-4-Maverick-17B-128E
- Llama-4-Maverick-17B-128E-Instruct
- gpt-4o
- gpt-4o-mini
test_tool_calling:
test_name: test_tool_calling
test_params:
input_output:
case:
- input:
messages:
- content: You are a helpful assistant that can use tools to get information.
@ -152,11 +131,3 @@ test_tool_calling:
type: object
type: function
output: get_weather_tool_call
model:
- Llama-3.3-70B-Instruct
- Llama-4-Scout-17B-16E
- Llama-4-Scout-17B-16E-Instruct
- Llama-4-Maverick-17B-128E
- Llama-4-Maverick-17B-128E-Instruct
- gpt-4o
- gpt-4o-mini

View file

@ -0,0 +1,326 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import json
import re
from typing import Any
import pytest
from pydantic import BaseModel
from tests.verifications.openai_api.fixtures.fixtures import _load_all_verification_configs
from tests.verifications.openai_api.fixtures.load import load_test_cases
chat_completion_test_cases = load_test_cases("chat_completion")
def case_id_generator(case):
"""Generate a test ID from the case's 'case_id' field, or use a default."""
case_id = case.get("case_id")
if isinstance(case_id, (str, int)):
return re.sub(r"\\W|^(?=\\d)", "_", str(case_id))
return None
def pytest_generate_tests(metafunc):
"""Dynamically parametrize tests based on the selected provider and config."""
if "model" in metafunc.fixturenames:
provider = metafunc.config.getoption("provider")
if not provider:
print("Warning: --provider not specified. Skipping model parametrization.")
metafunc.parametrize("model", [])
return
try:
config_data = _load_all_verification_configs()
except (FileNotFoundError, IOError) as e:
print(f"ERROR loading verification configs: {e}")
config_data = {"providers": {}}
provider_config = config_data.get("providers", {}).get(provider)
if provider_config:
models = provider_config.get("models", [])
if models:
metafunc.parametrize("model", models)
else:
print(f"Warning: No models found for provider '{provider}' in config.")
metafunc.parametrize("model", []) # Parametrize empty if no models found
else:
print(f"Warning: Provider '{provider}' not found in config. No models parametrized.")
metafunc.parametrize("model", []) # Parametrize empty if provider not found
def should_skip_test(verification_config, provider, model, test_name_base):
"""Check if a test should be skipped based on config exclusions."""
provider_config = verification_config.get("providers", {}).get(provider)
if not provider_config:
return False # No config for provider, don't skip
exclusions = provider_config.get("test_exclusions", {}).get(model, [])
return test_name_base in exclusions
# Helper to get the base test name from the request object
def get_base_test_name(request):
return request.node.originalname
# --- Test Functions ---
@pytest.mark.parametrize(
"case",
chat_completion_test_cases["test_chat_basic"]["test_params"]["case"],
ids=case_id_generator,
)
def test_chat_non_streaming_basic(request, openai_client, model, provider, verification_config, case):
test_name_base = get_base_test_name(request)
if should_skip_test(verification_config, provider, model, test_name_base):
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
response = openai_client.chat.completions.create(
model=model,
messages=case["input"]["messages"],
stream=False,
)
assert response.choices[0].message.role == "assistant"
assert case["output"].lower() in response.choices[0].message.content.lower()
@pytest.mark.parametrize(
"case",
chat_completion_test_cases["test_chat_basic"]["test_params"]["case"],
ids=case_id_generator,
)
def test_chat_streaming_basic(request, openai_client, model, provider, verification_config, case):
test_name_base = get_base_test_name(request)
if should_skip_test(verification_config, provider, model, test_name_base):
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
response = openai_client.chat.completions.create(
model=model,
messages=case["input"]["messages"],
stream=True,
)
content = ""
for chunk in response:
content += chunk.choices[0].delta.content or ""
# TODO: add detailed type validation
assert case["output"].lower() in content.lower()
@pytest.mark.parametrize(
"case",
chat_completion_test_cases["test_chat_image"]["test_params"]["case"],
ids=case_id_generator,
)
def test_chat_non_streaming_image(request, openai_client, model, provider, verification_config, case):
test_name_base = get_base_test_name(request)
if should_skip_test(verification_config, provider, model, test_name_base):
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
response = openai_client.chat.completions.create(
model=model,
messages=case["input"]["messages"],
stream=False,
)
assert response.choices[0].message.role == "assistant"
assert case["output"].lower() in response.choices[0].message.content.lower()
@pytest.mark.parametrize(
"case",
chat_completion_test_cases["test_chat_image"]["test_params"]["case"],
ids=case_id_generator,
)
def test_chat_streaming_image(request, openai_client, model, provider, verification_config, case):
test_name_base = get_base_test_name(request)
if should_skip_test(verification_config, provider, model, test_name_base):
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
response = openai_client.chat.completions.create(
model=model,
messages=case["input"]["messages"],
stream=True,
)
content = ""
for chunk in response:
content += chunk.choices[0].delta.content or ""
# TODO: add detailed type validation
assert case["output"].lower() in content.lower()
@pytest.mark.parametrize(
"case",
chat_completion_test_cases["test_chat_structured_output"]["test_params"]["case"],
ids=case_id_generator,
)
def test_chat_non_streaming_structured_output(request, openai_client, model, provider, verification_config, case):
test_name_base = get_base_test_name(request)
if should_skip_test(verification_config, provider, model, test_name_base):
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
response = openai_client.chat.completions.create(
model=model,
messages=case["input"]["messages"],
response_format=case["input"]["response_format"],
stream=False,
)
assert response.choices[0].message.role == "assistant"
maybe_json_content = response.choices[0].message.content
validate_structured_output(maybe_json_content, case["output"])
@pytest.mark.parametrize(
"case",
chat_completion_test_cases["test_chat_structured_output"]["test_params"]["case"],
ids=case_id_generator,
)
def test_chat_streaming_structured_output(request, openai_client, model, provider, verification_config, case):
test_name_base = get_base_test_name(request)
if should_skip_test(verification_config, provider, model, test_name_base):
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
response = openai_client.chat.completions.create(
model=model,
messages=case["input"]["messages"],
response_format=case["input"]["response_format"],
stream=True,
)
maybe_json_content = ""
for chunk in response:
maybe_json_content += chunk.choices[0].delta.content or ""
validate_structured_output(maybe_json_content, case["output"])
@pytest.mark.parametrize(
"case",
chat_completion_test_cases["test_tool_calling"]["test_params"]["case"],
ids=case_id_generator,
)
def test_chat_non_streaming_tool_calling(request, openai_client, model, provider, verification_config, case):
test_name_base = get_base_test_name(request)
if should_skip_test(verification_config, provider, model, test_name_base):
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
response = openai_client.chat.completions.create(
model=model,
messages=case["input"]["messages"],
tools=case["input"]["tools"],
stream=False,
)
assert response.choices[0].message.role == "assistant"
assert len(response.choices[0].message.tool_calls) > 0
assert case["output"] == "get_weather_tool_call"
assert response.choices[0].message.tool_calls[0].function.name == "get_weather"
# TODO: add detailed type validation
@pytest.mark.parametrize(
"case",
chat_completion_test_cases["test_tool_calling"]["test_params"]["case"],
ids=case_id_generator,
)
def test_chat_streaming_tool_calling(request, openai_client, model, provider, verification_config, case):
test_name_base = get_base_test_name(request)
if should_skip_test(verification_config, provider, model, test_name_base):
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
stream = openai_client.chat.completions.create(
model=model,
messages=case["input"]["messages"],
tools=case["input"]["tools"],
stream=True,
)
# Accumulate partial tool_calls here
tool_calls_buffer = {}
current_id = None
# Process streaming chunks
for chunk in stream:
choice = chunk.choices[0]
delta = choice.delta
if delta.tool_calls is None:
continue
for tool_call_delta in delta.tool_calls:
if tool_call_delta.id:
current_id = tool_call_delta.id
call_id = current_id
func_delta = tool_call_delta.function
if call_id not in tool_calls_buffer:
tool_calls_buffer[call_id] = {
"id": call_id,
"type": tool_call_delta.type,
"name": func_delta.name,
"arguments": "",
}
if func_delta.arguments:
tool_calls_buffer[call_id]["arguments"] += func_delta.arguments
assert len(tool_calls_buffer) == 1
for call in tool_calls_buffer.values():
assert len(call["id"]) > 0
assert call["name"] == "get_weather"
args_dict = json.loads(call["arguments"])
assert "san francisco" in args_dict["location"].lower()
# --- Helper functions (structured output validation) ---
def get_structured_output(maybe_json_content: str, schema_name: str) -> Any | None:
if schema_name == "valid_calendar_event":
class CalendarEvent(BaseModel):
name: str
date: str
participants: list[str]
try:
calendar_event = CalendarEvent.model_validate_json(maybe_json_content)
return calendar_event
except Exception:
return None
elif schema_name == "valid_math_reasoning":
class Step(BaseModel):
explanation: str
output: str
class MathReasoning(BaseModel):
steps: list[Step]
final_answer: str
try:
math_reasoning = MathReasoning.model_validate_json(maybe_json_content)
return math_reasoning
except Exception:
return None
return None
def validate_structured_output(maybe_json_content: str, schema_name: str) -> None:
structured_output = get_structured_output(maybe_json_content, schema_name)
assert structured_output is not None
if schema_name == "valid_calendar_event":
assert structured_output.name is not None
assert structured_output.date is not None
assert len(structured_output.participants) == 2
elif schema_name == "valid_math_reasoning":
assert len(structured_output.final_answer) > 0

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,945 @@
{
"created": 1744328898.0248861,
"duration": 47.561042070388794,
"exitcode": 0,
"root": "/Users/erichuang/projects/llama-stack",
"environment": {},
"summary": {
"passed": 24,
"total": 24,
"collected": 24
},
"collectors": [
{
"nodeid": "",
"outcome": "passed",
"result": [
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py",
"type": "Module"
}
]
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py",
"outcome": "passed",
"result": [
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[gpt-4o-earth]",
"type": "Function",
"lineno": 73
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[gpt-4o-saturn]",
"type": "Function",
"lineno": 73
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[gpt-4o-mini-earth]",
"type": "Function",
"lineno": 73
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[gpt-4o-mini-saturn]",
"type": "Function",
"lineno": 73
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[gpt-4o-earth]",
"type": "Function",
"lineno": 92
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[gpt-4o-saturn]",
"type": "Function",
"lineno": 92
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[gpt-4o-mini-earth]",
"type": "Function",
"lineno": 92
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[gpt-4o-mini-saturn]",
"type": "Function",
"lineno": 92
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[gpt-4o-case0]",
"type": "Function",
"lineno": 116
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[gpt-4o-mini-case0]",
"type": "Function",
"lineno": 116
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[gpt-4o-case0]",
"type": "Function",
"lineno": 135
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[gpt-4o-mini-case0]",
"type": "Function",
"lineno": 135
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[gpt-4o-calendar]",
"type": "Function",
"lineno": 159
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[gpt-4o-math]",
"type": "Function",
"lineno": 159
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[gpt-4o-mini-calendar]",
"type": "Function",
"lineno": 159
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[gpt-4o-mini-math]",
"type": "Function",
"lineno": 159
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[gpt-4o-calendar]",
"type": "Function",
"lineno": 182
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[gpt-4o-math]",
"type": "Function",
"lineno": 182
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[gpt-4o-mini-calendar]",
"type": "Function",
"lineno": 182
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[gpt-4o-mini-math]",
"type": "Function",
"lineno": 182
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[gpt-4o-case0]",
"type": "Function",
"lineno": 204
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[gpt-4o-mini-case0]",
"type": "Function",
"lineno": 204
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_calling[gpt-4o-case0]",
"type": "Function",
"lineno": 228
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_calling[gpt-4o-mini-case0]",
"type": "Function",
"lineno": 228
}
]
}
],
"tests": [
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[gpt-4o-earth]",
"lineno": 73,
"outcome": "passed",
"keywords": [
"test_chat_non_streaming_basic[gpt-4o-earth]",
"parametrize",
"pytestmark",
"gpt-4o-earth",
"test_chat_completion.py",
"openai_api",
"verifications",
"tests",
"llama-stack",
""
],
"metadata": {
"model": "gpt-4o",
"case_id": "earth"
},
"setup": {
"duration": 0.0694252080284059,
"outcome": "passed"
},
"call": {
"duration": 0.5709165419684723,
"outcome": "passed"
},
"teardown": {
"duration": 0.0007626248989254236,
"outcome": "passed"
}
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[gpt-4o-saturn]",
"lineno": 73,
"outcome": "passed",
"keywords": [
"test_chat_non_streaming_basic[gpt-4o-saturn]",
"parametrize",
"pytestmark",
"gpt-4o-saturn",
"test_chat_completion.py",
"openai_api",
"verifications",
"tests",
"llama-stack",
""
],
"metadata": {
"model": "gpt-4o",
"case_id": "saturn"
},
"setup": {
"duration": 0.010281750001013279,
"outcome": "passed"
},
"call": {
"duration": 0.6309260830748826,
"outcome": "passed"
},
"teardown": {
"duration": 0.0001824579667299986,
"outcome": "passed"
}
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[gpt-4o-mini-earth]",
"lineno": 73,
"outcome": "passed",
"keywords": [
"test_chat_non_streaming_basic[gpt-4o-mini-earth]",
"parametrize",
"pytestmark",
"gpt-4o-mini-earth",
"test_chat_completion.py",
"openai_api",
"verifications",
"tests",
"llama-stack",
""
],
"metadata": {
"model": "gpt-4o-mini",
"case_id": "earth"
},
"setup": {
"duration": 0.007922374992631376,
"outcome": "passed"
},
"call": {
"duration": 0.31756504194345325,
"outcome": "passed"
},
"teardown": {
"duration": 0.0005268750246614218,
"outcome": "passed"
}
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[gpt-4o-mini-saturn]",
"lineno": 73,
"outcome": "passed",
"keywords": [
"test_chat_non_streaming_basic[gpt-4o-mini-saturn]",
"parametrize",
"pytestmark",
"gpt-4o-mini-saturn",
"test_chat_completion.py",
"openai_api",
"verifications",
"tests",
"llama-stack",
""
],
"metadata": {
"model": "gpt-4o-mini",
"case_id": "saturn"
},
"setup": {
"duration": 0.01643404201604426,
"outcome": "passed"
},
"call": {
"duration": 0.7479908330133185,
"outcome": "passed"
},
"teardown": {
"duration": 0.0004037501057609916,
"outcome": "passed"
}
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[gpt-4o-earth]",
"lineno": 92,
"outcome": "passed",
"keywords": [
"test_chat_streaming_basic[gpt-4o-earth]",
"parametrize",
"pytestmark",
"gpt-4o-earth",
"test_chat_completion.py",
"openai_api",
"verifications",
"tests",
"llama-stack",
""
],
"metadata": {
"model": "gpt-4o",
"case_id": "earth"
},
"setup": {
"duration": 0.021671707974746823,
"outcome": "passed"
},
"call": {
"duration": 0.6701172919711098,
"outcome": "passed"
},
"teardown": {
"duration": 0.0005569590721279383,
"outcome": "passed"
}
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[gpt-4o-saturn]",
"lineno": 92,
"outcome": "passed",
"keywords": [
"test_chat_streaming_basic[gpt-4o-saturn]",
"parametrize",
"pytestmark",
"gpt-4o-saturn",
"test_chat_completion.py",
"openai_api",
"verifications",
"tests",
"llama-stack",
""
],
"metadata": {
"model": "gpt-4o",
"case_id": "saturn"
},
"setup": {
"duration": 0.015847125090658665,
"outcome": "passed"
},
"call": {
"duration": 0.636536999954842,
"outcome": "passed"
},
"teardown": {
"duration": 0.00029395800083875656,
"outcome": "passed"
}
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[gpt-4o-mini-earth]",
"lineno": 92,
"outcome": "passed",
"keywords": [
"test_chat_streaming_basic[gpt-4o-mini-earth]",
"parametrize",
"pytestmark",
"gpt-4o-mini-earth",
"test_chat_completion.py",
"openai_api",
"verifications",
"tests",
"llama-stack",
""
],
"metadata": {
"model": "gpt-4o-mini",
"case_id": "earth"
},
"setup": {
"duration": 0.011792832985520363,
"outcome": "passed"
},
"call": {
"duration": 0.5610962919890881,
"outcome": "passed"
},
"teardown": {
"duration": 0.0003578749019652605,
"outcome": "passed"
}
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[gpt-4o-mini-saturn]",
"lineno": 92,
"outcome": "passed",
"keywords": [
"test_chat_streaming_basic[gpt-4o-mini-saturn]",
"parametrize",
"pytestmark",
"gpt-4o-mini-saturn",
"test_chat_completion.py",
"openai_api",
"verifications",
"tests",
"llama-stack",
""
],
"metadata": {
"model": "gpt-4o-mini",
"case_id": "saturn"
},
"setup": {
"duration": 0.016500207944773138,
"outcome": "passed"
},
"call": {
"duration": 0.8060244580265135,
"outcome": "passed"
},
"teardown": {
"duration": 0.0005296670133247972,
"outcome": "passed"
}
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[gpt-4o-case0]",
"lineno": 116,
"outcome": "passed",
"keywords": [
"test_chat_non_streaming_image[gpt-4o-case0]",
"parametrize",
"pytestmark",
"gpt-4o-case0",
"test_chat_completion.py",
"openai_api",
"verifications",
"tests",
"llama-stack",
""
],
"metadata": {
"model": "gpt-4o",
"case_id": "case0"
},
"setup": {
"duration": 0.008338792016729712,
"outcome": "passed"
},
"call": {
"duration": 7.009252917021513,
"outcome": "passed"
},
"teardown": {
"duration": 0.0003042910248041153,
"outcome": "passed"
}
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[gpt-4o-mini-case0]",
"lineno": 116,
"outcome": "passed",
"keywords": [
"test_chat_non_streaming_image[gpt-4o-mini-case0]",
"parametrize",
"pytestmark",
"gpt-4o-mini-case0",
"test_chat_completion.py",
"openai_api",
"verifications",
"tests",
"llama-stack",
""
],
"metadata": {
"model": "gpt-4o-mini",
"case_id": "case0"
},
"setup": {
"duration": 0.007238540914840996,
"outcome": "passed"
},
"call": {
"duration": 3.134693874977529,
"outcome": "passed"
},
"teardown": {
"duration": 0.0003104590578004718,
"outcome": "passed"
}
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[gpt-4o-case0]",
"lineno": 135,
"outcome": "passed",
"keywords": [
"test_chat_streaming_image[gpt-4o-case0]",
"parametrize",
"pytestmark",
"gpt-4o-case0",
"test_chat_completion.py",
"openai_api",
"verifications",
"tests",
"llama-stack",
""
],
"metadata": {
"model": "gpt-4o",
"case_id": "case0"
},
"setup": {
"duration": 0.0161851670127362,
"outcome": "passed"
},
"call": {
"duration": 3.0745719589758664,
"outcome": "passed"
},
"teardown": {
"duration": 0.00022620800882577896,
"outcome": "passed"
}
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[gpt-4o-mini-case0]",
"lineno": 135,
"outcome": "passed",
"keywords": [
"test_chat_streaming_image[gpt-4o-mini-case0]",
"parametrize",
"pytestmark",
"gpt-4o-mini-case0",
"test_chat_completion.py",
"openai_api",
"verifications",
"tests",
"llama-stack",
""
],
"metadata": {
"model": "gpt-4o-mini",
"case_id": "case0"
},
"setup": {
"duration": 0.013220708002336323,
"outcome": "passed"
},
"call": {
"duration": 3.624867417034693,
"outcome": "passed"
},
"teardown": {
"duration": 0.00020633300300687551,
"outcome": "passed"
}
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[gpt-4o-calendar]",
"lineno": 159,
"outcome": "passed",
"keywords": [
"test_chat_non_streaming_structured_output[gpt-4o-calendar]",
"parametrize",
"pytestmark",
"gpt-4o-calendar",
"test_chat_completion.py",
"openai_api",
"verifications",
"tests",
"llama-stack",
""
],
"metadata": {
"model": "gpt-4o",
"case_id": "calendar"
},
"setup": {
"duration": 0.017596833989955485,
"outcome": "passed"
},
"call": {
"duration": 1.248568250099197,
"outcome": "passed"
},
"teardown": {
"duration": 0.0004248750628903508,
"outcome": "passed"
}
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[gpt-4o-math]",
"lineno": 159,
"outcome": "passed",
"keywords": [
"test_chat_non_streaming_structured_output[gpt-4o-math]",
"parametrize",
"pytestmark",
"gpt-4o-math",
"test_chat_completion.py",
"openai_api",
"verifications",
"tests",
"llama-stack",
""
],
"metadata": {
"model": "gpt-4o",
"case_id": "math"
},
"setup": {
"duration": 0.01512012502644211,
"outcome": "passed"
},
"call": {
"duration": 8.170285542029887,
"outcome": "passed"
},
"teardown": {
"duration": 0.00043537491001188755,
"outcome": "passed"
}
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[gpt-4o-mini-calendar]",
"lineno": 159,
"outcome": "passed",
"keywords": [
"test_chat_non_streaming_structured_output[gpt-4o-mini-calendar]",
"parametrize",
"pytestmark",
"gpt-4o-mini-calendar",
"test_chat_completion.py",
"openai_api",
"verifications",
"tests",
"llama-stack",
""
],
"metadata": {
"model": "gpt-4o-mini",
"case_id": "calendar"
},
"setup": {
"duration": 0.010376665974035859,
"outcome": "passed"
},
"call": {
"duration": 0.756480542011559,
"outcome": "passed"
},
"teardown": {
"duration": 0.00025695806834846735,
"outcome": "passed"
}
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[gpt-4o-mini-math]",
"lineno": 159,
"outcome": "passed",
"keywords": [
"test_chat_non_streaming_structured_output[gpt-4o-mini-math]",
"parametrize",
"pytestmark",
"gpt-4o-mini-math",
"test_chat_completion.py",
"openai_api",
"verifications",
"tests",
"llama-stack",
""
],
"metadata": {
"model": "gpt-4o-mini",
"case_id": "math"
},
"setup": {
"duration": 0.006846625008620322,
"outcome": "passed"
},
"call": {
"duration": 2.6833953330060467,
"outcome": "passed"
},
"teardown": {
"duration": 0.00022558309137821198,
"outcome": "passed"
}
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[gpt-4o-calendar]",
"lineno": 182,
"outcome": "passed",
"keywords": [
"test_chat_streaming_structured_output[gpt-4o-calendar]",
"parametrize",
"pytestmark",
"gpt-4o-calendar",
"test_chat_completion.py",
"openai_api",
"verifications",
"tests",
"llama-stack",
""
],
"metadata": {
"model": "gpt-4o",
"case_id": "calendar"
},
"setup": {
"duration": 0.009646040969528258,
"outcome": "passed"
},
"call": {
"duration": 0.6117532079806551,
"outcome": "passed"
},
"teardown": {
"duration": 0.00015258300118148327,
"outcome": "passed"
}
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[gpt-4o-math]",
"lineno": 182,
"outcome": "passed",
"keywords": [
"test_chat_streaming_structured_output[gpt-4o-math]",
"parametrize",
"pytestmark",
"gpt-4o-math",
"test_chat_completion.py",
"openai_api",
"verifications",
"tests",
"llama-stack",
""
],
"metadata": {
"model": "gpt-4o",
"case_id": "math"
},
"setup": {
"duration": 0.012024458032101393,
"outcome": "passed"
},
"call": {
"duration": 4.522625041077845,
"outcome": "passed"
},
"teardown": {
"duration": 0.0004230838967487216,
"outcome": "passed"
}
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[gpt-4o-mini-calendar]",
"lineno": 182,
"outcome": "passed",
"keywords": [
"test_chat_streaming_structured_output[gpt-4o-mini-calendar]",
"parametrize",
"pytestmark",
"gpt-4o-mini-calendar",
"test_chat_completion.py",
"openai_api",
"verifications",
"tests",
"llama-stack",
""
],
"metadata": {
"model": "gpt-4o-mini",
"case_id": "calendar"
},
"setup": {
"duration": 0.009566582972183824,
"outcome": "passed"
},
"call": {
"duration": 2.5591942919418216,
"outcome": "passed"
},
"teardown": {
"duration": 0.0007555419579148293,
"outcome": "passed"
}
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[gpt-4o-mini-math]",
"lineno": 182,
"outcome": "passed",
"keywords": [
"test_chat_streaming_structured_output[gpt-4o-mini-math]",
"parametrize",
"pytestmark",
"gpt-4o-mini-math",
"test_chat_completion.py",
"openai_api",
"verifications",
"tests",
"llama-stack",
""
],
"metadata": {
"model": "gpt-4o-mini",
"case_id": "math"
},
"setup": {
"duration": 0.010828875005245209,
"outcome": "passed"
},
"call": {
"duration": 2.495122667052783,
"outcome": "passed"
},
"teardown": {
"duration": 0.0002802090020850301,
"outcome": "passed"
}
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[gpt-4o-case0]",
"lineno": 204,
"outcome": "passed",
"keywords": [
"test_chat_non_streaming_tool_calling[gpt-4o-case0]",
"parametrize",
"pytestmark",
"gpt-4o-case0",
"test_chat_completion.py",
"openai_api",
"verifications",
"tests",
"llama-stack",
""
],
"metadata": {
"model": "gpt-4o",
"case_id": "case0"
},
"setup": {
"duration": 0.012762792059220374,
"outcome": "passed"
},
"call": {
"duration": 0.5655921660363674,
"outcome": "passed"
},
"teardown": {
"duration": 0.00022304197773337364,
"outcome": "passed"
}
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[gpt-4o-mini-case0]",
"lineno": 204,
"outcome": "passed",
"keywords": [
"test_chat_non_streaming_tool_calling[gpt-4o-mini-case0]",
"parametrize",
"pytestmark",
"gpt-4o-mini-case0",
"test_chat_completion.py",
"openai_api",
"verifications",
"tests",
"llama-stack",
""
],
"metadata": {
"model": "gpt-4o-mini",
"case_id": "case0"
},
"setup": {
"duration": 0.03188708401285112,
"outcome": "passed"
},
"call": {
"duration": 0.6159415419679135,
"outcome": "passed"
},
"teardown": {
"duration": 0.0005549580091610551,
"outcome": "passed"
}
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_calling[gpt-4o-case0]",
"lineno": 228,
"outcome": "passed",
"keywords": [
"test_chat_streaming_tool_calling[gpt-4o-case0]",
"parametrize",
"pytestmark",
"gpt-4o-case0",
"test_chat_completion.py",
"openai_api",
"verifications",
"tests",
"llama-stack",
""
],
"metadata": {
"model": "gpt-4o",
"case_id": "case0"
},
"setup": {
"duration": 0.014768208027817309,
"outcome": "passed"
},
"call": {
"duration": 0.47373537498060614,
"outcome": "passed"
},
"teardown": {
"duration": 0.0005811670562252402,
"outcome": "passed"
}
},
{
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_calling[gpt-4o-mini-case0]",
"lineno": 228,
"outcome": "passed",
"keywords": [
"test_chat_streaming_tool_calling[gpt-4o-mini-case0]",
"parametrize",
"pytestmark",
"gpt-4o-mini-case0",
"test_chat_completion.py",
"openai_api",
"verifications",
"tests",
"llama-stack",
""
],
"metadata": {
"model": "gpt-4o-mini",
"case_id": "case0"
},
"setup": {
"duration": 0.010271625011228025,
"outcome": "passed"
},
"call": {
"duration": 0.5656027499353513,
"outcome": "passed"
},
"teardown": {
"duration": 0.0025699170073494315,
"outcome": "passed"
}
}
],
"run_timestamp": 1744328848
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff