chore(tests): refactor and move responses tests away from verifications (#3068)

This PR kills the verifications infrastructure which is no longer used.
It was relocated to the `llama-stack-evals`
(https://github.com/meta-llama/llama-stack-evals) repository previously.

Responses tests used this infrastructure but that wasn't quite
necessary, just a little useful back when @bbrownin introduced the
tests. On Discord, we agreed that tests can be moved to our regular
integrations test infra.

## Test Plan

Some tests currently do fail (although they run!) I will send a
follow-up PR which makes them all pass.
This commit is contained in:
Ashwin Bharambe 2025-08-07 13:48:16 -07:00 committed by GitHub
parent 342550c1e2
commit 5f1ddd35e4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
36 changed files with 93 additions and 13032 deletions

View file

@ -53,7 +53,7 @@ jobs:
# Get test directories dynamically, excluding non-test directories
# NOTE: we are excluding post_training since the tests take too long
TEST_TYPES=$(find tests/integration -maxdepth 1 -mindepth 1 -type d -printf "%f\n" |
grep -Ev "^(__pycache__|fixtures|test_cases|recordings|post_training)$" |
grep -Ev "^(__pycache__|fixtures|test_cases|recordings|non_ci|post_training)$" |
sort | jq -R -s -c 'split("\n")[:-1]')
echo "test-types=$TEST_TYPES" >> $GITHUB_OUTPUT

View file

@ -1,60 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import pytest
POST_TRAINING_PROVIDER_TYPES = ["remote::nvidia"]
@pytest.mark.integration
@pytest.fixture(scope="session")
def post_training_provider_available(llama_stack_client):
providers = llama_stack_client.providers.list()
post_training_providers = [p for p in providers if p.provider_type in POST_TRAINING_PROVIDER_TYPES]
return len(post_training_providers) > 0
@pytest.mark.integration
def test_post_training_provider_registration(llama_stack_client, post_training_provider_available):
"""Check if post_training is in the api list.
This is a sanity check to ensure the provider is registered."""
if not post_training_provider_available:
pytest.skip("post training provider not available")
providers = llama_stack_client.providers.list()
post_training_providers = [p for p in providers if p.provider_type in POST_TRAINING_PROVIDER_TYPES]
assert len(post_training_providers) > 0
@pytest.mark.integration
def test_get_training_jobs(llama_stack_client, post_training_provider_available):
"""Test listing all training jobs."""
if not post_training_provider_available:
pytest.skip("post training provider not available")
jobs = llama_stack_client.post_training.get_training_jobs()
assert isinstance(jobs, dict)
assert "data" in jobs
assert isinstance(jobs["data"], list)
@pytest.mark.integration
def test_get_training_job_status(llama_stack_client, post_training_provider_available):
"""Test getting status of a specific training job."""
if not post_training_provider_available:
pytest.skip("post training provider not available")
jobs = llama_stack_client.post_training.get_training_jobs()
if not jobs["data"]:
pytest.skip("No training jobs available to check status")
job_uuid = jobs["data"][0]["job_uuid"]
job_status = llama_stack_client.post_training.get_training_job_status(job_uuid=job_uuid)
assert job_status is not None
assert "job_uuid" in job_status
assert "status" in job_status
assert job_status["job_uuid"] == job_uuid

View file

@ -56,16 +56,6 @@ def case_id_generator(case):
return None
def should_skip_test(verification_config, provider, model, test_name_base):
"""Check if a test should be skipped based on config exclusions."""
provider_config = verification_config.get("providers", {}).get(provider)
if not provider_config:
return False # No config for provider, don't skip
exclusions = provider_config.get("test_exclusions", {}).get(model, [])
return test_name_base in exclusions
# Helper to get the base test name from the request object
def get_base_test_name(request):
return request.node.originalname

View file

Before

Width:  |  Height:  |  Size: 108 KiB

After

Width:  |  Height:  |  Size: 108 KiB

Before After
Before After

View file

Before

Width:  |  Height:  |  Size: 148 KiB

After

Width:  |  Height:  |  Size: 148 KiB

Before After
Before After

View file

Before

Width:  |  Height:  |  Size: 139 KiB

After

Width:  |  Height:  |  Size: 139 KiB

Before After
Before After

View file

@ -15,12 +15,9 @@ import pytest
from llama_stack import LlamaStackAsLibraryClient
from llama_stack.core.datatypes import AuthenticationRequiredError
from tests.common.mcp import dependency_tools, make_mcp_server
from tests.verifications.openai_api.fixtures.fixtures import (
case_id_generator,
get_base_test_name,
should_skip_test,
)
from tests.verifications.openai_api.fixtures.load import load_test_cases
from .fixtures.fixtures import case_id_generator
from .fixtures.load import load_test_cases
responses_test_cases = load_test_cases("responses")
@ -55,13 +52,9 @@ def _upload_file(openai_client, name, file_path):
responses_test_cases["test_response_basic"]["test_params"]["case"],
ids=case_id_generator,
)
def test_response_non_streaming_basic(request, openai_client, model, provider, verification_config, case):
test_name_base = get_base_test_name(request)
if should_skip_test(verification_config, provider, model, test_name_base):
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
response = openai_client.responses.create(
model=model,
def test_response_non_streaming_basic(request, compat_client, text_model_id, case):
response = compat_client.responses.create(
model=text_model_id,
input=case["input"],
stream=False,
)
@ -69,11 +62,13 @@ def test_response_non_streaming_basic(request, openai_client, model, provider, v
assert len(output_text) > 0
assert case["output"].lower() in output_text
retrieved_response = openai_client.responses.retrieve(response_id=response.id)
retrieved_response = compat_client.responses.retrieve(response_id=response.id)
assert retrieved_response.output_text == response.output_text
next_response = openai_client.responses.create(
model=model, input="Repeat your previous response in all caps.", previous_response_id=response.id
next_response = compat_client.responses.create(
model=text_model_id,
input="Repeat your previous response in all caps.",
previous_response_id=response.id,
)
next_output_text = next_response.output_text.strip()
assert case["output"].upper() in next_output_text
@ -84,15 +79,11 @@ def test_response_non_streaming_basic(request, openai_client, model, provider, v
responses_test_cases["test_response_basic"]["test_params"]["case"],
ids=case_id_generator,
)
def test_response_streaming_basic(request, openai_client, model, provider, verification_config, case):
test_name_base = get_base_test_name(request)
if should_skip_test(verification_config, provider, model, test_name_base):
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
def test_response_streaming_basic(request, compat_client, text_model_id, case):
import time
response = openai_client.responses.create(
model=model,
response = compat_client.responses.create(
model=text_model_id,
input=case["input"],
stream=True,
)
@ -138,7 +129,7 @@ def test_response_streaming_basic(request, openai_client, model, provider, verif
assert created_index < completed_index, "response.created should come before response.completed"
# Verify stored response matches streamed response
retrieved_response = openai_client.responses.retrieve(response_id=response_id)
retrieved_response = compat_client.responses.retrieve(response_id=response_id)
final_event = events[-1]
assert retrieved_response.output_text == final_event.response.output_text
@ -148,16 +139,12 @@ def test_response_streaming_basic(request, openai_client, model, provider, verif
responses_test_cases["test_response_basic"]["test_params"]["case"],
ids=case_id_generator,
)
def test_response_streaming_incremental_content(request, openai_client, model, provider, verification_config, case):
def test_response_streaming_incremental_content(request, compat_client, text_model_id, case):
"""Test that streaming actually delivers content incrementally, not just at the end."""
test_name_base = get_base_test_name(request)
if should_skip_test(verification_config, provider, model, test_name_base):
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
import time
response = openai_client.responses.create(
model=model,
response = compat_client.responses.create(
model=text_model_id,
input=case["input"],
stream=True,
)
@ -241,15 +228,11 @@ def test_response_streaming_incremental_content(request, openai_client, model, p
responses_test_cases["test_response_multi_turn"]["test_params"]["case"],
ids=case_id_generator,
)
def test_response_non_streaming_multi_turn(request, openai_client, model, provider, verification_config, case):
test_name_base = get_base_test_name(request)
if should_skip_test(verification_config, provider, model, test_name_base):
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
def test_response_non_streaming_multi_turn(request, compat_client, text_model_id, case):
previous_response_id = None
for turn in case["turns"]:
response = openai_client.responses.create(
model=model,
response = compat_client.responses.create(
model=text_model_id,
input=turn["input"],
previous_response_id=previous_response_id,
tools=turn["tools"] if "tools" in turn else None,
@ -264,13 +247,9 @@ def test_response_non_streaming_multi_turn(request, openai_client, model, provid
responses_test_cases["test_response_web_search"]["test_params"]["case"],
ids=case_id_generator,
)
def test_response_non_streaming_web_search(request, openai_client, model, provider, verification_config, case):
test_name_base = get_base_test_name(request)
if should_skip_test(verification_config, provider, model, test_name_base):
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
response = openai_client.responses.create(
model=model,
def test_response_non_streaming_web_search(request, compat_client, text_model_id, case):
response = compat_client.responses.create(
model=text_model_id,
input=case["input"],
tools=case["tools"],
stream=False,
@ -290,17 +269,11 @@ def test_response_non_streaming_web_search(request, openai_client, model, provid
responses_test_cases["test_response_file_search"]["test_params"]["case"],
ids=case_id_generator,
)
def test_response_non_streaming_file_search(
request, openai_client, model, provider, verification_config, tmp_path, case
):
if isinstance(openai_client, LlamaStackAsLibraryClient):
def test_response_non_streaming_file_search(request, compat_client, text_model_id, tmp_path, case):
if isinstance(compat_client, LlamaStackAsLibraryClient):
pytest.skip("Responses API file search is not yet supported in library client.")
test_name_base = get_base_test_name(request)
if should_skip_test(verification_config, provider, model, test_name_base):
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
vector_store = _new_vector_store(openai_client, "test_vector_store")
vector_store = _new_vector_store(compat_client, "test_vector_store")
if "file_content" in case:
file_name = "test_response_non_streaming_file_search.txt"
@ -312,10 +285,10 @@ def test_response_non_streaming_file_search(
else:
raise ValueError(f"No file content or path provided for case {case['case_id']}")
file_response = _upload_file(openai_client, file_name, file_path)
file_response = _upload_file(compat_client, file_name, file_path)
# Attach our file to the vector store
file_attach_response = openai_client.vector_stores.files.create(
file_attach_response = compat_client.vector_stores.files.create(
vector_store_id=vector_store.id,
file_id=file_response.id,
)
@ -323,7 +296,7 @@ def test_response_non_streaming_file_search(
# Wait for the file to be attached
while file_attach_response.status == "in_progress":
time.sleep(0.1)
file_attach_response = openai_client.vector_stores.files.retrieve(
file_attach_response = compat_client.vector_stores.files.retrieve(
vector_store_id=vector_store.id,
file_id=file_response.id,
)
@ -337,8 +310,8 @@ def test_response_non_streaming_file_search(
tool["vector_store_ids"] = [vector_store.id]
# Create the response request, which should query our vector store
response = openai_client.responses.create(
model=model,
response = compat_client.responses.create(
model=text_model_id,
input=case["input"],
tools=tools,
stream=False,
@ -358,21 +331,15 @@ def test_response_non_streaming_file_search(
assert case["output"].lower() in response.output_text.lower().strip()
def test_response_non_streaming_file_search_empty_vector_store(
request, openai_client, model, provider, verification_config
):
if isinstance(openai_client, LlamaStackAsLibraryClient):
def test_response_non_streaming_file_search_empty_vector_store(request, compat_client, text_model_id):
if isinstance(compat_client, LlamaStackAsLibraryClient):
pytest.skip("Responses API file search is not yet supported in library client.")
test_name_base = get_base_test_name(request)
if should_skip_test(verification_config, provider, model, test_name_base):
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
vector_store = _new_vector_store(openai_client, "test_vector_store")
vector_store = _new_vector_store(compat_client, "test_vector_store")
# Create the response request, which should query our vector store
response = openai_client.responses.create(
model=model,
response = compat_client.responses.create(
model=text_model_id,
input="How many experts does the Llama 4 Maverick model have?",
tools=[{"type": "file_search", "vector_store_ids": [vector_store.id]}],
stream=False,
@ -395,19 +362,15 @@ def test_response_non_streaming_file_search_empty_vector_store(
responses_test_cases["test_response_mcp_tool"]["test_params"]["case"],
ids=case_id_generator,
)
def test_response_non_streaming_mcp_tool(request, openai_client, model, provider, verification_config, case):
test_name_base = get_base_test_name(request)
if should_skip_test(verification_config, provider, model, test_name_base):
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
def test_response_non_streaming_mcp_tool(request, compat_client, text_model_id, case):
with make_mcp_server() as mcp_server_info:
tools = case["tools"]
for tool in tools:
if tool["type"] == "mcp":
tool["server_url"] = mcp_server_info["server_url"]
response = openai_client.responses.create(
model=model,
response = compat_client.responses.create(
model=text_model_id,
input=case["input"],
tools=tools,
stream=False,
@ -418,7 +381,7 @@ def test_response_non_streaming_mcp_tool(request, openai_client, model, provider
assert list_tools.type == "mcp_list_tools"
assert list_tools.server_label == "localmcp"
assert len(list_tools.tools) == 2
assert {t["name"] for t in list_tools.tools} == {"get_boiling_point", "greet_everyone"}
assert {t.name for t in list_tools.tools} == {"get_boiling_point", "greet_everyone"}
call = response.output[1]
assert call.type == "mcp_call"
@ -440,12 +403,12 @@ def test_response_non_streaming_mcp_tool(request, openai_client, model, provider
exc_type = (
AuthenticationRequiredError
if isinstance(openai_client, LlamaStackAsLibraryClient)
if isinstance(compat_client, LlamaStackAsLibraryClient)
else (httpx.HTTPStatusError, openai.AuthenticationError)
)
with pytest.raises(exc_type):
openai_client.responses.create(
model=model,
compat_client.responses.create(
model=text_model_id,
input=case["input"],
tools=tools,
stream=False,
@ -456,8 +419,8 @@ def test_response_non_streaming_mcp_tool(request, openai_client, model, provider
tool["server_url"] = mcp_server_info["server_url"]
tool["headers"] = {"Authorization": "Bearer test-token"}
response = openai_client.responses.create(
model=model,
response = compat_client.responses.create(
model=text_model_id,
input=case["input"],
tools=tools,
stream=False,
@ -470,13 +433,9 @@ def test_response_non_streaming_mcp_tool(request, openai_client, model, provider
responses_test_cases["test_response_custom_tool"]["test_params"]["case"],
ids=case_id_generator,
)
def test_response_non_streaming_custom_tool(request, openai_client, model, provider, verification_config, case):
test_name_base = get_base_test_name(request)
if should_skip_test(verification_config, provider, model, test_name_base):
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
response = openai_client.responses.create(
model=model,
def test_response_non_streaming_custom_tool(request, compat_client, text_model_id, case):
response = compat_client.responses.create(
model=text_model_id,
input=case["input"],
tools=case["tools"],
stream=False,
@ -492,13 +451,9 @@ def test_response_non_streaming_custom_tool(request, openai_client, model, provi
responses_test_cases["test_response_image"]["test_params"]["case"],
ids=case_id_generator,
)
def test_response_non_streaming_image(request, openai_client, model, provider, verification_config, case):
test_name_base = get_base_test_name(request)
if should_skip_test(verification_config, provider, model, test_name_base):
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
response = openai_client.responses.create(
model=model,
def test_response_non_streaming_image(request, compat_client, text_model_id, case):
response = compat_client.responses.create(
model=text_model_id,
input=case["input"],
stream=False,
)
@ -511,15 +466,11 @@ def test_response_non_streaming_image(request, openai_client, model, provider, v
responses_test_cases["test_response_multi_turn_image"]["test_params"]["case"],
ids=case_id_generator,
)
def test_response_non_streaming_multi_turn_image(request, openai_client, model, provider, verification_config, case):
test_name_base = get_base_test_name(request)
if should_skip_test(verification_config, provider, model, test_name_base):
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
def test_response_non_streaming_multi_turn_image(request, compat_client, text_model_id, case):
previous_response_id = None
for turn in case["turns"]:
response = openai_client.responses.create(
model=model,
response = compat_client.responses.create(
model=text_model_id,
input=turn["input"],
previous_response_id=previous_response_id,
tools=turn["tools"] if "tools" in turn else None,
@ -534,14 +485,8 @@ def test_response_non_streaming_multi_turn_image(request, openai_client, model,
responses_test_cases["test_response_multi_turn_tool_execution"]["test_params"]["case"],
ids=case_id_generator,
)
def test_response_non_streaming_multi_turn_tool_execution(
request, openai_client, model, provider, verification_config, case
):
def test_response_non_streaming_multi_turn_tool_execution(request, compat_client, text_model_id, case):
"""Test multi-turn tool execution where multiple MCP tool calls are performed in sequence."""
test_name_base = get_base_test_name(request)
if should_skip_test(verification_config, provider, model, test_name_base):
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
with make_mcp_server(tools=dependency_tools()) as mcp_server_info:
tools = case["tools"]
# Replace the placeholder URL with the actual server URL
@ -549,14 +494,15 @@ def test_response_non_streaming_multi_turn_tool_execution(
if tool["type"] == "mcp" and tool["server_url"] == "<FILLED_BY_TEST_RUNNER>":
tool["server_url"] = mcp_server_info["server_url"]
response = openai_client.responses.create(
response = compat_client.responses.create(
input=case["input"],
model=model,
model=text_model_id,
tools=tools,
)
# Verify we have MCP tool calls in the output
mcp_list_tools = [output for output in response.output if output.type == "mcp_list_tools"]
mcp_calls = [output for output in response.output if output.type == "mcp_call"]
message_outputs = [output for output in response.output if output.type == "message"]
@ -571,7 +517,7 @@ def test_response_non_streaming_multi_turn_tool_execution(
"get_experiment_id",
"get_experiment_results",
}
assert {t["name"] for t in mcp_list_tools[0].tools} == expected_tool_names
assert {t.name for t in mcp_list_tools[0].tools} == expected_tool_names
assert len(mcp_calls) >= 1, f"Expected at least 1 mcp_call, got {len(mcp_calls)}"
for mcp_call in mcp_calls:
@ -595,14 +541,8 @@ def test_response_non_streaming_multi_turn_tool_execution(
responses_test_cases["test_response_multi_turn_tool_execution_streaming"]["test_params"]["case"],
ids=case_id_generator,
)
async def test_response_streaming_multi_turn_tool_execution(
request, openai_client, model, provider, verification_config, case
):
async def test_response_streaming_multi_turn_tool_execution(request, compat_client, text_model_id, case):
"""Test streaming multi-turn tool execution where multiple MCP tool calls are performed in sequence."""
test_name_base = get_base_test_name(request)
if should_skip_test(verification_config, provider, model, test_name_base):
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
with make_mcp_server(tools=dependency_tools()) as mcp_server_info:
tools = case["tools"]
# Replace the placeholder URL with the actual server URL
@ -610,15 +550,15 @@ async def test_response_streaming_multi_turn_tool_execution(
if tool["type"] == "mcp" and tool["server_url"] == "<FILLED_BY_TEST_RUNNER>":
tool["server_url"] = mcp_server_info["server_url"]
stream = openai_client.responses.create(
stream = compat_client.responses.create(
input=case["input"],
model=model,
model=text_model_id,
tools=tools,
stream=True,
)
chunks = []
async for chunk in stream:
for chunk in stream:
chunks.append(chunk)
# Should have at least response.created and response.completed
@ -653,7 +593,7 @@ async def test_response_streaming_multi_turn_tool_execution(
"get_experiment_id",
"get_experiment_results",
}
assert {t["name"] for t in mcp_list_tools[0].tools} == expected_tool_names
assert {t.name for t in mcp_list_tools[0].tools} == expected_tool_names
# Should have at least 1 MCP call (the model should call at least one tool)
assert len(mcp_calls) >= 1, f"Expected at least 1 mcp_call, got {len(mcp_calls)}"
@ -694,17 +634,13 @@ async def test_response_streaming_multi_turn_tool_execution(
},
],
)
def test_response_text_format(request, openai_client, model, provider, verification_config, text_format):
if isinstance(openai_client, LlamaStackAsLibraryClient):
def test_response_text_format(request, compat_client, text_model_id, text_format):
if isinstance(compat_client, LlamaStackAsLibraryClient):
pytest.skip("Responses API text format is not yet supported in library client.")
test_name_base = get_base_test_name(request)
if should_skip_test(verification_config, provider, model, test_name_base):
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
stream = False
response = openai_client.responses.create(
model=model,
response = compat_client.responses.create(
model=text_model_id,
input="What is the capital of France?",
stream=stream,
text={"format": text_format},
@ -717,16 +653,12 @@ def test_response_text_format(request, openai_client, model, provider, verificat
@pytest.fixture
def vector_store_with_filtered_files(request, openai_client, model, provider, verification_config, tmp_path_factory):
def vector_store_with_filtered_files(request, compat_client, text_model_id, tmp_path_factory):
"""Create a vector store with multiple files that have different attributes for filtering tests."""
if isinstance(openai_client, LlamaStackAsLibraryClient):
if isinstance(compat_client, LlamaStackAsLibraryClient):
pytest.skip("Responses API file search is not yet supported in library client.")
test_name_base = get_base_test_name(request)
if should_skip_test(verification_config, provider, model, test_name_base):
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
vector_store = _new_vector_store(openai_client, "test_vector_store_with_filters")
vector_store = _new_vector_store(compat_client, "test_vector_store_with_filters")
tmp_path = tmp_path_factory.mktemp("filter_test_files")
# Create multiple files with different attributes
@ -776,18 +708,18 @@ def vector_store_with_filtered_files(request, openai_client, model, provider, ve
file_path.write_text(file_data["content"])
# Upload file
file_response = _upload_file(openai_client, file_data["name"], str(file_path))
file_response = _upload_file(compat_client, file_data["name"], str(file_path))
file_ids.append(file_response.id)
# Attach file to vector store with attributes
file_attach_response = openai_client.vector_stores.files.create(
file_attach_response = compat_client.vector_stores.files.create(
vector_store_id=vector_store.id, file_id=file_response.id, attributes=file_data["attributes"]
)
# Wait for attachment
while file_attach_response.status == "in_progress":
time.sleep(0.1)
file_attach_response = openai_client.vector_stores.files.retrieve(
file_attach_response = compat_client.vector_stores.files.retrieve(
vector_store_id=vector_store.id,
file_id=file_response.id,
)
@ -797,17 +729,17 @@ def vector_store_with_filtered_files(request, openai_client, model, provider, ve
# Cleanup: delete vector store and files
try:
openai_client.vector_stores.delete(vector_store_id=vector_store.id)
compat_client.vector_stores.delete(vector_store_id=vector_store.id)
for file_id in file_ids:
try:
openai_client.files.delete(file_id=file_id)
compat_client.files.delete(file_id=file_id)
except Exception:
pass # File might already be deleted
except Exception:
pass # Best effort cleanup
def test_response_file_search_filter_by_region(openai_client, model, vector_store_with_filtered_files):
def test_response_file_search_filter_by_region(compat_client, text_model_id, vector_store_with_filtered_files):
"""Test file search with region equality filter."""
tools = [
{
@ -817,8 +749,8 @@ def test_response_file_search_filter_by_region(openai_client, model, vector_stor
}
]
response = openai_client.responses.create(
model=model,
response = compat_client.responses.create(
model=text_model_id,
input="What are the updates from the US region?",
tools=tools,
stream=False,
@ -838,7 +770,7 @@ def test_response_file_search_filter_by_region(openai_client, model, vector_stor
assert "asia" not in result.text.lower()
def test_response_file_search_filter_by_category(openai_client, model, vector_store_with_filtered_files):
def test_response_file_search_filter_by_category(compat_client, text_model_id, vector_store_with_filtered_files):
"""Test file search with category equality filter."""
tools = [
{
@ -848,8 +780,8 @@ def test_response_file_search_filter_by_category(openai_client, model, vector_st
}
]
response = openai_client.responses.create(
model=model,
response = compat_client.responses.create(
model=text_model_id,
input="Show me all marketing reports",
tools=tools,
stream=False,
@ -868,7 +800,7 @@ def test_response_file_search_filter_by_category(openai_client, model, vector_st
assert "revenue figures" not in result.text.lower()
def test_response_file_search_filter_by_date_range(openai_client, model, vector_store_with_filtered_files):
def test_response_file_search_filter_by_date_range(compat_client, text_model_id, vector_store_with_filtered_files):
"""Test file search with date range filter using compound AND."""
tools = [
{
@ -892,8 +824,8 @@ def test_response_file_search_filter_by_date_range(openai_client, model, vector_
}
]
response = openai_client.responses.create(
model=model,
response = compat_client.responses.create(
model=text_model_id,
input="What happened in Q1 2023?",
tools=tools,
stream=False,
@ -911,7 +843,7 @@ def test_response_file_search_filter_by_date_range(openai_client, model, vector_
assert "q3" not in result.text.lower()
def test_response_file_search_filter_compound_and(openai_client, model, vector_store_with_filtered_files):
def test_response_file_search_filter_compound_and(compat_client, text_model_id, vector_store_with_filtered_files):
"""Test file search with compound AND filter (region AND category)."""
tools = [
{
@ -927,8 +859,8 @@ def test_response_file_search_filter_compound_and(openai_client, model, vector_s
}
]
response = openai_client.responses.create(
model=model,
response = compat_client.responses.create(
model=text_model_id,
input="What are the engineering updates from the US?",
tools=tools,
stream=False,
@ -947,7 +879,7 @@ def test_response_file_search_filter_compound_and(openai_client, model, vector_s
assert "promotional" not in result.text.lower() and "revenue" not in result.text.lower()
def test_response_file_search_filter_compound_or(openai_client, model, vector_store_with_filtered_files):
def test_response_file_search_filter_compound_or(compat_client, text_model_id, vector_store_with_filtered_files):
"""Test file search with compound OR filter (marketing OR sales)."""
tools = [
{
@ -963,8 +895,8 @@ def test_response_file_search_filter_compound_or(openai_client, model, vector_st
}
]
response = openai_client.responses.create(
model=model,
response = compat_client.responses.create(
model=text_model_id,
input="Show me marketing and sales documents",
tools=tools,
stream=False,

View file

@ -1,79 +0,0 @@
# Llama Stack Verifications
Llama Stack Verifications provide standardized test suites to ensure API compatibility and behavior consistency across different LLM providers. These tests help verify that different models and providers implement the expected interfaces and behaviors correctly.
## Overview
This framework allows you to run the same set of verification tests against different LLM providers' OpenAI-compatible endpoints (Fireworks, Together, Groq, Cerebras, etc., and OpenAI itself) to ensure they meet the expected behavior and interface standards.
## Features
The verification suite currently tests the following in both streaming and non-streaming modes:
- Basic chat completions
- Image input capabilities
- Structured JSON output formatting
- Tool calling functionality
## Report
The lastest report can be found at [REPORT.md](REPORT.md).
To update the report, ensure you have the API keys set,
```bash
export OPENAI_API_KEY=<your_openai_api_key>
export FIREWORKS_API_KEY=<your_fireworks_api_key>
export TOGETHER_API_KEY=<your_together_api_key>
```
then run
```bash
uv run python tests/verifications/generate_report.py --run-tests
```
## Running Tests
To run the verification tests, use pytest with the following parameters:
```bash
cd llama-stack
pytest tests/verifications/openai_api --provider=<provider-name>
```
Example:
```bash
# Run all tests
pytest tests/verifications/openai_api --provider=together
# Only run tests with Llama 4 models
pytest tests/verifications/openai_api --provider=together -k 'Llama-4'
```
### Parameters
- `--provider`: The provider name (openai, fireworks, together, groq, cerebras, etc.)
- `--base-url`: The base URL for the provider's API (optional - defaults to the standard URL for the specified provider)
- `--api-key`: Your API key for the provider (optional - defaults to the standard API_KEY name for the specified provider)
## Supported Providers
The verification suite supports any provider with an OpenAI compatible endpoint.
See `tests/verifications/conf/` for the list of supported providers.
To run on a new provider, simply add a new yaml file to the `conf/` directory with the provider config. See `tests/verifications/conf/together.yaml` for an example.
## Adding New Test Cases
To add new test cases, create appropriate JSON files in the `openai_api/fixtures/test_cases/` directory following the existing patterns.
## Structure
- `__init__.py` - Marks the directory as a Python package
- `conf/` - Provider-specific configuration files
- `openai_api/` - Tests specific to OpenAI-compatible APIs
- `fixtures/` - Test fixtures and utilities
- `fixtures.py` - Provider-specific fixtures
- `load.py` - Utilities for loading test cases
- `test_cases/` - JSON test case definitions
- `test_chat_completion.py` - Tests for chat completion APIs

View file

@ -1,232 +0,0 @@
# Test Results Report
*Generated on: 2025-04-17 12:42:33*
*This report was generated by running `python tests/verifications/generate_report.py`*
## Legend
- ✅ - Test passed
- ❌ - Test failed
- ⚪ - Test not applicable or not run for this model
## Summary
| Provider | Pass Rate | Tests Passed | Total Tests |
| --- | --- | --- | --- |
| Meta_reference | 100.0% | 28 | 28 |
| Together | 50.0% | 40 | 80 |
| Fireworks | 50.0% | 40 | 80 |
| Openai | 100.0% | 56 | 56 |
## Meta_reference
*Tests run on: 2025-04-17 12:37:11*
```bash
# Run all tests for this provider:
pytest tests/verifications/openai_api/test_chat_completion.py --provider=meta_reference -v
# Example: Run only the 'stream=False' case of test_chat_multi_turn_multiple_images:
pytest tests/verifications/openai_api/test_chat_completion.py --provider=meta_reference -k "test_chat_multi_turn_multiple_images and stream=False"
```
**Model Key (Meta_reference)**
| Display Name | Full Model ID |
| --- | --- |
| Llama-4-Scout-Instruct | `meta-llama/Llama-4-Scout-17B-16E-Instruct` |
| Test | Llama-4-Scout-Instruct |
| --- | --- |
| test_chat_multi_turn_multiple_images (stream=False) | ✅ |
| test_chat_multi_turn_multiple_images (stream=True) | ✅ |
| test_chat_non_streaming_basic (earth) | ✅ |
| test_chat_non_streaming_basic (saturn) | ✅ |
| test_chat_non_streaming_image | ✅ |
| test_chat_non_streaming_multi_turn_tool_calling (add_product_tool) | ✅ |
| test_chat_non_streaming_multi_turn_tool_calling (compare_monthly_expense_tool) | ✅ |
| test_chat_non_streaming_multi_turn_tool_calling (get_then_create_event_tool) | ✅ |
| test_chat_non_streaming_multi_turn_tool_calling (text_then_weather_tool) | ✅ |
| test_chat_non_streaming_multi_turn_tool_calling (weather_tool_then_text) | ✅ |
| test_chat_non_streaming_structured_output (calendar) | ✅ |
| test_chat_non_streaming_structured_output (math) | ✅ |
| test_chat_non_streaming_tool_calling | ✅ |
| test_chat_non_streaming_tool_choice_none | ✅ |
| test_chat_non_streaming_tool_choice_required | ✅ |
| test_chat_streaming_basic (earth) | ✅ |
| test_chat_streaming_basic (saturn) | ✅ |
| test_chat_streaming_image | ✅ |
| test_chat_streaming_multi_turn_tool_calling (add_product_tool) | ✅ |
| test_chat_streaming_multi_turn_tool_calling (compare_monthly_expense_tool) | ✅ |
| test_chat_streaming_multi_turn_tool_calling (get_then_create_event_tool) | ✅ |
| test_chat_streaming_multi_turn_tool_calling (text_then_weather_tool) | ✅ |
| test_chat_streaming_multi_turn_tool_calling (weather_tool_then_text) | ✅ |
| test_chat_streaming_structured_output (calendar) | ✅ |
| test_chat_streaming_structured_output (math) | ✅ |
| test_chat_streaming_tool_calling | ✅ |
| test_chat_streaming_tool_choice_none | ✅ |
| test_chat_streaming_tool_choice_required | ✅ |
## Together
*Tests run on: 2025-04-17 12:27:45*
```bash
# Run all tests for this provider:
pytest tests/verifications/openai_api/test_chat_completion.py --provider=together -v
# Example: Run only the 'stream=False' case of test_chat_multi_turn_multiple_images:
pytest tests/verifications/openai_api/test_chat_completion.py --provider=together -k "test_chat_multi_turn_multiple_images and stream=False"
```
**Model Key (Together)**
| Display Name | Full Model ID |
| --- | --- |
| Llama-3.3-70B-Instruct | `meta-llama/Llama-3.3-70B-Instruct-Turbo` |
| Llama-4-Maverick-Instruct | `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8` |
| Llama-4-Scout-Instruct | `meta-llama/Llama-4-Scout-17B-16E-Instruct` |
| Test | Llama-3.3-70B-Instruct | Llama-4-Maverick-Instruct | Llama-4-Scout-Instruct |
| --- | --- | --- | --- |
| test_chat_multi_turn_multiple_images (stream=False) | ⚪ | ✅ | ✅ |
| test_chat_multi_turn_multiple_images (stream=True) | ⚪ | ❌ | ❌ |
| test_chat_non_streaming_basic (earth) | ✅ | ✅ | ✅ |
| test_chat_non_streaming_basic (saturn) | ✅ | ✅ | ✅ |
| test_chat_non_streaming_image | ⚪ | ✅ | ✅ |
| test_chat_non_streaming_multi_turn_tool_calling (add_product_tool) | ✅ | ✅ | ✅ |
| test_chat_non_streaming_multi_turn_tool_calling (compare_monthly_expense_tool) | ✅ | ✅ | ✅ |
| test_chat_non_streaming_multi_turn_tool_calling (get_then_create_event_tool) | ✅ | ❌ | ✅ |
| test_chat_non_streaming_multi_turn_tool_calling (text_then_weather_tool) | ❌ | ❌ | ❌ |
| test_chat_non_streaming_multi_turn_tool_calling (weather_tool_then_text) | ✅ | ✅ | ✅ |
| test_chat_non_streaming_structured_output (calendar) | ✅ | ✅ | ✅ |
| test_chat_non_streaming_structured_output (math) | ✅ | ✅ | ✅ |
| test_chat_non_streaming_tool_calling | ✅ | ✅ | ✅ |
| test_chat_non_streaming_tool_choice_none | ❌ | ❌ | ❌ |
| test_chat_non_streaming_tool_choice_required | ✅ | ✅ | ✅ |
| test_chat_streaming_basic (earth) | ✅ | ❌ | ❌ |
| test_chat_streaming_basic (saturn) | ✅ | ❌ | ❌ |
| test_chat_streaming_image | ⚪ | ❌ | ❌ |
| test_chat_streaming_multi_turn_tool_calling (add_product_tool) | ✅ | ❌ | ❌ |
| test_chat_streaming_multi_turn_tool_calling (compare_monthly_expense_tool) | ❌ | ❌ | ❌ |
| test_chat_streaming_multi_turn_tool_calling (get_then_create_event_tool) | ❌ | ❌ | ❌ |
| test_chat_streaming_multi_turn_tool_calling (text_then_weather_tool) | ❌ | ❌ | ❌ |
| test_chat_streaming_multi_turn_tool_calling (weather_tool_then_text) | ❌ | ❌ | ❌ |
| test_chat_streaming_structured_output (calendar) | ✅ | ❌ | ❌ |
| test_chat_streaming_structured_output (math) | ✅ | ❌ | ❌ |
| test_chat_streaming_tool_calling | ✅ | ❌ | ❌ |
| test_chat_streaming_tool_choice_none | ❌ | ❌ | ❌ |
| test_chat_streaming_tool_choice_required | ✅ | ❌ | ❌ |
## Fireworks
*Tests run on: 2025-04-17 12:29:53*
```bash
# Run all tests for this provider:
pytest tests/verifications/openai_api/test_chat_completion.py --provider=fireworks -v
# Example: Run only the 'stream=False' case of test_chat_multi_turn_multiple_images:
pytest tests/verifications/openai_api/test_chat_completion.py --provider=fireworks -k "test_chat_multi_turn_multiple_images and stream=False"
```
**Model Key (Fireworks)**
| Display Name | Full Model ID |
| --- | --- |
| Llama-3.3-70B-Instruct | `accounts/fireworks/models/llama-v3p3-70b-instruct` |
| Llama-4-Maverick-Instruct | `accounts/fireworks/models/llama4-maverick-instruct-basic` |
| Llama-4-Scout-Instruct | `accounts/fireworks/models/llama4-scout-instruct-basic` |
| Test | Llama-3.3-70B-Instruct | Llama-4-Maverick-Instruct | Llama-4-Scout-Instruct |
| --- | --- | --- | --- |
| test_chat_multi_turn_multiple_images (stream=False) | ⚪ | ✅ | ✅ |
| test_chat_multi_turn_multiple_images (stream=True) | ⚪ | ✅ | ✅ |
| test_chat_non_streaming_basic (earth) | ✅ | ✅ | ✅ |
| test_chat_non_streaming_basic (saturn) | ✅ | ✅ | ✅ |
| test_chat_non_streaming_image | ⚪ | ✅ | ✅ |
| test_chat_non_streaming_multi_turn_tool_calling (add_product_tool) | ❌ | ❌ | ❌ |
| test_chat_non_streaming_multi_turn_tool_calling (compare_monthly_expense_tool) | ❌ | ❌ | ❌ |
| test_chat_non_streaming_multi_turn_tool_calling (get_then_create_event_tool) | ❌ | ❌ | ❌ |
| test_chat_non_streaming_multi_turn_tool_calling (text_then_weather_tool) | ❌ | ❌ | ❌ |
| test_chat_non_streaming_multi_turn_tool_calling (weather_tool_then_text) | ❌ | ❌ | ❌ |
| test_chat_non_streaming_structured_output (calendar) | ✅ | ✅ | ✅ |
| test_chat_non_streaming_structured_output (math) | ✅ | ✅ | ✅ |
| test_chat_non_streaming_tool_calling | ❌ | ❌ | ❌ |
| test_chat_non_streaming_tool_choice_none | ✅ | ✅ | ✅ |
| test_chat_non_streaming_tool_choice_required | ✅ | ❌ | ❌ |
| test_chat_streaming_basic (earth) | ✅ | ✅ | ✅ |
| test_chat_streaming_basic (saturn) | ✅ | ✅ | ✅ |
| test_chat_streaming_image | ⚪ | ✅ | ✅ |
| test_chat_streaming_multi_turn_tool_calling (add_product_tool) | ❌ | ❌ | ❌ |
| test_chat_streaming_multi_turn_tool_calling (compare_monthly_expense_tool) | ❌ | ❌ | ❌ |
| test_chat_streaming_multi_turn_tool_calling (get_then_create_event_tool) | ❌ | ❌ | ❌ |
| test_chat_streaming_multi_turn_tool_calling (text_then_weather_tool) | ❌ | ❌ | ❌ |
| test_chat_streaming_multi_turn_tool_calling (weather_tool_then_text) | ❌ | ❌ | ❌ |
| test_chat_streaming_structured_output (calendar) | ✅ | ✅ | ✅ |
| test_chat_streaming_structured_output (math) | ✅ | ✅ | ✅ |
| test_chat_streaming_tool_calling | ❌ | ❌ | ❌ |
| test_chat_streaming_tool_choice_none | ✅ | ✅ | ✅ |
| test_chat_streaming_tool_choice_required | ✅ | ❌ | ❌ |
## Openai
*Tests run on: 2025-04-17 12:34:08*
```bash
# Run all tests for this provider:
pytest tests/verifications/openai_api/test_chat_completion.py --provider=openai -v
# Example: Run only the 'stream=False' case of test_chat_multi_turn_multiple_images:
pytest tests/verifications/openai_api/test_chat_completion.py --provider=openai -k "test_chat_multi_turn_multiple_images and stream=False"
```
**Model Key (Openai)**
| Display Name | Full Model ID |
| --- | --- |
| gpt-4o | `gpt-4o` |
| gpt-4o-mini | `gpt-4o-mini` |
| Test | gpt-4o | gpt-4o-mini |
| --- | --- | --- |
| test_chat_multi_turn_multiple_images (stream=False) | ✅ | ✅ |
| test_chat_multi_turn_multiple_images (stream=True) | ✅ | ✅ |
| test_chat_non_streaming_basic (earth) | ✅ | ✅ |
| test_chat_non_streaming_basic (saturn) | ✅ | ✅ |
| test_chat_non_streaming_image | ✅ | ✅ |
| test_chat_non_streaming_multi_turn_tool_calling (add_product_tool) | ✅ | ✅ |
| test_chat_non_streaming_multi_turn_tool_calling (compare_monthly_expense_tool) | ✅ | ✅ |
| test_chat_non_streaming_multi_turn_tool_calling (get_then_create_event_tool) | ✅ | ✅ |
| test_chat_non_streaming_multi_turn_tool_calling (text_then_weather_tool) | ✅ | ✅ |
| test_chat_non_streaming_multi_turn_tool_calling (weather_tool_then_text) | ✅ | ✅ |
| test_chat_non_streaming_structured_output (calendar) | ✅ | ✅ |
| test_chat_non_streaming_structured_output (math) | ✅ | ✅ |
| test_chat_non_streaming_tool_calling | ✅ | ✅ |
| test_chat_non_streaming_tool_choice_none | ✅ | ✅ |
| test_chat_non_streaming_tool_choice_required | ✅ | ✅ |
| test_chat_streaming_basic (earth) | ✅ | ✅ |
| test_chat_streaming_basic (saturn) | ✅ | ✅ |
| test_chat_streaming_image | ✅ | ✅ |
| test_chat_streaming_multi_turn_tool_calling (add_product_tool) | ✅ | ✅ |
| test_chat_streaming_multi_turn_tool_calling (compare_monthly_expense_tool) | ✅ | ✅ |
| test_chat_streaming_multi_turn_tool_calling (get_then_create_event_tool) | ✅ | ✅ |
| test_chat_streaming_multi_turn_tool_calling (text_then_weather_tool) | ✅ | ✅ |
| test_chat_streaming_multi_turn_tool_calling (weather_tool_then_text) | ✅ | ✅ |
| test_chat_streaming_structured_output (calendar) | ✅ | ✅ |
| test_chat_streaming_structured_output (math) | ✅ | ✅ |
| test_chat_streaming_tool_calling | ✅ | ✅ |
| test_chat_streaming_tool_choice_none | ✅ | ✅ |
| test_chat_streaming_tool_choice_required | ✅ | ✅ |

View file

@ -1,11 +0,0 @@
base_url: https://api.cerebras.ai/v1
api_key_var: CEREBRAS_API_KEY
models:
- llama-3.3-70b
model_display_names:
llama-3.3-70b: Llama-3.3-70B-Instruct
test_exclusions:
llama-3.3-70b:
- test_chat_non_streaming_image
- test_chat_streaming_image
- test_chat_multi_turn_multiple_images

View file

@ -1,17 +0,0 @@
base_url: http://localhost:8321/v1/openai/v1
api_key_var: FIREWORKS_API_KEY
models:
- fireworks/llama-v3p3-70b-instruct
- fireworks/llama4-scout-instruct-basic
- fireworks/llama4-maverick-instruct-basic
model_display_names:
fireworks/llama-v3p3-70b-instruct: Llama-3.3-70B-Instruct
fireworks/llama4-scout-instruct-basic: Llama-4-Scout-Instruct
fireworks/llama4-maverick-instruct-basic: Llama-4-Maverick-Instruct
test_exclusions:
fireworks/llama-v3p3-70b-instruct:
- test_chat_non_streaming_image
- test_chat_streaming_image
- test_chat_multi_turn_multiple_images
- test_response_non_streaming_image
- test_response_non_streaming_multi_turn_image

View file

@ -1,15 +0,0 @@
base_url: https://api.fireworks.ai/inference/v1
api_key_var: FIREWORKS_API_KEY
models:
- accounts/fireworks/models/llama-v3p3-70b-instruct
- accounts/fireworks/models/llama4-scout-instruct-basic
- accounts/fireworks/models/llama4-maverick-instruct-basic
model_display_names:
accounts/fireworks/models/llama-v3p3-70b-instruct: Llama-3.3-70B-Instruct
accounts/fireworks/models/llama4-scout-instruct-basic: Llama-4-Scout-Instruct
accounts/fireworks/models/llama4-maverick-instruct-basic: Llama-4-Maverick-Instruct
test_exclusions:
accounts/fireworks/models/llama-v3p3-70b-instruct:
- test_chat_non_streaming_image
- test_chat_streaming_image
- test_chat_multi_turn_multiple_images

View file

@ -1,17 +0,0 @@
base_url: http://localhost:8321/v1/openai/v1
api_key_var: GROQ_API_KEY
models:
- groq/llama-3.3-70b-versatile
- groq/llama-4-scout-17b-16e-instruct
- groq/llama-4-maverick-17b-128e-instruct
model_display_names:
groq/llama-3.3-70b-versatile: Llama-3.3-70B-Instruct
groq/llama-4-scout-17b-16e-instruct: Llama-4-Scout-Instruct
groq/llama-4-maverick-17b-128e-instruct: Llama-4-Maverick-Instruct
test_exclusions:
groq/llama-3.3-70b-versatile:
- test_chat_non_streaming_image
- test_chat_streaming_image
- test_chat_multi_turn_multiple_images
- test_response_non_streaming_image
- test_response_non_streaming_multi_turn_image

View file

@ -1,15 +0,0 @@
base_url: https://api.groq.com/openai/v1
api_key_var: GROQ_API_KEY
models:
- llama-3.3-70b-versatile
- meta-llama/llama-4-scout-17b-16e-instruct
- meta-llama/llama-4-maverick-17b-128e-instruct
model_display_names:
llama-3.3-70b-versatile: Llama-3.3-70B-Instruct
meta-llama/llama-4-scout-17b-16e-instruct: Llama-4-Scout-Instruct
meta-llama/llama-4-maverick-17b-128e-instruct: Llama-4-Maverick-Instruct
test_exclusions:
llama-3.3-70b-versatile:
- test_chat_non_streaming_image
- test_chat_streaming_image
- test_chat_multi_turn_multiple_images

View file

@ -1,8 +0,0 @@
# LLAMA_STACK_PORT=5002 llama stack run meta-reference-gpu --env INFERENCE_MODEL=meta-llama/Llama-4-Scout-17B-16E-Instruct --env INFERENCE_CHECKPOINT_DIR=<path_to_ckpt>
base_url: http://localhost:5002/v1/openai/v1
api_key_var: foo
models:
- meta-llama/Llama-4-Scout-17B-16E-Instruct
model_display_names:
meta-llama/Llama-4-Scout-17B-16E-Instruct: Llama-4-Scout-Instruct
test_exclusions: {}

View file

@ -1,9 +0,0 @@
base_url: http://localhost:8321/v1/openai/v1
api_key_var: OPENAI_API_KEY
models:
- openai/gpt-4o
- openai/gpt-4o-mini
model_display_names:
openai/gpt-4o: gpt-4o
openai/gpt-4o-mini: gpt-4o-mini
test_exclusions: {}

View file

@ -1,9 +0,0 @@
base_url: https://api.openai.com/v1
api_key_var: OPENAI_API_KEY
models:
- gpt-4o
- gpt-4o-mini
model_display_names:
gpt-4o: gpt-4o
gpt-4o-mini: gpt-4o-mini
test_exclusions: {}

View file

@ -1,17 +0,0 @@
base_url: http://localhost:8321/v1/openai/v1
api_key_var: TOGETHER_API_KEY
models:
- together/meta-llama/Llama-3.3-70B-Instruct-Turbo
- together/meta-llama/Llama-4-Scout-17B-16E-Instruct
- together/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
model_display_names:
together/meta-llama/Llama-3.3-70B-Instruct-Turbo: Llama-3.3-70B-Instruct
together/meta-llama/Llama-4-Scout-17B-16E-Instruct: Llama-4-Scout-Instruct
together/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8: Llama-4-Maverick-Instruct
test_exclusions:
together/meta-llama/Llama-3.3-70B-Instruct-Turbo:
- test_chat_non_streaming_image
- test_chat_streaming_image
- test_chat_multi_turn_multiple_images
- test_response_non_streaming_image
- test_response_non_streaming_multi_turn_image

View file

@ -1,15 +0,0 @@
base_url: https://api.together.xyz/v1
api_key_var: TOGETHER_API_KEY
models:
- meta-llama/Llama-3.3-70B-Instruct-Turbo
- meta-llama/Llama-4-Scout-17B-16E-Instruct
- meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
model_display_names:
meta-llama/Llama-3.3-70B-Instruct-Turbo: Llama-3.3-70B-Instruct
meta-llama/Llama-4-Scout-17B-16E-Instruct: Llama-4-Scout-Instruct
meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8: Llama-4-Maverick-Instruct
test_exclusions:
meta-llama/Llama-3.3-70B-Instruct-Turbo:
- test_chat_non_streaming_image
- test_chat_streaming_image
- test_chat_multi_turn_multiple_images

View file

@ -1,96 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import re
import pytest
def pytest_addoption(parser):
parser.addoption(
"--base-url",
action="store",
help="Base URL for OpenAI compatible API",
)
parser.addoption(
"--api-key",
action="store",
help="API key to use for the provider",
)
parser.addoption(
"--provider",
action="store",
help="Provider to use for testing",
)
parser.addoption(
"--model",
action="store",
help="Model to use for testing",
)
pytest_plugins = [
"pytest_jsonreport",
"tests.verifications.openai_api.fixtures.fixtures",
"tests.verifications.openai_api.fixtures.load",
]
@pytest.hookimpl(optionalhook=True)
def pytest_json_runtest_metadata(item, call):
"""Add model and case_id to pytest-json report metadata."""
metadata = {}
nodeid = item.nodeid
# 1. Extract model from callspec if available
model = item.callspec.params.get("model") if hasattr(item, "callspec") else None
if model:
metadata["model"] = model
else:
# Fallback: Try parsing from nodeid (less reliable)
match_model = re.search(r"\[(.*?)-", nodeid)
if match_model:
model = match_model.group(1) # Store model even if found via fallback
metadata["model"] = model
else:
print(f"Warning: Could not determine model for test {nodeid}")
model = None # Ensure model is None if not found
# 2. Extract case_id using the known model string if possible
if model:
# Construct a regex pattern to find the case_id *after* the model name and a hyphen.
# Escape the model name in case it contains regex special characters.
pattern = re.escape(model) + r"-(.*?)\]$"
match_case = re.search(pattern, nodeid)
if match_case:
case_id = match_case.group(1)
metadata["case_id"] = case_id
else:
# Fallback if the pattern didn't match (e.g., nodeid format unexpected)
# Try the old less specific regex as a last resort.
match_case_fallback = re.search(r"-(.*?)\]$", nodeid)
if match_case_fallback:
case_id = match_case_fallback.group(1)
metadata["case_id"] = case_id
print(f"Warning: Used fallback regex to parse case_id from nodeid {nodeid}")
else:
print(f"Warning: Could not parse case_id from nodeid {nodeid} even with fallback.")
if "case" in (item.callspec.params if hasattr(item, "callspec") else {}):
metadata["case_id"] = "parsing_failed"
elif "case" in (item.callspec.params if hasattr(item, "callspec") else {}):
# Cannot reliably parse case_id without model, but we know it's a case test.
# Try the generic fallback regex.
match_case_fallback = re.search(r"-(.*?)\]$", nodeid)
if match_case_fallback:
case_id = match_case_fallback.group(1)
metadata["case_id"] = case_id
print(f"Warning: Used fallback regex to parse case_id from nodeid {nodeid} (model unknown)")
else:
print(f"Warning: Could not parse case_id from nodeid {nodeid} (model unknown)")
metadata["case_id"] = "parsing_failed_no_model"
# else: Not a test with a model or case param we need to handle.
return metadata

View file

@ -1,502 +0,0 @@
#!/usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
"""
Test Report Generator
Description:
This script runs pytest tests (specifically designed for OpenAI API compatibility checks)
for different providers, aggregates the results from JSON reports, and generates
a markdown summary report (REPORT.md).
It automatically cleans up old test result files, keeping only the latest
per provider.
Configuration:
- Provider details (models, display names) are loaded from `tests/verifications/conf/*.yaml`.
- Test cases are defined in YAML files within `tests/verifications/openai_api/fixtures/test_cases/`.
- Test results are stored in `tests/verifications/test_results/`.
Usage:
# Generate a report using the latest existing test results
python tests/verifications/generate_report.py
# Run tests for all configured providers and generate a report
python tests/verifications/generate_report.py --run-tests
# Run tests only for specific providers (space-separated)
python tests/verifications/generate_report.py --run-tests --providers fireworks openai
# Run tests matching a keyword expression (uses pytest -k)
python tests/verifications/generate_report.py --run-tests --providers fireworks --k "streaming"
# Run a specific test case for a provider
python tests/verifications/generate_report.py --run-tests --providers fireworks --k "test_chat_streaming_basic and basic_earth"
# Save the report to a custom location
python tests/verifications/generate_report.py --output custom_report.md
"""
import argparse
import json
import os
import re
import subprocess
import time
from collections import defaultdict
from pathlib import Path
from typing import Any
from tests.verifications.openai_api.fixtures.fixtures import _load_all_verification_configs
# Define the root directory for test results
RESULTS_DIR = Path(__file__).parent / "test_results"
RESULTS_DIR.mkdir(exist_ok=True)
# Maximum number of test result files to keep per provider
MAX_RESULTS_PER_PROVIDER = 1
DEFAULT_PROVIDERS = [
"meta_reference",
"together",
"fireworks",
"openai",
]
VERIFICATION_CONFIG = _load_all_verification_configs()
def run_tests(provider, keyword=None):
"""Run pytest for a specific provider and save results"""
print(f"Running tests for provider: {provider}")
timestamp = int(time.time())
# Use a constant filename for the final result and temp file
result_file = RESULTS_DIR / f"{provider}.json"
temp_json_file = RESULTS_DIR / f"temp_{provider}.json"
# Determine project root directory relative to this script
project_root = Path(__file__).parent.parent.parent
# Run pytest with JSON output
cmd = [
"python",
"-m",
"pytest",
"tests/verifications/openai_api/test_chat_completion.py",
f"--provider={provider}",
"-v",
"--json-report",
f"--json-report-file={temp_json_file}",
]
# Append -k argument if provided
if keyword:
cmd.extend(["-k", keyword])
try:
# Run subprocess with cwd set to project root
result = subprocess.run(cmd, capture_output=True, text=True, cwd=project_root)
print(f"Pytest exit code: {result.returncode}")
# Check if the JSON file was created
if temp_json_file.exists():
with open(temp_json_file) as f:
test_results = json.load(f)
test_results["run_timestamp"] = timestamp
# Save results to the final (overwritten) file
with open(result_file, "w") as f:
json.dump(test_results, f, indent=2)
f.write("\n") # Add a trailing newline for precommit
# Clean up temp file
temp_json_file.unlink()
print(f"Test results saved to {result_file}")
return result_file
else:
print(f"Error: JSON report file not created for {provider}")
print(f"Command stdout: {result.stdout}")
print(f"Command stderr: {result.stderr}")
return None
except Exception as e:
print(f"Error running tests for {provider}: {e}")
return None
def run_multiple_tests(providers_to_run: list[str], keyword: str | None):
"""Runs tests for a list of providers."""
print(f"Running tests for providers: {', '.join(providers_to_run)}")
for provider in providers_to_run:
run_tests(provider.strip(), keyword=keyword)
print("Finished running tests.")
def parse_results(
result_file,
) -> tuple[defaultdict[str, defaultdict[str, dict[str, bool]]], defaultdict[str, set[str]], set[str], str]:
"""Parse a single test results file.
Returns:
Tuple containing:
- parsed_results: DefaultDict[provider, DefaultDict[model, Dict[test_name, pass_status]]]
- providers_in_file: DefaultDict[provider, Set[model]] found in this file.
- tests_in_file: Set[test_name] found in this file.
- run_timestamp: Timestamp when the test was run
"""
if not os.path.exists(result_file):
print(f"Results file does not exist: {result_file}")
# Return empty defaultdicts/set matching the type hint
return defaultdict(lambda: defaultdict(dict)), defaultdict(set), set(), ""
with open(result_file) as f:
results = json.load(f)
# Initialize results dictionary with specific types
parsed_results: defaultdict[str, defaultdict[str, dict[str, bool]]] = defaultdict(lambda: defaultdict(dict))
providers_in_file: defaultdict[str, set[str]] = defaultdict(set)
tests_in_file: set[str] = set()
# Extract provider from filename (e.g., "openai.json" -> "openai")
provider: str = result_file.stem
# Extract run timestamp from the JSON data
run_timestamp_unix = results.get("run_timestamp")
run_timestamp_str = (
time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(run_timestamp_unix))
if run_timestamp_unix is not None
else "Unknown"
)
# Debug: Print summary of test results
print(f"Test results summary for {provider}:")
print(f"Total tests: {results.get('summary', {}).get('total', 0)}")
print(f"Passed: {results.get('summary', {}).get('passed', 0)}")
print(f"Failed: {results.get('summary', {}).get('failed', 0)}")
print(f"Error: {results.get('summary', {}).get('error', 0)}")
print(f"Skipped: {results.get('summary', {}).get('skipped', 0)}")
# Extract test results
if "tests" not in results or not results["tests"]:
print(f"No test results found in {result_file}")
# Return empty defaultdicts/set matching the type hint
return defaultdict(lambda: defaultdict(dict)), defaultdict(set), set(), ""
# Process the tests
for test in results["tests"]:
test_id = test.get("nodeid", "")
if not (call_phase := test.get("call")):
continue
call_outcome = call_phase.get("outcome")
if call_outcome not in ("passed", "failed"):
continue
# --- Extract data from metadata ---
metadata = test.get("metadata", {})
model = metadata.get("model")
case_id = metadata.get("case_id") # String ID (if provided)
case_index = metadata.get("case_index") # Integer index (if no ID provided)
# Check if we have a model and at least one case identifier
if not model or (case_id is None and case_index is None):
print(
f"Warning: Missing 'model' or case identifier ('case_id'/'case_index') metadata for test: {test_id}. Skipping."
)
continue
try:
test_name_base = test_id.split("::")[1].split("[")[0]
except (IndexError, ValueError) as e:
print(f"Warning: Could not parse base test name for {test_id}. Error: {e}. Skipping.")
continue
# Construct detailed test name using ID or index
if case_id is not None:
detailed_test_name = f"{test_name_base} ({case_id})"
elif case_index == 0:
# If case_id is missing and index is 0, assume single case, use base name only
detailed_test_name = test_name_base
elif case_index is not None: # case_index > 0
# Use case_index for naming if case_id wasn't provided and index > 0
detailed_test_name = f"{test_name_base} (case{case_index})"
else:
# This case should be prevented by the earlier check, but handle defensively
print(f"Error: No case identifier found for test {test_id} after initial check. Skipping.")
continue
# Populate collections for this file
tests_in_file.add(detailed_test_name)
providers_in_file[provider].add(model)
if call_outcome == "passed":
parsed_results[provider][model][detailed_test_name] = True
elif call_outcome == "failed":
parsed_results[provider][model][detailed_test_name] = False
# Final Summary Warning (Optional)
if not parsed_results.get(provider):
print(f"Warning: No valid test results parsed for provider {provider} from file {result_file}")
return parsed_results, providers_in_file, tests_in_file, run_timestamp_str
def generate_report(
results_dict: dict[str, Any],
providers: dict[str, set[str]],
all_tests: set[str],
provider_timestamps: dict[str, str],
output_file=None,
):
"""Generate the markdown report.
Args:
results_dict: Aggregated results [provider][model][test_name] -> status.
providers: Dict of all providers and their models {provider: {models}}.
The order of keys in this dict determines the report order.
all_tests: Set of all test names found.
provider_timestamps: Dict of provider to timestamp when tests were run
output_file: Optional path to save the report.
"""
if output_file is None:
# Default to creating the report in the same directory as this script
output_file = Path(__file__).parent / "REPORT.md"
else:
output_file = Path(output_file)
# Convert provider model sets to sorted lists (use passed-in providers dict)
providers_sorted = {prov: sorted(models) for prov, models in providers.items()}
# Sort tests alphabetically (use passed-in all_tests set)
sorted_tests = sorted(all_tests)
# Calculate counts for each base test name
base_test_case_counts: defaultdict[str, int] = defaultdict(int)
base_test_name_map: dict[str, str] = {}
for test_name in sorted_tests:
match = re.match(r"^(.*?)( \([^)]+\))?$", test_name)
if match:
base_name = match.group(1).strip()
base_test_case_counts[base_name] += 1
base_test_name_map[test_name] = base_name
else:
# Should not happen with current naming, but handle defensively
base_test_case_counts[test_name] += 1
base_test_name_map[test_name] = test_name
if not sorted_tests:
print("Warning: No test results found to generate a report.")
# Optionally create an empty report or return early
with open(output_file, "w") as f:
f.write("# Test Results Report\n\nNo test results found.\n")
print(f"Generated empty report: {output_file}")
return
report = ["# Test Results Report\n"]
report.append(f"*Generated on: {time.strftime('%Y-%m-%d %H:%M:%S')}*\n")
report.append("*This report was generated by running `python tests/verifications/generate_report.py`*\n")
# Icons for pass/fail
pass_icon = ""
fail_icon = ""
na_icon = ""
# Add emoji legend
report.append("## Legend\n")
report.append(f"- {pass_icon} - Test passed")
report.append(f"- {fail_icon} - Test failed")
report.append(f"- {na_icon} - Test not applicable or not run for this model")
report.append("\n")
# Add a summary section
report.append("## Summary\n")
# Count total tests and passes (use passed-in providers and all_tests)
total_tests = 0
passed_tests = 0
provider_totals = {}
for provider, models in providers_sorted.items():
provider_passed = 0
provider_total = 0
if provider in results_dict:
for model in models:
if model in results_dict[provider]:
model_results = results_dict[provider][model]
for test in sorted_tests:
if test in model_results:
provider_total += 1
total_tests += 1
if model_results[test]:
provider_passed += 1
passed_tests += 1
provider_totals[provider] = (provider_passed, provider_total)
# Add summary table (use the order from the providers dict keys)
report.append("| Provider | Pass Rate | Tests Passed | Total Tests |")
report.append("| --- | --- | --- | --- |")
# Iterate through providers in the order they appear in the input dict
for provider in providers_sorted.keys():
passed, total = provider_totals.get(provider, (0, 0))
pass_rate = f"{(passed / total * 100):.1f}%" if total > 0 else "N/A"
report.append(f"| {provider.capitalize()} | {pass_rate} | {passed} | {total} |")
report.append("\n")
for provider in providers_sorted.keys():
provider_models = providers_sorted[provider] # Use sorted models
if not provider_models:
continue
report.append(f"\n## {provider.capitalize()}\n")
# Add timestamp when test was run
if provider in provider_timestamps:
report.append(f"*Tests run on: {provider_timestamps[provider]}*\n")
# Add test command for reproducing results
test_cmd_all = f"pytest tests/verifications/openai_api/test_chat_completion.py --provider={provider} -v"
report.append(f"```bash\n# Run all tests for this provider:\n{test_cmd_all}\n")
# Find an example test with a case ID
example_base_test_name = None
example_case_id = None
# Get first test as fallback base, handle empty list
first_test_name = sorted_tests[0] if sorted_tests else "unknown_test"
match = re.match(r"^(.*?) \((.*?)\)$", first_test_name)
if match:
example_base_test_name = match.group(1).strip()
example_case_id = match.group(2).strip()
else:
example_base_test_name = first_test_name
base_name = base_test_name_map.get(first_test_name, first_test_name) # Get base name
case_count = base_test_case_counts.get(base_name, 1) # Get count
filter_str = f"{example_base_test_name} and {example_case_id}" if case_count > 1 else example_base_test_name
test_cmd_specific_case = (
f'pytest tests/verifications/openai_api/test_chat_completion.py --provider={provider} -k "{filter_str}"'
)
report.append(
f"# Example: Run only the '{example_case_id}' case of {example_base_test_name}:\n{test_cmd_specific_case}\n```\n"
)
# Get display names (use passed-in providers dict)
provider_config = VERIFICATION_CONFIG.get("providers", {}).get(provider, {})
display_name_map = provider_config.get("model_display_names", {})
# Add Model Key Table (use provider_models)
report.append(f"\n**Model Key ({provider.capitalize()})**\n")
provider_key_lines = ["| Display Name | Full Model ID |", "| --- | --- |"]
for model_id in provider_models:
display_name = display_name_map.get(model_id, model_id)
provider_key_lines.append(f"| {display_name} | `{model_id}` |")
report.extend(provider_key_lines)
report.append("\n")
# Create results table header (use provider_models)
display_names = [display_name_map.get(m, m) for m in provider_models]
header = "| Test | " + " | ".join(display_names) + " |"
separator = "| --- | " + " | ".join(["---"] * len(provider_models)) + " |"
report.append(header)
report.append(separator)
# Get results for this provider from results_dict
provider_results_data = results_dict.get(provider, {})
# Add rows for each test (use sorted_tests)
for test in sorted_tests:
# Determine display name based on case count
base_name = base_test_name_map.get(test, test) # Get base name
case_count = base_test_case_counts.get(base_name, 1) # Get count
display_test_name = base_name if case_count == 1 else test # Choose display name
row = f"| {display_test_name} |" # Use display name
for model_id in provider_models:
if model_id in provider_results_data and test in provider_results_data[model_id]:
result = pass_icon if provider_results_data[model_id][test] else fail_icon
else:
result = na_icon
row += f" {result} |"
report.append(row)
# Write to file
with open(output_file, "w") as f:
f.write("\n".join(report))
f.write("\n")
print(f"Report generated: {output_file}")
def main():
parser = argparse.ArgumentParser(description="Generate test report")
parser.add_argument("--run-tests", action="store_true", help="Run tests before generating report")
parser.add_argument(
"--providers",
type=str,
nargs="+",
help="Specify providers to include/test (comma-separated or space-separated, default: uses DEFAULT_PROVIDERS)",
)
parser.add_argument("--output", type=str, help="Output file location (default: tests/verifications/REPORT.md)")
parser.add_argument("--k", type=str, help="Keyword expression to filter tests (passed to pytest -k)")
args = parser.parse_args()
all_results = {}
final_providers_order = {} # Dictionary to store results, preserving processing order
aggregated_tests = set()
provider_timestamps = {}
# 1. Determine the desired list and order of providers
if args.providers:
desired_providers = []
for provider_arg in args.providers:
desired_providers.extend([p.strip() for p in provider_arg.split(",")])
else:
desired_providers = DEFAULT_PROVIDERS # Use default order/list
# 2. Run tests if requested (using the desired provider list)
if args.run_tests:
run_multiple_tests(desired_providers, args.k)
for provider in desired_providers:
# Construct the expected result file path directly
result_file = RESULTS_DIR / f"{provider}.json"
if result_file.exists(): # Check if the specific file exists
print(f"Loading results for {provider} from {result_file}")
try:
parsed_data = parse_results(result_file)
parsed_results, providers_in_file, tests_in_file, run_timestamp = parsed_data
all_results.update(parsed_results)
aggregated_tests.update(tests_in_file)
# Add models for this provider, ensuring it's added in the correct report order
if provider in providers_in_file:
if provider not in final_providers_order:
final_providers_order[provider] = set()
final_providers_order[provider].update(providers_in_file[provider])
if run_timestamp != "Unknown":
provider_timestamps[provider] = run_timestamp
else:
print(
f"Warning: Provider '{provider}' found in desired list but not within its result file data ({result_file})."
)
except Exception as e:
print(f"Error parsing results for provider {provider} from {result_file}: {e}")
else:
# Only print warning if we expected results (i.e., provider was in the desired list)
print(f"Result file for desired provider '{provider}' not found at {result_file}. Skipping.")
# 5. Generate the report using the filtered & ordered results
print(f"Final Provider Order for Report: {list(final_providers_order.keys())}")
generate_report(all_results, final_providers_order, aggregated_tests, provider_timestamps, args.output)
if __name__ == "__main__":
main()

View file

@ -1,162 +0,0 @@
# This is a temporary run file because model names used by the verification tests
# are not quite consistent with various pre-existing distributions.
#
version: '2'
image_name: openai-api-verification
apis:
- agents
- inference
- telemetry
- tool_runtime
- vector_io
- safety
providers:
inference:
- provider_id: together
provider_type: remote::together
config:
url: https://api.together.xyz/v1
api_key: ${env.TOGETHER_API_KEY:}
- provider_id: fireworks
provider_type: remote::fireworks
config:
url: https://api.fireworks.ai/inference/v1
api_key: ${env.FIREWORKS_API_KEY:}
- provider_id: groq
provider_type: remote::groq
config:
url: https://api.groq.com
api_key: ${env.GROQ_API_KEY:}
- provider_id: openai
provider_type: remote::openai
config:
url: https://api.openai.com/v1
api_key: ${env.OPENAI_API_KEY:}
- provider_id: sentence-transformers
provider_type: inline::sentence-transformers
config: {}
vector_io:
- provider_id: faiss
provider_type: inline::faiss
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/openai}/faiss_store.db
telemetry:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
sinks: ${env.TELEMETRY_SINKS:console,sqlite}
sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/openai-api-verification}/trace_store.db
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config:
excluded_categories: []
agents:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
persistence_store:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/openai}/agents_store.db
responses_store:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/openai}/responses_store.db
tool_runtime:
- provider_id: brave-search
provider_type: remote::brave-search
config:
api_key: ${env.BRAVE_SEARCH_API_KEY:}
max_results: 3
- provider_id: tavily-search
provider_type: remote::tavily-search
config:
api_key: ${env.TAVILY_SEARCH_API_KEY:}
max_results: 3
- provider_id: rag-runtime
provider_type: inline::rag-runtime
config: {}
- provider_id: model-context-protocol
provider_type: remote::model-context-protocol
config: {}
- provider_id: wolfram-alpha
provider_type: remote::wolfram-alpha
config:
api_key: ${env.WOLFRAM_ALPHA_API_KEY:}
metadata_store:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/openai}/registry.db
models:
- metadata: {}
model_id: together/meta-llama/Llama-3.3-70B-Instruct-Turbo
provider_id: together
provider_model_id: meta-llama/Llama-3.3-70B-Instruct-Turbo
model_type: llm
- metadata: {}
model_id: together/meta-llama/Llama-4-Scout-17B-16E-Instruct
provider_id: together
provider_model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct
model_type: llm
- metadata: {}
model_id: together/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
provider_id: together
provider_model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
model_type: llm
- metadata: {}
model_id: fireworks/llama-v3p3-70b-instruct
provider_id: fireworks
provider_model_id: accounts/fireworks/models/llama-v3p3-70b-instruct
model_type: llm
- metadata: {}
model_id: fireworks/llama4-scout-instruct-basic
provider_id: fireworks
provider_model_id: accounts/fireworks/models/llama4-scout-instruct-basic
model_type: llm
- metadata: {}
model_id: fireworks/llama4-maverick-instruct-basic
provider_id: fireworks
provider_model_id: accounts/fireworks/models/llama4-maverick-instruct-basic
model_type: llm
- metadata: {}
model_id: groq/llama-3.3-70b-versatile
provider_id: groq
provider_model_id: groq/llama-3.3-70b-versatile
model_type: llm
- metadata: {}
model_id: groq/llama-4-scout-17b-16e-instruct
provider_id: groq
provider_model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct
model_type: llm
- metadata: {}
model_id: groq/llama-4-maverick-17b-128e-instruct
provider_id: groq
provider_model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct
model_type: llm
- metadata: {}
model_id: openai/gpt-4o
provider_id: openai
provider_model_id: openai/gpt-4o
model_type: llm
- metadata: {}
model_id: openai/gpt-4o-mini
provider_id: openai
provider_model_id: openai/gpt-4o-mini
model_type: llm
shields: []
vector_dbs: []
datasets: []
scoring_fns: []
benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search
- toolgroup_id: builtin::rag
provider_id: rag-runtime
- toolgroup_id: builtin::wolfram_alpha
provider_id: wolfram-alpha
server:
port: 8321

View file

@ -1,5 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.

View file

@ -1,40 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from tests.verifications.openai_api.fixtures.fixtures import _load_all_verification_configs
def pytest_generate_tests(metafunc):
"""Dynamically parametrize tests based on the selected provider and config."""
if "model" in metafunc.fixturenames:
model = metafunc.config.getoption("model")
if model:
metafunc.parametrize("model", [model])
return
provider = metafunc.config.getoption("provider")
if not provider:
print("Warning: --provider not specified. Skipping model parametrization.")
metafunc.parametrize("model", [])
return
try:
config_data = _load_all_verification_configs()
except (OSError, FileNotFoundError) as e:
print(f"ERROR loading verification configs: {e}")
config_data = {"providers": {}}
provider_config = config_data.get("providers", {}).get(provider)
if provider_config:
models = provider_config.get("models", [])
if models:
metafunc.parametrize("model", models)
else:
print(f"Warning: No models found for provider '{provider}' in config.")
metafunc.parametrize("model", []) # Parametrize empty if no models found
else:
print(f"Warning: Provider '{provider}' not found in config. No models parametrized.")
metafunc.parametrize("model", []) # Parametrize empty if provider not found

View file

@ -1,5 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.

View file

@ -1,717 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import base64
import copy
import json
from pathlib import Path
from typing import Any
import pytest
from openai import APIError
from pydantic import BaseModel
from tests.verifications.openai_api.fixtures.fixtures import (
case_id_generator,
get_base_test_name,
should_skip_test,
)
from tests.verifications.openai_api.fixtures.load import load_test_cases
chat_completion_test_cases = load_test_cases("chat_completion")
THIS_DIR = Path(__file__).parent
@pytest.fixture
def multi_image_data():
files = [
THIS_DIR / "fixtures/images/vision_test_1.jpg",
THIS_DIR / "fixtures/images/vision_test_2.jpg",
THIS_DIR / "fixtures/images/vision_test_3.jpg",
]
encoded_files = []
for file in files:
with open(file, "rb") as image_file:
base64_data = base64.b64encode(image_file.read()).decode("utf-8")
encoded_files.append(f"data:image/jpeg;base64,{base64_data}")
return encoded_files
# --- Test Functions ---
@pytest.mark.parametrize(
"case",
chat_completion_test_cases["test_chat_basic"]["test_params"]["case"],
ids=case_id_generator,
)
def test_chat_non_streaming_basic(request, openai_client, model, provider, verification_config, case):
test_name_base = get_base_test_name(request)
if should_skip_test(verification_config, provider, model, test_name_base):
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
response = openai_client.chat.completions.create(
model=model,
messages=case["input"]["messages"],
stream=False,
)
assert response.choices[0].message.role == "assistant"
assert case["output"].lower() in response.choices[0].message.content.lower()
@pytest.mark.parametrize(
"case",
chat_completion_test_cases["test_chat_basic"]["test_params"]["case"],
ids=case_id_generator,
)
def test_chat_streaming_basic(request, openai_client, model, provider, verification_config, case):
test_name_base = get_base_test_name(request)
if should_skip_test(verification_config, provider, model, test_name_base):
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
response = openai_client.chat.completions.create(
model=model,
messages=case["input"]["messages"],
stream=True,
)
content = ""
for chunk in response:
content += chunk.choices[0].delta.content or ""
# TODO: add detailed type validation
assert case["output"].lower() in content.lower()
@pytest.mark.parametrize(
"case",
chat_completion_test_cases["test_chat_input_validation"]["test_params"]["case"],
ids=case_id_generator,
)
def test_chat_non_streaming_error_handling(request, openai_client, model, provider, verification_config, case):
test_name_base = get_base_test_name(request)
if should_skip_test(verification_config, provider, model, test_name_base):
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
with pytest.raises(APIError) as e:
openai_client.chat.completions.create(
model=model,
messages=case["input"]["messages"],
stream=False,
tool_choice=case["input"]["tool_choice"] if "tool_choice" in case["input"] else None,
tools=case["input"]["tools"] if "tools" in case["input"] else None,
)
assert case["output"]["error"]["status_code"] == e.value.status_code
@pytest.mark.parametrize(
"case",
chat_completion_test_cases["test_chat_input_validation"]["test_params"]["case"],
ids=case_id_generator,
)
def test_chat_streaming_error_handling(request, openai_client, model, provider, verification_config, case):
test_name_base = get_base_test_name(request)
if should_skip_test(verification_config, provider, model, test_name_base):
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
with pytest.raises(APIError) as e:
response = openai_client.chat.completions.create(
model=model,
messages=case["input"]["messages"],
stream=True,
tool_choice=case["input"]["tool_choice"] if "tool_choice" in case["input"] else None,
tools=case["input"]["tools"] if "tools" in case["input"] else None,
)
for _chunk in response:
pass
assert str(case["output"]["error"]["status_code"]) in e.value.message
@pytest.mark.parametrize(
"case",
chat_completion_test_cases["test_chat_image"]["test_params"]["case"],
ids=case_id_generator,
)
def test_chat_non_streaming_image(request, openai_client, model, provider, verification_config, case):
test_name_base = get_base_test_name(request)
if should_skip_test(verification_config, provider, model, test_name_base):
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
response = openai_client.chat.completions.create(
model=model,
messages=case["input"]["messages"],
stream=False,
)
assert response.choices[0].message.role == "assistant"
assert case["output"].lower() in response.choices[0].message.content.lower()
@pytest.mark.parametrize(
"case",
chat_completion_test_cases["test_chat_image"]["test_params"]["case"],
ids=case_id_generator,
)
def test_chat_streaming_image(request, openai_client, model, provider, verification_config, case):
test_name_base = get_base_test_name(request)
if should_skip_test(verification_config, provider, model, test_name_base):
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
response = openai_client.chat.completions.create(
model=model,
messages=case["input"]["messages"],
stream=True,
)
content = ""
for chunk in response:
content += chunk.choices[0].delta.content or ""
# TODO: add detailed type validation
assert case["output"].lower() in content.lower()
@pytest.mark.parametrize(
"case",
chat_completion_test_cases["test_chat_structured_output"]["test_params"]["case"],
ids=case_id_generator,
)
def test_chat_non_streaming_structured_output(request, openai_client, model, provider, verification_config, case):
test_name_base = get_base_test_name(request)
if should_skip_test(verification_config, provider, model, test_name_base):
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
response = openai_client.chat.completions.create(
model=model,
messages=case["input"]["messages"],
response_format=case["input"]["response_format"],
stream=False,
)
assert response.choices[0].message.role == "assistant"
maybe_json_content = response.choices[0].message.content
validate_structured_output(maybe_json_content, case["output"])
@pytest.mark.parametrize(
"case",
chat_completion_test_cases["test_chat_structured_output"]["test_params"]["case"],
ids=case_id_generator,
)
def test_chat_streaming_structured_output(request, openai_client, model, provider, verification_config, case):
test_name_base = get_base_test_name(request)
if should_skip_test(verification_config, provider, model, test_name_base):
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
response = openai_client.chat.completions.create(
model=model,
messages=case["input"]["messages"],
response_format=case["input"]["response_format"],
stream=True,
)
maybe_json_content = ""
for chunk in response:
maybe_json_content += chunk.choices[0].delta.content or ""
validate_structured_output(maybe_json_content, case["output"])
@pytest.mark.parametrize(
"case",
chat_completion_test_cases["test_tool_calling"]["test_params"]["case"],
ids=case_id_generator,
)
def test_chat_non_streaming_tool_calling(request, openai_client, model, provider, verification_config, case):
test_name_base = get_base_test_name(request)
if should_skip_test(verification_config, provider, model, test_name_base):
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
response = openai_client.chat.completions.create(
model=model,
messages=case["input"]["messages"],
tools=case["input"]["tools"],
stream=False,
)
assert response.choices[0].message.role == "assistant"
assert len(response.choices[0].message.tool_calls) > 0
assert case["output"] == "get_weather_tool_call"
assert response.choices[0].message.tool_calls[0].function.name == "get_weather"
# TODO: add detailed type validation
@pytest.mark.parametrize(
"case",
chat_completion_test_cases["test_tool_calling"]["test_params"]["case"],
ids=case_id_generator,
)
def test_chat_streaming_tool_calling(request, openai_client, model, provider, verification_config, case):
test_name_base = get_base_test_name(request)
if should_skip_test(verification_config, provider, model, test_name_base):
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
stream = openai_client.chat.completions.create(
model=model,
messages=case["input"]["messages"],
tools=case["input"]["tools"],
stream=True,
)
_, tool_calls_buffer = _accumulate_streaming_tool_calls(stream)
assert len(tool_calls_buffer) == 1
for call in tool_calls_buffer:
assert len(call["id"]) > 0
function = call["function"]
assert function["name"] == "get_weather"
args_dict = json.loads(function["arguments"])
assert "san francisco" in args_dict["location"].lower()
@pytest.mark.parametrize(
"case",
chat_completion_test_cases["test_tool_calling"]["test_params"]["case"], # Reusing existing case for now
ids=case_id_generator,
)
def test_chat_non_streaming_tool_choice_required(request, openai_client, model, provider, verification_config, case):
test_name_base = get_base_test_name(request)
if should_skip_test(verification_config, provider, model, test_name_base):
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
response = openai_client.chat.completions.create(
model=model,
messages=case["input"]["messages"],
tools=case["input"]["tools"],
tool_choice="required", # Force tool call
stream=False,
)
assert response.choices[0].message.role == "assistant"
assert len(response.choices[0].message.tool_calls) > 0, "Expected tool call when tool_choice='required'"
expected_tool_name = case["input"]["tools"][0]["function"]["name"]
assert response.choices[0].message.tool_calls[0].function.name == expected_tool_name
@pytest.mark.parametrize(
"case",
chat_completion_test_cases["test_tool_calling"]["test_params"]["case"], # Reusing existing case for now
ids=case_id_generator,
)
def test_chat_streaming_tool_choice_required(request, openai_client, model, provider, verification_config, case):
test_name_base = get_base_test_name(request)
if should_skip_test(verification_config, provider, model, test_name_base):
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
stream = openai_client.chat.completions.create(
model=model,
messages=case["input"]["messages"],
tools=case["input"]["tools"],
tool_choice="required", # Force tool call
stream=True,
)
_, tool_calls_buffer = _accumulate_streaming_tool_calls(stream)
assert len(tool_calls_buffer) > 0, "Expected tool call when tool_choice='required'"
expected_tool_name = case["input"]["tools"][0]["function"]["name"]
assert any(call["function"]["name"] == expected_tool_name for call in tool_calls_buffer), (
f"Expected tool call '{expected_tool_name}' not found in stream"
)
@pytest.mark.parametrize(
"case",
chat_completion_test_cases["test_tool_calling"]["test_params"]["case"], # Reusing existing case for now
ids=case_id_generator,
)
def test_chat_non_streaming_tool_choice_none(request, openai_client, model, provider, verification_config, case):
test_name_base = get_base_test_name(request)
if should_skip_test(verification_config, provider, model, test_name_base):
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
response = openai_client.chat.completions.create(
model=model,
messages=case["input"]["messages"],
tools=case["input"]["tools"],
tool_choice="none",
stream=False,
)
assert response.choices[0].message.role == "assistant"
assert response.choices[0].message.tool_calls is None, "Expected no tool calls when tool_choice='none'"
assert response.choices[0].message.content is not None, "Expected content when tool_choice='none'"
@pytest.mark.parametrize(
"case",
chat_completion_test_cases["test_tool_calling"]["test_params"]["case"], # Reusing existing case for now
ids=case_id_generator,
)
def test_chat_streaming_tool_choice_none(request, openai_client, model, provider, verification_config, case):
test_name_base = get_base_test_name(request)
if should_skip_test(verification_config, provider, model, test_name_base):
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
stream = openai_client.chat.completions.create(
model=model,
messages=case["input"]["messages"],
tools=case["input"]["tools"],
tool_choice="none",
stream=True,
)
content = ""
for chunk in stream:
delta = chunk.choices[0].delta
if delta.content:
content += delta.content
assert not delta.tool_calls, "Expected no tool call chunks when tool_choice='none'"
assert len(content) > 0, "Expected content when tool_choice='none'"
@pytest.mark.parametrize(
"case",
chat_completion_test_cases.get("test_chat_multi_turn_tool_calling", {}).get("test_params", {}).get("case", []),
ids=case_id_generator,
)
def test_chat_non_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):
"""
Test cases for multi-turn tool calling.
Tool calls are asserted.
Tool responses are provided in the test case.
Final response is asserted.
"""
test_name_base = get_base_test_name(request)
if should_skip_test(verification_config, provider, model, test_name_base):
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
# Create a copy of the messages list to avoid modifying the original
messages = []
tools = case["input"]["tools"]
# Use deepcopy to prevent modification across runs/parametrization
expected_results = copy.deepcopy(case["expected"])
tool_responses = copy.deepcopy(case.get("tool_responses", []))
input_messages_turns = copy.deepcopy(case["input"]["messages"])
# keep going until either
# 1. we have messages to test in multi-turn
# 2. no messages but last message is tool response
while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1]["role"] == "tool"):
# do not take new messages if last message is tool response
if len(messages) == 0 or messages[-1]["role"] != "tool":
new_messages = input_messages_turns.pop(0)
# Ensure new_messages is a list of message objects
if isinstance(new_messages, list):
messages.extend(new_messages)
else:
# If it's a single message object, add it directly
messages.append(new_messages)
# --- API Call ---
response = openai_client.chat.completions.create(
model=model,
messages=messages,
tools=tools,
stream=False,
)
# --- Process Response ---
assistant_message = response.choices[0].message
messages.append(assistant_message.model_dump(exclude_unset=True))
assert assistant_message.role == "assistant"
# Get the expected result data
expected = expected_results.pop(0)
num_tool_calls = expected["num_tool_calls"]
# --- Assertions based on expected result ---
assert len(assistant_message.tool_calls or []) == num_tool_calls, (
f"Expected {num_tool_calls} tool calls, but got {len(assistant_message.tool_calls or [])}"
)
if num_tool_calls > 0:
tool_call = assistant_message.tool_calls[0]
assert tool_call.function.name == expected["tool_name"], (
f"Expected tool '{expected['tool_name']}', got '{tool_call.function.name}'"
)
# Parse the JSON string arguments before comparing
actual_arguments = json.loads(tool_call.function.arguments)
assert actual_arguments == expected["tool_arguments"], (
f"Expected arguments '{expected['tool_arguments']}', got '{actual_arguments}'"
)
# Prepare and append the tool response for the next turn
tool_response = tool_responses.pop(0)
messages.append(
{
"role": "tool",
"tool_call_id": tool_call.id,
"content": tool_response["response"],
}
)
else:
assert assistant_message.content is not None, "Expected content, but none received."
expected_answers = expected["answer"] # This is now a list
content_lower = assistant_message.content.lower()
assert any(ans.lower() in content_lower for ans in expected_answers), (
f"Expected one of {expected_answers} in content, but got: '{assistant_message.content}'"
)
@pytest.mark.parametrize(
"case",
chat_completion_test_cases.get("test_chat_multi_turn_tool_calling", {}).get("test_params", {}).get("case", []),
ids=case_id_generator,
)
def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):
""" """
test_name_base = get_base_test_name(request)
if should_skip_test(verification_config, provider, model, test_name_base):
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
messages = []
tools = case["input"]["tools"]
expected_results = copy.deepcopy(case["expected"])
tool_responses = copy.deepcopy(case.get("tool_responses", []))
input_messages_turns = copy.deepcopy(case["input"]["messages"])
while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1]["role"] == "tool"):
if len(messages) == 0 or messages[-1]["role"] != "tool":
new_messages = input_messages_turns.pop(0)
if isinstance(new_messages, list):
messages.extend(new_messages)
else:
messages.append(new_messages)
# --- API Call (Streaming) ---
stream = openai_client.chat.completions.create(
model=model,
messages=messages,
tools=tools,
stream=True,
)
# --- Process Stream ---
accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)
# --- Construct Assistant Message for History ---
assistant_message_dict = {"role": "assistant"}
if accumulated_content:
assistant_message_dict["content"] = accumulated_content
if accumulated_tool_calls:
assistant_message_dict["tool_calls"] = accumulated_tool_calls
messages.append(assistant_message_dict)
# --- Assertions ---
expected = expected_results.pop(0)
num_tool_calls = expected["num_tool_calls"]
assert len(accumulated_tool_calls or []) == num_tool_calls, (
f"Expected {num_tool_calls} tool calls, but got {len(accumulated_tool_calls or [])}"
)
if num_tool_calls > 0:
# Use the first accumulated tool call for assertion
tool_call = accumulated_tool_calls[0]
assert tool_call["function"]["name"] == expected["tool_name"], (
f"Expected tool '{expected['tool_name']}', got '{tool_call['function']['name']}'"
)
# Parse the accumulated arguments string for comparison
actual_arguments = json.loads(tool_call["function"]["arguments"])
assert actual_arguments == expected["tool_arguments"], (
f"Expected arguments '{expected['tool_arguments']}', got '{actual_arguments}'"
)
# Prepare and append the tool response for the next turn
tool_response = tool_responses.pop(0)
messages.append(
{
"role": "tool",
"tool_call_id": tool_call["id"],
"content": tool_response["response"],
}
)
else:
assert accumulated_content is not None and accumulated_content != "", "Expected content, but none received."
expected_answers = expected["answer"]
content_lower = accumulated_content.lower()
assert any(ans.lower() in content_lower for ans in expected_answers), (
f"Expected one of {expected_answers} in content, but got: '{accumulated_content}'"
)
@pytest.mark.parametrize("stream", [False, True], ids=["stream=False", "stream=True"])
def test_chat_multi_turn_multiple_images(
request, openai_client, model, provider, verification_config, multi_image_data, stream
):
test_name_base = get_base_test_name(request)
if should_skip_test(verification_config, provider, model, test_name_base):
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
messages_turn1 = [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": multi_image_data[0],
},
},
{
"type": "image_url",
"image_url": {
"url": multi_image_data[1],
},
},
{
"type": "text",
"text": "What furniture is in the first image that is not in the second image?",
},
],
},
]
# First API call
response1 = openai_client.chat.completions.create(
model=model,
messages=messages_turn1,
stream=stream,
)
if stream:
message_content1 = ""
for chunk in response1:
message_content1 += chunk.choices[0].delta.content or ""
else:
message_content1 = response1.choices[0].message.content
assert len(message_content1) > 0
assert any(expected in message_content1.lower().strip() for expected in {"chair", "table"}), message_content1
# Prepare messages for the second turn
messages_turn2 = messages_turn1 + [
{"role": "assistant", "content": message_content1},
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": multi_image_data[2],
},
},
{"type": "text", "text": "What is in this image that is also in the first image?"},
],
},
]
# Second API call
response2 = openai_client.chat.completions.create(
model=model,
messages=messages_turn2,
stream=stream,
)
if stream:
message_content2 = ""
for chunk in response2:
message_content2 += chunk.choices[0].delta.content or ""
else:
message_content2 = response2.choices[0].message.content
assert len(message_content2) > 0
assert any(expected in message_content2.lower().strip() for expected in {"bed"}), message_content2
# --- Helper functions (structured output validation) ---
def get_structured_output(maybe_json_content: str, schema_name: str) -> Any | None:
if schema_name == "valid_calendar_event":
class CalendarEvent(BaseModel):
name: str
date: str
participants: list[str]
try:
calendar_event = CalendarEvent.model_validate_json(maybe_json_content)
return calendar_event
except Exception:
return None
elif schema_name == "valid_math_reasoning":
class Step(BaseModel):
explanation: str
output: str
class MathReasoning(BaseModel):
steps: list[Step]
final_answer: str
try:
math_reasoning = MathReasoning.model_validate_json(maybe_json_content)
return math_reasoning
except Exception:
return None
return None
def validate_structured_output(maybe_json_content: str, schema_name: str) -> None:
structured_output = get_structured_output(maybe_json_content, schema_name)
assert structured_output is not None
if schema_name == "valid_calendar_event":
assert structured_output.name is not None
assert structured_output.date is not None
assert len(structured_output.participants) == 2
elif schema_name == "valid_math_reasoning":
assert len(structured_output.final_answer) > 0
def _accumulate_streaming_tool_calls(stream):
"""Accumulates tool calls and content from a streaming ChatCompletion response."""
tool_calls_buffer = {}
current_id = None
full_content = "" # Initialize content accumulator
# Process streaming chunks
for chunk in stream:
choice = chunk.choices[0]
delta = choice.delta
# Accumulate content
if delta.content:
full_content += delta.content
if delta.tool_calls is None:
continue
for tool_call_delta in delta.tool_calls:
if tool_call_delta.id:
current_id = tool_call_delta.id
call_id = current_id
# Skip if no ID seen yet for this tool call delta
if not call_id:
continue
func_delta = tool_call_delta.function
if call_id not in tool_calls_buffer:
tool_calls_buffer[call_id] = {
"id": call_id,
"type": "function", # Assume function type
"function": {"name": None, "arguments": ""}, # Nested structure
}
# Accumulate name and arguments into the nested function dict
if func_delta:
if func_delta.name:
tool_calls_buffer[call_id]["function"]["name"] = func_delta.name
if func_delta.arguments:
tool_calls_buffer[call_id]["function"]["arguments"] += func_delta.arguments
# Return content and tool calls as a list
return full_content, list(tool_calls_buffer.values())

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because one or more lines are too long