mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-04 12:07:34 +00:00
chore(test): migrate unit tests from unittest to pytest nvidia test eval
Signed-off-by: Mustafa Elbehery <melbeher@redhat.com>
This commit is contained in:
parent
ed418653ec
commit
352d990176
1 changed files with 178 additions and 153 deletions
|
@ -5,7 +5,6 @@
|
||||||
# the root directory of this source tree.
|
# the root directory of this source tree.
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import unittest
|
|
||||||
from unittest.mock import MagicMock, patch
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
@ -13,6 +12,8 @@ import pytest
|
||||||
from llama_stack.apis.benchmarks import Benchmark
|
from llama_stack.apis.benchmarks import Benchmark
|
||||||
from llama_stack.apis.common.job_types import Job, JobStatus
|
from llama_stack.apis.common.job_types import Job, JobStatus
|
||||||
from llama_stack.apis.eval.eval import BenchmarkConfig, EvaluateResponse, ModelCandidate, SamplingParams
|
from llama_stack.apis.eval.eval import BenchmarkConfig, EvaluateResponse, ModelCandidate, SamplingParams
|
||||||
|
from llama_stack.apis.inference.inference import TopPSamplingStrategy
|
||||||
|
from llama_stack.apis.resource import ResourceType
|
||||||
from llama_stack.models.llama.sku_types import CoreModelId
|
from llama_stack.models.llama.sku_types import CoreModelId
|
||||||
from llama_stack.providers.remote.eval.nvidia.config import NVIDIAEvalConfig
|
from llama_stack.providers.remote.eval.nvidia.config import NVIDIAEvalConfig
|
||||||
from llama_stack.providers.remote.eval.nvidia.eval import NVIDIAEvalImpl
|
from llama_stack.providers.remote.eval.nvidia.eval import NVIDIAEvalImpl
|
||||||
|
@ -21,181 +22,205 @@ MOCK_DATASET_ID = "default/test-dataset"
|
||||||
MOCK_BENCHMARK_ID = "test-benchmark"
|
MOCK_BENCHMARK_ID = "test-benchmark"
|
||||||
|
|
||||||
|
|
||||||
class TestNVIDIAEvalImpl(unittest.TestCase):
|
@pytest.fixture
|
||||||
def setUp(self):
|
def nvidia_eval_setup():
|
||||||
os.environ["NVIDIA_EVALUATOR_URL"] = "http://nemo.test"
|
"""Set up the NVIDIA eval implementation with mocked dependencies."""
|
||||||
|
os.environ["NVIDIA_EVALUATOR_URL"] = "http://nemo.test"
|
||||||
|
|
||||||
# Create mock APIs
|
# Create mock APIs
|
||||||
self.datasetio_api = MagicMock()
|
datasetio_api = MagicMock()
|
||||||
self.datasets_api = MagicMock()
|
datasets_api = MagicMock()
|
||||||
self.scoring_api = MagicMock()
|
scoring_api = MagicMock()
|
||||||
self.inference_api = MagicMock()
|
inference_api = MagicMock()
|
||||||
self.agents_api = MagicMock()
|
agents_api = MagicMock()
|
||||||
|
|
||||||
self.config = NVIDIAEvalConfig(
|
config = NVIDIAEvalConfig(
|
||||||
evaluator_url=os.environ["NVIDIA_EVALUATOR_URL"],
|
evaluator_url=os.environ["NVIDIA_EVALUATOR_URL"],
|
||||||
)
|
)
|
||||||
|
|
||||||
self.eval_impl = NVIDIAEvalImpl(
|
eval_impl = NVIDIAEvalImpl(
|
||||||
config=self.config,
|
config=config,
|
||||||
datasetio_api=self.datasetio_api,
|
datasetio_api=datasetio_api,
|
||||||
datasets_api=self.datasets_api,
|
datasets_api=datasets_api,
|
||||||
scoring_api=self.scoring_api,
|
scoring_api=scoring_api,
|
||||||
inference_api=self.inference_api,
|
inference_api=inference_api,
|
||||||
agents_api=self.agents_api,
|
agents_api=agents_api,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Mock the HTTP request methods
|
# Mock the HTTP request methods
|
||||||
self.evaluator_get_patcher = patch(
|
with (
|
||||||
"llama_stack.providers.remote.eval.nvidia.eval.NVIDIAEvalImpl._evaluator_get"
|
patch("llama_stack.providers.remote.eval.nvidia.eval.NVIDIAEvalImpl._evaluator_get") as mock_evaluator_get,
|
||||||
)
|
patch("llama_stack.providers.remote.eval.nvidia.eval.NVIDIAEvalImpl._evaluator_post") as mock_evaluator_post,
|
||||||
self.evaluator_post_patcher = patch(
|
):
|
||||||
"llama_stack.providers.remote.eval.nvidia.eval.NVIDIAEvalImpl._evaluator_post"
|
yield {
|
||||||
)
|
"eval_impl": eval_impl,
|
||||||
|
"mock_evaluator_get": mock_evaluator_get,
|
||||||
self.mock_evaluator_get = self.evaluator_get_patcher.start()
|
"mock_evaluator_post": mock_evaluator_post,
|
||||||
self.mock_evaluator_post = self.evaluator_post_patcher.start()
|
"datasetio_api": datasetio_api,
|
||||||
|
"datasets_api": datasets_api,
|
||||||
def tearDown(self):
|
"scoring_api": scoring_api,
|
||||||
"""Clean up after each test."""
|
"inference_api": inference_api,
|
||||||
self.evaluator_get_patcher.stop()
|
"agents_api": agents_api,
|
||||||
self.evaluator_post_patcher.stop()
|
|
||||||
|
|
||||||
def _assert_request_body(self, expected_json):
|
|
||||||
"""Helper method to verify request body in Evaluator POST request is correct"""
|
|
||||||
call_args = self.mock_evaluator_post.call_args
|
|
||||||
actual_json = call_args[0][1]
|
|
||||||
|
|
||||||
# Check that all expected keys contain the expected values in the actual JSON
|
|
||||||
for key, value in expected_json.items():
|
|
||||||
assert key in actual_json, f"Key '{key}' missing in actual JSON"
|
|
||||||
|
|
||||||
if isinstance(value, dict):
|
|
||||||
for nested_key, nested_value in value.items():
|
|
||||||
assert nested_key in actual_json[key], f"Nested key '{nested_key}' missing in actual JSON['{key}']"
|
|
||||||
assert actual_json[key][nested_key] == nested_value, f"Value mismatch for '{key}.{nested_key}'"
|
|
||||||
else:
|
|
||||||
assert actual_json[key] == value, f"Value mismatch for '{key}'"
|
|
||||||
|
|
||||||
@pytest.fixture(autouse=True)
|
|
||||||
def inject_fixtures(self, run_async):
|
|
||||||
self.run_async = run_async
|
|
||||||
|
|
||||||
def test_register_benchmark(self):
|
|
||||||
eval_config = {
|
|
||||||
"type": "custom",
|
|
||||||
"params": {"parallelism": 8},
|
|
||||||
"tasks": {
|
|
||||||
"qa": {
|
|
||||||
"type": "completion",
|
|
||||||
"params": {"template": {"prompt": "{{prompt}}", "max_tokens": 200}},
|
|
||||||
"dataset": {"files_url": f"hf://datasets/{MOCK_DATASET_ID}/testing/testing.jsonl"},
|
|
||||||
"metrics": {"bleu": {"type": "bleu", "params": {"references": ["{{ideal_response}}"]}}},
|
|
||||||
}
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
benchmark = Benchmark(
|
|
||||||
provider_id="nvidia",
|
|
||||||
type="benchmark",
|
|
||||||
identifier=MOCK_BENCHMARK_ID,
|
|
||||||
dataset_id=MOCK_DATASET_ID,
|
|
||||||
scoring_functions=["basic::equality"],
|
|
||||||
metadata=eval_config,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Mock Evaluator API response
|
def _assert_request_body(mock_evaluator_post, expected_json):
|
||||||
mock_evaluator_response = {"id": MOCK_BENCHMARK_ID, "status": "created"}
|
"""Helper method to verify request body in Evaluator POST request is correct"""
|
||||||
self.mock_evaluator_post.return_value = mock_evaluator_response
|
call_args = mock_evaluator_post.call_args
|
||||||
|
actual_json = call_args[0][1]
|
||||||
|
|
||||||
# Register the benchmark
|
# Check that all expected keys contain the expected values in the actual JSON
|
||||||
self.run_async(self.eval_impl.register_benchmark(benchmark))
|
for key, value in expected_json.items():
|
||||||
|
assert key in actual_json, f"Key '{key}' missing in actual JSON"
|
||||||
|
|
||||||
# Verify the Evaluator API was called correctly
|
if isinstance(value, dict):
|
||||||
self.mock_evaluator_post.assert_called_once()
|
for nested_key, nested_value in value.items():
|
||||||
self._assert_request_body({"namespace": benchmark.provider_id, "name": benchmark.identifier, **eval_config})
|
assert nested_key in actual_json[key], f"Nested key '{nested_key}' missing in actual JSON['{key}']"
|
||||||
|
assert actual_json[key][nested_key] == nested_value, f"Value mismatch for '{key}.{nested_key}'"
|
||||||
|
else:
|
||||||
|
assert actual_json[key] == value, f"Value mismatch for '{key}'"
|
||||||
|
|
||||||
def test_run_eval(self):
|
|
||||||
benchmark_config = BenchmarkConfig(
|
|
||||||
eval_candidate=ModelCandidate(
|
|
||||||
type="model",
|
|
||||||
model=CoreModelId.llama3_1_8b_instruct.value,
|
|
||||||
sampling_params=SamplingParams(max_tokens=100, temperature=0.7),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Mock Evaluator API response
|
@pytest.mark.asyncio
|
||||||
mock_evaluator_response = {"id": "job-123", "status": "created"}
|
async def test_register_benchmark(nvidia_eval_setup):
|
||||||
self.mock_evaluator_post.return_value = mock_evaluator_response
|
eval_impl = nvidia_eval_setup["eval_impl"]
|
||||||
|
mock_evaluator_post = nvidia_eval_setup["mock_evaluator_post"]
|
||||||
|
|
||||||
# Run the Evaluation job
|
eval_config = {
|
||||||
result = self.run_async(
|
"type": "custom",
|
||||||
self.eval_impl.run_eval(benchmark_id=MOCK_BENCHMARK_ID, benchmark_config=benchmark_config)
|
"params": {"parallelism": 8},
|
||||||
)
|
"tasks": {
|
||||||
|
"qa": {
|
||||||
# Verify the Evaluator API was called correctly
|
"type": "completion",
|
||||||
self.mock_evaluator_post.assert_called_once()
|
"params": {"template": {"prompt": "{{prompt}}", "max_tokens": 200}},
|
||||||
self._assert_request_body(
|
"dataset": {"files_url": f"hf://datasets/{MOCK_DATASET_ID}/testing/testing.jsonl"},
|
||||||
{
|
"metrics": {"bleu": {"type": "bleu", "params": {"references": ["{{ideal_response}}"]}}},
|
||||||
"config": f"nvidia/{MOCK_BENCHMARK_ID}",
|
|
||||||
"target": {"type": "model", "model": "meta/llama-3.1-8b-instruct"},
|
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
benchmark = Benchmark(
|
||||||
|
provider_id="nvidia",
|
||||||
|
type=ResourceType.benchmark,
|
||||||
|
identifier=MOCK_BENCHMARK_ID,
|
||||||
|
dataset_id=MOCK_DATASET_ID,
|
||||||
|
scoring_functions=["basic::equality"],
|
||||||
|
metadata=eval_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Mock Evaluator API response
|
||||||
|
mock_evaluator_response = {"id": MOCK_BENCHMARK_ID, "status": "created"}
|
||||||
|
mock_evaluator_post.return_value = mock_evaluator_response
|
||||||
|
|
||||||
|
# Register the benchmark
|
||||||
|
await eval_impl.register_benchmark(benchmark)
|
||||||
|
|
||||||
|
# Verify the Evaluator API was called correctly
|
||||||
|
mock_evaluator_post.assert_called_once()
|
||||||
|
_assert_request_body(
|
||||||
|
mock_evaluator_post, {"namespace": benchmark.provider_id, "name": benchmark.identifier, **eval_config}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_run_eval(nvidia_eval_setup):
|
||||||
|
eval_impl = nvidia_eval_setup["eval_impl"]
|
||||||
|
mock_evaluator_post = nvidia_eval_setup["mock_evaluator_post"]
|
||||||
|
|
||||||
|
benchmark_config = BenchmarkConfig(
|
||||||
|
eval_candidate=ModelCandidate(
|
||||||
|
type="model",
|
||||||
|
model=CoreModelId.llama3_1_8b_instruct.value,
|
||||||
|
sampling_params=SamplingParams(max_tokens=100, strategy=TopPSamplingStrategy(temperature=0.7)),
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# Verify the result
|
# Mock Evaluator API response
|
||||||
assert isinstance(result, Job)
|
mock_evaluator_response = {"id": "job-123", "status": "created"}
|
||||||
assert result.job_id == "job-123"
|
mock_evaluator_post.return_value = mock_evaluator_response
|
||||||
assert result.status == JobStatus.in_progress
|
|
||||||
|
|
||||||
def test_job_status(self):
|
# Run the Evaluation job
|
||||||
# Mock Evaluator API response
|
result = await eval_impl.run_eval(benchmark_id=MOCK_BENCHMARK_ID, benchmark_config=benchmark_config)
|
||||||
mock_evaluator_response = {"id": "job-123", "status": "completed"}
|
|
||||||
self.mock_evaluator_get.return_value = mock_evaluator_response
|
|
||||||
|
|
||||||
# Get the Evaluation job
|
# Verify the Evaluator API was called correctly
|
||||||
result = self.run_async(self.eval_impl.job_status(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123"))
|
mock_evaluator_post.assert_called_once()
|
||||||
|
_assert_request_body(
|
||||||
|
mock_evaluator_post,
|
||||||
|
{
|
||||||
|
"config": f"nvidia/{MOCK_BENCHMARK_ID}",
|
||||||
|
"target": {"type": "model", "model": "meta/llama-3.1-8b-instruct"},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
# Verify the result
|
# Verify the result
|
||||||
assert isinstance(result, Job)
|
assert isinstance(result, Job)
|
||||||
assert result.job_id == "job-123"
|
assert result.job_id == "job-123"
|
||||||
assert result.status == JobStatus.completed
|
assert result.status == JobStatus.in_progress
|
||||||
|
|
||||||
# Verify the API was called correctly
|
|
||||||
self.mock_evaluator_get.assert_called_once_with(f"/v1/evaluation/jobs/{result.job_id}")
|
|
||||||
|
|
||||||
def test_job_cancel(self):
|
@pytest.mark.asyncio
|
||||||
# Mock Evaluator API response
|
async def test_job_status(nvidia_eval_setup):
|
||||||
mock_evaluator_response = {"id": "job-123", "status": "cancelled"}
|
eval_impl = nvidia_eval_setup["eval_impl"]
|
||||||
self.mock_evaluator_post.return_value = mock_evaluator_response
|
mock_evaluator_get = nvidia_eval_setup["mock_evaluator_get"]
|
||||||
|
|
||||||
# Cancel the Evaluation job
|
# Mock Evaluator API response
|
||||||
self.run_async(self.eval_impl.job_cancel(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123"))
|
mock_evaluator_response = {"id": "job-123", "status": "completed"}
|
||||||
|
mock_evaluator_get.return_value = mock_evaluator_response
|
||||||
|
|
||||||
# Verify the API was called correctly
|
# Get the Evaluation job
|
||||||
self.mock_evaluator_post.assert_called_once_with("/v1/evaluation/jobs/job-123/cancel", {})
|
result = await eval_impl.job_status(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123")
|
||||||
|
|
||||||
def test_job_result(self):
|
# Verify the result
|
||||||
# Mock Evaluator API responses
|
assert isinstance(result, Job)
|
||||||
mock_job_status_response = {"id": "job-123", "status": "completed"}
|
assert result.job_id == "job-123"
|
||||||
mock_job_results_response = {
|
assert result.status == JobStatus.completed
|
||||||
"id": "job-123",
|
|
||||||
"status": "completed",
|
|
||||||
"results": {MOCK_BENCHMARK_ID: {"score": 0.85, "details": {"accuracy": 0.85, "f1": 0.84}}},
|
|
||||||
}
|
|
||||||
self.mock_evaluator_get.side_effect = [
|
|
||||||
mock_job_status_response, # First call to retrieve job
|
|
||||||
mock_job_results_response, # Second call to retrieve job results
|
|
||||||
]
|
|
||||||
|
|
||||||
# Get the Evaluation job results
|
# Verify the API was called correctly
|
||||||
result = self.run_async(self.eval_impl.job_result(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123"))
|
mock_evaluator_get.assert_called_once_with(f"/v1/evaluation/jobs/{result.job_id}")
|
||||||
|
|
||||||
# Verify the result
|
|
||||||
assert isinstance(result, EvaluateResponse)
|
|
||||||
assert MOCK_BENCHMARK_ID in result.scores
|
|
||||||
assert result.scores[MOCK_BENCHMARK_ID].aggregated_results["results"][MOCK_BENCHMARK_ID]["score"] == 0.85
|
|
||||||
|
|
||||||
# Verify the API was called correctly
|
@pytest.mark.asyncio
|
||||||
assert self.mock_evaluator_get.call_count == 2
|
async def test_job_cancel(nvidia_eval_setup):
|
||||||
self.mock_evaluator_get.assert_any_call("/v1/evaluation/jobs/job-123")
|
eval_impl = nvidia_eval_setup["eval_impl"]
|
||||||
self.mock_evaluator_get.assert_any_call("/v1/evaluation/jobs/job-123/results")
|
mock_evaluator_post = nvidia_eval_setup["mock_evaluator_post"]
|
||||||
|
|
||||||
|
# Mock Evaluator API response
|
||||||
|
mock_evaluator_response = {"id": "job-123", "status": "cancelled"}
|
||||||
|
mock_evaluator_post.return_value = mock_evaluator_response
|
||||||
|
|
||||||
|
# Cancel the Evaluation job
|
||||||
|
await eval_impl.job_cancel(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123")
|
||||||
|
|
||||||
|
# Verify the API was called correctly
|
||||||
|
mock_evaluator_post.assert_called_once_with("/v1/evaluation/jobs/job-123/cancel", {})
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_job_result(nvidia_eval_setup):
|
||||||
|
eval_impl = nvidia_eval_setup["eval_impl"]
|
||||||
|
mock_evaluator_get = nvidia_eval_setup["mock_evaluator_get"]
|
||||||
|
|
||||||
|
# Mock Evaluator API responses
|
||||||
|
mock_job_status_response = {"id": "job-123", "status": "completed"}
|
||||||
|
mock_job_results_response = {
|
||||||
|
"id": "job-123",
|
||||||
|
"status": "completed",
|
||||||
|
"results": {MOCK_BENCHMARK_ID: {"score": 0.85, "details": {"accuracy": 0.85, "f1": 0.84}}},
|
||||||
|
}
|
||||||
|
mock_evaluator_get.side_effect = [
|
||||||
|
mock_job_status_response, # First call to retrieve job
|
||||||
|
mock_job_results_response, # Second call to retrieve job results
|
||||||
|
]
|
||||||
|
|
||||||
|
# Get the Evaluation job results
|
||||||
|
result = await eval_impl.job_result(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123")
|
||||||
|
|
||||||
|
# Verify the result
|
||||||
|
assert isinstance(result, EvaluateResponse)
|
||||||
|
assert MOCK_BENCHMARK_ID in result.scores
|
||||||
|
assert result.scores[MOCK_BENCHMARK_ID].aggregated_results["results"][MOCK_BENCHMARK_ID]["score"] == 0.85
|
||||||
|
|
||||||
|
# Verify the API was called correctly
|
||||||
|
assert mock_evaluator_get.call_count == 2
|
||||||
|
mock_evaluator_get.assert_any_call("/v1/evaluation/jobs/job-123")
|
||||||
|
mock_evaluator_get.assert_any_call("/v1/evaluation/jobs/job-123/results")
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue