llama-stack-mirror/tests/unit/providers/nvidia/test_eval.py
Charlie Doern 840ad75fe9
feat: split API and provider specs into separate llama-stack-api pkg (#3895)
# What does this PR do?

Extract API definitions and provider specifications into a standalone
llama-stack-api package that can be published to PyPI independently of
the main llama-stack server.


see: https://github.com/llamastack/llama-stack/pull/2978 and
https://github.com/llamastack/llama-stack/pull/2978#issuecomment-3145115942

Motivation

External providers currently import from llama-stack, which overrides
the installed version and causes dependency conflicts. This separation
allows external providers to:

- Install only the type definitions they need without server
dependencies
- Avoid version conflicts with the installed llama-stack package
- Be versioned and released independently

This enables us to re-enable external provider module tests that were
previously blocked by these import conflicts.

Changes

- Created llama-stack-api package with minimal dependencies (pydantic,
jsonschema)
- Moved APIs, providers datatypes, strong_typing, and schema_utils
- Updated all imports from llama_stack.* to llama_stack_api.*
- Configured local editable install for development workflow
- Updated linting and type-checking configuration for both packages

Next Steps

- Publish llama-stack-api to PyPI
- Update external provider dependencies
- Re-enable external provider module tests


Pre-cursor PRs to this one:

- #4093 
- #3954 
- #4064 

These PRs moved key pieces _out_ of the Api pkg, limiting the scope of
change here.


relates to #3237 

## Test Plan

Package builds successfully and can be imported independently. All
pre-commit hooks pass with expected exclusions maintained.

---------

Signed-off-by: Charlie Doern <cdoern@redhat.com>
2025-11-13 11:51:17 -08:00

227 lines
7.9 KiB
Python

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import os
from unittest.mock import MagicMock, patch
import pytest
from llama_stack_api import (
Benchmark,
BenchmarkConfig,
EvaluateResponse,
Job,
JobStatus,
ModelCandidate,
ResourceType,
SamplingParams,
TopPSamplingStrategy,
)
from llama_stack.models.llama.sku_types import CoreModelId
from llama_stack.providers.remote.eval.nvidia.config import NVIDIAEvalConfig
from llama_stack.providers.remote.eval.nvidia.eval import NVIDIAEvalImpl
MOCK_DATASET_ID = "default/test-dataset"
MOCK_BENCHMARK_ID = "test-benchmark"
@pytest.fixture
def nvidia_eval_setup():
"""Set up the NVIDIA eval implementation with mocked dependencies."""
os.environ["NVIDIA_EVALUATOR_URL"] = "http://nemo.test"
# Create mock APIs
datasetio_api = MagicMock()
datasets_api = MagicMock()
scoring_api = MagicMock()
inference_api = MagicMock()
agents_api = MagicMock()
config = NVIDIAEvalConfig(
evaluator_url=os.environ["NVIDIA_EVALUATOR_URL"],
)
eval_impl = NVIDIAEvalImpl(
config=config,
datasetio_api=datasetio_api,
datasets_api=datasets_api,
scoring_api=scoring_api,
inference_api=inference_api,
agents_api=agents_api,
)
# Mock the HTTP request methods
with (
patch("llama_stack.providers.remote.eval.nvidia.eval.NVIDIAEvalImpl._evaluator_get") as mock_evaluator_get,
patch("llama_stack.providers.remote.eval.nvidia.eval.NVIDIAEvalImpl._evaluator_post") as mock_evaluator_post,
):
yield {
"eval_impl": eval_impl,
"mock_evaluator_get": mock_evaluator_get,
"mock_evaluator_post": mock_evaluator_post,
"datasetio_api": datasetio_api,
"datasets_api": datasets_api,
"scoring_api": scoring_api,
"inference_api": inference_api,
"agents_api": agents_api,
}
def _assert_request_body(mock_evaluator_post, expected_json):
"""Helper method to verify request body in Evaluator POST request is correct"""
call_args = mock_evaluator_post.call_args
actual_json = call_args[0][1]
# Check that all expected keys contain the expected values in the actual JSON
for key, value in expected_json.items():
assert key in actual_json, f"Key '{key}' missing in actual JSON"
if isinstance(value, dict):
for nested_key, nested_value in value.items():
assert nested_key in actual_json[key], f"Nested key '{nested_key}' missing in actual JSON['{key}']"
assert actual_json[key][nested_key] == nested_value, f"Value mismatch for '{key}.{nested_key}'"
else:
assert actual_json[key] == value, f"Value mismatch for '{key}'"
async def test_register_benchmark(nvidia_eval_setup):
eval_impl = nvidia_eval_setup["eval_impl"]
mock_evaluator_post = nvidia_eval_setup["mock_evaluator_post"]
eval_config = {
"type": "custom",
"params": {"parallelism": 8},
"tasks": {
"qa": {
"type": "completion",
"params": {"template": {"prompt": "{{prompt}}", "max_tokens": 200}},
"dataset": {"files_url": f"hf://datasets/{MOCK_DATASET_ID}/testing/testing.jsonl"},
"metrics": {"bleu": {"type": "bleu", "params": {"references": ["{{ideal_response}}"]}}},
}
},
}
benchmark = Benchmark(
provider_id="nvidia",
type=ResourceType.benchmark,
identifier=MOCK_BENCHMARK_ID,
dataset_id=MOCK_DATASET_ID,
scoring_functions=["basic::equality"],
metadata=eval_config,
)
# Mock Evaluator API response
mock_evaluator_response = {"id": MOCK_BENCHMARK_ID, "status": "created"}
mock_evaluator_post.return_value = mock_evaluator_response
# Register the benchmark
await eval_impl.register_benchmark(benchmark)
# Verify the Evaluator API was called correctly
mock_evaluator_post.assert_called_once()
_assert_request_body(
mock_evaluator_post, {"namespace": benchmark.provider_id, "name": benchmark.identifier, **eval_config}
)
async def test_run_eval(nvidia_eval_setup):
eval_impl = nvidia_eval_setup["eval_impl"]
mock_evaluator_post = nvidia_eval_setup["mock_evaluator_post"]
benchmark_config = BenchmarkConfig(
eval_candidate=ModelCandidate(
type="model",
model=CoreModelId.llama3_1_8b_instruct.value,
sampling_params=SamplingParams(max_tokens=100, strategy=TopPSamplingStrategy(temperature=0.7)),
)
)
# Mock Evaluator API response
mock_evaluator_response = {"id": "job-123", "status": "created"}
mock_evaluator_post.return_value = mock_evaluator_response
# Run the Evaluation job
result = await eval_impl.run_eval(benchmark_id=MOCK_BENCHMARK_ID, benchmark_config=benchmark_config)
# Verify the Evaluator API was called correctly
mock_evaluator_post.assert_called_once()
_assert_request_body(
mock_evaluator_post,
{
"config": f"nvidia/{MOCK_BENCHMARK_ID}",
"target": {"type": "model", "model": "Llama3.1-8B-Instruct"},
},
)
# Verify the result
assert isinstance(result, Job)
assert result.job_id == "job-123"
assert result.status == JobStatus.in_progress
async def test_job_status(nvidia_eval_setup):
eval_impl = nvidia_eval_setup["eval_impl"]
mock_evaluator_get = nvidia_eval_setup["mock_evaluator_get"]
# Mock Evaluator API response
mock_evaluator_response = {"id": "job-123", "status": "completed"}
mock_evaluator_get.return_value = mock_evaluator_response
# Get the Evaluation job
result = await eval_impl.job_status(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123")
# Verify the result
assert isinstance(result, Job)
assert result.job_id == "job-123"
assert result.status == JobStatus.completed
# Verify the API was called correctly
mock_evaluator_get.assert_called_once_with(f"/v1/evaluation/jobs/{result.job_id}")
async def test_job_cancel(nvidia_eval_setup):
eval_impl = nvidia_eval_setup["eval_impl"]
mock_evaluator_post = nvidia_eval_setup["mock_evaluator_post"]
# Mock Evaluator API response
mock_evaluator_response = {"id": "job-123", "status": "cancelled"}
mock_evaluator_post.return_value = mock_evaluator_response
# Cancel the Evaluation job
await eval_impl.job_cancel(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123")
# Verify the API was called correctly
mock_evaluator_post.assert_called_once_with("/v1/evaluation/jobs/job-123/cancel", {})
async def test_job_result(nvidia_eval_setup):
eval_impl = nvidia_eval_setup["eval_impl"]
mock_evaluator_get = nvidia_eval_setup["mock_evaluator_get"]
# Mock Evaluator API responses
mock_job_status_response = {"id": "job-123", "status": "completed"}
mock_job_results_response = {
"id": "job-123",
"status": "completed",
"results": {MOCK_BENCHMARK_ID: {"score": 0.85, "details": {"accuracy": 0.85, "f1": 0.84}}},
}
mock_evaluator_get.side_effect = [
mock_job_status_response, # First call to retrieve job
mock_job_results_response, # Second call to retrieve job results
]
# Get the Evaluation job results
result = await eval_impl.job_result(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123")
# Verify the result
assert isinstance(result, EvaluateResponse)
assert MOCK_BENCHMARK_ID in result.scores
assert result.scores[MOCK_BENCHMARK_ID].aggregated_results["results"][MOCK_BENCHMARK_ID]["score"] == 0.85
# Verify the API was called correctly
assert mock_evaluator_get.call_count == 2
mock_evaluator_get.assert_any_call("/v1/evaluation/jobs/job-123")
mock_evaluator_get.assert_any_call("/v1/evaluation/jobs/job-123/results")