In-progress: e2e notebook with partial Eval integration

2025-07-21 03:59:42 +00:00 · 2025-04-08 14:08:01 -04:00 · 2025-04-08 14:08:01 -04:00 · c04ab0133d
commit c04ab0133d
parent 861962fa80
19 changed files with 832 additions and 624 deletions
--- a/llama_stack/providers/registry/eval.py
+++ b/llama_stack/providers/registry/eval.py
@ -6,7 +6,7 @@

 from typing import List

-from llama_stack.providers.datatypes import Api, InlineProviderSpec, ProviderSpec
+from llama_stack.providers.datatypes import AdapterSpec, Api, InlineProviderSpec, ProviderSpec, remote_provider_spec


 def available_providers() -> List[ProviderSpec]:
@ -25,4 +25,22 @@ def available_providers() -> List[ProviderSpec]:
                Api.agents,
            ],
        ),
+        remote_provider_spec(
+            api=Api.eval,
+            adapter=AdapterSpec(
+                adapter_type="nvidia",
+                pip_packages=[
+                    "requests",
+                ],
+                module="llama_stack.providers.remote.eval.nvidia",
+                config_class="llama_stack.providers.remote.eval.nvidia.NVIDIAEvalConfig",
+            ),
+            api_dependencies=[
+                Api.datasetio,
+                Api.datasets,
+                Api.scoring,
+                Api.inference,
+                Api.agents,
+            ],
+        ),
    ]
--- a/llama_stack/providers/remote/eval/init.py
+++ b/llama_stack/providers/remote/eval/init.py
@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
--- a/llama_stack/providers/remote/eval/nvidia/README.md
+++ b/llama_stack/providers/remote/eval/nvidia/README.md
@ -0,0 +1,126 @@
+# NVIDIA NeMo Evaluator Eval Provider
+
+
+## Overview
+
+For the first integration, Benchmarks are mapped to Evaluation Configs on in the NeMo Evaluator. The full evaluation config object is provided as part of the meta-data. The `dataset_id` and `scoring_functions` are not used.
+
+Below are a few examples of how to register a benchmark, which in turn will create an evaluation config in NeMo Evaluator and how to trigger an evaluation.
+
+### Example for register an academic benchmark
+
+```
+POST /eval/benchmarks
+```
+```json
+{
+  "benchmark_id": "mmlu",
+  "dataset_id": "",
+  "scoring_functions": [],
+  "metadata": {
+    "type": "mmlu"
+  }
+}
+```
+
+### Example for register a custom evaluation
+
+```
+POST /eval/benchmarks
+```
+```json
+{
+  "benchmark_id": "my-custom-benchmark",
+  "dataset_id": "",
+  "scoring_functions": [],
+  "metadata": {
+    "type": "custom",
+    "params": {
+      "parallelism": 8
+    },
+    "tasks": {
+      "qa": {
+        "type": "completion",
+        "params": {
+          "template": {
+            "prompt": "{{prompt}}",
+            "max_tokens": 200
+          }
+        },
+        "dataset": {
+          "files_url": "hf://datasets/default/sample-basic-test/testing/testing.jsonl"
+        },
+        "metrics": {
+          "bleu": {
+            "type": "bleu",
+            "params": {
+              "references": [
+                "{{ideal_response}}"
+              ]
+            }
+          }
+        }
+      }
+    }
+  }
+}
+```
+
+### Example for triggering a benchmark/custom evaluation
+
+```
+POST /eval/benchmarks/{benchmark_id}/jobs
+```
+```json
+{
+  "benchmark_id": "my-custom-benchmark",
+  "benchmark_config": {
+    "eval_candidate": {
+      "type": "model",
+      "model": "meta/llama-3.1-8b-instruct",
+      "sampling_params": {
+        "max_tokens": 100,
+        "temperature": 0.7
+      }
+    },
+    "scoring_params": {}
+  }
+}
+```
+
+Response example:
+```json
+{
+    "job_id": "1234",
+    "status": "in_progress"
+}
+```
+
+### Example for getting the status of a job
+```
+GET /eval/benchmarks/{benchmark_id}/jobs/{job_id}
+```
+
+### Example for cancelling a job
+```
+POST /eval/benchmarks/{benchmark_id}/jobs/{job_id}/cancel
+```
+
+### Example for getting the results
+```
+GET /eval/benchmarks/{benchmark_id}/results
+```
+```json
+{
+  "generations": [],
+  "scores": {
+    "{benchmark_id}": {
+      "score_rows": [],
+      "aggregated_results": {
+        "tasks": {},
+        "groups": {}
+      }
+    }
+  }
+}
+```
--- a/llama_stack/providers/remote/eval/nvidia/init.py
+++ b/llama_stack/providers/remote/eval/nvidia/init.py
@ -0,0 +1,31 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from typing import Any, Dict
+
+from llama_stack.distribution.datatypes import Api
+
+from .config import NVIDIAEvalConfig
+
+
+async def get_adapter_impl(
+    config: NVIDIAEvalConfig,
+    deps: Dict[Api, Any],
+):
+    from .eval import NVIDIAEvalImpl
+
+    impl = NVIDIAEvalImpl(
+        config,
+        deps[Api.datasetio],
+        deps[Api.datasets],
+        deps[Api.scoring],
+        deps[Api.inference],
+        deps[Api.agents],
+    )
+    await impl.initialize()
+    return impl
+
+
+__all__ = ["get_adapter_impl", "NVIDIAEvalImpl"]
--- a/llama_stack/providers/remote/eval/nvidia/config.py
+++ b/llama_stack/providers/remote/eval/nvidia/config.py
@ -0,0 +1,29 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+import os
+from typing import Any, Dict
+
+from pydantic import BaseModel, Field
+
+
+class NVIDIAEvalConfig(BaseModel):
+    """
+     Configuration for the NVIDIA NeMo Evaluator microservice endpoint.
+
+    Attributes:
+        evaluator_service_url (str): A base url for accessing the NVIDIA evaluation endpoint, e.g. http://localhost:8000.
+    """
+
+    evaluator_service_url: str = Field(
+        default_factory=lambda: os.getenv("NVIDIA_EVALUATOR_URL", "http://0.0.0.0:7331"),
+        description="The url for accessing the evaluator service",
+    )
+
+    @classmethod
+    def sample_run_config(cls, **kwargs) -> Dict[str, Any]:
+        return {
+            "evaluator_service_url": "${env.NVIDIA_EVALUATOR_URL:http://localhost:7331}",
+        }
--- a/llama_stack/providers/remote/eval/nvidia/eval.py
+++ b/llama_stack/providers/remote/eval/nvidia/eval.py
@ -0,0 +1,147 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from typing import Any, Dict, List
+
+import requests
+
+from llama_stack.apis.agents import Agents
+from llama_stack.apis.benchmarks import Benchmark
+from llama_stack.apis.datasetio import DatasetIO
+from llama_stack.apis.datasets import Datasets
+from llama_stack.apis.inference import Inference
+from llama_stack.apis.scoring import Scoring, ScoringResult
+from llama_stack.providers.datatypes import BenchmarksProtocolPrivate
+
+from .....apis.common.job_types import Job, JobStatus
+from .....apis.eval.eval import BenchmarkConfig, Eval, EvaluateResponse
+from .config import NVIDIAEvalConfig
+
+DEFAULT_NAMESPACE = "nvidia"
+
+
+class NVIDIAEvalImpl(
+    Eval,
+    BenchmarksProtocolPrivate,
+):
+    def __init__(
+        self,
+        config: NVIDIAEvalConfig,
+        datasetio_api: DatasetIO,
+        datasets_api: Datasets,
+        scoring_api: Scoring,
+        inference_api: Inference,
+        agents_api: Agents,
+    ) -> None:
+        self.config = config
+        self.datasetio_api = datasetio_api
+        self.datasets_api = datasets_api
+        self.scoring_api = scoring_api
+        self.inference_api = inference_api
+        self.agents_api = agents_api
+
+    async def initialize(self) -> None: ...
+
+    async def shutdown(self) -> None: ...
+
+    async def _evaluator_get(self, path):
+        """Helper for making GET requests to the evaluator service."""
+        response = requests.get(url=f"{self.config.evaluator_service_url}/{path}")
+        response.raise_for_status()
+        return response.json()
+
+    async def _evaluator_post(self, path, data):
+        """Helper for making POST requests to the evaluator service."""
+        response = requests.post(url=f"{self.config.evaluator_service_url}/{path}", json=data)
+        response.raise_for_status()
+        return response.json()
+
+    async def register_benchmark(self, task_def: Benchmark) -> None:
+        """Register a benchmark as an evaluation configuration."""
+        await self._evaluator_post(
+            "/v1/evaluation/configs",
+            {
+                "namespace": DEFAULT_NAMESPACE,
+                "name": task_def.benchmark_id,
+                # metadata is copied to request body as-is
+                **task_def.metadata,
+            },
+        )
+
+    async def run_eval(
+        self,
+        benchmark_id: str,
+        benchmark_config: BenchmarkConfig,
+    ) -> Job:
+        """Run an evaluation job for a benchmark."""
+        model = (
+            benchmark_config.eval_candidate.model
+            if benchmark_config.eval_candidate.type == "model"
+            else benchmark_config.eval_candidate.config.model
+        )
+        result = await self._evaluator_post(
+            "/v1/evaluation/jobs",
+            {
+                "config": f"{DEFAULT_NAMESPACE}/{benchmark_id}",
+                "target": {"type": "model", "model": model},
+            },
+        )
+
+        return Job(job_id=result["id"], status=JobStatus.in_progress)
+
+    async def evaluate_rows(
+        self,
+        benchmark_id: str,
+        input_rows: List[Dict[str, Any]],
+        scoring_functions: List[str],
+        benchmark_config: BenchmarkConfig,
+    ) -> EvaluateResponse:
+        raise NotImplementedError()
+
+    async def job_status(self, benchmark_id: str, job_id: str) -> Job:
+        """Get the status of an evaluation job.
+
+        EvaluatorStatus: "created", "pending", "running", "cancelled", "cancelling", "failed", "completed".
+        JobStatus: "scheduled", "in_progress", "completed", "cancelled", "failed"
+        """
+        result = await self._evaluator_get(f"/v1/evaluation/jobs/{job_id}")
+        result_status = result["status"]
+
+        job_status = JobStatus.failed
+        if result_status in ["created", "pending"]:
+            job_status = JobStatus.scheduled
+        elif result_status in ["running"]:
+            job_status = JobStatus.in_progress
+        elif result_status in ["completed"]:
+            job_status = JobStatus.completed
+        elif result_status in ["cancelled"]:
+            job_status = JobStatus.cancelled
+
+        return Job(job_id=job_id, status=job_status)
+
+    async def job_cancel(self, benchmark_id: str, job_id: str) -> None:
+        """Cancel the evaluation job."""
+        await self._evaluator_post(f"/v1/evaluation/jobs/{job_id}/cancel", {})
+
+    async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse:
+        """Returns the results of the evaluation job."""
+
+        job = await self.job_status(benchmark_id, job_id)
+        status = job.status
+        if not status or status != JobStatus.completed:
+            raise ValueError(f"Job {job_id} not completed. Status: {status.value}")
+
+        result = await self._evaluator_get(f"/v1/evaluation/jobs/{job_id}/results")
+
+        return EvaluateResponse(
+            # TODO: these are stored in detailed results on NeMo Evaluator side; can be added
+            generations=[],
+            scores={
+                benchmark_id: ScoringResult(
+                    score_rows=[],
+                    aggregated_results=result,
+                )
+            },
+        )
--- a/llama_stack/providers/remote/post_training/nvidia/post_training.py
+++ b/llama_stack/providers/remote/post_training/nvidia/post_training.py
@ -95,7 +95,9 @@ class NvidiaPostTrainingAdapter(ModelRegistryHelper):

        for _ in range(self.config.max_retries):
            # TODO: Remove `verify_ssl=False`. Added for testing purposes to call NMP int environment from `docs/notebooks/nvidia/`
-            async with self.session.request(method, url, params=params, json=json, verify_ssl=False, **kwargs) as response:
+            async with self.session.request(
+                method, url, params=params, json=json, verify_ssl=False, **kwargs
+            ) as response:
                if response.status >= 400:
                    error_data = await response.json()
                    raise Exception(f"API request failed: {error_data}")
--- a/llama_stack/templates/dependencies.json
+++ b/llama_stack/templates/dependencies.json
@ -437,12 +437,10 @@
    "aiosqlite",
    "blobfile",
    "chardet",
-    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
-    "langdetect",
    "matplotlib",
    "nltk",
    "numpy",
@ -454,7 +452,6 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
-    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -462,7 +459,6 @@
    "sentencepiece",
    "tqdm",
    "transformers",
-    "tree_sitter",
    "uvicorn"
  ],
  "ollama": [
--- a/llama_stack/templates/nvidia/build.yaml
+++ b/llama_stack/templates/nvidia/build.yaml
@ -1,6 +1,6 @@
 version: '2'
 distribution_spec:
-  description: Use NVIDIA NIM for running LLM inference and safety
+  description: Use NVIDIA NIM for running LLM inference, evaluation and safety
  providers:
    inference:
    - remote::nvidia
@ -13,7 +13,7 @@ distribution_spec:
    telemetry:
    - inline::meta-reference
    eval:
-    - inline::meta-reference
+    - remote::nvidia
    post_training:
    - remote::nvidia
    datasetio:
--- a/llama_stack/templates/nvidia/nvidia.py
+++ b/llama_stack/templates/nvidia/nvidia.py
@ -7,6 +7,7 @@
 from pathlib import Path

 from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput, ToolGroupInput
+from llama_stack.providers.remote.eval.nvidia import NVIDIAEvalConfig
 from llama_stack.providers.remote.inference.nvidia import NVIDIAConfig
 from llama_stack.providers.remote.inference.nvidia.models import MODEL_ENTRIES
 from llama_stack.providers.remote.safety.nvidia import NVIDIASafetyConfig
@ -20,7 +21,7 @@ def get_distribution_template() -> DistributionTemplate:
        "safety": ["remote::nvidia"],
        "agents": ["inline::meta-reference"],
        "telemetry": ["inline::meta-reference"],
-        "eval": ["inline::meta-reference"],
+        "eval": ["remote::nvidia"],
        "post_training": ["remote::nvidia"],
        "datasetio": ["inline::localfs"],
        "scoring": ["inline::basic"],
@ -37,6 +38,11 @@ def get_distribution_template() -> DistributionTemplate:
        provider_type="remote::nvidia",
        config=NVIDIASafetyConfig.sample_run_config(),
    )
+    eval_provider = Provider(
+        provider_id="nvidia",
+        provider_type="remote::nvidia",
+        config=NVIDIAEvalConfig.sample_run_config(),
+    )
    inference_model = ModelInput(
        model_id="${env.INFERENCE_MODEL}",
        provider_id="nvidia",
@ -60,7 +66,7 @@ def get_distribution_template() -> DistributionTemplate:
    return DistributionTemplate(
        name="nvidia",
        distro_type="remote_hosted",
-        description="Use NVIDIA NIM for running LLM inference and safety",
+        description="Use NVIDIA NIM for running LLM inference, evaluation and safety",
        container_image=None,
        template_path=Path(__file__).parent / "doc_template.md",
        providers=providers,
@ -69,6 +75,7 @@ def get_distribution_template() -> DistributionTemplate:
            "run.yaml": RunConfigSettings(
                provider_overrides={
                    "inference": [inference_provider],
+                    "eval": [eval_provider],
                },
                default_models=default_models,
                default_tool_groups=default_tool_groups,
@ -78,7 +85,8 @@ def get_distribution_template() -> DistributionTemplate:
                    "inference": [
                        inference_provider,
                        safety_provider,
-                    ]
+                    ],
+                    "eval": [eval_provider],
                },
                default_models=[inference_model, safety_model],
                default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}", provider_id="nvidia")],
@ -119,6 +127,10 @@ def get_distribution_template() -> DistributionTemplate:
                "http://0.0.0.0:7331",
                "URL for the NeMo Guardrails Service",
            ),
+            "NVIDIA_EVALUATOR_URL": (
+                "http://0.0.0.0:7331",
+                "URL for the NeMo Evaluator Service",
+            ),
            "INFERENCE_MODEL": (
                "Llama3.1-8B-Instruct",
                "Inference model",
--- a/llama_stack/templates/nvidia/run-with-safety.yaml
+++ b/llama_stack/templates/nvidia/run-with-safety.yaml
@ -53,13 +53,10 @@ providers:
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/nvidia/trace_store.db}
  eval:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
+  - provider_id: nvidia
+    provider_type: remote::nvidia
    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/meta_reference_eval.db
+      evaluator_service_url: ${env.NVIDIA_EVALUATOR_URL:http://localhost:7331}
  post_training:
  - provider_id: nvidia
    provider_type: remote::nvidia
--- a/llama_stack/templates/nvidia/run.yaml
+++ b/llama_stack/templates/nvidia/run.yaml
@ -48,13 +48,10 @@ providers:
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/nvidia/trace_store.db}
  eval:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
+  - provider_id: nvidia
+    provider_type: remote::nvidia
    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/meta_reference_eval.db
+      evaluator_service_url: ${env.NVIDIA_EVALUATOR_URL:http://localhost:7331}
  post_training:
  - provider_id: nvidia
    provider_type: remote::nvidia