feat: create HTTP DELETE API endpoints to unregister ScoringFn and Benchmark resources in Llama Stack (#3371)

# What does this PR do?  This PR provides functionality for users to unregister ScoringFn and Benchmark resources for `scoring` and `eval` APIs.   Closes #3051 ## Test Plan  Updated integration and unit tests via CI workflow
2025-12-04 02:03:44 +00:00 · 2025-09-15 20:43:38 +01:00 · 2025-09-15 20:43:38 +01:00 · ab321739f2
commit ab321739f2
parent 01bdcce4d2
13 changed files with 241 additions and 3 deletions
--- a/llama_stack/apis/benchmarks/benchmarks.py
+++ b/llama_stack/apis/benchmarks/benchmarks.py
@ -93,3 +93,11 @@ class Benchmarks(Protocol):
        :param metadata: The metadata to use for the benchmark.
        """
        ...
+
+    @webmethod(route="/eval/benchmarks/{benchmark_id}", method="DELETE")
+    async def unregister_benchmark(self, benchmark_id: str) -> None:
+        """Unregister a benchmark.
+
+        :param benchmark_id: The ID of the benchmark to unregister.
+        """
+        ...
--- a/llama_stack/apis/scoring_functions/scoring_functions.py
+++ b/llama_stack/apis/scoring_functions/scoring_functions.py
@ -197,3 +197,11 @@ class ScoringFunctions(Protocol):
        :param params: The parameters for the scoring function for benchmark eval, these can be overridden for app eval.
        """
        ...
+
+    @webmethod(route="/scoring-functions/{scoring_fn_id:path}", method="DELETE")
+    async def unregister_scoring_function(self, scoring_fn_id: str) -> None:
+        """Unregister a scoring function.
+
+        :param scoring_fn_id: The ID of the scoring function to unregister.
+        """
+        ...
--- a/llama_stack/core/routing_tables/benchmarks.py
+++ b/llama_stack/core/routing_tables/benchmarks.py
@ -56,3 +56,7 @@ class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks):
            provider_resource_id=provider_benchmark_id,
        )
        await self.register_object(benchmark)
+
+    async def unregister_benchmark(self, benchmark_id: str) -> None:
+        existing_benchmark = await self.get_benchmark(benchmark_id)
+        await self.unregister_object(existing_benchmark)
--- a/llama_stack/core/routing_tables/common.py
+++ b/llama_stack/core/routing_tables/common.py
@ -64,6 +64,10 @@ async def unregister_object_from_provider(obj: RoutableObject, p: Any) -> None:
        return await p.unregister_shield(obj.identifier)
    elif api == Api.datasetio:
        return await p.unregister_dataset(obj.identifier)
+    elif api == Api.eval:
+        return await p.unregister_benchmark(obj.identifier)
+    elif api == Api.scoring:
+        return await p.unregister_scoring_function(obj.identifier)
    elif api == Api.tool_runtime:
        return await p.unregister_toolgroup(obj.identifier)
    else:
--- a/llama_stack/core/routing_tables/scoring_functions.py
+++ b/llama_stack/core/routing_tables/scoring_functions.py
@ -60,3 +60,7 @@ class ScoringFunctionsRoutingTable(CommonRoutingTableImpl, ScoringFunctions):
        )
        scoring_fn.provider_id = provider_id
        await self.register_object(scoring_fn)
+
+    async def unregister_scoring_function(self, scoring_fn_id: str) -> None:
+        existing_scoring_fn = await self.get_scoring_function(scoring_fn_id)
+        await self.unregister_object(existing_scoring_fn)
--- a/llama_stack/providers/inline/eval/meta_reference/eval.py
+++ b/llama_stack/providers/inline/eval/meta_reference/eval.py
@ -75,6 +75,13 @@ class MetaReferenceEvalImpl(
        )
        self.benchmarks[task_def.identifier] = task_def

+    async def unregister_benchmark(self, benchmark_id: str) -> None:
+        if benchmark_id in self.benchmarks:
+            del self.benchmarks[benchmark_id]
+
+        key = f"{EVAL_TASKS_PREFIX}{benchmark_id}"
+        await self.kvstore.delete(key)
+
    async def run_eval(
        self,
        benchmark_id: str,
--- a/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py
+++ b/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py
@ -63,6 +63,9 @@ class LlmAsJudgeScoringImpl(
    async def register_scoring_function(self, function_def: ScoringFn) -> None:
        self.llm_as_judge_fn.register_scoring_fn_def(function_def)

+    async def unregister_scoring_function(self, scoring_fn_id: str) -> None:
+        self.llm_as_judge_fn.unregister_scoring_fn_def(scoring_fn_id)
+
    async def score_batch(
        self,
        dataset_id: str,
--- a/llama_stack/providers/remote/eval/nvidia/eval.py
+++ b/llama_stack/providers/remote/eval/nvidia/eval.py
@ -51,18 +51,23 @@ class NVIDIAEvalImpl(

    async def shutdown(self) -> None: ...

-    async def _evaluator_get(self, path):
+    async def _evaluator_get(self, path: str):
        """Helper for making GET requests to the evaluator service."""
        response = requests.get(url=f"{self.config.evaluator_url}{path}")
        response.raise_for_status()
        return response.json()

-    async def _evaluator_post(self, path, data):
+    async def _evaluator_post(self, path: str, data: dict[str, Any]):
        """Helper for making POST requests to the evaluator service."""
        response = requests.post(url=f"{self.config.evaluator_url}{path}", json=data)
        response.raise_for_status()
        return response.json()

+    async def _evaluator_delete(self, path: str) -> None:
+        """Helper for making DELETE requests to the evaluator service."""
+        response = requests.delete(url=f"{self.config.evaluator_url}{path}")
+        response.raise_for_status()
+
    async def register_benchmark(self, task_def: Benchmark) -> None:
        """Register a benchmark as an evaluation configuration."""
        await self._evaluator_post(
@ -75,6 +80,10 @@ class NVIDIAEvalImpl(
            },
        )

+    async def unregister_benchmark(self, benchmark_id: str) -> None:
+        """Unregister a benchmark evaluation configuration from NeMo Evaluator."""
+        await self._evaluator_delete(f"/v1/evaluation/configs/{DEFAULT_NAMESPACE}/{benchmark_id}")
+
    async def run_eval(
        self,
        benchmark_id: str,