diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index a036e5dc0..9ddb070d7 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -1380,6 +1380,40 @@
}
}
]
+ },
+ "delete": {
+ "responses": {
+ "200": {
+ "description": "OK"
+ },
+ "400": {
+ "$ref": "#/components/responses/BadRequest400"
+ },
+ "429": {
+ "$ref": "#/components/responses/TooManyRequests429"
+ },
+ "500": {
+ "$ref": "#/components/responses/InternalServerError500"
+ },
+ "default": {
+ "$ref": "#/components/responses/DefaultError"
+ }
+ },
+ "tags": [
+ "Benchmarks"
+ ],
+ "description": "Unregister a benchmark.",
+ "parameters": [
+ {
+ "name": "benchmark_id",
+ "in": "path",
+ "description": "The ID of the benchmark to unregister.",
+ "required": true,
+ "schema": {
+ "type": "string"
+ }
+ }
+ ]
}
},
"/v1/openai/v1/chat/completions/{completion_id}": {
@@ -1620,6 +1654,40 @@
}
}
]
+ },
+ "delete": {
+ "responses": {
+ "200": {
+ "description": "OK"
+ },
+ "400": {
+ "$ref": "#/components/responses/BadRequest400"
+ },
+ "429": {
+ "$ref": "#/components/responses/TooManyRequests429"
+ },
+ "500": {
+ "$ref": "#/components/responses/InternalServerError500"
+ },
+ "default": {
+ "$ref": "#/components/responses/DefaultError"
+ }
+ },
+ "tags": [
+ "ScoringFunctions"
+ ],
+ "description": "Unregister a scoring function.",
+ "parameters": [
+ {
+ "name": "scoring_fn_id",
+ "in": "path",
+ "description": "The ID of the scoring function to unregister.",
+ "required": true,
+ "schema": {
+ "type": "string"
+ }
+ }
+ ]
}
},
"/v1/shields/{identifier}": {
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index 8ed04c1f8..94dc5c0f9 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -954,6 +954,30 @@ paths:
required: true
schema:
type: string
+ delete:
+ responses:
+ '200':
+ description: OK
+ '400':
+ $ref: '#/components/responses/BadRequest400'
+ '429':
+ $ref: >-
+ #/components/responses/TooManyRequests429
+ '500':
+ $ref: >-
+ #/components/responses/InternalServerError500
+ default:
+ $ref: '#/components/responses/DefaultError'
+ tags:
+ - Benchmarks
+ description: Unregister a benchmark.
+ parameters:
+ - name: benchmark_id
+ in: path
+ description: The ID of the benchmark to unregister.
+ required: true
+ schema:
+ type: string
/v1/openai/v1/chat/completions/{completion_id}:
get:
responses:
@@ -1119,6 +1143,31 @@ paths:
required: true
schema:
type: string
+ delete:
+ responses:
+ '200':
+ description: OK
+ '400':
+ $ref: '#/components/responses/BadRequest400'
+ '429':
+ $ref: >-
+ #/components/responses/TooManyRequests429
+ '500':
+ $ref: >-
+ #/components/responses/InternalServerError500
+ default:
+ $ref: '#/components/responses/DefaultError'
+ tags:
+ - ScoringFunctions
+ description: Unregister a scoring function.
+ parameters:
+ - name: scoring_fn_id
+ in: path
+ description: >-
+ The ID of the scoring function to unregister.
+ required: true
+ schema:
+ type: string
/v1/shields/{identifier}:
get:
responses:
diff --git a/llama_stack/apis/benchmarks/benchmarks.py b/llama_stack/apis/benchmarks/benchmarks.py
index 706eaed6c..8d0a25e7b 100644
--- a/llama_stack/apis/benchmarks/benchmarks.py
+++ b/llama_stack/apis/benchmarks/benchmarks.py
@@ -93,3 +93,11 @@ class Benchmarks(Protocol):
:param metadata: The metadata to use for the benchmark.
"""
...
+
+ @webmethod(route="/eval/benchmarks/{benchmark_id}", method="DELETE")
+ async def unregister_benchmark(self, benchmark_id: str) -> None:
+ """Unregister a benchmark.
+
+ :param benchmark_id: The ID of the benchmark to unregister.
+ """
+ ...
diff --git a/llama_stack/apis/scoring_functions/scoring_functions.py b/llama_stack/apis/scoring_functions/scoring_functions.py
index 05b6325b7..541067766 100644
--- a/llama_stack/apis/scoring_functions/scoring_functions.py
+++ b/llama_stack/apis/scoring_functions/scoring_functions.py
@@ -197,3 +197,11 @@ class ScoringFunctions(Protocol):
:param params: The parameters for the scoring function for benchmark eval, these can be overridden for app eval.
"""
...
+
+ @webmethod(route="/scoring-functions/{scoring_fn_id:path}", method="DELETE")
+ async def unregister_scoring_function(self, scoring_fn_id: str) -> None:
+ """Unregister a scoring function.
+
+ :param scoring_fn_id: The ID of the scoring function to unregister.
+ """
+ ...
diff --git a/llama_stack/core/routing_tables/benchmarks.py b/llama_stack/core/routing_tables/benchmarks.py
index c875dee5b..8c87d395d 100644
--- a/llama_stack/core/routing_tables/benchmarks.py
+++ b/llama_stack/core/routing_tables/benchmarks.py
@@ -56,3 +56,7 @@ class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks):
provider_resource_id=provider_benchmark_id,
)
await self.register_object(benchmark)
+
+ async def unregister_benchmark(self, benchmark_id: str) -> None:
+ existing_benchmark = await self.get_benchmark(benchmark_id)
+ await self.unregister_object(existing_benchmark)
diff --git a/llama_stack/core/routing_tables/common.py b/llama_stack/core/routing_tables/common.py
index e523746d8..ca2f3af42 100644
--- a/llama_stack/core/routing_tables/common.py
+++ b/llama_stack/core/routing_tables/common.py
@@ -64,6 +64,10 @@ async def unregister_object_from_provider(obj: RoutableObject, p: Any) -> None:
return await p.unregister_shield(obj.identifier)
elif api == Api.datasetio:
return await p.unregister_dataset(obj.identifier)
+ elif api == Api.eval:
+ return await p.unregister_benchmark(obj.identifier)
+ elif api == Api.scoring:
+ return await p.unregister_scoring_function(obj.identifier)
elif api == Api.tool_runtime:
return await p.unregister_toolgroup(obj.identifier)
else:
diff --git a/llama_stack/core/routing_tables/scoring_functions.py b/llama_stack/core/routing_tables/scoring_functions.py
index 71e5bed63..520f07014 100644
--- a/llama_stack/core/routing_tables/scoring_functions.py
+++ b/llama_stack/core/routing_tables/scoring_functions.py
@@ -60,3 +60,7 @@ class ScoringFunctionsRoutingTable(CommonRoutingTableImpl, ScoringFunctions):
)
scoring_fn.provider_id = provider_id
await self.register_object(scoring_fn)
+
+ async def unregister_scoring_function(self, scoring_fn_id: str) -> None:
+ existing_scoring_fn = await self.get_scoring_function(scoring_fn_id)
+ await self.unregister_object(existing_scoring_fn)
diff --git a/llama_stack/providers/inline/eval/meta_reference/eval.py b/llama_stack/providers/inline/eval/meta_reference/eval.py
index 9ae2018c4..a03e8951c 100644
--- a/llama_stack/providers/inline/eval/meta_reference/eval.py
+++ b/llama_stack/providers/inline/eval/meta_reference/eval.py
@@ -75,6 +75,13 @@ class MetaReferenceEvalImpl(
)
self.benchmarks[task_def.identifier] = task_def
+ async def unregister_benchmark(self, benchmark_id: str) -> None:
+ if benchmark_id in self.benchmarks:
+ del self.benchmarks[benchmark_id]
+
+ key = f"{EVAL_TASKS_PREFIX}{benchmark_id}"
+ await self.kvstore.delete(key)
+
async def run_eval(
self,
benchmark_id: str,
diff --git a/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py b/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py
index fd651877c..9b7628524 100644
--- a/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py
+++ b/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py
@@ -63,6 +63,9 @@ class LlmAsJudgeScoringImpl(
async def register_scoring_function(self, function_def: ScoringFn) -> None:
self.llm_as_judge_fn.register_scoring_fn_def(function_def)
+ async def unregister_scoring_function(self, scoring_fn_id: str) -> None:
+ self.llm_as_judge_fn.unregister_scoring_fn_def(scoring_fn_id)
+
async def score_batch(
self,
dataset_id: str,
diff --git a/llama_stack/providers/remote/eval/nvidia/eval.py b/llama_stack/providers/remote/eval/nvidia/eval.py
index 3572de0ef..a474e78e3 100644
--- a/llama_stack/providers/remote/eval/nvidia/eval.py
+++ b/llama_stack/providers/remote/eval/nvidia/eval.py
@@ -51,18 +51,23 @@ class NVIDIAEvalImpl(
async def shutdown(self) -> None: ...
- async def _evaluator_get(self, path):
+ async def _evaluator_get(self, path: str):
"""Helper for making GET requests to the evaluator service."""
response = requests.get(url=f"{self.config.evaluator_url}{path}")
response.raise_for_status()
return response.json()
- async def _evaluator_post(self, path, data):
+ async def _evaluator_post(self, path: str, data: dict[str, Any]):
"""Helper for making POST requests to the evaluator service."""
response = requests.post(url=f"{self.config.evaluator_url}{path}", json=data)
response.raise_for_status()
return response.json()
+ async def _evaluator_delete(self, path: str) -> None:
+ """Helper for making DELETE requests to the evaluator service."""
+ response = requests.delete(url=f"{self.config.evaluator_url}{path}")
+ response.raise_for_status()
+
async def register_benchmark(self, task_def: Benchmark) -> None:
"""Register a benchmark as an evaluation configuration."""
await self._evaluator_post(
@@ -75,6 +80,10 @@ class NVIDIAEvalImpl(
},
)
+ async def unregister_benchmark(self, benchmark_id: str) -> None:
+ """Unregister a benchmark evaluation configuration from NeMo Evaluator."""
+ await self._evaluator_delete(f"/v1/evaluation/configs/{DEFAULT_NAMESPACE}/{benchmark_id}")
+
async def run_eval(
self,
benchmark_id: str,
diff --git a/tests/integration/scoring/test_scoring.py b/tests/integration/scoring/test_scoring.py
index 315ff050c..1112f9164 100644
--- a/tests/integration/scoring/test_scoring.py
+++ b/tests/integration/scoring/test_scoring.py
@@ -9,6 +9,7 @@ from pathlib import Path
import pandas as pd
import pytest
+import requests
@pytest.fixture
@@ -77,7 +78,46 @@ def test_scoring_functions_register(
assert len(list_response) > 0
assert any(x.identifier == sample_scoring_fn_id for x in list_response)
- # TODO: add unregister api for scoring functions
+
+def test_scoring_functions_unregister(
+ llama_stack_client,
+ sample_scoring_fn_id,
+ judge_model_id,
+ sample_judge_prompt_template,
+):
+ llm_as_judge_provider = [
+ x
+ for x in llama_stack_client.providers.list()
+ if x.api == "scoring" and x.provider_type == "inline::llm-as-judge"
+ ]
+ if len(llm_as_judge_provider) == 0:
+ pytest.skip("No llm-as-judge provider found, cannot test unregister")
+
+ llm_as_judge_provider_id = llm_as_judge_provider[0].provider_id
+
+ # Register first
+ register_scoring_function(
+ llama_stack_client,
+ llm_as_judge_provider_id,
+ sample_scoring_fn_id,
+ judge_model_id,
+ sample_judge_prompt_template,
+ )
+
+ # Ensure it is present
+ list_response = llama_stack_client.scoring_functions.list()
+ assert any(x.identifier == sample_scoring_fn_id for x in list_response)
+
+ # Unregister scoring fn
+ try:
+ base_url = llama_stack_client.base_url
+ except AttributeError:
+ pytest.skip("No server base_url available; cannot test HTTP unregister in library mode")
+
+ resp = requests.delete(f"{base_url}/v1/scoring-functions/{sample_scoring_fn_id}", timeout=30)
+ assert resp.status_code in (200, 204)
+ list_after = llama_stack_client.scoring_functions.list()
+ assert all(x.identifier != sample_scoring_fn_id for x in list_after)
@pytest.mark.parametrize("scoring_fn_id", ["basic::equality"])
diff --git a/tests/unit/distribution/routers/test_routing_tables.py b/tests/unit/distribution/routers/test_routing_tables.py
index 1ceee81c6..bbfea3f46 100644
--- a/tests/unit/distribution/routers/test_routing_tables.py
+++ b/tests/unit/distribution/routers/test_routing_tables.py
@@ -105,6 +105,9 @@ class ScoringFunctionsImpl(Impl):
async def register_scoring_function(self, scoring_fn):
return scoring_fn
+ async def unregister_scoring_function(self, scoring_fn_id: str):
+ return scoring_fn_id
+
class BenchmarksImpl(Impl):
def __init__(self):
@@ -113,6 +116,9 @@ class BenchmarksImpl(Impl):
async def register_benchmark(self, benchmark):
return benchmark
+ async def unregister_benchmark(self, benchmark_id: str):
+ return benchmark_id
+
class ToolGroupsImpl(Impl):
def __init__(self):
@@ -330,6 +336,13 @@ async def test_scoring_functions_routing_table(cached_disk_dist_registry):
assert "test-scoring-fn" in scoring_fn_ids
assert "test-scoring-fn-2" in scoring_fn_ids
+ # Unregister scoring functions and verify listing
+ for i in range(len(scoring_functions.data)):
+ await table.unregister_scoring_function(scoring_functions.data[i].scoring_fn_id)
+
+ scoring_functions_list_after_deletion = await table.list_scoring_functions()
+ assert len(scoring_functions_list_after_deletion.data) == 0
+
async def test_benchmarks_routing_table(cached_disk_dist_registry):
table = BenchmarksRoutingTable({"test_provider": BenchmarksImpl()}, cached_disk_dist_registry, {})
@@ -347,6 +360,15 @@ async def test_benchmarks_routing_table(cached_disk_dist_registry):
benchmark_ids = {b.identifier for b in benchmarks.data}
assert "test-benchmark" in benchmark_ids
+ # Unregister the benchmark and verify removal
+ await table.unregister_benchmark(benchmark_id="test-benchmark")
+ benchmarks_after = await table.list_benchmarks()
+ assert len(benchmarks_after.data) == 0
+
+ # Unregistering a non-existent benchmark should raise a clear error
+ with pytest.raises(ValueError, match="Benchmark 'dummy_benchmark' not found"):
+ await table.unregister_benchmark(benchmark_id="dummy_benchmark")
+
async def test_tool_groups_routing_table(cached_disk_dist_registry):
table = ToolGroupsRoutingTable({"test_provider": ToolGroupsImpl()}, cached_disk_dist_registry, {})
diff --git a/tests/unit/providers/nvidia/test_eval.py b/tests/unit/providers/nvidia/test_eval.py
index 584ca2101..2bdcbbeba 100644
--- a/tests/unit/providers/nvidia/test_eval.py
+++ b/tests/unit/providers/nvidia/test_eval.py
@@ -52,14 +52,19 @@ class TestNVIDIAEvalImpl(unittest.TestCase):
self.evaluator_post_patcher = patch(
"llama_stack.providers.remote.eval.nvidia.eval.NVIDIAEvalImpl._evaluator_post"
)
+ self.evaluator_delete_patcher = patch(
+ "llama_stack.providers.remote.eval.nvidia.eval.NVIDIAEvalImpl._evaluator_delete"
+ )
self.mock_evaluator_get = self.evaluator_get_patcher.start()
self.mock_evaluator_post = self.evaluator_post_patcher.start()
+ self.mock_evaluator_delete = self.evaluator_delete_patcher.start()
def tearDown(self):
"""Clean up after each test."""
self.evaluator_get_patcher.stop()
self.evaluator_post_patcher.stop()
+ self.evaluator_delete_patcher.stop()
def _assert_request_body(self, expected_json):
"""Helper method to verify request body in Evaluator POST request is correct"""
@@ -115,6 +120,13 @@ class TestNVIDIAEvalImpl(unittest.TestCase):
self.mock_evaluator_post.assert_called_once()
self._assert_request_body({"namespace": benchmark.provider_id, "name": benchmark.identifier, **eval_config})
+ def test_unregister_benchmark(self):
+ # Unregister the benchmark
+ self.run_async(self.eval_impl.unregister_benchmark(benchmark_id=MOCK_BENCHMARK_ID))
+
+ # Verify the Evaluator API was called correctly
+ self.mock_evaluator_delete.assert_called_once_with(f"/v1/evaluation/configs/nvidia/{MOCK_BENCHMARK_ID}")
+
def test_run_eval(self):
benchmark_config = BenchmarkConfig(
eval_candidate=ModelCandidate(