diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index a036e5dc0..9ddb070d7 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -1380,6 +1380,40 @@ } } ] + }, + "delete": { + "responses": { + "200": { + "description": "OK" + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Benchmarks" + ], + "description": "Unregister a benchmark.", + "parameters": [ + { + "name": "benchmark_id", + "in": "path", + "description": "The ID of the benchmark to unregister.", + "required": true, + "schema": { + "type": "string" + } + } + ] } }, "/v1/openai/v1/chat/completions/{completion_id}": { @@ -1620,6 +1654,40 @@ } } ] + }, + "delete": { + "responses": { + "200": { + "description": "OK" + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "ScoringFunctions" + ], + "description": "Unregister a scoring function.", + "parameters": [ + { + "name": "scoring_fn_id", + "in": "path", + "description": "The ID of the scoring function to unregister.", + "required": true, + "schema": { + "type": "string" + } + } + ] } }, "/v1/shields/{identifier}": { diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 8ed04c1f8..94dc5c0f9 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -954,6 +954,30 @@ paths: required: true schema: type: string + delete: + responses: + '200': + description: OK + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Benchmarks + description: Unregister a benchmark. + parameters: + - name: benchmark_id + in: path + description: The ID of the benchmark to unregister. + required: true + schema: + type: string /v1/openai/v1/chat/completions/{completion_id}: get: responses: @@ -1119,6 +1143,31 @@ paths: required: true schema: type: string + delete: + responses: + '200': + description: OK + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - ScoringFunctions + description: Unregister a scoring function. + parameters: + - name: scoring_fn_id + in: path + description: >- + The ID of the scoring function to unregister. + required: true + schema: + type: string /v1/shields/{identifier}: get: responses: diff --git a/llama_stack/apis/benchmarks/benchmarks.py b/llama_stack/apis/benchmarks/benchmarks.py index 706eaed6c..8d0a25e7b 100644 --- a/llama_stack/apis/benchmarks/benchmarks.py +++ b/llama_stack/apis/benchmarks/benchmarks.py @@ -93,3 +93,11 @@ class Benchmarks(Protocol): :param metadata: The metadata to use for the benchmark. """ ... + + @webmethod(route="/eval/benchmarks/{benchmark_id}", method="DELETE") + async def unregister_benchmark(self, benchmark_id: str) -> None: + """Unregister a benchmark. + + :param benchmark_id: The ID of the benchmark to unregister. + """ + ... diff --git a/llama_stack/apis/scoring_functions/scoring_functions.py b/llama_stack/apis/scoring_functions/scoring_functions.py index 05b6325b7..541067766 100644 --- a/llama_stack/apis/scoring_functions/scoring_functions.py +++ b/llama_stack/apis/scoring_functions/scoring_functions.py @@ -197,3 +197,11 @@ class ScoringFunctions(Protocol): :param params: The parameters for the scoring function for benchmark eval, these can be overridden for app eval. """ ... + + @webmethod(route="/scoring-functions/{scoring_fn_id:path}", method="DELETE") + async def unregister_scoring_function(self, scoring_fn_id: str) -> None: + """Unregister a scoring function. + + :param scoring_fn_id: The ID of the scoring function to unregister. + """ + ... diff --git a/llama_stack/core/routing_tables/benchmarks.py b/llama_stack/core/routing_tables/benchmarks.py index c875dee5b..8c87d395d 100644 --- a/llama_stack/core/routing_tables/benchmarks.py +++ b/llama_stack/core/routing_tables/benchmarks.py @@ -56,3 +56,7 @@ class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks): provider_resource_id=provider_benchmark_id, ) await self.register_object(benchmark) + + async def unregister_benchmark(self, benchmark_id: str) -> None: + existing_benchmark = await self.get_benchmark(benchmark_id) + await self.unregister_object(existing_benchmark) diff --git a/llama_stack/core/routing_tables/common.py b/llama_stack/core/routing_tables/common.py index e523746d8..ca2f3af42 100644 --- a/llama_stack/core/routing_tables/common.py +++ b/llama_stack/core/routing_tables/common.py @@ -64,6 +64,10 @@ async def unregister_object_from_provider(obj: RoutableObject, p: Any) -> None: return await p.unregister_shield(obj.identifier) elif api == Api.datasetio: return await p.unregister_dataset(obj.identifier) + elif api == Api.eval: + return await p.unregister_benchmark(obj.identifier) + elif api == Api.scoring: + return await p.unregister_scoring_function(obj.identifier) elif api == Api.tool_runtime: return await p.unregister_toolgroup(obj.identifier) else: diff --git a/llama_stack/core/routing_tables/scoring_functions.py b/llama_stack/core/routing_tables/scoring_functions.py index 71e5bed63..520f07014 100644 --- a/llama_stack/core/routing_tables/scoring_functions.py +++ b/llama_stack/core/routing_tables/scoring_functions.py @@ -60,3 +60,7 @@ class ScoringFunctionsRoutingTable(CommonRoutingTableImpl, ScoringFunctions): ) scoring_fn.provider_id = provider_id await self.register_object(scoring_fn) + + async def unregister_scoring_function(self, scoring_fn_id: str) -> None: + existing_scoring_fn = await self.get_scoring_function(scoring_fn_id) + await self.unregister_object(existing_scoring_fn) diff --git a/llama_stack/providers/inline/eval/meta_reference/eval.py b/llama_stack/providers/inline/eval/meta_reference/eval.py index 9ae2018c4..a03e8951c 100644 --- a/llama_stack/providers/inline/eval/meta_reference/eval.py +++ b/llama_stack/providers/inline/eval/meta_reference/eval.py @@ -75,6 +75,13 @@ class MetaReferenceEvalImpl( ) self.benchmarks[task_def.identifier] = task_def + async def unregister_benchmark(self, benchmark_id: str) -> None: + if benchmark_id in self.benchmarks: + del self.benchmarks[benchmark_id] + + key = f"{EVAL_TASKS_PREFIX}{benchmark_id}" + await self.kvstore.delete(key) + async def run_eval( self, benchmark_id: str, diff --git a/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py b/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py index fd651877c..9b7628524 100644 --- a/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py +++ b/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py @@ -63,6 +63,9 @@ class LlmAsJudgeScoringImpl( async def register_scoring_function(self, function_def: ScoringFn) -> None: self.llm_as_judge_fn.register_scoring_fn_def(function_def) + async def unregister_scoring_function(self, scoring_fn_id: str) -> None: + self.llm_as_judge_fn.unregister_scoring_fn_def(scoring_fn_id) + async def score_batch( self, dataset_id: str, diff --git a/llama_stack/providers/remote/eval/nvidia/eval.py b/llama_stack/providers/remote/eval/nvidia/eval.py index 3572de0ef..a474e78e3 100644 --- a/llama_stack/providers/remote/eval/nvidia/eval.py +++ b/llama_stack/providers/remote/eval/nvidia/eval.py @@ -51,18 +51,23 @@ class NVIDIAEvalImpl( async def shutdown(self) -> None: ... - async def _evaluator_get(self, path): + async def _evaluator_get(self, path: str): """Helper for making GET requests to the evaluator service.""" response = requests.get(url=f"{self.config.evaluator_url}{path}") response.raise_for_status() return response.json() - async def _evaluator_post(self, path, data): + async def _evaluator_post(self, path: str, data: dict[str, Any]): """Helper for making POST requests to the evaluator service.""" response = requests.post(url=f"{self.config.evaluator_url}{path}", json=data) response.raise_for_status() return response.json() + async def _evaluator_delete(self, path: str) -> None: + """Helper for making DELETE requests to the evaluator service.""" + response = requests.delete(url=f"{self.config.evaluator_url}{path}") + response.raise_for_status() + async def register_benchmark(self, task_def: Benchmark) -> None: """Register a benchmark as an evaluation configuration.""" await self._evaluator_post( @@ -75,6 +80,10 @@ class NVIDIAEvalImpl( }, ) + async def unregister_benchmark(self, benchmark_id: str) -> None: + """Unregister a benchmark evaluation configuration from NeMo Evaluator.""" + await self._evaluator_delete(f"/v1/evaluation/configs/{DEFAULT_NAMESPACE}/{benchmark_id}") + async def run_eval( self, benchmark_id: str, diff --git a/tests/integration/scoring/test_scoring.py b/tests/integration/scoring/test_scoring.py index 315ff050c..1112f9164 100644 --- a/tests/integration/scoring/test_scoring.py +++ b/tests/integration/scoring/test_scoring.py @@ -9,6 +9,7 @@ from pathlib import Path import pandas as pd import pytest +import requests @pytest.fixture @@ -77,7 +78,46 @@ def test_scoring_functions_register( assert len(list_response) > 0 assert any(x.identifier == sample_scoring_fn_id for x in list_response) - # TODO: add unregister api for scoring functions + +def test_scoring_functions_unregister( + llama_stack_client, + sample_scoring_fn_id, + judge_model_id, + sample_judge_prompt_template, +): + llm_as_judge_provider = [ + x + for x in llama_stack_client.providers.list() + if x.api == "scoring" and x.provider_type == "inline::llm-as-judge" + ] + if len(llm_as_judge_provider) == 0: + pytest.skip("No llm-as-judge provider found, cannot test unregister") + + llm_as_judge_provider_id = llm_as_judge_provider[0].provider_id + + # Register first + register_scoring_function( + llama_stack_client, + llm_as_judge_provider_id, + sample_scoring_fn_id, + judge_model_id, + sample_judge_prompt_template, + ) + + # Ensure it is present + list_response = llama_stack_client.scoring_functions.list() + assert any(x.identifier == sample_scoring_fn_id for x in list_response) + + # Unregister scoring fn + try: + base_url = llama_stack_client.base_url + except AttributeError: + pytest.skip("No server base_url available; cannot test HTTP unregister in library mode") + + resp = requests.delete(f"{base_url}/v1/scoring-functions/{sample_scoring_fn_id}", timeout=30) + assert resp.status_code in (200, 204) + list_after = llama_stack_client.scoring_functions.list() + assert all(x.identifier != sample_scoring_fn_id for x in list_after) @pytest.mark.parametrize("scoring_fn_id", ["basic::equality"]) diff --git a/tests/unit/distribution/routers/test_routing_tables.py b/tests/unit/distribution/routers/test_routing_tables.py index 1ceee81c6..bbfea3f46 100644 --- a/tests/unit/distribution/routers/test_routing_tables.py +++ b/tests/unit/distribution/routers/test_routing_tables.py @@ -105,6 +105,9 @@ class ScoringFunctionsImpl(Impl): async def register_scoring_function(self, scoring_fn): return scoring_fn + async def unregister_scoring_function(self, scoring_fn_id: str): + return scoring_fn_id + class BenchmarksImpl(Impl): def __init__(self): @@ -113,6 +116,9 @@ class BenchmarksImpl(Impl): async def register_benchmark(self, benchmark): return benchmark + async def unregister_benchmark(self, benchmark_id: str): + return benchmark_id + class ToolGroupsImpl(Impl): def __init__(self): @@ -330,6 +336,13 @@ async def test_scoring_functions_routing_table(cached_disk_dist_registry): assert "test-scoring-fn" in scoring_fn_ids assert "test-scoring-fn-2" in scoring_fn_ids + # Unregister scoring functions and verify listing + for i in range(len(scoring_functions.data)): + await table.unregister_scoring_function(scoring_functions.data[i].scoring_fn_id) + + scoring_functions_list_after_deletion = await table.list_scoring_functions() + assert len(scoring_functions_list_after_deletion.data) == 0 + async def test_benchmarks_routing_table(cached_disk_dist_registry): table = BenchmarksRoutingTable({"test_provider": BenchmarksImpl()}, cached_disk_dist_registry, {}) @@ -347,6 +360,15 @@ async def test_benchmarks_routing_table(cached_disk_dist_registry): benchmark_ids = {b.identifier for b in benchmarks.data} assert "test-benchmark" in benchmark_ids + # Unregister the benchmark and verify removal + await table.unregister_benchmark(benchmark_id="test-benchmark") + benchmarks_after = await table.list_benchmarks() + assert len(benchmarks_after.data) == 0 + + # Unregistering a non-existent benchmark should raise a clear error + with pytest.raises(ValueError, match="Benchmark 'dummy_benchmark' not found"): + await table.unregister_benchmark(benchmark_id="dummy_benchmark") + async def test_tool_groups_routing_table(cached_disk_dist_registry): table = ToolGroupsRoutingTable({"test_provider": ToolGroupsImpl()}, cached_disk_dist_registry, {}) diff --git a/tests/unit/providers/nvidia/test_eval.py b/tests/unit/providers/nvidia/test_eval.py index 584ca2101..2bdcbbeba 100644 --- a/tests/unit/providers/nvidia/test_eval.py +++ b/tests/unit/providers/nvidia/test_eval.py @@ -52,14 +52,19 @@ class TestNVIDIAEvalImpl(unittest.TestCase): self.evaluator_post_patcher = patch( "llama_stack.providers.remote.eval.nvidia.eval.NVIDIAEvalImpl._evaluator_post" ) + self.evaluator_delete_patcher = patch( + "llama_stack.providers.remote.eval.nvidia.eval.NVIDIAEvalImpl._evaluator_delete" + ) self.mock_evaluator_get = self.evaluator_get_patcher.start() self.mock_evaluator_post = self.evaluator_post_patcher.start() + self.mock_evaluator_delete = self.evaluator_delete_patcher.start() def tearDown(self): """Clean up after each test.""" self.evaluator_get_patcher.stop() self.evaluator_post_patcher.stop() + self.evaluator_delete_patcher.stop() def _assert_request_body(self, expected_json): """Helper method to verify request body in Evaluator POST request is correct""" @@ -115,6 +120,13 @@ class TestNVIDIAEvalImpl(unittest.TestCase): self.mock_evaluator_post.assert_called_once() self._assert_request_body({"namespace": benchmark.provider_id, "name": benchmark.identifier, **eval_config}) + def test_unregister_benchmark(self): + # Unregister the benchmark + self.run_async(self.eval_impl.unregister_benchmark(benchmark_id=MOCK_BENCHMARK_ID)) + + # Verify the Evaluator API was called correctly + self.mock_evaluator_delete.assert_called_once_with(f"/v1/evaluation/configs/nvidia/{MOCK_BENCHMARK_ID}") + def test_run_eval(self): benchmark_config = BenchmarkConfig( eval_candidate=ModelCandidate(