feat: create HTTP DELETE API endpoints to unregister ScoringFn and Benchmark resources in Llama Stack (#3371)

# What does this PR do?  This PR provides functionality for users to unregister ScoringFn and Benchmark resources for `scoring` and `eval` APIs.   Closes #3051 ## Test Plan  Updated integration and unit tests via CI workflow
2025-12-03 18:00:36 +00:00 · 2025-09-15 20:43:38 +01:00 · 2025-09-15 20:43:38 +01:00 · ab321739f2
commit ab321739f2
parent 01bdcce4d2
13 changed files with 241 additions and 3 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -1380,6 +1380,40 @@
                        }
                    }
                ]
            },
            "delete": {
                "responses": {
                    "200": {
                        "description": "OK"
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Benchmarks"
                ],
                "description": "Unregister a benchmark.",
                "parameters": [
                    {
                        "name": "benchmark_id",
                        "in": "path",
                        "description": "The ID of the benchmark to unregister.",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    }
                ]
            }
        },
        "/v1/openai/v1/chat/completions/{completion_id}": {
@ -1620,6 +1654,40 @@
                        }
                    }
                ]
            },
            "delete": {
                "responses": {
                    "200": {
                        "description": "OK"
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "ScoringFunctions"
                ],
                "description": "Unregister a scoring function.",
                "parameters": [
                    {
                        "name": "scoring_fn_id",
                        "in": "path",
                        "description": "The ID of the scoring function to unregister.",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    }
                ]
            }
        },
        "/v1/shields/{identifier}": {
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -954,6 +954,30 @@ paths:
          required: true
          schema:
            type: string
    delete:
      responses:
        '200':
          description: OK
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Benchmarks
      description: Unregister a benchmark.
      parameters:
        - name: benchmark_id
          in: path
          description: The ID of the benchmark to unregister.
          required: true
          schema:
            type: string
  /v1/openai/v1/chat/completions/{completion_id}:
    get:
      responses:
@ -1119,6 +1143,31 @@ paths:
          required: true
          schema:
            type: string
    delete:
      responses:
        '200':
          description: OK
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - ScoringFunctions
      description: Unregister a scoring function.
      parameters:
        - name: scoring_fn_id
          in: path
          description: >-
            The ID of the scoring function to unregister.
          required: true
          schema:
            type: string
  /v1/shields/{identifier}:
    get:
      responses:
--- a/llama_stack/apis/benchmarks/benchmarks.py
+++ b/llama_stack/apis/benchmarks/benchmarks.py
@ -93,3 +93,11 @@ class Benchmarks(Protocol):
        :param metadata: The metadata to use for the benchmark.
        """
        ...
    @webmethod(route="/eval/benchmarks/{benchmark_id}", method="DELETE")
    async def unregister_benchmark(self, benchmark_id: str) -> None:
        """Unregister a benchmark.
        :param benchmark_id: The ID of the benchmark to unregister.
        """
        ...
--- a/llama_stack/apis/scoring_functions/scoring_functions.py
+++ b/llama_stack/apis/scoring_functions/scoring_functions.py
@ -197,3 +197,11 @@ class ScoringFunctions(Protocol):
        :param params: The parameters for the scoring function for benchmark eval, these can be overridden for app eval.
        """
        ...
    @webmethod(route="/scoring-functions/{scoring_fn_id:path}", method="DELETE")
    async def unregister_scoring_function(self, scoring_fn_id: str) -> None:
        """Unregister a scoring function.
        :param scoring_fn_id: The ID of the scoring function to unregister.
        """
        ...
--- a/llama_stack/core/routing_tables/benchmarks.py
+++ b/llama_stack/core/routing_tables/benchmarks.py
@ -56,3 +56,7 @@ class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks):
            provider_resource_id=provider_benchmark_id,
        )
        await self.register_object(benchmark)
    async def unregister_benchmark(self, benchmark_id: str) -> None:
        existing_benchmark = await self.get_benchmark(benchmark_id)
        await self.unregister_object(existing_benchmark)
--- a/llama_stack/core/routing_tables/common.py
+++ b/llama_stack/core/routing_tables/common.py
@ -64,6 +64,10 @@ async def unregister_object_from_provider(obj: RoutableObject, p: Any) -> None:
        return await p.unregister_shield(obj.identifier)
    elif api == Api.datasetio:
        return await p.unregister_dataset(obj.identifier)
    elif api == Api.eval:
        return await p.unregister_benchmark(obj.identifier)
    elif api == Api.scoring:
        return await p.unregister_scoring_function(obj.identifier)
    elif api == Api.tool_runtime:
        return await p.unregister_toolgroup(obj.identifier)
    else:
--- a/llama_stack/core/routing_tables/scoring_functions.py
+++ b/llama_stack/core/routing_tables/scoring_functions.py
@ -60,3 +60,7 @@ class ScoringFunctionsRoutingTable(CommonRoutingTableImpl, ScoringFunctions):
        )
        scoring_fn.provider_id = provider_id
        await self.register_object(scoring_fn)
    async def unregister_scoring_function(self, scoring_fn_id: str) -> None:
        existing_scoring_fn = await self.get_scoring_function(scoring_fn_id)
        await self.unregister_object(existing_scoring_fn)
--- a/llama_stack/providers/inline/eval/meta_reference/eval.py
+++ b/llama_stack/providers/inline/eval/meta_reference/eval.py
@ -75,6 +75,13 @@ class MetaReferenceEvalImpl(
        )
        self.benchmarks[task_def.identifier] = task_def
    async def unregister_benchmark(self, benchmark_id: str) -> None:
        if benchmark_id in self.benchmarks:
            del self.benchmarks[benchmark_id]
        key = f"{EVAL_TASKS_PREFIX}{benchmark_id}"
        await self.kvstore.delete(key)
    async def run_eval(
        self,
        benchmark_id: str,
--- a/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py
+++ b/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py
@ -63,6 +63,9 @@ class LlmAsJudgeScoringImpl(
    async def register_scoring_function(self, function_def: ScoringFn) -> None:
        self.llm_as_judge_fn.register_scoring_fn_def(function_def)
    async def unregister_scoring_function(self, scoring_fn_id: str) -> None:
        self.llm_as_judge_fn.unregister_scoring_fn_def(scoring_fn_id)
    async def score_batch(
        self,
        dataset_id: str,
--- a/llama_stack/providers/remote/eval/nvidia/eval.py
+++ b/llama_stack/providers/remote/eval/nvidia/eval.py
@ -51,18 +51,23 @@ class NVIDIAEvalImpl(
    async def shutdown(self) -> None: ...
-    async def _evaluator_get(self, path):
+    async def _evaluator_get(self, path: str):
        """Helper for making GET requests to the evaluator service."""
        response = requests.get(url=f"{self.config.evaluator_url}{path}")
        response.raise_for_status()
        return response.json()
-    async def _evaluator_post(self, path, data):
+    async def _evaluator_post(self, path: str, data: dict[str, Any]):
        """Helper for making POST requests to the evaluator service."""
        response = requests.post(url=f"{self.config.evaluator_url}{path}", json=data)
        response.raise_for_status()
        return response.json()
    async def _evaluator_delete(self, path: str) -> None:
        """Helper for making DELETE requests to the evaluator service."""
        response = requests.delete(url=f"{self.config.evaluator_url}{path}")
        response.raise_for_status()
    async def register_benchmark(self, task_def: Benchmark) -> None:
        """Register a benchmark as an evaluation configuration."""
        await self._evaluator_post(
@ -75,6 +80,10 @@ class NVIDIAEvalImpl(
            },
        )
    async def unregister_benchmark(self, benchmark_id: str) -> None:
        """Unregister a benchmark evaluation configuration from NeMo Evaluator."""
        await self._evaluator_delete(f"/v1/evaluation/configs/{DEFAULT_NAMESPACE}/{benchmark_id}")
    async def run_eval(
        self,
        benchmark_id: str,
--- a/tests/integration/scoring/test_scoring.py
+++ b/tests/integration/scoring/test_scoring.py
@ -9,6 +9,7 @@ from pathlib import Path
 import pandas as pd
 import pytest
 import requests
@pytest.fixture
@ -77,7 +78,46 @@ def test_scoring_functions_register(
    assert len(list_response) > 0
    assert any(x.identifier == sample_scoring_fn_id for x in list_response)
-    # TODO: add unregister api for scoring functions
+
 def test_scoring_functions_unregister(
    llama_stack_client,
    sample_scoring_fn_id,
    judge_model_id,
    sample_judge_prompt_template,
 ):
    llm_as_judge_provider = [
        x
        for x in llama_stack_client.providers.list()
        if x.api == "scoring" and x.provider_type == "inline::llm-as-judge"
    ]
    if len(llm_as_judge_provider) == 0:
        pytest.skip("No llm-as-judge provider found, cannot test unregister")
    llm_as_judge_provider_id = llm_as_judge_provider[0].provider_id
    # Register first
    register_scoring_function(
        llama_stack_client,
        llm_as_judge_provider_id,
        sample_scoring_fn_id,
        judge_model_id,
        sample_judge_prompt_template,
    )
    # Ensure it is present
    list_response = llama_stack_client.scoring_functions.list()
    assert any(x.identifier == sample_scoring_fn_id for x in list_response)
    # Unregister scoring fn
    try:
        base_url = llama_stack_client.base_url
    except AttributeError:
        pytest.skip("No server base_url available; cannot test HTTP unregister in library mode")
    resp = requests.delete(f"{base_url}/v1/scoring-functions/{sample_scoring_fn_id}", timeout=30)
    assert resp.status_code in (200, 204)
    list_after = llama_stack_client.scoring_functions.list()
    assert all(x.identifier != sample_scoring_fn_id for x in list_after)
@pytest.mark.parametrize("scoring_fn_id", ["basic::equality"])
--- a/tests/unit/distribution/routers/test_routing_tables.py
+++ b/tests/unit/distribution/routers/test_routing_tables.py
@ -105,6 +105,9 @@ class ScoringFunctionsImpl(Impl):
    async def register_scoring_function(self, scoring_fn):
        return scoring_fn
    async def unregister_scoring_function(self, scoring_fn_id: str):
        return scoring_fn_id
 class BenchmarksImpl(Impl):
    def __init__(self):
@ -113,6 +116,9 @@ class BenchmarksImpl(Impl):
    async def register_benchmark(self, benchmark):
        return benchmark
    async def unregister_benchmark(self, benchmark_id: str):
        return benchmark_id
 class ToolGroupsImpl(Impl):
    def __init__(self):
@ -330,6 +336,13 @@ async def test_scoring_functions_routing_table(cached_disk_dist_registry):
    assert "test-scoring-fn" in scoring_fn_ids
    assert "test-scoring-fn-2" in scoring_fn_ids
    # Unregister scoring functions and verify listing
    for i in range(len(scoring_functions.data)):
        await table.unregister_scoring_function(scoring_functions.data[i].scoring_fn_id)
    scoring_functions_list_after_deletion = await table.list_scoring_functions()
    assert len(scoring_functions_list_after_deletion.data) == 0
 async def test_benchmarks_routing_table(cached_disk_dist_registry):
    table = BenchmarksRoutingTable({"test_provider": BenchmarksImpl()}, cached_disk_dist_registry, {})
@ -347,6 +360,15 @@ async def test_benchmarks_routing_table(cached_disk_dist_registry):
    benchmark_ids = {b.identifier for b in benchmarks.data}
    assert "test-benchmark" in benchmark_ids
    # Unregister the benchmark and verify removal
    await table.unregister_benchmark(benchmark_id="test-benchmark")
    benchmarks_after = await table.list_benchmarks()
    assert len(benchmarks_after.data) == 0
    # Unregistering a non-existent benchmark should raise a clear error
    with pytest.raises(ValueError, match="Benchmark 'dummy_benchmark' not found"):
        await table.unregister_benchmark(benchmark_id="dummy_benchmark")
 async def test_tool_groups_routing_table(cached_disk_dist_registry):
    table = ToolGroupsRoutingTable({"test_provider": ToolGroupsImpl()}, cached_disk_dist_registry, {})
--- a/tests/unit/providers/nvidia/test_eval.py
+++ b/tests/unit/providers/nvidia/test_eval.py
@ -52,14 +52,19 @@ class TestNVIDIAEvalImpl(unittest.TestCase):
        self.evaluator_post_patcher = patch(
            "llama_stack.providers.remote.eval.nvidia.eval.NVIDIAEvalImpl._evaluator_post"
        )
        self.evaluator_delete_patcher = patch(
            "llama_stack.providers.remote.eval.nvidia.eval.NVIDIAEvalImpl._evaluator_delete"
        )
        self.mock_evaluator_get = self.evaluator_get_patcher.start()
        self.mock_evaluator_post = self.evaluator_post_patcher.start()
        self.mock_evaluator_delete = self.evaluator_delete_patcher.start()
    def tearDown(self):
        """Clean up after each test."""
        self.evaluator_get_patcher.stop()
        self.evaluator_post_patcher.stop()
        self.evaluator_delete_patcher.stop()
    def _assert_request_body(self, expected_json):
        """Helper method to verify request body in Evaluator POST request is correct"""
@ -115,6 +120,13 @@ class TestNVIDIAEvalImpl(unittest.TestCase):
        self.mock_evaluator_post.assert_called_once()
        self._assert_request_body({"namespace": benchmark.provider_id, "name": benchmark.identifier, **eval_config})
    def test_unregister_benchmark(self):
        # Unregister the benchmark
        self.run_async(self.eval_impl.unregister_benchmark(benchmark_id=MOCK_BENCHMARK_ID))
        # Verify the Evaluator API was called correctly
        self.mock_evaluator_delete.assert_called_once_with(f"/v1/evaluation/configs/nvidia/{MOCK_BENCHMARK_ID}")
    def test_run_eval(self):
        benchmark_config = BenchmarkConfig(
            eval_candidate=ModelCandidate(