test: revamp eval related integration tests (#1433)

# What does this PR do? - revamp and clean up datasets/scoring/eval integration tests - closes https://github.com/meta-llama/llama-stack/issues/1396 [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan **dataset** ``` LLAMA_STACK_BASE_URL=http://localhost:8321 pytest -v tests/integration/datasetio/ ``` <img width="842" alt="image" src="https://github.com/user-attachments/assets/88fc2b6a-b496-47bf-bc0c-8fea48ba36ff" /> **scoring** ``` LLAMA_STACK_CONFIG=fireworks pytest -v tests/integration/scoring --text-model meta-llama/Llama-3.1-8B-Instruct --judge-model meta-llama/Llama-3.1-8B-Instruct ``` <img width="851" alt="image" src="https://github.com/user-attachments/assets/50f46415-b44c-4c37-a6c3-076f2767adb3" /> **eval** ``` LLAMA_STACK_CONFIG=fireworks pytest -v tests/integration/eval --text-model meta-llama/Llama-3.1-8B-Instruct --judge-model meta-llama/Llama-3.1-8B-Instruct ``` <img width="841" alt="image" src="https://github.com/user-attachments/assets/8eb1c65c-3b39-4d66-8ff4-f471ca783e49" /> [//]: # (## Documentation)
2025-06-28 02:53:30 +00:00 · 2025-03-06 10:51:35 -08:00 · 2025-03-06 10:51:35 -08:00 · bcb13c492f
commit bcb13c492f
parent 82e94fe22f
7 changed files with 184 additions and 222 deletions
--- a/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py
+++ b/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py
@ -6,7 +6,7 @@
 import re
 from typing import Any, Dict, Optional

-from llama_stack.apis.inference.inference import Inference
+from llama_stack.apis.inference.inference import Inference, UserMessage
 from llama_stack.apis.scoring import ScoringResultRow
 from llama_stack.apis.scoring_functions import ScoringFnParams
 from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn
@ -58,10 +58,9 @@ class LlmAsJudgeScoringFn(RegisteredBaseScoringFn):
        judge_response = await self.inference_api.chat_completion(
            model_id=fn_def.params.judge_model,
            messages=[
-                {
-                    "role": "user",
-                    "content": judge_input_msg,
-                }
+                UserMessage(
+                    content=judge_input_msg,
+                ),
            ],
        )
        content = judge_response.completion_message.content
--- a/llama_stack/providers/utils/scoring/base_scoring_fn.py
+++ b/llama_stack/providers/utils/scoring/base_scoring_fn.py
@ -73,6 +73,11 @@ class RegisteredBaseScoringFn(BaseScoringFn):
            raise ValueError(f"Scoring function def with identifier {scoring_fn.identifier} already exists.")
        self.supported_fn_defs_registry[scoring_fn.identifier] = scoring_fn

+    def unregister_scoring_fn_def(self, scoring_fn_id: str) -> None:
+        if scoring_fn_id not in self.supported_fn_defs_registry:
+            raise ValueError(f"Scoring function def with identifier {scoring_fn_id} does not exist.")
+        del self.supported_fn_defs_registry[scoring_fn_id]
+
    @abstractmethod
    async def score_row(
        self,