simpleqa eval

2025-12-16 12:12:37 +00:00 · 2024-11-01 00:30:36 -07:00 · 2024-11-01 00:30:36 -07:00 · 43fb522a13
commit 43fb522a13
parent f94681baac
8 changed files with 191 additions and 24 deletions
--- a/llama_stack/apis/scoring_functions/scoring_functions.py
+++ b/llama_stack/apis/scoring_functions/scoring_functions.py
@ -43,6 +43,11 @@ class LLMAsJudgeContext(BaseModel):
        description="Regex to extract the score from the judge response",
        default=None,
    )
+    # TODO: think about whether to put this as a scoring function context or in separate scorer
+    # and how the LLM as judge defines the response
+    judge_grade_metrics: Optional[Dict[str, str]] = Field(
+        description="Mapping of extracted judge response to score", default=None
+    )


@json_schema_type