api refactor

2025-12-17 09:59:47 +00:00 · 2024-11-07 13:54:26 -08:00 · 2024-11-07 13:54:26 -08:00 · 51c20f9c29
commit 51c20f9c29
parent 97dcd5704c
8 changed files with 64 additions and 59 deletions
--- a/llama_stack/apis/eval/eval.py
+++ b/llama_stack/apis/eval/eval.py
@ -38,14 +38,16 @@ EvalCandidate = Annotated[

@json_schema_type
 class BenchmarkEvalTaskConfig(BaseModel):
+    type: Literal["benchmark"] = "benchmark"
    eval_candidate: EvalCandidate


@json_schema_type
 class AppEvalTaskConfig(BaseModel):
+    type: Literal["app"] = "app"
    eval_candidate: EvalCandidate
    scoring_params: Dict[str, ScoringFnParams] = Field(
-        description="Map between scoring function id and parameters",
+        description="Map between scoring function id and parameters for each scoring function you want to run",
        default_factory=dict,
    )
    # we could optinally add any specific dataset config here
@ -64,18 +66,18 @@ class EvaluateResponse(BaseModel):


 class Eval(Protocol):
-    @webmethod(route="/eval/run_benchmark_eval", method="POST")
-    async def run_benchmark_eval(
+    @webmethod(route="/eval/run_benchmark", method="POST")
+    async def run_benchmark(
        self,
        benchmark_id: str,
-        eval_task_config: BenchmarkEvalTaskConfig,
+        benchmark_config: BenchmarkEvalTaskConfig,
    ) -> Job: ...

    @webmethod(route="/eval/run_eval", method="POST")
    async def run_eval(
        self,
-        eval_task_def: EvalTaskDef,
-        eval_task_config: EvalTaskConfig,
+        task: EvalTaskDef,
+        task_config: AppEvalTaskConfig,
    ) -> Job: ...

    @webmethod(route="/eval/evaluate_rows", method="POST")
--- a/llama_stack/apis/scoring/scoring.py
+++ b/llama_stack/apis/scoring/scoring.py
@ -48,8 +48,7 @@ class Scoring(Protocol):
    async def score_batch(
        self,
        dataset_id: str,
-        scoring_functions: List[str],
-        scoring_params: Optional[Dict[str, ScoringFnParams]] = None,
+        scoring_functions: Optional[Dict[str, ScoringFnParams]] = None,
        save_results_dataset: bool = False,
    ) -> ScoreBatchResponse: ...

@ -57,6 +56,5 @@ class Scoring(Protocol):
    async def score(
        self,
        input_rows: List[Dict[str, Any]],
-        scoring_functions: List[str],
-        scoring_params: Optional[Dict[str, ScoringFnParams]] = None,
+        scoring_functions: Optional[Dict[str, ScoringFnParams]] = None,
    ) -> ScoreResponse: ...
--- a/llama_stack/apis/scoring_functions/scoring_functions.py
+++ b/llama_stack/apis/scoring_functions/scoring_functions.py
@ -76,7 +76,7 @@ class ScoringFnDef(BaseModel):
        description="The return type of the deterministic function",
    )
    params: Optional[ScoringFnParams] = Field(  # type: ignore
-        description="The parameters for the scoring function for benchmark eval, we could override this for app eval",
+        description="The parameters for the scoring function for benchmark eval, these can be overridden for app eval",
        default=None,
    )
    # We can optionally add information here to support packaging of code, etc.