[Evals API] [1/n] Initial API (#287)

* type system api * datasets api * fix * datasetio api * kill reward scoring * scoring functions + evals * move jobs, fix errors
2025-06-28 02:53:30 +00:00 · 2024-10-22 09:31:19 -07:00 · 2024-10-22 09:31:19 -07:00 · e45f121c77
commit e45f121c77
parent b279d3bc58
15 changed files with 397 additions and 243 deletions
--- a/llama_stack/apis/eval/init.py
+++ b/llama_stack/apis/eval/init.py
@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .eval import *  # noqa: F401 F403
--- a/llama_stack/apis/eval/eval.py
+++ b/llama_stack/apis/eval/eval.py
@ -0,0 +1,70 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Literal, Optional, Protocol, Union
+
+from typing_extensions import Annotated
+
+from llama_models.llama3.api.datatypes import *  # noqa: F403
+from llama_models.schema_utils import json_schema_type, webmethod
+from llama_stack.apis.scoring_functions import *  # noqa: F403
+from llama_stack.apis.agents import AgentConfig
+from llama_stack.apis.common.job_types import Job
+from llama_stack.apis.scoring import *  # noqa: F403
+
+
+@json_schema_type
+class ModelCandidate(BaseModel):
+    type: Literal["model"] = "model"
+    model: str
+    sampling_params: SamplingParams
+    system_message: Optional[SystemMessage] = None
+
+
+@json_schema_type
+class AgentCandidate(BaseModel):
+    type: Literal["agent"] = "agent"
+    config: AgentConfig
+
+
+EvalCandidate = Annotated[
+    Union[ModelCandidate, AgentCandidate], Field(discriminator="type")
+]
+
+
+@json_schema_type
+class EvaluateResponse(BaseModel):
+    generations: List[Dict[str, Any]]
+
+    # each key in the dict is a scoring function name
+    scores: List[Dict[str, ScoringResult]]
+
+
+class Eval(Protocol):
+    @webmethod(route="/eval/evaluate_batch", method="POST")
+    async def evaluate_batch(
+        self,
+        dataset_id: str,
+        candidate: EvalCandidate,
+        scoring_functions: List[str],
+    ) -> Job: ...
+
+    @webmethod(route="/eval/evaluate", method="POST")
+    async def evaluate(
+        self,
+        input_rows: List[Dict[str, Any]],
+        candidate: EvalCandidate,
+        scoring_functions: List[str],
+    ) -> EvaluateResponse: ...
+
+    @webmethod(route="/eval/job/status", method="GET")
+    async def job_status(self, job_id: str) -> None: ...
+
+    @webmethod(route="/eval/job/cancel", method="POST")
+    async def job_cancel(self, job_id: str) -> None: ...
+
+    @webmethod(route="/eval/job/result", method="GET")
+    async def job_result(self, job_id: str) -> None: ...