mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-31 11:50:01 +00:00
Merge branch 'eval_api_final' into delete_eval_scoring_scoring_fn
This commit is contained in:
commit
e23531c9d0
4 changed files with 50 additions and 38 deletions
|
|
@ -62,8 +62,8 @@ class Benchmarks(Protocol):
|
|||
"""
|
||||
Register a new benchmark. A benchmark consists of a dataset id and a list of grader ids.
|
||||
|
||||
:param dataset_id: The ID of the dataset to be used to run the benchmark.
|
||||
:param grader_ids: List of grader ids to use for this benchmark.
|
||||
:param dataset_id: The ID of the dataset to be used to run the benchmark. ID obtained through `datasets.register()`
|
||||
:param grader_ids: List of grader ids to use for this benchmark. ID obtained through `graders.register()`
|
||||
:param benchmark_id: (Optional) The ID of the benchmark to register. If not provided, an ID will be generated.
|
||||
:param metadata: (Optional) Metadata for this benchmark for additional descriptions.
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -76,30 +76,17 @@ class EvaluationJob(CommonJobFields):
|
|||
candidate: EvaluationCandidate
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class ScoringResult(BaseModel):
|
||||
"""
|
||||
A scoring result for a single row.
|
||||
|
||||
:param scores: The scoring result for each row. Each row is a map of grader column name to value.
|
||||
:param metrics: Map of metric name to aggregated value.
|
||||
"""
|
||||
|
||||
scores: List[Dict[str, Any]]
|
||||
metrics: Dict[str, Any]
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class EvaluationResponse(BaseModel):
|
||||
"""
|
||||
A response to an inline evaluation.
|
||||
|
||||
:param generations: The generations in rows for the evaluation.
|
||||
:param scores: The scores for the evaluation. Map of grader id to ScoringResult.
|
||||
:param result_rows: The result data containing inputs, generations and grades in each row.
|
||||
:param grades: Map of grader id to aggregated value.
|
||||
"""
|
||||
|
||||
generations: List[Dict[str, Any]]
|
||||
scores: Dict[str, ScoringResult]
|
||||
result_rows: List[Dict[str, Any]]
|
||||
grades: Dict[str, Any]
|
||||
|
||||
|
||||
class Evaluation(Protocol):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue