llama-stack/llama_stack/apis/scoring/scoring.py
Xi Yan abdf7cddf3
[Evals API][4/n] evals with generation meta-reference impl (#303)
* wip

* dataset validation

* test_scoring

* cleanup

* clean up test

* comments

* error checking

* dataset client

* test client:

* datasetio client

* clean up

* basic scoring function works

* scorer wip

* equality scorer

* score batch impl

* score batch

* update scoring test

* refactor

* validate scorer input

* address comments

* evals with generation

* add all rows scores to ScoringResult

* minor typing

* bugfix

* scoring function def rename

* rebase name

* refactor

* address comments

* Update iOS inference instructions for new quantization

* Small updates to quantization config

* Fix score threshold in faiss

* Bump version to 0.0.45

* Handle both ipv6 and ipv4 interfaces together

* update manifest for build templates

* Update getting_started.md

* chatcompletion & completion input type validation

* inclusion->subsetof

* error checking

* scoring_function -> scoring_fn rename, scorer -> scoring_fn rename

* address comments

* [Evals API][5/n] fixes to generate openapi spec (#323)

* generate openapi

* typing comment, dataset -> dataset_id

* remove custom type

* sample eval run.yaml

---------

Co-authored-by: Dalton Flanagan <6599399+dltn@users.noreply.github.com>
Co-authored-by: Ashwin Bharambe <ashwin.bharambe@gmail.com>
2024-10-25 13:12:39 -07:00

58 lines
1.6 KiB
Python

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import Any, Dict, List, Protocol, runtime_checkable
from llama_models.schema_utils import json_schema_type, webmethod
from pydantic import BaseModel
from llama_models.llama3.api.datatypes import * # noqa: F403
from llama_stack.apis.scoring_functions import * # noqa: F403
# mapping of metric to value
ScoringResultRow = Dict[str, Any]
@json_schema_type
class ScoringResult(BaseModel):
score_rows: List[ScoringResultRow]
# aggregated metrics to value
aggregated_results: Dict[str, Any]
@json_schema_type
class ScoreBatchResponse(BaseModel):
dataset_id: Optional[str] = None
results: Dict[str, ScoringResult]
@json_schema_type
class ScoreResponse(BaseModel):
# each key in the dict is a scoring function name
results: Dict[str, ScoringResult]
class ScoringFunctionStore(Protocol):
def get_scoring_function(self, name: str) -> ScoringFnDefWithProvider: ...
@runtime_checkable
class Scoring(Protocol):
scoring_function_store: ScoringFunctionStore
@webmethod(route="/scoring/score_batch")
async def score_batch(
self,
dataset_id: str,
scoring_functions: List[str],
save_results_dataset: bool = False,
) -> ScoreBatchResponse: ...
@webmethod(route="/scoring/score")
async def score(
self, input_rows: List[Dict[str, Any]], scoring_functions: List[str]
) -> ScoreResponse: ...