[Evals API][4/n] evals with generation meta-reference impl (#303)

* wip

* dataset validation

* test_scoring

* cleanup

* clean up test

* comments

* error checking

* dataset client

* test client:

* datasetio client

* clean up

* basic scoring function works

* scorer wip

* equality scorer

* score batch impl

* score batch

* update scoring test

* refactor

* validate scorer input

* address comments

* evals with generation

* add all rows scores to ScoringResult

* minor typing

* bugfix

* scoring function def rename

* rebase name

* refactor

* address comments

* Update iOS inference instructions for new quantization

* Small updates to quantization config

* Fix score threshold in faiss

* Bump version to 0.0.45

* Handle both ipv6 and ipv4 interfaces together

* update manifest for build templates

* Update getting_started.md

* chatcompletion & completion input type validation

* inclusion->subsetof

* error checking

* scoring_function -> scoring_fn rename, scorer -> scoring_fn rename

* address comments

* [Evals API][5/n] fixes to generate openapi spec (#323)

* generate openapi

* typing comment, dataset -> dataset_id

* remove custom type

* sample eval run.yaml

---------

Co-authored-by: Dalton Flanagan <6599399+dltn@users.noreply.github.com>
Co-authored-by: Ashwin Bharambe <ashwin.bharambe@gmail.com>
This commit is contained in:
Xi Yan 2024-10-25 13:12:39 -07:00 committed by GitHub
parent 426d821e7f
commit abdf7cddf3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
31 changed files with 3371 additions and 1296 deletions

View file

@ -3,6 +3,8 @@
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from enum import Enum
from llama_models.schema_utils import json_schema_type
from pydantic import BaseModel
@ -10,3 +12,9 @@ from pydantic import BaseModel
@json_schema_type
class Job(BaseModel):
job_id: str
@json_schema_type
class JobStatus(Enum):
completed = "completed"
in_progress = "in_progress"

View file

@ -4,7 +4,7 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import Dict, List, Literal, Union
from typing import Literal, Union
from pydantic import BaseModel, Field
from typing_extensions import Annotated
@ -24,12 +24,10 @@ class BooleanType(BaseModel):
class ArrayType(BaseModel):
type: Literal["array"] = "array"
items: "ParamType"
class ObjectType(BaseModel):
type: Literal["object"] = "object"
properties: Dict[str, "ParamType"] = Field(default_factory=dict)
class JsonType(BaseModel):
@ -38,12 +36,21 @@ class JsonType(BaseModel):
class UnionType(BaseModel):
type: Literal["union"] = "union"
options: List["ParamType"] = Field(default_factory=list)
class CustomType(BaseModel):
type: Literal["custom"] = "custom"
validator_class: str
class ChatCompletionInputType(BaseModel):
# expects List[Message] for messages
type: Literal["chat_completion_input"] = "chat_completion_input"
class CompletionInputType(BaseModel):
# expects InterleavedTextMedia for content
type: Literal["completion_input"] = "completion_input"
class AgentTurnInputType(BaseModel):
# expects List[Message] for messages (may also include attachments?)
type: Literal["agent_turn_input"] = "agent_turn_input"
ParamType = Annotated[
@ -55,11 +62,22 @@ ParamType = Annotated[
ObjectType,
JsonType,
UnionType,
CustomType,
ChatCompletionInputType,
CompletionInputType,
AgentTurnInputType,
],
Field(discriminator="type"),
]
ArrayType.model_rebuild()
ObjectType.model_rebuild()
UnionType.model_rebuild()
# TODO: recursive definition of ParamType in these containers
# will cause infinite recursion in OpenAPI generation script
# since we are going with ChatCompletionInputType and CompletionInputType
# we don't need to worry about ArrayType/ObjectType/UnionType for now
# ArrayType.model_rebuild()
# ObjectType.model_rebuild()
# UnionType.model_rebuild()
# class CustomType(BaseModel):
# type: Literal["custom"] = "custom"
# validator_class: str

View file

@ -12,7 +12,7 @@ from llama_models.llama3.api.datatypes import * # noqa: F403
from llama_models.schema_utils import json_schema_type, webmethod
from llama_stack.apis.scoring_functions import * # noqa: F403
from llama_stack.apis.agents import AgentConfig
from llama_stack.apis.common.job_types import Job
from llama_stack.apis.common.job_types import Job, JobStatus
from llama_stack.apis.scoring import * # noqa: F403
@ -40,7 +40,7 @@ class EvaluateResponse(BaseModel):
generations: List[Dict[str, Any]]
# each key in the dict is a scoring function name
scores: List[Dict[str, ScoringResult]]
scores: Dict[str, ScoringResult]
class Eval(Protocol):
@ -61,10 +61,10 @@ class Eval(Protocol):
) -> EvaluateResponse: ...
@webmethod(route="/eval/job/status", method="GET")
async def job_status(self, job_id: str) -> None: ...
async def job_status(self, job_id: str) -> Optional[JobStatus]: ...
@webmethod(route="/eval/job/cancel", method="POST")
async def job_cancel(self, job_id: str) -> None: ...
@webmethod(route="/eval/job/result", method="GET")
async def job_result(self, job_id: str) -> None: ...
async def job_result(self, job_id: str) -> EvaluateResponse: ...

View file

@ -14,7 +14,7 @@ from llama_models.schema_utils import json_schema_type, webmethod
from pydantic import BaseModel, Field
from llama_models.llama3.api.datatypes import * # noqa: F403
from llama_stack.apis.dataset import * # noqa: F403
from llama_stack.apis.datasets import * # noqa: F403
from llama_stack.apis.common.training_types import * # noqa: F403
@ -107,8 +107,8 @@ class PostTrainingSFTRequest(BaseModel):
job_uuid: str
model: str
dataset: TrainEvalDataset
validation_dataset: TrainEvalDataset
dataset_id: str
validation_dataset_id: str
algorithm: FinetuningAlgorithm
algorithm_config: Union[
@ -131,8 +131,8 @@ class PostTrainingRLHFRequest(BaseModel):
finetuned_model: URL
dataset: TrainEvalDataset
validation_dataset: TrainEvalDataset
dataset_id: str
validation_dataset_id: str
algorithm: RLHFAlgorithm
algorithm_config: Union[DPOAlignmentConfig]
@ -181,8 +181,8 @@ class PostTraining(Protocol):
self,
job_uuid: str,
model: str,
dataset: TrainEvalDataset,
validation_dataset: TrainEvalDataset,
dataset_id: str,
validation_dataset_id: str,
algorithm: FinetuningAlgorithm,
algorithm_config: Union[
LoraFinetuningConfig, QLoraFinetuningConfig, DoraFinetuningConfig
@ -198,8 +198,8 @@ class PostTraining(Protocol):
self,
job_uuid: str,
finetuned_model: URL,
dataset: TrainEvalDataset,
validation_dataset: TrainEvalDataset,
dataset_id: str,
validation_dataset_id: str,
algorithm: RLHFAlgorithm,
algorithm_config: Union[DPOAlignmentConfig],
optimizer_config: OptimizerConfig,

View file

@ -37,7 +37,7 @@ class ScoreResponse(BaseModel):
class ScoringFunctionStore(Protocol):
def get_scoring_function(self, name: str) -> ScoringFunctionDefWithProvider: ...
def get_scoring_function(self, name: str) -> ScoringFnDefWithProvider: ...
@runtime_checkable

View file

@ -29,7 +29,7 @@ class LLMAsJudgeContext(BaseModel):
@json_schema_type
class ScoringFunctionDef(BaseModel):
class ScoringFnDef(BaseModel):
identifier: str
description: Optional[str] = None
metadata: Dict[str, Any] = Field(
@ -48,7 +48,7 @@ class ScoringFunctionDef(BaseModel):
@json_schema_type
class ScoringFunctionDefWithProvider(ScoringFunctionDef):
class ScoringFnDefWithProvider(ScoringFnDef):
provider_id: str = Field(
description="ID of the provider which serves this dataset",
)
@ -57,14 +57,14 @@ class ScoringFunctionDefWithProvider(ScoringFunctionDef):
@runtime_checkable
class ScoringFunctions(Protocol):
@webmethod(route="/scoring_functions/list", method="GET")
async def list_scoring_functions(self) -> List[ScoringFunctionDefWithProvider]: ...
async def list_scoring_functions(self) -> List[ScoringFnDefWithProvider]: ...
@webmethod(route="/scoring_functions/get", method="GET")
async def get_scoring_function(
self, name: str
) -> Optional[ScoringFunctionDefWithProvider]: ...
) -> Optional[ScoringFnDefWithProvider]: ...
@webmethod(route="/scoring_functions/register", method="POST")
async def register_scoring_function(
self, function_def: ScoringFunctionDefWithProvider
self, function_def: ScoringFnDefWithProvider
) -> None: ...

View file

@ -13,7 +13,6 @@ from llama_models.schema_utils import json_schema_type, webmethod
from pydantic import BaseModel
from llama_models.llama3.api.datatypes import * # noqa: F403
from llama_stack.apis.reward_scoring import * # noqa: F403
class FilteringFunction(Enum):
@ -40,7 +39,7 @@ class SyntheticDataGenerationRequest(BaseModel):
class SyntheticDataGenerationResponse(BaseModel):
"""Response from the synthetic data generation. Batch of (prompt, response, score) tuples that pass the threshold."""
synthetic_data: List[ScoredDialogGenerations]
synthetic_data: List[Dict[str, Any]]
statistics: Optional[Dict[str, Any]] = None