diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py deleted file mode 100644 index 0e5959c37..000000000 --- a/llama_stack/apis/eval/eval.py +++ /dev/null @@ -1,143 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from typing import Any, Dict, List, Literal, Optional, Protocol, Union - -from pydantic import BaseModel, Field -from typing_extensions import Annotated - -from llama_stack.apis.agents import AgentConfig -from llama_stack.apis.common.job_types import Job -from llama_stack.apis.inference import SamplingParams, SystemMessage -from llama_stack.apis.scoring import ScoringResult -from llama_stack.apis.scoring_functions import ScoringFnParams -from llama_stack.schema_utils import json_schema_type, register_schema, webmethod - - -@json_schema_type -class ModelCandidate(BaseModel): - """A model candidate for evaluation. - - :param model: The model ID to evaluate. - :param sampling_params: The sampling parameters for the model. - :param system_message: (Optional) The system message providing instructions or context to the model. - """ - - type: Literal["model"] = "model" - model: str - sampling_params: SamplingParams - system_message: Optional[SystemMessage] = None - - -@json_schema_type -class AgentCandidate(BaseModel): - """An agent candidate for evaluation. - - :param config: The configuration for the agent candidate. - """ - - type: Literal["agent"] = "agent" - config: AgentConfig - - -EvalCandidate = Annotated[Union[ModelCandidate, AgentCandidate], Field(discriminator="type")] -register_schema(EvalCandidate, name="EvalCandidate") - - -@json_schema_type -class BenchmarkConfig(BaseModel): - """A benchmark configuration for evaluation. - - :param eval_candidate: The candidate to evaluate. - :param scoring_params: Map between scoring function id and parameters for each scoring function you want to run - :param num_examples: (Optional) The number of examples to evaluate. If not provided, all examples in the dataset will be evaluated - """ - - eval_candidate: EvalCandidate - scoring_params: Dict[str, ScoringFnParams] = Field( - description="Map between scoring function id and parameters for each scoring function you want to run", - default_factory=dict, - ) - num_examples: Optional[int] = Field( - description="Number of examples to evaluate (useful for testing), if not provided, all examples in the dataset will be evaluated", - default=None, - ) - # we could optinally add any specific dataset config here - - -@json_schema_type -class EvaluateResponse(BaseModel): - """The response from an evaluation. - - :param generations: The generations from the evaluation. - :param scores: The scores from the evaluation. - """ - - generations: List[Dict[str, Any]] - # each key in the dict is a scoring function name - scores: Dict[str, ScoringResult] - - -class Eval(Protocol): - """Llama Stack Evaluation API for running evaluations on model and agent candidates.""" - - @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST") - async def run_eval( - self, - benchmark_id: str, - benchmark_config: BenchmarkConfig, - ) -> Job: - """Run an evaluation on a benchmark. - - :param benchmark_id: The ID of the benchmark to run the evaluation on. - :param benchmark_config: The configuration for the benchmark. - :return: The job that was created to run the evaluation. - """ - - @webmethod(route="/eval/benchmarks/{benchmark_id}/evaluations", method="POST") - async def evaluate_rows( - self, - benchmark_id: str, - input_rows: List[Dict[str, Any]], - scoring_functions: List[str], - benchmark_config: BenchmarkConfig, - ) -> EvaluateResponse: - """Evaluate a list of rows on a benchmark. - - :param benchmark_id: The ID of the benchmark to run the evaluation on. - :param input_rows: The rows to evaluate. - :param scoring_functions: The scoring functions to use for the evaluation. - :param benchmark_config: The configuration for the benchmark. - :return: EvaluateResponse object containing generations and scores - """ - - @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="GET") - async def job_status(self, benchmark_id: str, job_id: str) -> Job: - """Get the status of a job. - - :param benchmark_id: The ID of the benchmark to run the evaluation on. - :param job_id: The ID of the job to get the status of. - :return: The status of the evaluationjob. - """ - ... - - @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="DELETE") - async def job_cancel(self, benchmark_id: str, job_id: str) -> None: - """Cancel a job. - - :param benchmark_id: The ID of the benchmark to run the evaluation on. - :param job_id: The ID of the job to cancel. - """ - ... - - @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET") - async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse: - """Get the result of a job. - - :param benchmark_id: The ID of the benchmark to run the evaluation on. - :param job_id: The ID of the job to get the result of. - :return: The result of the job. - """ diff --git a/llama_stack/apis/scoring_functions/scoring_functions.py b/llama_stack/apis/scoring_functions/scoring_functions.py deleted file mode 100644 index 4f85947dd..000000000 --- a/llama_stack/apis/scoring_functions/scoring_functions.py +++ /dev/null @@ -1,148 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from enum import Enum -from typing import ( - Any, - Dict, - List, - Literal, - Optional, - Protocol, - Union, - runtime_checkable, -) - -from pydantic import BaseModel, Field -from typing_extensions import Annotated - -from llama_stack.apis.common.type_system import ParamType -from llama_stack.apis.resource import Resource, ResourceType -from llama_stack.schema_utils import json_schema_type, register_schema, webmethod - - -# Perhaps more structure can be imposed on these functions. Maybe they could be associated -# with standard metrics so they can be rolled up? -@json_schema_type -class ScoringFnParamsType(Enum): - llm_as_judge = "llm_as_judge" - regex_parser = "regex_parser" - basic = "basic" - - -@json_schema_type -class AggregationFunctionType(Enum): - average = "average" - weighted_average = "weighted_average" - median = "median" - categorical_count = "categorical_count" - accuracy = "accuracy" - - -@json_schema_type -class LLMAsJudgeScoringFnParams(BaseModel): - type: Literal[ScoringFnParamsType.llm_as_judge.value] = ScoringFnParamsType.llm_as_judge.value - judge_model: str - prompt_template: Optional[str] = None - judge_score_regexes: Optional[List[str]] = Field( - description="Regexes to extract the answer from generated response", - default_factory=list, - ) - aggregation_functions: Optional[List[AggregationFunctionType]] = Field( - description="Aggregation functions to apply to the scores of each row", - default_factory=list, - ) - - -@json_schema_type -class RegexParserScoringFnParams(BaseModel): - type: Literal[ScoringFnParamsType.regex_parser.value] = ScoringFnParamsType.regex_parser.value - parsing_regexes: Optional[List[str]] = Field( - description="Regex to extract the answer from generated response", - default_factory=list, - ) - aggregation_functions: Optional[List[AggregationFunctionType]] = Field( - description="Aggregation functions to apply to the scores of each row", - default_factory=list, - ) - - -@json_schema_type -class BasicScoringFnParams(BaseModel): - type: Literal[ScoringFnParamsType.basic.value] = ScoringFnParamsType.basic.value - aggregation_functions: Optional[List[AggregationFunctionType]] = Field( - description="Aggregation functions to apply to the scores of each row", - default_factory=list, - ) - - -ScoringFnParams = Annotated[ - Union[ - LLMAsJudgeScoringFnParams, - RegexParserScoringFnParams, - BasicScoringFnParams, - ], - Field(discriminator="type"), -] -register_schema(ScoringFnParams, name="ScoringFnParams") - - -class CommonScoringFnFields(BaseModel): - description: Optional[str] = None - metadata: Dict[str, Any] = Field( - default_factory=dict, - description="Any additional metadata for this definition", - ) - return_type: ParamType = Field( - description="The return type of the deterministic function", - ) - params: Optional[ScoringFnParams] = Field( - description="The parameters for the scoring function for benchmark eval, these can be overridden for app eval", - default=None, - ) - - -@json_schema_type -class ScoringFn(CommonScoringFnFields, Resource): - type: Literal[ResourceType.scoring_function.value] = ResourceType.scoring_function.value - - @property - def scoring_fn_id(self) -> str: - return self.identifier - - @property - def provider_scoring_fn_id(self) -> str: - return self.provider_resource_id - - -class ScoringFnInput(CommonScoringFnFields, BaseModel): - scoring_fn_id: str - provider_id: Optional[str] = None - provider_scoring_fn_id: Optional[str] = None - - -class ListScoringFunctionsResponse(BaseModel): - data: List[ScoringFn] - - -@runtime_checkable -class ScoringFunctions(Protocol): - @webmethod(route="/scoring-functions", method="GET") - async def list_scoring_functions(self) -> ListScoringFunctionsResponse: ... - - @webmethod(route="/scoring-functions/{scoring_fn_id:path}", method="GET") - async def get_scoring_function(self, scoring_fn_id: str, /) -> ScoringFn: ... - - @webmethod(route="/scoring-functions", method="POST") - async def register_scoring_function( - self, - scoring_fn_id: str, - description: str, - return_type: ParamType, - provider_scoring_fn_id: Optional[str] = None, - provider_id: Optional[str] = None, - params: Optional[ScoringFnParams] = None, - ) -> None: ... diff --git a/llama_stack/providers/registry/eval.py b/llama_stack/providers/registry/eval.py deleted file mode 100644 index f3e42c531..000000000 --- a/llama_stack/providers/registry/eval.py +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from typing import List - -from llama_stack.providers.datatypes import Api, InlineProviderSpec, ProviderSpec - - -def available_providers() -> List[ProviderSpec]: - return [ - InlineProviderSpec( - api=Api.eval, - provider_type="inline::meta-reference", - pip_packages=["tree_sitter", "pythainlp", "langdetect", "emoji", "nltk"], - module="llama_stack.providers.inline.eval.meta_reference", - config_class="llama_stack.providers.inline.eval.meta_reference.MetaReferenceEvalConfig", - api_dependencies=[ - Api.datasetio, - Api.datasets, - Api.scoring, - Api.inference, - Api.agents, - ], - ), - ]