From 3a87562e8d1226bc0e8f2c6748251d056f34b1d3 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Wed, 12 Mar 2025 21:54:12 -0700 Subject: [PATCH] scoring updates --- docs/_static/llama-stack-spec.html | 1489 ++++++++--------- docs/_static/llama-stack-spec.yaml | 1161 +++++++------ llama_stack/apis/benchmarks/benchmarks.py | 1 - llama_stack/apis/eval/eval.py | 48 +- llama_stack/apis/scoring/scoring.py | 15 +- .../scoring_functions/scoring_functions.py | 98 +- 6 files changed, 1346 insertions(+), 1466 deletions(-) diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index d2a745655..493eeebc4 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -968,7 +968,60 @@ } } }, - "/v1/eval/benchmarks/{benchmark_id}/evaluations": { + "/v1/eval/benchmarks/{benchmark_id}/jobs": { + "post": { + "responses": { + "200": { + "description": "The job that was created to run the evaluation.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Job" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Eval" + ], + "description": "Run an evaluation on a benchmark.", + "parameters": [ + { + "name": "benchmark_id", + "in": "path", + "description": "The ID of the benchmark to run the evaluation on.", + "required": true, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/EvaluateBenchmarkRequest" + } + } + }, + "required": true + } + } + }, + "/v1/eval/rows": { "post": { "responses": { "200": { @@ -997,18 +1050,8 @@ "tags": [ "Eval" ], - "description": "Evaluate a list of rows on a benchmark.", - "parameters": [ - { - "name": "benchmark_id", - "in": "path", - "description": "The ID of the benchmark to run the evaluation on.", - "required": true, - "schema": { - "type": "string" - } - } - ], + "description": "Evaluate a list of rows on a candidate.", + "parameters": [], "requestBody": { "content": { "application/json": { @@ -3498,59 +3541,6 @@ } } }, - "/v1/eval/benchmarks/{benchmark_id}/jobs": { - "post": { - "responses": { - "200": { - "description": "The job that was created to run the evaluation.", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/Job" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Eval" - ], - "description": "Run an evaluation on a benchmark.", - "parameters": [ - { - "name": "benchmark_id", - "in": "path", - "description": "The ID of the benchmark to run the evaluation on.", - "required": true, - "schema": { - "type": "string" - } - } - ], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/RunEvalRequest" - } - } - }, - "required": true - } - } - }, "/v1/safety/run-shield": { "post": { "responses": { @@ -3708,7 +3698,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/ScoreBatchRequest" + "$ref": "#/components/schemas/ScoreDatasetRequest" } } }, @@ -6385,381 +6375,6 @@ "title": "AgentCandidate", "description": "An agent candidate for evaluation." }, - "AnswerCorrectnessScoringFnParams": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "answer_correctness", - "default": "answer_correctness" - }, - "answer_correctness": { - "type": "object", - "properties": { - "aggregation_functions": { - "type": "array", - "items": { - "type": "string", - "enum": [ - "average", - "median", - "categorical_count", - "accuracy" - ], - "title": "AggregationFunctionType", - "description": "A type of aggregation function." - }, - "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed." - } - }, - "additionalProperties": false, - "title": "BasicScoringFnParamsFields" - } - }, - "additionalProperties": false, - "required": [ - "type", - "answer_correctness" - ], - "title": "AnswerCorrectnessScoringFnParams" - }, - "AnswerRelevancyScoringFnParams": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "answer_relevancy", - "default": "answer_relevancy" - }, - "answer_relevancy": { - "type": "object", - "properties": { - "aggregation_functions": { - "type": "array", - "items": { - "type": "string", - "enum": [ - "average", - "median", - "categorical_count", - "accuracy" - ], - "title": "AggregationFunctionType", - "description": "A type of aggregation function." - }, - "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed." - } - }, - "additionalProperties": false, - "title": "BasicScoringFnParamsFields" - } - }, - "additionalProperties": false, - "required": [ - "type", - "answer_relevancy" - ], - "title": "AnswerRelevancyScoringFnParams" - }, - "AnswerSimilarityScoringFnParams": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "answer_similarity", - "default": "answer_similarity" - }, - "answer_similarity": { - "type": "object", - "properties": { - "aggregation_functions": { - "type": "array", - "items": { - "type": "string", - "enum": [ - "average", - "median", - "categorical_count", - "accuracy" - ], - "title": "AggregationFunctionType", - "description": "A type of aggregation function." - }, - "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed." - } - }, - "additionalProperties": false, - "title": "BasicScoringFnParamsFields" - } - }, - "additionalProperties": false, - "required": [ - "type", - "answer_similarity" - ], - "title": "AnswerSimilarityScoringFnParams" - }, - "BenchmarkConfig": { - "type": "object", - "properties": { - "eval_candidate": { - "$ref": "#/components/schemas/EvalCandidate", - "description": "The candidate to evaluate." - }, - "scoring_params": { - "type": "object", - "additionalProperties": { - "$ref": "#/components/schemas/ScoringFnParams" - }, - "description": "Map between scoring function id and parameters for each scoring function you want to run" - }, - "num_examples": { - "type": "integer", - "description": "(Optional) The number of examples to evaluate. If not provided, all examples in the dataset will be evaluated" - } - }, - "additionalProperties": false, - "required": [ - "eval_candidate", - "scoring_params" - ], - "title": "BenchmarkConfig", - "description": "A benchmark configuration for evaluation." - }, - "ContextEntityRecallScoringFnParams": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "context_entity_recall", - "default": "context_entity_recall" - }, - "context_entity_recall": { - "type": "object", - "properties": { - "aggregation_functions": { - "type": "array", - "items": { - "type": "string", - "enum": [ - "average", - "median", - "categorical_count", - "accuracy" - ], - "title": "AggregationFunctionType", - "description": "A type of aggregation function." - }, - "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed." - } - }, - "additionalProperties": false, - "title": "BasicScoringFnParamsFields" - } - }, - "additionalProperties": false, - "required": [ - "type", - "context_entity_recall" - ], - "title": "ContextEntityRecallScoringFnParams" - }, - "ContextPrecisionScoringFnParams": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "context_precision", - "default": "context_precision" - }, - "context_precision": { - "type": "object", - "properties": { - "aggregation_functions": { - "type": "array", - "items": { - "type": "string", - "enum": [ - "average", - "median", - "categorical_count", - "accuracy" - ], - "title": "AggregationFunctionType", - "description": "A type of aggregation function." - }, - "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed." - } - }, - "additionalProperties": false, - "title": "BasicScoringFnParamsFields" - } - }, - "additionalProperties": false, - "required": [ - "type", - "context_precision" - ], - "title": "ContextPrecisionScoringFnParams" - }, - "ContextRecallScoringFnParams": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "context_recall", - "default": "context_recall" - }, - "context_recall": { - "type": "object", - "properties": { - "aggregation_functions": { - "type": "array", - "items": { - "type": "string", - "enum": [ - "average", - "median", - "categorical_count", - "accuracy" - ], - "title": "AggregationFunctionType", - "description": "A type of aggregation function." - }, - "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed." - } - }, - "additionalProperties": false, - "title": "BasicScoringFnParamsFields" - } - }, - "additionalProperties": false, - "required": [ - "type", - "context_recall" - ], - "title": "ContextRecallScoringFnParams" - }, - "ContextRelevancyScoringFnParams": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "context_relevancy", - "default": "context_relevancy" - }, - "context_relevancy": { - "type": "object", - "properties": { - "aggregation_functions": { - "type": "array", - "items": { - "type": "string", - "enum": [ - "average", - "median", - "categorical_count", - "accuracy" - ], - "title": "AggregationFunctionType", - "description": "A type of aggregation function." - }, - "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed." - } - }, - "additionalProperties": false, - "title": "BasicScoringFnParamsFields" - } - }, - "additionalProperties": false, - "required": [ - "type", - "context_relevancy" - ], - "title": "ContextRelevancyScoringFnParams" - }, - "CustomLLMAsJudgeScoringFnParams": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "custom_llm_as_judge", - "default": "custom_llm_as_judge" - }, - "custom_llm_as_judge": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "custom_llm_as_judge", - "default": "custom_llm_as_judge" - }, - "judge_model": { - "type": "string" - }, - "prompt_template": { - "type": "string" - }, - "judge_score_regexes": { - "type": "array", - "items": { - "type": "string" - } - } - }, - "additionalProperties": false, - "required": [ - "type", - "judge_model" - ], - "title": "CustomLLMAsJudgeScoringFnParamsFields" - } - }, - "additionalProperties": false, - "required": [ - "type", - "custom_llm_as_judge" - ], - "title": "CustomLLMAsJudgeScoringFnParams" - }, - "EqualityScoringFnParams": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "equality", - "default": "equality" - }, - "equality": { - "type": "object", - "properties": { - "aggregation_functions": { - "type": "array", - "items": { - "type": "string", - "enum": [ - "average", - "median", - "categorical_count", - "accuracy" - ], - "title": "AggregationFunctionType", - "description": "A type of aggregation function." - }, - "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed." - } - }, - "additionalProperties": false, - "title": "BasicScoringFnParamsFields" - } - }, - "additionalProperties": false, - "required": [ - "type", - "equality" - ], - "title": "EqualityScoringFnParams" - }, "EvalCandidate": { "oneOf": [ { @@ -6777,82 +6392,6 @@ } } }, - "FactualityScoringFnParams": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "factuality", - "default": "factuality" - }, - "factuality": { - "type": "object", - "properties": { - "aggregation_functions": { - "type": "array", - "items": { - "type": "string", - "enum": [ - "average", - "median", - "categorical_count", - "accuracy" - ], - "title": "AggregationFunctionType", - "description": "A type of aggregation function." - }, - "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed." - } - }, - "additionalProperties": false, - "title": "BasicScoringFnParamsFields" - } - }, - "additionalProperties": false, - "required": [ - "type", - "factuality" - ], - "title": "FactualityScoringFnParams" - }, - "FaithfulnessScoringFnParams": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "faithfulness", - "default": "faithfulness" - }, - "faithfulness": { - "type": "object", - "properties": { - "aggregation_functions": { - "type": "array", - "items": { - "type": "string", - "enum": [ - "average", - "median", - "categorical_count", - "accuracy" - ], - "title": "AggregationFunctionType", - "description": "A type of aggregation function." - }, - "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed." - } - }, - "additionalProperties": false, - "title": "BasicScoringFnParamsFields" - } - }, - "additionalProperties": false, - "required": [ - "type", - "faithfulness" - ], - "title": "FaithfulnessScoringFnParams" - }, "ModelCandidate": { "type": "object", "properties": { @@ -6883,209 +6422,37 @@ "title": "ModelCandidate", "description": "A model candidate for evaluation." }, - "RegexParserMathScoringFnParams": { + "EvaluateBenchmarkRequest": { "type": "object", "properties": { - "type": { - "type": "string", - "const": "regex_parser_math_response", - "default": "regex_parser_math_response" - }, - "regex_parser_math_response": { - "type": "object", - "properties": { - "parsing_regexes": { - "type": "array", - "items": { - "type": "string" - }, - "description": "(Optional) Regexes to extract the answer from generated response." - }, - "aggregation_functions": { - "type": "array", - "items": { - "type": "string", - "enum": [ - "average", - "median", - "categorical_count", - "accuracy" - ], - "title": "AggregationFunctionType", - "description": "A type of aggregation function." - }, - "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed." - } - }, - "additionalProperties": false, - "required": [ - "parsing_regexes" - ], - "title": "RegexParserScoringFnParamsFields" + "candidate": { + "$ref": "#/components/schemas/EvalCandidate", + "description": "The candidate to evaluate on." } }, "additionalProperties": false, "required": [ - "type", - "regex_parser_math_response" + "candidate" ], - "title": "RegexParserMathScoringFnParams" + "title": "EvaluateBenchmarkRequest" }, - "RegexParserScoringFnParams": { + "Job": { "type": "object", "properties": { - "type": { - "type": "string", - "const": "regex_parser", - "default": "regex_parser" - }, - "regex_parser": { - "type": "object", - "properties": { - "parsing_regexes": { - "type": "array", - "items": { - "type": "string" - }, - "description": "(Optional) Regexes to extract the answer from generated response." - }, - "aggregation_functions": { - "type": "array", - "items": { - "type": "string", - "enum": [ - "average", - "median", - "categorical_count", - "accuracy" - ], - "title": "AggregationFunctionType", - "description": "A type of aggregation function." - }, - "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed." - } - }, - "additionalProperties": false, - "required": [ - "parsing_regexes" - ], - "title": "RegexParserScoringFnParamsFields" + "job_id": { + "type": "string" } }, "additionalProperties": false, "required": [ - "type", - "regex_parser" + "job_id" ], - "title": "RegexParserScoringFnParams" - }, - "ScoringFnParams": { - "oneOf": [ - { - "$ref": "#/components/schemas/CustomLLMAsJudgeScoringFnParams" - }, - { - "$ref": "#/components/schemas/RegexParserScoringFnParams" - }, - { - "$ref": "#/components/schemas/RegexParserMathScoringFnParams" - }, - { - "$ref": "#/components/schemas/EqualityScoringFnParams" - }, - { - "$ref": "#/components/schemas/SubsetOfcoringFnParams" - }, - { - "$ref": "#/components/schemas/FactualityScoringFnParams" - }, - { - "$ref": "#/components/schemas/FaithfulnessScoringFnParams" - }, - { - "$ref": "#/components/schemas/AnswerCorrectnessScoringFnParams" - }, - { - "$ref": "#/components/schemas/AnswerRelevancyScoringFnParams" - }, - { - "$ref": "#/components/schemas/AnswerSimilarityScoringFnParams" - }, - { - "$ref": "#/components/schemas/ContextEntityRecallScoringFnParams" - }, - { - "$ref": "#/components/schemas/ContextPrecisionScoringFnParams" - }, - { - "$ref": "#/components/schemas/ContextRecallScoringFnParams" - }, - { - "$ref": "#/components/schemas/ContextRelevancyScoringFnParams" - } - ], - "discriminator": { - "propertyName": "type", - "mapping": { - "custom_llm_as_judge": "#/components/schemas/CustomLLMAsJudgeScoringFnParams", - "regex_parser": "#/components/schemas/RegexParserScoringFnParams", - "regex_parser_math_response": "#/components/schemas/RegexParserMathScoringFnParams", - "equality": "#/components/schemas/EqualityScoringFnParams", - "subset_of": "#/components/schemas/SubsetOfcoringFnParams", - "factuality": "#/components/schemas/FactualityScoringFnParams", - "faithfulness": "#/components/schemas/FaithfulnessScoringFnParams", - "answer_correctness": "#/components/schemas/AnswerCorrectnessScoringFnParams", - "answer_relevancy": "#/components/schemas/AnswerRelevancyScoringFnParams", - "answer_similarity": "#/components/schemas/AnswerSimilarityScoringFnParams", - "context_entity_recall": "#/components/schemas/ContextEntityRecallScoringFnParams", - "context_precision": "#/components/schemas/ContextPrecisionScoringFnParams", - "context_recall": "#/components/schemas/ContextRecallScoringFnParams", - "context_relevancy": "#/components/schemas/ContextRelevancyScoringFnParams" - } - } - }, - "SubsetOfcoringFnParams": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "subset_of", - "default": "subset_of" - }, - "subset_of": { - "type": "object", - "properties": { - "aggregation_functions": { - "type": "array", - "items": { - "type": "string", - "enum": [ - "average", - "median", - "categorical_count", - "accuracy" - ], - "title": "AggregationFunctionType", - "description": "A type of aggregation function." - }, - "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed." - } - }, - "additionalProperties": false, - "title": "BasicScoringFnParamsFields" - } - }, - "additionalProperties": false, - "required": [ - "type", - "subset_of" - ], - "title": "SubsetOfcoringFnParams" + "title": "Job" }, "EvaluateRowsRequest": { "type": "object", "properties": { - "input_rows": { + "dataset_rows": { "type": "array", "items": { "type": "object", @@ -7114,23 +6481,23 @@ }, "description": "The rows to evaluate." }, - "scoring_functions": { + "scoring_fn_ids": { "type": "array", "items": { "type": "string" }, - "description": "The scoring functions to use for the evaluation." + "description": "The scoring function ids to use for the evaluation." }, - "benchmark_config": { - "$ref": "#/components/schemas/BenchmarkConfig", - "description": "The configuration for the benchmark." + "candidate": { + "$ref": "#/components/schemas/EvalCandidate", + "description": "The candidate to evaluate on." } }, "additionalProperties": false, "required": [ - "input_rows", - "scoring_functions", - "benchmark_config" + "dataset_rows", + "scoring_fn_ids", + "candidate" ], "title": "EvaluateRowsRequest" }, @@ -7731,6 +7098,526 @@ "title": "PaginatedRowsResult", "description": "A paginated list of rows from a dataset." }, + "AnswerCorrectnessScoringFn": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "answer_correctness", + "default": "answer_correctness" + }, + "answer_correctness": { + "type": "object", + "properties": { + "aggregation_functions": { + "type": "array", + "items": { + "type": "string", + "enum": [ + "average", + "median", + "categorical_count", + "accuracy" + ], + "title": "AggregationFunctionType", + "description": "A type of aggregation function." + }, + "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed." + } + }, + "additionalProperties": false, + "title": "BasicScoringFnParams" + } + }, + "additionalProperties": false, + "required": [ + "type", + "answer_correctness" + ], + "title": "AnswerCorrectnessScoringFn" + }, + "AnswerRelevancyScoringFn": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "answer_relevancy", + "default": "answer_relevancy" + }, + "answer_relevancy": { + "type": "object", + "properties": { + "aggregation_functions": { + "type": "array", + "items": { + "type": "string", + "enum": [ + "average", + "median", + "categorical_count", + "accuracy" + ], + "title": "AggregationFunctionType", + "description": "A type of aggregation function." + }, + "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed." + } + }, + "additionalProperties": false, + "title": "BasicScoringFnParams" + } + }, + "additionalProperties": false, + "required": [ + "type", + "answer_relevancy" + ], + "title": "AnswerRelevancyScoringFn" + }, + "AnswerSimilarityScoringFn": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "answer_similarity", + "default": "answer_similarity" + }, + "answer_similarity": { + "type": "object", + "properties": { + "aggregation_functions": { + "type": "array", + "items": { + "type": "string", + "enum": [ + "average", + "median", + "categorical_count", + "accuracy" + ], + "title": "AggregationFunctionType", + "description": "A type of aggregation function." + }, + "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed." + } + }, + "additionalProperties": false, + "title": "BasicScoringFnParams" + } + }, + "additionalProperties": false, + "required": [ + "type", + "answer_similarity" + ], + "title": "AnswerSimilarityScoringFn" + }, + "ContextEntityRecallScoringFn": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "context_entity_recall", + "default": "context_entity_recall" + }, + "context_entity_recall": { + "type": "object", + "properties": { + "aggregation_functions": { + "type": "array", + "items": { + "type": "string", + "enum": [ + "average", + "median", + "categorical_count", + "accuracy" + ], + "title": "AggregationFunctionType", + "description": "A type of aggregation function." + }, + "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed." + } + }, + "additionalProperties": false, + "title": "BasicScoringFnParams" + } + }, + "additionalProperties": false, + "required": [ + "type", + "context_entity_recall" + ], + "title": "ContextEntityRecallScoringFn" + }, + "ContextPrecisionScoringFn": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "context_precision", + "default": "context_precision" + }, + "context_precision": { + "type": "object", + "properties": { + "aggregation_functions": { + "type": "array", + "items": { + "type": "string", + "enum": [ + "average", + "median", + "categorical_count", + "accuracy" + ], + "title": "AggregationFunctionType", + "description": "A type of aggregation function." + }, + "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed." + } + }, + "additionalProperties": false, + "title": "BasicScoringFnParams" + } + }, + "additionalProperties": false, + "required": [ + "type", + "context_precision" + ], + "title": "ContextPrecisionScoringFn" + }, + "ContextRecallScoringFn": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "context_recall", + "default": "context_recall" + }, + "context_recall": { + "type": "object", + "properties": { + "aggregation_functions": { + "type": "array", + "items": { + "type": "string", + "enum": [ + "average", + "median", + "categorical_count", + "accuracy" + ], + "title": "AggregationFunctionType", + "description": "A type of aggregation function." + }, + "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed." + } + }, + "additionalProperties": false, + "title": "BasicScoringFnParams" + } + }, + "additionalProperties": false, + "required": [ + "type", + "context_recall" + ], + "title": "ContextRecallScoringFn" + }, + "ContextRelevancyScoringFn": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "context_relevancy", + "default": "context_relevancy" + }, + "context_relevancy": { + "type": "object", + "properties": { + "aggregation_functions": { + "type": "array", + "items": { + "type": "string", + "enum": [ + "average", + "median", + "categorical_count", + "accuracy" + ], + "title": "AggregationFunctionType", + "description": "A type of aggregation function." + }, + "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed." + } + }, + "additionalProperties": false, + "title": "BasicScoringFnParams" + } + }, + "additionalProperties": false, + "required": [ + "type", + "context_relevancy" + ], + "title": "ContextRelevancyScoringFn" + }, + "CustomLLMAsJudgeScoringFn": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "custom_llm_as_judge", + "default": "custom_llm_as_judge" + }, + "custom_llm_as_judge": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "custom_llm_as_judge", + "default": "custom_llm_as_judge" + }, + "judge_model": { + "type": "string" + }, + "prompt_template": { + "type": "string" + }, + "judge_score_regexes": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false, + "required": [ + "type", + "judge_model" + ], + "title": "CustomLLMAsJudgeScoringFnParams" + } + }, + "additionalProperties": false, + "required": [ + "type", + "custom_llm_as_judge" + ], + "title": "CustomLLMAsJudgeScoringFn" + }, + "EqualityScoringFn": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "equality", + "default": "equality" + }, + "equality": { + "type": "object", + "properties": { + "aggregation_functions": { + "type": "array", + "items": { + "type": "string", + "enum": [ + "average", + "median", + "categorical_count", + "accuracy" + ], + "title": "AggregationFunctionType", + "description": "A type of aggregation function." + }, + "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed." + } + }, + "additionalProperties": false, + "title": "BasicScoringFnParams" + } + }, + "additionalProperties": false, + "required": [ + "type", + "equality" + ], + "title": "EqualityScoringFn" + }, + "FactualityScoringFn": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "factuality", + "default": "factuality" + }, + "factuality": { + "type": "object", + "properties": { + "aggregation_functions": { + "type": "array", + "items": { + "type": "string", + "enum": [ + "average", + "median", + "categorical_count", + "accuracy" + ], + "title": "AggregationFunctionType", + "description": "A type of aggregation function." + }, + "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed." + } + }, + "additionalProperties": false, + "title": "BasicScoringFnParams" + } + }, + "additionalProperties": false, + "required": [ + "type", + "factuality" + ], + "title": "FactualityScoringFn" + }, + "FaithfulnessScoringFn": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "faithfulness", + "default": "faithfulness" + }, + "faithfulness": { + "type": "object", + "properties": { + "aggregation_functions": { + "type": "array", + "items": { + "type": "string", + "enum": [ + "average", + "median", + "categorical_count", + "accuracy" + ], + "title": "AggregationFunctionType", + "description": "A type of aggregation function." + }, + "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed." + } + }, + "additionalProperties": false, + "title": "BasicScoringFnParams" + } + }, + "additionalProperties": false, + "required": [ + "type", + "faithfulness" + ], + "title": "FaithfulnessScoringFn" + }, + "RegexParserMathScoringFn": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "regex_parser_math_response", + "default": "regex_parser_math_response" + }, + "regex_parser_math_response": { + "type": "object", + "properties": { + "parsing_regexes": { + "type": "array", + "items": { + "type": "string" + }, + "description": "(Optional) Regexes to extract the answer from generated response." + }, + "aggregation_functions": { + "type": "array", + "items": { + "type": "string", + "enum": [ + "average", + "median", + "categorical_count", + "accuracy" + ], + "title": "AggregationFunctionType", + "description": "A type of aggregation function." + }, + "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed." + } + }, + "additionalProperties": false, + "required": [ + "parsing_regexes" + ], + "title": "RegexParserScoringFnParams" + } + }, + "additionalProperties": false, + "required": [ + "type", + "regex_parser_math_response" + ], + "title": "RegexParserMathScoringFn" + }, + "RegexParserScoringFn": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "regex_parser", + "default": "regex_parser" + }, + "regex_parser": { + "type": "object", + "properties": { + "parsing_regexes": { + "type": "array", + "items": { + "type": "string" + }, + "description": "(Optional) Regexes to extract the answer from generated response." + }, + "aggregation_functions": { + "type": "array", + "items": { + "type": "string", + "enum": [ + "average", + "median", + "categorical_count", + "accuracy" + ], + "title": "AggregationFunctionType", + "description": "A type of aggregation function." + }, + "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed." + } + }, + "additionalProperties": false, + "required": [ + "parsing_regexes" + ], + "title": "RegexParserScoringFnParams" + } + }, + "additionalProperties": false, + "required": [ + "type", + "regex_parser" + ], + "title": "RegexParserScoringFn" + }, "ScoringFn": { "type": "object", "properties": { @@ -7749,7 +7636,7 @@ "default": "scoring_function" }, "fn": { - "$ref": "#/components/schemas/ScoringFnParams", + "$ref": "#/components/schemas/ScoringFnDefinition", "description": "The scoring function type and parameters." }, "metadata": { @@ -7790,6 +7677,109 @@ ], "title": "ScoringFn" }, + "ScoringFnDefinition": { + "oneOf": [ + { + "$ref": "#/components/schemas/CustomLLMAsJudgeScoringFn" + }, + { + "$ref": "#/components/schemas/RegexParserScoringFn" + }, + { + "$ref": "#/components/schemas/RegexParserMathScoringFn" + }, + { + "$ref": "#/components/schemas/EqualityScoringFn" + }, + { + "$ref": "#/components/schemas/SubsetOfScoringFn" + }, + { + "$ref": "#/components/schemas/FactualityScoringFn" + }, + { + "$ref": "#/components/schemas/FaithfulnessScoringFn" + }, + { + "$ref": "#/components/schemas/AnswerCorrectnessScoringFn" + }, + { + "$ref": "#/components/schemas/AnswerRelevancyScoringFn" + }, + { + "$ref": "#/components/schemas/AnswerSimilarityScoringFn" + }, + { + "$ref": "#/components/schemas/ContextEntityRecallScoringFn" + }, + { + "$ref": "#/components/schemas/ContextPrecisionScoringFn" + }, + { + "$ref": "#/components/schemas/ContextRecallScoringFn" + }, + { + "$ref": "#/components/schemas/ContextRelevancyScoringFn" + } + ], + "discriminator": { + "propertyName": "type", + "mapping": { + "custom_llm_as_judge": "#/components/schemas/CustomLLMAsJudgeScoringFn", + "regex_parser": "#/components/schemas/RegexParserScoringFn", + "regex_parser_math_response": "#/components/schemas/RegexParserMathScoringFn", + "equality": "#/components/schemas/EqualityScoringFn", + "subset_of": "#/components/schemas/SubsetOfScoringFn", + "factuality": "#/components/schemas/FactualityScoringFn", + "faithfulness": "#/components/schemas/FaithfulnessScoringFn", + "answer_correctness": "#/components/schemas/AnswerCorrectnessScoringFn", + "answer_relevancy": "#/components/schemas/AnswerRelevancyScoringFn", + "answer_similarity": "#/components/schemas/AnswerSimilarityScoringFn", + "context_entity_recall": "#/components/schemas/ContextEntityRecallScoringFn", + "context_precision": "#/components/schemas/ContextPrecisionScoringFn", + "context_recall": "#/components/schemas/ContextRecallScoringFn", + "context_relevancy": "#/components/schemas/ContextRelevancyScoringFn" + } + } + }, + "SubsetOfScoringFn": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "subset_of", + "default": "subset_of" + }, + "subset_of": { + "type": "object", + "properties": { + "aggregation_functions": { + "type": "array", + "items": { + "type": "string", + "enum": [ + "average", + "median", + "categorical_count", + "accuracy" + ], + "title": "AggregationFunctionType", + "description": "A type of aggregation function." + }, + "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed." + } + }, + "additionalProperties": false, + "title": "BasicScoringFnParams" + } + }, + "additionalProperties": false, + "required": [ + "type", + "subset_of" + ], + "title": "SubsetOfScoringFn" + }, "Shield": { "type": "object", "properties": { @@ -9992,7 +9982,7 @@ "type": "object", "properties": { "fn": { - "$ref": "#/components/schemas/ScoringFnParams", + "$ref": "#/components/schemas/ScoringFnDefinition", "description": "The type and parameters for the scoring function." }, "scoring_fn_id": { @@ -10168,33 +10158,6 @@ ], "title": "ResumeAgentTurnRequest" }, - "RunEvalRequest": { - "type": "object", - "properties": { - "benchmark_config": { - "$ref": "#/components/schemas/BenchmarkConfig", - "description": "The configuration for the benchmark." - } - }, - "additionalProperties": false, - "required": [ - "benchmark_config" - ], - "title": "RunEvalRequest" - }, - "Job": { - "type": "object", - "properties": { - "job_id": { - "type": "string" - } - }, - "additionalProperties": false, - "required": [ - "job_id" - ], - "title": "Job" - }, "RunShieldRequest": { "type": "object", "properties": { @@ -10284,7 +10247,7 @@ "ScoreRequest": { "type": "object", "properties": { - "input_rows": { + "dataset_rows": { "type": "array", "items": { "type": "object", @@ -10313,25 +10276,18 @@ }, "description": "The rows to score." }, - "scoring_functions": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "$ref": "#/components/schemas/ScoringFnParams" - }, - { - "type": "null" - } - ] + "scoring_fn_ids": { + "type": "array", + "items": { + "type": "string" }, - "description": "The scoring functions to use for the scoring." + "description": "The scoring function ids to use for the scoring." } }, "additionalProperties": false, "required": [ - "input_rows", - "scoring_functions" + "dataset_rows", + "scoring_fn_ids" ], "title": "ScoreRequest" }, @@ -10353,36 +10309,25 @@ "title": "ScoreResponse", "description": "The response from scoring." }, - "ScoreBatchRequest": { + "ScoreDatasetRequest": { "type": "object", "properties": { "dataset_id": { "type": "string" }, - "scoring_functions": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "$ref": "#/components/schemas/ScoringFnParams" - }, - { - "type": "null" - } - ] + "scoring_fn_ids": { + "type": "array", + "items": { + "type": "string" } - }, - "save_results_dataset": { - "type": "boolean" } }, "additionalProperties": false, "required": [ "dataset_id", - "scoring_functions", - "save_results_dataset" + "scoring_fn_ids" ], - "title": "ScoreBatchRequest" + "title": "ScoreDatasetRequest" }, "ScoreBatchResponse": { "type": "object", diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 5b99ba5aa..310b77eb1 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -666,7 +666,44 @@ paths: schema: $ref: '#/components/schemas/EmbeddingsRequest' required: true - /v1/eval/benchmarks/{benchmark_id}/evaluations: + /v1/eval/benchmarks/{benchmark_id}/jobs: + post: + responses: + '200': + description: >- + The job that was created to run the evaluation. + content: + application/json: + schema: + $ref: '#/components/schemas/Job' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Eval + description: Run an evaluation on a benchmark. + parameters: + - name: benchmark_id + in: path + description: >- + The ID of the benchmark to run the evaluation on. + required: true + schema: + type: string + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/EvaluateBenchmarkRequest' + required: true + /v1/eval/rows: post: responses: '200': @@ -688,15 +725,8 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Eval - description: Evaluate a list of rows on a benchmark. - parameters: - - name: benchmark_id - in: path - description: >- - The ID of the benchmark to run the evaluation on. - required: true - schema: - type: string + description: Evaluate a list of rows on a candidate. + parameters: [] requestBody: content: application/json: @@ -2377,43 +2407,6 @@ paths: schema: $ref: '#/components/schemas/ResumeAgentTurnRequest' required: true - /v1/eval/benchmarks/{benchmark_id}/jobs: - post: - responses: - '200': - description: >- - The job that was created to run the evaluation. - content: - application/json: - schema: - $ref: '#/components/schemas/Job' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Eval - description: Run an evaluation on a benchmark. - parameters: - - name: benchmark_id - in: path - description: >- - The ID of the benchmark to run the evaluation on. - required: true - schema: - type: string - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/RunEvalRequest' - required: true /v1/safety/run-shield: post: responses: @@ -2525,7 +2518,7 @@ paths: content: application/json: schema: - $ref: '#/components/schemas/ScoreBatchRequest' + $ref: '#/components/schemas/ScoreDatasetRequest' required: true /v1/post-training/supervised-fine-tune: post: @@ -4448,311 +4441,6 @@ components: - config title: AgentCandidate description: An agent candidate for evaluation. - AnswerCorrectnessScoringFnParams: - type: object - properties: - type: - type: string - const: answer_correctness - default: answer_correctness - answer_correctness: - type: object - properties: - aggregation_functions: - type: array - items: - type: string - enum: - - average - - median - - categorical_count - - accuracy - title: AggregationFunctionType - description: A type of aggregation function. - description: >- - (Optional) Aggregation functions to apply to the scores of each row. - If not provided, no aggregation will be performed. - additionalProperties: false - title: BasicScoringFnParamsFields - additionalProperties: false - required: - - type - - answer_correctness - title: AnswerCorrectnessScoringFnParams - AnswerRelevancyScoringFnParams: - type: object - properties: - type: - type: string - const: answer_relevancy - default: answer_relevancy - answer_relevancy: - type: object - properties: - aggregation_functions: - type: array - items: - type: string - enum: - - average - - median - - categorical_count - - accuracy - title: AggregationFunctionType - description: A type of aggregation function. - description: >- - (Optional) Aggregation functions to apply to the scores of each row. - If not provided, no aggregation will be performed. - additionalProperties: false - title: BasicScoringFnParamsFields - additionalProperties: false - required: - - type - - answer_relevancy - title: AnswerRelevancyScoringFnParams - AnswerSimilarityScoringFnParams: - type: object - properties: - type: - type: string - const: answer_similarity - default: answer_similarity - answer_similarity: - type: object - properties: - aggregation_functions: - type: array - items: - type: string - enum: - - average - - median - - categorical_count - - accuracy - title: AggregationFunctionType - description: A type of aggregation function. - description: >- - (Optional) Aggregation functions to apply to the scores of each row. - If not provided, no aggregation will be performed. - additionalProperties: false - title: BasicScoringFnParamsFields - additionalProperties: false - required: - - type - - answer_similarity - title: AnswerSimilarityScoringFnParams - BenchmarkConfig: - type: object - properties: - eval_candidate: - $ref: '#/components/schemas/EvalCandidate' - description: The candidate to evaluate. - scoring_params: - type: object - additionalProperties: - $ref: '#/components/schemas/ScoringFnParams' - description: >- - Map between scoring function id and parameters for each scoring function - you want to run - num_examples: - type: integer - description: >- - (Optional) The number of examples to evaluate. If not provided, all examples - in the dataset will be evaluated - additionalProperties: false - required: - - eval_candidate - - scoring_params - title: BenchmarkConfig - description: >- - A benchmark configuration for evaluation. - ContextEntityRecallScoringFnParams: - type: object - properties: - type: - type: string - const: context_entity_recall - default: context_entity_recall - context_entity_recall: - type: object - properties: - aggregation_functions: - type: array - items: - type: string - enum: - - average - - median - - categorical_count - - accuracy - title: AggregationFunctionType - description: A type of aggregation function. - description: >- - (Optional) Aggregation functions to apply to the scores of each row. - If not provided, no aggregation will be performed. - additionalProperties: false - title: BasicScoringFnParamsFields - additionalProperties: false - required: - - type - - context_entity_recall - title: ContextEntityRecallScoringFnParams - ContextPrecisionScoringFnParams: - type: object - properties: - type: - type: string - const: context_precision - default: context_precision - context_precision: - type: object - properties: - aggregation_functions: - type: array - items: - type: string - enum: - - average - - median - - categorical_count - - accuracy - title: AggregationFunctionType - description: A type of aggregation function. - description: >- - (Optional) Aggregation functions to apply to the scores of each row. - If not provided, no aggregation will be performed. - additionalProperties: false - title: BasicScoringFnParamsFields - additionalProperties: false - required: - - type - - context_precision - title: ContextPrecisionScoringFnParams - ContextRecallScoringFnParams: - type: object - properties: - type: - type: string - const: context_recall - default: context_recall - context_recall: - type: object - properties: - aggregation_functions: - type: array - items: - type: string - enum: - - average - - median - - categorical_count - - accuracy - title: AggregationFunctionType - description: A type of aggregation function. - description: >- - (Optional) Aggregation functions to apply to the scores of each row. - If not provided, no aggregation will be performed. - additionalProperties: false - title: BasicScoringFnParamsFields - additionalProperties: false - required: - - type - - context_recall - title: ContextRecallScoringFnParams - ContextRelevancyScoringFnParams: - type: object - properties: - type: - type: string - const: context_relevancy - default: context_relevancy - context_relevancy: - type: object - properties: - aggregation_functions: - type: array - items: - type: string - enum: - - average - - median - - categorical_count - - accuracy - title: AggregationFunctionType - description: A type of aggregation function. - description: >- - (Optional) Aggregation functions to apply to the scores of each row. - If not provided, no aggregation will be performed. - additionalProperties: false - title: BasicScoringFnParamsFields - additionalProperties: false - required: - - type - - context_relevancy - title: ContextRelevancyScoringFnParams - CustomLLMAsJudgeScoringFnParams: - type: object - properties: - type: - type: string - const: custom_llm_as_judge - default: custom_llm_as_judge - custom_llm_as_judge: - type: object - properties: - type: - type: string - const: custom_llm_as_judge - default: custom_llm_as_judge - judge_model: - type: string - prompt_template: - type: string - judge_score_regexes: - type: array - items: - type: string - additionalProperties: false - required: - - type - - judge_model - title: CustomLLMAsJudgeScoringFnParamsFields - additionalProperties: false - required: - - type - - custom_llm_as_judge - title: CustomLLMAsJudgeScoringFnParams - EqualityScoringFnParams: - type: object - properties: - type: - type: string - const: equality - default: equality - equality: - type: object - properties: - aggregation_functions: - type: array - items: - type: string - enum: - - average - - median - - categorical_count - - accuracy - title: AggregationFunctionType - description: A type of aggregation function. - description: >- - (Optional) Aggregation functions to apply to the scores of each row. - If not provided, no aggregation will be performed. - additionalProperties: false - title: BasicScoringFnParamsFields - additionalProperties: false - required: - - type - - equality - title: EqualityScoringFnParams EvalCandidate: oneOf: - $ref: '#/components/schemas/ModelCandidate' @@ -4762,68 +4450,6 @@ components: mapping: model: '#/components/schemas/ModelCandidate' agent: '#/components/schemas/AgentCandidate' - FactualityScoringFnParams: - type: object - properties: - type: - type: string - const: factuality - default: factuality - factuality: - type: object - properties: - aggregation_functions: - type: array - items: - type: string - enum: - - average - - median - - categorical_count - - accuracy - title: AggregationFunctionType - description: A type of aggregation function. - description: >- - (Optional) Aggregation functions to apply to the scores of each row. - If not provided, no aggregation will be performed. - additionalProperties: false - title: BasicScoringFnParamsFields - additionalProperties: false - required: - - type - - factuality - title: FactualityScoringFnParams - FaithfulnessScoringFnParams: - type: object - properties: - type: - type: string - const: faithfulness - default: faithfulness - faithfulness: - type: object - properties: - aggregation_functions: - type: array - items: - type: string - enum: - - average - - median - - categorical_count - - accuracy - title: AggregationFunctionType - description: A type of aggregation function. - description: >- - (Optional) Aggregation functions to apply to the scores of each row. - If not provided, no aggregation will be performed. - additionalProperties: false - title: BasicScoringFnParamsFields - additionalProperties: false - required: - - type - - faithfulness - title: FaithfulnessScoringFnParams ModelCandidate: type: object properties: @@ -4849,152 +4475,29 @@ components: - sampling_params title: ModelCandidate description: A model candidate for evaluation. - RegexParserMathScoringFnParams: + EvaluateBenchmarkRequest: type: object properties: - type: - type: string - const: regex_parser_math_response - default: regex_parser_math_response - regex_parser_math_response: - type: object - properties: - parsing_regexes: - type: array - items: - type: string - description: >- - (Optional) Regexes to extract the answer from generated response. - aggregation_functions: - type: array - items: - type: string - enum: - - average - - median - - categorical_count - - accuracy - title: AggregationFunctionType - description: A type of aggregation function. - description: >- - (Optional) Aggregation functions to apply to the scores of each row. - If not provided, no aggregation will be performed. - additionalProperties: false - required: - - parsing_regexes - title: RegexParserScoringFnParamsFields + candidate: + $ref: '#/components/schemas/EvalCandidate' + description: The candidate to evaluate on. additionalProperties: false required: - - type - - regex_parser_math_response - title: RegexParserMathScoringFnParams - RegexParserScoringFnParams: + - candidate + title: EvaluateBenchmarkRequest + Job: type: object properties: - type: + job_id: type: string - const: regex_parser - default: regex_parser - regex_parser: - type: object - properties: - parsing_regexes: - type: array - items: - type: string - description: >- - (Optional) Regexes to extract the answer from generated response. - aggregation_functions: - type: array - items: - type: string - enum: - - average - - median - - categorical_count - - accuracy - title: AggregationFunctionType - description: A type of aggregation function. - description: >- - (Optional) Aggregation functions to apply to the scores of each row. - If not provided, no aggregation will be performed. - additionalProperties: false - required: - - parsing_regexes - title: RegexParserScoringFnParamsFields additionalProperties: false required: - - type - - regex_parser - title: RegexParserScoringFnParams - ScoringFnParams: - oneOf: - - $ref: '#/components/schemas/CustomLLMAsJudgeScoringFnParams' - - $ref: '#/components/schemas/RegexParserScoringFnParams' - - $ref: '#/components/schemas/RegexParserMathScoringFnParams' - - $ref: '#/components/schemas/EqualityScoringFnParams' - - $ref: '#/components/schemas/SubsetOfcoringFnParams' - - $ref: '#/components/schemas/FactualityScoringFnParams' - - $ref: '#/components/schemas/FaithfulnessScoringFnParams' - - $ref: '#/components/schemas/AnswerCorrectnessScoringFnParams' - - $ref: '#/components/schemas/AnswerRelevancyScoringFnParams' - - $ref: '#/components/schemas/AnswerSimilarityScoringFnParams' - - $ref: '#/components/schemas/ContextEntityRecallScoringFnParams' - - $ref: '#/components/schemas/ContextPrecisionScoringFnParams' - - $ref: '#/components/schemas/ContextRecallScoringFnParams' - - $ref: '#/components/schemas/ContextRelevancyScoringFnParams' - discriminator: - propertyName: type - mapping: - custom_llm_as_judge: '#/components/schemas/CustomLLMAsJudgeScoringFnParams' - regex_parser: '#/components/schemas/RegexParserScoringFnParams' - regex_parser_math_response: '#/components/schemas/RegexParserMathScoringFnParams' - equality: '#/components/schemas/EqualityScoringFnParams' - subset_of: '#/components/schemas/SubsetOfcoringFnParams' - factuality: '#/components/schemas/FactualityScoringFnParams' - faithfulness: '#/components/schemas/FaithfulnessScoringFnParams' - answer_correctness: '#/components/schemas/AnswerCorrectnessScoringFnParams' - answer_relevancy: '#/components/schemas/AnswerRelevancyScoringFnParams' - answer_similarity: '#/components/schemas/AnswerSimilarityScoringFnParams' - context_entity_recall: '#/components/schemas/ContextEntityRecallScoringFnParams' - context_precision: '#/components/schemas/ContextPrecisionScoringFnParams' - context_recall: '#/components/schemas/ContextRecallScoringFnParams' - context_relevancy: '#/components/schemas/ContextRelevancyScoringFnParams' - SubsetOfcoringFnParams: - type: object - properties: - type: - type: string - const: subset_of - default: subset_of - subset_of: - type: object - properties: - aggregation_functions: - type: array - items: - type: string - enum: - - average - - median - - categorical_count - - accuracy - title: AggregationFunctionType - description: A type of aggregation function. - description: >- - (Optional) Aggregation functions to apply to the scores of each row. - If not provided, no aggregation will be performed. - additionalProperties: false - title: BasicScoringFnParamsFields - additionalProperties: false - required: - - type - - subset_of - title: SubsetOfcoringFnParams + - job_id + title: Job EvaluateRowsRequest: type: object properties: - input_rows: + dataset_rows: type: array items: type: object @@ -5007,20 +4510,20 @@ components: - type: array - type: object description: The rows to evaluate. - scoring_functions: + scoring_fn_ids: type: array items: type: string description: >- - The scoring functions to use for the evaluation. - benchmark_config: - $ref: '#/components/schemas/BenchmarkConfig' - description: The configuration for the benchmark. + The scoring function ids to use for the evaluation. + candidate: + $ref: '#/components/schemas/EvalCandidate' + description: The candidate to evaluate on. additionalProperties: false required: - - input_rows - - scoring_functions - - benchmark_config + - dataset_rows + - scoring_fn_ids + - candidate title: EvaluateRowsRequest EvaluateResponse: type: object @@ -5393,6 +4896,426 @@ components: - total_count title: PaginatedRowsResult description: A paginated list of rows from a dataset. + AnswerCorrectnessScoringFn: + type: object + properties: + type: + type: string + const: answer_correctness + default: answer_correctness + answer_correctness: + type: object + properties: + aggregation_functions: + type: array + items: + type: string + enum: + - average + - median + - categorical_count + - accuracy + title: AggregationFunctionType + description: A type of aggregation function. + description: >- + (Optional) Aggregation functions to apply to the scores of each row. + If not provided, no aggregation will be performed. + additionalProperties: false + title: BasicScoringFnParams + additionalProperties: false + required: + - type + - answer_correctness + title: AnswerCorrectnessScoringFn + AnswerRelevancyScoringFn: + type: object + properties: + type: + type: string + const: answer_relevancy + default: answer_relevancy + answer_relevancy: + type: object + properties: + aggregation_functions: + type: array + items: + type: string + enum: + - average + - median + - categorical_count + - accuracy + title: AggregationFunctionType + description: A type of aggregation function. + description: >- + (Optional) Aggregation functions to apply to the scores of each row. + If not provided, no aggregation will be performed. + additionalProperties: false + title: BasicScoringFnParams + additionalProperties: false + required: + - type + - answer_relevancy + title: AnswerRelevancyScoringFn + AnswerSimilarityScoringFn: + type: object + properties: + type: + type: string + const: answer_similarity + default: answer_similarity + answer_similarity: + type: object + properties: + aggregation_functions: + type: array + items: + type: string + enum: + - average + - median + - categorical_count + - accuracy + title: AggregationFunctionType + description: A type of aggregation function. + description: >- + (Optional) Aggregation functions to apply to the scores of each row. + If not provided, no aggregation will be performed. + additionalProperties: false + title: BasicScoringFnParams + additionalProperties: false + required: + - type + - answer_similarity + title: AnswerSimilarityScoringFn + ContextEntityRecallScoringFn: + type: object + properties: + type: + type: string + const: context_entity_recall + default: context_entity_recall + context_entity_recall: + type: object + properties: + aggregation_functions: + type: array + items: + type: string + enum: + - average + - median + - categorical_count + - accuracy + title: AggregationFunctionType + description: A type of aggregation function. + description: >- + (Optional) Aggregation functions to apply to the scores of each row. + If not provided, no aggregation will be performed. + additionalProperties: false + title: BasicScoringFnParams + additionalProperties: false + required: + - type + - context_entity_recall + title: ContextEntityRecallScoringFn + ContextPrecisionScoringFn: + type: object + properties: + type: + type: string + const: context_precision + default: context_precision + context_precision: + type: object + properties: + aggregation_functions: + type: array + items: + type: string + enum: + - average + - median + - categorical_count + - accuracy + title: AggregationFunctionType + description: A type of aggregation function. + description: >- + (Optional) Aggregation functions to apply to the scores of each row. + If not provided, no aggregation will be performed. + additionalProperties: false + title: BasicScoringFnParams + additionalProperties: false + required: + - type + - context_precision + title: ContextPrecisionScoringFn + ContextRecallScoringFn: + type: object + properties: + type: + type: string + const: context_recall + default: context_recall + context_recall: + type: object + properties: + aggregation_functions: + type: array + items: + type: string + enum: + - average + - median + - categorical_count + - accuracy + title: AggregationFunctionType + description: A type of aggregation function. + description: >- + (Optional) Aggregation functions to apply to the scores of each row. + If not provided, no aggregation will be performed. + additionalProperties: false + title: BasicScoringFnParams + additionalProperties: false + required: + - type + - context_recall + title: ContextRecallScoringFn + ContextRelevancyScoringFn: + type: object + properties: + type: + type: string + const: context_relevancy + default: context_relevancy + context_relevancy: + type: object + properties: + aggregation_functions: + type: array + items: + type: string + enum: + - average + - median + - categorical_count + - accuracy + title: AggregationFunctionType + description: A type of aggregation function. + description: >- + (Optional) Aggregation functions to apply to the scores of each row. + If not provided, no aggregation will be performed. + additionalProperties: false + title: BasicScoringFnParams + additionalProperties: false + required: + - type + - context_relevancy + title: ContextRelevancyScoringFn + CustomLLMAsJudgeScoringFn: + type: object + properties: + type: + type: string + const: custom_llm_as_judge + default: custom_llm_as_judge + custom_llm_as_judge: + type: object + properties: + type: + type: string + const: custom_llm_as_judge + default: custom_llm_as_judge + judge_model: + type: string + prompt_template: + type: string + judge_score_regexes: + type: array + items: + type: string + additionalProperties: false + required: + - type + - judge_model + title: CustomLLMAsJudgeScoringFnParams + additionalProperties: false + required: + - type + - custom_llm_as_judge + title: CustomLLMAsJudgeScoringFn + EqualityScoringFn: + type: object + properties: + type: + type: string + const: equality + default: equality + equality: + type: object + properties: + aggregation_functions: + type: array + items: + type: string + enum: + - average + - median + - categorical_count + - accuracy + title: AggregationFunctionType + description: A type of aggregation function. + description: >- + (Optional) Aggregation functions to apply to the scores of each row. + If not provided, no aggregation will be performed. + additionalProperties: false + title: BasicScoringFnParams + additionalProperties: false + required: + - type + - equality + title: EqualityScoringFn + FactualityScoringFn: + type: object + properties: + type: + type: string + const: factuality + default: factuality + factuality: + type: object + properties: + aggregation_functions: + type: array + items: + type: string + enum: + - average + - median + - categorical_count + - accuracy + title: AggregationFunctionType + description: A type of aggregation function. + description: >- + (Optional) Aggregation functions to apply to the scores of each row. + If not provided, no aggregation will be performed. + additionalProperties: false + title: BasicScoringFnParams + additionalProperties: false + required: + - type + - factuality + title: FactualityScoringFn + FaithfulnessScoringFn: + type: object + properties: + type: + type: string + const: faithfulness + default: faithfulness + faithfulness: + type: object + properties: + aggregation_functions: + type: array + items: + type: string + enum: + - average + - median + - categorical_count + - accuracy + title: AggregationFunctionType + description: A type of aggregation function. + description: >- + (Optional) Aggregation functions to apply to the scores of each row. + If not provided, no aggregation will be performed. + additionalProperties: false + title: BasicScoringFnParams + additionalProperties: false + required: + - type + - faithfulness + title: FaithfulnessScoringFn + RegexParserMathScoringFn: + type: object + properties: + type: + type: string + const: regex_parser_math_response + default: regex_parser_math_response + regex_parser_math_response: + type: object + properties: + parsing_regexes: + type: array + items: + type: string + description: >- + (Optional) Regexes to extract the answer from generated response. + aggregation_functions: + type: array + items: + type: string + enum: + - average + - median + - categorical_count + - accuracy + title: AggregationFunctionType + description: A type of aggregation function. + description: >- + (Optional) Aggregation functions to apply to the scores of each row. + If not provided, no aggregation will be performed. + additionalProperties: false + required: + - parsing_regexes + title: RegexParserScoringFnParams + additionalProperties: false + required: + - type + - regex_parser_math_response + title: RegexParserMathScoringFn + RegexParserScoringFn: + type: object + properties: + type: + type: string + const: regex_parser + default: regex_parser + regex_parser: + type: object + properties: + parsing_regexes: + type: array + items: + type: string + description: >- + (Optional) Regexes to extract the answer from generated response. + aggregation_functions: + type: array + items: + type: string + enum: + - average + - median + - categorical_count + - accuracy + title: AggregationFunctionType + description: A type of aggregation function. + description: >- + (Optional) Aggregation functions to apply to the scores of each row. + If not provided, no aggregation will be performed. + additionalProperties: false + required: + - parsing_regexes + title: RegexParserScoringFnParams + additionalProperties: false + required: + - type + - regex_parser + title: RegexParserScoringFn ScoringFn: type: object properties: @@ -5407,7 +5330,7 @@ components: const: scoring_function default: scoring_function fn: - $ref: '#/components/schemas/ScoringFnParams' + $ref: '#/components/schemas/ScoringFnDefinition' description: >- The scoring function type and parameters. metadata: @@ -5431,6 +5354,70 @@ components: - fn - metadata title: ScoringFn + ScoringFnDefinition: + oneOf: + - $ref: '#/components/schemas/CustomLLMAsJudgeScoringFn' + - $ref: '#/components/schemas/RegexParserScoringFn' + - $ref: '#/components/schemas/RegexParserMathScoringFn' + - $ref: '#/components/schemas/EqualityScoringFn' + - $ref: '#/components/schemas/SubsetOfScoringFn' + - $ref: '#/components/schemas/FactualityScoringFn' + - $ref: '#/components/schemas/FaithfulnessScoringFn' + - $ref: '#/components/schemas/AnswerCorrectnessScoringFn' + - $ref: '#/components/schemas/AnswerRelevancyScoringFn' + - $ref: '#/components/schemas/AnswerSimilarityScoringFn' + - $ref: '#/components/schemas/ContextEntityRecallScoringFn' + - $ref: '#/components/schemas/ContextPrecisionScoringFn' + - $ref: '#/components/schemas/ContextRecallScoringFn' + - $ref: '#/components/schemas/ContextRelevancyScoringFn' + discriminator: + propertyName: type + mapping: + custom_llm_as_judge: '#/components/schemas/CustomLLMAsJudgeScoringFn' + regex_parser: '#/components/schemas/RegexParserScoringFn' + regex_parser_math_response: '#/components/schemas/RegexParserMathScoringFn' + equality: '#/components/schemas/EqualityScoringFn' + subset_of: '#/components/schemas/SubsetOfScoringFn' + factuality: '#/components/schemas/FactualityScoringFn' + faithfulness: '#/components/schemas/FaithfulnessScoringFn' + answer_correctness: '#/components/schemas/AnswerCorrectnessScoringFn' + answer_relevancy: '#/components/schemas/AnswerRelevancyScoringFn' + answer_similarity: '#/components/schemas/AnswerSimilarityScoringFn' + context_entity_recall: '#/components/schemas/ContextEntityRecallScoringFn' + context_precision: '#/components/schemas/ContextPrecisionScoringFn' + context_recall: '#/components/schemas/ContextRecallScoringFn' + context_relevancy: '#/components/schemas/ContextRelevancyScoringFn' + SubsetOfScoringFn: + type: object + properties: + type: + type: string + const: subset_of + default: subset_of + subset_of: + type: object + properties: + aggregation_functions: + type: array + items: + type: string + enum: + - average + - median + - categorical_count + - accuracy + title: AggregationFunctionType + description: A type of aggregation function. + description: >- + (Optional) Aggregation functions to apply to the scores of each row. + If not provided, no aggregation will be performed. + additionalProperties: false + title: BasicScoringFnParams + additionalProperties: false + required: + - type + - subset_of + title: SubsetOfScoringFn Shield: type: object properties: @@ -6853,7 +6840,7 @@ components: type: object properties: fn: - $ref: '#/components/schemas/ScoringFnParams' + $ref: '#/components/schemas/ScoringFnDefinition' description: >- The type and parameters for the scoring function. scoring_fn_id: @@ -6959,25 +6946,6 @@ components: required: - tool_responses title: ResumeAgentTurnRequest - RunEvalRequest: - type: object - properties: - benchmark_config: - $ref: '#/components/schemas/BenchmarkConfig' - description: The configuration for the benchmark. - additionalProperties: false - required: - - benchmark_config - title: RunEvalRequest - Job: - type: object - properties: - job_id: - type: string - additionalProperties: false - required: - - job_id - title: Job RunShieldRequest: type: object properties: @@ -7034,7 +7002,7 @@ components: ScoreRequest: type: object properties: - input_rows: + dataset_rows: type: array items: type: object @@ -7047,18 +7015,16 @@ components: - type: array - type: object description: The rows to score. - scoring_functions: - type: object - additionalProperties: - oneOf: - - $ref: '#/components/schemas/ScoringFnParams' - - type: 'null' + scoring_fn_ids: + type: array + items: + type: string description: >- - The scoring functions to use for the scoring. + The scoring function ids to use for the scoring. additionalProperties: false required: - - input_rows - - scoring_functions + - dataset_rows + - scoring_fn_ids title: ScoreRequest ScoreResponse: type: object @@ -7074,25 +7040,20 @@ components: - results title: ScoreResponse description: The response from scoring. - ScoreBatchRequest: + ScoreDatasetRequest: type: object properties: dataset_id: type: string - scoring_functions: - type: object - additionalProperties: - oneOf: - - $ref: '#/components/schemas/ScoringFnParams' - - type: 'null' - save_results_dataset: - type: boolean + scoring_fn_ids: + type: array + items: + type: string additionalProperties: false required: - dataset_id - - scoring_functions - - save_results_dataset - title: ScoreBatchRequest + - scoring_fn_ids + title: ScoreDatasetRequest ScoreBatchResponse: type: object properties: diff --git a/llama_stack/apis/benchmarks/benchmarks.py b/llama_stack/apis/benchmarks/benchmarks.py index 01fc873e6..45edd3d6b 100644 --- a/llama_stack/apis/benchmarks/benchmarks.py +++ b/llama_stack/apis/benchmarks/benchmarks.py @@ -8,7 +8,6 @@ from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkab from pydantic import BaseModel, Field from llama_stack.apis.resource import Resource, ResourceType -from llama_stack.apis.scoring_functions import ScoringFnParams from llama_stack.schema_utils import json_schema_type, webmethod diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py index dec018d83..552afe0a2 100644 --- a/llama_stack/apis/eval/eval.py +++ b/llama_stack/apis/eval/eval.py @@ -13,7 +13,6 @@ from llama_stack.apis.agents import AgentConfig from llama_stack.apis.common.job_types import Job, JobStatus from llama_stack.apis.inference import SamplingParams, SystemMessage from llama_stack.apis.scoring import ScoringResult -from llama_stack.apis.scoring_functions import ScoringFnParams from llama_stack.schema_utils import json_schema_type, register_schema, webmethod @@ -49,27 +48,6 @@ EvalCandidate = register_schema( ) -@json_schema_type -class BenchmarkConfig(BaseModel): - """A benchmark configuration for evaluation. - - :param eval_candidate: The candidate to evaluate. - :param scoring_params: Map between scoring function id and parameters for each scoring function you want to run - :param num_examples: (Optional) The number of examples to evaluate. If not provided, all examples in the dataset will be evaluated - """ - - eval_candidate: EvalCandidate - scoring_params: Dict[str, ScoringFnParams] = Field( - description="Map between scoring function id and parameters for each scoring function you want to run", - default_factory=dict, - ) - num_examples: Optional[int] = Field( - description="Number of examples to evaluate (useful for testing), if not provided, all examples in the dataset will be evaluated", - default=None, - ) - # we could optinally add any specific dataset config here - - @json_schema_type class EvaluateResponse(BaseModel): """The response from an evaluation. @@ -87,32 +65,30 @@ class Eval(Protocol): """Llama Stack Evaluation API for running evaluations on model and agent candidates.""" @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST") - async def run_eval( + async def evaluate_benchmark( self, benchmark_id: str, - benchmark_config: BenchmarkConfig, + candidate: EvalCandidate, ) -> Job: """Run an evaluation on a benchmark. :param benchmark_id: The ID of the benchmark to run the evaluation on. - :param benchmark_config: The configuration for the benchmark. + :param candidate: The candidate to evaluate on. :return: The job that was created to run the evaluation. """ - @webmethod(route="/eval/benchmarks/{benchmark_id}/evaluations", method="POST") + @webmethod(route="/eval/rows", method="POST") async def evaluate_rows( self, - benchmark_id: str, - input_rows: List[Dict[str, Any]], - scoring_functions: List[str], - benchmark_config: BenchmarkConfig, + dataset_rows: List[Dict[str, Any]], + scoring_fn_ids: List[str], + candidate: EvalCandidate, ) -> EvaluateResponse: - """Evaluate a list of rows on a benchmark. - - :param benchmark_id: The ID of the benchmark to run the evaluation on. - :param input_rows: The rows to evaluate. - :param scoring_functions: The scoring functions to use for the evaluation. - :param benchmark_config: The configuration for the benchmark. + """Evaluate a list of rows on a candidate. + + :param dataset_rows: The rows to evaluate. + :param scoring_fn_ids: The scoring function ids to use for the evaluation. + :param candidate: The candidate to evaluate on. :return: EvaluateResponse object containing generations and scores """ diff --git a/llama_stack/apis/scoring/scoring.py b/llama_stack/apis/scoring/scoring.py index 54a9ac2aa..eecca7799 100644 --- a/llama_stack/apis/scoring/scoring.py +++ b/llama_stack/apis/scoring/scoring.py @@ -8,7 +8,7 @@ from typing import Any, Dict, List, Optional, Protocol, runtime_checkable from pydantic import BaseModel -from llama_stack.apis.scoring_functions import ScoringFn, ScoringFnParams +from llama_stack.apis.scoring_functions import ScoringFn from llama_stack.schema_utils import json_schema_type, webmethod # mapping of metric to value @@ -56,23 +56,22 @@ class Scoring(Protocol): scoring_function_store: ScoringFunctionStore @webmethod(route="/scoring/score-batch", method="POST") - async def score_batch( + async def score_dataset( self, dataset_id: str, - scoring_functions: Dict[str, Optional[ScoringFnParams]], - save_results_dataset: bool = False, + scoring_fn_ids: List[str], ) -> ScoreBatchResponse: ... @webmethod(route="/scoring/score", method="POST") async def score( self, - input_rows: List[Dict[str, Any]], - scoring_functions: Dict[str, Optional[ScoringFnParams]], + dataset_rows: List[Dict[str, Any]], + scoring_fn_ids: List[str], ) -> ScoreResponse: """Score a list of rows. - :param input_rows: The rows to score. - :param scoring_functions: The scoring functions to use for the scoring. + :param dataset_rows: The rows to score. + :param scoring_fn_ids: The scoring function ids to use for the scoring. :return: ScoreResponse object containing rows and aggregated results """ ... diff --git a/llama_stack/apis/scoring_functions/scoring_functions.py b/llama_stack/apis/scoring_functions/scoring_functions.py index d6ee4f975..0e7ec4354 100644 --- a/llama_stack/apis/scoring_functions/scoring_functions.py +++ b/llama_stack/apis/scoring_functions/scoring_functions.py @@ -67,7 +67,7 @@ class AggregationFunctionType(Enum): accuracy = "accuracy" -class BasicScoringFnParamsFields(BaseModel): +class BasicScoringFnParams(BaseModel): """ :param aggregation_functions: (Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed. """ @@ -78,7 +78,7 @@ class BasicScoringFnParamsFields(BaseModel): ) -class RegexParserScoringFnParamsFields(BaseModel): +class RegexParserScoringFnParams(BaseModel): """ :param parsing_regexes: (Optional) Regexes to extract the answer from generated response. :param aggregation_functions: (Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed. @@ -93,7 +93,7 @@ class RegexParserScoringFnParamsFields(BaseModel): default_factory=list, ) -class CustomLLMAsJudgeScoringFnParamsFields(BaseModel): +class CustomLLMAsJudgeScoringFnParams(BaseModel): type: Literal["custom_llm_as_judge"] = "custom_llm_as_judge" judge_model: str prompt_template: Optional[str] = None @@ -103,103 +103,103 @@ class CustomLLMAsJudgeScoringFnParamsFields(BaseModel): ) @json_schema_type -class RegexParserScoringFnParams(BaseModel): +class RegexParserScoringFn(BaseModel): type: Literal["regex_parser"] = "regex_parser" - regex_parser: RegexParserScoringFnParamsFields + regex_parser: RegexParserScoringFnParams @json_schema_type -class RegexParserMathScoringFnParams(BaseModel): +class RegexParserMathScoringFn(BaseModel): type: Literal["regex_parser_math_response"] = "regex_parser_math_response" - regex_parser_math_response: RegexParserScoringFnParamsFields + regex_parser_math_response: RegexParserScoringFnParams @json_schema_type -class EqualityScoringFnParams(BaseModel): +class EqualityScoringFn(BaseModel): type: Literal["equality"] = "equality" - equality: BasicScoringFnParamsFields + equality: BasicScoringFnParams @json_schema_type -class SubsetOfcoringFnParams(BaseModel): +class SubsetOfScoringFn(BaseModel): type: Literal["subset_of"] = "subset_of" - subset_of: BasicScoringFnParamsFields + subset_of: BasicScoringFnParams @json_schema_type -class FactualityScoringFnParams(BaseModel): +class FactualityScoringFn(BaseModel): type: Literal["factuality"] = "factuality" - factuality: BasicScoringFnParamsFields + factuality: BasicScoringFnParams @json_schema_type -class FaithfulnessScoringFnParams(BaseModel): +class FaithfulnessScoringFn(BaseModel): type: Literal["faithfulness"] = "faithfulness" - faithfulness: BasicScoringFnParamsFields + faithfulness: BasicScoringFnParams @json_schema_type -class AnswerCorrectnessScoringFnParams(BaseModel): +class AnswerCorrectnessScoringFn(BaseModel): type: Literal["answer_correctness"] = "answer_correctness" - answer_correctness: BasicScoringFnParamsFields + answer_correctness: BasicScoringFnParams @json_schema_type -class AnswerRelevancyScoringFnParams(BaseModel): +class AnswerRelevancyScoringFn(BaseModel): type: Literal["answer_relevancy"] = "answer_relevancy" - answer_relevancy: BasicScoringFnParamsFields + answer_relevancy: BasicScoringFnParams @json_schema_type -class AnswerSimilarityScoringFnParams(BaseModel): +class AnswerSimilarityScoringFn(BaseModel): type: Literal["answer_similarity"] = "answer_similarity" - answer_similarity: BasicScoringFnParamsFields + answer_similarity: BasicScoringFnParams @json_schema_type -class ContextEntityRecallScoringFnParams(BaseModel): +class ContextEntityRecallScoringFn(BaseModel): type: Literal["context_entity_recall"] = "context_entity_recall" - context_entity_recall: BasicScoringFnParamsFields + context_entity_recall: BasicScoringFnParams @json_schema_type -class ContextPrecisionScoringFnParams(BaseModel): +class ContextPrecisionScoringFn(BaseModel): type: Literal["context_precision"] = "context_precision" - context_precision: BasicScoringFnParamsFields + context_precision: BasicScoringFnParams @json_schema_type -class ContextRecallScoringFnParams(BaseModel): +class ContextRecallScoringFn(BaseModel): type: Literal["context_recall"] = "context_recall" - context_recall: BasicScoringFnParamsFields + context_recall: BasicScoringFnParams @json_schema_type -class ContextRelevancyScoringFnParams(BaseModel): +class ContextRelevancyScoringFn(BaseModel): type: Literal["context_relevancy"] = "context_relevancy" - context_relevancy: BasicScoringFnParamsFields + context_relevancy: BasicScoringFnParams @json_schema_type -class CustomLLMAsJudgeScoringFnParams(BaseModel): +class CustomLLMAsJudgeScoringFn(BaseModel): type: Literal["custom_llm_as_judge"] = "custom_llm_as_judge" - custom_llm_as_judge: CustomLLMAsJudgeScoringFnParamsFields + custom_llm_as_judge: CustomLLMAsJudgeScoringFnParams -ScoringFnParams = register_schema( +ScoringFnDefinition = register_schema( Annotated[ Union[ - CustomLLMAsJudgeScoringFnParams, - RegexParserScoringFnParams, - RegexParserMathScoringFnParams, - EqualityScoringFnParams, - SubsetOfcoringFnParams, - FactualityScoringFnParams, - FaithfulnessScoringFnParams, - AnswerCorrectnessScoringFnParams, - AnswerRelevancyScoringFnParams, - AnswerSimilarityScoringFnParams, - ContextEntityRecallScoringFnParams, - ContextPrecisionScoringFnParams, - ContextRecallScoringFnParams, - ContextRelevancyScoringFnParams, + CustomLLMAsJudgeScoringFn, + RegexParserScoringFn, + RegexParserMathScoringFn, + EqualityScoringFn, + SubsetOfScoringFn, + FactualityScoringFn, + FaithfulnessScoringFn, + AnswerCorrectnessScoringFn, + AnswerRelevancyScoringFn, + AnswerSimilarityScoringFn, + ContextEntityRecallScoringFn, + ContextPrecisionScoringFn, + ContextRecallScoringFn, + ContextRelevancyScoringFn, ], Field(discriminator="type"), ], - name="ScoringFnParams", + name="ScoringFnDefinition", ) @@ -208,7 +208,7 @@ class CommonScoringFnFields(BaseModel): :param fn: The scoring function type and parameters. :param metadata: (Optional) Any additional metadata for this definition (e.g. description). """ - fn: ScoringFnParams + fn: ScoringFnDefinition metadata: Dict[str, Any] = Field( default_factory=dict, description="Any additional metadata for this definition (e.g. description)", @@ -288,7 +288,7 @@ class ScoringFunctions(Protocol): @webmethod(route="/scoring-functions", method="POST") async def register_scoring_function( self, - fn: ScoringFnParams, + fn: ScoringFnDefinition, scoring_fn_id: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, ) -> ScoringFn: