diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index d2a745655..493eeebc4 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -968,7 +968,60 @@
}
}
},
- "/v1/eval/benchmarks/{benchmark_id}/evaluations": {
+ "/v1/eval/benchmarks/{benchmark_id}/jobs": {
+ "post": {
+ "responses": {
+ "200": {
+ "description": "The job that was created to run the evaluation.",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/Job"
+ }
+ }
+ }
+ },
+ "400": {
+ "$ref": "#/components/responses/BadRequest400"
+ },
+ "429": {
+ "$ref": "#/components/responses/TooManyRequests429"
+ },
+ "500": {
+ "$ref": "#/components/responses/InternalServerError500"
+ },
+ "default": {
+ "$ref": "#/components/responses/DefaultError"
+ }
+ },
+ "tags": [
+ "Eval"
+ ],
+ "description": "Run an evaluation on a benchmark.",
+ "parameters": [
+ {
+ "name": "benchmark_id",
+ "in": "path",
+ "description": "The ID of the benchmark to run the evaluation on.",
+ "required": true,
+ "schema": {
+ "type": "string"
+ }
+ }
+ ],
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/EvaluateBenchmarkRequest"
+ }
+ }
+ },
+ "required": true
+ }
+ }
+ },
+ "/v1/eval/rows": {
"post": {
"responses": {
"200": {
@@ -997,18 +1050,8 @@
"tags": [
"Eval"
],
- "description": "Evaluate a list of rows on a benchmark.",
- "parameters": [
- {
- "name": "benchmark_id",
- "in": "path",
- "description": "The ID of the benchmark to run the evaluation on.",
- "required": true,
- "schema": {
- "type": "string"
- }
- }
- ],
+ "description": "Evaluate a list of rows on a candidate.",
+ "parameters": [],
"requestBody": {
"content": {
"application/json": {
@@ -3498,59 +3541,6 @@
}
}
},
- "/v1/eval/benchmarks/{benchmark_id}/jobs": {
- "post": {
- "responses": {
- "200": {
- "description": "The job that was created to run the evaluation.",
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/Job"
- }
- }
- }
- },
- "400": {
- "$ref": "#/components/responses/BadRequest400"
- },
- "429": {
- "$ref": "#/components/responses/TooManyRequests429"
- },
- "500": {
- "$ref": "#/components/responses/InternalServerError500"
- },
- "default": {
- "$ref": "#/components/responses/DefaultError"
- }
- },
- "tags": [
- "Eval"
- ],
- "description": "Run an evaluation on a benchmark.",
- "parameters": [
- {
- "name": "benchmark_id",
- "in": "path",
- "description": "The ID of the benchmark to run the evaluation on.",
- "required": true,
- "schema": {
- "type": "string"
- }
- }
- ],
- "requestBody": {
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/RunEvalRequest"
- }
- }
- },
- "required": true
- }
- }
- },
"/v1/safety/run-shield": {
"post": {
"responses": {
@@ -3708,7 +3698,7 @@
"content": {
"application/json": {
"schema": {
- "$ref": "#/components/schemas/ScoreBatchRequest"
+ "$ref": "#/components/schemas/ScoreDatasetRequest"
}
}
},
@@ -6385,381 +6375,6 @@
"title": "AgentCandidate",
"description": "An agent candidate for evaluation."
},
- "AnswerCorrectnessScoringFnParams": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "answer_correctness",
- "default": "answer_correctness"
- },
- "answer_correctness": {
- "type": "object",
- "properties": {
- "aggregation_functions": {
- "type": "array",
- "items": {
- "type": "string",
- "enum": [
- "average",
- "median",
- "categorical_count",
- "accuracy"
- ],
- "title": "AggregationFunctionType",
- "description": "A type of aggregation function."
- },
- "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
- }
- },
- "additionalProperties": false,
- "title": "BasicScoringFnParamsFields"
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "answer_correctness"
- ],
- "title": "AnswerCorrectnessScoringFnParams"
- },
- "AnswerRelevancyScoringFnParams": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "answer_relevancy",
- "default": "answer_relevancy"
- },
- "answer_relevancy": {
- "type": "object",
- "properties": {
- "aggregation_functions": {
- "type": "array",
- "items": {
- "type": "string",
- "enum": [
- "average",
- "median",
- "categorical_count",
- "accuracy"
- ],
- "title": "AggregationFunctionType",
- "description": "A type of aggregation function."
- },
- "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
- }
- },
- "additionalProperties": false,
- "title": "BasicScoringFnParamsFields"
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "answer_relevancy"
- ],
- "title": "AnswerRelevancyScoringFnParams"
- },
- "AnswerSimilarityScoringFnParams": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "answer_similarity",
- "default": "answer_similarity"
- },
- "answer_similarity": {
- "type": "object",
- "properties": {
- "aggregation_functions": {
- "type": "array",
- "items": {
- "type": "string",
- "enum": [
- "average",
- "median",
- "categorical_count",
- "accuracy"
- ],
- "title": "AggregationFunctionType",
- "description": "A type of aggregation function."
- },
- "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
- }
- },
- "additionalProperties": false,
- "title": "BasicScoringFnParamsFields"
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "answer_similarity"
- ],
- "title": "AnswerSimilarityScoringFnParams"
- },
- "BenchmarkConfig": {
- "type": "object",
- "properties": {
- "eval_candidate": {
- "$ref": "#/components/schemas/EvalCandidate",
- "description": "The candidate to evaluate."
- },
- "scoring_params": {
- "type": "object",
- "additionalProperties": {
- "$ref": "#/components/schemas/ScoringFnParams"
- },
- "description": "Map between scoring function id and parameters for each scoring function you want to run"
- },
- "num_examples": {
- "type": "integer",
- "description": "(Optional) The number of examples to evaluate. If not provided, all examples in the dataset will be evaluated"
- }
- },
- "additionalProperties": false,
- "required": [
- "eval_candidate",
- "scoring_params"
- ],
- "title": "BenchmarkConfig",
- "description": "A benchmark configuration for evaluation."
- },
- "ContextEntityRecallScoringFnParams": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "context_entity_recall",
- "default": "context_entity_recall"
- },
- "context_entity_recall": {
- "type": "object",
- "properties": {
- "aggregation_functions": {
- "type": "array",
- "items": {
- "type": "string",
- "enum": [
- "average",
- "median",
- "categorical_count",
- "accuracy"
- ],
- "title": "AggregationFunctionType",
- "description": "A type of aggregation function."
- },
- "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
- }
- },
- "additionalProperties": false,
- "title": "BasicScoringFnParamsFields"
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "context_entity_recall"
- ],
- "title": "ContextEntityRecallScoringFnParams"
- },
- "ContextPrecisionScoringFnParams": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "context_precision",
- "default": "context_precision"
- },
- "context_precision": {
- "type": "object",
- "properties": {
- "aggregation_functions": {
- "type": "array",
- "items": {
- "type": "string",
- "enum": [
- "average",
- "median",
- "categorical_count",
- "accuracy"
- ],
- "title": "AggregationFunctionType",
- "description": "A type of aggregation function."
- },
- "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
- }
- },
- "additionalProperties": false,
- "title": "BasicScoringFnParamsFields"
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "context_precision"
- ],
- "title": "ContextPrecisionScoringFnParams"
- },
- "ContextRecallScoringFnParams": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "context_recall",
- "default": "context_recall"
- },
- "context_recall": {
- "type": "object",
- "properties": {
- "aggregation_functions": {
- "type": "array",
- "items": {
- "type": "string",
- "enum": [
- "average",
- "median",
- "categorical_count",
- "accuracy"
- ],
- "title": "AggregationFunctionType",
- "description": "A type of aggregation function."
- },
- "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
- }
- },
- "additionalProperties": false,
- "title": "BasicScoringFnParamsFields"
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "context_recall"
- ],
- "title": "ContextRecallScoringFnParams"
- },
- "ContextRelevancyScoringFnParams": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "context_relevancy",
- "default": "context_relevancy"
- },
- "context_relevancy": {
- "type": "object",
- "properties": {
- "aggregation_functions": {
- "type": "array",
- "items": {
- "type": "string",
- "enum": [
- "average",
- "median",
- "categorical_count",
- "accuracy"
- ],
- "title": "AggregationFunctionType",
- "description": "A type of aggregation function."
- },
- "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
- }
- },
- "additionalProperties": false,
- "title": "BasicScoringFnParamsFields"
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "context_relevancy"
- ],
- "title": "ContextRelevancyScoringFnParams"
- },
- "CustomLLMAsJudgeScoringFnParams": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "custom_llm_as_judge",
- "default": "custom_llm_as_judge"
- },
- "custom_llm_as_judge": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "custom_llm_as_judge",
- "default": "custom_llm_as_judge"
- },
- "judge_model": {
- "type": "string"
- },
- "prompt_template": {
- "type": "string"
- },
- "judge_score_regexes": {
- "type": "array",
- "items": {
- "type": "string"
- }
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "judge_model"
- ],
- "title": "CustomLLMAsJudgeScoringFnParamsFields"
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "custom_llm_as_judge"
- ],
- "title": "CustomLLMAsJudgeScoringFnParams"
- },
- "EqualityScoringFnParams": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "equality",
- "default": "equality"
- },
- "equality": {
- "type": "object",
- "properties": {
- "aggregation_functions": {
- "type": "array",
- "items": {
- "type": "string",
- "enum": [
- "average",
- "median",
- "categorical_count",
- "accuracy"
- ],
- "title": "AggregationFunctionType",
- "description": "A type of aggregation function."
- },
- "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
- }
- },
- "additionalProperties": false,
- "title": "BasicScoringFnParamsFields"
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "equality"
- ],
- "title": "EqualityScoringFnParams"
- },
"EvalCandidate": {
"oneOf": [
{
@@ -6777,82 +6392,6 @@
}
}
},
- "FactualityScoringFnParams": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "factuality",
- "default": "factuality"
- },
- "factuality": {
- "type": "object",
- "properties": {
- "aggregation_functions": {
- "type": "array",
- "items": {
- "type": "string",
- "enum": [
- "average",
- "median",
- "categorical_count",
- "accuracy"
- ],
- "title": "AggregationFunctionType",
- "description": "A type of aggregation function."
- },
- "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
- }
- },
- "additionalProperties": false,
- "title": "BasicScoringFnParamsFields"
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "factuality"
- ],
- "title": "FactualityScoringFnParams"
- },
- "FaithfulnessScoringFnParams": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "faithfulness",
- "default": "faithfulness"
- },
- "faithfulness": {
- "type": "object",
- "properties": {
- "aggregation_functions": {
- "type": "array",
- "items": {
- "type": "string",
- "enum": [
- "average",
- "median",
- "categorical_count",
- "accuracy"
- ],
- "title": "AggregationFunctionType",
- "description": "A type of aggregation function."
- },
- "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
- }
- },
- "additionalProperties": false,
- "title": "BasicScoringFnParamsFields"
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "faithfulness"
- ],
- "title": "FaithfulnessScoringFnParams"
- },
"ModelCandidate": {
"type": "object",
"properties": {
@@ -6883,209 +6422,37 @@
"title": "ModelCandidate",
"description": "A model candidate for evaluation."
},
- "RegexParserMathScoringFnParams": {
+ "EvaluateBenchmarkRequest": {
"type": "object",
"properties": {
- "type": {
- "type": "string",
- "const": "regex_parser_math_response",
- "default": "regex_parser_math_response"
- },
- "regex_parser_math_response": {
- "type": "object",
- "properties": {
- "parsing_regexes": {
- "type": "array",
- "items": {
- "type": "string"
- },
- "description": "(Optional) Regexes to extract the answer from generated response."
- },
- "aggregation_functions": {
- "type": "array",
- "items": {
- "type": "string",
- "enum": [
- "average",
- "median",
- "categorical_count",
- "accuracy"
- ],
- "title": "AggregationFunctionType",
- "description": "A type of aggregation function."
- },
- "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
- }
- },
- "additionalProperties": false,
- "required": [
- "parsing_regexes"
- ],
- "title": "RegexParserScoringFnParamsFields"
+ "candidate": {
+ "$ref": "#/components/schemas/EvalCandidate",
+ "description": "The candidate to evaluate on."
}
},
"additionalProperties": false,
"required": [
- "type",
- "regex_parser_math_response"
+ "candidate"
],
- "title": "RegexParserMathScoringFnParams"
+ "title": "EvaluateBenchmarkRequest"
},
- "RegexParserScoringFnParams": {
+ "Job": {
"type": "object",
"properties": {
- "type": {
- "type": "string",
- "const": "regex_parser",
- "default": "regex_parser"
- },
- "regex_parser": {
- "type": "object",
- "properties": {
- "parsing_regexes": {
- "type": "array",
- "items": {
- "type": "string"
- },
- "description": "(Optional) Regexes to extract the answer from generated response."
- },
- "aggregation_functions": {
- "type": "array",
- "items": {
- "type": "string",
- "enum": [
- "average",
- "median",
- "categorical_count",
- "accuracy"
- ],
- "title": "AggregationFunctionType",
- "description": "A type of aggregation function."
- },
- "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
- }
- },
- "additionalProperties": false,
- "required": [
- "parsing_regexes"
- ],
- "title": "RegexParserScoringFnParamsFields"
+ "job_id": {
+ "type": "string"
}
},
"additionalProperties": false,
"required": [
- "type",
- "regex_parser"
+ "job_id"
],
- "title": "RegexParserScoringFnParams"
- },
- "ScoringFnParams": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/CustomLLMAsJudgeScoringFnParams"
- },
- {
- "$ref": "#/components/schemas/RegexParserScoringFnParams"
- },
- {
- "$ref": "#/components/schemas/RegexParserMathScoringFnParams"
- },
- {
- "$ref": "#/components/schemas/EqualityScoringFnParams"
- },
- {
- "$ref": "#/components/schemas/SubsetOfcoringFnParams"
- },
- {
- "$ref": "#/components/schemas/FactualityScoringFnParams"
- },
- {
- "$ref": "#/components/schemas/FaithfulnessScoringFnParams"
- },
- {
- "$ref": "#/components/schemas/AnswerCorrectnessScoringFnParams"
- },
- {
- "$ref": "#/components/schemas/AnswerRelevancyScoringFnParams"
- },
- {
- "$ref": "#/components/schemas/AnswerSimilarityScoringFnParams"
- },
- {
- "$ref": "#/components/schemas/ContextEntityRecallScoringFnParams"
- },
- {
- "$ref": "#/components/schemas/ContextPrecisionScoringFnParams"
- },
- {
- "$ref": "#/components/schemas/ContextRecallScoringFnParams"
- },
- {
- "$ref": "#/components/schemas/ContextRelevancyScoringFnParams"
- }
- ],
- "discriminator": {
- "propertyName": "type",
- "mapping": {
- "custom_llm_as_judge": "#/components/schemas/CustomLLMAsJudgeScoringFnParams",
- "regex_parser": "#/components/schemas/RegexParserScoringFnParams",
- "regex_parser_math_response": "#/components/schemas/RegexParserMathScoringFnParams",
- "equality": "#/components/schemas/EqualityScoringFnParams",
- "subset_of": "#/components/schemas/SubsetOfcoringFnParams",
- "factuality": "#/components/schemas/FactualityScoringFnParams",
- "faithfulness": "#/components/schemas/FaithfulnessScoringFnParams",
- "answer_correctness": "#/components/schemas/AnswerCorrectnessScoringFnParams",
- "answer_relevancy": "#/components/schemas/AnswerRelevancyScoringFnParams",
- "answer_similarity": "#/components/schemas/AnswerSimilarityScoringFnParams",
- "context_entity_recall": "#/components/schemas/ContextEntityRecallScoringFnParams",
- "context_precision": "#/components/schemas/ContextPrecisionScoringFnParams",
- "context_recall": "#/components/schemas/ContextRecallScoringFnParams",
- "context_relevancy": "#/components/schemas/ContextRelevancyScoringFnParams"
- }
- }
- },
- "SubsetOfcoringFnParams": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "subset_of",
- "default": "subset_of"
- },
- "subset_of": {
- "type": "object",
- "properties": {
- "aggregation_functions": {
- "type": "array",
- "items": {
- "type": "string",
- "enum": [
- "average",
- "median",
- "categorical_count",
- "accuracy"
- ],
- "title": "AggregationFunctionType",
- "description": "A type of aggregation function."
- },
- "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
- }
- },
- "additionalProperties": false,
- "title": "BasicScoringFnParamsFields"
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "subset_of"
- ],
- "title": "SubsetOfcoringFnParams"
+ "title": "Job"
},
"EvaluateRowsRequest": {
"type": "object",
"properties": {
- "input_rows": {
+ "dataset_rows": {
"type": "array",
"items": {
"type": "object",
@@ -7114,23 +6481,23 @@
},
"description": "The rows to evaluate."
},
- "scoring_functions": {
+ "scoring_fn_ids": {
"type": "array",
"items": {
"type": "string"
},
- "description": "The scoring functions to use for the evaluation."
+ "description": "The scoring function ids to use for the evaluation."
},
- "benchmark_config": {
- "$ref": "#/components/schemas/BenchmarkConfig",
- "description": "The configuration for the benchmark."
+ "candidate": {
+ "$ref": "#/components/schemas/EvalCandidate",
+ "description": "The candidate to evaluate on."
}
},
"additionalProperties": false,
"required": [
- "input_rows",
- "scoring_functions",
- "benchmark_config"
+ "dataset_rows",
+ "scoring_fn_ids",
+ "candidate"
],
"title": "EvaluateRowsRequest"
},
@@ -7731,6 +7098,526 @@
"title": "PaginatedRowsResult",
"description": "A paginated list of rows from a dataset."
},
+ "AnswerCorrectnessScoringFn": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "answer_correctness",
+ "default": "answer_correctness"
+ },
+ "answer_correctness": {
+ "type": "object",
+ "properties": {
+ "aggregation_functions": {
+ "type": "array",
+ "items": {
+ "type": "string",
+ "enum": [
+ "average",
+ "median",
+ "categorical_count",
+ "accuracy"
+ ],
+ "title": "AggregationFunctionType",
+ "description": "A type of aggregation function."
+ },
+ "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
+ }
+ },
+ "additionalProperties": false,
+ "title": "BasicScoringFnParams"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "answer_correctness"
+ ],
+ "title": "AnswerCorrectnessScoringFn"
+ },
+ "AnswerRelevancyScoringFn": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "answer_relevancy",
+ "default": "answer_relevancy"
+ },
+ "answer_relevancy": {
+ "type": "object",
+ "properties": {
+ "aggregation_functions": {
+ "type": "array",
+ "items": {
+ "type": "string",
+ "enum": [
+ "average",
+ "median",
+ "categorical_count",
+ "accuracy"
+ ],
+ "title": "AggregationFunctionType",
+ "description": "A type of aggregation function."
+ },
+ "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
+ }
+ },
+ "additionalProperties": false,
+ "title": "BasicScoringFnParams"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "answer_relevancy"
+ ],
+ "title": "AnswerRelevancyScoringFn"
+ },
+ "AnswerSimilarityScoringFn": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "answer_similarity",
+ "default": "answer_similarity"
+ },
+ "answer_similarity": {
+ "type": "object",
+ "properties": {
+ "aggregation_functions": {
+ "type": "array",
+ "items": {
+ "type": "string",
+ "enum": [
+ "average",
+ "median",
+ "categorical_count",
+ "accuracy"
+ ],
+ "title": "AggregationFunctionType",
+ "description": "A type of aggregation function."
+ },
+ "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
+ }
+ },
+ "additionalProperties": false,
+ "title": "BasicScoringFnParams"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "answer_similarity"
+ ],
+ "title": "AnswerSimilarityScoringFn"
+ },
+ "ContextEntityRecallScoringFn": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "context_entity_recall",
+ "default": "context_entity_recall"
+ },
+ "context_entity_recall": {
+ "type": "object",
+ "properties": {
+ "aggregation_functions": {
+ "type": "array",
+ "items": {
+ "type": "string",
+ "enum": [
+ "average",
+ "median",
+ "categorical_count",
+ "accuracy"
+ ],
+ "title": "AggregationFunctionType",
+ "description": "A type of aggregation function."
+ },
+ "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
+ }
+ },
+ "additionalProperties": false,
+ "title": "BasicScoringFnParams"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "context_entity_recall"
+ ],
+ "title": "ContextEntityRecallScoringFn"
+ },
+ "ContextPrecisionScoringFn": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "context_precision",
+ "default": "context_precision"
+ },
+ "context_precision": {
+ "type": "object",
+ "properties": {
+ "aggregation_functions": {
+ "type": "array",
+ "items": {
+ "type": "string",
+ "enum": [
+ "average",
+ "median",
+ "categorical_count",
+ "accuracy"
+ ],
+ "title": "AggregationFunctionType",
+ "description": "A type of aggregation function."
+ },
+ "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
+ }
+ },
+ "additionalProperties": false,
+ "title": "BasicScoringFnParams"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "context_precision"
+ ],
+ "title": "ContextPrecisionScoringFn"
+ },
+ "ContextRecallScoringFn": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "context_recall",
+ "default": "context_recall"
+ },
+ "context_recall": {
+ "type": "object",
+ "properties": {
+ "aggregation_functions": {
+ "type": "array",
+ "items": {
+ "type": "string",
+ "enum": [
+ "average",
+ "median",
+ "categorical_count",
+ "accuracy"
+ ],
+ "title": "AggregationFunctionType",
+ "description": "A type of aggregation function."
+ },
+ "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
+ }
+ },
+ "additionalProperties": false,
+ "title": "BasicScoringFnParams"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "context_recall"
+ ],
+ "title": "ContextRecallScoringFn"
+ },
+ "ContextRelevancyScoringFn": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "context_relevancy",
+ "default": "context_relevancy"
+ },
+ "context_relevancy": {
+ "type": "object",
+ "properties": {
+ "aggregation_functions": {
+ "type": "array",
+ "items": {
+ "type": "string",
+ "enum": [
+ "average",
+ "median",
+ "categorical_count",
+ "accuracy"
+ ],
+ "title": "AggregationFunctionType",
+ "description": "A type of aggregation function."
+ },
+ "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
+ }
+ },
+ "additionalProperties": false,
+ "title": "BasicScoringFnParams"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "context_relevancy"
+ ],
+ "title": "ContextRelevancyScoringFn"
+ },
+ "CustomLLMAsJudgeScoringFn": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "custom_llm_as_judge",
+ "default": "custom_llm_as_judge"
+ },
+ "custom_llm_as_judge": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "custom_llm_as_judge",
+ "default": "custom_llm_as_judge"
+ },
+ "judge_model": {
+ "type": "string"
+ },
+ "prompt_template": {
+ "type": "string"
+ },
+ "judge_score_regexes": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "judge_model"
+ ],
+ "title": "CustomLLMAsJudgeScoringFnParams"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "custom_llm_as_judge"
+ ],
+ "title": "CustomLLMAsJudgeScoringFn"
+ },
+ "EqualityScoringFn": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "equality",
+ "default": "equality"
+ },
+ "equality": {
+ "type": "object",
+ "properties": {
+ "aggregation_functions": {
+ "type": "array",
+ "items": {
+ "type": "string",
+ "enum": [
+ "average",
+ "median",
+ "categorical_count",
+ "accuracy"
+ ],
+ "title": "AggregationFunctionType",
+ "description": "A type of aggregation function."
+ },
+ "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
+ }
+ },
+ "additionalProperties": false,
+ "title": "BasicScoringFnParams"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "equality"
+ ],
+ "title": "EqualityScoringFn"
+ },
+ "FactualityScoringFn": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "factuality",
+ "default": "factuality"
+ },
+ "factuality": {
+ "type": "object",
+ "properties": {
+ "aggregation_functions": {
+ "type": "array",
+ "items": {
+ "type": "string",
+ "enum": [
+ "average",
+ "median",
+ "categorical_count",
+ "accuracy"
+ ],
+ "title": "AggregationFunctionType",
+ "description": "A type of aggregation function."
+ },
+ "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
+ }
+ },
+ "additionalProperties": false,
+ "title": "BasicScoringFnParams"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "factuality"
+ ],
+ "title": "FactualityScoringFn"
+ },
+ "FaithfulnessScoringFn": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "faithfulness",
+ "default": "faithfulness"
+ },
+ "faithfulness": {
+ "type": "object",
+ "properties": {
+ "aggregation_functions": {
+ "type": "array",
+ "items": {
+ "type": "string",
+ "enum": [
+ "average",
+ "median",
+ "categorical_count",
+ "accuracy"
+ ],
+ "title": "AggregationFunctionType",
+ "description": "A type of aggregation function."
+ },
+ "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
+ }
+ },
+ "additionalProperties": false,
+ "title": "BasicScoringFnParams"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "faithfulness"
+ ],
+ "title": "FaithfulnessScoringFn"
+ },
+ "RegexParserMathScoringFn": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "regex_parser_math_response",
+ "default": "regex_parser_math_response"
+ },
+ "regex_parser_math_response": {
+ "type": "object",
+ "properties": {
+ "parsing_regexes": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ },
+ "description": "(Optional) Regexes to extract the answer from generated response."
+ },
+ "aggregation_functions": {
+ "type": "array",
+ "items": {
+ "type": "string",
+ "enum": [
+ "average",
+ "median",
+ "categorical_count",
+ "accuracy"
+ ],
+ "title": "AggregationFunctionType",
+ "description": "A type of aggregation function."
+ },
+ "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "parsing_regexes"
+ ],
+ "title": "RegexParserScoringFnParams"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "regex_parser_math_response"
+ ],
+ "title": "RegexParserMathScoringFn"
+ },
+ "RegexParserScoringFn": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "regex_parser",
+ "default": "regex_parser"
+ },
+ "regex_parser": {
+ "type": "object",
+ "properties": {
+ "parsing_regexes": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ },
+ "description": "(Optional) Regexes to extract the answer from generated response."
+ },
+ "aggregation_functions": {
+ "type": "array",
+ "items": {
+ "type": "string",
+ "enum": [
+ "average",
+ "median",
+ "categorical_count",
+ "accuracy"
+ ],
+ "title": "AggregationFunctionType",
+ "description": "A type of aggregation function."
+ },
+ "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "parsing_regexes"
+ ],
+ "title": "RegexParserScoringFnParams"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "regex_parser"
+ ],
+ "title": "RegexParserScoringFn"
+ },
"ScoringFn": {
"type": "object",
"properties": {
@@ -7749,7 +7636,7 @@
"default": "scoring_function"
},
"fn": {
- "$ref": "#/components/schemas/ScoringFnParams",
+ "$ref": "#/components/schemas/ScoringFnDefinition",
"description": "The scoring function type and parameters."
},
"metadata": {
@@ -7790,6 +7677,109 @@
],
"title": "ScoringFn"
},
+ "ScoringFnDefinition": {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/CustomLLMAsJudgeScoringFn"
+ },
+ {
+ "$ref": "#/components/schemas/RegexParserScoringFn"
+ },
+ {
+ "$ref": "#/components/schemas/RegexParserMathScoringFn"
+ },
+ {
+ "$ref": "#/components/schemas/EqualityScoringFn"
+ },
+ {
+ "$ref": "#/components/schemas/SubsetOfScoringFn"
+ },
+ {
+ "$ref": "#/components/schemas/FactualityScoringFn"
+ },
+ {
+ "$ref": "#/components/schemas/FaithfulnessScoringFn"
+ },
+ {
+ "$ref": "#/components/schemas/AnswerCorrectnessScoringFn"
+ },
+ {
+ "$ref": "#/components/schemas/AnswerRelevancyScoringFn"
+ },
+ {
+ "$ref": "#/components/schemas/AnswerSimilarityScoringFn"
+ },
+ {
+ "$ref": "#/components/schemas/ContextEntityRecallScoringFn"
+ },
+ {
+ "$ref": "#/components/schemas/ContextPrecisionScoringFn"
+ },
+ {
+ "$ref": "#/components/schemas/ContextRecallScoringFn"
+ },
+ {
+ "$ref": "#/components/schemas/ContextRelevancyScoringFn"
+ }
+ ],
+ "discriminator": {
+ "propertyName": "type",
+ "mapping": {
+ "custom_llm_as_judge": "#/components/schemas/CustomLLMAsJudgeScoringFn",
+ "regex_parser": "#/components/schemas/RegexParserScoringFn",
+ "regex_parser_math_response": "#/components/schemas/RegexParserMathScoringFn",
+ "equality": "#/components/schemas/EqualityScoringFn",
+ "subset_of": "#/components/schemas/SubsetOfScoringFn",
+ "factuality": "#/components/schemas/FactualityScoringFn",
+ "faithfulness": "#/components/schemas/FaithfulnessScoringFn",
+ "answer_correctness": "#/components/schemas/AnswerCorrectnessScoringFn",
+ "answer_relevancy": "#/components/schemas/AnswerRelevancyScoringFn",
+ "answer_similarity": "#/components/schemas/AnswerSimilarityScoringFn",
+ "context_entity_recall": "#/components/schemas/ContextEntityRecallScoringFn",
+ "context_precision": "#/components/schemas/ContextPrecisionScoringFn",
+ "context_recall": "#/components/schemas/ContextRecallScoringFn",
+ "context_relevancy": "#/components/schemas/ContextRelevancyScoringFn"
+ }
+ }
+ },
+ "SubsetOfScoringFn": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "subset_of",
+ "default": "subset_of"
+ },
+ "subset_of": {
+ "type": "object",
+ "properties": {
+ "aggregation_functions": {
+ "type": "array",
+ "items": {
+ "type": "string",
+ "enum": [
+ "average",
+ "median",
+ "categorical_count",
+ "accuracy"
+ ],
+ "title": "AggregationFunctionType",
+ "description": "A type of aggregation function."
+ },
+ "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
+ }
+ },
+ "additionalProperties": false,
+ "title": "BasicScoringFnParams"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "subset_of"
+ ],
+ "title": "SubsetOfScoringFn"
+ },
"Shield": {
"type": "object",
"properties": {
@@ -9992,7 +9982,7 @@
"type": "object",
"properties": {
"fn": {
- "$ref": "#/components/schemas/ScoringFnParams",
+ "$ref": "#/components/schemas/ScoringFnDefinition",
"description": "The type and parameters for the scoring function."
},
"scoring_fn_id": {
@@ -10168,33 +10158,6 @@
],
"title": "ResumeAgentTurnRequest"
},
- "RunEvalRequest": {
- "type": "object",
- "properties": {
- "benchmark_config": {
- "$ref": "#/components/schemas/BenchmarkConfig",
- "description": "The configuration for the benchmark."
- }
- },
- "additionalProperties": false,
- "required": [
- "benchmark_config"
- ],
- "title": "RunEvalRequest"
- },
- "Job": {
- "type": "object",
- "properties": {
- "job_id": {
- "type": "string"
- }
- },
- "additionalProperties": false,
- "required": [
- "job_id"
- ],
- "title": "Job"
- },
"RunShieldRequest": {
"type": "object",
"properties": {
@@ -10284,7 +10247,7 @@
"ScoreRequest": {
"type": "object",
"properties": {
- "input_rows": {
+ "dataset_rows": {
"type": "array",
"items": {
"type": "object",
@@ -10313,25 +10276,18 @@
},
"description": "The rows to score."
},
- "scoring_functions": {
- "type": "object",
- "additionalProperties": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/ScoringFnParams"
- },
- {
- "type": "null"
- }
- ]
+ "scoring_fn_ids": {
+ "type": "array",
+ "items": {
+ "type": "string"
},
- "description": "The scoring functions to use for the scoring."
+ "description": "The scoring function ids to use for the scoring."
}
},
"additionalProperties": false,
"required": [
- "input_rows",
- "scoring_functions"
+ "dataset_rows",
+ "scoring_fn_ids"
],
"title": "ScoreRequest"
},
@@ -10353,36 +10309,25 @@
"title": "ScoreResponse",
"description": "The response from scoring."
},
- "ScoreBatchRequest": {
+ "ScoreDatasetRequest": {
"type": "object",
"properties": {
"dataset_id": {
"type": "string"
},
- "scoring_functions": {
- "type": "object",
- "additionalProperties": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/ScoringFnParams"
- },
- {
- "type": "null"
- }
- ]
+ "scoring_fn_ids": {
+ "type": "array",
+ "items": {
+ "type": "string"
}
- },
- "save_results_dataset": {
- "type": "boolean"
}
},
"additionalProperties": false,
"required": [
"dataset_id",
- "scoring_functions",
- "save_results_dataset"
+ "scoring_fn_ids"
],
- "title": "ScoreBatchRequest"
+ "title": "ScoreDatasetRequest"
},
"ScoreBatchResponse": {
"type": "object",
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index 5b99ba5aa..310b77eb1 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -666,7 +666,44 @@ paths:
schema:
$ref: '#/components/schemas/EmbeddingsRequest'
required: true
- /v1/eval/benchmarks/{benchmark_id}/evaluations:
+ /v1/eval/benchmarks/{benchmark_id}/jobs:
+ post:
+ responses:
+ '200':
+ description: >-
+ The job that was created to run the evaluation.
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/Job'
+ '400':
+ $ref: '#/components/responses/BadRequest400'
+ '429':
+ $ref: >-
+ #/components/responses/TooManyRequests429
+ '500':
+ $ref: >-
+ #/components/responses/InternalServerError500
+ default:
+ $ref: '#/components/responses/DefaultError'
+ tags:
+ - Eval
+ description: Run an evaluation on a benchmark.
+ parameters:
+ - name: benchmark_id
+ in: path
+ description: >-
+ The ID of the benchmark to run the evaluation on.
+ required: true
+ schema:
+ type: string
+ requestBody:
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/EvaluateBenchmarkRequest'
+ required: true
+ /v1/eval/rows:
post:
responses:
'200':
@@ -688,15 +725,8 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Eval
- description: Evaluate a list of rows on a benchmark.
- parameters:
- - name: benchmark_id
- in: path
- description: >-
- The ID of the benchmark to run the evaluation on.
- required: true
- schema:
- type: string
+ description: Evaluate a list of rows on a candidate.
+ parameters: []
requestBody:
content:
application/json:
@@ -2377,43 +2407,6 @@ paths:
schema:
$ref: '#/components/schemas/ResumeAgentTurnRequest'
required: true
- /v1/eval/benchmarks/{benchmark_id}/jobs:
- post:
- responses:
- '200':
- description: >-
- The job that was created to run the evaluation.
- content:
- application/json:
- schema:
- $ref: '#/components/schemas/Job'
- '400':
- $ref: '#/components/responses/BadRequest400'
- '429':
- $ref: >-
- #/components/responses/TooManyRequests429
- '500':
- $ref: >-
- #/components/responses/InternalServerError500
- default:
- $ref: '#/components/responses/DefaultError'
- tags:
- - Eval
- description: Run an evaluation on a benchmark.
- parameters:
- - name: benchmark_id
- in: path
- description: >-
- The ID of the benchmark to run the evaluation on.
- required: true
- schema:
- type: string
- requestBody:
- content:
- application/json:
- schema:
- $ref: '#/components/schemas/RunEvalRequest'
- required: true
/v1/safety/run-shield:
post:
responses:
@@ -2525,7 +2518,7 @@ paths:
content:
application/json:
schema:
- $ref: '#/components/schemas/ScoreBatchRequest'
+ $ref: '#/components/schemas/ScoreDatasetRequest'
required: true
/v1/post-training/supervised-fine-tune:
post:
@@ -4448,311 +4441,6 @@ components:
- config
title: AgentCandidate
description: An agent candidate for evaluation.
- AnswerCorrectnessScoringFnParams:
- type: object
- properties:
- type:
- type: string
- const: answer_correctness
- default: answer_correctness
- answer_correctness:
- type: object
- properties:
- aggregation_functions:
- type: array
- items:
- type: string
- enum:
- - average
- - median
- - categorical_count
- - accuracy
- title: AggregationFunctionType
- description: A type of aggregation function.
- description: >-
- (Optional) Aggregation functions to apply to the scores of each row.
- If not provided, no aggregation will be performed.
- additionalProperties: false
- title: BasicScoringFnParamsFields
- additionalProperties: false
- required:
- - type
- - answer_correctness
- title: AnswerCorrectnessScoringFnParams
- AnswerRelevancyScoringFnParams:
- type: object
- properties:
- type:
- type: string
- const: answer_relevancy
- default: answer_relevancy
- answer_relevancy:
- type: object
- properties:
- aggregation_functions:
- type: array
- items:
- type: string
- enum:
- - average
- - median
- - categorical_count
- - accuracy
- title: AggregationFunctionType
- description: A type of aggregation function.
- description: >-
- (Optional) Aggregation functions to apply to the scores of each row.
- If not provided, no aggregation will be performed.
- additionalProperties: false
- title: BasicScoringFnParamsFields
- additionalProperties: false
- required:
- - type
- - answer_relevancy
- title: AnswerRelevancyScoringFnParams
- AnswerSimilarityScoringFnParams:
- type: object
- properties:
- type:
- type: string
- const: answer_similarity
- default: answer_similarity
- answer_similarity:
- type: object
- properties:
- aggregation_functions:
- type: array
- items:
- type: string
- enum:
- - average
- - median
- - categorical_count
- - accuracy
- title: AggregationFunctionType
- description: A type of aggregation function.
- description: >-
- (Optional) Aggregation functions to apply to the scores of each row.
- If not provided, no aggregation will be performed.
- additionalProperties: false
- title: BasicScoringFnParamsFields
- additionalProperties: false
- required:
- - type
- - answer_similarity
- title: AnswerSimilarityScoringFnParams
- BenchmarkConfig:
- type: object
- properties:
- eval_candidate:
- $ref: '#/components/schemas/EvalCandidate'
- description: The candidate to evaluate.
- scoring_params:
- type: object
- additionalProperties:
- $ref: '#/components/schemas/ScoringFnParams'
- description: >-
- Map between scoring function id and parameters for each scoring function
- you want to run
- num_examples:
- type: integer
- description: >-
- (Optional) The number of examples to evaluate. If not provided, all examples
- in the dataset will be evaluated
- additionalProperties: false
- required:
- - eval_candidate
- - scoring_params
- title: BenchmarkConfig
- description: >-
- A benchmark configuration for evaluation.
- ContextEntityRecallScoringFnParams:
- type: object
- properties:
- type:
- type: string
- const: context_entity_recall
- default: context_entity_recall
- context_entity_recall:
- type: object
- properties:
- aggregation_functions:
- type: array
- items:
- type: string
- enum:
- - average
- - median
- - categorical_count
- - accuracy
- title: AggregationFunctionType
- description: A type of aggregation function.
- description: >-
- (Optional) Aggregation functions to apply to the scores of each row.
- If not provided, no aggregation will be performed.
- additionalProperties: false
- title: BasicScoringFnParamsFields
- additionalProperties: false
- required:
- - type
- - context_entity_recall
- title: ContextEntityRecallScoringFnParams
- ContextPrecisionScoringFnParams:
- type: object
- properties:
- type:
- type: string
- const: context_precision
- default: context_precision
- context_precision:
- type: object
- properties:
- aggregation_functions:
- type: array
- items:
- type: string
- enum:
- - average
- - median
- - categorical_count
- - accuracy
- title: AggregationFunctionType
- description: A type of aggregation function.
- description: >-
- (Optional) Aggregation functions to apply to the scores of each row.
- If not provided, no aggregation will be performed.
- additionalProperties: false
- title: BasicScoringFnParamsFields
- additionalProperties: false
- required:
- - type
- - context_precision
- title: ContextPrecisionScoringFnParams
- ContextRecallScoringFnParams:
- type: object
- properties:
- type:
- type: string
- const: context_recall
- default: context_recall
- context_recall:
- type: object
- properties:
- aggregation_functions:
- type: array
- items:
- type: string
- enum:
- - average
- - median
- - categorical_count
- - accuracy
- title: AggregationFunctionType
- description: A type of aggregation function.
- description: >-
- (Optional) Aggregation functions to apply to the scores of each row.
- If not provided, no aggregation will be performed.
- additionalProperties: false
- title: BasicScoringFnParamsFields
- additionalProperties: false
- required:
- - type
- - context_recall
- title: ContextRecallScoringFnParams
- ContextRelevancyScoringFnParams:
- type: object
- properties:
- type:
- type: string
- const: context_relevancy
- default: context_relevancy
- context_relevancy:
- type: object
- properties:
- aggregation_functions:
- type: array
- items:
- type: string
- enum:
- - average
- - median
- - categorical_count
- - accuracy
- title: AggregationFunctionType
- description: A type of aggregation function.
- description: >-
- (Optional) Aggregation functions to apply to the scores of each row.
- If not provided, no aggregation will be performed.
- additionalProperties: false
- title: BasicScoringFnParamsFields
- additionalProperties: false
- required:
- - type
- - context_relevancy
- title: ContextRelevancyScoringFnParams
- CustomLLMAsJudgeScoringFnParams:
- type: object
- properties:
- type:
- type: string
- const: custom_llm_as_judge
- default: custom_llm_as_judge
- custom_llm_as_judge:
- type: object
- properties:
- type:
- type: string
- const: custom_llm_as_judge
- default: custom_llm_as_judge
- judge_model:
- type: string
- prompt_template:
- type: string
- judge_score_regexes:
- type: array
- items:
- type: string
- additionalProperties: false
- required:
- - type
- - judge_model
- title: CustomLLMAsJudgeScoringFnParamsFields
- additionalProperties: false
- required:
- - type
- - custom_llm_as_judge
- title: CustomLLMAsJudgeScoringFnParams
- EqualityScoringFnParams:
- type: object
- properties:
- type:
- type: string
- const: equality
- default: equality
- equality:
- type: object
- properties:
- aggregation_functions:
- type: array
- items:
- type: string
- enum:
- - average
- - median
- - categorical_count
- - accuracy
- title: AggregationFunctionType
- description: A type of aggregation function.
- description: >-
- (Optional) Aggregation functions to apply to the scores of each row.
- If not provided, no aggregation will be performed.
- additionalProperties: false
- title: BasicScoringFnParamsFields
- additionalProperties: false
- required:
- - type
- - equality
- title: EqualityScoringFnParams
EvalCandidate:
oneOf:
- $ref: '#/components/schemas/ModelCandidate'
@@ -4762,68 +4450,6 @@ components:
mapping:
model: '#/components/schemas/ModelCandidate'
agent: '#/components/schemas/AgentCandidate'
- FactualityScoringFnParams:
- type: object
- properties:
- type:
- type: string
- const: factuality
- default: factuality
- factuality:
- type: object
- properties:
- aggregation_functions:
- type: array
- items:
- type: string
- enum:
- - average
- - median
- - categorical_count
- - accuracy
- title: AggregationFunctionType
- description: A type of aggregation function.
- description: >-
- (Optional) Aggregation functions to apply to the scores of each row.
- If not provided, no aggregation will be performed.
- additionalProperties: false
- title: BasicScoringFnParamsFields
- additionalProperties: false
- required:
- - type
- - factuality
- title: FactualityScoringFnParams
- FaithfulnessScoringFnParams:
- type: object
- properties:
- type:
- type: string
- const: faithfulness
- default: faithfulness
- faithfulness:
- type: object
- properties:
- aggregation_functions:
- type: array
- items:
- type: string
- enum:
- - average
- - median
- - categorical_count
- - accuracy
- title: AggregationFunctionType
- description: A type of aggregation function.
- description: >-
- (Optional) Aggregation functions to apply to the scores of each row.
- If not provided, no aggregation will be performed.
- additionalProperties: false
- title: BasicScoringFnParamsFields
- additionalProperties: false
- required:
- - type
- - faithfulness
- title: FaithfulnessScoringFnParams
ModelCandidate:
type: object
properties:
@@ -4849,152 +4475,29 @@ components:
- sampling_params
title: ModelCandidate
description: A model candidate for evaluation.
- RegexParserMathScoringFnParams:
+ EvaluateBenchmarkRequest:
type: object
properties:
- type:
- type: string
- const: regex_parser_math_response
- default: regex_parser_math_response
- regex_parser_math_response:
- type: object
- properties:
- parsing_regexes:
- type: array
- items:
- type: string
- description: >-
- (Optional) Regexes to extract the answer from generated response.
- aggregation_functions:
- type: array
- items:
- type: string
- enum:
- - average
- - median
- - categorical_count
- - accuracy
- title: AggregationFunctionType
- description: A type of aggregation function.
- description: >-
- (Optional) Aggregation functions to apply to the scores of each row.
- If not provided, no aggregation will be performed.
- additionalProperties: false
- required:
- - parsing_regexes
- title: RegexParserScoringFnParamsFields
+ candidate:
+ $ref: '#/components/schemas/EvalCandidate'
+ description: The candidate to evaluate on.
additionalProperties: false
required:
- - type
- - regex_parser_math_response
- title: RegexParserMathScoringFnParams
- RegexParserScoringFnParams:
+ - candidate
+ title: EvaluateBenchmarkRequest
+ Job:
type: object
properties:
- type:
+ job_id:
type: string
- const: regex_parser
- default: regex_parser
- regex_parser:
- type: object
- properties:
- parsing_regexes:
- type: array
- items:
- type: string
- description: >-
- (Optional) Regexes to extract the answer from generated response.
- aggregation_functions:
- type: array
- items:
- type: string
- enum:
- - average
- - median
- - categorical_count
- - accuracy
- title: AggregationFunctionType
- description: A type of aggregation function.
- description: >-
- (Optional) Aggregation functions to apply to the scores of each row.
- If not provided, no aggregation will be performed.
- additionalProperties: false
- required:
- - parsing_regexes
- title: RegexParserScoringFnParamsFields
additionalProperties: false
required:
- - type
- - regex_parser
- title: RegexParserScoringFnParams
- ScoringFnParams:
- oneOf:
- - $ref: '#/components/schemas/CustomLLMAsJudgeScoringFnParams'
- - $ref: '#/components/schemas/RegexParserScoringFnParams'
- - $ref: '#/components/schemas/RegexParserMathScoringFnParams'
- - $ref: '#/components/schemas/EqualityScoringFnParams'
- - $ref: '#/components/schemas/SubsetOfcoringFnParams'
- - $ref: '#/components/schemas/FactualityScoringFnParams'
- - $ref: '#/components/schemas/FaithfulnessScoringFnParams'
- - $ref: '#/components/schemas/AnswerCorrectnessScoringFnParams'
- - $ref: '#/components/schemas/AnswerRelevancyScoringFnParams'
- - $ref: '#/components/schemas/AnswerSimilarityScoringFnParams'
- - $ref: '#/components/schemas/ContextEntityRecallScoringFnParams'
- - $ref: '#/components/schemas/ContextPrecisionScoringFnParams'
- - $ref: '#/components/schemas/ContextRecallScoringFnParams'
- - $ref: '#/components/schemas/ContextRelevancyScoringFnParams'
- discriminator:
- propertyName: type
- mapping:
- custom_llm_as_judge: '#/components/schemas/CustomLLMAsJudgeScoringFnParams'
- regex_parser: '#/components/schemas/RegexParserScoringFnParams'
- regex_parser_math_response: '#/components/schemas/RegexParserMathScoringFnParams'
- equality: '#/components/schemas/EqualityScoringFnParams'
- subset_of: '#/components/schemas/SubsetOfcoringFnParams'
- factuality: '#/components/schemas/FactualityScoringFnParams'
- faithfulness: '#/components/schemas/FaithfulnessScoringFnParams'
- answer_correctness: '#/components/schemas/AnswerCorrectnessScoringFnParams'
- answer_relevancy: '#/components/schemas/AnswerRelevancyScoringFnParams'
- answer_similarity: '#/components/schemas/AnswerSimilarityScoringFnParams'
- context_entity_recall: '#/components/schemas/ContextEntityRecallScoringFnParams'
- context_precision: '#/components/schemas/ContextPrecisionScoringFnParams'
- context_recall: '#/components/schemas/ContextRecallScoringFnParams'
- context_relevancy: '#/components/schemas/ContextRelevancyScoringFnParams'
- SubsetOfcoringFnParams:
- type: object
- properties:
- type:
- type: string
- const: subset_of
- default: subset_of
- subset_of:
- type: object
- properties:
- aggregation_functions:
- type: array
- items:
- type: string
- enum:
- - average
- - median
- - categorical_count
- - accuracy
- title: AggregationFunctionType
- description: A type of aggregation function.
- description: >-
- (Optional) Aggregation functions to apply to the scores of each row.
- If not provided, no aggregation will be performed.
- additionalProperties: false
- title: BasicScoringFnParamsFields
- additionalProperties: false
- required:
- - type
- - subset_of
- title: SubsetOfcoringFnParams
+ - job_id
+ title: Job
EvaluateRowsRequest:
type: object
properties:
- input_rows:
+ dataset_rows:
type: array
items:
type: object
@@ -5007,20 +4510,20 @@ components:
- type: array
- type: object
description: The rows to evaluate.
- scoring_functions:
+ scoring_fn_ids:
type: array
items:
type: string
description: >-
- The scoring functions to use for the evaluation.
- benchmark_config:
- $ref: '#/components/schemas/BenchmarkConfig'
- description: The configuration for the benchmark.
+ The scoring function ids to use for the evaluation.
+ candidate:
+ $ref: '#/components/schemas/EvalCandidate'
+ description: The candidate to evaluate on.
additionalProperties: false
required:
- - input_rows
- - scoring_functions
- - benchmark_config
+ - dataset_rows
+ - scoring_fn_ids
+ - candidate
title: EvaluateRowsRequest
EvaluateResponse:
type: object
@@ -5393,6 +4896,426 @@ components:
- total_count
title: PaginatedRowsResult
description: A paginated list of rows from a dataset.
+ AnswerCorrectnessScoringFn:
+ type: object
+ properties:
+ type:
+ type: string
+ const: answer_correctness
+ default: answer_correctness
+ answer_correctness:
+ type: object
+ properties:
+ aggregation_functions:
+ type: array
+ items:
+ type: string
+ enum:
+ - average
+ - median
+ - categorical_count
+ - accuracy
+ title: AggregationFunctionType
+ description: A type of aggregation function.
+ description: >-
+ (Optional) Aggregation functions to apply to the scores of each row.
+ If not provided, no aggregation will be performed.
+ additionalProperties: false
+ title: BasicScoringFnParams
+ additionalProperties: false
+ required:
+ - type
+ - answer_correctness
+ title: AnswerCorrectnessScoringFn
+ AnswerRelevancyScoringFn:
+ type: object
+ properties:
+ type:
+ type: string
+ const: answer_relevancy
+ default: answer_relevancy
+ answer_relevancy:
+ type: object
+ properties:
+ aggregation_functions:
+ type: array
+ items:
+ type: string
+ enum:
+ - average
+ - median
+ - categorical_count
+ - accuracy
+ title: AggregationFunctionType
+ description: A type of aggregation function.
+ description: >-
+ (Optional) Aggregation functions to apply to the scores of each row.
+ If not provided, no aggregation will be performed.
+ additionalProperties: false
+ title: BasicScoringFnParams
+ additionalProperties: false
+ required:
+ - type
+ - answer_relevancy
+ title: AnswerRelevancyScoringFn
+ AnswerSimilarityScoringFn:
+ type: object
+ properties:
+ type:
+ type: string
+ const: answer_similarity
+ default: answer_similarity
+ answer_similarity:
+ type: object
+ properties:
+ aggregation_functions:
+ type: array
+ items:
+ type: string
+ enum:
+ - average
+ - median
+ - categorical_count
+ - accuracy
+ title: AggregationFunctionType
+ description: A type of aggregation function.
+ description: >-
+ (Optional) Aggregation functions to apply to the scores of each row.
+ If not provided, no aggregation will be performed.
+ additionalProperties: false
+ title: BasicScoringFnParams
+ additionalProperties: false
+ required:
+ - type
+ - answer_similarity
+ title: AnswerSimilarityScoringFn
+ ContextEntityRecallScoringFn:
+ type: object
+ properties:
+ type:
+ type: string
+ const: context_entity_recall
+ default: context_entity_recall
+ context_entity_recall:
+ type: object
+ properties:
+ aggregation_functions:
+ type: array
+ items:
+ type: string
+ enum:
+ - average
+ - median
+ - categorical_count
+ - accuracy
+ title: AggregationFunctionType
+ description: A type of aggregation function.
+ description: >-
+ (Optional) Aggregation functions to apply to the scores of each row.
+ If not provided, no aggregation will be performed.
+ additionalProperties: false
+ title: BasicScoringFnParams
+ additionalProperties: false
+ required:
+ - type
+ - context_entity_recall
+ title: ContextEntityRecallScoringFn
+ ContextPrecisionScoringFn:
+ type: object
+ properties:
+ type:
+ type: string
+ const: context_precision
+ default: context_precision
+ context_precision:
+ type: object
+ properties:
+ aggregation_functions:
+ type: array
+ items:
+ type: string
+ enum:
+ - average
+ - median
+ - categorical_count
+ - accuracy
+ title: AggregationFunctionType
+ description: A type of aggregation function.
+ description: >-
+ (Optional) Aggregation functions to apply to the scores of each row.
+ If not provided, no aggregation will be performed.
+ additionalProperties: false
+ title: BasicScoringFnParams
+ additionalProperties: false
+ required:
+ - type
+ - context_precision
+ title: ContextPrecisionScoringFn
+ ContextRecallScoringFn:
+ type: object
+ properties:
+ type:
+ type: string
+ const: context_recall
+ default: context_recall
+ context_recall:
+ type: object
+ properties:
+ aggregation_functions:
+ type: array
+ items:
+ type: string
+ enum:
+ - average
+ - median
+ - categorical_count
+ - accuracy
+ title: AggregationFunctionType
+ description: A type of aggregation function.
+ description: >-
+ (Optional) Aggregation functions to apply to the scores of each row.
+ If not provided, no aggregation will be performed.
+ additionalProperties: false
+ title: BasicScoringFnParams
+ additionalProperties: false
+ required:
+ - type
+ - context_recall
+ title: ContextRecallScoringFn
+ ContextRelevancyScoringFn:
+ type: object
+ properties:
+ type:
+ type: string
+ const: context_relevancy
+ default: context_relevancy
+ context_relevancy:
+ type: object
+ properties:
+ aggregation_functions:
+ type: array
+ items:
+ type: string
+ enum:
+ - average
+ - median
+ - categorical_count
+ - accuracy
+ title: AggregationFunctionType
+ description: A type of aggregation function.
+ description: >-
+ (Optional) Aggregation functions to apply to the scores of each row.
+ If not provided, no aggregation will be performed.
+ additionalProperties: false
+ title: BasicScoringFnParams
+ additionalProperties: false
+ required:
+ - type
+ - context_relevancy
+ title: ContextRelevancyScoringFn
+ CustomLLMAsJudgeScoringFn:
+ type: object
+ properties:
+ type:
+ type: string
+ const: custom_llm_as_judge
+ default: custom_llm_as_judge
+ custom_llm_as_judge:
+ type: object
+ properties:
+ type:
+ type: string
+ const: custom_llm_as_judge
+ default: custom_llm_as_judge
+ judge_model:
+ type: string
+ prompt_template:
+ type: string
+ judge_score_regexes:
+ type: array
+ items:
+ type: string
+ additionalProperties: false
+ required:
+ - type
+ - judge_model
+ title: CustomLLMAsJudgeScoringFnParams
+ additionalProperties: false
+ required:
+ - type
+ - custom_llm_as_judge
+ title: CustomLLMAsJudgeScoringFn
+ EqualityScoringFn:
+ type: object
+ properties:
+ type:
+ type: string
+ const: equality
+ default: equality
+ equality:
+ type: object
+ properties:
+ aggregation_functions:
+ type: array
+ items:
+ type: string
+ enum:
+ - average
+ - median
+ - categorical_count
+ - accuracy
+ title: AggregationFunctionType
+ description: A type of aggregation function.
+ description: >-
+ (Optional) Aggregation functions to apply to the scores of each row.
+ If not provided, no aggregation will be performed.
+ additionalProperties: false
+ title: BasicScoringFnParams
+ additionalProperties: false
+ required:
+ - type
+ - equality
+ title: EqualityScoringFn
+ FactualityScoringFn:
+ type: object
+ properties:
+ type:
+ type: string
+ const: factuality
+ default: factuality
+ factuality:
+ type: object
+ properties:
+ aggregation_functions:
+ type: array
+ items:
+ type: string
+ enum:
+ - average
+ - median
+ - categorical_count
+ - accuracy
+ title: AggregationFunctionType
+ description: A type of aggregation function.
+ description: >-
+ (Optional) Aggregation functions to apply to the scores of each row.
+ If not provided, no aggregation will be performed.
+ additionalProperties: false
+ title: BasicScoringFnParams
+ additionalProperties: false
+ required:
+ - type
+ - factuality
+ title: FactualityScoringFn
+ FaithfulnessScoringFn:
+ type: object
+ properties:
+ type:
+ type: string
+ const: faithfulness
+ default: faithfulness
+ faithfulness:
+ type: object
+ properties:
+ aggregation_functions:
+ type: array
+ items:
+ type: string
+ enum:
+ - average
+ - median
+ - categorical_count
+ - accuracy
+ title: AggregationFunctionType
+ description: A type of aggregation function.
+ description: >-
+ (Optional) Aggregation functions to apply to the scores of each row.
+ If not provided, no aggregation will be performed.
+ additionalProperties: false
+ title: BasicScoringFnParams
+ additionalProperties: false
+ required:
+ - type
+ - faithfulness
+ title: FaithfulnessScoringFn
+ RegexParserMathScoringFn:
+ type: object
+ properties:
+ type:
+ type: string
+ const: regex_parser_math_response
+ default: regex_parser_math_response
+ regex_parser_math_response:
+ type: object
+ properties:
+ parsing_regexes:
+ type: array
+ items:
+ type: string
+ description: >-
+ (Optional) Regexes to extract the answer from generated response.
+ aggregation_functions:
+ type: array
+ items:
+ type: string
+ enum:
+ - average
+ - median
+ - categorical_count
+ - accuracy
+ title: AggregationFunctionType
+ description: A type of aggregation function.
+ description: >-
+ (Optional) Aggregation functions to apply to the scores of each row.
+ If not provided, no aggregation will be performed.
+ additionalProperties: false
+ required:
+ - parsing_regexes
+ title: RegexParserScoringFnParams
+ additionalProperties: false
+ required:
+ - type
+ - regex_parser_math_response
+ title: RegexParserMathScoringFn
+ RegexParserScoringFn:
+ type: object
+ properties:
+ type:
+ type: string
+ const: regex_parser
+ default: regex_parser
+ regex_parser:
+ type: object
+ properties:
+ parsing_regexes:
+ type: array
+ items:
+ type: string
+ description: >-
+ (Optional) Regexes to extract the answer from generated response.
+ aggregation_functions:
+ type: array
+ items:
+ type: string
+ enum:
+ - average
+ - median
+ - categorical_count
+ - accuracy
+ title: AggregationFunctionType
+ description: A type of aggregation function.
+ description: >-
+ (Optional) Aggregation functions to apply to the scores of each row.
+ If not provided, no aggregation will be performed.
+ additionalProperties: false
+ required:
+ - parsing_regexes
+ title: RegexParserScoringFnParams
+ additionalProperties: false
+ required:
+ - type
+ - regex_parser
+ title: RegexParserScoringFn
ScoringFn:
type: object
properties:
@@ -5407,7 +5330,7 @@ components:
const: scoring_function
default: scoring_function
fn:
- $ref: '#/components/schemas/ScoringFnParams'
+ $ref: '#/components/schemas/ScoringFnDefinition'
description: >-
The scoring function type and parameters.
metadata:
@@ -5431,6 +5354,70 @@ components:
- fn
- metadata
title: ScoringFn
+ ScoringFnDefinition:
+ oneOf:
+ - $ref: '#/components/schemas/CustomLLMAsJudgeScoringFn'
+ - $ref: '#/components/schemas/RegexParserScoringFn'
+ - $ref: '#/components/schemas/RegexParserMathScoringFn'
+ - $ref: '#/components/schemas/EqualityScoringFn'
+ - $ref: '#/components/schemas/SubsetOfScoringFn'
+ - $ref: '#/components/schemas/FactualityScoringFn'
+ - $ref: '#/components/schemas/FaithfulnessScoringFn'
+ - $ref: '#/components/schemas/AnswerCorrectnessScoringFn'
+ - $ref: '#/components/schemas/AnswerRelevancyScoringFn'
+ - $ref: '#/components/schemas/AnswerSimilarityScoringFn'
+ - $ref: '#/components/schemas/ContextEntityRecallScoringFn'
+ - $ref: '#/components/schemas/ContextPrecisionScoringFn'
+ - $ref: '#/components/schemas/ContextRecallScoringFn'
+ - $ref: '#/components/schemas/ContextRelevancyScoringFn'
+ discriminator:
+ propertyName: type
+ mapping:
+ custom_llm_as_judge: '#/components/schemas/CustomLLMAsJudgeScoringFn'
+ regex_parser: '#/components/schemas/RegexParserScoringFn'
+ regex_parser_math_response: '#/components/schemas/RegexParserMathScoringFn'
+ equality: '#/components/schemas/EqualityScoringFn'
+ subset_of: '#/components/schemas/SubsetOfScoringFn'
+ factuality: '#/components/schemas/FactualityScoringFn'
+ faithfulness: '#/components/schemas/FaithfulnessScoringFn'
+ answer_correctness: '#/components/schemas/AnswerCorrectnessScoringFn'
+ answer_relevancy: '#/components/schemas/AnswerRelevancyScoringFn'
+ answer_similarity: '#/components/schemas/AnswerSimilarityScoringFn'
+ context_entity_recall: '#/components/schemas/ContextEntityRecallScoringFn'
+ context_precision: '#/components/schemas/ContextPrecisionScoringFn'
+ context_recall: '#/components/schemas/ContextRecallScoringFn'
+ context_relevancy: '#/components/schemas/ContextRelevancyScoringFn'
+ SubsetOfScoringFn:
+ type: object
+ properties:
+ type:
+ type: string
+ const: subset_of
+ default: subset_of
+ subset_of:
+ type: object
+ properties:
+ aggregation_functions:
+ type: array
+ items:
+ type: string
+ enum:
+ - average
+ - median
+ - categorical_count
+ - accuracy
+ title: AggregationFunctionType
+ description: A type of aggregation function.
+ description: >-
+ (Optional) Aggregation functions to apply to the scores of each row.
+ If not provided, no aggregation will be performed.
+ additionalProperties: false
+ title: BasicScoringFnParams
+ additionalProperties: false
+ required:
+ - type
+ - subset_of
+ title: SubsetOfScoringFn
Shield:
type: object
properties:
@@ -6853,7 +6840,7 @@ components:
type: object
properties:
fn:
- $ref: '#/components/schemas/ScoringFnParams'
+ $ref: '#/components/schemas/ScoringFnDefinition'
description: >-
The type and parameters for the scoring function.
scoring_fn_id:
@@ -6959,25 +6946,6 @@ components:
required:
- tool_responses
title: ResumeAgentTurnRequest
- RunEvalRequest:
- type: object
- properties:
- benchmark_config:
- $ref: '#/components/schemas/BenchmarkConfig'
- description: The configuration for the benchmark.
- additionalProperties: false
- required:
- - benchmark_config
- title: RunEvalRequest
- Job:
- type: object
- properties:
- job_id:
- type: string
- additionalProperties: false
- required:
- - job_id
- title: Job
RunShieldRequest:
type: object
properties:
@@ -7034,7 +7002,7 @@ components:
ScoreRequest:
type: object
properties:
- input_rows:
+ dataset_rows:
type: array
items:
type: object
@@ -7047,18 +7015,16 @@ components:
- type: array
- type: object
description: The rows to score.
- scoring_functions:
- type: object
- additionalProperties:
- oneOf:
- - $ref: '#/components/schemas/ScoringFnParams'
- - type: 'null'
+ scoring_fn_ids:
+ type: array
+ items:
+ type: string
description: >-
- The scoring functions to use for the scoring.
+ The scoring function ids to use for the scoring.
additionalProperties: false
required:
- - input_rows
- - scoring_functions
+ - dataset_rows
+ - scoring_fn_ids
title: ScoreRequest
ScoreResponse:
type: object
@@ -7074,25 +7040,20 @@ components:
- results
title: ScoreResponse
description: The response from scoring.
- ScoreBatchRequest:
+ ScoreDatasetRequest:
type: object
properties:
dataset_id:
type: string
- scoring_functions:
- type: object
- additionalProperties:
- oneOf:
- - $ref: '#/components/schemas/ScoringFnParams'
- - type: 'null'
- save_results_dataset:
- type: boolean
+ scoring_fn_ids:
+ type: array
+ items:
+ type: string
additionalProperties: false
required:
- dataset_id
- - scoring_functions
- - save_results_dataset
- title: ScoreBatchRequest
+ - scoring_fn_ids
+ title: ScoreDatasetRequest
ScoreBatchResponse:
type: object
properties:
diff --git a/llama_stack/apis/benchmarks/benchmarks.py b/llama_stack/apis/benchmarks/benchmarks.py
index 01fc873e6..45edd3d6b 100644
--- a/llama_stack/apis/benchmarks/benchmarks.py
+++ b/llama_stack/apis/benchmarks/benchmarks.py
@@ -8,7 +8,6 @@ from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkab
from pydantic import BaseModel, Field
from llama_stack.apis.resource import Resource, ResourceType
-from llama_stack.apis.scoring_functions import ScoringFnParams
from llama_stack.schema_utils import json_schema_type, webmethod
diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py
index dec018d83..552afe0a2 100644
--- a/llama_stack/apis/eval/eval.py
+++ b/llama_stack/apis/eval/eval.py
@@ -13,7 +13,6 @@ from llama_stack.apis.agents import AgentConfig
from llama_stack.apis.common.job_types import Job, JobStatus
from llama_stack.apis.inference import SamplingParams, SystemMessage
from llama_stack.apis.scoring import ScoringResult
-from llama_stack.apis.scoring_functions import ScoringFnParams
from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
@@ -49,27 +48,6 @@ EvalCandidate = register_schema(
)
-@json_schema_type
-class BenchmarkConfig(BaseModel):
- """A benchmark configuration for evaluation.
-
- :param eval_candidate: The candidate to evaluate.
- :param scoring_params: Map between scoring function id and parameters for each scoring function you want to run
- :param num_examples: (Optional) The number of examples to evaluate. If not provided, all examples in the dataset will be evaluated
- """
-
- eval_candidate: EvalCandidate
- scoring_params: Dict[str, ScoringFnParams] = Field(
- description="Map between scoring function id and parameters for each scoring function you want to run",
- default_factory=dict,
- )
- num_examples: Optional[int] = Field(
- description="Number of examples to evaluate (useful for testing), if not provided, all examples in the dataset will be evaluated",
- default=None,
- )
- # we could optinally add any specific dataset config here
-
-
@json_schema_type
class EvaluateResponse(BaseModel):
"""The response from an evaluation.
@@ -87,32 +65,30 @@ class Eval(Protocol):
"""Llama Stack Evaluation API for running evaluations on model and agent candidates."""
@webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST")
- async def run_eval(
+ async def evaluate_benchmark(
self,
benchmark_id: str,
- benchmark_config: BenchmarkConfig,
+ candidate: EvalCandidate,
) -> Job:
"""Run an evaluation on a benchmark.
:param benchmark_id: The ID of the benchmark to run the evaluation on.
- :param benchmark_config: The configuration for the benchmark.
+ :param candidate: The candidate to evaluate on.
:return: The job that was created to run the evaluation.
"""
- @webmethod(route="/eval/benchmarks/{benchmark_id}/evaluations", method="POST")
+ @webmethod(route="/eval/rows", method="POST")
async def evaluate_rows(
self,
- benchmark_id: str,
- input_rows: List[Dict[str, Any]],
- scoring_functions: List[str],
- benchmark_config: BenchmarkConfig,
+ dataset_rows: List[Dict[str, Any]],
+ scoring_fn_ids: List[str],
+ candidate: EvalCandidate,
) -> EvaluateResponse:
- """Evaluate a list of rows on a benchmark.
-
- :param benchmark_id: The ID of the benchmark to run the evaluation on.
- :param input_rows: The rows to evaluate.
- :param scoring_functions: The scoring functions to use for the evaluation.
- :param benchmark_config: The configuration for the benchmark.
+ """Evaluate a list of rows on a candidate.
+
+ :param dataset_rows: The rows to evaluate.
+ :param scoring_fn_ids: The scoring function ids to use for the evaluation.
+ :param candidate: The candidate to evaluate on.
:return: EvaluateResponse object containing generations and scores
"""
diff --git a/llama_stack/apis/scoring/scoring.py b/llama_stack/apis/scoring/scoring.py
index 54a9ac2aa..eecca7799 100644
--- a/llama_stack/apis/scoring/scoring.py
+++ b/llama_stack/apis/scoring/scoring.py
@@ -8,7 +8,7 @@ from typing import Any, Dict, List, Optional, Protocol, runtime_checkable
from pydantic import BaseModel
-from llama_stack.apis.scoring_functions import ScoringFn, ScoringFnParams
+from llama_stack.apis.scoring_functions import ScoringFn
from llama_stack.schema_utils import json_schema_type, webmethod
# mapping of metric to value
@@ -56,23 +56,22 @@ class Scoring(Protocol):
scoring_function_store: ScoringFunctionStore
@webmethod(route="/scoring/score-batch", method="POST")
- async def score_batch(
+ async def score_dataset(
self,
dataset_id: str,
- scoring_functions: Dict[str, Optional[ScoringFnParams]],
- save_results_dataset: bool = False,
+ scoring_fn_ids: List[str],
) -> ScoreBatchResponse: ...
@webmethod(route="/scoring/score", method="POST")
async def score(
self,
- input_rows: List[Dict[str, Any]],
- scoring_functions: Dict[str, Optional[ScoringFnParams]],
+ dataset_rows: List[Dict[str, Any]],
+ scoring_fn_ids: List[str],
) -> ScoreResponse:
"""Score a list of rows.
- :param input_rows: The rows to score.
- :param scoring_functions: The scoring functions to use for the scoring.
+ :param dataset_rows: The rows to score.
+ :param scoring_fn_ids: The scoring function ids to use for the scoring.
:return: ScoreResponse object containing rows and aggregated results
"""
...
diff --git a/llama_stack/apis/scoring_functions/scoring_functions.py b/llama_stack/apis/scoring_functions/scoring_functions.py
index d6ee4f975..0e7ec4354 100644
--- a/llama_stack/apis/scoring_functions/scoring_functions.py
+++ b/llama_stack/apis/scoring_functions/scoring_functions.py
@@ -67,7 +67,7 @@ class AggregationFunctionType(Enum):
accuracy = "accuracy"
-class BasicScoringFnParamsFields(BaseModel):
+class BasicScoringFnParams(BaseModel):
"""
:param aggregation_functions: (Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed.
"""
@@ -78,7 +78,7 @@ class BasicScoringFnParamsFields(BaseModel):
)
-class RegexParserScoringFnParamsFields(BaseModel):
+class RegexParserScoringFnParams(BaseModel):
"""
:param parsing_regexes: (Optional) Regexes to extract the answer from generated response.
:param aggregation_functions: (Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed.
@@ -93,7 +93,7 @@ class RegexParserScoringFnParamsFields(BaseModel):
default_factory=list,
)
-class CustomLLMAsJudgeScoringFnParamsFields(BaseModel):
+class CustomLLMAsJudgeScoringFnParams(BaseModel):
type: Literal["custom_llm_as_judge"] = "custom_llm_as_judge"
judge_model: str
prompt_template: Optional[str] = None
@@ -103,103 +103,103 @@ class CustomLLMAsJudgeScoringFnParamsFields(BaseModel):
)
@json_schema_type
-class RegexParserScoringFnParams(BaseModel):
+class RegexParserScoringFn(BaseModel):
type: Literal["regex_parser"] = "regex_parser"
- regex_parser: RegexParserScoringFnParamsFields
+ regex_parser: RegexParserScoringFnParams
@json_schema_type
-class RegexParserMathScoringFnParams(BaseModel):
+class RegexParserMathScoringFn(BaseModel):
type: Literal["regex_parser_math_response"] = "regex_parser_math_response"
- regex_parser_math_response: RegexParserScoringFnParamsFields
+ regex_parser_math_response: RegexParserScoringFnParams
@json_schema_type
-class EqualityScoringFnParams(BaseModel):
+class EqualityScoringFn(BaseModel):
type: Literal["equality"] = "equality"
- equality: BasicScoringFnParamsFields
+ equality: BasicScoringFnParams
@json_schema_type
-class SubsetOfcoringFnParams(BaseModel):
+class SubsetOfScoringFn(BaseModel):
type: Literal["subset_of"] = "subset_of"
- subset_of: BasicScoringFnParamsFields
+ subset_of: BasicScoringFnParams
@json_schema_type
-class FactualityScoringFnParams(BaseModel):
+class FactualityScoringFn(BaseModel):
type: Literal["factuality"] = "factuality"
- factuality: BasicScoringFnParamsFields
+ factuality: BasicScoringFnParams
@json_schema_type
-class FaithfulnessScoringFnParams(BaseModel):
+class FaithfulnessScoringFn(BaseModel):
type: Literal["faithfulness"] = "faithfulness"
- faithfulness: BasicScoringFnParamsFields
+ faithfulness: BasicScoringFnParams
@json_schema_type
-class AnswerCorrectnessScoringFnParams(BaseModel):
+class AnswerCorrectnessScoringFn(BaseModel):
type: Literal["answer_correctness"] = "answer_correctness"
- answer_correctness: BasicScoringFnParamsFields
+ answer_correctness: BasicScoringFnParams
@json_schema_type
-class AnswerRelevancyScoringFnParams(BaseModel):
+class AnswerRelevancyScoringFn(BaseModel):
type: Literal["answer_relevancy"] = "answer_relevancy"
- answer_relevancy: BasicScoringFnParamsFields
+ answer_relevancy: BasicScoringFnParams
@json_schema_type
-class AnswerSimilarityScoringFnParams(BaseModel):
+class AnswerSimilarityScoringFn(BaseModel):
type: Literal["answer_similarity"] = "answer_similarity"
- answer_similarity: BasicScoringFnParamsFields
+ answer_similarity: BasicScoringFnParams
@json_schema_type
-class ContextEntityRecallScoringFnParams(BaseModel):
+class ContextEntityRecallScoringFn(BaseModel):
type: Literal["context_entity_recall"] = "context_entity_recall"
- context_entity_recall: BasicScoringFnParamsFields
+ context_entity_recall: BasicScoringFnParams
@json_schema_type
-class ContextPrecisionScoringFnParams(BaseModel):
+class ContextPrecisionScoringFn(BaseModel):
type: Literal["context_precision"] = "context_precision"
- context_precision: BasicScoringFnParamsFields
+ context_precision: BasicScoringFnParams
@json_schema_type
-class ContextRecallScoringFnParams(BaseModel):
+class ContextRecallScoringFn(BaseModel):
type: Literal["context_recall"] = "context_recall"
- context_recall: BasicScoringFnParamsFields
+ context_recall: BasicScoringFnParams
@json_schema_type
-class ContextRelevancyScoringFnParams(BaseModel):
+class ContextRelevancyScoringFn(BaseModel):
type: Literal["context_relevancy"] = "context_relevancy"
- context_relevancy: BasicScoringFnParamsFields
+ context_relevancy: BasicScoringFnParams
@json_schema_type
-class CustomLLMAsJudgeScoringFnParams(BaseModel):
+class CustomLLMAsJudgeScoringFn(BaseModel):
type: Literal["custom_llm_as_judge"] = "custom_llm_as_judge"
- custom_llm_as_judge: CustomLLMAsJudgeScoringFnParamsFields
+ custom_llm_as_judge: CustomLLMAsJudgeScoringFnParams
-ScoringFnParams = register_schema(
+ScoringFnDefinition = register_schema(
Annotated[
Union[
- CustomLLMAsJudgeScoringFnParams,
- RegexParserScoringFnParams,
- RegexParserMathScoringFnParams,
- EqualityScoringFnParams,
- SubsetOfcoringFnParams,
- FactualityScoringFnParams,
- FaithfulnessScoringFnParams,
- AnswerCorrectnessScoringFnParams,
- AnswerRelevancyScoringFnParams,
- AnswerSimilarityScoringFnParams,
- ContextEntityRecallScoringFnParams,
- ContextPrecisionScoringFnParams,
- ContextRecallScoringFnParams,
- ContextRelevancyScoringFnParams,
+ CustomLLMAsJudgeScoringFn,
+ RegexParserScoringFn,
+ RegexParserMathScoringFn,
+ EqualityScoringFn,
+ SubsetOfScoringFn,
+ FactualityScoringFn,
+ FaithfulnessScoringFn,
+ AnswerCorrectnessScoringFn,
+ AnswerRelevancyScoringFn,
+ AnswerSimilarityScoringFn,
+ ContextEntityRecallScoringFn,
+ ContextPrecisionScoringFn,
+ ContextRecallScoringFn,
+ ContextRelevancyScoringFn,
],
Field(discriminator="type"),
],
- name="ScoringFnParams",
+ name="ScoringFnDefinition",
)
@@ -208,7 +208,7 @@ class CommonScoringFnFields(BaseModel):
:param fn: The scoring function type and parameters.
:param metadata: (Optional) Any additional metadata for this definition (e.g. description).
"""
- fn: ScoringFnParams
+ fn: ScoringFnDefinition
metadata: Dict[str, Any] = Field(
default_factory=dict,
description="Any additional metadata for this definition (e.g. description)",
@@ -288,7 +288,7 @@ class ScoringFunctions(Protocol):
@webmethod(route="/scoring-functions", method="POST")
async def register_scoring_function(
self,
- fn: ScoringFnParams,
+ fn: ScoringFnDefinition,
scoring_fn_id: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
) -> ScoringFn: