diff --git a/api_update_plan.md b/api_update_plan.md index ffda31b00..20a8938e1 100644 --- a/api_update_plan.md +++ b/api_update_plan.md @@ -231,9 +231,9 @@ Before finalizing documentation, verify: [x] 10. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/datasets/datasets.py` - Dataset management [x] 11. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/datasetio/datasetio.py` - Dataset I/O operations [x] 12. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/post_training/post_training.py` - Training and fine-tuning -13. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/eval/eval.py` - Evaluation framework -14. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/scoring/scoring.py` - Scoring system -15. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/scoring_functions/scoring_functions.py` - Scoring function definitions +[x] 13. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/eval/eval.py` - Evaluation framework +[x] 14. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/scoring/scoring.py` - Scoring system +[x] 15. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/scoring_functions/scoring_functions.py` - Scoring function definitions 16. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/benchmarks/benchmarks.py` - Benchmarking framework 17. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/shields/shields.py` - Safety shields 18. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/batch_inference/batch_inference.py` - Batch inference operations diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index aa3fe644b..264b2e6b4 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -9301,7 +9301,8 @@ "categorical_count", "accuracy" ], - "title": "AggregationFunctionType" + "title": "AggregationFunctionType", + "description": "Types of aggregation functions for scoring results." }, "BasicScoringFnParams": { "type": "object", @@ -9309,13 +9310,15 @@ "type": { "$ref": "#/components/schemas/ScoringFnParamsType", "const": "basic", - "default": "basic" + "default": "basic", + "description": "The type of scoring function parameters, always basic" }, "aggregation_functions": { "type": "array", "items": { "$ref": "#/components/schemas/AggregationFunctionType" - } + }, + "description": "Aggregation functions to apply to the scores of each row" } }, "additionalProperties": false, @@ -9323,7 +9326,8 @@ "type", "aggregation_functions" ], - "title": "BasicScoringFnParams" + "title": "BasicScoringFnParams", + "description": "Parameters for basic scoring function configuration." }, "BenchmarkConfig": { "type": "object", @@ -9375,25 +9379,30 @@ "type": { "$ref": "#/components/schemas/ScoringFnParamsType", "const": "llm_as_judge", - "default": "llm_as_judge" + "default": "llm_as_judge", + "description": "The type of scoring function parameters, always llm_as_judge" }, "judge_model": { - "type": "string" + "type": "string", + "description": "Identifier of the LLM model to use as a judge for scoring" }, "prompt_template": { - "type": "string" + "type": "string", + "description": "(Optional) Custom prompt template for the judge model" }, "judge_score_regexes": { "type": "array", "items": { "type": "string" - } + }, + "description": "Regexes to extract the answer from generated response" }, "aggregation_functions": { "type": "array", "items": { "$ref": "#/components/schemas/AggregationFunctionType" - } + }, + "description": "Aggregation functions to apply to the scores of each row" } }, "additionalProperties": false, @@ -9403,7 +9412,8 @@ "judge_score_regexes", "aggregation_functions" ], - "title": "LLMAsJudgeScoringFnParams" + "title": "LLMAsJudgeScoringFnParams", + "description": "Parameters for LLM-as-judge scoring function configuration." }, "ModelCandidate": { "type": "object", @@ -9441,19 +9451,22 @@ "type": { "$ref": "#/components/schemas/ScoringFnParamsType", "const": "regex_parser", - "default": "regex_parser" + "default": "regex_parser", + "description": "The type of scoring function parameters, always regex_parser" }, "parsing_regexes": { "type": "array", "items": { "type": "string" - } + }, + "description": "Regex to extract the answer from generated response" }, "aggregation_functions": { "type": "array", "items": { "$ref": "#/components/schemas/AggregationFunctionType" - } + }, + "description": "Aggregation functions to apply to the scores of each row" } }, "additionalProperties": false, @@ -9462,7 +9475,8 @@ "parsing_regexes", "aggregation_functions" ], - "title": "RegexParserScoringFnParams" + "title": "RegexParserScoringFnParams", + "description": "Parameters for regex parser scoring function configuration." }, "ScoringFnParams": { "oneOf": [ @@ -9492,7 +9506,8 @@ "regex_parser", "basic" ], - "title": "ScoringFnParamsType" + "title": "ScoringFnParamsType", + "description": "Types of scoring function parameter configurations." }, "EvaluateRowsRequest": { "type": "object", @@ -10765,9 +10780,9 @@ "tool", "tool_group" ], - "title": "ResourceType", "const": "scoring_function", - "default": "scoring_function" + "default": "scoring_function", + "description": "The resource type, always scoring_function" }, "description": { "type": "string" @@ -10812,7 +10827,8 @@ "metadata", "return_type" ], - "title": "ScoringFn" + "title": "ScoringFn", + "description": "A scoring function resource for evaluating model outputs." }, "StringType": { "type": "object", @@ -16105,20 +16121,23 @@ "type": "object", "properties": { "dataset_id": { - "type": "string" + "type": "string", + "description": "(Optional) The identifier of the dataset that was scored" }, "results": { "type": "object", "additionalProperties": { "$ref": "#/components/schemas/ScoringResult" - } + }, + "description": "A map of scoring function name to ScoringResult" } }, "additionalProperties": false, "required": [ "results" ], - "title": "ScoreBatchResponse" + "title": "ScoreBatchResponse", + "description": "Response from batch scoring operations on datasets." }, "AlgorithmConfig": { "oneOf": [ diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index ce7a7293f..d24276596 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -6681,6 +6681,8 @@ components: - categorical_count - accuracy title: AggregationFunctionType + description: >- + Types of aggregation functions for scoring results. BasicScoringFnParams: type: object properties: @@ -6688,15 +6690,21 @@ components: $ref: '#/components/schemas/ScoringFnParamsType' const: basic default: basic + description: >- + The type of scoring function parameters, always basic aggregation_functions: type: array items: $ref: '#/components/schemas/AggregationFunctionType' + description: >- + Aggregation functions to apply to the scores of each row additionalProperties: false required: - type - aggregation_functions title: BasicScoringFnParams + description: >- + Parameters for basic scoring function configuration. BenchmarkConfig: type: object properties: @@ -6738,18 +6746,28 @@ components: $ref: '#/components/schemas/ScoringFnParamsType' const: llm_as_judge default: llm_as_judge + description: >- + The type of scoring function parameters, always llm_as_judge judge_model: type: string + description: >- + Identifier of the LLM model to use as a judge for scoring prompt_template: type: string + description: >- + (Optional) Custom prompt template for the judge model judge_score_regexes: type: array items: type: string + description: >- + Regexes to extract the answer from generated response aggregation_functions: type: array items: $ref: '#/components/schemas/AggregationFunctionType' + description: >- + Aggregation functions to apply to the scores of each row additionalProperties: false required: - type @@ -6757,6 +6775,8 @@ components: - judge_score_regexes - aggregation_functions title: LLMAsJudgeScoringFnParams + description: >- + Parameters for LLM-as-judge scoring function configuration. ModelCandidate: type: object properties: @@ -6789,20 +6809,28 @@ components: $ref: '#/components/schemas/ScoringFnParamsType' const: regex_parser default: regex_parser + description: >- + The type of scoring function parameters, always regex_parser parsing_regexes: type: array items: type: string + description: >- + Regex to extract the answer from generated response aggregation_functions: type: array items: $ref: '#/components/schemas/AggregationFunctionType' + description: >- + Aggregation functions to apply to the scores of each row additionalProperties: false required: - type - parsing_regexes - aggregation_functions title: RegexParserScoringFnParams + description: >- + Parameters for regex parser scoring function configuration. ScoringFnParams: oneOf: - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams' @@ -6821,6 +6849,8 @@ components: - regex_parser - basic title: ScoringFnParamsType + description: >- + Types of scoring function parameter configurations. EvaluateRowsRequest: type: object properties: @@ -7742,9 +7772,10 @@ components: - benchmark - tool - tool_group - title: ResourceType const: scoring_function default: scoring_function + description: >- + The resource type, always scoring_function description: type: string metadata: @@ -7769,6 +7800,8 @@ components: - metadata - return_type title: ScoringFn + description: >- + A scoring function resource for evaluating model outputs. StringType: type: object properties: @@ -11587,14 +11620,20 @@ components: properties: dataset_id: type: string + description: >- + (Optional) The identifier of the dataset that was scored results: type: object additionalProperties: $ref: '#/components/schemas/ScoringResult' + description: >- + A map of scoring function name to ScoringResult additionalProperties: false required: - results title: ScoreBatchResponse + description: >- + Response from batch scoring operations on datasets. AlgorithmConfig: oneOf: - $ref: '#/components/schemas/LoraFinetuningConfig' diff --git a/llama_stack/apis/scoring/scoring.py b/llama_stack/apis/scoring/scoring.py index 732e80e79..f4dc017a2 100644 --- a/llama_stack/apis/scoring/scoring.py +++ b/llama_stack/apis/scoring/scoring.py @@ -31,6 +31,11 @@ class ScoringResult(BaseModel): @json_schema_type class ScoreBatchResponse(BaseModel): + """Response from batch scoring operations on datasets. + + :param dataset_id: (Optional) The identifier of the dataset that was scored + :param results: A map of scoring function name to ScoringResult + """ dataset_id: str | None = None results: dict[str, ScoringResult] diff --git a/llama_stack/apis/scoring_functions/scoring_functions.py b/llama_stack/apis/scoring_functions/scoring_functions.py index 684041308..72bf1a42e 100644 --- a/llama_stack/apis/scoring_functions/scoring_functions.py +++ b/llama_stack/apis/scoring_functions/scoring_functions.py @@ -25,6 +25,12 @@ from llama_stack.schema_utils import json_schema_type, register_schema, webmetho # with standard metrics so they can be rolled up? @json_schema_type class ScoringFnParamsType(StrEnum): + """Types of scoring function parameter configurations. + + :cvar llm_as_judge: Use an LLM model to evaluate and score responses + :cvar regex_parser: Use regex patterns to extract and score specific parts of responses + :cvar basic: Basic scoring with simple aggregation functions + """ llm_as_judge = "llm_as_judge" regex_parser = "regex_parser" basic = "basic" @@ -32,6 +38,14 @@ class ScoringFnParamsType(StrEnum): @json_schema_type class AggregationFunctionType(StrEnum): + """Types of aggregation functions for scoring results. + + :cvar average: Calculate the arithmetic mean of scores + :cvar weighted_average: Calculate a weighted average of scores + :cvar median: Calculate the median value of scores + :cvar categorical_count: Count occurrences of categorical values + :cvar accuracy: Calculate accuracy as the proportion of correct answers + """ average = "average" weighted_average = "weighted_average" median = "median" @@ -41,6 +55,14 @@ class AggregationFunctionType(StrEnum): @json_schema_type class LLMAsJudgeScoringFnParams(BaseModel): + """Parameters for LLM-as-judge scoring function configuration. + + :param type: The type of scoring function parameters, always llm_as_judge + :param judge_model: Identifier of the LLM model to use as a judge for scoring + :param prompt_template: (Optional) Custom prompt template for the judge model + :param judge_score_regexes: Regexes to extract the answer from generated response + :param aggregation_functions: Aggregation functions to apply to the scores of each row + """ type: Literal[ScoringFnParamsType.llm_as_judge] = ScoringFnParamsType.llm_as_judge judge_model: str prompt_template: str | None = None @@ -56,6 +78,12 @@ class LLMAsJudgeScoringFnParams(BaseModel): @json_schema_type class RegexParserScoringFnParams(BaseModel): + """Parameters for regex parser scoring function configuration. + + :param type: The type of scoring function parameters, always regex_parser + :param parsing_regexes: Regex to extract the answer from generated response + :param aggregation_functions: Aggregation functions to apply to the scores of each row + """ type: Literal[ScoringFnParamsType.regex_parser] = ScoringFnParamsType.regex_parser parsing_regexes: list[str] = Field( description="Regex to extract the answer from generated response", @@ -69,6 +97,11 @@ class RegexParserScoringFnParams(BaseModel): @json_schema_type class BasicScoringFnParams(BaseModel): + """Parameters for basic scoring function configuration. + + :param type: The type of scoring function parameters, always basic + :param aggregation_functions: Aggregation functions to apply to the scores of each row + """ type: Literal[ScoringFnParamsType.basic] = ScoringFnParamsType.basic aggregation_functions: list[AggregationFunctionType] = Field( description="Aggregation functions to apply to the scores of each row", @@ -100,6 +133,10 @@ class CommonScoringFnFields(BaseModel): @json_schema_type class ScoringFn(CommonScoringFnFields, Resource): + """A scoring function resource for evaluating model outputs. + + :param type: The resource type, always scoring_function + """ type: Literal[ResourceType.scoring_function] = ResourceType.scoring_function @property