From f9ea90c4f7bb386cf20f4ec8b44c31de71edc34f Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Tue, 11 Mar 2025 22:45:48 -0700 Subject: [PATCH] docs --- docs/_static/llama-stack-spec.html | 39 +++++++++++------- docs/_static/llama-stack-spec.yaml | 30 ++++++++++++-- .../scoring_functions/scoring_functions.py | 41 +++++++++++++++++-- 3 files changed, 90 insertions(+), 20 deletions(-) diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index e4a06c2b9..13acd6c0c 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -6348,7 +6348,8 @@ "categorical_count", "accuracy" ], - "title": "AggregationFunctionType" + "title": "AggregationFunctionType", + "description": "A type of aggregation function." }, "BasicScoringFnParams": { "type": "object", @@ -6362,14 +6363,16 @@ "type": "array", "items": { "$ref": "#/components/schemas/AggregationFunctionType" - } + }, + "description": "(Optional) Aggregation functions to apply to the scores of each row. No aggregation for results is calculated if not provided." } }, "additionalProperties": false, "required": [ "type" ], - "title": "BasicScoringFnParams" + "title": "BasicScoringFnParams", + "description": "Parameters for a non-parameterized scoring function." }, "BenchmarkConfig": { "type": "object", @@ -6420,26 +6423,30 @@ "properties": { "type": { "type": "string", - "const": "llm_as_judge", - "default": "llm_as_judge" + "const": "custom_llm_as_judge", + "default": "custom_llm_as_judge" }, "judge_model": { - "type": "string" + "type": "string", + "description": "The model to use for scoring." }, "prompt_template": { - "type": "string" + "type": "string", + "description": "(Optional) The prompt template to use for scoring." }, "judge_score_regexes": { "type": "array", "items": { "type": "string" - } + }, + "description": "(Optional) Regexes to extract the score from the judge model's response." }, "aggregation_functions": { "type": "array", "items": { "$ref": "#/components/schemas/AggregationFunctionType" - } + }, + "description": "(Optional) Aggregation functions to apply to the scores of each row. No aggregation for results is calculated if not provided." } }, "additionalProperties": false, @@ -6447,7 +6454,8 @@ "type", "judge_model" ], - "title": "LLMAsJudgeScoringFnParams" + "title": "LLMAsJudgeScoringFnParams", + "description": "Parameters for a scoring function that uses a judge model to score the answer." }, "ModelCandidate": { "type": "object", @@ -6491,20 +6499,23 @@ "type": "array", "items": { "type": "string" - } + }, + "description": "Regexes to extract the answer from generated response" }, "aggregation_functions": { "type": "array", "items": { "$ref": "#/components/schemas/AggregationFunctionType" - } + }, + "description": "(Optional) Aggregation functions to apply to the scores of each row. No aggregation for results is calculated if not provided." } }, "additionalProperties": false, "required": [ "type" ], - "title": "RegexParserScoringFnParams" + "title": "RegexParserScoringFnParams", + "description": "Parameters for a scoring function that parses the answer from the generated response using regexes, and checks against the expected answer." }, "ScoringFnParams": { "oneOf": [ @@ -6521,7 +6532,7 @@ "discriminator": { "propertyName": "type", "mapping": { - "llm_as_judge": "#/components/schemas/LLMAsJudgeScoringFnParams", + "custom_llm_as_judge": "#/components/schemas/LLMAsJudgeScoringFnParams", "regex_parser": "#/components/schemas/RegexParserScoringFnParams", "basic": "#/components/schemas/BasicScoringFnParams" } diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index f2b2296da..d7d0107f3 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -4419,6 +4419,7 @@ components: - categorical_count - accuracy title: AggregationFunctionType + description: A type of aggregation function. BasicScoringFnParams: type: object properties: @@ -4430,10 +4431,15 @@ components: type: array items: $ref: '#/components/schemas/AggregationFunctionType' + description: >- + (Optional) Aggregation functions to apply to the scores of each row. No + aggregation for results is calculated if not provided. additionalProperties: false required: - type title: BasicScoringFnParams + description: >- + Parameters for a non-parameterized scoring function. BenchmarkConfig: type: object properties: @@ -4473,25 +4479,35 @@ components: properties: type: type: string - const: llm_as_judge - default: llm_as_judge + const: custom_llm_as_judge + default: custom_llm_as_judge judge_model: type: string + description: The model to use for scoring. prompt_template: type: string + description: >- + (Optional) The prompt template to use for scoring. judge_score_regexes: type: array items: type: string + description: >- + (Optional) Regexes to extract the score from the judge model's response. aggregation_functions: type: array items: $ref: '#/components/schemas/AggregationFunctionType' + description: >- + (Optional) Aggregation functions to apply to the scores of each row. No + aggregation for results is calculated if not provided. additionalProperties: false required: - type - judge_model title: LLMAsJudgeScoringFnParams + description: >- + Parameters for a scoring function that uses a judge model to score the answer. ModelCandidate: type: object properties: @@ -4528,14 +4544,22 @@ components: type: array items: type: string + description: >- + Regexes to extract the answer from generated response aggregation_functions: type: array items: $ref: '#/components/schemas/AggregationFunctionType' + description: >- + (Optional) Aggregation functions to apply to the scores of each row. No + aggregation for results is calculated if not provided. additionalProperties: false required: - type title: RegexParserScoringFnParams + description: >- + Parameters for a scoring function that parses the answer from the generated + response using regexes, and checks against the expected answer. ScoringFnParams: oneOf: - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams' @@ -4544,7 +4568,7 @@ components: discriminator: propertyName: type mapping: - llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams' + custom_llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams' regex_parser: '#/components/schemas/RegexParserScoringFnParams' basic: '#/components/schemas/BasicScoringFnParams' EvaluateRowsRequest: diff --git a/llama_stack/apis/scoring_functions/scoring_functions.py b/llama_stack/apis/scoring_functions/scoring_functions.py index f84ea1577..f67226eb1 100644 --- a/llama_stack/apis/scoring_functions/scoring_functions.py +++ b/llama_stack/apis/scoring_functions/scoring_functions.py @@ -12,8 +12,8 @@ from typing import ( Literal, Optional, Protocol, - Union, runtime_checkable, + Union, ) from pydantic import BaseModel, Field @@ -65,6 +65,15 @@ class ScoringFunctionType(Enum): @json_schema_type class AggregationFunctionType(Enum): + """ + A type of aggregation function. + + :cvar average: Average the scores of each row. + :cvar median: Median the scores of each row. + :cvar categorical_count: Count the number of rows that match each category. + :cvar accuracy: Number of correct results over total results. + """ + average = "average" median = "median" categorical_count = "categorical_count" @@ -73,6 +82,15 @@ class AggregationFunctionType(Enum): @json_schema_type class LLMAsJudgeScoringFnParams(BaseModel): + """ + Parameters for a scoring function that uses a judge model to score the answer. + + :param judge_model: The model to use for scoring. + :param prompt_template: (Optional) The prompt template to use for scoring. + :param judge_score_regexes: (Optional) Regexes to extract the score from the judge model's response. + :param aggregation_functions: (Optional) Aggregation functions to apply to the scores of each row. No aggregation for results is calculated if not provided. + """ + type: Literal["custom_llm_as_judge"] = "custom_llm_as_judge" judge_model: str prompt_template: Optional[str] = None @@ -88,6 +106,13 @@ class LLMAsJudgeScoringFnParams(BaseModel): @json_schema_type class RegexParserScoringFnParams(BaseModel): + """ + Parameters for a scoring function that parses the answer from the generated response using regexes, and checks against the expected answer. + + :param parsing_regexes: Regexes to extract the answer from generated response + :param aggregation_functions: (Optional) Aggregation functions to apply to the scores of each row. No aggregation for results is calculated if not provided. + """ + type: Literal["regex_parser"] = "regex_parser" parsing_regexes: Optional[List[str]] = Field( description="Regexes to extract the answer from generated response", @@ -101,6 +126,12 @@ class RegexParserScoringFnParams(BaseModel): @json_schema_type class BasicScoringFnParams(BaseModel): + """ + Parameters for a non-parameterized scoring function. + + :param aggregation_functions: (Optional) Aggregation functions to apply to the scores of each row. No aggregation for results is calculated if not provided. + """ + type: Literal["basic"] = "basic" aggregation_functions: Optional[List[AggregationFunctionType]] = Field( description="Aggregation functions to apply to the scores of each row", @@ -135,7 +166,9 @@ class CommonScoringFnFields(BaseModel): @json_schema_type class ScoringFn(CommonScoringFnFields, Resource): - type: Literal[ResourceType.scoring_function.value] = ResourceType.scoring_function.value + type: Literal[ResourceType.scoring_function.value] = ( + ResourceType.scoring_function.value + ) @property def scoring_fn_id(self) -> str: @@ -162,7 +195,9 @@ class ScoringFunctions(Protocol): async def list_scoring_functions(self) -> ListScoringFunctionsResponse: ... @webmethod(route="/scoring-functions/{scoring_fn_id:path}", method="GET") - async def get_scoring_function(self, scoring_fn_id: str, /) -> Optional[ScoringFn]: ... + async def get_scoring_function( + self, scoring_fn_id: str, / + ) -> Optional[ScoringFn]: ... @webmethod(route="/scoring-functions", method="POST") async def register_scoring_function(