This commit is contained in:
Sai Soundararaj 2025-07-01 17:00:35 -07:00
parent a9d8fdef90
commit 78ef9c605f
5 changed files with 125 additions and 25 deletions

View file

@ -231,9 +231,9 @@ Before finalizing documentation, verify:
[x] 10. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/datasets/datasets.py` - Dataset management [x] 10. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/datasets/datasets.py` - Dataset management
[x] 11. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/datasetio/datasetio.py` - Dataset I/O operations [x] 11. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/datasetio/datasetio.py` - Dataset I/O operations
[x] 12. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/post_training/post_training.py` - Training and fine-tuning [x] 12. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/post_training/post_training.py` - Training and fine-tuning
13. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/eval/eval.py` - Evaluation framework [x] 13. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/eval/eval.py` - Evaluation framework
14. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/scoring/scoring.py` - Scoring system [x] 14. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/scoring/scoring.py` - Scoring system
15. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/scoring_functions/scoring_functions.py` - Scoring function definitions [x] 15. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/scoring_functions/scoring_functions.py` - Scoring function definitions
16. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/benchmarks/benchmarks.py` - Benchmarking framework 16. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/benchmarks/benchmarks.py` - Benchmarking framework
17. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/shields/shields.py` - Safety shields 17. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/shields/shields.py` - Safety shields
18. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/batch_inference/batch_inference.py` - Batch inference operations 18. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/batch_inference/batch_inference.py` - Batch inference operations

View file

@ -9301,7 +9301,8 @@
"categorical_count", "categorical_count",
"accuracy" "accuracy"
], ],
"title": "AggregationFunctionType" "title": "AggregationFunctionType",
"description": "Types of aggregation functions for scoring results."
}, },
"BasicScoringFnParams": { "BasicScoringFnParams": {
"type": "object", "type": "object",
@ -9309,13 +9310,15 @@
"type": { "type": {
"$ref": "#/components/schemas/ScoringFnParamsType", "$ref": "#/components/schemas/ScoringFnParamsType",
"const": "basic", "const": "basic",
"default": "basic" "default": "basic",
"description": "The type of scoring function parameters, always basic"
}, },
"aggregation_functions": { "aggregation_functions": {
"type": "array", "type": "array",
"items": { "items": {
"$ref": "#/components/schemas/AggregationFunctionType" "$ref": "#/components/schemas/AggregationFunctionType"
} },
"description": "Aggregation functions to apply to the scores of each row"
} }
}, },
"additionalProperties": false, "additionalProperties": false,
@ -9323,7 +9326,8 @@
"type", "type",
"aggregation_functions" "aggregation_functions"
], ],
"title": "BasicScoringFnParams" "title": "BasicScoringFnParams",
"description": "Parameters for basic scoring function configuration."
}, },
"BenchmarkConfig": { "BenchmarkConfig": {
"type": "object", "type": "object",
@ -9375,25 +9379,30 @@
"type": { "type": {
"$ref": "#/components/schemas/ScoringFnParamsType", "$ref": "#/components/schemas/ScoringFnParamsType",
"const": "llm_as_judge", "const": "llm_as_judge",
"default": "llm_as_judge" "default": "llm_as_judge",
"description": "The type of scoring function parameters, always llm_as_judge"
}, },
"judge_model": { "judge_model": {
"type": "string" "type": "string",
"description": "Identifier of the LLM model to use as a judge for scoring"
}, },
"prompt_template": { "prompt_template": {
"type": "string" "type": "string",
"description": "(Optional) Custom prompt template for the judge model"
}, },
"judge_score_regexes": { "judge_score_regexes": {
"type": "array", "type": "array",
"items": { "items": {
"type": "string" "type": "string"
} },
"description": "Regexes to extract the answer from generated response"
}, },
"aggregation_functions": { "aggregation_functions": {
"type": "array", "type": "array",
"items": { "items": {
"$ref": "#/components/schemas/AggregationFunctionType" "$ref": "#/components/schemas/AggregationFunctionType"
} },
"description": "Aggregation functions to apply to the scores of each row"
} }
}, },
"additionalProperties": false, "additionalProperties": false,
@ -9403,7 +9412,8 @@
"judge_score_regexes", "judge_score_regexes",
"aggregation_functions" "aggregation_functions"
], ],
"title": "LLMAsJudgeScoringFnParams" "title": "LLMAsJudgeScoringFnParams",
"description": "Parameters for LLM-as-judge scoring function configuration."
}, },
"ModelCandidate": { "ModelCandidate": {
"type": "object", "type": "object",
@ -9441,19 +9451,22 @@
"type": { "type": {
"$ref": "#/components/schemas/ScoringFnParamsType", "$ref": "#/components/schemas/ScoringFnParamsType",
"const": "regex_parser", "const": "regex_parser",
"default": "regex_parser" "default": "regex_parser",
"description": "The type of scoring function parameters, always regex_parser"
}, },
"parsing_regexes": { "parsing_regexes": {
"type": "array", "type": "array",
"items": { "items": {
"type": "string" "type": "string"
} },
"description": "Regex to extract the answer from generated response"
}, },
"aggregation_functions": { "aggregation_functions": {
"type": "array", "type": "array",
"items": { "items": {
"$ref": "#/components/schemas/AggregationFunctionType" "$ref": "#/components/schemas/AggregationFunctionType"
} },
"description": "Aggregation functions to apply to the scores of each row"
} }
}, },
"additionalProperties": false, "additionalProperties": false,
@ -9462,7 +9475,8 @@
"parsing_regexes", "parsing_regexes",
"aggregation_functions" "aggregation_functions"
], ],
"title": "RegexParserScoringFnParams" "title": "RegexParserScoringFnParams",
"description": "Parameters for regex parser scoring function configuration."
}, },
"ScoringFnParams": { "ScoringFnParams": {
"oneOf": [ "oneOf": [
@ -9492,7 +9506,8 @@
"regex_parser", "regex_parser",
"basic" "basic"
], ],
"title": "ScoringFnParamsType" "title": "ScoringFnParamsType",
"description": "Types of scoring function parameter configurations."
}, },
"EvaluateRowsRequest": { "EvaluateRowsRequest": {
"type": "object", "type": "object",
@ -10765,9 +10780,9 @@
"tool", "tool",
"tool_group" "tool_group"
], ],
"title": "ResourceType",
"const": "scoring_function", "const": "scoring_function",
"default": "scoring_function" "default": "scoring_function",
"description": "The resource type, always scoring_function"
}, },
"description": { "description": {
"type": "string" "type": "string"
@ -10812,7 +10827,8 @@
"metadata", "metadata",
"return_type" "return_type"
], ],
"title": "ScoringFn" "title": "ScoringFn",
"description": "A scoring function resource for evaluating model outputs."
}, },
"StringType": { "StringType": {
"type": "object", "type": "object",
@ -16105,20 +16121,23 @@
"type": "object", "type": "object",
"properties": { "properties": {
"dataset_id": { "dataset_id": {
"type": "string" "type": "string",
"description": "(Optional) The identifier of the dataset that was scored"
}, },
"results": { "results": {
"type": "object", "type": "object",
"additionalProperties": { "additionalProperties": {
"$ref": "#/components/schemas/ScoringResult" "$ref": "#/components/schemas/ScoringResult"
} },
"description": "A map of scoring function name to ScoringResult"
} }
}, },
"additionalProperties": false, "additionalProperties": false,
"required": [ "required": [
"results" "results"
], ],
"title": "ScoreBatchResponse" "title": "ScoreBatchResponse",
"description": "Response from batch scoring operations on datasets."
}, },
"AlgorithmConfig": { "AlgorithmConfig": {
"oneOf": [ "oneOf": [

View file

@ -6681,6 +6681,8 @@ components:
- categorical_count - categorical_count
- accuracy - accuracy
title: AggregationFunctionType title: AggregationFunctionType
description: >-
Types of aggregation functions for scoring results.
BasicScoringFnParams: BasicScoringFnParams:
type: object type: object
properties: properties:
@ -6688,15 +6690,21 @@ components:
$ref: '#/components/schemas/ScoringFnParamsType' $ref: '#/components/schemas/ScoringFnParamsType'
const: basic const: basic
default: basic default: basic
description: >-
The type of scoring function parameters, always basic
aggregation_functions: aggregation_functions:
type: array type: array
items: items:
$ref: '#/components/schemas/AggregationFunctionType' $ref: '#/components/schemas/AggregationFunctionType'
description: >-
Aggregation functions to apply to the scores of each row
additionalProperties: false additionalProperties: false
required: required:
- type - type
- aggregation_functions - aggregation_functions
title: BasicScoringFnParams title: BasicScoringFnParams
description: >-
Parameters for basic scoring function configuration.
BenchmarkConfig: BenchmarkConfig:
type: object type: object
properties: properties:
@ -6738,18 +6746,28 @@ components:
$ref: '#/components/schemas/ScoringFnParamsType' $ref: '#/components/schemas/ScoringFnParamsType'
const: llm_as_judge const: llm_as_judge
default: llm_as_judge default: llm_as_judge
description: >-
The type of scoring function parameters, always llm_as_judge
judge_model: judge_model:
type: string type: string
description: >-
Identifier of the LLM model to use as a judge for scoring
prompt_template: prompt_template:
type: string type: string
description: >-
(Optional) Custom prompt template for the judge model
judge_score_regexes: judge_score_regexes:
type: array type: array
items: items:
type: string type: string
description: >-
Regexes to extract the answer from generated response
aggregation_functions: aggregation_functions:
type: array type: array
items: items:
$ref: '#/components/schemas/AggregationFunctionType' $ref: '#/components/schemas/AggregationFunctionType'
description: >-
Aggregation functions to apply to the scores of each row
additionalProperties: false additionalProperties: false
required: required:
- type - type
@ -6757,6 +6775,8 @@ components:
- judge_score_regexes - judge_score_regexes
- aggregation_functions - aggregation_functions
title: LLMAsJudgeScoringFnParams title: LLMAsJudgeScoringFnParams
description: >-
Parameters for LLM-as-judge scoring function configuration.
ModelCandidate: ModelCandidate:
type: object type: object
properties: properties:
@ -6789,20 +6809,28 @@ components:
$ref: '#/components/schemas/ScoringFnParamsType' $ref: '#/components/schemas/ScoringFnParamsType'
const: regex_parser const: regex_parser
default: regex_parser default: regex_parser
description: >-
The type of scoring function parameters, always regex_parser
parsing_regexes: parsing_regexes:
type: array type: array
items: items:
type: string type: string
description: >-
Regex to extract the answer from generated response
aggregation_functions: aggregation_functions:
type: array type: array
items: items:
$ref: '#/components/schemas/AggregationFunctionType' $ref: '#/components/schemas/AggregationFunctionType'
description: >-
Aggregation functions to apply to the scores of each row
additionalProperties: false additionalProperties: false
required: required:
- type - type
- parsing_regexes - parsing_regexes
- aggregation_functions - aggregation_functions
title: RegexParserScoringFnParams title: RegexParserScoringFnParams
description: >-
Parameters for regex parser scoring function configuration.
ScoringFnParams: ScoringFnParams:
oneOf: oneOf:
- $ref: '#/components/schemas/LLMAsJudgeScoringFnParams' - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
@ -6821,6 +6849,8 @@ components:
- regex_parser - regex_parser
- basic - basic
title: ScoringFnParamsType title: ScoringFnParamsType
description: >-
Types of scoring function parameter configurations.
EvaluateRowsRequest: EvaluateRowsRequest:
type: object type: object
properties: properties:
@ -7742,9 +7772,10 @@ components:
- benchmark - benchmark
- tool - tool
- tool_group - tool_group
title: ResourceType
const: scoring_function const: scoring_function
default: scoring_function default: scoring_function
description: >-
The resource type, always scoring_function
description: description:
type: string type: string
metadata: metadata:
@ -7769,6 +7800,8 @@ components:
- metadata - metadata
- return_type - return_type
title: ScoringFn title: ScoringFn
description: >-
A scoring function resource for evaluating model outputs.
StringType: StringType:
type: object type: object
properties: properties:
@ -11587,14 +11620,20 @@ components:
properties: properties:
dataset_id: dataset_id:
type: string type: string
description: >-
(Optional) The identifier of the dataset that was scored
results: results:
type: object type: object
additionalProperties: additionalProperties:
$ref: '#/components/schemas/ScoringResult' $ref: '#/components/schemas/ScoringResult'
description: >-
A map of scoring function name to ScoringResult
additionalProperties: false additionalProperties: false
required: required:
- results - results
title: ScoreBatchResponse title: ScoreBatchResponse
description: >-
Response from batch scoring operations on datasets.
AlgorithmConfig: AlgorithmConfig:
oneOf: oneOf:
- $ref: '#/components/schemas/LoraFinetuningConfig' - $ref: '#/components/schemas/LoraFinetuningConfig'

View file

@ -31,6 +31,11 @@ class ScoringResult(BaseModel):
@json_schema_type @json_schema_type
class ScoreBatchResponse(BaseModel): class ScoreBatchResponse(BaseModel):
"""Response from batch scoring operations on datasets.
:param dataset_id: (Optional) The identifier of the dataset that was scored
:param results: A map of scoring function name to ScoringResult
"""
dataset_id: str | None = None dataset_id: str | None = None
results: dict[str, ScoringResult] results: dict[str, ScoringResult]

View file

@ -25,6 +25,12 @@ from llama_stack.schema_utils import json_schema_type, register_schema, webmetho
# with standard metrics so they can be rolled up? # with standard metrics so they can be rolled up?
@json_schema_type @json_schema_type
class ScoringFnParamsType(StrEnum): class ScoringFnParamsType(StrEnum):
"""Types of scoring function parameter configurations.
:cvar llm_as_judge: Use an LLM model to evaluate and score responses
:cvar regex_parser: Use regex patterns to extract and score specific parts of responses
:cvar basic: Basic scoring with simple aggregation functions
"""
llm_as_judge = "llm_as_judge" llm_as_judge = "llm_as_judge"
regex_parser = "regex_parser" regex_parser = "regex_parser"
basic = "basic" basic = "basic"
@ -32,6 +38,14 @@ class ScoringFnParamsType(StrEnum):
@json_schema_type @json_schema_type
class AggregationFunctionType(StrEnum): class AggregationFunctionType(StrEnum):
"""Types of aggregation functions for scoring results.
:cvar average: Calculate the arithmetic mean of scores
:cvar weighted_average: Calculate a weighted average of scores
:cvar median: Calculate the median value of scores
:cvar categorical_count: Count occurrences of categorical values
:cvar accuracy: Calculate accuracy as the proportion of correct answers
"""
average = "average" average = "average"
weighted_average = "weighted_average" weighted_average = "weighted_average"
median = "median" median = "median"
@ -41,6 +55,14 @@ class AggregationFunctionType(StrEnum):
@json_schema_type @json_schema_type
class LLMAsJudgeScoringFnParams(BaseModel): class LLMAsJudgeScoringFnParams(BaseModel):
"""Parameters for LLM-as-judge scoring function configuration.
:param type: The type of scoring function parameters, always llm_as_judge
:param judge_model: Identifier of the LLM model to use as a judge for scoring
:param prompt_template: (Optional) Custom prompt template for the judge model
:param judge_score_regexes: Regexes to extract the answer from generated response
:param aggregation_functions: Aggregation functions to apply to the scores of each row
"""
type: Literal[ScoringFnParamsType.llm_as_judge] = ScoringFnParamsType.llm_as_judge type: Literal[ScoringFnParamsType.llm_as_judge] = ScoringFnParamsType.llm_as_judge
judge_model: str judge_model: str
prompt_template: str | None = None prompt_template: str | None = None
@ -56,6 +78,12 @@ class LLMAsJudgeScoringFnParams(BaseModel):
@json_schema_type @json_schema_type
class RegexParserScoringFnParams(BaseModel): class RegexParserScoringFnParams(BaseModel):
"""Parameters for regex parser scoring function configuration.
:param type: The type of scoring function parameters, always regex_parser
:param parsing_regexes: Regex to extract the answer from generated response
:param aggregation_functions: Aggregation functions to apply to the scores of each row
"""
type: Literal[ScoringFnParamsType.regex_parser] = ScoringFnParamsType.regex_parser type: Literal[ScoringFnParamsType.regex_parser] = ScoringFnParamsType.regex_parser
parsing_regexes: list[str] = Field( parsing_regexes: list[str] = Field(
description="Regex to extract the answer from generated response", description="Regex to extract the answer from generated response",
@ -69,6 +97,11 @@ class RegexParserScoringFnParams(BaseModel):
@json_schema_type @json_schema_type
class BasicScoringFnParams(BaseModel): class BasicScoringFnParams(BaseModel):
"""Parameters for basic scoring function configuration.
:param type: The type of scoring function parameters, always basic
:param aggregation_functions: Aggregation functions to apply to the scores of each row
"""
type: Literal[ScoringFnParamsType.basic] = ScoringFnParamsType.basic type: Literal[ScoringFnParamsType.basic] = ScoringFnParamsType.basic
aggregation_functions: list[AggregationFunctionType] = Field( aggregation_functions: list[AggregationFunctionType] = Field(
description="Aggregation functions to apply to the scores of each row", description="Aggregation functions to apply to the scores of each row",
@ -100,6 +133,10 @@ class CommonScoringFnFields(BaseModel):
@json_schema_type @json_schema_type
class ScoringFn(CommonScoringFnFields, Resource): class ScoringFn(CommonScoringFnFields, Resource):
"""A scoring function resource for evaluating model outputs.
:param type: The resource type, always scoring_function
"""
type: Literal[ResourceType.scoring_function] = ResourceType.scoring_function type: Literal[ResourceType.scoring_function] = ResourceType.scoring_function
@property @property