a

2025-12-23 07:32:26 +00:00 · 2025-07-01 17:00:35 -07:00 · 2025-07-01 17:00:35 -07:00 · 78ef9c605f
commit 78ef9c605f
parent a9d8fdef90
5 changed files with 125 additions and 25 deletions
--- a/api_update_plan.md
+++ b/api_update_plan.md
@ -231,9 +231,9 @@ Before finalizing documentation, verify:
 [x] 10. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/datasets/datasets.py` - Dataset management
 [x] 11. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/datasetio/datasetio.py` - Dataset I/O operations
 [x] 12. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/post_training/post_training.py` - Training and fine-tuning
-13. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/eval/eval.py` - Evaluation framework
+[x] 13. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/eval/eval.py` - Evaluation framework
-14. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/scoring/scoring.py` - Scoring system
+[x] 14. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/scoring/scoring.py` - Scoring system
-15. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/scoring_functions/scoring_functions.py` - Scoring function definitions
+[x] 15. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/scoring_functions/scoring_functions.py` - Scoring function definitions
 16. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/benchmarks/benchmarks.py` - Benchmarking framework
 17. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/shields/shields.py` - Safety shields
 18. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/batch_inference/batch_inference.py` - Batch inference operations
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -9301,7 +9301,8 @@
                    "categorical_count",
                    "accuracy"
                ],
-                "title": "AggregationFunctionType"
+                "title": "AggregationFunctionType",
                "description": "Types of aggregation functions for scoring results."
            },
            "BasicScoringFnParams": {
                "type": "object",
@ -9309,13 +9310,15 @@
                    "type": {
                        "$ref": "#/components/schemas/ScoringFnParamsType",
                        "const": "basic",
-                        "default": "basic"
+                        "default": "basic",
                        "description": "The type of scoring function parameters, always basic"
                    },
                    "aggregation_functions": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/AggregationFunctionType"
-                        }
+                        },
                        "description": "Aggregation functions to apply to the scores of each row"
                    }
                },
                "additionalProperties": false,
@ -9323,7 +9326,8 @@
                    "type",
                    "aggregation_functions"
                ],
-                "title": "BasicScoringFnParams"
+                "title": "BasicScoringFnParams",
                "description": "Parameters for basic scoring function configuration."
            },
            "BenchmarkConfig": {
                "type": "object",
@ -9375,25 +9379,30 @@
                    "type": {
                        "$ref": "#/components/schemas/ScoringFnParamsType",
                        "const": "llm_as_judge",
-                        "default": "llm_as_judge"
+                        "default": "llm_as_judge",
                        "description": "The type of scoring function parameters, always llm_as_judge"
                    },
                    "judge_model": {
-                        "type": "string"
+                        "type": "string",
                        "description": "Identifier of the LLM model to use as a judge for scoring"
                    },
                    "prompt_template": {
-                        "type": "string"
+                        "type": "string",
                        "description": "(Optional) Custom prompt template for the judge model"
                    },
                    "judge_score_regexes": {
                        "type": "array",
                        "items": {
                            "type": "string"
-                        }
+                        },
                        "description": "Regexes to extract the answer from generated response"
                    },
                    "aggregation_functions": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/AggregationFunctionType"
-                        }
+                        },
                        "description": "Aggregation functions to apply to the scores of each row"
                    }
                },
                "additionalProperties": false,
@ -9403,7 +9412,8 @@
                    "judge_score_regexes",
                    "aggregation_functions"
                ],
-                "title": "LLMAsJudgeScoringFnParams"
+                "title": "LLMAsJudgeScoringFnParams",
                "description": "Parameters for LLM-as-judge scoring function configuration."
            },
            "ModelCandidate": {
                "type": "object",
@ -9441,19 +9451,22 @@
                    "type": {
                        "$ref": "#/components/schemas/ScoringFnParamsType",
                        "const": "regex_parser",
-                        "default": "regex_parser"
+                        "default": "regex_parser",
                        "description": "The type of scoring function parameters, always regex_parser"
                    },
                    "parsing_regexes": {
                        "type": "array",
                        "items": {
                            "type": "string"
-                        }
+                        },
                        "description": "Regex to extract the answer from generated response"
                    },
                    "aggregation_functions": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/AggregationFunctionType"
-                        }
+                        },
                        "description": "Aggregation functions to apply to the scores of each row"
                    }
                },
                "additionalProperties": false,
@ -9462,7 +9475,8 @@
                    "parsing_regexes",
                    "aggregation_functions"
                ],
-                "title": "RegexParserScoringFnParams"
+                "title": "RegexParserScoringFnParams",
                "description": "Parameters for regex parser scoring function configuration."
            },
            "ScoringFnParams": {
                "oneOf": [
@ -9492,7 +9506,8 @@
                    "regex_parser",
                    "basic"
                ],
-                "title": "ScoringFnParamsType"
+                "title": "ScoringFnParamsType",
                "description": "Types of scoring function parameter configurations."
            },
            "EvaluateRowsRequest": {
                "type": "object",
@ -10765,9 +10780,9 @@
                            "tool",
                            "tool_group"
                        ],
                        "title": "ResourceType",
                        "const": "scoring_function",
-                        "default": "scoring_function"
+                        "default": "scoring_function",
                        "description": "The resource type, always scoring_function"
                    },
                    "description": {
                        "type": "string"
@ -10812,7 +10827,8 @@
                    "metadata",
                    "return_type"
                ],
-                "title": "ScoringFn"
+                "title": "ScoringFn",
                "description": "A scoring function resource for evaluating model outputs."
            },
            "StringType": {
                "type": "object",
@ -16105,20 +16121,23 @@
                "type": "object",
                "properties": {
                    "dataset_id": {
-                        "type": "string"
+                        "type": "string",
                        "description": "(Optional) The identifier of the dataset that was scored"
                    },
                    "results": {
                        "type": "object",
                        "additionalProperties": {
                            "$ref": "#/components/schemas/ScoringResult"
-                        }
+                        },
                        "description": "A map of scoring function name to ScoringResult"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "results"
                ],
-                "title": "ScoreBatchResponse"
+                "title": "ScoreBatchResponse",
                "description": "Response from batch scoring operations on datasets."
            },
            "AlgorithmConfig": {
                "oneOf": [
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -6681,6 +6681,8 @@ components:
        - categorical_count
        - accuracy
      title: AggregationFunctionType
      description: >-
        Types of aggregation functions for scoring results.
    BasicScoringFnParams:
      type: object
      properties:
@ -6688,15 +6690,21 @@ components:
          $ref: '#/components/schemas/ScoringFnParamsType'
          const: basic
          default: basic
          description: >-
            The type of scoring function parameters, always basic
        aggregation_functions:
          type: array
          items:
            $ref: '#/components/schemas/AggregationFunctionType'
          description: >-
            Aggregation functions to apply to the scores of each row
      additionalProperties: false
      required:
        - type
        - aggregation_functions
      title: BasicScoringFnParams
      description: >-
        Parameters for basic scoring function configuration.
    BenchmarkConfig:
      type: object
      properties:
@ -6738,18 +6746,28 @@ components:
          $ref: '#/components/schemas/ScoringFnParamsType'
          const: llm_as_judge
          default: llm_as_judge
          description: >-
            The type of scoring function parameters, always llm_as_judge
        judge_model:
          type: string
          description: >-
            Identifier of the LLM model to use as a judge for scoring
        prompt_template:
          type: string
          description: >-
            (Optional) Custom prompt template for the judge model
        judge_score_regexes:
          type: array
          items:
            type: string
          description: >-
            Regexes to extract the answer from generated response
        aggregation_functions:
          type: array
          items:
            $ref: '#/components/schemas/AggregationFunctionType'
          description: >-
            Aggregation functions to apply to the scores of each row
      additionalProperties: false
      required:
        - type
@ -6757,6 +6775,8 @@ components:
        - judge_score_regexes
        - aggregation_functions
      title: LLMAsJudgeScoringFnParams
      description: >-
        Parameters for LLM-as-judge scoring function configuration.
    ModelCandidate:
      type: object
      properties:
@ -6789,20 +6809,28 @@ components:
          $ref: '#/components/schemas/ScoringFnParamsType'
          const: regex_parser
          default: regex_parser
          description: >-
            The type of scoring function parameters, always regex_parser
        parsing_regexes:
          type: array
          items:
            type: string
          description: >-
            Regex to extract the answer from generated response
        aggregation_functions:
          type: array
          items:
            $ref: '#/components/schemas/AggregationFunctionType'
          description: >-
            Aggregation functions to apply to the scores of each row
      additionalProperties: false
      required:
        - type
        - parsing_regexes
        - aggregation_functions
      title: RegexParserScoringFnParams
      description: >-
        Parameters for regex parser scoring function configuration.
    ScoringFnParams:
      oneOf:
        - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
@ -6821,6 +6849,8 @@ components:
        - regex_parser
        - basic
      title: ScoringFnParamsType
      description: >-
        Types of scoring function parameter configurations.
    EvaluateRowsRequest:
      type: object
      properties:
@ -7742,9 +7772,10 @@ components:
            - benchmark
            - tool
            - tool_group
          title: ResourceType
          const: scoring_function
          default: scoring_function
          description: >-
            The resource type, always scoring_function
        description:
          type: string
        metadata:
@ -7769,6 +7800,8 @@ components:
        - metadata
        - return_type
      title: ScoringFn
      description: >-
        A scoring function resource for evaluating model outputs.
    StringType:
      type: object
      properties:
@ -11587,14 +11620,20 @@ components:
      properties:
        dataset_id:
          type: string
          description: >-
            (Optional) The identifier of the dataset that was scored
        results:
          type: object
          additionalProperties:
            $ref: '#/components/schemas/ScoringResult'
          description: >-
            A map of scoring function name to ScoringResult
      additionalProperties: false
      required:
        - results
      title: ScoreBatchResponse
      description: >-
        Response from batch scoring operations on datasets.
    AlgorithmConfig:
      oneOf:
        - $ref: '#/components/schemas/LoraFinetuningConfig'
--- a/llama_stack/apis/scoring/scoring.py
+++ b/llama_stack/apis/scoring/scoring.py
@ -31,6 +31,11 @@ class ScoringResult(BaseModel):
@json_schema_type
 class ScoreBatchResponse(BaseModel):
    """Response from batch scoring operations on datasets.
    :param dataset_id: (Optional) The identifier of the dataset that was scored
    :param results: A map of scoring function name to ScoringResult
    """
    dataset_id: str | None = None
    results: dict[str, ScoringResult]
--- a/llama_stack/apis/scoring_functions/scoring_functions.py
+++ b/llama_stack/apis/scoring_functions/scoring_functions.py
@ -25,6 +25,12 @@ from llama_stack.schema_utils import json_schema_type, register_schema, webmetho
 # with standard metrics so they can be rolled up?
@json_schema_type
 class ScoringFnParamsType(StrEnum):
    """Types of scoring function parameter configurations.
    :cvar llm_as_judge: Use an LLM model to evaluate and score responses
    :cvar regex_parser: Use regex patterns to extract and score specific parts of responses
    :cvar basic: Basic scoring with simple aggregation functions
    """
    llm_as_judge = "llm_as_judge"
    regex_parser = "regex_parser"
    basic = "basic"
@ -32,6 +38,14 @@ class ScoringFnParamsType(StrEnum):
@json_schema_type
 class AggregationFunctionType(StrEnum):
    """Types of aggregation functions for scoring results.
    :cvar average: Calculate the arithmetic mean of scores
    :cvar weighted_average: Calculate a weighted average of scores
    :cvar median: Calculate the median value of scores
    :cvar categorical_count: Count occurrences of categorical values
    :cvar accuracy: Calculate accuracy as the proportion of correct answers
    """
    average = "average"
    weighted_average = "weighted_average"
    median = "median"
@ -41,6 +55,14 @@ class AggregationFunctionType(StrEnum):
@json_schema_type
 class LLMAsJudgeScoringFnParams(BaseModel):
    """Parameters for LLM-as-judge scoring function configuration.
    :param type: The type of scoring function parameters, always llm_as_judge
    :param judge_model: Identifier of the LLM model to use as a judge for scoring
    :param prompt_template: (Optional) Custom prompt template for the judge model
    :param judge_score_regexes: Regexes to extract the answer from generated response
    :param aggregation_functions: Aggregation functions to apply to the scores of each row
    """
    type: Literal[ScoringFnParamsType.llm_as_judge] = ScoringFnParamsType.llm_as_judge
    judge_model: str
    prompt_template: str | None = None
@ -56,6 +78,12 @@ class LLMAsJudgeScoringFnParams(BaseModel):
@json_schema_type
 class RegexParserScoringFnParams(BaseModel):
    """Parameters for regex parser scoring function configuration.
    :param type: The type of scoring function parameters, always regex_parser
    :param parsing_regexes: Regex to extract the answer from generated response
    :param aggregation_functions: Aggregation functions to apply to the scores of each row
    """
    type: Literal[ScoringFnParamsType.regex_parser] = ScoringFnParamsType.regex_parser
    parsing_regexes: list[str] = Field(
        description="Regex to extract the answer from generated response",
@ -69,6 +97,11 @@ class RegexParserScoringFnParams(BaseModel):
@json_schema_type
 class BasicScoringFnParams(BaseModel):
    """Parameters for basic scoring function configuration.
    :param type: The type of scoring function parameters, always basic
    :param aggregation_functions: Aggregation functions to apply to the scores of each row
    """
    type: Literal[ScoringFnParamsType.basic] = ScoringFnParamsType.basic
    aggregation_functions: list[AggregationFunctionType] = Field(
        description="Aggregation functions to apply to the scores of each row",
@ -100,6 +133,10 @@ class CommonScoringFnFields(BaseModel):
@json_schema_type
 class ScoringFn(CommonScoringFnFields, Resource):
    """A scoring function resource for evaluating model outputs.
    :param type: The resource type, always scoring_function
    """
    type: Literal[ResourceType.scoring_function] = ResourceType.scoring_function
    @property