score

2025-12-31 04:03:53 +00:00 · 2025-03-13 15:35:09 -07:00 · 2025-03-13 15:35:09 -07:00 · 819ffe0518
commit 819ffe0518
parent 2cf769e05e
3 changed files with 190 additions and 176 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -962,7 +962,7 @@
                }
            }
        },
-        "/v1/eval/rows": {
+        "/v1/eval/evaluate_rows": {
            "post": {
                "responses": {
                    "200": {
@ -3631,49 +3631,6 @@
                }
            }
        },
-        "/v1/scoring/rows": {
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "ScoreResponse object containing rows and aggregated results",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/ScoreResponse"
-                                }
-                            }
-                        }
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "Scoring"
-                ],
-                "description": "Score a list of rows.",
-                "parameters": [],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/ScoreRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                }
-            }
-        },
        "/v1/scoring/jobs": {
            "post": {
                "responses": {
@ -3717,6 +3674,49 @@
                }
            }
        },
+        "/v1/scoring/score-rows": {
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "ScoreResponse object containing rows and aggregated results",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/ScoreResponse"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Scoring"
+                ],
+                "description": "Score a list of rows.",
+                "parameters": [],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/ScoreRowsRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                }
+            }
+        },
        "/v1/post-training/supervised-fine-tune": {
            "post": {
                "responses": {
@ -8714,7 +8714,7 @@
                        "type": "string",
                        "description": "A description of the scoring function type. - E.g. Write your custom judge prompt to score the answer."
                    },
-                    "supported_purposes": {
+                    "supported_dataset_purposes": {
                        "type": "array",
                        "items": {
                            "type": "string",
@ -8736,7 +8736,7 @@
                "required": [
                    "type",
                    "description",
-                    "supported_purposes"
+                    "supported_dataset_purposes"
                ],
                "title": "ScoringFnTypeInfo"
            },
@ -10181,7 +10181,46 @@
                ],
                "title": "SaveSpansToDatasetRequest"
            },
-            "ScoreRequest": {
+            "ScoreDatasetRequest": {
+                "type": "object",
+                "properties": {
+                    "dataset_id": {
+                        "type": "string"
+                    },
+                    "scoring_fn_ids": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "dataset_id",
+                    "scoring_fn_ids"
+                ],
+                "title": "ScoreDatasetRequest"
+            },
+            "ScoreBatchResponse": {
+                "type": "object",
+                "properties": {
+                    "dataset_id": {
+                        "type": "string"
+                    },
+                    "results": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "$ref": "#/components/schemas/ScoringResult"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "results"
+                ],
+                "title": "ScoreBatchResponse"
+            },
+            "ScoreRowsRequest": {
                "type": "object",
                "properties": {
                    "dataset_rows": {
@ -10226,7 +10265,7 @@
                    "dataset_rows",
                    "scoring_fn_ids"
                ],
-                "title": "ScoreRequest"
+                "title": "ScoreRowsRequest"
            },
            "ScoreResponse": {
                "type": "object",
@ -10246,45 +10285,6 @@
                "title": "ScoreResponse",
                "description": "The response from scoring."
            },
-            "ScoreDatasetRequest": {
-                "type": "object",
-                "properties": {
-                    "dataset_id": {
-                        "type": "string"
-                    },
-                    "scoring_fn_ids": {
-                        "type": "array",
-                        "items": {
-                            "type": "string"
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "dataset_id",
-                    "scoring_fn_ids"
-                ],
-                "title": "ScoreDatasetRequest"
-            },
-            "ScoreBatchResponse": {
-                "type": "object",
-                "properties": {
-                    "dataset_id": {
-                        "type": "string"
-                    },
-                    "results": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "$ref": "#/components/schemas/ScoringResult"
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "results"
-                ],
-                "title": "ScoreBatchResponse"
-            },
            "AlgorithmConfig": {
                "oneOf": [
                    {
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -659,7 +659,7 @@ paths:
            schema:
              $ref: '#/components/schemas/EvaluateBenchmarkRequest'
        required: true
-  /v1/eval/rows:
+  /v1/eval/evaluate_rows:
    post:
      responses:
        '200':
@ -2467,36 +2467,6 @@ paths:
            schema:
              $ref: '#/components/schemas/SaveSpansToDatasetRequest'
        required: true
-  /v1/scoring/rows:
-    post:
-      responses:
-        '200':
-          description: >-
-            ScoreResponse object containing rows and aggregated results
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ScoreResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - Scoring
-      description: Score a list of rows.
-      parameters: []
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/ScoreRequest'
-        required: true
  /v1/scoring/jobs:
    post:
      responses:
@ -2526,6 +2496,36 @@ paths:
            schema:
              $ref: '#/components/schemas/ScoreDatasetRequest'
        required: true
+  /v1/scoring/score-rows:
+    post:
+      responses:
+        '200':
+          description: >-
+            ScoreResponse object containing rows and aggregated results
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ScoreResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Scoring
+      description: Score a list of rows.
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/ScoreRowsRequest'
+        required: true
  /v1/post-training/supervised-fine-tune:
    post:
      responses:
@ -6019,7 +6019,7 @@ components:
          description: >-
            A description of the scoring function type. - E.g. Write your custom judge
            prompt to score the answer.
-        supported_purposes:
+        supported_dataset_purposes:
          type: array
          items:
            type: string
@ -6039,7 +6039,7 @@ components:
      required:
        - type
        - description
-        - supported_purposes
+        - supported_dataset_purposes
      title: ScoringFnTypeInfo
    ListScoringFunctionTypesResponse:
      type: object
@ -6982,47 +6982,6 @@ components:
        - attributes_to_save
        - dataset_id
      title: SaveSpansToDatasetRequest
-    ScoreRequest:
-      type: object
-      properties:
-        dataset_rows:
-          type: array
-          items:
-            type: object
-            additionalProperties:
-              oneOf:
-                - type: 'null'
-                - type: boolean
-                - type: number
-                - type: string
-                - type: array
-                - type: object
-          description: The rows to score.
-        scoring_fn_ids:
-          type: array
-          items:
-            type: string
-          description: >-
-            The scoring function ids to use for the scoring.
-      additionalProperties: false
-      required:
-        - dataset_rows
-        - scoring_fn_ids
-      title: ScoreRequest
-    ScoreResponse:
-      type: object
-      properties:
-        results:
-          type: object
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-          description: >-
-            A map of scoring function name to ScoringResult.
-      additionalProperties: false
-      required:
-        - results
-      title: ScoreResponse
-      description: The response from scoring.
    ScoreDatasetRequest:
      type: object
      properties:
@ -7050,6 +7009,47 @@ components:
      required:
        - results
      title: ScoreBatchResponse
+    ScoreRowsRequest:
+      type: object
+      properties:
+        dataset_rows:
+          type: array
+          items:
+            type: object
+            additionalProperties:
+              oneOf:
+                - type: 'null'
+                - type: boolean
+                - type: number
+                - type: string
+                - type: array
+                - type: object
+          description: The rows to score.
+        scoring_fn_ids:
+          type: array
+          items:
+            type: string
+          description: >-
+            The scoring function ids to use for the scoring.
+      additionalProperties: false
+      required:
+        - dataset_rows
+        - scoring_fn_ids
+      title: ScoreRowsRequest
+    ScoreResponse:
+      type: object
+      properties:
+        results:
+          type: object
+          additionalProperties:
+            $ref: '#/components/schemas/ScoringResult'
+          description: >-
+            A map of scoring function name to ScoringResult.
+      additionalProperties: false
+      required:
+        - results
+      title: ScoreResponse
+      description: The response from scoring.
    AlgorithmConfig:
      oneOf:
        - $ref: '#/components/schemas/LoraFinetuningConfig'
--- a/llama_stack/apis/scoring_functions/scoring_functions.py
+++ b/llama_stack/apis/scoring_functions/scoring_functions.py
@ -12,16 +12,17 @@ from typing import (
    Literal,
    Optional,
    Protocol,
-    Union,
    runtime_checkable,
+    Union,
 )

 from pydantic import BaseModel, Field
 from typing_extensions import Annotated

+from llama_stack.apis.datasets import DatasetPurpose
+
 from llama_stack.apis.resource import Resource, ResourceType
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
-from llama_stack.apis.datasets import DatasetPurpose

 # Perhaps more structure can be imposed on these functions. Maybe they could be associated
 # with standard metrics so they can be rolled up?
@ -93,6 +94,7 @@ class RegexParserScoringFnParams(BaseModel):
        default_factory=list,
    )

+
 class CustomLLMAsJudgeScoringFnParams(BaseModel):
    type: Literal["custom_llm_as_judge"] = "custom_llm_as_judge"
    judge_model: str
@ -102,6 +104,7 @@ class CustomLLMAsJudgeScoringFnParams(BaseModel):
        default_factory=list,
    )

+
@json_schema_type
 class RegexParserScoringFn(BaseModel):
    type: Literal["regex_parser"] = "regex_parser"
@ -113,36 +116,43 @@ class RegexParserMathScoringFn(BaseModel):
    type: Literal["regex_parser_math_response"] = "regex_parser_math_response"
    regex_parser_math_response: RegexParserScoringFnParams

+
@json_schema_type
 class EqualityScoringFn(BaseModel):
    type: Literal["equality"] = "equality"
    equality: BasicScoringFnParams

+
@json_schema_type
 class SubsetOfScoringFn(BaseModel):
    type: Literal["subset_of"] = "subset_of"
    subset_of: BasicScoringFnParams

+
@json_schema_type
 class FactualityScoringFn(BaseModel):
    type: Literal["factuality"] = "factuality"
    factuality: BasicScoringFnParams

+
@json_schema_type
 class FaithfulnessScoringFn(BaseModel):
    type: Literal["faithfulness"] = "faithfulness"
    faithfulness: BasicScoringFnParams

+
@json_schema_type
 class AnswerCorrectnessScoringFn(BaseModel):
    type: Literal["answer_correctness"] = "answer_correctness"
    answer_correctness: BasicScoringFnParams

+
@json_schema_type
 class AnswerRelevancyScoringFn(BaseModel):
    type: Literal["answer_relevancy"] = "answer_relevancy"
    answer_relevancy: BasicScoringFnParams

+
@json_schema_type
 class AnswerSimilarityScoringFn(BaseModel):
    type: Literal["answer_similarity"] = "answer_similarity"
@ -208,6 +218,7 @@ class CommonScoringFnFields(BaseModel):
    :param fn: The scoring function type and parameters.
    :param metadata: (Optional) Any additional metadata for this definition (e.g. description).
    """
+
    fn: ScoringFnDefinition
    metadata: Dict[str, Any] = Field(
        default_factory=dict,
@ -217,7 +228,9 @@ class CommonScoringFnFields(BaseModel):

@json_schema_type
 class ScoringFn(CommonScoringFnFields, Resource):
-    type: Literal[ResourceType.scoring_function.value] = ResourceType.scoring_function.value
+    type: Literal[ResourceType.scoring_function.value] = (
+        ResourceType.scoring_function.value
+    )

    @property
    def scoring_fn_id(self) -> str:
@ -234,11 +247,12 @@ class ScoringFnTypeInfo(BaseModel):
    :param type: The type of scoring function.
    :param description: A description of the scoring function type.
        - E.g. Write your custom judge prompt to score the answer.
-    :param supported_purposes: The purposes that this scoring function can be used for.
+    :param supported_dataset_purposes: The purposes that this scoring function can be used for.
    """
+
    type: ScoringFunctionType
    description: str
-    supported_purposes: List[DatasetPurpose] = Field(
+    supported_dataset_purposes: List[DatasetPurpose] = Field(
        description="The supported purposes (supported dataset schema) that this scoring function can be used for. E.g. eval/question-answer",
        default_factory=list,
    )