score

2026-01-08 00:31:29 +00:00 · 2025-03-13 15:35:09 -07:00 · 2025-03-13 15:35:09 -07:00 · 819ffe0518
commit 819ffe0518
parent 2cf769e05e
3 changed files with 190 additions and 176 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -962,7 +962,7 @@
                }
            }
        },
-        "/v1/eval/rows": {
+        "/v1/eval/evaluate_rows": {
            "post": {
                "responses": {
                    "200": {
@ -3631,49 +3631,6 @@
                }
            }
        },
        "/v1/scoring/rows": {
            "post": {
                "responses": {
                    "200": {
                        "description": "ScoreResponse object containing rows and aggregated results",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/ScoreResponse"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Scoring"
                ],
                "description": "Score a list of rows.",
                "parameters": [],
                "requestBody": {
                    "content": {
                        "application/json": {
                            "schema": {
                                "$ref": "#/components/schemas/ScoreRequest"
                            }
                        }
                    },
                    "required": true
                }
            }
        },
        "/v1/scoring/jobs": {
            "post": {
                "responses": {
@ -3717,6 +3674,49 @@
                }
            }
        },
        "/v1/scoring/score-rows": {
            "post": {
                "responses": {
                    "200": {
                        "description": "ScoreResponse object containing rows and aggregated results",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/ScoreResponse"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Scoring"
                ],
                "description": "Score a list of rows.",
                "parameters": [],
                "requestBody": {
                    "content": {
                        "application/json": {
                            "schema": {
                                "$ref": "#/components/schemas/ScoreRowsRequest"
                            }
                        }
                    },
                    "required": true
                }
            }
        },
        "/v1/post-training/supervised-fine-tune": {
            "post": {
                "responses": {
@ -8714,7 +8714,7 @@
                        "type": "string",
                        "description": "A description of the scoring function type. - E.g. Write your custom judge prompt to score the answer."
                    },
-                    "supported_purposes": {
+                    "supported_dataset_purposes": {
                        "type": "array",
                        "items": {
                            "type": "string",
@ -8736,7 +8736,7 @@
                "required": [
                    "type",
                    "description",
-                    "supported_purposes"
+                    "supported_dataset_purposes"
                ],
                "title": "ScoringFnTypeInfo"
            },
@ -10181,7 +10181,46 @@
                ],
                "title": "SaveSpansToDatasetRequest"
            },
-            "ScoreRequest": {
+            "ScoreDatasetRequest": {
                "type": "object",
                "properties": {
                    "dataset_id": {
                        "type": "string"
                    },
                    "scoring_fn_ids": {
                        "type": "array",
                        "items": {
                            "type": "string"
                        }
                    }
                },
                "additionalProperties": false,
                "required": [
                    "dataset_id",
                    "scoring_fn_ids"
                ],
                "title": "ScoreDatasetRequest"
            },
            "ScoreBatchResponse": {
                "type": "object",
                "properties": {
                    "dataset_id": {
                        "type": "string"
                    },
                    "results": {
                        "type": "object",
                        "additionalProperties": {
                            "$ref": "#/components/schemas/ScoringResult"
                        }
                    }
                },
                "additionalProperties": false,
                "required": [
                    "results"
                ],
                "title": "ScoreBatchResponse"
            },
            "ScoreRowsRequest": {
                "type": "object",
                "properties": {
                    "dataset_rows": {
@ -10226,7 +10265,7 @@
                    "dataset_rows",
                    "scoring_fn_ids"
                ],
-                "title": "ScoreRequest"
+                "title": "ScoreRowsRequest"
            },
            "ScoreResponse": {
                "type": "object",
@ -10246,45 +10285,6 @@
                "title": "ScoreResponse",
                "description": "The response from scoring."
            },
            "ScoreDatasetRequest": {
                "type": "object",
                "properties": {
                    "dataset_id": {
                        "type": "string"
                    },
                    "scoring_fn_ids": {
                        "type": "array",
                        "items": {
                            "type": "string"
                        }
                    }
                },
                "additionalProperties": false,
                "required": [
                    "dataset_id",
                    "scoring_fn_ids"
                ],
                "title": "ScoreDatasetRequest"
            },
            "ScoreBatchResponse": {
                "type": "object",
                "properties": {
                    "dataset_id": {
                        "type": "string"
                    },
                    "results": {
                        "type": "object",
                        "additionalProperties": {
                            "$ref": "#/components/schemas/ScoringResult"
                        }
                    }
                },
                "additionalProperties": false,
                "required": [
                    "results"
                ],
                "title": "ScoreBatchResponse"
            },
            "AlgorithmConfig": {
                "oneOf": [
                    {
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -659,7 +659,7 @@ paths:
            schema:
              $ref: '#/components/schemas/EvaluateBenchmarkRequest'
        required: true
-  /v1/eval/rows:
+  /v1/eval/evaluate_rows:
    post:
      responses:
        '200':
@ -2467,36 +2467,6 @@ paths:
            schema:
              $ref: '#/components/schemas/SaveSpansToDatasetRequest'
        required: true
  /v1/scoring/rows:
    post:
      responses:
        '200':
          description: >-
            ScoreResponse object containing rows and aggregated results
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ScoreResponse'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Scoring
      description: Score a list of rows.
      parameters: []
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/ScoreRequest'
        required: true
  /v1/scoring/jobs:
    post:
      responses:
@ -2526,6 +2496,36 @@ paths:
            schema:
              $ref: '#/components/schemas/ScoreDatasetRequest'
        required: true
  /v1/scoring/score-rows:
    post:
      responses:
        '200':
          description: >-
            ScoreResponse object containing rows and aggregated results
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ScoreResponse'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Scoring
      description: Score a list of rows.
      parameters: []
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/ScoreRowsRequest'
        required: true
  /v1/post-training/supervised-fine-tune:
    post:
      responses:
@ -6019,7 +6019,7 @@ components:
          description: >-
            A description of the scoring function type. - E.g. Write your custom judge
            prompt to score the answer.
-        supported_purposes:
+        supported_dataset_purposes:
          type: array
          items:
            type: string
@ -6039,7 +6039,7 @@ components:
      required:
        - type
        - description
-        - supported_purposes
+        - supported_dataset_purposes
      title: ScoringFnTypeInfo
    ListScoringFunctionTypesResponse:
      type: object
@ -6982,47 +6982,6 @@ components:
        - attributes_to_save
        - dataset_id
      title: SaveSpansToDatasetRequest
    ScoreRequest:
      type: object
      properties:
        dataset_rows:
          type: array
          items:
            type: object
            additionalProperties:
              oneOf:
                - type: 'null'
                - type: boolean
                - type: number
                - type: string
                - type: array
                - type: object
          description: The rows to score.
        scoring_fn_ids:
          type: array
          items:
            type: string
          description: >-
            The scoring function ids to use for the scoring.
      additionalProperties: false
      required:
        - dataset_rows
        - scoring_fn_ids
      title: ScoreRequest
    ScoreResponse:
      type: object
      properties:
        results:
          type: object
          additionalProperties:
            $ref: '#/components/schemas/ScoringResult'
          description: >-
            A map of scoring function name to ScoringResult.
      additionalProperties: false
      required:
        - results
      title: ScoreResponse
      description: The response from scoring.
    ScoreDatasetRequest:
      type: object
      properties:
@ -7050,6 +7009,47 @@ components:
      required:
        - results
      title: ScoreBatchResponse
    ScoreRowsRequest:
      type: object
      properties:
        dataset_rows:
          type: array
          items:
            type: object
            additionalProperties:
              oneOf:
                - type: 'null'
                - type: boolean
                - type: number
                - type: string
                - type: array
                - type: object
          description: The rows to score.
        scoring_fn_ids:
          type: array
          items:
            type: string
          description: >-
            The scoring function ids to use for the scoring.
      additionalProperties: false
      required:
        - dataset_rows
        - scoring_fn_ids
      title: ScoreRowsRequest
    ScoreResponse:
      type: object
      properties:
        results:
          type: object
          additionalProperties:
            $ref: '#/components/schemas/ScoringResult'
          description: >-
            A map of scoring function name to ScoringResult.
      additionalProperties: false
      required:
        - results
      title: ScoreResponse
      description: The response from scoring.
    AlgorithmConfig:
      oneOf:
        - $ref: '#/components/schemas/LoraFinetuningConfig'
--- a/llama_stack/apis/scoring_functions/scoring_functions.py
+++ b/llama_stack/apis/scoring_functions/scoring_functions.py
@ -12,16 +12,17 @@ from typing import (
    Literal,
    Optional,
    Protocol,
    Union,
    runtime_checkable,
    Union,
 )
 from pydantic import BaseModel, Field
 from typing_extensions import Annotated
 from llama_stack.apis.datasets import DatasetPurpose
 from llama_stack.apis.resource import Resource, ResourceType
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
 from llama_stack.apis.datasets import DatasetPurpose
 # Perhaps more structure can be imposed on these functions. Maybe they could be associated
 # with standard metrics so they can be rolled up?
@ -93,6 +94,7 @@ class RegexParserScoringFnParams(BaseModel):
        default_factory=list,
    )
 class CustomLLMAsJudgeScoringFnParams(BaseModel):
    type: Literal["custom_llm_as_judge"] = "custom_llm_as_judge"
    judge_model: str
@ -102,6 +104,7 @@ class CustomLLMAsJudgeScoringFnParams(BaseModel):
        default_factory=list,
    )
@json_schema_type
 class RegexParserScoringFn(BaseModel):
    type: Literal["regex_parser"] = "regex_parser"
@ -113,36 +116,43 @@ class RegexParserMathScoringFn(BaseModel):
    type: Literal["regex_parser_math_response"] = "regex_parser_math_response"
    regex_parser_math_response: RegexParserScoringFnParams
@json_schema_type
 class EqualityScoringFn(BaseModel):
    type: Literal["equality"] = "equality"
    equality: BasicScoringFnParams
@json_schema_type
 class SubsetOfScoringFn(BaseModel):
    type: Literal["subset_of"] = "subset_of"
    subset_of: BasicScoringFnParams
@json_schema_type
 class FactualityScoringFn(BaseModel):
    type: Literal["factuality"] = "factuality"
    factuality: BasicScoringFnParams
@json_schema_type
 class FaithfulnessScoringFn(BaseModel):
    type: Literal["faithfulness"] = "faithfulness"
    faithfulness: BasicScoringFnParams
@json_schema_type
 class AnswerCorrectnessScoringFn(BaseModel):
    type: Literal["answer_correctness"] = "answer_correctness"
    answer_correctness: BasicScoringFnParams
@json_schema_type
 class AnswerRelevancyScoringFn(BaseModel):
    type: Literal["answer_relevancy"] = "answer_relevancy"
    answer_relevancy: BasicScoringFnParams
@json_schema_type
 class AnswerSimilarityScoringFn(BaseModel):
    type: Literal["answer_similarity"] = "answer_similarity"
@ -205,9 +215,10 @@ ScoringFnDefinition = register_schema(
 class CommonScoringFnFields(BaseModel):
    """
-    :param fn: The scoring function type and parameters. 
+    :param fn: The scoring function type and parameters.
    :param metadata: (Optional) Any additional metadata for this definition (e.g. description).
    """
    fn: ScoringFnDefinition
    metadata: Dict[str, Any] = Field(
        default_factory=dict,
@ -217,7 +228,9 @@ class CommonScoringFnFields(BaseModel):
@json_schema_type
 class ScoringFn(CommonScoringFnFields, Resource):
-    type: Literal[ResourceType.scoring_function.value] = ResourceType.scoring_function.value
+    type: Literal[ResourceType.scoring_function.value] = (
        ResourceType.scoring_function.value
    )
    @property
    def scoring_fn_id(self) -> str:
@ -231,14 +244,15 @@ class ScoringFn(CommonScoringFnFields, Resource):
@json_schema_type
 class ScoringFnTypeInfo(BaseModel):
    """
-    :param type: The type of scoring function. 
+    :param type: The type of scoring function.
-    :param description: A description of the scoring function type. 
+    :param description: A description of the scoring function type.
-        - E.g. Write your custom judge prompt to score the answer. 
+        - E.g. Write your custom judge prompt to score the answer.
-    :param supported_purposes: The purposes that this scoring function can be used for.
+    :param supported_dataset_purposes: The purposes that this scoring function can be used for.
    """
    type: ScoringFunctionType
    description: str
-    supported_purposes: List[DatasetPurpose] = Field(
+    supported_dataset_purposes: List[DatasetPurpose] = Field(
        description="The supported purposes (supported dataset schema) that this scoring function can be used for. E.g. eval/question-answer",
        default_factory=list,
    )
@ -261,16 +275,16 @@ class ListScoringFunctionTypesResponse(BaseModel):
@runtime_checkable
 class ScoringFunctions(Protocol):
    @webmethod(route="/scoring-functions", method="GET")
-    async def list_scoring_functions(self) -> ListScoringFunctionsResponse: 
+    async def list_scoring_functions(self) -> ListScoringFunctionsResponse:
        """
        List all registered scoring functions.
        """
        ...
    @webmethod(route="/scoring-functions/types", method="GET")
-    async def list_scoring_function_types(self) -> ListScoringFunctionTypesResponse: 
+    async def list_scoring_function_types(self) -> ListScoringFunctionTypesResponse:
        """
-        List all available scoring function types information and how to use them. 
+        List all available scoring function types information and how to use them.
        """
        ...
@ -278,7 +292,7 @@ class ScoringFunctions(Protocol):
    async def get_scoring_function(
        self,
        scoring_fn_id: str,
-    ) -> Optional[ScoringFn]: 
+    ) -> Optional[ScoringFn]:
        """
        Get a scoring function by its ID.
        :param scoring_fn_id: The ID of the scoring function to get.
@ -302,12 +316,12 @@ class ScoringFunctions(Protocol):
            - E.g. {"description": "This scoring function is used for ..."}
        """
        ...
-    
+
    @webmethod(route="/scoring-functions/{scoring_fn_id:path}", method="DELETE")
    async def unregister_scoring_function(
        self,
        scoring_fn_id: str,
-    ) -> None: 
+    ) -> None:
        """
        Unregister a scoring function by its ID.
        :param scoring_fn_id: The ID of the scoring function to unregister.