diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index fd9792bbe..ff2c0d4a3 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -962,7 +962,7 @@
}
}
},
- "/v1/eval/rows": {
+ "/v1/eval/evaluate_rows": {
"post": {
"responses": {
"200": {
@@ -3631,49 +3631,6 @@
}
}
},
- "/v1/scoring/rows": {
- "post": {
- "responses": {
- "200": {
- "description": "ScoreResponse object containing rows and aggregated results",
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/ScoreResponse"
- }
- }
- }
- },
- "400": {
- "$ref": "#/components/responses/BadRequest400"
- },
- "429": {
- "$ref": "#/components/responses/TooManyRequests429"
- },
- "500": {
- "$ref": "#/components/responses/InternalServerError500"
- },
- "default": {
- "$ref": "#/components/responses/DefaultError"
- }
- },
- "tags": [
- "Scoring"
- ],
- "description": "Score a list of rows.",
- "parameters": [],
- "requestBody": {
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/ScoreRequest"
- }
- }
- },
- "required": true
- }
- }
- },
"/v1/scoring/jobs": {
"post": {
"responses": {
@@ -3717,6 +3674,49 @@
}
}
},
+ "/v1/scoring/score-rows": {
+ "post": {
+ "responses": {
+ "200": {
+ "description": "ScoreResponse object containing rows and aggregated results",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/ScoreResponse"
+ }
+ }
+ }
+ },
+ "400": {
+ "$ref": "#/components/responses/BadRequest400"
+ },
+ "429": {
+ "$ref": "#/components/responses/TooManyRequests429"
+ },
+ "500": {
+ "$ref": "#/components/responses/InternalServerError500"
+ },
+ "default": {
+ "$ref": "#/components/responses/DefaultError"
+ }
+ },
+ "tags": [
+ "Scoring"
+ ],
+ "description": "Score a list of rows.",
+ "parameters": [],
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/ScoreRowsRequest"
+ }
+ }
+ },
+ "required": true
+ }
+ }
+ },
"/v1/post-training/supervised-fine-tune": {
"post": {
"responses": {
@@ -8714,7 +8714,7 @@
"type": "string",
"description": "A description of the scoring function type. - E.g. Write your custom judge prompt to score the answer."
},
- "supported_purposes": {
+ "supported_dataset_purposes": {
"type": "array",
"items": {
"type": "string",
@@ -8736,7 +8736,7 @@
"required": [
"type",
"description",
- "supported_purposes"
+ "supported_dataset_purposes"
],
"title": "ScoringFnTypeInfo"
},
@@ -10181,7 +10181,46 @@
],
"title": "SaveSpansToDatasetRequest"
},
- "ScoreRequest": {
+ "ScoreDatasetRequest": {
+ "type": "object",
+ "properties": {
+ "dataset_id": {
+ "type": "string"
+ },
+ "scoring_fn_ids": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "dataset_id",
+ "scoring_fn_ids"
+ ],
+ "title": "ScoreDatasetRequest"
+ },
+ "ScoreBatchResponse": {
+ "type": "object",
+ "properties": {
+ "dataset_id": {
+ "type": "string"
+ },
+ "results": {
+ "type": "object",
+ "additionalProperties": {
+ "$ref": "#/components/schemas/ScoringResult"
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "results"
+ ],
+ "title": "ScoreBatchResponse"
+ },
+ "ScoreRowsRequest": {
"type": "object",
"properties": {
"dataset_rows": {
@@ -10226,7 +10265,7 @@
"dataset_rows",
"scoring_fn_ids"
],
- "title": "ScoreRequest"
+ "title": "ScoreRowsRequest"
},
"ScoreResponse": {
"type": "object",
@@ -10246,45 +10285,6 @@
"title": "ScoreResponse",
"description": "The response from scoring."
},
- "ScoreDatasetRequest": {
- "type": "object",
- "properties": {
- "dataset_id": {
- "type": "string"
- },
- "scoring_fn_ids": {
- "type": "array",
- "items": {
- "type": "string"
- }
- }
- },
- "additionalProperties": false,
- "required": [
- "dataset_id",
- "scoring_fn_ids"
- ],
- "title": "ScoreDatasetRequest"
- },
- "ScoreBatchResponse": {
- "type": "object",
- "properties": {
- "dataset_id": {
- "type": "string"
- },
- "results": {
- "type": "object",
- "additionalProperties": {
- "$ref": "#/components/schemas/ScoringResult"
- }
- }
- },
- "additionalProperties": false,
- "required": [
- "results"
- ],
- "title": "ScoreBatchResponse"
- },
"AlgorithmConfig": {
"oneOf": [
{
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index 6111d8e8a..aa2cca0f1 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -659,7 +659,7 @@ paths:
schema:
$ref: '#/components/schemas/EvaluateBenchmarkRequest'
required: true
- /v1/eval/rows:
+ /v1/eval/evaluate_rows:
post:
responses:
'200':
@@ -2467,36 +2467,6 @@ paths:
schema:
$ref: '#/components/schemas/SaveSpansToDatasetRequest'
required: true
- /v1/scoring/rows:
- post:
- responses:
- '200':
- description: >-
- ScoreResponse object containing rows and aggregated results
- content:
- application/json:
- schema:
- $ref: '#/components/schemas/ScoreResponse'
- '400':
- $ref: '#/components/responses/BadRequest400'
- '429':
- $ref: >-
- #/components/responses/TooManyRequests429
- '500':
- $ref: >-
- #/components/responses/InternalServerError500
- default:
- $ref: '#/components/responses/DefaultError'
- tags:
- - Scoring
- description: Score a list of rows.
- parameters: []
- requestBody:
- content:
- application/json:
- schema:
- $ref: '#/components/schemas/ScoreRequest'
- required: true
/v1/scoring/jobs:
post:
responses:
@@ -2526,6 +2496,36 @@ paths:
schema:
$ref: '#/components/schemas/ScoreDatasetRequest'
required: true
+ /v1/scoring/score-rows:
+ post:
+ responses:
+ '200':
+ description: >-
+ ScoreResponse object containing rows and aggregated results
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ScoreResponse'
+ '400':
+ $ref: '#/components/responses/BadRequest400'
+ '429':
+ $ref: >-
+ #/components/responses/TooManyRequests429
+ '500':
+ $ref: >-
+ #/components/responses/InternalServerError500
+ default:
+ $ref: '#/components/responses/DefaultError'
+ tags:
+ - Scoring
+ description: Score a list of rows.
+ parameters: []
+ requestBody:
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ScoreRowsRequest'
+ required: true
/v1/post-training/supervised-fine-tune:
post:
responses:
@@ -6019,7 +6019,7 @@ components:
description: >-
A description of the scoring function type. - E.g. Write your custom judge
prompt to score the answer.
- supported_purposes:
+ supported_dataset_purposes:
type: array
items:
type: string
@@ -6039,7 +6039,7 @@ components:
required:
- type
- description
- - supported_purposes
+ - supported_dataset_purposes
title: ScoringFnTypeInfo
ListScoringFunctionTypesResponse:
type: object
@@ -6982,47 +6982,6 @@ components:
- attributes_to_save
- dataset_id
title: SaveSpansToDatasetRequest
- ScoreRequest:
- type: object
- properties:
- dataset_rows:
- type: array
- items:
- type: object
- additionalProperties:
- oneOf:
- - type: 'null'
- - type: boolean
- - type: number
- - type: string
- - type: array
- - type: object
- description: The rows to score.
- scoring_fn_ids:
- type: array
- items:
- type: string
- description: >-
- The scoring function ids to use for the scoring.
- additionalProperties: false
- required:
- - dataset_rows
- - scoring_fn_ids
- title: ScoreRequest
- ScoreResponse:
- type: object
- properties:
- results:
- type: object
- additionalProperties:
- $ref: '#/components/schemas/ScoringResult'
- description: >-
- A map of scoring function name to ScoringResult.
- additionalProperties: false
- required:
- - results
- title: ScoreResponse
- description: The response from scoring.
ScoreDatasetRequest:
type: object
properties:
@@ -7050,6 +7009,47 @@ components:
required:
- results
title: ScoreBatchResponse
+ ScoreRowsRequest:
+ type: object
+ properties:
+ dataset_rows:
+ type: array
+ items:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ description: The rows to score.
+ scoring_fn_ids:
+ type: array
+ items:
+ type: string
+ description: >-
+ The scoring function ids to use for the scoring.
+ additionalProperties: false
+ required:
+ - dataset_rows
+ - scoring_fn_ids
+ title: ScoreRowsRequest
+ ScoreResponse:
+ type: object
+ properties:
+ results:
+ type: object
+ additionalProperties:
+ $ref: '#/components/schemas/ScoringResult'
+ description: >-
+ A map of scoring function name to ScoringResult.
+ additionalProperties: false
+ required:
+ - results
+ title: ScoreResponse
+ description: The response from scoring.
AlgorithmConfig:
oneOf:
- $ref: '#/components/schemas/LoraFinetuningConfig'
diff --git a/llama_stack/apis/scoring_functions/scoring_functions.py b/llama_stack/apis/scoring_functions/scoring_functions.py
index 0e7ec4354..a9d00a7f0 100644
--- a/llama_stack/apis/scoring_functions/scoring_functions.py
+++ b/llama_stack/apis/scoring_functions/scoring_functions.py
@@ -12,16 +12,17 @@ from typing import (
Literal,
Optional,
Protocol,
- Union,
runtime_checkable,
+ Union,
)
from pydantic import BaseModel, Field
from typing_extensions import Annotated
+from llama_stack.apis.datasets import DatasetPurpose
+
from llama_stack.apis.resource import Resource, ResourceType
from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
-from llama_stack.apis.datasets import DatasetPurpose
# Perhaps more structure can be imposed on these functions. Maybe they could be associated
# with standard metrics so they can be rolled up?
@@ -93,6 +94,7 @@ class RegexParserScoringFnParams(BaseModel):
default_factory=list,
)
+
class CustomLLMAsJudgeScoringFnParams(BaseModel):
type: Literal["custom_llm_as_judge"] = "custom_llm_as_judge"
judge_model: str
@@ -102,6 +104,7 @@ class CustomLLMAsJudgeScoringFnParams(BaseModel):
default_factory=list,
)
+
@json_schema_type
class RegexParserScoringFn(BaseModel):
type: Literal["regex_parser"] = "regex_parser"
@@ -113,36 +116,43 @@ class RegexParserMathScoringFn(BaseModel):
type: Literal["regex_parser_math_response"] = "regex_parser_math_response"
regex_parser_math_response: RegexParserScoringFnParams
+
@json_schema_type
class EqualityScoringFn(BaseModel):
type: Literal["equality"] = "equality"
equality: BasicScoringFnParams
+
@json_schema_type
class SubsetOfScoringFn(BaseModel):
type: Literal["subset_of"] = "subset_of"
subset_of: BasicScoringFnParams
+
@json_schema_type
class FactualityScoringFn(BaseModel):
type: Literal["factuality"] = "factuality"
factuality: BasicScoringFnParams
+
@json_schema_type
class FaithfulnessScoringFn(BaseModel):
type: Literal["faithfulness"] = "faithfulness"
faithfulness: BasicScoringFnParams
+
@json_schema_type
class AnswerCorrectnessScoringFn(BaseModel):
type: Literal["answer_correctness"] = "answer_correctness"
answer_correctness: BasicScoringFnParams
+
@json_schema_type
class AnswerRelevancyScoringFn(BaseModel):
type: Literal["answer_relevancy"] = "answer_relevancy"
answer_relevancy: BasicScoringFnParams
+
@json_schema_type
class AnswerSimilarityScoringFn(BaseModel):
type: Literal["answer_similarity"] = "answer_similarity"
@@ -205,9 +215,10 @@ ScoringFnDefinition = register_schema(
class CommonScoringFnFields(BaseModel):
"""
- :param fn: The scoring function type and parameters.
+ :param fn: The scoring function type and parameters.
:param metadata: (Optional) Any additional metadata for this definition (e.g. description).
"""
+
fn: ScoringFnDefinition
metadata: Dict[str, Any] = Field(
default_factory=dict,
@@ -217,7 +228,9 @@ class CommonScoringFnFields(BaseModel):
@json_schema_type
class ScoringFn(CommonScoringFnFields, Resource):
- type: Literal[ResourceType.scoring_function.value] = ResourceType.scoring_function.value
+ type: Literal[ResourceType.scoring_function.value] = (
+ ResourceType.scoring_function.value
+ )
@property
def scoring_fn_id(self) -> str:
@@ -231,14 +244,15 @@ class ScoringFn(CommonScoringFnFields, Resource):
@json_schema_type
class ScoringFnTypeInfo(BaseModel):
"""
- :param type: The type of scoring function.
- :param description: A description of the scoring function type.
- - E.g. Write your custom judge prompt to score the answer.
- :param supported_purposes: The purposes that this scoring function can be used for.
+ :param type: The type of scoring function.
+ :param description: A description of the scoring function type.
+ - E.g. Write your custom judge prompt to score the answer.
+ :param supported_dataset_purposes: The purposes that this scoring function can be used for.
"""
+
type: ScoringFunctionType
description: str
- supported_purposes: List[DatasetPurpose] = Field(
+ supported_dataset_purposes: List[DatasetPurpose] = Field(
description="The supported purposes (supported dataset schema) that this scoring function can be used for. E.g. eval/question-answer",
default_factory=list,
)
@@ -261,16 +275,16 @@ class ListScoringFunctionTypesResponse(BaseModel):
@runtime_checkable
class ScoringFunctions(Protocol):
@webmethod(route="/scoring-functions", method="GET")
- async def list_scoring_functions(self) -> ListScoringFunctionsResponse:
+ async def list_scoring_functions(self) -> ListScoringFunctionsResponse:
"""
List all registered scoring functions.
"""
...
@webmethod(route="/scoring-functions/types", method="GET")
- async def list_scoring_function_types(self) -> ListScoringFunctionTypesResponse:
+ async def list_scoring_function_types(self) -> ListScoringFunctionTypesResponse:
"""
- List all available scoring function types information and how to use them.
+ List all available scoring function types information and how to use them.
"""
...
@@ -278,7 +292,7 @@ class ScoringFunctions(Protocol):
async def get_scoring_function(
self,
scoring_fn_id: str,
- ) -> Optional[ScoringFn]:
+ ) -> Optional[ScoringFn]:
"""
Get a scoring function by its ID.
:param scoring_fn_id: The ID of the scoring function to get.
@@ -302,12 +316,12 @@ class ScoringFunctions(Protocol):
- E.g. {"description": "This scoring function is used for ..."}
"""
...
-
+
@webmethod(route="/scoring-functions/{scoring_fn_id:path}", method="DELETE")
async def unregister_scoring_function(
self,
scoring_fn_id: str,
- ) -> None:
+ ) -> None:
"""
Unregister a scoring function by its ID.
:param scoring_fn_id: The ID of the scoring function to unregister.