diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index fd9792bbe..ff2c0d4a3 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -962,7 +962,7 @@ } } }, - "/v1/eval/rows": { + "/v1/eval/evaluate_rows": { "post": { "responses": { "200": { @@ -3631,49 +3631,6 @@ } } }, - "/v1/scoring/rows": { - "post": { - "responses": { - "200": { - "description": "ScoreResponse object containing rows and aggregated results", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ScoreResponse" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Scoring" - ], - "description": "Score a list of rows.", - "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ScoreRequest" - } - } - }, - "required": true - } - } - }, "/v1/scoring/jobs": { "post": { "responses": { @@ -3717,6 +3674,49 @@ } } }, + "/v1/scoring/score-rows": { + "post": { + "responses": { + "200": { + "description": "ScoreResponse object containing rows and aggregated results", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ScoreResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Scoring" + ], + "description": "Score a list of rows.", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ScoreRowsRequest" + } + } + }, + "required": true + } + } + }, "/v1/post-training/supervised-fine-tune": { "post": { "responses": { @@ -8714,7 +8714,7 @@ "type": "string", "description": "A description of the scoring function type. - E.g. Write your custom judge prompt to score the answer." }, - "supported_purposes": { + "supported_dataset_purposes": { "type": "array", "items": { "type": "string", @@ -8736,7 +8736,7 @@ "required": [ "type", "description", - "supported_purposes" + "supported_dataset_purposes" ], "title": "ScoringFnTypeInfo" }, @@ -10181,7 +10181,46 @@ ], "title": "SaveSpansToDatasetRequest" }, - "ScoreRequest": { + "ScoreDatasetRequest": { + "type": "object", + "properties": { + "dataset_id": { + "type": "string" + }, + "scoring_fn_ids": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false, + "required": [ + "dataset_id", + "scoring_fn_ids" + ], + "title": "ScoreDatasetRequest" + }, + "ScoreBatchResponse": { + "type": "object", + "properties": { + "dataset_id": { + "type": "string" + }, + "results": { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/ScoringResult" + } + } + }, + "additionalProperties": false, + "required": [ + "results" + ], + "title": "ScoreBatchResponse" + }, + "ScoreRowsRequest": { "type": "object", "properties": { "dataset_rows": { @@ -10226,7 +10265,7 @@ "dataset_rows", "scoring_fn_ids" ], - "title": "ScoreRequest" + "title": "ScoreRowsRequest" }, "ScoreResponse": { "type": "object", @@ -10246,45 +10285,6 @@ "title": "ScoreResponse", "description": "The response from scoring." }, - "ScoreDatasetRequest": { - "type": "object", - "properties": { - "dataset_id": { - "type": "string" - }, - "scoring_fn_ids": { - "type": "array", - "items": { - "type": "string" - } - } - }, - "additionalProperties": false, - "required": [ - "dataset_id", - "scoring_fn_ids" - ], - "title": "ScoreDatasetRequest" - }, - "ScoreBatchResponse": { - "type": "object", - "properties": { - "dataset_id": { - "type": "string" - }, - "results": { - "type": "object", - "additionalProperties": { - "$ref": "#/components/schemas/ScoringResult" - } - } - }, - "additionalProperties": false, - "required": [ - "results" - ], - "title": "ScoreBatchResponse" - }, "AlgorithmConfig": { "oneOf": [ { diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 6111d8e8a..aa2cca0f1 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -659,7 +659,7 @@ paths: schema: $ref: '#/components/schemas/EvaluateBenchmarkRequest' required: true - /v1/eval/rows: + /v1/eval/evaluate_rows: post: responses: '200': @@ -2467,36 +2467,6 @@ paths: schema: $ref: '#/components/schemas/SaveSpansToDatasetRequest' required: true - /v1/scoring/rows: - post: - responses: - '200': - description: >- - ScoreResponse object containing rows and aggregated results - content: - application/json: - schema: - $ref: '#/components/schemas/ScoreResponse' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Scoring - description: Score a list of rows. - parameters: [] - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/ScoreRequest' - required: true /v1/scoring/jobs: post: responses: @@ -2526,6 +2496,36 @@ paths: schema: $ref: '#/components/schemas/ScoreDatasetRequest' required: true + /v1/scoring/score-rows: + post: + responses: + '200': + description: >- + ScoreResponse object containing rows and aggregated results + content: + application/json: + schema: + $ref: '#/components/schemas/ScoreResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Scoring + description: Score a list of rows. + parameters: [] + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/ScoreRowsRequest' + required: true /v1/post-training/supervised-fine-tune: post: responses: @@ -6019,7 +6019,7 @@ components: description: >- A description of the scoring function type. - E.g. Write your custom judge prompt to score the answer. - supported_purposes: + supported_dataset_purposes: type: array items: type: string @@ -6039,7 +6039,7 @@ components: required: - type - description - - supported_purposes + - supported_dataset_purposes title: ScoringFnTypeInfo ListScoringFunctionTypesResponse: type: object @@ -6982,47 +6982,6 @@ components: - attributes_to_save - dataset_id title: SaveSpansToDatasetRequest - ScoreRequest: - type: object - properties: - dataset_rows: - type: array - items: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - description: The rows to score. - scoring_fn_ids: - type: array - items: - type: string - description: >- - The scoring function ids to use for the scoring. - additionalProperties: false - required: - - dataset_rows - - scoring_fn_ids - title: ScoreRequest - ScoreResponse: - type: object - properties: - results: - type: object - additionalProperties: - $ref: '#/components/schemas/ScoringResult' - description: >- - A map of scoring function name to ScoringResult. - additionalProperties: false - required: - - results - title: ScoreResponse - description: The response from scoring. ScoreDatasetRequest: type: object properties: @@ -7050,6 +7009,47 @@ components: required: - results title: ScoreBatchResponse + ScoreRowsRequest: + type: object + properties: + dataset_rows: + type: array + items: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + description: The rows to score. + scoring_fn_ids: + type: array + items: + type: string + description: >- + The scoring function ids to use for the scoring. + additionalProperties: false + required: + - dataset_rows + - scoring_fn_ids + title: ScoreRowsRequest + ScoreResponse: + type: object + properties: + results: + type: object + additionalProperties: + $ref: '#/components/schemas/ScoringResult' + description: >- + A map of scoring function name to ScoringResult. + additionalProperties: false + required: + - results + title: ScoreResponse + description: The response from scoring. AlgorithmConfig: oneOf: - $ref: '#/components/schemas/LoraFinetuningConfig' diff --git a/llama_stack/apis/scoring_functions/scoring_functions.py b/llama_stack/apis/scoring_functions/scoring_functions.py index 0e7ec4354..a9d00a7f0 100644 --- a/llama_stack/apis/scoring_functions/scoring_functions.py +++ b/llama_stack/apis/scoring_functions/scoring_functions.py @@ -12,16 +12,17 @@ from typing import ( Literal, Optional, Protocol, - Union, runtime_checkable, + Union, ) from pydantic import BaseModel, Field from typing_extensions import Annotated +from llama_stack.apis.datasets import DatasetPurpose + from llama_stack.apis.resource import Resource, ResourceType from llama_stack.schema_utils import json_schema_type, register_schema, webmethod -from llama_stack.apis.datasets import DatasetPurpose # Perhaps more structure can be imposed on these functions. Maybe they could be associated # with standard metrics so they can be rolled up? @@ -93,6 +94,7 @@ class RegexParserScoringFnParams(BaseModel): default_factory=list, ) + class CustomLLMAsJudgeScoringFnParams(BaseModel): type: Literal["custom_llm_as_judge"] = "custom_llm_as_judge" judge_model: str @@ -102,6 +104,7 @@ class CustomLLMAsJudgeScoringFnParams(BaseModel): default_factory=list, ) + @json_schema_type class RegexParserScoringFn(BaseModel): type: Literal["regex_parser"] = "regex_parser" @@ -113,36 +116,43 @@ class RegexParserMathScoringFn(BaseModel): type: Literal["regex_parser_math_response"] = "regex_parser_math_response" regex_parser_math_response: RegexParserScoringFnParams + @json_schema_type class EqualityScoringFn(BaseModel): type: Literal["equality"] = "equality" equality: BasicScoringFnParams + @json_schema_type class SubsetOfScoringFn(BaseModel): type: Literal["subset_of"] = "subset_of" subset_of: BasicScoringFnParams + @json_schema_type class FactualityScoringFn(BaseModel): type: Literal["factuality"] = "factuality" factuality: BasicScoringFnParams + @json_schema_type class FaithfulnessScoringFn(BaseModel): type: Literal["faithfulness"] = "faithfulness" faithfulness: BasicScoringFnParams + @json_schema_type class AnswerCorrectnessScoringFn(BaseModel): type: Literal["answer_correctness"] = "answer_correctness" answer_correctness: BasicScoringFnParams + @json_schema_type class AnswerRelevancyScoringFn(BaseModel): type: Literal["answer_relevancy"] = "answer_relevancy" answer_relevancy: BasicScoringFnParams + @json_schema_type class AnswerSimilarityScoringFn(BaseModel): type: Literal["answer_similarity"] = "answer_similarity" @@ -205,9 +215,10 @@ ScoringFnDefinition = register_schema( class CommonScoringFnFields(BaseModel): """ - :param fn: The scoring function type and parameters. + :param fn: The scoring function type and parameters. :param metadata: (Optional) Any additional metadata for this definition (e.g. description). """ + fn: ScoringFnDefinition metadata: Dict[str, Any] = Field( default_factory=dict, @@ -217,7 +228,9 @@ class CommonScoringFnFields(BaseModel): @json_schema_type class ScoringFn(CommonScoringFnFields, Resource): - type: Literal[ResourceType.scoring_function.value] = ResourceType.scoring_function.value + type: Literal[ResourceType.scoring_function.value] = ( + ResourceType.scoring_function.value + ) @property def scoring_fn_id(self) -> str: @@ -231,14 +244,15 @@ class ScoringFn(CommonScoringFnFields, Resource): @json_schema_type class ScoringFnTypeInfo(BaseModel): """ - :param type: The type of scoring function. - :param description: A description of the scoring function type. - - E.g. Write your custom judge prompt to score the answer. - :param supported_purposes: The purposes that this scoring function can be used for. + :param type: The type of scoring function. + :param description: A description of the scoring function type. + - E.g. Write your custom judge prompt to score the answer. + :param supported_dataset_purposes: The purposes that this scoring function can be used for. """ + type: ScoringFunctionType description: str - supported_purposes: List[DatasetPurpose] = Field( + supported_dataset_purposes: List[DatasetPurpose] = Field( description="The supported purposes (supported dataset schema) that this scoring function can be used for. E.g. eval/question-answer", default_factory=list, ) @@ -261,16 +275,16 @@ class ListScoringFunctionTypesResponse(BaseModel): @runtime_checkable class ScoringFunctions(Protocol): @webmethod(route="/scoring-functions", method="GET") - async def list_scoring_functions(self) -> ListScoringFunctionsResponse: + async def list_scoring_functions(self) -> ListScoringFunctionsResponse: """ List all registered scoring functions. """ ... @webmethod(route="/scoring-functions/types", method="GET") - async def list_scoring_function_types(self) -> ListScoringFunctionTypesResponse: + async def list_scoring_function_types(self) -> ListScoringFunctionTypesResponse: """ - List all available scoring function types information and how to use them. + List all available scoring function types information and how to use them. """ ... @@ -278,7 +292,7 @@ class ScoringFunctions(Protocol): async def get_scoring_function( self, scoring_fn_id: str, - ) -> Optional[ScoringFn]: + ) -> Optional[ScoringFn]: """ Get a scoring function by its ID. :param scoring_fn_id: The ID of the scoring function to get. @@ -302,12 +316,12 @@ class ScoringFunctions(Protocol): - E.g. {"description": "This scoring function is used for ..."} """ ... - + @webmethod(route="/scoring-functions/{scoring_fn_id:path}", method="DELETE") async def unregister_scoring_function( self, scoring_fn_id: str, - ) -> None: + ) -> None: """ Unregister a scoring function by its ID. :param scoring_fn_id: The ID of the scoring function to unregister.