mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-07 11:08:20 +00:00
score
This commit is contained in:
parent
2cf769e05e
commit
819ffe0518
3 changed files with 190 additions and 176 deletions
174
docs/_static/llama-stack-spec.html
vendored
174
docs/_static/llama-stack-spec.html
vendored
|
@ -962,7 +962,7 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"/v1/eval/rows": {
|
||||
"/v1/eval/evaluate_rows": {
|
||||
"post": {
|
||||
"responses": {
|
||||
"200": {
|
||||
|
@ -3631,49 +3631,6 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"/v1/scoring/rows": {
|
||||
"post": {
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "ScoreResponse object containing rows and aggregated results",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/ScoreResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"400": {
|
||||
"$ref": "#/components/responses/BadRequest400"
|
||||
},
|
||||
"429": {
|
||||
"$ref": "#/components/responses/TooManyRequests429"
|
||||
},
|
||||
"500": {
|
||||
"$ref": "#/components/responses/InternalServerError500"
|
||||
},
|
||||
"default": {
|
||||
"$ref": "#/components/responses/DefaultError"
|
||||
}
|
||||
},
|
||||
"tags": [
|
||||
"Scoring"
|
||||
],
|
||||
"description": "Score a list of rows.",
|
||||
"parameters": [],
|
||||
"requestBody": {
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/ScoreRequest"
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"/v1/scoring/jobs": {
|
||||
"post": {
|
||||
"responses": {
|
||||
|
@ -3717,6 +3674,49 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"/v1/scoring/score-rows": {
|
||||
"post": {
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "ScoreResponse object containing rows and aggregated results",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/ScoreResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"400": {
|
||||
"$ref": "#/components/responses/BadRequest400"
|
||||
},
|
||||
"429": {
|
||||
"$ref": "#/components/responses/TooManyRequests429"
|
||||
},
|
||||
"500": {
|
||||
"$ref": "#/components/responses/InternalServerError500"
|
||||
},
|
||||
"default": {
|
||||
"$ref": "#/components/responses/DefaultError"
|
||||
}
|
||||
},
|
||||
"tags": [
|
||||
"Scoring"
|
||||
],
|
||||
"description": "Score a list of rows.",
|
||||
"parameters": [],
|
||||
"requestBody": {
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/ScoreRowsRequest"
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"/v1/post-training/supervised-fine-tune": {
|
||||
"post": {
|
||||
"responses": {
|
||||
|
@ -8714,7 +8714,7 @@
|
|||
"type": "string",
|
||||
"description": "A description of the scoring function type. - E.g. Write your custom judge prompt to score the answer."
|
||||
},
|
||||
"supported_purposes": {
|
||||
"supported_dataset_purposes": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
|
@ -8736,7 +8736,7 @@
|
|||
"required": [
|
||||
"type",
|
||||
"description",
|
||||
"supported_purposes"
|
||||
"supported_dataset_purposes"
|
||||
],
|
||||
"title": "ScoringFnTypeInfo"
|
||||
},
|
||||
|
@ -10181,7 +10181,46 @@
|
|||
],
|
||||
"title": "SaveSpansToDatasetRequest"
|
||||
},
|
||||
"ScoreRequest": {
|
||||
"ScoreDatasetRequest": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"dataset_id": {
|
||||
"type": "string"
|
||||
},
|
||||
"scoring_fn_ids": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"dataset_id",
|
||||
"scoring_fn_ids"
|
||||
],
|
||||
"title": "ScoreDatasetRequest"
|
||||
},
|
||||
"ScoreBatchResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"dataset_id": {
|
||||
"type": "string"
|
||||
},
|
||||
"results": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"$ref": "#/components/schemas/ScoringResult"
|
||||
}
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"results"
|
||||
],
|
||||
"title": "ScoreBatchResponse"
|
||||
},
|
||||
"ScoreRowsRequest": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"dataset_rows": {
|
||||
|
@ -10226,7 +10265,7 @@
|
|||
"dataset_rows",
|
||||
"scoring_fn_ids"
|
||||
],
|
||||
"title": "ScoreRequest"
|
||||
"title": "ScoreRowsRequest"
|
||||
},
|
||||
"ScoreResponse": {
|
||||
"type": "object",
|
||||
|
@ -10246,45 +10285,6 @@
|
|||
"title": "ScoreResponse",
|
||||
"description": "The response from scoring."
|
||||
},
|
||||
"ScoreDatasetRequest": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"dataset_id": {
|
||||
"type": "string"
|
||||
},
|
||||
"scoring_fn_ids": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"dataset_id",
|
||||
"scoring_fn_ids"
|
||||
],
|
||||
"title": "ScoreDatasetRequest"
|
||||
},
|
||||
"ScoreBatchResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"dataset_id": {
|
||||
"type": "string"
|
||||
},
|
||||
"results": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"$ref": "#/components/schemas/ScoringResult"
|
||||
}
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"results"
|
||||
],
|
||||
"title": "ScoreBatchResponse"
|
||||
},
|
||||
"AlgorithmConfig": {
|
||||
"oneOf": [
|
||||
{
|
||||
|
|
148
docs/_static/llama-stack-spec.yaml
vendored
148
docs/_static/llama-stack-spec.yaml
vendored
|
@ -659,7 +659,7 @@ paths:
|
|||
schema:
|
||||
$ref: '#/components/schemas/EvaluateBenchmarkRequest'
|
||||
required: true
|
||||
/v1/eval/rows:
|
||||
/v1/eval/evaluate_rows:
|
||||
post:
|
||||
responses:
|
||||
'200':
|
||||
|
@ -2467,36 +2467,6 @@ paths:
|
|||
schema:
|
||||
$ref: '#/components/schemas/SaveSpansToDatasetRequest'
|
||||
required: true
|
||||
/v1/scoring/rows:
|
||||
post:
|
||||
responses:
|
||||
'200':
|
||||
description: >-
|
||||
ScoreResponse object containing rows and aggregated results
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/ScoreResponse'
|
||||
'400':
|
||||
$ref: '#/components/responses/BadRequest400'
|
||||
'429':
|
||||
$ref: >-
|
||||
#/components/responses/TooManyRequests429
|
||||
'500':
|
||||
$ref: >-
|
||||
#/components/responses/InternalServerError500
|
||||
default:
|
||||
$ref: '#/components/responses/DefaultError'
|
||||
tags:
|
||||
- Scoring
|
||||
description: Score a list of rows.
|
||||
parameters: []
|
||||
requestBody:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/ScoreRequest'
|
||||
required: true
|
||||
/v1/scoring/jobs:
|
||||
post:
|
||||
responses:
|
||||
|
@ -2526,6 +2496,36 @@ paths:
|
|||
schema:
|
||||
$ref: '#/components/schemas/ScoreDatasetRequest'
|
||||
required: true
|
||||
/v1/scoring/score-rows:
|
||||
post:
|
||||
responses:
|
||||
'200':
|
||||
description: >-
|
||||
ScoreResponse object containing rows and aggregated results
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/ScoreResponse'
|
||||
'400':
|
||||
$ref: '#/components/responses/BadRequest400'
|
||||
'429':
|
||||
$ref: >-
|
||||
#/components/responses/TooManyRequests429
|
||||
'500':
|
||||
$ref: >-
|
||||
#/components/responses/InternalServerError500
|
||||
default:
|
||||
$ref: '#/components/responses/DefaultError'
|
||||
tags:
|
||||
- Scoring
|
||||
description: Score a list of rows.
|
||||
parameters: []
|
||||
requestBody:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/ScoreRowsRequest'
|
||||
required: true
|
||||
/v1/post-training/supervised-fine-tune:
|
||||
post:
|
||||
responses:
|
||||
|
@ -6019,7 +6019,7 @@ components:
|
|||
description: >-
|
||||
A description of the scoring function type. - E.g. Write your custom judge
|
||||
prompt to score the answer.
|
||||
supported_purposes:
|
||||
supported_dataset_purposes:
|
||||
type: array
|
||||
items:
|
||||
type: string
|
||||
|
@ -6039,7 +6039,7 @@ components:
|
|||
required:
|
||||
- type
|
||||
- description
|
||||
- supported_purposes
|
||||
- supported_dataset_purposes
|
||||
title: ScoringFnTypeInfo
|
||||
ListScoringFunctionTypesResponse:
|
||||
type: object
|
||||
|
@ -6982,47 +6982,6 @@ components:
|
|||
- attributes_to_save
|
||||
- dataset_id
|
||||
title: SaveSpansToDatasetRequest
|
||||
ScoreRequest:
|
||||
type: object
|
||||
properties:
|
||||
dataset_rows:
|
||||
type: array
|
||||
items:
|
||||
type: object
|
||||
additionalProperties:
|
||||
oneOf:
|
||||
- type: 'null'
|
||||
- type: boolean
|
||||
- type: number
|
||||
- type: string
|
||||
- type: array
|
||||
- type: object
|
||||
description: The rows to score.
|
||||
scoring_fn_ids:
|
||||
type: array
|
||||
items:
|
||||
type: string
|
||||
description: >-
|
||||
The scoring function ids to use for the scoring.
|
||||
additionalProperties: false
|
||||
required:
|
||||
- dataset_rows
|
||||
- scoring_fn_ids
|
||||
title: ScoreRequest
|
||||
ScoreResponse:
|
||||
type: object
|
||||
properties:
|
||||
results:
|
||||
type: object
|
||||
additionalProperties:
|
||||
$ref: '#/components/schemas/ScoringResult'
|
||||
description: >-
|
||||
A map of scoring function name to ScoringResult.
|
||||
additionalProperties: false
|
||||
required:
|
||||
- results
|
||||
title: ScoreResponse
|
||||
description: The response from scoring.
|
||||
ScoreDatasetRequest:
|
||||
type: object
|
||||
properties:
|
||||
|
@ -7050,6 +7009,47 @@ components:
|
|||
required:
|
||||
- results
|
||||
title: ScoreBatchResponse
|
||||
ScoreRowsRequest:
|
||||
type: object
|
||||
properties:
|
||||
dataset_rows:
|
||||
type: array
|
||||
items:
|
||||
type: object
|
||||
additionalProperties:
|
||||
oneOf:
|
||||
- type: 'null'
|
||||
- type: boolean
|
||||
- type: number
|
||||
- type: string
|
||||
- type: array
|
||||
- type: object
|
||||
description: The rows to score.
|
||||
scoring_fn_ids:
|
||||
type: array
|
||||
items:
|
||||
type: string
|
||||
description: >-
|
||||
The scoring function ids to use for the scoring.
|
||||
additionalProperties: false
|
||||
required:
|
||||
- dataset_rows
|
||||
- scoring_fn_ids
|
||||
title: ScoreRowsRequest
|
||||
ScoreResponse:
|
||||
type: object
|
||||
properties:
|
||||
results:
|
||||
type: object
|
||||
additionalProperties:
|
||||
$ref: '#/components/schemas/ScoringResult'
|
||||
description: >-
|
||||
A map of scoring function name to ScoringResult.
|
||||
additionalProperties: false
|
||||
required:
|
||||
- results
|
||||
title: ScoreResponse
|
||||
description: The response from scoring.
|
||||
AlgorithmConfig:
|
||||
oneOf:
|
||||
- $ref: '#/components/schemas/LoraFinetuningConfig'
|
||||
|
|
|
@ -12,16 +12,17 @@ from typing import (
|
|||
Literal,
|
||||
Optional,
|
||||
Protocol,
|
||||
Union,
|
||||
runtime_checkable,
|
||||
Union,
|
||||
)
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
from typing_extensions import Annotated
|
||||
|
||||
from llama_stack.apis.datasets import DatasetPurpose
|
||||
|
||||
from llama_stack.apis.resource import Resource, ResourceType
|
||||
from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
|
||||
from llama_stack.apis.datasets import DatasetPurpose
|
||||
|
||||
# Perhaps more structure can be imposed on these functions. Maybe they could be associated
|
||||
# with standard metrics so they can be rolled up?
|
||||
|
@ -93,6 +94,7 @@ class RegexParserScoringFnParams(BaseModel):
|
|||
default_factory=list,
|
||||
)
|
||||
|
||||
|
||||
class CustomLLMAsJudgeScoringFnParams(BaseModel):
|
||||
type: Literal["custom_llm_as_judge"] = "custom_llm_as_judge"
|
||||
judge_model: str
|
||||
|
@ -102,6 +104,7 @@ class CustomLLMAsJudgeScoringFnParams(BaseModel):
|
|||
default_factory=list,
|
||||
)
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class RegexParserScoringFn(BaseModel):
|
||||
type: Literal["regex_parser"] = "regex_parser"
|
||||
|
@ -113,36 +116,43 @@ class RegexParserMathScoringFn(BaseModel):
|
|||
type: Literal["regex_parser_math_response"] = "regex_parser_math_response"
|
||||
regex_parser_math_response: RegexParserScoringFnParams
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class EqualityScoringFn(BaseModel):
|
||||
type: Literal["equality"] = "equality"
|
||||
equality: BasicScoringFnParams
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class SubsetOfScoringFn(BaseModel):
|
||||
type: Literal["subset_of"] = "subset_of"
|
||||
subset_of: BasicScoringFnParams
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class FactualityScoringFn(BaseModel):
|
||||
type: Literal["factuality"] = "factuality"
|
||||
factuality: BasicScoringFnParams
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class FaithfulnessScoringFn(BaseModel):
|
||||
type: Literal["faithfulness"] = "faithfulness"
|
||||
faithfulness: BasicScoringFnParams
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class AnswerCorrectnessScoringFn(BaseModel):
|
||||
type: Literal["answer_correctness"] = "answer_correctness"
|
||||
answer_correctness: BasicScoringFnParams
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class AnswerRelevancyScoringFn(BaseModel):
|
||||
type: Literal["answer_relevancy"] = "answer_relevancy"
|
||||
answer_relevancy: BasicScoringFnParams
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class AnswerSimilarityScoringFn(BaseModel):
|
||||
type: Literal["answer_similarity"] = "answer_similarity"
|
||||
|
@ -208,6 +218,7 @@ class CommonScoringFnFields(BaseModel):
|
|||
:param fn: The scoring function type and parameters.
|
||||
:param metadata: (Optional) Any additional metadata for this definition (e.g. description).
|
||||
"""
|
||||
|
||||
fn: ScoringFnDefinition
|
||||
metadata: Dict[str, Any] = Field(
|
||||
default_factory=dict,
|
||||
|
@ -217,7 +228,9 @@ class CommonScoringFnFields(BaseModel):
|
|||
|
||||
@json_schema_type
|
||||
class ScoringFn(CommonScoringFnFields, Resource):
|
||||
type: Literal[ResourceType.scoring_function.value] = ResourceType.scoring_function.value
|
||||
type: Literal[ResourceType.scoring_function.value] = (
|
||||
ResourceType.scoring_function.value
|
||||
)
|
||||
|
||||
@property
|
||||
def scoring_fn_id(self) -> str:
|
||||
|
@ -234,11 +247,12 @@ class ScoringFnTypeInfo(BaseModel):
|
|||
:param type: The type of scoring function.
|
||||
:param description: A description of the scoring function type.
|
||||
- E.g. Write your custom judge prompt to score the answer.
|
||||
:param supported_purposes: The purposes that this scoring function can be used for.
|
||||
:param supported_dataset_purposes: The purposes that this scoring function can be used for.
|
||||
"""
|
||||
|
||||
type: ScoringFunctionType
|
||||
description: str
|
||||
supported_purposes: List[DatasetPurpose] = Field(
|
||||
supported_dataset_purposes: List[DatasetPurpose] = Field(
|
||||
description="The supported purposes (supported dataset schema) that this scoring function can be used for. E.g. eval/question-answer",
|
||||
default_factory=list,
|
||||
)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue