forked from phoenix-oss/llama-stack-mirror
precommit
This commit is contained in:
parent
205a50f10b
commit
bf135f38b1
4 changed files with 11 additions and 11 deletions
2
docs/_static/llama-stack-spec.html
vendored
2
docs/_static/llama-stack-spec.html
vendored
|
@ -8548,7 +8548,7 @@
|
||||||
},
|
},
|
||||||
"additionalProperties": false,
|
"additionalProperties": false,
|
||||||
"title": "EvaluationTask",
|
"title": "EvaluationTask",
|
||||||
"description": "A task for evaluation. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id. Use this when you have a curated dataset and have settled on the graders. - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids. Use this when you have datasets and / or are iterating on your graders. - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids. Prefer this when you are early in your evaluation cycle and experimenting much more with your data and graders."
|
"description": "A task for evaluation. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id. Use this when you have a curated dataset and have settled on the graders. - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids. Use this when you have datasets and / or are iterating on your graders. - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids. Prefer this when you are early in your evaluation cycle and experimenting much more with your data and graders."
|
||||||
},
|
},
|
||||||
"GradeRequest": {
|
"GradeRequest": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
|
|
2
docs/_static/llama-stack-spec.yaml
vendored
2
docs/_static/llama-stack-spec.yaml
vendored
|
@ -5927,7 +5927,7 @@ components:
|
||||||
- `benchmark_id`: Run evaluation task against a benchmark_id. Use this when
|
- `benchmark_id`: Run evaluation task against a benchmark_id. Use this when
|
||||||
you have a curated dataset and have settled on the graders. - `dataset_id`
|
you have a curated dataset and have settled on the graders. - `dataset_id`
|
||||||
and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids.
|
and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids.
|
||||||
Use this when you have datasets and / or are iterating on your graders. -
|
Use this when you have datasets and / or are iterating on your graders. -
|
||||||
`data_source` and `grader_ids`: Run evaluation task against a data source
|
`data_source` and `grader_ids`: Run evaluation task against a data source
|
||||||
(e.g. rows, uri, etc.) and a list of grader_ids. Prefer this when you are
|
(e.g. rows, uri, etc.) and a list of grader_ids. Prefer this when you are
|
||||||
early in your evaluation cycle and experimenting much more with your data
|
early in your evaluation cycle and experimenting much more with your data
|
||||||
|
|
|
@ -52,7 +52,7 @@ class EvaluationTask(BaseModel):
|
||||||
"""
|
"""
|
||||||
A task for evaluation. To specify a task, one of the following must be provided:
|
A task for evaluation. To specify a task, one of the following must be provided:
|
||||||
- `benchmark_id`: Run evaluation task against a benchmark_id. Use this when you have a curated dataset and have settled on the graders.
|
- `benchmark_id`: Run evaluation task against a benchmark_id. Use this when you have a curated dataset and have settled on the graders.
|
||||||
- `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids. Use this when you have datasets and / or are iterating on your graders.
|
- `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids. Use this when you have datasets and / or are iterating on your graders.
|
||||||
- `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids. Prefer this when you are early in your evaluation cycle and experimenting much more with your data and graders.
|
- `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids. Prefer this when you are early in your evaluation cycle and experimenting much more with your data and graders.
|
||||||
|
|
||||||
:param benchmark_id: The benchmark ID to evaluate.
|
:param benchmark_id: The benchmark ID to evaluate.
|
||||||
|
|
|
@ -20,7 +20,7 @@ from typing import (
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
from llama_stack.apis.datasets import DatasetPurpose
|
from llama_stack.apis.datasets import DatasetPurpose
|
||||||
from llama_stack.apis.resource import Resource, ResourceType
|
from llama_stack.apis.resource import Resource
|
||||||
from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
|
from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
|
||||||
|
|
||||||
from .graders import * # noqa: F401 F403
|
from .graders import * # noqa: F401 F403
|
||||||
|
@ -96,37 +96,37 @@ class RegexParserGraderParams(BaseModel):
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
class LlmGrader(BaseModel):
|
class LlmGrader(BaseModel):
|
||||||
type: Literal[GraderType.llm.value] = GraderType.llm.value
|
type: Literal["llm"] = "llm"
|
||||||
llm: LlmGraderParams
|
llm: LlmGraderParams
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
class RegexParserGrader(BaseModel):
|
class RegexParserGrader(BaseModel):
|
||||||
type: Literal[GraderType.regex_parser.value] = GraderType.regex_parser.value
|
type: Literal["regex_parser"] = "regex_parser"
|
||||||
regex_parser: RegexParserGraderParams
|
regex_parser: RegexParserGraderParams
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
class EqualityGrader(BaseModel):
|
class EqualityGrader(BaseModel):
|
||||||
type: Literal[GraderType.equality.value] = GraderType.equality.value
|
type: Literal["equality"] = "equality"
|
||||||
equality: BasicGraderParams
|
equality: BasicGraderParams
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
class SubsetOfGrader(BaseModel):
|
class SubsetOfGrader(BaseModel):
|
||||||
type: Literal[GraderType.subset_of.value] = GraderType.subset_of.value
|
type: Literal["subset_of"] = "subset_of"
|
||||||
subset_of: BasicGraderParams
|
subset_of: BasicGraderParams
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
class FactualityGrader(BaseModel):
|
class FactualityGrader(BaseModel):
|
||||||
type: Literal[GraderType.factuality.value] = GraderType.factuality.value
|
type: Literal["factuality"] = "factuality"
|
||||||
factuality: BasicGraderParams
|
factuality: BasicGraderParams
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
class FaithfulnessGrader(BaseModel):
|
class FaithfulnessGrader(BaseModel):
|
||||||
type: Literal[GraderType.faithfulness.value] = GraderType.faithfulness.value
|
type: Literal["faithfulness"] = "faithfulness"
|
||||||
faithfulness: BasicGraderParams
|
faithfulness: BasicGraderParams
|
||||||
|
|
||||||
|
|
||||||
|
@ -157,7 +157,7 @@ class CommonGraderFields(BaseModel):
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
class Grader(CommonGraderFields, Resource):
|
class Grader(CommonGraderFields, Resource):
|
||||||
type: Literal[ResourceType.grader.value] = ResourceType.grader.value
|
type: Literal["grader"] = "grader"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def grader_id(self) -> str:
|
def grader_id(self) -> str:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue