precommit

This commit is contained in:
Xi Yan 2025-03-23 16:00:48 -07:00
parent 45f6d5cd08
commit 3f8c7a584a
8 changed files with 31 additions and 1037 deletions

View file

@ -6,12 +6,10 @@
"chardet",
"chromadb-client",
"datasets",
"emoji",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"langdetect",
"matplotlib",
"mcp",
"nltk",
@ -23,7 +21,6 @@
"psycopg2-binary",
"pymongo",
"pypdf",
"pythainlp",
"redis",
"requests",
"scikit-learn",
@ -40,12 +37,10 @@
"chardet",
"chromadb-client",
"datasets",
"emoji",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"langdetect",
"matplotlib",
"nltk",
"numpy",
@ -56,7 +51,6 @@
"psycopg2-binary",
"pymongo",
"pypdf",
"pythainlp",
"redis",
"requests",
"scikit-learn",
@ -74,12 +68,10 @@
"chardet",
"chromadb-client",
"datasets",
"emoji",
"fastapi",
"fire",
"fireworks-ai",
"httpx",
"langdetect",
"matplotlib",
"mcp",
"nltk",
@ -91,7 +83,6 @@
"psycopg2-binary",
"pymongo",
"pypdf",
"pythainlp",
"redis",
"requests",
"scikit-learn",
@ -111,13 +102,11 @@
"chardet",
"chromadb-client",
"datasets",
"emoji",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"huggingface_hub",
"langdetect",
"matplotlib",
"nltk",
"numpy",
@ -128,7 +117,6 @@
"psycopg2-binary",
"pymongo",
"pypdf",
"pythainlp",
"redis",
"requests",
"scikit-learn",
@ -146,12 +134,10 @@
"chardet",
"chromadb-client",
"datasets",
"emoji",
"fastapi",
"fire",
"fireworks-ai",
"httpx",
"langdetect",
"litellm",
"matplotlib",
"mcp",
@ -164,7 +150,6 @@
"psycopg2-binary",
"pymongo",
"pypdf",
"pythainlp",
"redis",
"requests",
"scikit-learn",
@ -183,13 +168,11 @@
"chardet",
"chromadb-client",
"datasets",
"emoji",
"faiss-cpu",
"fastapi",
"fire",
"fireworks-ai",
"httpx",
"langdetect",
"matplotlib",
"mcp",
"nltk",
@ -201,7 +184,6 @@
"psycopg2-binary",
"pymongo",
"pypdf",
"pythainlp",
"redis",
"requests",
"scikit-learn",
@ -218,12 +200,10 @@
"blobfile",
"chardet",
"datasets",
"emoji",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"langdetect",
"litellm",
"matplotlib",
"nltk",
@ -235,7 +215,6 @@
"psycopg2-binary",
"pymongo",
"pypdf",
"pythainlp",
"redis",
"requests",
"scikit-learn",
@ -252,13 +231,11 @@
"chardet",
"chromadb-client",
"datasets",
"emoji",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"huggingface_hub",
"langdetect",
"matplotlib",
"mcp",
"nltk",
@ -270,7 +247,6 @@
"psycopg2-binary",
"pymongo",
"pypdf",
"pythainlp",
"redis",
"requests",
"scikit-learn",
@ -287,13 +263,11 @@
"chardet",
"chromadb-client",
"datasets",
"emoji",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"huggingface_hub",
"langdetect",
"matplotlib",
"mcp",
"nltk",
@ -305,7 +279,6 @@
"psycopg2-binary",
"pymongo",
"pypdf",
"pythainlp",
"redis",
"requests",
"scikit-learn",
@ -324,13 +297,11 @@
"chardet",
"chromadb-client",
"datasets",
"emoji",
"fairscale",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"langdetect",
"lm-format-enforcer",
"matplotlib",
"mcp",
@ -343,7 +314,6 @@
"psycopg2-binary",
"pymongo",
"pypdf",
"pythainlp",
"redis",
"requests",
"scikit-learn",
@ -364,14 +334,12 @@
"chardet",
"chromadb-client",
"datasets",
"emoji",
"fairscale",
"faiss-cpu",
"fastapi",
"fbgemm-gpu",
"fire",
"httpx",
"langdetect",
"lm-format-enforcer",
"matplotlib",
"mcp",
@ -384,7 +352,6 @@
"psycopg2-binary",
"pymongo",
"pypdf",
"pythainlp",
"redis",
"requests",
"scikit-learn",
@ -403,12 +370,10 @@
"aiosqlite",
"blobfile",
"chardet",
"emoji",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"langdetect",
"matplotlib",
"nltk",
"numpy",
@ -420,7 +385,6 @@
"psycopg2-binary",
"pymongo",
"pypdf",
"pythainlp",
"redis",
"requests",
"scikit-learn",
@ -437,12 +401,10 @@
"chardet",
"chromadb-client",
"datasets",
"emoji",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"langdetect",
"matplotlib",
"mcp",
"nltk",
@ -455,7 +417,6 @@
"psycopg2-binary",
"pymongo",
"pypdf",
"pythainlp",
"redis",
"requests",
"scikit-learn",
@ -471,11 +432,9 @@
"chardet",
"chromadb-client",
"datasets",
"emoji",
"fastapi",
"fire",
"httpx",
"langdetect",
"litellm",
"matplotlib",
"mcp",
@ -488,7 +447,6 @@
"psycopg2-binary",
"pymongo",
"pypdf",
"pythainlp",
"redis",
"requests",
"scikit-learn",
@ -506,12 +464,10 @@
"chardet",
"chromadb-client",
"datasets",
"emoji",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"langdetect",
"matplotlib",
"mcp",
"nltk",
@ -523,7 +479,6 @@
"psycopg2-binary",
"pymongo",
"pypdf",
"pythainlp",
"redis",
"requests",
"scikit-learn",
@ -541,12 +496,10 @@
"chardet",
"chromadb-client",
"datasets",
"emoji",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"langdetect",
"matplotlib",
"mcp",
"nltk",
@ -559,7 +512,6 @@
"psycopg2-binary",
"pymongo",
"pypdf",
"pythainlp",
"redis",
"requests",
"scikit-learn",
@ -607,13 +559,11 @@
"chardet",
"chromadb-client",
"datasets",
"emoji",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"huggingface_hub",
"langdetect",
"matplotlib",
"mcp",
"nltk",
@ -625,7 +575,6 @@
"psycopg2-binary",
"pymongo",
"pypdf",
"pythainlp",
"redis",
"requests",
"scikit-learn",
@ -643,12 +592,10 @@
"chardet",
"chromadb-client",
"datasets",
"emoji",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"langdetect",
"matplotlib",
"mcp",
"nltk",
@ -660,7 +607,6 @@
"psycopg2-binary",
"pymongo",
"pypdf",
"pythainlp",
"redis",
"requests",
"scikit-learn",
@ -679,12 +625,10 @@
"chardet",
"chromadb-client",
"datasets",
"emoji",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"langdetect",
"matplotlib",
"mcp",
"nltk",
@ -696,7 +640,6 @@
"psycopg2-binary",
"pymongo",
"pypdf",
"pythainlp",
"redis",
"requests",
"scikit-learn",

View file

@ -2285,7 +2285,7 @@
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/Job"
"$ref": "#/components/schemas/ListAgentSessionsResponse"
}
}
}
@ -6192,382 +6192,6 @@
"title": "EmbeddingsResponse",
"description": "Response containing generated embeddings."
},
"AgentCandidate": {
"type": "object",
"properties": {
"type": {
"type": "string",
"const": "agent",
"default": "agent"
},
"config": {
"$ref": "#/components/schemas/AgentConfig",
"description": "The configuration for the agent candidate."
}
},
"additionalProperties": false,
"required": [
"type",
"config"
],
"title": "AgentCandidate",
"description": "An agent candidate for evaluation."
},
"AggregationFunctionType": {
"type": "string",
"enum": [
"average",
"weighted_average",
"median",
"categorical_count",
"accuracy"
],
"title": "AggregationFunctionType"
},
"BasicScoringFnParams": {
"type": "object",
"properties": {
"type": {
"type": "string",
"const": "basic",
"default": "basic"
},
"aggregation_functions": {
"type": "array",
"items": {
"$ref": "#/components/schemas/AggregationFunctionType"
}
}
},
"additionalProperties": false,
"required": [
"type"
],
"title": "BasicScoringFnParams"
},
"BenchmarkConfig": {
"type": "object",
"properties": {
"eval_candidate": {
"$ref": "#/components/schemas/EvalCandidate",
"description": "The candidate to evaluate."
},
"scoring_params": {
"type": "object",
"additionalProperties": {
"$ref": "#/components/schemas/ScoringFnParams"
},
"description": "Map between scoring function id and parameters for each scoring function you want to run"
},
"num_examples": {
"type": "integer",
"description": "(Optional) The number of examples to evaluate. If not provided, all examples in the dataset will be evaluated"
}
},
"additionalProperties": false,
"required": [
"eval_candidate",
"scoring_params"
],
"title": "BenchmarkConfig",
"description": "A benchmark configuration for evaluation."
},
"EvalCandidate": {
"oneOf": [
{
"$ref": "#/components/schemas/ModelCandidate"
},
{
"$ref": "#/components/schemas/AgentCandidate"
}
],
"discriminator": {
"propertyName": "type",
"mapping": {
"model": "#/components/schemas/ModelCandidate",
"agent": "#/components/schemas/AgentCandidate"
}
}
},
"LLMAsJudgeScoringFnParams": {
"type": "object",
"properties": {
"type": {
"type": "string",
"const": "llm_as_judge",
"default": "llm_as_judge"
},
"judge_model": {
"type": "string"
},
"prompt_template": {
"type": "string"
},
"judge_score_regexes": {
"type": "array",
"items": {
"type": "string"
}
},
"aggregation_functions": {
"type": "array",
"items": {
"$ref": "#/components/schemas/AggregationFunctionType"
}
}
},
"additionalProperties": false,
"required": [
"type",
"judge_model"
],
"title": "LLMAsJudgeScoringFnParams"
},
"ModelCandidate": {
"type": "object",
"properties": {
"type": {
"type": "string",
"const": "model",
"default": "model"
},
"model": {
"type": "string",
"description": "The model ID to evaluate."
},
"sampling_params": {
"$ref": "#/components/schemas/SamplingParams",
"description": "The sampling parameters for the model."
},
"system_message": {
"$ref": "#/components/schemas/SystemMessage",
"description": "(Optional) The system message providing instructions or context to the model."
}
},
"additionalProperties": false,
"required": [
"type",
"model",
"sampling_params"
],
"title": "ModelCandidate",
"description": "A model candidate for evaluation."
},
"RegexParserScoringFnParams": {
"type": "object",
"properties": {
"type": {
"type": "string",
"const": "regex_parser",
"default": "regex_parser"
},
"parsing_regexes": {
"type": "array",
"items": {
"type": "string"
}
},
"aggregation_functions": {
"type": "array",
"items": {
"$ref": "#/components/schemas/AggregationFunctionType"
}
}
},
"additionalProperties": false,
"required": [
"type"
],
"title": "RegexParserScoringFnParams"
},
"ScoringFnParams": {
"oneOf": [
{
"$ref": "#/components/schemas/LLMAsJudgeScoringFnParams"
},
{
"$ref": "#/components/schemas/RegexParserScoringFnParams"
},
{
"$ref": "#/components/schemas/BasicScoringFnParams"
}
],
"discriminator": {
"propertyName": "type",
"mapping": {
"llm_as_judge": "#/components/schemas/LLMAsJudgeScoringFnParams",
"regex_parser": "#/components/schemas/RegexParserScoringFnParams",
"basic": "#/components/schemas/BasicScoringFnParams"
}
}
},
"EvaluateRowsRequest": {
"type": "object",
"properties": {
"input_rows": {
"type": "array",
"items": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
]
}
},
"description": "The rows to evaluate."
},
"scoring_functions": {
"type": "array",
"items": {
"type": "string"
},
"description": "The scoring functions to use for the evaluation."
},
"benchmark_config": {
"$ref": "#/components/schemas/BenchmarkConfig",
"description": "The configuration for the benchmark."
}
},
"additionalProperties": false,
"required": [
"input_rows",
"scoring_functions",
"benchmark_config"
],
"title": "EvaluateRowsRequest"
},
"EvaluateResponse": {
"type": "object",
"properties": {
"generations": {
"type": "array",
"items": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
]
}
},
"description": "The generations from the evaluation."
},
"scores": {
"type": "object",
"additionalProperties": {
"$ref": "#/components/schemas/ScoringResult"
},
"description": "The scores from the evaluation."
}
},
"additionalProperties": false,
"required": [
"generations",
"scores"
],
"title": "EvaluateResponse",
"description": "The response from an evaluation."
},
"ScoringResult": {
"type": "object",
"properties": {
"score_rows": {
"type": "array",
"items": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
]
}
},
"description": "The scoring result for each row. Each row is a map of column name to value."
},
"aggregated_results": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
]
},
"description": "Map of metric name to aggregated value"
}
},
"additionalProperties": false,
"required": [
"score_rows",
"aggregated_results"
],
"title": "ScoringResult",
"description": "A scoring result for a single row."
},
"Agent": {
"type": "object",
"properties": {
@ -7705,7 +7329,8 @@
"completed",
"in_progress",
"failed",
"scheduled"
"scheduled",
"cancelled"
],
"title": "JobStatus"
},
@ -8400,30 +8025,6 @@
"title": "IterrowsResponse",
"description": "A paginated list of rows from a dataset."
},
"Job": {
"type": "object",
"properties": {
"job_id": {
"type": "string"
},
"status": {
"type": "string",
"enum": [
"completed",
"in_progress",
"failed",
"scheduled"
],
"title": "JobStatus"
}
},
"additionalProperties": false,
"required": [
"job_id",
"status"
],
"title": "Job"
},
"ListAgentSessionsResponse": {
"type": "object",
"properties": {
@ -10007,16 +9608,21 @@
"RunRequest": {
"type": "object",
"properties": {
"benchmark_config": {
"$ref": "#/components/schemas/BenchmarkConfig",
"description": "The configuration for the benchmark."
"task": {
"$ref": "#/components/schemas/EvaluationTask",
"description": "The task to evaluate. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
},
"candidate": {
"$ref": "#/components/schemas/EvaluationCandidate",
"description": "The candidate to evaluate."
}
},
"additionalProperties": false,
"required": [
"benchmark_config"
"task",
"candidate"
],
"title": "RunEvalRequest"
"title": "RunRequest"
},
"RunShieldRequest": {
"type": "object",
@ -10123,128 +9729,6 @@
],
"title": "SaveSpansToDatasetRequest"
},
"ScoreRequest": {
"type": "object",
"properties": {
"input_rows": {
"type": "array",
"items": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
]
}
},
"description": "The rows to score."
},
"scoring_functions": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"$ref": "#/components/schemas/ScoringFnParams"
},
{
"type": "null"
}
]
},
"description": "The scoring functions to use for the scoring."
}
},
"additionalProperties": false,
"required": [
"input_rows",
"scoring_functions"
],
"title": "ScoreRequest"
},
"ScoreResponse": {
"type": "object",
"properties": {
"results": {
"type": "object",
"additionalProperties": {
"$ref": "#/components/schemas/ScoringResult"
},
"description": "A map of scoring function name to ScoringResult."
}
},
"additionalProperties": false,
"required": [
"results"
],
"title": "ScoreResponse",
"description": "The response from scoring."
},
"ScoreBatchRequest": {
"type": "object",
"properties": {
"dataset_id": {
"type": "string"
},
"scoring_functions": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"$ref": "#/components/schemas/ScoringFnParams"
},
{
"type": "null"
}
]
}
},
"save_results_dataset": {
"type": "boolean"
}
},
"additionalProperties": false,
"required": [
"dataset_id",
"scoring_functions",
"save_results_dataset"
],
"title": "ScoreBatchRequest"
},
"ScoreBatchResponse": {
"type": "object",
"properties": {
"dataset_id": {
"type": "string"
},
"results": {
"type": "object",
"additionalProperties": {
"$ref": "#/components/schemas/ScoringResult"
}
}
},
"additionalProperties": false,
"required": [
"results"
],
"title": "ScoreBatchResponse"
},
"AlgorithmConfig": {
"oneOf": [
{

View file

@ -1562,109 +1562,6 @@ paths:
required: false
schema:
type: integer
/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}:
get:
responses:
'200':
description: The status of the evaluationjob.
content:
application/json:
schema:
$ref: '#/components/schemas/Job'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Eval
description: Get the status of a job.
parameters:
- name: benchmark_id
in: path
description: >-
The ID of the benchmark to run the evaluation on.
required: true
schema:
type: string
- name: job_id
in: path
description: The ID of the job to get the status of.
required: true
schema:
type: string
delete:
responses:
'200':
description: OK
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Eval
description: Cancel a job.
parameters:
- name: benchmark_id
in: path
description: >-
The ID of the benchmark to run the evaluation on.
required: true
schema:
type: string
- name: job_id
in: path
description: The ID of the job to cancel.
required: true
schema:
type: string
/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result:
get:
responses:
'200':
description: The result of the job.
content:
application/json:
schema:
$ref: '#/components/schemas/EvaluateResponse'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Eval
description: Get the result of a job.
parameters:
- name: benchmark_id
in: path
description: >-
The ID of the benchmark to run the evaluation on.
required: true
schema:
type: string
- name: job_id
in: path
description: The ID of the job to get the result of.
required: true
schema:
type: string
/v1/agents/{agent_id}/sessions:
get:
responses:
@ -1923,7 +1820,7 @@ paths:
default:
$ref: '#/components/responses/DefaultError'
tags:
- Providers
- Models
description: ''
parameters: []
post:
@ -1974,7 +1871,7 @@ paths:
default:
$ref: '#/components/responses/DefaultError'
tags:
- Inspect
- Providers
description: ''
parameters: []
/v1/inspect/routes:
@ -4448,252 +4345,6 @@ components:
title: EmbeddingsResponse
description: >-
Response containing generated embeddings.
AgentCandidate:
type: object
properties:
type:
type: string
const: agent
default: agent
config:
$ref: '#/components/schemas/AgentConfig'
description: >-
The configuration for the agent candidate.
additionalProperties: false
required:
- type
- config
title: AgentCandidate
description: An agent candidate for evaluation.
AggregationFunctionType:
type: string
enum:
- average
- weighted_average
- median
- categorical_count
- accuracy
title: AggregationFunctionType
BasicScoringFnParams:
type: object
properties:
type:
type: string
const: basic
default: basic
aggregation_functions:
type: array
items:
$ref: '#/components/schemas/AggregationFunctionType'
additionalProperties: false
required:
- type
title: BasicScoringFnParams
BenchmarkConfig:
type: object
properties:
eval_candidate:
$ref: '#/components/schemas/EvalCandidate'
description: The candidate to evaluate.
scoring_params:
type: object
additionalProperties:
$ref: '#/components/schemas/ScoringFnParams'
description: >-
Map between scoring function id and parameters for each scoring function
you want to run
num_examples:
type: integer
description: >-
(Optional) The number of examples to evaluate. If not provided, all examples
in the dataset will be evaluated
additionalProperties: false
required:
- eval_candidate
- scoring_params
title: BenchmarkConfig
description: >-
A benchmark configuration for evaluation.
EvalCandidate:
oneOf:
- $ref: '#/components/schemas/ModelCandidate'
- $ref: '#/components/schemas/AgentCandidate'
discriminator:
propertyName: type
mapping:
model: '#/components/schemas/ModelCandidate'
agent: '#/components/schemas/AgentCandidate'
LLMAsJudgeScoringFnParams:
type: object
properties:
type:
type: string
const: llm_as_judge
default: llm_as_judge
judge_model:
type: string
prompt_template:
type: string
judge_score_regexes:
type: array
items:
type: string
aggregation_functions:
type: array
items:
$ref: '#/components/schemas/AggregationFunctionType'
additionalProperties: false
required:
- type
- judge_model
title: LLMAsJudgeScoringFnParams
ModelCandidate:
type: object
properties:
type:
type: string
const: model
default: model
model:
type: string
description: The model ID to evaluate.
sampling_params:
$ref: '#/components/schemas/SamplingParams'
description: The sampling parameters for the model.
system_message:
$ref: '#/components/schemas/SystemMessage'
description: >-
(Optional) The system message providing instructions or context to the
model.
additionalProperties: false
required:
- type
- model
- sampling_params
title: ModelCandidate
description: A model candidate for evaluation.
RegexParserScoringFnParams:
type: object
properties:
type:
type: string
const: regex_parser
default: regex_parser
parsing_regexes:
type: array
items:
type: string
aggregation_functions:
type: array
items:
$ref: '#/components/schemas/AggregationFunctionType'
additionalProperties: false
required:
- type
title: RegexParserScoringFnParams
ScoringFnParams:
oneOf:
- $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
- $ref: '#/components/schemas/RegexParserScoringFnParams'
- $ref: '#/components/schemas/BasicScoringFnParams'
discriminator:
propertyName: type
mapping:
llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
regex_parser: '#/components/schemas/RegexParserScoringFnParams'
basic: '#/components/schemas/BasicScoringFnParams'
EvaluateRowsRequest:
type: object
properties:
input_rows:
type: array
items:
type: object
additionalProperties:
oneOf:
- type: 'null'
- type: boolean
- type: number
- type: string
- type: array
- type: object
description: The rows to evaluate.
scoring_functions:
type: array
items:
type: string
description: >-
The scoring functions to use for the evaluation.
benchmark_config:
$ref: '#/components/schemas/BenchmarkConfig'
description: The configuration for the benchmark.
additionalProperties: false
required:
- input_rows
- scoring_functions
- benchmark_config
title: EvaluateRowsRequest
EvaluateResponse:
type: object
properties:
generations:
type: array
items:
type: object
additionalProperties:
oneOf:
- type: 'null'
- type: boolean
- type: number
- type: string
- type: array
- type: object
description: The generations from the evaluation.
scores:
type: object
additionalProperties:
$ref: '#/components/schemas/ScoringResult'
description: The scores from the evaluation.
additionalProperties: false
required:
- generations
- scores
title: EvaluateResponse
description: The response from an evaluation.
ScoringResult:
type: object
properties:
score_rows:
type: array
items:
type: object
additionalProperties:
oneOf:
- type: 'null'
- type: boolean
- type: number
- type: string
- type: array
- type: object
description: >-
The scoring result for each row. Each row is a map of column name to value.
aggregated_results:
type: object
additionalProperties:
oneOf:
- type: 'null'
- type: boolean
- type: number
- type: string
- type: array
- type: object
description: Map of metric name to aggregated value
additionalProperties: false
required:
- score_rows
- aggregated_results
title: ScoringResult
description: A scoring result for a single row.
Agent:
type: object
properties:
@ -5451,6 +5102,7 @@ components:
- in_progress
- failed
- scheduled
- cancelled
title: JobStatus
scheduled_at:
type: string
@ -5901,24 +5553,6 @@ components:
- data
title: IterrowsResponse
description: A paginated list of rows from a dataset.
Job:
type: object
properties:
job_id:
type: string
status:
type: string
enum:
- completed
- in_progress
- failed
- scheduled
title: JobStatus
additionalProperties: false
required:
- job_id
- status
title: Job
ListAgentSessionsResponse:
type: object
properties:
@ -6984,8 +6618,9 @@ components:
description: The candidate to evaluate.
additionalProperties: false
required:
- benchmark_config
title: RunEvalRequest
- task
- candidate
title: RunRequest
RunShieldRequest:
type: object
properties:
@ -7058,81 +6693,6 @@ components:
- attributes_to_save
- dataset_id
title: SaveSpansToDatasetRequest
ScoreRequest:
type: object
properties:
input_rows:
type: array
items:
type: object
additionalProperties:
oneOf:
- type: 'null'
- type: boolean
- type: number
- type: string
- type: array
- type: object
description: The rows to score.
scoring_functions:
type: object
additionalProperties:
oneOf:
- $ref: '#/components/schemas/ScoringFnParams'
- type: 'null'
description: >-
The scoring functions to use for the scoring.
additionalProperties: false
required:
- input_rows
- scoring_functions
title: ScoreRequest
ScoreResponse:
type: object
properties:
results:
type: object
additionalProperties:
$ref: '#/components/schemas/ScoringResult'
description: >-
A map of scoring function name to ScoringResult.
additionalProperties: false
required:
- results
title: ScoreResponse
description: The response from scoring.
ScoreBatchRequest:
type: object
properties:
dataset_id:
type: string
scoring_functions:
type: object
additionalProperties:
oneOf:
- $ref: '#/components/schemas/ScoringFnParams'
- type: 'null'
save_results_dataset:
type: boolean
additionalProperties: false
required:
- dataset_id
- scoring_functions
- save_results_dataset
title: ScoreBatchRequest
ScoreBatchResponse:
type: object
properties:
dataset_id:
type: string
results:
type: object
additionalProperties:
$ref: '#/components/schemas/ScoringResult'
additionalProperties: false
required:
- results
title: ScoreBatchResponse
AlgorithmConfig:
oneOf:
- $ref: '#/components/schemas/LoraFinetuningConfig'

View file

@ -141,4 +141,3 @@ class Eval(Protocol):
:param job_id: The ID of the job to get the result of.
:return: The result of the job.
"""

View file

@ -146,4 +146,3 @@ class ScoringFunctions(Protocol):
provider_id: Optional[str] = None,
params: Optional[ScoringFnParams] = None,
) -> None: ...

View file

@ -43,7 +43,6 @@ from llama_stack.distribution.datatypes import (
RoutableObject,
RoutableObjectWithProvider,
RoutedProtocol,
ScoringFnWithACL,
ShieldWithACL,
ToolGroupWithACL,
ToolWithACL,

View file

@ -26,4 +26,3 @@ def available_providers() -> List[ProviderSpec]:
],
),
]

View file

@ -166,7 +166,18 @@ datasets:
uri: huggingface://datasets/llamastack/bfcl_v3?split=train
metadata: {}
dataset_id: bfcl
provider_id: huggingface
- purpose: eval/messages-answer
source:
type: uri
uri: huggingface://datasets/llamastack/IfEval?split=train
metadata: {}
dataset_id: ifeval
- purpose: eval/messages-answer
source:
type: uri
uri: huggingface://datasets/llamastack/docvqa?split=val
metadata: {}
dataset_id: docvqa
benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch