diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index e3c81ddb9..d6f420cae 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -866,6 +866,83 @@
]
}
},
+ "/v1/graders/{grader_id}": {
+ "get": {
+ "responses": {
+ "200": {
+ "description": "The grader.",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/Grader"
+ }
+ }
+ }
+ },
+ "400": {
+ "$ref": "#/components/responses/BadRequest400"
+ },
+ "429": {
+ "$ref": "#/components/responses/TooManyRequests429"
+ },
+ "500": {
+ "$ref": "#/components/responses/InternalServerError500"
+ },
+ "default": {
+ "$ref": "#/components/responses/DefaultError"
+ }
+ },
+ "tags": [
+ "Graders"
+ ],
+ "description": "Get a grader by ID.",
+ "parameters": [
+ {
+ "name": "grader_id",
+ "in": "path",
+ "description": "The ID of the grader.",
+ "required": true,
+ "schema": {
+ "type": "string"
+ }
+ }
+ ]
+ },
+ "delete": {
+ "responses": {
+ "200": {
+ "description": "OK"
+ },
+ "400": {
+ "$ref": "#/components/responses/BadRequest400"
+ },
+ "429": {
+ "$ref": "#/components/responses/TooManyRequests429"
+ },
+ "500": {
+ "$ref": "#/components/responses/InternalServerError500"
+ },
+ "default": {
+ "$ref": "#/components/responses/DefaultError"
+ }
+ },
+ "tags": [
+ "Graders"
+ ],
+ "description": "Delete a grader by ID.",
+ "parameters": [
+ {
+ "name": "grader_id",
+ "in": "path",
+ "description": "The ID of the grader.",
+ "required": true,
+ "schema": {
+ "type": "string"
+ }
+ }
+ ]
+ }
+ },
"/v1/inference/embeddings": {
"post": {
"responses": {
@@ -909,59 +986,6 @@
}
}
},
- "/v1/eval/benchmarks/{benchmark_id}/evaluations": {
- "post": {
- "responses": {
- "200": {
- "description": "EvaluateResponse object containing generations and scores",
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/EvaluateResponse"
- }
- }
- }
- },
- "400": {
- "$ref": "#/components/responses/BadRequest400"
- },
- "429": {
- "$ref": "#/components/responses/TooManyRequests429"
- },
- "500": {
- "$ref": "#/components/responses/InternalServerError500"
- },
- "default": {
- "$ref": "#/components/responses/DefaultError"
- }
- },
- "tags": [
- "Eval"
- ],
- "description": "Evaluate a list of rows on a benchmark.",
- "parameters": [
- {
- "name": "benchmark_id",
- "in": "path",
- "description": "The ID of the benchmark to run the evaluation on.",
- "required": true,
- "schema": {
- "type": "string"
- }
- }
- ],
- "requestBody": {
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/EvaluateRowsRequest"
- }
- }
- },
- "required": true
- }
- }
- },
"/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}": {
"get": {
"responses": {
@@ -1101,14 +1125,7 @@
"content": {
"application/json": {
"schema": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/Benchmark"
- },
- {
- "type": "null"
- }
- ]
+ "$ref": "#/components/schemas/Benchmark"
}
}
}
@@ -1129,11 +1146,12 @@
"tags": [
"Benchmarks"
],
- "description": "",
+ "description": "Get a benchmark by ID.",
"parameters": [
{
"name": "benchmark_id",
"in": "path",
+ "description": "The ID of the benchmark to get.",
"required": true,
"schema": {
"type": "string"
@@ -1306,55 +1324,6 @@
]
}
},
- "/v1/scoring-functions/{scoring_fn_id}": {
- "get": {
- "responses": {
- "200": {
- "description": "OK",
- "content": {
- "application/json": {
- "schema": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/ScoringFn"
- },
- {
- "type": "null"
- }
- ]
- }
- }
- }
- },
- "400": {
- "$ref": "#/components/responses/BadRequest400"
- },
- "429": {
- "$ref": "#/components/responses/TooManyRequests429"
- },
- "500": {
- "$ref": "#/components/responses/InternalServerError500"
- },
- "default": {
- "$ref": "#/components/responses/DefaultError"
- }
- },
- "tags": [
- "ScoringFunctions"
- ],
- "description": "",
- "parameters": [
- {
- "name": "scoring_fn_id",
- "in": "path",
- "required": true,
- "schema": {
- "type": "string"
- }
- }
- ]
- }
- },
"/v1/shields/{identifier}": {
"get": {
"responses": {
@@ -1987,6 +1956,92 @@
]
}
},
+ "/v1/evaluation/grade": {
+ "post": {
+ "responses": {
+ "200": {
+ "description": "The evaluation job containing grader scores.",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/EvaluationJob"
+ }
+ }
+ }
+ },
+ "400": {
+ "$ref": "#/components/responses/BadRequest400"
+ },
+ "429": {
+ "$ref": "#/components/responses/TooManyRequests429"
+ },
+ "500": {
+ "$ref": "#/components/responses/InternalServerError500"
+ },
+ "default": {
+ "$ref": "#/components/responses/DefaultError"
+ }
+ },
+ "tags": [
+ "Evaluation"
+ ],
+ "description": "Run an grading job with generated results. Use this when you have generated results from inference in a dataset.",
+ "parameters": [],
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/GradeRequest"
+ }
+ }
+ },
+ "required": true
+ }
+ }
+ },
+ "/v1/evaluation/grade_inline": {
+ "post": {
+ "responses": {
+ "200": {
+ "description": "The evaluation job containing grader scores. \"generations\" is not populated in the response.",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/EvaluationResponse"
+ }
+ }
+ }
+ },
+ "400": {
+ "$ref": "#/components/responses/BadRequest400"
+ },
+ "429": {
+ "$ref": "#/components/responses/TooManyRequests429"
+ },
+ "500": {
+ "$ref": "#/components/responses/InternalServerError500"
+ },
+ "default": {
+ "$ref": "#/components/responses/DefaultError"
+ }
+ },
+ "tags": [
+ "Evaluation"
+ ],
+ "description": "Run an grading job with generated results inline.",
+ "parameters": [],
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/GradeInlineRequest"
+ }
+ }
+ },
+ "required": true
+ }
+ }
+ },
"/v1/health": {
"get": {
"responses": {
@@ -2238,160 +2293,6 @@
]
}
},
- "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}": {
- "get": {
- "responses": {
- "200": {
- "description": "The status of the evaluationjob.",
- "content": {
- "application/json": {
- "schema": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/JobStatus"
- },
- {
- "type": "null"
- }
- ]
- }
- }
- }
- },
- "400": {
- "$ref": "#/components/responses/BadRequest400"
- },
- "429": {
- "$ref": "#/components/responses/TooManyRequests429"
- },
- "500": {
- "$ref": "#/components/responses/InternalServerError500"
- },
- "default": {
- "$ref": "#/components/responses/DefaultError"
- }
- },
- "tags": [
- "Eval"
- ],
- "description": "Get the status of a job.",
- "parameters": [
- {
- "name": "benchmark_id",
- "in": "path",
- "description": "The ID of the benchmark to run the evaluation on.",
- "required": true,
- "schema": {
- "type": "string"
- }
- },
- {
- "name": "job_id",
- "in": "path",
- "description": "The ID of the job to get the status of.",
- "required": true,
- "schema": {
- "type": "string"
- }
- }
- ]
- },
- "delete": {
- "responses": {
- "200": {
- "description": "OK"
- },
- "400": {
- "$ref": "#/components/responses/BadRequest400"
- },
- "429": {
- "$ref": "#/components/responses/TooManyRequests429"
- },
- "500": {
- "$ref": "#/components/responses/InternalServerError500"
- },
- "default": {
- "$ref": "#/components/responses/DefaultError"
- }
- },
- "tags": [
- "Eval"
- ],
- "description": "Cancel a job.",
- "parameters": [
- {
- "name": "benchmark_id",
- "in": "path",
- "description": "The ID of the benchmark to run the evaluation on.",
- "required": true,
- "schema": {
- "type": "string"
- }
- },
- {
- "name": "job_id",
- "in": "path",
- "description": "The ID of the job to cancel.",
- "required": true,
- "schema": {
- "type": "string"
- }
- }
- ]
- }
- },
- "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result": {
- "get": {
- "responses": {
- "200": {
- "description": "The result of the job.",
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/EvaluateResponse"
- }
- }
- }
- },
- "400": {
- "$ref": "#/components/responses/BadRequest400"
- },
- "429": {
- "$ref": "#/components/responses/TooManyRequests429"
- },
- "500": {
- "$ref": "#/components/responses/InternalServerError500"
- },
- "default": {
- "$ref": "#/components/responses/DefaultError"
- }
- },
- "tags": [
- "Eval"
- ],
- "description": "Get the result of a job.",
- "parameters": [
- {
- "name": "benchmark_id",
- "in": "path",
- "description": "The ID of the benchmark to run the evaluation on.",
- "required": true,
- "schema": {
- "type": "string"
- }
- },
- {
- "name": "job_id",
- "in": "path",
- "description": "The ID of the job to get the result of.",
- "required": true,
- "schema": {
- "type": "string"
- }
- }
- ]
- }
- },
"/v1/agents/{agent_id}/sessions": {
"get": {
"responses": {
@@ -2464,13 +2365,20 @@
"tags": [
"Benchmarks"
],
- "description": "",
+ "description": "List all benchmarks.",
"parameters": []
},
"post": {
"responses": {
"200": {
- "description": "OK"
+ "description": "OK",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/Benchmark"
+ }
+ }
+ }
},
"400": {
"$ref": "#/components/responses/BadRequest400"
@@ -2488,7 +2396,7 @@
"tags": [
"Benchmarks"
],
- "description": "",
+ "description": "Register a new benchmark.",
"parameters": [],
"requestBody": {
"content": {
@@ -2619,6 +2527,113 @@
]
}
},
+ "/v1/graders/types": {
+ "get": {
+ "responses": {
+ "200": {
+ "description": "A list of grader types and information about the types.",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/ListGraderTypesResponse"
+ }
+ }
+ }
+ },
+ "400": {
+ "$ref": "#/components/responses/BadRequest400"
+ },
+ "429": {
+ "$ref": "#/components/responses/TooManyRequests429"
+ },
+ "500": {
+ "$ref": "#/components/responses/InternalServerError500"
+ },
+ "default": {
+ "$ref": "#/components/responses/DefaultError"
+ }
+ },
+ "tags": [
+ "Graders"
+ ],
+ "description": "List all grader types.",
+ "parameters": []
+ }
+ },
+ "/v1/graders": {
+ "get": {
+ "responses": {
+ "200": {
+ "description": "A list of graders.",
+ "content": {
+ "application/jsonl": {
+ "schema": {
+ "$ref": "#/components/schemas/Grader"
+ }
+ }
+ }
+ },
+ "400": {
+ "$ref": "#/components/responses/BadRequest400"
+ },
+ "429": {
+ "$ref": "#/components/responses/TooManyRequests429"
+ },
+ "500": {
+ "$ref": "#/components/responses/InternalServerError500"
+ },
+ "default": {
+ "$ref": "#/components/responses/DefaultError"
+ }
+ },
+ "tags": [
+ "Graders"
+ ],
+ "description": "List all graders.",
+ "parameters": []
+ },
+ "post": {
+ "responses": {
+ "200": {
+ "description": "The registered grader.",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/Grader"
+ }
+ }
+ }
+ },
+ "400": {
+ "$ref": "#/components/responses/BadRequest400"
+ },
+ "429": {
+ "$ref": "#/components/responses/TooManyRequests429"
+ },
+ "500": {
+ "$ref": "#/components/responses/InternalServerError500"
+ },
+ "default": {
+ "$ref": "#/components/responses/DefaultError"
+ }
+ },
+ "tags": [
+ "Graders"
+ ],
+ "description": "Register a new grader.",
+ "parameters": [],
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/RegisterGraderRequest"
+ }
+ }
+ },
+ "required": true
+ }
+ }
+ },
"/v1/models": {
"get": {
"responses": {
@@ -2809,73 +2824,6 @@
]
}
},
- "/v1/scoring-functions": {
- "get": {
- "responses": {
- "200": {
- "description": "OK",
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/ListScoringFunctionsResponse"
- }
- }
- }
- },
- "400": {
- "$ref": "#/components/responses/BadRequest400"
- },
- "429": {
- "$ref": "#/components/responses/TooManyRequests429"
- },
- "500": {
- "$ref": "#/components/responses/InternalServerError500"
- },
- "default": {
- "$ref": "#/components/responses/DefaultError"
- }
- },
- "tags": [
- "ScoringFunctions"
- ],
- "description": "",
- "parameters": []
- },
- "post": {
- "responses": {
- "200": {
- "description": "OK"
- },
- "400": {
- "$ref": "#/components/responses/BadRequest400"
- },
- "429": {
- "$ref": "#/components/responses/TooManyRequests429"
- },
- "500": {
- "$ref": "#/components/responses/InternalServerError500"
- },
- "default": {
- "$ref": "#/components/responses/DefaultError"
- }
- },
- "tags": [
- "ScoringFunctions"
- ],
- "description": "",
- "parameters": [],
- "requestBody": {
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/RegisterScoringFunctionRequest"
- }
- }
- },
- "required": true
- }
- }
- },
"/v1/shields": {
"get": {
"responses": {
@@ -3460,15 +3408,15 @@
}
}
},
- "/v1/eval/benchmarks/{benchmark_id}/jobs": {
+ "/v1/evaluation/run": {
"post": {
"responses": {
"200": {
- "description": "The job that was created to run the evaluation.",
+ "description": "OK",
"content": {
"application/json": {
"schema": {
- "$ref": "#/components/schemas/Job"
+ "$ref": "#/components/schemas/EvaluationJob"
}
}
}
@@ -3487,25 +3435,58 @@
}
},
"tags": [
- "Eval"
- ],
- "description": "Run an evaluation on a benchmark.",
- "parameters": [
- {
- "name": "benchmark_id",
- "in": "path",
- "description": "The ID of the benchmark to run the evaluation on.",
- "required": true,
- "schema": {
- "type": "string"
- }
- }
+ "Evaluation"
],
+ "description": "Run an evaluation job.",
+ "parameters": [],
"requestBody": {
"content": {
"application/json": {
"schema": {
- "$ref": "#/components/schemas/RunEvalRequest"
+ "$ref": "#/components/schemas/RunRequest"
+ }
+ }
+ },
+ "required": true
+ }
+ }
+ },
+ "/v1/evaluation/run_inline": {
+ "post": {
+ "responses": {
+ "200": {
+ "description": "OK",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/EvaluationResponse"
+ }
+ }
+ }
+ },
+ "400": {
+ "$ref": "#/components/responses/BadRequest400"
+ },
+ "429": {
+ "$ref": "#/components/responses/TooManyRequests429"
+ },
+ "500": {
+ "$ref": "#/components/responses/InternalServerError500"
+ },
+ "default": {
+ "$ref": "#/components/responses/DefaultError"
+ }
+ },
+ "tags": [
+ "Evaluation"
+ ],
+ "description": "Run an evaluation job inline.",
+ "parameters": [],
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/RunInlineRequest"
}
}
},
@@ -3592,92 +3573,6 @@
}
}
},
- "/v1/scoring/score": {
- "post": {
- "responses": {
- "200": {
- "description": "ScoreResponse object containing rows and aggregated results",
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/ScoreResponse"
- }
- }
- }
- },
- "400": {
- "$ref": "#/components/responses/BadRequest400"
- },
- "429": {
- "$ref": "#/components/responses/TooManyRequests429"
- },
- "500": {
- "$ref": "#/components/responses/InternalServerError500"
- },
- "default": {
- "$ref": "#/components/responses/DefaultError"
- }
- },
- "tags": [
- "Scoring"
- ],
- "description": "Score a list of rows.",
- "parameters": [],
- "requestBody": {
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/ScoreRequest"
- }
- }
- },
- "required": true
- }
- }
- },
- "/v1/scoring/score-batch": {
- "post": {
- "responses": {
- "200": {
- "description": "OK",
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/ScoreBatchResponse"
- }
- }
- }
- },
- "400": {
- "$ref": "#/components/responses/BadRequest400"
- },
- "429": {
- "$ref": "#/components/responses/TooManyRequests429"
- },
- "500": {
- "$ref": "#/components/responses/InternalServerError500"
- },
- "default": {
- "$ref": "#/components/responses/DefaultError"
- }
- },
- "tags": [
- "Scoring"
- ],
- "description": "",
- "parameters": [],
- "requestBody": {
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/ScoreBatchRequest"
- }
- }
- },
- "required": true
- }
- }
- },
"/v1/post-training/supervised-fine-tune": {
"post": {
"responses": {
@@ -6303,381 +6198,6 @@
"title": "EmbeddingsResponse",
"description": "Response containing generated embeddings."
},
- "AgentCandidate": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "agent",
- "default": "agent"
- },
- "config": {
- "$ref": "#/components/schemas/AgentConfig",
- "description": "The configuration for the agent candidate."
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "config"
- ],
- "title": "AgentCandidate",
- "description": "An agent candidate for evaluation."
- },
- "AggregationFunctionType": {
- "type": "string",
- "enum": [
- "average",
- "median",
- "categorical_count",
- "accuracy"
- ],
- "title": "AggregationFunctionType"
- },
- "BasicScoringFnParams": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "basic",
- "default": "basic"
- },
- "aggregation_functions": {
- "type": "array",
- "items": {
- "$ref": "#/components/schemas/AggregationFunctionType"
- }
- }
- },
- "additionalProperties": false,
- "required": [
- "type"
- ],
- "title": "BasicScoringFnParams"
- },
- "BenchmarkConfig": {
- "type": "object",
- "properties": {
- "eval_candidate": {
- "$ref": "#/components/schemas/EvalCandidate",
- "description": "The candidate to evaluate."
- },
- "scoring_params": {
- "type": "object",
- "additionalProperties": {
- "$ref": "#/components/schemas/ScoringFnParams"
- },
- "description": "Map between scoring function id and parameters for each scoring function you want to run"
- },
- "num_examples": {
- "type": "integer",
- "description": "(Optional) The number of examples to evaluate. If not provided, all examples in the dataset will be evaluated"
- }
- },
- "additionalProperties": false,
- "required": [
- "eval_candidate",
- "scoring_params"
- ],
- "title": "BenchmarkConfig",
- "description": "A benchmark configuration for evaluation."
- },
- "EvalCandidate": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/ModelCandidate"
- },
- {
- "$ref": "#/components/schemas/AgentCandidate"
- }
- ],
- "discriminator": {
- "propertyName": "type",
- "mapping": {
- "model": "#/components/schemas/ModelCandidate",
- "agent": "#/components/schemas/AgentCandidate"
- }
- }
- },
- "LLMAsJudgeScoringFnParams": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "llm_as_judge",
- "default": "llm_as_judge"
- },
- "judge_model": {
- "type": "string"
- },
- "prompt_template": {
- "type": "string"
- },
- "judge_score_regexes": {
- "type": "array",
- "items": {
- "type": "string"
- }
- },
- "aggregation_functions": {
- "type": "array",
- "items": {
- "$ref": "#/components/schemas/AggregationFunctionType"
- }
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "judge_model"
- ],
- "title": "LLMAsJudgeScoringFnParams"
- },
- "ModelCandidate": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "model",
- "default": "model"
- },
- "model": {
- "type": "string",
- "description": "The model ID to evaluate."
- },
- "sampling_params": {
- "$ref": "#/components/schemas/SamplingParams",
- "description": "The sampling parameters for the model."
- },
- "system_message": {
- "$ref": "#/components/schemas/SystemMessage",
- "description": "(Optional) The system message providing instructions or context to the model."
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "model",
- "sampling_params"
- ],
- "title": "ModelCandidate",
- "description": "A model candidate for evaluation."
- },
- "RegexParserScoringFnParams": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "regex_parser",
- "default": "regex_parser"
- },
- "parsing_regexes": {
- "type": "array",
- "items": {
- "type": "string"
- }
- },
- "aggregation_functions": {
- "type": "array",
- "items": {
- "$ref": "#/components/schemas/AggregationFunctionType"
- }
- }
- },
- "additionalProperties": false,
- "required": [
- "type"
- ],
- "title": "RegexParserScoringFnParams"
- },
- "ScoringFnParams": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/LLMAsJudgeScoringFnParams"
- },
- {
- "$ref": "#/components/schemas/RegexParserScoringFnParams"
- },
- {
- "$ref": "#/components/schemas/BasicScoringFnParams"
- }
- ],
- "discriminator": {
- "propertyName": "type",
- "mapping": {
- "llm_as_judge": "#/components/schemas/LLMAsJudgeScoringFnParams",
- "regex_parser": "#/components/schemas/RegexParserScoringFnParams",
- "basic": "#/components/schemas/BasicScoringFnParams"
- }
- }
- },
- "EvaluateRowsRequest": {
- "type": "object",
- "properties": {
- "input_rows": {
- "type": "array",
- "items": {
- "type": "object",
- "additionalProperties": {
- "oneOf": [
- {
- "type": "null"
- },
- {
- "type": "boolean"
- },
- {
- "type": "number"
- },
- {
- "type": "string"
- },
- {
- "type": "array"
- },
- {
- "type": "object"
- }
- ]
- }
- },
- "description": "The rows to evaluate."
- },
- "scoring_functions": {
- "type": "array",
- "items": {
- "type": "string"
- },
- "description": "The scoring functions to use for the evaluation."
- },
- "benchmark_config": {
- "$ref": "#/components/schemas/BenchmarkConfig",
- "description": "The configuration for the benchmark."
- }
- },
- "additionalProperties": false,
- "required": [
- "input_rows",
- "scoring_functions",
- "benchmark_config"
- ],
- "title": "EvaluateRowsRequest"
- },
- "EvaluateResponse": {
- "type": "object",
- "properties": {
- "generations": {
- "type": "array",
- "items": {
- "type": "object",
- "additionalProperties": {
- "oneOf": [
- {
- "type": "null"
- },
- {
- "type": "boolean"
- },
- {
- "type": "number"
- },
- {
- "type": "string"
- },
- {
- "type": "array"
- },
- {
- "type": "object"
- }
- ]
- }
- },
- "description": "The generations from the evaluation."
- },
- "scores": {
- "type": "object",
- "additionalProperties": {
- "$ref": "#/components/schemas/ScoringResult"
- },
- "description": "The scores from the evaluation."
- }
- },
- "additionalProperties": false,
- "required": [
- "generations",
- "scores"
- ],
- "title": "EvaluateResponse",
- "description": "The response from an evaluation."
- },
- "ScoringResult": {
- "type": "object",
- "properties": {
- "score_rows": {
- "type": "array",
- "items": {
- "type": "object",
- "additionalProperties": {
- "oneOf": [
- {
- "type": "null"
- },
- {
- "type": "boolean"
- },
- {
- "type": "number"
- },
- {
- "type": "string"
- },
- {
- "type": "array"
- },
- {
- "type": "object"
- }
- ]
- }
- },
- "description": "The scoring result for each row. Each row is a map of column name to value."
- },
- "aggregated_results": {
- "type": "object",
- "additionalProperties": {
- "oneOf": [
- {
- "type": "null"
- },
- {
- "type": "boolean"
- },
- {
- "type": "number"
- },
- {
- "type": "string"
- },
- {
- "type": "array"
- },
- {
- "type": "object"
- }
- ]
- },
- "description": "Map of metric name to aggregated value"
- }
- },
- "additionalProperties": false,
- "required": [
- "score_rows",
- "aggregated_results"
- ],
- "title": "ScoringResult",
- "description": "A scoring result for a single row."
- },
"Agent": {
"type": "object",
"properties": {
@@ -6783,13 +6303,15 @@
"default": "benchmark"
},
"dataset_id": {
- "type": "string"
+ "type": "string",
+ "description": "The ID of the dataset to used to run the benchmark."
},
- "scoring_functions": {
+ "grader_ids": {
"type": "array",
"items": {
"type": "string"
- }
+ },
+ "description": "The grader ids to use for this benchmark."
},
"metadata": {
"type": "object",
@@ -6814,7 +6336,8 @@
"type": "object"
}
]
- }
+ },
+ "description": "Metadata for this benchmark for additional descriptions."
}
},
"additionalProperties": false,
@@ -6824,7 +6347,7 @@
"provider_id",
"type",
"dataset_id",
- "scoring_functions",
+ "grader_ids",
"metadata"
],
"title": "Benchmark"
@@ -6981,6 +6504,361 @@
"title": "URIDataSource",
"description": "A dataset that can be obtained from a URI."
},
+ "EqualityGrader": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "equality",
+ "default": "equality"
+ },
+ "equality": {
+ "type": "object",
+ "properties": {
+ "aggregation_functions": {
+ "type": "array",
+ "items": {
+ "type": "string",
+ "enum": [
+ "average",
+ "median",
+ "categorical_count",
+ "accuracy"
+ ],
+ "title": "AggregationFunctionType",
+ "description": "A type of aggregation function."
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "aggregation_functions"
+ ],
+ "title": "BasicGraderParams"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "equality"
+ ],
+ "title": "EqualityGrader"
+ },
+ "FactualityGrader": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "factuality",
+ "default": "factuality"
+ },
+ "factuality": {
+ "type": "object",
+ "properties": {
+ "aggregation_functions": {
+ "type": "array",
+ "items": {
+ "type": "string",
+ "enum": [
+ "average",
+ "median",
+ "categorical_count",
+ "accuracy"
+ ],
+ "title": "AggregationFunctionType",
+ "description": "A type of aggregation function."
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "aggregation_functions"
+ ],
+ "title": "BasicGraderParams"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "factuality"
+ ],
+ "title": "FactualityGrader"
+ },
+ "FaithfulnessGrader": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "faithfulness",
+ "default": "faithfulness"
+ },
+ "faithfulness": {
+ "type": "object",
+ "properties": {
+ "aggregation_functions": {
+ "type": "array",
+ "items": {
+ "type": "string",
+ "enum": [
+ "average",
+ "median",
+ "categorical_count",
+ "accuracy"
+ ],
+ "title": "AggregationFunctionType",
+ "description": "A type of aggregation function."
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "aggregation_functions"
+ ],
+ "title": "BasicGraderParams"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "faithfulness"
+ ],
+ "title": "FaithfulnessGrader"
+ },
+ "Grader": {
+ "type": "object",
+ "properties": {
+ "identifier": {
+ "type": "string"
+ },
+ "provider_resource_id": {
+ "type": "string"
+ },
+ "provider_id": {
+ "type": "string"
+ },
+ "type": {
+ "type": "string",
+ "const": "grader",
+ "default": "grader"
+ },
+ "grader": {
+ "$ref": "#/components/schemas/GraderDefinition"
+ },
+ "description": {
+ "type": "string"
+ },
+ "metadata": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "identifier",
+ "provider_resource_id",
+ "provider_id",
+ "type",
+ "grader",
+ "metadata"
+ ],
+ "title": "Grader"
+ },
+ "GraderDefinition": {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/LlmGrader"
+ },
+ {
+ "$ref": "#/components/schemas/RegexParserGrader"
+ },
+ {
+ "$ref": "#/components/schemas/EqualityGrader"
+ },
+ {
+ "$ref": "#/components/schemas/SubsetOfGrader"
+ },
+ {
+ "$ref": "#/components/schemas/FactualityGrader"
+ },
+ {
+ "$ref": "#/components/schemas/FaithfulnessGrader"
+ }
+ ],
+ "discriminator": {
+ "propertyName": "type",
+ "mapping": {
+ "llm": "#/components/schemas/LlmGrader",
+ "regex_parser": "#/components/schemas/RegexParserGrader",
+ "equality": "#/components/schemas/EqualityGrader",
+ "subset_of": "#/components/schemas/SubsetOfGrader",
+ "factuality": "#/components/schemas/FactualityGrader",
+ "faithfulness": "#/components/schemas/FaithfulnessGrader"
+ }
+ }
+ },
+ "LlmGrader": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "llm",
+ "default": "llm"
+ },
+ "llm": {
+ "type": "object",
+ "properties": {
+ "model": {
+ "type": "string"
+ },
+ "prompt": {
+ "type": "string"
+ },
+ "score_regexes": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "aggregation_functions": {
+ "type": "array",
+ "items": {
+ "type": "string",
+ "enum": [
+ "average",
+ "median",
+ "categorical_count",
+ "accuracy"
+ ],
+ "title": "AggregationFunctionType",
+ "description": "A type of aggregation function."
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "model",
+ "prompt",
+ "score_regexes",
+ "aggregation_functions"
+ ],
+ "title": "LlmGraderParams"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "llm"
+ ],
+ "title": "LlmGrader"
+ },
+ "RegexParserGrader": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "regex_parser",
+ "default": "regex_parser"
+ },
+ "regex_parser": {
+ "type": "object",
+ "properties": {
+ "parsing_regexes": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "aggregation_functions": {
+ "type": "array",
+ "items": {
+ "type": "string",
+ "enum": [
+ "average",
+ "median",
+ "categorical_count",
+ "accuracy"
+ ],
+ "title": "AggregationFunctionType",
+ "description": "A type of aggregation function."
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "parsing_regexes",
+ "aggregation_functions"
+ ],
+ "title": "RegexParserGraderParams"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "regex_parser"
+ ],
+ "title": "RegexParserGrader"
+ },
+ "SubsetOfGrader": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "subset_of",
+ "default": "subset_of"
+ },
+ "subset_of": {
+ "type": "object",
+ "properties": {
+ "aggregation_functions": {
+ "type": "array",
+ "items": {
+ "type": "string",
+ "enum": [
+ "average",
+ "median",
+ "categorical_count",
+ "accuracy"
+ ],
+ "title": "AggregationFunctionType",
+ "description": "A type of aggregation function."
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "aggregation_functions"
+ ],
+ "title": "BasicGraderParams"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "subset_of"
+ ],
+ "title": "SubsetOfGrader"
+ },
"Model": {
"type": "object",
"properties": {
@@ -7047,268 +6925,6 @@
],
"title": "ModelType"
},
- "AgentTurnInputType": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "agent_turn_input",
- "default": "agent_turn_input"
- }
- },
- "additionalProperties": false,
- "required": [
- "type"
- ],
- "title": "AgentTurnInputType"
- },
- "ArrayType": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "array",
- "default": "array"
- }
- },
- "additionalProperties": false,
- "required": [
- "type"
- ],
- "title": "ArrayType"
- },
- "BooleanType": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "boolean",
- "default": "boolean"
- }
- },
- "additionalProperties": false,
- "required": [
- "type"
- ],
- "title": "BooleanType"
- },
- "ChatCompletionInputType": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "chat_completion_input",
- "default": "chat_completion_input"
- }
- },
- "additionalProperties": false,
- "required": [
- "type"
- ],
- "title": "ChatCompletionInputType"
- },
- "CompletionInputType": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "completion_input",
- "default": "completion_input"
- }
- },
- "additionalProperties": false,
- "required": [
- "type"
- ],
- "title": "CompletionInputType"
- },
- "JsonType": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "json",
- "default": "json"
- }
- },
- "additionalProperties": false,
- "required": [
- "type"
- ],
- "title": "JsonType"
- },
- "NumberType": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "number",
- "default": "number"
- }
- },
- "additionalProperties": false,
- "required": [
- "type"
- ],
- "title": "NumberType"
- },
- "ObjectType": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "object",
- "default": "object"
- }
- },
- "additionalProperties": false,
- "required": [
- "type"
- ],
- "title": "ObjectType"
- },
- "ParamType": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/StringType"
- },
- {
- "$ref": "#/components/schemas/NumberType"
- },
- {
- "$ref": "#/components/schemas/BooleanType"
- },
- {
- "$ref": "#/components/schemas/ArrayType"
- },
- {
- "$ref": "#/components/schemas/ObjectType"
- },
- {
- "$ref": "#/components/schemas/JsonType"
- },
- {
- "$ref": "#/components/schemas/UnionType"
- },
- {
- "$ref": "#/components/schemas/ChatCompletionInputType"
- },
- {
- "$ref": "#/components/schemas/CompletionInputType"
- },
- {
- "$ref": "#/components/schemas/AgentTurnInputType"
- }
- ],
- "discriminator": {
- "propertyName": "type",
- "mapping": {
- "string": "#/components/schemas/StringType",
- "number": "#/components/schemas/NumberType",
- "boolean": "#/components/schemas/BooleanType",
- "array": "#/components/schemas/ArrayType",
- "object": "#/components/schemas/ObjectType",
- "json": "#/components/schemas/JsonType",
- "union": "#/components/schemas/UnionType",
- "chat_completion_input": "#/components/schemas/ChatCompletionInputType",
- "completion_input": "#/components/schemas/CompletionInputType",
- "agent_turn_input": "#/components/schemas/AgentTurnInputType"
- }
- }
- },
- "ScoringFn": {
- "type": "object",
- "properties": {
- "identifier": {
- "type": "string"
- },
- "provider_resource_id": {
- "type": "string"
- },
- "provider_id": {
- "type": "string"
- },
- "type": {
- "type": "string",
- "const": "scoring_function",
- "default": "scoring_function"
- },
- "description": {
- "type": "string"
- },
- "metadata": {
- "type": "object",
- "additionalProperties": {
- "oneOf": [
- {
- "type": "null"
- },
- {
- "type": "boolean"
- },
- {
- "type": "number"
- },
- {
- "type": "string"
- },
- {
- "type": "array"
- },
- {
- "type": "object"
- }
- ]
- }
- },
- "return_type": {
- "$ref": "#/components/schemas/ParamType"
- },
- "params": {
- "$ref": "#/components/schemas/ScoringFnParams"
- }
- },
- "additionalProperties": false,
- "required": [
- "identifier",
- "provider_resource_id",
- "provider_id",
- "type",
- "metadata",
- "return_type"
- ],
- "title": "ScoringFn"
- },
- "StringType": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "string",
- "default": "string"
- }
- },
- "additionalProperties": false,
- "required": [
- "type"
- ],
- "title": "StringType"
- },
- "UnionType": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "union",
- "default": "union"
- }
- },
- "additionalProperties": false,
- "required": [
- "type"
- ],
- "title": "UnionType"
- },
"Shield": {
"type": "object",
"properties": {
@@ -7707,16 +7323,6 @@
"title": "PostTrainingJobArtifactsResponse",
"description": "Artifacts of a finetuning job."
},
- "JobStatus": {
- "type": "string",
- "enum": [
- "completed",
- "in_progress",
- "failed",
- "scheduled"
- ],
- "title": "JobStatus"
- },
"PostTrainingJobStatusResponse": {
"type": "object",
"properties": {
@@ -7724,7 +7330,15 @@
"type": "string"
},
"status": {
- "$ref": "#/components/schemas/JobStatus"
+ "type": "string",
+ "enum": [
+ "completed",
+ "in_progress",
+ "failed",
+ "scheduled",
+ "cancelled"
+ ],
+ "title": "JobStatus"
},
"scheduled_at": {
"type": "string",
@@ -7840,6 +7454,363 @@
],
"title": "VectorDB"
},
+ "BenchmarkTask": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "benchmark_id",
+ "default": "benchmark_id"
+ },
+ "benchmark_id": {
+ "type": "string"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "benchmark_id"
+ ],
+ "title": "BenchmarkTask"
+ },
+ "DataSourceGraderTask": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "data_source_grader",
+ "default": "data_source_grader"
+ },
+ "data_source": {
+ "$ref": "#/components/schemas/DataSource"
+ },
+ "grader_ids": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "data_source",
+ "grader_ids"
+ ],
+ "title": "DataSourceGraderTask"
+ },
+ "DatasetGraderTask": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "dataset_grader",
+ "default": "dataset_grader"
+ },
+ "dataset_id": {
+ "type": "string"
+ },
+ "grader_ids": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "dataset_id",
+ "grader_ids"
+ ],
+ "title": "DatasetGraderTask"
+ },
+ "EvaluationTask": {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/BenchmarkTask"
+ },
+ {
+ "$ref": "#/components/schemas/DatasetGraderTask"
+ },
+ {
+ "$ref": "#/components/schemas/DataSourceGraderTask"
+ }
+ ],
+ "discriminator": {
+ "propertyName": "type",
+ "mapping": {
+ "benchmark_id": "#/components/schemas/BenchmarkTask",
+ "dataset_grader": "#/components/schemas/DatasetGraderTask",
+ "data_source_grader": "#/components/schemas/DataSourceGraderTask"
+ }
+ }
+ },
+ "GradeRequest": {
+ "type": "object",
+ "properties": {
+ "task": {
+ "$ref": "#/components/schemas/EvaluationTask",
+ "description": "The task to evaluate. One of: - BenchmarkTask: Run evaluation task against a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "task"
+ ],
+ "title": "GradeRequest"
+ },
+ "AgentCandidate": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "agent",
+ "default": "agent"
+ },
+ "config": {
+ "$ref": "#/components/schemas/AgentConfig",
+ "description": "The configuration for the agent candidate."
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "config"
+ ],
+ "title": "AgentCandidate",
+ "description": "An agent candidate for evaluation."
+ },
+ "EvaluationCandidate": {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/ModelCandidate"
+ },
+ {
+ "$ref": "#/components/schemas/AgentCandidate"
+ }
+ ],
+ "discriminator": {
+ "propertyName": "type",
+ "mapping": {
+ "model": "#/components/schemas/ModelCandidate",
+ "agent": "#/components/schemas/AgentCandidate"
+ }
+ }
+ },
+ "EvaluationJob": {
+ "type": "object",
+ "properties": {
+ "id": {
+ "type": "string",
+ "description": "The ID of the job."
+ },
+ "status": {
+ "type": "string",
+ "enum": [
+ "completed",
+ "in_progress",
+ "failed",
+ "scheduled",
+ "cancelled"
+ ],
+ "description": "The status of the job."
+ },
+ "created_at": {
+ "type": "string",
+ "format": "date-time",
+ "description": "The time the job was created."
+ },
+ "ended_at": {
+ "type": "string",
+ "format": "date-time",
+ "description": "The time the job ended."
+ },
+ "error": {
+ "type": "string",
+ "description": "If status of the job is failed, this will contain the error message."
+ },
+ "type": {
+ "type": "string",
+ "const": "evaluation",
+ "default": "evaluation"
+ },
+ "task": {
+ "$ref": "#/components/schemas/EvaluationTask"
+ },
+ "candidate": {
+ "$ref": "#/components/schemas/EvaluationCandidate"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "id",
+ "status",
+ "created_at",
+ "type",
+ "task",
+ "candidate"
+ ],
+ "title": "EvaluationJob"
+ },
+ "ModelCandidate": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "model",
+ "default": "model"
+ },
+ "model_id": {
+ "type": "string"
+ },
+ "sampling_params": {
+ "$ref": "#/components/schemas/SamplingParams",
+ "description": "The sampling parameters for the model."
+ },
+ "system_message": {
+ "$ref": "#/components/schemas/SystemMessage",
+ "description": "(Optional) The system message providing instructions or context to the model."
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "model_id",
+ "sampling_params"
+ ],
+ "title": "ModelCandidate",
+ "description": "A model candidate for evaluation."
+ },
+ "GradeInlineRequest": {
+ "type": "object",
+ "properties": {
+ "task": {
+ "$ref": "#/components/schemas/EvaluationTask",
+ "description": "The task to evaluate. One of: - BenchmarkTask: Run evaluation task against a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "task"
+ ],
+ "title": "GradeInlineRequest"
+ },
+ "EvaluationResponse": {
+ "type": "object",
+ "properties": {
+ "generations": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ }
+ },
+ "description": "The generations in rows for the evaluation."
+ },
+ "scores": {
+ "type": "object",
+ "additionalProperties": {
+ "$ref": "#/components/schemas/ScoringResult"
+ },
+ "description": "The scores for the evaluation. Map of grader id to ScoringResult."
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "generations",
+ "scores"
+ ],
+ "title": "EvaluationResponse",
+ "description": "A response to an inline evaluation."
+ },
+ "ScoringResult": {
+ "type": "object",
+ "properties": {
+ "scores": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ }
+ },
+ "description": "The scoring result for each row. Each row is a map of grader column name to value."
+ },
+ "metrics": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ },
+ "description": "Map of metric name to aggregated value."
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "scores",
+ "metrics"
+ ],
+ "title": "ScoringResult",
+ "description": "A scoring result for a single row."
+ },
"HealthInfo": {
"type": "object",
"properties": {
@@ -8285,6 +8256,65 @@
"title": "ListFileResponse",
"description": "Response representing a list of file entries."
},
+ "GraderTypeInfo": {
+ "type": "object",
+ "properties": {
+ "grader_type": {
+ "type": "string",
+ "enum": [
+ "llm",
+ "regex_parser",
+ "equality",
+ "subset_of",
+ "factuality",
+ "faithfulness"
+ ],
+ "title": "GraderType",
+ "description": "A type of grader. Each type is a criteria for evaluating answers."
+ },
+ "description": {
+ "type": "string",
+ "description": "A description of the grader type. - E.g. Write your custom judge prompt to score the answer."
+ },
+ "supported_dataset_purposes": {
+ "type": "array",
+ "items": {
+ "type": "string",
+ "enum": [
+ "post-training/messages",
+ "eval/question-answer",
+ "eval/messages-answer"
+ ],
+ "title": "DatasetPurpose",
+ "description": "Purpose of the dataset. Each purpose has a required input data schema."
+ },
+ "description": "The purposes that this grader can be used for."
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "grader_type",
+ "description",
+ "supported_dataset_purposes"
+ ],
+ "title": "GraderTypeInfo"
+ },
+ "ListGraderTypesResponse": {
+ "type": "object",
+ "properties": {
+ "data": {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/GraderTypeInfo"
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "data"
+ ],
+ "title": "ListGraderTypesResponse"
+ },
"ListModelsResponse": {
"type": "object",
"properties": {
@@ -8357,22 +8387,6 @@
],
"title": "ListRoutesResponse"
},
- "ListScoringFunctionsResponse": {
- "type": "object",
- "properties": {
- "data": {
- "type": "array",
- "items": {
- "$ref": "#/components/schemas/ScoringFn"
- }
- }
- },
- "additionalProperties": false,
- "required": [
- "data"
- ],
- "title": "ListScoringFunctionsResponse"
- },
"ListShieldsResponse": {
"type": "object",
"properties": {
@@ -9363,23 +9377,20 @@
"RegisterBenchmarkRequest": {
"type": "object",
"properties": {
- "benchmark_id": {
- "type": "string"
- },
"dataset_id": {
- "type": "string"
+ "type": "string",
+ "description": "The ID of the dataset to used to run the benchmark."
},
- "scoring_functions": {
+ "grader_ids": {
"type": "array",
"items": {
"type": "string"
- }
+ },
+ "description": "List of grader ids to use for this benchmark."
},
- "provider_benchmark_id": {
- "type": "string"
- },
- "provider_id": {
- "type": "string"
+ "benchmark_id": {
+ "type": "string",
+ "description": "(Optional) The ID of the benchmark to register. If not provided, an ID will be generated."
},
"metadata": {
"type": "object",
@@ -9404,14 +9415,14 @@
"type": "object"
}
]
- }
+ },
+ "description": "(Optional) Metadata for this benchmark for additional descriptions."
}
},
"additionalProperties": false,
"required": [
- "benchmark_id",
"dataset_id",
- "scoring_functions"
+ "grader_ids"
],
"title": "RegisterBenchmarkRequest"
},
@@ -9469,6 +9480,50 @@
],
"title": "RegisterDatasetRequest"
},
+ "RegisterGraderRequest": {
+ "type": "object",
+ "properties": {
+ "grader": {
+ "$ref": "#/components/schemas/GraderDefinition",
+ "description": "The grader definition, E.g. - { \"type\": \"llm\", \"llm\": { \"model\": \"llama-405b\", \"prompt\": \"You are a judge. Score the answer based on the question. {question} {answer}\", } }"
+ },
+ "grader_id": {
+ "type": "string",
+ "description": "(Optional) The ID of the grader. If not provided, a random ID will be generated."
+ },
+ "metadata": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ },
+ "description": "(Optional) Any additional metadata for this grader. - E.g. { \"description\": \"A grader that scores the answer based on the question.\", }"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "grader"
+ ],
+ "title": "RegisterGraderRequest"
+ },
"RegisterModelRequest": {
"type": "object",
"properties": {
@@ -9516,36 +9571,6 @@
],
"title": "RegisterModelRequest"
},
- "RegisterScoringFunctionRequest": {
- "type": "object",
- "properties": {
- "scoring_fn_id": {
- "type": "string"
- },
- "description": {
- "type": "string"
- },
- "return_type": {
- "$ref": "#/components/schemas/ParamType"
- },
- "provider_scoring_fn_id": {
- "type": "string"
- },
- "provider_id": {
- "type": "string"
- },
- "params": {
- "$ref": "#/components/schemas/ScoringFnParams"
- }
- },
- "additionalProperties": false,
- "required": [
- "scoring_fn_id",
- "description",
- "return_type"
- ],
- "title": "RegisterScoringFunctionRequest"
- },
"RegisterShieldRequest": {
"type": "object",
"properties": {
@@ -9682,32 +9707,43 @@
],
"title": "ResumeAgentTurnRequest"
},
- "RunEvalRequest": {
+ "RunRequest": {
"type": "object",
"properties": {
- "benchmark_config": {
- "$ref": "#/components/schemas/BenchmarkConfig",
- "description": "The configuration for the benchmark."
+ "task": {
+ "$ref": "#/components/schemas/EvaluationTask",
+ "description": "The task to evaluate. One of: - BenchmarkTask: Run evaluation task against a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
+ },
+ "candidate": {
+ "$ref": "#/components/schemas/EvaluationCandidate",
+ "description": "The candidate to evaluate."
}
},
"additionalProperties": false,
"required": [
- "benchmark_config"
+ "task",
+ "candidate"
],
- "title": "RunEvalRequest"
+ "title": "RunRequest"
},
- "Job": {
+ "RunInlineRequest": {
"type": "object",
"properties": {
- "job_id": {
- "type": "string"
+ "task": {
+ "$ref": "#/components/schemas/EvaluationTask",
+ "description": "The task to evaluate. One of: - BenchmarkTask: Run evaluation task against a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
+ },
+ "candidate": {
+ "$ref": "#/components/schemas/EvaluationCandidate",
+ "description": "The candidate to evaluate."
}
},
"additionalProperties": false,
"required": [
- "job_id"
+ "task",
+ "candidate"
],
- "title": "Job"
+ "title": "RunInlineRequest"
},
"RunShieldRequest": {
"type": "object",
@@ -9795,128 +9831,6 @@
],
"title": "SaveSpansToDatasetRequest"
},
- "ScoreRequest": {
- "type": "object",
- "properties": {
- "input_rows": {
- "type": "array",
- "items": {
- "type": "object",
- "additionalProperties": {
- "oneOf": [
- {
- "type": "null"
- },
- {
- "type": "boolean"
- },
- {
- "type": "number"
- },
- {
- "type": "string"
- },
- {
- "type": "array"
- },
- {
- "type": "object"
- }
- ]
- }
- },
- "description": "The rows to score."
- },
- "scoring_functions": {
- "type": "object",
- "additionalProperties": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/ScoringFnParams"
- },
- {
- "type": "null"
- }
- ]
- },
- "description": "The scoring functions to use for the scoring."
- }
- },
- "additionalProperties": false,
- "required": [
- "input_rows",
- "scoring_functions"
- ],
- "title": "ScoreRequest"
- },
- "ScoreResponse": {
- "type": "object",
- "properties": {
- "results": {
- "type": "object",
- "additionalProperties": {
- "$ref": "#/components/schemas/ScoringResult"
- },
- "description": "A map of scoring function name to ScoringResult."
- }
- },
- "additionalProperties": false,
- "required": [
- "results"
- ],
- "title": "ScoreResponse",
- "description": "The response from scoring."
- },
- "ScoreBatchRequest": {
- "type": "object",
- "properties": {
- "dataset_id": {
- "type": "string"
- },
- "scoring_functions": {
- "type": "object",
- "additionalProperties": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/ScoringFnParams"
- },
- {
- "type": "null"
- }
- ]
- }
- },
- "save_results_dataset": {
- "type": "boolean"
- }
- },
- "additionalProperties": false,
- "required": [
- "dataset_id",
- "scoring_functions",
- "save_results_dataset"
- ],
- "title": "ScoreBatchRequest"
- },
- "ScoreBatchResponse": {
- "type": "object",
- "properties": {
- "dataset_id": {
- "type": "string"
- },
- "results": {
- "type": "object",
- "additionalProperties": {
- "$ref": "#/components/schemas/ScoringResult"
- }
- }
- },
- "additionalProperties": false,
- "required": [
- "results"
- ],
- "title": "ScoreBatchResponse"
- },
"AlgorithmConfig": {
"oneOf": [
{
@@ -10280,12 +10194,14 @@
"name": "Datasets"
},
{
- "name": "Eval",
- "x-displayName": "Llama Stack Evaluation API for running evaluations on model and agent candidates."
+ "name": "Evaluation"
},
{
"name": "Files"
},
+ {
+ "name": "Graders"
+ },
{
"name": "Inference",
"description": "This API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.",
@@ -10307,12 +10223,6 @@
{
"name": "Safety"
},
- {
- "name": "Scoring"
- },
- {
- "name": "ScoringFunctions"
- },
{
"name": "Shields"
},
@@ -10344,16 +10254,15 @@
"Benchmarks",
"DatasetIO",
"Datasets",
- "Eval",
+ "Evaluation",
"Files",
+ "Graders",
"Inference",
"Inspect",
"Models",
"PostTraining (Coming Soon)",
"Providers",
"Safety",
- "Scoring",
- "ScoringFunctions",
"Shields",
"SyntheticDataGeneration (Coming Soon)",
"Telemetry",
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index a3d4dbcc9..db92e7e6a 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -589,6 +589,59 @@ paths:
required: true
schema:
type: string
+ /v1/graders/{grader_id}:
+ get:
+ responses:
+ '200':
+ description: The grader.
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/Grader'
+ '400':
+ $ref: '#/components/responses/BadRequest400'
+ '429':
+ $ref: >-
+ #/components/responses/TooManyRequests429
+ '500':
+ $ref: >-
+ #/components/responses/InternalServerError500
+ default:
+ $ref: '#/components/responses/DefaultError'
+ tags:
+ - Graders
+ description: Get a grader by ID.
+ parameters:
+ - name: grader_id
+ in: path
+ description: The ID of the grader.
+ required: true
+ schema:
+ type: string
+ delete:
+ responses:
+ '200':
+ description: OK
+ '400':
+ $ref: '#/components/responses/BadRequest400'
+ '429':
+ $ref: >-
+ #/components/responses/TooManyRequests429
+ '500':
+ $ref: >-
+ #/components/responses/InternalServerError500
+ default:
+ $ref: '#/components/responses/DefaultError'
+ tags:
+ - Graders
+ description: Delete a grader by ID.
+ parameters:
+ - name: grader_id
+ in: path
+ description: The ID of the grader.
+ required: true
+ schema:
+ type: string
/v1/inference/embeddings:
post:
responses:
@@ -622,43 +675,6 @@ paths:
schema:
$ref: '#/components/schemas/EmbeddingsRequest'
required: true
- /v1/eval/benchmarks/{benchmark_id}/evaluations:
- post:
- responses:
- '200':
- description: >-
- EvaluateResponse object containing generations and scores
- content:
- application/json:
- schema:
- $ref: '#/components/schemas/EvaluateResponse'
- '400':
- $ref: '#/components/responses/BadRequest400'
- '429':
- $ref: >-
- #/components/responses/TooManyRequests429
- '500':
- $ref: >-
- #/components/responses/InternalServerError500
- default:
- $ref: '#/components/responses/DefaultError'
- tags:
- - Eval
- description: Evaluate a list of rows on a benchmark.
- parameters:
- - name: benchmark_id
- in: path
- description: >-
- The ID of the benchmark to run the evaluation on.
- required: true
- schema:
- type: string
- requestBody:
- content:
- application/json:
- schema:
- $ref: '#/components/schemas/EvaluateRowsRequest'
- required: true
/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}:
get:
responses:
@@ -757,9 +773,7 @@ paths:
content:
application/json:
schema:
- oneOf:
- - $ref: '#/components/schemas/Benchmark'
- - type: 'null'
+ $ref: '#/components/schemas/Benchmark'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
@@ -772,10 +786,11 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Benchmarks
- description: ''
+ description: Get a benchmark by ID.
parameters:
- name: benchmark_id
in: path
+ description: The ID of the benchmark to get.
required: true
schema:
type: string
@@ -885,36 +900,6 @@ paths:
required: true
schema:
type: string
- /v1/scoring-functions/{scoring_fn_id}:
- get:
- responses:
- '200':
- description: OK
- content:
- application/json:
- schema:
- oneOf:
- - $ref: '#/components/schemas/ScoringFn'
- - type: 'null'
- '400':
- $ref: '#/components/responses/BadRequest400'
- '429':
- $ref: >-
- #/components/responses/TooManyRequests429
- '500':
- $ref: >-
- #/components/responses/InternalServerError500
- default:
- $ref: '#/components/responses/DefaultError'
- tags:
- - ScoringFunctions
- description: ''
- parameters:
- - name: scoring_fn_id
- in: path
- required: true
- schema:
- type: string
/v1/shields/{identifier}:
get:
responses:
@@ -1326,6 +1311,70 @@ paths:
required: true
schema:
type: string
+ /v1/evaluation/grade:
+ post:
+ responses:
+ '200':
+ description: >-
+ The evaluation job containing grader scores.
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/EvaluationJob'
+ '400':
+ $ref: '#/components/responses/BadRequest400'
+ '429':
+ $ref: >-
+ #/components/responses/TooManyRequests429
+ '500':
+ $ref: >-
+ #/components/responses/InternalServerError500
+ default:
+ $ref: '#/components/responses/DefaultError'
+ tags:
+ - Evaluation
+ description: >-
+ Run an grading job with generated results. Use this when you have generated
+ results from inference in a dataset.
+ parameters: []
+ requestBody:
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/GradeRequest'
+ required: true
+ /v1/evaluation/grade_inline:
+ post:
+ responses:
+ '200':
+ description: >-
+ The evaluation job containing grader scores. "generations" is not populated
+ in the response.
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/EvaluationResponse'
+ '400':
+ $ref: '#/components/responses/BadRequest400'
+ '429':
+ $ref: >-
+ #/components/responses/TooManyRequests429
+ '500':
+ $ref: >-
+ #/components/responses/InternalServerError500
+ default:
+ $ref: '#/components/responses/DefaultError'
+ tags:
+ - Evaluation
+ description: >-
+ Run an grading job with generated results inline.
+ parameters: []
+ requestBody:
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/GradeInlineRequest'
+ required: true
/v1/health:
get:
responses:
@@ -1501,111 +1550,6 @@ paths:
required: false
schema:
type: integer
- /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}:
- get:
- responses:
- '200':
- description: The status of the evaluationjob.
- content:
- application/json:
- schema:
- oneOf:
- - $ref: '#/components/schemas/JobStatus'
- - type: 'null'
- '400':
- $ref: '#/components/responses/BadRequest400'
- '429':
- $ref: >-
- #/components/responses/TooManyRequests429
- '500':
- $ref: >-
- #/components/responses/InternalServerError500
- default:
- $ref: '#/components/responses/DefaultError'
- tags:
- - Eval
- description: Get the status of a job.
- parameters:
- - name: benchmark_id
- in: path
- description: >-
- The ID of the benchmark to run the evaluation on.
- required: true
- schema:
- type: string
- - name: job_id
- in: path
- description: The ID of the job to get the status of.
- required: true
- schema:
- type: string
- delete:
- responses:
- '200':
- description: OK
- '400':
- $ref: '#/components/responses/BadRequest400'
- '429':
- $ref: >-
- #/components/responses/TooManyRequests429
- '500':
- $ref: >-
- #/components/responses/InternalServerError500
- default:
- $ref: '#/components/responses/DefaultError'
- tags:
- - Eval
- description: Cancel a job.
- parameters:
- - name: benchmark_id
- in: path
- description: >-
- The ID of the benchmark to run the evaluation on.
- required: true
- schema:
- type: string
- - name: job_id
- in: path
- description: The ID of the job to cancel.
- required: true
- schema:
- type: string
- /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result:
- get:
- responses:
- '200':
- description: The result of the job.
- content:
- application/json:
- schema:
- $ref: '#/components/schemas/EvaluateResponse'
- '400':
- $ref: '#/components/responses/BadRequest400'
- '429':
- $ref: >-
- #/components/responses/TooManyRequests429
- '500':
- $ref: >-
- #/components/responses/InternalServerError500
- default:
- $ref: '#/components/responses/DefaultError'
- tags:
- - Eval
- description: Get the result of a job.
- parameters:
- - name: benchmark_id
- in: path
- description: >-
- The ID of the benchmark to run the evaluation on.
- required: true
- schema:
- type: string
- - name: job_id
- in: path
- description: The ID of the job to get the result of.
- required: true
- schema:
- type: string
/v1/agents/{agent_id}/sessions:
get:
responses:
@@ -1657,12 +1601,16 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Benchmarks
- description: ''
+ description: List all benchmarks.
parameters: []
post:
responses:
'200':
description: OK
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/Benchmark'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
@@ -1675,7 +1623,7 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Benchmarks
- description: ''
+ description: Register a new benchmark.
parameters: []
requestBody:
content:
@@ -1763,6 +1711,81 @@ paths:
required: true
schema:
type: string
+ /v1/graders/types:
+ get:
+ responses:
+ '200':
+ description: >-
+ A list of grader types and information about the types.
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ListGraderTypesResponse'
+ '400':
+ $ref: '#/components/responses/BadRequest400'
+ '429':
+ $ref: >-
+ #/components/responses/TooManyRequests429
+ '500':
+ $ref: >-
+ #/components/responses/InternalServerError500
+ default:
+ $ref: '#/components/responses/DefaultError'
+ tags:
+ - Graders
+ description: List all grader types.
+ parameters: []
+ /v1/graders:
+ get:
+ responses:
+ '200':
+ description: A list of graders.
+ content:
+ application/jsonl:
+ schema:
+ $ref: '#/components/schemas/Grader'
+ '400':
+ $ref: '#/components/responses/BadRequest400'
+ '429':
+ $ref: >-
+ #/components/responses/TooManyRequests429
+ '500':
+ $ref: >-
+ #/components/responses/InternalServerError500
+ default:
+ $ref: '#/components/responses/DefaultError'
+ tags:
+ - Graders
+ description: List all graders.
+ parameters: []
+ post:
+ responses:
+ '200':
+ description: The registered grader.
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/Grader'
+ '400':
+ $ref: '#/components/responses/BadRequest400'
+ '429':
+ $ref: >-
+ #/components/responses/TooManyRequests429
+ '500':
+ $ref: >-
+ #/components/responses/InternalServerError500
+ default:
+ $ref: '#/components/responses/DefaultError'
+ tags:
+ - Graders
+ description: Register a new grader.
+ parameters: []
+ requestBody:
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/RegisterGraderRequest'
+ required: true
/v1/models:
get:
responses:
@@ -1893,53 +1916,6 @@ paths:
required: false
schema:
$ref: '#/components/schemas/URL'
- /v1/scoring-functions:
- get:
- responses:
- '200':
- description: OK
- content:
- application/json:
- schema:
- $ref: '#/components/schemas/ListScoringFunctionsResponse'
- '400':
- $ref: '#/components/responses/BadRequest400'
- '429':
- $ref: >-
- #/components/responses/TooManyRequests429
- '500':
- $ref: >-
- #/components/responses/InternalServerError500
- default:
- $ref: '#/components/responses/DefaultError'
- tags:
- - ScoringFunctions
- description: ''
- parameters: []
- post:
- responses:
- '200':
- description: OK
- '400':
- $ref: '#/components/responses/BadRequest400'
- '429':
- $ref: >-
- #/components/responses/TooManyRequests429
- '500':
- $ref: >-
- #/components/responses/InternalServerError500
- default:
- $ref: '#/components/responses/DefaultError'
- tags:
- - ScoringFunctions
- description: ''
- parameters: []
- requestBody:
- content:
- application/json:
- schema:
- $ref: '#/components/schemas/RegisterScoringFunctionRequest'
- required: true
/v1/shields:
get:
responses:
@@ -2345,16 +2321,15 @@ paths:
schema:
$ref: '#/components/schemas/ResumeAgentTurnRequest'
required: true
- /v1/eval/benchmarks/{benchmark_id}/jobs:
+ /v1/evaluation/run:
post:
responses:
'200':
- description: >-
- The job that was created to run the evaluation.
+ description: OK
content:
application/json:
schema:
- $ref: '#/components/schemas/Job'
+ $ref: '#/components/schemas/EvaluationJob'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
@@ -2366,21 +2341,43 @@ paths:
default:
$ref: '#/components/responses/DefaultError'
tags:
- - Eval
- description: Run an evaluation on a benchmark.
- parameters:
- - name: benchmark_id
- in: path
- description: >-
- The ID of the benchmark to run the evaluation on.
- required: true
- schema:
- type: string
+ - Evaluation
+ description: Run an evaluation job.
+ parameters: []
requestBody:
content:
application/json:
schema:
- $ref: '#/components/schemas/RunEvalRequest'
+ $ref: '#/components/schemas/RunRequest'
+ required: true
+ /v1/evaluation/run_inline:
+ post:
+ responses:
+ '200':
+ description: OK
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/EvaluationResponse'
+ '400':
+ $ref: '#/components/responses/BadRequest400'
+ '429':
+ $ref: >-
+ #/components/responses/TooManyRequests429
+ '500':
+ $ref: >-
+ #/components/responses/InternalServerError500
+ default:
+ $ref: '#/components/responses/DefaultError'
+ tags:
+ - Evaluation
+ description: Run an evaluation job inline.
+ parameters: []
+ requestBody:
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/RunInlineRequest'
required: true
/v1/safety/run-shield:
post:
@@ -2436,65 +2433,6 @@ paths:
schema:
$ref: '#/components/schemas/SaveSpansToDatasetRequest'
required: true
- /v1/scoring/score:
- post:
- responses:
- '200':
- description: >-
- ScoreResponse object containing rows and aggregated results
- content:
- application/json:
- schema:
- $ref: '#/components/schemas/ScoreResponse'
- '400':
- $ref: '#/components/responses/BadRequest400'
- '429':
- $ref: >-
- #/components/responses/TooManyRequests429
- '500':
- $ref: >-
- #/components/responses/InternalServerError500
- default:
- $ref: '#/components/responses/DefaultError'
- tags:
- - Scoring
- description: Score a list of rows.
- parameters: []
- requestBody:
- content:
- application/json:
- schema:
- $ref: '#/components/schemas/ScoreRequest'
- required: true
- /v1/scoring/score-batch:
- post:
- responses:
- '200':
- description: OK
- content:
- application/json:
- schema:
- $ref: '#/components/schemas/ScoreBatchResponse'
- '400':
- $ref: '#/components/responses/BadRequest400'
- '429':
- $ref: >-
- #/components/responses/TooManyRequests429
- '500':
- $ref: >-
- #/components/responses/InternalServerError500
- default:
- $ref: '#/components/responses/DefaultError'
- tags:
- - Scoring
- description: ''
- parameters: []
- requestBody:
- content:
- application/json:
- schema:
- $ref: '#/components/schemas/ScoreBatchRequest'
- required: true
/v1/post-training/supervised-fine-tune:
post:
responses:
@@ -4384,251 +4322,6 @@ components:
title: EmbeddingsResponse
description: >-
Response containing generated embeddings.
- AgentCandidate:
- type: object
- properties:
- type:
- type: string
- const: agent
- default: agent
- config:
- $ref: '#/components/schemas/AgentConfig'
- description: >-
- The configuration for the agent candidate.
- additionalProperties: false
- required:
- - type
- - config
- title: AgentCandidate
- description: An agent candidate for evaluation.
- AggregationFunctionType:
- type: string
- enum:
- - average
- - median
- - categorical_count
- - accuracy
- title: AggregationFunctionType
- BasicScoringFnParams:
- type: object
- properties:
- type:
- type: string
- const: basic
- default: basic
- aggregation_functions:
- type: array
- items:
- $ref: '#/components/schemas/AggregationFunctionType'
- additionalProperties: false
- required:
- - type
- title: BasicScoringFnParams
- BenchmarkConfig:
- type: object
- properties:
- eval_candidate:
- $ref: '#/components/schemas/EvalCandidate'
- description: The candidate to evaluate.
- scoring_params:
- type: object
- additionalProperties:
- $ref: '#/components/schemas/ScoringFnParams'
- description: >-
- Map between scoring function id and parameters for each scoring function
- you want to run
- num_examples:
- type: integer
- description: >-
- (Optional) The number of examples to evaluate. If not provided, all examples
- in the dataset will be evaluated
- additionalProperties: false
- required:
- - eval_candidate
- - scoring_params
- title: BenchmarkConfig
- description: >-
- A benchmark configuration for evaluation.
- EvalCandidate:
- oneOf:
- - $ref: '#/components/schemas/ModelCandidate'
- - $ref: '#/components/schemas/AgentCandidate'
- discriminator:
- propertyName: type
- mapping:
- model: '#/components/schemas/ModelCandidate'
- agent: '#/components/schemas/AgentCandidate'
- LLMAsJudgeScoringFnParams:
- type: object
- properties:
- type:
- type: string
- const: llm_as_judge
- default: llm_as_judge
- judge_model:
- type: string
- prompt_template:
- type: string
- judge_score_regexes:
- type: array
- items:
- type: string
- aggregation_functions:
- type: array
- items:
- $ref: '#/components/schemas/AggregationFunctionType'
- additionalProperties: false
- required:
- - type
- - judge_model
- title: LLMAsJudgeScoringFnParams
- ModelCandidate:
- type: object
- properties:
- type:
- type: string
- const: model
- default: model
- model:
- type: string
- description: The model ID to evaluate.
- sampling_params:
- $ref: '#/components/schemas/SamplingParams'
- description: The sampling parameters for the model.
- system_message:
- $ref: '#/components/schemas/SystemMessage'
- description: >-
- (Optional) The system message providing instructions or context to the
- model.
- additionalProperties: false
- required:
- - type
- - model
- - sampling_params
- title: ModelCandidate
- description: A model candidate for evaluation.
- RegexParserScoringFnParams:
- type: object
- properties:
- type:
- type: string
- const: regex_parser
- default: regex_parser
- parsing_regexes:
- type: array
- items:
- type: string
- aggregation_functions:
- type: array
- items:
- $ref: '#/components/schemas/AggregationFunctionType'
- additionalProperties: false
- required:
- - type
- title: RegexParserScoringFnParams
- ScoringFnParams:
- oneOf:
- - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
- - $ref: '#/components/schemas/RegexParserScoringFnParams'
- - $ref: '#/components/schemas/BasicScoringFnParams'
- discriminator:
- propertyName: type
- mapping:
- llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
- regex_parser: '#/components/schemas/RegexParserScoringFnParams'
- basic: '#/components/schemas/BasicScoringFnParams'
- EvaluateRowsRequest:
- type: object
- properties:
- input_rows:
- type: array
- items:
- type: object
- additionalProperties:
- oneOf:
- - type: 'null'
- - type: boolean
- - type: number
- - type: string
- - type: array
- - type: object
- description: The rows to evaluate.
- scoring_functions:
- type: array
- items:
- type: string
- description: >-
- The scoring functions to use for the evaluation.
- benchmark_config:
- $ref: '#/components/schemas/BenchmarkConfig'
- description: The configuration for the benchmark.
- additionalProperties: false
- required:
- - input_rows
- - scoring_functions
- - benchmark_config
- title: EvaluateRowsRequest
- EvaluateResponse:
- type: object
- properties:
- generations:
- type: array
- items:
- type: object
- additionalProperties:
- oneOf:
- - type: 'null'
- - type: boolean
- - type: number
- - type: string
- - type: array
- - type: object
- description: The generations from the evaluation.
- scores:
- type: object
- additionalProperties:
- $ref: '#/components/schemas/ScoringResult'
- description: The scores from the evaluation.
- additionalProperties: false
- required:
- - generations
- - scores
- title: EvaluateResponse
- description: The response from an evaluation.
- ScoringResult:
- type: object
- properties:
- score_rows:
- type: array
- items:
- type: object
- additionalProperties:
- oneOf:
- - type: 'null'
- - type: boolean
- - type: number
- - type: string
- - type: array
- - type: object
- description: >-
- The scoring result for each row. Each row is a map of column name to value.
- aggregated_results:
- type: object
- additionalProperties:
- oneOf:
- - type: 'null'
- - type: boolean
- - type: number
- - type: string
- - type: array
- - type: object
- description: Map of metric name to aggregated value
- additionalProperties: false
- required:
- - score_rows
- - aggregated_results
- title: ScoringResult
- description: A scoring result for a single row.
Agent:
type: object
properties:
@@ -4703,10 +4396,14 @@ components:
default: benchmark
dataset_id:
type: string
- scoring_functions:
+ description: >-
+ The ID of the dataset to used to run the benchmark.
+ grader_ids:
type: array
items:
type: string
+ description: >-
+ The grader ids to use for this benchmark.
metadata:
type: object
additionalProperties:
@@ -4717,6 +4414,8 @@ components:
- type: string
- type: array
- type: object
+ description: >-
+ Metadata for this benchmark for additional descriptions.
additionalProperties: false
required:
- identifier
@@ -4724,7 +4423,7 @@ components:
- provider_id
- type
- dataset_id
- - scoring_functions
+ - grader_ids
- metadata
title: Benchmark
DataSource:
@@ -4828,6 +4527,255 @@ components:
title: URIDataSource
description: >-
A dataset that can be obtained from a URI.
+ EqualityGrader:
+ type: object
+ properties:
+ type:
+ type: string
+ const: equality
+ default: equality
+ equality:
+ type: object
+ properties:
+ aggregation_functions:
+ type: array
+ items:
+ type: string
+ enum:
+ - average
+ - median
+ - categorical_count
+ - accuracy
+ title: AggregationFunctionType
+ description: A type of aggregation function.
+ additionalProperties: false
+ required:
+ - aggregation_functions
+ title: BasicGraderParams
+ additionalProperties: false
+ required:
+ - type
+ - equality
+ title: EqualityGrader
+ FactualityGrader:
+ type: object
+ properties:
+ type:
+ type: string
+ const: factuality
+ default: factuality
+ factuality:
+ type: object
+ properties:
+ aggregation_functions:
+ type: array
+ items:
+ type: string
+ enum:
+ - average
+ - median
+ - categorical_count
+ - accuracy
+ title: AggregationFunctionType
+ description: A type of aggregation function.
+ additionalProperties: false
+ required:
+ - aggregation_functions
+ title: BasicGraderParams
+ additionalProperties: false
+ required:
+ - type
+ - factuality
+ title: FactualityGrader
+ FaithfulnessGrader:
+ type: object
+ properties:
+ type:
+ type: string
+ const: faithfulness
+ default: faithfulness
+ faithfulness:
+ type: object
+ properties:
+ aggregation_functions:
+ type: array
+ items:
+ type: string
+ enum:
+ - average
+ - median
+ - categorical_count
+ - accuracy
+ title: AggregationFunctionType
+ description: A type of aggregation function.
+ additionalProperties: false
+ required:
+ - aggregation_functions
+ title: BasicGraderParams
+ additionalProperties: false
+ required:
+ - type
+ - faithfulness
+ title: FaithfulnessGrader
+ Grader:
+ type: object
+ properties:
+ identifier:
+ type: string
+ provider_resource_id:
+ type: string
+ provider_id:
+ type: string
+ type:
+ type: string
+ const: grader
+ default: grader
+ grader:
+ $ref: '#/components/schemas/GraderDefinition'
+ description:
+ type: string
+ metadata:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ additionalProperties: false
+ required:
+ - identifier
+ - provider_resource_id
+ - provider_id
+ - type
+ - grader
+ - metadata
+ title: Grader
+ GraderDefinition:
+ oneOf:
+ - $ref: '#/components/schemas/LlmGrader'
+ - $ref: '#/components/schemas/RegexParserGrader'
+ - $ref: '#/components/schemas/EqualityGrader'
+ - $ref: '#/components/schemas/SubsetOfGrader'
+ - $ref: '#/components/schemas/FactualityGrader'
+ - $ref: '#/components/schemas/FaithfulnessGrader'
+ discriminator:
+ propertyName: type
+ mapping:
+ llm: '#/components/schemas/LlmGrader'
+ regex_parser: '#/components/schemas/RegexParserGrader'
+ equality: '#/components/schemas/EqualityGrader'
+ subset_of: '#/components/schemas/SubsetOfGrader'
+ factuality: '#/components/schemas/FactualityGrader'
+ faithfulness: '#/components/schemas/FaithfulnessGrader'
+ LlmGrader:
+ type: object
+ properties:
+ type:
+ type: string
+ const: llm
+ default: llm
+ llm:
+ type: object
+ properties:
+ model:
+ type: string
+ prompt:
+ type: string
+ score_regexes:
+ type: array
+ items:
+ type: string
+ aggregation_functions:
+ type: array
+ items:
+ type: string
+ enum:
+ - average
+ - median
+ - categorical_count
+ - accuracy
+ title: AggregationFunctionType
+ description: A type of aggregation function.
+ additionalProperties: false
+ required:
+ - model
+ - prompt
+ - score_regexes
+ - aggregation_functions
+ title: LlmGraderParams
+ additionalProperties: false
+ required:
+ - type
+ - llm
+ title: LlmGrader
+ RegexParserGrader:
+ type: object
+ properties:
+ type:
+ type: string
+ const: regex_parser
+ default: regex_parser
+ regex_parser:
+ type: object
+ properties:
+ parsing_regexes:
+ type: array
+ items:
+ type: string
+ aggregation_functions:
+ type: array
+ items:
+ type: string
+ enum:
+ - average
+ - median
+ - categorical_count
+ - accuracy
+ title: AggregationFunctionType
+ description: A type of aggregation function.
+ additionalProperties: false
+ required:
+ - parsing_regexes
+ - aggregation_functions
+ title: RegexParserGraderParams
+ additionalProperties: false
+ required:
+ - type
+ - regex_parser
+ title: RegexParserGrader
+ SubsetOfGrader:
+ type: object
+ properties:
+ type:
+ type: string
+ const: subset_of
+ default: subset_of
+ subset_of:
+ type: object
+ properties:
+ aggregation_functions:
+ type: array
+ items:
+ type: string
+ enum:
+ - average
+ - median
+ - categorical_count
+ - accuracy
+ title: AggregationFunctionType
+ description: A type of aggregation function.
+ additionalProperties: false
+ required:
+ - aggregation_functions
+ title: BasicGraderParams
+ additionalProperties: false
+ required:
+ - type
+ - subset_of
+ title: SubsetOfGrader
Model:
type: object
properties:
@@ -4869,179 +4817,6 @@ components:
- llm
- embedding
title: ModelType
- AgentTurnInputType:
- type: object
- properties:
- type:
- type: string
- const: agent_turn_input
- default: agent_turn_input
- additionalProperties: false
- required:
- - type
- title: AgentTurnInputType
- ArrayType:
- type: object
- properties:
- type:
- type: string
- const: array
- default: array
- additionalProperties: false
- required:
- - type
- title: ArrayType
- BooleanType:
- type: object
- properties:
- type:
- type: string
- const: boolean
- default: boolean
- additionalProperties: false
- required:
- - type
- title: BooleanType
- ChatCompletionInputType:
- type: object
- properties:
- type:
- type: string
- const: chat_completion_input
- default: chat_completion_input
- additionalProperties: false
- required:
- - type
- title: ChatCompletionInputType
- CompletionInputType:
- type: object
- properties:
- type:
- type: string
- const: completion_input
- default: completion_input
- additionalProperties: false
- required:
- - type
- title: CompletionInputType
- JsonType:
- type: object
- properties:
- type:
- type: string
- const: json
- default: json
- additionalProperties: false
- required:
- - type
- title: JsonType
- NumberType:
- type: object
- properties:
- type:
- type: string
- const: number
- default: number
- additionalProperties: false
- required:
- - type
- title: NumberType
- ObjectType:
- type: object
- properties:
- type:
- type: string
- const: object
- default: object
- additionalProperties: false
- required:
- - type
- title: ObjectType
- ParamType:
- oneOf:
- - $ref: '#/components/schemas/StringType'
- - $ref: '#/components/schemas/NumberType'
- - $ref: '#/components/schemas/BooleanType'
- - $ref: '#/components/schemas/ArrayType'
- - $ref: '#/components/schemas/ObjectType'
- - $ref: '#/components/schemas/JsonType'
- - $ref: '#/components/schemas/UnionType'
- - $ref: '#/components/schemas/ChatCompletionInputType'
- - $ref: '#/components/schemas/CompletionInputType'
- - $ref: '#/components/schemas/AgentTurnInputType'
- discriminator:
- propertyName: type
- mapping:
- string: '#/components/schemas/StringType'
- number: '#/components/schemas/NumberType'
- boolean: '#/components/schemas/BooleanType'
- array: '#/components/schemas/ArrayType'
- object: '#/components/schemas/ObjectType'
- json: '#/components/schemas/JsonType'
- union: '#/components/schemas/UnionType'
- chat_completion_input: '#/components/schemas/ChatCompletionInputType'
- completion_input: '#/components/schemas/CompletionInputType'
- agent_turn_input: '#/components/schemas/AgentTurnInputType'
- ScoringFn:
- type: object
- properties:
- identifier:
- type: string
- provider_resource_id:
- type: string
- provider_id:
- type: string
- type:
- type: string
- const: scoring_function
- default: scoring_function
- description:
- type: string
- metadata:
- type: object
- additionalProperties:
- oneOf:
- - type: 'null'
- - type: boolean
- - type: number
- - type: string
- - type: array
- - type: object
- return_type:
- $ref: '#/components/schemas/ParamType'
- params:
- $ref: '#/components/schemas/ScoringFnParams'
- additionalProperties: false
- required:
- - identifier
- - provider_resource_id
- - provider_id
- - type
- - metadata
- - return_type
- title: ScoringFn
- StringType:
- type: object
- properties:
- type:
- type: string
- const: string
- default: string
- additionalProperties: false
- required:
- - type
- title: StringType
- UnionType:
- type: object
- properties:
- type:
- type: string
- const: union
- default: union
- additionalProperties: false
- required:
- - type
- title: UnionType
Shield:
type: object
properties:
@@ -5292,21 +5067,20 @@ components:
- checkpoints
title: PostTrainingJobArtifactsResponse
description: Artifacts of a finetuning job.
- JobStatus:
- type: string
- enum:
- - completed
- - in_progress
- - failed
- - scheduled
- title: JobStatus
PostTrainingJobStatusResponse:
type: object
properties:
job_uuid:
type: string
status:
- $ref: '#/components/schemas/JobStatus'
+ type: string
+ enum:
+ - completed
+ - in_progress
+ - failed
+ - scheduled
+ - cancelled
+ title: JobStatus
scheduled_at:
type: string
format: date-time
@@ -5381,6 +5155,255 @@ components:
- embedding_model
- embedding_dimension
title: VectorDB
+ BenchmarkTask:
+ type: object
+ properties:
+ type:
+ type: string
+ const: benchmark_id
+ default: benchmark_id
+ benchmark_id:
+ type: string
+ additionalProperties: false
+ required:
+ - type
+ - benchmark_id
+ title: BenchmarkTask
+ DataSourceGraderTask:
+ type: object
+ properties:
+ type:
+ type: string
+ const: data_source_grader
+ default: data_source_grader
+ data_source:
+ $ref: '#/components/schemas/DataSource'
+ grader_ids:
+ type: array
+ items:
+ type: string
+ additionalProperties: false
+ required:
+ - type
+ - data_source
+ - grader_ids
+ title: DataSourceGraderTask
+ DatasetGraderTask:
+ type: object
+ properties:
+ type:
+ type: string
+ const: dataset_grader
+ default: dataset_grader
+ dataset_id:
+ type: string
+ grader_ids:
+ type: array
+ items:
+ type: string
+ additionalProperties: false
+ required:
+ - type
+ - dataset_id
+ - grader_ids
+ title: DatasetGraderTask
+ EvaluationTask:
+ oneOf:
+ - $ref: '#/components/schemas/BenchmarkTask'
+ - $ref: '#/components/schemas/DatasetGraderTask'
+ - $ref: '#/components/schemas/DataSourceGraderTask'
+ discriminator:
+ propertyName: type
+ mapping:
+ benchmark_id: '#/components/schemas/BenchmarkTask'
+ dataset_grader: '#/components/schemas/DatasetGraderTask'
+ data_source_grader: '#/components/schemas/DataSourceGraderTask'
+ GradeRequest:
+ type: object
+ properties:
+ task:
+ $ref: '#/components/schemas/EvaluationTask'
+ description: >-
+ The task to evaluate. One of: - BenchmarkTask: Run evaluation task against
+ a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id
+ and a list of grader_ids - DataSourceGraderTask: Run evaluation task against
+ a data source (e.g. rows, uri, etc.) and a list of grader_ids
+ additionalProperties: false
+ required:
+ - task
+ title: GradeRequest
+ AgentCandidate:
+ type: object
+ properties:
+ type:
+ type: string
+ const: agent
+ default: agent
+ config:
+ $ref: '#/components/schemas/AgentConfig'
+ description: >-
+ The configuration for the agent candidate.
+ additionalProperties: false
+ required:
+ - type
+ - config
+ title: AgentCandidate
+ description: An agent candidate for evaluation.
+ EvaluationCandidate:
+ oneOf:
+ - $ref: '#/components/schemas/ModelCandidate'
+ - $ref: '#/components/schemas/AgentCandidate'
+ discriminator:
+ propertyName: type
+ mapping:
+ model: '#/components/schemas/ModelCandidate'
+ agent: '#/components/schemas/AgentCandidate'
+ EvaluationJob:
+ type: object
+ properties:
+ id:
+ type: string
+ description: The ID of the job.
+ status:
+ type: string
+ enum:
+ - completed
+ - in_progress
+ - failed
+ - scheduled
+ - cancelled
+ description: The status of the job.
+ created_at:
+ type: string
+ format: date-time
+ description: The time the job was created.
+ ended_at:
+ type: string
+ format: date-time
+ description: The time the job ended.
+ error:
+ type: string
+ description: >-
+ If status of the job is failed, this will contain the error message.
+ type:
+ type: string
+ const: evaluation
+ default: evaluation
+ task:
+ $ref: '#/components/schemas/EvaluationTask'
+ candidate:
+ $ref: '#/components/schemas/EvaluationCandidate'
+ additionalProperties: false
+ required:
+ - id
+ - status
+ - created_at
+ - type
+ - task
+ - candidate
+ title: EvaluationJob
+ ModelCandidate:
+ type: object
+ properties:
+ type:
+ type: string
+ const: model
+ default: model
+ model_id:
+ type: string
+ sampling_params:
+ $ref: '#/components/schemas/SamplingParams'
+ description: The sampling parameters for the model.
+ system_message:
+ $ref: '#/components/schemas/SystemMessage'
+ description: >-
+ (Optional) The system message providing instructions or context to the
+ model.
+ additionalProperties: false
+ required:
+ - type
+ - model_id
+ - sampling_params
+ title: ModelCandidate
+ description: A model candidate for evaluation.
+ GradeInlineRequest:
+ type: object
+ properties:
+ task:
+ $ref: '#/components/schemas/EvaluationTask'
+ description: >-
+ The task to evaluate. One of: - BenchmarkTask: Run evaluation task against
+ a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id
+ and a list of grader_ids - DataSourceGraderTask: Run evaluation task against
+ a data source (e.g. rows, uri, etc.) and a list of grader_ids
+ additionalProperties: false
+ required:
+ - task
+ title: GradeInlineRequest
+ EvaluationResponse:
+ type: object
+ properties:
+ generations:
+ type: array
+ items:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ description: >-
+ The generations in rows for the evaluation.
+ scores:
+ type: object
+ additionalProperties:
+ $ref: '#/components/schemas/ScoringResult'
+ description: >-
+ The scores for the evaluation. Map of grader id to ScoringResult.
+ additionalProperties: false
+ required:
+ - generations
+ - scores
+ title: EvaluationResponse
+ description: A response to an inline evaluation.
+ ScoringResult:
+ type: object
+ properties:
+ scores:
+ type: array
+ items:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ description: >-
+ The scoring result for each row. Each row is a map of grader column name
+ to value.
+ metrics:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ description: Map of metric name to aggregated value.
+ additionalProperties: false
+ required:
+ - scores
+ - metrics
+ title: ScoringResult
+ description: A scoring result for a single row.
HealthInfo:
type: object
properties:
@@ -5648,6 +5671,56 @@ components:
title: ListFileResponse
description: >-
Response representing a list of file entries.
+ GraderTypeInfo:
+ type: object
+ properties:
+ grader_type:
+ type: string
+ enum:
+ - llm
+ - regex_parser
+ - equality
+ - subset_of
+ - factuality
+ - faithfulness
+ title: GraderType
+ description: >-
+ A type of grader. Each type is a criteria for evaluating answers.
+ description:
+ type: string
+ description: >-
+ A description of the grader type. - E.g. Write your custom judge prompt
+ to score the answer.
+ supported_dataset_purposes:
+ type: array
+ items:
+ type: string
+ enum:
+ - post-training/messages
+ - eval/question-answer
+ - eval/messages-answer
+ title: DatasetPurpose
+ description: >-
+ Purpose of the dataset. Each purpose has a required input data schema.
+ description: >-
+ The purposes that this grader can be used for.
+ additionalProperties: false
+ required:
+ - grader_type
+ - description
+ - supported_dataset_purposes
+ title: GraderTypeInfo
+ ListGraderTypesResponse:
+ type: object
+ properties:
+ data:
+ type: array
+ items:
+ $ref: '#/components/schemas/GraderTypeInfo'
+ additionalProperties: false
+ required:
+ - data
+ title: ListGraderTypesResponse
ListModelsResponse:
type: object
properties:
@@ -5698,17 +5771,6 @@ components:
required:
- data
title: ListRoutesResponse
- ListScoringFunctionsResponse:
- type: object
- properties:
- data:
- type: array
- items:
- $ref: '#/components/schemas/ScoringFn'
- additionalProperties: false
- required:
- - data
- title: ListScoringFunctionsResponse
ListShieldsResponse:
type: object
properties:
@@ -6343,18 +6405,21 @@ components:
RegisterBenchmarkRequest:
type: object
properties:
- benchmark_id:
- type: string
dataset_id:
type: string
- scoring_functions:
+ description: >-
+ The ID of the dataset to used to run the benchmark.
+ grader_ids:
type: array
items:
type: string
- provider_benchmark_id:
- type: string
- provider_id:
+ description: >-
+ List of grader ids to use for this benchmark.
+ benchmark_id:
type: string
+ description: >-
+ (Optional) The ID of the benchmark to register. If not provided, an ID
+ will be generated.
metadata:
type: object
additionalProperties:
@@ -6365,11 +6430,12 @@ components:
- type: string
- type: array
- type: object
+ description: >-
+ (Optional) Metadata for this benchmark for additional descriptions.
additionalProperties: false
required:
- - benchmark_id
- dataset_id
- - scoring_functions
+ - grader_ids
title: RegisterBenchmarkRequest
RegisterDatasetRequest:
type: object
@@ -6422,6 +6488,37 @@ components:
- purpose
- source
title: RegisterDatasetRequest
+ RegisterGraderRequest:
+ type: object
+ properties:
+ grader:
+ $ref: '#/components/schemas/GraderDefinition'
+ description: >-
+ The grader definition, E.g. - { "type": "llm", "llm": { "model": "llama-405b",
+ "prompt": "You are a judge. Score the answer based on the question. {question}
+ {answer}", } }
+ grader_id:
+ type: string
+ description: >-
+ (Optional) The ID of the grader. If not provided, a random ID will be
+ generated.
+ metadata:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ description: >-
+ (Optional) Any additional metadata for this grader. - E.g. { "description":
+ "A grader that scores the answer based on the question.", }
+ additionalProperties: false
+ required:
+ - grader
+ title: RegisterGraderRequest
RegisterModelRequest:
type: object
properties:
@@ -6447,27 +6544,6 @@ components:
required:
- model_id
title: RegisterModelRequest
- RegisterScoringFunctionRequest:
- type: object
- properties:
- scoring_fn_id:
- type: string
- description:
- type: string
- return_type:
- $ref: '#/components/schemas/ParamType'
- provider_scoring_fn_id:
- type: string
- provider_id:
- type: string
- params:
- $ref: '#/components/schemas/ScoringFnParams'
- additionalProperties: false
- required:
- - scoring_fn_id
- - description
- - return_type
- title: RegisterScoringFunctionRequest
RegisterShieldRequest:
type: object
properties:
@@ -6549,25 +6625,42 @@ components:
required:
- tool_responses
title: ResumeAgentTurnRequest
- RunEvalRequest:
+ RunRequest:
type: object
properties:
- benchmark_config:
- $ref: '#/components/schemas/BenchmarkConfig'
- description: The configuration for the benchmark.
+ task:
+ $ref: '#/components/schemas/EvaluationTask'
+ description: >-
+ The task to evaluate. One of: - BenchmarkTask: Run evaluation task against
+ a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id
+ and a list of grader_ids - DataSourceGraderTask: Run evaluation task against
+ a data source (e.g. rows, uri, etc.) and a list of grader_ids
+ candidate:
+ $ref: '#/components/schemas/EvaluationCandidate'
+ description: The candidate to evaluate.
additionalProperties: false
required:
- - benchmark_config
- title: RunEvalRequest
- Job:
+ - task
+ - candidate
+ title: RunRequest
+ RunInlineRequest:
type: object
properties:
- job_id:
- type: string
+ task:
+ $ref: '#/components/schemas/EvaluationTask'
+ description: >-
+ The task to evaluate. One of: - BenchmarkTask: Run evaluation task against
+ a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id
+ and a list of grader_ids - DataSourceGraderTask: Run evaluation task against
+ a data source (e.g. rows, uri, etc.) and a list of grader_ids
+ candidate:
+ $ref: '#/components/schemas/EvaluationCandidate'
+ description: The candidate to evaluate.
additionalProperties: false
required:
- - job_id
- title: Job
+ - task
+ - candidate
+ title: RunInlineRequest
RunShieldRequest:
type: object
properties:
@@ -6621,81 +6714,6 @@ components:
- attributes_to_save
- dataset_id
title: SaveSpansToDatasetRequest
- ScoreRequest:
- type: object
- properties:
- input_rows:
- type: array
- items:
- type: object
- additionalProperties:
- oneOf:
- - type: 'null'
- - type: boolean
- - type: number
- - type: string
- - type: array
- - type: object
- description: The rows to score.
- scoring_functions:
- type: object
- additionalProperties:
- oneOf:
- - $ref: '#/components/schemas/ScoringFnParams'
- - type: 'null'
- description: >-
- The scoring functions to use for the scoring.
- additionalProperties: false
- required:
- - input_rows
- - scoring_functions
- title: ScoreRequest
- ScoreResponse:
- type: object
- properties:
- results:
- type: object
- additionalProperties:
- $ref: '#/components/schemas/ScoringResult'
- description: >-
- A map of scoring function name to ScoringResult.
- additionalProperties: false
- required:
- - results
- title: ScoreResponse
- description: The response from scoring.
- ScoreBatchRequest:
- type: object
- properties:
- dataset_id:
- type: string
- scoring_functions:
- type: object
- additionalProperties:
- oneOf:
- - $ref: '#/components/schemas/ScoringFnParams'
- - type: 'null'
- save_results_dataset:
- type: boolean
- additionalProperties: false
- required:
- - dataset_id
- - scoring_functions
- - save_results_dataset
- title: ScoreBatchRequest
- ScoreBatchResponse:
- type: object
- properties:
- dataset_id:
- type: string
- results:
- type: object
- additionalProperties:
- $ref: '#/components/schemas/ScoringResult'
- additionalProperties: false
- required:
- - results
- title: ScoreBatchResponse
AlgorithmConfig:
oneOf:
- $ref: '#/components/schemas/LoraFinetuningConfig'
@@ -6933,10 +6951,9 @@ tags:
- name: Benchmarks
- name: DatasetIO
- name: Datasets
- - name: Eval
- x-displayName: >-
- Llama Stack Evaluation API for running evaluations on model and agent candidates.
+ - name: Evaluation
- name: Files
+ - name: Graders
- name: Inference
description: >-
This API provides the raw interface to the underlying models. Two kinds of models
@@ -6956,8 +6973,6 @@ tags:
x-displayName: >-
Providers API for inspecting, listing, and modifying providers and their configurations.
- name: Safety
- - name: Scoring
- - name: ScoringFunctions
- name: Shields
- name: SyntheticDataGeneration (Coming Soon)
- name: Telemetry
@@ -6973,16 +6988,15 @@ x-tagGroups:
- Benchmarks
- DatasetIO
- Datasets
- - Eval
+ - Evaluation
- Files
+ - Graders
- Inference
- Inspect
- Models
- PostTraining (Coming Soon)
- Providers
- Safety
- - Scoring
- - ScoringFunctions
- Shields
- SyntheticDataGeneration (Coming Soon)
- Telemetry
diff --git a/llama_stack/apis/benchmarks/benchmarks.py b/llama_stack/apis/benchmarks/benchmarks.py
index 39ba355e9..eaaf8530b 100644
--- a/llama_stack/apis/benchmarks/benchmarks.py
+++ b/llama_stack/apis/benchmarks/benchmarks.py
@@ -12,11 +12,17 @@ from llama_stack.schema_utils import json_schema_type, webmethod
class CommonBenchmarkFields(BaseModel):
+ """
+ :param dataset_id: The ID of the dataset to used to run the benchmark.
+ :param grader_ids: The grader ids to use for this benchmark.
+ :param metadata: Metadata for this benchmark for additional descriptions.
+ """
+
dataset_id: str
- scoring_functions: List[str]
+ grader_ids: List[str]
metadata: Dict[str, Any] = Field(
default_factory=dict,
- description="Metadata for this evaluation task",
+ description="Metadata for this benchmark",
)
@@ -45,22 +51,39 @@ class ListBenchmarksResponse(BaseModel):
@runtime_checkable
class Benchmarks(Protocol):
+ @webmethod(route="/eval/benchmarks", method="POST")
+ async def register_benchmark(
+ self,
+ dataset_id: str,
+ grader_ids: List[str],
+ benchmark_id: Optional[str] = None,
+ metadata: Optional[Dict[str, Any]] = None,
+ ) -> Benchmark:
+ """
+ Register a new benchmark.
+
+ :param dataset_id: The ID of the dataset to used to run the benchmark.
+ :param grader_ids: List of grader ids to use for this benchmark.
+ :param benchmark_id: (Optional) The ID of the benchmark to register. If not provided, an ID will be generated.
+ :param metadata: (Optional) Metadata for this benchmark for additional descriptions.
+ """
+ ...
+
@webmethod(route="/eval/benchmarks", method="GET")
- async def list_benchmarks(self) -> ListBenchmarksResponse: ...
+ async def list_benchmarks(self) -> ListBenchmarksResponse:
+ """
+ List all benchmarks.
+ """
+ ...
@webmethod(route="/eval/benchmarks/{benchmark_id}", method="GET")
async def get_benchmark(
self,
benchmark_id: str,
- ) -> Optional[Benchmark]: ...
+ ) -> Benchmark:
+ """
+ Get a benchmark by ID.
- @webmethod(route="/eval/benchmarks", method="POST")
- async def register_benchmark(
- self,
- benchmark_id: str,
- dataset_id: str,
- scoring_functions: List[str],
- provider_benchmark_id: Optional[str] = None,
- provider_id: Optional[str] = None,
- metadata: Optional[Dict[str, Any]] = None,
- ) -> None: ...
+ :param benchmark_id: The ID of the benchmark to get.
+ """
+ ...
diff --git a/llama_stack/apis/common/job_types.py b/llama_stack/apis/common/job_types.py
index bc070017b..e27f19493 100644
--- a/llama_stack/apis/common/job_types.py
+++ b/llama_stack/apis/common/job_types.py
@@ -3,21 +3,49 @@
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
+from datetime import datetime
from enum import Enum
+from typing import Optional
from pydantic import BaseModel
from llama_stack.schema_utils import json_schema_type
-@json_schema_type
-class Job(BaseModel):
- job_id: str
+class JobType(Enum):
+ batch_inference = "batch_inference"
+ evaluation = "evaluation"
+ finetuning = "finetuning"
-@json_schema_type
class JobStatus(Enum):
completed = "completed"
in_progress = "in_progress"
failed = "failed"
scheduled = "scheduled"
+ cancelled = "cancelled"
+
+
+class JobArtifact(BaseModel):
+ """
+ A job artifact is a file or directory that is produced by a job.
+ """
+
+ path: str
+
+
+@json_schema_type
+class CommonJobFields(BaseModel):
+ """Common fields for all jobs.
+ :param id: The ID of the job.
+ :param status: The status of the job.
+ :param created_at: The time the job was created.
+ :param ended_at: The time the job ended.
+ :param error: If status of the job is failed, this will contain the error message.
+ """
+
+ id: str
+ status: JobStatus
+ created_at: datetime
+ ended_at: Optional[datetime] = None
+ error: Optional[str] = None
diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py
index dec018d83..5b4433041 100644
--- a/llama_stack/apis/eval/eval.py
+++ b/llama_stack/apis/eval/eval.py
@@ -10,7 +10,7 @@ from pydantic import BaseModel, Field
from typing_extensions import Annotated
from llama_stack.apis.agents import AgentConfig
-from llama_stack.apis.common.job_types import Job, JobStatus
+from llama_stack.apis.common.job_types import JobStatus
from llama_stack.apis.inference import SamplingParams, SystemMessage
from llama_stack.apis.scoring import ScoringResult
from llama_stack.apis.scoring_functions import ScoringFnParams
@@ -91,7 +91,7 @@ class Eval(Protocol):
self,
benchmark_id: str,
benchmark_config: BenchmarkConfig,
- ) -> Job:
+ ) -> None:
"""Run an evaluation on a benchmark.
:param benchmark_id: The ID of the benchmark to run the evaluation on.
@@ -135,7 +135,9 @@ class Eval(Protocol):
"""
...
- @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET")
+ @webmethod(
+ route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET"
+ )
async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse:
"""Get the result of a job.
diff --git a/llama_stack/apis/evaluation/__init__.py b/llama_stack/apis/evaluation/__init__.py
new file mode 100644
index 000000000..9a168a2bc
--- /dev/null
+++ b/llama_stack/apis/evaluation/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .evaluation import * # noqa: F401 F403
diff --git a/llama_stack/apis/evaluation/evaluation.py b/llama_stack/apis/evaluation/evaluation.py
new file mode 100644
index 000000000..444495b6e
--- /dev/null
+++ b/llama_stack/apis/evaluation/evaluation.py
@@ -0,0 +1,175 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from typing import Any, Dict, List, Literal, Optional, Protocol, Union
+
+from pydantic import BaseModel, Field
+from typing_extensions import Annotated
+
+from llama_stack.apis.agents import AgentConfig
+from llama_stack.apis.common.job_types import CommonJobFields, JobType
+from llama_stack.apis.datasets import DataSource
+from llama_stack.apis.inference import SamplingParams, SystemMessage
+from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
+
+
+@json_schema_type
+class ModelCandidate(BaseModel):
+ """A model candidate for evaluation.
+
+ :param model: The model ID to evaluate.
+ :param sampling_params: The sampling parameters for the model.
+ :param system_message: (Optional) The system message providing instructions or context to the model.
+ """
+
+ type: Literal["model"] = "model"
+ model_id: str
+ sampling_params: SamplingParams
+ system_message: Optional[SystemMessage] = None
+
+
+@json_schema_type
+class AgentCandidate(BaseModel):
+ """An agent candidate for evaluation.
+
+ :param config: The configuration for the agent candidate.
+ """
+
+ type: Literal["agent"] = "agent"
+ config: AgentConfig
+
+
+EvaluationCandidate = register_schema(
+ Annotated[Union[ModelCandidate, AgentCandidate], Field(discriminator="type")],
+ name="EvaluationCandidate",
+)
+
+
+@json_schema_type
+class BenchmarkTask(BaseModel):
+ type: Literal["benchmark_id"] = "benchmark_id"
+ benchmark_id: str
+
+
+@json_schema_type
+class DatasetGraderTask(BaseModel):
+ type: Literal["dataset_grader"] = "dataset_grader"
+ dataset_id: str
+ grader_ids: List[str]
+
+
+@json_schema_type
+class DataSourceGraderTask(BaseModel):
+ type: Literal["data_source_grader"] = "data_source_grader"
+ data_source: DataSource
+ grader_ids: List[str]
+
+
+EvaluationTask = register_schema(
+ Annotated[
+ Union[BenchmarkTask, DatasetGraderTask, DataSourceGraderTask],
+ Field(discriminator="type"),
+ ],
+ name="EvaluationTask",
+)
+
+
+@json_schema_type
+class EvaluationJob(CommonJobFields):
+ type: Literal[JobType.evaluation.value] = JobType.evaluation.value
+
+ # input params for the submitted evaluation job
+ task: EvaluationTask
+ candidate: EvaluationCandidate
+
+
+@json_schema_type
+class ScoringResult(BaseModel):
+ """
+ A scoring result for a single row.
+
+ :param scores: The scoring result for each row. Each row is a map of grader column name to value.
+ :param metrics: Map of metric name to aggregated value.
+ """
+
+ scores: List[Dict[str, Any]]
+ metrics: Dict[str, Any]
+
+
+@json_schema_type
+class EvaluationResponse(BaseModel):
+ """
+ A response to an inline evaluation.
+
+ :param generations: The generations in rows for the evaluation.
+ :param scores: The scores for the evaluation. Map of grader id to ScoringResult.
+ """
+
+ generations: List[Dict[str, Any]]
+ scores: Dict[str, ScoringResult]
+
+
+class Evaluation(Protocol):
+ @webmethod(route="/evaluation/run", method="POST")
+ async def run(
+ self,
+ task: EvaluationTask,
+ candidate: EvaluationCandidate,
+ ) -> EvaluationJob:
+ """
+ Run an evaluation job.
+
+ :param task: The task to evaluate. One of:
+ - BenchmarkTask: Run evaluation task against a benchmark_id
+ - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids
+ - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
+ :param candidate: The candidate to evaluate.
+ """
+ ...
+
+ @webmethod(route="/evaluation/run_inline", method="POST")
+ async def run_inline(
+ self,
+ task: EvaluationTask,
+ candidate: EvaluationCandidate,
+ ) -> EvaluationResponse:
+ """
+ Run an evaluation job inline.
+
+ :param task: The task to evaluate. One of:
+ - BenchmarkTask: Run evaluation task against a benchmark_id
+ - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids
+ - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
+ :param candidate: The candidate to evaluate.
+ """
+ ...
+
+ @webmethod(route="/evaluation/grade", method="POST")
+ async def grade(self, task: EvaluationTask) -> EvaluationJob:
+ """
+ Run an grading job with generated results. Use this when you have generated results from inference in a dataset.
+
+ :param task: The task to evaluate. One of:
+ - BenchmarkTask: Run evaluation task against a benchmark_id
+ - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids
+ - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
+
+ :return: The evaluation job containing grader scores.
+ """
+ ...
+
+ @webmethod(route="/evaluation/grade_inline", method="POST")
+ async def grade_inline(self, task: EvaluationTask) -> EvaluationResponse:
+ """
+ Run an grading job with generated results inline.
+
+ :param task: The task to evaluate. One of:
+ - BenchmarkTask: Run evaluation task against a benchmark_id
+ - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids
+ - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
+
+ :return: The evaluation job containing grader scores. "generations" is not populated in the response.
+ """
+ ...
diff --git a/llama_stack/apis/graders/__init__.py b/llama_stack/apis/graders/__init__.py
new file mode 100644
index 000000000..b5791cb88
--- /dev/null
+++ b/llama_stack/apis/graders/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .graders import * # noqa: F401 F403
diff --git a/llama_stack/distribution/stack.py b/llama_stack/distribution/stack.py
index 9c9289a77..cd1c58348 100644
--- a/llama_stack/distribution/stack.py
+++ b/llama_stack/distribution/stack.py
@@ -17,16 +17,15 @@ from llama_stack.apis.batch_inference import BatchInference
from llama_stack.apis.benchmarks import Benchmarks
from llama_stack.apis.datasetio import DatasetIO
from llama_stack.apis.datasets import Datasets
-from llama_stack.apis.eval import Eval
+from llama_stack.apis.evaluation import Evaluation
from llama_stack.apis.files import Files
+from llama_stack.apis.graders import Graders
from llama_stack.apis.inference import Inference
from llama_stack.apis.inspect import Inspect
from llama_stack.apis.models import Models
from llama_stack.apis.post_training import PostTraining
from llama_stack.apis.providers import Providers
from llama_stack.apis.safety import Safety
-from llama_stack.apis.scoring import Scoring
-from llama_stack.apis.scoring_functions import ScoringFunctions
from llama_stack.apis.shields import Shields
from llama_stack.apis.synthetic_data_generation import SyntheticDataGeneration
from llama_stack.apis.telemetry import Telemetry
@@ -56,10 +55,7 @@ class LlamaStack(
Telemetry,
PostTraining,
VectorIO,
- Eval,
Benchmarks,
- Scoring,
- ScoringFunctions,
DatasetIO,
Models,
Shields,
@@ -68,6 +64,8 @@ class LlamaStack(
ToolRuntime,
RAGToolRuntime,
Files,
+ Graders,
+ Evaluation,
):
pass
@@ -113,7 +111,9 @@ class EnvVarError(Exception):
def __init__(self, var_name: str, path: str = ""):
self.var_name = var_name
self.path = path
- super().__init__(f"Environment variable '{var_name}' not set or empty{f' at {path}' if path else ''}")
+ super().__init__(
+ f"Environment variable '{var_name}' not set or empty{f' at {path}' if path else ''}"
+ )
def redact_sensitive_fields(data: Dict[str, Any]) -> Dict[str, Any]:
@@ -204,7 +204,9 @@ def validate_env_pair(env_pair: str) -> tuple[str, str]:
if not key:
raise ValueError(f"Empty key in environment variable pair: {env_pair}")
if not all(c.isalnum() or c == "_" for c in key):
- raise ValueError(f"Key must contain only alphanumeric characters and underscores: {key}")
+ raise ValueError(
+ f"Key must contain only alphanumeric characters and underscores: {key}"
+ )
return key, value
except ValueError as e:
raise ValueError(
@@ -217,14 +219,20 @@ def validate_env_pair(env_pair: str) -> tuple[str, str]:
async def construct_stack(
run_config: StackRunConfig, provider_registry: Optional[ProviderRegistry] = None
) -> Dict[Api, Any]:
- dist_registry, _ = await create_dist_registry(run_config.metadata_store, run_config.image_name)
- impls = await resolve_impls(run_config, provider_registry or get_provider_registry(), dist_registry)
+ dist_registry, _ = await create_dist_registry(
+ run_config.metadata_store, run_config.image_name
+ )
+ impls = await resolve_impls(
+ run_config, provider_registry or get_provider_registry(), dist_registry
+ )
await register_resources(run_config, impls)
return impls
def get_stack_run_config_from_template(template: str) -> StackRunConfig:
- template_path = importlib.resources.files("llama_stack") / f"templates/{template}/run.yaml"
+ template_path = (
+ importlib.resources.files("llama_stack") / f"templates/{template}/run.yaml"
+ )
with importlib.resources.as_file(template_path) as path:
if not path.exists():
@@ -267,7 +275,9 @@ def run_config_from_adhoc_config_spec(
# call method "sample_run_config" on the provider spec config class
provider_config_type = instantiate_class_type(provider_spec.config_class)
- provider_config = replace_env_vars(provider_config_type.sample_run_config(__distro_dir__=distro_dir))
+ provider_config = replace_env_vars(
+ provider_config_type.sample_run_config(__distro_dir__=distro_dir)
+ )
provider_configs_by_api[api_str] = [
Provider(