diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 8de7f86de..cb5959e22 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -866,83 +866,6 @@
]
}
},
- "/v1/graders/{grader_id}": {
- "get": {
- "responses": {
- "200": {
- "description": "The grader.",
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/Grader"
- }
- }
- }
- },
- "400": {
- "$ref": "#/components/responses/BadRequest400"
- },
- "429": {
- "$ref": "#/components/responses/TooManyRequests429"
- },
- "500": {
- "$ref": "#/components/responses/InternalServerError500"
- },
- "default": {
- "$ref": "#/components/responses/DefaultError"
- }
- },
- "tags": [
- "Graders"
- ],
- "description": "Get a grader by ID.",
- "parameters": [
- {
- "name": "grader_id",
- "in": "path",
- "description": "The ID of the grader.",
- "required": true,
- "schema": {
- "type": "string"
- }
- }
- ]
- },
- "delete": {
- "responses": {
- "200": {
- "description": "OK"
- },
- "400": {
- "$ref": "#/components/responses/BadRequest400"
- },
- "429": {
- "$ref": "#/components/responses/TooManyRequests429"
- },
- "500": {
- "$ref": "#/components/responses/InternalServerError500"
- },
- "default": {
- "$ref": "#/components/responses/DefaultError"
- }
- },
- "tags": [
- "Graders"
- ],
- "description": "Delete a grader by ID.",
- "parameters": [
- {
- "name": "grader_id",
- "in": "path",
- "description": "The ID of the grader.",
- "required": true,
- "schema": {
- "type": "string"
- }
- }
- ]
- }
- },
"/v1/inference/embeddings": {
"post": {
"responses": {
@@ -986,6 +909,59 @@
}
}
},
+ "/v1/eval/benchmarks/{benchmark_id}/evaluations": {
+ "post": {
+ "responses": {
+ "200": {
+ "description": "EvaluateResponse object containing generations and scores",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/EvaluateResponse"
+ }
+ }
+ }
+ },
+ "400": {
+ "$ref": "#/components/responses/BadRequest400"
+ },
+ "429": {
+ "$ref": "#/components/responses/TooManyRequests429"
+ },
+ "500": {
+ "$ref": "#/components/responses/InternalServerError500"
+ },
+ "default": {
+ "$ref": "#/components/responses/DefaultError"
+ }
+ },
+ "tags": [
+ "Eval"
+ ],
+ "description": "Evaluate a list of rows on a benchmark.",
+ "parameters": [
+ {
+ "name": "benchmark_id",
+ "in": "path",
+ "description": "The ID of the benchmark to run the evaluation on.",
+ "required": true,
+ "schema": {
+ "type": "string"
+ }
+ }
+ ],
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/EvaluateRowsRequest"
+ }
+ }
+ },
+ "required": true
+ }
+ }
+ },
"/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}": {
"get": {
"responses": {
@@ -1158,6 +1134,39 @@
}
}
]
+ },
+ "delete": {
+ "responses": {
+ "200": {
+ "description": "OK"
+ },
+ "400": {
+ "$ref": "#/components/responses/BadRequest400"
+ },
+ "429": {
+ "$ref": "#/components/responses/TooManyRequests429"
+ },
+ "500": {
+ "$ref": "#/components/responses/InternalServerError500"
+ },
+ "default": {
+ "$ref": "#/components/responses/DefaultError"
+ }
+ },
+ "tags": [
+ "Benchmarks"
+ ],
+ "description": "Unregister a benchmark by ID.",
+ "parameters": [
+ {
+ "name": "benchmark_id",
+ "in": "path",
+ "required": true,
+ "schema": {
+ "type": "string"
+ }
+ }
+ ]
}
},
"/v1/datasets/{dataset_id}": {
@@ -1235,6 +1244,83 @@
]
}
},
+ "/v1/graders/{grader_id}": {
+ "get": {
+ "responses": {
+ "200": {
+ "description": "The grader.",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/Grader"
+ }
+ }
+ }
+ },
+ "400": {
+ "$ref": "#/components/responses/BadRequest400"
+ },
+ "429": {
+ "$ref": "#/components/responses/TooManyRequests429"
+ },
+ "500": {
+ "$ref": "#/components/responses/InternalServerError500"
+ },
+ "default": {
+ "$ref": "#/components/responses/DefaultError"
+ }
+ },
+ "tags": [
+ "Graders"
+ ],
+ "description": "Get a grader by ID.",
+ "parameters": [
+ {
+ "name": "grader_id",
+ "in": "path",
+ "description": "The ID of the grader.",
+ "required": true,
+ "schema": {
+ "type": "string"
+ }
+ }
+ ]
+ },
+ "delete": {
+ "responses": {
+ "200": {
+ "description": "OK"
+ },
+ "400": {
+ "$ref": "#/components/responses/BadRequest400"
+ },
+ "429": {
+ "$ref": "#/components/responses/TooManyRequests429"
+ },
+ "500": {
+ "$ref": "#/components/responses/InternalServerError500"
+ },
+ "default": {
+ "$ref": "#/components/responses/DefaultError"
+ }
+ },
+ "tags": [
+ "Graders"
+ ],
+ "description": "Unregister a grader by ID.",
+ "parameters": [
+ {
+ "name": "grader_id",
+ "in": "path",
+ "description": "The ID of the grader.",
+ "required": true,
+ "schema": {
+ "type": "string"
+ }
+ }
+ ]
+ }
+ },
"/v1/models/{model_id}": {
"get": {
"responses": {
@@ -1310,6 +1396,48 @@
]
}
},
+ "/v1/scoring-functions/{scoring_fn_id}": {
+ "get": {
+ "responses": {
+ "200": {
+ "description": "OK",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/ScoringFn"
+ }
+ }
+ }
+ },
+ "400": {
+ "$ref": "#/components/responses/BadRequest400"
+ },
+ "429": {
+ "$ref": "#/components/responses/TooManyRequests429"
+ },
+ "500": {
+ "$ref": "#/components/responses/InternalServerError500"
+ },
+ "default": {
+ "$ref": "#/components/responses/DefaultError"
+ }
+ },
+ "tags": [
+ "ScoringFunctions"
+ ],
+ "description": "",
+ "parameters": [
+ {
+ "name": "scoring_fn_id",
+ "in": "path",
+ "required": true,
+ "schema": {
+ "type": "string"
+ }
+ }
+ ]
+ }
+ },
"/v1/shields/{identifier}": {
"get": {
"responses": {
@@ -2244,6 +2372,153 @@
]
}
},
+ "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}": {
+ "get": {
+ "responses": {
+ "200": {
+ "description": "The status of the evaluationjob.",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/JobStatus"
+ }
+ }
+ }
+ },
+ "400": {
+ "$ref": "#/components/responses/BadRequest400"
+ },
+ "429": {
+ "$ref": "#/components/responses/TooManyRequests429"
+ },
+ "500": {
+ "$ref": "#/components/responses/InternalServerError500"
+ },
+ "default": {
+ "$ref": "#/components/responses/DefaultError"
+ }
+ },
+ "tags": [
+ "Eval"
+ ],
+ "description": "Get the status of a job.",
+ "parameters": [
+ {
+ "name": "benchmark_id",
+ "in": "path",
+ "description": "The ID of the benchmark to run the evaluation on.",
+ "required": true,
+ "schema": {
+ "type": "string"
+ }
+ },
+ {
+ "name": "job_id",
+ "in": "path",
+ "description": "The ID of the job to get the status of.",
+ "required": true,
+ "schema": {
+ "type": "string"
+ }
+ }
+ ]
+ },
+ "delete": {
+ "responses": {
+ "200": {
+ "description": "OK"
+ },
+ "400": {
+ "$ref": "#/components/responses/BadRequest400"
+ },
+ "429": {
+ "$ref": "#/components/responses/TooManyRequests429"
+ },
+ "500": {
+ "$ref": "#/components/responses/InternalServerError500"
+ },
+ "default": {
+ "$ref": "#/components/responses/DefaultError"
+ }
+ },
+ "tags": [
+ "Eval"
+ ],
+ "description": "Cancel a job.",
+ "parameters": [
+ {
+ "name": "benchmark_id",
+ "in": "path",
+ "description": "The ID of the benchmark to run the evaluation on.",
+ "required": true,
+ "schema": {
+ "type": "string"
+ }
+ },
+ {
+ "name": "job_id",
+ "in": "path",
+ "description": "The ID of the job to cancel.",
+ "required": true,
+ "schema": {
+ "type": "string"
+ }
+ }
+ ]
+ }
+ },
+ "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result": {
+ "get": {
+ "responses": {
+ "200": {
+ "description": "The result of the job.",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/EvaluateResponse"
+ }
+ }
+ }
+ },
+ "400": {
+ "$ref": "#/components/responses/BadRequest400"
+ },
+ "429": {
+ "$ref": "#/components/responses/TooManyRequests429"
+ },
+ "500": {
+ "$ref": "#/components/responses/InternalServerError500"
+ },
+ "default": {
+ "$ref": "#/components/responses/DefaultError"
+ }
+ },
+ "tags": [
+ "Eval"
+ ],
+ "description": "Get the result of a job.",
+ "parameters": [
+ {
+ "name": "benchmark_id",
+ "in": "path",
+ "description": "The ID of the benchmark to run the evaluation on.",
+ "required": true,
+ "schema": {
+ "type": "string"
+ }
+ },
+ {
+ "name": "job_id",
+ "in": "path",
+ "description": "The ID of the job to get the result of.",
+ "required": true,
+ "schema": {
+ "type": "string"
+ }
+ }
+ ]
+ }
+ },
"/v1/agents/{agent_id}/sessions": {
"get": {
"responses": {
@@ -2517,9 +2792,9 @@
"200": {
"description": "A list of graders.",
"content": {
- "application/jsonl": {
+ "application/json": {
"schema": {
- "$ref": "#/components/schemas/Grader"
+ "$ref": "#/components/schemas/ListGradersResponse"
}
}
}
@@ -2775,6 +3050,73 @@
]
}
},
+ "/v1/scoring-functions": {
+ "get": {
+ "responses": {
+ "200": {
+ "description": "OK",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/ListScoringFunctionsResponse"
+ }
+ }
+ }
+ },
+ "400": {
+ "$ref": "#/components/responses/BadRequest400"
+ },
+ "429": {
+ "$ref": "#/components/responses/TooManyRequests429"
+ },
+ "500": {
+ "$ref": "#/components/responses/InternalServerError500"
+ },
+ "default": {
+ "$ref": "#/components/responses/DefaultError"
+ }
+ },
+ "tags": [
+ "ScoringFunctions"
+ ],
+ "description": "",
+ "parameters": []
+ },
+ "post": {
+ "responses": {
+ "200": {
+ "description": "OK"
+ },
+ "400": {
+ "$ref": "#/components/responses/BadRequest400"
+ },
+ "429": {
+ "$ref": "#/components/responses/TooManyRequests429"
+ },
+ "500": {
+ "$ref": "#/components/responses/InternalServerError500"
+ },
+ "default": {
+ "$ref": "#/components/responses/DefaultError"
+ }
+ },
+ "tags": [
+ "ScoringFunctions"
+ ],
+ "description": "",
+ "parameters": [],
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/RegisterScoringFunctionRequest"
+ }
+ }
+ },
+ "required": true
+ }
+ }
+ },
"/v1/shields": {
"get": {
"responses": {
@@ -3402,6 +3744,59 @@
}
}
},
+ "/v1/eval/benchmarks/{benchmark_id}/jobs": {
+ "post": {
+ "responses": {
+ "200": {
+ "description": "The job that was created to run the evaluation.",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/Job"
+ }
+ }
+ }
+ },
+ "400": {
+ "$ref": "#/components/responses/BadRequest400"
+ },
+ "429": {
+ "$ref": "#/components/responses/TooManyRequests429"
+ },
+ "500": {
+ "$ref": "#/components/responses/InternalServerError500"
+ },
+ "default": {
+ "$ref": "#/components/responses/DefaultError"
+ }
+ },
+ "tags": [
+ "Eval"
+ ],
+ "description": "Run an evaluation on a benchmark.",
+ "parameters": [
+ {
+ "name": "benchmark_id",
+ "in": "path",
+ "description": "The ID of the benchmark to run the evaluation on.",
+ "required": true,
+ "schema": {
+ "type": "string"
+ }
+ }
+ ],
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/RunEvalRequest"
+ }
+ }
+ },
+ "required": true
+ }
+ }
+ },
"/v1/safety/run-shield": {
"post": {
"responses": {
@@ -3524,6 +3919,92 @@
}
}
},
+ "/v1/scoring/score": {
+ "post": {
+ "responses": {
+ "200": {
+ "description": "ScoreResponse object containing rows and aggregated results",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/ScoreResponse"
+ }
+ }
+ }
+ },
+ "400": {
+ "$ref": "#/components/responses/BadRequest400"
+ },
+ "429": {
+ "$ref": "#/components/responses/TooManyRequests429"
+ },
+ "500": {
+ "$ref": "#/components/responses/InternalServerError500"
+ },
+ "default": {
+ "$ref": "#/components/responses/DefaultError"
+ }
+ },
+ "tags": [
+ "Scoring"
+ ],
+ "description": "Score a list of rows.",
+ "parameters": [],
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/ScoreRequest"
+ }
+ }
+ },
+ "required": true
+ }
+ }
+ },
+ "/v1/scoring/score-batch": {
+ "post": {
+ "responses": {
+ "200": {
+ "description": "OK",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/ScoreBatchResponse"
+ }
+ }
+ }
+ },
+ "400": {
+ "$ref": "#/components/responses/BadRequest400"
+ },
+ "429": {
+ "$ref": "#/components/responses/TooManyRequests429"
+ },
+ "500": {
+ "$ref": "#/components/responses/InternalServerError500"
+ },
+ "default": {
+ "$ref": "#/components/responses/DefaultError"
+ }
+ },
+ "tags": [
+ "Scoring"
+ ],
+ "description": "",
+ "parameters": [],
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/ScoreBatchRequest"
+ }
+ }
+ },
+ "required": true
+ }
+ }
+ },
"/v1/post-training/supervised-fine-tune": {
"post": {
"responses": {
@@ -6149,6 +6630,381 @@
"title": "EmbeddingsResponse",
"description": "Response containing generated embeddings."
},
+ "AgentCandidate": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "agent",
+ "default": "agent"
+ },
+ "config": {
+ "$ref": "#/components/schemas/AgentConfig",
+ "description": "The configuration for the agent candidate."
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "config"
+ ],
+ "title": "AgentCandidate",
+ "description": "An agent candidate for evaluation."
+ },
+ "AggregationFunctionType": {
+ "type": "string",
+ "enum": [
+ "average",
+ "median",
+ "categorical_count",
+ "accuracy"
+ ],
+ "title": "AggregationFunctionType"
+ },
+ "BasicScoringFnParams": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "basic",
+ "default": "basic"
+ },
+ "aggregation_functions": {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/AggregationFunctionType"
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type"
+ ],
+ "title": "BasicScoringFnParams"
+ },
+ "BenchmarkConfig": {
+ "type": "object",
+ "properties": {
+ "eval_candidate": {
+ "$ref": "#/components/schemas/EvalCandidate",
+ "description": "The candidate to evaluate."
+ },
+ "scoring_params": {
+ "type": "object",
+ "additionalProperties": {
+ "$ref": "#/components/schemas/ScoringFnParams"
+ },
+ "description": "Map between scoring function id and parameters for each scoring function you want to run"
+ },
+ "num_examples": {
+ "type": "integer",
+ "description": "(Optional) The number of examples to evaluate. If not provided, all examples in the dataset will be evaluated"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "eval_candidate",
+ "scoring_params"
+ ],
+ "title": "BenchmarkConfig",
+ "description": "A benchmark configuration for evaluation."
+ },
+ "EvalCandidate": {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/ModelCandidate"
+ },
+ {
+ "$ref": "#/components/schemas/AgentCandidate"
+ }
+ ],
+ "discriminator": {
+ "propertyName": "type",
+ "mapping": {
+ "model": "#/components/schemas/ModelCandidate",
+ "agent": "#/components/schemas/AgentCandidate"
+ }
+ }
+ },
+ "LLMAsJudgeScoringFnParams": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "llm_as_judge",
+ "default": "llm_as_judge"
+ },
+ "judge_model": {
+ "type": "string"
+ },
+ "prompt_template": {
+ "type": "string"
+ },
+ "judge_score_regexes": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "aggregation_functions": {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/AggregationFunctionType"
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "judge_model"
+ ],
+ "title": "LLMAsJudgeScoringFnParams"
+ },
+ "ModelCandidate": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "model",
+ "default": "model"
+ },
+ "model": {
+ "type": "string",
+ "description": "The model ID to evaluate."
+ },
+ "sampling_params": {
+ "$ref": "#/components/schemas/SamplingParams",
+ "description": "The sampling parameters for the model."
+ },
+ "system_message": {
+ "$ref": "#/components/schemas/SystemMessage",
+ "description": "(Optional) The system message providing instructions or context to the model."
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "model",
+ "sampling_params"
+ ],
+ "title": "ModelCandidate",
+ "description": "A model candidate for evaluation."
+ },
+ "RegexParserScoringFnParams": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "regex_parser",
+ "default": "regex_parser"
+ },
+ "parsing_regexes": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "aggregation_functions": {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/AggregationFunctionType"
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type"
+ ],
+ "title": "RegexParserScoringFnParams"
+ },
+ "ScoringFnParams": {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/LLMAsJudgeScoringFnParams"
+ },
+ {
+ "$ref": "#/components/schemas/RegexParserScoringFnParams"
+ },
+ {
+ "$ref": "#/components/schemas/BasicScoringFnParams"
+ }
+ ],
+ "discriminator": {
+ "propertyName": "type",
+ "mapping": {
+ "llm_as_judge": "#/components/schemas/LLMAsJudgeScoringFnParams",
+ "regex_parser": "#/components/schemas/RegexParserScoringFnParams",
+ "basic": "#/components/schemas/BasicScoringFnParams"
+ }
+ }
+ },
+ "EvaluateRowsRequest": {
+ "type": "object",
+ "properties": {
+ "input_rows": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ }
+ },
+ "description": "The rows to evaluate."
+ },
+ "scoring_functions": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ },
+ "description": "The scoring functions to use for the evaluation."
+ },
+ "benchmark_config": {
+ "$ref": "#/components/schemas/BenchmarkConfig",
+ "description": "The configuration for the benchmark."
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "input_rows",
+ "scoring_functions",
+ "benchmark_config"
+ ],
+ "title": "EvaluateRowsRequest"
+ },
+ "EvaluateResponse": {
+ "type": "object",
+ "properties": {
+ "generations": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ }
+ },
+ "description": "The generations from the evaluation."
+ },
+ "scores": {
+ "type": "object",
+ "additionalProperties": {
+ "$ref": "#/components/schemas/ScoringResult"
+ },
+ "description": "The scores from the evaluation."
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "generations",
+ "scores"
+ ],
+ "title": "EvaluateResponse",
+ "description": "The response from an evaluation."
+ },
+ "ScoringResult": {
+ "type": "object",
+ "properties": {
+ "score_rows": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ }
+ },
+ "description": "The scoring result for each row. Each row is a map of column name to value."
+ },
+ "aggregated_results": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ },
+ "description": "Map of metric name to aggregated value"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "score_rows",
+ "aggregated_results"
+ ],
+ "title": "ScoringResult",
+ "description": "A scoring result for a single row."
+ },
"Agent": {
"type": "object",
"properties": {
@@ -6876,6 +7732,268 @@
],
"title": "ModelType"
},
+ "AgentTurnInputType": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "agent_turn_input",
+ "default": "agent_turn_input"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type"
+ ],
+ "title": "AgentTurnInputType"
+ },
+ "ArrayType": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "array",
+ "default": "array"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type"
+ ],
+ "title": "ArrayType"
+ },
+ "BooleanType": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "boolean",
+ "default": "boolean"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type"
+ ],
+ "title": "BooleanType"
+ },
+ "ChatCompletionInputType": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "chat_completion_input",
+ "default": "chat_completion_input"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type"
+ ],
+ "title": "ChatCompletionInputType"
+ },
+ "CompletionInputType": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "completion_input",
+ "default": "completion_input"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type"
+ ],
+ "title": "CompletionInputType"
+ },
+ "JsonType": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "json",
+ "default": "json"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type"
+ ],
+ "title": "JsonType"
+ },
+ "NumberType": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "number",
+ "default": "number"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type"
+ ],
+ "title": "NumberType"
+ },
+ "ObjectType": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "object",
+ "default": "object"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type"
+ ],
+ "title": "ObjectType"
+ },
+ "ParamType": {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/StringType"
+ },
+ {
+ "$ref": "#/components/schemas/NumberType"
+ },
+ {
+ "$ref": "#/components/schemas/BooleanType"
+ },
+ {
+ "$ref": "#/components/schemas/ArrayType"
+ },
+ {
+ "$ref": "#/components/schemas/ObjectType"
+ },
+ {
+ "$ref": "#/components/schemas/JsonType"
+ },
+ {
+ "$ref": "#/components/schemas/UnionType"
+ },
+ {
+ "$ref": "#/components/schemas/ChatCompletionInputType"
+ },
+ {
+ "$ref": "#/components/schemas/CompletionInputType"
+ },
+ {
+ "$ref": "#/components/schemas/AgentTurnInputType"
+ }
+ ],
+ "discriminator": {
+ "propertyName": "type",
+ "mapping": {
+ "string": "#/components/schemas/StringType",
+ "number": "#/components/schemas/NumberType",
+ "boolean": "#/components/schemas/BooleanType",
+ "array": "#/components/schemas/ArrayType",
+ "object": "#/components/schemas/ObjectType",
+ "json": "#/components/schemas/JsonType",
+ "union": "#/components/schemas/UnionType",
+ "chat_completion_input": "#/components/schemas/ChatCompletionInputType",
+ "completion_input": "#/components/schemas/CompletionInputType",
+ "agent_turn_input": "#/components/schemas/AgentTurnInputType"
+ }
+ }
+ },
+ "ScoringFn": {
+ "type": "object",
+ "properties": {
+ "identifier": {
+ "type": "string"
+ },
+ "provider_resource_id": {
+ "type": "string"
+ },
+ "provider_id": {
+ "type": "string"
+ },
+ "type": {
+ "type": "string",
+ "const": "scoring_function",
+ "default": "scoring_function"
+ },
+ "description": {
+ "type": "string"
+ },
+ "metadata": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ }
+ },
+ "return_type": {
+ "$ref": "#/components/schemas/ParamType"
+ },
+ "params": {
+ "$ref": "#/components/schemas/ScoringFnParams"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "identifier",
+ "provider_resource_id",
+ "provider_id",
+ "type",
+ "metadata",
+ "return_type"
+ ],
+ "title": "ScoringFn"
+ },
+ "StringType": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "string",
+ "default": "string"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type"
+ ],
+ "title": "StringType"
+ },
+ "UnionType": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "union",
+ "default": "union"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type"
+ ],
+ "title": "UnionType"
+ },
"Shield": {
"type": "object",
"properties": {
@@ -7405,13 +8523,13 @@
],
"title": "VectorDB"
},
- "BenchmarkTask": {
+ "BenchmarkEvaluationTask": {
"type": "object",
"properties": {
"type": {
"type": "string",
- "const": "benchmark_id",
- "default": "benchmark_id"
+ "const": "benchmark",
+ "default": "benchmark"
},
"benchmark_id": {
"type": "string"
@@ -7422,15 +8540,15 @@
"type",
"benchmark_id"
],
- "title": "BenchmarkTask"
+ "title": "BenchmarkEvaluationTask"
},
- "DataSourceGraderTask": {
+ "DataEvaluationTask": {
"type": "object",
"properties": {
"type": {
"type": "string",
- "const": "data_source_grader",
- "default": "data_source_grader"
+ "const": "data",
+ "default": "data"
},
"data_source": {
"$ref": "#/components/schemas/DataSource"
@@ -7448,15 +8566,15 @@
"data_source",
"grader_ids"
],
- "title": "DataSourceGraderTask"
+ "title": "DataEvaluationTask"
},
- "DatasetGraderTask": {
+ "DatasetEvaluationTask": {
"type": "object",
"properties": {
"type": {
"type": "string",
- "const": "dataset_grader",
- "default": "dataset_grader"
+ "const": "dataset",
+ "default": "dataset"
},
"dataset_id": {
"type": "string"
@@ -7474,26 +8592,26 @@
"dataset_id",
"grader_ids"
],
- "title": "DatasetGraderTask"
+ "title": "DatasetEvaluationTask"
},
"EvaluationTask": {
"oneOf": [
{
- "$ref": "#/components/schemas/BenchmarkTask"
+ "$ref": "#/components/schemas/BenchmarkEvaluationTask"
},
{
- "$ref": "#/components/schemas/DatasetGraderTask"
+ "$ref": "#/components/schemas/DatasetEvaluationTask"
},
{
- "$ref": "#/components/schemas/DataSourceGraderTask"
+ "$ref": "#/components/schemas/DataEvaluationTask"
}
],
"discriminator": {
"propertyName": "type",
"mapping": {
- "benchmark_id": "#/components/schemas/BenchmarkTask",
- "dataset_grader": "#/components/schemas/DatasetGraderTask",
- "data_source_grader": "#/components/schemas/DataSourceGraderTask"
+ "benchmark": "#/components/schemas/BenchmarkEvaluationTask",
+ "dataset": "#/components/schemas/DatasetEvaluationTask",
+ "data": "#/components/schemas/DataEvaluationTask"
}
}
},
@@ -7511,27 +8629,6 @@
],
"title": "GradeRequest"
},
- "AgentCandidate": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "agent",
- "default": "agent"
- },
- "config": {
- "$ref": "#/components/schemas/AgentConfig",
- "description": "The configuration for the agent candidate."
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "config"
- ],
- "title": "AgentCandidate",
- "description": "An agent candidate for evaluation."
- },
"EvaluationCandidate": {
"oneOf": [
{
@@ -7572,10 +8669,10 @@
"format": "date-time",
"description": "The time the job was created."
},
- "ended_at": {
+ "completed_at": {
"type": "string",
"format": "date-time",
- "description": "The time the job ended."
+ "description": "The time the job completed."
},
"error": {
"type": "string",
@@ -7604,35 +8701,6 @@
],
"title": "EvaluationJob"
},
- "ModelCandidate": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "model",
- "default": "model"
- },
- "model_id": {
- "type": "string"
- },
- "sampling_params": {
- "$ref": "#/components/schemas/SamplingParams",
- "description": "The sampling parameters for the model."
- },
- "system_message": {
- "$ref": "#/components/schemas/SystemMessage",
- "description": "(Optional) The system message providing instructions or context to the model."
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "model_id",
- "sampling_params"
- ],
- "title": "ModelCandidate",
- "description": "A model candidate for evaluation."
- },
"GradeSyncRequest": {
"type": "object",
"properties": {
@@ -7695,73 +8763,6 @@
"title": "EvaluationResponse",
"description": "A response to an inline evaluation."
},
- "ScoringResult": {
- "type": "object",
- "properties": {
- "scores": {
- "type": "array",
- "items": {
- "type": "object",
- "additionalProperties": {
- "oneOf": [
- {
- "type": "null"
- },
- {
- "type": "boolean"
- },
- {
- "type": "number"
- },
- {
- "type": "string"
- },
- {
- "type": "array"
- },
- {
- "type": "object"
- }
- ]
- }
- },
- "description": "The scoring result for each row. Each row is a map of grader column name to value."
- },
- "metrics": {
- "type": "object",
- "additionalProperties": {
- "oneOf": [
- {
- "type": "null"
- },
- {
- "type": "boolean"
- },
- {
- "type": "number"
- },
- {
- "type": "string"
- },
- {
- "type": "array"
- },
- {
- "type": "object"
- }
- ]
- },
- "description": "Map of metric name to aggregated value."
- }
- },
- "additionalProperties": false,
- "required": [
- "scores",
- "metrics"
- ],
- "title": "ScoringResult",
- "description": "A scoring result for a single row."
- },
"HealthInfo": {
"type": "object",
"properties": {
@@ -8094,6 +9095,17 @@
"title": "IterrowsResponse",
"description": "A paginated list of rows from a dataset."
},
+ "JobStatus": {
+ "type": "string",
+ "enum": [
+ "completed",
+ "in_progress",
+ "failed",
+ "scheduled",
+ "cancelled"
+ ],
+ "title": "JobStatus"
+ },
"ListAgentSessionsResponse": {
"type": "object",
"properties": {
@@ -8266,6 +9278,22 @@
],
"title": "ListGraderTypesResponse"
},
+ "ListGradersResponse": {
+ "type": "object",
+ "properties": {
+ "data": {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/Grader"
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "data"
+ ],
+ "title": "ListGradersResponse"
+ },
"ListModelsResponse": {
"type": "object",
"properties": {
@@ -8338,6 +9366,22 @@
],
"title": "ListRoutesResponse"
},
+ "ListScoringFunctionsResponse": {
+ "type": "object",
+ "properties": {
+ "data": {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/ScoringFn"
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "data"
+ ],
+ "title": "ListScoringFunctionsResponse"
+ },
"ListShieldsResponse": {
"type": "object",
"properties": {
@@ -9330,7 +10374,7 @@
"properties": {
"dataset_id": {
"type": "string",
- "description": "The ID of the dataset to used to run the benchmark."
+ "description": "The ID of the dataset to be used to run the benchmark."
},
"grader_ids": {
"type": "array",
@@ -9522,6 +10566,36 @@
],
"title": "RegisterModelRequest"
},
+ "RegisterScoringFunctionRequest": {
+ "type": "object",
+ "properties": {
+ "scoring_fn_id": {
+ "type": "string"
+ },
+ "description": {
+ "type": "string"
+ },
+ "return_type": {
+ "$ref": "#/components/schemas/ParamType"
+ },
+ "provider_scoring_fn_id": {
+ "type": "string"
+ },
+ "provider_id": {
+ "type": "string"
+ },
+ "params": {
+ "$ref": "#/components/schemas/ScoringFnParams"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "scoring_fn_id",
+ "description",
+ "return_type"
+ ],
+ "title": "RegisterScoringFunctionRequest"
+ },
"RegisterShieldRequest": {
"type": "object",
"properties": {
@@ -9677,6 +10751,33 @@
],
"title": "RunRequest"
},
+ "RunEvalRequest": {
+ "type": "object",
+ "properties": {
+ "benchmark_config": {
+ "$ref": "#/components/schemas/BenchmarkConfig",
+ "description": "The configuration for the benchmark."
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "benchmark_config"
+ ],
+ "title": "RunEvalRequest"
+ },
+ "Job": {
+ "type": "object",
+ "properties": {
+ "job_id": {
+ "type": "string"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "job_id"
+ ],
+ "title": "Job"
+ },
"RunShieldRequest": {
"type": "object",
"properties": {
@@ -9782,6 +10883,128 @@
],
"title": "SaveSpansToDatasetRequest"
},
+ "ScoreRequest": {
+ "type": "object",
+ "properties": {
+ "input_rows": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ }
+ },
+ "description": "The rows to score."
+ },
+ "scoring_functions": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/ScoringFnParams"
+ },
+ {
+ "type": "null"
+ }
+ ]
+ },
+ "description": "The scoring functions to use for the scoring."
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "input_rows",
+ "scoring_functions"
+ ],
+ "title": "ScoreRequest"
+ },
+ "ScoreResponse": {
+ "type": "object",
+ "properties": {
+ "results": {
+ "type": "object",
+ "additionalProperties": {
+ "$ref": "#/components/schemas/ScoringResult"
+ },
+ "description": "A map of scoring function name to ScoringResult."
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "results"
+ ],
+ "title": "ScoreResponse",
+ "description": "The response from scoring."
+ },
+ "ScoreBatchRequest": {
+ "type": "object",
+ "properties": {
+ "dataset_id": {
+ "type": "string"
+ },
+ "scoring_functions": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/ScoringFnParams"
+ },
+ {
+ "type": "null"
+ }
+ ]
+ }
+ },
+ "save_results_dataset": {
+ "type": "boolean"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "dataset_id",
+ "scoring_functions",
+ "save_results_dataset"
+ ],
+ "title": "ScoreBatchRequest"
+ },
+ "ScoreBatchResponse": {
+ "type": "object",
+ "properties": {
+ "dataset_id": {
+ "type": "string"
+ },
+ "results": {
+ "type": "object",
+ "additionalProperties": {
+ "$ref": "#/components/schemas/ScoringResult"
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "results"
+ ],
+ "title": "ScoreBatchResponse"
+ },
"AlgorithmConfig": {
"oneOf": [
{
@@ -10144,6 +11367,10 @@
{
"name": "Datasets"
},
+ {
+ "name": "Eval",
+ "x-displayName": "Llama Stack Evaluation API for running evaluations on model and agent candidates."
+ },
{
"name": "Evaluation"
},
@@ -10174,6 +11401,12 @@
{
"name": "Safety"
},
+ {
+ "name": "Scoring"
+ },
+ {
+ "name": "ScoringFunctions"
+ },
{
"name": "Shields"
},
@@ -10205,6 +11438,7 @@
"Benchmarks",
"DatasetIO",
"Datasets",
+ "Eval",
"Evaluation",
"Files",
"Graders",
@@ -10214,6 +11448,8 @@
"PostTraining (Coming Soon)",
"Providers",
"Safety",
+ "Scoring",
+ "ScoringFunctions",
"Shields",
"SyntheticDataGeneration (Coming Soon)",
"Telemetry",
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index b0db86389..ecc8104e1 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -589,59 +589,6 @@ paths:
required: true
schema:
type: string
- /v1/graders/{grader_id}:
- get:
- responses:
- '200':
- description: The grader.
- content:
- application/json:
- schema:
- $ref: '#/components/schemas/Grader'
- '400':
- $ref: '#/components/responses/BadRequest400'
- '429':
- $ref: >-
- #/components/responses/TooManyRequests429
- '500':
- $ref: >-
- #/components/responses/InternalServerError500
- default:
- $ref: '#/components/responses/DefaultError'
- tags:
- - Graders
- description: Get a grader by ID.
- parameters:
- - name: grader_id
- in: path
- description: The ID of the grader.
- required: true
- schema:
- type: string
- delete:
- responses:
- '200':
- description: OK
- '400':
- $ref: '#/components/responses/BadRequest400'
- '429':
- $ref: >-
- #/components/responses/TooManyRequests429
- '500':
- $ref: >-
- #/components/responses/InternalServerError500
- default:
- $ref: '#/components/responses/DefaultError'
- tags:
- - Graders
- description: Delete a grader by ID.
- parameters:
- - name: grader_id
- in: path
- description: The ID of the grader.
- required: true
- schema:
- type: string
/v1/inference/embeddings:
post:
responses:
@@ -675,6 +622,43 @@ paths:
schema:
$ref: '#/components/schemas/EmbeddingsRequest'
required: true
+ /v1/eval/benchmarks/{benchmark_id}/evaluations:
+ post:
+ responses:
+ '200':
+ description: >-
+ EvaluateResponse object containing generations and scores
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/EvaluateResponse'
+ '400':
+ $ref: '#/components/responses/BadRequest400'
+ '429':
+ $ref: >-
+ #/components/responses/TooManyRequests429
+ '500':
+ $ref: >-
+ #/components/responses/InternalServerError500
+ default:
+ $ref: '#/components/responses/DefaultError'
+ tags:
+ - Eval
+ description: Evaluate a list of rows on a benchmark.
+ parameters:
+ - name: benchmark_id
+ in: path
+ description: >-
+ The ID of the benchmark to run the evaluation on.
+ required: true
+ schema:
+ type: string
+ requestBody:
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/EvaluateRowsRequest'
+ required: true
/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}:
get:
responses:
@@ -794,6 +778,29 @@ paths:
required: true
schema:
type: string
+ delete:
+ responses:
+ '200':
+ description: OK
+ '400':
+ $ref: '#/components/responses/BadRequest400'
+ '429':
+ $ref: >-
+ #/components/responses/TooManyRequests429
+ '500':
+ $ref: >-
+ #/components/responses/InternalServerError500
+ default:
+ $ref: '#/components/responses/DefaultError'
+ tags:
+ - Benchmarks
+ description: Unregister a benchmark by ID.
+ parameters:
+ - name: benchmark_id
+ in: path
+ required: true
+ schema:
+ type: string
/v1/datasets/{dataset_id}:
get:
responses:
@@ -845,6 +852,59 @@ paths:
required: true
schema:
type: string
+ /v1/graders/{grader_id}:
+ get:
+ responses:
+ '200':
+ description: The grader.
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/Grader'
+ '400':
+ $ref: '#/components/responses/BadRequest400'
+ '429':
+ $ref: >-
+ #/components/responses/TooManyRequests429
+ '500':
+ $ref: >-
+ #/components/responses/InternalServerError500
+ default:
+ $ref: '#/components/responses/DefaultError'
+ tags:
+ - Graders
+ description: Get a grader by ID.
+ parameters:
+ - name: grader_id
+ in: path
+ description: The ID of the grader.
+ required: true
+ schema:
+ type: string
+ delete:
+ responses:
+ '200':
+ description: OK
+ '400':
+ $ref: '#/components/responses/BadRequest400'
+ '429':
+ $ref: >-
+ #/components/responses/TooManyRequests429
+ '500':
+ $ref: >-
+ #/components/responses/InternalServerError500
+ default:
+ $ref: '#/components/responses/DefaultError'
+ tags:
+ - Graders
+ description: Unregister a grader by ID.
+ parameters:
+ - name: grader_id
+ in: path
+ description: The ID of the grader.
+ required: true
+ schema:
+ type: string
/v1/models/{model_id}:
get:
responses:
@@ -896,6 +956,34 @@ paths:
required: true
schema:
type: string
+ /v1/scoring-functions/{scoring_fn_id}:
+ get:
+ responses:
+ '200':
+ description: OK
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ScoringFn'
+ '400':
+ $ref: '#/components/responses/BadRequest400'
+ '429':
+ $ref: >-
+ #/components/responses/TooManyRequests429
+ '500':
+ $ref: >-
+ #/components/responses/InternalServerError500
+ default:
+ $ref: '#/components/responses/DefaultError'
+ tags:
+ - ScoringFunctions
+ description: ''
+ parameters:
+ - name: scoring_fn_id
+ in: path
+ required: true
+ schema:
+ type: string
/v1/shields/{identifier}:
get:
responses:
@@ -1536,6 +1624,109 @@ paths:
required: false
schema:
type: integer
+ /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}:
+ get:
+ responses:
+ '200':
+ description: The status of the evaluationjob.
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/JobStatus'
+ '400':
+ $ref: '#/components/responses/BadRequest400'
+ '429':
+ $ref: >-
+ #/components/responses/TooManyRequests429
+ '500':
+ $ref: >-
+ #/components/responses/InternalServerError500
+ default:
+ $ref: '#/components/responses/DefaultError'
+ tags:
+ - Eval
+ description: Get the status of a job.
+ parameters:
+ - name: benchmark_id
+ in: path
+ description: >-
+ The ID of the benchmark to run the evaluation on.
+ required: true
+ schema:
+ type: string
+ - name: job_id
+ in: path
+ description: The ID of the job to get the status of.
+ required: true
+ schema:
+ type: string
+ delete:
+ responses:
+ '200':
+ description: OK
+ '400':
+ $ref: '#/components/responses/BadRequest400'
+ '429':
+ $ref: >-
+ #/components/responses/TooManyRequests429
+ '500':
+ $ref: >-
+ #/components/responses/InternalServerError500
+ default:
+ $ref: '#/components/responses/DefaultError'
+ tags:
+ - Eval
+ description: Cancel a job.
+ parameters:
+ - name: benchmark_id
+ in: path
+ description: >-
+ The ID of the benchmark to run the evaluation on.
+ required: true
+ schema:
+ type: string
+ - name: job_id
+ in: path
+ description: The ID of the job to cancel.
+ required: true
+ schema:
+ type: string
+ /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result:
+ get:
+ responses:
+ '200':
+ description: The result of the job.
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/EvaluateResponse'
+ '400':
+ $ref: '#/components/responses/BadRequest400'
+ '429':
+ $ref: >-
+ #/components/responses/TooManyRequests429
+ '500':
+ $ref: >-
+ #/components/responses/InternalServerError500
+ default:
+ $ref: '#/components/responses/DefaultError'
+ tags:
+ - Eval
+ description: Get the result of a job.
+ parameters:
+ - name: benchmark_id
+ in: path
+ description: >-
+ The ID of the benchmark to run the evaluation on.
+ required: true
+ schema:
+ type: string
+ - name: job_id
+ in: path
+ description: The ID of the job to get the result of.
+ required: true
+ schema:
+ type: string
/v1/agents/{agent_id}/sessions:
get:
responses:
@@ -1727,9 +1918,9 @@ paths:
'200':
description: A list of graders.
content:
- application/jsonl:
+ application/json:
schema:
- $ref: '#/components/schemas/Grader'
+ $ref: '#/components/schemas/ListGradersResponse'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
@@ -1902,6 +2093,53 @@ paths:
required: false
schema:
$ref: '#/components/schemas/URL'
+ /v1/scoring-functions:
+ get:
+ responses:
+ '200':
+ description: OK
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ListScoringFunctionsResponse'
+ '400':
+ $ref: '#/components/responses/BadRequest400'
+ '429':
+ $ref: >-
+ #/components/responses/TooManyRequests429
+ '500':
+ $ref: >-
+ #/components/responses/InternalServerError500
+ default:
+ $ref: '#/components/responses/DefaultError'
+ tags:
+ - ScoringFunctions
+ description: ''
+ parameters: []
+ post:
+ responses:
+ '200':
+ description: OK
+ '400':
+ $ref: '#/components/responses/BadRequest400'
+ '429':
+ $ref: >-
+ #/components/responses/TooManyRequests429
+ '500':
+ $ref: >-
+ #/components/responses/InternalServerError500
+ default:
+ $ref: '#/components/responses/DefaultError'
+ tags:
+ - ScoringFunctions
+ description: ''
+ parameters: []
+ requestBody:
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/RegisterScoringFunctionRequest'
+ required: true
/v1/shields:
get:
responses:
@@ -2336,6 +2574,43 @@ paths:
schema:
$ref: '#/components/schemas/RunRequest'
required: true
+ /v1/eval/benchmarks/{benchmark_id}/jobs:
+ post:
+ responses:
+ '200':
+ description: >-
+ The job that was created to run the evaluation.
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/Job'
+ '400':
+ $ref: '#/components/responses/BadRequest400'
+ '429':
+ $ref: >-
+ #/components/responses/TooManyRequests429
+ '500':
+ $ref: >-
+ #/components/responses/InternalServerError500
+ default:
+ $ref: '#/components/responses/DefaultError'
+ tags:
+ - Eval
+ description: Run an evaluation on a benchmark.
+ parameters:
+ - name: benchmark_id
+ in: path
+ description: >-
+ The ID of the benchmark to run the evaluation on.
+ required: true
+ schema:
+ type: string
+ requestBody:
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/RunEvalRequest'
+ required: true
/v1/safety/run-shield:
post:
responses:
@@ -2419,6 +2694,65 @@ paths:
schema:
$ref: '#/components/schemas/SaveSpansToDatasetRequest'
required: true
+ /v1/scoring/score:
+ post:
+ responses:
+ '200':
+ description: >-
+ ScoreResponse object containing rows and aggregated results
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ScoreResponse'
+ '400':
+ $ref: '#/components/responses/BadRequest400'
+ '429':
+ $ref: >-
+ #/components/responses/TooManyRequests429
+ '500':
+ $ref: >-
+ #/components/responses/InternalServerError500
+ default:
+ $ref: '#/components/responses/DefaultError'
+ tags:
+ - Scoring
+ description: Score a list of rows.
+ parameters: []
+ requestBody:
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ScoreRequest'
+ required: true
+ /v1/scoring/score-batch:
+ post:
+ responses:
+ '200':
+ description: OK
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ScoreBatchResponse'
+ '400':
+ $ref: '#/components/responses/BadRequest400'
+ '429':
+ $ref: >-
+ #/components/responses/TooManyRequests429
+ '500':
+ $ref: >-
+ #/components/responses/InternalServerError500
+ default:
+ $ref: '#/components/responses/DefaultError'
+ tags:
+ - Scoring
+ description: ''
+ parameters: []
+ requestBody:
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ScoreBatchRequest'
+ required: true
/v1/post-training/supervised-fine-tune:
post:
responses:
@@ -4308,6 +4642,251 @@ components:
title: EmbeddingsResponse
description: >-
Response containing generated embeddings.
+ AgentCandidate:
+ type: object
+ properties:
+ type:
+ type: string
+ const: agent
+ default: agent
+ config:
+ $ref: '#/components/schemas/AgentConfig'
+ description: >-
+ The configuration for the agent candidate.
+ additionalProperties: false
+ required:
+ - type
+ - config
+ title: AgentCandidate
+ description: An agent candidate for evaluation.
+ AggregationFunctionType:
+ type: string
+ enum:
+ - average
+ - median
+ - categorical_count
+ - accuracy
+ title: AggregationFunctionType
+ BasicScoringFnParams:
+ type: object
+ properties:
+ type:
+ type: string
+ const: basic
+ default: basic
+ aggregation_functions:
+ type: array
+ items:
+ $ref: '#/components/schemas/AggregationFunctionType'
+ additionalProperties: false
+ required:
+ - type
+ title: BasicScoringFnParams
+ BenchmarkConfig:
+ type: object
+ properties:
+ eval_candidate:
+ $ref: '#/components/schemas/EvalCandidate'
+ description: The candidate to evaluate.
+ scoring_params:
+ type: object
+ additionalProperties:
+ $ref: '#/components/schemas/ScoringFnParams'
+ description: >-
+ Map between scoring function id and parameters for each scoring function
+ you want to run
+ num_examples:
+ type: integer
+ description: >-
+ (Optional) The number of examples to evaluate. If not provided, all examples
+ in the dataset will be evaluated
+ additionalProperties: false
+ required:
+ - eval_candidate
+ - scoring_params
+ title: BenchmarkConfig
+ description: >-
+ A benchmark configuration for evaluation.
+ EvalCandidate:
+ oneOf:
+ - $ref: '#/components/schemas/ModelCandidate'
+ - $ref: '#/components/schemas/AgentCandidate'
+ discriminator:
+ propertyName: type
+ mapping:
+ model: '#/components/schemas/ModelCandidate'
+ agent: '#/components/schemas/AgentCandidate'
+ LLMAsJudgeScoringFnParams:
+ type: object
+ properties:
+ type:
+ type: string
+ const: llm_as_judge
+ default: llm_as_judge
+ judge_model:
+ type: string
+ prompt_template:
+ type: string
+ judge_score_regexes:
+ type: array
+ items:
+ type: string
+ aggregation_functions:
+ type: array
+ items:
+ $ref: '#/components/schemas/AggregationFunctionType'
+ additionalProperties: false
+ required:
+ - type
+ - judge_model
+ title: LLMAsJudgeScoringFnParams
+ ModelCandidate:
+ type: object
+ properties:
+ type:
+ type: string
+ const: model
+ default: model
+ model:
+ type: string
+ description: The model ID to evaluate.
+ sampling_params:
+ $ref: '#/components/schemas/SamplingParams'
+ description: The sampling parameters for the model.
+ system_message:
+ $ref: '#/components/schemas/SystemMessage'
+ description: >-
+ (Optional) The system message providing instructions or context to the
+ model.
+ additionalProperties: false
+ required:
+ - type
+ - model
+ - sampling_params
+ title: ModelCandidate
+ description: A model candidate for evaluation.
+ RegexParserScoringFnParams:
+ type: object
+ properties:
+ type:
+ type: string
+ const: regex_parser
+ default: regex_parser
+ parsing_regexes:
+ type: array
+ items:
+ type: string
+ aggregation_functions:
+ type: array
+ items:
+ $ref: '#/components/schemas/AggregationFunctionType'
+ additionalProperties: false
+ required:
+ - type
+ title: RegexParserScoringFnParams
+ ScoringFnParams:
+ oneOf:
+ - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
+ - $ref: '#/components/schemas/RegexParserScoringFnParams'
+ - $ref: '#/components/schemas/BasicScoringFnParams'
+ discriminator:
+ propertyName: type
+ mapping:
+ llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
+ regex_parser: '#/components/schemas/RegexParserScoringFnParams'
+ basic: '#/components/schemas/BasicScoringFnParams'
+ EvaluateRowsRequest:
+ type: object
+ properties:
+ input_rows:
+ type: array
+ items:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ description: The rows to evaluate.
+ scoring_functions:
+ type: array
+ items:
+ type: string
+ description: >-
+ The scoring functions to use for the evaluation.
+ benchmark_config:
+ $ref: '#/components/schemas/BenchmarkConfig'
+ description: The configuration for the benchmark.
+ additionalProperties: false
+ required:
+ - input_rows
+ - scoring_functions
+ - benchmark_config
+ title: EvaluateRowsRequest
+ EvaluateResponse:
+ type: object
+ properties:
+ generations:
+ type: array
+ items:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ description: The generations from the evaluation.
+ scores:
+ type: object
+ additionalProperties:
+ $ref: '#/components/schemas/ScoringResult'
+ description: The scores from the evaluation.
+ additionalProperties: false
+ required:
+ - generations
+ - scores
+ title: EvaluateResponse
+ description: The response from an evaluation.
+ ScoringResult:
+ type: object
+ properties:
+ score_rows:
+ type: array
+ items:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ description: >-
+ The scoring result for each row. Each row is a map of column name to value.
+ aggregated_results:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ description: Map of metric name to aggregated value
+ additionalProperties: false
+ required:
+ - score_rows
+ - aggregated_results
+ title: ScoringResult
+ description: A scoring result for a single row.
Agent:
type: object
properties:
@@ -4803,6 +5382,179 @@ components:
- llm
- embedding
title: ModelType
+ AgentTurnInputType:
+ type: object
+ properties:
+ type:
+ type: string
+ const: agent_turn_input
+ default: agent_turn_input
+ additionalProperties: false
+ required:
+ - type
+ title: AgentTurnInputType
+ ArrayType:
+ type: object
+ properties:
+ type:
+ type: string
+ const: array
+ default: array
+ additionalProperties: false
+ required:
+ - type
+ title: ArrayType
+ BooleanType:
+ type: object
+ properties:
+ type:
+ type: string
+ const: boolean
+ default: boolean
+ additionalProperties: false
+ required:
+ - type
+ title: BooleanType
+ ChatCompletionInputType:
+ type: object
+ properties:
+ type:
+ type: string
+ const: chat_completion_input
+ default: chat_completion_input
+ additionalProperties: false
+ required:
+ - type
+ title: ChatCompletionInputType
+ CompletionInputType:
+ type: object
+ properties:
+ type:
+ type: string
+ const: completion_input
+ default: completion_input
+ additionalProperties: false
+ required:
+ - type
+ title: CompletionInputType
+ JsonType:
+ type: object
+ properties:
+ type:
+ type: string
+ const: json
+ default: json
+ additionalProperties: false
+ required:
+ - type
+ title: JsonType
+ NumberType:
+ type: object
+ properties:
+ type:
+ type: string
+ const: number
+ default: number
+ additionalProperties: false
+ required:
+ - type
+ title: NumberType
+ ObjectType:
+ type: object
+ properties:
+ type:
+ type: string
+ const: object
+ default: object
+ additionalProperties: false
+ required:
+ - type
+ title: ObjectType
+ ParamType:
+ oneOf:
+ - $ref: '#/components/schemas/StringType'
+ - $ref: '#/components/schemas/NumberType'
+ - $ref: '#/components/schemas/BooleanType'
+ - $ref: '#/components/schemas/ArrayType'
+ - $ref: '#/components/schemas/ObjectType'
+ - $ref: '#/components/schemas/JsonType'
+ - $ref: '#/components/schemas/UnionType'
+ - $ref: '#/components/schemas/ChatCompletionInputType'
+ - $ref: '#/components/schemas/CompletionInputType'
+ - $ref: '#/components/schemas/AgentTurnInputType'
+ discriminator:
+ propertyName: type
+ mapping:
+ string: '#/components/schemas/StringType'
+ number: '#/components/schemas/NumberType'
+ boolean: '#/components/schemas/BooleanType'
+ array: '#/components/schemas/ArrayType'
+ object: '#/components/schemas/ObjectType'
+ json: '#/components/schemas/JsonType'
+ union: '#/components/schemas/UnionType'
+ chat_completion_input: '#/components/schemas/ChatCompletionInputType'
+ completion_input: '#/components/schemas/CompletionInputType'
+ agent_turn_input: '#/components/schemas/AgentTurnInputType'
+ ScoringFn:
+ type: object
+ properties:
+ identifier:
+ type: string
+ provider_resource_id:
+ type: string
+ provider_id:
+ type: string
+ type:
+ type: string
+ const: scoring_function
+ default: scoring_function
+ description:
+ type: string
+ metadata:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ return_type:
+ $ref: '#/components/schemas/ParamType'
+ params:
+ $ref: '#/components/schemas/ScoringFnParams'
+ additionalProperties: false
+ required:
+ - identifier
+ - provider_resource_id
+ - provider_id
+ - type
+ - metadata
+ - return_type
+ title: ScoringFn
+ StringType:
+ type: object
+ properties:
+ type:
+ type: string
+ const: string
+ default: string
+ additionalProperties: false
+ required:
+ - type
+ title: StringType
+ UnionType:
+ type: object
+ properties:
+ type:
+ type: string
+ const: union
+ default: union
+ additionalProperties: false
+ required:
+ - type
+ title: UnionType
Shield:
type: object
properties:
@@ -5141,27 +5893,27 @@ components:
- embedding_model
- embedding_dimension
title: VectorDB
- BenchmarkTask:
+ BenchmarkEvaluationTask:
type: object
properties:
type:
type: string
- const: benchmark_id
- default: benchmark_id
+ const: benchmark
+ default: benchmark
benchmark_id:
type: string
additionalProperties: false
required:
- type
- benchmark_id
- title: BenchmarkTask
- DataSourceGraderTask:
+ title: BenchmarkEvaluationTask
+ DataEvaluationTask:
type: object
properties:
type:
type: string
- const: data_source_grader
- default: data_source_grader
+ const: data
+ default: data
data_source:
$ref: '#/components/schemas/DataSource'
grader_ids:
@@ -5173,14 +5925,14 @@ components:
- type
- data_source
- grader_ids
- title: DataSourceGraderTask
- DatasetGraderTask:
+ title: DataEvaluationTask
+ DatasetEvaluationTask:
type: object
properties:
type:
type: string
- const: dataset_grader
- default: dataset_grader
+ const: dataset
+ default: dataset
dataset_id:
type: string
grader_ids:
@@ -5192,18 +5944,18 @@ components:
- type
- dataset_id
- grader_ids
- title: DatasetGraderTask
+ title: DatasetEvaluationTask
EvaluationTask:
oneOf:
- - $ref: '#/components/schemas/BenchmarkTask'
- - $ref: '#/components/schemas/DatasetGraderTask'
- - $ref: '#/components/schemas/DataSourceGraderTask'
+ - $ref: '#/components/schemas/BenchmarkEvaluationTask'
+ - $ref: '#/components/schemas/DatasetEvaluationTask'
+ - $ref: '#/components/schemas/DataEvaluationTask'
discriminator:
propertyName: type
mapping:
- benchmark_id: '#/components/schemas/BenchmarkTask'
- dataset_grader: '#/components/schemas/DatasetGraderTask'
- data_source_grader: '#/components/schemas/DataSourceGraderTask'
+ benchmark: '#/components/schemas/BenchmarkEvaluationTask'
+ dataset: '#/components/schemas/DatasetEvaluationTask'
+ data: '#/components/schemas/DataEvaluationTask'
GradeRequest:
type: object
properties:
@@ -5218,23 +5970,6 @@ components:
required:
- task
title: GradeRequest
- AgentCandidate:
- type: object
- properties:
- type:
- type: string
- const: agent
- default: agent
- config:
- $ref: '#/components/schemas/AgentConfig'
- description: >-
- The configuration for the agent candidate.
- additionalProperties: false
- required:
- - type
- - config
- title: AgentCandidate
- description: An agent candidate for evaluation.
EvaluationCandidate:
oneOf:
- $ref: '#/components/schemas/ModelCandidate'
@@ -5263,10 +5998,10 @@ components:
type: string
format: date-time
description: The time the job was created.
- ended_at:
+ completed_at:
type: string
format: date-time
- description: The time the job ended.
+ description: The time the job completed.
error:
type: string
description: >-
@@ -5288,30 +6023,6 @@ components:
- task
- candidate
title: EvaluationJob
- ModelCandidate:
- type: object
- properties:
- type:
- type: string
- const: model
- default: model
- model_id:
- type: string
- sampling_params:
- $ref: '#/components/schemas/SamplingParams'
- description: The sampling parameters for the model.
- system_message:
- $ref: '#/components/schemas/SystemMessage'
- description: >-
- (Optional) The system message providing instructions or context to the
- model.
- additionalProperties: false
- required:
- - type
- - model_id
- - sampling_params
- title: ModelCandidate
- description: A model candidate for evaluation.
GradeSyncRequest:
type: object
properties:
@@ -5355,41 +6066,6 @@ components:
- scores
title: EvaluationResponse
description: A response to an inline evaluation.
- ScoringResult:
- type: object
- properties:
- scores:
- type: array
- items:
- type: object
- additionalProperties:
- oneOf:
- - type: 'null'
- - type: boolean
- - type: number
- - type: string
- - type: array
- - type: object
- description: >-
- The scoring result for each row. Each row is a map of grader column name
- to value.
- metrics:
- type: object
- additionalProperties:
- oneOf:
- - type: 'null'
- - type: boolean
- - type: number
- - type: string
- - type: array
- - type: object
- description: Map of metric name to aggregated value.
- additionalProperties: false
- required:
- - scores
- - metrics
- title: ScoringResult
- description: A scoring result for a single row.
HealthInfo:
type: object
properties:
@@ -5576,6 +6252,15 @@ components:
- data
title: IterrowsResponse
description: A paginated list of rows from a dataset.
+ JobStatus:
+ type: string
+ enum:
+ - completed
+ - in_progress
+ - failed
+ - scheduled
+ - cancelled
+ title: JobStatus
ListAgentSessionsResponse:
type: object
properties:
@@ -5707,6 +6392,17 @@ components:
required:
- data
title: ListGraderTypesResponse
+ ListGradersResponse:
+ type: object
+ properties:
+ data:
+ type: array
+ items:
+ $ref: '#/components/schemas/Grader'
+ additionalProperties: false
+ required:
+ - data
+ title: ListGradersResponse
ListModelsResponse:
type: object
properties:
@@ -5757,6 +6453,17 @@ components:
required:
- data
title: ListRoutesResponse
+ ListScoringFunctionsResponse:
+ type: object
+ properties:
+ data:
+ type: array
+ items:
+ $ref: '#/components/schemas/ScoringFn'
+ additionalProperties: false
+ required:
+ - data
+ title: ListScoringFunctionsResponse
ListShieldsResponse:
type: object
properties:
@@ -6394,7 +7101,7 @@ components:
dataset_id:
type: string
description: >-
- The ID of the dataset to used to run the benchmark.
+ The ID of the dataset to be used to run the benchmark.
grader_ids:
type: array
items:
@@ -6532,6 +7239,27 @@ components:
required:
- model_id
title: RegisterModelRequest
+ RegisterScoringFunctionRequest:
+ type: object
+ properties:
+ scoring_fn_id:
+ type: string
+ description:
+ type: string
+ return_type:
+ $ref: '#/components/schemas/ParamType'
+ provider_scoring_fn_id:
+ type: string
+ provider_id:
+ type: string
+ params:
+ $ref: '#/components/schemas/ScoringFnParams'
+ additionalProperties: false
+ required:
+ - scoring_fn_id
+ - description
+ - return_type
+ title: RegisterScoringFunctionRequest
RegisterShieldRequest:
type: object
properties:
@@ -6631,6 +7359,25 @@ components:
- task
- candidate
title: RunRequest
+ RunEvalRequest:
+ type: object
+ properties:
+ benchmark_config:
+ $ref: '#/components/schemas/BenchmarkConfig'
+ description: The configuration for the benchmark.
+ additionalProperties: false
+ required:
+ - benchmark_config
+ title: RunEvalRequest
+ Job:
+ type: object
+ properties:
+ job_id:
+ type: string
+ additionalProperties: false
+ required:
+ - job_id
+ title: Job
RunShieldRequest:
type: object
properties:
@@ -6702,6 +7449,81 @@ components:
- attributes_to_save
- dataset_id
title: SaveSpansToDatasetRequest
+ ScoreRequest:
+ type: object
+ properties:
+ input_rows:
+ type: array
+ items:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ description: The rows to score.
+ scoring_functions:
+ type: object
+ additionalProperties:
+ oneOf:
+ - $ref: '#/components/schemas/ScoringFnParams'
+ - type: 'null'
+ description: >-
+ The scoring functions to use for the scoring.
+ additionalProperties: false
+ required:
+ - input_rows
+ - scoring_functions
+ title: ScoreRequest
+ ScoreResponse:
+ type: object
+ properties:
+ results:
+ type: object
+ additionalProperties:
+ $ref: '#/components/schemas/ScoringResult'
+ description: >-
+ A map of scoring function name to ScoringResult.
+ additionalProperties: false
+ required:
+ - results
+ title: ScoreResponse
+ description: The response from scoring.
+ ScoreBatchRequest:
+ type: object
+ properties:
+ dataset_id:
+ type: string
+ scoring_functions:
+ type: object
+ additionalProperties:
+ oneOf:
+ - $ref: '#/components/schemas/ScoringFnParams'
+ - type: 'null'
+ save_results_dataset:
+ type: boolean
+ additionalProperties: false
+ required:
+ - dataset_id
+ - scoring_functions
+ - save_results_dataset
+ title: ScoreBatchRequest
+ ScoreBatchResponse:
+ type: object
+ properties:
+ dataset_id:
+ type: string
+ results:
+ type: object
+ additionalProperties:
+ $ref: '#/components/schemas/ScoringResult'
+ additionalProperties: false
+ required:
+ - results
+ title: ScoreBatchResponse
AlgorithmConfig:
oneOf:
- $ref: '#/components/schemas/LoraFinetuningConfig'
@@ -6939,6 +7761,9 @@ tags:
- name: Benchmarks
- name: DatasetIO
- name: Datasets
+ - name: Eval
+ x-displayName: >-
+ Llama Stack Evaluation API for running evaluations on model and agent candidates.
- name: Evaluation
- name: Files
- name: Graders
@@ -6961,6 +7786,8 @@ tags:
x-displayName: >-
Providers API for inspecting, listing, and modifying providers and their configurations.
- name: Safety
+ - name: Scoring
+ - name: ScoringFunctions
- name: Shields
- name: SyntheticDataGeneration (Coming Soon)
- name: Telemetry
@@ -6976,6 +7803,7 @@ x-tagGroups:
- Benchmarks
- DatasetIO
- Datasets
+ - Eval
- Evaluation
- Files
- Graders
@@ -6985,6 +7813,8 @@ x-tagGroups:
- PostTraining (Coming Soon)
- Providers
- Safety
+ - Scoring
+ - ScoringFunctions
- Shields
- SyntheticDataGeneration (Coming Soon)
- Telemetry
diff --git a/llama_stack/apis/benchmarks/benchmarks.py b/llama_stack/apis/benchmarks/benchmarks.py
index 11db4d350..8017e5c27 100644
--- a/llama_stack/apis/benchmarks/benchmarks.py
+++ b/llama_stack/apis/benchmarks/benchmarks.py
@@ -62,7 +62,7 @@ class Benchmarks(Protocol):
"""
Register a new benchmark.
- :param dataset_id: The ID of the dataset to used to run the benchmark.
+ :param dataset_id: The ID of the dataset to be used to run the benchmark.
:param grader_ids: List of grader ids to use for this benchmark.
:param benchmark_id: (Optional) The ID of the benchmark to register. If not provided, an ID will be generated.
:param metadata: (Optional) Metadata for this benchmark for additional descriptions.
@@ -87,3 +87,10 @@ class Benchmarks(Protocol):
:param benchmark_id: The ID of the benchmark to get.
"""
...
+
+ @webmethod(route="/benchmarks/{benchmark_id}", method="DELETE")
+ async def unregister_benchmark(self, benchmark_id: str) -> None:
+ """
+ Unregister a benchmark by ID.
+ """
+ ...
diff --git a/llama_stack/apis/common/job_types.py b/llama_stack/apis/common/job_types.py
index 57775754b..307e3fa54 100644
--- a/llama_stack/apis/common/job_types.py
+++ b/llama_stack/apis/common/job_types.py
@@ -5,7 +5,6 @@
# the root directory of this source tree.
from datetime import datetime
from enum import Enum
-from typing import Optional
from pydantic import BaseModel
@@ -38,12 +37,12 @@ class CommonJobFields(BaseModel):
:param id: The ID of the job.
:param status: The status of the job.
:param created_at: The time the job was created.
- :param ended_at: The time the job ended.
+ :param completed_at: The time the job completed.
:param error: If status of the job is failed, this will contain the error message.
"""
id: str
status: JobStatus
created_at: datetime
- ended_at: Optional[datetime] = None
- error: Optional[str] = None
+ completed_at: datetime | None = None
+ error: str | None = None
diff --git a/llama_stack/apis/evaluation/evaluation.py b/llama_stack/apis/evaluation/evaluation.py
index faa620872..e667acfd4 100644
--- a/llama_stack/apis/evaluation/evaluation.py
+++ b/llama_stack/apis/evaluation/evaluation.py
@@ -48,28 +48,28 @@ EvaluationCandidate = register_schema(
@json_schema_type
-class BenchmarkTask(BaseModel):
- type: Literal["benchmark_id"] = "benchmark_id"
+class BenchmarkEvaluationTask(BaseModel):
+ type: Literal["benchmark"] = "benchmark"
benchmark_id: str
@json_schema_type
-class DatasetGraderTask(BaseModel):
- type: Literal["dataset_grader"] = "dataset_grader"
+class DatasetEvaluationTask(BaseModel):
+ type: Literal["dataset"] = "dataset"
dataset_id: str
grader_ids: List[str]
@json_schema_type
-class DataSourceGraderTask(BaseModel):
- type: Literal["data_source_grader"] = "data_source_grader"
+class DataEvaluationTask(BaseModel):
+ type: Literal["data"] = "data"
data_source: DataSource
grader_ids: List[str]
EvaluationTask = register_schema(
Annotated[
- Union[BenchmarkTask, DatasetGraderTask, DataSourceGraderTask],
+ Union[BenchmarkEvaluationTask, DatasetEvaluationTask, DataEvaluationTask],
Field(discriminator="type"),
],
name="EvaluationTask",
diff --git a/llama_stack/apis/graders/graders.py b/llama_stack/apis/graders/graders.py
index 98b85552a..522559c3f 100644
--- a/llama_stack/apis/graders/graders.py
+++ b/llama_stack/apis/graders/graders.py
@@ -29,6 +29,13 @@ from .graders import * # noqa: F401 F403
class GraderType(Enum):
"""
A type of grader. Each type is a criteria for evaluating answers.
+
+ :cvar llm: Use an LLM to score the answer.
+ :cvar regex_parser: Use a regex parser to score the answer.
+ :cvar equality: Check if the answer is equal to the reference answer.
+ :cvar subset_of: Check if the answer is a subset of the reference answer.
+ :cvar factuality: Check if the answer is factually correct using LLM as judge.
+ :cvar faithfulness: Check if the answer is faithful to the reference answer using LLM as judge.
"""
llm = "llm"
@@ -221,9 +228,9 @@ class Graders(Protocol):
...
@webmethod(route="/graders/{grader_id:path}", method="DELETE")
- async def delete_grader(self, grader_id: str) -> None:
+ async def unregister_grader(self, grader_id: str) -> None:
"""
- Delete a grader by ID.
+ Unregister a grader by ID.
:param grader_id: The ID of the grader.
"""
...
diff --git a/llama_stack/distribution/stack.py b/llama_stack/distribution/stack.py
index b4862537a..a5fa0fe39 100644
--- a/llama_stack/distribution/stack.py
+++ b/llama_stack/distribution/stack.py
@@ -17,6 +17,7 @@ from llama_stack.apis.batch_inference import BatchInference
from llama_stack.apis.benchmarks import Benchmarks
from llama_stack.apis.datasetio import DatasetIO
from llama_stack.apis.datasets import Datasets
+from llama_stack.apis.eval import Eval
from llama_stack.apis.evaluation import Evaluation
from llama_stack.apis.files import Files
from llama_stack.apis.graders import Graders
@@ -26,6 +27,8 @@ from llama_stack.apis.models import Models
from llama_stack.apis.post_training import PostTraining
from llama_stack.apis.providers import Providers
from llama_stack.apis.safety import Safety
+from llama_stack.apis.scoring import Scoring
+from llama_stack.apis.scoring_functions import ScoringFunctions
from llama_stack.apis.shields import Shields
from llama_stack.apis.synthetic_data_generation import SyntheticDataGeneration
from llama_stack.apis.telemetry import Telemetry
@@ -66,6 +69,9 @@ class LlamaStack(
Files,
Graders,
Evaluation,
+ Eval,
+ ScoringFunctions,
+ Scoring,
):
pass
@@ -111,7 +117,9 @@ class EnvVarError(Exception):
def __init__(self, var_name: str, path: str = ""):
self.var_name = var_name
self.path = path
- super().__init__(f"Environment variable '{var_name}' not set or empty{f' at {path}' if path else ''}")
+ super().__init__(
+ f"Environment variable '{var_name}' not set or empty{f' at {path}' if path else ''}"
+ )
def redact_sensitive_fields(data: Dict[str, Any]) -> Dict[str, Any]:
@@ -202,7 +210,9 @@ def validate_env_pair(env_pair: str) -> tuple[str, str]:
if not key:
raise ValueError(f"Empty key in environment variable pair: {env_pair}")
if not all(c.isalnum() or c == "_" for c in key):
- raise ValueError(f"Key must contain only alphanumeric characters and underscores: {key}")
+ raise ValueError(
+ f"Key must contain only alphanumeric characters and underscores: {key}"
+ )
return key, value
except ValueError as e:
raise ValueError(
@@ -215,14 +225,20 @@ def validate_env_pair(env_pair: str) -> tuple[str, str]:
async def construct_stack(
run_config: StackRunConfig, provider_registry: Optional[ProviderRegistry] = None
) -> Dict[Api, Any]:
- dist_registry, _ = await create_dist_registry(run_config.metadata_store, run_config.image_name)
- impls = await resolve_impls(run_config, provider_registry or get_provider_registry(), dist_registry)
+ dist_registry, _ = await create_dist_registry(
+ run_config.metadata_store, run_config.image_name
+ )
+ impls = await resolve_impls(
+ run_config, provider_registry or get_provider_registry(), dist_registry
+ )
await register_resources(run_config, impls)
return impls
def get_stack_run_config_from_template(template: str) -> StackRunConfig:
- template_path = importlib.resources.files("llama_stack") / f"templates/{template}/run.yaml"
+ template_path = (
+ importlib.resources.files("llama_stack") / f"templates/{template}/run.yaml"
+ )
with importlib.resources.as_file(template_path) as path:
if not path.exists():
@@ -265,7 +281,9 @@ def run_config_from_adhoc_config_spec(
# call method "sample_run_config" on the provider spec config class
provider_config_type = instantiate_class_type(provider_spec.config_class)
- provider_config = replace_env_vars(provider_config_type.sample_run_config(__distro_dir__=distro_dir))
+ provider_config = replace_env_vars(
+ provider_config_type.sample_run_config(__distro_dir__=distro_dir)
+ )
provider_configs_by_api[api_str] = [
Provider(