From 035b2dcb605a2ebfe33ec2fdd132e379690284e8 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Sun, 16 Mar 2025 19:33:57 -0700 Subject: [PATCH] new apis --- docs/_static/llama-stack-spec.html | 2499 ++++++++++----------- docs/_static/llama-stack-spec.yaml | 1738 +++++++------- llama_stack/apis/benchmarks/benchmarks.py | 51 +- llama_stack/apis/common/job_types.py | 36 +- llama_stack/apis/eval/eval.py | 8 +- llama_stack/apis/evaluation/__init__.py | 7 + llama_stack/apis/evaluation/evaluation.py | 175 ++ llama_stack/apis/graders/__init__.py | 7 + llama_stack/distribution/stack.py | 34 +- 9 files changed, 2365 insertions(+), 2190 deletions(-) create mode 100644 llama_stack/apis/evaluation/__init__.py create mode 100644 llama_stack/apis/evaluation/evaluation.py create mode 100644 llama_stack/apis/graders/__init__.py diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index e3c81ddb9..d6f420cae 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -866,6 +866,83 @@ ] } }, + "/v1/graders/{grader_id}": { + "get": { + "responses": { + "200": { + "description": "The grader.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Grader" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Graders" + ], + "description": "Get a grader by ID.", + "parameters": [ + { + "name": "grader_id", + "in": "path", + "description": "The ID of the grader.", + "required": true, + "schema": { + "type": "string" + } + } + ] + }, + "delete": { + "responses": { + "200": { + "description": "OK" + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Graders" + ], + "description": "Delete a grader by ID.", + "parameters": [ + { + "name": "grader_id", + "in": "path", + "description": "The ID of the grader.", + "required": true, + "schema": { + "type": "string" + } + } + ] + } + }, "/v1/inference/embeddings": { "post": { "responses": { @@ -909,59 +986,6 @@ } } }, - "/v1/eval/benchmarks/{benchmark_id}/evaluations": { - "post": { - "responses": { - "200": { - "description": "EvaluateResponse object containing generations and scores", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/EvaluateResponse" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Eval" - ], - "description": "Evaluate a list of rows on a benchmark.", - "parameters": [ - { - "name": "benchmark_id", - "in": "path", - "description": "The ID of the benchmark to run the evaluation on.", - "required": true, - "schema": { - "type": "string" - } - } - ], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/EvaluateRowsRequest" - } - } - }, - "required": true - } - } - }, "/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}": { "get": { "responses": { @@ -1101,14 +1125,7 @@ "content": { "application/json": { "schema": { - "oneOf": [ - { - "$ref": "#/components/schemas/Benchmark" - }, - { - "type": "null" - } - ] + "$ref": "#/components/schemas/Benchmark" } } } @@ -1129,11 +1146,12 @@ "tags": [ "Benchmarks" ], - "description": "", + "description": "Get a benchmark by ID.", "parameters": [ { "name": "benchmark_id", "in": "path", + "description": "The ID of the benchmark to get.", "required": true, "schema": { "type": "string" @@ -1306,55 +1324,6 @@ ] } }, - "/v1/scoring-functions/{scoring_fn_id}": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "oneOf": [ - { - "$ref": "#/components/schemas/ScoringFn" - }, - { - "type": "null" - } - ] - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "ScoringFunctions" - ], - "description": "", - "parameters": [ - { - "name": "scoring_fn_id", - "in": "path", - "required": true, - "schema": { - "type": "string" - } - } - ] - } - }, "/v1/shields/{identifier}": { "get": { "responses": { @@ -1987,6 +1956,92 @@ ] } }, + "/v1/evaluation/grade": { + "post": { + "responses": { + "200": { + "description": "The evaluation job containing grader scores.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/EvaluationJob" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Evaluation" + ], + "description": "Run an grading job with generated results. Use this when you have generated results from inference in a dataset.", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/GradeRequest" + } + } + }, + "required": true + } + } + }, + "/v1/evaluation/grade_inline": { + "post": { + "responses": { + "200": { + "description": "The evaluation job containing grader scores. \"generations\" is not populated in the response.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/EvaluationResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Evaluation" + ], + "description": "Run an grading job with generated results inline.", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/GradeInlineRequest" + } + } + }, + "required": true + } + } + }, "/v1/health": { "get": { "responses": { @@ -2238,160 +2293,6 @@ ] } }, - "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}": { - "get": { - "responses": { - "200": { - "description": "The status of the evaluationjob.", - "content": { - "application/json": { - "schema": { - "oneOf": [ - { - "$ref": "#/components/schemas/JobStatus" - }, - { - "type": "null" - } - ] - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Eval" - ], - "description": "Get the status of a job.", - "parameters": [ - { - "name": "benchmark_id", - "in": "path", - "description": "The ID of the benchmark to run the evaluation on.", - "required": true, - "schema": { - "type": "string" - } - }, - { - "name": "job_id", - "in": "path", - "description": "The ID of the job to get the status of.", - "required": true, - "schema": { - "type": "string" - } - } - ] - }, - "delete": { - "responses": { - "200": { - "description": "OK" - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Eval" - ], - "description": "Cancel a job.", - "parameters": [ - { - "name": "benchmark_id", - "in": "path", - "description": "The ID of the benchmark to run the evaluation on.", - "required": true, - "schema": { - "type": "string" - } - }, - { - "name": "job_id", - "in": "path", - "description": "The ID of the job to cancel.", - "required": true, - "schema": { - "type": "string" - } - } - ] - } - }, - "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result": { - "get": { - "responses": { - "200": { - "description": "The result of the job.", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/EvaluateResponse" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Eval" - ], - "description": "Get the result of a job.", - "parameters": [ - { - "name": "benchmark_id", - "in": "path", - "description": "The ID of the benchmark to run the evaluation on.", - "required": true, - "schema": { - "type": "string" - } - }, - { - "name": "job_id", - "in": "path", - "description": "The ID of the job to get the result of.", - "required": true, - "schema": { - "type": "string" - } - } - ] - } - }, "/v1/agents/{agent_id}/sessions": { "get": { "responses": { @@ -2464,13 +2365,20 @@ "tags": [ "Benchmarks" ], - "description": "", + "description": "List all benchmarks.", "parameters": [] }, "post": { "responses": { "200": { - "description": "OK" + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Benchmark" + } + } + } }, "400": { "$ref": "#/components/responses/BadRequest400" @@ -2488,7 +2396,7 @@ "tags": [ "Benchmarks" ], - "description": "", + "description": "Register a new benchmark.", "parameters": [], "requestBody": { "content": { @@ -2619,6 +2527,113 @@ ] } }, + "/v1/graders/types": { + "get": { + "responses": { + "200": { + "description": "A list of grader types and information about the types.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ListGraderTypesResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Graders" + ], + "description": "List all grader types.", + "parameters": [] + } + }, + "/v1/graders": { + "get": { + "responses": { + "200": { + "description": "A list of graders.", + "content": { + "application/jsonl": { + "schema": { + "$ref": "#/components/schemas/Grader" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Graders" + ], + "description": "List all graders.", + "parameters": [] + }, + "post": { + "responses": { + "200": { + "description": "The registered grader.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Grader" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Graders" + ], + "description": "Register a new grader.", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RegisterGraderRequest" + } + } + }, + "required": true + } + } + }, "/v1/models": { "get": { "responses": { @@ -2809,73 +2824,6 @@ ] } }, - "/v1/scoring-functions": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ListScoringFunctionsResponse" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "ScoringFunctions" - ], - "description": "", - "parameters": [] - }, - "post": { - "responses": { - "200": { - "description": "OK" - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "ScoringFunctions" - ], - "description": "", - "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/RegisterScoringFunctionRequest" - } - } - }, - "required": true - } - } - }, "/v1/shields": { "get": { "responses": { @@ -3460,15 +3408,15 @@ } } }, - "/v1/eval/benchmarks/{benchmark_id}/jobs": { + "/v1/evaluation/run": { "post": { "responses": { "200": { - "description": "The job that was created to run the evaluation.", + "description": "OK", "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/Job" + "$ref": "#/components/schemas/EvaluationJob" } } } @@ -3487,25 +3435,58 @@ } }, "tags": [ - "Eval" - ], - "description": "Run an evaluation on a benchmark.", - "parameters": [ - { - "name": "benchmark_id", - "in": "path", - "description": "The ID of the benchmark to run the evaluation on.", - "required": true, - "schema": { - "type": "string" - } - } + "Evaluation" ], + "description": "Run an evaluation job.", + "parameters": [], "requestBody": { "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/RunEvalRequest" + "$ref": "#/components/schemas/RunRequest" + } + } + }, + "required": true + } + } + }, + "/v1/evaluation/run_inline": { + "post": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/EvaluationResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Evaluation" + ], + "description": "Run an evaluation job inline.", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RunInlineRequest" } } }, @@ -3592,92 +3573,6 @@ } } }, - "/v1/scoring/score": { - "post": { - "responses": { - "200": { - "description": "ScoreResponse object containing rows and aggregated results", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ScoreResponse" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Scoring" - ], - "description": "Score a list of rows.", - "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ScoreRequest" - } - } - }, - "required": true - } - } - }, - "/v1/scoring/score-batch": { - "post": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ScoreBatchResponse" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Scoring" - ], - "description": "", - "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ScoreBatchRequest" - } - } - }, - "required": true - } - } - }, "/v1/post-training/supervised-fine-tune": { "post": { "responses": { @@ -6303,381 +6198,6 @@ "title": "EmbeddingsResponse", "description": "Response containing generated embeddings." }, - "AgentCandidate": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "agent", - "default": "agent" - }, - "config": { - "$ref": "#/components/schemas/AgentConfig", - "description": "The configuration for the agent candidate." - } - }, - "additionalProperties": false, - "required": [ - "type", - "config" - ], - "title": "AgentCandidate", - "description": "An agent candidate for evaluation." - }, - "AggregationFunctionType": { - "type": "string", - "enum": [ - "average", - "median", - "categorical_count", - "accuracy" - ], - "title": "AggregationFunctionType" - }, - "BasicScoringFnParams": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "basic", - "default": "basic" - }, - "aggregation_functions": { - "type": "array", - "items": { - "$ref": "#/components/schemas/AggregationFunctionType" - } - } - }, - "additionalProperties": false, - "required": [ - "type" - ], - "title": "BasicScoringFnParams" - }, - "BenchmarkConfig": { - "type": "object", - "properties": { - "eval_candidate": { - "$ref": "#/components/schemas/EvalCandidate", - "description": "The candidate to evaluate." - }, - "scoring_params": { - "type": "object", - "additionalProperties": { - "$ref": "#/components/schemas/ScoringFnParams" - }, - "description": "Map between scoring function id and parameters for each scoring function you want to run" - }, - "num_examples": { - "type": "integer", - "description": "(Optional) The number of examples to evaluate. If not provided, all examples in the dataset will be evaluated" - } - }, - "additionalProperties": false, - "required": [ - "eval_candidate", - "scoring_params" - ], - "title": "BenchmarkConfig", - "description": "A benchmark configuration for evaluation." - }, - "EvalCandidate": { - "oneOf": [ - { - "$ref": "#/components/schemas/ModelCandidate" - }, - { - "$ref": "#/components/schemas/AgentCandidate" - } - ], - "discriminator": { - "propertyName": "type", - "mapping": { - "model": "#/components/schemas/ModelCandidate", - "agent": "#/components/schemas/AgentCandidate" - } - } - }, - "LLMAsJudgeScoringFnParams": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "llm_as_judge", - "default": "llm_as_judge" - }, - "judge_model": { - "type": "string" - }, - "prompt_template": { - "type": "string" - }, - "judge_score_regexes": { - "type": "array", - "items": { - "type": "string" - } - }, - "aggregation_functions": { - "type": "array", - "items": { - "$ref": "#/components/schemas/AggregationFunctionType" - } - } - }, - "additionalProperties": false, - "required": [ - "type", - "judge_model" - ], - "title": "LLMAsJudgeScoringFnParams" - }, - "ModelCandidate": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "model", - "default": "model" - }, - "model": { - "type": "string", - "description": "The model ID to evaluate." - }, - "sampling_params": { - "$ref": "#/components/schemas/SamplingParams", - "description": "The sampling parameters for the model." - }, - "system_message": { - "$ref": "#/components/schemas/SystemMessage", - "description": "(Optional) The system message providing instructions or context to the model." - } - }, - "additionalProperties": false, - "required": [ - "type", - "model", - "sampling_params" - ], - "title": "ModelCandidate", - "description": "A model candidate for evaluation." - }, - "RegexParserScoringFnParams": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "regex_parser", - "default": "regex_parser" - }, - "parsing_regexes": { - "type": "array", - "items": { - "type": "string" - } - }, - "aggregation_functions": { - "type": "array", - "items": { - "$ref": "#/components/schemas/AggregationFunctionType" - } - } - }, - "additionalProperties": false, - "required": [ - "type" - ], - "title": "RegexParserScoringFnParams" - }, - "ScoringFnParams": { - "oneOf": [ - { - "$ref": "#/components/schemas/LLMAsJudgeScoringFnParams" - }, - { - "$ref": "#/components/schemas/RegexParserScoringFnParams" - }, - { - "$ref": "#/components/schemas/BasicScoringFnParams" - } - ], - "discriminator": { - "propertyName": "type", - "mapping": { - "llm_as_judge": "#/components/schemas/LLMAsJudgeScoringFnParams", - "regex_parser": "#/components/schemas/RegexParserScoringFnParams", - "basic": "#/components/schemas/BasicScoringFnParams" - } - } - }, - "EvaluateRowsRequest": { - "type": "object", - "properties": { - "input_rows": { - "type": "array", - "items": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } - }, - "description": "The rows to evaluate." - }, - "scoring_functions": { - "type": "array", - "items": { - "type": "string" - }, - "description": "The scoring functions to use for the evaluation." - }, - "benchmark_config": { - "$ref": "#/components/schemas/BenchmarkConfig", - "description": "The configuration for the benchmark." - } - }, - "additionalProperties": false, - "required": [ - "input_rows", - "scoring_functions", - "benchmark_config" - ], - "title": "EvaluateRowsRequest" - }, - "EvaluateResponse": { - "type": "object", - "properties": { - "generations": { - "type": "array", - "items": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } - }, - "description": "The generations from the evaluation." - }, - "scores": { - "type": "object", - "additionalProperties": { - "$ref": "#/components/schemas/ScoringResult" - }, - "description": "The scores from the evaluation." - } - }, - "additionalProperties": false, - "required": [ - "generations", - "scores" - ], - "title": "EvaluateResponse", - "description": "The response from an evaluation." - }, - "ScoringResult": { - "type": "object", - "properties": { - "score_rows": { - "type": "array", - "items": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } - }, - "description": "The scoring result for each row. Each row is a map of column name to value." - }, - "aggregated_results": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - }, - "description": "Map of metric name to aggregated value" - } - }, - "additionalProperties": false, - "required": [ - "score_rows", - "aggregated_results" - ], - "title": "ScoringResult", - "description": "A scoring result for a single row." - }, "Agent": { "type": "object", "properties": { @@ -6783,13 +6303,15 @@ "default": "benchmark" }, "dataset_id": { - "type": "string" + "type": "string", + "description": "The ID of the dataset to used to run the benchmark." }, - "scoring_functions": { + "grader_ids": { "type": "array", "items": { "type": "string" - } + }, + "description": "The grader ids to use for this benchmark." }, "metadata": { "type": "object", @@ -6814,7 +6336,8 @@ "type": "object" } ] - } + }, + "description": "Metadata for this benchmark for additional descriptions." } }, "additionalProperties": false, @@ -6824,7 +6347,7 @@ "provider_id", "type", "dataset_id", - "scoring_functions", + "grader_ids", "metadata" ], "title": "Benchmark" @@ -6981,6 +6504,361 @@ "title": "URIDataSource", "description": "A dataset that can be obtained from a URI." }, + "EqualityGrader": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "equality", + "default": "equality" + }, + "equality": { + "type": "object", + "properties": { + "aggregation_functions": { + "type": "array", + "items": { + "type": "string", + "enum": [ + "average", + "median", + "categorical_count", + "accuracy" + ], + "title": "AggregationFunctionType", + "description": "A type of aggregation function." + } + } + }, + "additionalProperties": false, + "required": [ + "aggregation_functions" + ], + "title": "BasicGraderParams" + } + }, + "additionalProperties": false, + "required": [ + "type", + "equality" + ], + "title": "EqualityGrader" + }, + "FactualityGrader": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "factuality", + "default": "factuality" + }, + "factuality": { + "type": "object", + "properties": { + "aggregation_functions": { + "type": "array", + "items": { + "type": "string", + "enum": [ + "average", + "median", + "categorical_count", + "accuracy" + ], + "title": "AggregationFunctionType", + "description": "A type of aggregation function." + } + } + }, + "additionalProperties": false, + "required": [ + "aggregation_functions" + ], + "title": "BasicGraderParams" + } + }, + "additionalProperties": false, + "required": [ + "type", + "factuality" + ], + "title": "FactualityGrader" + }, + "FaithfulnessGrader": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "faithfulness", + "default": "faithfulness" + }, + "faithfulness": { + "type": "object", + "properties": { + "aggregation_functions": { + "type": "array", + "items": { + "type": "string", + "enum": [ + "average", + "median", + "categorical_count", + "accuracy" + ], + "title": "AggregationFunctionType", + "description": "A type of aggregation function." + } + } + }, + "additionalProperties": false, + "required": [ + "aggregation_functions" + ], + "title": "BasicGraderParams" + } + }, + "additionalProperties": false, + "required": [ + "type", + "faithfulness" + ], + "title": "FaithfulnessGrader" + }, + "Grader": { + "type": "object", + "properties": { + "identifier": { + "type": "string" + }, + "provider_resource_id": { + "type": "string" + }, + "provider_id": { + "type": "string" + }, + "type": { + "type": "string", + "const": "grader", + "default": "grader" + }, + "grader": { + "$ref": "#/components/schemas/GraderDefinition" + }, + "description": { + "type": "string" + }, + "metadata": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "additionalProperties": false, + "required": [ + "identifier", + "provider_resource_id", + "provider_id", + "type", + "grader", + "metadata" + ], + "title": "Grader" + }, + "GraderDefinition": { + "oneOf": [ + { + "$ref": "#/components/schemas/LlmGrader" + }, + { + "$ref": "#/components/schemas/RegexParserGrader" + }, + { + "$ref": "#/components/schemas/EqualityGrader" + }, + { + "$ref": "#/components/schemas/SubsetOfGrader" + }, + { + "$ref": "#/components/schemas/FactualityGrader" + }, + { + "$ref": "#/components/schemas/FaithfulnessGrader" + } + ], + "discriminator": { + "propertyName": "type", + "mapping": { + "llm": "#/components/schemas/LlmGrader", + "regex_parser": "#/components/schemas/RegexParserGrader", + "equality": "#/components/schemas/EqualityGrader", + "subset_of": "#/components/schemas/SubsetOfGrader", + "factuality": "#/components/schemas/FactualityGrader", + "faithfulness": "#/components/schemas/FaithfulnessGrader" + } + } + }, + "LlmGrader": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "llm", + "default": "llm" + }, + "llm": { + "type": "object", + "properties": { + "model": { + "type": "string" + }, + "prompt": { + "type": "string" + }, + "score_regexes": { + "type": "array", + "items": { + "type": "string" + } + }, + "aggregation_functions": { + "type": "array", + "items": { + "type": "string", + "enum": [ + "average", + "median", + "categorical_count", + "accuracy" + ], + "title": "AggregationFunctionType", + "description": "A type of aggregation function." + } + } + }, + "additionalProperties": false, + "required": [ + "model", + "prompt", + "score_regexes", + "aggregation_functions" + ], + "title": "LlmGraderParams" + } + }, + "additionalProperties": false, + "required": [ + "type", + "llm" + ], + "title": "LlmGrader" + }, + "RegexParserGrader": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "regex_parser", + "default": "regex_parser" + }, + "regex_parser": { + "type": "object", + "properties": { + "parsing_regexes": { + "type": "array", + "items": { + "type": "string" + } + }, + "aggregation_functions": { + "type": "array", + "items": { + "type": "string", + "enum": [ + "average", + "median", + "categorical_count", + "accuracy" + ], + "title": "AggregationFunctionType", + "description": "A type of aggregation function." + } + } + }, + "additionalProperties": false, + "required": [ + "parsing_regexes", + "aggregation_functions" + ], + "title": "RegexParserGraderParams" + } + }, + "additionalProperties": false, + "required": [ + "type", + "regex_parser" + ], + "title": "RegexParserGrader" + }, + "SubsetOfGrader": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "subset_of", + "default": "subset_of" + }, + "subset_of": { + "type": "object", + "properties": { + "aggregation_functions": { + "type": "array", + "items": { + "type": "string", + "enum": [ + "average", + "median", + "categorical_count", + "accuracy" + ], + "title": "AggregationFunctionType", + "description": "A type of aggregation function." + } + } + }, + "additionalProperties": false, + "required": [ + "aggregation_functions" + ], + "title": "BasicGraderParams" + } + }, + "additionalProperties": false, + "required": [ + "type", + "subset_of" + ], + "title": "SubsetOfGrader" + }, "Model": { "type": "object", "properties": { @@ -7047,268 +6925,6 @@ ], "title": "ModelType" }, - "AgentTurnInputType": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "agent_turn_input", - "default": "agent_turn_input" - } - }, - "additionalProperties": false, - "required": [ - "type" - ], - "title": "AgentTurnInputType" - }, - "ArrayType": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "array", - "default": "array" - } - }, - "additionalProperties": false, - "required": [ - "type" - ], - "title": "ArrayType" - }, - "BooleanType": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "boolean", - "default": "boolean" - } - }, - "additionalProperties": false, - "required": [ - "type" - ], - "title": "BooleanType" - }, - "ChatCompletionInputType": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "chat_completion_input", - "default": "chat_completion_input" - } - }, - "additionalProperties": false, - "required": [ - "type" - ], - "title": "ChatCompletionInputType" - }, - "CompletionInputType": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "completion_input", - "default": "completion_input" - } - }, - "additionalProperties": false, - "required": [ - "type" - ], - "title": "CompletionInputType" - }, - "JsonType": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "json", - "default": "json" - } - }, - "additionalProperties": false, - "required": [ - "type" - ], - "title": "JsonType" - }, - "NumberType": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "number", - "default": "number" - } - }, - "additionalProperties": false, - "required": [ - "type" - ], - "title": "NumberType" - }, - "ObjectType": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "object", - "default": "object" - } - }, - "additionalProperties": false, - "required": [ - "type" - ], - "title": "ObjectType" - }, - "ParamType": { - "oneOf": [ - { - "$ref": "#/components/schemas/StringType" - }, - { - "$ref": "#/components/schemas/NumberType" - }, - { - "$ref": "#/components/schemas/BooleanType" - }, - { - "$ref": "#/components/schemas/ArrayType" - }, - { - "$ref": "#/components/schemas/ObjectType" - }, - { - "$ref": "#/components/schemas/JsonType" - }, - { - "$ref": "#/components/schemas/UnionType" - }, - { - "$ref": "#/components/schemas/ChatCompletionInputType" - }, - { - "$ref": "#/components/schemas/CompletionInputType" - }, - { - "$ref": "#/components/schemas/AgentTurnInputType" - } - ], - "discriminator": { - "propertyName": "type", - "mapping": { - "string": "#/components/schemas/StringType", - "number": "#/components/schemas/NumberType", - "boolean": "#/components/schemas/BooleanType", - "array": "#/components/schemas/ArrayType", - "object": "#/components/schemas/ObjectType", - "json": "#/components/schemas/JsonType", - "union": "#/components/schemas/UnionType", - "chat_completion_input": "#/components/schemas/ChatCompletionInputType", - "completion_input": "#/components/schemas/CompletionInputType", - "agent_turn_input": "#/components/schemas/AgentTurnInputType" - } - } - }, - "ScoringFn": { - "type": "object", - "properties": { - "identifier": { - "type": "string" - }, - "provider_resource_id": { - "type": "string" - }, - "provider_id": { - "type": "string" - }, - "type": { - "type": "string", - "const": "scoring_function", - "default": "scoring_function" - }, - "description": { - "type": "string" - }, - "metadata": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } - }, - "return_type": { - "$ref": "#/components/schemas/ParamType" - }, - "params": { - "$ref": "#/components/schemas/ScoringFnParams" - } - }, - "additionalProperties": false, - "required": [ - "identifier", - "provider_resource_id", - "provider_id", - "type", - "metadata", - "return_type" - ], - "title": "ScoringFn" - }, - "StringType": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "string", - "default": "string" - } - }, - "additionalProperties": false, - "required": [ - "type" - ], - "title": "StringType" - }, - "UnionType": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "union", - "default": "union" - } - }, - "additionalProperties": false, - "required": [ - "type" - ], - "title": "UnionType" - }, "Shield": { "type": "object", "properties": { @@ -7707,16 +7323,6 @@ "title": "PostTrainingJobArtifactsResponse", "description": "Artifacts of a finetuning job." }, - "JobStatus": { - "type": "string", - "enum": [ - "completed", - "in_progress", - "failed", - "scheduled" - ], - "title": "JobStatus" - }, "PostTrainingJobStatusResponse": { "type": "object", "properties": { @@ -7724,7 +7330,15 @@ "type": "string" }, "status": { - "$ref": "#/components/schemas/JobStatus" + "type": "string", + "enum": [ + "completed", + "in_progress", + "failed", + "scheduled", + "cancelled" + ], + "title": "JobStatus" }, "scheduled_at": { "type": "string", @@ -7840,6 +7454,363 @@ ], "title": "VectorDB" }, + "BenchmarkTask": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "benchmark_id", + "default": "benchmark_id" + }, + "benchmark_id": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "type", + "benchmark_id" + ], + "title": "BenchmarkTask" + }, + "DataSourceGraderTask": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "data_source_grader", + "default": "data_source_grader" + }, + "data_source": { + "$ref": "#/components/schemas/DataSource" + }, + "grader_ids": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false, + "required": [ + "type", + "data_source", + "grader_ids" + ], + "title": "DataSourceGraderTask" + }, + "DatasetGraderTask": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "dataset_grader", + "default": "dataset_grader" + }, + "dataset_id": { + "type": "string" + }, + "grader_ids": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false, + "required": [ + "type", + "dataset_id", + "grader_ids" + ], + "title": "DatasetGraderTask" + }, + "EvaluationTask": { + "oneOf": [ + { + "$ref": "#/components/schemas/BenchmarkTask" + }, + { + "$ref": "#/components/schemas/DatasetGraderTask" + }, + { + "$ref": "#/components/schemas/DataSourceGraderTask" + } + ], + "discriminator": { + "propertyName": "type", + "mapping": { + "benchmark_id": "#/components/schemas/BenchmarkTask", + "dataset_grader": "#/components/schemas/DatasetGraderTask", + "data_source_grader": "#/components/schemas/DataSourceGraderTask" + } + } + }, + "GradeRequest": { + "type": "object", + "properties": { + "task": { + "$ref": "#/components/schemas/EvaluationTask", + "description": "The task to evaluate. One of: - BenchmarkTask: Run evaluation task against a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids" + } + }, + "additionalProperties": false, + "required": [ + "task" + ], + "title": "GradeRequest" + }, + "AgentCandidate": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "agent", + "default": "agent" + }, + "config": { + "$ref": "#/components/schemas/AgentConfig", + "description": "The configuration for the agent candidate." + } + }, + "additionalProperties": false, + "required": [ + "type", + "config" + ], + "title": "AgentCandidate", + "description": "An agent candidate for evaluation." + }, + "EvaluationCandidate": { + "oneOf": [ + { + "$ref": "#/components/schemas/ModelCandidate" + }, + { + "$ref": "#/components/schemas/AgentCandidate" + } + ], + "discriminator": { + "propertyName": "type", + "mapping": { + "model": "#/components/schemas/ModelCandidate", + "agent": "#/components/schemas/AgentCandidate" + } + } + }, + "EvaluationJob": { + "type": "object", + "properties": { + "id": { + "type": "string", + "description": "The ID of the job." + }, + "status": { + "type": "string", + "enum": [ + "completed", + "in_progress", + "failed", + "scheduled", + "cancelled" + ], + "description": "The status of the job." + }, + "created_at": { + "type": "string", + "format": "date-time", + "description": "The time the job was created." + }, + "ended_at": { + "type": "string", + "format": "date-time", + "description": "The time the job ended." + }, + "error": { + "type": "string", + "description": "If status of the job is failed, this will contain the error message." + }, + "type": { + "type": "string", + "const": "evaluation", + "default": "evaluation" + }, + "task": { + "$ref": "#/components/schemas/EvaluationTask" + }, + "candidate": { + "$ref": "#/components/schemas/EvaluationCandidate" + } + }, + "additionalProperties": false, + "required": [ + "id", + "status", + "created_at", + "type", + "task", + "candidate" + ], + "title": "EvaluationJob" + }, + "ModelCandidate": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "model", + "default": "model" + }, + "model_id": { + "type": "string" + }, + "sampling_params": { + "$ref": "#/components/schemas/SamplingParams", + "description": "The sampling parameters for the model." + }, + "system_message": { + "$ref": "#/components/schemas/SystemMessage", + "description": "(Optional) The system message providing instructions or context to the model." + } + }, + "additionalProperties": false, + "required": [ + "type", + "model_id", + "sampling_params" + ], + "title": "ModelCandidate", + "description": "A model candidate for evaluation." + }, + "GradeInlineRequest": { + "type": "object", + "properties": { + "task": { + "$ref": "#/components/schemas/EvaluationTask", + "description": "The task to evaluate. One of: - BenchmarkTask: Run evaluation task against a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids" + } + }, + "additionalProperties": false, + "required": [ + "task" + ], + "title": "GradeInlineRequest" + }, + "EvaluationResponse": { + "type": "object", + "properties": { + "generations": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + }, + "description": "The generations in rows for the evaluation." + }, + "scores": { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/ScoringResult" + }, + "description": "The scores for the evaluation. Map of grader id to ScoringResult." + } + }, + "additionalProperties": false, + "required": [ + "generations", + "scores" + ], + "title": "EvaluationResponse", + "description": "A response to an inline evaluation." + }, + "ScoringResult": { + "type": "object", + "properties": { + "scores": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + }, + "description": "The scoring result for each row. Each row is a map of grader column name to value." + }, + "metrics": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + }, + "description": "Map of metric name to aggregated value." + } + }, + "additionalProperties": false, + "required": [ + "scores", + "metrics" + ], + "title": "ScoringResult", + "description": "A scoring result for a single row." + }, "HealthInfo": { "type": "object", "properties": { @@ -8285,6 +8256,65 @@ "title": "ListFileResponse", "description": "Response representing a list of file entries." }, + "GraderTypeInfo": { + "type": "object", + "properties": { + "grader_type": { + "type": "string", + "enum": [ + "llm", + "regex_parser", + "equality", + "subset_of", + "factuality", + "faithfulness" + ], + "title": "GraderType", + "description": "A type of grader. Each type is a criteria for evaluating answers." + }, + "description": { + "type": "string", + "description": "A description of the grader type. - E.g. Write your custom judge prompt to score the answer." + }, + "supported_dataset_purposes": { + "type": "array", + "items": { + "type": "string", + "enum": [ + "post-training/messages", + "eval/question-answer", + "eval/messages-answer" + ], + "title": "DatasetPurpose", + "description": "Purpose of the dataset. Each purpose has a required input data schema." + }, + "description": "The purposes that this grader can be used for." + } + }, + "additionalProperties": false, + "required": [ + "grader_type", + "description", + "supported_dataset_purposes" + ], + "title": "GraderTypeInfo" + }, + "ListGraderTypesResponse": { + "type": "object", + "properties": { + "data": { + "type": "array", + "items": { + "$ref": "#/components/schemas/GraderTypeInfo" + } + } + }, + "additionalProperties": false, + "required": [ + "data" + ], + "title": "ListGraderTypesResponse" + }, "ListModelsResponse": { "type": "object", "properties": { @@ -8357,22 +8387,6 @@ ], "title": "ListRoutesResponse" }, - "ListScoringFunctionsResponse": { - "type": "object", - "properties": { - "data": { - "type": "array", - "items": { - "$ref": "#/components/schemas/ScoringFn" - } - } - }, - "additionalProperties": false, - "required": [ - "data" - ], - "title": "ListScoringFunctionsResponse" - }, "ListShieldsResponse": { "type": "object", "properties": { @@ -9363,23 +9377,20 @@ "RegisterBenchmarkRequest": { "type": "object", "properties": { - "benchmark_id": { - "type": "string" - }, "dataset_id": { - "type": "string" + "type": "string", + "description": "The ID of the dataset to used to run the benchmark." }, - "scoring_functions": { + "grader_ids": { "type": "array", "items": { "type": "string" - } + }, + "description": "List of grader ids to use for this benchmark." }, - "provider_benchmark_id": { - "type": "string" - }, - "provider_id": { - "type": "string" + "benchmark_id": { + "type": "string", + "description": "(Optional) The ID of the benchmark to register. If not provided, an ID will be generated." }, "metadata": { "type": "object", @@ -9404,14 +9415,14 @@ "type": "object" } ] - } + }, + "description": "(Optional) Metadata for this benchmark for additional descriptions." } }, "additionalProperties": false, "required": [ - "benchmark_id", "dataset_id", - "scoring_functions" + "grader_ids" ], "title": "RegisterBenchmarkRequest" }, @@ -9469,6 +9480,50 @@ ], "title": "RegisterDatasetRequest" }, + "RegisterGraderRequest": { + "type": "object", + "properties": { + "grader": { + "$ref": "#/components/schemas/GraderDefinition", + "description": "The grader definition, E.g. - { \"type\": \"llm\", \"llm\": { \"model\": \"llama-405b\", \"prompt\": \"You are a judge. Score the answer based on the question. {question} {answer}\", } }" + }, + "grader_id": { + "type": "string", + "description": "(Optional) The ID of the grader. If not provided, a random ID will be generated." + }, + "metadata": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + }, + "description": "(Optional) Any additional metadata for this grader. - E.g. { \"description\": \"A grader that scores the answer based on the question.\", }" + } + }, + "additionalProperties": false, + "required": [ + "grader" + ], + "title": "RegisterGraderRequest" + }, "RegisterModelRequest": { "type": "object", "properties": { @@ -9516,36 +9571,6 @@ ], "title": "RegisterModelRequest" }, - "RegisterScoringFunctionRequest": { - "type": "object", - "properties": { - "scoring_fn_id": { - "type": "string" - }, - "description": { - "type": "string" - }, - "return_type": { - "$ref": "#/components/schemas/ParamType" - }, - "provider_scoring_fn_id": { - "type": "string" - }, - "provider_id": { - "type": "string" - }, - "params": { - "$ref": "#/components/schemas/ScoringFnParams" - } - }, - "additionalProperties": false, - "required": [ - "scoring_fn_id", - "description", - "return_type" - ], - "title": "RegisterScoringFunctionRequest" - }, "RegisterShieldRequest": { "type": "object", "properties": { @@ -9682,32 +9707,43 @@ ], "title": "ResumeAgentTurnRequest" }, - "RunEvalRequest": { + "RunRequest": { "type": "object", "properties": { - "benchmark_config": { - "$ref": "#/components/schemas/BenchmarkConfig", - "description": "The configuration for the benchmark." + "task": { + "$ref": "#/components/schemas/EvaluationTask", + "description": "The task to evaluate. One of: - BenchmarkTask: Run evaluation task against a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids" + }, + "candidate": { + "$ref": "#/components/schemas/EvaluationCandidate", + "description": "The candidate to evaluate." } }, "additionalProperties": false, "required": [ - "benchmark_config" + "task", + "candidate" ], - "title": "RunEvalRequest" + "title": "RunRequest" }, - "Job": { + "RunInlineRequest": { "type": "object", "properties": { - "job_id": { - "type": "string" + "task": { + "$ref": "#/components/schemas/EvaluationTask", + "description": "The task to evaluate. One of: - BenchmarkTask: Run evaluation task against a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids" + }, + "candidate": { + "$ref": "#/components/schemas/EvaluationCandidate", + "description": "The candidate to evaluate." } }, "additionalProperties": false, "required": [ - "job_id" + "task", + "candidate" ], - "title": "Job" + "title": "RunInlineRequest" }, "RunShieldRequest": { "type": "object", @@ -9795,128 +9831,6 @@ ], "title": "SaveSpansToDatasetRequest" }, - "ScoreRequest": { - "type": "object", - "properties": { - "input_rows": { - "type": "array", - "items": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } - }, - "description": "The rows to score." - }, - "scoring_functions": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "$ref": "#/components/schemas/ScoringFnParams" - }, - { - "type": "null" - } - ] - }, - "description": "The scoring functions to use for the scoring." - } - }, - "additionalProperties": false, - "required": [ - "input_rows", - "scoring_functions" - ], - "title": "ScoreRequest" - }, - "ScoreResponse": { - "type": "object", - "properties": { - "results": { - "type": "object", - "additionalProperties": { - "$ref": "#/components/schemas/ScoringResult" - }, - "description": "A map of scoring function name to ScoringResult." - } - }, - "additionalProperties": false, - "required": [ - "results" - ], - "title": "ScoreResponse", - "description": "The response from scoring." - }, - "ScoreBatchRequest": { - "type": "object", - "properties": { - "dataset_id": { - "type": "string" - }, - "scoring_functions": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "$ref": "#/components/schemas/ScoringFnParams" - }, - { - "type": "null" - } - ] - } - }, - "save_results_dataset": { - "type": "boolean" - } - }, - "additionalProperties": false, - "required": [ - "dataset_id", - "scoring_functions", - "save_results_dataset" - ], - "title": "ScoreBatchRequest" - }, - "ScoreBatchResponse": { - "type": "object", - "properties": { - "dataset_id": { - "type": "string" - }, - "results": { - "type": "object", - "additionalProperties": { - "$ref": "#/components/schemas/ScoringResult" - } - } - }, - "additionalProperties": false, - "required": [ - "results" - ], - "title": "ScoreBatchResponse" - }, "AlgorithmConfig": { "oneOf": [ { @@ -10280,12 +10194,14 @@ "name": "Datasets" }, { - "name": "Eval", - "x-displayName": "Llama Stack Evaluation API for running evaluations on model and agent candidates." + "name": "Evaluation" }, { "name": "Files" }, + { + "name": "Graders" + }, { "name": "Inference", "description": "This API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.", @@ -10307,12 +10223,6 @@ { "name": "Safety" }, - { - "name": "Scoring" - }, - { - "name": "ScoringFunctions" - }, { "name": "Shields" }, @@ -10344,16 +10254,15 @@ "Benchmarks", "DatasetIO", "Datasets", - "Eval", + "Evaluation", "Files", + "Graders", "Inference", "Inspect", "Models", "PostTraining (Coming Soon)", "Providers", "Safety", - "Scoring", - "ScoringFunctions", "Shields", "SyntheticDataGeneration (Coming Soon)", "Telemetry", diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index a3d4dbcc9..db92e7e6a 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -589,6 +589,59 @@ paths: required: true schema: type: string + /v1/graders/{grader_id}: + get: + responses: + '200': + description: The grader. + content: + application/json: + schema: + $ref: '#/components/schemas/Grader' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Graders + description: Get a grader by ID. + parameters: + - name: grader_id + in: path + description: The ID of the grader. + required: true + schema: + type: string + delete: + responses: + '200': + description: OK + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Graders + description: Delete a grader by ID. + parameters: + - name: grader_id + in: path + description: The ID of the grader. + required: true + schema: + type: string /v1/inference/embeddings: post: responses: @@ -622,43 +675,6 @@ paths: schema: $ref: '#/components/schemas/EmbeddingsRequest' required: true - /v1/eval/benchmarks/{benchmark_id}/evaluations: - post: - responses: - '200': - description: >- - EvaluateResponse object containing generations and scores - content: - application/json: - schema: - $ref: '#/components/schemas/EvaluateResponse' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Eval - description: Evaluate a list of rows on a benchmark. - parameters: - - name: benchmark_id - in: path - description: >- - The ID of the benchmark to run the evaluation on. - required: true - schema: - type: string - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/EvaluateRowsRequest' - required: true /v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}: get: responses: @@ -757,9 +773,7 @@ paths: content: application/json: schema: - oneOf: - - $ref: '#/components/schemas/Benchmark' - - type: 'null' + $ref: '#/components/schemas/Benchmark' '400': $ref: '#/components/responses/BadRequest400' '429': @@ -772,10 +786,11 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Benchmarks - description: '' + description: Get a benchmark by ID. parameters: - name: benchmark_id in: path + description: The ID of the benchmark to get. required: true schema: type: string @@ -885,36 +900,6 @@ paths: required: true schema: type: string - /v1/scoring-functions/{scoring_fn_id}: - get: - responses: - '200': - description: OK - content: - application/json: - schema: - oneOf: - - $ref: '#/components/schemas/ScoringFn' - - type: 'null' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - ScoringFunctions - description: '' - parameters: - - name: scoring_fn_id - in: path - required: true - schema: - type: string /v1/shields/{identifier}: get: responses: @@ -1326,6 +1311,70 @@ paths: required: true schema: type: string + /v1/evaluation/grade: + post: + responses: + '200': + description: >- + The evaluation job containing grader scores. + content: + application/json: + schema: + $ref: '#/components/schemas/EvaluationJob' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Evaluation + description: >- + Run an grading job with generated results. Use this when you have generated + results from inference in a dataset. + parameters: [] + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/GradeRequest' + required: true + /v1/evaluation/grade_inline: + post: + responses: + '200': + description: >- + The evaluation job containing grader scores. "generations" is not populated + in the response. + content: + application/json: + schema: + $ref: '#/components/schemas/EvaluationResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Evaluation + description: >- + Run an grading job with generated results inline. + parameters: [] + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/GradeInlineRequest' + required: true /v1/health: get: responses: @@ -1501,111 +1550,6 @@ paths: required: false schema: type: integer - /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}: - get: - responses: - '200': - description: The status of the evaluationjob. - content: - application/json: - schema: - oneOf: - - $ref: '#/components/schemas/JobStatus' - - type: 'null' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Eval - description: Get the status of a job. - parameters: - - name: benchmark_id - in: path - description: >- - The ID of the benchmark to run the evaluation on. - required: true - schema: - type: string - - name: job_id - in: path - description: The ID of the job to get the status of. - required: true - schema: - type: string - delete: - responses: - '200': - description: OK - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Eval - description: Cancel a job. - parameters: - - name: benchmark_id - in: path - description: >- - The ID of the benchmark to run the evaluation on. - required: true - schema: - type: string - - name: job_id - in: path - description: The ID of the job to cancel. - required: true - schema: - type: string - /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result: - get: - responses: - '200': - description: The result of the job. - content: - application/json: - schema: - $ref: '#/components/schemas/EvaluateResponse' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Eval - description: Get the result of a job. - parameters: - - name: benchmark_id - in: path - description: >- - The ID of the benchmark to run the evaluation on. - required: true - schema: - type: string - - name: job_id - in: path - description: The ID of the job to get the result of. - required: true - schema: - type: string /v1/agents/{agent_id}/sessions: get: responses: @@ -1657,12 +1601,16 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Benchmarks - description: '' + description: List all benchmarks. parameters: [] post: responses: '200': description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/Benchmark' '400': $ref: '#/components/responses/BadRequest400' '429': @@ -1675,7 +1623,7 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Benchmarks - description: '' + description: Register a new benchmark. parameters: [] requestBody: content: @@ -1763,6 +1711,81 @@ paths: required: true schema: type: string + /v1/graders/types: + get: + responses: + '200': + description: >- + A list of grader types and information about the types. + content: + application/json: + schema: + $ref: '#/components/schemas/ListGraderTypesResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Graders + description: List all grader types. + parameters: [] + /v1/graders: + get: + responses: + '200': + description: A list of graders. + content: + application/jsonl: + schema: + $ref: '#/components/schemas/Grader' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Graders + description: List all graders. + parameters: [] + post: + responses: + '200': + description: The registered grader. + content: + application/json: + schema: + $ref: '#/components/schemas/Grader' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Graders + description: Register a new grader. + parameters: [] + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/RegisterGraderRequest' + required: true /v1/models: get: responses: @@ -1893,53 +1916,6 @@ paths: required: false schema: $ref: '#/components/schemas/URL' - /v1/scoring-functions: - get: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/ListScoringFunctionsResponse' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - ScoringFunctions - description: '' - parameters: [] - post: - responses: - '200': - description: OK - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - ScoringFunctions - description: '' - parameters: [] - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/RegisterScoringFunctionRequest' - required: true /v1/shields: get: responses: @@ -2345,16 +2321,15 @@ paths: schema: $ref: '#/components/schemas/ResumeAgentTurnRequest' required: true - /v1/eval/benchmarks/{benchmark_id}/jobs: + /v1/evaluation/run: post: responses: '200': - description: >- - The job that was created to run the evaluation. + description: OK content: application/json: schema: - $ref: '#/components/schemas/Job' + $ref: '#/components/schemas/EvaluationJob' '400': $ref: '#/components/responses/BadRequest400' '429': @@ -2366,21 +2341,43 @@ paths: default: $ref: '#/components/responses/DefaultError' tags: - - Eval - description: Run an evaluation on a benchmark. - parameters: - - name: benchmark_id - in: path - description: >- - The ID of the benchmark to run the evaluation on. - required: true - schema: - type: string + - Evaluation + description: Run an evaluation job. + parameters: [] requestBody: content: application/json: schema: - $ref: '#/components/schemas/RunEvalRequest' + $ref: '#/components/schemas/RunRequest' + required: true + /v1/evaluation/run_inline: + post: + responses: + '200': + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/EvaluationResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Evaluation + description: Run an evaluation job inline. + parameters: [] + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/RunInlineRequest' required: true /v1/safety/run-shield: post: @@ -2436,65 +2433,6 @@ paths: schema: $ref: '#/components/schemas/SaveSpansToDatasetRequest' required: true - /v1/scoring/score: - post: - responses: - '200': - description: >- - ScoreResponse object containing rows and aggregated results - content: - application/json: - schema: - $ref: '#/components/schemas/ScoreResponse' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Scoring - description: Score a list of rows. - parameters: [] - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/ScoreRequest' - required: true - /v1/scoring/score-batch: - post: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/ScoreBatchResponse' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Scoring - description: '' - parameters: [] - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/ScoreBatchRequest' - required: true /v1/post-training/supervised-fine-tune: post: responses: @@ -4384,251 +4322,6 @@ components: title: EmbeddingsResponse description: >- Response containing generated embeddings. - AgentCandidate: - type: object - properties: - type: - type: string - const: agent - default: agent - config: - $ref: '#/components/schemas/AgentConfig' - description: >- - The configuration for the agent candidate. - additionalProperties: false - required: - - type - - config - title: AgentCandidate - description: An agent candidate for evaluation. - AggregationFunctionType: - type: string - enum: - - average - - median - - categorical_count - - accuracy - title: AggregationFunctionType - BasicScoringFnParams: - type: object - properties: - type: - type: string - const: basic - default: basic - aggregation_functions: - type: array - items: - $ref: '#/components/schemas/AggregationFunctionType' - additionalProperties: false - required: - - type - title: BasicScoringFnParams - BenchmarkConfig: - type: object - properties: - eval_candidate: - $ref: '#/components/schemas/EvalCandidate' - description: The candidate to evaluate. - scoring_params: - type: object - additionalProperties: - $ref: '#/components/schemas/ScoringFnParams' - description: >- - Map between scoring function id and parameters for each scoring function - you want to run - num_examples: - type: integer - description: >- - (Optional) The number of examples to evaluate. If not provided, all examples - in the dataset will be evaluated - additionalProperties: false - required: - - eval_candidate - - scoring_params - title: BenchmarkConfig - description: >- - A benchmark configuration for evaluation. - EvalCandidate: - oneOf: - - $ref: '#/components/schemas/ModelCandidate' - - $ref: '#/components/schemas/AgentCandidate' - discriminator: - propertyName: type - mapping: - model: '#/components/schemas/ModelCandidate' - agent: '#/components/schemas/AgentCandidate' - LLMAsJudgeScoringFnParams: - type: object - properties: - type: - type: string - const: llm_as_judge - default: llm_as_judge - judge_model: - type: string - prompt_template: - type: string - judge_score_regexes: - type: array - items: - type: string - aggregation_functions: - type: array - items: - $ref: '#/components/schemas/AggregationFunctionType' - additionalProperties: false - required: - - type - - judge_model - title: LLMAsJudgeScoringFnParams - ModelCandidate: - type: object - properties: - type: - type: string - const: model - default: model - model: - type: string - description: The model ID to evaluate. - sampling_params: - $ref: '#/components/schemas/SamplingParams' - description: The sampling parameters for the model. - system_message: - $ref: '#/components/schemas/SystemMessage' - description: >- - (Optional) The system message providing instructions or context to the - model. - additionalProperties: false - required: - - type - - model - - sampling_params - title: ModelCandidate - description: A model candidate for evaluation. - RegexParserScoringFnParams: - type: object - properties: - type: - type: string - const: regex_parser - default: regex_parser - parsing_regexes: - type: array - items: - type: string - aggregation_functions: - type: array - items: - $ref: '#/components/schemas/AggregationFunctionType' - additionalProperties: false - required: - - type - title: RegexParserScoringFnParams - ScoringFnParams: - oneOf: - - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams' - - $ref: '#/components/schemas/RegexParserScoringFnParams' - - $ref: '#/components/schemas/BasicScoringFnParams' - discriminator: - propertyName: type - mapping: - llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams' - regex_parser: '#/components/schemas/RegexParserScoringFnParams' - basic: '#/components/schemas/BasicScoringFnParams' - EvaluateRowsRequest: - type: object - properties: - input_rows: - type: array - items: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - description: The rows to evaluate. - scoring_functions: - type: array - items: - type: string - description: >- - The scoring functions to use for the evaluation. - benchmark_config: - $ref: '#/components/schemas/BenchmarkConfig' - description: The configuration for the benchmark. - additionalProperties: false - required: - - input_rows - - scoring_functions - - benchmark_config - title: EvaluateRowsRequest - EvaluateResponse: - type: object - properties: - generations: - type: array - items: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - description: The generations from the evaluation. - scores: - type: object - additionalProperties: - $ref: '#/components/schemas/ScoringResult' - description: The scores from the evaluation. - additionalProperties: false - required: - - generations - - scores - title: EvaluateResponse - description: The response from an evaluation. - ScoringResult: - type: object - properties: - score_rows: - type: array - items: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - description: >- - The scoring result for each row. Each row is a map of column name to value. - aggregated_results: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - description: Map of metric name to aggregated value - additionalProperties: false - required: - - score_rows - - aggregated_results - title: ScoringResult - description: A scoring result for a single row. Agent: type: object properties: @@ -4703,10 +4396,14 @@ components: default: benchmark dataset_id: type: string - scoring_functions: + description: >- + The ID of the dataset to used to run the benchmark. + grader_ids: type: array items: type: string + description: >- + The grader ids to use for this benchmark. metadata: type: object additionalProperties: @@ -4717,6 +4414,8 @@ components: - type: string - type: array - type: object + description: >- + Metadata for this benchmark for additional descriptions. additionalProperties: false required: - identifier @@ -4724,7 +4423,7 @@ components: - provider_id - type - dataset_id - - scoring_functions + - grader_ids - metadata title: Benchmark DataSource: @@ -4828,6 +4527,255 @@ components: title: URIDataSource description: >- A dataset that can be obtained from a URI. + EqualityGrader: + type: object + properties: + type: + type: string + const: equality + default: equality + equality: + type: object + properties: + aggregation_functions: + type: array + items: + type: string + enum: + - average + - median + - categorical_count + - accuracy + title: AggregationFunctionType + description: A type of aggregation function. + additionalProperties: false + required: + - aggregation_functions + title: BasicGraderParams + additionalProperties: false + required: + - type + - equality + title: EqualityGrader + FactualityGrader: + type: object + properties: + type: + type: string + const: factuality + default: factuality + factuality: + type: object + properties: + aggregation_functions: + type: array + items: + type: string + enum: + - average + - median + - categorical_count + - accuracy + title: AggregationFunctionType + description: A type of aggregation function. + additionalProperties: false + required: + - aggregation_functions + title: BasicGraderParams + additionalProperties: false + required: + - type + - factuality + title: FactualityGrader + FaithfulnessGrader: + type: object + properties: + type: + type: string + const: faithfulness + default: faithfulness + faithfulness: + type: object + properties: + aggregation_functions: + type: array + items: + type: string + enum: + - average + - median + - categorical_count + - accuracy + title: AggregationFunctionType + description: A type of aggregation function. + additionalProperties: false + required: + - aggregation_functions + title: BasicGraderParams + additionalProperties: false + required: + - type + - faithfulness + title: FaithfulnessGrader + Grader: + type: object + properties: + identifier: + type: string + provider_resource_id: + type: string + provider_id: + type: string + type: + type: string + const: grader + default: grader + grader: + $ref: '#/components/schemas/GraderDefinition' + description: + type: string + metadata: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + additionalProperties: false + required: + - identifier + - provider_resource_id + - provider_id + - type + - grader + - metadata + title: Grader + GraderDefinition: + oneOf: + - $ref: '#/components/schemas/LlmGrader' + - $ref: '#/components/schemas/RegexParserGrader' + - $ref: '#/components/schemas/EqualityGrader' + - $ref: '#/components/schemas/SubsetOfGrader' + - $ref: '#/components/schemas/FactualityGrader' + - $ref: '#/components/schemas/FaithfulnessGrader' + discriminator: + propertyName: type + mapping: + llm: '#/components/schemas/LlmGrader' + regex_parser: '#/components/schemas/RegexParserGrader' + equality: '#/components/schemas/EqualityGrader' + subset_of: '#/components/schemas/SubsetOfGrader' + factuality: '#/components/schemas/FactualityGrader' + faithfulness: '#/components/schemas/FaithfulnessGrader' + LlmGrader: + type: object + properties: + type: + type: string + const: llm + default: llm + llm: + type: object + properties: + model: + type: string + prompt: + type: string + score_regexes: + type: array + items: + type: string + aggregation_functions: + type: array + items: + type: string + enum: + - average + - median + - categorical_count + - accuracy + title: AggregationFunctionType + description: A type of aggregation function. + additionalProperties: false + required: + - model + - prompt + - score_regexes + - aggregation_functions + title: LlmGraderParams + additionalProperties: false + required: + - type + - llm + title: LlmGrader + RegexParserGrader: + type: object + properties: + type: + type: string + const: regex_parser + default: regex_parser + regex_parser: + type: object + properties: + parsing_regexes: + type: array + items: + type: string + aggregation_functions: + type: array + items: + type: string + enum: + - average + - median + - categorical_count + - accuracy + title: AggregationFunctionType + description: A type of aggregation function. + additionalProperties: false + required: + - parsing_regexes + - aggregation_functions + title: RegexParserGraderParams + additionalProperties: false + required: + - type + - regex_parser + title: RegexParserGrader + SubsetOfGrader: + type: object + properties: + type: + type: string + const: subset_of + default: subset_of + subset_of: + type: object + properties: + aggregation_functions: + type: array + items: + type: string + enum: + - average + - median + - categorical_count + - accuracy + title: AggregationFunctionType + description: A type of aggregation function. + additionalProperties: false + required: + - aggregation_functions + title: BasicGraderParams + additionalProperties: false + required: + - type + - subset_of + title: SubsetOfGrader Model: type: object properties: @@ -4869,179 +4817,6 @@ components: - llm - embedding title: ModelType - AgentTurnInputType: - type: object - properties: - type: - type: string - const: agent_turn_input - default: agent_turn_input - additionalProperties: false - required: - - type - title: AgentTurnInputType - ArrayType: - type: object - properties: - type: - type: string - const: array - default: array - additionalProperties: false - required: - - type - title: ArrayType - BooleanType: - type: object - properties: - type: - type: string - const: boolean - default: boolean - additionalProperties: false - required: - - type - title: BooleanType - ChatCompletionInputType: - type: object - properties: - type: - type: string - const: chat_completion_input - default: chat_completion_input - additionalProperties: false - required: - - type - title: ChatCompletionInputType - CompletionInputType: - type: object - properties: - type: - type: string - const: completion_input - default: completion_input - additionalProperties: false - required: - - type - title: CompletionInputType - JsonType: - type: object - properties: - type: - type: string - const: json - default: json - additionalProperties: false - required: - - type - title: JsonType - NumberType: - type: object - properties: - type: - type: string - const: number - default: number - additionalProperties: false - required: - - type - title: NumberType - ObjectType: - type: object - properties: - type: - type: string - const: object - default: object - additionalProperties: false - required: - - type - title: ObjectType - ParamType: - oneOf: - - $ref: '#/components/schemas/StringType' - - $ref: '#/components/schemas/NumberType' - - $ref: '#/components/schemas/BooleanType' - - $ref: '#/components/schemas/ArrayType' - - $ref: '#/components/schemas/ObjectType' - - $ref: '#/components/schemas/JsonType' - - $ref: '#/components/schemas/UnionType' - - $ref: '#/components/schemas/ChatCompletionInputType' - - $ref: '#/components/schemas/CompletionInputType' - - $ref: '#/components/schemas/AgentTurnInputType' - discriminator: - propertyName: type - mapping: - string: '#/components/schemas/StringType' - number: '#/components/schemas/NumberType' - boolean: '#/components/schemas/BooleanType' - array: '#/components/schemas/ArrayType' - object: '#/components/schemas/ObjectType' - json: '#/components/schemas/JsonType' - union: '#/components/schemas/UnionType' - chat_completion_input: '#/components/schemas/ChatCompletionInputType' - completion_input: '#/components/schemas/CompletionInputType' - agent_turn_input: '#/components/schemas/AgentTurnInputType' - ScoringFn: - type: object - properties: - identifier: - type: string - provider_resource_id: - type: string - provider_id: - type: string - type: - type: string - const: scoring_function - default: scoring_function - description: - type: string - metadata: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - return_type: - $ref: '#/components/schemas/ParamType' - params: - $ref: '#/components/schemas/ScoringFnParams' - additionalProperties: false - required: - - identifier - - provider_resource_id - - provider_id - - type - - metadata - - return_type - title: ScoringFn - StringType: - type: object - properties: - type: - type: string - const: string - default: string - additionalProperties: false - required: - - type - title: StringType - UnionType: - type: object - properties: - type: - type: string - const: union - default: union - additionalProperties: false - required: - - type - title: UnionType Shield: type: object properties: @@ -5292,21 +5067,20 @@ components: - checkpoints title: PostTrainingJobArtifactsResponse description: Artifacts of a finetuning job. - JobStatus: - type: string - enum: - - completed - - in_progress - - failed - - scheduled - title: JobStatus PostTrainingJobStatusResponse: type: object properties: job_uuid: type: string status: - $ref: '#/components/schemas/JobStatus' + type: string + enum: + - completed + - in_progress + - failed + - scheduled + - cancelled + title: JobStatus scheduled_at: type: string format: date-time @@ -5381,6 +5155,255 @@ components: - embedding_model - embedding_dimension title: VectorDB + BenchmarkTask: + type: object + properties: + type: + type: string + const: benchmark_id + default: benchmark_id + benchmark_id: + type: string + additionalProperties: false + required: + - type + - benchmark_id + title: BenchmarkTask + DataSourceGraderTask: + type: object + properties: + type: + type: string + const: data_source_grader + default: data_source_grader + data_source: + $ref: '#/components/schemas/DataSource' + grader_ids: + type: array + items: + type: string + additionalProperties: false + required: + - type + - data_source + - grader_ids + title: DataSourceGraderTask + DatasetGraderTask: + type: object + properties: + type: + type: string + const: dataset_grader + default: dataset_grader + dataset_id: + type: string + grader_ids: + type: array + items: + type: string + additionalProperties: false + required: + - type + - dataset_id + - grader_ids + title: DatasetGraderTask + EvaluationTask: + oneOf: + - $ref: '#/components/schemas/BenchmarkTask' + - $ref: '#/components/schemas/DatasetGraderTask' + - $ref: '#/components/schemas/DataSourceGraderTask' + discriminator: + propertyName: type + mapping: + benchmark_id: '#/components/schemas/BenchmarkTask' + dataset_grader: '#/components/schemas/DatasetGraderTask' + data_source_grader: '#/components/schemas/DataSourceGraderTask' + GradeRequest: + type: object + properties: + task: + $ref: '#/components/schemas/EvaluationTask' + description: >- + The task to evaluate. One of: - BenchmarkTask: Run evaluation task against + a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id + and a list of grader_ids - DataSourceGraderTask: Run evaluation task against + a data source (e.g. rows, uri, etc.) and a list of grader_ids + additionalProperties: false + required: + - task + title: GradeRequest + AgentCandidate: + type: object + properties: + type: + type: string + const: agent + default: agent + config: + $ref: '#/components/schemas/AgentConfig' + description: >- + The configuration for the agent candidate. + additionalProperties: false + required: + - type + - config + title: AgentCandidate + description: An agent candidate for evaluation. + EvaluationCandidate: + oneOf: + - $ref: '#/components/schemas/ModelCandidate' + - $ref: '#/components/schemas/AgentCandidate' + discriminator: + propertyName: type + mapping: + model: '#/components/schemas/ModelCandidate' + agent: '#/components/schemas/AgentCandidate' + EvaluationJob: + type: object + properties: + id: + type: string + description: The ID of the job. + status: + type: string + enum: + - completed + - in_progress + - failed + - scheduled + - cancelled + description: The status of the job. + created_at: + type: string + format: date-time + description: The time the job was created. + ended_at: + type: string + format: date-time + description: The time the job ended. + error: + type: string + description: >- + If status of the job is failed, this will contain the error message. + type: + type: string + const: evaluation + default: evaluation + task: + $ref: '#/components/schemas/EvaluationTask' + candidate: + $ref: '#/components/schemas/EvaluationCandidate' + additionalProperties: false + required: + - id + - status + - created_at + - type + - task + - candidate + title: EvaluationJob + ModelCandidate: + type: object + properties: + type: + type: string + const: model + default: model + model_id: + type: string + sampling_params: + $ref: '#/components/schemas/SamplingParams' + description: The sampling parameters for the model. + system_message: + $ref: '#/components/schemas/SystemMessage' + description: >- + (Optional) The system message providing instructions or context to the + model. + additionalProperties: false + required: + - type + - model_id + - sampling_params + title: ModelCandidate + description: A model candidate for evaluation. + GradeInlineRequest: + type: object + properties: + task: + $ref: '#/components/schemas/EvaluationTask' + description: >- + The task to evaluate. One of: - BenchmarkTask: Run evaluation task against + a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id + and a list of grader_ids - DataSourceGraderTask: Run evaluation task against + a data source (e.g. rows, uri, etc.) and a list of grader_ids + additionalProperties: false + required: + - task + title: GradeInlineRequest + EvaluationResponse: + type: object + properties: + generations: + type: array + items: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + description: >- + The generations in rows for the evaluation. + scores: + type: object + additionalProperties: + $ref: '#/components/schemas/ScoringResult' + description: >- + The scores for the evaluation. Map of grader id to ScoringResult. + additionalProperties: false + required: + - generations + - scores + title: EvaluationResponse + description: A response to an inline evaluation. + ScoringResult: + type: object + properties: + scores: + type: array + items: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + description: >- + The scoring result for each row. Each row is a map of grader column name + to value. + metrics: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + description: Map of metric name to aggregated value. + additionalProperties: false + required: + - scores + - metrics + title: ScoringResult + description: A scoring result for a single row. HealthInfo: type: object properties: @@ -5648,6 +5671,56 @@ components: title: ListFileResponse description: >- Response representing a list of file entries. + GraderTypeInfo: + type: object + properties: + grader_type: + type: string + enum: + - llm + - regex_parser + - equality + - subset_of + - factuality + - faithfulness + title: GraderType + description: >- + A type of grader. Each type is a criteria for evaluating answers. + description: + type: string + description: >- + A description of the grader type. - E.g. Write your custom judge prompt + to score the answer. + supported_dataset_purposes: + type: array + items: + type: string + enum: + - post-training/messages + - eval/question-answer + - eval/messages-answer + title: DatasetPurpose + description: >- + Purpose of the dataset. Each purpose has a required input data schema. + description: >- + The purposes that this grader can be used for. + additionalProperties: false + required: + - grader_type + - description + - supported_dataset_purposes + title: GraderTypeInfo + ListGraderTypesResponse: + type: object + properties: + data: + type: array + items: + $ref: '#/components/schemas/GraderTypeInfo' + additionalProperties: false + required: + - data + title: ListGraderTypesResponse ListModelsResponse: type: object properties: @@ -5698,17 +5771,6 @@ components: required: - data title: ListRoutesResponse - ListScoringFunctionsResponse: - type: object - properties: - data: - type: array - items: - $ref: '#/components/schemas/ScoringFn' - additionalProperties: false - required: - - data - title: ListScoringFunctionsResponse ListShieldsResponse: type: object properties: @@ -6343,18 +6405,21 @@ components: RegisterBenchmarkRequest: type: object properties: - benchmark_id: - type: string dataset_id: type: string - scoring_functions: + description: >- + The ID of the dataset to used to run the benchmark. + grader_ids: type: array items: type: string - provider_benchmark_id: - type: string - provider_id: + description: >- + List of grader ids to use for this benchmark. + benchmark_id: type: string + description: >- + (Optional) The ID of the benchmark to register. If not provided, an ID + will be generated. metadata: type: object additionalProperties: @@ -6365,11 +6430,12 @@ components: - type: string - type: array - type: object + description: >- + (Optional) Metadata for this benchmark for additional descriptions. additionalProperties: false required: - - benchmark_id - dataset_id - - scoring_functions + - grader_ids title: RegisterBenchmarkRequest RegisterDatasetRequest: type: object @@ -6422,6 +6488,37 @@ components: - purpose - source title: RegisterDatasetRequest + RegisterGraderRequest: + type: object + properties: + grader: + $ref: '#/components/schemas/GraderDefinition' + description: >- + The grader definition, E.g. - { "type": "llm", "llm": { "model": "llama-405b", + "prompt": "You are a judge. Score the answer based on the question. {question} + {answer}", } } + grader_id: + type: string + description: >- + (Optional) The ID of the grader. If not provided, a random ID will be + generated. + metadata: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + description: >- + (Optional) Any additional metadata for this grader. - E.g. { "description": + "A grader that scores the answer based on the question.", } + additionalProperties: false + required: + - grader + title: RegisterGraderRequest RegisterModelRequest: type: object properties: @@ -6447,27 +6544,6 @@ components: required: - model_id title: RegisterModelRequest - RegisterScoringFunctionRequest: - type: object - properties: - scoring_fn_id: - type: string - description: - type: string - return_type: - $ref: '#/components/schemas/ParamType' - provider_scoring_fn_id: - type: string - provider_id: - type: string - params: - $ref: '#/components/schemas/ScoringFnParams' - additionalProperties: false - required: - - scoring_fn_id - - description - - return_type - title: RegisterScoringFunctionRequest RegisterShieldRequest: type: object properties: @@ -6549,25 +6625,42 @@ components: required: - tool_responses title: ResumeAgentTurnRequest - RunEvalRequest: + RunRequest: type: object properties: - benchmark_config: - $ref: '#/components/schemas/BenchmarkConfig' - description: The configuration for the benchmark. + task: + $ref: '#/components/schemas/EvaluationTask' + description: >- + The task to evaluate. One of: - BenchmarkTask: Run evaluation task against + a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id + and a list of grader_ids - DataSourceGraderTask: Run evaluation task against + a data source (e.g. rows, uri, etc.) and a list of grader_ids + candidate: + $ref: '#/components/schemas/EvaluationCandidate' + description: The candidate to evaluate. additionalProperties: false required: - - benchmark_config - title: RunEvalRequest - Job: + - task + - candidate + title: RunRequest + RunInlineRequest: type: object properties: - job_id: - type: string + task: + $ref: '#/components/schemas/EvaluationTask' + description: >- + The task to evaluate. One of: - BenchmarkTask: Run evaluation task against + a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id + and a list of grader_ids - DataSourceGraderTask: Run evaluation task against + a data source (e.g. rows, uri, etc.) and a list of grader_ids + candidate: + $ref: '#/components/schemas/EvaluationCandidate' + description: The candidate to evaluate. additionalProperties: false required: - - job_id - title: Job + - task + - candidate + title: RunInlineRequest RunShieldRequest: type: object properties: @@ -6621,81 +6714,6 @@ components: - attributes_to_save - dataset_id title: SaveSpansToDatasetRequest - ScoreRequest: - type: object - properties: - input_rows: - type: array - items: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - description: The rows to score. - scoring_functions: - type: object - additionalProperties: - oneOf: - - $ref: '#/components/schemas/ScoringFnParams' - - type: 'null' - description: >- - The scoring functions to use for the scoring. - additionalProperties: false - required: - - input_rows - - scoring_functions - title: ScoreRequest - ScoreResponse: - type: object - properties: - results: - type: object - additionalProperties: - $ref: '#/components/schemas/ScoringResult' - description: >- - A map of scoring function name to ScoringResult. - additionalProperties: false - required: - - results - title: ScoreResponse - description: The response from scoring. - ScoreBatchRequest: - type: object - properties: - dataset_id: - type: string - scoring_functions: - type: object - additionalProperties: - oneOf: - - $ref: '#/components/schemas/ScoringFnParams' - - type: 'null' - save_results_dataset: - type: boolean - additionalProperties: false - required: - - dataset_id - - scoring_functions - - save_results_dataset - title: ScoreBatchRequest - ScoreBatchResponse: - type: object - properties: - dataset_id: - type: string - results: - type: object - additionalProperties: - $ref: '#/components/schemas/ScoringResult' - additionalProperties: false - required: - - results - title: ScoreBatchResponse AlgorithmConfig: oneOf: - $ref: '#/components/schemas/LoraFinetuningConfig' @@ -6933,10 +6951,9 @@ tags: - name: Benchmarks - name: DatasetIO - name: Datasets - - name: Eval - x-displayName: >- - Llama Stack Evaluation API for running evaluations on model and agent candidates. + - name: Evaluation - name: Files + - name: Graders - name: Inference description: >- This API provides the raw interface to the underlying models. Two kinds of models @@ -6956,8 +6973,6 @@ tags: x-displayName: >- Providers API for inspecting, listing, and modifying providers and their configurations. - name: Safety - - name: Scoring - - name: ScoringFunctions - name: Shields - name: SyntheticDataGeneration (Coming Soon) - name: Telemetry @@ -6973,16 +6988,15 @@ x-tagGroups: - Benchmarks - DatasetIO - Datasets - - Eval + - Evaluation - Files + - Graders - Inference - Inspect - Models - PostTraining (Coming Soon) - Providers - Safety - - Scoring - - ScoringFunctions - Shields - SyntheticDataGeneration (Coming Soon) - Telemetry diff --git a/llama_stack/apis/benchmarks/benchmarks.py b/llama_stack/apis/benchmarks/benchmarks.py index 39ba355e9..eaaf8530b 100644 --- a/llama_stack/apis/benchmarks/benchmarks.py +++ b/llama_stack/apis/benchmarks/benchmarks.py @@ -12,11 +12,17 @@ from llama_stack.schema_utils import json_schema_type, webmethod class CommonBenchmarkFields(BaseModel): + """ + :param dataset_id: The ID of the dataset to used to run the benchmark. + :param grader_ids: The grader ids to use for this benchmark. + :param metadata: Metadata for this benchmark for additional descriptions. + """ + dataset_id: str - scoring_functions: List[str] + grader_ids: List[str] metadata: Dict[str, Any] = Field( default_factory=dict, - description="Metadata for this evaluation task", + description="Metadata for this benchmark", ) @@ -45,22 +51,39 @@ class ListBenchmarksResponse(BaseModel): @runtime_checkable class Benchmarks(Protocol): + @webmethod(route="/eval/benchmarks", method="POST") + async def register_benchmark( + self, + dataset_id: str, + grader_ids: List[str], + benchmark_id: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + ) -> Benchmark: + """ + Register a new benchmark. + + :param dataset_id: The ID of the dataset to used to run the benchmark. + :param grader_ids: List of grader ids to use for this benchmark. + :param benchmark_id: (Optional) The ID of the benchmark to register. If not provided, an ID will be generated. + :param metadata: (Optional) Metadata for this benchmark for additional descriptions. + """ + ... + @webmethod(route="/eval/benchmarks", method="GET") - async def list_benchmarks(self) -> ListBenchmarksResponse: ... + async def list_benchmarks(self) -> ListBenchmarksResponse: + """ + List all benchmarks. + """ + ... @webmethod(route="/eval/benchmarks/{benchmark_id}", method="GET") async def get_benchmark( self, benchmark_id: str, - ) -> Optional[Benchmark]: ... + ) -> Benchmark: + """ + Get a benchmark by ID. - @webmethod(route="/eval/benchmarks", method="POST") - async def register_benchmark( - self, - benchmark_id: str, - dataset_id: str, - scoring_functions: List[str], - provider_benchmark_id: Optional[str] = None, - provider_id: Optional[str] = None, - metadata: Optional[Dict[str, Any]] = None, - ) -> None: ... + :param benchmark_id: The ID of the benchmark to get. + """ + ... diff --git a/llama_stack/apis/common/job_types.py b/llama_stack/apis/common/job_types.py index bc070017b..e27f19493 100644 --- a/llama_stack/apis/common/job_types.py +++ b/llama_stack/apis/common/job_types.py @@ -3,21 +3,49 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from datetime import datetime from enum import Enum +from typing import Optional from pydantic import BaseModel from llama_stack.schema_utils import json_schema_type -@json_schema_type -class Job(BaseModel): - job_id: str +class JobType(Enum): + batch_inference = "batch_inference" + evaluation = "evaluation" + finetuning = "finetuning" -@json_schema_type class JobStatus(Enum): completed = "completed" in_progress = "in_progress" failed = "failed" scheduled = "scheduled" + cancelled = "cancelled" + + +class JobArtifact(BaseModel): + """ + A job artifact is a file or directory that is produced by a job. + """ + + path: str + + +@json_schema_type +class CommonJobFields(BaseModel): + """Common fields for all jobs. + :param id: The ID of the job. + :param status: The status of the job. + :param created_at: The time the job was created. + :param ended_at: The time the job ended. + :param error: If status of the job is failed, this will contain the error message. + """ + + id: str + status: JobStatus + created_at: datetime + ended_at: Optional[datetime] = None + error: Optional[str] = None diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py index dec018d83..5b4433041 100644 --- a/llama_stack/apis/eval/eval.py +++ b/llama_stack/apis/eval/eval.py @@ -10,7 +10,7 @@ from pydantic import BaseModel, Field from typing_extensions import Annotated from llama_stack.apis.agents import AgentConfig -from llama_stack.apis.common.job_types import Job, JobStatus +from llama_stack.apis.common.job_types import JobStatus from llama_stack.apis.inference import SamplingParams, SystemMessage from llama_stack.apis.scoring import ScoringResult from llama_stack.apis.scoring_functions import ScoringFnParams @@ -91,7 +91,7 @@ class Eval(Protocol): self, benchmark_id: str, benchmark_config: BenchmarkConfig, - ) -> Job: + ) -> None: """Run an evaluation on a benchmark. :param benchmark_id: The ID of the benchmark to run the evaluation on. @@ -135,7 +135,9 @@ class Eval(Protocol): """ ... - @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET") + @webmethod( + route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET" + ) async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse: """Get the result of a job. diff --git a/llama_stack/apis/evaluation/__init__.py b/llama_stack/apis/evaluation/__init__.py new file mode 100644 index 000000000..9a168a2bc --- /dev/null +++ b/llama_stack/apis/evaluation/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from .evaluation import * # noqa: F401 F403 diff --git a/llama_stack/apis/evaluation/evaluation.py b/llama_stack/apis/evaluation/evaluation.py new file mode 100644 index 000000000..444495b6e --- /dev/null +++ b/llama_stack/apis/evaluation/evaluation.py @@ -0,0 +1,175 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +from typing import Any, Dict, List, Literal, Optional, Protocol, Union + +from pydantic import BaseModel, Field +from typing_extensions import Annotated + +from llama_stack.apis.agents import AgentConfig +from llama_stack.apis.common.job_types import CommonJobFields, JobType +from llama_stack.apis.datasets import DataSource +from llama_stack.apis.inference import SamplingParams, SystemMessage +from llama_stack.schema_utils import json_schema_type, register_schema, webmethod + + +@json_schema_type +class ModelCandidate(BaseModel): + """A model candidate for evaluation. + + :param model: The model ID to evaluate. + :param sampling_params: The sampling parameters for the model. + :param system_message: (Optional) The system message providing instructions or context to the model. + """ + + type: Literal["model"] = "model" + model_id: str + sampling_params: SamplingParams + system_message: Optional[SystemMessage] = None + + +@json_schema_type +class AgentCandidate(BaseModel): + """An agent candidate for evaluation. + + :param config: The configuration for the agent candidate. + """ + + type: Literal["agent"] = "agent" + config: AgentConfig + + +EvaluationCandidate = register_schema( + Annotated[Union[ModelCandidate, AgentCandidate], Field(discriminator="type")], + name="EvaluationCandidate", +) + + +@json_schema_type +class BenchmarkTask(BaseModel): + type: Literal["benchmark_id"] = "benchmark_id" + benchmark_id: str + + +@json_schema_type +class DatasetGraderTask(BaseModel): + type: Literal["dataset_grader"] = "dataset_grader" + dataset_id: str + grader_ids: List[str] + + +@json_schema_type +class DataSourceGraderTask(BaseModel): + type: Literal["data_source_grader"] = "data_source_grader" + data_source: DataSource + grader_ids: List[str] + + +EvaluationTask = register_schema( + Annotated[ + Union[BenchmarkTask, DatasetGraderTask, DataSourceGraderTask], + Field(discriminator="type"), + ], + name="EvaluationTask", +) + + +@json_schema_type +class EvaluationJob(CommonJobFields): + type: Literal[JobType.evaluation.value] = JobType.evaluation.value + + # input params for the submitted evaluation job + task: EvaluationTask + candidate: EvaluationCandidate + + +@json_schema_type +class ScoringResult(BaseModel): + """ + A scoring result for a single row. + + :param scores: The scoring result for each row. Each row is a map of grader column name to value. + :param metrics: Map of metric name to aggregated value. + """ + + scores: List[Dict[str, Any]] + metrics: Dict[str, Any] + + +@json_schema_type +class EvaluationResponse(BaseModel): + """ + A response to an inline evaluation. + + :param generations: The generations in rows for the evaluation. + :param scores: The scores for the evaluation. Map of grader id to ScoringResult. + """ + + generations: List[Dict[str, Any]] + scores: Dict[str, ScoringResult] + + +class Evaluation(Protocol): + @webmethod(route="/evaluation/run", method="POST") + async def run( + self, + task: EvaluationTask, + candidate: EvaluationCandidate, + ) -> EvaluationJob: + """ + Run an evaluation job. + + :param task: The task to evaluate. One of: + - BenchmarkTask: Run evaluation task against a benchmark_id + - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids + - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids + :param candidate: The candidate to evaluate. + """ + ... + + @webmethod(route="/evaluation/run_inline", method="POST") + async def run_inline( + self, + task: EvaluationTask, + candidate: EvaluationCandidate, + ) -> EvaluationResponse: + """ + Run an evaluation job inline. + + :param task: The task to evaluate. One of: + - BenchmarkTask: Run evaluation task against a benchmark_id + - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids + - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids + :param candidate: The candidate to evaluate. + """ + ... + + @webmethod(route="/evaluation/grade", method="POST") + async def grade(self, task: EvaluationTask) -> EvaluationJob: + """ + Run an grading job with generated results. Use this when you have generated results from inference in a dataset. + + :param task: The task to evaluate. One of: + - BenchmarkTask: Run evaluation task against a benchmark_id + - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids + - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids + + :return: The evaluation job containing grader scores. + """ + ... + + @webmethod(route="/evaluation/grade_inline", method="POST") + async def grade_inline(self, task: EvaluationTask) -> EvaluationResponse: + """ + Run an grading job with generated results inline. + + :param task: The task to evaluate. One of: + - BenchmarkTask: Run evaluation task against a benchmark_id + - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids + - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids + + :return: The evaluation job containing grader scores. "generations" is not populated in the response. + """ + ... diff --git a/llama_stack/apis/graders/__init__.py b/llama_stack/apis/graders/__init__.py new file mode 100644 index 000000000..b5791cb88 --- /dev/null +++ b/llama_stack/apis/graders/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from .graders import * # noqa: F401 F403 diff --git a/llama_stack/distribution/stack.py b/llama_stack/distribution/stack.py index 9c9289a77..cd1c58348 100644 --- a/llama_stack/distribution/stack.py +++ b/llama_stack/distribution/stack.py @@ -17,16 +17,15 @@ from llama_stack.apis.batch_inference import BatchInference from llama_stack.apis.benchmarks import Benchmarks from llama_stack.apis.datasetio import DatasetIO from llama_stack.apis.datasets import Datasets -from llama_stack.apis.eval import Eval +from llama_stack.apis.evaluation import Evaluation from llama_stack.apis.files import Files +from llama_stack.apis.graders import Graders from llama_stack.apis.inference import Inference from llama_stack.apis.inspect import Inspect from llama_stack.apis.models import Models from llama_stack.apis.post_training import PostTraining from llama_stack.apis.providers import Providers from llama_stack.apis.safety import Safety -from llama_stack.apis.scoring import Scoring -from llama_stack.apis.scoring_functions import ScoringFunctions from llama_stack.apis.shields import Shields from llama_stack.apis.synthetic_data_generation import SyntheticDataGeneration from llama_stack.apis.telemetry import Telemetry @@ -56,10 +55,7 @@ class LlamaStack( Telemetry, PostTraining, VectorIO, - Eval, Benchmarks, - Scoring, - ScoringFunctions, DatasetIO, Models, Shields, @@ -68,6 +64,8 @@ class LlamaStack( ToolRuntime, RAGToolRuntime, Files, + Graders, + Evaluation, ): pass @@ -113,7 +111,9 @@ class EnvVarError(Exception): def __init__(self, var_name: str, path: str = ""): self.var_name = var_name self.path = path - super().__init__(f"Environment variable '{var_name}' not set or empty{f' at {path}' if path else ''}") + super().__init__( + f"Environment variable '{var_name}' not set or empty{f' at {path}' if path else ''}" + ) def redact_sensitive_fields(data: Dict[str, Any]) -> Dict[str, Any]: @@ -204,7 +204,9 @@ def validate_env_pair(env_pair: str) -> tuple[str, str]: if not key: raise ValueError(f"Empty key in environment variable pair: {env_pair}") if not all(c.isalnum() or c == "_" for c in key): - raise ValueError(f"Key must contain only alphanumeric characters and underscores: {key}") + raise ValueError( + f"Key must contain only alphanumeric characters and underscores: {key}" + ) return key, value except ValueError as e: raise ValueError( @@ -217,14 +219,20 @@ def validate_env_pair(env_pair: str) -> tuple[str, str]: async def construct_stack( run_config: StackRunConfig, provider_registry: Optional[ProviderRegistry] = None ) -> Dict[Api, Any]: - dist_registry, _ = await create_dist_registry(run_config.metadata_store, run_config.image_name) - impls = await resolve_impls(run_config, provider_registry or get_provider_registry(), dist_registry) + dist_registry, _ = await create_dist_registry( + run_config.metadata_store, run_config.image_name + ) + impls = await resolve_impls( + run_config, provider_registry or get_provider_registry(), dist_registry + ) await register_resources(run_config, impls) return impls def get_stack_run_config_from_template(template: str) -> StackRunConfig: - template_path = importlib.resources.files("llama_stack") / f"templates/{template}/run.yaml" + template_path = ( + importlib.resources.files("llama_stack") / f"templates/{template}/run.yaml" + ) with importlib.resources.as_file(template_path) as path: if not path.exists(): @@ -267,7 +275,9 @@ def run_config_from_adhoc_config_spec( # call method "sample_run_config" on the provider spec config class provider_config_type = instantiate_class_type(provider_spec.config_class) - provider_config = replace_env_vars(provider_config_type.sample_run_config(__distro_dir__=distro_dir)) + provider_config = replace_env_vars( + provider_config_type.sample_run_config(__distro_dir__=distro_dir) + ) provider_configs_by_api[api_str] = [ Provider(