From a69759613a0f024b54dbe97229d3de7cac5109c5 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Tue, 18 Mar 2025 15:01:41 -0700 Subject: [PATCH] comments --- docs/_static/llama-stack-spec.html | 1670 ++++++++++++++++++--- docs/_static/llama-stack-spec.yaml | 1134 ++++++++++++-- llama_stack/apis/benchmarks/benchmarks.py | 9 +- llama_stack/apis/common/job_types.py | 7 +- llama_stack/apis/evaluation/evaluation.py | 14 +- llama_stack/apis/graders/graders.py | 11 +- llama_stack/distribution/stack.py | 30 +- 7 files changed, 2486 insertions(+), 389 deletions(-) diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 8de7f86de..cb5959e22 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -866,83 +866,6 @@ ] } }, - "/v1/graders/{grader_id}": { - "get": { - "responses": { - "200": { - "description": "The grader.", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/Grader" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Graders" - ], - "description": "Get a grader by ID.", - "parameters": [ - { - "name": "grader_id", - "in": "path", - "description": "The ID of the grader.", - "required": true, - "schema": { - "type": "string" - } - } - ] - }, - "delete": { - "responses": { - "200": { - "description": "OK" - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Graders" - ], - "description": "Delete a grader by ID.", - "parameters": [ - { - "name": "grader_id", - "in": "path", - "description": "The ID of the grader.", - "required": true, - "schema": { - "type": "string" - } - } - ] - } - }, "/v1/inference/embeddings": { "post": { "responses": { @@ -986,6 +909,59 @@ } } }, + "/v1/eval/benchmarks/{benchmark_id}/evaluations": { + "post": { + "responses": { + "200": { + "description": "EvaluateResponse object containing generations and scores", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/EvaluateResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Eval" + ], + "description": "Evaluate a list of rows on a benchmark.", + "parameters": [ + { + "name": "benchmark_id", + "in": "path", + "description": "The ID of the benchmark to run the evaluation on.", + "required": true, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/EvaluateRowsRequest" + } + } + }, + "required": true + } + } + }, "/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}": { "get": { "responses": { @@ -1158,6 +1134,39 @@ } } ] + }, + "delete": { + "responses": { + "200": { + "description": "OK" + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Benchmarks" + ], + "description": "Unregister a benchmark by ID.", + "parameters": [ + { + "name": "benchmark_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + } + ] } }, "/v1/datasets/{dataset_id}": { @@ -1235,6 +1244,83 @@ ] } }, + "/v1/graders/{grader_id}": { + "get": { + "responses": { + "200": { + "description": "The grader.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Grader" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Graders" + ], + "description": "Get a grader by ID.", + "parameters": [ + { + "name": "grader_id", + "in": "path", + "description": "The ID of the grader.", + "required": true, + "schema": { + "type": "string" + } + } + ] + }, + "delete": { + "responses": { + "200": { + "description": "OK" + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Graders" + ], + "description": "Unregister a grader by ID.", + "parameters": [ + { + "name": "grader_id", + "in": "path", + "description": "The ID of the grader.", + "required": true, + "schema": { + "type": "string" + } + } + ] + } + }, "/v1/models/{model_id}": { "get": { "responses": { @@ -1310,6 +1396,48 @@ ] } }, + "/v1/scoring-functions/{scoring_fn_id}": { + "get": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ScoringFn" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "ScoringFunctions" + ], + "description": "", + "parameters": [ + { + "name": "scoring_fn_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + } + ] + } + }, "/v1/shields/{identifier}": { "get": { "responses": { @@ -2244,6 +2372,153 @@ ] } }, + "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}": { + "get": { + "responses": { + "200": { + "description": "The status of the evaluationjob.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/JobStatus" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Eval" + ], + "description": "Get the status of a job.", + "parameters": [ + { + "name": "benchmark_id", + "in": "path", + "description": "The ID of the benchmark to run the evaluation on.", + "required": true, + "schema": { + "type": "string" + } + }, + { + "name": "job_id", + "in": "path", + "description": "The ID of the job to get the status of.", + "required": true, + "schema": { + "type": "string" + } + } + ] + }, + "delete": { + "responses": { + "200": { + "description": "OK" + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Eval" + ], + "description": "Cancel a job.", + "parameters": [ + { + "name": "benchmark_id", + "in": "path", + "description": "The ID of the benchmark to run the evaluation on.", + "required": true, + "schema": { + "type": "string" + } + }, + { + "name": "job_id", + "in": "path", + "description": "The ID of the job to cancel.", + "required": true, + "schema": { + "type": "string" + } + } + ] + } + }, + "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result": { + "get": { + "responses": { + "200": { + "description": "The result of the job.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/EvaluateResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Eval" + ], + "description": "Get the result of a job.", + "parameters": [ + { + "name": "benchmark_id", + "in": "path", + "description": "The ID of the benchmark to run the evaluation on.", + "required": true, + "schema": { + "type": "string" + } + }, + { + "name": "job_id", + "in": "path", + "description": "The ID of the job to get the result of.", + "required": true, + "schema": { + "type": "string" + } + } + ] + } + }, "/v1/agents/{agent_id}/sessions": { "get": { "responses": { @@ -2517,9 +2792,9 @@ "200": { "description": "A list of graders.", "content": { - "application/jsonl": { + "application/json": { "schema": { - "$ref": "#/components/schemas/Grader" + "$ref": "#/components/schemas/ListGradersResponse" } } } @@ -2775,6 +3050,73 @@ ] } }, + "/v1/scoring-functions": { + "get": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ListScoringFunctionsResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "ScoringFunctions" + ], + "description": "", + "parameters": [] + }, + "post": { + "responses": { + "200": { + "description": "OK" + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "ScoringFunctions" + ], + "description": "", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RegisterScoringFunctionRequest" + } + } + }, + "required": true + } + } + }, "/v1/shields": { "get": { "responses": { @@ -3402,6 +3744,59 @@ } } }, + "/v1/eval/benchmarks/{benchmark_id}/jobs": { + "post": { + "responses": { + "200": { + "description": "The job that was created to run the evaluation.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Job" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Eval" + ], + "description": "Run an evaluation on a benchmark.", + "parameters": [ + { + "name": "benchmark_id", + "in": "path", + "description": "The ID of the benchmark to run the evaluation on.", + "required": true, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RunEvalRequest" + } + } + }, + "required": true + } + } + }, "/v1/safety/run-shield": { "post": { "responses": { @@ -3524,6 +3919,92 @@ } } }, + "/v1/scoring/score": { + "post": { + "responses": { + "200": { + "description": "ScoreResponse object containing rows and aggregated results", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ScoreResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Scoring" + ], + "description": "Score a list of rows.", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ScoreRequest" + } + } + }, + "required": true + } + } + }, + "/v1/scoring/score-batch": { + "post": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ScoreBatchResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Scoring" + ], + "description": "", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ScoreBatchRequest" + } + } + }, + "required": true + } + } + }, "/v1/post-training/supervised-fine-tune": { "post": { "responses": { @@ -6149,6 +6630,381 @@ "title": "EmbeddingsResponse", "description": "Response containing generated embeddings." }, + "AgentCandidate": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "agent", + "default": "agent" + }, + "config": { + "$ref": "#/components/schemas/AgentConfig", + "description": "The configuration for the agent candidate." + } + }, + "additionalProperties": false, + "required": [ + "type", + "config" + ], + "title": "AgentCandidate", + "description": "An agent candidate for evaluation." + }, + "AggregationFunctionType": { + "type": "string", + "enum": [ + "average", + "median", + "categorical_count", + "accuracy" + ], + "title": "AggregationFunctionType" + }, + "BasicScoringFnParams": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "basic", + "default": "basic" + }, + "aggregation_functions": { + "type": "array", + "items": { + "$ref": "#/components/schemas/AggregationFunctionType" + } + } + }, + "additionalProperties": false, + "required": [ + "type" + ], + "title": "BasicScoringFnParams" + }, + "BenchmarkConfig": { + "type": "object", + "properties": { + "eval_candidate": { + "$ref": "#/components/schemas/EvalCandidate", + "description": "The candidate to evaluate." + }, + "scoring_params": { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/ScoringFnParams" + }, + "description": "Map between scoring function id and parameters for each scoring function you want to run" + }, + "num_examples": { + "type": "integer", + "description": "(Optional) The number of examples to evaluate. If not provided, all examples in the dataset will be evaluated" + } + }, + "additionalProperties": false, + "required": [ + "eval_candidate", + "scoring_params" + ], + "title": "BenchmarkConfig", + "description": "A benchmark configuration for evaluation." + }, + "EvalCandidate": { + "oneOf": [ + { + "$ref": "#/components/schemas/ModelCandidate" + }, + { + "$ref": "#/components/schemas/AgentCandidate" + } + ], + "discriminator": { + "propertyName": "type", + "mapping": { + "model": "#/components/schemas/ModelCandidate", + "agent": "#/components/schemas/AgentCandidate" + } + } + }, + "LLMAsJudgeScoringFnParams": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "llm_as_judge", + "default": "llm_as_judge" + }, + "judge_model": { + "type": "string" + }, + "prompt_template": { + "type": "string" + }, + "judge_score_regexes": { + "type": "array", + "items": { + "type": "string" + } + }, + "aggregation_functions": { + "type": "array", + "items": { + "$ref": "#/components/schemas/AggregationFunctionType" + } + } + }, + "additionalProperties": false, + "required": [ + "type", + "judge_model" + ], + "title": "LLMAsJudgeScoringFnParams" + }, + "ModelCandidate": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "model", + "default": "model" + }, + "model": { + "type": "string", + "description": "The model ID to evaluate." + }, + "sampling_params": { + "$ref": "#/components/schemas/SamplingParams", + "description": "The sampling parameters for the model." + }, + "system_message": { + "$ref": "#/components/schemas/SystemMessage", + "description": "(Optional) The system message providing instructions or context to the model." + } + }, + "additionalProperties": false, + "required": [ + "type", + "model", + "sampling_params" + ], + "title": "ModelCandidate", + "description": "A model candidate for evaluation." + }, + "RegexParserScoringFnParams": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "regex_parser", + "default": "regex_parser" + }, + "parsing_regexes": { + "type": "array", + "items": { + "type": "string" + } + }, + "aggregation_functions": { + "type": "array", + "items": { + "$ref": "#/components/schemas/AggregationFunctionType" + } + } + }, + "additionalProperties": false, + "required": [ + "type" + ], + "title": "RegexParserScoringFnParams" + }, + "ScoringFnParams": { + "oneOf": [ + { + "$ref": "#/components/schemas/LLMAsJudgeScoringFnParams" + }, + { + "$ref": "#/components/schemas/RegexParserScoringFnParams" + }, + { + "$ref": "#/components/schemas/BasicScoringFnParams" + } + ], + "discriminator": { + "propertyName": "type", + "mapping": { + "llm_as_judge": "#/components/schemas/LLMAsJudgeScoringFnParams", + "regex_parser": "#/components/schemas/RegexParserScoringFnParams", + "basic": "#/components/schemas/BasicScoringFnParams" + } + } + }, + "EvaluateRowsRequest": { + "type": "object", + "properties": { + "input_rows": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + }, + "description": "The rows to evaluate." + }, + "scoring_functions": { + "type": "array", + "items": { + "type": "string" + }, + "description": "The scoring functions to use for the evaluation." + }, + "benchmark_config": { + "$ref": "#/components/schemas/BenchmarkConfig", + "description": "The configuration for the benchmark." + } + }, + "additionalProperties": false, + "required": [ + "input_rows", + "scoring_functions", + "benchmark_config" + ], + "title": "EvaluateRowsRequest" + }, + "EvaluateResponse": { + "type": "object", + "properties": { + "generations": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + }, + "description": "The generations from the evaluation." + }, + "scores": { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/ScoringResult" + }, + "description": "The scores from the evaluation." + } + }, + "additionalProperties": false, + "required": [ + "generations", + "scores" + ], + "title": "EvaluateResponse", + "description": "The response from an evaluation." + }, + "ScoringResult": { + "type": "object", + "properties": { + "score_rows": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + }, + "description": "The scoring result for each row. Each row is a map of column name to value." + }, + "aggregated_results": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + }, + "description": "Map of metric name to aggregated value" + } + }, + "additionalProperties": false, + "required": [ + "score_rows", + "aggregated_results" + ], + "title": "ScoringResult", + "description": "A scoring result for a single row." + }, "Agent": { "type": "object", "properties": { @@ -6876,6 +7732,268 @@ ], "title": "ModelType" }, + "AgentTurnInputType": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "agent_turn_input", + "default": "agent_turn_input" + } + }, + "additionalProperties": false, + "required": [ + "type" + ], + "title": "AgentTurnInputType" + }, + "ArrayType": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "array", + "default": "array" + } + }, + "additionalProperties": false, + "required": [ + "type" + ], + "title": "ArrayType" + }, + "BooleanType": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "boolean", + "default": "boolean" + } + }, + "additionalProperties": false, + "required": [ + "type" + ], + "title": "BooleanType" + }, + "ChatCompletionInputType": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "chat_completion_input", + "default": "chat_completion_input" + } + }, + "additionalProperties": false, + "required": [ + "type" + ], + "title": "ChatCompletionInputType" + }, + "CompletionInputType": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "completion_input", + "default": "completion_input" + } + }, + "additionalProperties": false, + "required": [ + "type" + ], + "title": "CompletionInputType" + }, + "JsonType": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "json", + "default": "json" + } + }, + "additionalProperties": false, + "required": [ + "type" + ], + "title": "JsonType" + }, + "NumberType": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "number", + "default": "number" + } + }, + "additionalProperties": false, + "required": [ + "type" + ], + "title": "NumberType" + }, + "ObjectType": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "object", + "default": "object" + } + }, + "additionalProperties": false, + "required": [ + "type" + ], + "title": "ObjectType" + }, + "ParamType": { + "oneOf": [ + { + "$ref": "#/components/schemas/StringType" + }, + { + "$ref": "#/components/schemas/NumberType" + }, + { + "$ref": "#/components/schemas/BooleanType" + }, + { + "$ref": "#/components/schemas/ArrayType" + }, + { + "$ref": "#/components/schemas/ObjectType" + }, + { + "$ref": "#/components/schemas/JsonType" + }, + { + "$ref": "#/components/schemas/UnionType" + }, + { + "$ref": "#/components/schemas/ChatCompletionInputType" + }, + { + "$ref": "#/components/schemas/CompletionInputType" + }, + { + "$ref": "#/components/schemas/AgentTurnInputType" + } + ], + "discriminator": { + "propertyName": "type", + "mapping": { + "string": "#/components/schemas/StringType", + "number": "#/components/schemas/NumberType", + "boolean": "#/components/schemas/BooleanType", + "array": "#/components/schemas/ArrayType", + "object": "#/components/schemas/ObjectType", + "json": "#/components/schemas/JsonType", + "union": "#/components/schemas/UnionType", + "chat_completion_input": "#/components/schemas/ChatCompletionInputType", + "completion_input": "#/components/schemas/CompletionInputType", + "agent_turn_input": "#/components/schemas/AgentTurnInputType" + } + } + }, + "ScoringFn": { + "type": "object", + "properties": { + "identifier": { + "type": "string" + }, + "provider_resource_id": { + "type": "string" + }, + "provider_id": { + "type": "string" + }, + "type": { + "type": "string", + "const": "scoring_function", + "default": "scoring_function" + }, + "description": { + "type": "string" + }, + "metadata": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + }, + "return_type": { + "$ref": "#/components/schemas/ParamType" + }, + "params": { + "$ref": "#/components/schemas/ScoringFnParams" + } + }, + "additionalProperties": false, + "required": [ + "identifier", + "provider_resource_id", + "provider_id", + "type", + "metadata", + "return_type" + ], + "title": "ScoringFn" + }, + "StringType": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "string", + "default": "string" + } + }, + "additionalProperties": false, + "required": [ + "type" + ], + "title": "StringType" + }, + "UnionType": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "union", + "default": "union" + } + }, + "additionalProperties": false, + "required": [ + "type" + ], + "title": "UnionType" + }, "Shield": { "type": "object", "properties": { @@ -7405,13 +8523,13 @@ ], "title": "VectorDB" }, - "BenchmarkTask": { + "BenchmarkEvaluationTask": { "type": "object", "properties": { "type": { "type": "string", - "const": "benchmark_id", - "default": "benchmark_id" + "const": "benchmark", + "default": "benchmark" }, "benchmark_id": { "type": "string" @@ -7422,15 +8540,15 @@ "type", "benchmark_id" ], - "title": "BenchmarkTask" + "title": "BenchmarkEvaluationTask" }, - "DataSourceGraderTask": { + "DataEvaluationTask": { "type": "object", "properties": { "type": { "type": "string", - "const": "data_source_grader", - "default": "data_source_grader" + "const": "data", + "default": "data" }, "data_source": { "$ref": "#/components/schemas/DataSource" @@ -7448,15 +8566,15 @@ "data_source", "grader_ids" ], - "title": "DataSourceGraderTask" + "title": "DataEvaluationTask" }, - "DatasetGraderTask": { + "DatasetEvaluationTask": { "type": "object", "properties": { "type": { "type": "string", - "const": "dataset_grader", - "default": "dataset_grader" + "const": "dataset", + "default": "dataset" }, "dataset_id": { "type": "string" @@ -7474,26 +8592,26 @@ "dataset_id", "grader_ids" ], - "title": "DatasetGraderTask" + "title": "DatasetEvaluationTask" }, "EvaluationTask": { "oneOf": [ { - "$ref": "#/components/schemas/BenchmarkTask" + "$ref": "#/components/schemas/BenchmarkEvaluationTask" }, { - "$ref": "#/components/schemas/DatasetGraderTask" + "$ref": "#/components/schemas/DatasetEvaluationTask" }, { - "$ref": "#/components/schemas/DataSourceGraderTask" + "$ref": "#/components/schemas/DataEvaluationTask" } ], "discriminator": { "propertyName": "type", "mapping": { - "benchmark_id": "#/components/schemas/BenchmarkTask", - "dataset_grader": "#/components/schemas/DatasetGraderTask", - "data_source_grader": "#/components/schemas/DataSourceGraderTask" + "benchmark": "#/components/schemas/BenchmarkEvaluationTask", + "dataset": "#/components/schemas/DatasetEvaluationTask", + "data": "#/components/schemas/DataEvaluationTask" } } }, @@ -7511,27 +8629,6 @@ ], "title": "GradeRequest" }, - "AgentCandidate": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "agent", - "default": "agent" - }, - "config": { - "$ref": "#/components/schemas/AgentConfig", - "description": "The configuration for the agent candidate." - } - }, - "additionalProperties": false, - "required": [ - "type", - "config" - ], - "title": "AgentCandidate", - "description": "An agent candidate for evaluation." - }, "EvaluationCandidate": { "oneOf": [ { @@ -7572,10 +8669,10 @@ "format": "date-time", "description": "The time the job was created." }, - "ended_at": { + "completed_at": { "type": "string", "format": "date-time", - "description": "The time the job ended." + "description": "The time the job completed." }, "error": { "type": "string", @@ -7604,35 +8701,6 @@ ], "title": "EvaluationJob" }, - "ModelCandidate": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "model", - "default": "model" - }, - "model_id": { - "type": "string" - }, - "sampling_params": { - "$ref": "#/components/schemas/SamplingParams", - "description": "The sampling parameters for the model." - }, - "system_message": { - "$ref": "#/components/schemas/SystemMessage", - "description": "(Optional) The system message providing instructions or context to the model." - } - }, - "additionalProperties": false, - "required": [ - "type", - "model_id", - "sampling_params" - ], - "title": "ModelCandidate", - "description": "A model candidate for evaluation." - }, "GradeSyncRequest": { "type": "object", "properties": { @@ -7695,73 +8763,6 @@ "title": "EvaluationResponse", "description": "A response to an inline evaluation." }, - "ScoringResult": { - "type": "object", - "properties": { - "scores": { - "type": "array", - "items": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } - }, - "description": "The scoring result for each row. Each row is a map of grader column name to value." - }, - "metrics": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - }, - "description": "Map of metric name to aggregated value." - } - }, - "additionalProperties": false, - "required": [ - "scores", - "metrics" - ], - "title": "ScoringResult", - "description": "A scoring result for a single row." - }, "HealthInfo": { "type": "object", "properties": { @@ -8094,6 +9095,17 @@ "title": "IterrowsResponse", "description": "A paginated list of rows from a dataset." }, + "JobStatus": { + "type": "string", + "enum": [ + "completed", + "in_progress", + "failed", + "scheduled", + "cancelled" + ], + "title": "JobStatus" + }, "ListAgentSessionsResponse": { "type": "object", "properties": { @@ -8266,6 +9278,22 @@ ], "title": "ListGraderTypesResponse" }, + "ListGradersResponse": { + "type": "object", + "properties": { + "data": { + "type": "array", + "items": { + "$ref": "#/components/schemas/Grader" + } + } + }, + "additionalProperties": false, + "required": [ + "data" + ], + "title": "ListGradersResponse" + }, "ListModelsResponse": { "type": "object", "properties": { @@ -8338,6 +9366,22 @@ ], "title": "ListRoutesResponse" }, + "ListScoringFunctionsResponse": { + "type": "object", + "properties": { + "data": { + "type": "array", + "items": { + "$ref": "#/components/schemas/ScoringFn" + } + } + }, + "additionalProperties": false, + "required": [ + "data" + ], + "title": "ListScoringFunctionsResponse" + }, "ListShieldsResponse": { "type": "object", "properties": { @@ -9330,7 +10374,7 @@ "properties": { "dataset_id": { "type": "string", - "description": "The ID of the dataset to used to run the benchmark." + "description": "The ID of the dataset to be used to run the benchmark." }, "grader_ids": { "type": "array", @@ -9522,6 +10566,36 @@ ], "title": "RegisterModelRequest" }, + "RegisterScoringFunctionRequest": { + "type": "object", + "properties": { + "scoring_fn_id": { + "type": "string" + }, + "description": { + "type": "string" + }, + "return_type": { + "$ref": "#/components/schemas/ParamType" + }, + "provider_scoring_fn_id": { + "type": "string" + }, + "provider_id": { + "type": "string" + }, + "params": { + "$ref": "#/components/schemas/ScoringFnParams" + } + }, + "additionalProperties": false, + "required": [ + "scoring_fn_id", + "description", + "return_type" + ], + "title": "RegisterScoringFunctionRequest" + }, "RegisterShieldRequest": { "type": "object", "properties": { @@ -9677,6 +10751,33 @@ ], "title": "RunRequest" }, + "RunEvalRequest": { + "type": "object", + "properties": { + "benchmark_config": { + "$ref": "#/components/schemas/BenchmarkConfig", + "description": "The configuration for the benchmark." + } + }, + "additionalProperties": false, + "required": [ + "benchmark_config" + ], + "title": "RunEvalRequest" + }, + "Job": { + "type": "object", + "properties": { + "job_id": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "job_id" + ], + "title": "Job" + }, "RunShieldRequest": { "type": "object", "properties": { @@ -9782,6 +10883,128 @@ ], "title": "SaveSpansToDatasetRequest" }, + "ScoreRequest": { + "type": "object", + "properties": { + "input_rows": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + }, + "description": "The rows to score." + }, + "scoring_functions": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "$ref": "#/components/schemas/ScoringFnParams" + }, + { + "type": "null" + } + ] + }, + "description": "The scoring functions to use for the scoring." + } + }, + "additionalProperties": false, + "required": [ + "input_rows", + "scoring_functions" + ], + "title": "ScoreRequest" + }, + "ScoreResponse": { + "type": "object", + "properties": { + "results": { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/ScoringResult" + }, + "description": "A map of scoring function name to ScoringResult." + } + }, + "additionalProperties": false, + "required": [ + "results" + ], + "title": "ScoreResponse", + "description": "The response from scoring." + }, + "ScoreBatchRequest": { + "type": "object", + "properties": { + "dataset_id": { + "type": "string" + }, + "scoring_functions": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "$ref": "#/components/schemas/ScoringFnParams" + }, + { + "type": "null" + } + ] + } + }, + "save_results_dataset": { + "type": "boolean" + } + }, + "additionalProperties": false, + "required": [ + "dataset_id", + "scoring_functions", + "save_results_dataset" + ], + "title": "ScoreBatchRequest" + }, + "ScoreBatchResponse": { + "type": "object", + "properties": { + "dataset_id": { + "type": "string" + }, + "results": { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/ScoringResult" + } + } + }, + "additionalProperties": false, + "required": [ + "results" + ], + "title": "ScoreBatchResponse" + }, "AlgorithmConfig": { "oneOf": [ { @@ -10144,6 +11367,10 @@ { "name": "Datasets" }, + { + "name": "Eval", + "x-displayName": "Llama Stack Evaluation API for running evaluations on model and agent candidates." + }, { "name": "Evaluation" }, @@ -10174,6 +11401,12 @@ { "name": "Safety" }, + { + "name": "Scoring" + }, + { + "name": "ScoringFunctions" + }, { "name": "Shields" }, @@ -10205,6 +11438,7 @@ "Benchmarks", "DatasetIO", "Datasets", + "Eval", "Evaluation", "Files", "Graders", @@ -10214,6 +11448,8 @@ "PostTraining (Coming Soon)", "Providers", "Safety", + "Scoring", + "ScoringFunctions", "Shields", "SyntheticDataGeneration (Coming Soon)", "Telemetry", diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index b0db86389..ecc8104e1 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -589,59 +589,6 @@ paths: required: true schema: type: string - /v1/graders/{grader_id}: - get: - responses: - '200': - description: The grader. - content: - application/json: - schema: - $ref: '#/components/schemas/Grader' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Graders - description: Get a grader by ID. - parameters: - - name: grader_id - in: path - description: The ID of the grader. - required: true - schema: - type: string - delete: - responses: - '200': - description: OK - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Graders - description: Delete a grader by ID. - parameters: - - name: grader_id - in: path - description: The ID of the grader. - required: true - schema: - type: string /v1/inference/embeddings: post: responses: @@ -675,6 +622,43 @@ paths: schema: $ref: '#/components/schemas/EmbeddingsRequest' required: true + /v1/eval/benchmarks/{benchmark_id}/evaluations: + post: + responses: + '200': + description: >- + EvaluateResponse object containing generations and scores + content: + application/json: + schema: + $ref: '#/components/schemas/EvaluateResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Eval + description: Evaluate a list of rows on a benchmark. + parameters: + - name: benchmark_id + in: path + description: >- + The ID of the benchmark to run the evaluation on. + required: true + schema: + type: string + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/EvaluateRowsRequest' + required: true /v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}: get: responses: @@ -794,6 +778,29 @@ paths: required: true schema: type: string + delete: + responses: + '200': + description: OK + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Benchmarks + description: Unregister a benchmark by ID. + parameters: + - name: benchmark_id + in: path + required: true + schema: + type: string /v1/datasets/{dataset_id}: get: responses: @@ -845,6 +852,59 @@ paths: required: true schema: type: string + /v1/graders/{grader_id}: + get: + responses: + '200': + description: The grader. + content: + application/json: + schema: + $ref: '#/components/schemas/Grader' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Graders + description: Get a grader by ID. + parameters: + - name: grader_id + in: path + description: The ID of the grader. + required: true + schema: + type: string + delete: + responses: + '200': + description: OK + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Graders + description: Unregister a grader by ID. + parameters: + - name: grader_id + in: path + description: The ID of the grader. + required: true + schema: + type: string /v1/models/{model_id}: get: responses: @@ -896,6 +956,34 @@ paths: required: true schema: type: string + /v1/scoring-functions/{scoring_fn_id}: + get: + responses: + '200': + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/ScoringFn' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - ScoringFunctions + description: '' + parameters: + - name: scoring_fn_id + in: path + required: true + schema: + type: string /v1/shields/{identifier}: get: responses: @@ -1536,6 +1624,109 @@ paths: required: false schema: type: integer + /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}: + get: + responses: + '200': + description: The status of the evaluationjob. + content: + application/json: + schema: + $ref: '#/components/schemas/JobStatus' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Eval + description: Get the status of a job. + parameters: + - name: benchmark_id + in: path + description: >- + The ID of the benchmark to run the evaluation on. + required: true + schema: + type: string + - name: job_id + in: path + description: The ID of the job to get the status of. + required: true + schema: + type: string + delete: + responses: + '200': + description: OK + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Eval + description: Cancel a job. + parameters: + - name: benchmark_id + in: path + description: >- + The ID of the benchmark to run the evaluation on. + required: true + schema: + type: string + - name: job_id + in: path + description: The ID of the job to cancel. + required: true + schema: + type: string + /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result: + get: + responses: + '200': + description: The result of the job. + content: + application/json: + schema: + $ref: '#/components/schemas/EvaluateResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Eval + description: Get the result of a job. + parameters: + - name: benchmark_id + in: path + description: >- + The ID of the benchmark to run the evaluation on. + required: true + schema: + type: string + - name: job_id + in: path + description: The ID of the job to get the result of. + required: true + schema: + type: string /v1/agents/{agent_id}/sessions: get: responses: @@ -1727,9 +1918,9 @@ paths: '200': description: A list of graders. content: - application/jsonl: + application/json: schema: - $ref: '#/components/schemas/Grader' + $ref: '#/components/schemas/ListGradersResponse' '400': $ref: '#/components/responses/BadRequest400' '429': @@ -1902,6 +2093,53 @@ paths: required: false schema: $ref: '#/components/schemas/URL' + /v1/scoring-functions: + get: + responses: + '200': + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/ListScoringFunctionsResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - ScoringFunctions + description: '' + parameters: [] + post: + responses: + '200': + description: OK + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - ScoringFunctions + description: '' + parameters: [] + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/RegisterScoringFunctionRequest' + required: true /v1/shields: get: responses: @@ -2336,6 +2574,43 @@ paths: schema: $ref: '#/components/schemas/RunRequest' required: true + /v1/eval/benchmarks/{benchmark_id}/jobs: + post: + responses: + '200': + description: >- + The job that was created to run the evaluation. + content: + application/json: + schema: + $ref: '#/components/schemas/Job' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Eval + description: Run an evaluation on a benchmark. + parameters: + - name: benchmark_id + in: path + description: >- + The ID of the benchmark to run the evaluation on. + required: true + schema: + type: string + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/RunEvalRequest' + required: true /v1/safety/run-shield: post: responses: @@ -2419,6 +2694,65 @@ paths: schema: $ref: '#/components/schemas/SaveSpansToDatasetRequest' required: true + /v1/scoring/score: + post: + responses: + '200': + description: >- + ScoreResponse object containing rows and aggregated results + content: + application/json: + schema: + $ref: '#/components/schemas/ScoreResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Scoring + description: Score a list of rows. + parameters: [] + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/ScoreRequest' + required: true + /v1/scoring/score-batch: + post: + responses: + '200': + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/ScoreBatchResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Scoring + description: '' + parameters: [] + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/ScoreBatchRequest' + required: true /v1/post-training/supervised-fine-tune: post: responses: @@ -4308,6 +4642,251 @@ components: title: EmbeddingsResponse description: >- Response containing generated embeddings. + AgentCandidate: + type: object + properties: + type: + type: string + const: agent + default: agent + config: + $ref: '#/components/schemas/AgentConfig' + description: >- + The configuration for the agent candidate. + additionalProperties: false + required: + - type + - config + title: AgentCandidate + description: An agent candidate for evaluation. + AggregationFunctionType: + type: string + enum: + - average + - median + - categorical_count + - accuracy + title: AggregationFunctionType + BasicScoringFnParams: + type: object + properties: + type: + type: string + const: basic + default: basic + aggregation_functions: + type: array + items: + $ref: '#/components/schemas/AggregationFunctionType' + additionalProperties: false + required: + - type + title: BasicScoringFnParams + BenchmarkConfig: + type: object + properties: + eval_candidate: + $ref: '#/components/schemas/EvalCandidate' + description: The candidate to evaluate. + scoring_params: + type: object + additionalProperties: + $ref: '#/components/schemas/ScoringFnParams' + description: >- + Map between scoring function id and parameters for each scoring function + you want to run + num_examples: + type: integer + description: >- + (Optional) The number of examples to evaluate. If not provided, all examples + in the dataset will be evaluated + additionalProperties: false + required: + - eval_candidate + - scoring_params + title: BenchmarkConfig + description: >- + A benchmark configuration for evaluation. + EvalCandidate: + oneOf: + - $ref: '#/components/schemas/ModelCandidate' + - $ref: '#/components/schemas/AgentCandidate' + discriminator: + propertyName: type + mapping: + model: '#/components/schemas/ModelCandidate' + agent: '#/components/schemas/AgentCandidate' + LLMAsJudgeScoringFnParams: + type: object + properties: + type: + type: string + const: llm_as_judge + default: llm_as_judge + judge_model: + type: string + prompt_template: + type: string + judge_score_regexes: + type: array + items: + type: string + aggregation_functions: + type: array + items: + $ref: '#/components/schemas/AggregationFunctionType' + additionalProperties: false + required: + - type + - judge_model + title: LLMAsJudgeScoringFnParams + ModelCandidate: + type: object + properties: + type: + type: string + const: model + default: model + model: + type: string + description: The model ID to evaluate. + sampling_params: + $ref: '#/components/schemas/SamplingParams' + description: The sampling parameters for the model. + system_message: + $ref: '#/components/schemas/SystemMessage' + description: >- + (Optional) The system message providing instructions or context to the + model. + additionalProperties: false + required: + - type + - model + - sampling_params + title: ModelCandidate + description: A model candidate for evaluation. + RegexParserScoringFnParams: + type: object + properties: + type: + type: string + const: regex_parser + default: regex_parser + parsing_regexes: + type: array + items: + type: string + aggregation_functions: + type: array + items: + $ref: '#/components/schemas/AggregationFunctionType' + additionalProperties: false + required: + - type + title: RegexParserScoringFnParams + ScoringFnParams: + oneOf: + - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams' + - $ref: '#/components/schemas/RegexParserScoringFnParams' + - $ref: '#/components/schemas/BasicScoringFnParams' + discriminator: + propertyName: type + mapping: + llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams' + regex_parser: '#/components/schemas/RegexParserScoringFnParams' + basic: '#/components/schemas/BasicScoringFnParams' + EvaluateRowsRequest: + type: object + properties: + input_rows: + type: array + items: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + description: The rows to evaluate. + scoring_functions: + type: array + items: + type: string + description: >- + The scoring functions to use for the evaluation. + benchmark_config: + $ref: '#/components/schemas/BenchmarkConfig' + description: The configuration for the benchmark. + additionalProperties: false + required: + - input_rows + - scoring_functions + - benchmark_config + title: EvaluateRowsRequest + EvaluateResponse: + type: object + properties: + generations: + type: array + items: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + description: The generations from the evaluation. + scores: + type: object + additionalProperties: + $ref: '#/components/schemas/ScoringResult' + description: The scores from the evaluation. + additionalProperties: false + required: + - generations + - scores + title: EvaluateResponse + description: The response from an evaluation. + ScoringResult: + type: object + properties: + score_rows: + type: array + items: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + description: >- + The scoring result for each row. Each row is a map of column name to value. + aggregated_results: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + description: Map of metric name to aggregated value + additionalProperties: false + required: + - score_rows + - aggregated_results + title: ScoringResult + description: A scoring result for a single row. Agent: type: object properties: @@ -4803,6 +5382,179 @@ components: - llm - embedding title: ModelType + AgentTurnInputType: + type: object + properties: + type: + type: string + const: agent_turn_input + default: agent_turn_input + additionalProperties: false + required: + - type + title: AgentTurnInputType + ArrayType: + type: object + properties: + type: + type: string + const: array + default: array + additionalProperties: false + required: + - type + title: ArrayType + BooleanType: + type: object + properties: + type: + type: string + const: boolean + default: boolean + additionalProperties: false + required: + - type + title: BooleanType + ChatCompletionInputType: + type: object + properties: + type: + type: string + const: chat_completion_input + default: chat_completion_input + additionalProperties: false + required: + - type + title: ChatCompletionInputType + CompletionInputType: + type: object + properties: + type: + type: string + const: completion_input + default: completion_input + additionalProperties: false + required: + - type + title: CompletionInputType + JsonType: + type: object + properties: + type: + type: string + const: json + default: json + additionalProperties: false + required: + - type + title: JsonType + NumberType: + type: object + properties: + type: + type: string + const: number + default: number + additionalProperties: false + required: + - type + title: NumberType + ObjectType: + type: object + properties: + type: + type: string + const: object + default: object + additionalProperties: false + required: + - type + title: ObjectType + ParamType: + oneOf: + - $ref: '#/components/schemas/StringType' + - $ref: '#/components/schemas/NumberType' + - $ref: '#/components/schemas/BooleanType' + - $ref: '#/components/schemas/ArrayType' + - $ref: '#/components/schemas/ObjectType' + - $ref: '#/components/schemas/JsonType' + - $ref: '#/components/schemas/UnionType' + - $ref: '#/components/schemas/ChatCompletionInputType' + - $ref: '#/components/schemas/CompletionInputType' + - $ref: '#/components/schemas/AgentTurnInputType' + discriminator: + propertyName: type + mapping: + string: '#/components/schemas/StringType' + number: '#/components/schemas/NumberType' + boolean: '#/components/schemas/BooleanType' + array: '#/components/schemas/ArrayType' + object: '#/components/schemas/ObjectType' + json: '#/components/schemas/JsonType' + union: '#/components/schemas/UnionType' + chat_completion_input: '#/components/schemas/ChatCompletionInputType' + completion_input: '#/components/schemas/CompletionInputType' + agent_turn_input: '#/components/schemas/AgentTurnInputType' + ScoringFn: + type: object + properties: + identifier: + type: string + provider_resource_id: + type: string + provider_id: + type: string + type: + type: string + const: scoring_function + default: scoring_function + description: + type: string + metadata: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + return_type: + $ref: '#/components/schemas/ParamType' + params: + $ref: '#/components/schemas/ScoringFnParams' + additionalProperties: false + required: + - identifier + - provider_resource_id + - provider_id + - type + - metadata + - return_type + title: ScoringFn + StringType: + type: object + properties: + type: + type: string + const: string + default: string + additionalProperties: false + required: + - type + title: StringType + UnionType: + type: object + properties: + type: + type: string + const: union + default: union + additionalProperties: false + required: + - type + title: UnionType Shield: type: object properties: @@ -5141,27 +5893,27 @@ components: - embedding_model - embedding_dimension title: VectorDB - BenchmarkTask: + BenchmarkEvaluationTask: type: object properties: type: type: string - const: benchmark_id - default: benchmark_id + const: benchmark + default: benchmark benchmark_id: type: string additionalProperties: false required: - type - benchmark_id - title: BenchmarkTask - DataSourceGraderTask: + title: BenchmarkEvaluationTask + DataEvaluationTask: type: object properties: type: type: string - const: data_source_grader - default: data_source_grader + const: data + default: data data_source: $ref: '#/components/schemas/DataSource' grader_ids: @@ -5173,14 +5925,14 @@ components: - type - data_source - grader_ids - title: DataSourceGraderTask - DatasetGraderTask: + title: DataEvaluationTask + DatasetEvaluationTask: type: object properties: type: type: string - const: dataset_grader - default: dataset_grader + const: dataset + default: dataset dataset_id: type: string grader_ids: @@ -5192,18 +5944,18 @@ components: - type - dataset_id - grader_ids - title: DatasetGraderTask + title: DatasetEvaluationTask EvaluationTask: oneOf: - - $ref: '#/components/schemas/BenchmarkTask' - - $ref: '#/components/schemas/DatasetGraderTask' - - $ref: '#/components/schemas/DataSourceGraderTask' + - $ref: '#/components/schemas/BenchmarkEvaluationTask' + - $ref: '#/components/schemas/DatasetEvaluationTask' + - $ref: '#/components/schemas/DataEvaluationTask' discriminator: propertyName: type mapping: - benchmark_id: '#/components/schemas/BenchmarkTask' - dataset_grader: '#/components/schemas/DatasetGraderTask' - data_source_grader: '#/components/schemas/DataSourceGraderTask' + benchmark: '#/components/schemas/BenchmarkEvaluationTask' + dataset: '#/components/schemas/DatasetEvaluationTask' + data: '#/components/schemas/DataEvaluationTask' GradeRequest: type: object properties: @@ -5218,23 +5970,6 @@ components: required: - task title: GradeRequest - AgentCandidate: - type: object - properties: - type: - type: string - const: agent - default: agent - config: - $ref: '#/components/schemas/AgentConfig' - description: >- - The configuration for the agent candidate. - additionalProperties: false - required: - - type - - config - title: AgentCandidate - description: An agent candidate for evaluation. EvaluationCandidate: oneOf: - $ref: '#/components/schemas/ModelCandidate' @@ -5263,10 +5998,10 @@ components: type: string format: date-time description: The time the job was created. - ended_at: + completed_at: type: string format: date-time - description: The time the job ended. + description: The time the job completed. error: type: string description: >- @@ -5288,30 +6023,6 @@ components: - task - candidate title: EvaluationJob - ModelCandidate: - type: object - properties: - type: - type: string - const: model - default: model - model_id: - type: string - sampling_params: - $ref: '#/components/schemas/SamplingParams' - description: The sampling parameters for the model. - system_message: - $ref: '#/components/schemas/SystemMessage' - description: >- - (Optional) The system message providing instructions or context to the - model. - additionalProperties: false - required: - - type - - model_id - - sampling_params - title: ModelCandidate - description: A model candidate for evaluation. GradeSyncRequest: type: object properties: @@ -5355,41 +6066,6 @@ components: - scores title: EvaluationResponse description: A response to an inline evaluation. - ScoringResult: - type: object - properties: - scores: - type: array - items: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - description: >- - The scoring result for each row. Each row is a map of grader column name - to value. - metrics: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - description: Map of metric name to aggregated value. - additionalProperties: false - required: - - scores - - metrics - title: ScoringResult - description: A scoring result for a single row. HealthInfo: type: object properties: @@ -5576,6 +6252,15 @@ components: - data title: IterrowsResponse description: A paginated list of rows from a dataset. + JobStatus: + type: string + enum: + - completed + - in_progress + - failed + - scheduled + - cancelled + title: JobStatus ListAgentSessionsResponse: type: object properties: @@ -5707,6 +6392,17 @@ components: required: - data title: ListGraderTypesResponse + ListGradersResponse: + type: object + properties: + data: + type: array + items: + $ref: '#/components/schemas/Grader' + additionalProperties: false + required: + - data + title: ListGradersResponse ListModelsResponse: type: object properties: @@ -5757,6 +6453,17 @@ components: required: - data title: ListRoutesResponse + ListScoringFunctionsResponse: + type: object + properties: + data: + type: array + items: + $ref: '#/components/schemas/ScoringFn' + additionalProperties: false + required: + - data + title: ListScoringFunctionsResponse ListShieldsResponse: type: object properties: @@ -6394,7 +7101,7 @@ components: dataset_id: type: string description: >- - The ID of the dataset to used to run the benchmark. + The ID of the dataset to be used to run the benchmark. grader_ids: type: array items: @@ -6532,6 +7239,27 @@ components: required: - model_id title: RegisterModelRequest + RegisterScoringFunctionRequest: + type: object + properties: + scoring_fn_id: + type: string + description: + type: string + return_type: + $ref: '#/components/schemas/ParamType' + provider_scoring_fn_id: + type: string + provider_id: + type: string + params: + $ref: '#/components/schemas/ScoringFnParams' + additionalProperties: false + required: + - scoring_fn_id + - description + - return_type + title: RegisterScoringFunctionRequest RegisterShieldRequest: type: object properties: @@ -6631,6 +7359,25 @@ components: - task - candidate title: RunRequest + RunEvalRequest: + type: object + properties: + benchmark_config: + $ref: '#/components/schemas/BenchmarkConfig' + description: The configuration for the benchmark. + additionalProperties: false + required: + - benchmark_config + title: RunEvalRequest + Job: + type: object + properties: + job_id: + type: string + additionalProperties: false + required: + - job_id + title: Job RunShieldRequest: type: object properties: @@ -6702,6 +7449,81 @@ components: - attributes_to_save - dataset_id title: SaveSpansToDatasetRequest + ScoreRequest: + type: object + properties: + input_rows: + type: array + items: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + description: The rows to score. + scoring_functions: + type: object + additionalProperties: + oneOf: + - $ref: '#/components/schemas/ScoringFnParams' + - type: 'null' + description: >- + The scoring functions to use for the scoring. + additionalProperties: false + required: + - input_rows + - scoring_functions + title: ScoreRequest + ScoreResponse: + type: object + properties: + results: + type: object + additionalProperties: + $ref: '#/components/schemas/ScoringResult' + description: >- + A map of scoring function name to ScoringResult. + additionalProperties: false + required: + - results + title: ScoreResponse + description: The response from scoring. + ScoreBatchRequest: + type: object + properties: + dataset_id: + type: string + scoring_functions: + type: object + additionalProperties: + oneOf: + - $ref: '#/components/schemas/ScoringFnParams' + - type: 'null' + save_results_dataset: + type: boolean + additionalProperties: false + required: + - dataset_id + - scoring_functions + - save_results_dataset + title: ScoreBatchRequest + ScoreBatchResponse: + type: object + properties: + dataset_id: + type: string + results: + type: object + additionalProperties: + $ref: '#/components/schemas/ScoringResult' + additionalProperties: false + required: + - results + title: ScoreBatchResponse AlgorithmConfig: oneOf: - $ref: '#/components/schemas/LoraFinetuningConfig' @@ -6939,6 +7761,9 @@ tags: - name: Benchmarks - name: DatasetIO - name: Datasets + - name: Eval + x-displayName: >- + Llama Stack Evaluation API for running evaluations on model and agent candidates. - name: Evaluation - name: Files - name: Graders @@ -6961,6 +7786,8 @@ tags: x-displayName: >- Providers API for inspecting, listing, and modifying providers and their configurations. - name: Safety + - name: Scoring + - name: ScoringFunctions - name: Shields - name: SyntheticDataGeneration (Coming Soon) - name: Telemetry @@ -6976,6 +7803,7 @@ x-tagGroups: - Benchmarks - DatasetIO - Datasets + - Eval - Evaluation - Files - Graders @@ -6985,6 +7813,8 @@ x-tagGroups: - PostTraining (Coming Soon) - Providers - Safety + - Scoring + - ScoringFunctions - Shields - SyntheticDataGeneration (Coming Soon) - Telemetry diff --git a/llama_stack/apis/benchmarks/benchmarks.py b/llama_stack/apis/benchmarks/benchmarks.py index 11db4d350..8017e5c27 100644 --- a/llama_stack/apis/benchmarks/benchmarks.py +++ b/llama_stack/apis/benchmarks/benchmarks.py @@ -62,7 +62,7 @@ class Benchmarks(Protocol): """ Register a new benchmark. - :param dataset_id: The ID of the dataset to used to run the benchmark. + :param dataset_id: The ID of the dataset to be used to run the benchmark. :param grader_ids: List of grader ids to use for this benchmark. :param benchmark_id: (Optional) The ID of the benchmark to register. If not provided, an ID will be generated. :param metadata: (Optional) Metadata for this benchmark for additional descriptions. @@ -87,3 +87,10 @@ class Benchmarks(Protocol): :param benchmark_id: The ID of the benchmark to get. """ ... + + @webmethod(route="/benchmarks/{benchmark_id}", method="DELETE") + async def unregister_benchmark(self, benchmark_id: str) -> None: + """ + Unregister a benchmark by ID. + """ + ... diff --git a/llama_stack/apis/common/job_types.py b/llama_stack/apis/common/job_types.py index 57775754b..307e3fa54 100644 --- a/llama_stack/apis/common/job_types.py +++ b/llama_stack/apis/common/job_types.py @@ -5,7 +5,6 @@ # the root directory of this source tree. from datetime import datetime from enum import Enum -from typing import Optional from pydantic import BaseModel @@ -38,12 +37,12 @@ class CommonJobFields(BaseModel): :param id: The ID of the job. :param status: The status of the job. :param created_at: The time the job was created. - :param ended_at: The time the job ended. + :param completed_at: The time the job completed. :param error: If status of the job is failed, this will contain the error message. """ id: str status: JobStatus created_at: datetime - ended_at: Optional[datetime] = None - error: Optional[str] = None + completed_at: datetime | None = None + error: str | None = None diff --git a/llama_stack/apis/evaluation/evaluation.py b/llama_stack/apis/evaluation/evaluation.py index faa620872..e667acfd4 100644 --- a/llama_stack/apis/evaluation/evaluation.py +++ b/llama_stack/apis/evaluation/evaluation.py @@ -48,28 +48,28 @@ EvaluationCandidate = register_schema( @json_schema_type -class BenchmarkTask(BaseModel): - type: Literal["benchmark_id"] = "benchmark_id" +class BenchmarkEvaluationTask(BaseModel): + type: Literal["benchmark"] = "benchmark" benchmark_id: str @json_schema_type -class DatasetGraderTask(BaseModel): - type: Literal["dataset_grader"] = "dataset_grader" +class DatasetEvaluationTask(BaseModel): + type: Literal["dataset"] = "dataset" dataset_id: str grader_ids: List[str] @json_schema_type -class DataSourceGraderTask(BaseModel): - type: Literal["data_source_grader"] = "data_source_grader" +class DataEvaluationTask(BaseModel): + type: Literal["data"] = "data" data_source: DataSource grader_ids: List[str] EvaluationTask = register_schema( Annotated[ - Union[BenchmarkTask, DatasetGraderTask, DataSourceGraderTask], + Union[BenchmarkEvaluationTask, DatasetEvaluationTask, DataEvaluationTask], Field(discriminator="type"), ], name="EvaluationTask", diff --git a/llama_stack/apis/graders/graders.py b/llama_stack/apis/graders/graders.py index 98b85552a..522559c3f 100644 --- a/llama_stack/apis/graders/graders.py +++ b/llama_stack/apis/graders/graders.py @@ -29,6 +29,13 @@ from .graders import * # noqa: F401 F403 class GraderType(Enum): """ A type of grader. Each type is a criteria for evaluating answers. + + :cvar llm: Use an LLM to score the answer. + :cvar regex_parser: Use a regex parser to score the answer. + :cvar equality: Check if the answer is equal to the reference answer. + :cvar subset_of: Check if the answer is a subset of the reference answer. + :cvar factuality: Check if the answer is factually correct using LLM as judge. + :cvar faithfulness: Check if the answer is faithful to the reference answer using LLM as judge. """ llm = "llm" @@ -221,9 +228,9 @@ class Graders(Protocol): ... @webmethod(route="/graders/{grader_id:path}", method="DELETE") - async def delete_grader(self, grader_id: str) -> None: + async def unregister_grader(self, grader_id: str) -> None: """ - Delete a grader by ID. + Unregister a grader by ID. :param grader_id: The ID of the grader. """ ... diff --git a/llama_stack/distribution/stack.py b/llama_stack/distribution/stack.py index b4862537a..a5fa0fe39 100644 --- a/llama_stack/distribution/stack.py +++ b/llama_stack/distribution/stack.py @@ -17,6 +17,7 @@ from llama_stack.apis.batch_inference import BatchInference from llama_stack.apis.benchmarks import Benchmarks from llama_stack.apis.datasetio import DatasetIO from llama_stack.apis.datasets import Datasets +from llama_stack.apis.eval import Eval from llama_stack.apis.evaluation import Evaluation from llama_stack.apis.files import Files from llama_stack.apis.graders import Graders @@ -26,6 +27,8 @@ from llama_stack.apis.models import Models from llama_stack.apis.post_training import PostTraining from llama_stack.apis.providers import Providers from llama_stack.apis.safety import Safety +from llama_stack.apis.scoring import Scoring +from llama_stack.apis.scoring_functions import ScoringFunctions from llama_stack.apis.shields import Shields from llama_stack.apis.synthetic_data_generation import SyntheticDataGeneration from llama_stack.apis.telemetry import Telemetry @@ -66,6 +69,9 @@ class LlamaStack( Files, Graders, Evaluation, + Eval, + ScoringFunctions, + Scoring, ): pass @@ -111,7 +117,9 @@ class EnvVarError(Exception): def __init__(self, var_name: str, path: str = ""): self.var_name = var_name self.path = path - super().__init__(f"Environment variable '{var_name}' not set or empty{f' at {path}' if path else ''}") + super().__init__( + f"Environment variable '{var_name}' not set or empty{f' at {path}' if path else ''}" + ) def redact_sensitive_fields(data: Dict[str, Any]) -> Dict[str, Any]: @@ -202,7 +210,9 @@ def validate_env_pair(env_pair: str) -> tuple[str, str]: if not key: raise ValueError(f"Empty key in environment variable pair: {env_pair}") if not all(c.isalnum() or c == "_" for c in key): - raise ValueError(f"Key must contain only alphanumeric characters and underscores: {key}") + raise ValueError( + f"Key must contain only alphanumeric characters and underscores: {key}" + ) return key, value except ValueError as e: raise ValueError( @@ -215,14 +225,20 @@ def validate_env_pair(env_pair: str) -> tuple[str, str]: async def construct_stack( run_config: StackRunConfig, provider_registry: Optional[ProviderRegistry] = None ) -> Dict[Api, Any]: - dist_registry, _ = await create_dist_registry(run_config.metadata_store, run_config.image_name) - impls = await resolve_impls(run_config, provider_registry or get_provider_registry(), dist_registry) + dist_registry, _ = await create_dist_registry( + run_config.metadata_store, run_config.image_name + ) + impls = await resolve_impls( + run_config, provider_registry or get_provider_registry(), dist_registry + ) await register_resources(run_config, impls) return impls def get_stack_run_config_from_template(template: str) -> StackRunConfig: - template_path = importlib.resources.files("llama_stack") / f"templates/{template}/run.yaml" + template_path = ( + importlib.resources.files("llama_stack") / f"templates/{template}/run.yaml" + ) with importlib.resources.as_file(template_path) as path: if not path.exists(): @@ -265,7 +281,9 @@ def run_config_from_adhoc_config_spec( # call method "sample_run_config" on the provider spec config class provider_config_type = instantiate_class_type(provider_spec.config_class) - provider_config = replace_env_vars(provider_config_type.sample_run_config(__distro_dir__=distro_dir)) + provider_config = replace_env_vars( + provider_config_type.sample_run_config(__distro_dir__=distro_dir) + ) provider_configs_by_api[api_str] = [ Provider(