diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 98270f7b8..b93f6a380 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -40,6 +40,286 @@
}
],
"paths": {
+ "/v1/eval/tasks/{task_id}/evaluations": {
+ "post": {
+ "responses": {
+ "200": {
+ "description": "OK",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/EvaluateResponse"
+ }
+ }
+ }
+ }
+ },
+ "tags": [
+ "Eval"
+ ],
+ "description": "",
+ "parameters": [
+ {
+ "name": "task_id",
+ "in": "path",
+ "required": true,
+ "schema": {
+ "type": "string"
+ }
+ }
+ ],
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/DeprecatedEvaluateRowsRequest"
+ }
+ }
+ },
+ "required": true
+ },
+ "deprecated": true
+ }
+ },
+ "/v1/eval-tasks/{task_id}": {
+ "get": {
+ "responses": {
+ "200": {
+ "description": "OK",
+ "content": {
+ "application/json": {
+ "schema": {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/Benchmark"
+ },
+ {
+ "type": "null"
+ }
+ ]
+ }
+ }
+ }
+ }
+ },
+ "tags": [
+ "Benchmarks"
+ ],
+ "description": "",
+ "parameters": [
+ {
+ "name": "eval_task_id",
+ "in": "query",
+ "required": true,
+ "schema": {
+ "type": "string"
+ }
+ }
+ ],
+ "deprecated": true
+ }
+ },
+ "/v1/eval/tasks/{task_id}/jobs/{job_id}": {
+ "get": {
+ "responses": {
+ "200": {
+ "description": "OK",
+ "content": {
+ "application/json": {
+ "schema": {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/JobStatus"
+ },
+ {
+ "type": "null"
+ }
+ ]
+ }
+ }
+ }
+ }
+ },
+ "tags": [
+ "Eval"
+ ],
+ "description": "",
+ "parameters": [
+ {
+ "name": "task_id",
+ "in": "path",
+ "required": true,
+ "schema": {
+ "type": "string"
+ }
+ },
+ {
+ "name": "job_id",
+ "in": "path",
+ "required": true,
+ "schema": {
+ "type": "string"
+ }
+ }
+ ],
+ "deprecated": true
+ },
+ "delete": {
+ "responses": {
+ "200": {
+ "description": "OK"
+ }
+ },
+ "tags": [
+ "Eval"
+ ],
+ "description": "",
+ "parameters": [
+ {
+ "name": "task_id",
+ "in": "path",
+ "required": true,
+ "schema": {
+ "type": "string"
+ }
+ },
+ {
+ "name": "job_id",
+ "in": "path",
+ "required": true,
+ "schema": {
+ "type": "string"
+ }
+ }
+ ],
+ "deprecated": true
+ }
+ },
+ "/v1/eval/tasks/{task_id}/jobs/{job_id}/result": {
+ "get": {
+ "responses": {
+ "200": {
+ "description": "OK",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/EvaluateResponse"
+ }
+ }
+ }
+ }
+ },
+ "tags": [
+ "Eval"
+ ],
+ "description": "",
+ "parameters": [
+ {
+ "name": "task_id",
+ "in": "path",
+ "required": true,
+ "schema": {
+ "type": "string"
+ }
+ },
+ {
+ "name": "job_id",
+ "in": "path",
+ "required": true,
+ "schema": {
+ "type": "string"
+ }
+ }
+ ],
+ "deprecated": true
+ }
+ },
+ "/v1/eval-tasks": {
+ "get": {
+ "responses": {
+ "200": {
+ "description": "OK",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/ListBenchmarksResponse"
+ }
+ }
+ }
+ }
+ },
+ "tags": [
+ "Benchmarks"
+ ],
+ "description": "",
+ "parameters": [],
+ "deprecated": true
+ },
+ "post": {
+ "responses": {
+ "200": {
+ "description": "OK"
+ }
+ },
+ "tags": [
+ "Benchmarks"
+ ],
+ "description": "",
+ "parameters": [],
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/DeprecatedRegisterEvalTaskRequest"
+ }
+ }
+ },
+ "required": true
+ },
+ "deprecated": true
+ }
+ },
+ "/v1/eval/tasks/{task_id}/jobs": {
+ "post": {
+ "responses": {
+ "200": {
+ "description": "OK",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/Job"
+ }
+ }
+ }
+ }
+ },
+ "tags": [
+ "Eval"
+ ],
+ "description": "",
+ "parameters": [
+ {
+ "name": "task_id",
+ "in": "path",
+ "required": true,
+ "schema": {
+ "type": "string"
+ }
+ }
+ ],
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/DeprecatedRunEvalRequest"
+ }
+ }
+ },
+ "required": true
+ },
+ "deprecated": true
+ }
+ },
"/v1/datasetio/rows": {
"get": {
"responses": {
@@ -530,7 +810,7 @@
}
}
},
- "/v1/eval/tasks/{task_id}/evaluations": {
+ "/v1/eval/benchmarks/{benchmark_id}/evaluations": {
"post": {
"responses": {
"200": {
@@ -550,7 +830,7 @@
"description": "",
"parameters": [
{
- "name": "task_id",
+ "name": "benchmark_id",
"in": "path",
"required": true,
"schema": {
@@ -670,6 +950,43 @@
]
}
},
+ "/v1/eval/benchmarks/{benchmark_id}": {
+ "get": {
+ "responses": {
+ "200": {
+ "description": "OK",
+ "content": {
+ "application/json": {
+ "schema": {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/Benchmark"
+ },
+ {
+ "type": "null"
+ }
+ ]
+ }
+ }
+ }
+ }
+ },
+ "tags": [
+ "Benchmarks"
+ ],
+ "description": "",
+ "parameters": [
+ {
+ "name": "benchmark_id",
+ "in": "path",
+ "required": true,
+ "schema": {
+ "type": "string"
+ }
+ }
+ ]
+ }
+ },
"/v1/datasets/{dataset_id}": {
"get": {
"responses": {
@@ -728,43 +1045,6 @@
]
}
},
- "/v1/eval-tasks/{eval_task_id}": {
- "get": {
- "responses": {
- "200": {
- "description": "OK",
- "content": {
- "application/json": {
- "schema": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/EvalTask"
- },
- {
- "type": "null"
- }
- ]
- }
- }
- }
- }
- },
- "tags": [
- "EvalTasks"
- ],
- "description": "",
- "parameters": [
- {
- "name": "eval_task_id",
- "in": "path",
- "required": true,
- "schema": {
- "type": "string"
- }
- }
- ]
- }
- },
"/v1/models/{model_id}": {
"get": {
"responses": {
@@ -1348,7 +1628,7 @@
}
}
},
- "/v1/eval/tasks/{task_id}/jobs/{job_id}": {
+ "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}": {
"get": {
"responses": {
"200": {
@@ -1375,7 +1655,7 @@
"description": "",
"parameters": [
{
- "name": "task_id",
+ "name": "benchmark_id",
"in": "path",
"required": true,
"schema": {
@@ -1404,7 +1684,7 @@
"description": "",
"parameters": [
{
- "name": "task_id",
+ "name": "benchmark_id",
"in": "path",
"required": true,
"schema": {
@@ -1422,7 +1702,7 @@
]
}
},
- "/v1/eval/tasks/{task_id}/jobs/{job_id}/result": {
+ "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result": {
"get": {
"responses": {
"200": {
@@ -1442,7 +1722,7 @@
"description": "",
"parameters": [
{
- "name": "job_id",
+ "name": "benchmark_id",
"in": "path",
"required": true,
"schema": {
@@ -1450,7 +1730,7 @@
}
},
{
- "name": "task_id",
+ "name": "job_id",
"in": "path",
"required": true,
"schema": {
@@ -1460,6 +1740,49 @@
]
}
},
+ "/v1/eval/benchmarks": {
+ "get": {
+ "responses": {
+ "200": {
+ "description": "OK",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/ListBenchmarksResponse"
+ }
+ }
+ }
+ }
+ },
+ "tags": [
+ "Benchmarks"
+ ],
+ "description": "",
+ "parameters": []
+ },
+ "post": {
+ "responses": {
+ "200": {
+ "description": "OK"
+ }
+ },
+ "tags": [
+ "Benchmarks"
+ ],
+ "description": "",
+ "parameters": [],
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/RegisterBenchmarkRequest"
+ }
+ }
+ },
+ "required": true
+ }
+ }
+ },
"/v1/datasets": {
"get": {
"responses": {
@@ -1503,49 +1826,6 @@
}
}
},
- "/v1/eval-tasks": {
- "get": {
- "responses": {
- "200": {
- "description": "OK",
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/ListEvalTasksResponse"
- }
- }
- }
- }
- },
- "tags": [
- "EvalTasks"
- ],
- "description": "",
- "parameters": []
- },
- "post": {
- "responses": {
- "200": {
- "description": "OK"
- }
- },
- "tags": [
- "EvalTasks"
- ],
- "description": "",
- "parameters": [],
- "requestBody": {
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/RegisterEvalTaskRequest"
- }
- }
- },
- "required": true
- }
- }
- },
"/v1/models": {
"get": {
"responses": {
@@ -2121,7 +2401,7 @@
]
}
},
- "/v1/eval/tasks/{task_id}/jobs": {
+ "/v1/eval/benchmarks/{benchmark_id}/jobs": {
"post": {
"responses": {
"200": {
@@ -2141,7 +2421,7 @@
"description": "",
"parameters": [
{
- "name": "task_id",
+ "name": "benchmark_id",
"in": "path",
"required": true,
"schema": {
@@ -2365,84 +2645,216 @@
"jsonSchemaDialect": "https://json-schema.org/draft/2020-12/schema",
"components": {
"schemas": {
- "AppendRowsRequest": {
+ "AgentCandidate": {
"type": "object",
"properties": {
- "dataset_id": {
- "type": "string"
+ "type": {
+ "type": "string",
+ "const": "agent",
+ "default": "agent"
},
- "rows": {
+ "config": {
+ "$ref": "#/components/schemas/AgentConfig"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "config"
+ ]
+ },
+ "AgentConfig": {
+ "type": "object",
+ "properties": {
+ "sampling_params": {
+ "$ref": "#/components/schemas/SamplingParams"
+ },
+ "input_shields": {
"type": "array",
"items": {
- "type": "object",
- "additionalProperties": {
- "oneOf": [
- {
- "type": "null"
- },
- {
- "type": "boolean"
- },
- {
- "type": "number"
- },
- {
- "type": "string"
- },
- {
- "type": "array"
- },
- {
- "type": "object"
- }
- ]
+ "type": "string"
+ }
+ },
+ "output_shields": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "toolgroups": {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/AgentTool"
+ }
+ },
+ "client_tools": {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/ToolDef"
+ }
+ },
+ "tool_choice": {
+ "type": "string",
+ "enum": [
+ "auto",
+ "required"
+ ],
+ "description": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model."
+ },
+ "tool_prompt_format": {
+ "type": "string",
+ "enum": [
+ "json",
+ "function_tag",
+ "python_list"
+ ],
+ "description": "Prompt format for calling custom / zero shot tools."
+ },
+ "tool_config": {
+ "$ref": "#/components/schemas/ToolConfig"
+ },
+ "max_infer_iters": {
+ "type": "integer",
+ "default": 10
+ },
+ "model": {
+ "type": "string"
+ },
+ "instructions": {
+ "type": "string"
+ },
+ "enable_session_persistence": {
+ "type": "boolean"
+ },
+ "response_format": {
+ "$ref": "#/components/schemas/ResponseFormat"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "model",
+ "instructions",
+ "enable_session_persistence"
+ ]
+ },
+ "AgentTool": {
+ "oneOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "object",
+ "properties": {
+ "name": {
+ "type": "string"
+ },
+ "args": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ }
}
+ },
+ "additionalProperties": false,
+ "required": [
+ "name",
+ "args"
+ ]
+ }
+ ]
+ },
+ "AggregationFunctionType": {
+ "type": "string",
+ "enum": [
+ "average",
+ "median",
+ "categorical_count",
+ "accuracy"
+ ]
+ },
+ "BasicScoringFnParams": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "basic",
+ "default": "basic"
+ },
+ "aggregation_functions": {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/AggregationFunctionType"
}
}
},
"additionalProperties": false,
"required": [
- "dataset_id",
- "rows"
+ "type"
]
},
- "CompletionMessage": {
+ "BenchmarkConfig": {
"type": "object",
"properties": {
- "role": {
+ "type": {
"type": "string",
- "const": "assistant",
- "default": "assistant",
- "description": "Must be \"assistant\" to identify this as the model's response"
+ "const": "benchmark",
+ "default": "benchmark"
},
- "content": {
- "$ref": "#/components/schemas/InterleavedContent",
- "description": "The content of the model's response"
+ "eval_candidate": {
+ "$ref": "#/components/schemas/EvalCandidate"
},
- "stop_reason": {
- "type": "string",
- "enum": [
- "end_of_turn",
- "end_of_message",
- "out_of_tokens"
- ],
- "description": "Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`: The model finished generating the entire response. - `StopReason.end_of_message`: The model finished generating but generated a partial response -- usually, a tool call. The user may call the tool and continue the conversation with the tool's response. - `StopReason.out_of_tokens`: The model ran out of token budget."
+ "scoring_params": {
+ "type": "object",
+ "additionalProperties": {
+ "$ref": "#/components/schemas/ScoringFnParams"
+ }
},
- "tool_calls": {
- "type": "array",
- "items": {
- "$ref": "#/components/schemas/ToolCall"
- },
- "description": "List of tool calls. Each tool call is a ToolCall object."
+ "num_examples": {
+ "type": "integer"
}
},
"additionalProperties": false,
"required": [
- "role",
- "content",
- "stop_reason"
+ "type",
+ "eval_candidate",
+ "scoring_params"
+ ]
+ },
+ "EvalCandidate": {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/ModelCandidate"
+ },
+ {
+ "$ref": "#/components/schemas/AgentCandidate"
+ }
],
- "description": "A message containing the model's (assistant) response in a chat conversation."
+ "discriminator": {
+ "propertyName": "type",
+ "mapping": {
+ "model": "#/components/schemas/ModelCandidate",
+ "agent": "#/components/schemas/AgentCandidate"
+ }
+ }
},
"GrammarResponseFormat": {
"type": "object",
@@ -2610,30 +3022,89 @@
],
"description": "Configuration for JSON schema-guided response generation."
},
- "Message": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/UserMessage"
+ "LLMAsJudgeScoringFnParams": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "llm_as_judge",
+ "default": "llm_as_judge"
},
- {
+ "judge_model": {
+ "type": "string"
+ },
+ "prompt_template": {
+ "type": "string"
+ },
+ "judge_score_regexes": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "aggregation_functions": {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/AggregationFunctionType"
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "judge_model"
+ ]
+ },
+ "ModelCandidate": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "model",
+ "default": "model"
+ },
+ "model": {
+ "type": "string"
+ },
+ "sampling_params": {
+ "$ref": "#/components/schemas/SamplingParams"
+ },
+ "system_message": {
"$ref": "#/components/schemas/SystemMessage"
- },
- {
- "$ref": "#/components/schemas/ToolResponseMessage"
- },
- {
- "$ref": "#/components/schemas/CompletionMessage"
}
- ],
- "discriminator": {
- "propertyName": "role",
- "mapping": {
- "user": "#/components/schemas/UserMessage",
- "system": "#/components/schemas/SystemMessage",
- "tool": "#/components/schemas/ToolResponseMessage",
- "assistant": "#/components/schemas/CompletionMessage"
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "model",
+ "sampling_params"
+ ]
+ },
+ "RegexParserScoringFnParams": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "regex_parser",
+ "default": "regex_parser"
+ },
+ "parsing_regexes": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "aggregation_functions": {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/AggregationFunctionType"
+ }
}
- }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type"
+ ]
},
"ResponseFormat": {
"oneOf": [
@@ -2693,6 +3164,27 @@
}
}
},
+ "ScoringFnParams": {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/LLMAsJudgeScoringFnParams"
+ },
+ {
+ "$ref": "#/components/schemas/RegexParserScoringFnParams"
+ },
+ {
+ "$ref": "#/components/schemas/BasicScoringFnParams"
+ }
+ ],
+ "discriminator": {
+ "propertyName": "type",
+ "mapping": {
+ "llm_as_judge": "#/components/schemas/LLMAsJudgeScoringFnParams",
+ "regex_parser": "#/components/schemas/RegexParserScoringFnParams",
+ "basic": "#/components/schemas/BasicScoringFnParams"
+ }
+ }
+ },
"SystemMessage": {
"type": "object",
"properties": {
@@ -2735,6 +3227,611 @@
],
"description": "A text content item"
},
+ "ToolConfig": {
+ "type": "object",
+ "properties": {
+ "tool_choice": {
+ "type": "string",
+ "enum": [
+ "auto",
+ "required"
+ ],
+ "description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.",
+ "default": "auto"
+ },
+ "tool_prompt_format": {
+ "type": "string",
+ "enum": [
+ "json",
+ "function_tag",
+ "python_list"
+ ],
+ "description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls."
+ },
+ "system_message_behavior": {
+ "type": "string",
+ "enum": [
+ "append",
+ "replace"
+ ],
+ "description": "(Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`: Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`: Replaces the default system prompt with the provided system message. The system message can include the string '{{function_definitions}}' to indicate where the function definitions should be inserted.",
+ "default": "append"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "system_message_behavior"
+ ],
+ "description": "Configuration for tool use."
+ },
+ "ToolDef": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "type": "string"
+ },
+ "description": {
+ "type": "string"
+ },
+ "parameters": {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/ToolParameter"
+ }
+ },
+ "metadata": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "name"
+ ]
+ },
+ "ToolParameter": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "type": "string"
+ },
+ "parameter_type": {
+ "type": "string"
+ },
+ "description": {
+ "type": "string"
+ },
+ "required": {
+ "type": "boolean",
+ "default": true
+ },
+ "default": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "name",
+ "parameter_type",
+ "description",
+ "required"
+ ]
+ },
+ "TopKSamplingStrategy": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "top_k",
+ "default": "top_k"
+ },
+ "top_k": {
+ "type": "integer"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "top_k"
+ ]
+ },
+ "TopPSamplingStrategy": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "top_p",
+ "default": "top_p"
+ },
+ "temperature": {
+ "type": "number"
+ },
+ "top_p": {
+ "type": "number",
+ "default": 0.95
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type"
+ ]
+ },
+ "URL": {
+ "type": "object",
+ "properties": {
+ "uri": {
+ "type": "string"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "uri"
+ ]
+ },
+ "DeprecatedEvaluateRowsRequest": {
+ "type": "object",
+ "properties": {
+ "input_rows": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ }
+ }
+ },
+ "scoring_functions": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "task_config": {
+ "$ref": "#/components/schemas/BenchmarkConfig"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "input_rows",
+ "scoring_functions",
+ "task_config"
+ ]
+ },
+ "EvaluateResponse": {
+ "type": "object",
+ "properties": {
+ "generations": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ }
+ }
+ },
+ "scores": {
+ "type": "object",
+ "additionalProperties": {
+ "$ref": "#/components/schemas/ScoringResult"
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "generations",
+ "scores"
+ ]
+ },
+ "ScoringResult": {
+ "type": "object",
+ "properties": {
+ "score_rows": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ }
+ }
+ },
+ "aggregated_results": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "score_rows",
+ "aggregated_results"
+ ]
+ },
+ "Benchmark": {
+ "type": "object",
+ "properties": {
+ "identifier": {
+ "type": "string"
+ },
+ "provider_resource_id": {
+ "type": "string"
+ },
+ "provider_id": {
+ "type": "string"
+ },
+ "type": {
+ "type": "string",
+ "const": "benchmark",
+ "default": "benchmark"
+ },
+ "dataset_id": {
+ "type": "string"
+ },
+ "scoring_functions": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "metadata": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "identifier",
+ "provider_resource_id",
+ "provider_id",
+ "type",
+ "dataset_id",
+ "scoring_functions",
+ "metadata"
+ ]
+ },
+ "JobStatus": {
+ "type": "string",
+ "enum": [
+ "completed",
+ "in_progress",
+ "failed",
+ "scheduled"
+ ]
+ },
+ "ListBenchmarksResponse": {
+ "type": "object",
+ "properties": {
+ "data": {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/Benchmark"
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "data"
+ ]
+ },
+ "DeprecatedRegisterEvalTaskRequest": {
+ "type": "object",
+ "properties": {
+ "eval_task_id": {
+ "type": "string"
+ },
+ "dataset_id": {
+ "type": "string"
+ },
+ "scoring_functions": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "provider_benchmark_id": {
+ "type": "string"
+ },
+ "provider_id": {
+ "type": "string"
+ },
+ "metadata": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "eval_task_id",
+ "dataset_id",
+ "scoring_functions"
+ ]
+ },
+ "DeprecatedRunEvalRequest": {
+ "type": "object",
+ "properties": {
+ "task_config": {
+ "$ref": "#/components/schemas/BenchmarkConfig"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "task_config"
+ ]
+ },
+ "Job": {
+ "type": "object",
+ "properties": {
+ "job_id": {
+ "type": "string"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "job_id"
+ ]
+ },
+ "AppendRowsRequest": {
+ "type": "object",
+ "properties": {
+ "dataset_id": {
+ "type": "string"
+ },
+ "rows": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ }
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "dataset_id",
+ "rows"
+ ]
+ },
+ "CompletionMessage": {
+ "type": "object",
+ "properties": {
+ "role": {
+ "type": "string",
+ "const": "assistant",
+ "default": "assistant",
+ "description": "Must be \"assistant\" to identify this as the model's response"
+ },
+ "content": {
+ "$ref": "#/components/schemas/InterleavedContent",
+ "description": "The content of the model's response"
+ },
+ "stop_reason": {
+ "type": "string",
+ "enum": [
+ "end_of_turn",
+ "end_of_message",
+ "out_of_tokens"
+ ],
+ "description": "Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`: The model finished generating the entire response. - `StopReason.end_of_message`: The model finished generating but generated a partial response -- usually, a tool call. The user may call the tool and continue the conversation with the tool's response. - `StopReason.out_of_tokens`: The model ran out of token budget."
+ },
+ "tool_calls": {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/ToolCall"
+ },
+ "description": "List of tool calls. Each tool call is a ToolCall object."
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "role",
+ "content",
+ "stop_reason"
+ ],
+ "description": "A message containing the model's (assistant) response in a chat conversation."
+ },
+ "Message": {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/UserMessage"
+ },
+ {
+ "$ref": "#/components/schemas/SystemMessage"
+ },
+ {
+ "$ref": "#/components/schemas/ToolResponseMessage"
+ },
+ {
+ "$ref": "#/components/schemas/CompletionMessage"
+ }
+ ],
+ "discriminator": {
+ "propertyName": "role",
+ "mapping": {
+ "user": "#/components/schemas/UserMessage",
+ "system": "#/components/schemas/SystemMessage",
+ "tool": "#/components/schemas/ToolResponseMessage",
+ "assistant": "#/components/schemas/CompletionMessage"
+ }
+ }
+ },
"ToolCall": {
"type": "object",
"properties": {
@@ -2950,57 +4047,6 @@
],
"description": "A message representing the result of a tool invocation."
},
- "TopKSamplingStrategy": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "top_k",
- "default": "top_k"
- },
- "top_k": {
- "type": "integer"
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "top_k"
- ]
- },
- "TopPSamplingStrategy": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "top_p",
- "default": "top_p"
- },
- "temperature": {
- "type": "number"
- },
- "top_p": {
- "type": "number",
- "default": 0.95
- }
- },
- "additionalProperties": false,
- "required": [
- "type"
- ]
- },
- "URL": {
- "type": "object",
- "properties": {
- "uri": {
- "type": "string"
- }
- },
- "additionalProperties": false,
- "required": [
- "uri"
- ]
- },
"UserMessage": {
"type": "object",
"properties": {
@@ -3309,43 +4355,6 @@
"job_uuid"
]
},
- "ToolConfig": {
- "type": "object",
- "properties": {
- "tool_choice": {
- "type": "string",
- "enum": [
- "auto",
- "required"
- ],
- "description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.",
- "default": "auto"
- },
- "tool_prompt_format": {
- "type": "string",
- "enum": [
- "json",
- "function_tag",
- "python_list"
- ],
- "description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls."
- },
- "system_message_behavior": {
- "type": "string",
- "enum": [
- "append",
- "replace"
- ],
- "description": "(Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`: Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`: Replaces the default system prompt with the provided system message. The system message can include the string '{{function_definitions}}' to indicate where the function definitions should be inserted.",
- "default": "append"
- }
- },
- "additionalProperties": false,
- "required": [
- "system_message_behavior"
- ],
- "description": "Configuration for tool use."
- },
"ChatCompletionRequest": {
"type": "object",
"properties": {
@@ -3644,218 +4653,6 @@
],
"description": "A chunk of a streamed completion response."
},
- "AgentConfig": {
- "type": "object",
- "properties": {
- "sampling_params": {
- "$ref": "#/components/schemas/SamplingParams"
- },
- "input_shields": {
- "type": "array",
- "items": {
- "type": "string"
- }
- },
- "output_shields": {
- "type": "array",
- "items": {
- "type": "string"
- }
- },
- "toolgroups": {
- "type": "array",
- "items": {
- "$ref": "#/components/schemas/AgentTool"
- }
- },
- "client_tools": {
- "type": "array",
- "items": {
- "$ref": "#/components/schemas/ToolDef"
- }
- },
- "tool_choice": {
- "type": "string",
- "enum": [
- "auto",
- "required"
- ],
- "description": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model."
- },
- "tool_prompt_format": {
- "type": "string",
- "enum": [
- "json",
- "function_tag",
- "python_list"
- ],
- "description": "Prompt format for calling custom / zero shot tools."
- },
- "tool_config": {
- "$ref": "#/components/schemas/ToolConfig"
- },
- "max_infer_iters": {
- "type": "integer",
- "default": 10
- },
- "model": {
- "type": "string"
- },
- "instructions": {
- "type": "string"
- },
- "enable_session_persistence": {
- "type": "boolean"
- },
- "response_format": {
- "$ref": "#/components/schemas/ResponseFormat"
- }
- },
- "additionalProperties": false,
- "required": [
- "model",
- "instructions",
- "enable_session_persistence"
- ]
- },
- "AgentTool": {
- "oneOf": [
- {
- "type": "string"
- },
- {
- "type": "object",
- "properties": {
- "name": {
- "type": "string"
- },
- "args": {
- "type": "object",
- "additionalProperties": {
- "oneOf": [
- {
- "type": "null"
- },
- {
- "type": "boolean"
- },
- {
- "type": "number"
- },
- {
- "type": "string"
- },
- {
- "type": "array"
- },
- {
- "type": "object"
- }
- ]
- }
- }
- },
- "additionalProperties": false,
- "required": [
- "name",
- "args"
- ]
- }
- ]
- },
- "ToolDef": {
- "type": "object",
- "properties": {
- "name": {
- "type": "string"
- },
- "description": {
- "type": "string"
- },
- "parameters": {
- "type": "array",
- "items": {
- "$ref": "#/components/schemas/ToolParameter"
- }
- },
- "metadata": {
- "type": "object",
- "additionalProperties": {
- "oneOf": [
- {
- "type": "null"
- },
- {
- "type": "boolean"
- },
- {
- "type": "number"
- },
- {
- "type": "string"
- },
- {
- "type": "array"
- },
- {
- "type": "object"
- }
- ]
- }
- }
- },
- "additionalProperties": false,
- "required": [
- "name"
- ]
- },
- "ToolParameter": {
- "type": "object",
- "properties": {
- "name": {
- "type": "string"
- },
- "parameter_type": {
- "type": "string"
- },
- "description": {
- "type": "string"
- },
- "required": {
- "type": "boolean",
- "default": true
- },
- "default": {
- "oneOf": [
- {
- "type": "null"
- },
- {
- "type": "boolean"
- },
- {
- "type": "number"
- },
- {
- "type": "string"
- },
- {
- "type": "array"
- },
- {
- "type": "object"
- }
- ]
- }
- },
- "additionalProperties": false,
- "required": [
- "name",
- "parameter_type",
- "description",
- "required"
- ]
- },
"CreateAgentRequest": {
"type": "object",
"properties": {
@@ -4582,241 +5379,6 @@
],
"description": "Response containing generated embeddings."
},
- "AgentCandidate": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "agent",
- "default": "agent"
- },
- "config": {
- "$ref": "#/components/schemas/AgentConfig"
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "config"
- ]
- },
- "AggregationFunctionType": {
- "type": "string",
- "enum": [
- "average",
- "median",
- "categorical_count",
- "accuracy"
- ]
- },
- "AppEvalTaskConfig": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "app",
- "default": "app"
- },
- "eval_candidate": {
- "$ref": "#/components/schemas/EvalCandidate"
- },
- "scoring_params": {
- "type": "object",
- "additionalProperties": {
- "$ref": "#/components/schemas/ScoringFnParams"
- }
- },
- "num_examples": {
- "type": "integer"
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "eval_candidate",
- "scoring_params"
- ]
- },
- "BasicScoringFnParams": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "basic",
- "default": "basic"
- },
- "aggregation_functions": {
- "type": "array",
- "items": {
- "$ref": "#/components/schemas/AggregationFunctionType"
- }
- }
- },
- "additionalProperties": false,
- "required": [
- "type"
- ]
- },
- "BenchmarkEvalTaskConfig": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "benchmark",
- "default": "benchmark"
- },
- "eval_candidate": {
- "$ref": "#/components/schemas/EvalCandidate"
- },
- "num_examples": {
- "type": "integer"
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "eval_candidate"
- ]
- },
- "EvalCandidate": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/ModelCandidate"
- },
- {
- "$ref": "#/components/schemas/AgentCandidate"
- }
- ],
- "discriminator": {
- "propertyName": "type",
- "mapping": {
- "model": "#/components/schemas/ModelCandidate",
- "agent": "#/components/schemas/AgentCandidate"
- }
- }
- },
- "EvalTaskConfig": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/BenchmarkEvalTaskConfig"
- },
- {
- "$ref": "#/components/schemas/AppEvalTaskConfig"
- }
- ],
- "discriminator": {
- "propertyName": "type",
- "mapping": {
- "benchmark": "#/components/schemas/BenchmarkEvalTaskConfig",
- "app": "#/components/schemas/AppEvalTaskConfig"
- }
- }
- },
- "LLMAsJudgeScoringFnParams": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "llm_as_judge",
- "default": "llm_as_judge"
- },
- "judge_model": {
- "type": "string"
- },
- "prompt_template": {
- "type": "string"
- },
- "judge_score_regexes": {
- "type": "array",
- "items": {
- "type": "string"
- }
- },
- "aggregation_functions": {
- "type": "array",
- "items": {
- "$ref": "#/components/schemas/AggregationFunctionType"
- }
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "judge_model"
- ]
- },
- "ModelCandidate": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "model",
- "default": "model"
- },
- "model": {
- "type": "string"
- },
- "sampling_params": {
- "$ref": "#/components/schemas/SamplingParams"
- },
- "system_message": {
- "$ref": "#/components/schemas/SystemMessage"
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "model",
- "sampling_params"
- ]
- },
- "RegexParserScoringFnParams": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "regex_parser",
- "default": "regex_parser"
- },
- "parsing_regexes": {
- "type": "array",
- "items": {
- "type": "string"
- }
- },
- "aggregation_functions": {
- "type": "array",
- "items": {
- "$ref": "#/components/schemas/AggregationFunctionType"
- }
- }
- },
- "additionalProperties": false,
- "required": [
- "type"
- ]
- },
- "ScoringFnParams": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/LLMAsJudgeScoringFnParams"
- },
- {
- "$ref": "#/components/schemas/RegexParserScoringFnParams"
- },
- {
- "$ref": "#/components/schemas/BasicScoringFnParams"
- }
- ],
- "discriminator": {
- "propertyName": "type",
- "mapping": {
- "llm_as_judge": "#/components/schemas/LLMAsJudgeScoringFnParams",
- "regex_parser": "#/components/schemas/RegexParserScoringFnParams",
- "basic": "#/components/schemas/BasicScoringFnParams"
- }
- }
- },
"EvaluateRowsRequest": {
"type": "object",
"properties": {
@@ -4855,7 +5417,7 @@
}
},
"task_config": {
- "$ref": "#/components/schemas/EvalTaskConfig"
+ "$ref": "#/components/schemas/BenchmarkConfig"
}
},
"additionalProperties": false,
@@ -4865,113 +5427,6 @@
"task_config"
]
},
- "EvaluateResponse": {
- "type": "object",
- "properties": {
- "generations": {
- "type": "array",
- "items": {
- "type": "object",
- "additionalProperties": {
- "oneOf": [
- {
- "type": "null"
- },
- {
- "type": "boolean"
- },
- {
- "type": "number"
- },
- {
- "type": "string"
- },
- {
- "type": "array"
- },
- {
- "type": "object"
- }
- ]
- }
- }
- },
- "scores": {
- "type": "object",
- "additionalProperties": {
- "$ref": "#/components/schemas/ScoringResult"
- }
- }
- },
- "additionalProperties": false,
- "required": [
- "generations",
- "scores"
- ]
- },
- "ScoringResult": {
- "type": "object",
- "properties": {
- "score_rows": {
- "type": "array",
- "items": {
- "type": "object",
- "additionalProperties": {
- "oneOf": [
- {
- "type": "null"
- },
- {
- "type": "boolean"
- },
- {
- "type": "number"
- },
- {
- "type": "string"
- },
- {
- "type": "array"
- },
- {
- "type": "object"
- }
- ]
- }
- }
- },
- "aggregated_results": {
- "type": "object",
- "additionalProperties": {
- "oneOf": [
- {
- "type": "null"
- },
- {
- "type": "boolean"
- },
- {
- "type": "number"
- },
- {
- "type": "string"
- },
- {
- "type": "array"
- },
- {
- "type": "object"
- }
- ]
- }
- }
- },
- "additionalProperties": false,
- "required": [
- "score_rows",
- "aggregated_results"
- ]
- },
"Session": {
"type": "object",
"properties": {
@@ -5287,69 +5742,6 @@
"type"
]
},
- "EvalTask": {
- "type": "object",
- "properties": {
- "identifier": {
- "type": "string"
- },
- "provider_resource_id": {
- "type": "string"
- },
- "provider_id": {
- "type": "string"
- },
- "type": {
- "type": "string",
- "const": "eval_task",
- "default": "eval_task"
- },
- "dataset_id": {
- "type": "string"
- },
- "scoring_functions": {
- "type": "array",
- "items": {
- "type": "string"
- }
- },
- "metadata": {
- "type": "object",
- "additionalProperties": {
- "oneOf": [
- {
- "type": "null"
- },
- {
- "type": "boolean"
- },
- {
- "type": "number"
- },
- {
- "type": "string"
- },
- {
- "type": "array"
- },
- {
- "type": "object"
- }
- ]
- }
- }
- },
- "additionalProperties": false,
- "required": [
- "identifier",
- "provider_resource_id",
- "provider_id",
- "type",
- "dataset_id",
- "scoring_functions",
- "metadata"
- ]
- },
"Model": {
"type": "object",
"properties": {
@@ -5891,15 +6283,6 @@
],
"description": "Artifacts of a finetuning job."
},
- "JobStatus": {
- "type": "string",
- "enum": [
- "completed",
- "in_progress",
- "failed",
- "scheduled"
- ]
- },
"PostTrainingJobStatusResponse": {
"type": "object",
"properties": {
@@ -6243,21 +6626,6 @@
"data"
]
},
- "ListEvalTasksResponse": {
- "type": "object",
- "properties": {
- "data": {
- "type": "array",
- "items": {
- "$ref": "#/components/schemas/EvalTask"
- }
- }
- },
- "additionalProperties": false,
- "required": [
- "data"
- ]
- },
"ListModelsResponse": {
"type": "object",
"properties": {
@@ -7169,6 +7537,60 @@
"data"
]
},
+ "RegisterBenchmarkRequest": {
+ "type": "object",
+ "properties": {
+ "benchmark_id": {
+ "type": "string"
+ },
+ "dataset_id": {
+ "type": "string"
+ },
+ "scoring_functions": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "provider_benchmark_id": {
+ "type": "string"
+ },
+ "provider_id": {
+ "type": "string"
+ },
+ "metadata": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "benchmark_id",
+ "dataset_id",
+ "scoring_functions"
+ ]
+ },
"RegisterDatasetRequest": {
"type": "object",
"properties": {
@@ -7223,60 +7645,6 @@
"url"
]
},
- "RegisterEvalTaskRequest": {
- "type": "object",
- "properties": {
- "eval_task_id": {
- "type": "string"
- },
- "dataset_id": {
- "type": "string"
- },
- "scoring_functions": {
- "type": "array",
- "items": {
- "type": "string"
- }
- },
- "provider_eval_task_id": {
- "type": "string"
- },
- "provider_id": {
- "type": "string"
- },
- "metadata": {
- "type": "object",
- "additionalProperties": {
- "oneOf": [
- {
- "type": "null"
- },
- {
- "type": "boolean"
- },
- {
- "type": "number"
- },
- {
- "type": "string"
- },
- {
- "type": "array"
- },
- {
- "type": "object"
- }
- ]
- }
- }
- },
- "additionalProperties": false,
- "required": [
- "eval_task_id",
- "dataset_id",
- "scoring_functions"
- ]
- },
"RegisterModelRequest": {
"type": "object",
"properties": {
@@ -7468,7 +7836,7 @@
"type": "object",
"properties": {
"task_config": {
- "$ref": "#/components/schemas/EvalTaskConfig"
+ "$ref": "#/components/schemas/BenchmarkConfig"
}
},
"additionalProperties": false,
@@ -7476,18 +7844,6 @@
"task_config"
]
},
- "Job": {
- "type": "object",
- "properties": {
- "job_id": {
- "type": "string"
- }
- },
- "additionalProperties": false,
- "required": [
- "job_id"
- ]
- },
"RunShieldRequest": {
"type": "object",
"properties": {
@@ -7970,6 +8326,9 @@
{
"name": "BatchInference (Coming Soon)"
},
+ {
+ "name": "Benchmarks"
+ },
{
"name": "DatasetIO"
},
@@ -7979,9 +8338,6 @@
{
"name": "Eval"
},
- {
- "name": "EvalTasks"
- },
{
"name": "Inference",
"description": "This API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.",
@@ -8033,10 +8389,10 @@
"tags": [
"Agents",
"BatchInference (Coming Soon)",
+ "Benchmarks",
"DatasetIO",
"Datasets",
"Eval",
- "EvalTasks",
"Inference",
"Inspect",
"Models",
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index a646d7e08..b30025020 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -10,6 +10,175 @@ info:
servers:
- url: http://any-hosted-llama-stack.com
paths:
+ /v1/eval/tasks/{task_id}/evaluations:
+ post:
+ responses:
+ '200':
+ description: OK
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/EvaluateResponse'
+ tags:
+ - Eval
+ description: ''
+ parameters:
+ - name: task_id
+ in: path
+ required: true
+ schema:
+ type: string
+ requestBody:
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/DeprecatedEvaluateRowsRequest'
+ required: true
+ deprecated: true
+ /v1/eval-tasks/{task_id}:
+ get:
+ responses:
+ '200':
+ description: OK
+ content:
+ application/json:
+ schema:
+ oneOf:
+ - $ref: '#/components/schemas/Benchmark'
+ - type: 'null'
+ tags:
+ - Benchmarks
+ description: ''
+ parameters:
+ - name: eval_task_id
+ in: query
+ required: true
+ schema:
+ type: string
+ deprecated: true
+ /v1/eval/tasks/{task_id}/jobs/{job_id}:
+ get:
+ responses:
+ '200':
+ description: OK
+ content:
+ application/json:
+ schema:
+ oneOf:
+ - $ref: '#/components/schemas/JobStatus'
+ - type: 'null'
+ tags:
+ - Eval
+ description: ''
+ parameters:
+ - name: task_id
+ in: path
+ required: true
+ schema:
+ type: string
+ - name: job_id
+ in: path
+ required: true
+ schema:
+ type: string
+ deprecated: true
+ delete:
+ responses:
+ '200':
+ description: OK
+ tags:
+ - Eval
+ description: ''
+ parameters:
+ - name: task_id
+ in: path
+ required: true
+ schema:
+ type: string
+ - name: job_id
+ in: path
+ required: true
+ schema:
+ type: string
+ deprecated: true
+ /v1/eval/tasks/{task_id}/jobs/{job_id}/result:
+ get:
+ responses:
+ '200':
+ description: OK
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/EvaluateResponse'
+ tags:
+ - Eval
+ description: ''
+ parameters:
+ - name: task_id
+ in: path
+ required: true
+ schema:
+ type: string
+ - name: job_id
+ in: path
+ required: true
+ schema:
+ type: string
+ deprecated: true
+ /v1/eval-tasks:
+ get:
+ responses:
+ '200':
+ description: OK
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ListBenchmarksResponse'
+ tags:
+ - Benchmarks
+ description: ''
+ parameters: []
+ deprecated: true
+ post:
+ responses:
+ '200':
+ description: OK
+ tags:
+ - Benchmarks
+ description: ''
+ parameters: []
+ requestBody:
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/DeprecatedRegisterEvalTaskRequest'
+ required: true
+ deprecated: true
+ /v1/eval/tasks/{task_id}/jobs:
+ post:
+ responses:
+ '200':
+ description: OK
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/Job'
+ tags:
+ - Eval
+ description: ''
+ parameters:
+ - name: task_id
+ in: path
+ required: true
+ schema:
+ type: string
+ requestBody:
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/DeprecatedRunEvalRequest'
+ required: true
+ deprecated: true
/v1/datasetio/rows:
get:
responses:
@@ -322,7 +491,7 @@ paths:
schema:
$ref: '#/components/schemas/EmbeddingsRequest'
required: true
- /v1/eval/tasks/{task_id}/evaluations:
+ /v1/eval/benchmarks/{benchmark_id}/evaluations:
post:
responses:
'200':
@@ -335,7 +504,7 @@ paths:
- Eval
description: ''
parameters:
- - name: task_id
+ - name: benchmark_id
in: path
required: true
schema:
@@ -407,6 +576,26 @@ paths:
required: true
schema:
type: string
+ /v1/eval/benchmarks/{benchmark_id}:
+ get:
+ responses:
+ '200':
+ description: OK
+ content:
+ application/json:
+ schema:
+ oneOf:
+ - $ref: '#/components/schemas/Benchmark'
+ - type: 'null'
+ tags:
+ - Benchmarks
+ description: ''
+ parameters:
+ - name: benchmark_id
+ in: path
+ required: true
+ schema:
+ type: string
/v1/datasets/{dataset_id}:
get:
responses:
@@ -440,26 +629,6 @@ paths:
required: true
schema:
type: string
- /v1/eval-tasks/{eval_task_id}:
- get:
- responses:
- '200':
- description: OK
- content:
- application/json:
- schema:
- oneOf:
- - $ref: '#/components/schemas/EvalTask'
- - type: 'null'
- tags:
- - EvalTasks
- description: ''
- parameters:
- - name: eval_task_id
- in: path
- required: true
- schema:
- type: string
/v1/models/{model_id}:
get:
responses:
@@ -802,7 +971,7 @@ paths:
schema:
$ref: '#/components/schemas/InvokeToolRequest'
required: true
- /v1/eval/tasks/{task_id}/jobs/{job_id}:
+ /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}:
get:
responses:
'200':
@@ -817,7 +986,7 @@ paths:
- Eval
description: ''
parameters:
- - name: task_id
+ - name: benchmark_id
in: path
required: true
schema:
@@ -835,7 +1004,7 @@ paths:
- Eval
description: ''
parameters:
- - name: task_id
+ - name: benchmark_id
in: path
required: true
schema:
@@ -845,7 +1014,7 @@ paths:
required: true
schema:
type: string
- /v1/eval/tasks/{task_id}/jobs/{job_id}/result:
+ /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result:
get:
responses:
'200':
@@ -858,16 +1027,43 @@ paths:
- Eval
description: ''
parameters:
+ - name: benchmark_id
+ in: path
+ required: true
+ schema:
+ type: string
- name: job_id
in: path
required: true
schema:
type: string
- - name: task_id
- in: path
- required: true
- schema:
- type: string
+ /v1/eval/benchmarks:
+ get:
+ responses:
+ '200':
+ description: OK
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ListBenchmarksResponse'
+ tags:
+ - Benchmarks
+ description: ''
+ parameters: []
+ post:
+ responses:
+ '200':
+ description: OK
+ tags:
+ - Benchmarks
+ description: ''
+ parameters: []
+ requestBody:
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/RegisterBenchmarkRequest'
+ required: true
/v1/datasets:
get:
responses:
@@ -895,33 +1091,6 @@ paths:
schema:
$ref: '#/components/schemas/RegisterDatasetRequest'
required: true
- /v1/eval-tasks:
- get:
- responses:
- '200':
- description: OK
- content:
- application/json:
- schema:
- $ref: '#/components/schemas/ListEvalTasksResponse'
- tags:
- - EvalTasks
- description: ''
- parameters: []
- post:
- responses:
- '200':
- description: OK
- tags:
- - EvalTasks
- description: ''
- parameters: []
- requestBody:
- content:
- application/json:
- schema:
- $ref: '#/components/schemas/RegisterEvalTaskRequest'
- required: true
/v1/models:
get:
responses:
@@ -1278,7 +1447,7 @@ paths:
type: array
items:
type: string
- /v1/eval/tasks/{task_id}/jobs:
+ /v1/eval/benchmarks/{benchmark_id}/jobs:
post:
responses:
'200':
@@ -1291,7 +1460,7 @@ paths:
- Eval
description: ''
parameters:
- - name: task_id
+ - name: benchmark_id
in: path
required: true
schema:
@@ -1429,65 +1598,146 @@ jsonSchemaDialect: >-
https://json-schema.org/draft/2020-12/schema
components:
schemas:
- AppendRowsRequest:
+ AgentCandidate:
type: object
properties:
- dataset_id:
+ type:
type: string
- rows:
- type: array
- items:
- type: object
- additionalProperties:
- oneOf:
- - type: 'null'
- - type: boolean
- - type: number
- - type: string
- - type: array
- - type: object
+ const: agent
+ default: agent
+ config:
+ $ref: '#/components/schemas/AgentConfig'
additionalProperties: false
required:
- - dataset_id
- - rows
- CompletionMessage:
+ - type
+ - config
+ AgentConfig:
type: object
properties:
- role:
- type: string
- const: assistant
- default: assistant
- description: >-
- Must be "assistant" to identify this as the model's response
- content:
- $ref: '#/components/schemas/InterleavedContent'
- description: The content of the model's response
- stop_reason:
+ sampling_params:
+ $ref: '#/components/schemas/SamplingParams'
+ input_shields:
+ type: array
+ items:
+ type: string
+ output_shields:
+ type: array
+ items:
+ type: string
+ toolgroups:
+ type: array
+ items:
+ $ref: '#/components/schemas/AgentTool'
+ client_tools:
+ type: array
+ items:
+ $ref: '#/components/schemas/ToolDef'
+ tool_choice:
type: string
enum:
- - end_of_turn
- - end_of_message
- - out_of_tokens
+ - auto
+ - required
description: >-
- Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`:
- The model finished generating the entire response. - `StopReason.end_of_message`:
- The model finished generating but generated a partial response -- usually,
- a tool call. The user may call the tool and continue the conversation
- with the tool's response. - `StopReason.out_of_tokens`: The model ran
- out of token budget.
- tool_calls:
- type: array
- items:
- $ref: '#/components/schemas/ToolCall'
+ Whether tool use is required or automatic. This is a hint to the model
+ which may not be followed. It depends on the Instruction Following capabilities
+ of the model.
+ tool_prompt_format:
+ type: string
+ enum:
+ - json
+ - function_tag
+ - python_list
description: >-
- List of tool calls. Each tool call is a ToolCall object.
+ Prompt format for calling custom / zero shot tools.
+ tool_config:
+ $ref: '#/components/schemas/ToolConfig'
+ max_infer_iters:
+ type: integer
+ default: 10
+ model:
+ type: string
+ instructions:
+ type: string
+ enable_session_persistence:
+ type: boolean
+ response_format:
+ $ref: '#/components/schemas/ResponseFormat'
additionalProperties: false
required:
- - role
- - content
- - stop_reason
- description: >-
- A message containing the model's (assistant) response in a chat conversation.
+ - model
+ - instructions
+ - enable_session_persistence
+ AgentTool:
+ oneOf:
+ - type: string
+ - type: object
+ properties:
+ name:
+ type: string
+ args:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ additionalProperties: false
+ required:
+ - name
+ - args
+ AggregationFunctionType:
+ type: string
+ enum:
+ - average
+ - median
+ - categorical_count
+ - accuracy
+ BasicScoringFnParams:
+ type: object
+ properties:
+ type:
+ type: string
+ const: basic
+ default: basic
+ aggregation_functions:
+ type: array
+ items:
+ $ref: '#/components/schemas/AggregationFunctionType'
+ additionalProperties: false
+ required:
+ - type
+ BenchmarkConfig:
+ type: object
+ properties:
+ type:
+ type: string
+ const: benchmark
+ default: benchmark
+ eval_candidate:
+ $ref: '#/components/schemas/EvalCandidate'
+ scoring_params:
+ type: object
+ additionalProperties:
+ $ref: '#/components/schemas/ScoringFnParams'
+ num_examples:
+ type: integer
+ additionalProperties: false
+ required:
+ - type
+ - eval_candidate
+ - scoring_params
+ EvalCandidate:
+ oneOf:
+ - $ref: '#/components/schemas/ModelCandidate'
+ - $ref: '#/components/schemas/AgentCandidate'
+ discriminator:
+ propertyName: type
+ mapping:
+ model: '#/components/schemas/ModelCandidate'
+ agent: '#/components/schemas/AgentCandidate'
GrammarResponseFormat:
type: object
properties:
@@ -1598,19 +1848,65 @@ components:
- json_schema
description: >-
Configuration for JSON schema-guided response generation.
- Message:
- oneOf:
- - $ref: '#/components/schemas/UserMessage'
- - $ref: '#/components/schemas/SystemMessage'
- - $ref: '#/components/schemas/ToolResponseMessage'
- - $ref: '#/components/schemas/CompletionMessage'
- discriminator:
- propertyName: role
- mapping:
- user: '#/components/schemas/UserMessage'
- system: '#/components/schemas/SystemMessage'
- tool: '#/components/schemas/ToolResponseMessage'
- assistant: '#/components/schemas/CompletionMessage'
+ LLMAsJudgeScoringFnParams:
+ type: object
+ properties:
+ type:
+ type: string
+ const: llm_as_judge
+ default: llm_as_judge
+ judge_model:
+ type: string
+ prompt_template:
+ type: string
+ judge_score_regexes:
+ type: array
+ items:
+ type: string
+ aggregation_functions:
+ type: array
+ items:
+ $ref: '#/components/schemas/AggregationFunctionType'
+ additionalProperties: false
+ required:
+ - type
+ - judge_model
+ ModelCandidate:
+ type: object
+ properties:
+ type:
+ type: string
+ const: model
+ default: model
+ model:
+ type: string
+ sampling_params:
+ $ref: '#/components/schemas/SamplingParams'
+ system_message:
+ $ref: '#/components/schemas/SystemMessage'
+ additionalProperties: false
+ required:
+ - type
+ - model
+ - sampling_params
+ RegexParserScoringFnParams:
+ type: object
+ properties:
+ type:
+ type: string
+ const: regex_parser
+ default: regex_parser
+ parsing_regexes:
+ type: array
+ items:
+ type: string
+ aggregation_functions:
+ type: array
+ items:
+ $ref: '#/components/schemas/AggregationFunctionType'
+ additionalProperties: false
+ required:
+ - type
ResponseFormat:
oneOf:
- $ref: '#/components/schemas/JsonSchemaResponseFormat'
@@ -1645,6 +1941,17 @@ components:
greedy: '#/components/schemas/GreedySamplingStrategy'
top_p: '#/components/schemas/TopPSamplingStrategy'
top_k: '#/components/schemas/TopKSamplingStrategy'
+ ScoringFnParams:
+ oneOf:
+ - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
+ - $ref: '#/components/schemas/RegexParserScoringFnParams'
+ - $ref: '#/components/schemas/BasicScoringFnParams'
+ discriminator:
+ propertyName: type
+ mapping:
+ llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
+ regex_parser: '#/components/schemas/RegexParserScoringFnParams'
+ basic: '#/components/schemas/BasicScoringFnParams'
SystemMessage:
type: object
properties:
@@ -1683,6 +1990,383 @@ components:
- type
- text
description: A text content item
+ ToolConfig:
+ type: object
+ properties:
+ tool_choice:
+ type: string
+ enum:
+ - auto
+ - required
+ description: >-
+ (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.
+ default: auto
+ tool_prompt_format:
+ type: string
+ enum:
+ - json
+ - function_tag
+ - python_list
+ description: >-
+ (Optional) Instructs the model how to format tool calls. By default, Llama
+ Stack will attempt to use a format that is best adapted to the model.
+ - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object.
+ - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a
+ tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python
+ syntax -- a list of function calls.
+ system_message_behavior:
+ type: string
+ enum:
+ - append
+ - replace
+ description: >-
+ (Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`:
+ Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`:
+ Replaces the default system prompt with the provided system message. The
+ system message can include the string '{{function_definitions}}' to indicate
+ where the function definitions should be inserted.
+ default: append
+ additionalProperties: false
+ required:
+ - system_message_behavior
+ description: Configuration for tool use.
+ ToolDef:
+ type: object
+ properties:
+ name:
+ type: string
+ description:
+ type: string
+ parameters:
+ type: array
+ items:
+ $ref: '#/components/schemas/ToolParameter'
+ metadata:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ additionalProperties: false
+ required:
+ - name
+ ToolParameter:
+ type: object
+ properties:
+ name:
+ type: string
+ parameter_type:
+ type: string
+ description:
+ type: string
+ required:
+ type: boolean
+ default: true
+ default:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ additionalProperties: false
+ required:
+ - name
+ - parameter_type
+ - description
+ - required
+ TopKSamplingStrategy:
+ type: object
+ properties:
+ type:
+ type: string
+ const: top_k
+ default: top_k
+ top_k:
+ type: integer
+ additionalProperties: false
+ required:
+ - type
+ - top_k
+ TopPSamplingStrategy:
+ type: object
+ properties:
+ type:
+ type: string
+ const: top_p
+ default: top_p
+ temperature:
+ type: number
+ top_p:
+ type: number
+ default: 0.95
+ additionalProperties: false
+ required:
+ - type
+ URL:
+ type: object
+ properties:
+ uri:
+ type: string
+ additionalProperties: false
+ required:
+ - uri
+ DeprecatedEvaluateRowsRequest:
+ type: object
+ properties:
+ input_rows:
+ type: array
+ items:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ scoring_functions:
+ type: array
+ items:
+ type: string
+ task_config:
+ $ref: '#/components/schemas/BenchmarkConfig'
+ additionalProperties: false
+ required:
+ - input_rows
+ - scoring_functions
+ - task_config
+ EvaluateResponse:
+ type: object
+ properties:
+ generations:
+ type: array
+ items:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ scores:
+ type: object
+ additionalProperties:
+ $ref: '#/components/schemas/ScoringResult'
+ additionalProperties: false
+ required:
+ - generations
+ - scores
+ ScoringResult:
+ type: object
+ properties:
+ score_rows:
+ type: array
+ items:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ aggregated_results:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ additionalProperties: false
+ required:
+ - score_rows
+ - aggregated_results
+ Benchmark:
+ type: object
+ properties:
+ identifier:
+ type: string
+ provider_resource_id:
+ type: string
+ provider_id:
+ type: string
+ type:
+ type: string
+ const: benchmark
+ default: benchmark
+ dataset_id:
+ type: string
+ scoring_functions:
+ type: array
+ items:
+ type: string
+ metadata:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ additionalProperties: false
+ required:
+ - identifier
+ - provider_resource_id
+ - provider_id
+ - type
+ - dataset_id
+ - scoring_functions
+ - metadata
+ JobStatus:
+ type: string
+ enum:
+ - completed
+ - in_progress
+ - failed
+ - scheduled
+ ListBenchmarksResponse:
+ type: object
+ properties:
+ data:
+ type: array
+ items:
+ $ref: '#/components/schemas/Benchmark'
+ additionalProperties: false
+ required:
+ - data
+ DeprecatedRegisterEvalTaskRequest:
+ type: object
+ properties:
+ eval_task_id:
+ type: string
+ dataset_id:
+ type: string
+ scoring_functions:
+ type: array
+ items:
+ type: string
+ provider_benchmark_id:
+ type: string
+ provider_id:
+ type: string
+ metadata:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ additionalProperties: false
+ required:
+ - eval_task_id
+ - dataset_id
+ - scoring_functions
+ DeprecatedRunEvalRequest:
+ type: object
+ properties:
+ task_config:
+ $ref: '#/components/schemas/BenchmarkConfig'
+ additionalProperties: false
+ required:
+ - task_config
+ Job:
+ type: object
+ properties:
+ job_id:
+ type: string
+ additionalProperties: false
+ required:
+ - job_id
+ AppendRowsRequest:
+ type: object
+ properties:
+ dataset_id:
+ type: string
+ rows:
+ type: array
+ items:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ additionalProperties: false
+ required:
+ - dataset_id
+ - rows
+ CompletionMessage:
+ type: object
+ properties:
+ role:
+ type: string
+ const: assistant
+ default: assistant
+ description: >-
+ Must be "assistant" to identify this as the model's response
+ content:
+ $ref: '#/components/schemas/InterleavedContent'
+ description: The content of the model's response
+ stop_reason:
+ type: string
+ enum:
+ - end_of_turn
+ - end_of_message
+ - out_of_tokens
+ description: >-
+ Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`:
+ The model finished generating the entire response. - `StopReason.end_of_message`:
+ The model finished generating but generated a partial response -- usually,
+ a tool call. The user may call the tool and continue the conversation
+ with the tool's response. - `StopReason.out_of_tokens`: The model ran
+ out of token budget.
+ tool_calls:
+ type: array
+ items:
+ $ref: '#/components/schemas/ToolCall'
+ description: >-
+ List of tool calls. Each tool call is a ToolCall object.
+ additionalProperties: false
+ required:
+ - role
+ - content
+ - stop_reason
+ description: >-
+ A message containing the model's (assistant) response in a chat conversation.
+ Message:
+ oneOf:
+ - $ref: '#/components/schemas/UserMessage'
+ - $ref: '#/components/schemas/SystemMessage'
+ - $ref: '#/components/schemas/ToolResponseMessage'
+ - $ref: '#/components/schemas/CompletionMessage'
+ discriminator:
+ propertyName: role
+ mapping:
+ user: '#/components/schemas/UserMessage'
+ system: '#/components/schemas/SystemMessage'
+ tool: '#/components/schemas/ToolResponseMessage'
+ assistant: '#/components/schemas/CompletionMessage'
ToolCall:
type: object
properties:
@@ -1803,42 +2487,6 @@ components:
- content
description: >-
A message representing the result of a tool invocation.
- TopKSamplingStrategy:
- type: object
- properties:
- type:
- type: string
- const: top_k
- default: top_k
- top_k:
- type: integer
- additionalProperties: false
- required:
- - type
- - top_k
- TopPSamplingStrategy:
- type: object
- properties:
- type:
- type: string
- const: top_p
- default: top_p
- temperature:
- type: number
- top_p:
- type: number
- default: 0.95
- additionalProperties: false
- required:
- - type
- URL:
- type: object
- properties:
- uri:
- type: string
- additionalProperties: false
- required:
- - uri
UserMessage:
type: object
properties:
@@ -2063,46 +2711,6 @@ components:
additionalProperties: false
required:
- job_uuid
- ToolConfig:
- type: object
- properties:
- tool_choice:
- type: string
- enum:
- - auto
- - required
- description: >-
- (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.
- default: auto
- tool_prompt_format:
- type: string
- enum:
- - json
- - function_tag
- - python_list
- description: >-
- (Optional) Instructs the model how to format tool calls. By default, Llama
- Stack will attempt to use a format that is best adapted to the model.
- - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object.
- - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a
- tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python
- syntax -- a list of function calls.
- system_message_behavior:
- type: string
- enum:
- - append
- - replace
- description: >-
- (Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`:
- Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`:
- Replaces the default system prompt with the provided system message. The
- system message can include the string '{{function_definitions}}' to indicate
- where the function definitions should be inserted.
- default: append
- additionalProperties: false
- required:
- - system_message_behavior
- description: Configuration for tool use.
ChatCompletionRequest:
type: object
properties:
@@ -2356,133 +2964,6 @@ components:
- delta
description: >-
A chunk of a streamed completion response.
- AgentConfig:
- type: object
- properties:
- sampling_params:
- $ref: '#/components/schemas/SamplingParams'
- input_shields:
- type: array
- items:
- type: string
- output_shields:
- type: array
- items:
- type: string
- toolgroups:
- type: array
- items:
- $ref: '#/components/schemas/AgentTool'
- client_tools:
- type: array
- items:
- $ref: '#/components/schemas/ToolDef'
- tool_choice:
- type: string
- enum:
- - auto
- - required
- description: >-
- Whether tool use is required or automatic. This is a hint to the model
- which may not be followed. It depends on the Instruction Following capabilities
- of the model.
- tool_prompt_format:
- type: string
- enum:
- - json
- - function_tag
- - python_list
- description: >-
- Prompt format for calling custom / zero shot tools.
- tool_config:
- $ref: '#/components/schemas/ToolConfig'
- max_infer_iters:
- type: integer
- default: 10
- model:
- type: string
- instructions:
- type: string
- enable_session_persistence:
- type: boolean
- response_format:
- $ref: '#/components/schemas/ResponseFormat'
- additionalProperties: false
- required:
- - model
- - instructions
- - enable_session_persistence
- AgentTool:
- oneOf:
- - type: string
- - type: object
- properties:
- name:
- type: string
- args:
- type: object
- additionalProperties:
- oneOf:
- - type: 'null'
- - type: boolean
- - type: number
- - type: string
- - type: array
- - type: object
- additionalProperties: false
- required:
- - name
- - args
- ToolDef:
- type: object
- properties:
- name:
- type: string
- description:
- type: string
- parameters:
- type: array
- items:
- $ref: '#/components/schemas/ToolParameter'
- metadata:
- type: object
- additionalProperties:
- oneOf:
- - type: 'null'
- - type: boolean
- - type: number
- - type: string
- - type: array
- - type: object
- additionalProperties: false
- required:
- - name
- ToolParameter:
- type: object
- properties:
- name:
- type: string
- parameter_type:
- type: string
- description:
- type: string
- required:
- type: boolean
- default: true
- default:
- oneOf:
- - type: 'null'
- - type: boolean
- - type: number
- - type: string
- - type: array
- - type: object
- additionalProperties: false
- required:
- - name
- - parameter_type
- - description
- - required
CreateAgentRequest:
type: object
properties:
@@ -2962,163 +3443,6 @@ components:
- embeddings
description: >-
Response containing generated embeddings.
- AgentCandidate:
- type: object
- properties:
- type:
- type: string
- const: agent
- default: agent
- config:
- $ref: '#/components/schemas/AgentConfig'
- additionalProperties: false
- required:
- - type
- - config
- AggregationFunctionType:
- type: string
- enum:
- - average
- - median
- - categorical_count
- - accuracy
- AppEvalTaskConfig:
- type: object
- properties:
- type:
- type: string
- const: app
- default: app
- eval_candidate:
- $ref: '#/components/schemas/EvalCandidate'
- scoring_params:
- type: object
- additionalProperties:
- $ref: '#/components/schemas/ScoringFnParams'
- num_examples:
- type: integer
- additionalProperties: false
- required:
- - type
- - eval_candidate
- - scoring_params
- BasicScoringFnParams:
- type: object
- properties:
- type:
- type: string
- const: basic
- default: basic
- aggregation_functions:
- type: array
- items:
- $ref: '#/components/schemas/AggregationFunctionType'
- additionalProperties: false
- required:
- - type
- BenchmarkEvalTaskConfig:
- type: object
- properties:
- type:
- type: string
- const: benchmark
- default: benchmark
- eval_candidate:
- $ref: '#/components/schemas/EvalCandidate'
- num_examples:
- type: integer
- additionalProperties: false
- required:
- - type
- - eval_candidate
- EvalCandidate:
- oneOf:
- - $ref: '#/components/schemas/ModelCandidate'
- - $ref: '#/components/schemas/AgentCandidate'
- discriminator:
- propertyName: type
- mapping:
- model: '#/components/schemas/ModelCandidate'
- agent: '#/components/schemas/AgentCandidate'
- EvalTaskConfig:
- oneOf:
- - $ref: '#/components/schemas/BenchmarkEvalTaskConfig'
- - $ref: '#/components/schemas/AppEvalTaskConfig'
- discriminator:
- propertyName: type
- mapping:
- benchmark: '#/components/schemas/BenchmarkEvalTaskConfig'
- app: '#/components/schemas/AppEvalTaskConfig'
- LLMAsJudgeScoringFnParams:
- type: object
- properties:
- type:
- type: string
- const: llm_as_judge
- default: llm_as_judge
- judge_model:
- type: string
- prompt_template:
- type: string
- judge_score_regexes:
- type: array
- items:
- type: string
- aggregation_functions:
- type: array
- items:
- $ref: '#/components/schemas/AggregationFunctionType'
- additionalProperties: false
- required:
- - type
- - judge_model
- ModelCandidate:
- type: object
- properties:
- type:
- type: string
- const: model
- default: model
- model:
- type: string
- sampling_params:
- $ref: '#/components/schemas/SamplingParams'
- system_message:
- $ref: '#/components/schemas/SystemMessage'
- additionalProperties: false
- required:
- - type
- - model
- - sampling_params
- RegexParserScoringFnParams:
- type: object
- properties:
- type:
- type: string
- const: regex_parser
- default: regex_parser
- parsing_regexes:
- type: array
- items:
- type: string
- aggregation_functions:
- type: array
- items:
- $ref: '#/components/schemas/AggregationFunctionType'
- additionalProperties: false
- required:
- - type
- ScoringFnParams:
- oneOf:
- - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
- - $ref: '#/components/schemas/RegexParserScoringFnParams'
- - $ref: '#/components/schemas/BasicScoringFnParams'
- discriminator:
- propertyName: type
- mapping:
- llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
- regex_parser: '#/components/schemas/RegexParserScoringFnParams'
- basic: '#/components/schemas/BasicScoringFnParams'
EvaluateRowsRequest:
type: object
properties:
@@ -3139,64 +3463,12 @@ components:
items:
type: string
task_config:
- $ref: '#/components/schemas/EvalTaskConfig'
+ $ref: '#/components/schemas/BenchmarkConfig'
additionalProperties: false
required:
- input_rows
- scoring_functions
- task_config
- EvaluateResponse:
- type: object
- properties:
- generations:
- type: array
- items:
- type: object
- additionalProperties:
- oneOf:
- - type: 'null'
- - type: boolean
- - type: number
- - type: string
- - type: array
- - type: object
- scores:
- type: object
- additionalProperties:
- $ref: '#/components/schemas/ScoringResult'
- additionalProperties: false
- required:
- - generations
- - scores
- ScoringResult:
- type: object
- properties:
- score_rows:
- type: array
- items:
- type: object
- additionalProperties:
- oneOf:
- - type: 'null'
- - type: boolean
- - type: number
- - type: string
- - type: array
- - type: object
- aggregated_results:
- type: object
- additionalProperties:
- oneOf:
- - type: 'null'
- - type: boolean
- - type: number
- - type: string
- - type: array
- - type: object
- additionalProperties: false
- required:
- - score_rows
- - aggregated_results
Session:
type: object
properties:
@@ -3401,44 +3673,6 @@ components:
additionalProperties: false
required:
- type
- EvalTask:
- type: object
- properties:
- identifier:
- type: string
- provider_resource_id:
- type: string
- provider_id:
- type: string
- type:
- type: string
- const: eval_task
- default: eval_task
- dataset_id:
- type: string
- scoring_functions:
- type: array
- items:
- type: string
- metadata:
- type: object
- additionalProperties:
- oneOf:
- - type: 'null'
- - type: boolean
- - type: number
- - type: string
- - type: array
- - type: object
- additionalProperties: false
- required:
- - identifier
- - provider_resource_id
- - provider_id
- - type
- - dataset_id
- - scoring_functions
- - metadata
Model:
type: object
properties:
@@ -3766,13 +4000,6 @@ components:
- job_uuid
- checkpoints
description: Artifacts of a finetuning job.
- JobStatus:
- type: string
- enum:
- - completed
- - in_progress
- - failed
- - scheduled
PostTrainingJobStatusResponse:
type: object
properties:
@@ -3977,16 +4204,6 @@ components:
additionalProperties: false
required:
- data
- ListEvalTasksResponse:
- type: object
- properties:
- data:
- type: array
- items:
- $ref: '#/components/schemas/EvalTask'
- additionalProperties: false
- required:
- - data
ListModelsResponse:
type: object
properties:
@@ -4569,6 +4786,36 @@ components:
additionalProperties: false
required:
- data
+ RegisterBenchmarkRequest:
+ type: object
+ properties:
+ benchmark_id:
+ type: string
+ dataset_id:
+ type: string
+ scoring_functions:
+ type: array
+ items:
+ type: string
+ provider_benchmark_id:
+ type: string
+ provider_id:
+ type: string
+ metadata:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ additionalProperties: false
+ required:
+ - benchmark_id
+ - dataset_id
+ - scoring_functions
RegisterDatasetRequest:
type: object
properties:
@@ -4599,36 +4846,6 @@ components:
- dataset_id
- dataset_schema
- url
- RegisterEvalTaskRequest:
- type: object
- properties:
- eval_task_id:
- type: string
- dataset_id:
- type: string
- scoring_functions:
- type: array
- items:
- type: string
- provider_eval_task_id:
- type: string
- provider_id:
- type: string
- metadata:
- type: object
- additionalProperties:
- oneOf:
- - type: 'null'
- - type: boolean
- - type: number
- - type: string
- - type: array
- - type: object
- additionalProperties: false
- required:
- - eval_task_id
- - dataset_id
- - scoring_functions
RegisterModelRequest:
type: object
properties:
@@ -4739,18 +4956,10 @@ components:
type: object
properties:
task_config:
- $ref: '#/components/schemas/EvalTaskConfig'
+ $ref: '#/components/schemas/BenchmarkConfig'
additionalProperties: false
required:
- task_config
- Job:
- type: object
- properties:
- job_id:
- type: string
- additionalProperties: false
- required:
- - job_id
RunShieldRequest:
type: object
properties:
@@ -5049,10 +5258,10 @@ tags:
x-displayName: >-
Agents API for creating and interacting with agentic systems.
- name: BatchInference (Coming Soon)
+ - name: Benchmarks
- name: DatasetIO
- name: Datasets
- name: Eval
- - name: EvalTasks
- name: Inference
description: >-
This API provides the raw interface to the underlying models. Two kinds of models
@@ -5083,10 +5292,10 @@ x-tagGroups:
tags:
- Agents
- BatchInference (Coming Soon)
+ - Benchmarks
- DatasetIO
- Datasets
- Eval
- - EvalTasks
- Inference
- Inspect
- Models
diff --git a/docs/getting_started.ipynb b/docs/getting_started.ipynb
index abe537c8e..ee616b471 100644
--- a/docs/getting_started.ipynb
+++ b/docs/getting_started.ipynb
@@ -324,7 +324,7 @@
"- vector_io\n",
"container_image: null\n",
"datasets: []\n",
- "eval_tasks: []\n",
+ "benchmarks: []\n",
"image_name: together\n",
"metadata_store:\n",
" db_path: /Users/ashwin/.llama/distributions/together/registry.db\n",
@@ -508,7 +508,7 @@
"- vector_io\n",
"container_image: null\n",
"datasets: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
- "eval_tasks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
+ "benchmarks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
"image_name: together\n",
"metadata_store:\n",
" db_path: \u001b[35m/Users/ashwin/.llama/distributions/together/\u001b[0m\u001b[95mregistry.db\u001b[0m\n",
diff --git a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
index 84da25246..8eecf84ab 100644
--- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
+++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
@@ -370,7 +370,7 @@
"- tool_runtime\n",
"datasets: []\n",
"container_image: null\n",
- "eval_tasks: []\n",
+ "benchmarks: []\n",
"image_name: together\n",
"memory_banks: []\n",
"metadata_store:\n",
@@ -551,7 +551,7 @@
"- tool_runtime\n",
"datasets: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
"container_image: null\n",
- "eval_tasks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
+ "benchmarks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
"image_name: together\n",
"memory_banks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
"metadata_store:\n",
diff --git a/docs/openapi_generator/pyopenapi/generator.py b/docs/openapi_generator/pyopenapi/generator.py
index a0385cae0..0f3b99784 100644
--- a/docs/openapi_generator/pyopenapi/generator.py
+++ b/docs/openapi_generator/pyopenapi/generator.py
@@ -647,6 +647,7 @@ class Generator:
description = "\n".join(
filter(None, [doc_string.short_description, doc_string.long_description])
)
+
return Operation(
tags=[op.defining_class.__name__],
summary=None,
@@ -656,6 +657,7 @@ class Generator:
requestBody=requestBody,
responses=responses,
callbacks=callbacks,
+ deprecated=True if "DEPRECATED" in op.func_name else None,
security=[] if op.public else None,
)
diff --git a/docs/openapi_generator/pyopenapi/specification.py b/docs/openapi_generator/pyopenapi/specification.py
index 4b54295c5..f96de58b6 100644
--- a/docs/openapi_generator/pyopenapi/specification.py
+++ b/docs/openapi_generator/pyopenapi/specification.py
@@ -117,6 +117,7 @@ class Operation:
requestBody: Optional[RequestBody] = None
callbacks: Optional[Dict[str, "Callback"]] = None
security: Optional[List["SecurityRequirement"]] = None
+ deprecated: Optional[bool] = None
@dataclass
diff --git a/docs/source/building_applications/evals.md b/docs/source/building_applications/evals.md
index c4cb476e4..f28e0d5fd 100644
--- a/docs/source/building_applications/evals.md
+++ b/docs/source/building_applications/evals.md
@@ -41,14 +41,14 @@ system_message = {
"content": SYSTEM_PROMPT_TEMPLATE,
}
-client.eval_tasks.register(
- eval_task_id="meta-reference::mmmu",
+client.benchmarks.register(
+ benchmark_id="meta-reference::mmmu",
dataset_id=f"mmmu-{subset}-{split}",
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
)
response = client.eval.evaluate_rows(
- task_id="meta-reference::mmmu",
+ benchmark_id="meta-reference::mmmu",
input_rows=eval_rows,
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
task_config={
@@ -99,14 +99,14 @@ eval_rows = client.datasetio.get_rows_paginated(
```
```python
-client.eval_tasks.register(
- eval_task_id="meta-reference::simpleqa",
+client.benchmarks.register(
+ benchmark_id="meta-reference::simpleqa",
dataset_id=simpleqa_dataset_id,
scoring_functions=["llm-as-judge::405b-simpleqa"],
)
response = client.eval.evaluate_rows(
- task_id="meta-reference::simpleqa",
+ benchmark_id="meta-reference::simpleqa",
input_rows=eval_rows.rows,
scoring_functions=["llm-as-judge::405b-simpleqa"],
task_config={
@@ -156,7 +156,7 @@ agent_config = {
}
response = client.eval.evaluate_rows(
- task_id="meta-reference::simpleqa",
+ benchmark_id="meta-reference::simpleqa",
input_rows=eval_rows.rows,
scoring_functions=["llm-as-judge::405b-simpleqa"],
task_config={
diff --git a/docs/source/building_applications/evaluation.md b/docs/source/building_applications/evaluation.md
index 91e5c552b..ad220f751 100644
--- a/docs/source/building_applications/evaluation.md
+++ b/docs/source/building_applications/evaluation.md
@@ -10,15 +10,15 @@ Here's how to set up basic evaluation:
```python
# Create an evaluation task
-response = client.eval_tasks.register(
- eval_task_id="my_eval",
+response = client.benchmarks.register(
+ benchmark_id="my_eval",
dataset_id="my_dataset",
scoring_functions=["accuracy", "relevance"],
)
# Run evaluation
job = client.eval.run_eval(
- task_id="my_eval",
+ benchmark_id="my_eval",
task_config={
"type": "app",
"eval_candidate": {"type": "agent", "config": agent_config},
@@ -26,5 +26,5 @@ job = client.eval.run_eval(
)
# Get results
-result = client.eval.job_result(task_id="my_eval", job_id=job.job_id)
+result = client.eval.job_result(benchmark_id="my_eval", job_id=job.job_id)
```
diff --git a/docs/source/concepts/evaluation_concepts.md b/docs/source/concepts/evaluation_concepts.md
index 399d99d92..3ca4b0ac8 100644
--- a/docs/source/concepts/evaluation_concepts.md
+++ b/docs/source/concepts/evaluation_concepts.md
@@ -5,7 +5,7 @@ The Llama Stack Evaluation flow allows you to run evaluations on your GenAI appl
We introduce a set of APIs in Llama Stack for supporting running evaluations of LLM applications.
- `/datasetio` + `/datasets` API
- `/scoring` + `/scoring_functions` API
-- `/eval` + `/eval_tasks` API
+- `/eval` + `/benchmarks` API
This guide goes over the sets of APIs and developer experience flow of using Llama Stack to run evaluations for different use cases. Checkout our Colab notebook on working examples with evaluations [here](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing).
@@ -21,7 +21,7 @@ The Evaluation APIs are associated with a set of Resources as shown in the follo
- **Scoring**: evaluate outputs of the system.
- Associated with `ScoringFunction` resource. We provide a suite of out-of-the box scoring functions and also the ability for you to add custom evaluators. These scoring functions are the core part of defining an evaluation task to output evaluation metrics.
- **Eval**: generate outputs (via Inference or Agents) and perform scoring.
- - Associated with `EvalTask` resource.
+ - Associated with `Benchmark` resource.
Use the following decision tree to decide how to use LlamaStack Evaluation flow.
diff --git a/docs/source/concepts/index.md b/docs/source/concepts/index.md
index 1437ec623..403e47c48 100644
--- a/docs/source/concepts/index.md
+++ b/docs/source/concepts/index.md
@@ -42,7 +42,7 @@ Some of these APIs are associated with a set of **Resources**. Here is the mappi
- **Tool Runtime** is associated with `ToolGroup` resources.
- **DatasetIO** is associated with `Dataset` resources.
- **Scoring** is associated with `ScoringFunction` resources.
-- **Eval** is associated with `Model` and `EvalTask` resources.
+- **Eval** is associated with `Model` and `Benchmark` resources.
Furthermore, we allow these resources to be **federated** across multiple providers. For example, you may have some Llama models served by Fireworks while others are served by AWS Bedrock. Regardless, they will all work seamlessly with the same uniform Inference API provided by Llama Stack.
diff --git a/docs/source/playground/index.md b/docs/source/playground/index.md
index d74bf1a03..9691609ab 100644
--- a/docs/source/playground/index.md
+++ b/docs/source/playground/index.md
@@ -64,7 +64,7 @@ Interactive pages for users to play with and explore Llama Stack API capabilitie
```
```bash
- $ llama-stack-client eval_tasks register \
+ $ llama-stack-client benchmarks register \
--eval-task-id meta-reference-mmlu \
--provider-id meta-reference \
--dataset-id mmlu \
@@ -86,7 +86,7 @@ Interactive pages for users to play with and explore Llama Stack API capabilitie
- Under the hood, it uses Llama Stack's `/providers` API to get information about the providers.
- **API Resources**: Inspect Llama Stack API resources
- - This page allows you to inspect Llama Stack API resources (`models`, `datasets`, `memory_banks`, `eval_tasks`, `shields`).
+ - This page allows you to inspect Llama Stack API resources (`models`, `datasets`, `memory_banks`, `benchmarks`, `shields`).
- Under the hood, it uses Llama Stack's `//list` API to get information about each resources.
- Please visit [Core Concepts](https://llama-stack.readthedocs.io/en/latest/concepts/index.html) for more details about the resources.
diff --git a/docs/source/references/evals_reference/index.md b/docs/source/references/evals_reference/index.md
index 86f66208a..71dbb47e5 100644
--- a/docs/source/references/evals_reference/index.md
+++ b/docs/source/references/evals_reference/index.md
@@ -5,7 +5,7 @@ The Llama Stack Evaluation flow allows you to run evaluations on your GenAI appl
We introduce a set of APIs in Llama Stack for supporting running evaluations of LLM applications.
- `/datasetio` + `/datasets` API
- `/scoring` + `/scoring_functions` API
-- `/eval` + `/eval_tasks` API
+- `/eval` + `/benchmarks` API
This guide goes over the sets of APIs and developer experience flow of using Llama Stack to run evaluations for different use cases. Checkout our Colab notebook on working examples with evaluations [here](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing).
@@ -21,7 +21,7 @@ The Evaluation APIs are associated with a set of Resources as shown in the follo
- **Scoring**: evaluate outputs of the system.
- Associated with `ScoringFunction` resource. We provide a suite of out-of-the box scoring functions and also the ability for you to add custom evaluators. These scoring functions are the core part of defining an evaluation task to output evaluation metrics.
- **Eval**: generate outputs (via Inference or Agents) and perform scoring.
- - Associated with `EvalTask` resource.
+ - Associated with `Benchmark` resource.
Use the following decision tree to decide how to use LlamaStack Evaluation flow.
@@ -77,14 +77,14 @@ system_message = {
"content": SYSTEM_PROMPT_TEMPLATE,
}
-client.eval_tasks.register(
- eval_task_id="meta-reference::mmmu",
+client.benchmarks.register(
+ benchmark_id="meta-reference::mmmu",
dataset_id=f"mmmu-{subset}-{split}",
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
)
response = client.eval.evaluate_rows(
- task_id="meta-reference::mmmu",
+ benchmark_id="meta-reference::mmmu",
input_rows=eval_rows,
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
task_config={
@@ -135,14 +135,14 @@ eval_rows = client.datasetio.get_rows_paginated(
```
```python
-client.eval_tasks.register(
- eval_task_id="meta-reference::simpleqa",
+client.benchmarks.register(
+ benchmark_id="meta-reference::simpleqa",
dataset_id=simpleqa_dataset_id,
scoring_functions=["llm-as-judge::405b-simpleqa"],
)
response = client.eval.evaluate_rows(
- task_id="meta-reference::simpleqa",
+ benchmark_id="meta-reference::simpleqa",
input_rows=eval_rows.rows,
scoring_functions=["llm-as-judge::405b-simpleqa"],
task_config={
@@ -192,7 +192,7 @@ agent_config = {
}
response = client.eval.evaluate_rows(
- task_id="meta-reference::simpleqa",
+ benchmark_id="meta-reference::simpleqa",
input_rows=eval_rows.rows,
scoring_functions=["llm-as-judge::405b-simpleqa"],
task_config={
@@ -281,7 +281,7 @@ The following examples give the quick steps to start running evaluations using t
#### Benchmark Evaluation CLI
Usage: There are 2 inputs necessary for running a benchmark eval
-- `eval-task-id`: the identifier associated with the eval task. Each `EvalTask` is parametrized by
+- `eval-task-id`: the identifier associated with the eval task. Each `Benchmark` is parametrized by
- `dataset_id`: the identifier associated with the dataset.
- `List[scoring_function_id]`: list of scoring function identifiers.
- `eval-task-config`: specifies the configuration of the model / agent to evaluate on.
@@ -289,7 +289,7 @@ Usage: There are 2 inputs necessary for running a benchmark eval
```
llama-stack-client eval run_benchmark \
---eval-task-config ~/eval_task_config.json \
+--eval-task-config ~/benchmark_config.json \
--visualize
```
@@ -309,15 +309,15 @@ llama-stack-client eval run_scoring ... --dataset-id --scoring-functions [ ...] [--provider-id ] [--provider-eval-task-id ] [--metadata ]
+$ llama-stack-client benchmarks register --eval-task-id --dataset-id --scoring-functions [ ...] [--provider-id ] [--provider-eval-task-id ] [--metadata ]
```
Options:
@@ -191,7 +191,7 @@ Options:
- `--num-examples`: Optional. Number of examples to evaluate (useful for debugging)
- `--visualize`: Optional flag. If set, visualizes evaluation results after completion
-Example eval_task_config.json:
+Example benchmark_config.json:
```json
{
"type": "benchmark",
diff --git a/docs/source/references/python_sdk_reference/index.md b/docs/source/references/python_sdk_reference/index.md
index 8a06e2244..9d1130422 100644
--- a/docs/source/references/python_sdk_reference/index.md
+++ b/docs/source/references/python_sdk_reference/index.md
@@ -181,8 +181,8 @@ from llama_stack_client.types import EvaluateResponse, Job
Methods:
-- client.eval.evaluate_rows(task_id, \*\*params) -> EvaluateResponse
-- client.eval.run_eval(task_id, \*\*params) -> Job
+- client.eval.evaluate_rows(benchmark_id, \*\*params) -> EvaluateResponse
+- client.eval.run_eval(benchmark_id, \*\*params) -> Job
### Jobs
@@ -194,9 +194,9 @@ from llama_stack_client.types.eval import JobStatusResponse
Methods:
-- client.eval.jobs.retrieve(job_id, \*, task_id) -> EvaluateResponse
-- client.eval.jobs.cancel(job_id, \*, task_id) -> None
-- client.eval.jobs.status(job_id, \*, task_id) -> Optional[JobStatusResponse]
+- client.eval.jobs.retrieve(job_id, \*, benchmark_id) -> EvaluateResponse
+- client.eval.jobs.cancel(job_id, \*, benchmark_id) -> None
+- client.eval.jobs.status(job_id, \*, benchmark_id) -> Optional[JobStatusResponse]
## Inspect
@@ -443,20 +443,20 @@ Methods:
- client.scoring_functions.list() -> ScoringFunctionListResponse
- client.scoring_functions.register(\*\*params) -> None
-## EvalTasks
+## Benchmarks
Types:
```python
from llama_stack_client.types import (
- EvalTask,
- ListEvalTasksResponse,
- EvalTaskListResponse,
+ Benchmark,
+ ListBenchmarksResponse,
+ BenchmarkListResponse,
)
```
Methods:
-- client.eval_tasks.retrieve(eval_task_id) -> Optional[EvalTask]
-- client.eval_tasks.list() -> EvalTaskListResponse
-- client.eval_tasks.register(\*\*params) -> None
+- client.benchmarks.retrieve(benchmark_id) -> Optional[Benchmark]
+- client.benchmarks.list() -> BenchmarkListResponse
+- client.benchmarks.register(\*\*params) -> None
diff --git a/llama_stack/apis/eval_tasks/__init__.py b/llama_stack/apis/benchmarks/__init__.py
similarity index 81%
rename from llama_stack/apis/eval_tasks/__init__.py
rename to llama_stack/apis/benchmarks/__init__.py
index 7ca216706..f8f564957 100644
--- a/llama_stack/apis/eval_tasks/__init__.py
+++ b/llama_stack/apis/benchmarks/__init__.py
@@ -4,4 +4,4 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
-from .eval_tasks import * # noqa: F401 F403
+from .benchmarks import * # noqa: F401 F403
diff --git a/llama_stack/apis/benchmarks/benchmarks.py b/llama_stack/apis/benchmarks/benchmarks.py
new file mode 100644
index 000000000..50019b18c
--- /dev/null
+++ b/llama_stack/apis/benchmarks/benchmarks.py
@@ -0,0 +1,86 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable
+
+from llama_models.schema_utils import json_schema_type, webmethod
+from pydantic import BaseModel, Field
+
+from llama_stack.apis.resource import Resource, ResourceType
+
+
+class CommonBenchmarkFields(BaseModel):
+ dataset_id: str
+ scoring_functions: List[str]
+ metadata: Dict[str, Any] = Field(
+ default_factory=dict,
+ description="Metadata for this evaluation task",
+ )
+
+
+@json_schema_type
+class Benchmark(CommonBenchmarkFields, Resource):
+ type: Literal[ResourceType.benchmark.value] = ResourceType.benchmark.value
+
+ @property
+ def benchmark_id(self) -> str:
+ return self.identifier
+
+ @property
+ def provider_benchmark_id(self) -> str:
+ return self.provider_resource_id
+
+
+class BenchmarkInput(CommonBenchmarkFields, BaseModel):
+ benchmark_id: str
+ provider_id: Optional[str] = None
+ provider_benchmark_id: Optional[str] = None
+
+
+class ListBenchmarksResponse(BaseModel):
+ data: List[Benchmark]
+
+
+@runtime_checkable
+class Benchmarks(Protocol):
+ @webmethod(route="/eval/benchmarks", method="GET")
+ async def list_benchmarks(self) -> ListBenchmarksResponse: ...
+
+ @webmethod(route="/eval/benchmarks/{benchmark_id}", method="GET")
+ async def get_benchmark(
+ self,
+ benchmark_id: str,
+ ) -> Optional[Benchmark]: ...
+
+ @webmethod(route="/eval/benchmarks", method="POST")
+ async def register_benchmark(
+ self,
+ benchmark_id: str,
+ dataset_id: str,
+ scoring_functions: List[str],
+ provider_benchmark_id: Optional[str] = None,
+ provider_id: Optional[str] = None,
+ metadata: Optional[Dict[str, Any]] = None,
+ ) -> None: ...
+
+ @webmethod(route="/eval-tasks", method="GET")
+ async def DEPRECATED_list_eval_tasks(self) -> ListBenchmarksResponse: ...
+
+ @webmethod(route="/eval-tasks/{task_id}", method="GET")
+ async def DEPRECATED_get_eval_task(
+ self,
+ eval_task_id: str,
+ ) -> Optional[Benchmark]: ...
+
+ @webmethod(route="/eval-tasks", method="POST")
+ async def DEPRECATED_register_eval_task(
+ self,
+ eval_task_id: str,
+ dataset_id: str,
+ scoring_functions: List[str],
+ provider_benchmark_id: Optional[str] = None,
+ provider_id: Optional[str] = None,
+ metadata: Optional[Dict[str, Any]] = None,
+ ) -> None: ...
diff --git a/llama_stack/apis/datatypes.py b/llama_stack/apis/datatypes.py
index ccc395b80..0751b2c9b 100644
--- a/llama_stack/apis/datatypes.py
+++ b/llama_stack/apis/datatypes.py
@@ -28,7 +28,7 @@ class Api(Enum):
vector_dbs = "vector_dbs"
datasets = "datasets"
scoring_functions = "scoring_functions"
- eval_tasks = "eval_tasks"
+ benchmarks = "benchmarks"
tool_groups = "tool_groups"
# built-in API
diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py
index ae13a5bd9..e5c782150 100644
--- a/llama_stack/apis/eval/eval.py
+++ b/llama_stack/apis/eval/eval.py
@@ -38,19 +38,9 @@ EvalCandidate = register_schema(
@json_schema_type
-class BenchmarkEvalTaskConfig(BaseModel):
+class BenchmarkConfig(BaseModel):
type: Literal["benchmark"] = "benchmark"
eval_candidate: EvalCandidate
- num_examples: Optional[int] = Field(
- description="Number of examples to evaluate (useful for testing), if not provided, all examples in the dataset will be evaluated",
- default=None,
- )
-
-
-@json_schema_type
-class AppEvalTaskConfig(BaseModel):
- type: Literal["app"] = "app"
- eval_candidate: EvalCandidate
scoring_params: Dict[str, ScoringFnParams] = Field(
description="Map between scoring function id and parameters for each scoring function you want to run",
default_factory=dict,
@@ -62,12 +52,6 @@ class AppEvalTaskConfig(BaseModel):
# we could optinally add any specific dataset config here
-EvalTaskConfig = register_schema(
- Annotated[Union[BenchmarkEvalTaskConfig, AppEvalTaskConfig], Field(discriminator="type")],
- name="EvalTaskConfig",
-)
-
-
@json_schema_type
class EvaluateResponse(BaseModel):
generations: List[Dict[str, Any]]
@@ -76,27 +60,52 @@ class EvaluateResponse(BaseModel):
class Eval(Protocol):
- @webmethod(route="/eval/tasks/{task_id}/jobs", method="POST")
+ @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST")
async def run_eval(
+ self,
+ benchmark_id: str,
+ task_config: BenchmarkConfig,
+ ) -> Job: ...
+
+ @webmethod(route="/eval/benchmarks/{benchmark_id}/evaluations", method="POST")
+ async def evaluate_rows(
+ self,
+ benchmark_id: str,
+ input_rows: List[Dict[str, Any]],
+ scoring_functions: List[str],
+ task_config: BenchmarkConfig,
+ ) -> EvaluateResponse: ...
+
+ @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="GET")
+ async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]: ...
+
+ @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="DELETE")
+ async def job_cancel(self, benchmark_id: str, job_id: str) -> None: ...
+
+ @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET")
+ async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse: ...
+
+ @webmethod(route="/eval/tasks/{task_id}/jobs", method="POST")
+ async def DEPRECATED_run_eval(
self,
task_id: str,
- task_config: EvalTaskConfig,
+ task_config: BenchmarkConfig,
) -> Job: ...
@webmethod(route="/eval/tasks/{task_id}/evaluations", method="POST")
- async def evaluate_rows(
+ async def DEPRECATED_evaluate_rows(
self,
task_id: str,
input_rows: List[Dict[str, Any]],
scoring_functions: List[str],
- task_config: EvalTaskConfig,
+ task_config: BenchmarkConfig,
) -> EvaluateResponse: ...
@webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="GET")
- async def job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]: ...
+ async def DEPRECATED_job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]: ...
@webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="DELETE")
- async def job_cancel(self, task_id: str, job_id: str) -> None: ...
+ async def DEPRECATED_job_cancel(self, task_id: str, job_id: str) -> None: ...
@webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}/result", method="GET")
- async def job_result(self, job_id: str, task_id: str) -> EvaluateResponse: ...
+ async def DEPRECATED_job_result(self, task_id: str, job_id: str) -> EvaluateResponse: ...
diff --git a/llama_stack/apis/eval_tasks/eval_tasks.py b/llama_stack/apis/eval_tasks/eval_tasks.py
deleted file mode 100644
index a0a533055..000000000
--- a/llama_stack/apis/eval_tasks/eval_tasks.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable
-
-from llama_models.schema_utils import json_schema_type, webmethod
-from pydantic import BaseModel, Field
-
-from llama_stack.apis.resource import Resource, ResourceType
-
-
-class CommonEvalTaskFields(BaseModel):
- dataset_id: str
- scoring_functions: List[str]
- metadata: Dict[str, Any] = Field(
- default_factory=dict,
- description="Metadata for this evaluation task",
- )
-
-
-@json_schema_type
-class EvalTask(CommonEvalTaskFields, Resource):
- type: Literal[ResourceType.eval_task.value] = ResourceType.eval_task.value
-
- @property
- def eval_task_id(self) -> str:
- return self.identifier
-
- @property
- def provider_eval_task_id(self) -> str:
- return self.provider_resource_id
-
-
-class EvalTaskInput(CommonEvalTaskFields, BaseModel):
- eval_task_id: str
- provider_id: Optional[str] = None
- provider_eval_task_id: Optional[str] = None
-
-
-class ListEvalTasksResponse(BaseModel):
- data: List[EvalTask]
-
-
-@runtime_checkable
-class EvalTasks(Protocol):
- @webmethod(route="/eval-tasks", method="GET")
- async def list_eval_tasks(self) -> ListEvalTasksResponse: ...
-
- @webmethod(route="/eval-tasks/{eval_task_id}", method="GET")
- async def get_eval_task(
- self,
- eval_task_id: str,
- ) -> Optional[EvalTask]: ...
-
- @webmethod(route="/eval-tasks", method="POST")
- async def register_eval_task(
- self,
- eval_task_id: str,
- dataset_id: str,
- scoring_functions: List[str],
- provider_eval_task_id: Optional[str] = None,
- provider_id: Optional[str] = None,
- metadata: Optional[Dict[str, Any]] = None,
- ) -> None: ...
diff --git a/llama_stack/apis/resource.py b/llama_stack/apis/resource.py
index 145113a5d..70ec63c55 100644
--- a/llama_stack/apis/resource.py
+++ b/llama_stack/apis/resource.py
@@ -15,7 +15,7 @@ class ResourceType(Enum):
vector_db = "vector_db"
dataset = "dataset"
scoring_function = "scoring_function"
- eval_task = "eval_task"
+ benchmark = "benchmark"
tool = "tool"
tool_group = "tool_group"
diff --git a/llama_stack/distribution/datatypes.py b/llama_stack/distribution/datatypes.py
index 97706f22a..f62996081 100644
--- a/llama_stack/distribution/datatypes.py
+++ b/llama_stack/distribution/datatypes.py
@@ -8,10 +8,10 @@ from typing import Annotated, Any, Dict, List, Optional, Union
from pydantic import BaseModel, Field
+from llama_stack.apis.benchmarks import Benchmark, BenchmarkInput
from llama_stack.apis.datasetio import DatasetIO
from llama_stack.apis.datasets import Dataset, DatasetInput
from llama_stack.apis.eval import Eval
-from llama_stack.apis.eval_tasks import EvalTask, EvalTaskInput
from llama_stack.apis.inference import Inference
from llama_stack.apis.models import Model, ModelInput
from llama_stack.apis.safety import Safety
@@ -37,7 +37,7 @@ RoutableObject = Union[
VectorDB,
Dataset,
ScoringFn,
- EvalTask,
+ Benchmark,
Tool,
ToolGroup,
]
@@ -50,7 +50,7 @@ RoutableObjectWithProvider = Annotated[
VectorDB,
Dataset,
ScoringFn,
- EvalTask,
+ Benchmark,
Tool,
ToolGroup,
],
@@ -173,7 +173,7 @@ a default SQLite store will be used.""",
vector_dbs: List[VectorDBInput] = Field(default_factory=list)
datasets: List[DatasetInput] = Field(default_factory=list)
scoring_fns: List[ScoringFnInput] = Field(default_factory=list)
- eval_tasks: List[EvalTaskInput] = Field(default_factory=list)
+ benchmarks: List[BenchmarkInput] = Field(default_factory=list)
tool_groups: List[ToolGroupInput] = Field(default_factory=list)
server: ServerConfig = Field(
diff --git a/llama_stack/distribution/distribution.py b/llama_stack/distribution/distribution.py
index 2dcf38463..384e2c3c8 100644
--- a/llama_stack/distribution/distribution.py
+++ b/llama_stack/distribution/distribution.py
@@ -44,7 +44,7 @@ def builtin_automatically_routed_apis() -> List[AutoRoutedApiInfo]:
router_api=Api.scoring,
),
AutoRoutedApiInfo(
- routing_table_api=Api.eval_tasks,
+ routing_table_api=Api.benchmarks,
router_api=Api.eval,
),
AutoRoutedApiInfo(
diff --git a/llama_stack/distribution/resolver.py b/llama_stack/distribution/resolver.py
index 353c2971b..0bc2e774c 100644
--- a/llama_stack/distribution/resolver.py
+++ b/llama_stack/distribution/resolver.py
@@ -9,10 +9,10 @@ import logging
from typing import Any, Dict, List, Set
from llama_stack.apis.agents import Agents
+from llama_stack.apis.benchmarks import Benchmarks
from llama_stack.apis.datasetio import DatasetIO
from llama_stack.apis.datasets import Datasets
from llama_stack.apis.eval import Eval
-from llama_stack.apis.eval_tasks import EvalTasks
from llama_stack.apis.inference import Inference
from llama_stack.apis.inspect import Inspect
from llama_stack.apis.models import Models
@@ -37,8 +37,8 @@ from llama_stack.distribution.store import DistributionRegistry
from llama_stack.distribution.utils.dynamic import instantiate_class_type
from llama_stack.providers.datatypes import (
Api,
+ BenchmarksProtocolPrivate,
DatasetsProtocolPrivate,
- EvalTasksProtocolPrivate,
InlineProviderSpec,
ModelsProtocolPrivate,
ProviderSpec,
@@ -73,7 +73,7 @@ def api_protocol_map() -> Dict[Api, Any]:
Api.scoring: Scoring,
Api.scoring_functions: ScoringFunctions,
Api.eval: Eval,
- Api.eval_tasks: EvalTasks,
+ Api.benchmarks: Benchmarks,
Api.post_training: PostTraining,
Api.tool_groups: ToolGroups,
Api.tool_runtime: ToolRuntime,
@@ -92,7 +92,7 @@ def additional_protocols_map() -> Dict[Api, Any]:
ScoringFunctions,
Api.scoring_functions,
),
- Api.eval: (EvalTasksProtocolPrivate, EvalTasks, Api.eval_tasks),
+ Api.eval: (BenchmarksProtocolPrivate, Benchmarks, Api.benchmarks),
}
diff --git a/llama_stack/distribution/routers/__init__.py b/llama_stack/distribution/routers/__init__.py
index 18197ca7f..a54f57fb3 100644
--- a/llama_stack/distribution/routers/__init__.py
+++ b/llama_stack/distribution/routers/__init__.py
@@ -11,8 +11,8 @@ from llama_stack.distribution.store import DistributionRegistry
from llama_stack.providers.datatypes import Api, RoutingTable
from .routing_tables import (
+ BenchmarksRoutingTable,
DatasetsRoutingTable,
- EvalTasksRoutingTable,
ModelsRoutingTable,
ScoringFunctionsRoutingTable,
ShieldsRoutingTable,
@@ -33,7 +33,7 @@ async def get_routing_table_impl(
"shields": ShieldsRoutingTable,
"datasets": DatasetsRoutingTable,
"scoring_functions": ScoringFunctionsRoutingTable,
- "eval_tasks": EvalTasksRoutingTable,
+ "benchmarks": BenchmarksRoutingTable,
"tool_groups": ToolGroupsRoutingTable,
}
diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py
index e716e44b0..f45975189 100644
--- a/llama_stack/distribution/routers/routers.py
+++ b/llama_stack/distribution/routers/routers.py
@@ -9,9 +9,8 @@ from typing import Any, AsyncGenerator, Dict, List, Optional
from llama_stack.apis.common.content_types import URL, InterleavedContent
from llama_stack.apis.datasetio import DatasetIO, PaginatedRowsResult
from llama_stack.apis.eval import (
- AppEvalTaskConfig,
+ BenchmarkConfig,
Eval,
- EvalTaskConfig,
EvaluateResponse,
Job,
JobStatus,
@@ -347,23 +346,23 @@ class EvalRouter(Eval):
async def run_eval(
self,
- task_id: str,
- task_config: AppEvalTaskConfig,
+ benchmark_id: str,
+ task_config: BenchmarkConfig,
) -> Job:
- return await self.routing_table.get_provider_impl(task_id).run_eval(
- task_id=task_id,
+ return await self.routing_table.get_provider_impl(benchmark_id).run_eval(
+ benchmark_id=benchmark_id,
task_config=task_config,
)
async def evaluate_rows(
self,
- task_id: str,
+ benchmark_id: str,
input_rows: List[Dict[str, Any]],
scoring_functions: List[str],
- task_config: EvalTaskConfig,
+ task_config: BenchmarkConfig,
) -> EvaluateResponse:
- return await self.routing_table.get_provider_impl(task_id).evaluate_rows(
- task_id=task_id,
+ return await self.routing_table.get_provider_impl(benchmark_id).evaluate_rows(
+ benchmark_id=benchmark_id,
input_rows=input_rows,
scoring_functions=scoring_functions,
task_config=task_config,
@@ -371,30 +370,72 @@ class EvalRouter(Eval):
async def job_status(
self,
- task_id: str,
+ benchmark_id: str,
job_id: str,
) -> Optional[JobStatus]:
- return await self.routing_table.get_provider_impl(task_id).job_status(task_id, job_id)
+ return await self.routing_table.get_provider_impl(benchmark_id).job_status(benchmark_id, job_id)
async def job_cancel(
self,
- task_id: str,
+ benchmark_id: str,
job_id: str,
) -> None:
- await self.routing_table.get_provider_impl(task_id).job_cancel(
- task_id,
+ await self.routing_table.get_provider_impl(benchmark_id).job_cancel(
+ benchmark_id,
job_id,
)
async def job_result(
+ self,
+ benchmark_id: str,
+ job_id: str,
+ ) -> EvaluateResponse:
+ return await self.routing_table.get_provider_impl(benchmark_id).job_result(
+ benchmark_id,
+ job_id,
+ )
+
+ async def DEPRECATED_run_eval(
+ self,
+ task_id: str,
+ task_config: BenchmarkConfig,
+ ) -> Job:
+ return await self.run_eval(benchmark_id=task_id, task_config=task_config)
+
+ async def DEPRECATED_evaluate_rows(
+ self,
+ task_id: str,
+ input_rows: List[Dict[str, Any]],
+ scoring_functions: List[str],
+ task_config: BenchmarkConfig,
+ ) -> EvaluateResponse:
+ return await self.evaluate_rows(
+ benchmark_id=task_id,
+ input_rows=input_rows,
+ scoring_functions=scoring_functions,
+ task_config=task_config,
+ )
+
+ async def DEPRECATED_job_status(
+ self,
+ task_id: str,
+ job_id: str,
+ ) -> Optional[JobStatus]:
+ return await self.job_status(benchmark_id=task_id, job_id=job_id)
+
+ async def DEPRECATED_job_cancel(
+ self,
+ task_id: str,
+ job_id: str,
+ ) -> None:
+ return await self.job_cancel(benchmark_id=task_id, job_id=job_id)
+
+ async def DEPRECATED_job_result(
self,
task_id: str,
job_id: str,
) -> EvaluateResponse:
- return await self.routing_table.get_provider_impl(task_id).job_result(
- task_id,
- job_id,
- )
+ return await self.job_result(benchmark_id=task_id, job_id=job_id)
class ToolRuntimeRouter(ToolRuntime):
diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py
index 009775ca5..2cddc3970 100644
--- a/llama_stack/distribution/routers/routing_tables.py
+++ b/llama_stack/distribution/routers/routing_tables.py
@@ -4,14 +4,15 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
+import logging
from typing import Any, Dict, List, Optional
from pydantic import TypeAdapter
+from llama_stack.apis.benchmarks import Benchmark, Benchmarks, ListBenchmarksResponse
from llama_stack.apis.common.content_types import URL
from llama_stack.apis.common.type_system import ParamType
from llama_stack.apis.datasets import Dataset, Datasets, ListDatasetsResponse
-from llama_stack.apis.eval_tasks import EvalTask, EvalTasks, ListEvalTasksResponse
from llama_stack.apis.models import ListModelsResponse, Model, Models, ModelType
from llama_stack.apis.resource import ResourceType
from llama_stack.apis.scoring_functions import (
@@ -38,6 +39,8 @@ from llama_stack.distribution.datatypes import (
from llama_stack.distribution.store import DistributionRegistry
from llama_stack.providers.datatypes import Api, RoutingTable
+logger = logging.getLogger(__name__)
+
def get_impl_api(p: Any) -> Api:
return p.__provider_spec__.api
@@ -60,7 +63,7 @@ async def register_object_with_provider(obj: RoutableObject, p: Any) -> Routable
elif api == Api.scoring:
return await p.register_scoring_function(obj)
elif api == Api.eval:
- return await p.register_eval_task(obj)
+ return await p.register_benchmark(obj)
elif api == Api.tool_runtime:
return await p.register_tool(obj)
else:
@@ -121,7 +124,7 @@ class CommonRoutingTableImpl(RoutingTable):
scoring_functions = await p.list_scoring_functions()
await add_objects(scoring_functions, pid, ScoringFn)
elif api == Api.eval:
- p.eval_task_store = self
+ p.benchmark_store = self
elif api == Api.tool_runtime:
p.tool_store = self
@@ -141,8 +144,8 @@ class CommonRoutingTableImpl(RoutingTable):
return ("DatasetIO", "dataset")
elif isinstance(self, ScoringFunctionsRoutingTable):
return ("Scoring", "scoring_function")
- elif isinstance(self, EvalTasksRoutingTable):
- return ("Eval", "eval_task")
+ elif isinstance(self, BenchmarksRoutingTable):
+ return ("Eval", "benchmark")
elif isinstance(self, ToolGroupsRoutingTable):
return ("Tools", "tool")
else:
@@ -428,20 +431,20 @@ class ScoringFunctionsRoutingTable(CommonRoutingTableImpl, ScoringFunctions):
await self.register_object(scoring_fn)
-class EvalTasksRoutingTable(CommonRoutingTableImpl, EvalTasks):
- async def list_eval_tasks(self) -> ListEvalTasksResponse:
- return ListEvalTasksResponse(data=await self.get_all_with_type("eval_task"))
+class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks):
+ async def list_benchmarks(self) -> ListBenchmarksResponse:
+ return ListBenchmarksResponse(data=await self.get_all_with_type("benchmark"))
- async def get_eval_task(self, eval_task_id: str) -> Optional[EvalTask]:
- return await self.get_object_by_identifier("eval_task", eval_task_id)
+ async def get_benchmark(self, benchmark_id: str) -> Optional[Benchmark]:
+ return await self.get_object_by_identifier("benchmark", benchmark_id)
- async def register_eval_task(
+ async def register_benchmark(
self,
- eval_task_id: str,
+ benchmark_id: str,
dataset_id: str,
scoring_functions: List[str],
metadata: Optional[Dict[str, Any]] = None,
- provider_eval_task_id: Optional[str] = None,
+ provider_benchmark_id: Optional[str] = None,
provider_id: Optional[str] = None,
) -> None:
if metadata is None:
@@ -453,17 +456,46 @@ class EvalTasksRoutingTable(CommonRoutingTableImpl, EvalTasks):
raise ValueError(
"No provider specified and multiple providers available. Please specify a provider_id."
)
- if provider_eval_task_id is None:
- provider_eval_task_id = eval_task_id
- eval_task = EvalTask(
- identifier=eval_task_id,
+ if provider_benchmark_id is None:
+ provider_benchmark_id = benchmark_id
+ benchmark = Benchmark(
+ identifier=benchmark_id,
dataset_id=dataset_id,
scoring_functions=scoring_functions,
metadata=metadata,
provider_id=provider_id,
- provider_resource_id=provider_eval_task_id,
+ provider_resource_id=provider_benchmark_id,
+ )
+ await self.register_object(benchmark)
+
+ async def DEPRECATED_list_eval_tasks(self) -> ListBenchmarksResponse:
+ logger.warning("DEPRECATED: Use /eval/benchmarks instead")
+ return await self.list_benchmarks()
+
+ async def DEPRECATED_get_eval_task(
+ self,
+ eval_task_id: str,
+ ) -> Optional[Benchmark]:
+ logger.warning("DEPRECATED: Use /eval/benchmarks instead")
+ return await self.get_benchmark(eval_task_id)
+
+ async def DEPRECATED_register_eval_task(
+ self,
+ eval_task_id: str,
+ dataset_id: str,
+ scoring_functions: List[str],
+ provider_benchmark_id: Optional[str] = None,
+ provider_id: Optional[str] = None,
+ metadata: Optional[Dict[str, Any]] = None,
+ ) -> None:
+ logger.warning("DEPRECATED: Use /eval/benchmarks instead")
+ return await self.register_benchmark(
+ benchmark_id=eval_task_id,
+ dataset_id=dataset_id,
+ scoring_functions=scoring_functions,
+ metadata=metadata,
+ provider_benchmark_id=provider_benchmark_id,
)
- await self.register_object(eval_task)
class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):
diff --git a/llama_stack/distribution/stack.py b/llama_stack/distribution/stack.py
index 2baad8ac4..9335dc3a9 100644
--- a/llama_stack/distribution/stack.py
+++ b/llama_stack/distribution/stack.py
@@ -15,10 +15,10 @@ from termcolor import colored
from llama_stack.apis.agents import Agents
from llama_stack.apis.batch_inference import BatchInference
+from llama_stack.apis.benchmarks import Benchmarks
from llama_stack.apis.datasetio import DatasetIO
from llama_stack.apis.datasets import Datasets
from llama_stack.apis.eval import Eval
-from llama_stack.apis.eval_tasks import EvalTasks
from llama_stack.apis.inference import Inference
from llama_stack.apis.inspect import Inspect
from llama_stack.apis.models import Models
@@ -53,7 +53,7 @@ class LlamaStack(
PostTraining,
VectorIO,
Eval,
- EvalTasks,
+ Benchmarks,
Scoring,
ScoringFunctions,
DatasetIO,
@@ -78,7 +78,7 @@ RESOURCES = [
"register_scoring_function",
"list_scoring_functions",
),
- ("eval_tasks", Api.eval_tasks, "register_eval_task", "list_eval_tasks"),
+ ("benchmarks", Api.benchmarks, "register_benchmark", "list_benchmarks"),
("tool_groups", Api.tool_groups, "register_tool_group", "list_tool_groups"),
]
diff --git a/llama_stack/distribution/ui/README.md b/llama_stack/distribution/ui/README.md
index c0a2597af..8fceb5c63 100644
--- a/llama_stack/distribution/ui/README.md
+++ b/llama_stack/distribution/ui/README.md
@@ -26,7 +26,7 @@ $ llama-stack-client datasets register \
```
```bash
-$ llama-stack-client eval_tasks register \
+$ llama-stack-client benchmarks register \
--eval-task-id meta-reference-mmlu \
--provider-id meta-reference \
--dataset-id mmlu \
diff --git a/llama_stack/distribution/ui/page/distribution/eval_tasks.py b/llama_stack/distribution/ui/page/distribution/eval_tasks.py
index f58969663..1428ae9ab 100644
--- a/llama_stack/distribution/ui/page/distribution/eval_tasks.py
+++ b/llama_stack/distribution/ui/page/distribution/eval_tasks.py
@@ -8,12 +8,12 @@ import streamlit as st
from modules.api import llama_stack_api
-def eval_tasks():
- # Eval Tasks Section
- st.header("Eval Tasks")
+def benchmarks():
+ # Benchmarks Section
+ st.header("Benchmarks")
- eval_tasks_info = {d.identifier: d.to_dict() for d in llama_stack_api.client.eval_tasks.list()}
+ benchmarks_info = {d.identifier: d.to_dict() for d in llama_stack_api.client.benchmarks.list()}
- if len(eval_tasks_info) > 0:
- selected_eval_task = st.selectbox("Select an eval task", list(eval_tasks_info.keys()), key="eval_task_inspect")
- st.json(eval_tasks_info[selected_eval_task], expanded=True)
+ if len(benchmarks_info) > 0:
+ selected_benchmark = st.selectbox("Select an eval task", list(benchmarks_info.keys()), key="benchmark_inspect")
+ st.json(benchmarks_info[selected_benchmark], expanded=True)
diff --git a/llama_stack/distribution/ui/page/distribution/resources.py b/llama_stack/distribution/ui/page/distribution/resources.py
index 94b840bcb..684270d4d 100644
--- a/llama_stack/distribution/ui/page/distribution/resources.py
+++ b/llama_stack/distribution/ui/page/distribution/resources.py
@@ -4,8 +4,8 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
+from page.distribution.benchmarks import benchmarks
from page.distribution.datasets import datasets
-from page.distribution.eval_tasks import eval_tasks
from page.distribution.models import models
from page.distribution.scoring_functions import scoring_functions
from page.distribution.shields import shields
@@ -20,7 +20,7 @@ def resources_page():
"Shields",
"Scoring Functions",
"Datasets",
- "Eval Tasks",
+ "Benchmarks",
]
icons = ["magic", "memory", "shield", "file-bar-graph", "database", "list-task"]
selected_resource = option_menu(
@@ -34,8 +34,8 @@ def resources_page():
},
},
)
- if selected_resource == "Eval Tasks":
- eval_tasks()
+ if selected_resource == "Benchmarks":
+ benchmarks()
elif selected_resource == "Vector Databases":
vector_dbs()
elif selected_resource == "Datasets":
diff --git a/llama_stack/distribution/ui/page/evaluations/native_eval.py b/llama_stack/distribution/ui/page/evaluations/native_eval.py
index 112d9cff0..f1cae714a 100644
--- a/llama_stack/distribution/ui/page/evaluations/native_eval.py
+++ b/llama_stack/distribution/ui/page/evaluations/native_eval.py
@@ -11,28 +11,28 @@ import streamlit as st
from modules.api import llama_stack_api
-def select_eval_task_1():
- # Select Eval Tasks
+def select_benchmark_1():
+ # Select Benchmarks
st.subheader("1. Choose An Eval Task")
- eval_tasks = llama_stack_api.client.eval_tasks.list()
- eval_tasks = {et.identifier: et for et in eval_tasks}
- eval_tasks_names = list(eval_tasks.keys())
- selected_eval_task = st.selectbox(
+ benchmarks = llama_stack_api.client.benchmarks.list()
+ benchmarks = {et.identifier: et for et in benchmarks}
+ benchmarks_names = list(benchmarks.keys())
+ selected_benchmark = st.selectbox(
"Choose an eval task.",
- options=eval_tasks_names,
+ options=benchmarks_names,
help="Choose an eval task. Each eval task is parameterized by a dataset, and list of scoring functions.",
)
with st.expander("View Eval Task"):
- st.json(eval_tasks[selected_eval_task], expanded=True)
+ st.json(benchmarks[selected_benchmark], expanded=True)
- st.session_state["selected_eval_task"] = selected_eval_task
- st.session_state["eval_tasks"] = eval_tasks
+ st.session_state["selected_benchmark"] = selected_benchmark
+ st.session_state["benchmarks"] = benchmarks
if st.button("Confirm", key="confirm_1"):
- st.session_state["selected_eval_task_1_next"] = True
+ st.session_state["selected_benchmark_1_next"] = True
def define_eval_candidate_2():
- if not st.session_state.get("selected_eval_task_1_next", None):
+ if not st.session_state.get("selected_benchmark_1_next", None):
return
st.subheader("2. Define Eval Candidate")
@@ -161,11 +161,11 @@ def run_evaluation_3():
Review the configurations that will be used for this evaluation run, make any necessary changes, and then click the "Run Evaluation" button.
"""
)
- selected_eval_task = st.session_state["selected_eval_task"]
- eval_tasks = st.session_state["eval_tasks"]
+ selected_benchmark = st.session_state["selected_benchmark"]
+ benchmarks = st.session_state["benchmarks"]
eval_candidate = st.session_state["eval_candidate"]
- dataset_id = eval_tasks[selected_eval_task].dataset_id
+ dataset_id = benchmarks[selected_benchmark].dataset_id
rows = llama_stack_api.client.datasetio.get_rows_paginated(
dataset_id=dataset_id,
rows_in_page=-1,
@@ -180,16 +180,16 @@ def run_evaluation_3():
help="Number of examples from the dataset to evaluate. ",
)
- eval_task_config = {
+ benchmark_config = {
"type": "benchmark",
"eval_candidate": eval_candidate,
"scoring_params": {},
}
with st.expander("View Evaluation Task", expanded=True):
- st.json(eval_tasks[selected_eval_task], expanded=True)
+ st.json(benchmarks[selected_benchmark], expanded=True)
with st.expander("View Evaluation Task Configuration", expanded=True):
- st.json(eval_task_config, expanded=True)
+ st.json(benchmark_config, expanded=True)
# Add run button and handle evaluation
if st.button("Run Evaluation"):
@@ -209,10 +209,10 @@ def run_evaluation_3():
progress_bar.progress(progress, text=progress_text)
# Run evaluation for current row
eval_res = llama_stack_api.client.eval.evaluate_rows(
- task_id=selected_eval_task,
+ benchmark_id=selected_benchmark,
input_rows=[r],
- scoring_functions=eval_tasks[selected_eval_task].scoring_functions,
- task_config=eval_task_config,
+ scoring_functions=benchmarks[selected_benchmark].scoring_functions,
+ task_config=benchmark_config,
)
for k in r.keys():
@@ -225,7 +225,7 @@ def run_evaluation_3():
output_res[k] = []
output_res[k].append(eval_res.generations[0][k])
- for scoring_fn in eval_tasks[selected_eval_task].scoring_functions:
+ for scoring_fn in benchmarks[selected_benchmark].scoring_functions:
if scoring_fn not in output_res:
output_res[scoring_fn] = []
output_res[scoring_fn].append(eval_res.scores[scoring_fn].score_rows[0])
@@ -245,7 +245,7 @@ def native_evaluation_page():
st.set_page_config(page_title="Evaluations (Generation + Scoring)", page_icon="🦙")
st.title("📊 Evaluations (Generation + Scoring)")
- select_eval_task_1()
+ select_benchmark_1()
define_eval_candidate_2()
run_evaluation_3()
diff --git a/llama_stack/providers/datatypes.py b/llama_stack/providers/datatypes.py
index ccdaf76e7..b92f9dc0a 100644
--- a/llama_stack/providers/datatypes.py
+++ b/llama_stack/providers/datatypes.py
@@ -10,9 +10,9 @@ from urllib.parse import urlparse
from llama_models.schema_utils import json_schema_type
from pydantic import BaseModel, Field
+from llama_stack.apis.benchmarks import Benchmark
from llama_stack.apis.datasets import Dataset
from llama_stack.apis.datatypes import Api
-from llama_stack.apis.eval_tasks import EvalTask
from llama_stack.apis.models import Model
from llama_stack.apis.scoring_functions import ScoringFn
from llama_stack.apis.shields import Shield
@@ -48,8 +48,8 @@ class ScoringFunctionsProtocolPrivate(Protocol):
async def register_scoring_function(self, scoring_fn: ScoringFn) -> None: ...
-class EvalTasksProtocolPrivate(Protocol):
- async def register_eval_task(self, eval_task: EvalTask) -> None: ...
+class BenchmarksProtocolPrivate(Protocol):
+ async def register_benchmark(self, benchmark: Benchmark) -> None: ...
class ToolsProtocolPrivate(Protocol):
diff --git a/llama_stack/providers/inline/eval/meta_reference/eval.py b/llama_stack/providers/inline/eval/meta_reference/eval.py
index 1c44caf7f..cd99c9ad8 100644
--- a/llama_stack/providers/inline/eval/meta_reference/eval.py
+++ b/llama_stack/providers/inline/eval/meta_reference/eval.py
@@ -8,13 +8,13 @@ from typing import Any, Dict, List, Optional
from tqdm import tqdm
from llama_stack.apis.agents import Agents, StepType
+from llama_stack.apis.benchmarks import Benchmark
from llama_stack.apis.datasetio import DatasetIO
from llama_stack.apis.datasets import Datasets
-from llama_stack.apis.eval_tasks import EvalTask
from llama_stack.apis.inference import Inference, UserMessage
from llama_stack.apis.scoring import Scoring
from llama_stack.distribution.datatypes import Api
-from llama_stack.providers.datatypes import EvalTasksProtocolPrivate
+from llama_stack.providers.datatypes import BenchmarksProtocolPrivate
from llama_stack.providers.inline.agents.meta_reference.agent_instance import (
MEMORY_QUERY_TOOL,
)
@@ -26,15 +26,15 @@ from llama_stack.providers.utils.common.data_schema_validator import (
from llama_stack.providers.utils.kvstore import kvstore_impl
from .....apis.common.job_types import Job
-from .....apis.eval.eval import Eval, EvalTaskConfig, EvaluateResponse, JobStatus
+from .....apis.eval.eval import BenchmarkConfig, Eval, EvaluateResponse, JobStatus
from .config import MetaReferenceEvalConfig
-EVAL_TASKS_PREFIX = "eval_tasks:"
+EVAL_TASKS_PREFIX = "benchmarks:"
class MetaReferenceEvalImpl(
Eval,
- EvalTasksProtocolPrivate,
+ BenchmarksProtocolPrivate,
):
def __init__(
self,
@@ -55,36 +55,36 @@ class MetaReferenceEvalImpl(
# TODO: assume sync job, will need jobs API for async scheduling
self.jobs = {}
- self.eval_tasks = {}
+ self.benchmarks = {}
async def initialize(self) -> None:
self.kvstore = await kvstore_impl(self.config.kvstore)
- # Load existing eval_tasks from kvstore
+ # Load existing benchmarks from kvstore
start_key = EVAL_TASKS_PREFIX
end_key = f"{EVAL_TASKS_PREFIX}\xff"
- stored_eval_tasks = await self.kvstore.range(start_key, end_key)
+ stored_benchmarks = await self.kvstore.range(start_key, end_key)
- for eval_task in stored_eval_tasks:
- eval_task = EvalTask.model_validate_json(eval_task)
- self.eval_tasks[eval_task.identifier] = eval_task
+ for benchmark in stored_benchmarks:
+ benchmark = Benchmark.model_validate_json(benchmark)
+ self.benchmarks[benchmark.identifier] = benchmark
async def shutdown(self) -> None: ...
- async def register_eval_task(self, task_def: EvalTask) -> None:
+ async def register_benchmark(self, task_def: Benchmark) -> None:
# Store in kvstore
key = f"{EVAL_TASKS_PREFIX}{task_def.identifier}"
await self.kvstore.set(
key=key,
value=task_def.model_dump_json(),
)
- self.eval_tasks[task_def.identifier] = task_def
+ self.benchmarks[task_def.identifier] = task_def
async def run_eval(
self,
- task_id: str,
- task_config: EvalTaskConfig,
+ benchmark_id: str,
+ task_config: BenchmarkConfig,
) -> Job:
- task_def = self.eval_tasks[task_id]
+ task_def = self.benchmarks[benchmark_id]
dataset_id = task_def.dataset_id
candidate = task_config.eval_candidate
scoring_functions = task_def.scoring_functions
@@ -95,7 +95,7 @@ class MetaReferenceEvalImpl(
rows_in_page=(-1 if task_config.num_examples is None else task_config.num_examples),
)
res = await self.evaluate_rows(
- task_id=task_id,
+ benchmark_id=benchmark_id,
input_rows=all_rows.rows,
scoring_functions=scoring_functions,
task_config=task_config,
@@ -108,7 +108,7 @@ class MetaReferenceEvalImpl(
return Job(job_id=job_id)
async def _run_agent_generation(
- self, input_rows: List[Dict[str, Any]], task_config: EvalTaskConfig
+ self, input_rows: List[Dict[str, Any]], task_config: BenchmarkConfig
) -> List[Dict[str, Any]]:
candidate = task_config.eval_candidate
create_response = await self.agents_api.create_agent(candidate.config)
@@ -151,7 +151,7 @@ class MetaReferenceEvalImpl(
return generations
async def _run_model_generation(
- self, input_rows: List[Dict[str, Any]], task_config: EvalTaskConfig
+ self, input_rows: List[Dict[str, Any]], task_config: BenchmarkConfig
) -> List[Dict[str, Any]]:
candidate = task_config.eval_candidate
assert candidate.sampling_params.max_tokens is not None, "SamplingParams.max_tokens must be provided"
@@ -187,10 +187,10 @@ class MetaReferenceEvalImpl(
async def evaluate_rows(
self,
- task_id: str,
+ benchmark_id: str,
input_rows: List[Dict[str, Any]],
scoring_functions: List[str],
- task_config: EvalTaskConfig,
+ task_config: BenchmarkConfig,
) -> EvaluateResponse:
candidate = task_config.eval_candidate
if candidate.type == "agent":
@@ -203,7 +203,7 @@ class MetaReferenceEvalImpl(
# scoring with generated_answer
score_input_rows = [input_r | generated_r for input_r, generated_r in zip(input_rows, generations)]
- if task_config.type == "app" and task_config.scoring_params is not None:
+ if task_config.scoring_params is not None:
scoring_functions_dict = {
scoring_fn_id: task_config.scoring_params.get(scoring_fn_id, None)
for scoring_fn_id in scoring_functions
@@ -217,18 +217,60 @@ class MetaReferenceEvalImpl(
return EvaluateResponse(generations=generations, scores=score_response.results)
- async def job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]:
+ async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]:
if job_id in self.jobs:
return JobStatus.completed
return None
- async def job_cancel(self, task_id: str, job_id: str) -> None:
+ async def job_cancel(self, benchmark_id: str, job_id: str) -> None:
raise NotImplementedError("Job cancel is not implemented yet")
- async def job_result(self, task_id: str, job_id: str) -> EvaluateResponse:
- status = await self.job_status(task_id, job_id)
+ async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse:
+ status = await self.job_status(benchmark_id, job_id)
if not status or status != JobStatus.completed:
raise ValueError(f"Job is not completed, Status: {status.value}")
return self.jobs[job_id]
+
+ async def DEPRECATED_run_eval(
+ self,
+ task_id: str,
+ task_config: BenchmarkConfig,
+ ) -> Job:
+ return await self.run_eval(benchmark_id=task_id, task_config=task_config)
+
+ async def DEPRECATED_evaluate_rows(
+ self,
+ task_id: str,
+ input_rows: List[Dict[str, Any]],
+ scoring_functions: List[str],
+ task_config: BenchmarkConfig,
+ ) -> EvaluateResponse:
+ return await self.evaluate_rows(
+ benchmark_id=task_id,
+ input_rows=input_rows,
+ scoring_functions=scoring_functions,
+ task_config=task_config,
+ )
+
+ async def DEPRECATED_job_status(
+ self,
+ task_id: str,
+ job_id: str,
+ ) -> Optional[JobStatus]:
+ return await self.job_status(benchmark_id=task_id, job_id=job_id)
+
+ async def DEPRECATED_job_cancel(
+ self,
+ task_id: str,
+ job_id: str,
+ ) -> None:
+ return await self.job_cancel(benchmark_id=task_id, job_id=job_id)
+
+ async def DEPRECATED_job_result(
+ self,
+ task_id: str,
+ job_id: str,
+ ) -> EvaluateResponse:
+ return await self.job_result(benchmark_id=task_id, job_id=job_id)
diff --git a/llama_stack/providers/tests/eval/test_eval.py b/llama_stack/providers/tests/eval/test_eval.py
index ec3d08728..ad80b8601 100644
--- a/llama_stack/providers/tests/eval/test_eval.py
+++ b/llama_stack/providers/tests/eval/test_eval.py
@@ -10,8 +10,8 @@ import pytest
from llama_stack.apis.common.content_types import URL
from llama_stack.apis.common.type_system import ChatCompletionInputType, StringType
from llama_stack.apis.eval.eval import (
- AppEvalTaskConfig,
- BenchmarkEvalTaskConfig,
+ AppBenchmarkConfig,
+ BenchmarkBenchmarkConfig,
ModelCandidate,
)
from llama_stack.apis.inference import SamplingParams
@@ -30,18 +30,18 @@ from .constants import JUDGE_PROMPT
class Testeval:
@pytest.mark.asyncio
- async def test_eval_tasks_list(self, eval_stack):
+ async def test_benchmarks_list(self, eval_stack):
# NOTE: this needs you to ensure that you are starting from a clean state
# but so far we don't have an unregister API unfortunately, so be careful
- eval_tasks_impl = eval_stack[Api.eval_tasks]
- response = await eval_tasks_impl.list_eval_tasks()
+ benchmarks_impl = eval_stack[Api.benchmarks]
+ response = await benchmarks_impl.list_benchmarks()
assert isinstance(response, list)
@pytest.mark.asyncio
async def test_eval_evaluate_rows(self, eval_stack, inference_model, judge_model):
- eval_impl, eval_tasks_impl, datasetio_impl, datasets_impl, models_impl = (
+ eval_impl, benchmarks_impl, datasetio_impl, datasets_impl, models_impl = (
eval_stack[Api.eval],
- eval_stack[Api.eval_tasks],
+ eval_stack[Api.benchmarks],
eval_stack[Api.datasetio],
eval_stack[Api.datasets],
eval_stack[Api.models],
@@ -59,17 +59,17 @@ class Testeval:
scoring_functions = [
"basic::equality",
]
- task_id = "meta-reference::app_eval"
- await eval_tasks_impl.register_eval_task(
- eval_task_id=task_id,
+ benchmark_id = "meta-reference::app_eval"
+ await benchmarks_impl.register_benchmark(
+ benchmark_id=benchmark_id,
dataset_id="test_dataset_for_eval",
scoring_functions=scoring_functions,
)
response = await eval_impl.evaluate_rows(
- task_id=task_id,
+ benchmark_id=benchmark_id,
input_rows=rows.rows,
scoring_functions=scoring_functions,
- task_config=AppEvalTaskConfig(
+ task_config=AppBenchmarkConfig(
eval_candidate=ModelCandidate(
model=inference_model,
sampling_params=SamplingParams(),
@@ -92,9 +92,9 @@ class Testeval:
@pytest.mark.asyncio
async def test_eval_run_eval(self, eval_stack, inference_model, judge_model):
- eval_impl, eval_tasks_impl, datasets_impl, models_impl = (
+ eval_impl, benchmarks_impl, datasets_impl, models_impl = (
eval_stack[Api.eval],
- eval_stack[Api.eval_tasks],
+ eval_stack[Api.benchmarks],
eval_stack[Api.datasets],
eval_stack[Api.models],
)
@@ -105,15 +105,15 @@ class Testeval:
"basic::subset_of",
]
- task_id = "meta-reference::app_eval-2"
- await eval_tasks_impl.register_eval_task(
- eval_task_id=task_id,
+ benchmark_id = "meta-reference::app_eval-2"
+ await benchmarks_impl.register_benchmark(
+ benchmark_id=benchmark_id,
dataset_id="test_dataset_for_eval",
scoring_functions=scoring_functions,
)
response = await eval_impl.run_eval(
- task_id=task_id,
- task_config=AppEvalTaskConfig(
+ benchmark_id=benchmark_id,
+ task_config=AppBenchmarkConfig(
eval_candidate=ModelCandidate(
model=inference_model,
sampling_params=SamplingParams(),
@@ -121,9 +121,9 @@ class Testeval:
),
)
assert response.job_id == "0"
- job_status = await eval_impl.job_status(task_id, response.job_id)
+ job_status = await eval_impl.job_status(benchmark_id, response.job_id)
assert job_status and job_status.value == "completed"
- eval_response = await eval_impl.job_result(task_id, response.job_id)
+ eval_response = await eval_impl.job_result(benchmark_id, response.job_id)
assert eval_response is not None
assert len(eval_response.generations) == 5
@@ -131,9 +131,9 @@ class Testeval:
@pytest.mark.asyncio
async def test_eval_run_benchmark_eval(self, eval_stack, inference_model):
- eval_impl, eval_tasks_impl, datasets_impl, models_impl = (
+ eval_impl, benchmarks_impl, datasets_impl, models_impl = (
eval_stack[Api.eval],
- eval_stack[Api.eval_tasks],
+ eval_stack[Api.benchmarks],
eval_stack[Api.datasets],
eval_stack[Api.models],
)
@@ -159,20 +159,20 @@ class Testeval:
)
# register eval task
- await eval_tasks_impl.register_eval_task(
- eval_task_id="meta-reference-mmlu",
+ await benchmarks_impl.register_benchmark(
+ benchmark_id="meta-reference-mmlu",
dataset_id="mmlu",
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
)
# list benchmarks
- response = await eval_tasks_impl.list_eval_tasks()
+ response = await benchmarks_impl.list_benchmarks()
assert len(response) > 0
benchmark_id = "meta-reference-mmlu"
response = await eval_impl.run_eval(
- task_id=benchmark_id,
- task_config=BenchmarkEvalTaskConfig(
+ benchmark_id=benchmark_id,
+ task_config=BenchmarkBenchmarkConfig(
eval_candidate=ModelCandidate(
model=inference_model,
sampling_params=SamplingParams(),
diff --git a/llama_stack/providers/tests/resolver.py b/llama_stack/providers/tests/resolver.py
index 0ff632717..76343b7f4 100644
--- a/llama_stack/providers/tests/resolver.py
+++ b/llama_stack/providers/tests/resolver.py
@@ -10,8 +10,8 @@ from typing import Any, Dict, List, Optional
from pydantic import BaseModel
+from llama_stack.apis.benchmarks import BenchmarkInput
from llama_stack.apis.datasets import DatasetInput
-from llama_stack.apis.eval_tasks import EvalTaskInput
from llama_stack.apis.models import ModelInput
from llama_stack.apis.scoring_functions import ScoringFnInput
from llama_stack.apis.shields import ShieldInput
@@ -42,7 +42,7 @@ async def construct_stack_for_test(
vector_dbs: Optional[List[VectorDBInput]] = None,
datasets: Optional[List[DatasetInput]] = None,
scoring_fns: Optional[List[ScoringFnInput]] = None,
- eval_tasks: Optional[List[EvalTaskInput]] = None,
+ benchmarks: Optional[List[BenchmarkInput]] = None,
tool_groups: Optional[List[ToolGroupInput]] = None,
) -> TestStack:
sqlite_file = tempfile.NamedTemporaryFile(delete=False, suffix=".db")
@@ -56,7 +56,7 @@ async def construct_stack_for_test(
vector_dbs=vector_dbs or [],
datasets=datasets or [],
scoring_fns=scoring_fns or [],
- eval_tasks=eval_tasks or [],
+ benchmarks=benchmarks or [],
tool_groups=tool_groups or [],
)
run_config = parse_and_maybe_upgrade_config(run_config)
diff --git a/llama_stack/templates/bedrock/run.yaml b/llama_stack/templates/bedrock/run.yaml
index be6c9a928..7d03b7c29 100644
--- a/llama_stack/templates/bedrock/run.yaml
+++ b/llama_stack/templates/bedrock/run.yaml
@@ -107,7 +107,7 @@ shields: []
vector_dbs: []
datasets: []
scoring_fns: []
-eval_tasks: []
+benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search
diff --git a/llama_stack/templates/cerebras/run.yaml b/llama_stack/templates/cerebras/run.yaml
index 05d3f4525..6afff2be2 100644
--- a/llama_stack/templates/cerebras/run.yaml
+++ b/llama_stack/templates/cerebras/run.yaml
@@ -109,7 +109,7 @@ shields: []
vector_dbs: []
datasets: []
scoring_fns: []
-eval_tasks: []
+benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search
diff --git a/llama_stack/templates/dell/run-with-safety.yaml b/llama_stack/templates/dell/run-with-safety.yaml
index 04c5957d4..ddec3a715 100644
--- a/llama_stack/templates/dell/run-with-safety.yaml
+++ b/llama_stack/templates/dell/run-with-safety.yaml
@@ -108,7 +108,7 @@ shields:
vector_dbs: []
datasets: []
scoring_fns: []
-eval_tasks: []
+benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: brave-search
diff --git a/llama_stack/templates/dell/run.yaml b/llama_stack/templates/dell/run.yaml
index 706444eb1..9394c94ef 100644
--- a/llama_stack/templates/dell/run.yaml
+++ b/llama_stack/templates/dell/run.yaml
@@ -99,7 +99,7 @@ shields: []
vector_dbs: []
datasets: []
scoring_fns: []
-eval_tasks: []
+benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: brave-search
diff --git a/llama_stack/templates/experimental-post-training/run.yaml b/llama_stack/templates/experimental-post-training/run.yaml
index 75d103c9f..e70ccdd2d 100644
--- a/llama_stack/templates/experimental-post-training/run.yaml
+++ b/llama_stack/templates/experimental-post-training/run.yaml
@@ -85,4 +85,4 @@ shields: []
vector_dbs: []
datasets: []
scoring_fns: []
-eval_tasks: []
+benchmarks: []
diff --git a/llama_stack/templates/fireworks/run-with-safety.yaml b/llama_stack/templates/fireworks/run-with-safety.yaml
index 0fbe14a5a..8f95e9d59 100644
--- a/llama_stack/templates/fireworks/run-with-safety.yaml
+++ b/llama_stack/templates/fireworks/run-with-safety.yaml
@@ -164,7 +164,7 @@ shields:
vector_dbs: []
datasets: []
scoring_fns: []
-eval_tasks: []
+benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search
diff --git a/llama_stack/templates/fireworks/run.yaml b/llama_stack/templates/fireworks/run.yaml
index ccf67dcbb..64229a5d8 100644
--- a/llama_stack/templates/fireworks/run.yaml
+++ b/llama_stack/templates/fireworks/run.yaml
@@ -153,7 +153,7 @@ shields:
vector_dbs: []
datasets: []
scoring_fns: []
-eval_tasks: []
+benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search
diff --git a/llama_stack/templates/hf-endpoint/run-with-safety.yaml b/llama_stack/templates/hf-endpoint/run-with-safety.yaml
index f520a2fda..867d7a076 100644
--- a/llama_stack/templates/hf-endpoint/run-with-safety.yaml
+++ b/llama_stack/templates/hf-endpoint/run-with-safety.yaml
@@ -116,7 +116,7 @@ shields:
vector_dbs: []
datasets: []
scoring_fns: []
-eval_tasks: []
+benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search
diff --git a/llama_stack/templates/hf-endpoint/run.yaml b/llama_stack/templates/hf-endpoint/run.yaml
index 708cb1bcc..d60acdefd 100644
--- a/llama_stack/templates/hf-endpoint/run.yaml
+++ b/llama_stack/templates/hf-endpoint/run.yaml
@@ -106,7 +106,7 @@ shields: []
vector_dbs: []
datasets: []
scoring_fns: []
-eval_tasks: []
+benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search
diff --git a/llama_stack/templates/hf-serverless/run-with-safety.yaml b/llama_stack/templates/hf-serverless/run-with-safety.yaml
index 7f0abf5be..e58ad15b3 100644
--- a/llama_stack/templates/hf-serverless/run-with-safety.yaml
+++ b/llama_stack/templates/hf-serverless/run-with-safety.yaml
@@ -116,7 +116,7 @@ shields:
vector_dbs: []
datasets: []
scoring_fns: []
-eval_tasks: []
+benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search
diff --git a/llama_stack/templates/hf-serverless/run.yaml b/llama_stack/templates/hf-serverless/run.yaml
index c0b7a4c60..5045e821a 100644
--- a/llama_stack/templates/hf-serverless/run.yaml
+++ b/llama_stack/templates/hf-serverless/run.yaml
@@ -106,7 +106,7 @@ shields: []
vector_dbs: []
datasets: []
scoring_fns: []
-eval_tasks: []
+benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search
diff --git a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
index c5286fc6b..caac65c8c 100644
--- a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
+++ b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
@@ -118,7 +118,7 @@ shields:
vector_dbs: []
datasets: []
scoring_fns: []
-eval_tasks: []
+benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search
diff --git a/llama_stack/templates/meta-reference-gpu/run.yaml b/llama_stack/templates/meta-reference-gpu/run.yaml
index 310585f23..bade9a076 100644
--- a/llama_stack/templates/meta-reference-gpu/run.yaml
+++ b/llama_stack/templates/meta-reference-gpu/run.yaml
@@ -107,7 +107,7 @@ shields: []
vector_dbs: []
datasets: []
scoring_fns: []
-eval_tasks: []
+benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search
diff --git a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml
index d43cf3917..f131e8ea6 100644
--- a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml
+++ b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml
@@ -109,7 +109,7 @@ shields: []
vector_dbs: []
datasets: []
scoring_fns: []
-eval_tasks: []
+benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search
diff --git a/llama_stack/templates/nvidia/run.yaml b/llama_stack/templates/nvidia/run.yaml
index c8ae362f5..14fb28354 100644
--- a/llama_stack/templates/nvidia/run.yaml
+++ b/llama_stack/templates/nvidia/run.yaml
@@ -139,7 +139,7 @@ shields: []
vector_dbs: []
datasets: []
scoring_fns: []
-eval_tasks: []
+benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search
diff --git a/llama_stack/templates/ollama/run-with-safety.yaml b/llama_stack/templates/ollama/run-with-safety.yaml
index ac5dab755..9d5bfc7a0 100644
--- a/llama_stack/templates/ollama/run-with-safety.yaml
+++ b/llama_stack/templates/ollama/run-with-safety.yaml
@@ -113,7 +113,7 @@ shields:
vector_dbs: []
datasets: []
scoring_fns: []
-eval_tasks: []
+benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search
diff --git a/llama_stack/templates/ollama/run.yaml b/llama_stack/templates/ollama/run.yaml
index 3a60fe61f..9ac1f3267 100644
--- a/llama_stack/templates/ollama/run.yaml
+++ b/llama_stack/templates/ollama/run.yaml
@@ -110,7 +110,7 @@ shields: []
vector_dbs: []
datasets: []
scoring_fns: []
-eval_tasks: []
+benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search
diff --git a/llama_stack/templates/remote-vllm/run-with-safety.yaml b/llama_stack/templates/remote-vllm/run-with-safety.yaml
index 1fe998a1f..dd43f21f6 100644
--- a/llama_stack/templates/remote-vllm/run-with-safety.yaml
+++ b/llama_stack/templates/remote-vllm/run-with-safety.yaml
@@ -118,7 +118,7 @@ shields:
vector_dbs: []
datasets: []
scoring_fns: []
-eval_tasks: []
+benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search
diff --git a/llama_stack/templates/remote-vllm/run.yaml b/llama_stack/templates/remote-vllm/run.yaml
index 9d3db8a31..24cd207c7 100644
--- a/llama_stack/templates/remote-vllm/run.yaml
+++ b/llama_stack/templates/remote-vllm/run.yaml
@@ -107,7 +107,7 @@ shields: []
vector_dbs: []
datasets: []
scoring_fns: []
-eval_tasks: []
+benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search
diff --git a/llama_stack/templates/sambanova/run.yaml b/llama_stack/templates/sambanova/run.yaml
index 39b0f3c4e..26815dcd0 100644
--- a/llama_stack/templates/sambanova/run.yaml
+++ b/llama_stack/templates/sambanova/run.yaml
@@ -118,7 +118,7 @@ shields:
vector_dbs: []
datasets: []
scoring_fns: []
-eval_tasks: []
+benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search
diff --git a/llama_stack/templates/tgi/run-with-safety.yaml b/llama_stack/templates/tgi/run-with-safety.yaml
index ed6c9ef6f..e1d85f59a 100644
--- a/llama_stack/templates/tgi/run-with-safety.yaml
+++ b/llama_stack/templates/tgi/run-with-safety.yaml
@@ -106,7 +106,7 @@ shields:
vector_dbs: []
datasets: []
scoring_fns: []
-eval_tasks: []
+benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search
diff --git a/llama_stack/templates/tgi/run.yaml b/llama_stack/templates/tgi/run.yaml
index 8bf76f37b..fc73e0978 100644
--- a/llama_stack/templates/tgi/run.yaml
+++ b/llama_stack/templates/tgi/run.yaml
@@ -105,7 +105,7 @@ shields: []
vector_dbs: []
datasets: []
scoring_fns: []
-eval_tasks: []
+benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search
diff --git a/llama_stack/templates/together/run-with-safety.yaml b/llama_stack/templates/together/run-with-safety.yaml
index 298926630..f101a5d60 100644
--- a/llama_stack/templates/together/run-with-safety.yaml
+++ b/llama_stack/templates/together/run-with-safety.yaml
@@ -159,7 +159,7 @@ shields:
vector_dbs: []
datasets: []
scoring_fns: []
-eval_tasks: []
+benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search
diff --git a/llama_stack/templates/together/run.yaml b/llama_stack/templates/together/run.yaml
index 920003759..8af85979d 100644
--- a/llama_stack/templates/together/run.yaml
+++ b/llama_stack/templates/together/run.yaml
@@ -148,7 +148,7 @@ shields:
vector_dbs: []
datasets: []
scoring_fns: []
-eval_tasks: []
+benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search
diff --git a/llama_stack/templates/vllm-gpu/run.yaml b/llama_stack/templates/vllm-gpu/run.yaml
index 41a545e1a..cdce5510d 100644
--- a/llama_stack/templates/vllm-gpu/run.yaml
+++ b/llama_stack/templates/vllm-gpu/run.yaml
@@ -109,7 +109,7 @@ shields: []
vector_dbs: []
datasets: []
scoring_fns: []
-eval_tasks: []
+benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search