diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 98270f7b8..b93f6a380 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -40,6 +40,286 @@ } ], "paths": { + "/v1/eval/tasks/{task_id}/evaluations": { + "post": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/EvaluateResponse" + } + } + } + } + }, + "tags": [ + "Eval" + ], + "description": "", + "parameters": [ + { + "name": "task_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/DeprecatedEvaluateRowsRequest" + } + } + }, + "required": true + }, + "deprecated": true + } + }, + "/v1/eval-tasks/{task_id}": { + "get": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "oneOf": [ + { + "$ref": "#/components/schemas/Benchmark" + }, + { + "type": "null" + } + ] + } + } + } + } + }, + "tags": [ + "Benchmarks" + ], + "description": "", + "parameters": [ + { + "name": "eval_task_id", + "in": "query", + "required": true, + "schema": { + "type": "string" + } + } + ], + "deprecated": true + } + }, + "/v1/eval/tasks/{task_id}/jobs/{job_id}": { + "get": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "oneOf": [ + { + "$ref": "#/components/schemas/JobStatus" + }, + { + "type": "null" + } + ] + } + } + } + } + }, + "tags": [ + "Eval" + ], + "description": "", + "parameters": [ + { + "name": "task_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + }, + { + "name": "job_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + } + ], + "deprecated": true + }, + "delete": { + "responses": { + "200": { + "description": "OK" + } + }, + "tags": [ + "Eval" + ], + "description": "", + "parameters": [ + { + "name": "task_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + }, + { + "name": "job_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + } + ], + "deprecated": true + } + }, + "/v1/eval/tasks/{task_id}/jobs/{job_id}/result": { + "get": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/EvaluateResponse" + } + } + } + } + }, + "tags": [ + "Eval" + ], + "description": "", + "parameters": [ + { + "name": "task_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + }, + { + "name": "job_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + } + ], + "deprecated": true + } + }, + "/v1/eval-tasks": { + "get": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ListBenchmarksResponse" + } + } + } + } + }, + "tags": [ + "Benchmarks" + ], + "description": "", + "parameters": [], + "deprecated": true + }, + "post": { + "responses": { + "200": { + "description": "OK" + } + }, + "tags": [ + "Benchmarks" + ], + "description": "", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/DeprecatedRegisterEvalTaskRequest" + } + } + }, + "required": true + }, + "deprecated": true + } + }, + "/v1/eval/tasks/{task_id}/jobs": { + "post": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Job" + } + } + } + } + }, + "tags": [ + "Eval" + ], + "description": "", + "parameters": [ + { + "name": "task_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/DeprecatedRunEvalRequest" + } + } + }, + "required": true + }, + "deprecated": true + } + }, "/v1/datasetio/rows": { "get": { "responses": { @@ -530,7 +810,7 @@ } } }, - "/v1/eval/tasks/{task_id}/evaluations": { + "/v1/eval/benchmarks/{benchmark_id}/evaluations": { "post": { "responses": { "200": { @@ -550,7 +830,7 @@ "description": "", "parameters": [ { - "name": "task_id", + "name": "benchmark_id", "in": "path", "required": true, "schema": { @@ -670,6 +950,43 @@ ] } }, + "/v1/eval/benchmarks/{benchmark_id}": { + "get": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "oneOf": [ + { + "$ref": "#/components/schemas/Benchmark" + }, + { + "type": "null" + } + ] + } + } + } + } + }, + "tags": [ + "Benchmarks" + ], + "description": "", + "parameters": [ + { + "name": "benchmark_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + } + ] + } + }, "/v1/datasets/{dataset_id}": { "get": { "responses": { @@ -728,43 +1045,6 @@ ] } }, - "/v1/eval-tasks/{eval_task_id}": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "oneOf": [ - { - "$ref": "#/components/schemas/EvalTask" - }, - { - "type": "null" - } - ] - } - } - } - } - }, - "tags": [ - "EvalTasks" - ], - "description": "", - "parameters": [ - { - "name": "eval_task_id", - "in": "path", - "required": true, - "schema": { - "type": "string" - } - } - ] - } - }, "/v1/models/{model_id}": { "get": { "responses": { @@ -1348,7 +1628,7 @@ } } }, - "/v1/eval/tasks/{task_id}/jobs/{job_id}": { + "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}": { "get": { "responses": { "200": { @@ -1375,7 +1655,7 @@ "description": "", "parameters": [ { - "name": "task_id", + "name": "benchmark_id", "in": "path", "required": true, "schema": { @@ -1404,7 +1684,7 @@ "description": "", "parameters": [ { - "name": "task_id", + "name": "benchmark_id", "in": "path", "required": true, "schema": { @@ -1422,7 +1702,7 @@ ] } }, - "/v1/eval/tasks/{task_id}/jobs/{job_id}/result": { + "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result": { "get": { "responses": { "200": { @@ -1442,7 +1722,7 @@ "description": "", "parameters": [ { - "name": "job_id", + "name": "benchmark_id", "in": "path", "required": true, "schema": { @@ -1450,7 +1730,7 @@ } }, { - "name": "task_id", + "name": "job_id", "in": "path", "required": true, "schema": { @@ -1460,6 +1740,49 @@ ] } }, + "/v1/eval/benchmarks": { + "get": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ListBenchmarksResponse" + } + } + } + } + }, + "tags": [ + "Benchmarks" + ], + "description": "", + "parameters": [] + }, + "post": { + "responses": { + "200": { + "description": "OK" + } + }, + "tags": [ + "Benchmarks" + ], + "description": "", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RegisterBenchmarkRequest" + } + } + }, + "required": true + } + } + }, "/v1/datasets": { "get": { "responses": { @@ -1503,49 +1826,6 @@ } } }, - "/v1/eval-tasks": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ListEvalTasksResponse" - } - } - } - } - }, - "tags": [ - "EvalTasks" - ], - "description": "", - "parameters": [] - }, - "post": { - "responses": { - "200": { - "description": "OK" - } - }, - "tags": [ - "EvalTasks" - ], - "description": "", - "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/RegisterEvalTaskRequest" - } - } - }, - "required": true - } - } - }, "/v1/models": { "get": { "responses": { @@ -2121,7 +2401,7 @@ ] } }, - "/v1/eval/tasks/{task_id}/jobs": { + "/v1/eval/benchmarks/{benchmark_id}/jobs": { "post": { "responses": { "200": { @@ -2141,7 +2421,7 @@ "description": "", "parameters": [ { - "name": "task_id", + "name": "benchmark_id", "in": "path", "required": true, "schema": { @@ -2365,84 +2645,216 @@ "jsonSchemaDialect": "https://json-schema.org/draft/2020-12/schema", "components": { "schemas": { - "AppendRowsRequest": { + "AgentCandidate": { "type": "object", "properties": { - "dataset_id": { - "type": "string" + "type": { + "type": "string", + "const": "agent", + "default": "agent" }, - "rows": { + "config": { + "$ref": "#/components/schemas/AgentConfig" + } + }, + "additionalProperties": false, + "required": [ + "type", + "config" + ] + }, + "AgentConfig": { + "type": "object", + "properties": { + "sampling_params": { + "$ref": "#/components/schemas/SamplingParams" + }, + "input_shields": { "type": "array", "items": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] + "type": "string" + } + }, + "output_shields": { + "type": "array", + "items": { + "type": "string" + } + }, + "toolgroups": { + "type": "array", + "items": { + "$ref": "#/components/schemas/AgentTool" + } + }, + "client_tools": { + "type": "array", + "items": { + "$ref": "#/components/schemas/ToolDef" + } + }, + "tool_choice": { + "type": "string", + "enum": [ + "auto", + "required" + ], + "description": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model." + }, + "tool_prompt_format": { + "type": "string", + "enum": [ + "json", + "function_tag", + "python_list" + ], + "description": "Prompt format for calling custom / zero shot tools." + }, + "tool_config": { + "$ref": "#/components/schemas/ToolConfig" + }, + "max_infer_iters": { + "type": "integer", + "default": 10 + }, + "model": { + "type": "string" + }, + "instructions": { + "type": "string" + }, + "enable_session_persistence": { + "type": "boolean" + }, + "response_format": { + "$ref": "#/components/schemas/ResponseFormat" + } + }, + "additionalProperties": false, + "required": [ + "model", + "instructions", + "enable_session_persistence" + ] + }, + "AgentTool": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "args": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } } + }, + "additionalProperties": false, + "required": [ + "name", + "args" + ] + } + ] + }, + "AggregationFunctionType": { + "type": "string", + "enum": [ + "average", + "median", + "categorical_count", + "accuracy" + ] + }, + "BasicScoringFnParams": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "basic", + "default": "basic" + }, + "aggregation_functions": { + "type": "array", + "items": { + "$ref": "#/components/schemas/AggregationFunctionType" } } }, "additionalProperties": false, "required": [ - "dataset_id", - "rows" + "type" ] }, - "CompletionMessage": { + "BenchmarkConfig": { "type": "object", "properties": { - "role": { + "type": { "type": "string", - "const": "assistant", - "default": "assistant", - "description": "Must be \"assistant\" to identify this as the model's response" + "const": "benchmark", + "default": "benchmark" }, - "content": { - "$ref": "#/components/schemas/InterleavedContent", - "description": "The content of the model's response" + "eval_candidate": { + "$ref": "#/components/schemas/EvalCandidate" }, - "stop_reason": { - "type": "string", - "enum": [ - "end_of_turn", - "end_of_message", - "out_of_tokens" - ], - "description": "Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`: The model finished generating the entire response. - `StopReason.end_of_message`: The model finished generating but generated a partial response -- usually, a tool call. The user may call the tool and continue the conversation with the tool's response. - `StopReason.out_of_tokens`: The model ran out of token budget." + "scoring_params": { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/ScoringFnParams" + } }, - "tool_calls": { - "type": "array", - "items": { - "$ref": "#/components/schemas/ToolCall" - }, - "description": "List of tool calls. Each tool call is a ToolCall object." + "num_examples": { + "type": "integer" } }, "additionalProperties": false, "required": [ - "role", - "content", - "stop_reason" + "type", + "eval_candidate", + "scoring_params" + ] + }, + "EvalCandidate": { + "oneOf": [ + { + "$ref": "#/components/schemas/ModelCandidate" + }, + { + "$ref": "#/components/schemas/AgentCandidate" + } ], - "description": "A message containing the model's (assistant) response in a chat conversation." + "discriminator": { + "propertyName": "type", + "mapping": { + "model": "#/components/schemas/ModelCandidate", + "agent": "#/components/schemas/AgentCandidate" + } + } }, "GrammarResponseFormat": { "type": "object", @@ -2610,30 +3022,89 @@ ], "description": "Configuration for JSON schema-guided response generation." }, - "Message": { - "oneOf": [ - { - "$ref": "#/components/schemas/UserMessage" + "LLMAsJudgeScoringFnParams": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "llm_as_judge", + "default": "llm_as_judge" }, - { + "judge_model": { + "type": "string" + }, + "prompt_template": { + "type": "string" + }, + "judge_score_regexes": { + "type": "array", + "items": { + "type": "string" + } + }, + "aggregation_functions": { + "type": "array", + "items": { + "$ref": "#/components/schemas/AggregationFunctionType" + } + } + }, + "additionalProperties": false, + "required": [ + "type", + "judge_model" + ] + }, + "ModelCandidate": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "model", + "default": "model" + }, + "model": { + "type": "string" + }, + "sampling_params": { + "$ref": "#/components/schemas/SamplingParams" + }, + "system_message": { "$ref": "#/components/schemas/SystemMessage" - }, - { - "$ref": "#/components/schemas/ToolResponseMessage" - }, - { - "$ref": "#/components/schemas/CompletionMessage" } - ], - "discriminator": { - "propertyName": "role", - "mapping": { - "user": "#/components/schemas/UserMessage", - "system": "#/components/schemas/SystemMessage", - "tool": "#/components/schemas/ToolResponseMessage", - "assistant": "#/components/schemas/CompletionMessage" + }, + "additionalProperties": false, + "required": [ + "type", + "model", + "sampling_params" + ] + }, + "RegexParserScoringFnParams": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "regex_parser", + "default": "regex_parser" + }, + "parsing_regexes": { + "type": "array", + "items": { + "type": "string" + } + }, + "aggregation_functions": { + "type": "array", + "items": { + "$ref": "#/components/schemas/AggregationFunctionType" + } } - } + }, + "additionalProperties": false, + "required": [ + "type" + ] }, "ResponseFormat": { "oneOf": [ @@ -2693,6 +3164,27 @@ } } }, + "ScoringFnParams": { + "oneOf": [ + { + "$ref": "#/components/schemas/LLMAsJudgeScoringFnParams" + }, + { + "$ref": "#/components/schemas/RegexParserScoringFnParams" + }, + { + "$ref": "#/components/schemas/BasicScoringFnParams" + } + ], + "discriminator": { + "propertyName": "type", + "mapping": { + "llm_as_judge": "#/components/schemas/LLMAsJudgeScoringFnParams", + "regex_parser": "#/components/schemas/RegexParserScoringFnParams", + "basic": "#/components/schemas/BasicScoringFnParams" + } + } + }, "SystemMessage": { "type": "object", "properties": { @@ -2735,6 +3227,611 @@ ], "description": "A text content item" }, + "ToolConfig": { + "type": "object", + "properties": { + "tool_choice": { + "type": "string", + "enum": [ + "auto", + "required" + ], + "description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.", + "default": "auto" + }, + "tool_prompt_format": { + "type": "string", + "enum": [ + "json", + "function_tag", + "python_list" + ], + "description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls." + }, + "system_message_behavior": { + "type": "string", + "enum": [ + "append", + "replace" + ], + "description": "(Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`: Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`: Replaces the default system prompt with the provided system message. The system message can include the string '{{function_definitions}}' to indicate where the function definitions should be inserted.", + "default": "append" + } + }, + "additionalProperties": false, + "required": [ + "system_message_behavior" + ], + "description": "Configuration for tool use." + }, + "ToolDef": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "description": { + "type": "string" + }, + "parameters": { + "type": "array", + "items": { + "$ref": "#/components/schemas/ToolParameter" + } + }, + "metadata": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "additionalProperties": false, + "required": [ + "name" + ] + }, + "ToolParameter": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "parameter_type": { + "type": "string" + }, + "description": { + "type": "string" + }, + "required": { + "type": "boolean", + "default": true + }, + "default": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + }, + "additionalProperties": false, + "required": [ + "name", + "parameter_type", + "description", + "required" + ] + }, + "TopKSamplingStrategy": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "top_k", + "default": "top_k" + }, + "top_k": { + "type": "integer" + } + }, + "additionalProperties": false, + "required": [ + "type", + "top_k" + ] + }, + "TopPSamplingStrategy": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "top_p", + "default": "top_p" + }, + "temperature": { + "type": "number" + }, + "top_p": { + "type": "number", + "default": 0.95 + } + }, + "additionalProperties": false, + "required": [ + "type" + ] + }, + "URL": { + "type": "object", + "properties": { + "uri": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "uri" + ] + }, + "DeprecatedEvaluateRowsRequest": { + "type": "object", + "properties": { + "input_rows": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "scoring_functions": { + "type": "array", + "items": { + "type": "string" + } + }, + "task_config": { + "$ref": "#/components/schemas/BenchmarkConfig" + } + }, + "additionalProperties": false, + "required": [ + "input_rows", + "scoring_functions", + "task_config" + ] + }, + "EvaluateResponse": { + "type": "object", + "properties": { + "generations": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "scores": { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/ScoringResult" + } + } + }, + "additionalProperties": false, + "required": [ + "generations", + "scores" + ] + }, + "ScoringResult": { + "type": "object", + "properties": { + "score_rows": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "aggregated_results": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "additionalProperties": false, + "required": [ + "score_rows", + "aggregated_results" + ] + }, + "Benchmark": { + "type": "object", + "properties": { + "identifier": { + "type": "string" + }, + "provider_resource_id": { + "type": "string" + }, + "provider_id": { + "type": "string" + }, + "type": { + "type": "string", + "const": "benchmark", + "default": "benchmark" + }, + "dataset_id": { + "type": "string" + }, + "scoring_functions": { + "type": "array", + "items": { + "type": "string" + } + }, + "metadata": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "additionalProperties": false, + "required": [ + "identifier", + "provider_resource_id", + "provider_id", + "type", + "dataset_id", + "scoring_functions", + "metadata" + ] + }, + "JobStatus": { + "type": "string", + "enum": [ + "completed", + "in_progress", + "failed", + "scheduled" + ] + }, + "ListBenchmarksResponse": { + "type": "object", + "properties": { + "data": { + "type": "array", + "items": { + "$ref": "#/components/schemas/Benchmark" + } + } + }, + "additionalProperties": false, + "required": [ + "data" + ] + }, + "DeprecatedRegisterEvalTaskRequest": { + "type": "object", + "properties": { + "eval_task_id": { + "type": "string" + }, + "dataset_id": { + "type": "string" + }, + "scoring_functions": { + "type": "array", + "items": { + "type": "string" + } + }, + "provider_benchmark_id": { + "type": "string" + }, + "provider_id": { + "type": "string" + }, + "metadata": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "additionalProperties": false, + "required": [ + "eval_task_id", + "dataset_id", + "scoring_functions" + ] + }, + "DeprecatedRunEvalRequest": { + "type": "object", + "properties": { + "task_config": { + "$ref": "#/components/schemas/BenchmarkConfig" + } + }, + "additionalProperties": false, + "required": [ + "task_config" + ] + }, + "Job": { + "type": "object", + "properties": { + "job_id": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "job_id" + ] + }, + "AppendRowsRequest": { + "type": "object", + "properties": { + "dataset_id": { + "type": "string" + }, + "rows": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + } + }, + "additionalProperties": false, + "required": [ + "dataset_id", + "rows" + ] + }, + "CompletionMessage": { + "type": "object", + "properties": { + "role": { + "type": "string", + "const": "assistant", + "default": "assistant", + "description": "Must be \"assistant\" to identify this as the model's response" + }, + "content": { + "$ref": "#/components/schemas/InterleavedContent", + "description": "The content of the model's response" + }, + "stop_reason": { + "type": "string", + "enum": [ + "end_of_turn", + "end_of_message", + "out_of_tokens" + ], + "description": "Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`: The model finished generating the entire response. - `StopReason.end_of_message`: The model finished generating but generated a partial response -- usually, a tool call. The user may call the tool and continue the conversation with the tool's response. - `StopReason.out_of_tokens`: The model ran out of token budget." + }, + "tool_calls": { + "type": "array", + "items": { + "$ref": "#/components/schemas/ToolCall" + }, + "description": "List of tool calls. Each tool call is a ToolCall object." + } + }, + "additionalProperties": false, + "required": [ + "role", + "content", + "stop_reason" + ], + "description": "A message containing the model's (assistant) response in a chat conversation." + }, + "Message": { + "oneOf": [ + { + "$ref": "#/components/schemas/UserMessage" + }, + { + "$ref": "#/components/schemas/SystemMessage" + }, + { + "$ref": "#/components/schemas/ToolResponseMessage" + }, + { + "$ref": "#/components/schemas/CompletionMessage" + } + ], + "discriminator": { + "propertyName": "role", + "mapping": { + "user": "#/components/schemas/UserMessage", + "system": "#/components/schemas/SystemMessage", + "tool": "#/components/schemas/ToolResponseMessage", + "assistant": "#/components/schemas/CompletionMessage" + } + } + }, "ToolCall": { "type": "object", "properties": { @@ -2950,57 +4047,6 @@ ], "description": "A message representing the result of a tool invocation." }, - "TopKSamplingStrategy": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "top_k", - "default": "top_k" - }, - "top_k": { - "type": "integer" - } - }, - "additionalProperties": false, - "required": [ - "type", - "top_k" - ] - }, - "TopPSamplingStrategy": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "top_p", - "default": "top_p" - }, - "temperature": { - "type": "number" - }, - "top_p": { - "type": "number", - "default": 0.95 - } - }, - "additionalProperties": false, - "required": [ - "type" - ] - }, - "URL": { - "type": "object", - "properties": { - "uri": { - "type": "string" - } - }, - "additionalProperties": false, - "required": [ - "uri" - ] - }, "UserMessage": { "type": "object", "properties": { @@ -3309,43 +4355,6 @@ "job_uuid" ] }, - "ToolConfig": { - "type": "object", - "properties": { - "tool_choice": { - "type": "string", - "enum": [ - "auto", - "required" - ], - "description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.", - "default": "auto" - }, - "tool_prompt_format": { - "type": "string", - "enum": [ - "json", - "function_tag", - "python_list" - ], - "description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls." - }, - "system_message_behavior": { - "type": "string", - "enum": [ - "append", - "replace" - ], - "description": "(Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`: Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`: Replaces the default system prompt with the provided system message. The system message can include the string '{{function_definitions}}' to indicate where the function definitions should be inserted.", - "default": "append" - } - }, - "additionalProperties": false, - "required": [ - "system_message_behavior" - ], - "description": "Configuration for tool use." - }, "ChatCompletionRequest": { "type": "object", "properties": { @@ -3644,218 +4653,6 @@ ], "description": "A chunk of a streamed completion response." }, - "AgentConfig": { - "type": "object", - "properties": { - "sampling_params": { - "$ref": "#/components/schemas/SamplingParams" - }, - "input_shields": { - "type": "array", - "items": { - "type": "string" - } - }, - "output_shields": { - "type": "array", - "items": { - "type": "string" - } - }, - "toolgroups": { - "type": "array", - "items": { - "$ref": "#/components/schemas/AgentTool" - } - }, - "client_tools": { - "type": "array", - "items": { - "$ref": "#/components/schemas/ToolDef" - } - }, - "tool_choice": { - "type": "string", - "enum": [ - "auto", - "required" - ], - "description": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model." - }, - "tool_prompt_format": { - "type": "string", - "enum": [ - "json", - "function_tag", - "python_list" - ], - "description": "Prompt format for calling custom / zero shot tools." - }, - "tool_config": { - "$ref": "#/components/schemas/ToolConfig" - }, - "max_infer_iters": { - "type": "integer", - "default": 10 - }, - "model": { - "type": "string" - }, - "instructions": { - "type": "string" - }, - "enable_session_persistence": { - "type": "boolean" - }, - "response_format": { - "$ref": "#/components/schemas/ResponseFormat" - } - }, - "additionalProperties": false, - "required": [ - "model", - "instructions", - "enable_session_persistence" - ] - }, - "AgentTool": { - "oneOf": [ - { - "type": "string" - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "args": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } - } - }, - "additionalProperties": false, - "required": [ - "name", - "args" - ] - } - ] - }, - "ToolDef": { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "description": { - "type": "string" - }, - "parameters": { - "type": "array", - "items": { - "$ref": "#/components/schemas/ToolParameter" - } - }, - "metadata": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } - } - }, - "additionalProperties": false, - "required": [ - "name" - ] - }, - "ToolParameter": { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "parameter_type": { - "type": "string" - }, - "description": { - "type": "string" - }, - "required": { - "type": "boolean", - "default": true - }, - "default": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } - }, - "additionalProperties": false, - "required": [ - "name", - "parameter_type", - "description", - "required" - ] - }, "CreateAgentRequest": { "type": "object", "properties": { @@ -4582,241 +5379,6 @@ ], "description": "Response containing generated embeddings." }, - "AgentCandidate": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "agent", - "default": "agent" - }, - "config": { - "$ref": "#/components/schemas/AgentConfig" - } - }, - "additionalProperties": false, - "required": [ - "type", - "config" - ] - }, - "AggregationFunctionType": { - "type": "string", - "enum": [ - "average", - "median", - "categorical_count", - "accuracy" - ] - }, - "AppEvalTaskConfig": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "app", - "default": "app" - }, - "eval_candidate": { - "$ref": "#/components/schemas/EvalCandidate" - }, - "scoring_params": { - "type": "object", - "additionalProperties": { - "$ref": "#/components/schemas/ScoringFnParams" - } - }, - "num_examples": { - "type": "integer" - } - }, - "additionalProperties": false, - "required": [ - "type", - "eval_candidate", - "scoring_params" - ] - }, - "BasicScoringFnParams": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "basic", - "default": "basic" - }, - "aggregation_functions": { - "type": "array", - "items": { - "$ref": "#/components/schemas/AggregationFunctionType" - } - } - }, - "additionalProperties": false, - "required": [ - "type" - ] - }, - "BenchmarkEvalTaskConfig": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "benchmark", - "default": "benchmark" - }, - "eval_candidate": { - "$ref": "#/components/schemas/EvalCandidate" - }, - "num_examples": { - "type": "integer" - } - }, - "additionalProperties": false, - "required": [ - "type", - "eval_candidate" - ] - }, - "EvalCandidate": { - "oneOf": [ - { - "$ref": "#/components/schemas/ModelCandidate" - }, - { - "$ref": "#/components/schemas/AgentCandidate" - } - ], - "discriminator": { - "propertyName": "type", - "mapping": { - "model": "#/components/schemas/ModelCandidate", - "agent": "#/components/schemas/AgentCandidate" - } - } - }, - "EvalTaskConfig": { - "oneOf": [ - { - "$ref": "#/components/schemas/BenchmarkEvalTaskConfig" - }, - { - "$ref": "#/components/schemas/AppEvalTaskConfig" - } - ], - "discriminator": { - "propertyName": "type", - "mapping": { - "benchmark": "#/components/schemas/BenchmarkEvalTaskConfig", - "app": "#/components/schemas/AppEvalTaskConfig" - } - } - }, - "LLMAsJudgeScoringFnParams": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "llm_as_judge", - "default": "llm_as_judge" - }, - "judge_model": { - "type": "string" - }, - "prompt_template": { - "type": "string" - }, - "judge_score_regexes": { - "type": "array", - "items": { - "type": "string" - } - }, - "aggregation_functions": { - "type": "array", - "items": { - "$ref": "#/components/schemas/AggregationFunctionType" - } - } - }, - "additionalProperties": false, - "required": [ - "type", - "judge_model" - ] - }, - "ModelCandidate": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "model", - "default": "model" - }, - "model": { - "type": "string" - }, - "sampling_params": { - "$ref": "#/components/schemas/SamplingParams" - }, - "system_message": { - "$ref": "#/components/schemas/SystemMessage" - } - }, - "additionalProperties": false, - "required": [ - "type", - "model", - "sampling_params" - ] - }, - "RegexParserScoringFnParams": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "regex_parser", - "default": "regex_parser" - }, - "parsing_regexes": { - "type": "array", - "items": { - "type": "string" - } - }, - "aggregation_functions": { - "type": "array", - "items": { - "$ref": "#/components/schemas/AggregationFunctionType" - } - } - }, - "additionalProperties": false, - "required": [ - "type" - ] - }, - "ScoringFnParams": { - "oneOf": [ - { - "$ref": "#/components/schemas/LLMAsJudgeScoringFnParams" - }, - { - "$ref": "#/components/schemas/RegexParserScoringFnParams" - }, - { - "$ref": "#/components/schemas/BasicScoringFnParams" - } - ], - "discriminator": { - "propertyName": "type", - "mapping": { - "llm_as_judge": "#/components/schemas/LLMAsJudgeScoringFnParams", - "regex_parser": "#/components/schemas/RegexParserScoringFnParams", - "basic": "#/components/schemas/BasicScoringFnParams" - } - } - }, "EvaluateRowsRequest": { "type": "object", "properties": { @@ -4855,7 +5417,7 @@ } }, "task_config": { - "$ref": "#/components/schemas/EvalTaskConfig" + "$ref": "#/components/schemas/BenchmarkConfig" } }, "additionalProperties": false, @@ -4865,113 +5427,6 @@ "task_config" ] }, - "EvaluateResponse": { - "type": "object", - "properties": { - "generations": { - "type": "array", - "items": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } - } - }, - "scores": { - "type": "object", - "additionalProperties": { - "$ref": "#/components/schemas/ScoringResult" - } - } - }, - "additionalProperties": false, - "required": [ - "generations", - "scores" - ] - }, - "ScoringResult": { - "type": "object", - "properties": { - "score_rows": { - "type": "array", - "items": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } - } - }, - "aggregated_results": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } - } - }, - "additionalProperties": false, - "required": [ - "score_rows", - "aggregated_results" - ] - }, "Session": { "type": "object", "properties": { @@ -5287,69 +5742,6 @@ "type" ] }, - "EvalTask": { - "type": "object", - "properties": { - "identifier": { - "type": "string" - }, - "provider_resource_id": { - "type": "string" - }, - "provider_id": { - "type": "string" - }, - "type": { - "type": "string", - "const": "eval_task", - "default": "eval_task" - }, - "dataset_id": { - "type": "string" - }, - "scoring_functions": { - "type": "array", - "items": { - "type": "string" - } - }, - "metadata": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } - } - }, - "additionalProperties": false, - "required": [ - "identifier", - "provider_resource_id", - "provider_id", - "type", - "dataset_id", - "scoring_functions", - "metadata" - ] - }, "Model": { "type": "object", "properties": { @@ -5891,15 +6283,6 @@ ], "description": "Artifacts of a finetuning job." }, - "JobStatus": { - "type": "string", - "enum": [ - "completed", - "in_progress", - "failed", - "scheduled" - ] - }, "PostTrainingJobStatusResponse": { "type": "object", "properties": { @@ -6243,21 +6626,6 @@ "data" ] }, - "ListEvalTasksResponse": { - "type": "object", - "properties": { - "data": { - "type": "array", - "items": { - "$ref": "#/components/schemas/EvalTask" - } - } - }, - "additionalProperties": false, - "required": [ - "data" - ] - }, "ListModelsResponse": { "type": "object", "properties": { @@ -7169,6 +7537,60 @@ "data" ] }, + "RegisterBenchmarkRequest": { + "type": "object", + "properties": { + "benchmark_id": { + "type": "string" + }, + "dataset_id": { + "type": "string" + }, + "scoring_functions": { + "type": "array", + "items": { + "type": "string" + } + }, + "provider_benchmark_id": { + "type": "string" + }, + "provider_id": { + "type": "string" + }, + "metadata": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "additionalProperties": false, + "required": [ + "benchmark_id", + "dataset_id", + "scoring_functions" + ] + }, "RegisterDatasetRequest": { "type": "object", "properties": { @@ -7223,60 +7645,6 @@ "url" ] }, - "RegisterEvalTaskRequest": { - "type": "object", - "properties": { - "eval_task_id": { - "type": "string" - }, - "dataset_id": { - "type": "string" - }, - "scoring_functions": { - "type": "array", - "items": { - "type": "string" - } - }, - "provider_eval_task_id": { - "type": "string" - }, - "provider_id": { - "type": "string" - }, - "metadata": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } - } - }, - "additionalProperties": false, - "required": [ - "eval_task_id", - "dataset_id", - "scoring_functions" - ] - }, "RegisterModelRequest": { "type": "object", "properties": { @@ -7468,7 +7836,7 @@ "type": "object", "properties": { "task_config": { - "$ref": "#/components/schemas/EvalTaskConfig" + "$ref": "#/components/schemas/BenchmarkConfig" } }, "additionalProperties": false, @@ -7476,18 +7844,6 @@ "task_config" ] }, - "Job": { - "type": "object", - "properties": { - "job_id": { - "type": "string" - } - }, - "additionalProperties": false, - "required": [ - "job_id" - ] - }, "RunShieldRequest": { "type": "object", "properties": { @@ -7970,6 +8326,9 @@ { "name": "BatchInference (Coming Soon)" }, + { + "name": "Benchmarks" + }, { "name": "DatasetIO" }, @@ -7979,9 +8338,6 @@ { "name": "Eval" }, - { - "name": "EvalTasks" - }, { "name": "Inference", "description": "This API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.", @@ -8033,10 +8389,10 @@ "tags": [ "Agents", "BatchInference (Coming Soon)", + "Benchmarks", "DatasetIO", "Datasets", "Eval", - "EvalTasks", "Inference", "Inspect", "Models", diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index a646d7e08..b30025020 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -10,6 +10,175 @@ info: servers: - url: http://any-hosted-llama-stack.com paths: + /v1/eval/tasks/{task_id}/evaluations: + post: + responses: + '200': + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/EvaluateResponse' + tags: + - Eval + description: '' + parameters: + - name: task_id + in: path + required: true + schema: + type: string + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/DeprecatedEvaluateRowsRequest' + required: true + deprecated: true + /v1/eval-tasks/{task_id}: + get: + responses: + '200': + description: OK + content: + application/json: + schema: + oneOf: + - $ref: '#/components/schemas/Benchmark' + - type: 'null' + tags: + - Benchmarks + description: '' + parameters: + - name: eval_task_id + in: query + required: true + schema: + type: string + deprecated: true + /v1/eval/tasks/{task_id}/jobs/{job_id}: + get: + responses: + '200': + description: OK + content: + application/json: + schema: + oneOf: + - $ref: '#/components/schemas/JobStatus' + - type: 'null' + tags: + - Eval + description: '' + parameters: + - name: task_id + in: path + required: true + schema: + type: string + - name: job_id + in: path + required: true + schema: + type: string + deprecated: true + delete: + responses: + '200': + description: OK + tags: + - Eval + description: '' + parameters: + - name: task_id + in: path + required: true + schema: + type: string + - name: job_id + in: path + required: true + schema: + type: string + deprecated: true + /v1/eval/tasks/{task_id}/jobs/{job_id}/result: + get: + responses: + '200': + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/EvaluateResponse' + tags: + - Eval + description: '' + parameters: + - name: task_id + in: path + required: true + schema: + type: string + - name: job_id + in: path + required: true + schema: + type: string + deprecated: true + /v1/eval-tasks: + get: + responses: + '200': + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/ListBenchmarksResponse' + tags: + - Benchmarks + description: '' + parameters: [] + deprecated: true + post: + responses: + '200': + description: OK + tags: + - Benchmarks + description: '' + parameters: [] + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/DeprecatedRegisterEvalTaskRequest' + required: true + deprecated: true + /v1/eval/tasks/{task_id}/jobs: + post: + responses: + '200': + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/Job' + tags: + - Eval + description: '' + parameters: + - name: task_id + in: path + required: true + schema: + type: string + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/DeprecatedRunEvalRequest' + required: true + deprecated: true /v1/datasetio/rows: get: responses: @@ -322,7 +491,7 @@ paths: schema: $ref: '#/components/schemas/EmbeddingsRequest' required: true - /v1/eval/tasks/{task_id}/evaluations: + /v1/eval/benchmarks/{benchmark_id}/evaluations: post: responses: '200': @@ -335,7 +504,7 @@ paths: - Eval description: '' parameters: - - name: task_id + - name: benchmark_id in: path required: true schema: @@ -407,6 +576,26 @@ paths: required: true schema: type: string + /v1/eval/benchmarks/{benchmark_id}: + get: + responses: + '200': + description: OK + content: + application/json: + schema: + oneOf: + - $ref: '#/components/schemas/Benchmark' + - type: 'null' + tags: + - Benchmarks + description: '' + parameters: + - name: benchmark_id + in: path + required: true + schema: + type: string /v1/datasets/{dataset_id}: get: responses: @@ -440,26 +629,6 @@ paths: required: true schema: type: string - /v1/eval-tasks/{eval_task_id}: - get: - responses: - '200': - description: OK - content: - application/json: - schema: - oneOf: - - $ref: '#/components/schemas/EvalTask' - - type: 'null' - tags: - - EvalTasks - description: '' - parameters: - - name: eval_task_id - in: path - required: true - schema: - type: string /v1/models/{model_id}: get: responses: @@ -802,7 +971,7 @@ paths: schema: $ref: '#/components/schemas/InvokeToolRequest' required: true - /v1/eval/tasks/{task_id}/jobs/{job_id}: + /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}: get: responses: '200': @@ -817,7 +986,7 @@ paths: - Eval description: '' parameters: - - name: task_id + - name: benchmark_id in: path required: true schema: @@ -835,7 +1004,7 @@ paths: - Eval description: '' parameters: - - name: task_id + - name: benchmark_id in: path required: true schema: @@ -845,7 +1014,7 @@ paths: required: true schema: type: string - /v1/eval/tasks/{task_id}/jobs/{job_id}/result: + /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result: get: responses: '200': @@ -858,16 +1027,43 @@ paths: - Eval description: '' parameters: + - name: benchmark_id + in: path + required: true + schema: + type: string - name: job_id in: path required: true schema: type: string - - name: task_id - in: path - required: true - schema: - type: string + /v1/eval/benchmarks: + get: + responses: + '200': + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/ListBenchmarksResponse' + tags: + - Benchmarks + description: '' + parameters: [] + post: + responses: + '200': + description: OK + tags: + - Benchmarks + description: '' + parameters: [] + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/RegisterBenchmarkRequest' + required: true /v1/datasets: get: responses: @@ -895,33 +1091,6 @@ paths: schema: $ref: '#/components/schemas/RegisterDatasetRequest' required: true - /v1/eval-tasks: - get: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/ListEvalTasksResponse' - tags: - - EvalTasks - description: '' - parameters: [] - post: - responses: - '200': - description: OK - tags: - - EvalTasks - description: '' - parameters: [] - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/RegisterEvalTaskRequest' - required: true /v1/models: get: responses: @@ -1278,7 +1447,7 @@ paths: type: array items: type: string - /v1/eval/tasks/{task_id}/jobs: + /v1/eval/benchmarks/{benchmark_id}/jobs: post: responses: '200': @@ -1291,7 +1460,7 @@ paths: - Eval description: '' parameters: - - name: task_id + - name: benchmark_id in: path required: true schema: @@ -1429,65 +1598,146 @@ jsonSchemaDialect: >- https://json-schema.org/draft/2020-12/schema components: schemas: - AppendRowsRequest: + AgentCandidate: type: object properties: - dataset_id: + type: type: string - rows: - type: array - items: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object + const: agent + default: agent + config: + $ref: '#/components/schemas/AgentConfig' additionalProperties: false required: - - dataset_id - - rows - CompletionMessage: + - type + - config + AgentConfig: type: object properties: - role: - type: string - const: assistant - default: assistant - description: >- - Must be "assistant" to identify this as the model's response - content: - $ref: '#/components/schemas/InterleavedContent' - description: The content of the model's response - stop_reason: + sampling_params: + $ref: '#/components/schemas/SamplingParams' + input_shields: + type: array + items: + type: string + output_shields: + type: array + items: + type: string + toolgroups: + type: array + items: + $ref: '#/components/schemas/AgentTool' + client_tools: + type: array + items: + $ref: '#/components/schemas/ToolDef' + tool_choice: type: string enum: - - end_of_turn - - end_of_message - - out_of_tokens + - auto + - required description: >- - Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`: - The model finished generating the entire response. - `StopReason.end_of_message`: - The model finished generating but generated a partial response -- usually, - a tool call. The user may call the tool and continue the conversation - with the tool's response. - `StopReason.out_of_tokens`: The model ran - out of token budget. - tool_calls: - type: array - items: - $ref: '#/components/schemas/ToolCall' + Whether tool use is required or automatic. This is a hint to the model + which may not be followed. It depends on the Instruction Following capabilities + of the model. + tool_prompt_format: + type: string + enum: + - json + - function_tag + - python_list description: >- - List of tool calls. Each tool call is a ToolCall object. + Prompt format for calling custom / zero shot tools. + tool_config: + $ref: '#/components/schemas/ToolConfig' + max_infer_iters: + type: integer + default: 10 + model: + type: string + instructions: + type: string + enable_session_persistence: + type: boolean + response_format: + $ref: '#/components/schemas/ResponseFormat' additionalProperties: false required: - - role - - content - - stop_reason - description: >- - A message containing the model's (assistant) response in a chat conversation. + - model + - instructions + - enable_session_persistence + AgentTool: + oneOf: + - type: string + - type: object + properties: + name: + type: string + args: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + additionalProperties: false + required: + - name + - args + AggregationFunctionType: + type: string + enum: + - average + - median + - categorical_count + - accuracy + BasicScoringFnParams: + type: object + properties: + type: + type: string + const: basic + default: basic + aggregation_functions: + type: array + items: + $ref: '#/components/schemas/AggregationFunctionType' + additionalProperties: false + required: + - type + BenchmarkConfig: + type: object + properties: + type: + type: string + const: benchmark + default: benchmark + eval_candidate: + $ref: '#/components/schemas/EvalCandidate' + scoring_params: + type: object + additionalProperties: + $ref: '#/components/schemas/ScoringFnParams' + num_examples: + type: integer + additionalProperties: false + required: + - type + - eval_candidate + - scoring_params + EvalCandidate: + oneOf: + - $ref: '#/components/schemas/ModelCandidate' + - $ref: '#/components/schemas/AgentCandidate' + discriminator: + propertyName: type + mapping: + model: '#/components/schemas/ModelCandidate' + agent: '#/components/schemas/AgentCandidate' GrammarResponseFormat: type: object properties: @@ -1598,19 +1848,65 @@ components: - json_schema description: >- Configuration for JSON schema-guided response generation. - Message: - oneOf: - - $ref: '#/components/schemas/UserMessage' - - $ref: '#/components/schemas/SystemMessage' - - $ref: '#/components/schemas/ToolResponseMessage' - - $ref: '#/components/schemas/CompletionMessage' - discriminator: - propertyName: role - mapping: - user: '#/components/schemas/UserMessage' - system: '#/components/schemas/SystemMessage' - tool: '#/components/schemas/ToolResponseMessage' - assistant: '#/components/schemas/CompletionMessage' + LLMAsJudgeScoringFnParams: + type: object + properties: + type: + type: string + const: llm_as_judge + default: llm_as_judge + judge_model: + type: string + prompt_template: + type: string + judge_score_regexes: + type: array + items: + type: string + aggregation_functions: + type: array + items: + $ref: '#/components/schemas/AggregationFunctionType' + additionalProperties: false + required: + - type + - judge_model + ModelCandidate: + type: object + properties: + type: + type: string + const: model + default: model + model: + type: string + sampling_params: + $ref: '#/components/schemas/SamplingParams' + system_message: + $ref: '#/components/schemas/SystemMessage' + additionalProperties: false + required: + - type + - model + - sampling_params + RegexParserScoringFnParams: + type: object + properties: + type: + type: string + const: regex_parser + default: regex_parser + parsing_regexes: + type: array + items: + type: string + aggregation_functions: + type: array + items: + $ref: '#/components/schemas/AggregationFunctionType' + additionalProperties: false + required: + - type ResponseFormat: oneOf: - $ref: '#/components/schemas/JsonSchemaResponseFormat' @@ -1645,6 +1941,17 @@ components: greedy: '#/components/schemas/GreedySamplingStrategy' top_p: '#/components/schemas/TopPSamplingStrategy' top_k: '#/components/schemas/TopKSamplingStrategy' + ScoringFnParams: + oneOf: + - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams' + - $ref: '#/components/schemas/RegexParserScoringFnParams' + - $ref: '#/components/schemas/BasicScoringFnParams' + discriminator: + propertyName: type + mapping: + llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams' + regex_parser: '#/components/schemas/RegexParserScoringFnParams' + basic: '#/components/schemas/BasicScoringFnParams' SystemMessage: type: object properties: @@ -1683,6 +1990,383 @@ components: - type - text description: A text content item + ToolConfig: + type: object + properties: + tool_choice: + type: string + enum: + - auto + - required + description: >- + (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto. + default: auto + tool_prompt_format: + type: string + enum: + - json + - function_tag + - python_list + description: >- + (Optional) Instructs the model how to format tool calls. By default, Llama + Stack will attempt to use a format that is best adapted to the model. + - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. + - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a + tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python + syntax -- a list of function calls. + system_message_behavior: + type: string + enum: + - append + - replace + description: >- + (Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`: + Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`: + Replaces the default system prompt with the provided system message. The + system message can include the string '{{function_definitions}}' to indicate + where the function definitions should be inserted. + default: append + additionalProperties: false + required: + - system_message_behavior + description: Configuration for tool use. + ToolDef: + type: object + properties: + name: + type: string + description: + type: string + parameters: + type: array + items: + $ref: '#/components/schemas/ToolParameter' + metadata: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + additionalProperties: false + required: + - name + ToolParameter: + type: object + properties: + name: + type: string + parameter_type: + type: string + description: + type: string + required: + type: boolean + default: true + default: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + additionalProperties: false + required: + - name + - parameter_type + - description + - required + TopKSamplingStrategy: + type: object + properties: + type: + type: string + const: top_k + default: top_k + top_k: + type: integer + additionalProperties: false + required: + - type + - top_k + TopPSamplingStrategy: + type: object + properties: + type: + type: string + const: top_p + default: top_p + temperature: + type: number + top_p: + type: number + default: 0.95 + additionalProperties: false + required: + - type + URL: + type: object + properties: + uri: + type: string + additionalProperties: false + required: + - uri + DeprecatedEvaluateRowsRequest: + type: object + properties: + input_rows: + type: array + items: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + scoring_functions: + type: array + items: + type: string + task_config: + $ref: '#/components/schemas/BenchmarkConfig' + additionalProperties: false + required: + - input_rows + - scoring_functions + - task_config + EvaluateResponse: + type: object + properties: + generations: + type: array + items: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + scores: + type: object + additionalProperties: + $ref: '#/components/schemas/ScoringResult' + additionalProperties: false + required: + - generations + - scores + ScoringResult: + type: object + properties: + score_rows: + type: array + items: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + aggregated_results: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + additionalProperties: false + required: + - score_rows + - aggregated_results + Benchmark: + type: object + properties: + identifier: + type: string + provider_resource_id: + type: string + provider_id: + type: string + type: + type: string + const: benchmark + default: benchmark + dataset_id: + type: string + scoring_functions: + type: array + items: + type: string + metadata: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + additionalProperties: false + required: + - identifier + - provider_resource_id + - provider_id + - type + - dataset_id + - scoring_functions + - metadata + JobStatus: + type: string + enum: + - completed + - in_progress + - failed + - scheduled + ListBenchmarksResponse: + type: object + properties: + data: + type: array + items: + $ref: '#/components/schemas/Benchmark' + additionalProperties: false + required: + - data + DeprecatedRegisterEvalTaskRequest: + type: object + properties: + eval_task_id: + type: string + dataset_id: + type: string + scoring_functions: + type: array + items: + type: string + provider_benchmark_id: + type: string + provider_id: + type: string + metadata: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + additionalProperties: false + required: + - eval_task_id + - dataset_id + - scoring_functions + DeprecatedRunEvalRequest: + type: object + properties: + task_config: + $ref: '#/components/schemas/BenchmarkConfig' + additionalProperties: false + required: + - task_config + Job: + type: object + properties: + job_id: + type: string + additionalProperties: false + required: + - job_id + AppendRowsRequest: + type: object + properties: + dataset_id: + type: string + rows: + type: array + items: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + additionalProperties: false + required: + - dataset_id + - rows + CompletionMessage: + type: object + properties: + role: + type: string + const: assistant + default: assistant + description: >- + Must be "assistant" to identify this as the model's response + content: + $ref: '#/components/schemas/InterleavedContent' + description: The content of the model's response + stop_reason: + type: string + enum: + - end_of_turn + - end_of_message + - out_of_tokens + description: >- + Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`: + The model finished generating the entire response. - `StopReason.end_of_message`: + The model finished generating but generated a partial response -- usually, + a tool call. The user may call the tool and continue the conversation + with the tool's response. - `StopReason.out_of_tokens`: The model ran + out of token budget. + tool_calls: + type: array + items: + $ref: '#/components/schemas/ToolCall' + description: >- + List of tool calls. Each tool call is a ToolCall object. + additionalProperties: false + required: + - role + - content + - stop_reason + description: >- + A message containing the model's (assistant) response in a chat conversation. + Message: + oneOf: + - $ref: '#/components/schemas/UserMessage' + - $ref: '#/components/schemas/SystemMessage' + - $ref: '#/components/schemas/ToolResponseMessage' + - $ref: '#/components/schemas/CompletionMessage' + discriminator: + propertyName: role + mapping: + user: '#/components/schemas/UserMessage' + system: '#/components/schemas/SystemMessage' + tool: '#/components/schemas/ToolResponseMessage' + assistant: '#/components/schemas/CompletionMessage' ToolCall: type: object properties: @@ -1803,42 +2487,6 @@ components: - content description: >- A message representing the result of a tool invocation. - TopKSamplingStrategy: - type: object - properties: - type: - type: string - const: top_k - default: top_k - top_k: - type: integer - additionalProperties: false - required: - - type - - top_k - TopPSamplingStrategy: - type: object - properties: - type: - type: string - const: top_p - default: top_p - temperature: - type: number - top_p: - type: number - default: 0.95 - additionalProperties: false - required: - - type - URL: - type: object - properties: - uri: - type: string - additionalProperties: false - required: - - uri UserMessage: type: object properties: @@ -2063,46 +2711,6 @@ components: additionalProperties: false required: - job_uuid - ToolConfig: - type: object - properties: - tool_choice: - type: string - enum: - - auto - - required - description: >- - (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto. - default: auto - tool_prompt_format: - type: string - enum: - - json - - function_tag - - python_list - description: >- - (Optional) Instructs the model how to format tool calls. By default, Llama - Stack will attempt to use a format that is best adapted to the model. - - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a - tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python - syntax -- a list of function calls. - system_message_behavior: - type: string - enum: - - append - - replace - description: >- - (Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`: - Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`: - Replaces the default system prompt with the provided system message. The - system message can include the string '{{function_definitions}}' to indicate - where the function definitions should be inserted. - default: append - additionalProperties: false - required: - - system_message_behavior - description: Configuration for tool use. ChatCompletionRequest: type: object properties: @@ -2356,133 +2964,6 @@ components: - delta description: >- A chunk of a streamed completion response. - AgentConfig: - type: object - properties: - sampling_params: - $ref: '#/components/schemas/SamplingParams' - input_shields: - type: array - items: - type: string - output_shields: - type: array - items: - type: string - toolgroups: - type: array - items: - $ref: '#/components/schemas/AgentTool' - client_tools: - type: array - items: - $ref: '#/components/schemas/ToolDef' - tool_choice: - type: string - enum: - - auto - - required - description: >- - Whether tool use is required or automatic. This is a hint to the model - which may not be followed. It depends on the Instruction Following capabilities - of the model. - tool_prompt_format: - type: string - enum: - - json - - function_tag - - python_list - description: >- - Prompt format for calling custom / zero shot tools. - tool_config: - $ref: '#/components/schemas/ToolConfig' - max_infer_iters: - type: integer - default: 10 - model: - type: string - instructions: - type: string - enable_session_persistence: - type: boolean - response_format: - $ref: '#/components/schemas/ResponseFormat' - additionalProperties: false - required: - - model - - instructions - - enable_session_persistence - AgentTool: - oneOf: - - type: string - - type: object - properties: - name: - type: string - args: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - additionalProperties: false - required: - - name - - args - ToolDef: - type: object - properties: - name: - type: string - description: - type: string - parameters: - type: array - items: - $ref: '#/components/schemas/ToolParameter' - metadata: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - additionalProperties: false - required: - - name - ToolParameter: - type: object - properties: - name: - type: string - parameter_type: - type: string - description: - type: string - required: - type: boolean - default: true - default: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - additionalProperties: false - required: - - name - - parameter_type - - description - - required CreateAgentRequest: type: object properties: @@ -2962,163 +3443,6 @@ components: - embeddings description: >- Response containing generated embeddings. - AgentCandidate: - type: object - properties: - type: - type: string - const: agent - default: agent - config: - $ref: '#/components/schemas/AgentConfig' - additionalProperties: false - required: - - type - - config - AggregationFunctionType: - type: string - enum: - - average - - median - - categorical_count - - accuracy - AppEvalTaskConfig: - type: object - properties: - type: - type: string - const: app - default: app - eval_candidate: - $ref: '#/components/schemas/EvalCandidate' - scoring_params: - type: object - additionalProperties: - $ref: '#/components/schemas/ScoringFnParams' - num_examples: - type: integer - additionalProperties: false - required: - - type - - eval_candidate - - scoring_params - BasicScoringFnParams: - type: object - properties: - type: - type: string - const: basic - default: basic - aggregation_functions: - type: array - items: - $ref: '#/components/schemas/AggregationFunctionType' - additionalProperties: false - required: - - type - BenchmarkEvalTaskConfig: - type: object - properties: - type: - type: string - const: benchmark - default: benchmark - eval_candidate: - $ref: '#/components/schemas/EvalCandidate' - num_examples: - type: integer - additionalProperties: false - required: - - type - - eval_candidate - EvalCandidate: - oneOf: - - $ref: '#/components/schemas/ModelCandidate' - - $ref: '#/components/schemas/AgentCandidate' - discriminator: - propertyName: type - mapping: - model: '#/components/schemas/ModelCandidate' - agent: '#/components/schemas/AgentCandidate' - EvalTaskConfig: - oneOf: - - $ref: '#/components/schemas/BenchmarkEvalTaskConfig' - - $ref: '#/components/schemas/AppEvalTaskConfig' - discriminator: - propertyName: type - mapping: - benchmark: '#/components/schemas/BenchmarkEvalTaskConfig' - app: '#/components/schemas/AppEvalTaskConfig' - LLMAsJudgeScoringFnParams: - type: object - properties: - type: - type: string - const: llm_as_judge - default: llm_as_judge - judge_model: - type: string - prompt_template: - type: string - judge_score_regexes: - type: array - items: - type: string - aggregation_functions: - type: array - items: - $ref: '#/components/schemas/AggregationFunctionType' - additionalProperties: false - required: - - type - - judge_model - ModelCandidate: - type: object - properties: - type: - type: string - const: model - default: model - model: - type: string - sampling_params: - $ref: '#/components/schemas/SamplingParams' - system_message: - $ref: '#/components/schemas/SystemMessage' - additionalProperties: false - required: - - type - - model - - sampling_params - RegexParserScoringFnParams: - type: object - properties: - type: - type: string - const: regex_parser - default: regex_parser - parsing_regexes: - type: array - items: - type: string - aggregation_functions: - type: array - items: - $ref: '#/components/schemas/AggregationFunctionType' - additionalProperties: false - required: - - type - ScoringFnParams: - oneOf: - - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams' - - $ref: '#/components/schemas/RegexParserScoringFnParams' - - $ref: '#/components/schemas/BasicScoringFnParams' - discriminator: - propertyName: type - mapping: - llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams' - regex_parser: '#/components/schemas/RegexParserScoringFnParams' - basic: '#/components/schemas/BasicScoringFnParams' EvaluateRowsRequest: type: object properties: @@ -3139,64 +3463,12 @@ components: items: type: string task_config: - $ref: '#/components/schemas/EvalTaskConfig' + $ref: '#/components/schemas/BenchmarkConfig' additionalProperties: false required: - input_rows - scoring_functions - task_config - EvaluateResponse: - type: object - properties: - generations: - type: array - items: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - scores: - type: object - additionalProperties: - $ref: '#/components/schemas/ScoringResult' - additionalProperties: false - required: - - generations - - scores - ScoringResult: - type: object - properties: - score_rows: - type: array - items: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - aggregated_results: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - additionalProperties: false - required: - - score_rows - - aggregated_results Session: type: object properties: @@ -3401,44 +3673,6 @@ components: additionalProperties: false required: - type - EvalTask: - type: object - properties: - identifier: - type: string - provider_resource_id: - type: string - provider_id: - type: string - type: - type: string - const: eval_task - default: eval_task - dataset_id: - type: string - scoring_functions: - type: array - items: - type: string - metadata: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - additionalProperties: false - required: - - identifier - - provider_resource_id - - provider_id - - type - - dataset_id - - scoring_functions - - metadata Model: type: object properties: @@ -3766,13 +4000,6 @@ components: - job_uuid - checkpoints description: Artifacts of a finetuning job. - JobStatus: - type: string - enum: - - completed - - in_progress - - failed - - scheduled PostTrainingJobStatusResponse: type: object properties: @@ -3977,16 +4204,6 @@ components: additionalProperties: false required: - data - ListEvalTasksResponse: - type: object - properties: - data: - type: array - items: - $ref: '#/components/schemas/EvalTask' - additionalProperties: false - required: - - data ListModelsResponse: type: object properties: @@ -4569,6 +4786,36 @@ components: additionalProperties: false required: - data + RegisterBenchmarkRequest: + type: object + properties: + benchmark_id: + type: string + dataset_id: + type: string + scoring_functions: + type: array + items: + type: string + provider_benchmark_id: + type: string + provider_id: + type: string + metadata: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + additionalProperties: false + required: + - benchmark_id + - dataset_id + - scoring_functions RegisterDatasetRequest: type: object properties: @@ -4599,36 +4846,6 @@ components: - dataset_id - dataset_schema - url - RegisterEvalTaskRequest: - type: object - properties: - eval_task_id: - type: string - dataset_id: - type: string - scoring_functions: - type: array - items: - type: string - provider_eval_task_id: - type: string - provider_id: - type: string - metadata: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - additionalProperties: false - required: - - eval_task_id - - dataset_id - - scoring_functions RegisterModelRequest: type: object properties: @@ -4739,18 +4956,10 @@ components: type: object properties: task_config: - $ref: '#/components/schemas/EvalTaskConfig' + $ref: '#/components/schemas/BenchmarkConfig' additionalProperties: false required: - task_config - Job: - type: object - properties: - job_id: - type: string - additionalProperties: false - required: - - job_id RunShieldRequest: type: object properties: @@ -5049,10 +5258,10 @@ tags: x-displayName: >- Agents API for creating and interacting with agentic systems. - name: BatchInference (Coming Soon) + - name: Benchmarks - name: DatasetIO - name: Datasets - name: Eval - - name: EvalTasks - name: Inference description: >- This API provides the raw interface to the underlying models. Two kinds of models @@ -5083,10 +5292,10 @@ x-tagGroups: tags: - Agents - BatchInference (Coming Soon) + - Benchmarks - DatasetIO - Datasets - Eval - - EvalTasks - Inference - Inspect - Models diff --git a/docs/getting_started.ipynb b/docs/getting_started.ipynb index abe537c8e..ee616b471 100644 --- a/docs/getting_started.ipynb +++ b/docs/getting_started.ipynb @@ -324,7 +324,7 @@ "- vector_io\n", "container_image: null\n", "datasets: []\n", - "eval_tasks: []\n", + "benchmarks: []\n", "image_name: together\n", "metadata_store:\n", " db_path: /Users/ashwin/.llama/distributions/together/registry.db\n", @@ -508,7 +508,7 @@ "- vector_io\n", "container_image: null\n", "datasets: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", - "eval_tasks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", + "benchmarks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", "image_name: together\n", "metadata_store:\n", " db_path: \u001b[35m/Users/ashwin/.llama/distributions/together/\u001b[0m\u001b[95mregistry.db\u001b[0m\n", diff --git a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb index 84da25246..8eecf84ab 100644 --- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb +++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb @@ -370,7 +370,7 @@ "- tool_runtime\n", "datasets: []\n", "container_image: null\n", - "eval_tasks: []\n", + "benchmarks: []\n", "image_name: together\n", "memory_banks: []\n", "metadata_store:\n", @@ -551,7 +551,7 @@ "- tool_runtime\n", "datasets: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", "container_image: null\n", - "eval_tasks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", + "benchmarks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", "image_name: together\n", "memory_banks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", "metadata_store:\n", diff --git a/docs/openapi_generator/pyopenapi/generator.py b/docs/openapi_generator/pyopenapi/generator.py index a0385cae0..0f3b99784 100644 --- a/docs/openapi_generator/pyopenapi/generator.py +++ b/docs/openapi_generator/pyopenapi/generator.py @@ -647,6 +647,7 @@ class Generator: description = "\n".join( filter(None, [doc_string.short_description, doc_string.long_description]) ) + return Operation( tags=[op.defining_class.__name__], summary=None, @@ -656,6 +657,7 @@ class Generator: requestBody=requestBody, responses=responses, callbacks=callbacks, + deprecated=True if "DEPRECATED" in op.func_name else None, security=[] if op.public else None, ) diff --git a/docs/openapi_generator/pyopenapi/specification.py b/docs/openapi_generator/pyopenapi/specification.py index 4b54295c5..f96de58b6 100644 --- a/docs/openapi_generator/pyopenapi/specification.py +++ b/docs/openapi_generator/pyopenapi/specification.py @@ -117,6 +117,7 @@ class Operation: requestBody: Optional[RequestBody] = None callbacks: Optional[Dict[str, "Callback"]] = None security: Optional[List["SecurityRequirement"]] = None + deprecated: Optional[bool] = None @dataclass diff --git a/docs/source/building_applications/evals.md b/docs/source/building_applications/evals.md index c4cb476e4..f28e0d5fd 100644 --- a/docs/source/building_applications/evals.md +++ b/docs/source/building_applications/evals.md @@ -41,14 +41,14 @@ system_message = { "content": SYSTEM_PROMPT_TEMPLATE, } -client.eval_tasks.register( - eval_task_id="meta-reference::mmmu", +client.benchmarks.register( + benchmark_id="meta-reference::mmmu", dataset_id=f"mmmu-{subset}-{split}", scoring_functions=["basic::regex_parser_multiple_choice_answer"], ) response = client.eval.evaluate_rows( - task_id="meta-reference::mmmu", + benchmark_id="meta-reference::mmmu", input_rows=eval_rows, scoring_functions=["basic::regex_parser_multiple_choice_answer"], task_config={ @@ -99,14 +99,14 @@ eval_rows = client.datasetio.get_rows_paginated( ``` ```python -client.eval_tasks.register( - eval_task_id="meta-reference::simpleqa", +client.benchmarks.register( + benchmark_id="meta-reference::simpleqa", dataset_id=simpleqa_dataset_id, scoring_functions=["llm-as-judge::405b-simpleqa"], ) response = client.eval.evaluate_rows( - task_id="meta-reference::simpleqa", + benchmark_id="meta-reference::simpleqa", input_rows=eval_rows.rows, scoring_functions=["llm-as-judge::405b-simpleqa"], task_config={ @@ -156,7 +156,7 @@ agent_config = { } response = client.eval.evaluate_rows( - task_id="meta-reference::simpleqa", + benchmark_id="meta-reference::simpleqa", input_rows=eval_rows.rows, scoring_functions=["llm-as-judge::405b-simpleqa"], task_config={ diff --git a/docs/source/building_applications/evaluation.md b/docs/source/building_applications/evaluation.md index 91e5c552b..ad220f751 100644 --- a/docs/source/building_applications/evaluation.md +++ b/docs/source/building_applications/evaluation.md @@ -10,15 +10,15 @@ Here's how to set up basic evaluation: ```python # Create an evaluation task -response = client.eval_tasks.register( - eval_task_id="my_eval", +response = client.benchmarks.register( + benchmark_id="my_eval", dataset_id="my_dataset", scoring_functions=["accuracy", "relevance"], ) # Run evaluation job = client.eval.run_eval( - task_id="my_eval", + benchmark_id="my_eval", task_config={ "type": "app", "eval_candidate": {"type": "agent", "config": agent_config}, @@ -26,5 +26,5 @@ job = client.eval.run_eval( ) # Get results -result = client.eval.job_result(task_id="my_eval", job_id=job.job_id) +result = client.eval.job_result(benchmark_id="my_eval", job_id=job.job_id) ``` diff --git a/docs/source/concepts/evaluation_concepts.md b/docs/source/concepts/evaluation_concepts.md index 399d99d92..3ca4b0ac8 100644 --- a/docs/source/concepts/evaluation_concepts.md +++ b/docs/source/concepts/evaluation_concepts.md @@ -5,7 +5,7 @@ The Llama Stack Evaluation flow allows you to run evaluations on your GenAI appl We introduce a set of APIs in Llama Stack for supporting running evaluations of LLM applications. - `/datasetio` + `/datasets` API - `/scoring` + `/scoring_functions` API -- `/eval` + `/eval_tasks` API +- `/eval` + `/benchmarks` API This guide goes over the sets of APIs and developer experience flow of using Llama Stack to run evaluations for different use cases. Checkout our Colab notebook on working examples with evaluations [here](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing). @@ -21,7 +21,7 @@ The Evaluation APIs are associated with a set of Resources as shown in the follo - **Scoring**: evaluate outputs of the system. - Associated with `ScoringFunction` resource. We provide a suite of out-of-the box scoring functions and also the ability for you to add custom evaluators. These scoring functions are the core part of defining an evaluation task to output evaluation metrics. - **Eval**: generate outputs (via Inference or Agents) and perform scoring. - - Associated with `EvalTask` resource. + - Associated with `Benchmark` resource. Use the following decision tree to decide how to use LlamaStack Evaluation flow. diff --git a/docs/source/concepts/index.md b/docs/source/concepts/index.md index 1437ec623..403e47c48 100644 --- a/docs/source/concepts/index.md +++ b/docs/source/concepts/index.md @@ -42,7 +42,7 @@ Some of these APIs are associated with a set of **Resources**. Here is the mappi - **Tool Runtime** is associated with `ToolGroup` resources. - **DatasetIO** is associated with `Dataset` resources. - **Scoring** is associated with `ScoringFunction` resources. -- **Eval** is associated with `Model` and `EvalTask` resources. +- **Eval** is associated with `Model` and `Benchmark` resources. Furthermore, we allow these resources to be **federated** across multiple providers. For example, you may have some Llama models served by Fireworks while others are served by AWS Bedrock. Regardless, they will all work seamlessly with the same uniform Inference API provided by Llama Stack. diff --git a/docs/source/playground/index.md b/docs/source/playground/index.md index d74bf1a03..9691609ab 100644 --- a/docs/source/playground/index.md +++ b/docs/source/playground/index.md @@ -64,7 +64,7 @@ Interactive pages for users to play with and explore Llama Stack API capabilitie ``` ```bash - $ llama-stack-client eval_tasks register \ + $ llama-stack-client benchmarks register \ --eval-task-id meta-reference-mmlu \ --provider-id meta-reference \ --dataset-id mmlu \ @@ -86,7 +86,7 @@ Interactive pages for users to play with and explore Llama Stack API capabilitie - Under the hood, it uses Llama Stack's `/providers` API to get information about the providers. - **API Resources**: Inspect Llama Stack API resources - - This page allows you to inspect Llama Stack API resources (`models`, `datasets`, `memory_banks`, `eval_tasks`, `shields`). + - This page allows you to inspect Llama Stack API resources (`models`, `datasets`, `memory_banks`, `benchmarks`, `shields`). - Under the hood, it uses Llama Stack's `//list` API to get information about each resources. - Please visit [Core Concepts](https://llama-stack.readthedocs.io/en/latest/concepts/index.html) for more details about the resources. diff --git a/docs/source/references/evals_reference/index.md b/docs/source/references/evals_reference/index.md index 86f66208a..71dbb47e5 100644 --- a/docs/source/references/evals_reference/index.md +++ b/docs/source/references/evals_reference/index.md @@ -5,7 +5,7 @@ The Llama Stack Evaluation flow allows you to run evaluations on your GenAI appl We introduce a set of APIs in Llama Stack for supporting running evaluations of LLM applications. - `/datasetio` + `/datasets` API - `/scoring` + `/scoring_functions` API -- `/eval` + `/eval_tasks` API +- `/eval` + `/benchmarks` API This guide goes over the sets of APIs and developer experience flow of using Llama Stack to run evaluations for different use cases. Checkout our Colab notebook on working examples with evaluations [here](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing). @@ -21,7 +21,7 @@ The Evaluation APIs are associated with a set of Resources as shown in the follo - **Scoring**: evaluate outputs of the system. - Associated with `ScoringFunction` resource. We provide a suite of out-of-the box scoring functions and also the ability for you to add custom evaluators. These scoring functions are the core part of defining an evaluation task to output evaluation metrics. - **Eval**: generate outputs (via Inference or Agents) and perform scoring. - - Associated with `EvalTask` resource. + - Associated with `Benchmark` resource. Use the following decision tree to decide how to use LlamaStack Evaluation flow. @@ -77,14 +77,14 @@ system_message = { "content": SYSTEM_PROMPT_TEMPLATE, } -client.eval_tasks.register( - eval_task_id="meta-reference::mmmu", +client.benchmarks.register( + benchmark_id="meta-reference::mmmu", dataset_id=f"mmmu-{subset}-{split}", scoring_functions=["basic::regex_parser_multiple_choice_answer"], ) response = client.eval.evaluate_rows( - task_id="meta-reference::mmmu", + benchmark_id="meta-reference::mmmu", input_rows=eval_rows, scoring_functions=["basic::regex_parser_multiple_choice_answer"], task_config={ @@ -135,14 +135,14 @@ eval_rows = client.datasetio.get_rows_paginated( ``` ```python -client.eval_tasks.register( - eval_task_id="meta-reference::simpleqa", +client.benchmarks.register( + benchmark_id="meta-reference::simpleqa", dataset_id=simpleqa_dataset_id, scoring_functions=["llm-as-judge::405b-simpleqa"], ) response = client.eval.evaluate_rows( - task_id="meta-reference::simpleqa", + benchmark_id="meta-reference::simpleqa", input_rows=eval_rows.rows, scoring_functions=["llm-as-judge::405b-simpleqa"], task_config={ @@ -192,7 +192,7 @@ agent_config = { } response = client.eval.evaluate_rows( - task_id="meta-reference::simpleqa", + benchmark_id="meta-reference::simpleqa", input_rows=eval_rows.rows, scoring_functions=["llm-as-judge::405b-simpleqa"], task_config={ @@ -281,7 +281,7 @@ The following examples give the quick steps to start running evaluations using t #### Benchmark Evaluation CLI Usage: There are 2 inputs necessary for running a benchmark eval -- `eval-task-id`: the identifier associated with the eval task. Each `EvalTask` is parametrized by +- `eval-task-id`: the identifier associated with the eval task. Each `Benchmark` is parametrized by - `dataset_id`: the identifier associated with the dataset. - `List[scoring_function_id]`: list of scoring function identifiers. - `eval-task-config`: specifies the configuration of the model / agent to evaluate on. @@ -289,7 +289,7 @@ Usage: There are 2 inputs necessary for running a benchmark eval ``` llama-stack-client eval run_benchmark \ ---eval-task-config ~/eval_task_config.json \ +--eval-task-config ~/benchmark_config.json \ --visualize ``` @@ -309,15 +309,15 @@ llama-stack-client eval run_scoring ... --dataset-id --scoring-functions [ ...] [--provider-id ] [--provider-eval-task-id ] [--metadata ] +$ llama-stack-client benchmarks register --eval-task-id --dataset-id --scoring-functions [ ...] [--provider-id ] [--provider-eval-task-id ] [--metadata ] ``` Options: @@ -191,7 +191,7 @@ Options: - `--num-examples`: Optional. Number of examples to evaluate (useful for debugging) - `--visualize`: Optional flag. If set, visualizes evaluation results after completion -Example eval_task_config.json: +Example benchmark_config.json: ```json { "type": "benchmark", diff --git a/docs/source/references/python_sdk_reference/index.md b/docs/source/references/python_sdk_reference/index.md index 8a06e2244..9d1130422 100644 --- a/docs/source/references/python_sdk_reference/index.md +++ b/docs/source/references/python_sdk_reference/index.md @@ -181,8 +181,8 @@ from llama_stack_client.types import EvaluateResponse, Job Methods: -- client.eval.evaluate_rows(task_id, \*\*params) -> EvaluateResponse -- client.eval.run_eval(task_id, \*\*params) -> Job +- client.eval.evaluate_rows(benchmark_id, \*\*params) -> EvaluateResponse +- client.eval.run_eval(benchmark_id, \*\*params) -> Job ### Jobs @@ -194,9 +194,9 @@ from llama_stack_client.types.eval import JobStatusResponse Methods: -- client.eval.jobs.retrieve(job_id, \*, task_id) -> EvaluateResponse -- client.eval.jobs.cancel(job_id, \*, task_id) -> None -- client.eval.jobs.status(job_id, \*, task_id) -> Optional[JobStatusResponse] +- client.eval.jobs.retrieve(job_id, \*, benchmark_id) -> EvaluateResponse +- client.eval.jobs.cancel(job_id, \*, benchmark_id) -> None +- client.eval.jobs.status(job_id, \*, benchmark_id) -> Optional[JobStatusResponse] ## Inspect @@ -443,20 +443,20 @@ Methods: - client.scoring_functions.list() -> ScoringFunctionListResponse - client.scoring_functions.register(\*\*params) -> None -## EvalTasks +## Benchmarks Types: ```python from llama_stack_client.types import ( - EvalTask, - ListEvalTasksResponse, - EvalTaskListResponse, + Benchmark, + ListBenchmarksResponse, + BenchmarkListResponse, ) ``` Methods: -- client.eval_tasks.retrieve(eval_task_id) -> Optional[EvalTask] -- client.eval_tasks.list() -> EvalTaskListResponse -- client.eval_tasks.register(\*\*params) -> None +- client.benchmarks.retrieve(benchmark_id) -> Optional[Benchmark] +- client.benchmarks.list() -> BenchmarkListResponse +- client.benchmarks.register(\*\*params) -> None diff --git a/llama_stack/apis/eval_tasks/__init__.py b/llama_stack/apis/benchmarks/__init__.py similarity index 81% rename from llama_stack/apis/eval_tasks/__init__.py rename to llama_stack/apis/benchmarks/__init__.py index 7ca216706..f8f564957 100644 --- a/llama_stack/apis/eval_tasks/__init__.py +++ b/llama_stack/apis/benchmarks/__init__.py @@ -4,4 +4,4 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from .eval_tasks import * # noqa: F401 F403 +from .benchmarks import * # noqa: F401 F403 diff --git a/llama_stack/apis/benchmarks/benchmarks.py b/llama_stack/apis/benchmarks/benchmarks.py new file mode 100644 index 000000000..50019b18c --- /dev/null +++ b/llama_stack/apis/benchmarks/benchmarks.py @@ -0,0 +1,86 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable + +from llama_models.schema_utils import json_schema_type, webmethod +from pydantic import BaseModel, Field + +from llama_stack.apis.resource import Resource, ResourceType + + +class CommonBenchmarkFields(BaseModel): + dataset_id: str + scoring_functions: List[str] + metadata: Dict[str, Any] = Field( + default_factory=dict, + description="Metadata for this evaluation task", + ) + + +@json_schema_type +class Benchmark(CommonBenchmarkFields, Resource): + type: Literal[ResourceType.benchmark.value] = ResourceType.benchmark.value + + @property + def benchmark_id(self) -> str: + return self.identifier + + @property + def provider_benchmark_id(self) -> str: + return self.provider_resource_id + + +class BenchmarkInput(CommonBenchmarkFields, BaseModel): + benchmark_id: str + provider_id: Optional[str] = None + provider_benchmark_id: Optional[str] = None + + +class ListBenchmarksResponse(BaseModel): + data: List[Benchmark] + + +@runtime_checkable +class Benchmarks(Protocol): + @webmethod(route="/eval/benchmarks", method="GET") + async def list_benchmarks(self) -> ListBenchmarksResponse: ... + + @webmethod(route="/eval/benchmarks/{benchmark_id}", method="GET") + async def get_benchmark( + self, + benchmark_id: str, + ) -> Optional[Benchmark]: ... + + @webmethod(route="/eval/benchmarks", method="POST") + async def register_benchmark( + self, + benchmark_id: str, + dataset_id: str, + scoring_functions: List[str], + provider_benchmark_id: Optional[str] = None, + provider_id: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + ) -> None: ... + + @webmethod(route="/eval-tasks", method="GET") + async def DEPRECATED_list_eval_tasks(self) -> ListBenchmarksResponse: ... + + @webmethod(route="/eval-tasks/{task_id}", method="GET") + async def DEPRECATED_get_eval_task( + self, + eval_task_id: str, + ) -> Optional[Benchmark]: ... + + @webmethod(route="/eval-tasks", method="POST") + async def DEPRECATED_register_eval_task( + self, + eval_task_id: str, + dataset_id: str, + scoring_functions: List[str], + provider_benchmark_id: Optional[str] = None, + provider_id: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + ) -> None: ... diff --git a/llama_stack/apis/datatypes.py b/llama_stack/apis/datatypes.py index ccc395b80..0751b2c9b 100644 --- a/llama_stack/apis/datatypes.py +++ b/llama_stack/apis/datatypes.py @@ -28,7 +28,7 @@ class Api(Enum): vector_dbs = "vector_dbs" datasets = "datasets" scoring_functions = "scoring_functions" - eval_tasks = "eval_tasks" + benchmarks = "benchmarks" tool_groups = "tool_groups" # built-in API diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py index ae13a5bd9..e5c782150 100644 --- a/llama_stack/apis/eval/eval.py +++ b/llama_stack/apis/eval/eval.py @@ -38,19 +38,9 @@ EvalCandidate = register_schema( @json_schema_type -class BenchmarkEvalTaskConfig(BaseModel): +class BenchmarkConfig(BaseModel): type: Literal["benchmark"] = "benchmark" eval_candidate: EvalCandidate - num_examples: Optional[int] = Field( - description="Number of examples to evaluate (useful for testing), if not provided, all examples in the dataset will be evaluated", - default=None, - ) - - -@json_schema_type -class AppEvalTaskConfig(BaseModel): - type: Literal["app"] = "app" - eval_candidate: EvalCandidate scoring_params: Dict[str, ScoringFnParams] = Field( description="Map between scoring function id and parameters for each scoring function you want to run", default_factory=dict, @@ -62,12 +52,6 @@ class AppEvalTaskConfig(BaseModel): # we could optinally add any specific dataset config here -EvalTaskConfig = register_schema( - Annotated[Union[BenchmarkEvalTaskConfig, AppEvalTaskConfig], Field(discriminator="type")], - name="EvalTaskConfig", -) - - @json_schema_type class EvaluateResponse(BaseModel): generations: List[Dict[str, Any]] @@ -76,27 +60,52 @@ class EvaluateResponse(BaseModel): class Eval(Protocol): - @webmethod(route="/eval/tasks/{task_id}/jobs", method="POST") + @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST") async def run_eval( + self, + benchmark_id: str, + task_config: BenchmarkConfig, + ) -> Job: ... + + @webmethod(route="/eval/benchmarks/{benchmark_id}/evaluations", method="POST") + async def evaluate_rows( + self, + benchmark_id: str, + input_rows: List[Dict[str, Any]], + scoring_functions: List[str], + task_config: BenchmarkConfig, + ) -> EvaluateResponse: ... + + @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="GET") + async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]: ... + + @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="DELETE") + async def job_cancel(self, benchmark_id: str, job_id: str) -> None: ... + + @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET") + async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse: ... + + @webmethod(route="/eval/tasks/{task_id}/jobs", method="POST") + async def DEPRECATED_run_eval( self, task_id: str, - task_config: EvalTaskConfig, + task_config: BenchmarkConfig, ) -> Job: ... @webmethod(route="/eval/tasks/{task_id}/evaluations", method="POST") - async def evaluate_rows( + async def DEPRECATED_evaluate_rows( self, task_id: str, input_rows: List[Dict[str, Any]], scoring_functions: List[str], - task_config: EvalTaskConfig, + task_config: BenchmarkConfig, ) -> EvaluateResponse: ... @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="GET") - async def job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]: ... + async def DEPRECATED_job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]: ... @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="DELETE") - async def job_cancel(self, task_id: str, job_id: str) -> None: ... + async def DEPRECATED_job_cancel(self, task_id: str, job_id: str) -> None: ... @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}/result", method="GET") - async def job_result(self, job_id: str, task_id: str) -> EvaluateResponse: ... + async def DEPRECATED_job_result(self, task_id: str, job_id: str) -> EvaluateResponse: ... diff --git a/llama_stack/apis/eval_tasks/eval_tasks.py b/llama_stack/apis/eval_tasks/eval_tasks.py deleted file mode 100644 index a0a533055..000000000 --- a/llama_stack/apis/eval_tasks/eval_tasks.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. -from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable - -from llama_models.schema_utils import json_schema_type, webmethod -from pydantic import BaseModel, Field - -from llama_stack.apis.resource import Resource, ResourceType - - -class CommonEvalTaskFields(BaseModel): - dataset_id: str - scoring_functions: List[str] - metadata: Dict[str, Any] = Field( - default_factory=dict, - description="Metadata for this evaluation task", - ) - - -@json_schema_type -class EvalTask(CommonEvalTaskFields, Resource): - type: Literal[ResourceType.eval_task.value] = ResourceType.eval_task.value - - @property - def eval_task_id(self) -> str: - return self.identifier - - @property - def provider_eval_task_id(self) -> str: - return self.provider_resource_id - - -class EvalTaskInput(CommonEvalTaskFields, BaseModel): - eval_task_id: str - provider_id: Optional[str] = None - provider_eval_task_id: Optional[str] = None - - -class ListEvalTasksResponse(BaseModel): - data: List[EvalTask] - - -@runtime_checkable -class EvalTasks(Protocol): - @webmethod(route="/eval-tasks", method="GET") - async def list_eval_tasks(self) -> ListEvalTasksResponse: ... - - @webmethod(route="/eval-tasks/{eval_task_id}", method="GET") - async def get_eval_task( - self, - eval_task_id: str, - ) -> Optional[EvalTask]: ... - - @webmethod(route="/eval-tasks", method="POST") - async def register_eval_task( - self, - eval_task_id: str, - dataset_id: str, - scoring_functions: List[str], - provider_eval_task_id: Optional[str] = None, - provider_id: Optional[str] = None, - metadata: Optional[Dict[str, Any]] = None, - ) -> None: ... diff --git a/llama_stack/apis/resource.py b/llama_stack/apis/resource.py index 145113a5d..70ec63c55 100644 --- a/llama_stack/apis/resource.py +++ b/llama_stack/apis/resource.py @@ -15,7 +15,7 @@ class ResourceType(Enum): vector_db = "vector_db" dataset = "dataset" scoring_function = "scoring_function" - eval_task = "eval_task" + benchmark = "benchmark" tool = "tool" tool_group = "tool_group" diff --git a/llama_stack/distribution/datatypes.py b/llama_stack/distribution/datatypes.py index 97706f22a..f62996081 100644 --- a/llama_stack/distribution/datatypes.py +++ b/llama_stack/distribution/datatypes.py @@ -8,10 +8,10 @@ from typing import Annotated, Any, Dict, List, Optional, Union from pydantic import BaseModel, Field +from llama_stack.apis.benchmarks import Benchmark, BenchmarkInput from llama_stack.apis.datasetio import DatasetIO from llama_stack.apis.datasets import Dataset, DatasetInput from llama_stack.apis.eval import Eval -from llama_stack.apis.eval_tasks import EvalTask, EvalTaskInput from llama_stack.apis.inference import Inference from llama_stack.apis.models import Model, ModelInput from llama_stack.apis.safety import Safety @@ -37,7 +37,7 @@ RoutableObject = Union[ VectorDB, Dataset, ScoringFn, - EvalTask, + Benchmark, Tool, ToolGroup, ] @@ -50,7 +50,7 @@ RoutableObjectWithProvider = Annotated[ VectorDB, Dataset, ScoringFn, - EvalTask, + Benchmark, Tool, ToolGroup, ], @@ -173,7 +173,7 @@ a default SQLite store will be used.""", vector_dbs: List[VectorDBInput] = Field(default_factory=list) datasets: List[DatasetInput] = Field(default_factory=list) scoring_fns: List[ScoringFnInput] = Field(default_factory=list) - eval_tasks: List[EvalTaskInput] = Field(default_factory=list) + benchmarks: List[BenchmarkInput] = Field(default_factory=list) tool_groups: List[ToolGroupInput] = Field(default_factory=list) server: ServerConfig = Field( diff --git a/llama_stack/distribution/distribution.py b/llama_stack/distribution/distribution.py index 2dcf38463..384e2c3c8 100644 --- a/llama_stack/distribution/distribution.py +++ b/llama_stack/distribution/distribution.py @@ -44,7 +44,7 @@ def builtin_automatically_routed_apis() -> List[AutoRoutedApiInfo]: router_api=Api.scoring, ), AutoRoutedApiInfo( - routing_table_api=Api.eval_tasks, + routing_table_api=Api.benchmarks, router_api=Api.eval, ), AutoRoutedApiInfo( diff --git a/llama_stack/distribution/resolver.py b/llama_stack/distribution/resolver.py index 353c2971b..0bc2e774c 100644 --- a/llama_stack/distribution/resolver.py +++ b/llama_stack/distribution/resolver.py @@ -9,10 +9,10 @@ import logging from typing import Any, Dict, List, Set from llama_stack.apis.agents import Agents +from llama_stack.apis.benchmarks import Benchmarks from llama_stack.apis.datasetio import DatasetIO from llama_stack.apis.datasets import Datasets from llama_stack.apis.eval import Eval -from llama_stack.apis.eval_tasks import EvalTasks from llama_stack.apis.inference import Inference from llama_stack.apis.inspect import Inspect from llama_stack.apis.models import Models @@ -37,8 +37,8 @@ from llama_stack.distribution.store import DistributionRegistry from llama_stack.distribution.utils.dynamic import instantiate_class_type from llama_stack.providers.datatypes import ( Api, + BenchmarksProtocolPrivate, DatasetsProtocolPrivate, - EvalTasksProtocolPrivate, InlineProviderSpec, ModelsProtocolPrivate, ProviderSpec, @@ -73,7 +73,7 @@ def api_protocol_map() -> Dict[Api, Any]: Api.scoring: Scoring, Api.scoring_functions: ScoringFunctions, Api.eval: Eval, - Api.eval_tasks: EvalTasks, + Api.benchmarks: Benchmarks, Api.post_training: PostTraining, Api.tool_groups: ToolGroups, Api.tool_runtime: ToolRuntime, @@ -92,7 +92,7 @@ def additional_protocols_map() -> Dict[Api, Any]: ScoringFunctions, Api.scoring_functions, ), - Api.eval: (EvalTasksProtocolPrivate, EvalTasks, Api.eval_tasks), + Api.eval: (BenchmarksProtocolPrivate, Benchmarks, Api.benchmarks), } diff --git a/llama_stack/distribution/routers/__init__.py b/llama_stack/distribution/routers/__init__.py index 18197ca7f..a54f57fb3 100644 --- a/llama_stack/distribution/routers/__init__.py +++ b/llama_stack/distribution/routers/__init__.py @@ -11,8 +11,8 @@ from llama_stack.distribution.store import DistributionRegistry from llama_stack.providers.datatypes import Api, RoutingTable from .routing_tables import ( + BenchmarksRoutingTable, DatasetsRoutingTable, - EvalTasksRoutingTable, ModelsRoutingTable, ScoringFunctionsRoutingTable, ShieldsRoutingTable, @@ -33,7 +33,7 @@ async def get_routing_table_impl( "shields": ShieldsRoutingTable, "datasets": DatasetsRoutingTable, "scoring_functions": ScoringFunctionsRoutingTable, - "eval_tasks": EvalTasksRoutingTable, + "benchmarks": BenchmarksRoutingTable, "tool_groups": ToolGroupsRoutingTable, } diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py index e716e44b0..f45975189 100644 --- a/llama_stack/distribution/routers/routers.py +++ b/llama_stack/distribution/routers/routers.py @@ -9,9 +9,8 @@ from typing import Any, AsyncGenerator, Dict, List, Optional from llama_stack.apis.common.content_types import URL, InterleavedContent from llama_stack.apis.datasetio import DatasetIO, PaginatedRowsResult from llama_stack.apis.eval import ( - AppEvalTaskConfig, + BenchmarkConfig, Eval, - EvalTaskConfig, EvaluateResponse, Job, JobStatus, @@ -347,23 +346,23 @@ class EvalRouter(Eval): async def run_eval( self, - task_id: str, - task_config: AppEvalTaskConfig, + benchmark_id: str, + task_config: BenchmarkConfig, ) -> Job: - return await self.routing_table.get_provider_impl(task_id).run_eval( - task_id=task_id, + return await self.routing_table.get_provider_impl(benchmark_id).run_eval( + benchmark_id=benchmark_id, task_config=task_config, ) async def evaluate_rows( self, - task_id: str, + benchmark_id: str, input_rows: List[Dict[str, Any]], scoring_functions: List[str], - task_config: EvalTaskConfig, + task_config: BenchmarkConfig, ) -> EvaluateResponse: - return await self.routing_table.get_provider_impl(task_id).evaluate_rows( - task_id=task_id, + return await self.routing_table.get_provider_impl(benchmark_id).evaluate_rows( + benchmark_id=benchmark_id, input_rows=input_rows, scoring_functions=scoring_functions, task_config=task_config, @@ -371,30 +370,72 @@ class EvalRouter(Eval): async def job_status( self, - task_id: str, + benchmark_id: str, job_id: str, ) -> Optional[JobStatus]: - return await self.routing_table.get_provider_impl(task_id).job_status(task_id, job_id) + return await self.routing_table.get_provider_impl(benchmark_id).job_status(benchmark_id, job_id) async def job_cancel( self, - task_id: str, + benchmark_id: str, job_id: str, ) -> None: - await self.routing_table.get_provider_impl(task_id).job_cancel( - task_id, + await self.routing_table.get_provider_impl(benchmark_id).job_cancel( + benchmark_id, job_id, ) async def job_result( + self, + benchmark_id: str, + job_id: str, + ) -> EvaluateResponse: + return await self.routing_table.get_provider_impl(benchmark_id).job_result( + benchmark_id, + job_id, + ) + + async def DEPRECATED_run_eval( + self, + task_id: str, + task_config: BenchmarkConfig, + ) -> Job: + return await self.run_eval(benchmark_id=task_id, task_config=task_config) + + async def DEPRECATED_evaluate_rows( + self, + task_id: str, + input_rows: List[Dict[str, Any]], + scoring_functions: List[str], + task_config: BenchmarkConfig, + ) -> EvaluateResponse: + return await self.evaluate_rows( + benchmark_id=task_id, + input_rows=input_rows, + scoring_functions=scoring_functions, + task_config=task_config, + ) + + async def DEPRECATED_job_status( + self, + task_id: str, + job_id: str, + ) -> Optional[JobStatus]: + return await self.job_status(benchmark_id=task_id, job_id=job_id) + + async def DEPRECATED_job_cancel( + self, + task_id: str, + job_id: str, + ) -> None: + return await self.job_cancel(benchmark_id=task_id, job_id=job_id) + + async def DEPRECATED_job_result( self, task_id: str, job_id: str, ) -> EvaluateResponse: - return await self.routing_table.get_provider_impl(task_id).job_result( - task_id, - job_id, - ) + return await self.job_result(benchmark_id=task_id, job_id=job_id) class ToolRuntimeRouter(ToolRuntime): diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py index 009775ca5..2cddc3970 100644 --- a/llama_stack/distribution/routers/routing_tables.py +++ b/llama_stack/distribution/routers/routing_tables.py @@ -4,14 +4,15 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +import logging from typing import Any, Dict, List, Optional from pydantic import TypeAdapter +from llama_stack.apis.benchmarks import Benchmark, Benchmarks, ListBenchmarksResponse from llama_stack.apis.common.content_types import URL from llama_stack.apis.common.type_system import ParamType from llama_stack.apis.datasets import Dataset, Datasets, ListDatasetsResponse -from llama_stack.apis.eval_tasks import EvalTask, EvalTasks, ListEvalTasksResponse from llama_stack.apis.models import ListModelsResponse, Model, Models, ModelType from llama_stack.apis.resource import ResourceType from llama_stack.apis.scoring_functions import ( @@ -38,6 +39,8 @@ from llama_stack.distribution.datatypes import ( from llama_stack.distribution.store import DistributionRegistry from llama_stack.providers.datatypes import Api, RoutingTable +logger = logging.getLogger(__name__) + def get_impl_api(p: Any) -> Api: return p.__provider_spec__.api @@ -60,7 +63,7 @@ async def register_object_with_provider(obj: RoutableObject, p: Any) -> Routable elif api == Api.scoring: return await p.register_scoring_function(obj) elif api == Api.eval: - return await p.register_eval_task(obj) + return await p.register_benchmark(obj) elif api == Api.tool_runtime: return await p.register_tool(obj) else: @@ -121,7 +124,7 @@ class CommonRoutingTableImpl(RoutingTable): scoring_functions = await p.list_scoring_functions() await add_objects(scoring_functions, pid, ScoringFn) elif api == Api.eval: - p.eval_task_store = self + p.benchmark_store = self elif api == Api.tool_runtime: p.tool_store = self @@ -141,8 +144,8 @@ class CommonRoutingTableImpl(RoutingTable): return ("DatasetIO", "dataset") elif isinstance(self, ScoringFunctionsRoutingTable): return ("Scoring", "scoring_function") - elif isinstance(self, EvalTasksRoutingTable): - return ("Eval", "eval_task") + elif isinstance(self, BenchmarksRoutingTable): + return ("Eval", "benchmark") elif isinstance(self, ToolGroupsRoutingTable): return ("Tools", "tool") else: @@ -428,20 +431,20 @@ class ScoringFunctionsRoutingTable(CommonRoutingTableImpl, ScoringFunctions): await self.register_object(scoring_fn) -class EvalTasksRoutingTable(CommonRoutingTableImpl, EvalTasks): - async def list_eval_tasks(self) -> ListEvalTasksResponse: - return ListEvalTasksResponse(data=await self.get_all_with_type("eval_task")) +class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks): + async def list_benchmarks(self) -> ListBenchmarksResponse: + return ListBenchmarksResponse(data=await self.get_all_with_type("benchmark")) - async def get_eval_task(self, eval_task_id: str) -> Optional[EvalTask]: - return await self.get_object_by_identifier("eval_task", eval_task_id) + async def get_benchmark(self, benchmark_id: str) -> Optional[Benchmark]: + return await self.get_object_by_identifier("benchmark", benchmark_id) - async def register_eval_task( + async def register_benchmark( self, - eval_task_id: str, + benchmark_id: str, dataset_id: str, scoring_functions: List[str], metadata: Optional[Dict[str, Any]] = None, - provider_eval_task_id: Optional[str] = None, + provider_benchmark_id: Optional[str] = None, provider_id: Optional[str] = None, ) -> None: if metadata is None: @@ -453,17 +456,46 @@ class EvalTasksRoutingTable(CommonRoutingTableImpl, EvalTasks): raise ValueError( "No provider specified and multiple providers available. Please specify a provider_id." ) - if provider_eval_task_id is None: - provider_eval_task_id = eval_task_id - eval_task = EvalTask( - identifier=eval_task_id, + if provider_benchmark_id is None: + provider_benchmark_id = benchmark_id + benchmark = Benchmark( + identifier=benchmark_id, dataset_id=dataset_id, scoring_functions=scoring_functions, metadata=metadata, provider_id=provider_id, - provider_resource_id=provider_eval_task_id, + provider_resource_id=provider_benchmark_id, + ) + await self.register_object(benchmark) + + async def DEPRECATED_list_eval_tasks(self) -> ListBenchmarksResponse: + logger.warning("DEPRECATED: Use /eval/benchmarks instead") + return await self.list_benchmarks() + + async def DEPRECATED_get_eval_task( + self, + eval_task_id: str, + ) -> Optional[Benchmark]: + logger.warning("DEPRECATED: Use /eval/benchmarks instead") + return await self.get_benchmark(eval_task_id) + + async def DEPRECATED_register_eval_task( + self, + eval_task_id: str, + dataset_id: str, + scoring_functions: List[str], + provider_benchmark_id: Optional[str] = None, + provider_id: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + ) -> None: + logger.warning("DEPRECATED: Use /eval/benchmarks instead") + return await self.register_benchmark( + benchmark_id=eval_task_id, + dataset_id=dataset_id, + scoring_functions=scoring_functions, + metadata=metadata, + provider_benchmark_id=provider_benchmark_id, ) - await self.register_object(eval_task) class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups): diff --git a/llama_stack/distribution/stack.py b/llama_stack/distribution/stack.py index 2baad8ac4..9335dc3a9 100644 --- a/llama_stack/distribution/stack.py +++ b/llama_stack/distribution/stack.py @@ -15,10 +15,10 @@ from termcolor import colored from llama_stack.apis.agents import Agents from llama_stack.apis.batch_inference import BatchInference +from llama_stack.apis.benchmarks import Benchmarks from llama_stack.apis.datasetio import DatasetIO from llama_stack.apis.datasets import Datasets from llama_stack.apis.eval import Eval -from llama_stack.apis.eval_tasks import EvalTasks from llama_stack.apis.inference import Inference from llama_stack.apis.inspect import Inspect from llama_stack.apis.models import Models @@ -53,7 +53,7 @@ class LlamaStack( PostTraining, VectorIO, Eval, - EvalTasks, + Benchmarks, Scoring, ScoringFunctions, DatasetIO, @@ -78,7 +78,7 @@ RESOURCES = [ "register_scoring_function", "list_scoring_functions", ), - ("eval_tasks", Api.eval_tasks, "register_eval_task", "list_eval_tasks"), + ("benchmarks", Api.benchmarks, "register_benchmark", "list_benchmarks"), ("tool_groups", Api.tool_groups, "register_tool_group", "list_tool_groups"), ] diff --git a/llama_stack/distribution/ui/README.md b/llama_stack/distribution/ui/README.md index c0a2597af..8fceb5c63 100644 --- a/llama_stack/distribution/ui/README.md +++ b/llama_stack/distribution/ui/README.md @@ -26,7 +26,7 @@ $ llama-stack-client datasets register \ ``` ```bash -$ llama-stack-client eval_tasks register \ +$ llama-stack-client benchmarks register \ --eval-task-id meta-reference-mmlu \ --provider-id meta-reference \ --dataset-id mmlu \ diff --git a/llama_stack/distribution/ui/page/distribution/eval_tasks.py b/llama_stack/distribution/ui/page/distribution/eval_tasks.py index f58969663..1428ae9ab 100644 --- a/llama_stack/distribution/ui/page/distribution/eval_tasks.py +++ b/llama_stack/distribution/ui/page/distribution/eval_tasks.py @@ -8,12 +8,12 @@ import streamlit as st from modules.api import llama_stack_api -def eval_tasks(): - # Eval Tasks Section - st.header("Eval Tasks") +def benchmarks(): + # Benchmarks Section + st.header("Benchmarks") - eval_tasks_info = {d.identifier: d.to_dict() for d in llama_stack_api.client.eval_tasks.list()} + benchmarks_info = {d.identifier: d.to_dict() for d in llama_stack_api.client.benchmarks.list()} - if len(eval_tasks_info) > 0: - selected_eval_task = st.selectbox("Select an eval task", list(eval_tasks_info.keys()), key="eval_task_inspect") - st.json(eval_tasks_info[selected_eval_task], expanded=True) + if len(benchmarks_info) > 0: + selected_benchmark = st.selectbox("Select an eval task", list(benchmarks_info.keys()), key="benchmark_inspect") + st.json(benchmarks_info[selected_benchmark], expanded=True) diff --git a/llama_stack/distribution/ui/page/distribution/resources.py b/llama_stack/distribution/ui/page/distribution/resources.py index 94b840bcb..684270d4d 100644 --- a/llama_stack/distribution/ui/page/distribution/resources.py +++ b/llama_stack/distribution/ui/page/distribution/resources.py @@ -4,8 +4,8 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from page.distribution.benchmarks import benchmarks from page.distribution.datasets import datasets -from page.distribution.eval_tasks import eval_tasks from page.distribution.models import models from page.distribution.scoring_functions import scoring_functions from page.distribution.shields import shields @@ -20,7 +20,7 @@ def resources_page(): "Shields", "Scoring Functions", "Datasets", - "Eval Tasks", + "Benchmarks", ] icons = ["magic", "memory", "shield", "file-bar-graph", "database", "list-task"] selected_resource = option_menu( @@ -34,8 +34,8 @@ def resources_page(): }, }, ) - if selected_resource == "Eval Tasks": - eval_tasks() + if selected_resource == "Benchmarks": + benchmarks() elif selected_resource == "Vector Databases": vector_dbs() elif selected_resource == "Datasets": diff --git a/llama_stack/distribution/ui/page/evaluations/native_eval.py b/llama_stack/distribution/ui/page/evaluations/native_eval.py index 112d9cff0..f1cae714a 100644 --- a/llama_stack/distribution/ui/page/evaluations/native_eval.py +++ b/llama_stack/distribution/ui/page/evaluations/native_eval.py @@ -11,28 +11,28 @@ import streamlit as st from modules.api import llama_stack_api -def select_eval_task_1(): - # Select Eval Tasks +def select_benchmark_1(): + # Select Benchmarks st.subheader("1. Choose An Eval Task") - eval_tasks = llama_stack_api.client.eval_tasks.list() - eval_tasks = {et.identifier: et for et in eval_tasks} - eval_tasks_names = list(eval_tasks.keys()) - selected_eval_task = st.selectbox( + benchmarks = llama_stack_api.client.benchmarks.list() + benchmarks = {et.identifier: et for et in benchmarks} + benchmarks_names = list(benchmarks.keys()) + selected_benchmark = st.selectbox( "Choose an eval task.", - options=eval_tasks_names, + options=benchmarks_names, help="Choose an eval task. Each eval task is parameterized by a dataset, and list of scoring functions.", ) with st.expander("View Eval Task"): - st.json(eval_tasks[selected_eval_task], expanded=True) + st.json(benchmarks[selected_benchmark], expanded=True) - st.session_state["selected_eval_task"] = selected_eval_task - st.session_state["eval_tasks"] = eval_tasks + st.session_state["selected_benchmark"] = selected_benchmark + st.session_state["benchmarks"] = benchmarks if st.button("Confirm", key="confirm_1"): - st.session_state["selected_eval_task_1_next"] = True + st.session_state["selected_benchmark_1_next"] = True def define_eval_candidate_2(): - if not st.session_state.get("selected_eval_task_1_next", None): + if not st.session_state.get("selected_benchmark_1_next", None): return st.subheader("2. Define Eval Candidate") @@ -161,11 +161,11 @@ def run_evaluation_3(): Review the configurations that will be used for this evaluation run, make any necessary changes, and then click the "Run Evaluation" button. """ ) - selected_eval_task = st.session_state["selected_eval_task"] - eval_tasks = st.session_state["eval_tasks"] + selected_benchmark = st.session_state["selected_benchmark"] + benchmarks = st.session_state["benchmarks"] eval_candidate = st.session_state["eval_candidate"] - dataset_id = eval_tasks[selected_eval_task].dataset_id + dataset_id = benchmarks[selected_benchmark].dataset_id rows = llama_stack_api.client.datasetio.get_rows_paginated( dataset_id=dataset_id, rows_in_page=-1, @@ -180,16 +180,16 @@ def run_evaluation_3(): help="Number of examples from the dataset to evaluate. ", ) - eval_task_config = { + benchmark_config = { "type": "benchmark", "eval_candidate": eval_candidate, "scoring_params": {}, } with st.expander("View Evaluation Task", expanded=True): - st.json(eval_tasks[selected_eval_task], expanded=True) + st.json(benchmarks[selected_benchmark], expanded=True) with st.expander("View Evaluation Task Configuration", expanded=True): - st.json(eval_task_config, expanded=True) + st.json(benchmark_config, expanded=True) # Add run button and handle evaluation if st.button("Run Evaluation"): @@ -209,10 +209,10 @@ def run_evaluation_3(): progress_bar.progress(progress, text=progress_text) # Run evaluation for current row eval_res = llama_stack_api.client.eval.evaluate_rows( - task_id=selected_eval_task, + benchmark_id=selected_benchmark, input_rows=[r], - scoring_functions=eval_tasks[selected_eval_task].scoring_functions, - task_config=eval_task_config, + scoring_functions=benchmarks[selected_benchmark].scoring_functions, + task_config=benchmark_config, ) for k in r.keys(): @@ -225,7 +225,7 @@ def run_evaluation_3(): output_res[k] = [] output_res[k].append(eval_res.generations[0][k]) - for scoring_fn in eval_tasks[selected_eval_task].scoring_functions: + for scoring_fn in benchmarks[selected_benchmark].scoring_functions: if scoring_fn not in output_res: output_res[scoring_fn] = [] output_res[scoring_fn].append(eval_res.scores[scoring_fn].score_rows[0]) @@ -245,7 +245,7 @@ def native_evaluation_page(): st.set_page_config(page_title="Evaluations (Generation + Scoring)", page_icon="🦙") st.title("📊 Evaluations (Generation + Scoring)") - select_eval_task_1() + select_benchmark_1() define_eval_candidate_2() run_evaluation_3() diff --git a/llama_stack/providers/datatypes.py b/llama_stack/providers/datatypes.py index ccdaf76e7..b92f9dc0a 100644 --- a/llama_stack/providers/datatypes.py +++ b/llama_stack/providers/datatypes.py @@ -10,9 +10,9 @@ from urllib.parse import urlparse from llama_models.schema_utils import json_schema_type from pydantic import BaseModel, Field +from llama_stack.apis.benchmarks import Benchmark from llama_stack.apis.datasets import Dataset from llama_stack.apis.datatypes import Api -from llama_stack.apis.eval_tasks import EvalTask from llama_stack.apis.models import Model from llama_stack.apis.scoring_functions import ScoringFn from llama_stack.apis.shields import Shield @@ -48,8 +48,8 @@ class ScoringFunctionsProtocolPrivate(Protocol): async def register_scoring_function(self, scoring_fn: ScoringFn) -> None: ... -class EvalTasksProtocolPrivate(Protocol): - async def register_eval_task(self, eval_task: EvalTask) -> None: ... +class BenchmarksProtocolPrivate(Protocol): + async def register_benchmark(self, benchmark: Benchmark) -> None: ... class ToolsProtocolPrivate(Protocol): diff --git a/llama_stack/providers/inline/eval/meta_reference/eval.py b/llama_stack/providers/inline/eval/meta_reference/eval.py index 1c44caf7f..cd99c9ad8 100644 --- a/llama_stack/providers/inline/eval/meta_reference/eval.py +++ b/llama_stack/providers/inline/eval/meta_reference/eval.py @@ -8,13 +8,13 @@ from typing import Any, Dict, List, Optional from tqdm import tqdm from llama_stack.apis.agents import Agents, StepType +from llama_stack.apis.benchmarks import Benchmark from llama_stack.apis.datasetio import DatasetIO from llama_stack.apis.datasets import Datasets -from llama_stack.apis.eval_tasks import EvalTask from llama_stack.apis.inference import Inference, UserMessage from llama_stack.apis.scoring import Scoring from llama_stack.distribution.datatypes import Api -from llama_stack.providers.datatypes import EvalTasksProtocolPrivate +from llama_stack.providers.datatypes import BenchmarksProtocolPrivate from llama_stack.providers.inline.agents.meta_reference.agent_instance import ( MEMORY_QUERY_TOOL, ) @@ -26,15 +26,15 @@ from llama_stack.providers.utils.common.data_schema_validator import ( from llama_stack.providers.utils.kvstore import kvstore_impl from .....apis.common.job_types import Job -from .....apis.eval.eval import Eval, EvalTaskConfig, EvaluateResponse, JobStatus +from .....apis.eval.eval import BenchmarkConfig, Eval, EvaluateResponse, JobStatus from .config import MetaReferenceEvalConfig -EVAL_TASKS_PREFIX = "eval_tasks:" +EVAL_TASKS_PREFIX = "benchmarks:" class MetaReferenceEvalImpl( Eval, - EvalTasksProtocolPrivate, + BenchmarksProtocolPrivate, ): def __init__( self, @@ -55,36 +55,36 @@ class MetaReferenceEvalImpl( # TODO: assume sync job, will need jobs API for async scheduling self.jobs = {} - self.eval_tasks = {} + self.benchmarks = {} async def initialize(self) -> None: self.kvstore = await kvstore_impl(self.config.kvstore) - # Load existing eval_tasks from kvstore + # Load existing benchmarks from kvstore start_key = EVAL_TASKS_PREFIX end_key = f"{EVAL_TASKS_PREFIX}\xff" - stored_eval_tasks = await self.kvstore.range(start_key, end_key) + stored_benchmarks = await self.kvstore.range(start_key, end_key) - for eval_task in stored_eval_tasks: - eval_task = EvalTask.model_validate_json(eval_task) - self.eval_tasks[eval_task.identifier] = eval_task + for benchmark in stored_benchmarks: + benchmark = Benchmark.model_validate_json(benchmark) + self.benchmarks[benchmark.identifier] = benchmark async def shutdown(self) -> None: ... - async def register_eval_task(self, task_def: EvalTask) -> None: + async def register_benchmark(self, task_def: Benchmark) -> None: # Store in kvstore key = f"{EVAL_TASKS_PREFIX}{task_def.identifier}" await self.kvstore.set( key=key, value=task_def.model_dump_json(), ) - self.eval_tasks[task_def.identifier] = task_def + self.benchmarks[task_def.identifier] = task_def async def run_eval( self, - task_id: str, - task_config: EvalTaskConfig, + benchmark_id: str, + task_config: BenchmarkConfig, ) -> Job: - task_def = self.eval_tasks[task_id] + task_def = self.benchmarks[benchmark_id] dataset_id = task_def.dataset_id candidate = task_config.eval_candidate scoring_functions = task_def.scoring_functions @@ -95,7 +95,7 @@ class MetaReferenceEvalImpl( rows_in_page=(-1 if task_config.num_examples is None else task_config.num_examples), ) res = await self.evaluate_rows( - task_id=task_id, + benchmark_id=benchmark_id, input_rows=all_rows.rows, scoring_functions=scoring_functions, task_config=task_config, @@ -108,7 +108,7 @@ class MetaReferenceEvalImpl( return Job(job_id=job_id) async def _run_agent_generation( - self, input_rows: List[Dict[str, Any]], task_config: EvalTaskConfig + self, input_rows: List[Dict[str, Any]], task_config: BenchmarkConfig ) -> List[Dict[str, Any]]: candidate = task_config.eval_candidate create_response = await self.agents_api.create_agent(candidate.config) @@ -151,7 +151,7 @@ class MetaReferenceEvalImpl( return generations async def _run_model_generation( - self, input_rows: List[Dict[str, Any]], task_config: EvalTaskConfig + self, input_rows: List[Dict[str, Any]], task_config: BenchmarkConfig ) -> List[Dict[str, Any]]: candidate = task_config.eval_candidate assert candidate.sampling_params.max_tokens is not None, "SamplingParams.max_tokens must be provided" @@ -187,10 +187,10 @@ class MetaReferenceEvalImpl( async def evaluate_rows( self, - task_id: str, + benchmark_id: str, input_rows: List[Dict[str, Any]], scoring_functions: List[str], - task_config: EvalTaskConfig, + task_config: BenchmarkConfig, ) -> EvaluateResponse: candidate = task_config.eval_candidate if candidate.type == "agent": @@ -203,7 +203,7 @@ class MetaReferenceEvalImpl( # scoring with generated_answer score_input_rows = [input_r | generated_r for input_r, generated_r in zip(input_rows, generations)] - if task_config.type == "app" and task_config.scoring_params is not None: + if task_config.scoring_params is not None: scoring_functions_dict = { scoring_fn_id: task_config.scoring_params.get(scoring_fn_id, None) for scoring_fn_id in scoring_functions @@ -217,18 +217,60 @@ class MetaReferenceEvalImpl( return EvaluateResponse(generations=generations, scores=score_response.results) - async def job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]: + async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]: if job_id in self.jobs: return JobStatus.completed return None - async def job_cancel(self, task_id: str, job_id: str) -> None: + async def job_cancel(self, benchmark_id: str, job_id: str) -> None: raise NotImplementedError("Job cancel is not implemented yet") - async def job_result(self, task_id: str, job_id: str) -> EvaluateResponse: - status = await self.job_status(task_id, job_id) + async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse: + status = await self.job_status(benchmark_id, job_id) if not status or status != JobStatus.completed: raise ValueError(f"Job is not completed, Status: {status.value}") return self.jobs[job_id] + + async def DEPRECATED_run_eval( + self, + task_id: str, + task_config: BenchmarkConfig, + ) -> Job: + return await self.run_eval(benchmark_id=task_id, task_config=task_config) + + async def DEPRECATED_evaluate_rows( + self, + task_id: str, + input_rows: List[Dict[str, Any]], + scoring_functions: List[str], + task_config: BenchmarkConfig, + ) -> EvaluateResponse: + return await self.evaluate_rows( + benchmark_id=task_id, + input_rows=input_rows, + scoring_functions=scoring_functions, + task_config=task_config, + ) + + async def DEPRECATED_job_status( + self, + task_id: str, + job_id: str, + ) -> Optional[JobStatus]: + return await self.job_status(benchmark_id=task_id, job_id=job_id) + + async def DEPRECATED_job_cancel( + self, + task_id: str, + job_id: str, + ) -> None: + return await self.job_cancel(benchmark_id=task_id, job_id=job_id) + + async def DEPRECATED_job_result( + self, + task_id: str, + job_id: str, + ) -> EvaluateResponse: + return await self.job_result(benchmark_id=task_id, job_id=job_id) diff --git a/llama_stack/providers/tests/eval/test_eval.py b/llama_stack/providers/tests/eval/test_eval.py index ec3d08728..ad80b8601 100644 --- a/llama_stack/providers/tests/eval/test_eval.py +++ b/llama_stack/providers/tests/eval/test_eval.py @@ -10,8 +10,8 @@ import pytest from llama_stack.apis.common.content_types import URL from llama_stack.apis.common.type_system import ChatCompletionInputType, StringType from llama_stack.apis.eval.eval import ( - AppEvalTaskConfig, - BenchmarkEvalTaskConfig, + AppBenchmarkConfig, + BenchmarkBenchmarkConfig, ModelCandidate, ) from llama_stack.apis.inference import SamplingParams @@ -30,18 +30,18 @@ from .constants import JUDGE_PROMPT class Testeval: @pytest.mark.asyncio - async def test_eval_tasks_list(self, eval_stack): + async def test_benchmarks_list(self, eval_stack): # NOTE: this needs you to ensure that you are starting from a clean state # but so far we don't have an unregister API unfortunately, so be careful - eval_tasks_impl = eval_stack[Api.eval_tasks] - response = await eval_tasks_impl.list_eval_tasks() + benchmarks_impl = eval_stack[Api.benchmarks] + response = await benchmarks_impl.list_benchmarks() assert isinstance(response, list) @pytest.mark.asyncio async def test_eval_evaluate_rows(self, eval_stack, inference_model, judge_model): - eval_impl, eval_tasks_impl, datasetio_impl, datasets_impl, models_impl = ( + eval_impl, benchmarks_impl, datasetio_impl, datasets_impl, models_impl = ( eval_stack[Api.eval], - eval_stack[Api.eval_tasks], + eval_stack[Api.benchmarks], eval_stack[Api.datasetio], eval_stack[Api.datasets], eval_stack[Api.models], @@ -59,17 +59,17 @@ class Testeval: scoring_functions = [ "basic::equality", ] - task_id = "meta-reference::app_eval" - await eval_tasks_impl.register_eval_task( - eval_task_id=task_id, + benchmark_id = "meta-reference::app_eval" + await benchmarks_impl.register_benchmark( + benchmark_id=benchmark_id, dataset_id="test_dataset_for_eval", scoring_functions=scoring_functions, ) response = await eval_impl.evaluate_rows( - task_id=task_id, + benchmark_id=benchmark_id, input_rows=rows.rows, scoring_functions=scoring_functions, - task_config=AppEvalTaskConfig( + task_config=AppBenchmarkConfig( eval_candidate=ModelCandidate( model=inference_model, sampling_params=SamplingParams(), @@ -92,9 +92,9 @@ class Testeval: @pytest.mark.asyncio async def test_eval_run_eval(self, eval_stack, inference_model, judge_model): - eval_impl, eval_tasks_impl, datasets_impl, models_impl = ( + eval_impl, benchmarks_impl, datasets_impl, models_impl = ( eval_stack[Api.eval], - eval_stack[Api.eval_tasks], + eval_stack[Api.benchmarks], eval_stack[Api.datasets], eval_stack[Api.models], ) @@ -105,15 +105,15 @@ class Testeval: "basic::subset_of", ] - task_id = "meta-reference::app_eval-2" - await eval_tasks_impl.register_eval_task( - eval_task_id=task_id, + benchmark_id = "meta-reference::app_eval-2" + await benchmarks_impl.register_benchmark( + benchmark_id=benchmark_id, dataset_id="test_dataset_for_eval", scoring_functions=scoring_functions, ) response = await eval_impl.run_eval( - task_id=task_id, - task_config=AppEvalTaskConfig( + benchmark_id=benchmark_id, + task_config=AppBenchmarkConfig( eval_candidate=ModelCandidate( model=inference_model, sampling_params=SamplingParams(), @@ -121,9 +121,9 @@ class Testeval: ), ) assert response.job_id == "0" - job_status = await eval_impl.job_status(task_id, response.job_id) + job_status = await eval_impl.job_status(benchmark_id, response.job_id) assert job_status and job_status.value == "completed" - eval_response = await eval_impl.job_result(task_id, response.job_id) + eval_response = await eval_impl.job_result(benchmark_id, response.job_id) assert eval_response is not None assert len(eval_response.generations) == 5 @@ -131,9 +131,9 @@ class Testeval: @pytest.mark.asyncio async def test_eval_run_benchmark_eval(self, eval_stack, inference_model): - eval_impl, eval_tasks_impl, datasets_impl, models_impl = ( + eval_impl, benchmarks_impl, datasets_impl, models_impl = ( eval_stack[Api.eval], - eval_stack[Api.eval_tasks], + eval_stack[Api.benchmarks], eval_stack[Api.datasets], eval_stack[Api.models], ) @@ -159,20 +159,20 @@ class Testeval: ) # register eval task - await eval_tasks_impl.register_eval_task( - eval_task_id="meta-reference-mmlu", + await benchmarks_impl.register_benchmark( + benchmark_id="meta-reference-mmlu", dataset_id="mmlu", scoring_functions=["basic::regex_parser_multiple_choice_answer"], ) # list benchmarks - response = await eval_tasks_impl.list_eval_tasks() + response = await benchmarks_impl.list_benchmarks() assert len(response) > 0 benchmark_id = "meta-reference-mmlu" response = await eval_impl.run_eval( - task_id=benchmark_id, - task_config=BenchmarkEvalTaskConfig( + benchmark_id=benchmark_id, + task_config=BenchmarkBenchmarkConfig( eval_candidate=ModelCandidate( model=inference_model, sampling_params=SamplingParams(), diff --git a/llama_stack/providers/tests/resolver.py b/llama_stack/providers/tests/resolver.py index 0ff632717..76343b7f4 100644 --- a/llama_stack/providers/tests/resolver.py +++ b/llama_stack/providers/tests/resolver.py @@ -10,8 +10,8 @@ from typing import Any, Dict, List, Optional from pydantic import BaseModel +from llama_stack.apis.benchmarks import BenchmarkInput from llama_stack.apis.datasets import DatasetInput -from llama_stack.apis.eval_tasks import EvalTaskInput from llama_stack.apis.models import ModelInput from llama_stack.apis.scoring_functions import ScoringFnInput from llama_stack.apis.shields import ShieldInput @@ -42,7 +42,7 @@ async def construct_stack_for_test( vector_dbs: Optional[List[VectorDBInput]] = None, datasets: Optional[List[DatasetInput]] = None, scoring_fns: Optional[List[ScoringFnInput]] = None, - eval_tasks: Optional[List[EvalTaskInput]] = None, + benchmarks: Optional[List[BenchmarkInput]] = None, tool_groups: Optional[List[ToolGroupInput]] = None, ) -> TestStack: sqlite_file = tempfile.NamedTemporaryFile(delete=False, suffix=".db") @@ -56,7 +56,7 @@ async def construct_stack_for_test( vector_dbs=vector_dbs or [], datasets=datasets or [], scoring_fns=scoring_fns or [], - eval_tasks=eval_tasks or [], + benchmarks=benchmarks or [], tool_groups=tool_groups or [], ) run_config = parse_and_maybe_upgrade_config(run_config) diff --git a/llama_stack/templates/bedrock/run.yaml b/llama_stack/templates/bedrock/run.yaml index be6c9a928..7d03b7c29 100644 --- a/llama_stack/templates/bedrock/run.yaml +++ b/llama_stack/templates/bedrock/run.yaml @@ -107,7 +107,7 @@ shields: [] vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/cerebras/run.yaml b/llama_stack/templates/cerebras/run.yaml index 05d3f4525..6afff2be2 100644 --- a/llama_stack/templates/cerebras/run.yaml +++ b/llama_stack/templates/cerebras/run.yaml @@ -109,7 +109,7 @@ shields: [] vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/dell/run-with-safety.yaml b/llama_stack/templates/dell/run-with-safety.yaml index 04c5957d4..ddec3a715 100644 --- a/llama_stack/templates/dell/run-with-safety.yaml +++ b/llama_stack/templates/dell/run-with-safety.yaml @@ -108,7 +108,7 @@ shields: vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: brave-search diff --git a/llama_stack/templates/dell/run.yaml b/llama_stack/templates/dell/run.yaml index 706444eb1..9394c94ef 100644 --- a/llama_stack/templates/dell/run.yaml +++ b/llama_stack/templates/dell/run.yaml @@ -99,7 +99,7 @@ shields: [] vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: brave-search diff --git a/llama_stack/templates/experimental-post-training/run.yaml b/llama_stack/templates/experimental-post-training/run.yaml index 75d103c9f..e70ccdd2d 100644 --- a/llama_stack/templates/experimental-post-training/run.yaml +++ b/llama_stack/templates/experimental-post-training/run.yaml @@ -85,4 +85,4 @@ shields: [] vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] diff --git a/llama_stack/templates/fireworks/run-with-safety.yaml b/llama_stack/templates/fireworks/run-with-safety.yaml index 0fbe14a5a..8f95e9d59 100644 --- a/llama_stack/templates/fireworks/run-with-safety.yaml +++ b/llama_stack/templates/fireworks/run-with-safety.yaml @@ -164,7 +164,7 @@ shields: vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/fireworks/run.yaml b/llama_stack/templates/fireworks/run.yaml index ccf67dcbb..64229a5d8 100644 --- a/llama_stack/templates/fireworks/run.yaml +++ b/llama_stack/templates/fireworks/run.yaml @@ -153,7 +153,7 @@ shields: vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/hf-endpoint/run-with-safety.yaml b/llama_stack/templates/hf-endpoint/run-with-safety.yaml index f520a2fda..867d7a076 100644 --- a/llama_stack/templates/hf-endpoint/run-with-safety.yaml +++ b/llama_stack/templates/hf-endpoint/run-with-safety.yaml @@ -116,7 +116,7 @@ shields: vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/hf-endpoint/run.yaml b/llama_stack/templates/hf-endpoint/run.yaml index 708cb1bcc..d60acdefd 100644 --- a/llama_stack/templates/hf-endpoint/run.yaml +++ b/llama_stack/templates/hf-endpoint/run.yaml @@ -106,7 +106,7 @@ shields: [] vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/hf-serverless/run-with-safety.yaml b/llama_stack/templates/hf-serverless/run-with-safety.yaml index 7f0abf5be..e58ad15b3 100644 --- a/llama_stack/templates/hf-serverless/run-with-safety.yaml +++ b/llama_stack/templates/hf-serverless/run-with-safety.yaml @@ -116,7 +116,7 @@ shields: vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/hf-serverless/run.yaml b/llama_stack/templates/hf-serverless/run.yaml index c0b7a4c60..5045e821a 100644 --- a/llama_stack/templates/hf-serverless/run.yaml +++ b/llama_stack/templates/hf-serverless/run.yaml @@ -106,7 +106,7 @@ shields: [] vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml index c5286fc6b..caac65c8c 100644 --- a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml +++ b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml @@ -118,7 +118,7 @@ shields: vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/meta-reference-gpu/run.yaml b/llama_stack/templates/meta-reference-gpu/run.yaml index 310585f23..bade9a076 100644 --- a/llama_stack/templates/meta-reference-gpu/run.yaml +++ b/llama_stack/templates/meta-reference-gpu/run.yaml @@ -107,7 +107,7 @@ shields: [] vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml index d43cf3917..f131e8ea6 100644 --- a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml +++ b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml @@ -109,7 +109,7 @@ shields: [] vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/nvidia/run.yaml b/llama_stack/templates/nvidia/run.yaml index c8ae362f5..14fb28354 100644 --- a/llama_stack/templates/nvidia/run.yaml +++ b/llama_stack/templates/nvidia/run.yaml @@ -139,7 +139,7 @@ shields: [] vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/ollama/run-with-safety.yaml b/llama_stack/templates/ollama/run-with-safety.yaml index ac5dab755..9d5bfc7a0 100644 --- a/llama_stack/templates/ollama/run-with-safety.yaml +++ b/llama_stack/templates/ollama/run-with-safety.yaml @@ -113,7 +113,7 @@ shields: vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/ollama/run.yaml b/llama_stack/templates/ollama/run.yaml index 3a60fe61f..9ac1f3267 100644 --- a/llama_stack/templates/ollama/run.yaml +++ b/llama_stack/templates/ollama/run.yaml @@ -110,7 +110,7 @@ shields: [] vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/remote-vllm/run-with-safety.yaml b/llama_stack/templates/remote-vllm/run-with-safety.yaml index 1fe998a1f..dd43f21f6 100644 --- a/llama_stack/templates/remote-vllm/run-with-safety.yaml +++ b/llama_stack/templates/remote-vllm/run-with-safety.yaml @@ -118,7 +118,7 @@ shields: vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/remote-vllm/run.yaml b/llama_stack/templates/remote-vllm/run.yaml index 9d3db8a31..24cd207c7 100644 --- a/llama_stack/templates/remote-vllm/run.yaml +++ b/llama_stack/templates/remote-vllm/run.yaml @@ -107,7 +107,7 @@ shields: [] vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/sambanova/run.yaml b/llama_stack/templates/sambanova/run.yaml index 39b0f3c4e..26815dcd0 100644 --- a/llama_stack/templates/sambanova/run.yaml +++ b/llama_stack/templates/sambanova/run.yaml @@ -118,7 +118,7 @@ shields: vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/tgi/run-with-safety.yaml b/llama_stack/templates/tgi/run-with-safety.yaml index ed6c9ef6f..e1d85f59a 100644 --- a/llama_stack/templates/tgi/run-with-safety.yaml +++ b/llama_stack/templates/tgi/run-with-safety.yaml @@ -106,7 +106,7 @@ shields: vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/tgi/run.yaml b/llama_stack/templates/tgi/run.yaml index 8bf76f37b..fc73e0978 100644 --- a/llama_stack/templates/tgi/run.yaml +++ b/llama_stack/templates/tgi/run.yaml @@ -105,7 +105,7 @@ shields: [] vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/together/run-with-safety.yaml b/llama_stack/templates/together/run-with-safety.yaml index 298926630..f101a5d60 100644 --- a/llama_stack/templates/together/run-with-safety.yaml +++ b/llama_stack/templates/together/run-with-safety.yaml @@ -159,7 +159,7 @@ shields: vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/together/run.yaml b/llama_stack/templates/together/run.yaml index 920003759..8af85979d 100644 --- a/llama_stack/templates/together/run.yaml +++ b/llama_stack/templates/together/run.yaml @@ -148,7 +148,7 @@ shields: vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/vllm-gpu/run.yaml b/llama_stack/templates/vllm-gpu/run.yaml index 41a545e1a..cdce5510d 100644 --- a/llama_stack/templates/vllm-gpu/run.yaml +++ b/llama_stack/templates/vllm-gpu/run.yaml @@ -109,7 +109,7 @@ shields: [] vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search