diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index c656808a6..652dae562 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -67,8 +67,8 @@
"description": "",
"parameters": [
{
- "name": "benchmark_id",
- "in": "path",
+ "name": "task_id",
+ "in": "query",
"required": true,
"schema": {
"type": "string"
@@ -114,7 +114,7 @@
"content": {
"application/json": {
"schema": {
- "$ref": "#/components/schemas/DeprecatedRegisterBenchmarkRequest"
+ "$ref": "#/components/schemas/DeprecatedRegisterEvalTaskRequest"
}
}
},
@@ -613,7 +613,7 @@
}
}
},
- "/v1/eval/tasks/{benchmark_id}/evaluations": {
+ "/v1/eval/benchmarks/{benchmark_id}/evaluations": {
"post": {
"responses": {
"200": {
@@ -653,6 +653,47 @@
}
}
},
+ "/v1/eval/tasks/{task_id}/evaluations": {
+ "post": {
+ "responses": {
+ "200": {
+ "description": "OK",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/EvaluateResponse"
+ }
+ }
+ }
+ }
+ },
+ "tags": [
+ "Eval"
+ ],
+ "description": "",
+ "parameters": [
+ {
+ "name": "task_id",
+ "in": "path",
+ "required": true,
+ "schema": {
+ "type": "string"
+ }
+ }
+ ],
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/EvaluateRowsDeprecatedRequest"
+ }
+ }
+ },
+ "required": true
+ },
+ "deprecated": true
+ }
+ },
"/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}": {
"get": {
"responses": {
@@ -753,6 +794,43 @@
]
}
},
+ "/v1/eval/benchmarks/{benchmark_id}": {
+ "get": {
+ "responses": {
+ "200": {
+ "description": "OK",
+ "content": {
+ "application/json": {
+ "schema": {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/Benchmark"
+ },
+ {
+ "type": "null"
+ }
+ ]
+ }
+ }
+ }
+ }
+ },
+ "tags": [
+ "Benchmarks"
+ ],
+ "description": "",
+ "parameters": [
+ {
+ "name": "benchmark_id",
+ "in": "path",
+ "required": true,
+ "schema": {
+ "type": "string"
+ }
+ }
+ ]
+ }
+ },
"/v1/datasets/{dataset_id}": {
"get": {
"responses": {
@@ -811,43 +889,6 @@
]
}
},
- "/v1/eval/tasks/{benchmark_id}": {
- "get": {
- "responses": {
- "200": {
- "description": "OK",
- "content": {
- "application/json": {
- "schema": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/Benchmark"
- },
- {
- "type": "null"
- }
- ]
- }
- }
- }
- }
- },
- "tags": [
- "Benchmarks"
- ],
- "description": "",
- "parameters": [
- {
- "name": "benchmark_id",
- "in": "path",
- "required": true,
- "schema": {
- "type": "string"
- }
- }
- ]
- }
- },
"/v1/models/{model_id}": {
"get": {
"responses": {
@@ -1431,7 +1472,7 @@
}
}
},
- "/v1/eval/tasks/{benchmark_id}/jobs/{job_id}": {
+ "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}": {
"get": {
"responses": {
"200": {
@@ -1505,7 +1546,83 @@
]
}
},
- "/v1/eval/tasks/{benchmark_id}/jobs/{job_id}/result": {
+ "/v1/eval/tasks/{task_id}/jobs/{job_id}": {
+ "get": {
+ "responses": {
+ "200": {
+ "description": "OK",
+ "content": {
+ "application/json": {
+ "schema": {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/JobStatus"
+ },
+ {
+ "type": "null"
+ }
+ ]
+ }
+ }
+ }
+ }
+ },
+ "tags": [
+ "Eval"
+ ],
+ "description": "",
+ "parameters": [
+ {
+ "name": "task_id",
+ "in": "path",
+ "required": true,
+ "schema": {
+ "type": "string"
+ }
+ },
+ {
+ "name": "job_id",
+ "in": "path",
+ "required": true,
+ "schema": {
+ "type": "string"
+ }
+ }
+ ],
+ "deprecated": true
+ },
+ "delete": {
+ "responses": {
+ "200": {
+ "description": "OK"
+ }
+ },
+ "tags": [
+ "Eval"
+ ],
+ "description": "",
+ "parameters": [
+ {
+ "name": "task_id",
+ "in": "path",
+ "required": true,
+ "schema": {
+ "type": "string"
+ }
+ },
+ {
+ "name": "job_id",
+ "in": "path",
+ "required": true,
+ "schema": {
+ "type": "string"
+ }
+ }
+ ],
+ "deprecated": true
+ }
+ },
+ "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result": {
"get": {
"responses": {
"200": {
@@ -1525,7 +1642,7 @@
"description": "",
"parameters": [
{
- "name": "job_id",
+ "name": "benchmark_id",
"in": "path",
"required": true,
"schema": {
@@ -1533,7 +1650,7 @@
}
},
{
- "name": "benchmark_id",
+ "name": "job_id",
"in": "path",
"required": true,
"schema": {
@@ -1543,6 +1660,88 @@
]
}
},
+ "/v1/eval/tasks/{task_id}/jobs/{job_id}/result": {
+ "get": {
+ "responses": {
+ "200": {
+ "description": "OK",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/EvaluateResponse"
+ }
+ }
+ }
+ }
+ },
+ "tags": [
+ "Eval"
+ ],
+ "description": "",
+ "parameters": [
+ {
+ "name": "task_id",
+ "in": "path",
+ "required": true,
+ "schema": {
+ "type": "string"
+ }
+ },
+ {
+ "name": "job_id",
+ "in": "path",
+ "required": true,
+ "schema": {
+ "type": "string"
+ }
+ }
+ ],
+ "deprecated": true
+ }
+ },
+ "/v1/eval/benchmarks": {
+ "get": {
+ "responses": {
+ "200": {
+ "description": "OK",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/ListBenchmarksResponse"
+ }
+ }
+ }
+ }
+ },
+ "tags": [
+ "Benchmarks"
+ ],
+ "description": "",
+ "parameters": []
+ },
+ "post": {
+ "responses": {
+ "200": {
+ "description": "OK"
+ }
+ },
+ "tags": [
+ "Benchmarks"
+ ],
+ "description": "",
+ "parameters": [],
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/RegisterBenchmarkRequest"
+ }
+ }
+ },
+ "required": true
+ }
+ }
+ },
"/v1/datasets": {
"get": {
"responses": {
@@ -1586,49 +1785,6 @@
}
}
},
- "/v1/eval/tasks": {
- "get": {
- "responses": {
- "200": {
- "description": "OK",
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/ListBenchmarksResponse"
- }
- }
- }
- }
- },
- "tags": [
- "Benchmarks"
- ],
- "description": "",
- "parameters": []
- },
- "post": {
- "responses": {
- "200": {
- "description": "OK"
- }
- },
- "tags": [
- "Benchmarks"
- ],
- "description": "",
- "parameters": [],
- "requestBody": {
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/RegisterBenchmarkRequest"
- }
- }
- },
- "required": true
- }
- }
- },
"/v1/models": {
"get": {
"responses": {
@@ -2204,7 +2360,7 @@
]
}
},
- "/v1/eval/tasks/{benchmark_id}/jobs": {
+ "/v1/eval/benchmarks/{benchmark_id}/jobs": {
"post": {
"responses": {
"200": {
@@ -2244,6 +2400,47 @@
}
}
},
+ "/v1/eval/tasks/{task_id}/jobs": {
+ "post": {
+ "responses": {
+ "200": {
+ "description": "OK",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/Job"
+ }
+ }
+ }
+ }
+ },
+ "tags": [
+ "Eval"
+ ],
+ "description": "",
+ "parameters": [
+ {
+ "name": "task_id",
+ "in": "path",
+ "required": true,
+ "schema": {
+ "type": "string"
+ }
+ }
+ ],
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/RunEvalDeprecatedRequest"
+ }
+ }
+ },
+ "required": true
+ },
+ "deprecated": true
+ }
+ },
"/v1/safety/run-shield": {
"post": {
"responses": {
@@ -2526,10 +2723,10 @@
"data"
]
},
- "DeprecatedRegisterBenchmarkRequest": {
+ "DeprecatedRegisterEvalTaskRequest": {
"type": "object",
"properties": {
- "benchmark_id": {
+ "task_id": {
"type": "string"
},
"dataset_id": {
@@ -2575,7 +2772,7 @@
},
"additionalProperties": false,
"required": [
- "benchmark_id",
+ "task_id",
"dataset_id",
"scoring_functions"
]
@@ -4745,34 +4942,6 @@
"accuracy"
]
},
- "AppBenchmarkConfig": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "app",
- "default": "app"
- },
- "eval_candidate": {
- "$ref": "#/components/schemas/EvalCandidate"
- },
- "scoring_params": {
- "type": "object",
- "additionalProperties": {
- "$ref": "#/components/schemas/ScoringFnParams"
- }
- },
- "num_examples": {
- "type": "integer"
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "eval_candidate",
- "scoring_params"
- ]
- },
"BasicScoringFnParams": {
"type": "object",
"properties": {
@@ -4793,25 +4962,26 @@
"type"
]
},
- "BenchmarkBenchmarkConfig": {
+ "BenchmarkConfig": {
"type": "object",
"properties": {
- "type": {
- "type": "string",
- "const": "benchmark",
- "default": "benchmark"
- },
"eval_candidate": {
"$ref": "#/components/schemas/EvalCandidate"
},
+ "scoring_params": {
+ "type": "object",
+ "additionalProperties": {
+ "$ref": "#/components/schemas/ScoringFnParams"
+ }
+ },
"num_examples": {
"type": "integer"
}
},
"additionalProperties": false,
"required": [
- "type",
- "eval_candidate"
+ "eval_candidate",
+ "scoring_params"
]
},
"EvalCandidate": {
@@ -4831,23 +5001,6 @@
}
}
},
- "BenchmarkConfig": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/BenchmarkBenchmarkConfig"
- },
- {
- "$ref": "#/components/schemas/AppBenchmarkConfig"
- }
- ],
- "discriminator": {
- "propertyName": "type",
- "mapping": {
- "benchmark": "#/components/schemas/BenchmarkBenchmarkConfig",
- "app": "#/components/schemas/AppBenchmarkConfig"
- }
- }
- },
"LLMAsJudgeScoringFnParams": {
"type": "object",
"properties": {
@@ -5108,6 +5261,54 @@
"aggregated_results"
]
},
+ "EvaluateRowsDeprecatedRequest": {
+ "type": "object",
+ "properties": {
+ "input_rows": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ }
+ }
+ },
+ "scoring_functions": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "task_config": {
+ "$ref": "#/components/schemas/BenchmarkConfig"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "input_rows",
+ "scoring_functions",
+ "task_config"
+ ]
+ },
"Session": {
"type": "object",
"properties": {
@@ -7304,60 +7505,6 @@
"data"
]
},
- "RegisterDatasetRequest": {
- "type": "object",
- "properties": {
- "dataset_id": {
- "type": "string"
- },
- "dataset_schema": {
- "type": "object",
- "additionalProperties": {
- "$ref": "#/components/schemas/ParamType"
- }
- },
- "url": {
- "$ref": "#/components/schemas/URL"
- },
- "provider_dataset_id": {
- "type": "string"
- },
- "provider_id": {
- "type": "string"
- },
- "metadata": {
- "type": "object",
- "additionalProperties": {
- "oneOf": [
- {
- "type": "null"
- },
- {
- "type": "boolean"
- },
- {
- "type": "number"
- },
- {
- "type": "string"
- },
- {
- "type": "array"
- },
- {
- "type": "object"
- }
- ]
- }
- }
- },
- "additionalProperties": false,
- "required": [
- "dataset_id",
- "dataset_schema",
- "url"
- ]
- },
"RegisterBenchmarkRequest": {
"type": "object",
"properties": {
@@ -7412,6 +7559,60 @@
"scoring_functions"
]
},
+ "RegisterDatasetRequest": {
+ "type": "object",
+ "properties": {
+ "dataset_id": {
+ "type": "string"
+ },
+ "dataset_schema": {
+ "type": "object",
+ "additionalProperties": {
+ "$ref": "#/components/schemas/ParamType"
+ }
+ },
+ "url": {
+ "$ref": "#/components/schemas/URL"
+ },
+ "provider_dataset_id": {
+ "type": "string"
+ },
+ "provider_id": {
+ "type": "string"
+ },
+ "metadata": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "dataset_id",
+ "dataset_schema",
+ "url"
+ ]
+ },
"RegisterModelRequest": {
"type": "object",
"properties": {
@@ -7623,6 +7824,18 @@
"job_id"
]
},
+ "RunEvalDeprecatedRequest": {
+ "type": "object",
+ "properties": {
+ "task_config": {
+ "$ref": "#/components/schemas/BenchmarkConfig"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "task_config"
+ ]
+ },
"RunShieldRequest": {
"type": "object",
"properties": {
@@ -8105,6 +8318,9 @@
{
"name": "BatchInference (Coming Soon)"
},
+ {
+ "name": "Benchmarks"
+ },
{
"name": "DatasetIO"
},
@@ -8114,9 +8330,6 @@
{
"name": "Eval"
},
- {
- "name": "Benchmarks"
- },
{
"name": "Inference",
"description": "This API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.",
@@ -8168,10 +8381,10 @@
"tags": [
"Agents",
"BatchInference (Coming Soon)",
+ "Benchmarks",
"DatasetIO",
"Datasets",
"Eval",
- "Benchmarks",
"Inference",
"Inspect",
"Models",
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index 0f0a613a8..89e066917 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -25,8 +25,8 @@ paths:
- Benchmarks
description: ''
parameters:
- - name: benchmark_id
- in: path
+ - name: task_id
+ in: query
required: true
schema:
type: string
@@ -57,7 +57,7 @@ paths:
content:
application/json:
schema:
- $ref: '#/components/schemas/DeprecatedRegisterBenchmarkRequest'
+ $ref: '#/components/schemas/DeprecatedRegisterEvalTaskRequest'
required: true
deprecated: true
/v1/datasetio/rows:
@@ -372,7 +372,7 @@ paths:
schema:
$ref: '#/components/schemas/EmbeddingsRequest'
required: true
- /v1/eval/tasks/{benchmark_id}/evaluations:
+ /v1/eval/benchmarks/{benchmark_id}/evaluations:
post:
responses:
'200':
@@ -396,6 +396,31 @@ paths:
schema:
$ref: '#/components/schemas/EvaluateRowsRequest'
required: true
+ /v1/eval/tasks/{task_id}/evaluations:
+ post:
+ responses:
+ '200':
+ description: OK
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/EvaluateResponse'
+ tags:
+ - Eval
+ description: ''
+ parameters:
+ - name: task_id
+ in: path
+ required: true
+ schema:
+ type: string
+ requestBody:
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/EvaluateRowsDeprecatedRequest'
+ required: true
+ deprecated: true
/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}:
get:
responses:
@@ -457,6 +482,26 @@ paths:
required: true
schema:
type: string
+ /v1/eval/benchmarks/{benchmark_id}:
+ get:
+ responses:
+ '200':
+ description: OK
+ content:
+ application/json:
+ schema:
+ oneOf:
+ - $ref: '#/components/schemas/Benchmark'
+ - type: 'null'
+ tags:
+ - Benchmarks
+ description: ''
+ parameters:
+ - name: benchmark_id
+ in: path
+ required: true
+ schema:
+ type: string
/v1/datasets/{dataset_id}:
get:
responses:
@@ -490,26 +535,6 @@ paths:
required: true
schema:
type: string
- /v1/eval/tasks/{benchmark_id}:
- get:
- responses:
- '200':
- description: OK
- content:
- application/json:
- schema:
- oneOf:
- - $ref: '#/components/schemas/Benchmark'
- - type: 'null'
- tags:
- - Benchmarks
- description: ''
- parameters:
- - name: benchmark_id
- in: path
- required: true
- schema:
- type: string
/v1/models/{model_id}:
get:
responses:
@@ -852,7 +877,7 @@ paths:
schema:
$ref: '#/components/schemas/InvokeToolRequest'
required: true
- /v1/eval/tasks/{benchmark_id}/jobs/{job_id}:
+ /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}:
get:
responses:
'200':
@@ -895,7 +920,52 @@ paths:
required: true
schema:
type: string
- /v1/eval/tasks/{benchmark_id}/jobs/{job_id}/result:
+ /v1/eval/tasks/{task_id}/jobs/{job_id}:
+ get:
+ responses:
+ '200':
+ description: OK
+ content:
+ application/json:
+ schema:
+ oneOf:
+ - $ref: '#/components/schemas/JobStatus'
+ - type: 'null'
+ tags:
+ - Eval
+ description: ''
+ parameters:
+ - name: task_id
+ in: path
+ required: true
+ schema:
+ type: string
+ - name: job_id
+ in: path
+ required: true
+ schema:
+ type: string
+ deprecated: true
+ delete:
+ responses:
+ '200':
+ description: OK
+ tags:
+ - Eval
+ description: ''
+ parameters:
+ - name: task_id
+ in: path
+ required: true
+ schema:
+ type: string
+ - name: job_id
+ in: path
+ required: true
+ schema:
+ type: string
+ deprecated: true
+ /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result:
get:
responses:
'200':
@@ -908,16 +978,67 @@ paths:
- Eval
description: ''
parameters:
- - name: job_id
- in: path
- required: true
- schema:
- type: string
- name: benchmark_id
in: path
required: true
schema:
type: string
+ - name: job_id
+ in: path
+ required: true
+ schema:
+ type: string
+ /v1/eval/tasks/{task_id}/jobs/{job_id}/result:
+ get:
+ responses:
+ '200':
+ description: OK
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/EvaluateResponse'
+ tags:
+ - Eval
+ description: ''
+ parameters:
+ - name: task_id
+ in: path
+ required: true
+ schema:
+ type: string
+ - name: job_id
+ in: path
+ required: true
+ schema:
+ type: string
+ deprecated: true
+ /v1/eval/benchmarks:
+ get:
+ responses:
+ '200':
+ description: OK
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ListBenchmarksResponse'
+ tags:
+ - Benchmarks
+ description: ''
+ parameters: []
+ post:
+ responses:
+ '200':
+ description: OK
+ tags:
+ - Benchmarks
+ description: ''
+ parameters: []
+ requestBody:
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/RegisterBenchmarkRequest'
+ required: true
/v1/datasets:
get:
responses:
@@ -945,33 +1066,6 @@ paths:
schema:
$ref: '#/components/schemas/RegisterDatasetRequest'
required: true
- /v1/eval/tasks:
- get:
- responses:
- '200':
- description: OK
- content:
- application/json:
- schema:
- $ref: '#/components/schemas/ListBenchmarksResponse'
- tags:
- - Benchmarks
- description: ''
- parameters: []
- post:
- responses:
- '200':
- description: OK
- tags:
- - Benchmarks
- description: ''
- parameters: []
- requestBody:
- content:
- application/json:
- schema:
- $ref: '#/components/schemas/RegisterBenchmarkRequest'
- required: true
/v1/models:
get:
responses:
@@ -1328,7 +1422,7 @@ paths:
type: array
items:
type: string
- /v1/eval/tasks/{benchmark_id}/jobs:
+ /v1/eval/benchmarks/{benchmark_id}/jobs:
post:
responses:
'200':
@@ -1352,6 +1446,31 @@ paths:
schema:
$ref: '#/components/schemas/RunEvalRequest'
required: true
+ /v1/eval/tasks/{task_id}/jobs:
+ post:
+ responses:
+ '200':
+ description: OK
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/Job'
+ tags:
+ - Eval
+ description: ''
+ parameters:
+ - name: task_id
+ in: path
+ required: true
+ schema:
+ type: string
+ requestBody:
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/RunEvalDeprecatedRequest'
+ required: true
+ deprecated: true
/v1/safety/run-shield:
post:
responses:
@@ -1527,10 +1646,10 @@ components:
additionalProperties: false
required:
- data
- DeprecatedRegisterBenchmarkRequest:
+ DeprecatedRegisterEvalTaskRequest:
type: object
properties:
- benchmark_id:
+ task_id:
type: string
dataset_id:
type: string
@@ -1554,7 +1673,7 @@ components:
- type: object
additionalProperties: false
required:
- - benchmark_id
+ - task_id
- dataset_id
- scoring_functions
AppendRowsRequest:
@@ -3063,26 +3182,6 @@ components:
- median
- categorical_count
- accuracy
- AppBenchmarkConfig:
- type: object
- properties:
- type:
- type: string
- const: app
- default: app
- eval_candidate:
- $ref: '#/components/schemas/EvalCandidate'
- scoring_params:
- type: object
- additionalProperties:
- $ref: '#/components/schemas/ScoringFnParams'
- num_examples:
- type: integer
- additionalProperties: false
- required:
- - type
- - eval_candidate
- - scoring_params
BasicScoringFnParams:
type: object
properties:
@@ -3097,21 +3196,21 @@ components:
additionalProperties: false
required:
- type
- BenchmarkBenchmarkConfig:
+ BenchmarkConfig:
type: object
properties:
- type:
- type: string
- const: benchmark
- default: benchmark
eval_candidate:
$ref: '#/components/schemas/EvalCandidate'
+ scoring_params:
+ type: object
+ additionalProperties:
+ $ref: '#/components/schemas/ScoringFnParams'
num_examples:
type: integer
additionalProperties: false
required:
- - type
- eval_candidate
+ - scoring_params
EvalCandidate:
oneOf:
- $ref: '#/components/schemas/ModelCandidate'
@@ -3121,15 +3220,6 @@ components:
mapping:
model: '#/components/schemas/ModelCandidate'
agent: '#/components/schemas/AgentCandidate'
- BenchmarkConfig:
- oneOf:
- - $ref: '#/components/schemas/BenchmarkBenchmarkConfig'
- - $ref: '#/components/schemas/AppBenchmarkConfig'
- discriminator:
- propertyName: type
- mapping:
- benchmark: '#/components/schemas/BenchmarkBenchmarkConfig'
- app: '#/components/schemas/AppBenchmarkConfig'
LLMAsJudgeScoringFnParams:
type: object
properties:
@@ -3278,6 +3368,32 @@ components:
required:
- score_rows
- aggregated_results
+ EvaluateRowsDeprecatedRequest:
+ type: object
+ properties:
+ input_rows:
+ type: array
+ items:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ scoring_functions:
+ type: array
+ items:
+ type: string
+ task_config:
+ $ref: '#/components/schemas/BenchmarkConfig'
+ additionalProperties: false
+ required:
+ - input_rows
+ - scoring_functions
+ - task_config
Session:
type: object
properties:
@@ -4645,36 +4761,6 @@ components:
additionalProperties: false
required:
- data
- RegisterDatasetRequest:
- type: object
- properties:
- dataset_id:
- type: string
- dataset_schema:
- type: object
- additionalProperties:
- $ref: '#/components/schemas/ParamType'
- url:
- $ref: '#/components/schemas/URL'
- provider_dataset_id:
- type: string
- provider_id:
- type: string
- metadata:
- type: object
- additionalProperties:
- oneOf:
- - type: 'null'
- - type: boolean
- - type: number
- - type: string
- - type: array
- - type: object
- additionalProperties: false
- required:
- - dataset_id
- - dataset_schema
- - url
RegisterBenchmarkRequest:
type: object
properties:
@@ -4705,6 +4791,36 @@ components:
- benchmark_id
- dataset_id
- scoring_functions
+ RegisterDatasetRequest:
+ type: object
+ properties:
+ dataset_id:
+ type: string
+ dataset_schema:
+ type: object
+ additionalProperties:
+ $ref: '#/components/schemas/ParamType'
+ url:
+ $ref: '#/components/schemas/URL'
+ provider_dataset_id:
+ type: string
+ provider_id:
+ type: string
+ metadata:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ additionalProperties: false
+ required:
+ - dataset_id
+ - dataset_schema
+ - url
RegisterModelRequest:
type: object
properties:
@@ -4827,6 +4943,14 @@ components:
additionalProperties: false
required:
- job_id
+ RunEvalDeprecatedRequest:
+ type: object
+ properties:
+ task_config:
+ $ref: '#/components/schemas/BenchmarkConfig'
+ additionalProperties: false
+ required:
+ - task_config
RunShieldRequest:
type: object
properties:
@@ -5125,10 +5249,10 @@ tags:
x-displayName: >-
Agents API for creating and interacting with agentic systems.
- name: BatchInference (Coming Soon)
+ - name: Benchmarks
- name: DatasetIO
- name: Datasets
- name: Eval
- - name: Benchmarks
- name: Inference
description: >-
This API provides the raw interface to the underlying models. Two kinds of models
@@ -5159,10 +5283,10 @@ x-tagGroups:
tags:
- Agents
- BatchInference (Coming Soon)
+ - Benchmarks
- DatasetIO
- Datasets
- Eval
- - Benchmarks
- Inference
- Inspect
- Models
diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py
index 90b14131f..b805e4976 100644
--- a/llama_stack/apis/eval/eval.py
+++ b/llama_stack/apis/eval/eval.py
@@ -83,3 +83,28 @@ class Eval(Protocol):
@webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET")
async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse: ...
+
+ @webmethod(route="/eval/tasks/{task_id}/jobs", method="POST")
+ async def run_eval_DEPRECATED(
+ self,
+ task_id: str,
+ task_config: BenchmarkConfig,
+ ) -> Job: ...
+
+ @webmethod(route="/eval/tasks/{task_id}/evaluations", method="POST")
+ async def evaluate_rows_DEPRECATED(
+ self,
+ task_id: str,
+ input_rows: List[Dict[str, Any]],
+ scoring_functions: List[str],
+ task_config: BenchmarkConfig,
+ ) -> EvaluateResponse: ...
+
+ @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="GET")
+ async def job_status_DEPRECATED(self, task_id: str, job_id: str) -> Optional[JobStatus]: ...
+
+ @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="DELETE")
+ async def job_cancel_DEPRECATED(self, task_id: str, job_id: str) -> None: ...
+
+ @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}/result", method="GET")
+ async def job_result_DEPRECATED(self, task_id: str, job_id: str) -> EvaluateResponse: ...
diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py
index f9f306767..9945ad367 100644
--- a/llama_stack/distribution/routers/routers.py
+++ b/llama_stack/distribution/routers/routers.py
@@ -9,7 +9,6 @@ from typing import Any, AsyncGenerator, Dict, List, Optional
from llama_stack.apis.common.content_types import InterleavedContent, URL
from llama_stack.apis.datasetio import DatasetIO, PaginatedRowsResult
from llama_stack.apis.eval import (
- AppBenchmarkConfig,
BenchmarkConfig,
Eval,
EvaluateResponse,
@@ -348,7 +347,7 @@ class EvalRouter(Eval):
async def run_eval(
self,
benchmark_id: str,
- task_config: AppBenchmarkConfig,
+ task_config: BenchmarkConfig,
) -> Job:
return await self.routing_table.get_provider_impl(benchmark_id).run_eval(
benchmark_id=benchmark_id,