This commit is contained in:
Xi Yan 2025-02-12 20:48:05 -08:00
parent e07776fff6
commit ec721b3867
4 changed files with 695 additions and 334 deletions

View file

@ -67,8 +67,8 @@
"description": "", "description": "",
"parameters": [ "parameters": [
{ {
"name": "benchmark_id", "name": "task_id",
"in": "path", "in": "query",
"required": true, "required": true,
"schema": { "schema": {
"type": "string" "type": "string"
@ -114,7 +114,7 @@
"content": { "content": {
"application/json": { "application/json": {
"schema": { "schema": {
"$ref": "#/components/schemas/DeprecatedRegisterBenchmarkRequest" "$ref": "#/components/schemas/DeprecatedRegisterEvalTaskRequest"
} }
} }
}, },
@ -613,7 +613,7 @@
} }
} }
}, },
"/v1/eval/tasks/{benchmark_id}/evaluations": { "/v1/eval/benchmarks/{benchmark_id}/evaluations": {
"post": { "post": {
"responses": { "responses": {
"200": { "200": {
@ -653,6 +653,47 @@
} }
} }
}, },
"/v1/eval/tasks/{task_id}/evaluations": {
"post": {
"responses": {
"200": {
"description": "OK",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/EvaluateResponse"
}
}
}
}
},
"tags": [
"Eval"
],
"description": "",
"parameters": [
{
"name": "task_id",
"in": "path",
"required": true,
"schema": {
"type": "string"
}
}
],
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/EvaluateRowsDeprecatedRequest"
}
}
},
"required": true
},
"deprecated": true
}
},
"/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}": { "/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}": {
"get": { "get": {
"responses": { "responses": {
@ -753,6 +794,43 @@
] ]
} }
}, },
"/v1/eval/benchmarks/{benchmark_id}": {
"get": {
"responses": {
"200": {
"description": "OK",
"content": {
"application/json": {
"schema": {
"oneOf": [
{
"$ref": "#/components/schemas/Benchmark"
},
{
"type": "null"
}
]
}
}
}
}
},
"tags": [
"Benchmarks"
],
"description": "",
"parameters": [
{
"name": "benchmark_id",
"in": "path",
"required": true,
"schema": {
"type": "string"
}
}
]
}
},
"/v1/datasets/{dataset_id}": { "/v1/datasets/{dataset_id}": {
"get": { "get": {
"responses": { "responses": {
@ -811,43 +889,6 @@
] ]
} }
}, },
"/v1/eval/tasks/{benchmark_id}": {
"get": {
"responses": {
"200": {
"description": "OK",
"content": {
"application/json": {
"schema": {
"oneOf": [
{
"$ref": "#/components/schemas/Benchmark"
},
{
"type": "null"
}
]
}
}
}
}
},
"tags": [
"Benchmarks"
],
"description": "",
"parameters": [
{
"name": "benchmark_id",
"in": "path",
"required": true,
"schema": {
"type": "string"
}
}
]
}
},
"/v1/models/{model_id}": { "/v1/models/{model_id}": {
"get": { "get": {
"responses": { "responses": {
@ -1431,7 +1472,7 @@
} }
} }
}, },
"/v1/eval/tasks/{benchmark_id}/jobs/{job_id}": { "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}": {
"get": { "get": {
"responses": { "responses": {
"200": { "200": {
@ -1505,7 +1546,83 @@
] ]
} }
}, },
"/v1/eval/tasks/{benchmark_id}/jobs/{job_id}/result": { "/v1/eval/tasks/{task_id}/jobs/{job_id}": {
"get": {
"responses": {
"200": {
"description": "OK",
"content": {
"application/json": {
"schema": {
"oneOf": [
{
"$ref": "#/components/schemas/JobStatus"
},
{
"type": "null"
}
]
}
}
}
}
},
"tags": [
"Eval"
],
"description": "",
"parameters": [
{
"name": "task_id",
"in": "path",
"required": true,
"schema": {
"type": "string"
}
},
{
"name": "job_id",
"in": "path",
"required": true,
"schema": {
"type": "string"
}
}
],
"deprecated": true
},
"delete": {
"responses": {
"200": {
"description": "OK"
}
},
"tags": [
"Eval"
],
"description": "",
"parameters": [
{
"name": "task_id",
"in": "path",
"required": true,
"schema": {
"type": "string"
}
},
{
"name": "job_id",
"in": "path",
"required": true,
"schema": {
"type": "string"
}
}
],
"deprecated": true
}
},
"/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result": {
"get": { "get": {
"responses": { "responses": {
"200": { "200": {
@ -1525,7 +1642,7 @@
"description": "", "description": "",
"parameters": [ "parameters": [
{ {
"name": "job_id", "name": "benchmark_id",
"in": "path", "in": "path",
"required": true, "required": true,
"schema": { "schema": {
@ -1533,7 +1650,7 @@
} }
}, },
{ {
"name": "benchmark_id", "name": "job_id",
"in": "path", "in": "path",
"required": true, "required": true,
"schema": { "schema": {
@ -1543,6 +1660,88 @@
] ]
} }
}, },
"/v1/eval/tasks/{task_id}/jobs/{job_id}/result": {
"get": {
"responses": {
"200": {
"description": "OK",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/EvaluateResponse"
}
}
}
}
},
"tags": [
"Eval"
],
"description": "",
"parameters": [
{
"name": "task_id",
"in": "path",
"required": true,
"schema": {
"type": "string"
}
},
{
"name": "job_id",
"in": "path",
"required": true,
"schema": {
"type": "string"
}
}
],
"deprecated": true
}
},
"/v1/eval/benchmarks": {
"get": {
"responses": {
"200": {
"description": "OK",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ListBenchmarksResponse"
}
}
}
}
},
"tags": [
"Benchmarks"
],
"description": "",
"parameters": []
},
"post": {
"responses": {
"200": {
"description": "OK"
}
},
"tags": [
"Benchmarks"
],
"description": "",
"parameters": [],
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/RegisterBenchmarkRequest"
}
}
},
"required": true
}
}
},
"/v1/datasets": { "/v1/datasets": {
"get": { "get": {
"responses": { "responses": {
@ -1586,49 +1785,6 @@
} }
} }
}, },
"/v1/eval/tasks": {
"get": {
"responses": {
"200": {
"description": "OK",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ListBenchmarksResponse"
}
}
}
}
},
"tags": [
"Benchmarks"
],
"description": "",
"parameters": []
},
"post": {
"responses": {
"200": {
"description": "OK"
}
},
"tags": [
"Benchmarks"
],
"description": "",
"parameters": [],
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/RegisterBenchmarkRequest"
}
}
},
"required": true
}
}
},
"/v1/models": { "/v1/models": {
"get": { "get": {
"responses": { "responses": {
@ -2204,7 +2360,7 @@
] ]
} }
}, },
"/v1/eval/tasks/{benchmark_id}/jobs": { "/v1/eval/benchmarks/{benchmark_id}/jobs": {
"post": { "post": {
"responses": { "responses": {
"200": { "200": {
@ -2244,6 +2400,47 @@
} }
} }
}, },
"/v1/eval/tasks/{task_id}/jobs": {
"post": {
"responses": {
"200": {
"description": "OK",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/Job"
}
}
}
}
},
"tags": [
"Eval"
],
"description": "",
"parameters": [
{
"name": "task_id",
"in": "path",
"required": true,
"schema": {
"type": "string"
}
}
],
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/RunEvalDeprecatedRequest"
}
}
},
"required": true
},
"deprecated": true
}
},
"/v1/safety/run-shield": { "/v1/safety/run-shield": {
"post": { "post": {
"responses": { "responses": {
@ -2526,10 +2723,10 @@
"data" "data"
] ]
}, },
"DeprecatedRegisterBenchmarkRequest": { "DeprecatedRegisterEvalTaskRequest": {
"type": "object", "type": "object",
"properties": { "properties": {
"benchmark_id": { "task_id": {
"type": "string" "type": "string"
}, },
"dataset_id": { "dataset_id": {
@ -2575,7 +2772,7 @@
}, },
"additionalProperties": false, "additionalProperties": false,
"required": [ "required": [
"benchmark_id", "task_id",
"dataset_id", "dataset_id",
"scoring_functions" "scoring_functions"
] ]
@ -4745,34 +4942,6 @@
"accuracy" "accuracy"
] ]
}, },
"AppBenchmarkConfig": {
"type": "object",
"properties": {
"type": {
"type": "string",
"const": "app",
"default": "app"
},
"eval_candidate": {
"$ref": "#/components/schemas/EvalCandidate"
},
"scoring_params": {
"type": "object",
"additionalProperties": {
"$ref": "#/components/schemas/ScoringFnParams"
}
},
"num_examples": {
"type": "integer"
}
},
"additionalProperties": false,
"required": [
"type",
"eval_candidate",
"scoring_params"
]
},
"BasicScoringFnParams": { "BasicScoringFnParams": {
"type": "object", "type": "object",
"properties": { "properties": {
@ -4793,25 +4962,26 @@
"type" "type"
] ]
}, },
"BenchmarkBenchmarkConfig": { "BenchmarkConfig": {
"type": "object", "type": "object",
"properties": { "properties": {
"type": {
"type": "string",
"const": "benchmark",
"default": "benchmark"
},
"eval_candidate": { "eval_candidate": {
"$ref": "#/components/schemas/EvalCandidate" "$ref": "#/components/schemas/EvalCandidate"
}, },
"scoring_params": {
"type": "object",
"additionalProperties": {
"$ref": "#/components/schemas/ScoringFnParams"
}
},
"num_examples": { "num_examples": {
"type": "integer" "type": "integer"
} }
}, },
"additionalProperties": false, "additionalProperties": false,
"required": [ "required": [
"type", "eval_candidate",
"eval_candidate" "scoring_params"
] ]
}, },
"EvalCandidate": { "EvalCandidate": {
@ -4831,23 +5001,6 @@
} }
} }
}, },
"BenchmarkConfig": {
"oneOf": [
{
"$ref": "#/components/schemas/BenchmarkBenchmarkConfig"
},
{
"$ref": "#/components/schemas/AppBenchmarkConfig"
}
],
"discriminator": {
"propertyName": "type",
"mapping": {
"benchmark": "#/components/schemas/BenchmarkBenchmarkConfig",
"app": "#/components/schemas/AppBenchmarkConfig"
}
}
},
"LLMAsJudgeScoringFnParams": { "LLMAsJudgeScoringFnParams": {
"type": "object", "type": "object",
"properties": { "properties": {
@ -5108,6 +5261,54 @@
"aggregated_results" "aggregated_results"
] ]
}, },
"EvaluateRowsDeprecatedRequest": {
"type": "object",
"properties": {
"input_rows": {
"type": "array",
"items": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
]
}
}
},
"scoring_functions": {
"type": "array",
"items": {
"type": "string"
}
},
"task_config": {
"$ref": "#/components/schemas/BenchmarkConfig"
}
},
"additionalProperties": false,
"required": [
"input_rows",
"scoring_functions",
"task_config"
]
},
"Session": { "Session": {
"type": "object", "type": "object",
"properties": { "properties": {
@ -7304,60 +7505,6 @@
"data" "data"
] ]
}, },
"RegisterDatasetRequest": {
"type": "object",
"properties": {
"dataset_id": {
"type": "string"
},
"dataset_schema": {
"type": "object",
"additionalProperties": {
"$ref": "#/components/schemas/ParamType"
}
},
"url": {
"$ref": "#/components/schemas/URL"
},
"provider_dataset_id": {
"type": "string"
},
"provider_id": {
"type": "string"
},
"metadata": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
]
}
}
},
"additionalProperties": false,
"required": [
"dataset_id",
"dataset_schema",
"url"
]
},
"RegisterBenchmarkRequest": { "RegisterBenchmarkRequest": {
"type": "object", "type": "object",
"properties": { "properties": {
@ -7412,6 +7559,60 @@
"scoring_functions" "scoring_functions"
] ]
}, },
"RegisterDatasetRequest": {
"type": "object",
"properties": {
"dataset_id": {
"type": "string"
},
"dataset_schema": {
"type": "object",
"additionalProperties": {
"$ref": "#/components/schemas/ParamType"
}
},
"url": {
"$ref": "#/components/schemas/URL"
},
"provider_dataset_id": {
"type": "string"
},
"provider_id": {
"type": "string"
},
"metadata": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
]
}
}
},
"additionalProperties": false,
"required": [
"dataset_id",
"dataset_schema",
"url"
]
},
"RegisterModelRequest": { "RegisterModelRequest": {
"type": "object", "type": "object",
"properties": { "properties": {
@ -7623,6 +7824,18 @@
"job_id" "job_id"
] ]
}, },
"RunEvalDeprecatedRequest": {
"type": "object",
"properties": {
"task_config": {
"$ref": "#/components/schemas/BenchmarkConfig"
}
},
"additionalProperties": false,
"required": [
"task_config"
]
},
"RunShieldRequest": { "RunShieldRequest": {
"type": "object", "type": "object",
"properties": { "properties": {
@ -8105,6 +8318,9 @@
{ {
"name": "BatchInference (Coming Soon)" "name": "BatchInference (Coming Soon)"
}, },
{
"name": "Benchmarks"
},
{ {
"name": "DatasetIO" "name": "DatasetIO"
}, },
@ -8114,9 +8330,6 @@
{ {
"name": "Eval" "name": "Eval"
}, },
{
"name": "Benchmarks"
},
{ {
"name": "Inference", "name": "Inference",
"description": "This API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.", "description": "This API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.",
@ -8168,10 +8381,10 @@
"tags": [ "tags": [
"Agents", "Agents",
"BatchInference (Coming Soon)", "BatchInference (Coming Soon)",
"Benchmarks",
"DatasetIO", "DatasetIO",
"Datasets", "Datasets",
"Eval", "Eval",
"Benchmarks",
"Inference", "Inference",
"Inspect", "Inspect",
"Models", "Models",

View file

@ -25,8 +25,8 @@ paths:
- Benchmarks - Benchmarks
description: '' description: ''
parameters: parameters:
- name: benchmark_id - name: task_id
in: path in: query
required: true required: true
schema: schema:
type: string type: string
@ -57,7 +57,7 @@ paths:
content: content:
application/json: application/json:
schema: schema:
$ref: '#/components/schemas/DeprecatedRegisterBenchmarkRequest' $ref: '#/components/schemas/DeprecatedRegisterEvalTaskRequest'
required: true required: true
deprecated: true deprecated: true
/v1/datasetio/rows: /v1/datasetio/rows:
@ -372,7 +372,7 @@ paths:
schema: schema:
$ref: '#/components/schemas/EmbeddingsRequest' $ref: '#/components/schemas/EmbeddingsRequest'
required: true required: true
/v1/eval/tasks/{benchmark_id}/evaluations: /v1/eval/benchmarks/{benchmark_id}/evaluations:
post: post:
responses: responses:
'200': '200':
@ -396,6 +396,31 @@ paths:
schema: schema:
$ref: '#/components/schemas/EvaluateRowsRequest' $ref: '#/components/schemas/EvaluateRowsRequest'
required: true required: true
/v1/eval/tasks/{task_id}/evaluations:
post:
responses:
'200':
description: OK
content:
application/json:
schema:
$ref: '#/components/schemas/EvaluateResponse'
tags:
- Eval
description: ''
parameters:
- name: task_id
in: path
required: true
schema:
type: string
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/EvaluateRowsDeprecatedRequest'
required: true
deprecated: true
/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}: /v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}:
get: get:
responses: responses:
@ -457,6 +482,26 @@ paths:
required: true required: true
schema: schema:
type: string type: string
/v1/eval/benchmarks/{benchmark_id}:
get:
responses:
'200':
description: OK
content:
application/json:
schema:
oneOf:
- $ref: '#/components/schemas/Benchmark'
- type: 'null'
tags:
- Benchmarks
description: ''
parameters:
- name: benchmark_id
in: path
required: true
schema:
type: string
/v1/datasets/{dataset_id}: /v1/datasets/{dataset_id}:
get: get:
responses: responses:
@ -490,26 +535,6 @@ paths:
required: true required: true
schema: schema:
type: string type: string
/v1/eval/tasks/{benchmark_id}:
get:
responses:
'200':
description: OK
content:
application/json:
schema:
oneOf:
- $ref: '#/components/schemas/Benchmark'
- type: 'null'
tags:
- Benchmarks
description: ''
parameters:
- name: benchmark_id
in: path
required: true
schema:
type: string
/v1/models/{model_id}: /v1/models/{model_id}:
get: get:
responses: responses:
@ -852,7 +877,7 @@ paths:
schema: schema:
$ref: '#/components/schemas/InvokeToolRequest' $ref: '#/components/schemas/InvokeToolRequest'
required: true required: true
/v1/eval/tasks/{benchmark_id}/jobs/{job_id}: /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}:
get: get:
responses: responses:
'200': '200':
@ -895,7 +920,52 @@ paths:
required: true required: true
schema: schema:
type: string type: string
/v1/eval/tasks/{benchmark_id}/jobs/{job_id}/result: /v1/eval/tasks/{task_id}/jobs/{job_id}:
get:
responses:
'200':
description: OK
content:
application/json:
schema:
oneOf:
- $ref: '#/components/schemas/JobStatus'
- type: 'null'
tags:
- Eval
description: ''
parameters:
- name: task_id
in: path
required: true
schema:
type: string
- name: job_id
in: path
required: true
schema:
type: string
deprecated: true
delete:
responses:
'200':
description: OK
tags:
- Eval
description: ''
parameters:
- name: task_id
in: path
required: true
schema:
type: string
- name: job_id
in: path
required: true
schema:
type: string
deprecated: true
/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result:
get: get:
responses: responses:
'200': '200':
@ -908,16 +978,67 @@ paths:
- Eval - Eval
description: '' description: ''
parameters: parameters:
- name: job_id
in: path
required: true
schema:
type: string
- name: benchmark_id - name: benchmark_id
in: path in: path
required: true required: true
schema: schema:
type: string type: string
- name: job_id
in: path
required: true
schema:
type: string
/v1/eval/tasks/{task_id}/jobs/{job_id}/result:
get:
responses:
'200':
description: OK
content:
application/json:
schema:
$ref: '#/components/schemas/EvaluateResponse'
tags:
- Eval
description: ''
parameters:
- name: task_id
in: path
required: true
schema:
type: string
- name: job_id
in: path
required: true
schema:
type: string
deprecated: true
/v1/eval/benchmarks:
get:
responses:
'200':
description: OK
content:
application/json:
schema:
$ref: '#/components/schemas/ListBenchmarksResponse'
tags:
- Benchmarks
description: ''
parameters: []
post:
responses:
'200':
description: OK
tags:
- Benchmarks
description: ''
parameters: []
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/RegisterBenchmarkRequest'
required: true
/v1/datasets: /v1/datasets:
get: get:
responses: responses:
@ -945,33 +1066,6 @@ paths:
schema: schema:
$ref: '#/components/schemas/RegisterDatasetRequest' $ref: '#/components/schemas/RegisterDatasetRequest'
required: true required: true
/v1/eval/tasks:
get:
responses:
'200':
description: OK
content:
application/json:
schema:
$ref: '#/components/schemas/ListBenchmarksResponse'
tags:
- Benchmarks
description: ''
parameters: []
post:
responses:
'200':
description: OK
tags:
- Benchmarks
description: ''
parameters: []
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/RegisterBenchmarkRequest'
required: true
/v1/models: /v1/models:
get: get:
responses: responses:
@ -1328,7 +1422,7 @@ paths:
type: array type: array
items: items:
type: string type: string
/v1/eval/tasks/{benchmark_id}/jobs: /v1/eval/benchmarks/{benchmark_id}/jobs:
post: post:
responses: responses:
'200': '200':
@ -1352,6 +1446,31 @@ paths:
schema: schema:
$ref: '#/components/schemas/RunEvalRequest' $ref: '#/components/schemas/RunEvalRequest'
required: true required: true
/v1/eval/tasks/{task_id}/jobs:
post:
responses:
'200':
description: OK
content:
application/json:
schema:
$ref: '#/components/schemas/Job'
tags:
- Eval
description: ''
parameters:
- name: task_id
in: path
required: true
schema:
type: string
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/RunEvalDeprecatedRequest'
required: true
deprecated: true
/v1/safety/run-shield: /v1/safety/run-shield:
post: post:
responses: responses:
@ -1527,10 +1646,10 @@ components:
additionalProperties: false additionalProperties: false
required: required:
- data - data
DeprecatedRegisterBenchmarkRequest: DeprecatedRegisterEvalTaskRequest:
type: object type: object
properties: properties:
benchmark_id: task_id:
type: string type: string
dataset_id: dataset_id:
type: string type: string
@ -1554,7 +1673,7 @@ components:
- type: object - type: object
additionalProperties: false additionalProperties: false
required: required:
- benchmark_id - task_id
- dataset_id - dataset_id
- scoring_functions - scoring_functions
AppendRowsRequest: AppendRowsRequest:
@ -3063,26 +3182,6 @@ components:
- median - median
- categorical_count - categorical_count
- accuracy - accuracy
AppBenchmarkConfig:
type: object
properties:
type:
type: string
const: app
default: app
eval_candidate:
$ref: '#/components/schemas/EvalCandidate'
scoring_params:
type: object
additionalProperties:
$ref: '#/components/schemas/ScoringFnParams'
num_examples:
type: integer
additionalProperties: false
required:
- type
- eval_candidate
- scoring_params
BasicScoringFnParams: BasicScoringFnParams:
type: object type: object
properties: properties:
@ -3097,21 +3196,21 @@ components:
additionalProperties: false additionalProperties: false
required: required:
- type - type
BenchmarkBenchmarkConfig: BenchmarkConfig:
type: object type: object
properties: properties:
type:
type: string
const: benchmark
default: benchmark
eval_candidate: eval_candidate:
$ref: '#/components/schemas/EvalCandidate' $ref: '#/components/schemas/EvalCandidate'
scoring_params:
type: object
additionalProperties:
$ref: '#/components/schemas/ScoringFnParams'
num_examples: num_examples:
type: integer type: integer
additionalProperties: false additionalProperties: false
required: required:
- type
- eval_candidate - eval_candidate
- scoring_params
EvalCandidate: EvalCandidate:
oneOf: oneOf:
- $ref: '#/components/schemas/ModelCandidate' - $ref: '#/components/schemas/ModelCandidate'
@ -3121,15 +3220,6 @@ components:
mapping: mapping:
model: '#/components/schemas/ModelCandidate' model: '#/components/schemas/ModelCandidate'
agent: '#/components/schemas/AgentCandidate' agent: '#/components/schemas/AgentCandidate'
BenchmarkConfig:
oneOf:
- $ref: '#/components/schemas/BenchmarkBenchmarkConfig'
- $ref: '#/components/schemas/AppBenchmarkConfig'
discriminator:
propertyName: type
mapping:
benchmark: '#/components/schemas/BenchmarkBenchmarkConfig'
app: '#/components/schemas/AppBenchmarkConfig'
LLMAsJudgeScoringFnParams: LLMAsJudgeScoringFnParams:
type: object type: object
properties: properties:
@ -3278,6 +3368,32 @@ components:
required: required:
- score_rows - score_rows
- aggregated_results - aggregated_results
EvaluateRowsDeprecatedRequest:
type: object
properties:
input_rows:
type: array
items:
type: object
additionalProperties:
oneOf:
- type: 'null'
- type: boolean
- type: number
- type: string
- type: array
- type: object
scoring_functions:
type: array
items:
type: string
task_config:
$ref: '#/components/schemas/BenchmarkConfig'
additionalProperties: false
required:
- input_rows
- scoring_functions
- task_config
Session: Session:
type: object type: object
properties: properties:
@ -4645,36 +4761,6 @@ components:
additionalProperties: false additionalProperties: false
required: required:
- data - data
RegisterDatasetRequest:
type: object
properties:
dataset_id:
type: string
dataset_schema:
type: object
additionalProperties:
$ref: '#/components/schemas/ParamType'
url:
$ref: '#/components/schemas/URL'
provider_dataset_id:
type: string
provider_id:
type: string
metadata:
type: object
additionalProperties:
oneOf:
- type: 'null'
- type: boolean
- type: number
- type: string
- type: array
- type: object
additionalProperties: false
required:
- dataset_id
- dataset_schema
- url
RegisterBenchmarkRequest: RegisterBenchmarkRequest:
type: object type: object
properties: properties:
@ -4705,6 +4791,36 @@ components:
- benchmark_id - benchmark_id
- dataset_id - dataset_id
- scoring_functions - scoring_functions
RegisterDatasetRequest:
type: object
properties:
dataset_id:
type: string
dataset_schema:
type: object
additionalProperties:
$ref: '#/components/schemas/ParamType'
url:
$ref: '#/components/schemas/URL'
provider_dataset_id:
type: string
provider_id:
type: string
metadata:
type: object
additionalProperties:
oneOf:
- type: 'null'
- type: boolean
- type: number
- type: string
- type: array
- type: object
additionalProperties: false
required:
- dataset_id
- dataset_schema
- url
RegisterModelRequest: RegisterModelRequest:
type: object type: object
properties: properties:
@ -4827,6 +4943,14 @@ components:
additionalProperties: false additionalProperties: false
required: required:
- job_id - job_id
RunEvalDeprecatedRequest:
type: object
properties:
task_config:
$ref: '#/components/schemas/BenchmarkConfig'
additionalProperties: false
required:
- task_config
RunShieldRequest: RunShieldRequest:
type: object type: object
properties: properties:
@ -5125,10 +5249,10 @@ tags:
x-displayName: >- x-displayName: >-
Agents API for creating and interacting with agentic systems. Agents API for creating and interacting with agentic systems.
- name: BatchInference (Coming Soon) - name: BatchInference (Coming Soon)
- name: Benchmarks
- name: DatasetIO - name: DatasetIO
- name: Datasets - name: Datasets
- name: Eval - name: Eval
- name: Benchmarks
- name: Inference - name: Inference
description: >- description: >-
This API provides the raw interface to the underlying models. Two kinds of models This API provides the raw interface to the underlying models. Two kinds of models
@ -5159,10 +5283,10 @@ x-tagGroups:
tags: tags:
- Agents - Agents
- BatchInference (Coming Soon) - BatchInference (Coming Soon)
- Benchmarks
- DatasetIO - DatasetIO
- Datasets - Datasets
- Eval - Eval
- Benchmarks
- Inference - Inference
- Inspect - Inspect
- Models - Models

View file

@ -83,3 +83,28 @@ class Eval(Protocol):
@webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET") @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET")
async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse: ... async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse: ...
@webmethod(route="/eval/tasks/{task_id}/jobs", method="POST")
async def run_eval_DEPRECATED(
self,
task_id: str,
task_config: BenchmarkConfig,
) -> Job: ...
@webmethod(route="/eval/tasks/{task_id}/evaluations", method="POST")
async def evaluate_rows_DEPRECATED(
self,
task_id: str,
input_rows: List[Dict[str, Any]],
scoring_functions: List[str],
task_config: BenchmarkConfig,
) -> EvaluateResponse: ...
@webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="GET")
async def job_status_DEPRECATED(self, task_id: str, job_id: str) -> Optional[JobStatus]: ...
@webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="DELETE")
async def job_cancel_DEPRECATED(self, task_id: str, job_id: str) -> None: ...
@webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}/result", method="GET")
async def job_result_DEPRECATED(self, task_id: str, job_id: str) -> EvaluateResponse: ...

View file

@ -9,7 +9,6 @@ from typing import Any, AsyncGenerator, Dict, List, Optional
from llama_stack.apis.common.content_types import InterleavedContent, URL from llama_stack.apis.common.content_types import InterleavedContent, URL
from llama_stack.apis.datasetio import DatasetIO, PaginatedRowsResult from llama_stack.apis.datasetio import DatasetIO, PaginatedRowsResult
from llama_stack.apis.eval import ( from llama_stack.apis.eval import (
AppBenchmarkConfig,
BenchmarkConfig, BenchmarkConfig,
Eval, Eval,
EvaluateResponse, EvaluateResponse,
@ -348,7 +347,7 @@ class EvalRouter(Eval):
async def run_eval( async def run_eval(
self, self,
benchmark_id: str, benchmark_id: str,
task_config: AppBenchmarkConfig, task_config: BenchmarkConfig,
) -> Job: ) -> Job:
return await self.routing_table.get_provider_impl(benchmark_id).run_eval( return await self.routing_table.get_provider_impl(benchmark_id).run_eval(
benchmark_id=benchmark_id, benchmark_id=benchmark_id,