diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 7ba26acb7..817a65ca8 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -2426,7 +2426,14 @@
"post": {
"responses": {
"200": {
- "description": "OK"
+ "description": "OK",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/Benchmark"
+ }
+ }
+ }
},
"400": {
"$ref": "#/components/responses/BadRequest400"
@@ -2444,7 +2451,7 @@
"tags": [
"Benchmarks"
],
- "description": "",
+ "description": "Register a new benchmark.",
"parameters": [],
"requestBody": {
"content": {
@@ -7098,13 +7105,15 @@
"default": "benchmark"
},
"dataset_id": {
- "type": "string"
+ "type": "string",
+ "description": "The ID of the dataset to used to run the benchmark."
},
"scoring_functions": {
"type": "array",
"items": {
- "type": "string"
- }
+ "$ref": "#/components/schemas/ScoringFnParams"
+ },
+ "description": "The scoring functions with parameters to use for this benchmark."
},
"metadata": {
"type": "object",
@@ -7129,7 +7138,8 @@
"type": "object"
}
]
- }
+ },
+ "description": "Metadata for this benchmark for additional descriptions."
}
},
"additionalProperties": false,
@@ -9448,23 +9458,20 @@
"RegisterBenchmarkRequest": {
"type": "object",
"properties": {
- "benchmark_id": {
- "type": "string"
- },
"dataset_id": {
- "type": "string"
+ "type": "string",
+ "description": "The ID of the dataset to used to run the benchmark."
},
"scoring_functions": {
"type": "array",
"items": {
- "type": "string"
- }
+ "$ref": "#/components/schemas/ScoringFnParams"
+ },
+ "description": "The scoring functions with parameters to use for this benchmark."
},
- "provider_benchmark_id": {
- "type": "string"
- },
- "provider_id": {
- "type": "string"
+ "benchmark_id": {
+ "type": "string",
+ "description": "(Optional) The ID of the benchmark to register. If not provided, a random ID will be generated."
},
"metadata": {
"type": "object",
@@ -9489,12 +9496,12 @@
"type": "object"
}
]
- }
+ },
+ "description": "(Optional) Metadata for this benchmark for additional descriptions."
}
},
"additionalProperties": false,
"required": [
- "benchmark_id",
"dataset_id",
"scoring_functions"
],
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index e37c49100..62fb02651 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -1635,6 +1635,10 @@ paths:
responses:
'200':
description: OK
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/Benchmark'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
@@ -1647,7 +1651,7 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Benchmarks
- description: ''
+ description: Register a new benchmark.
parameters: []
requestBody:
content:
@@ -4950,10 +4954,14 @@ components:
default: benchmark
dataset_id:
type: string
+ description: >-
+ The ID of the dataset to used to run the benchmark.
scoring_functions:
type: array
items:
- type: string
+ $ref: '#/components/schemas/ScoringFnParams'
+ description: >-
+ The scoring functions with parameters to use for this benchmark.
metadata:
type: object
additionalProperties:
@@ -4964,6 +4972,8 @@ components:
- type: string
- type: array
- type: object
+ description: >-
+ Metadata for this benchmark for additional descriptions.
additionalProperties: false
required:
- identifier
@@ -6438,18 +6448,21 @@ components:
RegisterBenchmarkRequest:
type: object
properties:
- benchmark_id:
- type: string
dataset_id:
type: string
+ description: >-
+ The ID of the dataset to used to run the benchmark.
scoring_functions:
type: array
items:
- type: string
- provider_benchmark_id:
- type: string
- provider_id:
+ $ref: '#/components/schemas/ScoringFnParams'
+ description: >-
+ The scoring functions with parameters to use for this benchmark.
+ benchmark_id:
type: string
+ description: >-
+ (Optional) The ID of the benchmark to register. If not provided, a random
+ ID will be generated.
metadata:
type: object
additionalProperties:
@@ -6460,9 +6473,10 @@ components:
- type: string
- type: array
- type: object
+ description: >-
+ (Optional) Metadata for this benchmark for additional descriptions.
additionalProperties: false
required:
- - benchmark_id
- dataset_id
- scoring_functions
title: RegisterBenchmarkRequest
diff --git a/llama_stack/apis/benchmarks/benchmarks.py b/llama_stack/apis/benchmarks/benchmarks.py
index 39ba355e9..6738a6653 100644
--- a/llama_stack/apis/benchmarks/benchmarks.py
+++ b/llama_stack/apis/benchmarks/benchmarks.py
@@ -8,15 +8,22 @@ from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkab
from pydantic import BaseModel, Field
from llama_stack.apis.resource import Resource, ResourceType
+from llama_stack.apis.scoring_functions import ScoringFnParams
from llama_stack.schema_utils import json_schema_type, webmethod
class CommonBenchmarkFields(BaseModel):
+ """
+ :param dataset_id: The ID of the dataset to used to run the benchmark.
+ :param scoring_functions: The scoring functions with parameters to use for this benchmark.
+ :param metadata: Metadata for this benchmark for additional descriptions.
+ """
+
dataset_id: str
- scoring_functions: List[str]
+ scoring_functions: List[ScoringFnParams]
metadata: Dict[str, Any] = Field(
default_factory=dict,
- description="Metadata for this evaluation task",
+ description="Metadata for this benchmark",
)
@@ -57,10 +64,17 @@ class Benchmarks(Protocol):
@webmethod(route="/eval/benchmarks", method="POST")
async def register_benchmark(
self,
- benchmark_id: str,
dataset_id: str,
- scoring_functions: List[str],
- provider_benchmark_id: Optional[str] = None,
- provider_id: Optional[str] = None,
+ scoring_functions: List[ScoringFnParams],
+ benchmark_id: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
- ) -> None: ...
+ ) -> Benchmark:
+ """
+ Register a new benchmark.
+
+ :param dataset_id: The ID of the dataset to used to run the benchmark.
+ :param scoring_functions: The scoring functions with parameters to use for this benchmark.
+ :param benchmark_id: (Optional) The ID of the benchmark to register. If not provided, a random ID will be generated.
+ :param metadata: (Optional) Metadata for this benchmark for additional descriptions.
+ """
+ ...