From b4d868a1e55f1755a51e29d354dc84a5667b56cb Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Wed, 12 Mar 2025 00:43:24 -0700 Subject: [PATCH] include benchmarks --- docs/_static/llama-stack-spec.html | 45 +++++++++++++---------- docs/_static/llama-stack-spec.yaml | 32 +++++++++++----- llama_stack/apis/benchmarks/benchmarks.py | 28 ++++++++++---- 3 files changed, 70 insertions(+), 35 deletions(-) diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 7ba26acb7..817a65ca8 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -2426,7 +2426,14 @@ "post": { "responses": { "200": { - "description": "OK" + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Benchmark" + } + } + } }, "400": { "$ref": "#/components/responses/BadRequest400" @@ -2444,7 +2451,7 @@ "tags": [ "Benchmarks" ], - "description": "", + "description": "Register a new benchmark.", "parameters": [], "requestBody": { "content": { @@ -7098,13 +7105,15 @@ "default": "benchmark" }, "dataset_id": { - "type": "string" + "type": "string", + "description": "The ID of the dataset to used to run the benchmark." }, "scoring_functions": { "type": "array", "items": { - "type": "string" - } + "$ref": "#/components/schemas/ScoringFnParams" + }, + "description": "The scoring functions with parameters to use for this benchmark." }, "metadata": { "type": "object", @@ -7129,7 +7138,8 @@ "type": "object" } ] - } + }, + "description": "Metadata for this benchmark for additional descriptions." } }, "additionalProperties": false, @@ -9448,23 +9458,20 @@ "RegisterBenchmarkRequest": { "type": "object", "properties": { - "benchmark_id": { - "type": "string" - }, "dataset_id": { - "type": "string" + "type": "string", + "description": "The ID of the dataset to used to run the benchmark." }, "scoring_functions": { "type": "array", "items": { - "type": "string" - } + "$ref": "#/components/schemas/ScoringFnParams" + }, + "description": "The scoring functions with parameters to use for this benchmark." }, - "provider_benchmark_id": { - "type": "string" - }, - "provider_id": { - "type": "string" + "benchmark_id": { + "type": "string", + "description": "(Optional) The ID of the benchmark to register. If not provided, a random ID will be generated." }, "metadata": { "type": "object", @@ -9489,12 +9496,12 @@ "type": "object" } ] - } + }, + "description": "(Optional) Metadata for this benchmark for additional descriptions." } }, "additionalProperties": false, "required": [ - "benchmark_id", "dataset_id", "scoring_functions" ], diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index e37c49100..62fb02651 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -1635,6 +1635,10 @@ paths: responses: '200': description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/Benchmark' '400': $ref: '#/components/responses/BadRequest400' '429': @@ -1647,7 +1651,7 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Benchmarks - description: '' + description: Register a new benchmark. parameters: [] requestBody: content: @@ -4950,10 +4954,14 @@ components: default: benchmark dataset_id: type: string + description: >- + The ID of the dataset to used to run the benchmark. scoring_functions: type: array items: - type: string + $ref: '#/components/schemas/ScoringFnParams' + description: >- + The scoring functions with parameters to use for this benchmark. metadata: type: object additionalProperties: @@ -4964,6 +4972,8 @@ components: - type: string - type: array - type: object + description: >- + Metadata for this benchmark for additional descriptions. additionalProperties: false required: - identifier @@ -6438,18 +6448,21 @@ components: RegisterBenchmarkRequest: type: object properties: - benchmark_id: - type: string dataset_id: type: string + description: >- + The ID of the dataset to used to run the benchmark. scoring_functions: type: array items: - type: string - provider_benchmark_id: - type: string - provider_id: + $ref: '#/components/schemas/ScoringFnParams' + description: >- + The scoring functions with parameters to use for this benchmark. + benchmark_id: type: string + description: >- + (Optional) The ID of the benchmark to register. If not provided, a random + ID will be generated. metadata: type: object additionalProperties: @@ -6460,9 +6473,10 @@ components: - type: string - type: array - type: object + description: >- + (Optional) Metadata for this benchmark for additional descriptions. additionalProperties: false required: - - benchmark_id - dataset_id - scoring_functions title: RegisterBenchmarkRequest diff --git a/llama_stack/apis/benchmarks/benchmarks.py b/llama_stack/apis/benchmarks/benchmarks.py index 39ba355e9..6738a6653 100644 --- a/llama_stack/apis/benchmarks/benchmarks.py +++ b/llama_stack/apis/benchmarks/benchmarks.py @@ -8,15 +8,22 @@ from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkab from pydantic import BaseModel, Field from llama_stack.apis.resource import Resource, ResourceType +from llama_stack.apis.scoring_functions import ScoringFnParams from llama_stack.schema_utils import json_schema_type, webmethod class CommonBenchmarkFields(BaseModel): + """ + :param dataset_id: The ID of the dataset to used to run the benchmark. + :param scoring_functions: The scoring functions with parameters to use for this benchmark. + :param metadata: Metadata for this benchmark for additional descriptions. + """ + dataset_id: str - scoring_functions: List[str] + scoring_functions: List[ScoringFnParams] metadata: Dict[str, Any] = Field( default_factory=dict, - description="Metadata for this evaluation task", + description="Metadata for this benchmark", ) @@ -57,10 +64,17 @@ class Benchmarks(Protocol): @webmethod(route="/eval/benchmarks", method="POST") async def register_benchmark( self, - benchmark_id: str, dataset_id: str, - scoring_functions: List[str], - provider_benchmark_id: Optional[str] = None, - provider_id: Optional[str] = None, + scoring_functions: List[ScoringFnParams], + benchmark_id: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, - ) -> None: ... + ) -> Benchmark: + """ + Register a new benchmark. + + :param dataset_id: The ID of the dataset to used to run the benchmark. + :param scoring_functions: The scoring functions with parameters to use for this benchmark. + :param benchmark_id: (Optional) The ID of the benchmark to register. If not provided, a random ID will be generated. + :param metadata: (Optional) Metadata for this benchmark for additional descriptions. + """ + ...