From 10e8c964539e332ba45583acbdb41561376258ce Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Wed, 12 Feb 2025 21:18:48 -0800 Subject: [PATCH] add benchmarks --- llama_stack/apis/benchmarks/__init__.py | 7 ++ llama_stack/apis/benchmarks/benchmarks.py | 86 +++++++++++++++++++++++ 2 files changed, 93 insertions(+) create mode 100644 llama_stack/apis/benchmarks/__init__.py create mode 100644 llama_stack/apis/benchmarks/benchmarks.py diff --git a/llama_stack/apis/benchmarks/__init__.py b/llama_stack/apis/benchmarks/__init__.py new file mode 100644 index 000000000..f8f564957 --- /dev/null +++ b/llama_stack/apis/benchmarks/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from .benchmarks import * # noqa: F401 F403 diff --git a/llama_stack/apis/benchmarks/benchmarks.py b/llama_stack/apis/benchmarks/benchmarks.py new file mode 100644 index 000000000..75f2b3d05 --- /dev/null +++ b/llama_stack/apis/benchmarks/benchmarks.py @@ -0,0 +1,86 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable + +from llama_models.schema_utils import json_schema_type, webmethod +from pydantic import BaseModel, Field + +from llama_stack.apis.resource import Resource, ResourceType + + +class CommonBenchmarkFields(BaseModel): + dataset_id: str + scoring_functions: List[str] + metadata: Dict[str, Any] = Field( + default_factory=dict, + description="Metadata for this evaluation task", + ) + + +@json_schema_type +class Benchmark(CommonBenchmarkFields, Resource): + type: Literal[ResourceType.benchmark.value] = ResourceType.benchmark.value + + @property + def benchmark_id(self) -> str: + return self.identifier + + @property + def provider_benchmark_id(self) -> str: + return self.provider_resource_id + + +class BenchmarkInput(CommonBenchmarkFields, BaseModel): + benchmark_id: str + provider_id: Optional[str] = None + provider_benchmark_id: Optional[str] = None + + +class ListBenchmarksResponse(BaseModel): + data: List[Benchmark] + + +@runtime_checkable +class Benchmarks(Protocol): + @webmethod(route="/eval/benchmarks", method="GET") + async def list_benchmarks(self) -> ListBenchmarksResponse: ... + + @webmethod(route="/eval/benchmarks/{benchmark_id}", method="GET") + async def get_benchmark( + self, + benchmark_id: str, + ) -> Optional[Benchmark]: ... + + @webmethod(route="/eval/benchmarks", method="POST") + async def register_benchmark( + self, + benchmark_id: str, + dataset_id: str, + scoring_functions: List[str], + provider_benchmark_id: Optional[str] = None, + provider_id: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + ) -> None: ... + + @webmethod(route="/eval-tasks", method="GET") + async def DEPRECATED_list_eval_tasks(self) -> ListBenchmarksResponse: ... + + @webmethod(route="/eval-tasks/{task_id}", method="GET") + async def DEPRECATED_get_eval_task( + self, + task_id: str, + ) -> Optional[Benchmark]: ... + + @webmethod(route="/eval-tasks", method="POST") + async def DEPRECATED_register_eval_task( + self, + task_id: str, + dataset_id: str, + scoring_functions: List[str], + provider_benchmark_id: Optional[str] = None, + provider_id: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + ) -> None: ...