mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-07 19:12:09 +00:00
include benchmarks
This commit is contained in:
parent
e68e8c96ae
commit
b4d868a1e5
3 changed files with 70 additions and 35 deletions
43
docs/_static/llama-stack-spec.html
vendored
43
docs/_static/llama-stack-spec.html
vendored
|
@ -2426,7 +2426,14 @@
|
||||||
"post": {
|
"post": {
|
||||||
"responses": {
|
"responses": {
|
||||||
"200": {
|
"200": {
|
||||||
"description": "OK"
|
"description": "OK",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"$ref": "#/components/schemas/Benchmark"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"400": {
|
"400": {
|
||||||
"$ref": "#/components/responses/BadRequest400"
|
"$ref": "#/components/responses/BadRequest400"
|
||||||
|
@ -2444,7 +2451,7 @@
|
||||||
"tags": [
|
"tags": [
|
||||||
"Benchmarks"
|
"Benchmarks"
|
||||||
],
|
],
|
||||||
"description": "",
|
"description": "Register a new benchmark.",
|
||||||
"parameters": [],
|
"parameters": [],
|
||||||
"requestBody": {
|
"requestBody": {
|
||||||
"content": {
|
"content": {
|
||||||
|
@ -7098,13 +7105,15 @@
|
||||||
"default": "benchmark"
|
"default": "benchmark"
|
||||||
},
|
},
|
||||||
"dataset_id": {
|
"dataset_id": {
|
||||||
"type": "string"
|
"type": "string",
|
||||||
|
"description": "The ID of the dataset to used to run the benchmark."
|
||||||
},
|
},
|
||||||
"scoring_functions": {
|
"scoring_functions": {
|
||||||
"type": "array",
|
"type": "array",
|
||||||
"items": {
|
"items": {
|
||||||
"type": "string"
|
"$ref": "#/components/schemas/ScoringFnParams"
|
||||||
}
|
},
|
||||||
|
"description": "The scoring functions with parameters to use for this benchmark."
|
||||||
},
|
},
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
|
@ -7129,7 +7138,8 @@
|
||||||
"type": "object"
|
"type": "object"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
},
|
||||||
|
"description": "Metadata for this benchmark for additional descriptions."
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"additionalProperties": false,
|
"additionalProperties": false,
|
||||||
|
@ -9448,23 +9458,20 @@
|
||||||
"RegisterBenchmarkRequest": {
|
"RegisterBenchmarkRequest": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"benchmark_id": {
|
|
||||||
"type": "string"
|
|
||||||
},
|
|
||||||
"dataset_id": {
|
"dataset_id": {
|
||||||
"type": "string"
|
"type": "string",
|
||||||
|
"description": "The ID of the dataset to used to run the benchmark."
|
||||||
},
|
},
|
||||||
"scoring_functions": {
|
"scoring_functions": {
|
||||||
"type": "array",
|
"type": "array",
|
||||||
"items": {
|
"items": {
|
||||||
"type": "string"
|
"$ref": "#/components/schemas/ScoringFnParams"
|
||||||
}
|
|
||||||
},
|
},
|
||||||
"provider_benchmark_id": {
|
"description": "The scoring functions with parameters to use for this benchmark."
|
||||||
"type": "string"
|
|
||||||
},
|
},
|
||||||
"provider_id": {
|
"benchmark_id": {
|
||||||
"type": "string"
|
"type": "string",
|
||||||
|
"description": "(Optional) The ID of the benchmark to register. If not provided, a random ID will be generated."
|
||||||
},
|
},
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
|
@ -9489,12 +9496,12 @@
|
||||||
"type": "object"
|
"type": "object"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
},
|
||||||
|
"description": "(Optional) Metadata for this benchmark for additional descriptions."
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"additionalProperties": false,
|
"additionalProperties": false,
|
||||||
"required": [
|
"required": [
|
||||||
"benchmark_id",
|
|
||||||
"dataset_id",
|
"dataset_id",
|
||||||
"scoring_functions"
|
"scoring_functions"
|
||||||
],
|
],
|
||||||
|
|
32
docs/_static/llama-stack-spec.yaml
vendored
32
docs/_static/llama-stack-spec.yaml
vendored
|
@ -1635,6 +1635,10 @@ paths:
|
||||||
responses:
|
responses:
|
||||||
'200':
|
'200':
|
||||||
description: OK
|
description: OK
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: '#/components/schemas/Benchmark'
|
||||||
'400':
|
'400':
|
||||||
$ref: '#/components/responses/BadRequest400'
|
$ref: '#/components/responses/BadRequest400'
|
||||||
'429':
|
'429':
|
||||||
|
@ -1647,7 +1651,7 @@ paths:
|
||||||
$ref: '#/components/responses/DefaultError'
|
$ref: '#/components/responses/DefaultError'
|
||||||
tags:
|
tags:
|
||||||
- Benchmarks
|
- Benchmarks
|
||||||
description: ''
|
description: Register a new benchmark.
|
||||||
parameters: []
|
parameters: []
|
||||||
requestBody:
|
requestBody:
|
||||||
content:
|
content:
|
||||||
|
@ -4950,10 +4954,14 @@ components:
|
||||||
default: benchmark
|
default: benchmark
|
||||||
dataset_id:
|
dataset_id:
|
||||||
type: string
|
type: string
|
||||||
|
description: >-
|
||||||
|
The ID of the dataset to used to run the benchmark.
|
||||||
scoring_functions:
|
scoring_functions:
|
||||||
type: array
|
type: array
|
||||||
items:
|
items:
|
||||||
type: string
|
$ref: '#/components/schemas/ScoringFnParams'
|
||||||
|
description: >-
|
||||||
|
The scoring functions with parameters to use for this benchmark.
|
||||||
metadata:
|
metadata:
|
||||||
type: object
|
type: object
|
||||||
additionalProperties:
|
additionalProperties:
|
||||||
|
@ -4964,6 +4972,8 @@ components:
|
||||||
- type: string
|
- type: string
|
||||||
- type: array
|
- type: array
|
||||||
- type: object
|
- type: object
|
||||||
|
description: >-
|
||||||
|
Metadata for this benchmark for additional descriptions.
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
required:
|
required:
|
||||||
- identifier
|
- identifier
|
||||||
|
@ -6438,18 +6448,21 @@ components:
|
||||||
RegisterBenchmarkRequest:
|
RegisterBenchmarkRequest:
|
||||||
type: object
|
type: object
|
||||||
properties:
|
properties:
|
||||||
benchmark_id:
|
|
||||||
type: string
|
|
||||||
dataset_id:
|
dataset_id:
|
||||||
type: string
|
type: string
|
||||||
|
description: >-
|
||||||
|
The ID of the dataset to used to run the benchmark.
|
||||||
scoring_functions:
|
scoring_functions:
|
||||||
type: array
|
type: array
|
||||||
items:
|
items:
|
||||||
|
$ref: '#/components/schemas/ScoringFnParams'
|
||||||
|
description: >-
|
||||||
|
The scoring functions with parameters to use for this benchmark.
|
||||||
|
benchmark_id:
|
||||||
type: string
|
type: string
|
||||||
provider_benchmark_id:
|
description: >-
|
||||||
type: string
|
(Optional) The ID of the benchmark to register. If not provided, a random
|
||||||
provider_id:
|
ID will be generated.
|
||||||
type: string
|
|
||||||
metadata:
|
metadata:
|
||||||
type: object
|
type: object
|
||||||
additionalProperties:
|
additionalProperties:
|
||||||
|
@ -6460,9 +6473,10 @@ components:
|
||||||
- type: string
|
- type: string
|
||||||
- type: array
|
- type: array
|
||||||
- type: object
|
- type: object
|
||||||
|
description: >-
|
||||||
|
(Optional) Metadata for this benchmark for additional descriptions.
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
required:
|
required:
|
||||||
- benchmark_id
|
|
||||||
- dataset_id
|
- dataset_id
|
||||||
- scoring_functions
|
- scoring_functions
|
||||||
title: RegisterBenchmarkRequest
|
title: RegisterBenchmarkRequest
|
||||||
|
|
|
@ -8,15 +8,22 @@ from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkab
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
from llama_stack.apis.resource import Resource, ResourceType
|
from llama_stack.apis.resource import Resource, ResourceType
|
||||||
|
from llama_stack.apis.scoring_functions import ScoringFnParams
|
||||||
from llama_stack.schema_utils import json_schema_type, webmethod
|
from llama_stack.schema_utils import json_schema_type, webmethod
|
||||||
|
|
||||||
|
|
||||||
class CommonBenchmarkFields(BaseModel):
|
class CommonBenchmarkFields(BaseModel):
|
||||||
|
"""
|
||||||
|
:param dataset_id: The ID of the dataset to used to run the benchmark.
|
||||||
|
:param scoring_functions: The scoring functions with parameters to use for this benchmark.
|
||||||
|
:param metadata: Metadata for this benchmark for additional descriptions.
|
||||||
|
"""
|
||||||
|
|
||||||
dataset_id: str
|
dataset_id: str
|
||||||
scoring_functions: List[str]
|
scoring_functions: List[ScoringFnParams]
|
||||||
metadata: Dict[str, Any] = Field(
|
metadata: Dict[str, Any] = Field(
|
||||||
default_factory=dict,
|
default_factory=dict,
|
||||||
description="Metadata for this evaluation task",
|
description="Metadata for this benchmark",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -57,10 +64,17 @@ class Benchmarks(Protocol):
|
||||||
@webmethod(route="/eval/benchmarks", method="POST")
|
@webmethod(route="/eval/benchmarks", method="POST")
|
||||||
async def register_benchmark(
|
async def register_benchmark(
|
||||||
self,
|
self,
|
||||||
benchmark_id: str,
|
|
||||||
dataset_id: str,
|
dataset_id: str,
|
||||||
scoring_functions: List[str],
|
scoring_functions: List[ScoringFnParams],
|
||||||
provider_benchmark_id: Optional[str] = None,
|
benchmark_id: Optional[str] = None,
|
||||||
provider_id: Optional[str] = None,
|
|
||||||
metadata: Optional[Dict[str, Any]] = None,
|
metadata: Optional[Dict[str, Any]] = None,
|
||||||
) -> None: ...
|
) -> Benchmark:
|
||||||
|
"""
|
||||||
|
Register a new benchmark.
|
||||||
|
|
||||||
|
:param dataset_id: The ID of the dataset to used to run the benchmark.
|
||||||
|
:param scoring_functions: The scoring functions with parameters to use for this benchmark.
|
||||||
|
:param benchmark_id: (Optional) The ID of the benchmark to register. If not provided, a random ID will be generated.
|
||||||
|
:param metadata: (Optional) Metadata for this benchmark for additional descriptions.
|
||||||
|
"""
|
||||||
|
...
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue