feat: convert Benchmarks API to use FastAPI router (#4309)

# What does this PR do?

Convert the Benchmarks API from @webmethod decorators to FastAPI router
pattern, matching the Batches API structure.

One notable change is the update of stack.py to handle request models in
register_resources().

Closes: #4308 

## Test Plan

CI and `curl http://localhost:8321/v1/inspect/routes | jq '.data[] |
select(.route | contains("benchmark"))'`

---------

Signed-off-by: Sébastien Han <seb@redhat.com>
This commit is contained in:
Sébastien Han 2025-12-10 15:04:27 +01:00 committed by GitHub
parent 661985e240
commit ff375f1abb
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
18 changed files with 862 additions and 195 deletions

View file

@ -3404,7 +3404,7 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Benchmarks - Benchmarks
summary: List Benchmarks summary: List all benchmarks.
description: List all benchmarks. description: List all benchmarks.
operationId: list_benchmarks_v1alpha_eval_benchmarks_get operationId: list_benchmarks_v1alpha_eval_benchmarks_get
post: post:
@ -3422,10 +3422,10 @@ paths:
description: Default Response description: Default Response
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
'204': '204':
description: Successful Response description: The benchmark was successfully registered.
tags: tags:
- Benchmarks - Benchmarks
summary: Register Benchmark summary: Register a benchmark.
description: Register a benchmark. description: Register a benchmark.
operationId: register_benchmark_v1alpha_eval_benchmarks_post operationId: register_benchmark_v1alpha_eval_benchmarks_post
requestBody: requestBody:
@ -3445,20 +3445,20 @@ paths:
schema: schema:
$ref: '#/components/schemas/Benchmark' $ref: '#/components/schemas/Benchmark'
'400': '400':
description: Bad Request
$ref: '#/components/responses/BadRequest400' $ref: '#/components/responses/BadRequest400'
description: Bad Request
'429': '429':
description: Too Many Requests
$ref: '#/components/responses/TooManyRequests429' $ref: '#/components/responses/TooManyRequests429'
description: Too Many Requests
'500': '500':
description: Internal Server Error
$ref: '#/components/responses/InternalServerError500' $ref: '#/components/responses/InternalServerError500'
description: Internal Server Error
default: default:
description: Default Response
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
description: Default Response
tags: tags:
- Benchmarks - Benchmarks
summary: Get Benchmark summary: Get a benchmark by its ID.
description: Get a benchmark by its ID. description: Get a benchmark by its ID.
operationId: get_benchmark_v1alpha_eval_benchmarks__benchmark_id__get operationId: get_benchmark_v1alpha_eval_benchmarks__benchmark_id__get
parameters: parameters:
@ -3467,26 +3467,28 @@ paths:
required: true required: true
schema: schema:
type: string type: string
description: 'Path parameter: benchmark_id' description: The ID of the benchmark to get.
title: Benchmark Id
description: The ID of the benchmark to get.
delete: delete:
responses: responses:
'400': '400':
description: Bad Request
$ref: '#/components/responses/BadRequest400' $ref: '#/components/responses/BadRequest400'
description: Bad Request
'429': '429':
description: Too Many Requests
$ref: '#/components/responses/TooManyRequests429' $ref: '#/components/responses/TooManyRequests429'
description: Too Many Requests
'500': '500':
description: Internal Server Error
$ref: '#/components/responses/InternalServerError500' $ref: '#/components/responses/InternalServerError500'
description: Internal Server Error
default: default:
description: Default Response
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
description: Default Response
'204': '204':
description: Successful Response description: The benchmark was successfully unregistered.
tags: tags:
- Benchmarks - Benchmarks
summary: Unregister Benchmark summary: Unregister a benchmark.
description: Unregister a benchmark. description: Unregister a benchmark.
operationId: unregister_benchmark_v1alpha_eval_benchmarks__benchmark_id__delete operationId: unregister_benchmark_v1alpha_eval_benchmarks__benchmark_id__delete
parameters: parameters:
@ -3495,7 +3497,9 @@ paths:
required: true required: true
schema: schema:
type: string type: string
description: 'Path parameter: benchmark_id' description: The ID of the benchmark to unregister.
title: Benchmark Id
description: The ID of the benchmark to unregister.
deprecated: true deprecated: true
/v1alpha/eval/benchmarks/{benchmark_id}/evaluations: /v1alpha/eval/benchmarks/{benchmark_id}/evaluations:
post: post:
@ -10391,20 +10395,23 @@ components:
type: string type: string
const: benchmark const: benchmark
title: Type title: Type
description: The resource type, always benchmark.
default: benchmark default: benchmark
dataset_id: dataset_id:
type: string type: string
title: Dataset Id title: Dataset Id
description: Identifier of the dataset to use for the benchmark evaluation.
scoring_functions: scoring_functions:
items: items:
type: string type: string
type: array type: array
title: Scoring Functions title: Scoring Functions
description: List of scoring function identifiers to apply during evaluation.
metadata: metadata:
additionalProperties: true additionalProperties: true
type: object type: object
title: Metadata title: Metadata
description: Metadata for this evaluation task description: Metadata for this evaluation task.
type: object type: object
required: required:
- identifier - identifier
@ -10420,10 +10427,12 @@ components:
$ref: '#/components/schemas/Benchmark' $ref: '#/components/schemas/Benchmark'
type: array type: array
title: Data title: Data
description: List of benchmark objects.
type: object type: object
required: required:
- data - data
title: ListBenchmarksResponse title: ListBenchmarksResponse
description: Response containing a list of benchmark objects.
BenchmarkConfig: BenchmarkConfig:
properties: properties:
eval_candidate: eval_candidate:
@ -11385,33 +11394,40 @@ components:
benchmark_id: benchmark_id:
type: string type: string
title: Benchmark Id title: Benchmark Id
description: The ID of the benchmark to register.
dataset_id: dataset_id:
type: string type: string
title: Dataset Id title: Dataset Id
description: The ID of the dataset to use for the benchmark.
scoring_functions: scoring_functions:
items: items:
type: string type: string
type: array type: array
title: Scoring Functions title: Scoring Functions
description: The scoring functions to use for the benchmark.
provider_benchmark_id: provider_benchmark_id:
anyOf: anyOf:
- type: string - type: string
- type: 'null' - type: 'null'
description: The ID of the provider benchmark to use for the benchmark.
provider_id: provider_id:
anyOf: anyOf:
- type: string - type: string
- type: 'null' - type: 'null'
description: The ID of the provider to use for the benchmark.
metadata: metadata:
anyOf: anyOf:
- additionalProperties: true - additionalProperties: true
type: object type: object
- type: 'null' - type: 'null'
description: The metadata to use for the benchmark.
type: object type: object
required: required:
- benchmark_id - benchmark_id
- dataset_id - dataset_id
- scoring_functions - scoring_functions
title: RegisterBenchmarkRequest title: RegisterBenchmarkRequest
description: Request model for registering a benchmark.
AllowedToolsFilter: AllowedToolsFilter:
properties: properties:
tool_names: tool_names:
@ -12617,6 +12633,33 @@ components:
- batch_id - batch_id
title: CancelBatchRequest title: CancelBatchRequest
type: object type: object
ListBenchmarksRequest:
description: Request model for listing benchmarks.
properties: {}
title: ListBenchmarksRequest
type: object
GetBenchmarkRequest:
description: Request model for getting a benchmark.
properties:
benchmark_id:
description: The ID of the benchmark to get.
title: Benchmark Id
type: string
required:
- benchmark_id
title: GetBenchmarkRequest
type: object
UnregisterBenchmarkRequest:
description: Request model for unregistering a benchmark.
properties:
benchmark_id:
description: The ID of the benchmark to unregister.
title: Benchmark Id
type: string
required:
- benchmark_id
title: UnregisterBenchmarkRequest
type: object
DialogType: DialogType:
description: Parameter type for dialog data with semantic output labels. description: Parameter type for dialog data with semantic output labels.
properties: properties:

View file

@ -794,7 +794,7 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Benchmarks - Benchmarks
summary: List Benchmarks summary: List all benchmarks.
description: List all benchmarks. description: List all benchmarks.
operationId: list_benchmarks_v1alpha_eval_benchmarks_get operationId: list_benchmarks_v1alpha_eval_benchmarks_get
post: post:
@ -812,10 +812,10 @@ paths:
description: Default Response description: Default Response
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
'204': '204':
description: Successful Response description: The benchmark was successfully registered.
tags: tags:
- Benchmarks - Benchmarks
summary: Register Benchmark summary: Register a benchmark.
description: Register a benchmark. description: Register a benchmark.
operationId: register_benchmark_v1alpha_eval_benchmarks_post operationId: register_benchmark_v1alpha_eval_benchmarks_post
requestBody: requestBody:
@ -835,20 +835,20 @@ paths:
schema: schema:
$ref: '#/components/schemas/Benchmark' $ref: '#/components/schemas/Benchmark'
'400': '400':
description: Bad Request
$ref: '#/components/responses/BadRequest400' $ref: '#/components/responses/BadRequest400'
description: Bad Request
'429': '429':
description: Too Many Requests
$ref: '#/components/responses/TooManyRequests429' $ref: '#/components/responses/TooManyRequests429'
description: Too Many Requests
'500': '500':
description: Internal Server Error
$ref: '#/components/responses/InternalServerError500' $ref: '#/components/responses/InternalServerError500'
description: Internal Server Error
default: default:
description: Default Response
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
description: Default Response
tags: tags:
- Benchmarks - Benchmarks
summary: Get Benchmark summary: Get a benchmark by its ID.
description: Get a benchmark by its ID. description: Get a benchmark by its ID.
operationId: get_benchmark_v1alpha_eval_benchmarks__benchmark_id__get operationId: get_benchmark_v1alpha_eval_benchmarks__benchmark_id__get
parameters: parameters:
@ -857,26 +857,28 @@ paths:
required: true required: true
schema: schema:
type: string type: string
description: 'Path parameter: benchmark_id' description: The ID of the benchmark to get.
title: Benchmark Id
description: The ID of the benchmark to get.
delete: delete:
responses: responses:
'400': '400':
description: Bad Request
$ref: '#/components/responses/BadRequest400' $ref: '#/components/responses/BadRequest400'
description: Bad Request
'429': '429':
description: Too Many Requests
$ref: '#/components/responses/TooManyRequests429' $ref: '#/components/responses/TooManyRequests429'
description: Too Many Requests
'500': '500':
description: Internal Server Error
$ref: '#/components/responses/InternalServerError500' $ref: '#/components/responses/InternalServerError500'
description: Internal Server Error
default: default:
description: Default Response
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
description: Default Response
'204': '204':
description: Successful Response description: The benchmark was successfully unregistered.
tags: tags:
- Benchmarks - Benchmarks
summary: Unregister Benchmark summary: Unregister a benchmark.
description: Unregister a benchmark. description: Unregister a benchmark.
operationId: unregister_benchmark_v1alpha_eval_benchmarks__benchmark_id__delete operationId: unregister_benchmark_v1alpha_eval_benchmarks__benchmark_id__delete
parameters: parameters:
@ -885,7 +887,9 @@ paths:
required: true required: true
schema: schema:
type: string type: string
description: 'Path parameter: benchmark_id' description: The ID of the benchmark to unregister.
title: Benchmark Id
description: The ID of the benchmark to unregister.
deprecated: true deprecated: true
components: components:
schemas: schemas:
@ -7375,20 +7379,23 @@ components:
type: string type: string
const: benchmark const: benchmark
title: Type title: Type
description: The resource type, always benchmark.
default: benchmark default: benchmark
dataset_id: dataset_id:
type: string type: string
title: Dataset Id title: Dataset Id
description: Identifier of the dataset to use for the benchmark evaluation.
scoring_functions: scoring_functions:
items: items:
type: string type: string
type: array type: array
title: Scoring Functions title: Scoring Functions
description: List of scoring function identifiers to apply during evaluation.
metadata: metadata:
additionalProperties: true additionalProperties: true
type: object type: object
title: Metadata title: Metadata
description: Metadata for this evaluation task description: Metadata for this evaluation task.
type: object type: object
required: required:
- identifier - identifier
@ -7404,10 +7411,12 @@ components:
$ref: '#/components/schemas/Benchmark' $ref: '#/components/schemas/Benchmark'
type: array type: array
title: Data title: Data
description: List of benchmark objects.
type: object type: object
required: required:
- data - data
title: ListBenchmarksResponse title: ListBenchmarksResponse
description: Response containing a list of benchmark objects.
BenchmarkConfig: BenchmarkConfig:
properties: properties:
eval_candidate: eval_candidate:
@ -8369,33 +8378,40 @@ components:
benchmark_id: benchmark_id:
type: string type: string
title: Benchmark Id title: Benchmark Id
description: The ID of the benchmark to register.
dataset_id: dataset_id:
type: string type: string
title: Dataset Id title: Dataset Id
description: The ID of the dataset to use for the benchmark.
scoring_functions: scoring_functions:
items: items:
type: string type: string
type: array type: array
title: Scoring Functions title: Scoring Functions
description: The scoring functions to use for the benchmark.
provider_benchmark_id: provider_benchmark_id:
anyOf: anyOf:
- type: string - type: string
- type: 'null' - type: 'null'
description: The ID of the provider benchmark to use for the benchmark.
provider_id: provider_id:
anyOf: anyOf:
- type: string - type: string
- type: 'null' - type: 'null'
description: The ID of the provider to use for the benchmark.
metadata: metadata:
anyOf: anyOf:
- additionalProperties: true - additionalProperties: true
type: object type: object
- type: 'null' - type: 'null'
description: The metadata to use for the benchmark.
type: object type: object
required: required:
- benchmark_id - benchmark_id
- dataset_id - dataset_id
- scoring_functions - scoring_functions
title: RegisterBenchmarkRequest title: RegisterBenchmarkRequest
description: Request model for registering a benchmark.
AllowedToolsFilter: AllowedToolsFilter:
properties: properties:
tool_names: tool_names:
@ -9601,6 +9617,33 @@ components:
- batch_id - batch_id
title: CancelBatchRequest title: CancelBatchRequest
type: object type: object
ListBenchmarksRequest:
description: Request model for listing benchmarks.
properties: {}
title: ListBenchmarksRequest
type: object
GetBenchmarkRequest:
description: Request model for getting a benchmark.
properties:
benchmark_id:
description: The ID of the benchmark to get.
title: Benchmark Id
type: string
required:
- benchmark_id
title: GetBenchmarkRequest
type: object
UnregisterBenchmarkRequest:
description: Request model for unregistering a benchmark.
properties:
benchmark_id:
description: The ID of the benchmark to unregister.
title: Benchmark Id
type: string
required:
- benchmark_id
title: UnregisterBenchmarkRequest
type: object
DialogType: DialogType:
description: Parameter type for dialog data with semantic output labels. description: Parameter type for dialog data with semantic output labels.
properties: properties:

View file

@ -188,7 +188,7 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Benchmarks - Benchmarks
summary: List Benchmarks summary: List all benchmarks.
description: List all benchmarks. description: List all benchmarks.
operationId: list_benchmarks_v1alpha_eval_benchmarks_get operationId: list_benchmarks_v1alpha_eval_benchmarks_get
/v1alpha/eval/benchmarks/{benchmark_id}: /v1alpha/eval/benchmarks/{benchmark_id}:
@ -201,20 +201,20 @@ paths:
schema: schema:
$ref: '#/components/schemas/Benchmark' $ref: '#/components/schemas/Benchmark'
'400': '400':
description: Bad Request
$ref: '#/components/responses/BadRequest400' $ref: '#/components/responses/BadRequest400'
description: Bad Request
'429': '429':
description: Too Many Requests
$ref: '#/components/responses/TooManyRequests429' $ref: '#/components/responses/TooManyRequests429'
description: Too Many Requests
'500': '500':
description: Internal Server Error
$ref: '#/components/responses/InternalServerError500' $ref: '#/components/responses/InternalServerError500'
description: Internal Server Error
default: default:
description: Default Response
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
description: Default Response
tags: tags:
- Benchmarks - Benchmarks
summary: Get Benchmark summary: Get a benchmark by its ID.
description: Get a benchmark by its ID. description: Get a benchmark by its ID.
operationId: get_benchmark_v1alpha_eval_benchmarks__benchmark_id__get operationId: get_benchmark_v1alpha_eval_benchmarks__benchmark_id__get
parameters: parameters:
@ -223,7 +223,9 @@ paths:
required: true required: true
schema: schema:
type: string type: string
description: 'Path parameter: benchmark_id' description: The ID of the benchmark to get.
title: Benchmark Id
description: The ID of the benchmark to get.
/v1alpha/eval/benchmarks/{benchmark_id}/evaluations: /v1alpha/eval/benchmarks/{benchmark_id}/evaluations:
post: post:
responses: responses:
@ -6517,20 +6519,23 @@ components:
type: string type: string
const: benchmark const: benchmark
title: Type title: Type
description: The resource type, always benchmark.
default: benchmark default: benchmark
dataset_id: dataset_id:
type: string type: string
title: Dataset Id title: Dataset Id
description: Identifier of the dataset to use for the benchmark evaluation.
scoring_functions: scoring_functions:
items: items:
type: string type: string
type: array type: array
title: Scoring Functions title: Scoring Functions
description: List of scoring function identifiers to apply during evaluation.
metadata: metadata:
additionalProperties: true additionalProperties: true
type: object type: object
title: Metadata title: Metadata
description: Metadata for this evaluation task description: Metadata for this evaluation task.
type: object type: object
required: required:
- identifier - identifier
@ -6546,10 +6551,12 @@ components:
$ref: '#/components/schemas/Benchmark' $ref: '#/components/schemas/Benchmark'
type: array type: array
title: Data title: Data
description: List of benchmark objects.
type: object type: object
required: required:
- data - data
title: ListBenchmarksResponse title: ListBenchmarksResponse
description: Response containing a list of benchmark objects.
BenchmarkConfig: BenchmarkConfig:
properties: properties:
eval_candidate: eval_candidate:
@ -7346,6 +7353,45 @@ components:
- $ref: '#/components/schemas/RowsDataSource' - $ref: '#/components/schemas/RowsDataSource'
title: RowsDataSource title: RowsDataSource
title: URIDataSource | RowsDataSource title: URIDataSource | RowsDataSource
RegisterBenchmarkRequest:
properties:
benchmark_id:
type: string
title: Benchmark Id
description: The ID of the benchmark to register.
dataset_id:
type: string
title: Dataset Id
description: The ID of the dataset to use for the benchmark.
scoring_functions:
items:
type: string
type: array
title: Scoring Functions
description: The scoring functions to use for the benchmark.
provider_benchmark_id:
anyOf:
- type: string
- type: 'null'
description: The ID of the provider benchmark to use for the benchmark.
provider_id:
anyOf:
- type: string
- type: 'null'
description: The ID of the provider to use for the benchmark.
metadata:
anyOf:
- additionalProperties: true
type: object
- type: 'null'
description: The metadata to use for the benchmark.
type: object
required:
- benchmark_id
- dataset_id
- scoring_functions
title: RegisterBenchmarkRequest
description: Request model for registering a benchmark.
AllowedToolsFilter: AllowedToolsFilter:
properties: properties:
tool_names: tool_names:
@ -8395,6 +8441,33 @@ components:
- batch_id - batch_id
title: CancelBatchRequest title: CancelBatchRequest
type: object type: object
ListBenchmarksRequest:
description: Request model for listing benchmarks.
properties: {}
title: ListBenchmarksRequest
type: object
GetBenchmarkRequest:
description: Request model for getting a benchmark.
properties:
benchmark_id:
description: The ID of the benchmark to get.
title: Benchmark Id
type: string
required:
- benchmark_id
title: GetBenchmarkRequest
type: object
UnregisterBenchmarkRequest:
description: Request model for unregistering a benchmark.
properties:
benchmark_id:
description: The ID of the benchmark to unregister.
title: Benchmark Id
type: string
required:
- benchmark_id
title: UnregisterBenchmarkRequest
type: object
DialogType: DialogType:
description: Parameter type for dialog data with semantic output labels. description: Parameter type for dialog data with semantic output labels.
properties: properties:

View file

@ -9166,20 +9166,23 @@ components:
type: string type: string
const: benchmark const: benchmark
title: Type title: Type
description: The resource type, always benchmark.
default: benchmark default: benchmark
dataset_id: dataset_id:
type: string type: string
title: Dataset Id title: Dataset Id
description: Identifier of the dataset to use for the benchmark evaluation.
scoring_functions: scoring_functions:
items: items:
type: string type: string
type: array type: array
title: Scoring Functions title: Scoring Functions
description: List of scoring function identifiers to apply during evaluation.
metadata: metadata:
additionalProperties: true additionalProperties: true
type: object type: object
title: Metadata title: Metadata
description: Metadata for this evaluation task description: Metadata for this evaluation task.
type: object type: object
required: required:
- identifier - identifier
@ -9195,10 +9198,12 @@ components:
$ref: '#/components/schemas/Benchmark' $ref: '#/components/schemas/Benchmark'
type: array type: array
title: Data title: Data
description: List of benchmark objects.
type: object type: object
required: required:
- data - data
title: ListBenchmarksResponse title: ListBenchmarksResponse
description: Response containing a list of benchmark objects.
BenchmarkConfig: BenchmarkConfig:
properties: properties:
eval_candidate: eval_candidate:
@ -9848,6 +9853,45 @@ components:
- $ref: '#/components/schemas/RowsDataSource' - $ref: '#/components/schemas/RowsDataSource'
title: RowsDataSource title: RowsDataSource
title: URIDataSource | RowsDataSource title: URIDataSource | RowsDataSource
RegisterBenchmarkRequest:
properties:
benchmark_id:
type: string
title: Benchmark Id
description: The ID of the benchmark to register.
dataset_id:
type: string
title: Dataset Id
description: The ID of the dataset to use for the benchmark.
scoring_functions:
items:
type: string
type: array
title: Scoring Functions
description: The scoring functions to use for the benchmark.
provider_benchmark_id:
anyOf:
- type: string
- type: 'null'
description: The ID of the provider benchmark to use for the benchmark.
provider_id:
anyOf:
- type: string
- type: 'null'
description: The ID of the provider to use for the benchmark.
metadata:
anyOf:
- additionalProperties: true
type: object
- type: 'null'
description: The metadata to use for the benchmark.
type: object
required:
- benchmark_id
- dataset_id
- scoring_functions
title: RegisterBenchmarkRequest
description: Request model for registering a benchmark.
AllowedToolsFilter: AllowedToolsFilter:
properties: properties:
tool_names: tool_names:
@ -11053,6 +11097,33 @@ components:
- batch_id - batch_id
title: CancelBatchRequest title: CancelBatchRequest
type: object type: object
ListBenchmarksRequest:
description: Request model for listing benchmarks.
properties: {}
title: ListBenchmarksRequest
type: object
GetBenchmarkRequest:
description: Request model for getting a benchmark.
properties:
benchmark_id:
description: The ID of the benchmark to get.
title: Benchmark Id
type: string
required:
- benchmark_id
title: GetBenchmarkRequest
type: object
UnregisterBenchmarkRequest:
description: Request model for unregistering a benchmark.
properties:
benchmark_id:
description: The ID of the benchmark to unregister.
title: Benchmark Id
type: string
required:
- benchmark_id
title: UnregisterBenchmarkRequest
type: object
DialogType: DialogType:
description: Parameter type for dialog data with semantic output labels. description: Parameter type for dialog data with semantic output labels.
properties: properties:

View file

@ -3404,7 +3404,7 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Benchmarks - Benchmarks
summary: List Benchmarks summary: List all benchmarks.
description: List all benchmarks. description: List all benchmarks.
operationId: list_benchmarks_v1alpha_eval_benchmarks_get operationId: list_benchmarks_v1alpha_eval_benchmarks_get
post: post:
@ -3422,10 +3422,10 @@ paths:
description: Default Response description: Default Response
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
'204': '204':
description: Successful Response description: The benchmark was successfully registered.
tags: tags:
- Benchmarks - Benchmarks
summary: Register Benchmark summary: Register a benchmark.
description: Register a benchmark. description: Register a benchmark.
operationId: register_benchmark_v1alpha_eval_benchmarks_post operationId: register_benchmark_v1alpha_eval_benchmarks_post
requestBody: requestBody:
@ -3445,20 +3445,20 @@ paths:
schema: schema:
$ref: '#/components/schemas/Benchmark' $ref: '#/components/schemas/Benchmark'
'400': '400':
description: Bad Request
$ref: '#/components/responses/BadRequest400' $ref: '#/components/responses/BadRequest400'
description: Bad Request
'429': '429':
description: Too Many Requests
$ref: '#/components/responses/TooManyRequests429' $ref: '#/components/responses/TooManyRequests429'
description: Too Many Requests
'500': '500':
description: Internal Server Error
$ref: '#/components/responses/InternalServerError500' $ref: '#/components/responses/InternalServerError500'
description: Internal Server Error
default: default:
description: Default Response
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
description: Default Response
tags: tags:
- Benchmarks - Benchmarks
summary: Get Benchmark summary: Get a benchmark by its ID.
description: Get a benchmark by its ID. description: Get a benchmark by its ID.
operationId: get_benchmark_v1alpha_eval_benchmarks__benchmark_id__get operationId: get_benchmark_v1alpha_eval_benchmarks__benchmark_id__get
parameters: parameters:
@ -3467,26 +3467,28 @@ paths:
required: true required: true
schema: schema:
type: string type: string
description: 'Path parameter: benchmark_id' description: The ID of the benchmark to get.
title: Benchmark Id
description: The ID of the benchmark to get.
delete: delete:
responses: responses:
'400': '400':
description: Bad Request
$ref: '#/components/responses/BadRequest400' $ref: '#/components/responses/BadRequest400'
description: Bad Request
'429': '429':
description: Too Many Requests
$ref: '#/components/responses/TooManyRequests429' $ref: '#/components/responses/TooManyRequests429'
description: Too Many Requests
'500': '500':
description: Internal Server Error
$ref: '#/components/responses/InternalServerError500' $ref: '#/components/responses/InternalServerError500'
description: Internal Server Error
default: default:
description: Default Response
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
description: Default Response
'204': '204':
description: Successful Response description: The benchmark was successfully unregistered.
tags: tags:
- Benchmarks - Benchmarks
summary: Unregister Benchmark summary: Unregister a benchmark.
description: Unregister a benchmark. description: Unregister a benchmark.
operationId: unregister_benchmark_v1alpha_eval_benchmarks__benchmark_id__delete operationId: unregister_benchmark_v1alpha_eval_benchmarks__benchmark_id__delete
parameters: parameters:
@ -3495,7 +3497,9 @@ paths:
required: true required: true
schema: schema:
type: string type: string
description: 'Path parameter: benchmark_id' description: The ID of the benchmark to unregister.
title: Benchmark Id
description: The ID of the benchmark to unregister.
deprecated: true deprecated: true
/v1alpha/eval/benchmarks/{benchmark_id}/evaluations: /v1alpha/eval/benchmarks/{benchmark_id}/evaluations:
post: post:
@ -10391,20 +10395,23 @@ components:
type: string type: string
const: benchmark const: benchmark
title: Type title: Type
description: The resource type, always benchmark.
default: benchmark default: benchmark
dataset_id: dataset_id:
type: string type: string
title: Dataset Id title: Dataset Id
description: Identifier of the dataset to use for the benchmark evaluation.
scoring_functions: scoring_functions:
items: items:
type: string type: string
type: array type: array
title: Scoring Functions title: Scoring Functions
description: List of scoring function identifiers to apply during evaluation.
metadata: metadata:
additionalProperties: true additionalProperties: true
type: object type: object
title: Metadata title: Metadata
description: Metadata for this evaluation task description: Metadata for this evaluation task.
type: object type: object
required: required:
- identifier - identifier
@ -10420,10 +10427,12 @@ components:
$ref: '#/components/schemas/Benchmark' $ref: '#/components/schemas/Benchmark'
type: array type: array
title: Data title: Data
description: List of benchmark objects.
type: object type: object
required: required:
- data - data
title: ListBenchmarksResponse title: ListBenchmarksResponse
description: Response containing a list of benchmark objects.
BenchmarkConfig: BenchmarkConfig:
properties: properties:
eval_candidate: eval_candidate:
@ -11385,33 +11394,40 @@ components:
benchmark_id: benchmark_id:
type: string type: string
title: Benchmark Id title: Benchmark Id
description: The ID of the benchmark to register.
dataset_id: dataset_id:
type: string type: string
title: Dataset Id title: Dataset Id
description: The ID of the dataset to use for the benchmark.
scoring_functions: scoring_functions:
items: items:
type: string type: string
type: array type: array
title: Scoring Functions title: Scoring Functions
description: The scoring functions to use for the benchmark.
provider_benchmark_id: provider_benchmark_id:
anyOf: anyOf:
- type: string - type: string
- type: 'null' - type: 'null'
description: The ID of the provider benchmark to use for the benchmark.
provider_id: provider_id:
anyOf: anyOf:
- type: string - type: string
- type: 'null' - type: 'null'
description: The ID of the provider to use for the benchmark.
metadata: metadata:
anyOf: anyOf:
- additionalProperties: true - additionalProperties: true
type: object type: object
- type: 'null' - type: 'null'
description: The metadata to use for the benchmark.
type: object type: object
required: required:
- benchmark_id - benchmark_id
- dataset_id - dataset_id
- scoring_functions - scoring_functions
title: RegisterBenchmarkRequest title: RegisterBenchmarkRequest
description: Request model for registering a benchmark.
AllowedToolsFilter: AllowedToolsFilter:
properties: properties:
tool_names: tool_names:
@ -12617,6 +12633,33 @@ components:
- batch_id - batch_id
title: CancelBatchRequest title: CancelBatchRequest
type: object type: object
ListBenchmarksRequest:
description: Request model for listing benchmarks.
properties: {}
title: ListBenchmarksRequest
type: object
GetBenchmarkRequest:
description: Request model for getting a benchmark.
properties:
benchmark_id:
description: The ID of the benchmark to get.
title: Benchmark Id
type: string
required:
- benchmark_id
title: GetBenchmarkRequest
type: object
UnregisterBenchmarkRequest:
description: Request model for unregistering a benchmark.
properties:
benchmark_id:
description: The ID of the benchmark to unregister.
title: Benchmark Id
type: string
required:
- benchmark_id
title: UnregisterBenchmarkRequest
type: object
DialogType: DialogType:
description: Parameter type for dialog data with semantic output labels. description: Parameter type for dialog data with semantic output labels.
properties: properties:

View file

@ -9,6 +9,7 @@ from importlib.metadata import version
from pydantic import BaseModel from pydantic import BaseModel
from llama_stack.core.datatypes import StackConfig from llama_stack.core.datatypes import StackConfig
from llama_stack.core.distribution import builtin_automatically_routed_apis
from llama_stack.core.external import load_external_apis from llama_stack.core.external import load_external_apis
from llama_stack.core.server.fastapi_router_registry import ( from llama_stack.core.server.fastapi_router_registry import (
_ROUTER_FACTORIES, _ROUTER_FACTORIES,
@ -65,6 +66,17 @@ class DistributionInspectImpl(Inspect):
def _get_provider_types(api: Api) -> list[str]: def _get_provider_types(api: Api) -> list[str]:
if api.value in ["providers", "inspect"]: if api.value in ["providers", "inspect"]:
return [] # These APIs don't have "real" providers they're internal to the stack return [] # These APIs don't have "real" providers they're internal to the stack
# For routing table APIs, look up providers from their router API
# (e.g., benchmarks -> eval, models -> inference, etc.)
auto_routed_apis = builtin_automatically_routed_apis()
for auto_routed in auto_routed_apis:
if auto_routed.routing_table_api == api:
# This is a routing table API, use its router API for providers
providers = config.providers.get(auto_routed.router_api.value, [])
return [p.provider_type for p in providers] if providers else []
# Regular API, look up providers directly
providers = config.providers.get(api.value, []) providers = config.providers.get(api.value, [])
return [p.provider_type for p in providers] if providers else [] return [p.provider_type for p in providers] if providers else []

View file

@ -10,6 +10,7 @@ import json
import logging # allow-direct-logging import logging # allow-direct-logging
import os import os
import sys import sys
import typing
from enum import Enum from enum import Enum
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
@ -490,6 +491,25 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
unwrapped_body_param = param unwrapped_body_param = param
break break
# Check for parameters with Depends() annotation (FastAPI router endpoints)
# These need special handling: construct the request model from body
depends_param = None
for param in params_list:
param_type = param.annotation
if get_origin(param_type) is typing.Annotated:
args = get_args(param_type)
if len(args) > 1:
# Check if any metadata is Depends
metadata = args[1:]
for item in metadata:
# Check if it's a Depends object (has dependency attribute or is a callable)
# Depends objects typically have a 'dependency' attribute or are callable functions
if hasattr(item, "dependency") or callable(item) or "Depends" in str(type(item)):
depends_param = param
break
if depends_param:
break
# Convert parameters to Pydantic models where needed # Convert parameters to Pydantic models where needed
converted_body = {} converted_body = {}
for param_name, param in sig.parameters.items(): for param_name, param in sig.parameters.items():
@ -500,6 +520,27 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
else: else:
converted_body[param_name] = convert_to_pydantic(param.annotation, value) converted_body[param_name] = convert_to_pydantic(param.annotation, value)
# Handle Depends parameter: construct request model from body
if depends_param and depends_param.name not in converted_body:
param_type = depends_param.annotation
if get_origin(param_type) is typing.Annotated:
base_type = get_args(param_type)[0]
# Handle Union types (e.g., SomeRequestModel | None) - extract the non-None type
# In Python 3.10+, Union types created with | syntax are still typing.Union
origin = get_origin(base_type)
if origin is Union:
# Get the first non-None type from the Union
union_args = get_args(base_type)
base_type = next(
(t for t in union_args if t is not type(None) and t is not None),
union_args[0] if union_args else None,
)
# Only try to instantiate if it's a class (not a Union or other non-callable type)
if base_type is not None and inspect.isclass(base_type) and callable(base_type):
# Construct the request model from all body parameters
converted_body[depends_param.name] = base_type(**body)
# handle unwrapped body parameter after processing all named parameters # handle unwrapped body parameter after processing all named parameters
if unwrapped_body_param: if unwrapped_body_param:
base_type = get_args(unwrapped_body_param.annotation)[0] base_type = get_args(unwrapped_body_param.annotation)[0]

View file

@ -4,13 +4,20 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
from typing import Any
from llama_stack.core.datatypes import ( from llama_stack.core.datatypes import (
BenchmarkWithOwner, BenchmarkWithOwner,
) )
from llama_stack.log import get_logger from llama_stack.log import get_logger
from llama_stack_api import Benchmark, Benchmarks, ListBenchmarksResponse from llama_stack_api import (
Benchmark,
Benchmarks,
GetBenchmarkRequest,
ListBenchmarksRequest,
ListBenchmarksResponse,
RegisterBenchmarkRequest,
UnregisterBenchmarkRequest,
)
from .common import CommonRoutingTableImpl from .common import CommonRoutingTableImpl
@ -18,26 +25,21 @@ logger = get_logger(name=__name__, category="core::routing_tables")
class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks): class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks):
async def list_benchmarks(self) -> ListBenchmarksResponse: async def list_benchmarks(self, request: ListBenchmarksRequest) -> ListBenchmarksResponse:
return ListBenchmarksResponse(data=await self.get_all_with_type("benchmark")) return ListBenchmarksResponse(data=await self.get_all_with_type("benchmark"))
async def get_benchmark(self, benchmark_id: str) -> Benchmark: async def get_benchmark(self, request: GetBenchmarkRequest) -> Benchmark:
benchmark = await self.get_object_by_identifier("benchmark", benchmark_id) benchmark = await self.get_object_by_identifier("benchmark", request.benchmark_id)
if benchmark is None: if benchmark is None:
raise ValueError(f"Benchmark '{benchmark_id}' not found") raise ValueError(f"Benchmark '{request.benchmark_id}' not found")
return benchmark return benchmark
async def register_benchmark( async def register_benchmark(
self, self,
benchmark_id: str, request: RegisterBenchmarkRequest,
dataset_id: str,
scoring_functions: list[str],
metadata: dict[str, Any] | None = None,
provider_benchmark_id: str | None = None,
provider_id: str | None = None,
) -> None: ) -> None:
if metadata is None: metadata = request.metadata if request.metadata is not None else {}
metadata = {} provider_id = request.provider_id
if provider_id is None: if provider_id is None:
if len(self.impls_by_provider_id) == 1: if len(self.impls_by_provider_id) == 1:
provider_id = list(self.impls_by_provider_id.keys())[0] provider_id = list(self.impls_by_provider_id.keys())[0]
@ -45,18 +47,20 @@ class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks):
raise ValueError( raise ValueError(
"No provider specified and multiple providers available. Please specify a provider_id." "No provider specified and multiple providers available. Please specify a provider_id."
) )
provider_benchmark_id = request.provider_benchmark_id
if provider_benchmark_id is None: if provider_benchmark_id is None:
provider_benchmark_id = benchmark_id provider_benchmark_id = request.benchmark_id
benchmark = BenchmarkWithOwner( benchmark = BenchmarkWithOwner(
identifier=benchmark_id, identifier=request.benchmark_id,
dataset_id=dataset_id, dataset_id=request.dataset_id,
scoring_functions=scoring_functions, scoring_functions=request.scoring_functions,
metadata=metadata, metadata=metadata,
provider_id=provider_id, provider_id=provider_id,
provider_resource_id=provider_benchmark_id, provider_resource_id=provider_benchmark_id,
) )
await self.register_object(benchmark) await self.register_object(benchmark)
async def unregister_benchmark(self, benchmark_id: str) -> None: async def unregister_benchmark(self, request: UnregisterBenchmarkRequest) -> None:
existing_benchmark = await self.get_benchmark(benchmark_id) get_request = GetBenchmarkRequest(benchmark_id=request.benchmark_id)
existing_benchmark = await self.get_benchmark(get_request)
await self.unregister_object(existing_benchmark) await self.unregister_object(existing_benchmark)

View file

@ -17,7 +17,7 @@ from fastapi import APIRouter
from fastapi.routing import APIRoute from fastapi.routing import APIRoute
from starlette.routing import Route from starlette.routing import Route
from llama_stack_api import batches from llama_stack_api import batches, benchmarks
# Router factories for APIs that have FastAPI routers # Router factories for APIs that have FastAPI routers
# Add new APIs here as they are migrated to the router system # Add new APIs here as they are migrated to the router system
@ -25,6 +25,7 @@ from llama_stack_api.datatypes import Api
_ROUTER_FACTORIES: dict[str, Callable[[Any], APIRouter]] = { _ROUTER_FACTORIES: dict[str, Callable[[Any], APIRouter]] = {
"batches": batches.fastapi_routes.create_router, "batches": batches.fastapi_routes.create_router,
"benchmarks": benchmarks.fastapi_routes.create_router,
} }

View file

@ -13,6 +13,11 @@ from aiohttp import hdrs
from starlette.routing import Route from starlette.routing import Route
from llama_stack.core.resolver import api_protocol_map from llama_stack.core.resolver import api_protocol_map
from llama_stack.core.server.fastapi_router_registry import (
_ROUTER_FACTORIES,
build_fastapi_router,
get_router_routes,
)
from llama_stack_api import Api, ExternalApiSpec, WebMethod from llama_stack_api import Api, ExternalApiSpec, WebMethod
EndpointFunc = Callable[..., Any] EndpointFunc = Callable[..., Any]
@ -85,7 +90,53 @@ def initialize_route_impls(impls, external_apis: dict[Api, ExternalApiSpec] | No
return f"^{pattern}$" return f"^{pattern}$"
# Process routes from FastAPI routers
for api_name in _ROUTER_FACTORIES.keys():
api = Api(api_name)
if api not in impls:
continue
impl = impls[api]
router = build_fastapi_router(api, impl)
if router:
router_routes = get_router_routes(router)
for route in router_routes:
# Get the endpoint function from the route
# For FastAPI routes, the endpoint is the actual function
func = route.endpoint
if func is None:
continue
# Get the first (and typically only) method from the set, filtering out HEAD
available_methods = [m for m in (route.methods or []) if m != "HEAD"]
if not available_methods:
continue # Skip if only HEAD method is available
method = available_methods[0].lower()
if method not in route_impls:
route_impls[method] = {}
# Create a minimal WebMethod for router routes (needed for RouteMatch tuple)
# We don't have webmethod metadata for router routes, so create a minimal one
# that has the attributes used by the library client (descriptive_name for tracing)
#
# TODO: Long-term migration plan (once all APIs are migrated to FastAPI routers):
# - Extract summary from APIRoute: route.summary (available on FastAPI APIRoute objects)
# - Pass summary directly in RouteMatch instead of WebMethod
# - Remove this WebMethod() instantiation entirely
# - Update library_client.py to use the extracted summary instead of webmethod.descriptive_name
webmethod = WebMethod(descriptive_name=None)
route_impls[method][_convert_path_to_regex(route.path)] = (
func,
route.path,
webmethod,
)
# Process routes from legacy webmethod-based APIs
for api, api_routes in api_to_routes.items(): for api, api_routes in api_to_routes.items():
# Skip APIs that have routers (already processed above)
if api.value in _ROUTER_FACTORIES:
continue
if api not in impls: if api not in impls:
continue continue
for route, webmethod in api_routes: for route, webmethod in api_routes:

View file

@ -6,12 +6,14 @@
import asyncio import asyncio
import importlib.resources import importlib.resources
import inspect
import os import os
import re import re
import tempfile import tempfile
from typing import Any from typing import Any, get_type_hints
import yaml import yaml
from pydantic import BaseModel
from llama_stack.core.conversations.conversations import ConversationServiceConfig, ConversationServiceImpl from llama_stack.core.conversations.conversations import ConversationServiceConfig, ConversationServiceImpl
from llama_stack.core.datatypes import Provider, SafetyConfig, StackConfig, VectorStoresConfig from llama_stack.core.datatypes import Provider, SafetyConfig, StackConfig, VectorStoresConfig
@ -108,6 +110,81 @@ REGISTRY_REFRESH_TASK = None
TEST_RECORDING_CONTEXT = None TEST_RECORDING_CONTEXT = None
def is_request_model(t: Any) -> bool:
"""Check if a type is a request model (Pydantic BaseModel).
Args:
t: The type to check
Returns:
True if the type is a Pydantic BaseModel subclass, False otherwise
"""
return inspect.isclass(t) and issubclass(t, BaseModel)
async def invoke_with_optional_request(method: Any) -> Any:
"""Invoke a method, automatically creating a request instance if needed.
For APIs that use request models, this will create an empty request object.
For backward compatibility, falls back to calling without arguments.
Uses get_type_hints() to resolve forward references (e.g., "ListBenchmarksRequest" -> actual class).
Handles methods with:
- No parameters: calls without arguments
- One or more request model parameters: creates empty instances for each
- Mixed parameters: creates request models, uses defaults for others
- Required non-request-model parameters without defaults: falls back to calling without arguments
Args:
method: The method to invoke
Returns:
The result of calling the method
"""
try:
hints = get_type_hints(method)
except Exception:
# Forward references can't be resolved, fall back to calling without request
return await method()
params = list(inspect.signature(method).parameters.values())
params = [p for p in params if p.name != "self"]
if not params:
return await method()
# Build arguments for the method call
args: dict[str, Any] = {}
can_call = True
for param in params:
param_type = hints.get(param.name)
# If it's a request model, try to create an empty instance
if param_type and is_request_model(param_type):
try:
args[param.name] = param_type()
except Exception:
# Request model requires arguments, can't create empty instance
can_call = False
break
# If it has a default value, we can skip it (will use default)
elif param.default != inspect.Parameter.empty:
continue
# Required parameter that's not a request model - can't provide it
else:
can_call = False
break
if can_call and args:
return await method(**args)
# Fall back to calling without arguments for backward compatibility
return await method()
async def register_resources(run_config: StackConfig, impls: dict[Api, Any]): async def register_resources(run_config: StackConfig, impls: dict[Api, Any]):
for rsrc, api, register_method, list_method in RESOURCES: for rsrc, api, register_method, list_method in RESOURCES:
objects = getattr(run_config.registered_resources, rsrc) objects = getattr(run_config.registered_resources, rsrc)
@ -129,7 +206,7 @@ async def register_resources(run_config: StackConfig, impls: dict[Api, Any]):
await method(**{k: getattr(obj, k) for k in obj.model_dump().keys()}) await method(**{k: getattr(obj, k) for k in obj.model_dump().keys()})
method = getattr(impls[api], list_method) method = getattr(impls[api], list_method)
response = await method() response = await invoke_with_optional_request(method)
objects_to_process = response.data if hasattr(response, "data") else response objects_to_process = response.data if hasattr(response, "data") else response

View file

@ -40,7 +40,11 @@ from .benchmarks import (
BenchmarkInput, BenchmarkInput,
Benchmarks, Benchmarks,
CommonBenchmarkFields, CommonBenchmarkFields,
GetBenchmarkRequest,
ListBenchmarksRequest,
ListBenchmarksResponse, ListBenchmarksResponse,
RegisterBenchmarkRequest,
UnregisterBenchmarkRequest,
) )
# Import commonly used types from common submodule # Import commonly used types from common submodule
@ -567,7 +571,11 @@ __all__ = [
"LLMRAGQueryGeneratorConfig", "LLMRAGQueryGeneratorConfig",
"ListBatchesResponse", "ListBatchesResponse",
"RetrieveBatchRequest", "RetrieveBatchRequest",
"GetBenchmarkRequest",
"ListBenchmarksRequest",
"ListBenchmarksResponse", "ListBenchmarksResponse",
"RegisterBenchmarkRequest",
"UnregisterBenchmarkRequest",
"ListDatasetsResponse", "ListDatasetsResponse",
"ListModelsResponse", "ListModelsResponse",
"ListOpenAIChatCompletionResponse", "ListOpenAIChatCompletionResponse",

View file

@ -1,105 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import Any, Literal, Protocol, runtime_checkable
from pydantic import BaseModel, Field
from llama_stack_api.resource import Resource, ResourceType
from llama_stack_api.schema_utils import json_schema_type, webmethod
from llama_stack_api.version import LLAMA_STACK_API_V1ALPHA
class CommonBenchmarkFields(BaseModel):
dataset_id: str
scoring_functions: list[str]
metadata: dict[str, Any] = Field(
default_factory=dict,
description="Metadata for this evaluation task",
)
@json_schema_type
class Benchmark(CommonBenchmarkFields, Resource):
"""A benchmark resource for evaluating model performance.
:param dataset_id: Identifier of the dataset to use for the benchmark evaluation
:param scoring_functions: List of scoring function identifiers to apply during evaluation
:param metadata: Metadata for this evaluation task
:param type: The resource type, always benchmark
"""
type: Literal[ResourceType.benchmark] = ResourceType.benchmark
@property
def benchmark_id(self) -> str:
return self.identifier
@property
def provider_benchmark_id(self) -> str | None:
return self.provider_resource_id
class BenchmarkInput(CommonBenchmarkFields, BaseModel):
benchmark_id: str
provider_id: str | None = None
provider_benchmark_id: str | None = None
@json_schema_type
class ListBenchmarksResponse(BaseModel):
data: list[Benchmark]
@runtime_checkable
class Benchmarks(Protocol):
@webmethod(route="/eval/benchmarks", method="GET", level=LLAMA_STACK_API_V1ALPHA)
async def list_benchmarks(self) -> ListBenchmarksResponse:
"""List all benchmarks.
:returns: A ListBenchmarksResponse.
"""
...
@webmethod(route="/eval/benchmarks/{benchmark_id}", method="GET", level=LLAMA_STACK_API_V1ALPHA)
async def get_benchmark(
self,
benchmark_id: str,
) -> Benchmark:
"""Get a benchmark by its ID.
:param benchmark_id: The ID of the benchmark to get.
:returns: A Benchmark.
"""
...
@webmethod(route="/eval/benchmarks", method="POST", level=LLAMA_STACK_API_V1ALPHA, deprecated=True)
async def register_benchmark(
self,
benchmark_id: str,
dataset_id: str,
scoring_functions: list[str],
provider_benchmark_id: str | None = None,
provider_id: str | None = None,
metadata: dict[str, Any] | None = None,
) -> None:
"""Register a benchmark.
:param benchmark_id: The ID of the benchmark to register.
:param dataset_id: The ID of the dataset to use for the benchmark.
:param scoring_functions: The scoring functions to use for the benchmark.
:param provider_benchmark_id: The ID of the provider benchmark to use for the benchmark.
:param provider_id: The ID of the provider to use for the benchmark.
:param metadata: The metadata to use for the benchmark.
"""
...
@webmethod(route="/eval/benchmarks/{benchmark_id}", method="DELETE", level=LLAMA_STACK_API_V1ALPHA, deprecated=True)
async def unregister_benchmark(self, benchmark_id: str) -> None:
"""Unregister a benchmark.
:param benchmark_id: The ID of the benchmark to unregister.
"""
...

View file

@ -0,0 +1,43 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
"""Benchmarks API protocol and models.
This module contains the Benchmarks protocol definition.
Pydantic models are defined in llama_stack_api.benchmarks.models.
The FastAPI router is defined in llama_stack_api.benchmarks.fastapi_routes.
"""
# Import fastapi_routes for router factory access
from . import fastapi_routes
# Import protocol for re-export
from .api import Benchmarks
# Import models for re-export
from .models import (
Benchmark,
BenchmarkInput,
CommonBenchmarkFields,
GetBenchmarkRequest,
ListBenchmarksRequest,
ListBenchmarksResponse,
RegisterBenchmarkRequest,
UnregisterBenchmarkRequest,
)
__all__ = [
"Benchmarks",
"Benchmark",
"BenchmarkInput",
"CommonBenchmarkFields",
"ListBenchmarksResponse",
"ListBenchmarksRequest",
"GetBenchmarkRequest",
"RegisterBenchmarkRequest",
"UnregisterBenchmarkRequest",
"fastapi_routes",
]

View file

@ -0,0 +1,39 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import Protocol, runtime_checkable
from .models import (
Benchmark,
GetBenchmarkRequest,
ListBenchmarksRequest,
ListBenchmarksResponse,
RegisterBenchmarkRequest,
UnregisterBenchmarkRequest,
)
@runtime_checkable
class Benchmarks(Protocol):
async def list_benchmarks(
self,
request: ListBenchmarksRequest,
) -> ListBenchmarksResponse: ...
async def get_benchmark(
self,
request: GetBenchmarkRequest,
) -> Benchmark: ...
async def register_benchmark(
self,
request: RegisterBenchmarkRequest,
) -> None: ...
async def unregister_benchmark(
self,
request: UnregisterBenchmarkRequest,
) -> None: ...

View file

@ -0,0 +1,109 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
"""FastAPI router for the Benchmarks API.
This module defines the FastAPI router for the Benchmarks API using standard
FastAPI route decorators. The router is defined in the API package to keep
all API-related code together.
"""
from typing import Annotated
from fastapi import APIRouter, Body, Depends
from llama_stack_api.router_utils import create_path_dependency, create_query_dependency, standard_responses
from llama_stack_api.version import LLAMA_STACK_API_V1ALPHA
from .api import Benchmarks
from .models import (
Benchmark,
GetBenchmarkRequest,
ListBenchmarksRequest,
ListBenchmarksResponse,
RegisterBenchmarkRequest,
UnregisterBenchmarkRequest,
)
# Automatically generate dependency functions from Pydantic models
# This ensures the models are the single source of truth for descriptions
get_list_benchmarks_request = create_query_dependency(ListBenchmarksRequest)
get_get_benchmark_request = create_path_dependency(GetBenchmarkRequest)
get_unregister_benchmark_request = create_path_dependency(UnregisterBenchmarkRequest)
def create_router(impl: Benchmarks) -> APIRouter:
"""Create a FastAPI router for the Benchmarks API.
Args:
impl: The Benchmarks implementation instance
Returns:
APIRouter configured for the Benchmarks API
"""
router = APIRouter(
prefix=f"/{LLAMA_STACK_API_V1ALPHA}",
tags=["Benchmarks"],
responses=standard_responses,
)
@router.get(
"/eval/benchmarks",
response_model=ListBenchmarksResponse,
summary="List all benchmarks.",
description="List all benchmarks.",
responses={
200: {"description": "A ListBenchmarksResponse."},
},
)
async def list_benchmarks(
request: Annotated[ListBenchmarksRequest, Depends(get_list_benchmarks_request)],
) -> ListBenchmarksResponse:
return await impl.list_benchmarks(request)
@router.get(
"/eval/benchmarks/{benchmark_id}",
response_model=Benchmark,
summary="Get a benchmark by its ID.",
description="Get a benchmark by its ID.",
responses={
200: {"description": "A Benchmark."},
},
)
async def get_benchmark(
request: Annotated[GetBenchmarkRequest, Depends(get_get_benchmark_request)],
) -> Benchmark:
return await impl.get_benchmark(request)
@router.post(
"/eval/benchmarks",
summary="Register a benchmark.",
description="Register a benchmark.",
responses={
200: {"description": "The benchmark was successfully registered."},
},
deprecated=True,
)
async def register_benchmark(
request: Annotated[RegisterBenchmarkRequest, Body(...)],
) -> None:
return await impl.register_benchmark(request)
@router.delete(
"/eval/benchmarks/{benchmark_id}",
summary="Unregister a benchmark.",
description="Unregister a benchmark.",
responses={
200: {"description": "The benchmark was successfully unregistered."},
},
deprecated=True,
)
async def unregister_benchmark(
request: Annotated[UnregisterBenchmarkRequest, Depends(get_unregister_benchmark_request)],
) -> None:
return await impl.unregister_benchmark(request)
return router

View file

@ -0,0 +1,109 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
"""Pydantic models for Benchmarks API requests and responses.
This module defines the request and response models for the Benchmarks API
using Pydantic with Field descriptions for OpenAPI schema generation.
"""
from typing import Any, Literal
from pydantic import BaseModel, Field
from llama_stack_api.resource import Resource, ResourceType
from llama_stack_api.schema_utils import json_schema_type
@json_schema_type
class ListBenchmarksRequest(BaseModel):
"""Request model for listing benchmarks."""
pass
@json_schema_type
class GetBenchmarkRequest(BaseModel):
"""Request model for getting a benchmark."""
benchmark_id: str = Field(..., description="The ID of the benchmark to get.")
@json_schema_type
class RegisterBenchmarkRequest(BaseModel):
"""Request model for registering a benchmark."""
benchmark_id: str = Field(..., description="The ID of the benchmark to register.")
dataset_id: str = Field(..., description="The ID of the dataset to use for the benchmark.")
scoring_functions: list[str] = Field(..., description="The scoring functions to use for the benchmark.")
provider_benchmark_id: str | None = Field(
default=None, description="The ID of the provider benchmark to use for the benchmark."
)
provider_id: str | None = Field(default=None, description="The ID of the provider to use for the benchmark.")
metadata: dict[str, Any] | None = Field(default=None, description="The metadata to use for the benchmark.")
@json_schema_type
class UnregisterBenchmarkRequest(BaseModel):
"""Request model for unregistering a benchmark."""
benchmark_id: str = Field(..., description="The ID of the benchmark to unregister.")
class CommonBenchmarkFields(BaseModel):
dataset_id: str = Field(..., description="Identifier of the dataset to use for the benchmark evaluation.")
scoring_functions: list[str] = Field(
..., description="List of scoring function identifiers to apply during evaluation."
)
metadata: dict[str, Any] = Field(
default_factory=dict,
description="Metadata for this evaluation task.",
)
@json_schema_type
class Benchmark(CommonBenchmarkFields, Resource):
"""A benchmark resource for evaluating model performance."""
type: Literal[ResourceType.benchmark] = Field(
default=ResourceType.benchmark,
description="The resource type, always benchmark.",
)
@property
def benchmark_id(self) -> str:
return self.identifier
@property
def provider_benchmark_id(self) -> str | None:
return self.provider_resource_id
class BenchmarkInput(CommonBenchmarkFields, BaseModel):
benchmark_id: str = Field(..., description="The ID of the benchmark.")
provider_id: str | None = Field(default=None, description="The ID of the provider to use for the benchmark.")
provider_benchmark_id: str | None = Field(
default=None, description="The ID of the provider benchmark to use for the benchmark."
)
@json_schema_type
class ListBenchmarksResponse(BaseModel):
"""Response containing a list of benchmark objects."""
data: list[Benchmark] = Field(..., description="List of benchmark objects.")
__all__ = [
"ListBenchmarksRequest",
"GetBenchmarkRequest",
"RegisterBenchmarkRequest",
"UnregisterBenchmarkRequest",
"CommonBenchmarkFields",
"Benchmark",
"BenchmarkInput",
"ListBenchmarksResponse",
]

View file

@ -22,14 +22,17 @@ from llama_stack_api import (
Api, Api,
Dataset, Dataset,
DatasetPurpose, DatasetPurpose,
ListBenchmarksRequest,
ListToolDefsResponse, ListToolDefsResponse,
Model, Model,
ModelNotFoundError, ModelNotFoundError,
ModelType, ModelType,
NumberType, NumberType,
RegisterBenchmarkRequest,
Shield, Shield,
ToolDef, ToolDef,
ToolGroup, ToolGroup,
UnregisterBenchmarkRequest,
URIDataSource, URIDataSource,
) )
@ -420,24 +423,26 @@ async def test_benchmarks_routing_table(cached_disk_dist_registry):
# Register multiple benchmarks and verify listing # Register multiple benchmarks and verify listing
await table.register_benchmark( await table.register_benchmark(
benchmark_id="test-benchmark", RegisterBenchmarkRequest(
dataset_id="test-dataset", benchmark_id="test-benchmark",
scoring_functions=["test-scoring-fn", "test-scoring-fn-2"], dataset_id="test-dataset",
scoring_functions=["test-scoring-fn", "test-scoring-fn-2"],
)
) )
benchmarks = await table.list_benchmarks() benchmarks = await table.list_benchmarks(ListBenchmarksRequest())
assert len(benchmarks.data) == 1 assert len(benchmarks.data) == 1
benchmark_ids = {b.identifier for b in benchmarks.data} benchmark_ids = {b.identifier for b in benchmarks.data}
assert "test-benchmark" in benchmark_ids assert "test-benchmark" in benchmark_ids
# Unregister the benchmark and verify removal # Unregister the benchmark and verify removal
await table.unregister_benchmark(benchmark_id="test-benchmark") await table.unregister_benchmark(UnregisterBenchmarkRequest(benchmark_id="test-benchmark"))
benchmarks_after = await table.list_benchmarks() benchmarks_after = await table.list_benchmarks(ListBenchmarksRequest())
assert len(benchmarks_after.data) == 0 assert len(benchmarks_after.data) == 0
# Unregistering a non-existent benchmark should raise a clear error # Unregistering a non-existent benchmark should raise a clear error
with pytest.raises(ValueError, match="Benchmark 'dummy_benchmark' not found"): with pytest.raises(ValueError, match="Benchmark 'dummy_benchmark' not found"):
await table.unregister_benchmark(benchmark_id="dummy_benchmark") await table.unregister_benchmark(UnregisterBenchmarkRequest(benchmark_id="dummy_benchmark"))
async def test_tool_groups_routing_table(cached_disk_dist_registry): async def test_tool_groups_routing_table(cached_disk_dist_registry):