feat: convert Benchmarks API to use FastAPI router (#4309)

# What does this PR do? Convert the Benchmarks API from @webmethod decorators to FastAPI router pattern, matching the Batches API structure. One notable change is the update of stack.py to handle request models in register_resources(). Closes: #4308 ## Test Plan CI and `curl http://localhost:8321/v1/inspect/routes | jq '.data[] | select(.route | contains("benchmark"))'` --------- Signed-off-by: Sébastien Han <seb@redhat.com>
2025-12-17 09:29:47 +00:00 · 2025-12-10 15:04:27 +01:00 · 2025-12-10 15:04:27 +01:00 · ff375f1abb
commit ff375f1abb
parent 661985e240
18 changed files with 862 additions and 195 deletions
--- a/docs/static/deprecated-llama-stack-spec.yaml
+++ b/docs/static/deprecated-llama-stack-spec.yaml
@ -794,7 +794,7 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
      - Benchmarks
-      summary: List Benchmarks
+      summary: List all benchmarks.
      description: List all benchmarks.
      operationId: list_benchmarks_v1alpha_eval_benchmarks_get
    post:
@ -812,10 +812,10 @@ paths:
          description: Default Response
          $ref: '#/components/responses/DefaultError'
        '204':
-          description: Successful Response
+          description: The benchmark was successfully registered.
      tags:
      - Benchmarks
-      summary: Register Benchmark
+      summary: Register a benchmark.
      description: Register a benchmark.
      operationId: register_benchmark_v1alpha_eval_benchmarks_post
      requestBody:
@ -835,20 +835,20 @@ paths:
              schema:
                $ref: '#/components/schemas/Benchmark'
        '400':
-          description: Bad Request
          $ref: '#/components/responses/BadRequest400'
+          description: Bad Request
        '429':
-          description: Too Many Requests
          $ref: '#/components/responses/TooManyRequests429'
+          description: Too Many Requests
        '500':
-          description: Internal Server Error
          $ref: '#/components/responses/InternalServerError500'
+          description: Internal Server Error
        default:
-          description: Default Response
          $ref: '#/components/responses/DefaultError'
+          description: Default Response
      tags:
      - Benchmarks
-      summary: Get Benchmark
+      summary: Get a benchmark by its ID.
      description: Get a benchmark by its ID.
      operationId: get_benchmark_v1alpha_eval_benchmarks__benchmark_id__get
      parameters:
@ -857,26 +857,28 @@ paths:
        required: true
        schema:
          type: string
-        description: 'Path parameter: benchmark_id'
+          description: The ID of the benchmark to get.
+          title: Benchmark Id
+        description: The ID of the benchmark to get.
    delete:
      responses:
        '400':
-          description: Bad Request
          $ref: '#/components/responses/BadRequest400'
+          description: Bad Request
        '429':
-          description: Too Many Requests
          $ref: '#/components/responses/TooManyRequests429'
+          description: Too Many Requests
        '500':
-          description: Internal Server Error
          $ref: '#/components/responses/InternalServerError500'
+          description: Internal Server Error
        default:
-          description: Default Response
          $ref: '#/components/responses/DefaultError'
+          description: Default Response
        '204':
-          description: Successful Response
+          description: The benchmark was successfully unregistered.
      tags:
      - Benchmarks
-      summary: Unregister Benchmark
+      summary: Unregister a benchmark.
      description: Unregister a benchmark.
      operationId: unregister_benchmark_v1alpha_eval_benchmarks__benchmark_id__delete
      parameters:
@ -885,7 +887,9 @@ paths:
        required: true
        schema:
          type: string
-        description: 'Path parameter: benchmark_id'
+          description: The ID of the benchmark to unregister.
+          title: Benchmark Id
+        description: The ID of the benchmark to unregister.
      deprecated: true
 components:
  schemas:
@ -7375,20 +7379,23 @@ components:
          type: string
          const: benchmark
          title: Type
+          description: The resource type, always benchmark.
          default: benchmark
        dataset_id:
          type: string
          title: Dataset Id
+          description: Identifier of the dataset to use for the benchmark evaluation.
        scoring_functions:
          items:
            type: string
          type: array
          title: Scoring Functions
+          description: List of scoring function identifiers to apply during evaluation.
        metadata:
          additionalProperties: true
          type: object
          title: Metadata
-          description: Metadata for this evaluation task
+          description: Metadata for this evaluation task.
      type: object
      required:
      - identifier
@ -7404,10 +7411,12 @@ components:
            $ref: '#/components/schemas/Benchmark'
          type: array
          title: Data
+          description: List of benchmark objects.
      type: object
      required:
      - data
      title: ListBenchmarksResponse
+      description: Response containing a list of benchmark objects.
    BenchmarkConfig:
      properties:
        eval_candidate:
@ -8369,33 +8378,40 @@ components:
        benchmark_id:
          type: string
          title: Benchmark Id
+          description: The ID of the benchmark to register.
        dataset_id:
          type: string
          title: Dataset Id
+          description: The ID of the dataset to use for the benchmark.
        scoring_functions:
          items:
            type: string
          type: array
          title: Scoring Functions
+          description: The scoring functions to use for the benchmark.
        provider_benchmark_id:
          anyOf:
          - type: string
          - type: 'null'
+          description: The ID of the provider benchmark to use for the benchmark.
        provider_id:
          anyOf:
          - type: string
          - type: 'null'
+          description: The ID of the provider to use for the benchmark.
        metadata:
          anyOf:
          - additionalProperties: true
            type: object
          - type: 'null'
+          description: The metadata to use for the benchmark.
      type: object
      required:
      - benchmark_id
      - dataset_id
      - scoring_functions
      title: RegisterBenchmarkRequest
+      description: Request model for registering a benchmark.
    AllowedToolsFilter:
      properties:
        tool_names:
@ -9601,6 +9617,33 @@ components:
      - batch_id
      title: CancelBatchRequest
      type: object
+    ListBenchmarksRequest:
+      description: Request model for listing benchmarks.
+      properties: {}
+      title: ListBenchmarksRequest
+      type: object
+    GetBenchmarkRequest:
+      description: Request model for getting a benchmark.
+      properties:
+        benchmark_id:
+          description: The ID of the benchmark to get.
+          title: Benchmark Id
+          type: string
+      required:
+      - benchmark_id
+      title: GetBenchmarkRequest
+      type: object
+    UnregisterBenchmarkRequest:
+      description: Request model for unregistering a benchmark.
+      properties:
+        benchmark_id:
+          description: The ID of the benchmark to unregister.
+          title: Benchmark Id
+          type: string
+      required:
+      - benchmark_id
+      title: UnregisterBenchmarkRequest
+      type: object
    DialogType:
      description: Parameter type for dialog data with semantic output labels.
      properties:
--- a/docs/static/experimental-llama-stack-spec.yaml
+++ b/docs/static/experimental-llama-stack-spec.yaml
@ -188,7 +188,7 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
      - Benchmarks
-      summary: List Benchmarks
+      summary: List all benchmarks.
      description: List all benchmarks.
      operationId: list_benchmarks_v1alpha_eval_benchmarks_get
  /v1alpha/eval/benchmarks/{benchmark_id}:
@ -201,20 +201,20 @@ paths:
              schema:
                $ref: '#/components/schemas/Benchmark'
        '400':
-          description: Bad Request
          $ref: '#/components/responses/BadRequest400'
+          description: Bad Request
        '429':
-          description: Too Many Requests
          $ref: '#/components/responses/TooManyRequests429'
+          description: Too Many Requests
        '500':
-          description: Internal Server Error
          $ref: '#/components/responses/InternalServerError500'
+          description: Internal Server Error
        default:
-          description: Default Response
          $ref: '#/components/responses/DefaultError'
+          description: Default Response
      tags:
      - Benchmarks
-      summary: Get Benchmark
+      summary: Get a benchmark by its ID.
      description: Get a benchmark by its ID.
      operationId: get_benchmark_v1alpha_eval_benchmarks__benchmark_id__get
      parameters:
@ -223,7 +223,9 @@ paths:
        required: true
        schema:
          type: string
-        description: 'Path parameter: benchmark_id'
+          description: The ID of the benchmark to get.
+          title: Benchmark Id
+        description: The ID of the benchmark to get.
  /v1alpha/eval/benchmarks/{benchmark_id}/evaluations:
    post:
      responses:
@ -6517,20 +6519,23 @@ components:
          type: string
          const: benchmark
          title: Type
+          description: The resource type, always benchmark.
          default: benchmark
        dataset_id:
          type: string
          title: Dataset Id
+          description: Identifier of the dataset to use for the benchmark evaluation.
        scoring_functions:
          items:
            type: string
          type: array
          title: Scoring Functions
+          description: List of scoring function identifiers to apply during evaluation.
        metadata:
          additionalProperties: true
          type: object
          title: Metadata
-          description: Metadata for this evaluation task
+          description: Metadata for this evaluation task.
      type: object
      required:
      - identifier
@ -6546,10 +6551,12 @@ components:
            $ref: '#/components/schemas/Benchmark'
          type: array
          title: Data
+          description: List of benchmark objects.
      type: object
      required:
      - data
      title: ListBenchmarksResponse
+      description: Response containing a list of benchmark objects.
    BenchmarkConfig:
      properties:
        eval_candidate:
@ -7346,6 +7353,45 @@ components:
      - $ref: '#/components/schemas/RowsDataSource'
        title: RowsDataSource
      title: URIDataSource | RowsDataSource
+    RegisterBenchmarkRequest:
+      properties:
+        benchmark_id:
+          type: string
+          title: Benchmark Id
+          description: The ID of the benchmark to register.
+        dataset_id:
+          type: string
+          title: Dataset Id
+          description: The ID of the dataset to use for the benchmark.
+        scoring_functions:
+          items:
+            type: string
+          type: array
+          title: Scoring Functions
+          description: The scoring functions to use for the benchmark.
+        provider_benchmark_id:
+          anyOf:
+          - type: string
+          - type: 'null'
+          description: The ID of the provider benchmark to use for the benchmark.
+        provider_id:
+          anyOf:
+          - type: string
+          - type: 'null'
+          description: The ID of the provider to use for the benchmark.
+        metadata:
+          anyOf:
+          - additionalProperties: true
+            type: object
+          - type: 'null'
+          description: The metadata to use for the benchmark.
+      type: object
+      required:
+      - benchmark_id
+      - dataset_id
+      - scoring_functions
+      title: RegisterBenchmarkRequest
+      description: Request model for registering a benchmark.
    AllowedToolsFilter:
      properties:
        tool_names:
@ -8395,6 +8441,33 @@ components:
      - batch_id
      title: CancelBatchRequest
      type: object
+    ListBenchmarksRequest:
+      description: Request model for listing benchmarks.
+      properties: {}
+      title: ListBenchmarksRequest
+      type: object
+    GetBenchmarkRequest:
+      description: Request model for getting a benchmark.
+      properties:
+        benchmark_id:
+          description: The ID of the benchmark to get.
+          title: Benchmark Id
+          type: string
+      required:
+      - benchmark_id
+      title: GetBenchmarkRequest
+      type: object
+    UnregisterBenchmarkRequest:
+      description: Request model for unregistering a benchmark.
+      properties:
+        benchmark_id:
+          description: The ID of the benchmark to unregister.
+          title: Benchmark Id
+          type: string
+      required:
+      - benchmark_id
+      title: UnregisterBenchmarkRequest
+      type: object
    DialogType:
      description: Parameter type for dialog data with semantic output labels.
      properties:
--- a/docs/static/llama-stack-spec.yaml
+++ b/docs/static/llama-stack-spec.yaml
@ -9166,20 +9166,23 @@ components:
          type: string
          const: benchmark
          title: Type
+          description: The resource type, always benchmark.
          default: benchmark
        dataset_id:
          type: string
          title: Dataset Id
+          description: Identifier of the dataset to use for the benchmark evaluation.
        scoring_functions:
          items:
            type: string
          type: array
          title: Scoring Functions
+          description: List of scoring function identifiers to apply during evaluation.
        metadata:
          additionalProperties: true
          type: object
          title: Metadata
-          description: Metadata for this evaluation task
+          description: Metadata for this evaluation task.
      type: object
      required:
      - identifier
@ -9195,10 +9198,12 @@ components:
            $ref: '#/components/schemas/Benchmark'
          type: array
          title: Data
+          description: List of benchmark objects.
      type: object
      required:
      - data
      title: ListBenchmarksResponse
+      description: Response containing a list of benchmark objects.
    BenchmarkConfig:
      properties:
        eval_candidate:
@ -9848,6 +9853,45 @@ components:
      - $ref: '#/components/schemas/RowsDataSource'
        title: RowsDataSource
      title: URIDataSource | RowsDataSource
+    RegisterBenchmarkRequest:
+      properties:
+        benchmark_id:
+          type: string
+          title: Benchmark Id
+          description: The ID of the benchmark to register.
+        dataset_id:
+          type: string
+          title: Dataset Id
+          description: The ID of the dataset to use for the benchmark.
+        scoring_functions:
+          items:
+            type: string
+          type: array
+          title: Scoring Functions
+          description: The scoring functions to use for the benchmark.
+        provider_benchmark_id:
+          anyOf:
+          - type: string
+          - type: 'null'
+          description: The ID of the provider benchmark to use for the benchmark.
+        provider_id:
+          anyOf:
+          - type: string
+          - type: 'null'
+          description: The ID of the provider to use for the benchmark.
+        metadata:
+          anyOf:
+          - additionalProperties: true
+            type: object
+          - type: 'null'
+          description: The metadata to use for the benchmark.
+      type: object
+      required:
+      - benchmark_id
+      - dataset_id
+      - scoring_functions
+      title: RegisterBenchmarkRequest
+      description: Request model for registering a benchmark.
    AllowedToolsFilter:
      properties:
        tool_names:
@ -11053,6 +11097,33 @@ components:
      - batch_id
      title: CancelBatchRequest
      type: object
+    ListBenchmarksRequest:
+      description: Request model for listing benchmarks.
+      properties: {}
+      title: ListBenchmarksRequest
+      type: object
+    GetBenchmarkRequest:
+      description: Request model for getting a benchmark.
+      properties:
+        benchmark_id:
+          description: The ID of the benchmark to get.
+          title: Benchmark Id
+          type: string
+      required:
+      - benchmark_id
+      title: GetBenchmarkRequest
+      type: object
+    UnregisterBenchmarkRequest:
+      description: Request model for unregistering a benchmark.
+      properties:
+        benchmark_id:
+          description: The ID of the benchmark to unregister.
+          title: Benchmark Id
+          type: string
+      required:
+      - benchmark_id
+      title: UnregisterBenchmarkRequest
+      type: object
    DialogType:
      description: Parameter type for dialog data with semantic output labels.
      properties:
--- a/docs/static/stainless-llama-stack-spec.yaml
+++ b/docs/static/stainless-llama-stack-spec.yaml
@ -3404,7 +3404,7 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
      - Benchmarks
-      summary: List Benchmarks
+      summary: List all benchmarks.
      description: List all benchmarks.
      operationId: list_benchmarks_v1alpha_eval_benchmarks_get
    post:
@ -3422,10 +3422,10 @@ paths:
          description: Default Response
          $ref: '#/components/responses/DefaultError'
        '204':
-          description: Successful Response
+          description: The benchmark was successfully registered.
      tags:
      - Benchmarks
-      summary: Register Benchmark
+      summary: Register a benchmark.
      description: Register a benchmark.
      operationId: register_benchmark_v1alpha_eval_benchmarks_post
      requestBody:
@ -3445,20 +3445,20 @@ paths:
              schema:
                $ref: '#/components/schemas/Benchmark'
        '400':
-          description: Bad Request
          $ref: '#/components/responses/BadRequest400'
+          description: Bad Request
        '429':
-          description: Too Many Requests
          $ref: '#/components/responses/TooManyRequests429'
+          description: Too Many Requests
        '500':
-          description: Internal Server Error
          $ref: '#/components/responses/InternalServerError500'
+          description: Internal Server Error
        default:
-          description: Default Response
          $ref: '#/components/responses/DefaultError'
+          description: Default Response
      tags:
      - Benchmarks
-      summary: Get Benchmark
+      summary: Get a benchmark by its ID.
      description: Get a benchmark by its ID.
      operationId: get_benchmark_v1alpha_eval_benchmarks__benchmark_id__get
      parameters:
@ -3467,26 +3467,28 @@ paths:
        required: true
        schema:
          type: string
-        description: 'Path parameter: benchmark_id'
+          description: The ID of the benchmark to get.
+          title: Benchmark Id
+        description: The ID of the benchmark to get.
    delete:
      responses:
        '400':
-          description: Bad Request
          $ref: '#/components/responses/BadRequest400'
+          description: Bad Request
        '429':
-          description: Too Many Requests
          $ref: '#/components/responses/TooManyRequests429'
+          description: Too Many Requests
        '500':
-          description: Internal Server Error
          $ref: '#/components/responses/InternalServerError500'
+          description: Internal Server Error
        default:
-          description: Default Response
          $ref: '#/components/responses/DefaultError'
+          description: Default Response
        '204':
-          description: Successful Response
+          description: The benchmark was successfully unregistered.
      tags:
      - Benchmarks
-      summary: Unregister Benchmark
+      summary: Unregister a benchmark.
      description: Unregister a benchmark.
      operationId: unregister_benchmark_v1alpha_eval_benchmarks__benchmark_id__delete
      parameters:
@ -3495,7 +3497,9 @@ paths:
        required: true
        schema:
          type: string
-        description: 'Path parameter: benchmark_id'
+          description: The ID of the benchmark to unregister.
+          title: Benchmark Id
+        description: The ID of the benchmark to unregister.
      deprecated: true
  /v1alpha/eval/benchmarks/{benchmark_id}/evaluations:
    post:
@ -10391,20 +10395,23 @@ components:
          type: string
          const: benchmark
          title: Type
+          description: The resource type, always benchmark.
          default: benchmark
        dataset_id:
          type: string
          title: Dataset Id
+          description: Identifier of the dataset to use for the benchmark evaluation.
        scoring_functions:
          items:
            type: string
          type: array
          title: Scoring Functions
+          description: List of scoring function identifiers to apply during evaluation.
        metadata:
          additionalProperties: true
          type: object
          title: Metadata
-          description: Metadata for this evaluation task
+          description: Metadata for this evaluation task.
      type: object
      required:
      - identifier
@ -10420,10 +10427,12 @@ components:
            $ref: '#/components/schemas/Benchmark'
          type: array
          title: Data
+          description: List of benchmark objects.
      type: object
      required:
      - data
      title: ListBenchmarksResponse
+      description: Response containing a list of benchmark objects.
    BenchmarkConfig:
      properties:
        eval_candidate:
@ -11385,33 +11394,40 @@ components:
        benchmark_id:
          type: string
          title: Benchmark Id
+          description: The ID of the benchmark to register.
        dataset_id:
          type: string
          title: Dataset Id
+          description: The ID of the dataset to use for the benchmark.
        scoring_functions:
          items:
            type: string
          type: array
          title: Scoring Functions
+          description: The scoring functions to use for the benchmark.
        provider_benchmark_id:
          anyOf:
          - type: string
          - type: 'null'
+          description: The ID of the provider benchmark to use for the benchmark.
        provider_id:
          anyOf:
          - type: string
          - type: 'null'
+          description: The ID of the provider to use for the benchmark.
        metadata:
          anyOf:
          - additionalProperties: true
            type: object
          - type: 'null'
+          description: The metadata to use for the benchmark.
      type: object
      required:
      - benchmark_id
      - dataset_id
      - scoring_functions
      title: RegisterBenchmarkRequest
+      description: Request model for registering a benchmark.
    AllowedToolsFilter:
      properties:
        tool_names:
@ -12617,6 +12633,33 @@ components:
      - batch_id
      title: CancelBatchRequest
      type: object
+    ListBenchmarksRequest:
+      description: Request model for listing benchmarks.
+      properties: {}
+      title: ListBenchmarksRequest
+      type: object
+    GetBenchmarkRequest:
+      description: Request model for getting a benchmark.
+      properties:
+        benchmark_id:
+          description: The ID of the benchmark to get.
+          title: Benchmark Id
+          type: string
+      required:
+      - benchmark_id
+      title: GetBenchmarkRequest
+      type: object
+    UnregisterBenchmarkRequest:
+      description: Request model for unregistering a benchmark.
+      properties:
+        benchmark_id:
+          description: The ID of the benchmark to unregister.
+          title: Benchmark Id
+          type: string
+      required:
+      - benchmark_id
+      title: UnregisterBenchmarkRequest
+      type: object
    DialogType:
      description: Parameter type for dialog data with semantic output labels.
      properties: