a

2025-12-23 07:59:44 +00:00 · 2025-07-01 17:03:45 -07:00 · 2025-07-01 17:03:45 -07:00 · bcdb6fcc15
commit bcdb6fcc15
parent 78ef9c605f
4 changed files with 26 additions and 8 deletions
--- a/api_update_plan.md
+++ b/api_update_plan.md
@ -234,7 +234,7 @@ Before finalizing documentation, verify:
 [x] 13. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/eval/eval.py` - Evaluation framework
 [x] 14. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/scoring/scoring.py` - Scoring system
 [x] 15. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/scoring_functions/scoring_functions.py` - Scoring function definitions
-16. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/benchmarks/benchmarks.py` - Benchmarking framework
+[x] 16. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/benchmarks/benchmarks.py` - Benchmarking framework
 17. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/shields/shields.py` - Safety shields
 18. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/batch_inference/batch_inference.py` - Batch inference operations
 19. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/synthetic_data_generation/synthetic_data_generation.py` - Data generation
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -9797,18 +9797,20 @@
                            "tool",
                            "tool_group"
                        ],
                        "title": "ResourceType",
                        "const": "benchmark",
-                        "default": "benchmark"
+                        "default": "benchmark",
                        "description": "The resource type, always benchmark"
                    },
                    "dataset_id": {
-                        "type": "string"
+                        "type": "string",
                        "description": "Identifier of the dataset to use for the benchmark evaluation"
                    },
                    "scoring_functions": {
                        "type": "array",
                        "items": {
                            "type": "string"
-                        }
+                        },
                        "description": "List of scoring function identifiers to apply during evaluation"
                    },
                    "metadata": {
                        "type": "object",
@ -9833,7 +9835,8 @@
                                    "type": "object"
                                }
                            ]
-                        }
+                        },
                        "description": "Metadata for this evaluation task"
                    }
                },
                "additionalProperties": false,
@ -9845,7 +9848,8 @@
                    "scoring_functions",
                    "metadata"
                ],
-                "title": "Benchmark"
+                "title": "Benchmark",
                "description": "A benchmark resource for evaluating model performance."
            },
            "OpenAIAssistantMessageParam": {
                "type": "object",
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -7037,15 +7037,19 @@ components:
            - benchmark
            - tool
            - tool_group
          title: ResourceType
          const: benchmark
          default: benchmark
          description: The resource type, always benchmark
        dataset_id:
          type: string
          description: >-
            Identifier of the dataset to use for the benchmark evaluation
        scoring_functions:
          type: array
          items:
            type: string
          description: >-
            List of scoring function identifiers to apply during evaluation
        metadata:
          type: object
          additionalProperties:
@ -7056,6 +7060,7 @@ components:
              - type: string
              - type: array
              - type: object
          description: Metadata for this evaluation task
      additionalProperties: false
      required:
        - identifier
@ -7065,6 +7070,8 @@ components:
        - scoring_functions
        - metadata
      title: Benchmark
      description: >-
        A benchmark resource for evaluating model performance.
    OpenAIAssistantMessageParam:
      type: object
      properties:
--- a/llama_stack/apis/benchmarks/benchmarks.py
+++ b/llama_stack/apis/benchmarks/benchmarks.py
@ -22,6 +22,13 @@ class CommonBenchmarkFields(BaseModel):
@json_schema_type
 class Benchmark(CommonBenchmarkFields, Resource):
    """A benchmark resource for evaluating model performance.
    :param dataset_id: Identifier of the dataset to use for the benchmark evaluation
    :param scoring_functions: List of scoring function identifiers to apply during evaluation
    :param metadata: Metadata for this evaluation task
    :param type: The resource type, always benchmark
    """
    type: Literal[ResourceType.benchmark] = ResourceType.benchmark
    @property