From bcdb6fcc1503fe2c40cf63da8f03a2956c193959 Mon Sep 17 00:00:00 2001 From: Sai Soundararaj Date: Tue, 1 Jul 2025 17:03:45 -0700 Subject: [PATCH] a --- api_update_plan.md | 2 +- docs/_static/llama-stack-spec.html | 16 ++++++++++------ docs/_static/llama-stack-spec.yaml | 9 ++++++++- llama_stack/apis/benchmarks/benchmarks.py | 7 +++++++ 4 files changed, 26 insertions(+), 8 deletions(-) diff --git a/api_update_plan.md b/api_update_plan.md index 20a8938e1..ec93f1440 100644 --- a/api_update_plan.md +++ b/api_update_plan.md @@ -234,7 +234,7 @@ Before finalizing documentation, verify: [x] 13. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/eval/eval.py` - Evaluation framework [x] 14. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/scoring/scoring.py` - Scoring system [x] 15. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/scoring_functions/scoring_functions.py` - Scoring function definitions -16. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/benchmarks/benchmarks.py` - Benchmarking framework +[x] 16. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/benchmarks/benchmarks.py` - Benchmarking framework 17. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/shields/shields.py` - Safety shields 18. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/batch_inference/batch_inference.py` - Batch inference operations 19. `/Users/saip/Documents/GitHub/llama-stack/llama_stack/apis/synthetic_data_generation/synthetic_data_generation.py` - Data generation diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 264b2e6b4..c28135dd2 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -9797,18 +9797,20 @@ "tool", "tool_group" ], - "title": "ResourceType", "const": "benchmark", - "default": "benchmark" + "default": "benchmark", + "description": "The resource type, always benchmark" }, "dataset_id": { - "type": "string" + "type": "string", + "description": "Identifier of the dataset to use for the benchmark evaluation" }, "scoring_functions": { "type": "array", "items": { "type": "string" - } + }, + "description": "List of scoring function identifiers to apply during evaluation" }, "metadata": { "type": "object", @@ -9833,7 +9835,8 @@ "type": "object" } ] - } + }, + "description": "Metadata for this evaluation task" } }, "additionalProperties": false, @@ -9845,7 +9848,8 @@ "scoring_functions", "metadata" ], - "title": "Benchmark" + "title": "Benchmark", + "description": "A benchmark resource for evaluating model performance." }, "OpenAIAssistantMessageParam": { "type": "object", diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index d24276596..995ff8601 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -7037,15 +7037,19 @@ components: - benchmark - tool - tool_group - title: ResourceType const: benchmark default: benchmark + description: The resource type, always benchmark dataset_id: type: string + description: >- + Identifier of the dataset to use for the benchmark evaluation scoring_functions: type: array items: type: string + description: >- + List of scoring function identifiers to apply during evaluation metadata: type: object additionalProperties: @@ -7056,6 +7060,7 @@ components: - type: string - type: array - type: object + description: Metadata for this evaluation task additionalProperties: false required: - identifier @@ -7065,6 +7070,8 @@ components: - scoring_functions - metadata title: Benchmark + description: >- + A benchmark resource for evaluating model performance. OpenAIAssistantMessageParam: type: object properties: diff --git a/llama_stack/apis/benchmarks/benchmarks.py b/llama_stack/apis/benchmarks/benchmarks.py index d80c767f8..ff36007c6 100644 --- a/llama_stack/apis/benchmarks/benchmarks.py +++ b/llama_stack/apis/benchmarks/benchmarks.py @@ -22,6 +22,13 @@ class CommonBenchmarkFields(BaseModel): @json_schema_type class Benchmark(CommonBenchmarkFields, Resource): + """A benchmark resource for evaluating model performance. + + :param dataset_id: Identifier of the dataset to use for the benchmark evaluation + :param scoring_functions: List of scoring function identifiers to apply during evaluation + :param metadata: Metadata for this evaluation task + :param type: The resource type, always benchmark + """ type: Literal[ResourceType.benchmark] = ResourceType.benchmark @property