From 3f8c7a584aede5a666bdc3eb5e7ac21185dc0ca8 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Sun, 23 Mar 2025 16:00:48 -0700 Subject: [PATCH] precommit --- distributions/dependencies.json | 57 -- docs/_static/llama-stack-spec.html | 542 +----------------- docs/_static/llama-stack-spec.yaml | 452 +-------------- llama_stack/apis/eval/eval.py | 1 - .../scoring_functions/scoring_functions.py | 1 - .../distribution/routers/routing_tables.py | 1 - llama_stack/providers/registry/eval.py | 1 - llama_stack/templates/open-benchmark/run.yaml | 13 +- 8 files changed, 31 insertions(+), 1037 deletions(-) diff --git a/distributions/dependencies.json b/distributions/dependencies.json index fe1509483..1767523d6 100644 --- a/distributions/dependencies.json +++ b/distributions/dependencies.json @@ -6,12 +6,10 @@ "chardet", "chromadb-client", "datasets", - "emoji", "faiss-cpu", "fastapi", "fire", "httpx", - "langdetect", "matplotlib", "mcp", "nltk", @@ -23,7 +21,6 @@ "psycopg2-binary", "pymongo", "pypdf", - "pythainlp", "redis", "requests", "scikit-learn", @@ -40,12 +37,10 @@ "chardet", "chromadb-client", "datasets", - "emoji", "faiss-cpu", "fastapi", "fire", "httpx", - "langdetect", "matplotlib", "nltk", "numpy", @@ -56,7 +51,6 @@ "psycopg2-binary", "pymongo", "pypdf", - "pythainlp", "redis", "requests", "scikit-learn", @@ -74,12 +68,10 @@ "chardet", "chromadb-client", "datasets", - "emoji", "fastapi", "fire", "fireworks-ai", "httpx", - "langdetect", "matplotlib", "mcp", "nltk", @@ -91,7 +83,6 @@ "psycopg2-binary", "pymongo", "pypdf", - "pythainlp", "redis", "requests", "scikit-learn", @@ -111,13 +102,11 @@ "chardet", "chromadb-client", "datasets", - "emoji", "faiss-cpu", "fastapi", "fire", "httpx", "huggingface_hub", - "langdetect", "matplotlib", "nltk", "numpy", @@ -128,7 +117,6 @@ "psycopg2-binary", "pymongo", "pypdf", - "pythainlp", "redis", "requests", "scikit-learn", @@ -146,12 +134,10 @@ "chardet", "chromadb-client", "datasets", - "emoji", "fastapi", "fire", "fireworks-ai", "httpx", - "langdetect", "litellm", "matplotlib", "mcp", @@ -164,7 +150,6 @@ "psycopg2-binary", "pymongo", "pypdf", - "pythainlp", "redis", "requests", "scikit-learn", @@ -183,13 +168,11 @@ "chardet", "chromadb-client", "datasets", - "emoji", "faiss-cpu", "fastapi", "fire", "fireworks-ai", "httpx", - "langdetect", "matplotlib", "mcp", "nltk", @@ -201,7 +184,6 @@ "psycopg2-binary", "pymongo", "pypdf", - "pythainlp", "redis", "requests", "scikit-learn", @@ -218,12 +200,10 @@ "blobfile", "chardet", "datasets", - "emoji", "faiss-cpu", "fastapi", "fire", "httpx", - "langdetect", "litellm", "matplotlib", "nltk", @@ -235,7 +215,6 @@ "psycopg2-binary", "pymongo", "pypdf", - "pythainlp", "redis", "requests", "scikit-learn", @@ -252,13 +231,11 @@ "chardet", "chromadb-client", "datasets", - "emoji", "faiss-cpu", "fastapi", "fire", "httpx", "huggingface_hub", - "langdetect", "matplotlib", "mcp", "nltk", @@ -270,7 +247,6 @@ "psycopg2-binary", "pymongo", "pypdf", - "pythainlp", "redis", "requests", "scikit-learn", @@ -287,13 +263,11 @@ "chardet", "chromadb-client", "datasets", - "emoji", "faiss-cpu", "fastapi", "fire", "httpx", "huggingface_hub", - "langdetect", "matplotlib", "mcp", "nltk", @@ -305,7 +279,6 @@ "psycopg2-binary", "pymongo", "pypdf", - "pythainlp", "redis", "requests", "scikit-learn", @@ -324,13 +297,11 @@ "chardet", "chromadb-client", "datasets", - "emoji", "fairscale", "faiss-cpu", "fastapi", "fire", "httpx", - "langdetect", "lm-format-enforcer", "matplotlib", "mcp", @@ -343,7 +314,6 @@ "psycopg2-binary", "pymongo", "pypdf", - "pythainlp", "redis", "requests", "scikit-learn", @@ -364,14 +334,12 @@ "chardet", "chromadb-client", "datasets", - "emoji", "fairscale", "faiss-cpu", "fastapi", "fbgemm-gpu", "fire", "httpx", - "langdetect", "lm-format-enforcer", "matplotlib", "mcp", @@ -384,7 +352,6 @@ "psycopg2-binary", "pymongo", "pypdf", - "pythainlp", "redis", "requests", "scikit-learn", @@ -403,12 +370,10 @@ "aiosqlite", "blobfile", "chardet", - "emoji", "faiss-cpu", "fastapi", "fire", "httpx", - "langdetect", "matplotlib", "nltk", "numpy", @@ -420,7 +385,6 @@ "psycopg2-binary", "pymongo", "pypdf", - "pythainlp", "redis", "requests", "scikit-learn", @@ -437,12 +401,10 @@ "chardet", "chromadb-client", "datasets", - "emoji", "faiss-cpu", "fastapi", "fire", "httpx", - "langdetect", "matplotlib", "mcp", "nltk", @@ -455,7 +417,6 @@ "psycopg2-binary", "pymongo", "pypdf", - "pythainlp", "redis", "requests", "scikit-learn", @@ -471,11 +432,9 @@ "chardet", "chromadb-client", "datasets", - "emoji", "fastapi", "fire", "httpx", - "langdetect", "litellm", "matplotlib", "mcp", @@ -488,7 +447,6 @@ "psycopg2-binary", "pymongo", "pypdf", - "pythainlp", "redis", "requests", "scikit-learn", @@ -506,12 +464,10 @@ "chardet", "chromadb-client", "datasets", - "emoji", "faiss-cpu", "fastapi", "fire", "httpx", - "langdetect", "matplotlib", "mcp", "nltk", @@ -523,7 +479,6 @@ "psycopg2-binary", "pymongo", "pypdf", - "pythainlp", "redis", "requests", "scikit-learn", @@ -541,12 +496,10 @@ "chardet", "chromadb-client", "datasets", - "emoji", "faiss-cpu", "fastapi", "fire", "httpx", - "langdetect", "matplotlib", "mcp", "nltk", @@ -559,7 +512,6 @@ "psycopg2-binary", "pymongo", "pypdf", - "pythainlp", "redis", "requests", "scikit-learn", @@ -607,13 +559,11 @@ "chardet", "chromadb-client", "datasets", - "emoji", "faiss-cpu", "fastapi", "fire", "httpx", "huggingface_hub", - "langdetect", "matplotlib", "mcp", "nltk", @@ -625,7 +575,6 @@ "psycopg2-binary", "pymongo", "pypdf", - "pythainlp", "redis", "requests", "scikit-learn", @@ -643,12 +592,10 @@ "chardet", "chromadb-client", "datasets", - "emoji", "faiss-cpu", "fastapi", "fire", "httpx", - "langdetect", "matplotlib", "mcp", "nltk", @@ -660,7 +607,6 @@ "psycopg2-binary", "pymongo", "pypdf", - "pythainlp", "redis", "requests", "scikit-learn", @@ -679,12 +625,10 @@ "chardet", "chromadb-client", "datasets", - "emoji", "faiss-cpu", "fastapi", "fire", "httpx", - "langdetect", "matplotlib", "mcp", "nltk", @@ -696,7 +640,6 @@ "psycopg2-binary", "pymongo", "pypdf", - "pythainlp", "redis", "requests", "scikit-learn", diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index a7b95d9fe..ab73dc345 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -2285,7 +2285,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/Job" + "$ref": "#/components/schemas/ListAgentSessionsResponse" } } } @@ -6192,382 +6192,6 @@ "title": "EmbeddingsResponse", "description": "Response containing generated embeddings." }, - "AgentCandidate": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "agent", - "default": "agent" - }, - "config": { - "$ref": "#/components/schemas/AgentConfig", - "description": "The configuration for the agent candidate." - } - }, - "additionalProperties": false, - "required": [ - "type", - "config" - ], - "title": "AgentCandidate", - "description": "An agent candidate for evaluation." - }, - "AggregationFunctionType": { - "type": "string", - "enum": [ - "average", - "weighted_average", - "median", - "categorical_count", - "accuracy" - ], - "title": "AggregationFunctionType" - }, - "BasicScoringFnParams": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "basic", - "default": "basic" - }, - "aggregation_functions": { - "type": "array", - "items": { - "$ref": "#/components/schemas/AggregationFunctionType" - } - } - }, - "additionalProperties": false, - "required": [ - "type" - ], - "title": "BasicScoringFnParams" - }, - "BenchmarkConfig": { - "type": "object", - "properties": { - "eval_candidate": { - "$ref": "#/components/schemas/EvalCandidate", - "description": "The candidate to evaluate." - }, - "scoring_params": { - "type": "object", - "additionalProperties": { - "$ref": "#/components/schemas/ScoringFnParams" - }, - "description": "Map between scoring function id and parameters for each scoring function you want to run" - }, - "num_examples": { - "type": "integer", - "description": "(Optional) The number of examples to evaluate. If not provided, all examples in the dataset will be evaluated" - } - }, - "additionalProperties": false, - "required": [ - "eval_candidate", - "scoring_params" - ], - "title": "BenchmarkConfig", - "description": "A benchmark configuration for evaluation." - }, - "EvalCandidate": { - "oneOf": [ - { - "$ref": "#/components/schemas/ModelCandidate" - }, - { - "$ref": "#/components/schemas/AgentCandidate" - } - ], - "discriminator": { - "propertyName": "type", - "mapping": { - "model": "#/components/schemas/ModelCandidate", - "agent": "#/components/schemas/AgentCandidate" - } - } - }, - "LLMAsJudgeScoringFnParams": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "llm_as_judge", - "default": "llm_as_judge" - }, - "judge_model": { - "type": "string" - }, - "prompt_template": { - "type": "string" - }, - "judge_score_regexes": { - "type": "array", - "items": { - "type": "string" - } - }, - "aggregation_functions": { - "type": "array", - "items": { - "$ref": "#/components/schemas/AggregationFunctionType" - } - } - }, - "additionalProperties": false, - "required": [ - "type", - "judge_model" - ], - "title": "LLMAsJudgeScoringFnParams" - }, - "ModelCandidate": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "model", - "default": "model" - }, - "model": { - "type": "string", - "description": "The model ID to evaluate." - }, - "sampling_params": { - "$ref": "#/components/schemas/SamplingParams", - "description": "The sampling parameters for the model." - }, - "system_message": { - "$ref": "#/components/schemas/SystemMessage", - "description": "(Optional) The system message providing instructions or context to the model." - } - }, - "additionalProperties": false, - "required": [ - "type", - "model", - "sampling_params" - ], - "title": "ModelCandidate", - "description": "A model candidate for evaluation." - }, - "RegexParserScoringFnParams": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "regex_parser", - "default": "regex_parser" - }, - "parsing_regexes": { - "type": "array", - "items": { - "type": "string" - } - }, - "aggregation_functions": { - "type": "array", - "items": { - "$ref": "#/components/schemas/AggregationFunctionType" - } - } - }, - "additionalProperties": false, - "required": [ - "type" - ], - "title": "RegexParserScoringFnParams" - }, - "ScoringFnParams": { - "oneOf": [ - { - "$ref": "#/components/schemas/LLMAsJudgeScoringFnParams" - }, - { - "$ref": "#/components/schemas/RegexParserScoringFnParams" - }, - { - "$ref": "#/components/schemas/BasicScoringFnParams" - } - ], - "discriminator": { - "propertyName": "type", - "mapping": { - "llm_as_judge": "#/components/schemas/LLMAsJudgeScoringFnParams", - "regex_parser": "#/components/schemas/RegexParserScoringFnParams", - "basic": "#/components/schemas/BasicScoringFnParams" - } - } - }, - "EvaluateRowsRequest": { - "type": "object", - "properties": { - "input_rows": { - "type": "array", - "items": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } - }, - "description": "The rows to evaluate." - }, - "scoring_functions": { - "type": "array", - "items": { - "type": "string" - }, - "description": "The scoring functions to use for the evaluation." - }, - "benchmark_config": { - "$ref": "#/components/schemas/BenchmarkConfig", - "description": "The configuration for the benchmark." - } - }, - "additionalProperties": false, - "required": [ - "input_rows", - "scoring_functions", - "benchmark_config" - ], - "title": "EvaluateRowsRequest" - }, - "EvaluateResponse": { - "type": "object", - "properties": { - "generations": { - "type": "array", - "items": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } - }, - "description": "The generations from the evaluation." - }, - "scores": { - "type": "object", - "additionalProperties": { - "$ref": "#/components/schemas/ScoringResult" - }, - "description": "The scores from the evaluation." - } - }, - "additionalProperties": false, - "required": [ - "generations", - "scores" - ], - "title": "EvaluateResponse", - "description": "The response from an evaluation." - }, - "ScoringResult": { - "type": "object", - "properties": { - "score_rows": { - "type": "array", - "items": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } - }, - "description": "The scoring result for each row. Each row is a map of column name to value." - }, - "aggregated_results": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - }, - "description": "Map of metric name to aggregated value" - } - }, - "additionalProperties": false, - "required": [ - "score_rows", - "aggregated_results" - ], - "title": "ScoringResult", - "description": "A scoring result for a single row." - }, "Agent": { "type": "object", "properties": { @@ -7705,7 +7329,8 @@ "completed", "in_progress", "failed", - "scheduled" + "scheduled", + "cancelled" ], "title": "JobStatus" }, @@ -8400,30 +8025,6 @@ "title": "IterrowsResponse", "description": "A paginated list of rows from a dataset." }, - "Job": { - "type": "object", - "properties": { - "job_id": { - "type": "string" - }, - "status": { - "type": "string", - "enum": [ - "completed", - "in_progress", - "failed", - "scheduled" - ], - "title": "JobStatus" - } - }, - "additionalProperties": false, - "required": [ - "job_id", - "status" - ], - "title": "Job" - }, "ListAgentSessionsResponse": { "type": "object", "properties": { @@ -10007,16 +9608,21 @@ "RunRequest": { "type": "object", "properties": { - "benchmark_config": { - "$ref": "#/components/schemas/BenchmarkConfig", - "description": "The configuration for the benchmark." + "task": { + "$ref": "#/components/schemas/EvaluationTask", + "description": "The task to evaluate. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids" + }, + "candidate": { + "$ref": "#/components/schemas/EvaluationCandidate", + "description": "The candidate to evaluate." } }, "additionalProperties": false, "required": [ - "benchmark_config" + "task", + "candidate" ], - "title": "RunEvalRequest" + "title": "RunRequest" }, "RunShieldRequest": { "type": "object", @@ -10123,128 +9729,6 @@ ], "title": "SaveSpansToDatasetRequest" }, - "ScoreRequest": { - "type": "object", - "properties": { - "input_rows": { - "type": "array", - "items": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } - }, - "description": "The rows to score." - }, - "scoring_functions": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "$ref": "#/components/schemas/ScoringFnParams" - }, - { - "type": "null" - } - ] - }, - "description": "The scoring functions to use for the scoring." - } - }, - "additionalProperties": false, - "required": [ - "input_rows", - "scoring_functions" - ], - "title": "ScoreRequest" - }, - "ScoreResponse": { - "type": "object", - "properties": { - "results": { - "type": "object", - "additionalProperties": { - "$ref": "#/components/schemas/ScoringResult" - }, - "description": "A map of scoring function name to ScoringResult." - } - }, - "additionalProperties": false, - "required": [ - "results" - ], - "title": "ScoreResponse", - "description": "The response from scoring." - }, - "ScoreBatchRequest": { - "type": "object", - "properties": { - "dataset_id": { - "type": "string" - }, - "scoring_functions": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "$ref": "#/components/schemas/ScoringFnParams" - }, - { - "type": "null" - } - ] - } - }, - "save_results_dataset": { - "type": "boolean" - } - }, - "additionalProperties": false, - "required": [ - "dataset_id", - "scoring_functions", - "save_results_dataset" - ], - "title": "ScoreBatchRequest" - }, - "ScoreBatchResponse": { - "type": "object", - "properties": { - "dataset_id": { - "type": "string" - }, - "results": { - "type": "object", - "additionalProperties": { - "$ref": "#/components/schemas/ScoringResult" - } - } - }, - "additionalProperties": false, - "required": [ - "results" - ], - "title": "ScoreBatchResponse" - }, "AlgorithmConfig": { "oneOf": [ { diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 42ea4bd29..66044bb65 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -1562,109 +1562,6 @@ paths: required: false schema: type: integer - /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}: - get: - responses: - '200': - description: The status of the evaluationjob. - content: - application/json: - schema: - $ref: '#/components/schemas/Job' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Eval - description: Get the status of a job. - parameters: - - name: benchmark_id - in: path - description: >- - The ID of the benchmark to run the evaluation on. - required: true - schema: - type: string - - name: job_id - in: path - description: The ID of the job to get the status of. - required: true - schema: - type: string - delete: - responses: - '200': - description: OK - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Eval - description: Cancel a job. - parameters: - - name: benchmark_id - in: path - description: >- - The ID of the benchmark to run the evaluation on. - required: true - schema: - type: string - - name: job_id - in: path - description: The ID of the job to cancel. - required: true - schema: - type: string - /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result: - get: - responses: - '200': - description: The result of the job. - content: - application/json: - schema: - $ref: '#/components/schemas/EvaluateResponse' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Eval - description: Get the result of a job. - parameters: - - name: benchmark_id - in: path - description: >- - The ID of the benchmark to run the evaluation on. - required: true - schema: - type: string - - name: job_id - in: path - description: The ID of the job to get the result of. - required: true - schema: - type: string /v1/agents/{agent_id}/sessions: get: responses: @@ -1923,7 +1820,7 @@ paths: default: $ref: '#/components/responses/DefaultError' tags: - - Providers + - Models description: '' parameters: [] post: @@ -1974,7 +1871,7 @@ paths: default: $ref: '#/components/responses/DefaultError' tags: - - Inspect + - Providers description: '' parameters: [] /v1/inspect/routes: @@ -4448,252 +4345,6 @@ components: title: EmbeddingsResponse description: >- Response containing generated embeddings. - AgentCandidate: - type: object - properties: - type: - type: string - const: agent - default: agent - config: - $ref: '#/components/schemas/AgentConfig' - description: >- - The configuration for the agent candidate. - additionalProperties: false - required: - - type - - config - title: AgentCandidate - description: An agent candidate for evaluation. - AggregationFunctionType: - type: string - enum: - - average - - weighted_average - - median - - categorical_count - - accuracy - title: AggregationFunctionType - BasicScoringFnParams: - type: object - properties: - type: - type: string - const: basic - default: basic - aggregation_functions: - type: array - items: - $ref: '#/components/schemas/AggregationFunctionType' - additionalProperties: false - required: - - type - title: BasicScoringFnParams - BenchmarkConfig: - type: object - properties: - eval_candidate: - $ref: '#/components/schemas/EvalCandidate' - description: The candidate to evaluate. - scoring_params: - type: object - additionalProperties: - $ref: '#/components/schemas/ScoringFnParams' - description: >- - Map between scoring function id and parameters for each scoring function - you want to run - num_examples: - type: integer - description: >- - (Optional) The number of examples to evaluate. If not provided, all examples - in the dataset will be evaluated - additionalProperties: false - required: - - eval_candidate - - scoring_params - title: BenchmarkConfig - description: >- - A benchmark configuration for evaluation. - EvalCandidate: - oneOf: - - $ref: '#/components/schemas/ModelCandidate' - - $ref: '#/components/schemas/AgentCandidate' - discriminator: - propertyName: type - mapping: - model: '#/components/schemas/ModelCandidate' - agent: '#/components/schemas/AgentCandidate' - LLMAsJudgeScoringFnParams: - type: object - properties: - type: - type: string - const: llm_as_judge - default: llm_as_judge - judge_model: - type: string - prompt_template: - type: string - judge_score_regexes: - type: array - items: - type: string - aggregation_functions: - type: array - items: - $ref: '#/components/schemas/AggregationFunctionType' - additionalProperties: false - required: - - type - - judge_model - title: LLMAsJudgeScoringFnParams - ModelCandidate: - type: object - properties: - type: - type: string - const: model - default: model - model: - type: string - description: The model ID to evaluate. - sampling_params: - $ref: '#/components/schemas/SamplingParams' - description: The sampling parameters for the model. - system_message: - $ref: '#/components/schemas/SystemMessage' - description: >- - (Optional) The system message providing instructions or context to the - model. - additionalProperties: false - required: - - type - - model - - sampling_params - title: ModelCandidate - description: A model candidate for evaluation. - RegexParserScoringFnParams: - type: object - properties: - type: - type: string - const: regex_parser - default: regex_parser - parsing_regexes: - type: array - items: - type: string - aggregation_functions: - type: array - items: - $ref: '#/components/schemas/AggregationFunctionType' - additionalProperties: false - required: - - type - title: RegexParserScoringFnParams - ScoringFnParams: - oneOf: - - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams' - - $ref: '#/components/schemas/RegexParserScoringFnParams' - - $ref: '#/components/schemas/BasicScoringFnParams' - discriminator: - propertyName: type - mapping: - llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams' - regex_parser: '#/components/schemas/RegexParserScoringFnParams' - basic: '#/components/schemas/BasicScoringFnParams' - EvaluateRowsRequest: - type: object - properties: - input_rows: - type: array - items: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - description: The rows to evaluate. - scoring_functions: - type: array - items: - type: string - description: >- - The scoring functions to use for the evaluation. - benchmark_config: - $ref: '#/components/schemas/BenchmarkConfig' - description: The configuration for the benchmark. - additionalProperties: false - required: - - input_rows - - scoring_functions - - benchmark_config - title: EvaluateRowsRequest - EvaluateResponse: - type: object - properties: - generations: - type: array - items: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - description: The generations from the evaluation. - scores: - type: object - additionalProperties: - $ref: '#/components/schemas/ScoringResult' - description: The scores from the evaluation. - additionalProperties: false - required: - - generations - - scores - title: EvaluateResponse - description: The response from an evaluation. - ScoringResult: - type: object - properties: - score_rows: - type: array - items: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - description: >- - The scoring result for each row. Each row is a map of column name to value. - aggregated_results: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - description: Map of metric name to aggregated value - additionalProperties: false - required: - - score_rows - - aggregated_results - title: ScoringResult - description: A scoring result for a single row. Agent: type: object properties: @@ -5451,6 +5102,7 @@ components: - in_progress - failed - scheduled + - cancelled title: JobStatus scheduled_at: type: string @@ -5901,24 +5553,6 @@ components: - data title: IterrowsResponse description: A paginated list of rows from a dataset. - Job: - type: object - properties: - job_id: - type: string - status: - type: string - enum: - - completed - - in_progress - - failed - - scheduled - title: JobStatus - additionalProperties: false - required: - - job_id - - status - title: Job ListAgentSessionsResponse: type: object properties: @@ -6984,8 +6618,9 @@ components: description: The candidate to evaluate. additionalProperties: false required: - - benchmark_config - title: RunEvalRequest + - task + - candidate + title: RunRequest RunShieldRequest: type: object properties: @@ -7058,81 +6693,6 @@ components: - attributes_to_save - dataset_id title: SaveSpansToDatasetRequest - ScoreRequest: - type: object - properties: - input_rows: - type: array - items: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - description: The rows to score. - scoring_functions: - type: object - additionalProperties: - oneOf: - - $ref: '#/components/schemas/ScoringFnParams' - - type: 'null' - description: >- - The scoring functions to use for the scoring. - additionalProperties: false - required: - - input_rows - - scoring_functions - title: ScoreRequest - ScoreResponse: - type: object - properties: - results: - type: object - additionalProperties: - $ref: '#/components/schemas/ScoringResult' - description: >- - A map of scoring function name to ScoringResult. - additionalProperties: false - required: - - results - title: ScoreResponse - description: The response from scoring. - ScoreBatchRequest: - type: object - properties: - dataset_id: - type: string - scoring_functions: - type: object - additionalProperties: - oneOf: - - $ref: '#/components/schemas/ScoringFnParams' - - type: 'null' - save_results_dataset: - type: boolean - additionalProperties: false - required: - - dataset_id - - scoring_functions - - save_results_dataset - title: ScoreBatchRequest - ScoreBatchResponse: - type: object - properties: - dataset_id: - type: string - results: - type: object - additionalProperties: - $ref: '#/components/schemas/ScoringResult' - additionalProperties: false - required: - - results - title: ScoreBatchResponse AlgorithmConfig: oneOf: - $ref: '#/components/schemas/LoraFinetuningConfig' diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py index 83b50d175..0e5959c37 100644 --- a/llama_stack/apis/eval/eval.py +++ b/llama_stack/apis/eval/eval.py @@ -141,4 +141,3 @@ class Eval(Protocol): :param job_id: The ID of the job to get the result of. :return: The result of the job. """ - diff --git a/llama_stack/apis/scoring_functions/scoring_functions.py b/llama_stack/apis/scoring_functions/scoring_functions.py index 4f6f4d824..4f85947dd 100644 --- a/llama_stack/apis/scoring_functions/scoring_functions.py +++ b/llama_stack/apis/scoring_functions/scoring_functions.py @@ -146,4 +146,3 @@ class ScoringFunctions(Protocol): provider_id: Optional[str] = None, params: Optional[ScoringFnParams] = None, ) -> None: ... - diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py index d56d6f672..84fe52632 100644 --- a/llama_stack/distribution/routers/routing_tables.py +++ b/llama_stack/distribution/routers/routing_tables.py @@ -43,7 +43,6 @@ from llama_stack.distribution.datatypes import ( RoutableObject, RoutableObjectWithProvider, RoutedProtocol, - ScoringFnWithACL, ShieldWithACL, ToolGroupWithACL, ToolWithACL, diff --git a/llama_stack/providers/registry/eval.py b/llama_stack/providers/registry/eval.py index b6aaacaa7..f3e42c531 100644 --- a/llama_stack/providers/registry/eval.py +++ b/llama_stack/providers/registry/eval.py @@ -26,4 +26,3 @@ def available_providers() -> List[ProviderSpec]: ], ), ] - diff --git a/llama_stack/templates/open-benchmark/run.yaml b/llama_stack/templates/open-benchmark/run.yaml index 6ed8a7a71..d9ca11a84 100644 --- a/llama_stack/templates/open-benchmark/run.yaml +++ b/llama_stack/templates/open-benchmark/run.yaml @@ -166,7 +166,18 @@ datasets: uri: huggingface://datasets/llamastack/bfcl_v3?split=train metadata: {} dataset_id: bfcl - provider_id: huggingface +- purpose: eval/messages-answer + source: + type: uri + uri: huggingface://datasets/llamastack/IfEval?split=train + metadata: {} + dataset_id: ifeval +- purpose: eval/messages-answer + source: + type: uri + uri: huggingface://datasets/llamastack/docvqa?split=val + metadata: {} + dataset_id: docvqa benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch