From 3f8c7a584aede5a666bdc3eb5e7ac21185dc0ca8 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Sun, 23 Mar 2025 16:00:48 -0700
Subject: [PATCH] precommit

---
 distributions/dependencies.json               |  57 --
 docs/_static/llama-stack-spec.html            | 542 +-----------------
 docs/_static/llama-stack-spec.yaml            | 452 +--------------
 llama_stack/apis/eval/eval.py                 |   1 -
 .../scoring_functions/scoring_functions.py    |   1 -
 .../distribution/routers/routing_tables.py    |   1 -
 llama_stack/providers/registry/eval.py        |   1 -
 llama_stack/templates/open-benchmark/run.yaml |  13 +-
 8 files changed, 31 insertions(+), 1037 deletions(-)

diff --git a/distributions/dependencies.json b/distributions/dependencies.json
index fe1509483..1767523d6 100644
--- a/distributions/dependencies.json
+++ b/distributions/dependencies.json
@@ -6,12 +6,10 @@
     "chardet",
     "chromadb-client",
     "datasets",
-    "emoji",
     "faiss-cpu",
     "fastapi",
     "fire",
     "httpx",
-    "langdetect",
     "matplotlib",
     "mcp",
     "nltk",
@@ -23,7 +21,6 @@
     "psycopg2-binary",
     "pymongo",
     "pypdf",
-    "pythainlp",
     "redis",
     "requests",
     "scikit-learn",
@@ -40,12 +37,10 @@
     "chardet",
     "chromadb-client",
     "datasets",
-    "emoji",
     "faiss-cpu",
     "fastapi",
     "fire",
     "httpx",
-    "langdetect",
     "matplotlib",
     "nltk",
     "numpy",
@@ -56,7 +51,6 @@
     "psycopg2-binary",
     "pymongo",
     "pypdf",
-    "pythainlp",
     "redis",
     "requests",
     "scikit-learn",
@@ -74,12 +68,10 @@
     "chardet",
     "chromadb-client",
     "datasets",
-    "emoji",
     "fastapi",
     "fire",
     "fireworks-ai",
     "httpx",
-    "langdetect",
     "matplotlib",
     "mcp",
     "nltk",
@@ -91,7 +83,6 @@
     "psycopg2-binary",
     "pymongo",
     "pypdf",
-    "pythainlp",
     "redis",
     "requests",
     "scikit-learn",
@@ -111,13 +102,11 @@
     "chardet",
     "chromadb-client",
     "datasets",
-    "emoji",
     "faiss-cpu",
     "fastapi",
     "fire",
     "httpx",
     "huggingface_hub",
-    "langdetect",
     "matplotlib",
     "nltk",
     "numpy",
@@ -128,7 +117,6 @@
     "psycopg2-binary",
     "pymongo",
     "pypdf",
-    "pythainlp",
     "redis",
     "requests",
     "scikit-learn",
@@ -146,12 +134,10 @@
     "chardet",
     "chromadb-client",
     "datasets",
-    "emoji",
     "fastapi",
     "fire",
     "fireworks-ai",
     "httpx",
-    "langdetect",
     "litellm",
     "matplotlib",
     "mcp",
@@ -164,7 +150,6 @@
     "psycopg2-binary",
     "pymongo",
     "pypdf",
-    "pythainlp",
     "redis",
     "requests",
     "scikit-learn",
@@ -183,13 +168,11 @@
     "chardet",
     "chromadb-client",
     "datasets",
-    "emoji",
     "faiss-cpu",
     "fastapi",
     "fire",
     "fireworks-ai",
     "httpx",
-    "langdetect",
     "matplotlib",
     "mcp",
     "nltk",
@@ -201,7 +184,6 @@
     "psycopg2-binary",
     "pymongo",
     "pypdf",
-    "pythainlp",
     "redis",
     "requests",
     "scikit-learn",
@@ -218,12 +200,10 @@
     "blobfile",
     "chardet",
     "datasets",
-    "emoji",
     "faiss-cpu",
     "fastapi",
     "fire",
     "httpx",
-    "langdetect",
     "litellm",
     "matplotlib",
     "nltk",
@@ -235,7 +215,6 @@
     "psycopg2-binary",
     "pymongo",
     "pypdf",
-    "pythainlp",
     "redis",
     "requests",
     "scikit-learn",
@@ -252,13 +231,11 @@
     "chardet",
     "chromadb-client",
     "datasets",
-    "emoji",
     "faiss-cpu",
     "fastapi",
     "fire",
     "httpx",
     "huggingface_hub",
-    "langdetect",
     "matplotlib",
     "mcp",
     "nltk",
@@ -270,7 +247,6 @@
     "psycopg2-binary",
     "pymongo",
     "pypdf",
-    "pythainlp",
     "redis",
     "requests",
     "scikit-learn",
@@ -287,13 +263,11 @@
     "chardet",
     "chromadb-client",
     "datasets",
-    "emoji",
     "faiss-cpu",
     "fastapi",
     "fire",
     "httpx",
     "huggingface_hub",
-    "langdetect",
     "matplotlib",
     "mcp",
     "nltk",
@@ -305,7 +279,6 @@
     "psycopg2-binary",
     "pymongo",
     "pypdf",
-    "pythainlp",
     "redis",
     "requests",
     "scikit-learn",
@@ -324,13 +297,11 @@
     "chardet",
     "chromadb-client",
     "datasets",
-    "emoji",
     "fairscale",
     "faiss-cpu",
     "fastapi",
     "fire",
     "httpx",
-    "langdetect",
     "lm-format-enforcer",
     "matplotlib",
     "mcp",
@@ -343,7 +314,6 @@
     "psycopg2-binary",
     "pymongo",
     "pypdf",
-    "pythainlp",
     "redis",
     "requests",
     "scikit-learn",
@@ -364,14 +334,12 @@
     "chardet",
     "chromadb-client",
     "datasets",
-    "emoji",
     "fairscale",
     "faiss-cpu",
     "fastapi",
     "fbgemm-gpu",
     "fire",
     "httpx",
-    "langdetect",
     "lm-format-enforcer",
     "matplotlib",
     "mcp",
@@ -384,7 +352,6 @@
     "psycopg2-binary",
     "pymongo",
     "pypdf",
-    "pythainlp",
     "redis",
     "requests",
     "scikit-learn",
@@ -403,12 +370,10 @@
     "aiosqlite",
     "blobfile",
     "chardet",
-    "emoji",
     "faiss-cpu",
     "fastapi",
     "fire",
     "httpx",
-    "langdetect",
     "matplotlib",
     "nltk",
     "numpy",
@@ -420,7 +385,6 @@
     "psycopg2-binary",
     "pymongo",
     "pypdf",
-    "pythainlp",
     "redis",
     "requests",
     "scikit-learn",
@@ -437,12 +401,10 @@
     "chardet",
     "chromadb-client",
     "datasets",
-    "emoji",
     "faiss-cpu",
     "fastapi",
     "fire",
     "httpx",
-    "langdetect",
     "matplotlib",
     "mcp",
     "nltk",
@@ -455,7 +417,6 @@
     "psycopg2-binary",
     "pymongo",
     "pypdf",
-    "pythainlp",
     "redis",
     "requests",
     "scikit-learn",
@@ -471,11 +432,9 @@
     "chardet",
     "chromadb-client",
     "datasets",
-    "emoji",
     "fastapi",
     "fire",
     "httpx",
-    "langdetect",
     "litellm",
     "matplotlib",
     "mcp",
@@ -488,7 +447,6 @@
     "psycopg2-binary",
     "pymongo",
     "pypdf",
-    "pythainlp",
     "redis",
     "requests",
     "scikit-learn",
@@ -506,12 +464,10 @@
     "chardet",
     "chromadb-client",
     "datasets",
-    "emoji",
     "faiss-cpu",
     "fastapi",
     "fire",
     "httpx",
-    "langdetect",
     "matplotlib",
     "mcp",
     "nltk",
@@ -523,7 +479,6 @@
     "psycopg2-binary",
     "pymongo",
     "pypdf",
-    "pythainlp",
     "redis",
     "requests",
     "scikit-learn",
@@ -541,12 +496,10 @@
     "chardet",
     "chromadb-client",
     "datasets",
-    "emoji",
     "faiss-cpu",
     "fastapi",
     "fire",
     "httpx",
-    "langdetect",
     "matplotlib",
     "mcp",
     "nltk",
@@ -559,7 +512,6 @@
     "psycopg2-binary",
     "pymongo",
     "pypdf",
-    "pythainlp",
     "redis",
     "requests",
     "scikit-learn",
@@ -607,13 +559,11 @@
     "chardet",
     "chromadb-client",
     "datasets",
-    "emoji",
     "faiss-cpu",
     "fastapi",
     "fire",
     "httpx",
     "huggingface_hub",
-    "langdetect",
     "matplotlib",
     "mcp",
     "nltk",
@@ -625,7 +575,6 @@
     "psycopg2-binary",
     "pymongo",
     "pypdf",
-    "pythainlp",
     "redis",
     "requests",
     "scikit-learn",
@@ -643,12 +592,10 @@
     "chardet",
     "chromadb-client",
     "datasets",
-    "emoji",
     "faiss-cpu",
     "fastapi",
     "fire",
     "httpx",
-    "langdetect",
     "matplotlib",
     "mcp",
     "nltk",
@@ -660,7 +607,6 @@
     "psycopg2-binary",
     "pymongo",
     "pypdf",
-    "pythainlp",
     "redis",
     "requests",
     "scikit-learn",
@@ -679,12 +625,10 @@
     "chardet",
     "chromadb-client",
     "datasets",
-    "emoji",
     "faiss-cpu",
     "fastapi",
     "fire",
     "httpx",
-    "langdetect",
     "matplotlib",
     "mcp",
     "nltk",
@@ -696,7 +640,6 @@
     "psycopg2-binary",
     "pymongo",
     "pypdf",
-    "pythainlp",
     "redis",
     "requests",
     "scikit-learn",
diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index a7b95d9fe..ab73dc345 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -2285,7 +2285,7 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/Job"
+                                    "$ref": "#/components/schemas/ListAgentSessionsResponse"
                                 }
                             }
                         }
@@ -6192,382 +6192,6 @@
                 "title": "EmbeddingsResponse",
                 "description": "Response containing generated embeddings."
             },
-            "AgentCandidate": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "agent",
-                        "default": "agent"
-                    },
-                    "config": {
-                        "$ref": "#/components/schemas/AgentConfig",
-                        "description": "The configuration for the agent candidate."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "config"
-                ],
-                "title": "AgentCandidate",
-                "description": "An agent candidate for evaluation."
-            },
-            "AggregationFunctionType": {
-                "type": "string",
-                "enum": [
-                    "average",
-                    "weighted_average",
-                    "median",
-                    "categorical_count",
-                    "accuracy"
-                ],
-                "title": "AggregationFunctionType"
-            },
-            "BasicScoringFnParams": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "basic",
-                        "default": "basic"
-                    },
-                    "aggregation_functions": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/AggregationFunctionType"
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type"
-                ],
-                "title": "BasicScoringFnParams"
-            },
-            "BenchmarkConfig": {
-                "type": "object",
-                "properties": {
-                    "eval_candidate": {
-                        "$ref": "#/components/schemas/EvalCandidate",
-                        "description": "The candidate to evaluate."
-                    },
-                    "scoring_params": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "$ref": "#/components/schemas/ScoringFnParams"
-                        },
-                        "description": "Map between scoring function id and parameters for each scoring function you want to run"
-                    },
-                    "num_examples": {
-                        "type": "integer",
-                        "description": "(Optional) The number of examples to evaluate. If not provided, all examples in the dataset will be evaluated"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "eval_candidate",
-                    "scoring_params"
-                ],
-                "title": "BenchmarkConfig",
-                "description": "A benchmark configuration for evaluation."
-            },
-            "EvalCandidate": {
-                "oneOf": [
-                    {
-                        "$ref": "#/components/schemas/ModelCandidate"
-                    },
-                    {
-                        "$ref": "#/components/schemas/AgentCandidate"
-                    }
-                ],
-                "discriminator": {
-                    "propertyName": "type",
-                    "mapping": {
-                        "model": "#/components/schemas/ModelCandidate",
-                        "agent": "#/components/schemas/AgentCandidate"
-                    }
-                }
-            },
-            "LLMAsJudgeScoringFnParams": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "llm_as_judge",
-                        "default": "llm_as_judge"
-                    },
-                    "judge_model": {
-                        "type": "string"
-                    },
-                    "prompt_template": {
-                        "type": "string"
-                    },
-                    "judge_score_regexes": {
-                        "type": "array",
-                        "items": {
-                            "type": "string"
-                        }
-                    },
-                    "aggregation_functions": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/AggregationFunctionType"
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "judge_model"
-                ],
-                "title": "LLMAsJudgeScoringFnParams"
-            },
-            "ModelCandidate": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "model",
-                        "default": "model"
-                    },
-                    "model": {
-                        "type": "string",
-                        "description": "The model ID to evaluate."
-                    },
-                    "sampling_params": {
-                        "$ref": "#/components/schemas/SamplingParams",
-                        "description": "The sampling parameters for the model."
-                    },
-                    "system_message": {
-                        "$ref": "#/components/schemas/SystemMessage",
-                        "description": "(Optional) The system message providing instructions or context to the model."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "model",
-                    "sampling_params"
-                ],
-                "title": "ModelCandidate",
-                "description": "A model candidate for evaluation."
-            },
-            "RegexParserScoringFnParams": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "regex_parser",
-                        "default": "regex_parser"
-                    },
-                    "parsing_regexes": {
-                        "type": "array",
-                        "items": {
-                            "type": "string"
-                        }
-                    },
-                    "aggregation_functions": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/AggregationFunctionType"
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type"
-                ],
-                "title": "RegexParserScoringFnParams"
-            },
-            "ScoringFnParams": {
-                "oneOf": [
-                    {
-                        "$ref": "#/components/schemas/LLMAsJudgeScoringFnParams"
-                    },
-                    {
-                        "$ref": "#/components/schemas/RegexParserScoringFnParams"
-                    },
-                    {
-                        "$ref": "#/components/schemas/BasicScoringFnParams"
-                    }
-                ],
-                "discriminator": {
-                    "propertyName": "type",
-                    "mapping": {
-                        "llm_as_judge": "#/components/schemas/LLMAsJudgeScoringFnParams",
-                        "regex_parser": "#/components/schemas/RegexParserScoringFnParams",
-                        "basic": "#/components/schemas/BasicScoringFnParams"
-                    }
-                }
-            },
-            "EvaluateRowsRequest": {
-                "type": "object",
-                "properties": {
-                    "input_rows": {
-                        "type": "array",
-                        "items": {
-                            "type": "object",
-                            "additionalProperties": {
-                                "oneOf": [
-                                    {
-                                        "type": "null"
-                                    },
-                                    {
-                                        "type": "boolean"
-                                    },
-                                    {
-                                        "type": "number"
-                                    },
-                                    {
-                                        "type": "string"
-                                    },
-                                    {
-                                        "type": "array"
-                                    },
-                                    {
-                                        "type": "object"
-                                    }
-                                ]
-                            }
-                        },
-                        "description": "The rows to evaluate."
-                    },
-                    "scoring_functions": {
-                        "type": "array",
-                        "items": {
-                            "type": "string"
-                        },
-                        "description": "The scoring functions to use for the evaluation."
-                    },
-                    "benchmark_config": {
-                        "$ref": "#/components/schemas/BenchmarkConfig",
-                        "description": "The configuration for the benchmark."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "input_rows",
-                    "scoring_functions",
-                    "benchmark_config"
-                ],
-                "title": "EvaluateRowsRequest"
-            },
-            "EvaluateResponse": {
-                "type": "object",
-                "properties": {
-                    "generations": {
-                        "type": "array",
-                        "items": {
-                            "type": "object",
-                            "additionalProperties": {
-                                "oneOf": [
-                                    {
-                                        "type": "null"
-                                    },
-                                    {
-                                        "type": "boolean"
-                                    },
-                                    {
-                                        "type": "number"
-                                    },
-                                    {
-                                        "type": "string"
-                                    },
-                                    {
-                                        "type": "array"
-                                    },
-                                    {
-                                        "type": "object"
-                                    }
-                                ]
-                            }
-                        },
-                        "description": "The generations from the evaluation."
-                    },
-                    "scores": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "$ref": "#/components/schemas/ScoringResult"
-                        },
-                        "description": "The scores from the evaluation."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "generations",
-                    "scores"
-                ],
-                "title": "EvaluateResponse",
-                "description": "The response from an evaluation."
-            },
-            "ScoringResult": {
-                "type": "object",
-                "properties": {
-                    "score_rows": {
-                        "type": "array",
-                        "items": {
-                            "type": "object",
-                            "additionalProperties": {
-                                "oneOf": [
-                                    {
-                                        "type": "null"
-                                    },
-                                    {
-                                        "type": "boolean"
-                                    },
-                                    {
-                                        "type": "number"
-                                    },
-                                    {
-                                        "type": "string"
-                                    },
-                                    {
-                                        "type": "array"
-                                    },
-                                    {
-                                        "type": "object"
-                                    }
-                                ]
-                            }
-                        },
-                        "description": "The scoring result for each row. Each row is a map of column name to value."
-                    },
-                    "aggregated_results": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "type": "null"
-                                },
-                                {
-                                    "type": "boolean"
-                                },
-                                {
-                                    "type": "number"
-                                },
-                                {
-                                    "type": "string"
-                                },
-                                {
-                                    "type": "array"
-                                },
-                                {
-                                    "type": "object"
-                                }
-                            ]
-                        },
-                        "description": "Map of metric name to aggregated value"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "score_rows",
-                    "aggregated_results"
-                ],
-                "title": "ScoringResult",
-                "description": "A scoring result for a single row."
-            },
             "Agent": {
                 "type": "object",
                 "properties": {
@@ -7705,7 +7329,8 @@
                             "completed",
                             "in_progress",
                             "failed",
-                            "scheduled"
+                            "scheduled",
+                            "cancelled"
                         ],
                         "title": "JobStatus"
                     },
@@ -8400,30 +8025,6 @@
                 "title": "IterrowsResponse",
                 "description": "A paginated list of rows from a dataset."
             },
-            "Job": {
-                "type": "object",
-                "properties": {
-                    "job_id": {
-                        "type": "string"
-                    },
-                    "status": {
-                        "type": "string",
-                        "enum": [
-                            "completed",
-                            "in_progress",
-                            "failed",
-                            "scheduled"
-                        ],
-                        "title": "JobStatus"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "job_id",
-                    "status"
-                ],
-                "title": "Job"
-            },
             "ListAgentSessionsResponse": {
                 "type": "object",
                 "properties": {
@@ -10007,16 +9608,21 @@
             "RunRequest": {
                 "type": "object",
                 "properties": {
-                    "benchmark_config": {
-                        "$ref": "#/components/schemas/BenchmarkConfig",
-                        "description": "The configuration for the benchmark."
+                    "task": {
+                        "$ref": "#/components/schemas/EvaluationTask",
+                        "description": "The task to evaluate. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
+                    },
+                    "candidate": {
+                        "$ref": "#/components/schemas/EvaluationCandidate",
+                        "description": "The candidate to evaluate."
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "benchmark_config"
+                    "task",
+                    "candidate"
                 ],
-                "title": "RunEvalRequest"
+                "title": "RunRequest"
             },
             "RunShieldRequest": {
                 "type": "object",
@@ -10123,128 +9729,6 @@
                 ],
                 "title": "SaveSpansToDatasetRequest"
             },
-            "ScoreRequest": {
-                "type": "object",
-                "properties": {
-                    "input_rows": {
-                        "type": "array",
-                        "items": {
-                            "type": "object",
-                            "additionalProperties": {
-                                "oneOf": [
-                                    {
-                                        "type": "null"
-                                    },
-                                    {
-                                        "type": "boolean"
-                                    },
-                                    {
-                                        "type": "number"
-                                    },
-                                    {
-                                        "type": "string"
-                                    },
-                                    {
-                                        "type": "array"
-                                    },
-                                    {
-                                        "type": "object"
-                                    }
-                                ]
-                            }
-                        },
-                        "description": "The rows to score."
-                    },
-                    "scoring_functions": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "$ref": "#/components/schemas/ScoringFnParams"
-                                },
-                                {
-                                    "type": "null"
-                                }
-                            ]
-                        },
-                        "description": "The scoring functions to use for the scoring."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "input_rows",
-                    "scoring_functions"
-                ],
-                "title": "ScoreRequest"
-            },
-            "ScoreResponse": {
-                "type": "object",
-                "properties": {
-                    "results": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "$ref": "#/components/schemas/ScoringResult"
-                        },
-                        "description": "A map of scoring function name to ScoringResult."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "results"
-                ],
-                "title": "ScoreResponse",
-                "description": "The response from scoring."
-            },
-            "ScoreBatchRequest": {
-                "type": "object",
-                "properties": {
-                    "dataset_id": {
-                        "type": "string"
-                    },
-                    "scoring_functions": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "$ref": "#/components/schemas/ScoringFnParams"
-                                },
-                                {
-                                    "type": "null"
-                                }
-                            ]
-                        }
-                    },
-                    "save_results_dataset": {
-                        "type": "boolean"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "dataset_id",
-                    "scoring_functions",
-                    "save_results_dataset"
-                ],
-                "title": "ScoreBatchRequest"
-            },
-            "ScoreBatchResponse": {
-                "type": "object",
-                "properties": {
-                    "dataset_id": {
-                        "type": "string"
-                    },
-                    "results": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "$ref": "#/components/schemas/ScoringResult"
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "results"
-                ],
-                "title": "ScoreBatchResponse"
-            },
             "AlgorithmConfig": {
                 "oneOf": [
                     {
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index 42ea4bd29..66044bb65 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -1562,109 +1562,6 @@ paths:
           required: false
           schema:
             type: integer
-  /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}:
-    get:
-      responses:
-        '200':
-          description: The status of the evaluationjob.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/Job'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - Eval
-      description: Get the status of a job.
-      parameters:
-        - name: benchmark_id
-          in: path
-          description: >-
-            The ID of the benchmark to run the evaluation on.
-          required: true
-          schema:
-            type: string
-        - name: job_id
-          in: path
-          description: The ID of the job to get the status of.
-          required: true
-          schema:
-            type: string
-    delete:
-      responses:
-        '200':
-          description: OK
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - Eval
-      description: Cancel a job.
-      parameters:
-        - name: benchmark_id
-          in: path
-          description: >-
-            The ID of the benchmark to run the evaluation on.
-          required: true
-          schema:
-            type: string
-        - name: job_id
-          in: path
-          description: The ID of the job to cancel.
-          required: true
-          schema:
-            type: string
-  /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result:
-    get:
-      responses:
-        '200':
-          description: The result of the job.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/EvaluateResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - Eval
-      description: Get the result of a job.
-      parameters:
-        - name: benchmark_id
-          in: path
-          description: >-
-            The ID of the benchmark to run the evaluation on.
-          required: true
-          schema:
-            type: string
-        - name: job_id
-          in: path
-          description: The ID of the job to get the result of.
-          required: true
-          schema:
-            type: string
   /v1/agents/{agent_id}/sessions:
     get:
       responses:
@@ -1923,7 +1820,7 @@ paths:
         default:
           $ref: '#/components/responses/DefaultError'
       tags:
-        - Providers
+        - Models
       description: ''
       parameters: []
     post:
@@ -1974,7 +1871,7 @@ paths:
         default:
           $ref: '#/components/responses/DefaultError'
       tags:
-        - Inspect
+        - Providers
       description: ''
       parameters: []
   /v1/inspect/routes:
@@ -4448,252 +4345,6 @@ components:
       title: EmbeddingsResponse
       description: >-
         Response containing generated embeddings.
-    AgentCandidate:
-      type: object
-      properties:
-        type:
-          type: string
-          const: agent
-          default: agent
-        config:
-          $ref: '#/components/schemas/AgentConfig'
-          description: >-
-            The configuration for the agent candidate.
-      additionalProperties: false
-      required:
-        - type
-        - config
-      title: AgentCandidate
-      description: An agent candidate for evaluation.
-    AggregationFunctionType:
-      type: string
-      enum:
-        - average
-        - weighted_average
-        - median
-        - categorical_count
-        - accuracy
-      title: AggregationFunctionType
-    BasicScoringFnParams:
-      type: object
-      properties:
-        type:
-          type: string
-          const: basic
-          default: basic
-        aggregation_functions:
-          type: array
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-      additionalProperties: false
-      required:
-        - type
-      title: BasicScoringFnParams
-    BenchmarkConfig:
-      type: object
-      properties:
-        eval_candidate:
-          $ref: '#/components/schemas/EvalCandidate'
-          description: The candidate to evaluate.
-        scoring_params:
-          type: object
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringFnParams'
-          description: >-
-            Map between scoring function id and parameters for each scoring function
-            you want to run
-        num_examples:
-          type: integer
-          description: >-
-            (Optional) The number of examples to evaluate. If not provided, all examples
-            in the dataset will be evaluated
-      additionalProperties: false
-      required:
-        - eval_candidate
-        - scoring_params
-      title: BenchmarkConfig
-      description: >-
-        A benchmark configuration for evaluation.
-    EvalCandidate:
-      oneOf:
-        - $ref: '#/components/schemas/ModelCandidate'
-        - $ref: '#/components/schemas/AgentCandidate'
-      discriminator:
-        propertyName: type
-        mapping:
-          model: '#/components/schemas/ModelCandidate'
-          agent: '#/components/schemas/AgentCandidate'
-    LLMAsJudgeScoringFnParams:
-      type: object
-      properties:
-        type:
-          type: string
-          const: llm_as_judge
-          default: llm_as_judge
-        judge_model:
-          type: string
-        prompt_template:
-          type: string
-        judge_score_regexes:
-          type: array
-          items:
-            type: string
-        aggregation_functions:
-          type: array
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-      additionalProperties: false
-      required:
-        - type
-        - judge_model
-      title: LLMAsJudgeScoringFnParams
-    ModelCandidate:
-      type: object
-      properties:
-        type:
-          type: string
-          const: model
-          default: model
-        model:
-          type: string
-          description: The model ID to evaluate.
-        sampling_params:
-          $ref: '#/components/schemas/SamplingParams'
-          description: The sampling parameters for the model.
-        system_message:
-          $ref: '#/components/schemas/SystemMessage'
-          description: >-
-            (Optional) The system message providing instructions or context to the
-            model.
-      additionalProperties: false
-      required:
-        - type
-        - model
-        - sampling_params
-      title: ModelCandidate
-      description: A model candidate for evaluation.
-    RegexParserScoringFnParams:
-      type: object
-      properties:
-        type:
-          type: string
-          const: regex_parser
-          default: regex_parser
-        parsing_regexes:
-          type: array
-          items:
-            type: string
-        aggregation_functions:
-          type: array
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-      additionalProperties: false
-      required:
-        - type
-      title: RegexParserScoringFnParams
-    ScoringFnParams:
-      oneOf:
-        - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-        - $ref: '#/components/schemas/RegexParserScoringFnParams'
-        - $ref: '#/components/schemas/BasicScoringFnParams'
-      discriminator:
-        propertyName: type
-        mapping:
-          llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-          regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-          basic: '#/components/schemas/BasicScoringFnParams'
-    EvaluateRowsRequest:
-      type: object
-      properties:
-        input_rows:
-          type: array
-          items:
-            type: object
-            additionalProperties:
-              oneOf:
-                - type: 'null'
-                - type: boolean
-                - type: number
-                - type: string
-                - type: array
-                - type: object
-          description: The rows to evaluate.
-        scoring_functions:
-          type: array
-          items:
-            type: string
-          description: >-
-            The scoring functions to use for the evaluation.
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark.
-      additionalProperties: false
-      required:
-        - input_rows
-        - scoring_functions
-        - benchmark_config
-      title: EvaluateRowsRequest
-    EvaluateResponse:
-      type: object
-      properties:
-        generations:
-          type: array
-          items:
-            type: object
-            additionalProperties:
-              oneOf:
-                - type: 'null'
-                - type: boolean
-                - type: number
-                - type: string
-                - type: array
-                - type: object
-          description: The generations from the evaluation.
-        scores:
-          type: object
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-          description: The scores from the evaluation.
-      additionalProperties: false
-      required:
-        - generations
-        - scores
-      title: EvaluateResponse
-      description: The response from an evaluation.
-    ScoringResult:
-      type: object
-      properties:
-        score_rows:
-          type: array
-          items:
-            type: object
-            additionalProperties:
-              oneOf:
-                - type: 'null'
-                - type: boolean
-                - type: number
-                - type: string
-                - type: array
-                - type: object
-          description: >-
-            The scoring result for each row. Each row is a map of column name to value.
-        aggregated_results:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-          description: Map of metric name to aggregated value
-      additionalProperties: false
-      required:
-        - score_rows
-        - aggregated_results
-      title: ScoringResult
-      description: A scoring result for a single row.
     Agent:
       type: object
       properties:
@@ -5451,6 +5102,7 @@ components:
             - in_progress
             - failed
             - scheduled
+            - cancelled
           title: JobStatus
         scheduled_at:
           type: string
@@ -5901,24 +5553,6 @@ components:
         - data
       title: IterrowsResponse
       description: A paginated list of rows from a dataset.
-    Job:
-      type: object
-      properties:
-        job_id:
-          type: string
-        status:
-          type: string
-          enum:
-            - completed
-            - in_progress
-            - failed
-            - scheduled
-          title: JobStatus
-      additionalProperties: false
-      required:
-        - job_id
-        - status
-      title: Job
     ListAgentSessionsResponse:
       type: object
       properties:
@@ -6984,8 +6618,9 @@ components:
           description: The candidate to evaluate.
       additionalProperties: false
       required:
-        - benchmark_config
-      title: RunEvalRequest
+        - task
+        - candidate
+      title: RunRequest
     RunShieldRequest:
       type: object
       properties:
@@ -7058,81 +6693,6 @@ components:
         - attributes_to_save
         - dataset_id
       title: SaveSpansToDatasetRequest
-    ScoreRequest:
-      type: object
-      properties:
-        input_rows:
-          type: array
-          items:
-            type: object
-            additionalProperties:
-              oneOf:
-                - type: 'null'
-                - type: boolean
-                - type: number
-                - type: string
-                - type: array
-                - type: object
-          description: The rows to score.
-        scoring_functions:
-          type: object
-          additionalProperties:
-            oneOf:
-              - $ref: '#/components/schemas/ScoringFnParams'
-              - type: 'null'
-          description: >-
-            The scoring functions to use for the scoring.
-      additionalProperties: false
-      required:
-        - input_rows
-        - scoring_functions
-      title: ScoreRequest
-    ScoreResponse:
-      type: object
-      properties:
-        results:
-          type: object
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-          description: >-
-            A map of scoring function name to ScoringResult.
-      additionalProperties: false
-      required:
-        - results
-      title: ScoreResponse
-      description: The response from scoring.
-    ScoreBatchRequest:
-      type: object
-      properties:
-        dataset_id:
-          type: string
-        scoring_functions:
-          type: object
-          additionalProperties:
-            oneOf:
-              - $ref: '#/components/schemas/ScoringFnParams'
-              - type: 'null'
-        save_results_dataset:
-          type: boolean
-      additionalProperties: false
-      required:
-        - dataset_id
-        - scoring_functions
-        - save_results_dataset
-      title: ScoreBatchRequest
-    ScoreBatchResponse:
-      type: object
-      properties:
-        dataset_id:
-          type: string
-        results:
-          type: object
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-      additionalProperties: false
-      required:
-        - results
-      title: ScoreBatchResponse
     AlgorithmConfig:
       oneOf:
         - $ref: '#/components/schemas/LoraFinetuningConfig'
diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py
index 83b50d175..0e5959c37 100644
--- a/llama_stack/apis/eval/eval.py
+++ b/llama_stack/apis/eval/eval.py
@@ -141,4 +141,3 @@ class Eval(Protocol):
         :param job_id: The ID of the job to get the result of.
         :return: The result of the job.
         """
-
diff --git a/llama_stack/apis/scoring_functions/scoring_functions.py b/llama_stack/apis/scoring_functions/scoring_functions.py
index 4f6f4d824..4f85947dd 100644
--- a/llama_stack/apis/scoring_functions/scoring_functions.py
+++ b/llama_stack/apis/scoring_functions/scoring_functions.py
@@ -146,4 +146,3 @@ class ScoringFunctions(Protocol):
         provider_id: Optional[str] = None,
         params: Optional[ScoringFnParams] = None,
     ) -> None: ...
-
diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py
index d56d6f672..84fe52632 100644
--- a/llama_stack/distribution/routers/routing_tables.py
+++ b/llama_stack/distribution/routers/routing_tables.py
@@ -43,7 +43,6 @@ from llama_stack.distribution.datatypes import (
     RoutableObject,
     RoutableObjectWithProvider,
     RoutedProtocol,
-    ScoringFnWithACL,
     ShieldWithACL,
     ToolGroupWithACL,
     ToolWithACL,
diff --git a/llama_stack/providers/registry/eval.py b/llama_stack/providers/registry/eval.py
index b6aaacaa7..f3e42c531 100644
--- a/llama_stack/providers/registry/eval.py
+++ b/llama_stack/providers/registry/eval.py
@@ -26,4 +26,3 @@ def available_providers() -> List[ProviderSpec]:
             ],
         ),
     ]
-
diff --git a/llama_stack/templates/open-benchmark/run.yaml b/llama_stack/templates/open-benchmark/run.yaml
index 6ed8a7a71..d9ca11a84 100644
--- a/llama_stack/templates/open-benchmark/run.yaml
+++ b/llama_stack/templates/open-benchmark/run.yaml
@@ -166,7 +166,18 @@ datasets:
     uri: huggingface://datasets/llamastack/bfcl_v3?split=train
   metadata: {}
   dataset_id: bfcl
-  provider_id: huggingface
+- purpose: eval/messages-answer
+  source:
+    type: uri
+    uri: huggingface://datasets/llamastack/IfEval?split=train
+  metadata: {}
+  dataset_id: ifeval
+- purpose: eval/messages-answer
+  source:
+    type: uri
+    uri: huggingface://datasets/llamastack/docvqa?split=val
+  metadata: {}
+  dataset_id: docvqa
 benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch