precommit

2025-03-23 16:00:48 -07:00 · 2025-03-23 16:00:48 -07:00 · 3f8c7a584a
commit 3f8c7a584a
parent 45f6d5cd08
8 changed files with 31 additions and 1037 deletions
--- a/distributions/dependencies.json
+++ b/distributions/dependencies.json
@ -6,12 +6,10 @@
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
@ -23,7 +21,6 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -40,12 +37,10 @@
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "langdetect",
    "matplotlib",
    "nltk",
    "numpy",
@ -56,7 +51,6 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -74,12 +68,10 @@
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "fastapi",
    "fire",
    "fireworks-ai",
    "httpx",
    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
@ -91,7 +83,6 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -111,13 +102,11 @@
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "huggingface_hub",
    "langdetect",
    "matplotlib",
    "nltk",
    "numpy",
@ -128,7 +117,6 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -146,12 +134,10 @@
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "fastapi",
    "fire",
    "fireworks-ai",
    "httpx",
    "langdetect",
    "litellm",
    "matplotlib",
    "mcp",
@ -164,7 +150,6 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -183,13 +168,11 @@
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "fireworks-ai",
    "httpx",
    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
@ -201,7 +184,6 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -218,12 +200,10 @@
    "blobfile",
    "chardet",
    "datasets",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "langdetect",
    "litellm",
    "matplotlib",
    "nltk",
@ -235,7 +215,6 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -252,13 +231,11 @@
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "huggingface_hub",
    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
@ -270,7 +247,6 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -287,13 +263,11 @@
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "huggingface_hub",
    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
@ -305,7 +279,6 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -324,13 +297,11 @@
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "fairscale",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "langdetect",
    "lm-format-enforcer",
    "matplotlib",
    "mcp",
@ -343,7 +314,6 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -364,14 +334,12 @@
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "fairscale",
    "faiss-cpu",
    "fastapi",
    "fbgemm-gpu",
    "fire",
    "httpx",
    "langdetect",
    "lm-format-enforcer",
    "matplotlib",
    "mcp",
@ -384,7 +352,6 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -403,12 +370,10 @@
    "aiosqlite",
    "blobfile",
    "chardet",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "langdetect",
    "matplotlib",
    "nltk",
    "numpy",
@ -420,7 +385,6 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -437,12 +401,10 @@
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
@ -455,7 +417,6 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -471,11 +432,9 @@
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "fastapi",
    "fire",
    "httpx",
    "langdetect",
    "litellm",
    "matplotlib",
    "mcp",
@ -488,7 +447,6 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -506,12 +464,10 @@
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
@ -523,7 +479,6 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -541,12 +496,10 @@
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
@ -559,7 +512,6 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -607,13 +559,11 @@
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "huggingface_hub",
    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
@ -625,7 +575,6 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -643,12 +592,10 @@
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
@ -660,7 +607,6 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
@ -679,12 +625,10 @@
    "chardet",
    "chromadb-client",
    "datasets",
    "emoji",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "langdetect",
    "matplotlib",
    "mcp",
    "nltk",
@ -696,7 +640,6 @@
    "psycopg2-binary",
    "pymongo",
    "pypdf",
    "pythainlp",
    "redis",
    "requests",
    "scikit-learn",
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -2285,7 +2285,7 @@
                        "content": {
                            "application/json": {
                                "schema": {
-                                    "$ref": "#/components/schemas/Job"
+                                    "$ref": "#/components/schemas/ListAgentSessionsResponse"
                                }
                            }
                        }
@ -6192,382 +6192,6 @@
                "title": "EmbeddingsResponse",
                "description": "Response containing generated embeddings."
            },
            "AgentCandidate": {
                "type": "object",
                "properties": {
                    "type": {
                        "type": "string",
                        "const": "agent",
                        "default": "agent"
                    },
                    "config": {
                        "$ref": "#/components/schemas/AgentConfig",
                        "description": "The configuration for the agent candidate."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "type",
                    "config"
                ],
                "title": "AgentCandidate",
                "description": "An agent candidate for evaluation."
            },
            "AggregationFunctionType": {
                "type": "string",
                "enum": [
                    "average",
                    "weighted_average",
                    "median",
                    "categorical_count",
                    "accuracy"
                ],
                "title": "AggregationFunctionType"
            },
            "BasicScoringFnParams": {
                "type": "object",
                "properties": {
                    "type": {
                        "type": "string",
                        "const": "basic",
                        "default": "basic"
                    },
                    "aggregation_functions": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/AggregationFunctionType"
                        }
                    }
                },
                "additionalProperties": false,
                "required": [
                    "type"
                ],
                "title": "BasicScoringFnParams"
            },
            "BenchmarkConfig": {
                "type": "object",
                "properties": {
                    "eval_candidate": {
                        "$ref": "#/components/schemas/EvalCandidate",
                        "description": "The candidate to evaluate."
                    },
                    "scoring_params": {
                        "type": "object",
                        "additionalProperties": {
                            "$ref": "#/components/schemas/ScoringFnParams"
                        },
                        "description": "Map between scoring function id and parameters for each scoring function you want to run"
                    },
                    "num_examples": {
                        "type": "integer",
                        "description": "(Optional) The number of examples to evaluate. If not provided, all examples in the dataset will be evaluated"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "eval_candidate",
                    "scoring_params"
                ],
                "title": "BenchmarkConfig",
                "description": "A benchmark configuration for evaluation."
            },
            "EvalCandidate": {
                "oneOf": [
                    {
                        "$ref": "#/components/schemas/ModelCandidate"
                    },
                    {
                        "$ref": "#/components/schemas/AgentCandidate"
                    }
                ],
                "discriminator": {
                    "propertyName": "type",
                    "mapping": {
                        "model": "#/components/schemas/ModelCandidate",
                        "agent": "#/components/schemas/AgentCandidate"
                    }
                }
            },
            "LLMAsJudgeScoringFnParams": {
                "type": "object",
                "properties": {
                    "type": {
                        "type": "string",
                        "const": "llm_as_judge",
                        "default": "llm_as_judge"
                    },
                    "judge_model": {
                        "type": "string"
                    },
                    "prompt_template": {
                        "type": "string"
                    },
                    "judge_score_regexes": {
                        "type": "array",
                        "items": {
                            "type": "string"
                        }
                    },
                    "aggregation_functions": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/AggregationFunctionType"
                        }
                    }
                },
                "additionalProperties": false,
                "required": [
                    "type",
                    "judge_model"
                ],
                "title": "LLMAsJudgeScoringFnParams"
            },
            "ModelCandidate": {
                "type": "object",
                "properties": {
                    "type": {
                        "type": "string",
                        "const": "model",
                        "default": "model"
                    },
                    "model": {
                        "type": "string",
                        "description": "The model ID to evaluate."
                    },
                    "sampling_params": {
                        "$ref": "#/components/schemas/SamplingParams",
                        "description": "The sampling parameters for the model."
                    },
                    "system_message": {
                        "$ref": "#/components/schemas/SystemMessage",
                        "description": "(Optional) The system message providing instructions or context to the model."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "type",
                    "model",
                    "sampling_params"
                ],
                "title": "ModelCandidate",
                "description": "A model candidate for evaluation."
            },
            "RegexParserScoringFnParams": {
                "type": "object",
                "properties": {
                    "type": {
                        "type": "string",
                        "const": "regex_parser",
                        "default": "regex_parser"
                    },
                    "parsing_regexes": {
                        "type": "array",
                        "items": {
                            "type": "string"
                        }
                    },
                    "aggregation_functions": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/AggregationFunctionType"
                        }
                    }
                },
                "additionalProperties": false,
                "required": [
                    "type"
                ],
                "title": "RegexParserScoringFnParams"
            },
            "ScoringFnParams": {
                "oneOf": [
                    {
                        "$ref": "#/components/schemas/LLMAsJudgeScoringFnParams"
                    },
                    {
                        "$ref": "#/components/schemas/RegexParserScoringFnParams"
                    },
                    {
                        "$ref": "#/components/schemas/BasicScoringFnParams"
                    }
                ],
                "discriminator": {
                    "propertyName": "type",
                    "mapping": {
                        "llm_as_judge": "#/components/schemas/LLMAsJudgeScoringFnParams",
                        "regex_parser": "#/components/schemas/RegexParserScoringFnParams",
                        "basic": "#/components/schemas/BasicScoringFnParams"
                    }
                }
            },
            "EvaluateRowsRequest": {
                "type": "object",
                "properties": {
                    "input_rows": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "additionalProperties": {
                                "oneOf": [
                                    {
                                        "type": "null"
                                    },
                                    {
                                        "type": "boolean"
                                    },
                                    {
                                        "type": "number"
                                    },
                                    {
                                        "type": "string"
                                    },
                                    {
                                        "type": "array"
                                    },
                                    {
                                        "type": "object"
                                    }
                                ]
                            }
                        },
                        "description": "The rows to evaluate."
                    },
                    "scoring_functions": {
                        "type": "array",
                        "items": {
                            "type": "string"
                        },
                        "description": "The scoring functions to use for the evaluation."
                    },
                    "benchmark_config": {
                        "$ref": "#/components/schemas/BenchmarkConfig",
                        "description": "The configuration for the benchmark."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "input_rows",
                    "scoring_functions",
                    "benchmark_config"
                ],
                "title": "EvaluateRowsRequest"
            },
            "EvaluateResponse": {
                "type": "object",
                "properties": {
                    "generations": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "additionalProperties": {
                                "oneOf": [
                                    {
                                        "type": "null"
                                    },
                                    {
                                        "type": "boolean"
                                    },
                                    {
                                        "type": "number"
                                    },
                                    {
                                        "type": "string"
                                    },
                                    {
                                        "type": "array"
                                    },
                                    {
                                        "type": "object"
                                    }
                                ]
                            }
                        },
                        "description": "The generations from the evaluation."
                    },
                    "scores": {
                        "type": "object",
                        "additionalProperties": {
                            "$ref": "#/components/schemas/ScoringResult"
                        },
                        "description": "The scores from the evaluation."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "generations",
                    "scores"
                ],
                "title": "EvaluateResponse",
                "description": "The response from an evaluation."
            },
            "ScoringResult": {
                "type": "object",
                "properties": {
                    "score_rows": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "additionalProperties": {
                                "oneOf": [
                                    {
                                        "type": "null"
                                    },
                                    {
                                        "type": "boolean"
                                    },
                                    {
                                        "type": "number"
                                    },
                                    {
                                        "type": "string"
                                    },
                                    {
                                        "type": "array"
                                    },
                                    {
                                        "type": "object"
                                    }
                                ]
                            }
                        },
                        "description": "The scoring result for each row. Each row is a map of column name to value."
                    },
                    "aggregated_results": {
                        "type": "object",
                        "additionalProperties": {
                            "oneOf": [
                                {
                                    "type": "null"
                                },
                                {
                                    "type": "boolean"
                                },
                                {
                                    "type": "number"
                                },
                                {
                                    "type": "string"
                                },
                                {
                                    "type": "array"
                                },
                                {
                                    "type": "object"
                                }
                            ]
                        },
                        "description": "Map of metric name to aggregated value"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "score_rows",
                    "aggregated_results"
                ],
                "title": "ScoringResult",
                "description": "A scoring result for a single row."
            },
            "Agent": {
                "type": "object",
                "properties": {
@ -7705,7 +7329,8 @@
                            "completed",
                            "in_progress",
                            "failed",
-                            "scheduled"
+                            "scheduled",
                            "cancelled"
                        ],
                        "title": "JobStatus"
                    },
@ -8400,30 +8025,6 @@
                "title": "IterrowsResponse",
                "description": "A paginated list of rows from a dataset."
            },
            "Job": {
                "type": "object",
                "properties": {
                    "job_id": {
                        "type": "string"
                    },
                    "status": {
                        "type": "string",
                        "enum": [
                            "completed",
                            "in_progress",
                            "failed",
                            "scheduled"
                        ],
                        "title": "JobStatus"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "job_id",
                    "status"
                ],
                "title": "Job"
            },
            "ListAgentSessionsResponse": {
                "type": "object",
                "properties": {
@ -10007,16 +9608,21 @@
            "RunRequest": {
                "type": "object",
                "properties": {
-                    "benchmark_config": {
+                    "task": {
-                        "$ref": "#/components/schemas/BenchmarkConfig",
+                        "$ref": "#/components/schemas/EvaluationTask",
-                        "description": "The configuration for the benchmark."
+                        "description": "The task to evaluate. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
                    },
                    "candidate": {
                        "$ref": "#/components/schemas/EvaluationCandidate",
                        "description": "The candidate to evaluate."
                    }
                },
                "additionalProperties": false,
                "required": [
-                    "benchmark_config"
+                    "task",
                    "candidate"
                ],
-                "title": "RunEvalRequest"
+                "title": "RunRequest"
            },
            "RunShieldRequest": {
                "type": "object",
@ -10123,128 +9729,6 @@
                ],
                "title": "SaveSpansToDatasetRequest"
            },
            "ScoreRequest": {
                "type": "object",
                "properties": {
                    "input_rows": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "additionalProperties": {
                                "oneOf": [
                                    {
                                        "type": "null"
                                    },
                                    {
                                        "type": "boolean"
                                    },
                                    {
                                        "type": "number"
                                    },
                                    {
                                        "type": "string"
                                    },
                                    {
                                        "type": "array"
                                    },
                                    {
                                        "type": "object"
                                    }
                                ]
                            }
                        },
                        "description": "The rows to score."
                    },
                    "scoring_functions": {
                        "type": "object",
                        "additionalProperties": {
                            "oneOf": [
                                {
                                    "$ref": "#/components/schemas/ScoringFnParams"
                                },
                                {
                                    "type": "null"
                                }
                            ]
                        },
                        "description": "The scoring functions to use for the scoring."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "input_rows",
                    "scoring_functions"
                ],
                "title": "ScoreRequest"
            },
            "ScoreResponse": {
                "type": "object",
                "properties": {
                    "results": {
                        "type": "object",
                        "additionalProperties": {
                            "$ref": "#/components/schemas/ScoringResult"
                        },
                        "description": "A map of scoring function name to ScoringResult."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "results"
                ],
                "title": "ScoreResponse",
                "description": "The response from scoring."
            },
            "ScoreBatchRequest": {
                "type": "object",
                "properties": {
                    "dataset_id": {
                        "type": "string"
                    },
                    "scoring_functions": {
                        "type": "object",
                        "additionalProperties": {
                            "oneOf": [
                                {
                                    "$ref": "#/components/schemas/ScoringFnParams"
                                },
                                {
                                    "type": "null"
                                }
                            ]
                        }
                    },
                    "save_results_dataset": {
                        "type": "boolean"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "dataset_id",
                    "scoring_functions",
                    "save_results_dataset"
                ],
                "title": "ScoreBatchRequest"
            },
            "ScoreBatchResponse": {
                "type": "object",
                "properties": {
                    "dataset_id": {
                        "type": "string"
                    },
                    "results": {
                        "type": "object",
                        "additionalProperties": {
                            "$ref": "#/components/schemas/ScoringResult"
                        }
                    }
                },
                "additionalProperties": false,
                "required": [
                    "results"
                ],
                "title": "ScoreBatchResponse"
            },
            "AlgorithmConfig": {
                "oneOf": [
                    {
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -1562,109 +1562,6 @@ paths:
          required: false
          schema:
            type: integer
  /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}:
    get:
      responses:
        '200':
          description: The status of the evaluationjob.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Job'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Eval
      description: Get the status of a job.
      parameters:
        - name: benchmark_id
          in: path
          description: >-
            The ID of the benchmark to run the evaluation on.
          required: true
          schema:
            type: string
        - name: job_id
          in: path
          description: The ID of the job to get the status of.
          required: true
          schema:
            type: string
    delete:
      responses:
        '200':
          description: OK
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Eval
      description: Cancel a job.
      parameters:
        - name: benchmark_id
          in: path
          description: >-
            The ID of the benchmark to run the evaluation on.
          required: true
          schema:
            type: string
        - name: job_id
          in: path
          description: The ID of the job to cancel.
          required: true
          schema:
            type: string
  /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result:
    get:
      responses:
        '200':
          description: The result of the job.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/EvaluateResponse'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Eval
      description: Get the result of a job.
      parameters:
        - name: benchmark_id
          in: path
          description: >-
            The ID of the benchmark to run the evaluation on.
          required: true
          schema:
            type: string
        - name: job_id
          in: path
          description: The ID of the job to get the result of.
          required: true
          schema:
            type: string
  /v1/agents/{agent_id}/sessions:
    get:
      responses:
@ -1923,7 +1820,7 @@ paths:
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
-        - Providers
+        - Models
      description: ''
      parameters: []
    post:
@ -1974,7 +1871,7 @@ paths:
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
-        - Inspect
+        - Providers
      description: ''
      parameters: []
  /v1/inspect/routes:
@ -4448,252 +4345,6 @@ components:
      title: EmbeddingsResponse
      description: >-
        Response containing generated embeddings.
    AgentCandidate:
      type: object
      properties:
        type:
          type: string
          const: agent
          default: agent
        config:
          $ref: '#/components/schemas/AgentConfig'
          description: >-
            The configuration for the agent candidate.
      additionalProperties: false
      required:
        - type
        - config
      title: AgentCandidate
      description: An agent candidate for evaluation.
    AggregationFunctionType:
      type: string
      enum:
        - average
        - weighted_average
        - median
        - categorical_count
        - accuracy
      title: AggregationFunctionType
    BasicScoringFnParams:
      type: object
      properties:
        type:
          type: string
          const: basic
          default: basic
        aggregation_functions:
          type: array
          items:
            $ref: '#/components/schemas/AggregationFunctionType'
      additionalProperties: false
      required:
        - type
      title: BasicScoringFnParams
    BenchmarkConfig:
      type: object
      properties:
        eval_candidate:
          $ref: '#/components/schemas/EvalCandidate'
          description: The candidate to evaluate.
        scoring_params:
          type: object
          additionalProperties:
            $ref: '#/components/schemas/ScoringFnParams'
          description: >-
            Map between scoring function id and parameters for each scoring function
            you want to run
        num_examples:
          type: integer
          description: >-
            (Optional) The number of examples to evaluate. If not provided, all examples
            in the dataset will be evaluated
      additionalProperties: false
      required:
        - eval_candidate
        - scoring_params
      title: BenchmarkConfig
      description: >-
        A benchmark configuration for evaluation.
    EvalCandidate:
      oneOf:
        - $ref: '#/components/schemas/ModelCandidate'
        - $ref: '#/components/schemas/AgentCandidate'
      discriminator:
        propertyName: type
        mapping:
          model: '#/components/schemas/ModelCandidate'
          agent: '#/components/schemas/AgentCandidate'
    LLMAsJudgeScoringFnParams:
      type: object
      properties:
        type:
          type: string
          const: llm_as_judge
          default: llm_as_judge
        judge_model:
          type: string
        prompt_template:
          type: string
        judge_score_regexes:
          type: array
          items:
            type: string
        aggregation_functions:
          type: array
          items:
            $ref: '#/components/schemas/AggregationFunctionType'
      additionalProperties: false
      required:
        - type
        - judge_model
      title: LLMAsJudgeScoringFnParams
    ModelCandidate:
      type: object
      properties:
        type:
          type: string
          const: model
          default: model
        model:
          type: string
          description: The model ID to evaluate.
        sampling_params:
          $ref: '#/components/schemas/SamplingParams'
          description: The sampling parameters for the model.
        system_message:
          $ref: '#/components/schemas/SystemMessage'
          description: >-
            (Optional) The system message providing instructions or context to the
            model.
      additionalProperties: false
      required:
        - type
        - model
        - sampling_params
      title: ModelCandidate
      description: A model candidate for evaluation.
    RegexParserScoringFnParams:
      type: object
      properties:
        type:
          type: string
          const: regex_parser
          default: regex_parser
        parsing_regexes:
          type: array
          items:
            type: string
        aggregation_functions:
          type: array
          items:
            $ref: '#/components/schemas/AggregationFunctionType'
      additionalProperties: false
      required:
        - type
      title: RegexParserScoringFnParams
    ScoringFnParams:
      oneOf:
        - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
        - $ref: '#/components/schemas/RegexParserScoringFnParams'
        - $ref: '#/components/schemas/BasicScoringFnParams'
      discriminator:
        propertyName: type
        mapping:
          llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
          regex_parser: '#/components/schemas/RegexParserScoringFnParams'
          basic: '#/components/schemas/BasicScoringFnParams'
    EvaluateRowsRequest:
      type: object
      properties:
        input_rows:
          type: array
          items:
            type: object
            additionalProperties:
              oneOf:
                - type: 'null'
                - type: boolean
                - type: number
                - type: string
                - type: array
                - type: object
          description: The rows to evaluate.
        scoring_functions:
          type: array
          items:
            type: string
          description: >-
            The scoring functions to use for the evaluation.
        benchmark_config:
          $ref: '#/components/schemas/BenchmarkConfig'
          description: The configuration for the benchmark.
      additionalProperties: false
      required:
        - input_rows
        - scoring_functions
        - benchmark_config
      title: EvaluateRowsRequest
    EvaluateResponse:
      type: object
      properties:
        generations:
          type: array
          items:
            type: object
            additionalProperties:
              oneOf:
                - type: 'null'
                - type: boolean
                - type: number
                - type: string
                - type: array
                - type: object
          description: The generations from the evaluation.
        scores:
          type: object
          additionalProperties:
            $ref: '#/components/schemas/ScoringResult'
          description: The scores from the evaluation.
      additionalProperties: false
      required:
        - generations
        - scores
      title: EvaluateResponse
      description: The response from an evaluation.
    ScoringResult:
      type: object
      properties:
        score_rows:
          type: array
          items:
            type: object
            additionalProperties:
              oneOf:
                - type: 'null'
                - type: boolean
                - type: number
                - type: string
                - type: array
                - type: object
          description: >-
            The scoring result for each row. Each row is a map of column name to value.
        aggregated_results:
          type: object
          additionalProperties:
            oneOf:
              - type: 'null'
              - type: boolean
              - type: number
              - type: string
              - type: array
              - type: object
          description: Map of metric name to aggregated value
      additionalProperties: false
      required:
        - score_rows
        - aggregated_results
      title: ScoringResult
      description: A scoring result for a single row.
    Agent:
      type: object
      properties:
@ -5451,6 +5102,7 @@ components:
            - in_progress
            - failed
            - scheduled
            - cancelled
          title: JobStatus
        scheduled_at:
          type: string
@ -5901,24 +5553,6 @@ components:
        - data
      title: IterrowsResponse
      description: A paginated list of rows from a dataset.
    Job:
      type: object
      properties:
        job_id:
          type: string
        status:
          type: string
          enum:
            - completed
            - in_progress
            - failed
            - scheduled
          title: JobStatus
      additionalProperties: false
      required:
        - job_id
        - status
      title: Job
    ListAgentSessionsResponse:
      type: object
      properties:
@ -6984,8 +6618,9 @@ components:
          description: The candidate to evaluate.
      additionalProperties: false
      required:
-        - benchmark_config
+        - task
-      title: RunEvalRequest
+        - candidate
      title: RunRequest
    RunShieldRequest:
      type: object
      properties:
@ -7058,81 +6693,6 @@ components:
        - attributes_to_save
        - dataset_id
      title: SaveSpansToDatasetRequest
    ScoreRequest:
      type: object
      properties:
        input_rows:
          type: array
          items:
            type: object
            additionalProperties:
              oneOf:
                - type: 'null'
                - type: boolean
                - type: number
                - type: string
                - type: array
                - type: object
          description: The rows to score.
        scoring_functions:
          type: object
          additionalProperties:
            oneOf:
              - $ref: '#/components/schemas/ScoringFnParams'
              - type: 'null'
          description: >-
            The scoring functions to use for the scoring.
      additionalProperties: false
      required:
        - input_rows
        - scoring_functions
      title: ScoreRequest
    ScoreResponse:
      type: object
      properties:
        results:
          type: object
          additionalProperties:
            $ref: '#/components/schemas/ScoringResult'
          description: >-
            A map of scoring function name to ScoringResult.
      additionalProperties: false
      required:
        - results
      title: ScoreResponse
      description: The response from scoring.
    ScoreBatchRequest:
      type: object
      properties:
        dataset_id:
          type: string
        scoring_functions:
          type: object
          additionalProperties:
            oneOf:
              - $ref: '#/components/schemas/ScoringFnParams'
              - type: 'null'
        save_results_dataset:
          type: boolean
      additionalProperties: false
      required:
        - dataset_id
        - scoring_functions
        - save_results_dataset
      title: ScoreBatchRequest
    ScoreBatchResponse:
      type: object
      properties:
        dataset_id:
          type: string
        results:
          type: object
          additionalProperties:
            $ref: '#/components/schemas/ScoringResult'
      additionalProperties: false
      required:
        - results
      title: ScoreBatchResponse
    AlgorithmConfig:
      oneOf:
        - $ref: '#/components/schemas/LoraFinetuningConfig'
--- a/llama_stack/apis/eval/eval.py
+++ b/llama_stack/apis/eval/eval.py
@ -141,4 +141,3 @@ class Eval(Protocol):
        :param job_id: The ID of the job to get the result of.
        :return: The result of the job.
        """
--- a/llama_stack/apis/scoring_functions/scoring_functions.py
+++ b/llama_stack/apis/scoring_functions/scoring_functions.py
@ -146,4 +146,3 @@ class ScoringFunctions(Protocol):
        provider_id: Optional[str] = None,
        params: Optional[ScoringFnParams] = None,
    ) -> None: ...
--- a/llama_stack/distribution/routers/routing_tables.py
+++ b/llama_stack/distribution/routers/routing_tables.py
@ -43,7 +43,6 @@ from llama_stack.distribution.datatypes import (
    RoutableObject,
    RoutableObjectWithProvider,
    RoutedProtocol,
    ScoringFnWithACL,
    ShieldWithACL,
    ToolGroupWithACL,
    ToolWithACL,
--- a/llama_stack/providers/registry/eval.py
+++ b/llama_stack/providers/registry/eval.py
@ -26,4 +26,3 @@ def available_providers() -> List[ProviderSpec]:
            ],
        ),
    ]
--- a/llama_stack/templates/open-benchmark/run.yaml
+++ b/llama_stack/templates/open-benchmark/run.yaml
@ -166,7 +166,18 @@ datasets:
    uri: huggingface://datasets/llamastack/bfcl_v3?split=train
  metadata: {}
  dataset_id: bfcl
-  provider_id: huggingface
+- purpose: eval/messages-answer
  source:
    type: uri
    uri: huggingface://datasets/llamastack/IfEval?split=train
  metadata: {}
  dataset_id: ifeval
 - purpose: eval/messages-answer
  source:
    type: uri
    uri: huggingface://datasets/llamastack/docvqa?split=val
  metadata: {}
  dataset_id: docvqa
 benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch