diff --git a/docs/openapi_generator/generate.py b/docs/openapi_generator/generate.py
index 871c01a80..994b06e58 100644
--- a/docs/openapi_generator/generate.py
+++ b/docs/openapi_generator/generate.py
@@ -33,7 +33,7 @@ schema_utils.json_schema_type = json_schema_type
 
 from llama_models.llama3.api.datatypes import *  # noqa: F403
 from llama_stack.apis.agents import *  # noqa: F403
-from llama_stack.apis.dataset import *  # noqa: F403
+from llama_stack.apis.datasets import *  # noqa: F403
 from llama_stack.apis.evals import *  # noqa: F403
 from llama_stack.apis.inference import *  # noqa: F403
 from llama_stack.apis.batch_inference import *  # noqa: F403
@@ -61,7 +61,7 @@ class LlamaStack(
     Telemetry,
     PostTraining,
     Memory,
-    Evaluations,
+    Evals,
     Models,
     Shields,
     Inspect,
diff --git a/docs/resources/llama-stack-spec.html b/docs/resources/llama-stack-spec.html
index 96ef7e4bb..ac75dbf04 100644
--- a/docs/resources/llama-stack-spec.html
+++ b/docs/resources/llama-stack-spec.html
@@ -21,7 +21,7 @@
     "info": {
         "title": "[DRAFT] Llama Stack Specification",
         "version": "0.0.1",
-        "description": "This is the specification of the llama stack that provides\n                a set of endpoints and their corresponding interfaces that are tailored to\n                best leverage Llama Models. The specification is still in draft and subject to change.\n                Generated at 2024-10-09 21:10:09.073430"
+        "description": "This is the specification of the llama stack that provides\n                a set of endpoints and their corresponding interfaces that are tailored to\n                best leverage Llama Models. The specification is still in draft and subject to change.\n                Generated at 2024-10-15 00:44:26.278642"
     },
     "servers": [
         {
@@ -109,39 +109,6 @@
                 }
             }
         },
-        "/evaluate/job/cancel": {
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "OK"
-                    }
-                },
-                "tags": [
-                    "Evaluations"
-                ],
-                "parameters": [
-                    {
-                        "name": "X-LlamaStack-ProviderData",
-                        "in": "header",
-                        "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
-                        "required": false,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/CancelEvaluationJobRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                }
-            }
-        },
         "/post_training/job/cancel": {
             "post": {
                 "responses": {
@@ -355,7 +322,7 @@
                     "200": {
                         "description": "OK",
                         "content": {
-                            "application/json": {
+                            "text/event-stream": {
                                 "schema": {
                                     "$ref": "#/components/schemas/AgentTurnResponseStreamChunk"
                                 }
@@ -393,7 +360,14 @@
             "post": {
                 "responses": {
                     "200": {
-                        "description": "OK"
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/CreateDatasetResponse"
+                                }
+                            }
+                        }
                     }
                 },
                 "tags": [
@@ -492,7 +466,14 @@
             "post": {
                 "responses": {
                     "200": {
-                        "description": "OK"
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/DeleteDatasetResponse"
+                                }
+                            }
+                        }
                     }
                 },
                 "tags": [
@@ -561,126 +542,6 @@
                 }
             }
         },
-        "/evaluate/question_answering/": {
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/EvaluationJob"
-                                }
-                            }
-                        }
-                    }
-                },
-                "tags": [
-                    "Evaluations"
-                ],
-                "parameters": [
-                    {
-                        "name": "X-LlamaStack-ProviderData",
-                        "in": "header",
-                        "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
-                        "required": false,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/EvaluateQuestionAnsweringRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                }
-            }
-        },
-        "/evaluate/summarization/": {
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/EvaluationJob"
-                                }
-                            }
-                        }
-                    }
-                },
-                "tags": [
-                    "Evaluations"
-                ],
-                "parameters": [
-                    {
-                        "name": "X-LlamaStack-ProviderData",
-                        "in": "header",
-                        "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
-                        "required": false,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/EvaluateSummarizationRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                }
-            }
-        },
-        "/evaluate/text_generation/": {
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/EvaluationJob"
-                                }
-                            }
-                        }
-                    }
-                },
-                "tags": [
-                    "Evaluations"
-                ],
-                "parameters": [
-                    {
-                        "name": "X-LlamaStack-ProviderData",
-                        "in": "header",
-                        "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
-                        "required": false,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/EvaluateTextGenerationRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                }
-            }
-        },
         "/agents/session/get": {
             "post": {
                 "responses": {
@@ -845,7 +706,21 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/TrainEvalDataset"
+                                    "oneOf": [
+                                        {
+                                            "oneOf": [
+                                                {
+                                                    "$ref": "#/components/schemas/HuggingfaceDatasetDef"
+                                                },
+                                                {
+                                                    "$ref": "#/components/schemas/CustomDatasetDef"
+                                                }
+                                            ]
+                                        },
+                                        {
+                                            "type": "null"
+                                        }
+                                    ]
                                 }
                             }
                         }
@@ -856,7 +731,7 @@
                 ],
                 "parameters": [
                     {
-                        "name": "dataset_uuid",
+                        "name": "dataset_identifier",
                         "in": "query",
                         "required": true,
                         "schema": {
@@ -875,150 +750,6 @@
                 ]
             }
         },
-        "/evaluate/job/artifacts": {
-            "get": {
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/EvaluationJobArtifactsResponse"
-                                }
-                            }
-                        }
-                    }
-                },
-                "tags": [
-                    "Evaluations"
-                ],
-                "parameters": [
-                    {
-                        "name": "job_uuid",
-                        "in": "query",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    },
-                    {
-                        "name": "X-LlamaStack-ProviderData",
-                        "in": "header",
-                        "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
-                        "required": false,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ]
-            }
-        },
-        "/evaluate/job/logs": {
-            "get": {
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/EvaluationJobLogStream"
-                                }
-                            }
-                        }
-                    }
-                },
-                "tags": [
-                    "Evaluations"
-                ],
-                "parameters": [
-                    {
-                        "name": "job_uuid",
-                        "in": "query",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    },
-                    {
-                        "name": "X-LlamaStack-ProviderData",
-                        "in": "header",
-                        "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
-                        "required": false,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ]
-            }
-        },
-        "/evaluate/job/status": {
-            "get": {
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/EvaluationJobStatusResponse"
-                                }
-                            }
-                        }
-                    }
-                },
-                "tags": [
-                    "Evaluations"
-                ],
-                "parameters": [
-                    {
-                        "name": "job_uuid",
-                        "in": "query",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    },
-                    {
-                        "name": "X-LlamaStack-ProviderData",
-                        "in": "header",
-                        "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
-                        "required": false,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ]
-            }
-        },
-        "/evaluate/jobs": {
-            "get": {
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "content": {
-                            "application/jsonl": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/EvaluationJob"
-                                }
-                            }
-                        }
-                    }
-                },
-                "tags": [
-                    "Evaluations"
-                ],
-                "parameters": [
-                    {
-                        "name": "X-LlamaStack-ProviderData",
-                        "in": "header",
-                        "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
-                        "required": false,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ]
-            }
-        },
         "/memory_banks/get": {
             "get": {
                 "responses": {
@@ -1412,6 +1143,43 @@
                 }
             }
         },
+        "/datasets/list": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/jsonl": {
+                                "schema": {
+                                    "oneOf": [
+                                        {
+                                            "$ref": "#/components/schemas/HuggingfaceDatasetDef"
+                                        },
+                                        {
+                                            "$ref": "#/components/schemas/CustomDatasetDef"
+                                        }
+                                    ]
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "Datasets"
+                ],
+                "parameters": [
+                    {
+                        "name": "X-LlamaStack-ProviderData",
+                        "in": "header",
+                        "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
+                        "required": false,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
+            }
+        },
         "/memory_banks/list": {
             "get": {
                 "responses": {
@@ -1836,6 +1604,86 @@
                 }
             }
         },
+        "/evals/run_eval_task": {
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/EvaluateResponse"
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "Evals"
+                ],
+                "parameters": [
+                    {
+                        "name": "X-LlamaStack-ProviderData",
+                        "in": "header",
+                        "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
+                        "required": false,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/RunEvalTaskRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                }
+            }
+        },
+        "/evals/run_scorer": {
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/EvaluateResponse"
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "Evals"
+                ],
+                "parameters": [
+                    {
+                        "name": "X-LlamaStack-ProviderData",
+                        "in": "header",
+                        "description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
+                        "required": false,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/RunScorerRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                }
+            }
+        },
         "/safety/run_shield": {
             "post": {
                 "responses": {
@@ -2571,18 +2419,6 @@
                     "completion_message_batch"
                 ]
             },
-            "CancelEvaluationJobRequest": {
-                "type": "object",
-                "properties": {
-                    "job_uuid": {
-                        "type": "string"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "job_uuid"
-                ]
-            },
             "CancelTrainingJobRequest": {
                 "type": "object",
                 "properties": {
@@ -4090,19 +3926,58 @@
                     "error"
                 ]
             },
-            "TrainEvalDataset": {
+            "CustomDatasetDef": {
                 "type": "object",
                 "properties": {
-                    "columns": {
+                    "type": {
+                        "type": "string",
+                        "const": "custom",
+                        "default": "custom"
+                    },
+                    "identifier": {
+                        "type": "string"
+                    },
+                    "url": {
+                        "type": "string"
+                    },
+                    "rename_columns_map": {
                         "type": "object",
                         "additionalProperties": {
-                            "$ref": "#/components/schemas/TrainEvalDatasetColumnType"
+                            "type": "string"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "identifier",
+                    "url"
+                ]
+            },
+            "HuggingfaceDatasetDef": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "huggingface",
+                        "default": "huggingface"
+                    },
+                    "identifier": {
+                        "type": "string"
+                    },
+                    "dataset_path": {
+                        "type": "string"
+                    },
+                    "dataset_name": {
+                        "type": "string"
+                    },
+                    "rename_columns_map": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "type": "string"
                         }
                     },
-                    "content_url": {
-                        "$ref": "#/components/schemas/URL"
-                    },
-                    "metadata": {
+                    "kwargs": {
                         "type": "object",
                         "additionalProperties": {
                             "oneOf": [
@@ -4130,35 +4005,48 @@
                 },
                 "additionalProperties": false,
                 "required": [
-                    "columns",
-                    "content_url"
-                ],
-                "title": "Dataset to be used for training or evaluating language models."
-            },
-            "TrainEvalDatasetColumnType": {
-                "type": "string",
-                "enum": [
-                    "dialog",
-                    "text",
-                    "media",
-                    "number",
-                    "json"
+                    "type",
+                    "identifier",
+                    "dataset_path",
+                    "kwargs"
                 ]
             },
             "CreateDatasetRequest": {
                 "type": "object",
                 "properties": {
-                    "uuid": {
-                        "type": "string"
-                    },
-                    "dataset": {
-                        "$ref": "#/components/schemas/TrainEvalDataset"
+                    "dataset_def": {
+                        "oneOf": [
+                            {
+                                "$ref": "#/components/schemas/HuggingfaceDatasetDef"
+                            },
+                            {
+                                "$ref": "#/components/schemas/CustomDatasetDef"
+                            }
+                        ]
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "uuid",
-                    "dataset"
+                    "dataset_def"
+                ]
+            },
+            "CreateDatasetResponse": {
+                "type": "object",
+                "properties": {
+                    "status": {
+                        "type": "string",
+                        "enum": [
+                            "success",
+                            "fail"
+                        ]
+                    },
+                    "msg": {
+                        "type": "string"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "status"
                 ]
             },
             "DeleteAgentsRequest": {
@@ -4192,13 +4080,32 @@
             "DeleteDatasetRequest": {
                 "type": "object",
                 "properties": {
-                    "dataset_uuid": {
+                    "dataset_identifier": {
                         "type": "string"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "dataset_uuid"
+                    "dataset_identifier"
+                ]
+            },
+            "DeleteDatasetResponse": {
+                "type": "object",
+                "properties": {
+                    "status": {
+                        "type": "string",
+                        "enum": [
+                            "success",
+                            "fail"
+                        ]
+                    },
+                    "msg": {
+                        "type": "string"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "status"
                 ]
             },
             "EmbeddingsRequest": {
@@ -4258,76 +4165,6 @@
                     "embeddings"
                 ]
             },
-            "EvaluateQuestionAnsweringRequest": {
-                "type": "object",
-                "properties": {
-                    "metrics": {
-                        "type": "array",
-                        "items": {
-                            "type": "string",
-                            "enum": [
-                                "em",
-                                "f1"
-                            ]
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "metrics"
-                ]
-            },
-            "EvaluationJob": {
-                "type": "object",
-                "properties": {
-                    "job_uuid": {
-                        "type": "string"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "job_uuid"
-                ]
-            },
-            "EvaluateSummarizationRequest": {
-                "type": "object",
-                "properties": {
-                    "metrics": {
-                        "type": "array",
-                        "items": {
-                            "type": "string",
-                            "enum": [
-                                "rouge",
-                                "bleu"
-                            ]
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "metrics"
-                ]
-            },
-            "EvaluateTextGenerationRequest": {
-                "type": "object",
-                "properties": {
-                    "metrics": {
-                        "type": "array",
-                        "items": {
-                            "type": "string",
-                            "enum": [
-                                "perplexity",
-                                "rouge",
-                                "bleu"
-                            ]
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "metrics"
-                ]
-            },
             "GetAgentsSessionRequest": {
                 "type": "object",
                 "properties": {
@@ -4513,43 +4350,6 @@
                     "step"
                 ]
             },
-            "EvaluationJobArtifactsResponse": {
-                "type": "object",
-                "properties": {
-                    "job_uuid": {
-                        "type": "string"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "job_uuid"
-                ],
-                "title": "Artifacts of a evaluation job."
-            },
-            "EvaluationJobLogStream": {
-                "type": "object",
-                "properties": {
-                    "job_uuid": {
-                        "type": "string"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "job_uuid"
-                ]
-            },
-            "EvaluationJobStatusResponse": {
-                "type": "object",
-                "properties": {
-                    "job_uuid": {
-                        "type": "string"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "job_uuid"
-                ]
-            },
             "ModelDefWithProvider": {
                 "type": "object",
                 "properties": {
@@ -5265,6 +5065,61 @@
                     "dpo"
                 ]
             },
+            "TrainEvalDataset": {
+                "type": "object",
+                "properties": {
+                    "columns": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "$ref": "#/components/schemas/TrainEvalDatasetColumnType"
+                        }
+                    },
+                    "content_url": {
+                        "$ref": "#/components/schemas/URL"
+                    },
+                    "metadata": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "columns",
+                    "content_url"
+                ],
+                "title": "Dataset to be used for training or evaluating language models."
+            },
+            "TrainEvalDatasetColumnType": {
+                "type": "string",
+                "enum": [
+                    "dialog",
+                    "text",
+                    "media",
+                    "number",
+                    "json"
+                ]
+            },
             "TrainingConfig": {
                 "type": "object",
                 "properties": {
@@ -5709,6 +5564,314 @@
                     "score"
                 ]
             },
+            "EvaluateDatasetConfig": {
+                "type": "object",
+                "properties": {
+                    "dataset_identifier": {
+                        "type": "string"
+                    },
+                    "row_limit": {
+                        "type": "integer"
+                    },
+                    "kwargs": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "dataset_identifier"
+                ]
+            },
+            "EvaluateJudgeScoringConfig": {
+                "type": "object"
+            },
+            "EvaluateModelGenerationConfig": {
+                "type": "object",
+                "properties": {
+                    "model": {
+                        "type": "string"
+                    },
+                    "sampling_params": {
+                        "$ref": "#/components/schemas/SamplingParams"
+                    },
+                    "kwargs": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "model",
+                    "sampling_params"
+                ]
+            },
+            "EvaluatePostprocessConfig": {
+                "type": "object",
+                "properties": {
+                    "kwargs": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
+                    }
+                },
+                "additionalProperties": false
+            },
+            "EvaluatePreprocessConfig": {
+                "type": "object",
+                "properties": {
+                    "kwargs": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
+                    }
+                },
+                "additionalProperties": false
+            },
+            "EvaluateProcessorConfig": {
+                "type": "object",
+                "properties": {
+                    "processor_identifier": {
+                        "type": "string"
+                    },
+                    "preprocess_config": {
+                        "$ref": "#/components/schemas/EvaluatePreprocessConfig"
+                    },
+                    "postprocess_config": {
+                        "$ref": "#/components/schemas/EvaluatePostprocessConfig"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "processor_identifier"
+                ]
+            },
+            "EvaluateScoringConfig": {
+                "type": "object",
+                "properties": {
+                    "scorer_config_list": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/EvaluateSingleScorerConfig"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "scorer_config_list"
+                ]
+            },
+            "EvaluateSingleScorerConfig": {
+                "type": "object",
+                "properties": {
+                    "scorer_name": {
+                        "type": "string"
+                    },
+                    "llm_judge_config": {
+                        "$ref": "#/components/schemas/LLMJudgeConfig"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "scorer_name"
+                ]
+            },
+            "EvaluateTaskConfig": {
+                "type": "object",
+                "properties": {
+                    "dataset_config": {
+                        "$ref": "#/components/schemas/EvaluateDatasetConfig"
+                    },
+                    "processor_config": {
+                        "$ref": "#/components/schemas/EvaluateProcessorConfig"
+                    },
+                    "generation_config": {
+                        "$ref": "#/components/schemas/EvaluateModelGenerationConfig"
+                    },
+                    "scoring_config": {
+                        "$ref": "#/components/schemas/EvaluateScoringConfig"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "dataset_config",
+                    "processor_config",
+                    "generation_config",
+                    "scoring_config"
+                ]
+            },
+            "LLMJudgeConfig": {
+                "type": "object",
+                "properties": {
+                    "judge_processor_config": {
+                        "$ref": "#/components/schemas/EvaluateProcessorConfig"
+                    },
+                    "judge_model_generation_config": {
+                        "$ref": "#/components/schemas/EvaluateModelGenerationConfig"
+                    },
+                    "judge_scoring_config": {
+                        "$ref": "#/components/schemas/EvaluateJudgeScoringConfig"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "judge_processor_config",
+                    "judge_model_generation_config",
+                    "judge_scoring_config"
+                ]
+            },
+            "RunEvalTaskRequest": {
+                "type": "object",
+                "properties": {
+                    "model": {
+                        "type": "string"
+                    },
+                    "task": {
+                        "type": "string"
+                    },
+                    "dataset": {
+                        "type": "string"
+                    },
+                    "eval_task_config": {
+                        "$ref": "#/components/schemas/EvaluateTaskConfig"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "model",
+                    "task"
+                ]
+            },
+            "EvalResult": {
+                "type": "object",
+                "properties": {
+                    "metrics": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "type": "number"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "metrics"
+                ],
+                "title": "Aggregated final evaluation result."
+            },
+            "EvaluateResponse": {
+                "type": "object",
+                "properties": {
+                    "eval_result": {
+                        "$ref": "#/components/schemas/EvalResult"
+                    },
+                    "formatted_report": {
+                        "type": "string"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "eval_result"
+                ],
+                "title": "Scores for evaluation."
+            },
+            "RunScorerRequest": {
+                "type": "object",
+                "properties": {
+                    "dataset_config": {
+                        "$ref": "#/components/schemas/EvaluateDatasetConfig"
+                    },
+                    "eval_scoring_config": {
+                        "$ref": "#/components/schemas/EvaluateScoringConfig"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "dataset_config",
+                    "eval_scoring_config"
+                ]
+            },
             "RunShieldRequest": {
                 "type": "object",
                 "properties": {
@@ -6075,7 +6238,28 @@
     ],
     "tags": [
         {
-            "name": "RewardScoring"
+            "name": "Models"
+        },
+        {
+            "name": "BatchInference"
+        },
+        {
+            "name": "Inspect"
+        },
+        {
+            "name": "Evals"
+        },
+        {
+            "name": "Safety"
+        },
+        {
+            "name": "Shields"
+        },
+        {
+            "name": "Telemetry"
+        },
+        {
+            "name": "Agents"
         },
         {
             "name": "Memory"
@@ -6084,37 +6268,16 @@
             "name": "SyntheticDataGeneration"
         },
         {
-            "name": "Models"
-        },
-        {
-            "name": "Safety"
-        },
-        {
-            "name": "BatchInference"
-        },
-        {
-            "name": "Agents"
-        },
-        {
-            "name": "MemoryBanks"
-        },
-        {
-            "name": "Shields"
+            "name": "PostTraining"
         },
         {
             "name": "Datasets"
         },
         {
-            "name": "Evaluations"
+            "name": "MemoryBanks"
         },
         {
-            "name": "Inspect"
-        },
-        {
-            "name": "PostTraining"
-        },
-        {
-            "name": "Telemetry"
+            "name": "RewardScoring"
         },
         {
             "name": "Inference"
@@ -6195,10 +6358,6 @@
             "name": "BatchCompletionResponse",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/BatchCompletionResponse\" />"
         },
-        {
-            "name": "CancelEvaluationJobRequest",
-            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/CancelEvaluationJobRequest\" />"
-        },
         {
             "name": "CancelTrainingJobRequest",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/CancelTrainingJobRequest\" />"
@@ -6368,17 +6527,21 @@
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/ViolationLevel\" />"
         },
         {
-            "name": "TrainEvalDataset",
-            "description": "Dataset to be used for training or evaluating language models.\n\n<SchemaDefinition schemaRef=\"#/components/schemas/TrainEvalDataset\" />"
+            "name": "CustomDatasetDef",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/CustomDatasetDef\" />"
         },
         {
-            "name": "TrainEvalDatasetColumnType",
-            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/TrainEvalDatasetColumnType\" />"
+            "name": "HuggingfaceDatasetDef",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/HuggingfaceDatasetDef\" />"
         },
         {
             "name": "CreateDatasetRequest",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/CreateDatasetRequest\" />"
         },
+        {
+            "name": "CreateDatasetResponse",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/CreateDatasetResponse\" />"
+        },
         {
             "name": "DeleteAgentsRequest",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/DeleteAgentsRequest\" />"
@@ -6391,6 +6554,10 @@
             "name": "DeleteDatasetRequest",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/DeleteDatasetRequest\" />"
         },
+        {
+            "name": "DeleteDatasetResponse",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/DeleteDatasetResponse\" />"
+        },
         {
             "name": "EmbeddingsRequest",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EmbeddingsRequest\" />"
@@ -6399,22 +6566,6 @@
             "name": "EmbeddingsResponse",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EmbeddingsResponse\" />"
         },
-        {
-            "name": "EvaluateQuestionAnsweringRequest",
-            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluateQuestionAnsweringRequest\" />"
-        },
-        {
-            "name": "EvaluationJob",
-            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluationJob\" />"
-        },
-        {
-            "name": "EvaluateSummarizationRequest",
-            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluateSummarizationRequest\" />"
-        },
-        {
-            "name": "EvaluateTextGenerationRequest",
-            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluateTextGenerationRequest\" />"
-        },
         {
             "name": "GetAgentsSessionRequest",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/GetAgentsSessionRequest\" />"
@@ -6443,18 +6594,6 @@
             "name": "AgentStepResponse",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/AgentStepResponse\" />"
         },
-        {
-            "name": "EvaluationJobArtifactsResponse",
-            "description": "Artifacts of a evaluation job.\n\n<SchemaDefinition schemaRef=\"#/components/schemas/EvaluationJobArtifactsResponse\" />"
-        },
-        {
-            "name": "EvaluationJobLogStream",
-            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluationJobLogStream\" />"
-        },
-        {
-            "name": "EvaluationJobStatusResponse",
-            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluationJobStatusResponse\" />"
-        },
         {
             "name": "ModelDefWithProvider",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/ModelDefWithProvider\" />"
@@ -6555,6 +6694,14 @@
             "name": "RLHFAlgorithm",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/RLHFAlgorithm\" />"
         },
+        {
+            "name": "TrainEvalDataset",
+            "description": "Dataset to be used for training or evaluating language models.\n\n<SchemaDefinition schemaRef=\"#/components/schemas/TrainEvalDataset\" />"
+        },
+        {
+            "name": "TrainEvalDatasetColumnType",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/TrainEvalDatasetColumnType\" />"
+        },
         {
             "name": "TrainingConfig",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/TrainingConfig\" />"
@@ -6603,6 +6750,62 @@
             "name": "ScoredMessage",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/ScoredMessage\" />"
         },
+        {
+            "name": "EvaluateDatasetConfig",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluateDatasetConfig\" />"
+        },
+        {
+            "name": "EvaluateJudgeScoringConfig",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluateJudgeScoringConfig\" />"
+        },
+        {
+            "name": "EvaluateModelGenerationConfig",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluateModelGenerationConfig\" />"
+        },
+        {
+            "name": "EvaluatePostprocessConfig",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluatePostprocessConfig\" />"
+        },
+        {
+            "name": "EvaluatePreprocessConfig",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluatePreprocessConfig\" />"
+        },
+        {
+            "name": "EvaluateProcessorConfig",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluateProcessorConfig\" />"
+        },
+        {
+            "name": "EvaluateScoringConfig",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluateScoringConfig\" />"
+        },
+        {
+            "name": "EvaluateSingleScorerConfig",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluateSingleScorerConfig\" />"
+        },
+        {
+            "name": "EvaluateTaskConfig",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/EvaluateTaskConfig\" />"
+        },
+        {
+            "name": "LLMJudgeConfig",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/LLMJudgeConfig\" />"
+        },
+        {
+            "name": "RunEvalTaskRequest",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/RunEvalTaskRequest\" />"
+        },
+        {
+            "name": "EvalResult",
+            "description": "Aggregated final evaluation result.\n\n<SchemaDefinition schemaRef=\"#/components/schemas/EvalResult\" />"
+        },
+        {
+            "name": "EvaluateResponse",
+            "description": "Scores for evaluation.\n\n<SchemaDefinition schemaRef=\"#/components/schemas/EvaluateResponse\" />"
+        },
+        {
+            "name": "RunScorerRequest",
+            "description": "<SchemaDefinition schemaRef=\"#/components/schemas/RunScorerRequest\" />"
+        },
         {
             "name": "RunShieldRequest",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/RunShieldRequest\" />"
@@ -6647,7 +6850,7 @@
                 "Agents",
                 "BatchInference",
                 "Datasets",
-                "Evaluations",
+                "Evals",
                 "Inference",
                 "Inspect",
                 "Memory",
@@ -6681,7 +6884,6 @@
                 "BatchCompletionRequest",
                 "BatchCompletionResponse",
                 "BuiltinTool",
-                "CancelEvaluationJobRequest",
                 "CancelTrainingJobRequest",
                 "ChatCompletionRequest",
                 "ChatCompletionResponse",
@@ -6698,31 +6900,40 @@
                 "CreateAgentSessionRequest",
                 "CreateAgentTurnRequest",
                 "CreateDatasetRequest",
+                "CreateDatasetResponse",
+                "CustomDatasetDef",
                 "DPOAlignmentConfig",
                 "DeleteAgentsRequest",
                 "DeleteAgentsSessionRequest",
                 "DeleteDatasetRequest",
+                "DeleteDatasetResponse",
                 "DialogGenerations",
                 "DoraFinetuningConfig",
                 "EmbeddingsRequest",
                 "EmbeddingsResponse",
-                "EvaluateQuestionAnsweringRequest",
-                "EvaluateSummarizationRequest",
-                "EvaluateTextGenerationRequest",
-                "EvaluationJob",
-                "EvaluationJobArtifactsResponse",
-                "EvaluationJobLogStream",
-                "EvaluationJobStatusResponse",
+                "EvalResult",
+                "EvaluateDatasetConfig",
+                "EvaluateJudgeScoringConfig",
+                "EvaluateModelGenerationConfig",
+                "EvaluatePostprocessConfig",
+                "EvaluatePreprocessConfig",
+                "EvaluateProcessorConfig",
+                "EvaluateResponse",
+                "EvaluateScoringConfig",
+                "EvaluateSingleScorerConfig",
+                "EvaluateTaskConfig",
                 "FinetuningAlgorithm",
                 "FunctionCallToolDefinition",
                 "GetAgentsSessionRequest",
                 "GraphMemoryBankDef",
                 "HealthInfo",
+                "HuggingfaceDatasetDef",
                 "ImageMedia",
                 "InferenceStep",
                 "InsertDocumentsRequest",
                 "KeyValueMemoryBankDef",
                 "KeywordMemoryBankDef",
+                "LLMJudgeConfig",
                 "LogEventRequest",
                 "LogSeverity",
                 "LoraFinetuningConfig",
@@ -6752,6 +6963,8 @@
                 "RewardScoreRequest",
                 "RewardScoringResponse",
                 "RouteInfo",
+                "RunEvalTaskRequest",
+                "RunScorerRequest",
                 "RunShieldRequest",
                 "RunShieldResponse",
                 "SafetyViolation",
diff --git a/docs/resources/llama-stack-spec.yaml b/docs/resources/llama-stack-spec.yaml
index 9307ee47b..ab54c4c09 100644
--- a/docs/resources/llama-stack-spec.yaml
+++ b/docs/resources/llama-stack-spec.yaml
@@ -315,14 +315,6 @@ components:
       - photogen
       - code_interpreter
       type: string
-    CancelEvaluationJobRequest:
-      additionalProperties: false
-      properties:
-        job_uuid:
-          type: string
-      required:
-      - job_uuid
-      type: object
     CancelTrainingJobRequest:
       additionalProperties: false
       properties:
@@ -572,13 +564,45 @@ components:
     CreateDatasetRequest:
       additionalProperties: false
       properties:
-        dataset:
-          $ref: '#/components/schemas/TrainEvalDataset'
-        uuid:
+        dataset_def:
+          oneOf:
+          - $ref: '#/components/schemas/HuggingfaceDatasetDef'
+          - $ref: '#/components/schemas/CustomDatasetDef'
+      required:
+      - dataset_def
+      type: object
+    CreateDatasetResponse:
+      additionalProperties: false
+      properties:
+        msg:
+          type: string
+        status:
+          enum:
+          - success
+          - fail
           type: string
       required:
-      - uuid
-      - dataset
+      - status
+      type: object
+    CustomDatasetDef:
+      additionalProperties: false
+      properties:
+        identifier:
+          type: string
+        rename_columns_map:
+          additionalProperties:
+            type: string
+          type: object
+        type:
+          const: custom
+          default: custom
+          type: string
+        url:
+          type: string
+      required:
+      - type
+      - identifier
+      - url
       type: object
     DPOAlignmentConfig:
       additionalProperties: false
@@ -619,10 +643,23 @@ components:
     DeleteDatasetRequest:
       additionalProperties: false
       properties:
-        dataset_uuid:
+        dataset_identifier:
           type: string
       required:
-      - dataset_uuid
+      - dataset_identifier
+      type: object
+    DeleteDatasetResponse:
+      additionalProperties: false
+      properties:
+        msg:
+          type: string
+        status:
+          enum:
+          - success
+          - fail
+          type: string
+      required:
+      - status
       type: object
     DialogGenerations:
       additionalProperties: false
@@ -701,78 +738,147 @@ components:
       required:
       - embeddings
       type: object
-    EvaluateQuestionAnsweringRequest:
+    EvalResult:
       additionalProperties: false
       properties:
         metrics:
-          items:
-            enum:
-            - em
-            - f1
-            type: string
-          type: array
+          additionalProperties:
+            type: number
+          type: object
       required:
       - metrics
+      title: Aggregated final evaluation result.
       type: object
-    EvaluateSummarizationRequest:
+    EvaluateDatasetConfig:
       additionalProperties: false
       properties:
-        metrics:
+        dataset_identifier:
+          type: string
+        kwargs:
+          additionalProperties:
+            oneOf:
+            - type: 'null'
+            - type: boolean
+            - type: number
+            - type: string
+            - type: array
+            - type: object
+          type: object
+        row_limit:
+          type: integer
+      required:
+      - dataset_identifier
+      type: object
+    EvaluateJudgeScoringConfig:
+      type: object
+    EvaluateModelGenerationConfig:
+      additionalProperties: false
+      properties:
+        kwargs:
+          additionalProperties:
+            oneOf:
+            - type: 'null'
+            - type: boolean
+            - type: number
+            - type: string
+            - type: array
+            - type: object
+          type: object
+        model:
+          type: string
+        sampling_params:
+          $ref: '#/components/schemas/SamplingParams'
+      required:
+      - model
+      - sampling_params
+      type: object
+    EvaluatePostprocessConfig:
+      additionalProperties: false
+      properties:
+        kwargs:
+          additionalProperties:
+            oneOf:
+            - type: 'null'
+            - type: boolean
+            - type: number
+            - type: string
+            - type: array
+            - type: object
+          type: object
+      type: object
+    EvaluatePreprocessConfig:
+      additionalProperties: false
+      properties:
+        kwargs:
+          additionalProperties:
+            oneOf:
+            - type: 'null'
+            - type: boolean
+            - type: number
+            - type: string
+            - type: array
+            - type: object
+          type: object
+      type: object
+    EvaluateProcessorConfig:
+      additionalProperties: false
+      properties:
+        postprocess_config:
+          $ref: '#/components/schemas/EvaluatePostprocessConfig'
+        preprocess_config:
+          $ref: '#/components/schemas/EvaluatePreprocessConfig'
+        processor_identifier:
+          type: string
+      required:
+      - processor_identifier
+      type: object
+    EvaluateResponse:
+      additionalProperties: false
+      properties:
+        eval_result:
+          $ref: '#/components/schemas/EvalResult'
+        formatted_report:
+          type: string
+      required:
+      - eval_result
+      title: Scores for evaluation.
+      type: object
+    EvaluateScoringConfig:
+      additionalProperties: false
+      properties:
+        scorer_config_list:
           items:
-            enum:
-            - rouge
-            - bleu
-            type: string
+            $ref: '#/components/schemas/EvaluateSingleScorerConfig'
           type: array
       required:
-      - metrics
+      - scorer_config_list
       type: object
-    EvaluateTextGenerationRequest:
+    EvaluateSingleScorerConfig:
       additionalProperties: false
       properties:
-        metrics:
-          items:
-            enum:
-            - perplexity
-            - rouge
-            - bleu
-            type: string
-          type: array
-      required:
-      - metrics
-      type: object
-    EvaluationJob:
-      additionalProperties: false
-      properties:
-        job_uuid:
+        llm_judge_config:
+          $ref: '#/components/schemas/LLMJudgeConfig'
+        scorer_name:
           type: string
       required:
-      - job_uuid
+      - scorer_name
       type: object
-    EvaluationJobArtifactsResponse:
+    EvaluateTaskConfig:
       additionalProperties: false
       properties:
-        job_uuid:
-          type: string
+        dataset_config:
+          $ref: '#/components/schemas/EvaluateDatasetConfig'
+        generation_config:
+          $ref: '#/components/schemas/EvaluateModelGenerationConfig'
+        processor_config:
+          $ref: '#/components/schemas/EvaluateProcessorConfig'
+        scoring_config:
+          $ref: '#/components/schemas/EvaluateScoringConfig'
       required:
-      - job_uuid
-      title: Artifacts of a evaluation job.
-      type: object
-    EvaluationJobLogStream:
-      additionalProperties: false
-      properties:
-        job_uuid:
-          type: string
-      required:
-      - job_uuid
-      type: object
-    EvaluationJobStatusResponse:
-      additionalProperties: false
-      properties:
-        job_uuid:
-          type: string
-      required:
-      - job_uuid
+      - dataset_config
+      - processor_config
+      - generation_config
+      - scoring_config
       type: object
     FinetuningAlgorithm:
       enum:
@@ -845,6 +951,39 @@ components:
       required:
       - status
       type: object
+    HuggingfaceDatasetDef:
+      additionalProperties: false
+      properties:
+        dataset_name:
+          type: string
+        dataset_path:
+          type: string
+        identifier:
+          type: string
+        kwargs:
+          additionalProperties:
+            oneOf:
+            - type: 'null'
+            - type: boolean
+            - type: number
+            - type: string
+            - type: array
+            - type: object
+          type: object
+        rename_columns_map:
+          additionalProperties:
+            type: string
+          type: object
+        type:
+          const: huggingface
+          default: huggingface
+          type: string
+      required:
+      - type
+      - identifier
+      - dataset_path
+      - kwargs
+      type: object
     ImageMedia:
       additionalProperties: false
       properties:
@@ -936,6 +1075,20 @@ components:
       - provider_id
       - type
       type: object
+    LLMJudgeConfig:
+      additionalProperties: false
+      properties:
+        judge_model_generation_config:
+          $ref: '#/components/schemas/EvaluateModelGenerationConfig'
+        judge_processor_config:
+          $ref: '#/components/schemas/EvaluateProcessorConfig'
+        judge_scoring_config:
+          $ref: '#/components/schemas/EvaluateJudgeScoringConfig'
+      required:
+      - judge_processor_config
+      - judge_model_generation_config
+      - judge_scoring_config
+      type: object
     LogEventRequest:
       additionalProperties: false
       properties:
@@ -1629,6 +1782,32 @@ components:
       - method
       - provider_types
       type: object
+    RunEvalTaskRequest:
+      additionalProperties: false
+      properties:
+        dataset:
+          type: string
+        eval_task_config:
+          $ref: '#/components/schemas/EvaluateTaskConfig'
+        model:
+          type: string
+        task:
+          type: string
+      required:
+      - model
+      - task
+      type: object
+    RunScorerRequest:
+      additionalProperties: false
+      properties:
+        dataset_config:
+          $ref: '#/components/schemas/EvaluateDatasetConfig'
+        eval_scoring_config:
+          $ref: '#/components/schemas/EvaluateScoringConfig'
+      required:
+      - dataset_config
+      - eval_scoring_config
+      type: object
     RunShieldRequest:
       additionalProperties: false
       properties:
@@ -2507,7 +2686,7 @@ info:
   description: "This is the specification of the llama stack that provides\n     \
     \           a set of endpoints and their corresponding interfaces that are tailored\
     \ to\n                best leverage Llama Models. The specification is still in\
-    \ draft and subject to change.\n                Generated at 2024-10-09 21:10:09.073430"
+    \ draft and subject to change.\n                Generated at 2024-10-15 00:44:26.278642"
   title: '[DRAFT] Llama Stack Specification'
   version: 0.0.1
 jsonSchemaDialect: https://json-schema.org/draft/2020-12/schema
@@ -2693,7 +2872,7 @@ paths:
       responses:
         '200':
           content:
-            application/json:
+            text/event-stream:
               schema:
                 $ref: '#/components/schemas/AgentTurnResponseStreamChunk'
           description: OK
@@ -2796,6 +2975,10 @@ paths:
         required: true
       responses:
         '200':
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/CreateDatasetResponse'
           description: OK
       tags:
       - Datasets
@@ -2817,6 +3000,10 @@ paths:
         required: true
       responses:
         '200':
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/DeleteDatasetResponse'
           description: OK
       tags:
       - Datasets
@@ -2824,7 +3011,7 @@ paths:
     get:
       parameters:
       - in: query
-        name: dataset_uuid
+        name: dataset_identifier
         required: true
         schema:
           type: string
@@ -2840,104 +3027,15 @@ paths:
           content:
             application/json:
               schema:
-                $ref: '#/components/schemas/TrainEvalDataset'
+                oneOf:
+                - oneOf:
+                  - $ref: '#/components/schemas/HuggingfaceDatasetDef'
+                  - $ref: '#/components/schemas/CustomDatasetDef'
+                - type: 'null'
           description: OK
       tags:
       - Datasets
-  /evaluate/job/artifacts:
-    get:
-      parameters:
-      - in: query
-        name: job_uuid
-        required: true
-        schema:
-          type: string
-      - description: JSON-encoded provider data which will be made available to the
-          adapter servicing the API
-        in: header
-        name: X-LlamaStack-ProviderData
-        required: false
-        schema:
-          type: string
-      responses:
-        '200':
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/EvaluationJobArtifactsResponse'
-          description: OK
-      tags:
-      - Evaluations
-  /evaluate/job/cancel:
-    post:
-      parameters:
-      - description: JSON-encoded provider data which will be made available to the
-          adapter servicing the API
-        in: header
-        name: X-LlamaStack-ProviderData
-        required: false
-        schema:
-          type: string
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/CancelEvaluationJobRequest'
-        required: true
-      responses:
-        '200':
-          description: OK
-      tags:
-      - Evaluations
-  /evaluate/job/logs:
-    get:
-      parameters:
-      - in: query
-        name: job_uuid
-        required: true
-        schema:
-          type: string
-      - description: JSON-encoded provider data which will be made available to the
-          adapter servicing the API
-        in: header
-        name: X-LlamaStack-ProviderData
-        required: false
-        schema:
-          type: string
-      responses:
-        '200':
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/EvaluationJobLogStream'
-          description: OK
-      tags:
-      - Evaluations
-  /evaluate/job/status:
-    get:
-      parameters:
-      - in: query
-        name: job_uuid
-        required: true
-        schema:
-          type: string
-      - description: JSON-encoded provider data which will be made available to the
-          adapter servicing the API
-        in: header
-        name: X-LlamaStack-ProviderData
-        required: false
-        schema:
-          type: string
-      responses:
-        '200':
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/EvaluationJobStatusResponse'
-          description: OK
-      tags:
-      - Evaluations
-  /evaluate/jobs:
+  /datasets/list:
     get:
       parameters:
       - description: JSON-encoded provider data which will be made available to the
@@ -2952,11 +3050,13 @@ paths:
           content:
             application/jsonl:
               schema:
-                $ref: '#/components/schemas/EvaluationJob'
+                oneOf:
+                - $ref: '#/components/schemas/HuggingfaceDatasetDef'
+                - $ref: '#/components/schemas/CustomDatasetDef'
           description: OK
       tags:
-      - Evaluations
-  /evaluate/question_answering/:
+      - Datasets
+  /evals/run_eval_task:
     post:
       parameters:
       - description: JSON-encoded provider data which will be made available to the
@@ -2970,18 +3070,18 @@ paths:
         content:
           application/json:
             schema:
-              $ref: '#/components/schemas/EvaluateQuestionAnsweringRequest'
+              $ref: '#/components/schemas/RunEvalTaskRequest'
         required: true
       responses:
         '200':
           content:
             application/json:
               schema:
-                $ref: '#/components/schemas/EvaluationJob'
+                $ref: '#/components/schemas/EvaluateResponse'
           description: OK
       tags:
-      - Evaluations
-  /evaluate/summarization/:
+      - Evals
+  /evals/run_scorer:
     post:
       parameters:
       - description: JSON-encoded provider data which will be made available to the
@@ -2995,42 +3095,17 @@ paths:
         content:
           application/json:
             schema:
-              $ref: '#/components/schemas/EvaluateSummarizationRequest'
+              $ref: '#/components/schemas/RunScorerRequest'
         required: true
       responses:
         '200':
           content:
             application/json:
               schema:
-                $ref: '#/components/schemas/EvaluationJob'
+                $ref: '#/components/schemas/EvaluateResponse'
           description: OK
       tags:
-      - Evaluations
-  /evaluate/text_generation/:
-    post:
-      parameters:
-      - description: JSON-encoded provider data which will be made available to the
-          adapter servicing the API
-        in: header
-        name: X-LlamaStack-ProviderData
-        required: false
-        schema:
-          type: string
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/EvaluateTextGenerationRequest'
-        required: true
-      responses:
-        '200':
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/EvaluationJob'
-          description: OK
-      tags:
-      - Evaluations
+      - Evals
   /health:
     get:
       parameters:
@@ -3712,20 +3787,20 @@ security:
 servers:
 - url: http://any-hosted-llama-stack.com
 tags:
-- name: RewardScoring
+- name: Models
+- name: BatchInference
+- name: Inspect
+- name: Evals
+- name: Safety
+- name: Shields
+- name: Telemetry
+- name: Agents
 - name: Memory
 - name: SyntheticDataGeneration
-- name: Models
-- name: Safety
-- name: BatchInference
-- name: Agents
-- name: MemoryBanks
-- name: Shields
-- name: Datasets
-- name: Evaluations
-- name: Inspect
 - name: PostTraining
-- name: Telemetry
+- name: Datasets
+- name: MemoryBanks
+- name: RewardScoring
 - name: Inference
 - description: <SchemaDefinition schemaRef="#/components/schemas/BuiltinTool" />
   name: BuiltinTool
@@ -3782,9 +3857,6 @@ tags:
 - description: <SchemaDefinition schemaRef="#/components/schemas/BatchCompletionResponse"
     />
   name: BatchCompletionResponse
-- description: <SchemaDefinition schemaRef="#/components/schemas/CancelEvaluationJobRequest"
-    />
-  name: CancelEvaluationJobRequest
 - description: <SchemaDefinition schemaRef="#/components/schemas/CancelTrainingJobRequest"
     />
   name: CancelTrainingJobRequest
@@ -3919,17 +3991,18 @@ tags:
   name: Turn
 - description: <SchemaDefinition schemaRef="#/components/schemas/ViolationLevel" />
   name: ViolationLevel
-- description: 'Dataset to be used for training or evaluating language models.
-
-
-    <SchemaDefinition schemaRef="#/components/schemas/TrainEvalDataset" />'
-  name: TrainEvalDataset
-- description: <SchemaDefinition schemaRef="#/components/schemas/TrainEvalDatasetColumnType"
+- description: <SchemaDefinition schemaRef="#/components/schemas/CustomDatasetDef"
     />
-  name: TrainEvalDatasetColumnType
+  name: CustomDatasetDef
+- description: <SchemaDefinition schemaRef="#/components/schemas/HuggingfaceDatasetDef"
+    />
+  name: HuggingfaceDatasetDef
 - description: <SchemaDefinition schemaRef="#/components/schemas/CreateDatasetRequest"
     />
   name: CreateDatasetRequest
+- description: <SchemaDefinition schemaRef="#/components/schemas/CreateDatasetResponse"
+    />
+  name: CreateDatasetResponse
 - description: <SchemaDefinition schemaRef="#/components/schemas/DeleteAgentsRequest"
     />
   name: DeleteAgentsRequest
@@ -3939,23 +4012,15 @@ tags:
 - description: <SchemaDefinition schemaRef="#/components/schemas/DeleteDatasetRequest"
     />
   name: DeleteDatasetRequest
+- description: <SchemaDefinition schemaRef="#/components/schemas/DeleteDatasetResponse"
+    />
+  name: DeleteDatasetResponse
 - description: <SchemaDefinition schemaRef="#/components/schemas/EmbeddingsRequest"
     />
   name: EmbeddingsRequest
 - description: <SchemaDefinition schemaRef="#/components/schemas/EmbeddingsResponse"
     />
   name: EmbeddingsResponse
-- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluateQuestionAnsweringRequest"
-    />
-  name: EvaluateQuestionAnsweringRequest
-- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluationJob" />
-  name: EvaluationJob
-- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluateSummarizationRequest"
-    />
-  name: EvaluateSummarizationRequest
-- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluateTextGenerationRequest"
-    />
-  name: EvaluateTextGenerationRequest
 - description: <SchemaDefinition schemaRef="#/components/schemas/GetAgentsSessionRequest"
     />
   name: GetAgentsSessionRequest
@@ -3979,18 +4044,6 @@ tags:
 - description: <SchemaDefinition schemaRef="#/components/schemas/AgentStepResponse"
     />
   name: AgentStepResponse
-- description: 'Artifacts of a evaluation job.
-
-
-    <SchemaDefinition schemaRef="#/components/schemas/EvaluationJobArtifactsResponse"
-    />'
-  name: EvaluationJobArtifactsResponse
-- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluationJobLogStream"
-    />
-  name: EvaluationJobLogStream
-- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluationJobStatusResponse"
-    />
-  name: EvaluationJobStatusResponse
 - description: <SchemaDefinition schemaRef="#/components/schemas/ModelDefWithProvider"
     />
   name: ModelDefWithProvider
@@ -4067,6 +4120,14 @@ tags:
   name: OptimizerConfig
 - description: <SchemaDefinition schemaRef="#/components/schemas/RLHFAlgorithm" />
   name: RLHFAlgorithm
+- description: 'Dataset to be used for training or evaluating language models.
+
+
+    <SchemaDefinition schemaRef="#/components/schemas/TrainEvalDataset" />'
+  name: TrainEvalDataset
+- description: <SchemaDefinition schemaRef="#/components/schemas/TrainEvalDatasetColumnType"
+    />
+  name: TrainEvalDatasetColumnType
 - description: <SchemaDefinition schemaRef="#/components/schemas/TrainingConfig" />
   name: TrainingConfig
 - description: <SchemaDefinition schemaRef="#/components/schemas/PreferenceOptimizeRequest"
@@ -4104,6 +4165,51 @@ tags:
   name: ScoredDialogGenerations
 - description: <SchemaDefinition schemaRef="#/components/schemas/ScoredMessage" />
   name: ScoredMessage
+- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluateDatasetConfig"
+    />
+  name: EvaluateDatasetConfig
+- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluateJudgeScoringConfig"
+    />
+  name: EvaluateJudgeScoringConfig
+- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluateModelGenerationConfig"
+    />
+  name: EvaluateModelGenerationConfig
+- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluatePostprocessConfig"
+    />
+  name: EvaluatePostprocessConfig
+- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluatePreprocessConfig"
+    />
+  name: EvaluatePreprocessConfig
+- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluateProcessorConfig"
+    />
+  name: EvaluateProcessorConfig
+- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluateScoringConfig"
+    />
+  name: EvaluateScoringConfig
+- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluateSingleScorerConfig"
+    />
+  name: EvaluateSingleScorerConfig
+- description: <SchemaDefinition schemaRef="#/components/schemas/EvaluateTaskConfig"
+    />
+  name: EvaluateTaskConfig
+- description: <SchemaDefinition schemaRef="#/components/schemas/LLMJudgeConfig" />
+  name: LLMJudgeConfig
+- description: <SchemaDefinition schemaRef="#/components/schemas/RunEvalTaskRequest"
+    />
+  name: RunEvalTaskRequest
+- description: 'Aggregated final evaluation result.
+
+
+    <SchemaDefinition schemaRef="#/components/schemas/EvalResult" />'
+  name: EvalResult
+- description: 'Scores for evaluation.
+
+
+    <SchemaDefinition schemaRef="#/components/schemas/EvaluateResponse" />'
+  name: EvaluateResponse
+- description: <SchemaDefinition schemaRef="#/components/schemas/RunScorerRequest"
+    />
+  name: RunScorerRequest
 - description: <SchemaDefinition schemaRef="#/components/schemas/RunShieldRequest"
     />
   name: RunShieldRequest
@@ -4141,7 +4247,7 @@ x-tagGroups:
   - Agents
   - BatchInference
   - Datasets
-  - Evaluations
+  - Evals
   - Inference
   - Inspect
   - Memory
@@ -4172,7 +4278,6 @@ x-tagGroups:
   - BatchCompletionRequest
   - BatchCompletionResponse
   - BuiltinTool
-  - CancelEvaluationJobRequest
   - CancelTrainingJobRequest
   - ChatCompletionRequest
   - ChatCompletionResponse
@@ -4189,31 +4294,40 @@ x-tagGroups:
   - CreateAgentSessionRequest
   - CreateAgentTurnRequest
   - CreateDatasetRequest
+  - CreateDatasetResponse
+  - CustomDatasetDef
   - DPOAlignmentConfig
   - DeleteAgentsRequest
   - DeleteAgentsSessionRequest
   - DeleteDatasetRequest
+  - DeleteDatasetResponse
   - DialogGenerations
   - DoraFinetuningConfig
   - EmbeddingsRequest
   - EmbeddingsResponse
-  - EvaluateQuestionAnsweringRequest
-  - EvaluateSummarizationRequest
-  - EvaluateTextGenerationRequest
-  - EvaluationJob
-  - EvaluationJobArtifactsResponse
-  - EvaluationJobLogStream
-  - EvaluationJobStatusResponse
+  - EvalResult
+  - EvaluateDatasetConfig
+  - EvaluateJudgeScoringConfig
+  - EvaluateModelGenerationConfig
+  - EvaluatePostprocessConfig
+  - EvaluatePreprocessConfig
+  - EvaluateProcessorConfig
+  - EvaluateResponse
+  - EvaluateScoringConfig
+  - EvaluateSingleScorerConfig
+  - EvaluateTaskConfig
   - FinetuningAlgorithm
   - FunctionCallToolDefinition
   - GetAgentsSessionRequest
   - GraphMemoryBankDef
   - HealthInfo
+  - HuggingfaceDatasetDef
   - ImageMedia
   - InferenceStep
   - InsertDocumentsRequest
   - KeyValueMemoryBankDef
   - KeywordMemoryBankDef
+  - LLMJudgeConfig
   - LogEventRequest
   - LogSeverity
   - LoraFinetuningConfig
@@ -4243,6 +4357,8 @@ x-tagGroups:
   - RewardScoreRequest
   - RewardScoringResponse
   - RouteInfo
+  - RunEvalTaskRequest
+  - RunScorerRequest
   - RunShieldRequest
   - RunShieldResponse
   - SafetyViolation
diff --git a/llama_stack/apis/datasets/datasets.py b/llama_stack/apis/datasets/datasets.py
index c0aa4d161..f5991c52e 100644
--- a/llama_stack/apis/datasets/datasets.py
+++ b/llama_stack/apis/datasets/datasets.py
@@ -15,6 +15,26 @@ from pydantic import BaseModel, Field
 from typing_extensions import Annotated
 
 
+@json_schema_type
+class TrainEvalDatasetColumnType(Enum):
+    dialog = "dialog"
+    text = "text"
+    media = "media"
+    number = "number"
+    json = "json"
+
+
+@json_schema_type
+class TrainEvalDataset(BaseModel):
+    """Dataset to be used for training or evaluating language models."""
+
+    # TODO(ashwin): figure out if we need to add an enum for a "dataset type"
+
+    columns: Dict[str, TrainEvalDatasetColumnType]
+    content_url: URL
+    metadata: Optional[Dict[str, Any]] = None
+
+
 @json_schema_type
 class GenerationInput(BaseModel):
     messages: List[Message]
diff --git a/llama_stack/apis/post_training/post_training.py b/llama_stack/apis/post_training/post_training.py
index d943f48b2..cdfe5c467 100644
--- a/llama_stack/apis/post_training/post_training.py
+++ b/llama_stack/apis/post_training/post_training.py
@@ -14,7 +14,7 @@ from llama_models.schema_utils import json_schema_type, webmethod
 from pydantic import BaseModel, Field
 
 from llama_models.llama3.api.datatypes import *  # noqa: F403
-from llama_stack.apis.dataset import *  # noqa: F403
+from llama_stack.apis.datasets import *  # noqa: F403
 from llama_stack.apis.common.training_types import *  # noqa: F403