diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 16847f542..68f27ef3b 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -69,11 +69,12 @@ "tags": [ "DatasetIO" ], - "description": "", + "description": "Get a paginated list of rows from a dataset.", "parameters": [ { "name": "dataset_id", "in": "query", + "description": "The ID of the dataset to get the rows from.", "required": true, "schema": { "type": "string" @@ -82,6 +83,7 @@ { "name": "rows_in_page", "in": "query", + "description": "The number of rows to get per page.", "required": true, "schema": { "type": "integer" @@ -90,6 +92,7 @@ { "name": "page_token", "in": "query", + "description": "The token to get the next page of rows.", "required": false, "schema": { "type": "string" @@ -98,6 +101,7 @@ { "name": "filter_condition", "in": "query", + "description": "(Optional) A condition to filter the rows by.", "required": false, "schema": { "type": "string" @@ -896,7 +900,7 @@ "post": { "responses": { "200": { - "description": "OK", + "description": "EvaluateResponse object containing generations and scores", "content": { "application/json": { "schema": { @@ -921,11 +925,12 @@ "tags": [ "Eval" ], - "description": "", + "description": "Evaluate a list of rows on a benchmark.", "parameters": [ { "name": "benchmark_id", "in": "path", + "description": "The ID of the benchmark to run the evaluation on.", "required": true, "schema": { "type": "string" @@ -2121,7 +2126,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "The status of the evaluationjob.", "content": { "application/json": { "schema": { @@ -2153,11 +2158,12 @@ "tags": [ "Eval" ], - "description": "", + "description": "Get the status of a job.", "parameters": [ { "name": "benchmark_id", "in": "path", + "description": "The ID of the benchmark to run the evaluation on.", "required": true, "schema": { "type": "string" @@ -2166,6 +2172,7 @@ { "name": "job_id", "in": "path", + "description": "The ID of the job to get the status of.", "required": true, "schema": { "type": "string" @@ -2194,11 +2201,12 @@ "tags": [ "Eval" ], - "description": "", + "description": "Cancel a job.", "parameters": [ { "name": "benchmark_id", "in": "path", + "description": "The ID of the benchmark to run the evaluation on.", "required": true, "schema": { "type": "string" @@ -2207,6 +2215,7 @@ { "name": "job_id", "in": "path", + "description": "The ID of the job to cancel.", "required": true, "schema": { "type": "string" @@ -2219,7 +2228,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "The result of the job.", "content": { "application/json": { "schema": { @@ -2244,11 +2253,12 @@ "tags": [ "Eval" ], - "description": "", + "description": "Get the result of a job.", "parameters": [ { "name": "benchmark_id", "in": "path", + "description": "The ID of the benchmark to run the evaluation on.", "required": true, "schema": { "type": "string" @@ -2257,6 +2267,7 @@ { "name": "job_id", "in": "path", + "description": "The ID of the job to get the result of.", "required": true, "schema": { "type": "string" @@ -3287,7 +3298,7 @@ "post": { "responses": { "200": { - "description": "OK", + "description": "The job that was created to run the evaluation.", "content": { "application/json": { "schema": { @@ -3312,11 +3323,12 @@ "tags": [ "Eval" ], - "description": "", + "description": "Run an evaluation on a benchmark.", "parameters": [ { "name": "benchmark_id", "in": "path", + "description": "The ID of the benchmark to run the evaluation on.", "required": true, "schema": { "type": "string" @@ -3418,7 +3430,7 @@ "post": { "responses": { "200": { - "description": "OK", + "description": "ScoreResponse object containing rows and aggregated results", "content": { "application/json": { "schema": { @@ -3443,7 +3455,7 @@ "tags": [ "Scoring" ], - "description": "", + "description": "Score a list of rows.", "parameters": [], "requestBody": { "content": { @@ -6187,7 +6199,8 @@ "default": "agent" }, "config": { - "$ref": "#/components/schemas/AgentConfig" + "$ref": "#/components/schemas/AgentConfig", + "description": "The configuration for the agent candidate." } }, "additionalProperties": false, @@ -6195,7 +6208,8 @@ "type", "config" ], - "title": "AgentCandidate" + "title": "AgentCandidate", + "description": "An agent candidate for evaluation." }, "AggregationFunctionType": { "type": "string", @@ -6232,16 +6246,19 @@ "type": "object", "properties": { "eval_candidate": { - "$ref": "#/components/schemas/EvalCandidate" + "$ref": "#/components/schemas/EvalCandidate", + "description": "The candidate to evaluate." }, "scoring_params": { "type": "object", "additionalProperties": { "$ref": "#/components/schemas/ScoringFnParams" - } + }, + "description": "Map between scoring function id and parameters for each scoring function you want to run" }, "num_examples": { - "type": "integer" + "type": "integer", + "description": "(Optional) The number of examples to evaluate. If not provided, all examples in the dataset will be evaluated" } }, "additionalProperties": false, @@ -6249,7 +6266,8 @@ "eval_candidate", "scoring_params" ], - "title": "BenchmarkConfig" + "title": "BenchmarkConfig", + "description": "A benchmark configuration for evaluation." }, "EvalCandidate": { "oneOf": [ @@ -6311,13 +6329,16 @@ "default": "model" }, "model": { - "type": "string" + "type": "string", + "description": "The model ID to evaluate." }, "sampling_params": { - "$ref": "#/components/schemas/SamplingParams" + "$ref": "#/components/schemas/SamplingParams", + "description": "The sampling parameters for the model." }, "system_message": { - "$ref": "#/components/schemas/SystemMessage" + "$ref": "#/components/schemas/SystemMessage", + "description": "(Optional) The system message providing instructions or context to the model." } }, "additionalProperties": false, @@ -6326,7 +6347,8 @@ "model", "sampling_params" ], - "title": "ModelCandidate" + "title": "ModelCandidate", + "description": "A model candidate for evaluation." }, "RegexParserScoringFnParams": { "type": "object", @@ -6405,16 +6427,19 @@ } ] } - } + }, + "description": "The rows to evaluate." }, "scoring_functions": { "type": "array", "items": { "type": "string" - } + }, + "description": "The scoring functions to use for the evaluation." }, "benchmark_config": { - "$ref": "#/components/schemas/BenchmarkConfig" + "$ref": "#/components/schemas/BenchmarkConfig", + "description": "The configuration for the benchmark." } }, "additionalProperties": false, @@ -6454,13 +6479,15 @@ } ] } - } + }, + "description": "The generations from the evaluation." }, "scores": { "type": "object", "additionalProperties": { "$ref": "#/components/schemas/ScoringResult" - } + }, + "description": "The scores from the evaluation." } }, "additionalProperties": false, @@ -6468,7 +6495,8 @@ "generations", "scores" ], - "title": "EvaluateResponse" + "title": "EvaluateResponse", + "description": "The response from an evaluation." }, "ScoringResult": { "type": "object", @@ -6499,7 +6527,8 @@ } ] } - } + }, + "description": "The scoring result for each row. Each row is a map of column name to value." }, "aggregated_results": { "type": "object", @@ -6524,7 +6553,8 @@ "type": "object" } ] - } + }, + "description": "Map of metric name to aggregated value" } }, "additionalProperties": false, @@ -6532,7 +6562,8 @@ "score_rows", "aggregated_results" ], - "title": "ScoringResult" + "title": "ScoringResult", + "description": "A scoring result for a single row." }, "Session": { "type": "object", @@ -7021,13 +7052,16 @@ } ] } - } + }, + "description": "The rows in the current page." }, "total_count": { - "type": "integer" + "type": "integer", + "description": "The total number of rows in the dataset." }, "next_page_token": { - "type": "string" + "type": "string", + "description": "The token to get the next page of rows." } }, "additionalProperties": false, @@ -7035,7 +7069,8 @@ "rows", "total_count" ], - "title": "PaginatedRowsResult" + "title": "PaginatedRowsResult", + "description": "A paginated list of rows from a dataset." }, "ScoringFn": { "type": "object", @@ -9307,7 +9342,8 @@ "type": "object", "properties": { "benchmark_config": { - "$ref": "#/components/schemas/BenchmarkConfig" + "$ref": "#/components/schemas/BenchmarkConfig", + "description": "The configuration for the benchmark." } }, "additionalProperties": false, @@ -9444,7 +9480,8 @@ } ] } - } + }, + "description": "The rows to score." }, "scoring_functions": { "type": "object", @@ -9457,7 +9494,8 @@ "type": "null" } ] - } + }, + "description": "The scoring functions to use for the scoring." } }, "additionalProperties": false, @@ -9474,14 +9512,16 @@ "type": "object", "additionalProperties": { "$ref": "#/components/schemas/ScoringResult" - } + }, + "description": "A map of scoring function name to ScoringResult." } }, "additionalProperties": false, "required": [ "results" ], - "title": "ScoreResponse" + "title": "ScoreResponse", + "description": "The response from scoring." }, "ScoreBatchRequest": { "type": "object", @@ -9896,7 +9936,8 @@ "name": "Datasets" }, { - "name": "Eval" + "name": "Eval", + "x-displayName": "Llama Stack Evaluation API for running evaluations on model and agent candidates." }, { "name": "Files (Coming Soon)" diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index cfca894fb..bb994b0c5 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -31,25 +31,32 @@ paths: $ref: '#/components/responses/DefaultError' tags: - DatasetIO - description: '' + description: >- + Get a paginated list of rows from a dataset. parameters: - name: dataset_id in: query + description: >- + The ID of the dataset to get the rows from. required: true schema: type: string - name: rows_in_page in: query + description: The number of rows to get per page. required: true schema: type: integer - name: page_token in: query + description: The token to get the next page of rows. required: false schema: type: string - name: filter_condition in: query + description: >- + (Optional) A condition to filter the rows by. required: false schema: type: string @@ -613,7 +620,8 @@ paths: post: responses: '200': - description: OK + description: >- + EvaluateResponse object containing generations and scores content: application/json: schema: @@ -630,10 +638,12 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Eval - description: '' + description: Evaluate a list of rows on a benchmark. parameters: - name: benchmark_id in: path + description: >- + The ID of the benchmark to run the evaluation on. required: true schema: type: string @@ -1417,7 +1427,7 @@ paths: get: responses: '200': - description: OK + description: The status of the evaluationjob. content: application/json: schema: @@ -1436,15 +1446,18 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Eval - description: '' + description: Get the status of a job. parameters: - name: benchmark_id in: path + description: >- + The ID of the benchmark to run the evaluation on. required: true schema: type: string - name: job_id in: path + description: The ID of the job to get the status of. required: true schema: type: string @@ -1464,15 +1477,18 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Eval - description: '' + description: Cancel a job. parameters: - name: benchmark_id in: path + description: >- + The ID of the benchmark to run the evaluation on. required: true schema: type: string - name: job_id in: path + description: The ID of the job to cancel. required: true schema: type: string @@ -1480,7 +1496,7 @@ paths: get: responses: '200': - description: OK + description: The result of the job. content: application/json: schema: @@ -1497,15 +1513,18 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Eval - description: '' + description: Get the result of a job. parameters: - name: benchmark_id in: path + description: >- + The ID of the benchmark to run the evaluation on. required: true schema: type: string - name: job_id in: path + description: The ID of the job to get the result of. required: true schema: type: string @@ -2218,7 +2237,8 @@ paths: post: responses: '200': - description: OK + description: >- + The job that was created to run the evaluation. content: application/json: schema: @@ -2235,10 +2255,12 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Eval - description: '' + description: Run an evaluation on a benchmark. parameters: - name: benchmark_id in: path + description: >- + The ID of the benchmark to run the evaluation on. required: true schema: type: string @@ -2306,7 +2328,8 @@ paths: post: responses: '200': - description: OK + description: >- + ScoreResponse object containing rows and aggregated results content: application/json: schema: @@ -2323,7 +2346,7 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Scoring - description: '' + description: Score a list of rows. parameters: [] requestBody: content: @@ -4290,11 +4313,14 @@ components: default: agent config: $ref: '#/components/schemas/AgentConfig' + description: >- + The configuration for the agent candidate. additionalProperties: false required: - type - config title: AgentCandidate + description: An agent candidate for evaluation. AggregationFunctionType: type: string enum: @@ -4323,17 +4349,26 @@ components: properties: eval_candidate: $ref: '#/components/schemas/EvalCandidate' + description: The candidate to evaluate. scoring_params: type: object additionalProperties: $ref: '#/components/schemas/ScoringFnParams' + description: >- + Map between scoring function id and parameters for each scoring function + you want to run num_examples: type: integer + description: >- + (Optional) The number of examples to evaluate. If not provided, all examples + in the dataset will be evaluated additionalProperties: false required: - eval_candidate - scoring_params title: BenchmarkConfig + description: >- + A benchmark configuration for evaluation. EvalCandidate: oneOf: - $ref: '#/components/schemas/ModelCandidate' @@ -4376,16 +4411,22 @@ components: default: model model: type: string + description: The model ID to evaluate. sampling_params: $ref: '#/components/schemas/SamplingParams' + description: The sampling parameters for the model. system_message: $ref: '#/components/schemas/SystemMessage' + description: >- + (Optional) The system message providing instructions or context to the + model. additionalProperties: false required: - type - model - sampling_params title: ModelCandidate + description: A model candidate for evaluation. RegexParserScoringFnParams: type: object properties: @@ -4431,12 +4472,16 @@ components: - type: string - type: array - type: object + description: The rows to evaluate. scoring_functions: type: array items: type: string + description: >- + The scoring functions to use for the evaluation. benchmark_config: $ref: '#/components/schemas/BenchmarkConfig' + description: The configuration for the benchmark. additionalProperties: false required: - input_rows @@ -4458,15 +4503,18 @@ components: - type: string - type: array - type: object + description: The generations from the evaluation. scores: type: object additionalProperties: $ref: '#/components/schemas/ScoringResult' + description: The scores from the evaluation. additionalProperties: false required: - generations - scores title: EvaluateResponse + description: The response from an evaluation. ScoringResult: type: object properties: @@ -4482,6 +4530,8 @@ components: - type: string - type: array - type: object + description: >- + The scoring result for each row. Each row is a map of column name to value. aggregated_results: type: object additionalProperties: @@ -4492,11 +4542,13 @@ components: - type: string - type: array - type: object + description: Map of metric name to aggregated value additionalProperties: false required: - score_rows - aggregated_results title: ScoringResult + description: A scoring result for a single row. Session: type: object properties: @@ -4809,15 +4861,19 @@ components: - type: string - type: array - type: object + description: The rows in the current page. total_count: type: integer + description: The total number of rows in the dataset. next_page_token: type: string + description: The token to get the next page of rows. additionalProperties: false required: - rows - total_count title: PaginatedRowsResult + description: A paginated list of rows from a dataset. ScoringFn: type: object properties: @@ -6248,6 +6304,7 @@ components: properties: benchmark_config: $ref: '#/components/schemas/BenchmarkConfig' + description: The configuration for the benchmark. additionalProperties: false required: - benchmark_config @@ -6329,12 +6386,15 @@ components: - type: string - type: array - type: object + description: The rows to score. scoring_functions: type: object additionalProperties: oneOf: - $ref: '#/components/schemas/ScoringFnParams' - type: 'null' + description: >- + The scoring functions to use for the scoring. additionalProperties: false required: - input_rows @@ -6347,10 +6407,13 @@ components: type: object additionalProperties: $ref: '#/components/schemas/ScoringResult' + description: >- + A map of scoring function name to ScoringResult. additionalProperties: false required: - results title: ScoreResponse + description: The response from scoring. ScoreBatchRequest: type: object properties: @@ -6621,6 +6684,8 @@ tags: - name: DatasetIO - name: Datasets - name: Eval + x-displayName: >- + Llama Stack Evaluation API for running evaluations on model and agent candidates. - name: Files (Coming Soon) - name: Inference description: >- diff --git a/llama_stack/apis/datasetio/datasetio.py b/llama_stack/apis/datasetio/datasetio.py index d85d22876..6a04a6329 100644 --- a/llama_stack/apis/datasetio/datasetio.py +++ b/llama_stack/apis/datasetio/datasetio.py @@ -14,6 +14,14 @@ from llama_stack.schema_utils import json_schema_type, webmethod @json_schema_type class PaginatedRowsResult(BaseModel): + """ + A paginated list of rows from a dataset. + + :param rows: The rows in the current page. + :param total_count: The total number of rows in the dataset. + :param next_page_token: The token to get the next page of rows. + """ + # the rows obey the DatasetSchema for the given dataset rows: List[Dict[str, Any]] total_count: int @@ -36,7 +44,15 @@ class DatasetIO(Protocol): rows_in_page: int, page_token: Optional[str] = None, filter_condition: Optional[str] = None, - ) -> PaginatedRowsResult: ... + ) -> PaginatedRowsResult: + """Get a paginated list of rows from a dataset. + + :param dataset_id: The ID of the dataset to get the rows from. + :param rows_in_page: The number of rows to get per page. + :param page_token: The token to get the next page of rows. + :param filter_condition: (Optional) A condition to filter the rows by. + """ + ... @webmethod(route="/datasetio/rows", method="POST") async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None: ... diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py index 40a3b750a..dec018d83 100644 --- a/llama_stack/apis/eval/eval.py +++ b/llama_stack/apis/eval/eval.py @@ -19,6 +19,13 @@ from llama_stack.schema_utils import json_schema_type, register_schema, webmetho @json_schema_type class ModelCandidate(BaseModel): + """A model candidate for evaluation. + + :param model: The model ID to evaluate. + :param sampling_params: The sampling parameters for the model. + :param system_message: (Optional) The system message providing instructions or context to the model. + """ + type: Literal["model"] = "model" model: str sampling_params: SamplingParams @@ -27,6 +34,11 @@ class ModelCandidate(BaseModel): @json_schema_type class AgentCandidate(BaseModel): + """An agent candidate for evaluation. + + :param config: The configuration for the agent candidate. + """ + type: Literal["agent"] = "agent" config: AgentConfig @@ -39,6 +51,13 @@ EvalCandidate = register_schema( @json_schema_type class BenchmarkConfig(BaseModel): + """A benchmark configuration for evaluation. + + :param eval_candidate: The candidate to evaluate. + :param scoring_params: Map between scoring function id and parameters for each scoring function you want to run + :param num_examples: (Optional) The number of examples to evaluate. If not provided, all examples in the dataset will be evaluated + """ + eval_candidate: EvalCandidate scoring_params: Dict[str, ScoringFnParams] = Field( description="Map between scoring function id and parameters for each scoring function you want to run", @@ -53,18 +72,32 @@ class BenchmarkConfig(BaseModel): @json_schema_type class EvaluateResponse(BaseModel): + """The response from an evaluation. + + :param generations: The generations from the evaluation. + :param scores: The scores from the evaluation. + """ + generations: List[Dict[str, Any]] # each key in the dict is a scoring function name scores: Dict[str, ScoringResult] class Eval(Protocol): + """Llama Stack Evaluation API for running evaluations on model and agent candidates.""" + @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST") async def run_eval( self, benchmark_id: str, benchmark_config: BenchmarkConfig, - ) -> Job: ... + ) -> Job: + """Run an evaluation on a benchmark. + + :param benchmark_id: The ID of the benchmark to run the evaluation on. + :param benchmark_config: The configuration for the benchmark. + :return: The job that was created to run the evaluation. + """ @webmethod(route="/eval/benchmarks/{benchmark_id}/evaluations", method="POST") async def evaluate_rows( @@ -73,13 +106,40 @@ class Eval(Protocol): input_rows: List[Dict[str, Any]], scoring_functions: List[str], benchmark_config: BenchmarkConfig, - ) -> EvaluateResponse: ... + ) -> EvaluateResponse: + """Evaluate a list of rows on a benchmark. + + :param benchmark_id: The ID of the benchmark to run the evaluation on. + :param input_rows: The rows to evaluate. + :param scoring_functions: The scoring functions to use for the evaluation. + :param benchmark_config: The configuration for the benchmark. + :return: EvaluateResponse object containing generations and scores + """ @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="GET") - async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]: ... + async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]: + """Get the status of a job. + + :param benchmark_id: The ID of the benchmark to run the evaluation on. + :param job_id: The ID of the job to get the status of. + :return: The status of the evaluationjob. + """ + ... @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="DELETE") - async def job_cancel(self, benchmark_id: str, job_id: str) -> None: ... + async def job_cancel(self, benchmark_id: str, job_id: str) -> None: + """Cancel a job. + + :param benchmark_id: The ID of the benchmark to run the evaluation on. + :param job_id: The ID of the job to cancel. + """ + ... @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET") - async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse: ... + async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse: + """Get the result of a job. + + :param benchmark_id: The ID of the benchmark to run the evaluation on. + :param job_id: The ID of the job to get the result of. + :return: The result of the job. + """ diff --git a/llama_stack/apis/scoring/scoring.py b/llama_stack/apis/scoring/scoring.py index 960149476..54a9ac2aa 100644 --- a/llama_stack/apis/scoring/scoring.py +++ b/llama_stack/apis/scoring/scoring.py @@ -17,6 +17,13 @@ ScoringResultRow = Dict[str, Any] @json_schema_type class ScoringResult(BaseModel): + """ + A scoring result for a single row. + + :param score_rows: The scoring result for each row. Each row is a map of column name to value. + :param aggregated_results: Map of metric name to aggregated value + """ + score_rows: List[ScoringResultRow] # aggregated metrics to value aggregated_results: Dict[str, Any] @@ -30,6 +37,12 @@ class ScoreBatchResponse(BaseModel): @json_schema_type class ScoreResponse(BaseModel): + """ + The response from scoring. + + :param results: A map of scoring function name to ScoringResult. + """ + # each key in the dict is a scoring function name results: Dict[str, ScoringResult] @@ -55,4 +68,11 @@ class Scoring(Protocol): self, input_rows: List[Dict[str, Any]], scoring_functions: Dict[str, Optional[ScoringFnParams]], - ) -> ScoreResponse: ... + ) -> ScoreResponse: + """Score a list of rows. + + :param input_rows: The rows to score. + :param scoring_functions: The scoring functions to use for the scoring. + :return: ScoreResponse object containing rows and aggregated results + """ + ...