From ec721b3867d664a486faebb6a2a2b7a77ecd0b71 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Wed, 12 Feb 2025 20:48:05 -0800 Subject: [PATCH] update --- docs/_static/llama-stack-spec.html | 619 +++++++++++++------- docs/_static/llama-stack-spec.yaml | 382 ++++++++---- llama_stack/apis/eval/eval.py | 25 + llama_stack/distribution/routers/routers.py | 3 +- 4 files changed, 695 insertions(+), 334 deletions(-) diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index c656808a6..652dae562 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -67,8 +67,8 @@ "description": "", "parameters": [ { - "name": "benchmark_id", - "in": "path", + "name": "task_id", + "in": "query", "required": true, "schema": { "type": "string" @@ -114,7 +114,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/DeprecatedRegisterBenchmarkRequest" + "$ref": "#/components/schemas/DeprecatedRegisterEvalTaskRequest" } } }, @@ -613,7 +613,7 @@ } } }, - "/v1/eval/tasks/{benchmark_id}/evaluations": { + "/v1/eval/benchmarks/{benchmark_id}/evaluations": { "post": { "responses": { "200": { @@ -653,6 +653,47 @@ } } }, + "/v1/eval/tasks/{task_id}/evaluations": { + "post": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/EvaluateResponse" + } + } + } + } + }, + "tags": [ + "Eval" + ], + "description": "", + "parameters": [ + { + "name": "task_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/EvaluateRowsDeprecatedRequest" + } + } + }, + "required": true + }, + "deprecated": true + } + }, "/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}": { "get": { "responses": { @@ -753,6 +794,43 @@ ] } }, + "/v1/eval/benchmarks/{benchmark_id}": { + "get": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "oneOf": [ + { + "$ref": "#/components/schemas/Benchmark" + }, + { + "type": "null" + } + ] + } + } + } + } + }, + "tags": [ + "Benchmarks" + ], + "description": "", + "parameters": [ + { + "name": "benchmark_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + } + ] + } + }, "/v1/datasets/{dataset_id}": { "get": { "responses": { @@ -811,43 +889,6 @@ ] } }, - "/v1/eval/tasks/{benchmark_id}": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "oneOf": [ - { - "$ref": "#/components/schemas/Benchmark" - }, - { - "type": "null" - } - ] - } - } - } - } - }, - "tags": [ - "Benchmarks" - ], - "description": "", - "parameters": [ - { - "name": "benchmark_id", - "in": "path", - "required": true, - "schema": { - "type": "string" - } - } - ] - } - }, "/v1/models/{model_id}": { "get": { "responses": { @@ -1431,7 +1472,7 @@ } } }, - "/v1/eval/tasks/{benchmark_id}/jobs/{job_id}": { + "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}": { "get": { "responses": { "200": { @@ -1505,7 +1546,83 @@ ] } }, - "/v1/eval/tasks/{benchmark_id}/jobs/{job_id}/result": { + "/v1/eval/tasks/{task_id}/jobs/{job_id}": { + "get": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "oneOf": [ + { + "$ref": "#/components/schemas/JobStatus" + }, + { + "type": "null" + } + ] + } + } + } + } + }, + "tags": [ + "Eval" + ], + "description": "", + "parameters": [ + { + "name": "task_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + }, + { + "name": "job_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + } + ], + "deprecated": true + }, + "delete": { + "responses": { + "200": { + "description": "OK" + } + }, + "tags": [ + "Eval" + ], + "description": "", + "parameters": [ + { + "name": "task_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + }, + { + "name": "job_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + } + ], + "deprecated": true + } + }, + "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result": { "get": { "responses": { "200": { @@ -1525,7 +1642,7 @@ "description": "", "parameters": [ { - "name": "job_id", + "name": "benchmark_id", "in": "path", "required": true, "schema": { @@ -1533,7 +1650,7 @@ } }, { - "name": "benchmark_id", + "name": "job_id", "in": "path", "required": true, "schema": { @@ -1543,6 +1660,88 @@ ] } }, + "/v1/eval/tasks/{task_id}/jobs/{job_id}/result": { + "get": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/EvaluateResponse" + } + } + } + } + }, + "tags": [ + "Eval" + ], + "description": "", + "parameters": [ + { + "name": "task_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + }, + { + "name": "job_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + } + ], + "deprecated": true + } + }, + "/v1/eval/benchmarks": { + "get": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ListBenchmarksResponse" + } + } + } + } + }, + "tags": [ + "Benchmarks" + ], + "description": "", + "parameters": [] + }, + "post": { + "responses": { + "200": { + "description": "OK" + } + }, + "tags": [ + "Benchmarks" + ], + "description": "", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RegisterBenchmarkRequest" + } + } + }, + "required": true + } + } + }, "/v1/datasets": { "get": { "responses": { @@ -1586,49 +1785,6 @@ } } }, - "/v1/eval/tasks": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ListBenchmarksResponse" - } - } - } - } - }, - "tags": [ - "Benchmarks" - ], - "description": "", - "parameters": [] - }, - "post": { - "responses": { - "200": { - "description": "OK" - } - }, - "tags": [ - "Benchmarks" - ], - "description": "", - "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/RegisterBenchmarkRequest" - } - } - }, - "required": true - } - } - }, "/v1/models": { "get": { "responses": { @@ -2204,7 +2360,7 @@ ] } }, - "/v1/eval/tasks/{benchmark_id}/jobs": { + "/v1/eval/benchmarks/{benchmark_id}/jobs": { "post": { "responses": { "200": { @@ -2244,6 +2400,47 @@ } } }, + "/v1/eval/tasks/{task_id}/jobs": { + "post": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Job" + } + } + } + } + }, + "tags": [ + "Eval" + ], + "description": "", + "parameters": [ + { + "name": "task_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RunEvalDeprecatedRequest" + } + } + }, + "required": true + }, + "deprecated": true + } + }, "/v1/safety/run-shield": { "post": { "responses": { @@ -2526,10 +2723,10 @@ "data" ] }, - "DeprecatedRegisterBenchmarkRequest": { + "DeprecatedRegisterEvalTaskRequest": { "type": "object", "properties": { - "benchmark_id": { + "task_id": { "type": "string" }, "dataset_id": { @@ -2575,7 +2772,7 @@ }, "additionalProperties": false, "required": [ - "benchmark_id", + "task_id", "dataset_id", "scoring_functions" ] @@ -4745,34 +4942,6 @@ "accuracy" ] }, - "AppBenchmarkConfig": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "app", - "default": "app" - }, - "eval_candidate": { - "$ref": "#/components/schemas/EvalCandidate" - }, - "scoring_params": { - "type": "object", - "additionalProperties": { - "$ref": "#/components/schemas/ScoringFnParams" - } - }, - "num_examples": { - "type": "integer" - } - }, - "additionalProperties": false, - "required": [ - "type", - "eval_candidate", - "scoring_params" - ] - }, "BasicScoringFnParams": { "type": "object", "properties": { @@ -4793,25 +4962,26 @@ "type" ] }, - "BenchmarkBenchmarkConfig": { + "BenchmarkConfig": { "type": "object", "properties": { - "type": { - "type": "string", - "const": "benchmark", - "default": "benchmark" - }, "eval_candidate": { "$ref": "#/components/schemas/EvalCandidate" }, + "scoring_params": { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/ScoringFnParams" + } + }, "num_examples": { "type": "integer" } }, "additionalProperties": false, "required": [ - "type", - "eval_candidate" + "eval_candidate", + "scoring_params" ] }, "EvalCandidate": { @@ -4831,23 +5001,6 @@ } } }, - "BenchmarkConfig": { - "oneOf": [ - { - "$ref": "#/components/schemas/BenchmarkBenchmarkConfig" - }, - { - "$ref": "#/components/schemas/AppBenchmarkConfig" - } - ], - "discriminator": { - "propertyName": "type", - "mapping": { - "benchmark": "#/components/schemas/BenchmarkBenchmarkConfig", - "app": "#/components/schemas/AppBenchmarkConfig" - } - } - }, "LLMAsJudgeScoringFnParams": { "type": "object", "properties": { @@ -5108,6 +5261,54 @@ "aggregated_results" ] }, + "EvaluateRowsDeprecatedRequest": { + "type": "object", + "properties": { + "input_rows": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "scoring_functions": { + "type": "array", + "items": { + "type": "string" + } + }, + "task_config": { + "$ref": "#/components/schemas/BenchmarkConfig" + } + }, + "additionalProperties": false, + "required": [ + "input_rows", + "scoring_functions", + "task_config" + ] + }, "Session": { "type": "object", "properties": { @@ -7304,60 +7505,6 @@ "data" ] }, - "RegisterDatasetRequest": { - "type": "object", - "properties": { - "dataset_id": { - "type": "string" - }, - "dataset_schema": { - "type": "object", - "additionalProperties": { - "$ref": "#/components/schemas/ParamType" - } - }, - "url": { - "$ref": "#/components/schemas/URL" - }, - "provider_dataset_id": { - "type": "string" - }, - "provider_id": { - "type": "string" - }, - "metadata": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } - } - }, - "additionalProperties": false, - "required": [ - "dataset_id", - "dataset_schema", - "url" - ] - }, "RegisterBenchmarkRequest": { "type": "object", "properties": { @@ -7412,6 +7559,60 @@ "scoring_functions" ] }, + "RegisterDatasetRequest": { + "type": "object", + "properties": { + "dataset_id": { + "type": "string" + }, + "dataset_schema": { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/ParamType" + } + }, + "url": { + "$ref": "#/components/schemas/URL" + }, + "provider_dataset_id": { + "type": "string" + }, + "provider_id": { + "type": "string" + }, + "metadata": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "additionalProperties": false, + "required": [ + "dataset_id", + "dataset_schema", + "url" + ] + }, "RegisterModelRequest": { "type": "object", "properties": { @@ -7623,6 +7824,18 @@ "job_id" ] }, + "RunEvalDeprecatedRequest": { + "type": "object", + "properties": { + "task_config": { + "$ref": "#/components/schemas/BenchmarkConfig" + } + }, + "additionalProperties": false, + "required": [ + "task_config" + ] + }, "RunShieldRequest": { "type": "object", "properties": { @@ -8105,6 +8318,9 @@ { "name": "BatchInference (Coming Soon)" }, + { + "name": "Benchmarks" + }, { "name": "DatasetIO" }, @@ -8114,9 +8330,6 @@ { "name": "Eval" }, - { - "name": "Benchmarks" - }, { "name": "Inference", "description": "This API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.", @@ -8168,10 +8381,10 @@ "tags": [ "Agents", "BatchInference (Coming Soon)", + "Benchmarks", "DatasetIO", "Datasets", "Eval", - "Benchmarks", "Inference", "Inspect", "Models", diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 0f0a613a8..89e066917 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -25,8 +25,8 @@ paths: - Benchmarks description: '' parameters: - - name: benchmark_id - in: path + - name: task_id + in: query required: true schema: type: string @@ -57,7 +57,7 @@ paths: content: application/json: schema: - $ref: '#/components/schemas/DeprecatedRegisterBenchmarkRequest' + $ref: '#/components/schemas/DeprecatedRegisterEvalTaskRequest' required: true deprecated: true /v1/datasetio/rows: @@ -372,7 +372,7 @@ paths: schema: $ref: '#/components/schemas/EmbeddingsRequest' required: true - /v1/eval/tasks/{benchmark_id}/evaluations: + /v1/eval/benchmarks/{benchmark_id}/evaluations: post: responses: '200': @@ -396,6 +396,31 @@ paths: schema: $ref: '#/components/schemas/EvaluateRowsRequest' required: true + /v1/eval/tasks/{task_id}/evaluations: + post: + responses: + '200': + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/EvaluateResponse' + tags: + - Eval + description: '' + parameters: + - name: task_id + in: path + required: true + schema: + type: string + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/EvaluateRowsDeprecatedRequest' + required: true + deprecated: true /v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}: get: responses: @@ -457,6 +482,26 @@ paths: required: true schema: type: string + /v1/eval/benchmarks/{benchmark_id}: + get: + responses: + '200': + description: OK + content: + application/json: + schema: + oneOf: + - $ref: '#/components/schemas/Benchmark' + - type: 'null' + tags: + - Benchmarks + description: '' + parameters: + - name: benchmark_id + in: path + required: true + schema: + type: string /v1/datasets/{dataset_id}: get: responses: @@ -490,26 +535,6 @@ paths: required: true schema: type: string - /v1/eval/tasks/{benchmark_id}: - get: - responses: - '200': - description: OK - content: - application/json: - schema: - oneOf: - - $ref: '#/components/schemas/Benchmark' - - type: 'null' - tags: - - Benchmarks - description: '' - parameters: - - name: benchmark_id - in: path - required: true - schema: - type: string /v1/models/{model_id}: get: responses: @@ -852,7 +877,7 @@ paths: schema: $ref: '#/components/schemas/InvokeToolRequest' required: true - /v1/eval/tasks/{benchmark_id}/jobs/{job_id}: + /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}: get: responses: '200': @@ -895,7 +920,52 @@ paths: required: true schema: type: string - /v1/eval/tasks/{benchmark_id}/jobs/{job_id}/result: + /v1/eval/tasks/{task_id}/jobs/{job_id}: + get: + responses: + '200': + description: OK + content: + application/json: + schema: + oneOf: + - $ref: '#/components/schemas/JobStatus' + - type: 'null' + tags: + - Eval + description: '' + parameters: + - name: task_id + in: path + required: true + schema: + type: string + - name: job_id + in: path + required: true + schema: + type: string + deprecated: true + delete: + responses: + '200': + description: OK + tags: + - Eval + description: '' + parameters: + - name: task_id + in: path + required: true + schema: + type: string + - name: job_id + in: path + required: true + schema: + type: string + deprecated: true + /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result: get: responses: '200': @@ -908,16 +978,67 @@ paths: - Eval description: '' parameters: - - name: job_id - in: path - required: true - schema: - type: string - name: benchmark_id in: path required: true schema: type: string + - name: job_id + in: path + required: true + schema: + type: string + /v1/eval/tasks/{task_id}/jobs/{job_id}/result: + get: + responses: + '200': + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/EvaluateResponse' + tags: + - Eval + description: '' + parameters: + - name: task_id + in: path + required: true + schema: + type: string + - name: job_id + in: path + required: true + schema: + type: string + deprecated: true + /v1/eval/benchmarks: + get: + responses: + '200': + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/ListBenchmarksResponse' + tags: + - Benchmarks + description: '' + parameters: [] + post: + responses: + '200': + description: OK + tags: + - Benchmarks + description: '' + parameters: [] + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/RegisterBenchmarkRequest' + required: true /v1/datasets: get: responses: @@ -945,33 +1066,6 @@ paths: schema: $ref: '#/components/schemas/RegisterDatasetRequest' required: true - /v1/eval/tasks: - get: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/ListBenchmarksResponse' - tags: - - Benchmarks - description: '' - parameters: [] - post: - responses: - '200': - description: OK - tags: - - Benchmarks - description: '' - parameters: [] - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/RegisterBenchmarkRequest' - required: true /v1/models: get: responses: @@ -1328,7 +1422,7 @@ paths: type: array items: type: string - /v1/eval/tasks/{benchmark_id}/jobs: + /v1/eval/benchmarks/{benchmark_id}/jobs: post: responses: '200': @@ -1352,6 +1446,31 @@ paths: schema: $ref: '#/components/schemas/RunEvalRequest' required: true + /v1/eval/tasks/{task_id}/jobs: + post: + responses: + '200': + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/Job' + tags: + - Eval + description: '' + parameters: + - name: task_id + in: path + required: true + schema: + type: string + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/RunEvalDeprecatedRequest' + required: true + deprecated: true /v1/safety/run-shield: post: responses: @@ -1527,10 +1646,10 @@ components: additionalProperties: false required: - data - DeprecatedRegisterBenchmarkRequest: + DeprecatedRegisterEvalTaskRequest: type: object properties: - benchmark_id: + task_id: type: string dataset_id: type: string @@ -1554,7 +1673,7 @@ components: - type: object additionalProperties: false required: - - benchmark_id + - task_id - dataset_id - scoring_functions AppendRowsRequest: @@ -3063,26 +3182,6 @@ components: - median - categorical_count - accuracy - AppBenchmarkConfig: - type: object - properties: - type: - type: string - const: app - default: app - eval_candidate: - $ref: '#/components/schemas/EvalCandidate' - scoring_params: - type: object - additionalProperties: - $ref: '#/components/schemas/ScoringFnParams' - num_examples: - type: integer - additionalProperties: false - required: - - type - - eval_candidate - - scoring_params BasicScoringFnParams: type: object properties: @@ -3097,21 +3196,21 @@ components: additionalProperties: false required: - type - BenchmarkBenchmarkConfig: + BenchmarkConfig: type: object properties: - type: - type: string - const: benchmark - default: benchmark eval_candidate: $ref: '#/components/schemas/EvalCandidate' + scoring_params: + type: object + additionalProperties: + $ref: '#/components/schemas/ScoringFnParams' num_examples: type: integer additionalProperties: false required: - - type - eval_candidate + - scoring_params EvalCandidate: oneOf: - $ref: '#/components/schemas/ModelCandidate' @@ -3121,15 +3220,6 @@ components: mapping: model: '#/components/schemas/ModelCandidate' agent: '#/components/schemas/AgentCandidate' - BenchmarkConfig: - oneOf: - - $ref: '#/components/schemas/BenchmarkBenchmarkConfig' - - $ref: '#/components/schemas/AppBenchmarkConfig' - discriminator: - propertyName: type - mapping: - benchmark: '#/components/schemas/BenchmarkBenchmarkConfig' - app: '#/components/schemas/AppBenchmarkConfig' LLMAsJudgeScoringFnParams: type: object properties: @@ -3278,6 +3368,32 @@ components: required: - score_rows - aggregated_results + EvaluateRowsDeprecatedRequest: + type: object + properties: + input_rows: + type: array + items: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + scoring_functions: + type: array + items: + type: string + task_config: + $ref: '#/components/schemas/BenchmarkConfig' + additionalProperties: false + required: + - input_rows + - scoring_functions + - task_config Session: type: object properties: @@ -4645,36 +4761,6 @@ components: additionalProperties: false required: - data - RegisterDatasetRequest: - type: object - properties: - dataset_id: - type: string - dataset_schema: - type: object - additionalProperties: - $ref: '#/components/schemas/ParamType' - url: - $ref: '#/components/schemas/URL' - provider_dataset_id: - type: string - provider_id: - type: string - metadata: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - additionalProperties: false - required: - - dataset_id - - dataset_schema - - url RegisterBenchmarkRequest: type: object properties: @@ -4705,6 +4791,36 @@ components: - benchmark_id - dataset_id - scoring_functions + RegisterDatasetRequest: + type: object + properties: + dataset_id: + type: string + dataset_schema: + type: object + additionalProperties: + $ref: '#/components/schemas/ParamType' + url: + $ref: '#/components/schemas/URL' + provider_dataset_id: + type: string + provider_id: + type: string + metadata: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + additionalProperties: false + required: + - dataset_id + - dataset_schema + - url RegisterModelRequest: type: object properties: @@ -4827,6 +4943,14 @@ components: additionalProperties: false required: - job_id + RunEvalDeprecatedRequest: + type: object + properties: + task_config: + $ref: '#/components/schemas/BenchmarkConfig' + additionalProperties: false + required: + - task_config RunShieldRequest: type: object properties: @@ -5125,10 +5249,10 @@ tags: x-displayName: >- Agents API for creating and interacting with agentic systems. - name: BatchInference (Coming Soon) + - name: Benchmarks - name: DatasetIO - name: Datasets - name: Eval - - name: Benchmarks - name: Inference description: >- This API provides the raw interface to the underlying models. Two kinds of models @@ -5159,10 +5283,10 @@ x-tagGroups: tags: - Agents - BatchInference (Coming Soon) + - Benchmarks - DatasetIO - Datasets - Eval - - Benchmarks - Inference - Inspect - Models diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py index 90b14131f..b805e4976 100644 --- a/llama_stack/apis/eval/eval.py +++ b/llama_stack/apis/eval/eval.py @@ -83,3 +83,28 @@ class Eval(Protocol): @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET") async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse: ... + + @webmethod(route="/eval/tasks/{task_id}/jobs", method="POST") + async def run_eval_DEPRECATED( + self, + task_id: str, + task_config: BenchmarkConfig, + ) -> Job: ... + + @webmethod(route="/eval/tasks/{task_id}/evaluations", method="POST") + async def evaluate_rows_DEPRECATED( + self, + task_id: str, + input_rows: List[Dict[str, Any]], + scoring_functions: List[str], + task_config: BenchmarkConfig, + ) -> EvaluateResponse: ... + + @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="GET") + async def job_status_DEPRECATED(self, task_id: str, job_id: str) -> Optional[JobStatus]: ... + + @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="DELETE") + async def job_cancel_DEPRECATED(self, task_id: str, job_id: str) -> None: ... + + @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}/result", method="GET") + async def job_result_DEPRECATED(self, task_id: str, job_id: str) -> EvaluateResponse: ... diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py index f9f306767..9945ad367 100644 --- a/llama_stack/distribution/routers/routers.py +++ b/llama_stack/distribution/routers/routers.py @@ -9,7 +9,6 @@ from typing import Any, AsyncGenerator, Dict, List, Optional from llama_stack.apis.common.content_types import InterleavedContent, URL from llama_stack.apis.datasetio import DatasetIO, PaginatedRowsResult from llama_stack.apis.eval import ( - AppBenchmarkConfig, BenchmarkConfig, Eval, EvaluateResponse, @@ -348,7 +347,7 @@ class EvalRouter(Eval): async def run_eval( self, benchmark_id: str, - task_config: AppBenchmarkConfig, + task_config: BenchmarkConfig, ) -> Job: return await self.routing_table.get_provider_impl(benchmark_id).run_eval( benchmark_id=benchmark_id,