diff --git a/docs/openapi_generator/generate.py b/docs/openapi_generator/generate.py index 871c01a80..f9f56119b 100644 --- a/docs/openapi_generator/generate.py +++ b/docs/openapi_generator/generate.py @@ -33,14 +33,16 @@ schema_utils.json_schema_type = json_schema_type from llama_models.llama3.api.datatypes import * # noqa: F403 from llama_stack.apis.agents import * # noqa: F403 -from llama_stack.apis.dataset import * # noqa: F403 -from llama_stack.apis.evals import * # noqa: F403 +from llama_stack.apis.datasets import * # noqa: F403 +from llama_stack.apis.datasetio import * # noqa: F403 +from llama_stack.apis.scoring import * # noqa: F403 +from llama_stack.apis.scoring_functions import * # noqa: F403 +from llama_stack.apis.eval import * # noqa: F403 from llama_stack.apis.inference import * # noqa: F403 from llama_stack.apis.batch_inference import * # noqa: F403 from llama_stack.apis.memory import * # noqa: F403 from llama_stack.apis.telemetry import * # noqa: F403 from llama_stack.apis.post_training import * # noqa: F403 -from llama_stack.apis.reward_scoring import * # noqa: F403 from llama_stack.apis.synthetic_data_generation import * # noqa: F403 from llama_stack.apis.safety import * # noqa: F403 from llama_stack.apis.models import * # noqa: F403 @@ -54,14 +56,16 @@ class LlamaStack( Inference, BatchInference, Agents, - RewardScoring, Safety, SyntheticDataGeneration, Datasets, Telemetry, PostTraining, Memory, - Evaluations, + Eval, + Scoring, + ScoringFunctions, + DatasetIO, Models, Shields, Inspect, diff --git a/docs/resources/llama-stack-spec.html b/docs/resources/llama-stack-spec.html index 8e6683931..886634fba 100644 --- a/docs/resources/llama-stack-spec.html +++ b/docs/resources/llama-stack-spec.html @@ -21,7 +21,7 @@ "info": { "title": "[DRAFT] Llama Stack Specification", "version": "0.0.1", - "description": "This is the specification of the llama stack that provides\n a set of endpoints and their corresponding interfaces that are tailored to\n best leverage Llama Models. The specification is still in draft and subject to change.\n Generated at 2024-10-18 20:48:17.730988" + "description": "This is the specification of the llama stack that provides\n a set of endpoints and their corresponding interfaces that are tailored to\n best leverage Llama Models. The specification is still in draft and subject to change.\n Generated at 2024-10-24 17:40:59.576117" }, "servers": [ { @@ -109,39 +109,6 @@ } } }, - "/evaluate/job/cancel": { - "post": { - "responses": { - "200": { - "description": "OK" - } - }, - "tags": [ - "Evaluations" - ], - "parameters": [ - { - "name": "X-LlamaStack-ProviderData", - "in": "header", - "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", - "required": false, - "schema": { - "type": "string" - } - } - ], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/CancelEvaluationJobRequest" - } - } - }, - "required": true - } - } - }, "/post_training/job/cancel": { "post": { "responses": { @@ -389,39 +356,6 @@ } } }, - "/datasets/create": { - "post": { - "responses": { - "200": { - "description": "OK" - } - }, - "tags": [ - "Datasets" - ], - "parameters": [ - { - "name": "X-LlamaStack-ProviderData", - "in": "header", - "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", - "required": false, - "schema": { - "type": "string" - } - } - ], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/CreateDatasetRequest" - } - } - }, - "required": true - } - } - }, "/agents/delete": { "post": { "responses": { @@ -488,39 +422,6 @@ } } }, - "/datasets/delete": { - "post": { - "responses": { - "200": { - "description": "OK" - } - }, - "tags": [ - "Datasets" - ], - "parameters": [ - { - "name": "X-LlamaStack-ProviderData", - "in": "header", - "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", - "required": false, - "schema": { - "type": "string" - } - } - ], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/DeleteDatasetRequest" - } - } - }, - "required": true - } - } - }, "/inference/embeddings": { "post": { "responses": { @@ -561,7 +462,7 @@ } } }, - "/evaluate/question_answering/": { + "/eval/evaluate": { "post": { "responses": { "200": { @@ -569,14 +470,14 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/EvaluationJob" + "$ref": "#/components/schemas/EvaluateResponse" } } } } }, "tags": [ - "Evaluations" + "Eval" ], "parameters": [ { @@ -593,7 +494,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/EvaluateQuestionAnsweringRequest" + "$ref": "#/components/schemas/EvaluateRequest" } } }, @@ -601,7 +502,7 @@ } } }, - "/evaluate/summarization/": { + "/eval/evaluate_batch": { "post": { "responses": { "200": { @@ -609,14 +510,14 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/EvaluationJob" + "$ref": "#/components/schemas/Job" } } } } }, "tags": [ - "Evaluations" + "Eval" ], "parameters": [ { @@ -633,47 +534,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/EvaluateSummarizationRequest" - } - } - }, - "required": true - } - } - }, - "/evaluate/text_generation/": { - "post": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/EvaluationJob" - } - } - } - } - }, - "tags": [ - "Evaluations" - ], - "parameters": [ - { - "name": "X-LlamaStack-ProviderData", - "in": "header", - "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", - "required": false, - "schema": { - "type": "string" - } - } - ], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/EvaluateTextGenerationRequest" + "$ref": "#/components/schemas/EvaluateBatchRequest" } } }, @@ -763,6 +624,14 @@ "type": "string" } }, + { + "name": "session_id", + "in": "query", + "required": true, + "schema": { + "type": "string" + } + }, { "name": "turn_id", "in": "query", @@ -817,6 +686,14 @@ "type": "string" } }, + { + "name": "session_id", + "in": "query", + "required": true, + "schema": { + "type": "string" + } + }, { "name": "turn_id", "in": "query", @@ -845,7 +722,14 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/TrainEvalDataset" + "oneOf": [ + { + "$ref": "#/components/schemas/DatasetDefWithProvider" + }, + { + "type": "null" + } + ] } } } @@ -856,7 +740,7 @@ ], "parameters": [ { - "name": "dataset_uuid", + "name": "dataset_identifier", "in": "query", "required": true, "schema": { @@ -875,150 +759,6 @@ ] } }, - "/evaluate/job/artifacts": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/EvaluationJobArtifactsResponse" - } - } - } - } - }, - "tags": [ - "Evaluations" - ], - "parameters": [ - { - "name": "job_uuid", - "in": "query", - "required": true, - "schema": { - "type": "string" - } - }, - { - "name": "X-LlamaStack-ProviderData", - "in": "header", - "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", - "required": false, - "schema": { - "type": "string" - } - } - ] - } - }, - "/evaluate/job/logs": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/EvaluationJobLogStream" - } - } - } - } - }, - "tags": [ - "Evaluations" - ], - "parameters": [ - { - "name": "job_uuid", - "in": "query", - "required": true, - "schema": { - "type": "string" - } - }, - { - "name": "X-LlamaStack-ProviderData", - "in": "header", - "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", - "required": false, - "schema": { - "type": "string" - } - } - ] - } - }, - "/evaluate/job/status": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/EvaluationJobStatusResponse" - } - } - } - } - }, - "tags": [ - "Evaluations" - ], - "parameters": [ - { - "name": "job_uuid", - "in": "query", - "required": true, - "schema": { - "type": "string" - } - }, - { - "name": "X-LlamaStack-ProviderData", - "in": "header", - "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", - "required": false, - "schema": { - "type": "string" - } - } - ] - } - }, - "/evaluate/jobs": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/jsonl": { - "schema": { - "$ref": "#/components/schemas/EvaluationJob" - } - } - } - } - }, - "tags": [ - "Evaluations" - ], - "parameters": [ - { - "name": "X-LlamaStack-ProviderData", - "in": "header", - "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", - "required": false, - "schema": { - "type": "string" - } - } - ] - } - }, "/memory_banks/get": { "get": { "responses": { @@ -1122,6 +862,113 @@ ] } }, + "/datasetio/get_rows_paginated": { + "get": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/PaginatedRowsResult" + } + } + } + } + }, + "tags": [ + "DatasetIO" + ], + "parameters": [ + { + "name": "dataset_id", + "in": "query", + "required": true, + "schema": { + "type": "string" + } + }, + { + "name": "rows_in_page", + "in": "query", + "required": true, + "schema": { + "type": "integer" + } + }, + { + "name": "page_token", + "in": "query", + "required": false, + "schema": { + "type": "string" + } + }, + { + "name": "filter_condition", + "in": "query", + "required": false, + "schema": { + "type": "string" + } + }, + { + "name": "X-LlamaStack-ProviderData", + "in": "header", + "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", + "required": false, + "schema": { + "type": "string" + } + } + ] + } + }, + "/scoring_functions/get": { + "get": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "oneOf": [ + { + "$ref": "#/components/schemas/ScoringFunctionDefWithProvider" + }, + { + "type": "null" + } + ] + } + } + } + } + }, + "tags": [ + "ScoringFunctions" + ], + "parameters": [ + { + "name": "name", + "in": "query", + "required": true, + "schema": { + "type": "string" + } + }, + { + "name": "X-LlamaStack-ProviderData", + "in": "header", + "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", + "required": false, + "schema": { + "type": "string" + } + } + ] + } + }, "/shields/get": { "get": { "responses": { @@ -1412,6 +1259,152 @@ } } }, + "/eval/job/cancel": { + "post": { + "responses": { + "200": { + "description": "OK" + } + }, + "tags": [ + "Eval" + ], + "parameters": [ + { + "name": "X-LlamaStack-ProviderData", + "in": "header", + "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", + "required": false, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/JobCancelRequest" + } + } + }, + "required": true + } + } + }, + "/eval/job/result": { + "get": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/EvaluateResponse" + } + } + } + } + }, + "tags": [ + "Eval" + ], + "parameters": [ + { + "name": "job_id", + "in": "query", + "required": true, + "schema": { + "type": "string" + } + }, + { + "name": "X-LlamaStack-ProviderData", + "in": "header", + "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", + "required": false, + "schema": { + "type": "string" + } + } + ] + } + }, + "/eval/job/status": { + "get": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "oneOf": [ + { + "$ref": "#/components/schemas/JobStatus" + }, + { + "type": "null" + } + ] + } + } + } + } + }, + "tags": [ + "Eval" + ], + "parameters": [ + { + "name": "job_id", + "in": "query", + "required": true, + "schema": { + "type": "string" + } + }, + { + "name": "X-LlamaStack-ProviderData", + "in": "header", + "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", + "required": false, + "schema": { + "type": "string" + } + } + ] + } + }, + "/datasets/list": { + "get": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/jsonl": { + "schema": { + "$ref": "#/components/schemas/DatasetDefWithProvider" + } + } + } + } + }, + "tags": [ + "Datasets" + ], + "parameters": [ + { + "name": "X-LlamaStack-ProviderData", + "in": "header", + "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", + "required": false, + "schema": { + "type": "string" + } + } + ] + } + }, "/memory_banks/list": { "get": { "responses": { @@ -1554,6 +1547,36 @@ ] } }, + "/scoring_functions/list": { + "get": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/jsonl": { + "schema": { + "$ref": "#/components/schemas/ScoringFunctionDefWithProvider" + } + } + } + } + }, + "tags": [ + "ScoringFunctions" + ], + "parameters": [ + { + "name": "X-LlamaStack-ProviderData", + "in": "header", + "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", + "required": false, + "schema": { + "type": "string" + } + } + ] + } + }, "/shields/list": { "get": { "responses": { @@ -1697,6 +1720,39 @@ } } }, + "/datasets/register": { + "post": { + "responses": { + "200": { + "description": "OK" + } + }, + "tags": [ + "Datasets" + ], + "parameters": [ + { + "name": "X-LlamaStack-ProviderData", + "in": "header", + "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", + "required": false, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RegisterDatasetRequest" + } + } + }, + "required": true + } + } + }, "/memory_banks/register": { "post": { "responses": { @@ -1763,6 +1819,39 @@ } } }, + "/scoring_functions/register": { + "post": { + "responses": { + "200": { + "description": "OK" + } + }, + "tags": [ + "ScoringFunctions" + ], + "parameters": [ + { + "name": "X-LlamaStack-ProviderData", + "in": "header", + "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", + "required": false, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RegisterScoringFunctionRequest" + } + } + }, + "required": true + } + } + }, "/shields/register": { "post": { "responses": { @@ -1796,46 +1885,6 @@ } } }, - "/reward_scoring/score": { - "post": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/RewardScoringResponse" - } - } - } - } - }, - "tags": [ - "RewardScoring" - ], - "parameters": [ - { - "name": "X-LlamaStack-ProviderData", - "in": "header", - "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", - "required": false, - "schema": { - "type": "string" - } - } - ], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/RewardScoreRequest" - } - } - }, - "required": true - } - } - }, "/safety/run_shield": { "post": { "responses": { @@ -1876,6 +1925,86 @@ } } }, + "/scoring/score": { + "post": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ScoreResponse" + } + } + } + } + }, + "tags": [ + "Scoring" + ], + "parameters": [ + { + "name": "X-LlamaStack-ProviderData", + "in": "header", + "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", + "required": false, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ScoreRequest" + } + } + }, + "required": true + } + } + }, + "/scoring/score_batch": { + "post": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ScoreBatchResponse" + } + } + } + } + }, + "tags": [ + "Scoring" + ], + "parameters": [ + { + "name": "X-LlamaStack-ProviderData", + "in": "header", + "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", + "required": false, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ScoreBatchRequest" + } + } + }, + "required": true + } + } + }, "/post_training/supervised_fine_tune": { "post": { "responses": { @@ -2571,18 +2700,6 @@ "completion_message_batch" ] }, - "CancelEvaluationJobRequest": { - "type": "object", - "properties": { - "job_uuid": { - "type": "string" - } - }, - "additionalProperties": false, - "required": [ - "job_uuid" - ] - }, "CancelTrainingJobRequest": { "type": "object", "properties": { @@ -2635,6 +2752,90 @@ "tool_prompt_format": { "$ref": "#/components/schemas/ToolPromptFormat" }, + "response_format": { + "oneOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "json_schema", + "default": "json_schema" + }, + "schema": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "additionalProperties": false, + "required": [ + "type", + "schema" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "grammar", + "default": "grammar" + }, + "bnf": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "additionalProperties": false, + "required": [ + "type", + "bnf" + ] + } + ] + }, "stream": { "type": "boolean" }, @@ -2807,6 +3008,90 @@ "sampling_params": { "$ref": "#/components/schemas/SamplingParams" }, + "response_format": { + "oneOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "json_schema", + "default": "json_schema" + }, + "schema": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "additionalProperties": false, + "required": [ + "type", + "schema" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "grammar", + "default": "grammar" + }, + "bnf": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "additionalProperties": false, + "required": [ + "type", + "bnf" + ] + } + ] + }, "stream": { "type": "boolean" }, @@ -4094,77 +4379,6 @@ "error" ] }, - "TrainEvalDataset": { - "type": "object", - "properties": { - "columns": { - "type": "object", - "additionalProperties": { - "$ref": "#/components/schemas/TrainEvalDatasetColumnType" - } - }, - "content_url": { - "$ref": "#/components/schemas/URL" - }, - "metadata": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } - } - }, - "additionalProperties": false, - "required": [ - "columns", - "content_url" - ], - "title": "Dataset to be used for training or evaluating language models." - }, - "TrainEvalDatasetColumnType": { - "type": "string", - "enum": [ - "dialog", - "text", - "media", - "number", - "json" - ] - }, - "CreateDatasetRequest": { - "type": "object", - "properties": { - "uuid": { - "type": "string" - }, - "dataset": { - "$ref": "#/components/schemas/TrainEvalDataset" - } - }, - "additionalProperties": false, - "required": [ - "uuid", - "dataset" - ] - }, "DeleteAgentsRequest": { "type": "object", "properties": { @@ -4193,18 +4407,6 @@ "session_id" ] }, - "DeleteDatasetRequest": { - "type": "object", - "properties": { - "dataset_uuid": { - "type": "string" - } - }, - "additionalProperties": false, - "required": [ - "dataset_uuid" - ] - }, "EmbeddingsRequest": { "type": "object", "properties": { @@ -4262,74 +4464,251 @@ "embeddings" ] }, - "EvaluateQuestionAnsweringRequest": { + "AgentCandidate": { "type": "object", "properties": { - "metrics": { + "type": { + "type": "string", + "const": "agent", + "default": "agent" + }, + "config": { + "$ref": "#/components/schemas/AgentConfig" + } + }, + "additionalProperties": false, + "required": [ + "type", + "config" + ] + }, + "ModelCandidate": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "model", + "default": "model" + }, + "model": { + "type": "string" + }, + "sampling_params": { + "$ref": "#/components/schemas/SamplingParams" + }, + "system_message": { + "$ref": "#/components/schemas/SystemMessage" + } + }, + "additionalProperties": false, + "required": [ + "type", + "model", + "sampling_params" + ] + }, + "EvaluateRequest": { + "type": "object", + "properties": { + "input_rows": { "type": "array", "items": { - "type": "string", - "enum": [ - "em", - "f1" + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "candidate": { + "oneOf": [ + { + "$ref": "#/components/schemas/ModelCandidate" + }, + { + "$ref": "#/components/schemas/AgentCandidate" + } + ] + }, + "scoring_functions": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false, + "required": [ + "input_rows", + "candidate", + "scoring_functions" + ] + }, + "EvaluateResponse": { + "type": "object", + "properties": { + "generations": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "scores": { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/ScoringResult" + } + } + }, + "additionalProperties": false, + "required": [ + "generations", + "scores" + ] + }, + "ScoringResult": { + "type": "object", + "properties": { + "score_rows": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "aggregated_results": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } ] } } }, "additionalProperties": false, "required": [ - "metrics" + "score_rows", + "aggregated_results" ] }, - "EvaluationJob": { + "EvaluateBatchRequest": { "type": "object", "properties": { - "job_uuid": { + "dataset_id": { + "type": "string" + }, + "candidate": { + "oneOf": [ + { + "$ref": "#/components/schemas/ModelCandidate" + }, + { + "$ref": "#/components/schemas/AgentCandidate" + } + ] + }, + "scoring_functions": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false, + "required": [ + "dataset_id", + "candidate", + "scoring_functions" + ] + }, + "Job": { + "type": "object", + "properties": { + "job_id": { "type": "string" } }, "additionalProperties": false, "required": [ - "job_uuid" - ] - }, - "EvaluateSummarizationRequest": { - "type": "object", - "properties": { - "metrics": { - "type": "array", - "items": { - "type": "string", - "enum": [ - "rouge", - "bleu" - ] - } - } - }, - "additionalProperties": false, - "required": [ - "metrics" - ] - }, - "EvaluateTextGenerationRequest": { - "type": "object", - "properties": { - "metrics": { - "type": "array", - "items": { - "type": "string", - "enum": [ - "perplexity", - "rouge", - "bleu" - ] - } - } - }, - "additionalProperties": false, - "required": [ - "metrics" + "job_id" ] }, "GetAgentsSessionRequest": { @@ -4517,41 +4896,216 @@ "step" ] }, - "EvaluationJobArtifactsResponse": { + "DatasetDefWithProvider": { "type": "object", "properties": { - "job_uuid": { + "identifier": { + "type": "string" + }, + "dataset_schema": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "string", + "default": "string" + } + }, + "additionalProperties": false, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "number", + "default": "number" + } + }, + "additionalProperties": false, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "boolean", + "default": "boolean" + } + }, + "additionalProperties": false, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "array", + "default": "array" + } + }, + "additionalProperties": false, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "object", + "default": "object" + } + }, + "additionalProperties": false, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "json", + "default": "json" + } + }, + "additionalProperties": false, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "union", + "default": "union" + } + }, + "additionalProperties": false, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "custom", + "default": "custom" + }, + "validator_class": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "type", + "validator_class" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "chat_completion_input", + "default": "chat_completion_input" + } + }, + "additionalProperties": false, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "completion_input", + "default": "completion_input" + } + }, + "additionalProperties": false, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "agent_turn_input", + "default": "agent_turn_input" + } + }, + "additionalProperties": false, + "required": [ + "type" + ] + } + ] + } + }, + "url": { + "$ref": "#/components/schemas/URL" + }, + "metadata": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + }, + "provider_id": { "type": "string" } }, "additionalProperties": false, "required": [ - "job_uuid" - ], - "title": "Artifacts of a evaluation job." - }, - "EvaluationJobLogStream": { - "type": "object", - "properties": { - "job_uuid": { - "type": "string" - } - }, - "additionalProperties": false, - "required": [ - "job_uuid" - ] - }, - "EvaluationJobStatusResponse": { - "type": "object", - "properties": { - "job_uuid": { - "type": "string" - } - }, - "additionalProperties": false, - "required": [ - "job_uuid" + "identifier", + "dataset_schema", + "url", + "metadata", + "provider_id" ] }, "ModelDefWithProvider": { @@ -4600,6 +5154,458 @@ "provider_id" ] }, + "PaginatedRowsResult": { + "type": "object", + "properties": { + "rows": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "total_count": { + "type": "integer" + }, + "next_page_token": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "rows", + "total_count" + ] + }, + "Parameter": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "type": { + "oneOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "string", + "default": "string" + } + }, + "additionalProperties": false, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "number", + "default": "number" + } + }, + "additionalProperties": false, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "boolean", + "default": "boolean" + } + }, + "additionalProperties": false, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "array", + "default": "array" + } + }, + "additionalProperties": false, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "object", + "default": "object" + } + }, + "additionalProperties": false, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "json", + "default": "json" + } + }, + "additionalProperties": false, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "union", + "default": "union" + } + }, + "additionalProperties": false, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "custom", + "default": "custom" + }, + "validator_class": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "type", + "validator_class" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "chat_completion_input", + "default": "chat_completion_input" + } + }, + "additionalProperties": false, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "completion_input", + "default": "completion_input" + } + }, + "additionalProperties": false, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "agent_turn_input", + "default": "agent_turn_input" + } + }, + "additionalProperties": false, + "required": [ + "type" + ] + } + ] + }, + "description": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "name", + "type" + ] + }, + "ScoringFunctionDefWithProvider": { + "type": "object", + "properties": { + "identifier": { + "type": "string" + }, + "description": { + "type": "string" + }, + "metadata": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + }, + "parameters": { + "type": "array", + "items": { + "$ref": "#/components/schemas/Parameter" + } + }, + "return_type": { + "oneOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "string", + "default": "string" + } + }, + "additionalProperties": false, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "number", + "default": "number" + } + }, + "additionalProperties": false, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "boolean", + "default": "boolean" + } + }, + "additionalProperties": false, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "array", + "default": "array" + } + }, + "additionalProperties": false, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "object", + "default": "object" + } + }, + "additionalProperties": false, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "json", + "default": "json" + } + }, + "additionalProperties": false, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "union", + "default": "union" + } + }, + "additionalProperties": false, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "custom", + "default": "custom" + }, + "validator_class": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "type", + "validator_class" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "chat_completion_input", + "default": "chat_completion_input" + } + }, + "additionalProperties": false, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "completion_input", + "default": "completion_input" + } + }, + "additionalProperties": false, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "agent_turn_input", + "default": "agent_turn_input" + } + }, + "additionalProperties": false, + "required": [ + "type" + ] + } + ] + }, + "context": { + "type": "object", + "properties": { + "judge_model": { + "type": "string" + }, + "prompt_template": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "judge_model" + ] + }, + "provider_id": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "identifier", + "metadata", + "parameters", + "return_type", + "provider_id" + ] + }, "ShieldDefWithProvider": { "type": "object", "properties": { @@ -4898,6 +5904,25 @@ "documents" ] }, + "JobCancelRequest": { + "type": "object", + "properties": { + "job_id": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "job_id" + ] + }, + "JobStatus": { + "type": "string", + "enum": [ + "completed", + "in_progress" + ] + }, "ProviderInfo": { "type": "object", "properties": { @@ -5315,10 +6340,10 @@ "$ref": "#/components/schemas/URL" }, "dataset": { - "$ref": "#/components/schemas/TrainEvalDataset" + "type": "string" }, "validation_dataset": { - "$ref": "#/components/schemas/TrainEvalDataset" + "type": "string" }, "algorithm": { "$ref": "#/components/schemas/RLHFAlgorithm" @@ -5517,6 +6542,18 @@ "scores" ] }, + "RegisterDatasetRequest": { + "type": "object", + "properties": { + "dataset_def": { + "$ref": "#/components/schemas/DatasetDefWithProvider" + } + }, + "additionalProperties": false, + "required": [ + "dataset_def" + ] + }, "RegisterMemoryBankRequest": { "type": "object", "properties": { @@ -5554,6 +6591,18 @@ "model" ] }, + "RegisterScoringFunctionRequest": { + "type": "object", + "properties": { + "function_def": { + "$ref": "#/components/schemas/ScoringFunctionDefWithProvider" + } + }, + "additionalProperties": false, + "required": [ + "function_def" + ] + }, "RegisterShieldRequest": { "type": "object", "properties": { @@ -5566,153 +6615,6 @@ "shield" ] }, - "DialogGenerations": { - "type": "object", - "properties": { - "dialog": { - "type": "array", - "items": { - "oneOf": [ - { - "$ref": "#/components/schemas/UserMessage" - }, - { - "$ref": "#/components/schemas/SystemMessage" - }, - { - "$ref": "#/components/schemas/ToolResponseMessage" - }, - { - "$ref": "#/components/schemas/CompletionMessage" - } - ] - } - }, - "sampled_generations": { - "type": "array", - "items": { - "oneOf": [ - { - "$ref": "#/components/schemas/UserMessage" - }, - { - "$ref": "#/components/schemas/SystemMessage" - }, - { - "$ref": "#/components/schemas/ToolResponseMessage" - }, - { - "$ref": "#/components/schemas/CompletionMessage" - } - ] - } - } - }, - "additionalProperties": false, - "required": [ - "dialog", - "sampled_generations" - ] - }, - "RewardScoreRequest": { - "type": "object", - "properties": { - "dialog_generations": { - "type": "array", - "items": { - "$ref": "#/components/schemas/DialogGenerations" - } - }, - "model": { - "type": "string" - } - }, - "additionalProperties": false, - "required": [ - "dialog_generations", - "model" - ] - }, - "RewardScoringResponse": { - "type": "object", - "properties": { - "scored_generations": { - "type": "array", - "items": { - "$ref": "#/components/schemas/ScoredDialogGenerations" - } - } - }, - "additionalProperties": false, - "required": [ - "scored_generations" - ], - "title": "Response from the reward scoring. Batch of (prompt, response, score) tuples that pass the threshold." - }, - "ScoredDialogGenerations": { - "type": "object", - "properties": { - "dialog": { - "type": "array", - "items": { - "oneOf": [ - { - "$ref": "#/components/schemas/UserMessage" - }, - { - "$ref": "#/components/schemas/SystemMessage" - }, - { - "$ref": "#/components/schemas/ToolResponseMessage" - }, - { - "$ref": "#/components/schemas/CompletionMessage" - } - ] - } - }, - "scored_generations": { - "type": "array", - "items": { - "$ref": "#/components/schemas/ScoredMessage" - } - } - }, - "additionalProperties": false, - "required": [ - "dialog", - "scored_generations" - ] - }, - "ScoredMessage": { - "type": "object", - "properties": { - "message": { - "oneOf": [ - { - "$ref": "#/components/schemas/UserMessage" - }, - { - "$ref": "#/components/schemas/SystemMessage" - }, - { - "$ref": "#/components/schemas/ToolResponseMessage" - }, - { - "$ref": "#/components/schemas/CompletionMessage" - } - ] - }, - "score": { - "type": "number" - } - }, - "additionalProperties": false, - "required": [ - "message", - "score" - ] - }, "RunShieldRequest": { "type": "object", "properties": { @@ -5780,6 +6682,106 @@ }, "additionalProperties": false }, + "ScoreRequest": { + "type": "object", + "properties": { + "input_rows": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "scoring_functions": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false, + "required": [ + "input_rows", + "scoring_functions" + ] + }, + "ScoreResponse": { + "type": "object", + "properties": { + "results": { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/ScoringResult" + } + } + }, + "additionalProperties": false, + "required": [ + "results" + ] + }, + "ScoreBatchRequest": { + "type": "object", + "properties": { + "dataset_id": { + "type": "string" + }, + "scoring_functions": { + "type": "array", + "items": { + "type": "string" + } + }, + "save_results_dataset": { + "type": "boolean" + } + }, + "additionalProperties": false, + "required": [ + "dataset_id", + "scoring_functions", + "save_results_dataset" + ] + }, + "ScoreBatchResponse": { + "type": "object", + "properties": { + "dataset_id": { + "type": "string" + }, + "results": { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/ScoringResult" + } + } + }, + "additionalProperties": false, + "required": [ + "results" + ] + }, "DoraFinetuningConfig": { "type": "object", "properties": { @@ -5892,10 +6894,10 @@ "type": "string" }, "dataset": { - "$ref": "#/components/schemas/TrainEvalDataset" + "type": "string" }, "validation_dataset": { - "$ref": "#/components/schemas/TrainEvalDataset" + "type": "string" }, "algorithm": { "$ref": "#/components/schemas/FinetuningAlgorithm" @@ -6034,7 +7036,29 @@ "synthetic_data": { "type": "array", "items": { - "$ref": "#/components/schemas/ScoredDialogGenerations" + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } } }, "statistics": { @@ -6079,49 +7103,55 @@ ], "tags": [ { - "name": "Models" + "name": "Eval" }, { - "name": "RewardScoring" - }, - { - "name": "MemoryBanks" - }, - { - "name": "Shields" + "name": "ScoringFunctions" }, { "name": "SyntheticDataGeneration" }, - { - "name": "Inference" - }, { "name": "Inspect" }, - { - "name": "BatchInference" - }, - { - "name": "Memory" - }, - { - "name": "Datasets" - }, - { - "name": "Agents" - }, { "name": "PostTraining" }, { - "name": "Telemetry" + "name": "Models" }, { "name": "Safety" }, { - "name": "Evaluations" + "name": "MemoryBanks" + }, + { + "name": "DatasetIO" + }, + { + "name": "Memory" + }, + { + "name": "Scoring" + }, + { + "name": "Shields" + }, + { + "name": "Datasets" + }, + { + "name": "Inference" + }, + { + "name": "Telemetry" + }, + { + "name": "BatchInference" + }, + { + "name": "Agents" }, { "name": "BuiltinTool", @@ -6199,10 +7229,6 @@ "name": "BatchCompletionResponse", "description": "" }, - { - "name": "CancelEvaluationJobRequest", - "description": "" - }, { "name": "CancelTrainingJobRequest", "description": "" @@ -6371,18 +7397,6 @@ "name": "ViolationLevel", "description": "" }, - { - "name": "TrainEvalDataset", - "description": "Dataset to be used for training or evaluating language models.\n\n" - }, - { - "name": "TrainEvalDatasetColumnType", - "description": "" - }, - { - "name": "CreateDatasetRequest", - "description": "" - }, { "name": "DeleteAgentsRequest", "description": "" @@ -6391,10 +7405,6 @@ "name": "DeleteAgentsSessionRequest", "description": "" }, - { - "name": "DeleteDatasetRequest", - "description": "" - }, { "name": "EmbeddingsRequest", "description": "" @@ -6404,20 +7414,32 @@ "description": "" }, { - "name": "EvaluateQuestionAnsweringRequest", - "description": "" + "name": "AgentCandidate", + "description": "" }, { - "name": "EvaluationJob", - "description": "" + "name": "ModelCandidate", + "description": "" }, { - "name": "EvaluateSummarizationRequest", - "description": "" + "name": "EvaluateRequest", + "description": "" }, { - "name": "EvaluateTextGenerationRequest", - "description": "" + "name": "EvaluateResponse", + "description": "" + }, + { + "name": "ScoringResult", + "description": "" + }, + { + "name": "EvaluateBatchRequest", + "description": "" + }, + { + "name": "Job", + "description": "" }, { "name": "GetAgentsSessionRequest", @@ -6448,21 +7470,25 @@ "description": "" }, { - "name": "EvaluationJobArtifactsResponse", - "description": "Artifacts of a evaluation job.\n\n" - }, - { - "name": "EvaluationJobLogStream", - "description": "" - }, - { - "name": "EvaluationJobStatusResponse", - "description": "" + "name": "DatasetDefWithProvider", + "description": "" }, { "name": "ModelDefWithProvider", "description": "" }, + { + "name": "PaginatedRowsResult", + "description": "" + }, + { + "name": "Parameter", + "description": "" + }, + { + "name": "ScoringFunctionDefWithProvider", + "description": "" + }, { "name": "ShieldDefWithProvider", "description": "" @@ -6507,6 +7533,14 @@ "name": "InsertDocumentsRequest", "description": "" }, + { + "name": "JobCancelRequest", + "description": "" + }, + { + "name": "JobStatus", + "description": "" + }, { "name": "ProviderInfo", "description": "" @@ -6575,6 +7609,10 @@ "name": "QueryDocumentsResponse", "description": "" }, + { + "name": "RegisterDatasetRequest", + "description": "" + }, { "name": "RegisterMemoryBankRequest", "description": "" @@ -6583,30 +7621,14 @@ "name": "RegisterModelRequest", "description": "" }, + { + "name": "RegisterScoringFunctionRequest", + "description": "" + }, { "name": "RegisterShieldRequest", "description": "" }, - { - "name": "DialogGenerations", - "description": "" - }, - { - "name": "RewardScoreRequest", - "description": "" - }, - { - "name": "RewardScoringResponse", - "description": "Response from the reward scoring. Batch of (prompt, response, score) tuples that pass the threshold.\n\n" - }, - { - "name": "ScoredDialogGenerations", - "description": "" - }, - { - "name": "ScoredMessage", - "description": "" - }, { "name": "RunShieldRequest", "description": "" @@ -6615,6 +7637,22 @@ "name": "RunShieldResponse", "description": "" }, + { + "name": "ScoreRequest", + "description": "" + }, + { + "name": "ScoreResponse", + "description": "" + }, + { + "name": "ScoreBatchRequest", + "description": "" + }, + { + "name": "ScoreBatchResponse", + "description": "" + }, { "name": "DoraFinetuningConfig", "description": "" @@ -6650,16 +7688,18 @@ "tags": [ "Agents", "BatchInference", + "DatasetIO", "Datasets", - "Evaluations", + "Eval", "Inference", "Inspect", "Memory", "MemoryBanks", "Models", "PostTraining", - "RewardScoring", "Safety", + "Scoring", + "ScoringFunctions", "Shields", "SyntheticDataGeneration", "Telemetry" @@ -6668,6 +7708,7 @@ { "name": "Types", "tags": [ + "AgentCandidate", "AgentConfig", "AgentCreateResponse", "AgentSessionCreateResponse", @@ -6685,7 +7726,6 @@ "BatchCompletionRequest", "BatchCompletionResponse", "BuiltinTool", - "CancelEvaluationJobRequest", "CancelTrainingJobRequest", "ChatCompletionRequest", "ChatCompletionResponse", @@ -6701,22 +7741,16 @@ "CreateAgentRequest", "CreateAgentSessionRequest", "CreateAgentTurnRequest", - "CreateDatasetRequest", "DPOAlignmentConfig", + "DatasetDefWithProvider", "DeleteAgentsRequest", "DeleteAgentsSessionRequest", - "DeleteDatasetRequest", - "DialogGenerations", "DoraFinetuningConfig", "EmbeddingsRequest", "EmbeddingsResponse", - "EvaluateQuestionAnsweringRequest", - "EvaluateSummarizationRequest", - "EvaluateTextGenerationRequest", - "EvaluationJob", - "EvaluationJobArtifactsResponse", - "EvaluationJobLogStream", - "EvaluationJobStatusResponse", + "EvaluateBatchRequest", + "EvaluateRequest", + "EvaluateResponse", "FinetuningAlgorithm", "FunctionCallToolDefinition", "GetAgentsSessionRequest", @@ -6725,6 +7759,9 @@ "ImageMedia", "InferenceStep", "InsertDocumentsRequest", + "Job", + "JobCancelRequest", + "JobStatus", "KeyValueMemoryBankDef", "KeywordMemoryBankDef", "LogEventRequest", @@ -6734,8 +7771,11 @@ "MemoryRetrievalStep", "MemoryToolDefinition", "MetricEvent", + "ModelCandidate", "ModelDefWithProvider", "OptimizerConfig", + "PaginatedRowsResult", + "Parameter", "PhotogenToolDefinition", "PostTrainingJob", "PostTrainingJobArtifactsResponse", @@ -6748,21 +7788,25 @@ "QueryDocumentsRequest", "QueryDocumentsResponse", "RLHFAlgorithm", + "RegisterDatasetRequest", "RegisterMemoryBankRequest", "RegisterModelRequest", + "RegisterScoringFunctionRequest", "RegisterShieldRequest", "RestAPIExecutionConfig", "RestAPIMethod", - "RewardScoreRequest", - "RewardScoringResponse", "RouteInfo", "RunShieldRequest", "RunShieldResponse", "SafetyViolation", "SamplingParams", "SamplingStrategy", - "ScoredDialogGenerations", - "ScoredMessage", + "ScoreBatchRequest", + "ScoreBatchResponse", + "ScoreRequest", + "ScoreResponse", + "ScoringFunctionDefWithProvider", + "ScoringResult", "SearchToolDefinition", "Session", "ShieldCallStep", @@ -6788,8 +7832,6 @@ "ToolResponse", "ToolResponseMessage", "Trace", - "TrainEvalDataset", - "TrainEvalDatasetColumnType", "TrainingConfig", "Turn", "URL", diff --git a/docs/resources/llama-stack-spec.yaml b/docs/resources/llama-stack-spec.yaml index 906d3934a..9dcdbb028 100644 --- a/docs/resources/llama-stack-spec.yaml +++ b/docs/resources/llama-stack-spec.yaml @@ -1,6 +1,19 @@ components: responses: {} schemas: + AgentCandidate: + additionalProperties: false + properties: + config: + $ref: '#/components/schemas/AgentConfig' + type: + const: agent + default: agent + type: string + required: + - type + - config + type: object AgentConfig: additionalProperties: false properties: @@ -315,14 +328,6 @@ components: - photogen - code_interpreter type: string - CancelEvaluationJobRequest: - additionalProperties: false - properties: - job_uuid: - type: string - required: - - job_uuid - type: object CancelTrainingJobRequest: additionalProperties: false properties: @@ -351,6 +356,48 @@ components: type: array model: type: string + response_format: + oneOf: + - additionalProperties: false + properties: + schema: + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + type: object + type: + const: json_schema + default: json_schema + type: string + required: + - type + - schema + type: object + - additionalProperties: false + properties: + bnf: + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + type: object + type: + const: grammar + default: grammar + type: string + required: + - type + - bnf + type: object sampling_params: $ref: '#/components/schemas/SamplingParams' stream: @@ -490,6 +537,48 @@ components: type: object model: type: string + response_format: + oneOf: + - additionalProperties: false + properties: + schema: + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + type: object + type: + const: json_schema + default: json_schema + type: string + required: + - type + - schema + type: object + - additionalProperties: false + properties: + bnf: + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + type: object + type: + const: grammar + default: grammar + type: string + required: + - type + - bnf + type: object sampling_params: $ref: '#/components/schemas/SamplingParams' stream: @@ -572,17 +661,6 @@ components: - session_id - messages type: object - CreateDatasetRequest: - additionalProperties: false - properties: - dataset: - $ref: '#/components/schemas/TrainEvalDataset' - uuid: - type: string - required: - - uuid - - dataset - type: object DPOAlignmentConfig: additionalProperties: false properties: @@ -600,6 +678,138 @@ components: - epsilon - gamma type: object + DatasetDefWithProvider: + additionalProperties: false + properties: + dataset_schema: + additionalProperties: + oneOf: + - additionalProperties: false + properties: + type: + const: string + default: string + type: string + required: + - type + type: object + - additionalProperties: false + properties: + type: + const: number + default: number + type: string + required: + - type + type: object + - additionalProperties: false + properties: + type: + const: boolean + default: boolean + type: string + required: + - type + type: object + - additionalProperties: false + properties: + type: + const: array + default: array + type: string + required: + - type + type: object + - additionalProperties: false + properties: + type: + const: object + default: object + type: string + required: + - type + type: object + - additionalProperties: false + properties: + type: + const: json + default: json + type: string + required: + - type + type: object + - additionalProperties: false + properties: + type: + const: union + default: union + type: string + required: + - type + type: object + - additionalProperties: false + properties: + type: + const: custom + default: custom + type: string + validator_class: + type: string + required: + - type + - validator_class + type: object + - additionalProperties: false + properties: + type: + const: chat_completion_input + default: chat_completion_input + type: string + required: + - type + type: object + - additionalProperties: false + properties: + type: + const: completion_input + default: completion_input + type: string + required: + - type + type: object + - additionalProperties: false + properties: + type: + const: agent_turn_input + default: agent_turn_input + type: string + required: + - type + type: object + type: object + identifier: + type: string + metadata: + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + type: object + provider_id: + type: string + url: + $ref: '#/components/schemas/URL' + required: + - identifier + - dataset_schema + - url + - metadata + - provider_id + type: object DeleteAgentsRequest: additionalProperties: false properties: @@ -619,37 +829,6 @@ components: - agent_id - session_id type: object - DeleteDatasetRequest: - additionalProperties: false - properties: - dataset_uuid: - type: string - required: - - dataset_uuid - type: object - DialogGenerations: - additionalProperties: false - properties: - dialog: - items: - oneOf: - - $ref: '#/components/schemas/UserMessage' - - $ref: '#/components/schemas/SystemMessage' - - $ref: '#/components/schemas/ToolResponseMessage' - - $ref: '#/components/schemas/CompletionMessage' - type: array - sampled_generations: - items: - oneOf: - - $ref: '#/components/schemas/UserMessage' - - $ref: '#/components/schemas/SystemMessage' - - $ref: '#/components/schemas/ToolResponseMessage' - - $ref: '#/components/schemas/CompletionMessage' - type: array - required: - - dialog - - sampled_generations - type: object DoraFinetuningConfig: additionalProperties: false properties: @@ -704,78 +883,74 @@ components: required: - embeddings type: object - EvaluateQuestionAnsweringRequest: + EvaluateBatchRequest: additionalProperties: false properties: - metrics: + candidate: + oneOf: + - $ref: '#/components/schemas/ModelCandidate' + - $ref: '#/components/schemas/AgentCandidate' + dataset_id: + type: string + scoring_functions: items: - enum: - - em - - f1 type: string type: array required: - - metrics + - dataset_id + - candidate + - scoring_functions type: object - EvaluateSummarizationRequest: + EvaluateRequest: additionalProperties: false properties: - metrics: + candidate: + oneOf: + - $ref: '#/components/schemas/ModelCandidate' + - $ref: '#/components/schemas/AgentCandidate' + input_rows: + items: + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + type: object + type: array + scoring_functions: items: - enum: - - rouge - - bleu type: string type: array required: - - metrics + - input_rows + - candidate + - scoring_functions type: object - EvaluateTextGenerationRequest: + EvaluateResponse: additionalProperties: false properties: - metrics: + generations: items: - enum: - - perplexity - - rouge - - bleu - type: string + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + type: object type: array + scores: + additionalProperties: + $ref: '#/components/schemas/ScoringResult' + type: object required: - - metrics - type: object - EvaluationJob: - additionalProperties: false - properties: - job_uuid: - type: string - required: - - job_uuid - type: object - EvaluationJobArtifactsResponse: - additionalProperties: false - properties: - job_uuid: - type: string - required: - - job_uuid - title: Artifacts of a evaluation job. - type: object - EvaluationJobLogStream: - additionalProperties: false - properties: - job_uuid: - type: string - required: - - job_uuid - type: object - EvaluationJobStatusResponse: - additionalProperties: false - properties: - job_uuid: - type: string - required: - - job_uuid + - generations + - scores type: object FinetuningAlgorithm: enum: @@ -905,6 +1080,27 @@ components: - bank_id - documents type: object + Job: + additionalProperties: false + properties: + job_id: + type: string + required: + - job_id + type: object + JobCancelRequest: + additionalProperties: false + properties: + job_id: + type: string + required: + - job_id + type: object + JobStatus: + enum: + - completed + - in_progress + type: string KeyValueMemoryBankDef: additionalProperties: false properties: @@ -1220,6 +1416,24 @@ components: - value - unit type: object + ModelCandidate: + additionalProperties: false + properties: + model: + type: string + sampling_params: + $ref: '#/components/schemas/SamplingParams' + system_message: + $ref: '#/components/schemas/SystemMessage' + type: + const: model + default: model + type: string + required: + - type + - model + - sampling_params + type: object ModelDefWithProvider: additionalProperties: false properties: @@ -1266,6 +1480,144 @@ components: - lr_min - weight_decay type: object + PaginatedRowsResult: + additionalProperties: false + properties: + next_page_token: + type: string + rows: + items: + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + type: object + type: array + total_count: + type: integer + required: + - rows + - total_count + type: object + Parameter: + additionalProperties: false + properties: + description: + type: string + name: + type: string + type: + oneOf: + - additionalProperties: false + properties: + type: + const: string + default: string + type: string + required: + - type + type: object + - additionalProperties: false + properties: + type: + const: number + default: number + type: string + required: + - type + type: object + - additionalProperties: false + properties: + type: + const: boolean + default: boolean + type: string + required: + - type + type: object + - additionalProperties: false + properties: + type: + const: array + default: array + type: string + required: + - type + type: object + - additionalProperties: false + properties: + type: + const: object + default: object + type: string + required: + - type + type: object + - additionalProperties: false + properties: + type: + const: json + default: json + type: string + required: + - type + type: object + - additionalProperties: false + properties: + type: + const: union + default: union + type: string + required: + - type + type: object + - additionalProperties: false + properties: + type: + const: custom + default: custom + type: string + validator_class: + type: string + required: + - type + - validator_class + type: object + - additionalProperties: false + properties: + type: + const: chat_completion_input + default: chat_completion_input + type: string + required: + - type + type: object + - additionalProperties: false + properties: + type: + const: completion_input + default: completion_input + type: string + required: + - type + type: object + - additionalProperties: false + properties: + type: + const: agent_turn_input + default: agent_turn_input + type: string + required: + - type + type: object + required: + - name + - type + type: object PhotogenToolDefinition: additionalProperties: false properties: @@ -1373,7 +1725,7 @@ components: algorithm_config: $ref: '#/components/schemas/DPOAlignmentConfig' dataset: - $ref: '#/components/schemas/TrainEvalDataset' + type: string finetuned_model: $ref: '#/components/schemas/URL' hyperparam_search_config: @@ -1403,7 +1755,7 @@ components: training_config: $ref: '#/components/schemas/TrainingConfig' validation_dataset: - $ref: '#/components/schemas/TrainEvalDataset' + type: string required: - job_uuid - finetuned_model @@ -1515,6 +1867,14 @@ components: enum: - dpo type: string + RegisterDatasetRequest: + additionalProperties: false + properties: + dataset_def: + $ref: '#/components/schemas/DatasetDefWithProvider' + required: + - dataset_def + type: object RegisterMemoryBankRequest: additionalProperties: false properties: @@ -1535,6 +1895,14 @@ components: required: - model type: object + RegisterScoringFunctionRequest: + additionalProperties: false + properties: + function_def: + $ref: '#/components/schemas/ScoringFunctionDefWithProvider' + required: + - function_def + type: object RegisterShieldRequest: additionalProperties: false properties: @@ -1591,31 +1959,6 @@ components: - PUT - DELETE type: string - RewardScoreRequest: - additionalProperties: false - properties: - dialog_generations: - items: - $ref: '#/components/schemas/DialogGenerations' - type: array - model: - type: string - required: - - dialog_generations - - model - type: object - RewardScoringResponse: - additionalProperties: false - properties: - scored_generations: - items: - $ref: '#/components/schemas/ScoredDialogGenerations' - type: array - required: - - scored_generations - title: Response from the reward scoring. Batch of (prompt, response, score) - tuples that pass the threshold. - type: object RouteInfo: additionalProperties: false properties: @@ -1717,39 +2060,239 @@ components: - top_p - top_k type: string - ScoredDialogGenerations: + ScoreBatchRequest: additionalProperties: false properties: - dialog: + dataset_id: + type: string + save_results_dataset: + type: boolean + scoring_functions: items: - oneOf: - - $ref: '#/components/schemas/UserMessage' - - $ref: '#/components/schemas/SystemMessage' - - $ref: '#/components/schemas/ToolResponseMessage' - - $ref: '#/components/schemas/CompletionMessage' - type: array - scored_generations: - items: - $ref: '#/components/schemas/ScoredMessage' + type: string type: array required: - - dialog - - scored_generations + - dataset_id + - scoring_functions + - save_results_dataset type: object - ScoredMessage: + ScoreBatchResponse: additionalProperties: false properties: - message: - oneOf: - - $ref: '#/components/schemas/UserMessage' - - $ref: '#/components/schemas/SystemMessage' - - $ref: '#/components/schemas/ToolResponseMessage' - - $ref: '#/components/schemas/CompletionMessage' - score: - type: number + dataset_id: + type: string + results: + additionalProperties: + $ref: '#/components/schemas/ScoringResult' + type: object required: - - message - - score + - results + type: object + ScoreRequest: + additionalProperties: false + properties: + input_rows: + items: + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + type: object + type: array + scoring_functions: + items: + type: string + type: array + required: + - input_rows + - scoring_functions + type: object + ScoreResponse: + additionalProperties: false + properties: + results: + additionalProperties: + $ref: '#/components/schemas/ScoringResult' + type: object + required: + - results + type: object + ScoringFunctionDefWithProvider: + additionalProperties: false + properties: + context: + additionalProperties: false + properties: + judge_model: + type: string + prompt_template: + type: string + required: + - judge_model + type: object + description: + type: string + identifier: + type: string + metadata: + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + type: object + parameters: + items: + $ref: '#/components/schemas/Parameter' + type: array + provider_id: + type: string + return_type: + oneOf: + - additionalProperties: false + properties: + type: + const: string + default: string + type: string + required: + - type + type: object + - additionalProperties: false + properties: + type: + const: number + default: number + type: string + required: + - type + type: object + - additionalProperties: false + properties: + type: + const: boolean + default: boolean + type: string + required: + - type + type: object + - additionalProperties: false + properties: + type: + const: array + default: array + type: string + required: + - type + type: object + - additionalProperties: false + properties: + type: + const: object + default: object + type: string + required: + - type + type: object + - additionalProperties: false + properties: + type: + const: json + default: json + type: string + required: + - type + type: object + - additionalProperties: false + properties: + type: + const: union + default: union + type: string + required: + - type + type: object + - additionalProperties: false + properties: + type: + const: custom + default: custom + type: string + validator_class: + type: string + required: + - type + - validator_class + type: object + - additionalProperties: false + properties: + type: + const: chat_completion_input + default: chat_completion_input + type: string + required: + - type + type: object + - additionalProperties: false + properties: + type: + const: completion_input + default: completion_input + type: string + required: + - type + type: object + - additionalProperties: false + properties: + type: + const: agent_turn_input + default: agent_turn_input + type: string + required: + - type + type: object + required: + - identifier + - metadata + - parameters + - return_type + - provider_id + type: object + ScoringResult: + additionalProperties: false + properties: + aggregated_results: + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + type: object + score_rows: + items: + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + type: object + type: array + required: + - score_rows + - aggregated_results type: object SearchToolDefinition: additionalProperties: false @@ -1942,7 +2485,7 @@ components: - $ref: '#/components/schemas/QLoraFinetuningConfig' - $ref: '#/components/schemas/DoraFinetuningConfig' dataset: - $ref: '#/components/schemas/TrainEvalDataset' + type: string hyperparam_search_config: additionalProperties: oneOf: @@ -1972,7 +2515,7 @@ components: training_config: $ref: '#/components/schemas/TrainingConfig' validation_dataset: - $ref: '#/components/schemas/TrainEvalDataset' + type: string required: - job_uuid - model @@ -2027,7 +2570,15 @@ components: type: object synthetic_data: items: - $ref: '#/components/schemas/ScoredDialogGenerations' + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + type: object type: array required: - synthetic_data @@ -2282,38 +2833,6 @@ components: - root_span_id - start_time type: object - TrainEvalDataset: - additionalProperties: false - properties: - columns: - additionalProperties: - $ref: '#/components/schemas/TrainEvalDatasetColumnType' - type: object - content_url: - $ref: '#/components/schemas/URL' - metadata: - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - type: object - required: - - columns - - content_url - title: Dataset to be used for training or evaluating language models. - type: object - TrainEvalDatasetColumnType: - enum: - - dialog - - text - - media - - number - - json - type: string TrainingConfig: additionalProperties: false properties: @@ -2510,7 +3029,7 @@ info: description: "This is the specification of the llama stack that provides\n \ \ a set of endpoints and their corresponding interfaces that are tailored\ \ to\n best leverage Llama Models. The specification is still in\ - \ draft and subject to change.\n Generated at 2024-10-18 20:48:17.730988" + \ draft and subject to change.\n Generated at 2024-10-24 17:40:59.576117" title: '[DRAFT] Llama Stack Specification' version: 0.0.1 jsonSchemaDialect: https://json-schema.org/draft/2020-12/schema @@ -2651,6 +3170,11 @@ paths: required: true schema: type: string + - in: query + name: session_id + required: true + schema: + type: string - in: query name: turn_id required: true @@ -2710,6 +3234,11 @@ paths: required: true schema: type: string + - in: query + name: session_id + required: true + schema: + type: string - in: query name: turn_id required: true @@ -2781,9 +3310,29 @@ paths: description: OK tags: - BatchInference - /datasets/create: - post: + /datasetio/get_rows_paginated: + get: parameters: + - in: query + name: dataset_id + required: true + schema: + type: string + - in: query + name: rows_in_page + required: true + schema: + type: integer + - in: query + name: page_token + required: false + schema: + type: string + - in: query + name: filter_condition + required: false + schema: + type: string - description: JSON-encoded provider data which will be made available to the adapter servicing the API in: header @@ -2791,43 +3340,20 @@ paths: required: false schema: type: string - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/CreateDatasetRequest' - required: true responses: '200': + content: + application/json: + schema: + $ref: '#/components/schemas/PaginatedRowsResult' description: OK tags: - - Datasets - /datasets/delete: - post: - parameters: - - description: JSON-encoded provider data which will be made available to the - adapter servicing the API - in: header - name: X-LlamaStack-ProviderData - required: false - schema: - type: string - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/DeleteDatasetRequest' - required: true - responses: - '200': - description: OK - tags: - - Datasets + - DatasetIO /datasets/get: get: parameters: - in: query - name: dataset_uuid + name: dataset_identifier required: true schema: type: string @@ -2843,104 +3369,13 @@ paths: content: application/json: schema: - $ref: '#/components/schemas/TrainEvalDataset' + oneOf: + - $ref: '#/components/schemas/DatasetDefWithProvider' + - type: 'null' description: OK tags: - Datasets - /evaluate/job/artifacts: - get: - parameters: - - in: query - name: job_uuid - required: true - schema: - type: string - - description: JSON-encoded provider data which will be made available to the - adapter servicing the API - in: header - name: X-LlamaStack-ProviderData - required: false - schema: - type: string - responses: - '200': - content: - application/json: - schema: - $ref: '#/components/schemas/EvaluationJobArtifactsResponse' - description: OK - tags: - - Evaluations - /evaluate/job/cancel: - post: - parameters: - - description: JSON-encoded provider data which will be made available to the - adapter servicing the API - in: header - name: X-LlamaStack-ProviderData - required: false - schema: - type: string - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/CancelEvaluationJobRequest' - required: true - responses: - '200': - description: OK - tags: - - Evaluations - /evaluate/job/logs: - get: - parameters: - - in: query - name: job_uuid - required: true - schema: - type: string - - description: JSON-encoded provider data which will be made available to the - adapter servicing the API - in: header - name: X-LlamaStack-ProviderData - required: false - schema: - type: string - responses: - '200': - content: - application/json: - schema: - $ref: '#/components/schemas/EvaluationJobLogStream' - description: OK - tags: - - Evaluations - /evaluate/job/status: - get: - parameters: - - in: query - name: job_uuid - required: true - schema: - type: string - - description: JSON-encoded provider data which will be made available to the - adapter servicing the API - in: header - name: X-LlamaStack-ProviderData - required: false - schema: - type: string - responses: - '200': - content: - application/json: - schema: - $ref: '#/components/schemas/EvaluationJobStatusResponse' - description: OK - tags: - - Evaluations - /evaluate/jobs: + /datasets/list: get: parameters: - description: JSON-encoded provider data which will be made available to the @@ -2955,11 +3390,11 @@ paths: content: application/jsonl: schema: - $ref: '#/components/schemas/EvaluationJob' + $ref: '#/components/schemas/DatasetDefWithProvider' description: OK tags: - - Evaluations - /evaluate/question_answering/: + - Datasets + /datasets/register: post: parameters: - description: JSON-encoded provider data which will be made available to the @@ -2973,18 +3408,14 @@ paths: content: application/json: schema: - $ref: '#/components/schemas/EvaluateQuestionAnsweringRequest' + $ref: '#/components/schemas/RegisterDatasetRequest' required: true responses: '200': - content: - application/json: - schema: - $ref: '#/components/schemas/EvaluationJob' description: OK tags: - - Evaluations - /evaluate/summarization/: + - Datasets + /eval/evaluate: post: parameters: - description: JSON-encoded provider data which will be made available to the @@ -2998,18 +3429,18 @@ paths: content: application/json: schema: - $ref: '#/components/schemas/EvaluateSummarizationRequest' + $ref: '#/components/schemas/EvaluateRequest' required: true responses: '200': content: application/json: schema: - $ref: '#/components/schemas/EvaluationJob' + $ref: '#/components/schemas/EvaluateResponse' description: OK tags: - - Evaluations - /evaluate/text_generation/: + - Eval + /eval/evaluate_batch: post: parameters: - description: JSON-encoded provider data which will be made available to the @@ -3023,17 +3454,88 @@ paths: content: application/json: schema: - $ref: '#/components/schemas/EvaluateTextGenerationRequest' + $ref: '#/components/schemas/EvaluateBatchRequest' required: true responses: '200': content: application/json: schema: - $ref: '#/components/schemas/EvaluationJob' + $ref: '#/components/schemas/Job' description: OK tags: - - Evaluations + - Eval + /eval/job/cancel: + post: + parameters: + - description: JSON-encoded provider data which will be made available to the + adapter servicing the API + in: header + name: X-LlamaStack-ProviderData + required: false + schema: + type: string + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/JobCancelRequest' + required: true + responses: + '200': + description: OK + tags: + - Eval + /eval/job/result: + get: + parameters: + - in: query + name: job_id + required: true + schema: + type: string + - description: JSON-encoded provider data which will be made available to the + adapter servicing the API + in: header + name: X-LlamaStack-ProviderData + required: false + schema: + type: string + responses: + '200': + content: + application/json: + schema: + $ref: '#/components/schemas/EvaluateResponse' + description: OK + tags: + - Eval + /eval/job/status: + get: + parameters: + - in: query + name: job_id + required: true + schema: + type: string + - description: JSON-encoded provider data which will be made available to the + adapter servicing the API + in: header + name: X-LlamaStack-ProviderData + required: false + schema: + type: string + responses: + '200': + content: + application/json: + schema: + oneOf: + - $ref: '#/components/schemas/JobStatus' + - type: 'null' + description: OK + tags: + - Eval /health: get: parameters: @@ -3501,31 +4003,6 @@ paths: description: OK tags: - Inspect - /reward_scoring/score: - post: - parameters: - - description: JSON-encoded provider data which will be made available to the - adapter servicing the API - in: header - name: X-LlamaStack-ProviderData - required: false - schema: - type: string - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/RewardScoreRequest' - required: true - responses: - '200': - content: - application/json: - schema: - $ref: '#/components/schemas/RewardScoringResponse' - description: OK - tags: - - RewardScoring /routes/list: get: parameters: @@ -3574,6 +4051,122 @@ paths: description: OK tags: - Safety + /scoring/score: + post: + parameters: + - description: JSON-encoded provider data which will be made available to the + adapter servicing the API + in: header + name: X-LlamaStack-ProviderData + required: false + schema: + type: string + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/ScoreRequest' + required: true + responses: + '200': + content: + application/json: + schema: + $ref: '#/components/schemas/ScoreResponse' + description: OK + tags: + - Scoring + /scoring/score_batch: + post: + parameters: + - description: JSON-encoded provider data which will be made available to the + adapter servicing the API + in: header + name: X-LlamaStack-ProviderData + required: false + schema: + type: string + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/ScoreBatchRequest' + required: true + responses: + '200': + content: + application/json: + schema: + $ref: '#/components/schemas/ScoreBatchResponse' + description: OK + tags: + - Scoring + /scoring_functions/get: + get: + parameters: + - in: query + name: name + required: true + schema: + type: string + - description: JSON-encoded provider data which will be made available to the + adapter servicing the API + in: header + name: X-LlamaStack-ProviderData + required: false + schema: + type: string + responses: + '200': + content: + application/json: + schema: + oneOf: + - $ref: '#/components/schemas/ScoringFunctionDefWithProvider' + - type: 'null' + description: OK + tags: + - ScoringFunctions + /scoring_functions/list: + get: + parameters: + - description: JSON-encoded provider data which will be made available to the + adapter servicing the API + in: header + name: X-LlamaStack-ProviderData + required: false + schema: + type: string + responses: + '200': + content: + application/jsonl: + schema: + $ref: '#/components/schemas/ScoringFunctionDefWithProvider' + description: OK + tags: + - ScoringFunctions + /scoring_functions/register: + post: + parameters: + - description: JSON-encoded provider data which will be made available to the + adapter servicing the API + in: header + name: X-LlamaStack-ProviderData + required: false + schema: + type: string + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/RegisterScoringFunctionRequest' + required: true + responses: + '200': + description: OK + tags: + - ScoringFunctions /shields/get: get: parameters: @@ -3715,21 +4308,23 @@ security: servers: - url: http://any-hosted-llama-stack.com tags: -- name: Models -- name: RewardScoring -- name: MemoryBanks -- name: Shields +- name: Eval +- name: ScoringFunctions - name: SyntheticDataGeneration -- name: Inference - name: Inspect -- name: BatchInference -- name: Memory -- name: Datasets -- name: Agents - name: PostTraining -- name: Telemetry +- name: Models - name: Safety -- name: Evaluations +- name: MemoryBanks +- name: DatasetIO +- name: Memory +- name: Scoring +- name: Shields +- name: Datasets +- name: Inference +- name: Telemetry +- name: BatchInference +- name: Agents - description: name: BuiltinTool - description: name: BatchCompletionResponse -- description: - name: CancelEvaluationJobRequest - description: name: CancelTrainingJobRequest @@ -3922,43 +4514,35 @@ tags: name: Turn - description: name: ViolationLevel -- description: 'Dataset to be used for training or evaluating language models. - - - ' - name: TrainEvalDataset -- description: - name: TrainEvalDatasetColumnType -- description: - name: CreateDatasetRequest - description: name: DeleteAgentsRequest - description: name: DeleteAgentsSessionRequest -- description: - name: DeleteDatasetRequest - description: name: EmbeddingsRequest - description: name: EmbeddingsResponse -- description: + name: AgentCandidate +- description: + name: ModelCandidate +- description: - name: EvaluateQuestionAnsweringRequest -- description: - name: EvaluationJob -- description: - name: EvaluateSummarizationRequest -- description: + name: ScoringResult +- description: - name: EvaluateTextGenerationRequest + name: EvaluateBatchRequest +- description: + name: Job - description: name: GetAgentsSessionRequest @@ -3982,21 +4566,20 @@ tags: - description: name: AgentStepResponse -- description: 'Artifacts of a evaluation job. - - - ' - name: EvaluationJobArtifactsResponse -- description: - name: EvaluationJobLogStream -- description: - name: EvaluationJobStatusResponse + name: DatasetDefWithProvider - description: name: ModelDefWithProvider +- description: + name: PaginatedRowsResult +- description: + name: Parameter +- description: + name: ScoringFunctionDefWithProvider - description: name: ShieldDefWithProvider @@ -4038,6 +4621,11 @@ tags: - description: name: InsertDocumentsRequest +- description: + name: JobCancelRequest +- description: + name: JobStatus - description: name: ProviderInfo - description: @@ -4081,38 +4669,37 @@ tags: - description: name: QueryDocumentsResponse +- description: + name: RegisterDatasetRequest - description: name: RegisterMemoryBankRequest - description: name: RegisterModelRequest +- description: + name: RegisterScoringFunctionRequest - description: name: RegisterShieldRequest -- description: - name: DialogGenerations -- description: - name: RewardScoreRequest -- description: 'Response from the reward scoring. Batch of (prompt, response, score) - tuples that pass the threshold. - - - ' - name: RewardScoringResponse -- description: - name: ScoredDialogGenerations -- description: - name: ScoredMessage - description: name: RunShieldRequest - description: name: RunShieldResponse +- description: + name: ScoreRequest +- description: + name: ScoreResponse +- description: + name: ScoreBatchRequest +- description: + name: ScoreBatchResponse - description: name: DoraFinetuningConfig @@ -4143,21 +4730,24 @@ x-tagGroups: tags: - Agents - BatchInference + - DatasetIO - Datasets - - Evaluations + - Eval - Inference - Inspect - Memory - MemoryBanks - Models - PostTraining - - RewardScoring - Safety + - Scoring + - ScoringFunctions - Shields - SyntheticDataGeneration - Telemetry - name: Types tags: + - AgentCandidate - AgentConfig - AgentCreateResponse - AgentSessionCreateResponse @@ -4175,7 +4765,6 @@ x-tagGroups: - BatchCompletionRequest - BatchCompletionResponse - BuiltinTool - - CancelEvaluationJobRequest - CancelTrainingJobRequest - ChatCompletionRequest - ChatCompletionResponse @@ -4191,22 +4780,16 @@ x-tagGroups: - CreateAgentRequest - CreateAgentSessionRequest - CreateAgentTurnRequest - - CreateDatasetRequest - DPOAlignmentConfig + - DatasetDefWithProvider - DeleteAgentsRequest - DeleteAgentsSessionRequest - - DeleteDatasetRequest - - DialogGenerations - DoraFinetuningConfig - EmbeddingsRequest - EmbeddingsResponse - - EvaluateQuestionAnsweringRequest - - EvaluateSummarizationRequest - - EvaluateTextGenerationRequest - - EvaluationJob - - EvaluationJobArtifactsResponse - - EvaluationJobLogStream - - EvaluationJobStatusResponse + - EvaluateBatchRequest + - EvaluateRequest + - EvaluateResponse - FinetuningAlgorithm - FunctionCallToolDefinition - GetAgentsSessionRequest @@ -4215,6 +4798,9 @@ x-tagGroups: - ImageMedia - InferenceStep - InsertDocumentsRequest + - Job + - JobCancelRequest + - JobStatus - KeyValueMemoryBankDef - KeywordMemoryBankDef - LogEventRequest @@ -4224,8 +4810,11 @@ x-tagGroups: - MemoryRetrievalStep - MemoryToolDefinition - MetricEvent + - ModelCandidate - ModelDefWithProvider - OptimizerConfig + - PaginatedRowsResult + - Parameter - PhotogenToolDefinition - PostTrainingJob - PostTrainingJobArtifactsResponse @@ -4238,21 +4827,25 @@ x-tagGroups: - QueryDocumentsRequest - QueryDocumentsResponse - RLHFAlgorithm + - RegisterDatasetRequest - RegisterMemoryBankRequest - RegisterModelRequest + - RegisterScoringFunctionRequest - RegisterShieldRequest - RestAPIExecutionConfig - RestAPIMethod - - RewardScoreRequest - - RewardScoringResponse - RouteInfo - RunShieldRequest - RunShieldResponse - SafetyViolation - SamplingParams - SamplingStrategy - - ScoredDialogGenerations - - ScoredMessage + - ScoreBatchRequest + - ScoreBatchResponse + - ScoreRequest + - ScoreResponse + - ScoringFunctionDefWithProvider + - ScoringResult - SearchToolDefinition - Session - ShieldCallStep @@ -4278,8 +4871,6 @@ x-tagGroups: - ToolResponse - ToolResponseMessage - Trace - - TrainEvalDataset - - TrainEvalDatasetColumnType - TrainingConfig - Turn - URL diff --git a/llama_stack/apis/common/job_types.py b/llama_stack/apis/common/job_types.py index ab203ebb8..ab8ab22dc 100644 --- a/llama_stack/apis/common/job_types.py +++ b/llama_stack/apis/common/job_types.py @@ -3,6 +3,8 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from enum import Enum + from llama_models.schema_utils import json_schema_type from pydantic import BaseModel @@ -10,3 +12,9 @@ from pydantic import BaseModel @json_schema_type class Job(BaseModel): job_id: str + + +@json_schema_type +class JobStatus(Enum): + completed = "completed" + in_progress = "in_progress" diff --git a/llama_stack/apis/common/type_system.py b/llama_stack/apis/common/type_system.py index 35a26e9ef..93a3c0339 100644 --- a/llama_stack/apis/common/type_system.py +++ b/llama_stack/apis/common/type_system.py @@ -4,7 +4,7 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import Dict, List, Literal, Union +from typing import Literal, Union from pydantic import BaseModel, Field from typing_extensions import Annotated @@ -24,12 +24,10 @@ class BooleanType(BaseModel): class ArrayType(BaseModel): type: Literal["array"] = "array" - items: "ParamType" class ObjectType(BaseModel): type: Literal["object"] = "object" - properties: Dict[str, "ParamType"] = Field(default_factory=dict) class JsonType(BaseModel): @@ -38,12 +36,21 @@ class JsonType(BaseModel): class UnionType(BaseModel): type: Literal["union"] = "union" - options: List["ParamType"] = Field(default_factory=list) -class CustomType(BaseModel): - type: Literal["custom"] = "custom" - validator_class: str +class ChatCompletionInputType(BaseModel): + # expects List[Message] for messages + type: Literal["chat_completion_input"] = "chat_completion_input" + + +class CompletionInputType(BaseModel): + # expects InterleavedTextMedia for content + type: Literal["completion_input"] = "completion_input" + + +class AgentTurnInputType(BaseModel): + # expects List[Message] for messages (may also include attachments?) + type: Literal["agent_turn_input"] = "agent_turn_input" ParamType = Annotated[ @@ -55,11 +62,22 @@ ParamType = Annotated[ ObjectType, JsonType, UnionType, - CustomType, + ChatCompletionInputType, + CompletionInputType, + AgentTurnInputType, ], Field(discriminator="type"), ] -ArrayType.model_rebuild() -ObjectType.model_rebuild() -UnionType.model_rebuild() +# TODO: recursive definition of ParamType in these containers +# will cause infinite recursion in OpenAPI generation script +# since we are going with ChatCompletionInputType and CompletionInputType +# we don't need to worry about ArrayType/ObjectType/UnionType for now +# ArrayType.model_rebuild() +# ObjectType.model_rebuild() +# UnionType.model_rebuild() + + +# class CustomType(BaseModel): +# type: Literal["custom"] = "custom" +# validator_class: str diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py index a97af1fc0..51f49da15 100644 --- a/llama_stack/apis/eval/eval.py +++ b/llama_stack/apis/eval/eval.py @@ -12,7 +12,7 @@ from llama_models.llama3.api.datatypes import * # noqa: F403 from llama_models.schema_utils import json_schema_type, webmethod from llama_stack.apis.scoring_functions import * # noqa: F403 from llama_stack.apis.agents import AgentConfig -from llama_stack.apis.common.job_types import Job +from llama_stack.apis.common.job_types import Job, JobStatus from llama_stack.apis.scoring import * # noqa: F403 @@ -40,7 +40,7 @@ class EvaluateResponse(BaseModel): generations: List[Dict[str, Any]] # each key in the dict is a scoring function name - scores: List[Dict[str, ScoringResult]] + scores: Dict[str, ScoringResult] class Eval(Protocol): @@ -61,10 +61,10 @@ class Eval(Protocol): ) -> EvaluateResponse: ... @webmethod(route="/eval/job/status", method="GET") - async def job_status(self, job_id: str) -> None: ... + async def job_status(self, job_id: str) -> Optional[JobStatus]: ... @webmethod(route="/eval/job/cancel", method="POST") async def job_cancel(self, job_id: str) -> None: ... @webmethod(route="/eval/job/result", method="GET") - async def job_result(self, job_id: str) -> None: ... + async def job_result(self, job_id: str) -> EvaluateResponse: ... diff --git a/llama_stack/apis/post_training/post_training.py b/llama_stack/apis/post_training/post_training.py index d943f48b2..eb4992cc6 100644 --- a/llama_stack/apis/post_training/post_training.py +++ b/llama_stack/apis/post_training/post_training.py @@ -14,7 +14,7 @@ from llama_models.schema_utils import json_schema_type, webmethod from pydantic import BaseModel, Field from llama_models.llama3.api.datatypes import * # noqa: F403 -from llama_stack.apis.dataset import * # noqa: F403 +from llama_stack.apis.datasets import * # noqa: F403 from llama_stack.apis.common.training_types import * # noqa: F403 @@ -107,8 +107,8 @@ class PostTrainingSFTRequest(BaseModel): job_uuid: str model: str - dataset: TrainEvalDataset - validation_dataset: TrainEvalDataset + dataset_id: str + validation_dataset_id: str algorithm: FinetuningAlgorithm algorithm_config: Union[ @@ -131,8 +131,8 @@ class PostTrainingRLHFRequest(BaseModel): finetuned_model: URL - dataset: TrainEvalDataset - validation_dataset: TrainEvalDataset + dataset_id: str + validation_dataset_id: str algorithm: RLHFAlgorithm algorithm_config: Union[DPOAlignmentConfig] @@ -181,8 +181,8 @@ class PostTraining(Protocol): self, job_uuid: str, model: str, - dataset: TrainEvalDataset, - validation_dataset: TrainEvalDataset, + dataset_id: str, + validation_dataset_id: str, algorithm: FinetuningAlgorithm, algorithm_config: Union[ LoraFinetuningConfig, QLoraFinetuningConfig, DoraFinetuningConfig @@ -198,8 +198,8 @@ class PostTraining(Protocol): self, job_uuid: str, finetuned_model: URL, - dataset: TrainEvalDataset, - validation_dataset: TrainEvalDataset, + dataset_id: str, + validation_dataset_id: str, algorithm: RLHFAlgorithm, algorithm_config: Union[DPOAlignmentConfig], optimizer_config: OptimizerConfig, diff --git a/llama_stack/apis/scoring/scoring.py b/llama_stack/apis/scoring/scoring.py index adac34d55..1fd523dcb 100644 --- a/llama_stack/apis/scoring/scoring.py +++ b/llama_stack/apis/scoring/scoring.py @@ -37,7 +37,7 @@ class ScoreResponse(BaseModel): class ScoringFunctionStore(Protocol): - def get_scoring_function(self, name: str) -> ScoringFunctionDefWithProvider: ... + def get_scoring_function(self, name: str) -> ScoringFnDefWithProvider: ... @runtime_checkable diff --git a/llama_stack/apis/scoring_functions/scoring_functions.py b/llama_stack/apis/scoring_functions/scoring_functions.py index a242215c6..fc3584f90 100644 --- a/llama_stack/apis/scoring_functions/scoring_functions.py +++ b/llama_stack/apis/scoring_functions/scoring_functions.py @@ -29,7 +29,7 @@ class LLMAsJudgeContext(BaseModel): @json_schema_type -class ScoringFunctionDef(BaseModel): +class ScoringFnDef(BaseModel): identifier: str description: Optional[str] = None metadata: Dict[str, Any] = Field( @@ -48,7 +48,7 @@ class ScoringFunctionDef(BaseModel): @json_schema_type -class ScoringFunctionDefWithProvider(ScoringFunctionDef): +class ScoringFnDefWithProvider(ScoringFnDef): provider_id: str = Field( description="ID of the provider which serves this dataset", ) @@ -57,14 +57,14 @@ class ScoringFunctionDefWithProvider(ScoringFunctionDef): @runtime_checkable class ScoringFunctions(Protocol): @webmethod(route="/scoring_functions/list", method="GET") - async def list_scoring_functions(self) -> List[ScoringFunctionDefWithProvider]: ... + async def list_scoring_functions(self) -> List[ScoringFnDefWithProvider]: ... @webmethod(route="/scoring_functions/get", method="GET") async def get_scoring_function( self, name: str - ) -> Optional[ScoringFunctionDefWithProvider]: ... + ) -> Optional[ScoringFnDefWithProvider]: ... @webmethod(route="/scoring_functions/register", method="POST") async def register_scoring_function( - self, function_def: ScoringFunctionDefWithProvider + self, function_def: ScoringFnDefWithProvider ) -> None: ... diff --git a/llama_stack/apis/synthetic_data_generation/synthetic_data_generation.py b/llama_stack/apis/synthetic_data_generation/synthetic_data_generation.py index 60c756128..05b49036d 100644 --- a/llama_stack/apis/synthetic_data_generation/synthetic_data_generation.py +++ b/llama_stack/apis/synthetic_data_generation/synthetic_data_generation.py @@ -13,7 +13,6 @@ from llama_models.schema_utils import json_schema_type, webmethod from pydantic import BaseModel from llama_models.llama3.api.datatypes import * # noqa: F403 -from llama_stack.apis.reward_scoring import * # noqa: F403 class FilteringFunction(Enum): @@ -40,7 +39,7 @@ class SyntheticDataGenerationRequest(BaseModel): class SyntheticDataGenerationResponse(BaseModel): """Response from the synthetic data generation. Batch of (prompt, response, score) tuples that pass the threshold.""" - synthetic_data: List[ScoredDialogGenerations] + synthetic_data: List[Dict[str, Any]] statistics: Optional[Dict[str, Any]] = None diff --git a/llama_stack/distribution/datatypes.py b/llama_stack/distribution/datatypes.py index 318809baf..9ad82cd79 100644 --- a/llama_stack/distribution/datatypes.py +++ b/llama_stack/distribution/datatypes.py @@ -34,7 +34,7 @@ RoutableObject = Union[ ShieldDef, MemoryBankDef, DatasetDef, - ScoringFunctionDef, + ScoringFnDef, ] RoutableObjectWithProvider = Union[ @@ -42,7 +42,7 @@ RoutableObjectWithProvider = Union[ ShieldDefWithProvider, MemoryBankDefWithProvider, DatasetDefWithProvider, - ScoringFunctionDefWithProvider, + ScoringFnDefWithProvider, ] RoutedProtocol = Union[ diff --git a/llama_stack/distribution/resolver.py b/llama_stack/distribution/resolver.py index b9b9fb229..cfe31a21d 100644 --- a/llama_stack/distribution/resolver.py +++ b/llama_stack/distribution/resolver.py @@ -14,6 +14,7 @@ from llama_stack.distribution.datatypes import * # noqa: F403 from llama_stack.apis.agents import Agents from llama_stack.apis.datasetio import DatasetIO from llama_stack.apis.datasets import Datasets +from llama_stack.apis.eval import Eval from llama_stack.apis.inference import Inference from llama_stack.apis.inspect import Inspect from llama_stack.apis.memory import Memory @@ -46,6 +47,7 @@ def api_protocol_map() -> Dict[Api, Any]: Api.datasetio: DatasetIO, Api.scoring_functions: ScoringFunctions, Api.scoring: Scoring, + Api.eval: Eval, } diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py index dcd588a9e..3e07b9162 100644 --- a/llama_stack/distribution/routers/routing_tables.py +++ b/llama_stack/distribution/routers/routing_tables.py @@ -100,7 +100,7 @@ class CommonRoutingTableImpl(RoutingTable): scoring_functions = await p.list_scoring_functions() add_objects( [ - ScoringFunctionDefWithProvider(**s.dict(), provider_id=pid) + ScoringFnDefWithProvider(**s.dict(), provider_id=pid) for s in scoring_functions ] ) @@ -239,7 +239,7 @@ class DatasetsRoutingTable(CommonRoutingTableImpl, Datasets): class ScoringFunctionsRoutingTable(CommonRoutingTableImpl, Scoring): - async def list_scoring_functions(self) -> List[ScoringFunctionDefWithProvider]: + async def list_scoring_functions(self) -> List[ScoringFnDefWithProvider]: objects = [] for objs in self.registry.values(): objects.extend(objs) @@ -247,10 +247,10 @@ class ScoringFunctionsRoutingTable(CommonRoutingTableImpl, Scoring): async def get_scoring_function( self, name: str - ) -> Optional[ScoringFunctionDefWithProvider]: + ) -> Optional[ScoringFnDefWithProvider]: return self.get_object_by_identifier(name) async def register_scoring_function( - self, function_def: ScoringFunctionDefWithProvider + self, function_def: ScoringFnDefWithProvider ) -> None: await self.register_object(function_def) diff --git a/llama_stack/providers/datatypes.py b/llama_stack/providers/datatypes.py index 903ff5438..eace0ea1a 100644 --- a/llama_stack/providers/datatypes.py +++ b/llama_stack/providers/datatypes.py @@ -13,7 +13,7 @@ from pydantic import BaseModel, Field from llama_stack.apis.datasets import DatasetDef from llama_stack.apis.memory_banks import MemoryBankDef from llama_stack.apis.models import ModelDef -from llama_stack.apis.scoring_functions import ScoringFunctionDef +from llama_stack.apis.scoring_functions import ScoringFnDef from llama_stack.apis.shields import ShieldDef @@ -25,6 +25,7 @@ class Api(Enum): memory = "memory" datasetio = "datasetio" scoring = "scoring" + eval = "eval" telemetry = "telemetry" @@ -63,11 +64,9 @@ class DatasetsProtocolPrivate(Protocol): class ScoringFunctionsProtocolPrivate(Protocol): - async def list_scoring_functions(self) -> List[ScoringFunctionDef]: ... + async def list_scoring_functions(self) -> List[ScoringFnDef]: ... - async def register_scoring_function( - self, function_def: ScoringFunctionDef - ) -> None: ... + async def register_scoring_function(self, function_def: ScoringFnDef) -> None: ... @json_schema_type diff --git a/llama_stack/providers/impls/meta_reference/datasetio/datasetio.py b/llama_stack/providers/impls/meta_reference/datasetio/datasetio.py index 43664f394..a96d9bcab 100644 --- a/llama_stack/providers/impls/meta_reference/datasetio/datasetio.py +++ b/llama_stack/providers/impls/meta_reference/datasetio/datasetio.py @@ -143,11 +143,12 @@ class MetaReferenceDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate): else: next_page_token = int(page_token) - if rows_in_page == -1: - rows = dataset_info.dataset_impl[next_page_token:] - start = next_page_token - end = min(start + rows_in_page, len(dataset_info.dataset_impl)) + if rows_in_page == -1: + end = len(dataset_info.dataset_impl) + else: + end = min(start + rows_in_page, len(dataset_info.dataset_impl)) + rows = dataset_info.dataset_impl[start:end] return PaginatedRowsResult( diff --git a/llama_stack/providers/impls/meta_reference/eval/__init__.py b/llama_stack/providers/impls/meta_reference/eval/__init__.py new file mode 100644 index 000000000..fb285c668 --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/eval/__init__.py @@ -0,0 +1,27 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +from typing import Dict + +from llama_stack.distribution.datatypes import Api, ProviderSpec + +from .config import MetaReferenceEvalConfig + + +async def get_provider_impl( + config: MetaReferenceEvalConfig, + deps: Dict[Api, ProviderSpec], +): + from .eval import MetaReferenceEvalImpl + + impl = MetaReferenceEvalImpl( + config, + deps[Api.datasetio], + deps[Api.datasets], + deps[Api.scoring], + deps[Api.inference], + ) + await impl.initialize() + return impl diff --git a/llama_stack/providers/impls/meta_reference/eval/config.py b/llama_stack/providers/impls/meta_reference/eval/config.py new file mode 100644 index 000000000..1892da2a2 --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/eval/config.py @@ -0,0 +1,9 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +from llama_stack.apis.eval import * # noqa: F401, F403 + + +class MetaReferenceEvalConfig(BaseModel): ... diff --git a/llama_stack/providers/impls/meta_reference/eval/eval.py b/llama_stack/providers/impls/meta_reference/eval/eval.py new file mode 100644 index 000000000..d675e40eb --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/eval/eval.py @@ -0,0 +1,167 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +from enum import Enum +from llama_models.llama3.api.datatypes import * # noqa: F403 + +from llama_stack.apis.common.type_system import * # noqa: F403 +from llama_stack.apis.common.job_types import Job +from llama_stack.apis.datasetio import DatasetIO +from llama_stack.apis.datasets import Datasets +from llama_stack.apis.eval import Eval, EvalCandidate, EvaluateResponse, JobStatus +from llama_stack.apis.inference import Inference +from llama_stack.apis.scoring import Scoring + +from .config import MetaReferenceEvalConfig + + +class ColumnName(Enum): + expected_answer = "expected_answer" + chat_completion_input = "chat_completion_input" + completion_input = "completion_input" + generated_answer = "generated_answer" + + +class MetaReferenceEvalImpl(Eval): + def __init__( + self, + config: MetaReferenceEvalConfig, + datasetio_api: DatasetIO, + datasets_api: Datasets, + scoring_api: Scoring, + inference_api: Inference, + ) -> None: + self.config = config + self.datasetio_api = datasetio_api + self.datasets_api = datasets_api + self.scoring_api = scoring_api + self.inference_api = inference_api + + # TODO: assume sync job, will need jobs API for async scheduling + self.jobs = {} + + async def initialize(self) -> None: ... + + async def shutdown(self) -> None: ... + + async def validate_eval_input_dataset_schema(self, dataset_id: str) -> None: + dataset_def = await self.datasets_api.get_dataset(dataset_identifier=dataset_id) + if not dataset_def.dataset_schema or len(dataset_def.dataset_schema) == 0: + raise ValueError(f"Dataset {dataset_id} does not have a schema defined.") + + expected_schemas = [ + { + ColumnName.expected_answer.value: StringType(), + ColumnName.chat_completion_input.value: ChatCompletionInputType(), + }, + { + ColumnName.expected_answer.value: StringType(), + ColumnName.completion_input.value: CompletionInputType(), + }, + ] + + if dataset_def.dataset_schema not in expected_schemas: + raise ValueError( + f"Dataset {dataset_id} does not have a correct input schema in {expected_schemas}" + ) + + async def evaluate_batch( + self, + dataset_id: str, + candidate: EvalCandidate, + scoring_functions: List[str], + ) -> Job: + await self.validate_eval_input_dataset_schema(dataset_id=dataset_id) + all_rows = await self.datasetio_api.get_rows_paginated( + dataset_id=dataset_id, + rows_in_page=-1, + ) + res = await self.evaluate( + input_rows=all_rows.rows, + candidate=candidate, + scoring_functions=scoring_functions, + ) + + # TODO: currently needs to wait for generation before returning + # need job scheduler queue (ray/celery) w/ jobs api + job_id = str(len(self.jobs)) + self.jobs[job_id] = res + return Job(job_id=job_id) + + async def evaluate( + self, + input_rows: List[Dict[str, Any]], + candidate: EvalCandidate, + scoring_functions: List[str], + ) -> EvaluateResponse: + if candidate.type == "agent": + raise NotImplementedError( + "Evaluation with generation has not been implemented for agents" + ) + assert ( + candidate.sampling_params.max_tokens is not None + ), "SamplingParams.max_tokens must be provided" + + generations = [] + for x in input_rows: + if ColumnName.completion_input.value in x: + input_content = eval(str(x[ColumnName.completion_input.value])) + response = await self.inference_api.completion( + model=candidate.model, + content=input_content, + sampling_params=candidate.sampling_params, + ) + generations.append( + { + ColumnName.generated_answer.value: response.completion_message.content + } + ) + elif ColumnName.chat_completion_input.value in x: + input_messages = eval(str(x[ColumnName.chat_completion_input.value])) + input_messages = [UserMessage(**x) for x in input_messages] + messages = [] + if candidate.system_message: + messages.append(candidate.system_message) + messages += input_messages + response = await self.inference_api.chat_completion( + model=candidate.model, + messages=messages, + sampling_params=candidate.sampling_params, + ) + generations.append( + { + ColumnName.generated_answer.value: response.completion_message.content + } + ) + else: + raise ValueError("Invalid input row") + + # scoring with generated_answer + score_input_rows = [ + input_r | generated_r + for input_r, generated_r in zip(input_rows, generations) + ] + + score_response = await self.scoring_api.score( + input_rows=score_input_rows, scoring_functions=scoring_functions + ) + + return EvaluateResponse(generations=generations, scores=score_response.results) + + async def job_status(self, job_id: str) -> Optional[JobStatus]: + if job_id in self.jobs: + return JobStatus.completed + + return None + + async def job_cancel(self, job_id: str) -> None: + raise NotImplementedError("Job cancel is not implemented yet") + + async def job_result(self, job_id: str) -> EvaluateResponse: + status = await self.job_status(job_id) + if not status or status != JobStatus.completed: + raise ValueError(f"Job is not completed, Status: {status.value}") + + return self.jobs[job_id] diff --git a/llama_stack/providers/impls/meta_reference/scoring/scoring.py b/llama_stack/providers/impls/meta_reference/scoring/scoring.py index 0d32c8195..b1d561533 100644 --- a/llama_stack/providers/impls/meta_reference/scoring/scoring.py +++ b/llama_stack/providers/impls/meta_reference/scoring/scoring.py @@ -13,17 +13,22 @@ from llama_stack.apis.datasetio import * # noqa: F403 from llama_stack.apis.datasets import * # noqa: F403 from llama_stack.providers.datatypes import ScoringFunctionsProtocolPrivate -from llama_stack.providers.impls.meta_reference.scoring.scorer.equality_scorer import ( - EqualityScorer, +from llama_stack.providers.impls.meta_reference.scoring.scoring_fn.equality_scoring_fn import ( + EqualityScoringFn, +) + +from llama_stack.providers.impls.meta_reference.scoring.scoring_fn.subset_of_scoring_fn import ( + SubsetOfScoringFn, ) from .config import MetaReferenceScoringConfig -SUPPORTED_SCORERS = [ - EqualityScorer, +SUPPORTED_SCORING_FNS = [ + EqualityScoringFn, + SubsetOfScoringFn, ] -SCORER_REGISTRY = {x.scoring_function_def.identifier: x for x in SUPPORTED_SCORERS} +SCORER_REGISTRY = {x.scoring_function_def.identifier: x for x in SUPPORTED_SCORING_FNS} class MetaReferenceScoringImpl(Scoring, ScoringFunctionsProtocolPrivate): @@ -41,10 +46,10 @@ class MetaReferenceScoringImpl(Scoring, ScoringFunctionsProtocolPrivate): async def shutdown(self) -> None: ... - async def list_scoring_functions(self) -> List[ScoringFunctionDef]: - return [x.scoring_function_def for x in SUPPORTED_SCORERS] + async def list_scoring_functions(self) -> List[ScoringFnDef]: + return [x.scoring_function_def for x in SUPPORTED_SCORING_FNS] - async def register_scoring_function(self, function_def: ScoringFunctionDef) -> None: + async def register_scoring_function(self, function_def: ScoringFnDef) -> None: raise NotImplementedError( "Dynamically registering scoring functions is not supported" ) @@ -96,9 +101,9 @@ class MetaReferenceScoringImpl(Scoring, ScoringFunctionsProtocolPrivate): for scoring_fn_id in scoring_functions: if scoring_fn_id not in SCORER_REGISTRY: raise ValueError(f"Scoring function {scoring_fn_id} is not supported.") - scorer = SCORER_REGISTRY[scoring_fn_id]() - score_results = scorer.score(input_rows) - agg_results = scorer.aggregate(score_results) + scoring_fn = SCORER_REGISTRY[scoring_fn_id]() + score_results = scoring_fn.score(input_rows) + agg_results = scoring_fn.aggregate(score_results) res[scoring_fn_id] = ScoringResult( score_rows=score_results, aggregated_results=agg_results, diff --git a/llama_stack/providers/impls/meta_reference/scoring/scorer/__init__.py b/llama_stack/providers/impls/meta_reference/scoring/scoring_fn/__init__.py similarity index 100% rename from llama_stack/providers/impls/meta_reference/scoring/scorer/__init__.py rename to llama_stack/providers/impls/meta_reference/scoring/scoring_fn/__init__.py diff --git a/llama_stack/providers/impls/meta_reference/scoring/scorer/base_scorer.py b/llama_stack/providers/impls/meta_reference/scoring/scoring_fn/base_scoring_fn.py similarity index 81% rename from llama_stack/providers/impls/meta_reference/scoring/scorer/base_scorer.py rename to llama_stack/providers/impls/meta_reference/scoring/scoring_fn/base_scoring_fn.py index ea8a3f063..952d46bb2 100644 --- a/llama_stack/providers/impls/meta_reference/scoring/scorer/base_scorer.py +++ b/llama_stack/providers/impls/meta_reference/scoring/scoring_fn/base_scoring_fn.py @@ -9,15 +9,15 @@ from llama_stack.apis.scoring_functions import * # noqa: F401, F403 from llama_stack.apis.scoring import * # noqa: F401, F403 -class BaseScorer(ABC): +class BaseScoringFn(ABC): """ - Base interface class for all meta-reference scorers. - Each scorer needs to implement the following methods: + Base interface class for all meta-reference scoring_fns. + Each scoring_fn needs to implement the following methods: - score_row(self, row) - - aggregate(self, scorer_results) + - aggregate(self, scoring_fn_results) """ - scoring_function_def: ScoringFunctionDef + scoring_function_def: ScoringFnDef def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) diff --git a/llama_stack/providers/impls/meta_reference/scoring/scoring_fn/common.py b/llama_stack/providers/impls/meta_reference/scoring/scoring_fn/common.py new file mode 100644 index 000000000..52eabea2e --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/scoring/scoring_fn/common.py @@ -0,0 +1,19 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +from typing import Any, Dict, List + +from llama_stack.apis.scoring import ScoringResultRow + + +def aggregate_accuracy(scoring_results: List[ScoringResultRow]) -> Dict[str, Any]: + num_correct = sum(result["score"] for result in scoring_results) + avg_score = num_correct / len(scoring_results) + + return { + "accuracy": avg_score, + "num_correct": num_correct, + "num_total": len(scoring_results), + } diff --git a/llama_stack/providers/impls/meta_reference/scoring/scorer/equality_scorer.py b/llama_stack/providers/impls/meta_reference/scoring/scoring_fn/equality_scoring_fn.py similarity index 65% rename from llama_stack/providers/impls/meta_reference/scoring/scorer/equality_scorer.py rename to llama_stack/providers/impls/meta_reference/scoring/scoring_fn/equality_scoring_fn.py index ce765bfb5..cce0f948a 100644 --- a/llama_stack/providers/impls/meta_reference/scoring/scorer/equality_scorer.py +++ b/llama_stack/providers/impls/meta_reference/scoring/scoring_fn/equality_scoring_fn.py @@ -4,20 +4,23 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from llama_stack.providers.impls.meta_reference.scoring.scorer.base_scorer import ( - BaseScorer, +from llama_stack.providers.impls.meta_reference.scoring.scoring_fn.base_scoring_fn import ( + BaseScoringFn, ) from llama_stack.apis.scoring_functions import * # noqa: F401, F403 from llama_stack.apis.scoring import * # noqa: F401, F403 from llama_stack.apis.common.type_system import * # noqa: F403 +from llama_stack.providers.impls.meta_reference.scoring.scoring_fn.common import ( + aggregate_accuracy, +) -class EqualityScorer(BaseScorer): +class EqualityScoringFn(BaseScoringFn): """ - A scorer that assigns a score of 1.0 if the input string matches the target string, and 0.0 otherwise. + A scoring_fn that assigns a score of 1.0 if the input string matches the target string, and 0.0 otherwise. """ - scoring_function_def = ScoringFunctionDef( + scoring_function_def = ScoringFnDef( identifier="equality", description="Returns 1.0 if the input is equal to the target, 0.0 otherwise.", parameters=[], @@ -38,12 +41,4 @@ class EqualityScorer(BaseScorer): } def aggregate(self, scoring_results: List[ScoringResultRow]) -> Dict[str, Any]: - assert len(scoring_results) > 0, "Empty scoring results provided." - num_correct = sum(result["score"] for result in scoring_results) - avg_score = num_correct / len(scoring_results) - - return { - "accuracy": avg_score, - "num_correct": num_correct, - "num_total": len(scoring_results), - } + return aggregate_accuracy(scoring_results) diff --git a/llama_stack/providers/impls/meta_reference/scoring/scoring_fn/subset_of_scoring_fn.py b/llama_stack/providers/impls/meta_reference/scoring/scoring_fn/subset_of_scoring_fn.py new file mode 100644 index 000000000..c7ee68e26 --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/scoring/scoring_fn/subset_of_scoring_fn.py @@ -0,0 +1,44 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack.providers.impls.meta_reference.scoring.scoring_fn.base_scoring_fn import ( + BaseScoringFn, +) +from llama_stack.apis.scoring_functions import * # noqa: F401, F403 +from llama_stack.apis.scoring import * # noqa: F401, F403 +from llama_stack.apis.common.type_system import * # noqa: F403 +from llama_stack.providers.impls.meta_reference.scoring.scoring_fn.common import ( + aggregate_accuracy, +) + + +class SubsetOfScoringFn(BaseScoringFn): + """ + A scoring_fn that assigns a score of 1.0 if the expected string is included in the generated string, and 0.0 otherwise. + """ + + scoring_function_def = ScoringFnDef( + identifier="subset_of", + description="Returns 1.0 if the expected is included in generated, 0.0 otherwise.", + parameters=[], + return_type=NumberType(), + ) + + def score_row(self, input_row: Dict[str, Any]) -> ScoringResultRow: + assert "expected_answer" in input_row, "Expected answer not found in input row." + assert ( + "generated_answer" in input_row + ), "Generated answer not found in input row." + + expected_answer = input_row["expected_answer"] + generated_answer = input_row["generated_answer"] + score = 1.0 if expected_answer in generated_answer else 0.0 + return { + "score": score, + } + + def aggregate(self, scoring_results: List[ScoringResultRow]) -> Dict[str, Any]: + return aggregate_accuracy(scoring_results) diff --git a/llama_stack/providers/registry/eval.py b/llama_stack/providers/registry/eval.py new file mode 100644 index 000000000..fc7c923d9 --- /dev/null +++ b/llama_stack/providers/registry/eval.py @@ -0,0 +1,27 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from typing import List + +from llama_stack.distribution.datatypes import * # noqa: F403 + + +def available_providers() -> List[ProviderSpec]: + return [ + InlineProviderSpec( + api=Api.eval, + provider_type="meta-reference", + pip_packages=[], + module="llama_stack.providers.impls.meta_reference.eval", + config_class="llama_stack.providers.impls.meta_reference.eval.MetaReferenceEvalConfig", + api_dependencies=[ + Api.datasetio, + Api.datasets, + Api.scoring, + Api.inference, + ], + ), + ] diff --git a/llama_stack/providers/tests/datasetio/test_dataset.csv b/llama_stack/providers/tests/datasetio/test_dataset.csv index a1a250753..f682c6d3d 100644 --- a/llama_stack/providers/tests/datasetio/test_dataset.csv +++ b/llama_stack/providers/tests/datasetio/test_dataset.csv @@ -1,6 +1,6 @@ -input_query,generated_answer,expected_answer -What is the capital of France?,London,Paris -Who is the CEO of Meta?,Mark Zuckerberg,Mark Zuckerberg -What is the largest planet in our solar system?,Jupiter,Jupiter -What is the smallest country in the world?,China,Vatican City -What is the currency of Japan?,Yen,Yen +input_query,generated_answer,expected_answer,chat_completion_input +What is the capital of France?,London,Paris,"[{'role': 'user', 'content': 'What is the capital of France?'}]" +Who is the CEO of Meta?,Mark Zuckerberg,Mark Zuckerberg,"[{'role': 'user', 'content': 'Who is the CEO of Meta?'}]" +What is the largest planet in our solar system?,Jupiter,Jupiter,"[{'role': 'user', 'content': 'What is the largest planet in our solar system?'}]" +What is the smallest country in the world?,China,Vatican City,"[{'role': 'user', 'content': 'What is the smallest country in the world?'}]" +What is the currency of Japan?,Yen,Yen,"[{'role': 'user', 'content': 'What is the currency of Japan?'}]" diff --git a/llama_stack/providers/tests/datasetio/test_datasetio.py b/llama_stack/providers/tests/datasetio/test_datasetio.py index 9a351ba30..9bd80f94d 100644 --- a/llama_stack/providers/tests/datasetio/test_datasetio.py +++ b/llama_stack/providers/tests/datasetio/test_datasetio.py @@ -61,20 +61,31 @@ def data_url_from_file(file_path: str) -> str: return data_url -async def register_dataset(datasets_impl: Datasets): +async def register_dataset( + datasets_impl: Datasets, for_generation=False, dataset_id="test_dataset" +): test_file = Path(os.path.abspath(__file__)).parent / "test_dataset.csv" test_url = data_url_from_file(str(test_file)) + + if for_generation: + dataset_schema = { + "expected_answer": StringType(), + "chat_completion_input": ChatCompletionInputType(), + } + else: + dataset_schema = { + "expected_answer": StringType(), + "input_query": StringType(), + "generated_answer": StringType(), + } + dataset = DatasetDefWithProvider( - identifier="test_dataset", + identifier=dataset_id, provider_id=os.environ["PROVIDER_ID"], url=URL( uri=test_url, ), - dataset_schema={ - "generated_answer": StringType(), - "expected_answer": StringType(), - "input_query": StringType(), - }, + dataset_schema=dataset_schema, ) await datasets_impl.register_dataset(dataset) diff --git a/llama_stack/providers/tests/eval/__init__.py b/llama_stack/providers/tests/eval/__init__.py new file mode 100644 index 000000000..756f351d8 --- /dev/null +++ b/llama_stack/providers/tests/eval/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. diff --git a/llama_stack/providers/tests/eval/provider_config_example.yaml b/llama_stack/providers/tests/eval/provider_config_example.yaml new file mode 100644 index 000000000..1576d2ef0 --- /dev/null +++ b/llama_stack/providers/tests/eval/provider_config_example.yaml @@ -0,0 +1,18 @@ +providers: + datasetio: + - provider_id: test-meta + provider_type: meta-reference + config: {} + scoring: + - provider_id: test-meta + provider_type: meta-reference + config: {} + eval: + - provider_id: test-meta + provider_type: meta-reference + config: {} + inference: + - provider_id: test-tgi + provider_type: remote::tgi + config: + url: http://127.0.0.1:5009 diff --git a/llama_stack/providers/tests/eval/test_eval.py b/llama_stack/providers/tests/eval/test_eval.py new file mode 100644 index 000000000..6b0d99a22 --- /dev/null +++ b/llama_stack/providers/tests/eval/test_eval.py @@ -0,0 +1,79 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +import pytest +import pytest_asyncio + +from llama_stack.apis.common.type_system import * # noqa: F403 +from llama_stack.apis.datasetio import * # noqa: F403 +from llama_stack.apis.eval.eval import ModelCandidate +from llama_stack.distribution.datatypes import * # noqa: F403 + +from llama_models.llama3.api import SamplingParams + +from llama_stack.providers.tests.datasetio.test_datasetio import register_dataset +from llama_stack.providers.tests.resolver import resolve_impls_for_test + +# How to run this test: +# +# 1. Ensure you have a conda with the right dependencies installed. This is a bit tricky +# since it depends on the provider you are testing. On top of that you need +# `pytest` and `pytest-asyncio` installed. +# +# 2. Copy and modify the provider_config_example.yaml depending on the provider you are testing. +# +# 3. Run: +# +# ```bash +# PROVIDER_ID= \ +# PROVIDER_CONFIG=provider_config.yaml \ +# pytest -s llama_stack/providers/tests/eval/test_eval.py \ +# --tb=short --disable-warnings +# ``` + + +@pytest_asyncio.fixture(scope="session") +async def eval_settings(): + impls = await resolve_impls_for_test( + Api.eval, deps=[Api.datasetio, Api.scoring, Api.inference] + ) + return { + "eval_impl": impls[Api.eval], + "scoring_impl": impls[Api.scoring], + "datasets_impl": impls[Api.datasets], + } + + +@pytest.mark.asyncio +async def test_eval(eval_settings): + datasets_impl = eval_settings["datasets_impl"] + await register_dataset( + datasets_impl, + for_generation=True, + dataset_id="test_dataset_for_eval", + ) + + response = await datasets_impl.list_datasets() + assert len(response) == 1 + + eval_impl = eval_settings["eval_impl"] + response = await eval_impl.evaluate_batch( + dataset_id=response[0].identifier, + candidate=ModelCandidate( + model="Llama3.2-1B-Instruct", + sampling_params=SamplingParams(), + ), + scoring_functions=["subset_of"], + ) + assert response.job_id == "0" + job_status = await eval_impl.job_status(response.job_id) + + assert job_status and job_status.value == "completed" + + eval_response = await eval_impl.job_result(response.job_id) + + assert eval_response is not None + assert len(eval_response.generations) == 5 + assert "subset_of" in eval_response.scores diff --git a/tests/examples/evals-tgi-run.yaml b/tests/examples/evals-tgi-run.yaml index e56c43420..e63523889 100644 --- a/tests/examples/evals-tgi-run.yaml +++ b/tests/examples/evals-tgi-run.yaml @@ -14,7 +14,12 @@ apis: - datasets - datasetio - scoring +- eval providers: + eval: + - provider_id: meta0 + provider_type: meta-reference + config: {} scoring: - provider_id: meta0 provider_type: meta-reference