From be4f395032930f8ba9b7a21da6d8a9644396a631 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Tue, 15 Oct 2024 10:17:45 -0700 Subject: [PATCH] full evals / full scoring flow --- docs/resources/llama-stack-spec.html | 74 ++++++++--------- docs/resources/llama-stack-spec.yaml | 39 ++++----- llama_stack/apis/evals/client.py | 82 +++++++++---------- .../registry/datasets/dataset_wrappers.py | 2 +- 4 files changed, 88 insertions(+), 109 deletions(-) diff --git a/docs/resources/llama-stack-spec.html b/docs/resources/llama-stack-spec.html index ac75dbf04..7787001ff 100644 --- a/docs/resources/llama-stack-spec.html +++ b/docs/resources/llama-stack-spec.html @@ -21,7 +21,7 @@ "info": { "title": "[DRAFT] Llama Stack Specification", "version": "0.0.1", - "description": "This is the specification of the llama stack that provides\n a set of endpoints and their corresponding interfaces that are tailored to\n best leverage Llama Models. The specification is still in draft and subject to change.\n Generated at 2024-10-15 00:44:26.278642" + "description": "This is the specification of the llama stack that provides\n a set of endpoints and their corresponding interfaces that are tailored to\n best leverage Llama Models. The specification is still in draft and subject to change.\n Generated at 2024-10-15 10:15:15.195382" }, "servers": [ { @@ -5805,23 +5805,13 @@ "RunEvalTaskRequest": { "type": "object", "properties": { - "model": { - "type": "string" - }, - "task": { - "type": "string" - }, - "dataset": { - "type": "string" - }, "eval_task_config": { "$ref": "#/components/schemas/EvaluateTaskConfig" } }, "additionalProperties": false, "required": [ - "model", - "task" + "eval_task_config" ] }, "EvalResult": { @@ -6238,49 +6228,49 @@ ], "tags": [ { - "name": "Models" - }, - { - "name": "BatchInference" - }, - { - "name": "Inspect" - }, - { - "name": "Evals" - }, - { - "name": "Safety" - }, - { - "name": "Shields" - }, - { - "name": "Telemetry" - }, - { - "name": "Agents" - }, - { - "name": "Memory" - }, - { - "name": "SyntheticDataGeneration" + "name": "Inference" }, { "name": "PostTraining" }, { - "name": "Datasets" + "name": "Agents" }, { "name": "MemoryBanks" }, + { + "name": "Inspect" + }, + { + "name": "Models" + }, + { + "name": "Safety" + }, + { + "name": "Evals" + }, + { + "name": "BatchInference" + }, + { + "name": "Shields" + }, + { + "name": "SyntheticDataGeneration" + }, + { + "name": "Telemetry" + }, { "name": "RewardScoring" }, { - "name": "Inference" + "name": "Datasets" + }, + { + "name": "Memory" }, { "name": "BuiltinTool", diff --git a/docs/resources/llama-stack-spec.yaml b/docs/resources/llama-stack-spec.yaml index ab54c4c09..d601435d7 100644 --- a/docs/resources/llama-stack-spec.yaml +++ b/docs/resources/llama-stack-spec.yaml @@ -1785,17 +1785,10 @@ components: RunEvalTaskRequest: additionalProperties: false properties: - dataset: - type: string eval_task_config: $ref: '#/components/schemas/EvaluateTaskConfig' - model: - type: string - task: - type: string required: - - model - - task + - eval_task_config type: object RunScorerRequest: additionalProperties: false @@ -2686,7 +2679,7 @@ info: description: "This is the specification of the llama stack that provides\n \ \ a set of endpoints and their corresponding interfaces that are tailored\ \ to\n best leverage Llama Models. The specification is still in\ - \ draft and subject to change.\n Generated at 2024-10-15 00:44:26.278642" + \ draft and subject to change.\n Generated at 2024-10-15 10:15:15.195382" title: '[DRAFT] Llama Stack Specification' version: 0.0.1 jsonSchemaDialect: https://json-schema.org/draft/2020-12/schema @@ -3787,21 +3780,21 @@ security: servers: - url: http://any-hosted-llama-stack.com tags: -- name: Models -- name: BatchInference -- name: Inspect -- name: Evals -- name: Safety -- name: Shields -- name: Telemetry -- name: Agents -- name: Memory -- name: SyntheticDataGeneration -- name: PostTraining -- name: Datasets -- name: MemoryBanks -- name: RewardScoring - name: Inference +- name: PostTraining +- name: Agents +- name: MemoryBanks +- name: Inspect +- name: Models +- name: Safety +- name: Evals +- name: BatchInference +- name: Shields +- name: SyntheticDataGeneration +- name: Telemetry +- name: RewardScoring +- name: Datasets +- name: Memory - description: name: BuiltinTool - description: