full evals / full scoring flow

This commit is contained in:
Xi Yan 2024-10-15 10:17:45 -07:00
parent cccd5be090
commit be4f395032
4 changed files with 88 additions and 109 deletions

View file

@ -21,7 +21,7 @@
"info": { "info": {
"title": "[DRAFT] Llama Stack Specification", "title": "[DRAFT] Llama Stack Specification",
"version": "0.0.1", "version": "0.0.1",
"description": "This is the specification of the llama stack that provides\n a set of endpoints and their corresponding interfaces that are tailored to\n best leverage Llama Models. The specification is still in draft and subject to change.\n Generated at 2024-10-15 00:44:26.278642" "description": "This is the specification of the llama stack that provides\n a set of endpoints and their corresponding interfaces that are tailored to\n best leverage Llama Models. The specification is still in draft and subject to change.\n Generated at 2024-10-15 10:15:15.195382"
}, },
"servers": [ "servers": [
{ {
@ -5805,23 +5805,13 @@
"RunEvalTaskRequest": { "RunEvalTaskRequest": {
"type": "object", "type": "object",
"properties": { "properties": {
"model": {
"type": "string"
},
"task": {
"type": "string"
},
"dataset": {
"type": "string"
},
"eval_task_config": { "eval_task_config": {
"$ref": "#/components/schemas/EvaluateTaskConfig" "$ref": "#/components/schemas/EvaluateTaskConfig"
} }
}, },
"additionalProperties": false, "additionalProperties": false,
"required": [ "required": [
"model", "eval_task_config"
"task"
] ]
}, },
"EvalResult": { "EvalResult": {
@ -6238,49 +6228,49 @@
], ],
"tags": [ "tags": [
{ {
"name": "Models" "name": "Inference"
},
{
"name": "BatchInference"
},
{
"name": "Inspect"
},
{
"name": "Evals"
},
{
"name": "Safety"
},
{
"name": "Shields"
},
{
"name": "Telemetry"
},
{
"name": "Agents"
},
{
"name": "Memory"
},
{
"name": "SyntheticDataGeneration"
}, },
{ {
"name": "PostTraining" "name": "PostTraining"
}, },
{ {
"name": "Datasets" "name": "Agents"
}, },
{ {
"name": "MemoryBanks" "name": "MemoryBanks"
}, },
{
"name": "Inspect"
},
{
"name": "Models"
},
{
"name": "Safety"
},
{
"name": "Evals"
},
{
"name": "BatchInference"
},
{
"name": "Shields"
},
{
"name": "SyntheticDataGeneration"
},
{
"name": "Telemetry"
},
{ {
"name": "RewardScoring" "name": "RewardScoring"
}, },
{ {
"name": "Inference" "name": "Datasets"
},
{
"name": "Memory"
}, },
{ {
"name": "BuiltinTool", "name": "BuiltinTool",

View file

@ -1785,17 +1785,10 @@ components:
RunEvalTaskRequest: RunEvalTaskRequest:
additionalProperties: false additionalProperties: false
properties: properties:
dataset:
type: string
eval_task_config: eval_task_config:
$ref: '#/components/schemas/EvaluateTaskConfig' $ref: '#/components/schemas/EvaluateTaskConfig'
model:
type: string
task:
type: string
required: required:
- model - eval_task_config
- task
type: object type: object
RunScorerRequest: RunScorerRequest:
additionalProperties: false additionalProperties: false
@ -2686,7 +2679,7 @@ info:
description: "This is the specification of the llama stack that provides\n \ description: "This is the specification of the llama stack that provides\n \
\ a set of endpoints and their corresponding interfaces that are tailored\ \ a set of endpoints and their corresponding interfaces that are tailored\
\ to\n best leverage Llama Models. The specification is still in\ \ to\n best leverage Llama Models. The specification is still in\
\ draft and subject to change.\n Generated at 2024-10-15 00:44:26.278642" \ draft and subject to change.\n Generated at 2024-10-15 10:15:15.195382"
title: '[DRAFT] Llama Stack Specification' title: '[DRAFT] Llama Stack Specification'
version: 0.0.1 version: 0.0.1
jsonSchemaDialect: https://json-schema.org/draft/2020-12/schema jsonSchemaDialect: https://json-schema.org/draft/2020-12/schema
@ -3787,21 +3780,21 @@ security:
servers: servers:
- url: http://any-hosted-llama-stack.com - url: http://any-hosted-llama-stack.com
tags: tags:
- name: Models
- name: BatchInference
- name: Inspect
- name: Evals
- name: Safety
- name: Shields
- name: Telemetry
- name: Agents
- name: Memory
- name: SyntheticDataGeneration
- name: PostTraining
- name: Datasets
- name: MemoryBanks
- name: RewardScoring
- name: Inference - name: Inference
- name: PostTraining
- name: Agents
- name: MemoryBanks
- name: Inspect
- name: Models
- name: Safety
- name: Evals
- name: BatchInference
- name: Shields
- name: SyntheticDataGeneration
- name: Telemetry
- name: RewardScoring
- name: Datasets
- name: Memory
- description: <SchemaDefinition schemaRef="#/components/schemas/BuiltinTool" /> - description: <SchemaDefinition schemaRef="#/components/schemas/BuiltinTool" />
name: BuiltinTool name: BuiltinTool
- description: <SchemaDefinition schemaRef="#/components/schemas/CompletionMessage" - description: <SchemaDefinition schemaRef="#/components/schemas/CompletionMessage"

View file

@ -119,52 +119,48 @@ async def run_main(host: str, port: int, eval_dataset_path: str = ""):
cprint(f"{k}: {v}", "green") cprint(f"{k}: {v}", "green")
# Scoring Task # Scoring Task
# # 1. register huggingface dataset # 1. register huggingface dataset
# response = await dataset_client.create_dataset( response = await dataset_client.create_dataset(
# dataset_def=HuggingfaceDatasetDef( dataset_def=HuggingfaceDatasetDef(
# identifier="Llama-3.1-8B-Instruct-evals__mmlu_pro__details", identifier="Llama-3.1-8B-Instruct-evals__mmlu_pro__details",
# dataset_path="meta-llama/Llama-3.1-8B-Instruct-evals", dataset_path="meta-llama/Llama-3.1-8B-Instruct-evals",
# dataset_name="Llama-3.1-8B-Instruct-evals__mmlu_pro__details", dataset_name="Llama-3.1-8B-Instruct-evals__mmlu_pro__details",
# rename_columns_map={ rename_columns_map={
# "output_parsed_answer": "generated_answer", "output_parsed_answer": "generated_answer",
# "input_correct_responses": "expected_answer", "input_correct_responses": "expected_answer",
# }, },
# kwargs={"split": "latest"}, kwargs={"split": "latest"},
# ) )
# ) )
# cprint(response, "cyan") cprint(response, "cyan")
# # register custom dataset from file path # register custom dataset from file path
# response = await dataset_client.create_dataset( response = await dataset_client.create_dataset(
# dataset_def=CustomDatasetDef( dataset_def=CustomDatasetDef(
# identifier="rag-evals", identifier="rag-evals",
# url=data_url_from_file(eval_dataset_path), url=data_url_from_file(eval_dataset_path),
# rename_columns_map={ )
# "query": "input_query", )
# }, cprint(response, "cyan")
# )
# )
# cprint(response, "cyan")
# # 2. run evals on the registered dataset # 2. run evals on the registered dataset
# response = await client.run_scorer( response = await client.run_scorer(
# dataset_config=EvaluateDatasetConfig( dataset_config=EvaluateDatasetConfig(
# dataset_identifier="rag-evals", dataset_identifier="rag-evals",
# # dataset_identifier="Llama-3.1-8B-Instruct-evals__mmlu_pro__details", row_limit=10,
# row_limit=10, ),
# ), eval_scoring_config=EvaluateScoringConfig(
# eval_scoring_config=EvaluateScoringConfig( scorer_config_list=[
# scorer_config_list=[ EvaluateSingleScorerConfig(scorer_name="accuracy"),
# EvaluateSingleScorerConfig(scorer_name="accuracy"), EvaluateSingleScorerConfig(
# EvaluateSingleScorerConfig( scorer_name="braintrust::answer-correctness"
# scorer_name="braintrust::answer-correctness" ),
# ), ]
# ] ),
# ), )
# )
# for k, v in response.eval_result.metrics.items(): for k, v in response.eval_result.metrics.items():
# cprint(f"{k}: {v}", "green") cprint(f"{k}: {v}", "green")
def main(host: str, port: int, eval_dataset_path: str = ""): def main(host: str, port: int, eval_dataset_path: str = ""):

View file

@ -67,7 +67,7 @@ class CustomDataset(BaseDataset[DictSample]):
raise ValueError(f"Unsupported file type: {self.config.url}") raise ValueError(f"Unsupported file type: {self.config.url}")
if n_samples is not None: if n_samples is not None:
df = df.sample(n=n_samples) df = df.sample(n=min(n_samples, len(df)))
self.dataset = Dataset.from_pandas(df) self.dataset = Dataset.from_pandas(df)
if self.config.rename_columns_map: if self.config.rename_columns_map: