native eval

This commit is contained in:
Xi Yan 2024-12-02 15:38:58 -08:00
parent de2ab1243a
commit b59810cd9a

View file

@ -3,6 +3,11 @@
# #
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
import json
import pandas as pd
import streamlit as st import streamlit as st
from modules.api import llama_stack_api from modules.api import llama_stack_api
@ -13,21 +18,208 @@ def native_evaluation_page():
st.set_page_config(page_title="Native Evaluations", page_icon="🦙") st.set_page_config(page_title="Native Evaluations", page_icon="🦙")
st.title("🦙 Llama Stack Native Evaluations") st.title("🦙 Llama Stack Native Evaluations")
# Select Eval Tasks # Create tabs
st.subheader("Select Eval Tasks") task_tab, candidate_tab, params_tab, run_tab = st.tabs(
eval_tasks = llama_stack_api.client.eval_tasks.list() [
eval_tasks = {et.identifier: et for et in eval_tasks} "(1) Select Eval Task",
eval_tasks_names = list(eval_tasks.keys()) "(2) Define Eval Candidate",
selected_eval_task = st.selectbox( "(3) Define Scoring Parameters",
"Choose an eval task.", "(4) Run Evaluation",
options=eval_tasks_names, ]
help="Choose an eval task. Each eval task is parameterized by a dataset, and list of scoring functions.",
) )
st.json(eval_tasks[selected_eval_task], expanded=True)
# Define Eval Candidate with task_tab:
st.subheader("Define Eval Candidate") # Select Eval Tasks
# eval_candidate = {} eval_tasks = llama_stack_api.client.eval_tasks.list()
eval_tasks = {et.identifier: et for et in eval_tasks}
eval_tasks_names = list(eval_tasks.keys())
selected_eval_task = st.selectbox(
"Choose an eval task.",
options=eval_tasks_names,
help="Choose an eval task. Each eval task is parameterized by a dataset, and list of scoring functions.",
)
st.json(eval_tasks[selected_eval_task], expanded=True)
with candidate_tab:
# Define Eval Candidate
candidate_type = st.radio("Candidate Type", ["model", "agent"])
available_models = llama_stack_api.client.models.list()
available_models = [model.identifier for model in available_models]
selected_model = st.selectbox(
"Choose a model",
available_models,
index=0,
)
# Sampling Parameters
st.markdown("##### Sampling Parameters")
strategy = st.selectbox(
"Strategy",
["greedy", "top_p", "top_k"],
index=0,
)
temperature = st.slider(
"Temperature",
min_value=0.0,
max_value=1.0,
value=0.0,
step=0.1,
help="Controls the randomness of the response. Higher values make the output more creative and unexpected, lower values make it more conservative and predictable",
)
top_p = st.slider(
"Top P",
min_value=0.0,
max_value=1.0,
value=0.95,
step=0.1,
)
max_tokens = st.slider(
"Max Tokens",
min_value=0,
max_value=4096,
value=512,
step=1,
help="The maximum number of tokens to generate",
)
repetition_penalty = st.slider(
"Repetition Penalty",
min_value=1.0,
max_value=2.0,
value=1.0,
step=0.1,
help="Controls the likelihood for generating the same word or phrase multiple times in the same sentence or paragraph. 1 implies no penalty, 2 will strongly discourage model to repeat words or phrases.",
)
if candidate_type == "model":
eval_candidate = {
"type": "model",
"model": selected_model,
"sampling_params": {
"strategy": strategy,
"temperature": temperature,
"top_p": top_p,
"max_tokens": max_tokens,
"repetition_penalty": repetition_penalty,
},
}
elif candidate_type == "agent":
system_prompt = st.text_area(
"System Prompt",
value="You are a helpful AI assistant.",
help="Initial instructions given to the AI to set its behavior and context",
)
tools_json = st.text_area(
"Tools Configuration (JSON)",
value=json.dumps(
[
{
"type": "brave_search",
"engine": "brave",
"api_key": "ENTER_BRAVE_API_KEY_HERE",
}
]
),
help="Enter tool configurations in JSON format. Each tool should have a name, description, and parameters.",
height=200,
)
try:
tools = json.loads(tools_json)
except json.JSONDecodeError:
st.error("Invalid JSON format for tools configuration")
tools = []
eval_candidate = {
"type": "agent",
"config": {
"model": selected_model,
"instructions": system_prompt,
"tools": tools,
"tool_choice": "auto",
"tool_prompt_format": "json",
"input_shields": [],
"output_shields": [],
"enable_session_persistence": False,
},
}
with params_tab:
st.write("Define scoring function parameters here")
with run_tab:
dataset_id = eval_tasks[selected_eval_task].dataset_id
rows = llama_stack_api.client.datasetio.get_rows_paginated(
dataset_id=dataset_id,
rows_in_page=-1,
)
total_rows = len(rows.rows)
# Add number of examples control
num_rows = st.number_input(
"Number of Examples to Evaluate",
min_value=1,
max_value=total_rows,
value=5,
help="Number of examples from the dataset to evaluate. ",
)
eval_task_config = {
"type": "benchmark",
"eval_candidate": eval_candidate,
"scoring_params": {},
}
st.json(eval_tasks[selected_eval_task], expanded=True)
st.json(eval_task_config, expanded=True)
# Add run button and handle evaluation
if st.button("Run Evaluation"):
progress_text = "Running evaluation..."
progress_bar = st.progress(0, text=progress_text)
rows = rows.rows
if num_rows < total_rows:
rows = rows[:num_rows]
# Create separate containers for progress text and results
progress_text_container = st.empty()
results_container = st.empty()
output_res = {}
for i, r in enumerate(rows):
# Update progress
progress = i / len(rows)
progress_bar.progress(progress, text=progress_text)
# Run evaluation for current row
eval_res = llama_stack_api.client.eval.evaluate_rows(
task_id=selected_eval_task,
input_rows=[r],
scoring_functions=eval_tasks[selected_eval_task].scoring_functions,
task_config=eval_task_config,
)
for k in r.keys():
if k not in output_res:
output_res[k] = []
output_res[k].append(r[k])
for k in eval_res.generations[0].keys():
if k not in output_res:
output_res[k] = []
output_res[k].append(eval_res.generations[0][k])
for scoring_fn in eval_tasks[selected_eval_task].scoring_functions:
if scoring_fn not in output_res:
output_res[scoring_fn] = []
output_res[scoring_fn].append(
eval_res.scores[scoring_fn].score_rows[0]
)
progress_text_container.write(
f"Expand to see current processed result ({i+1}/{len(rows)})"
)
results_container.json(eval_res, expanded=2)
progress_bar.progress(1.0, text="Evaluation complete!")
# Display results in dataframe
if output_res:
output_df = pd.DataFrame(output_res)
st.subheader("Evaluation Results")
st.dataframe(output_df)
native_evaluation_page() native_evaluation_page()