expander refactor

This commit is contained in:
Xi Yan 2024-12-03 16:20:31 -08:00
parent e245f459bb
commit 92f79d4dfb

View file

@ -13,34 +13,27 @@ import streamlit as st
from modules.api import llama_stack_api from modules.api import llama_stack_api
def native_evaluation_page(): def select_eval_task_1():
# Select Eval Tasks
st.set_page_config(page_title="Evaluations (Generation + Scoring)", page_icon="🦙") st.subheader("1. Choose An Eval Task")
st.title("📊 Evaluations (Generation + Scoring)") eval_tasks = llama_stack_api.client.eval_tasks.list()
eval_tasks = {et.identifier: et for et in eval_tasks}
# Create tabs eval_tasks_names = list(eval_tasks.keys())
task_tab, candidate_tab, params_tab, run_tab = st.tabs( selected_eval_task = st.selectbox(
[ "Choose an eval task.",
"(1) Select Eval Task", options=eval_tasks_names,
"(2) Define Eval Candidate", help="Choose an eval task. Each eval task is parameterized by a dataset, and list of scoring functions.",
"(3) Define Scoring Parameters",
"(4) Run Evaluation",
]
) )
with st.expander("View Eval Task"):
with task_tab:
# Select Eval Tasks
eval_tasks = llama_stack_api.client.eval_tasks.list()
eval_tasks = {et.identifier: et for et in eval_tasks}
eval_tasks_names = list(eval_tasks.keys())
selected_eval_task = st.selectbox(
"Choose an eval task.",
options=eval_tasks_names,
help="Choose an eval task. Each eval task is parameterized by a dataset, and list of scoring functions.",
)
st.json(eval_tasks[selected_eval_task], expanded=True) st.json(eval_tasks[selected_eval_task], expanded=True)
with candidate_tab: st.session_state["selected_eval_task"] = selected_eval_task
st.session_state["eval_tasks"] = eval_tasks
def define_eval_candidate_2():
st.subheader("2. Define Eval Candidate")
with st.expander("Define Eval Candidate"):
# Define Eval Candidate # Define Eval Candidate
candidate_type = st.radio("Candidate Type", ["model", "agent"]) candidate_type = st.radio("Candidate Type", ["model", "agent"])
@ -140,100 +133,124 @@ def native_evaluation_page():
"enable_session_persistence": False, "enable_session_persistence": False,
}, },
} }
st.session_state["eval_candidate"] = eval_candidate
with params_tab:
st.write("(Optional) Define scoring function parameters here")
with run_tab: def define_scoring_params_3():
# Add info box to explain configurations being used if not st.session_state.get("selected_eval_candidate_2_next", None):
st.info( return
""" st.write("(Optional) Define scoring function parameters here")
def run_evaluation_4():
st.subheader("3. Run Evaluation")
# Add info box to explain configurations being used
st.info(
"""
Review the configurations that will be used for this evaluation run, make any necessary changes, and then click the "Run Evaluation" button. Review the configurations that will be used for this evaluation run, make any necessary changes, and then click the "Run Evaluation" button.
""" """
) )
selected_eval_task = st.session_state["selected_eval_task"]
eval_tasks = st.session_state["eval_tasks"]
eval_candidate = st.session_state["eval_candidate"]
dataset_id = eval_tasks[selected_eval_task].dataset_id dataset_id = eval_tasks[selected_eval_task].dataset_id
rows = llama_stack_api.client.datasetio.get_rows_paginated( rows = llama_stack_api.client.datasetio.get_rows_paginated(
dataset_id=dataset_id, dataset_id=dataset_id,
rows_in_page=-1, rows_in_page=-1,
) )
total_rows = len(rows.rows) total_rows = len(rows.rows)
# Add number of examples control # Add number of examples control
num_rows = st.number_input( num_rows = st.number_input(
"Number of Examples to Evaluate", "Number of Examples to Evaluate",
min_value=1, min_value=1,
max_value=total_rows, max_value=total_rows,
value=5, value=5,
help="Number of examples from the dataset to evaluate. ", help="Number of examples from the dataset to evaluate. ",
) )
eval_task_config = { eval_task_config = {
"type": "benchmark", "type": "benchmark",
"eval_candidate": eval_candidate, "eval_candidate": eval_candidate,
"scoring_params": {}, "scoring_params": {},
} }
st.markdown("##### Evaluation Task")
st.write("Go back to (1) Select Eval Task to make changes to the eval task. ") with st.expander("View Evaluation Task"):
st.json(eval_tasks[selected_eval_task], expanded=True) st.json(eval_tasks[selected_eval_task], expanded=True)
st.markdown("##### Evaluation Task Configuration") with st.expander("View Evaluation Task Configuration"):
st.write(
"Go back to (2) Define Eval Candidate and (3) Define Scoring Parameters to make changes to the configuration. "
)
st.json(eval_task_config, expanded=True) st.json(eval_task_config, expanded=True)
# Add run button and handle evaluation # Add run button and handle evaluation
if st.button("Run Evaluation"): if st.button("Run Evaluation"):
progress_text = "Running evaluation..." progress_text = "Running evaluation..."
progress_bar = st.progress(0, text=progress_text) progress_bar = st.progress(0, text=progress_text)
rows = rows.rows rows = rows.rows
if num_rows < total_rows: if num_rows < total_rows:
rows = rows[:num_rows] rows = rows[:num_rows]
# Create separate containers for progress text and results # Create separate containers for progress text and results
progress_text_container = st.empty() progress_text_container = st.empty()
results_container = st.empty() results_container = st.empty()
output_res = {} output_res = {}
for i, r in enumerate(rows): for i, r in enumerate(rows):
# Update progress # Update progress
progress = i / len(rows) progress = i / len(rows)
progress_bar.progress(progress, text=progress_text) progress_bar.progress(progress, text=progress_text)
# Run evaluation for current row # Run evaluation for current row
eval_res = llama_stack_api.client.eval.evaluate_rows( eval_res = llama_stack_api.client.eval.evaluate_rows(
task_id=selected_eval_task, task_id=selected_eval_task,
input_rows=[r], input_rows=[r],
scoring_functions=eval_tasks[selected_eval_task].scoring_functions, scoring_functions=eval_tasks[selected_eval_task].scoring_functions,
task_config=eval_task_config, task_config=eval_task_config,
) )
for k in r.keys(): for k in r.keys():
if k not in output_res: if k not in output_res:
output_res[k] = [] output_res[k] = []
output_res[k].append(r[k]) output_res[k].append(r[k])
for k in eval_res.generations[0].keys(): for k in eval_res.generations[0].keys():
if k not in output_res: if k not in output_res:
output_res[k] = [] output_res[k] = []
output_res[k].append(eval_res.generations[0][k]) output_res[k].append(eval_res.generations[0][k])
for scoring_fn in eval_tasks[selected_eval_task].scoring_functions: for scoring_fn in eval_tasks[selected_eval_task].scoring_functions:
if scoring_fn not in output_res: if scoring_fn not in output_res:
output_res[scoring_fn] = [] output_res[scoring_fn] = []
output_res[scoring_fn].append( output_res[scoring_fn].append(eval_res.scores[scoring_fn].score_rows[0])
eval_res.scores[scoring_fn].score_rows[0]
)
progress_text_container.write( progress_text_container.write(
f"Expand to see current processed result ({i+1}/{len(rows)})" f"Expand to see current processed result ({i+1}/{len(rows)})"
) )
results_container.json(eval_res, expanded=2) results_container.json(eval_res, expanded=2)
progress_bar.progress(1.0, text="Evaluation complete!") progress_bar.progress(1.0, text="Evaluation complete!")
# Display results in dataframe # Display results in dataframe
if output_res: if output_res:
output_df = pd.DataFrame(output_res) output_df = pd.DataFrame(output_res)
st.subheader("Evaluation Results") st.subheader("Evaluation Results")
st.dataframe(output_df) st.dataframe(output_df)
def native_evaluation_page():
st.set_page_config(page_title="Evaluations (Generation + Scoring)", page_icon="🦙")
st.title("📊 Evaluations (Generation + Scoring)")
# Create tabs
# task_tab, candidate_tab, params_tab, run_tab = st.tabs(
# [
# "(1) Select Eval Task",
# "(2) Define Eval Candidate",
# "(3) Define Scoring Parameters",
# "(4) Run Evaluation",
# ]
# )
select_eval_task_1()
define_eval_candidate_2()
define_scoring_params_3()
run_evaluation_4()
native_evaluation_page() native_evaluation_page()