mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-02 08:44:44 +00:00
expander refactor
This commit is contained in:
parent
e245f459bb
commit
92f79d4dfb
1 changed files with 120 additions and 103 deletions
|
@ -13,34 +13,27 @@ import streamlit as st
|
||||||
from modules.api import llama_stack_api
|
from modules.api import llama_stack_api
|
||||||
|
|
||||||
|
|
||||||
def native_evaluation_page():
|
def select_eval_task_1():
|
||||||
|
# Select Eval Tasks
|
||||||
st.set_page_config(page_title="Evaluations (Generation + Scoring)", page_icon="🦙")
|
st.subheader("1. Choose An Eval Task")
|
||||||
st.title("📊 Evaluations (Generation + Scoring)")
|
eval_tasks = llama_stack_api.client.eval_tasks.list()
|
||||||
|
eval_tasks = {et.identifier: et for et in eval_tasks}
|
||||||
# Create tabs
|
eval_tasks_names = list(eval_tasks.keys())
|
||||||
task_tab, candidate_tab, params_tab, run_tab = st.tabs(
|
selected_eval_task = st.selectbox(
|
||||||
[
|
"Choose an eval task.",
|
||||||
"(1) Select Eval Task",
|
options=eval_tasks_names,
|
||||||
"(2) Define Eval Candidate",
|
help="Choose an eval task. Each eval task is parameterized by a dataset, and list of scoring functions.",
|
||||||
"(3) Define Scoring Parameters",
|
|
||||||
"(4) Run Evaluation",
|
|
||||||
]
|
|
||||||
)
|
)
|
||||||
|
with st.expander("View Eval Task"):
|
||||||
with task_tab:
|
|
||||||
# Select Eval Tasks
|
|
||||||
eval_tasks = llama_stack_api.client.eval_tasks.list()
|
|
||||||
eval_tasks = {et.identifier: et for et in eval_tasks}
|
|
||||||
eval_tasks_names = list(eval_tasks.keys())
|
|
||||||
selected_eval_task = st.selectbox(
|
|
||||||
"Choose an eval task.",
|
|
||||||
options=eval_tasks_names,
|
|
||||||
help="Choose an eval task. Each eval task is parameterized by a dataset, and list of scoring functions.",
|
|
||||||
)
|
|
||||||
st.json(eval_tasks[selected_eval_task], expanded=True)
|
st.json(eval_tasks[selected_eval_task], expanded=True)
|
||||||
|
|
||||||
with candidate_tab:
|
st.session_state["selected_eval_task"] = selected_eval_task
|
||||||
|
st.session_state["eval_tasks"] = eval_tasks
|
||||||
|
|
||||||
|
|
||||||
|
def define_eval_candidate_2():
|
||||||
|
st.subheader("2. Define Eval Candidate")
|
||||||
|
with st.expander("Define Eval Candidate"):
|
||||||
# Define Eval Candidate
|
# Define Eval Candidate
|
||||||
candidate_type = st.radio("Candidate Type", ["model", "agent"])
|
candidate_type = st.radio("Candidate Type", ["model", "agent"])
|
||||||
|
|
||||||
|
@ -140,100 +133,124 @@ def native_evaluation_page():
|
||||||
"enable_session_persistence": False,
|
"enable_session_persistence": False,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
st.session_state["eval_candidate"] = eval_candidate
|
||||||
|
|
||||||
with params_tab:
|
|
||||||
st.write("(Optional) Define scoring function parameters here")
|
|
||||||
|
|
||||||
with run_tab:
|
def define_scoring_params_3():
|
||||||
# Add info box to explain configurations being used
|
if not st.session_state.get("selected_eval_candidate_2_next", None):
|
||||||
st.info(
|
return
|
||||||
"""
|
st.write("(Optional) Define scoring function parameters here")
|
||||||
|
|
||||||
|
|
||||||
|
def run_evaluation_4():
|
||||||
|
st.subheader("3. Run Evaluation")
|
||||||
|
# Add info box to explain configurations being used
|
||||||
|
st.info(
|
||||||
|
"""
|
||||||
Review the configurations that will be used for this evaluation run, make any necessary changes, and then click the "Run Evaluation" button.
|
Review the configurations that will be used for this evaluation run, make any necessary changes, and then click the "Run Evaluation" button.
|
||||||
"""
|
"""
|
||||||
)
|
)
|
||||||
|
selected_eval_task = st.session_state["selected_eval_task"]
|
||||||
|
eval_tasks = st.session_state["eval_tasks"]
|
||||||
|
eval_candidate = st.session_state["eval_candidate"]
|
||||||
|
|
||||||
dataset_id = eval_tasks[selected_eval_task].dataset_id
|
dataset_id = eval_tasks[selected_eval_task].dataset_id
|
||||||
rows = llama_stack_api.client.datasetio.get_rows_paginated(
|
rows = llama_stack_api.client.datasetio.get_rows_paginated(
|
||||||
dataset_id=dataset_id,
|
dataset_id=dataset_id,
|
||||||
rows_in_page=-1,
|
rows_in_page=-1,
|
||||||
)
|
)
|
||||||
total_rows = len(rows.rows)
|
total_rows = len(rows.rows)
|
||||||
# Add number of examples control
|
# Add number of examples control
|
||||||
num_rows = st.number_input(
|
num_rows = st.number_input(
|
||||||
"Number of Examples to Evaluate",
|
"Number of Examples to Evaluate",
|
||||||
min_value=1,
|
min_value=1,
|
||||||
max_value=total_rows,
|
max_value=total_rows,
|
||||||
value=5,
|
value=5,
|
||||||
help="Number of examples from the dataset to evaluate. ",
|
help="Number of examples from the dataset to evaluate. ",
|
||||||
)
|
)
|
||||||
|
|
||||||
eval_task_config = {
|
eval_task_config = {
|
||||||
"type": "benchmark",
|
"type": "benchmark",
|
||||||
"eval_candidate": eval_candidate,
|
"eval_candidate": eval_candidate,
|
||||||
"scoring_params": {},
|
"scoring_params": {},
|
||||||
}
|
}
|
||||||
st.markdown("##### Evaluation Task")
|
|
||||||
st.write("Go back to (1) Select Eval Task to make changes to the eval task. ")
|
with st.expander("View Evaluation Task"):
|
||||||
st.json(eval_tasks[selected_eval_task], expanded=True)
|
st.json(eval_tasks[selected_eval_task], expanded=True)
|
||||||
st.markdown("##### Evaluation Task Configuration")
|
with st.expander("View Evaluation Task Configuration"):
|
||||||
st.write(
|
|
||||||
"Go back to (2) Define Eval Candidate and (3) Define Scoring Parameters to make changes to the configuration. "
|
|
||||||
)
|
|
||||||
st.json(eval_task_config, expanded=True)
|
st.json(eval_task_config, expanded=True)
|
||||||
|
|
||||||
# Add run button and handle evaluation
|
# Add run button and handle evaluation
|
||||||
if st.button("Run Evaluation"):
|
if st.button("Run Evaluation"):
|
||||||
|
|
||||||
progress_text = "Running evaluation..."
|
progress_text = "Running evaluation..."
|
||||||
progress_bar = st.progress(0, text=progress_text)
|
progress_bar = st.progress(0, text=progress_text)
|
||||||
rows = rows.rows
|
rows = rows.rows
|
||||||
if num_rows < total_rows:
|
if num_rows < total_rows:
|
||||||
rows = rows[:num_rows]
|
rows = rows[:num_rows]
|
||||||
|
|
||||||
# Create separate containers for progress text and results
|
# Create separate containers for progress text and results
|
||||||
progress_text_container = st.empty()
|
progress_text_container = st.empty()
|
||||||
results_container = st.empty()
|
results_container = st.empty()
|
||||||
output_res = {}
|
output_res = {}
|
||||||
for i, r in enumerate(rows):
|
for i, r in enumerate(rows):
|
||||||
# Update progress
|
# Update progress
|
||||||
progress = i / len(rows)
|
progress = i / len(rows)
|
||||||
progress_bar.progress(progress, text=progress_text)
|
progress_bar.progress(progress, text=progress_text)
|
||||||
# Run evaluation for current row
|
# Run evaluation for current row
|
||||||
eval_res = llama_stack_api.client.eval.evaluate_rows(
|
eval_res = llama_stack_api.client.eval.evaluate_rows(
|
||||||
task_id=selected_eval_task,
|
task_id=selected_eval_task,
|
||||||
input_rows=[r],
|
input_rows=[r],
|
||||||
scoring_functions=eval_tasks[selected_eval_task].scoring_functions,
|
scoring_functions=eval_tasks[selected_eval_task].scoring_functions,
|
||||||
task_config=eval_task_config,
|
task_config=eval_task_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
for k in r.keys():
|
for k in r.keys():
|
||||||
if k not in output_res:
|
if k not in output_res:
|
||||||
output_res[k] = []
|
output_res[k] = []
|
||||||
output_res[k].append(r[k])
|
output_res[k].append(r[k])
|
||||||
|
|
||||||
for k in eval_res.generations[0].keys():
|
for k in eval_res.generations[0].keys():
|
||||||
if k not in output_res:
|
if k not in output_res:
|
||||||
output_res[k] = []
|
output_res[k] = []
|
||||||
output_res[k].append(eval_res.generations[0][k])
|
output_res[k].append(eval_res.generations[0][k])
|
||||||
|
|
||||||
for scoring_fn in eval_tasks[selected_eval_task].scoring_functions:
|
for scoring_fn in eval_tasks[selected_eval_task].scoring_functions:
|
||||||
if scoring_fn not in output_res:
|
if scoring_fn not in output_res:
|
||||||
output_res[scoring_fn] = []
|
output_res[scoring_fn] = []
|
||||||
output_res[scoring_fn].append(
|
output_res[scoring_fn].append(eval_res.scores[scoring_fn].score_rows[0])
|
||||||
eval_res.scores[scoring_fn].score_rows[0]
|
|
||||||
)
|
|
||||||
|
|
||||||
progress_text_container.write(
|
progress_text_container.write(
|
||||||
f"Expand to see current processed result ({i+1}/{len(rows)})"
|
f"Expand to see current processed result ({i+1}/{len(rows)})"
|
||||||
)
|
)
|
||||||
results_container.json(eval_res, expanded=2)
|
results_container.json(eval_res, expanded=2)
|
||||||
|
|
||||||
progress_bar.progress(1.0, text="Evaluation complete!")
|
progress_bar.progress(1.0, text="Evaluation complete!")
|
||||||
# Display results in dataframe
|
# Display results in dataframe
|
||||||
if output_res:
|
if output_res:
|
||||||
output_df = pd.DataFrame(output_res)
|
output_df = pd.DataFrame(output_res)
|
||||||
st.subheader("Evaluation Results")
|
st.subheader("Evaluation Results")
|
||||||
st.dataframe(output_df)
|
st.dataframe(output_df)
|
||||||
|
|
||||||
|
|
||||||
|
def native_evaluation_page():
|
||||||
|
|
||||||
|
st.set_page_config(page_title="Evaluations (Generation + Scoring)", page_icon="🦙")
|
||||||
|
st.title("📊 Evaluations (Generation + Scoring)")
|
||||||
|
|
||||||
|
# Create tabs
|
||||||
|
# task_tab, candidate_tab, params_tab, run_tab = st.tabs(
|
||||||
|
# [
|
||||||
|
# "(1) Select Eval Task",
|
||||||
|
# "(2) Define Eval Candidate",
|
||||||
|
# "(3) Define Scoring Parameters",
|
||||||
|
# "(4) Run Evaluation",
|
||||||
|
# ]
|
||||||
|
# )
|
||||||
|
select_eval_task_1()
|
||||||
|
define_eval_candidate_2()
|
||||||
|
define_scoring_params_3()
|
||||||
|
run_evaluation_4()
|
||||||
|
|
||||||
|
|
||||||
native_evaluation_page()
|
native_evaluation_page()
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue