expander refactor

2025-12-17 11:12:36 +00:00 · 2024-12-03 16:20:31 -08:00 · 2024-12-03 16:20:31 -08:00 · 92f79d4dfb
commit 92f79d4dfb
parent e245f459bb
1 changed files with 120 additions and 103 deletions
--- a/llama_stack/distribution/ui/page/evaluations/native_eval.py
+++ b/llama_stack/distribution/ui/page/evaluations/native_eval.py
@ -13,34 +13,27 @@ import streamlit as st
 from modules.api import llama_stack_api
-def native_evaluation_page():
+def select_eval_task_1():
-
+    # Select Eval Tasks
-    st.set_page_config(page_title="Evaluations (Generation + Scoring)", page_icon="🦙")
+    st.subheader("1. Choose An Eval Task")
-    st.title("📊 Evaluations (Generation + Scoring)")
+    eval_tasks = llama_stack_api.client.eval_tasks.list()
-
+    eval_tasks = {et.identifier: et for et in eval_tasks}
-    # Create tabs
+    eval_tasks_names = list(eval_tasks.keys())
-    task_tab, candidate_tab, params_tab, run_tab = st.tabs(
+    selected_eval_task = st.selectbox(
-        [
+        "Choose an eval task.",
-            "(1) Select Eval Task",
+        options=eval_tasks_names,
-            "(2) Define Eval Candidate",
+        help="Choose an eval task. Each eval task is parameterized by a dataset, and list of scoring functions.",
            "(3) Define Scoring Parameters",
            "(4) Run Evaluation",
        ]
    )
-
+    with st.expander("View Eval Task"):
    with task_tab:
        # Select Eval Tasks
        eval_tasks = llama_stack_api.client.eval_tasks.list()
        eval_tasks = {et.identifier: et for et in eval_tasks}
        eval_tasks_names = list(eval_tasks.keys())
        selected_eval_task = st.selectbox(
            "Choose an eval task.",
            options=eval_tasks_names,
            help="Choose an eval task. Each eval task is parameterized by a dataset, and list of scoring functions.",
        )
        st.json(eval_tasks[selected_eval_task], expanded=True)
-    with candidate_tab:
+    st.session_state["selected_eval_task"] = selected_eval_task
    st.session_state["eval_tasks"] = eval_tasks
 def define_eval_candidate_2():
    st.subheader("2. Define Eval Candidate")
    with st.expander("Define Eval Candidate"):
        # Define Eval Candidate
        candidate_type = st.radio("Candidate Type", ["model", "agent"])
@ -140,100 +133,124 @@ def native_evaluation_page():
                    "enable_session_persistence": False,
                },
            }
        st.session_state["eval_candidate"] = eval_candidate
    with params_tab:
        st.write("(Optional) Define scoring function parameters here")
-    with run_tab:
+def define_scoring_params_3():
-        # Add info box to explain configurations being used
+    if not st.session_state.get("selected_eval_candidate_2_next", None):
-        st.info(
+        return
-            """
+    st.write("(Optional) Define scoring function parameters here")
 def run_evaluation_4():
    st.subheader("3. Run Evaluation")
    # Add info box to explain configurations being used
    st.info(
        """
        Review the configurations that will be used for this evaluation run, make any necessary changes, and then click the "Run Evaluation" button.
        """
-        )
+    )
    selected_eval_task = st.session_state["selected_eval_task"]
    eval_tasks = st.session_state["eval_tasks"]
    eval_candidate = st.session_state["eval_candidate"]
-        dataset_id = eval_tasks[selected_eval_task].dataset_id
+    dataset_id = eval_tasks[selected_eval_task].dataset_id
-        rows = llama_stack_api.client.datasetio.get_rows_paginated(
+    rows = llama_stack_api.client.datasetio.get_rows_paginated(
-            dataset_id=dataset_id,
+        dataset_id=dataset_id,
-            rows_in_page=-1,
+        rows_in_page=-1,
-        )
+    )
-        total_rows = len(rows.rows)
+    total_rows = len(rows.rows)
-        # Add number of examples control
+    # Add number of examples control
-        num_rows = st.number_input(
+    num_rows = st.number_input(
-            "Number of Examples to Evaluate",
+        "Number of Examples to Evaluate",
-            min_value=1,
+        min_value=1,
-            max_value=total_rows,
+        max_value=total_rows,
-            value=5,
+        value=5,
-            help="Number of examples from the dataset to evaluate. ",
+        help="Number of examples from the dataset to evaluate. ",
-        )
+    )
-        eval_task_config = {
+    eval_task_config = {
-            "type": "benchmark",
+        "type": "benchmark",
-            "eval_candidate": eval_candidate,
+        "eval_candidate": eval_candidate,
-            "scoring_params": {},
+        "scoring_params": {},
-        }
+    }
-        st.markdown("##### Evaluation Task")
+
-        st.write("Go back to (1) Select Eval Task to make changes to the eval task. ")
+    with st.expander("View Evaluation Task"):
        st.json(eval_tasks[selected_eval_task], expanded=True)
-        st.markdown("##### Evaluation Task Configuration")
+    with st.expander("View Evaluation Task Configuration"):
        st.write(
            "Go back to (2) Define Eval Candidate and (3) Define Scoring Parameters to make changes to the configuration. "
        )
        st.json(eval_task_config, expanded=True)
-        # Add run button and handle evaluation
+    # Add run button and handle evaluation
-        if st.button("Run Evaluation"):
+    if st.button("Run Evaluation"):
-            progress_text = "Running evaluation..."
+        progress_text = "Running evaluation..."
-            progress_bar = st.progress(0, text=progress_text)
+        progress_bar = st.progress(0, text=progress_text)
-            rows = rows.rows
+        rows = rows.rows
-            if num_rows < total_rows:
+        if num_rows < total_rows:
-                rows = rows[:num_rows]
+            rows = rows[:num_rows]
-            # Create separate containers for progress text and results
+        # Create separate containers for progress text and results
-            progress_text_container = st.empty()
+        progress_text_container = st.empty()
-            results_container = st.empty()
+        results_container = st.empty()
-            output_res = {}
+        output_res = {}
-            for i, r in enumerate(rows):
+        for i, r in enumerate(rows):
-                # Update progress
+            # Update progress
-                progress = i / len(rows)
+            progress = i / len(rows)
-                progress_bar.progress(progress, text=progress_text)
+            progress_bar.progress(progress, text=progress_text)
-                # Run evaluation for current row
+            # Run evaluation for current row
-                eval_res = llama_stack_api.client.eval.evaluate_rows(
+            eval_res = llama_stack_api.client.eval.evaluate_rows(
-                    task_id=selected_eval_task,
+                task_id=selected_eval_task,
-                    input_rows=[r],
+                input_rows=[r],
-                    scoring_functions=eval_tasks[selected_eval_task].scoring_functions,
+                scoring_functions=eval_tasks[selected_eval_task].scoring_functions,
-                    task_config=eval_task_config,
+                task_config=eval_task_config,
-                )
+            )
-                for k in r.keys():
+            for k in r.keys():
-                    if k not in output_res:
+                if k not in output_res:
-                        output_res[k] = []
+                    output_res[k] = []
-                    output_res[k].append(r[k])
+                output_res[k].append(r[k])
-                for k in eval_res.generations[0].keys():
+            for k in eval_res.generations[0].keys():
-                    if k not in output_res:
+                if k not in output_res:
-                        output_res[k] = []
+                    output_res[k] = []
-                    output_res[k].append(eval_res.generations[0][k])
+                output_res[k].append(eval_res.generations[0][k])
-                for scoring_fn in eval_tasks[selected_eval_task].scoring_functions:
+            for scoring_fn in eval_tasks[selected_eval_task].scoring_functions:
-                    if scoring_fn not in output_res:
+                if scoring_fn not in output_res:
-                        output_res[scoring_fn] = []
+                    output_res[scoring_fn] = []
-                    output_res[scoring_fn].append(
+                output_res[scoring_fn].append(eval_res.scores[scoring_fn].score_rows[0])
                        eval_res.scores[scoring_fn].score_rows[0]
                    )
-                progress_text_container.write(
+            progress_text_container.write(
-                    f"Expand to see current processed result ({i+1}/{len(rows)})"
+                f"Expand to see current processed result ({i+1}/{len(rows)})"
-                )
+            )
-                results_container.json(eval_res, expanded=2)
+            results_container.json(eval_res, expanded=2)
-            progress_bar.progress(1.0, text="Evaluation complete!")
+        progress_bar.progress(1.0, text="Evaluation complete!")
-            # Display results in dataframe
+        # Display results in dataframe
-            if output_res:
+        if output_res:
-                output_df = pd.DataFrame(output_res)
+            output_df = pd.DataFrame(output_res)
-                st.subheader("Evaluation Results")
+            st.subheader("Evaluation Results")
-                st.dataframe(output_df)
+            st.dataframe(output_df)
 def native_evaluation_page():
    st.set_page_config(page_title="Evaluations (Generation + Scoring)", page_icon="🦙")
    st.title("📊 Evaluations (Generation + Scoring)")
    # Create tabs
    # task_tab, candidate_tab, params_tab, run_tab = st.tabs(
    #     [
    #         "(1) Select Eval Task",
    #         "(2) Define Eval Candidate",
    #         "(3) Define Scoring Parameters",
    #         "(4) Run Evaluation",
    #     ]
    # )
    select_eval_task_1()
    define_eval_candidate_2()
    define_scoring_params_3()
    run_evaluation_4()
 native_evaluation_page()