native eval

2025-12-17 05:32:36 +00:00 · 2024-12-02 15:38:58 -08:00 · 2024-12-02 15:38:58 -08:00 · b59810cd9a
commit b59810cd9a
parent de2ab1243a
1 changed files with 205 additions and 13 deletions
--- a/llama_stack/distribution/ui/page/evaluations/native_eval.py
+++ b/llama_stack/distribution/ui/page/evaluations/native_eval.py
@ -3,6 +3,11 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+
+import json
+
+import pandas as pd
+
 import streamlit as st

 from modules.api import llama_stack_api
@ -13,21 +18,208 @@ def native_evaluation_page():
    st.set_page_config(page_title="Native Evaluations", page_icon="🦙")
    st.title("🦙 Llama Stack Native Evaluations")

-    # Select Eval Tasks
-    st.subheader("Select Eval Tasks")
-    eval_tasks = llama_stack_api.client.eval_tasks.list()
-    eval_tasks = {et.identifier: et for et in eval_tasks}
-    eval_tasks_names = list(eval_tasks.keys())
-    selected_eval_task = st.selectbox(
-        "Choose an eval task.",
-        options=eval_tasks_names,
-        help="Choose an eval task. Each eval task is parameterized by a dataset, and list of scoring functions.",
+    # Create tabs
+    task_tab, candidate_tab, params_tab, run_tab = st.tabs(
+        [
+            "(1) Select Eval Task",
+            "(2) Define Eval Candidate",
+            "(3) Define Scoring Parameters",
+            "(4) Run Evaluation",
+        ]
    )
-    st.json(eval_tasks[selected_eval_task], expanded=True)

-    # Define Eval Candidate
-    st.subheader("Define Eval Candidate")
-    # eval_candidate = {}
+    with task_tab:
+        # Select Eval Tasks
+        eval_tasks = llama_stack_api.client.eval_tasks.list()
+        eval_tasks = {et.identifier: et for et in eval_tasks}
+        eval_tasks_names = list(eval_tasks.keys())
+        selected_eval_task = st.selectbox(
+            "Choose an eval task.",
+            options=eval_tasks_names,
+            help="Choose an eval task. Each eval task is parameterized by a dataset, and list of scoring functions.",
+        )
+        st.json(eval_tasks[selected_eval_task], expanded=True)
+
+    with candidate_tab:
+        # Define Eval Candidate
+        candidate_type = st.radio("Candidate Type", ["model", "agent"])
+
+        available_models = llama_stack_api.client.models.list()
+        available_models = [model.identifier for model in available_models]
+        selected_model = st.selectbox(
+            "Choose a model",
+            available_models,
+            index=0,
+        )
+
+        # Sampling Parameters
+        st.markdown("##### Sampling Parameters")
+        strategy = st.selectbox(
+            "Strategy",
+            ["greedy", "top_p", "top_k"],
+            index=0,
+        )
+        temperature = st.slider(
+            "Temperature",
+            min_value=0.0,
+            max_value=1.0,
+            value=0.0,
+            step=0.1,
+            help="Controls the randomness of the response. Higher values make the output more creative and unexpected, lower values make it more conservative and predictable",
+        )
+        top_p = st.slider(
+            "Top P",
+            min_value=0.0,
+            max_value=1.0,
+            value=0.95,
+            step=0.1,
+        )
+        max_tokens = st.slider(
+            "Max Tokens",
+            min_value=0,
+            max_value=4096,
+            value=512,
+            step=1,
+            help="The maximum number of tokens to generate",
+        )
+        repetition_penalty = st.slider(
+            "Repetition Penalty",
+            min_value=1.0,
+            max_value=2.0,
+            value=1.0,
+            step=0.1,
+            help="Controls the likelihood for generating the same word or phrase multiple times in the same sentence or paragraph. 1 implies no penalty, 2 will strongly discourage model to repeat words or phrases.",
+        )
+        if candidate_type == "model":
+            eval_candidate = {
+                "type": "model",
+                "model": selected_model,
+                "sampling_params": {
+                    "strategy": strategy,
+                    "temperature": temperature,
+                    "top_p": top_p,
+                    "max_tokens": max_tokens,
+                    "repetition_penalty": repetition_penalty,
+                },
+            }
+        elif candidate_type == "agent":
+            system_prompt = st.text_area(
+                "System Prompt",
+                value="You are a helpful AI assistant.",
+                help="Initial instructions given to the AI to set its behavior and context",
+            )
+            tools_json = st.text_area(
+                "Tools Configuration (JSON)",
+                value=json.dumps(
+                    [
+                        {
+                            "type": "brave_search",
+                            "engine": "brave",
+                            "api_key": "ENTER_BRAVE_API_KEY_HERE",
+                        }
+                    ]
+                ),
+                help="Enter tool configurations in JSON format. Each tool should have a name, description, and parameters.",
+                height=200,
+            )
+            try:
+                tools = json.loads(tools_json)
+            except json.JSONDecodeError:
+                st.error("Invalid JSON format for tools configuration")
+                tools = []
+            eval_candidate = {
+                "type": "agent",
+                "config": {
+                    "model": selected_model,
+                    "instructions": system_prompt,
+                    "tools": tools,
+                    "tool_choice": "auto",
+                    "tool_prompt_format": "json",
+                    "input_shields": [],
+                    "output_shields": [],
+                    "enable_session_persistence": False,
+                },
+            }
+
+    with params_tab:
+        st.write("Define scoring function parameters here")
+
+    with run_tab:
+        dataset_id = eval_tasks[selected_eval_task].dataset_id
+        rows = llama_stack_api.client.datasetio.get_rows_paginated(
+            dataset_id=dataset_id,
+            rows_in_page=-1,
+        )
+        total_rows = len(rows.rows)
+        # Add number of examples control
+        num_rows = st.number_input(
+            "Number of Examples to Evaluate",
+            min_value=1,
+            max_value=total_rows,
+            value=5,
+            help="Number of examples from the dataset to evaluate. ",
+        )
+
+        eval_task_config = {
+            "type": "benchmark",
+            "eval_candidate": eval_candidate,
+            "scoring_params": {},
+        }
+        st.json(eval_tasks[selected_eval_task], expanded=True)
+        st.json(eval_task_config, expanded=True)
+
+        # Add run button and handle evaluation
+        if st.button("Run Evaluation"):
+            progress_text = "Running evaluation..."
+            progress_bar = st.progress(0, text=progress_text)
+            rows = rows.rows
+            if num_rows < total_rows:
+                rows = rows[:num_rows]
+
+            # Create separate containers for progress text and results
+            progress_text_container = st.empty()
+            results_container = st.empty()
+            output_res = {}
+            for i, r in enumerate(rows):
+                # Update progress
+                progress = i / len(rows)
+                progress_bar.progress(progress, text=progress_text)
+                # Run evaluation for current row
+                eval_res = llama_stack_api.client.eval.evaluate_rows(
+                    task_id=selected_eval_task,
+                    input_rows=[r],
+                    scoring_functions=eval_tasks[selected_eval_task].scoring_functions,
+                    task_config=eval_task_config,
+                )
+
+                for k in r.keys():
+                    if k not in output_res:
+                        output_res[k] = []
+                    output_res[k].append(r[k])
+
+                for k in eval_res.generations[0].keys():
+                    if k not in output_res:
+                        output_res[k] = []
+                    output_res[k].append(eval_res.generations[0][k])
+
+                for scoring_fn in eval_tasks[selected_eval_task].scoring_functions:
+                    if scoring_fn not in output_res:
+                        output_res[scoring_fn] = []
+                    output_res[scoring_fn].append(
+                        eval_res.scores[scoring_fn].score_rows[0]
+                    )
+
+                progress_text_container.write(
+                    f"Expand to see current processed result ({i+1}/{len(rows)})"
+                )
+                results_container.json(eval_res, expanded=2)
+
+            progress_bar.progress(1.0, text="Evaluation complete!")
+            # Display results in dataframe
+            if output_res:
+                output_df = pd.DataFrame(output_res)
+                st.subheader("Evaluation Results")
+                st.dataframe(output_df)


 native_evaluation_page()