native eval

2025-12-17 07:22:35 +00:00 · 2024-12-02 15:38:58 -08:00 · 2024-12-02 15:38:58 -08:00 · b59810cd9a
commit b59810cd9a
parent de2ab1243a
1 changed files with 205 additions and 13 deletions
--- a/llama_stack/distribution/ui/page/evaluations/native_eval.py
+++ b/llama_stack/distribution/ui/page/evaluations/native_eval.py
@ -3,6 +3,11 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import json
 import pandas as pd
 import streamlit as st
 from modules.api import llama_stack_api
@ -13,21 +18,208 @@ def native_evaluation_page():
    st.set_page_config(page_title="Native Evaluations", page_icon="🦙")
    st.title("🦙 Llama Stack Native Evaluations")
-    # Select Eval Tasks
+    # Create tabs
-    st.subheader("Select Eval Tasks")
+    task_tab, candidate_tab, params_tab, run_tab = st.tabs(
-    eval_tasks = llama_stack_api.client.eval_tasks.list()
+        [
-    eval_tasks = {et.identifier: et for et in eval_tasks}
+            "(1) Select Eval Task",
-    eval_tasks_names = list(eval_tasks.keys())
+            "(2) Define Eval Candidate",
-    selected_eval_task = st.selectbox(
+            "(3) Define Scoring Parameters",
-        "Choose an eval task.",
+            "(4) Run Evaluation",
-        options=eval_tasks_names,
+        ]
        help="Choose an eval task. Each eval task is parameterized by a dataset, and list of scoring functions.",
    )
    st.json(eval_tasks[selected_eval_task], expanded=True)
-    # Define Eval Candidate
+    with task_tab:
-    st.subheader("Define Eval Candidate")
+        # Select Eval Tasks
-    # eval_candidate = {}
+        eval_tasks = llama_stack_api.client.eval_tasks.list()
        eval_tasks = {et.identifier: et for et in eval_tasks}
        eval_tasks_names = list(eval_tasks.keys())
        selected_eval_task = st.selectbox(
            "Choose an eval task.",
            options=eval_tasks_names,
            help="Choose an eval task. Each eval task is parameterized by a dataset, and list of scoring functions.",
        )
        st.json(eval_tasks[selected_eval_task], expanded=True)
    with candidate_tab:
        # Define Eval Candidate
        candidate_type = st.radio("Candidate Type", ["model", "agent"])
        available_models = llama_stack_api.client.models.list()
        available_models = [model.identifier for model in available_models]
        selected_model = st.selectbox(
            "Choose a model",
            available_models,
            index=0,
        )
        # Sampling Parameters
        st.markdown("##### Sampling Parameters")
        strategy = st.selectbox(
            "Strategy",
            ["greedy", "top_p", "top_k"],
            index=0,
        )
        temperature = st.slider(
            "Temperature",
            min_value=0.0,
            max_value=1.0,
            value=0.0,
            step=0.1,
            help="Controls the randomness of the response. Higher values make the output more creative and unexpected, lower values make it more conservative and predictable",
        )
        top_p = st.slider(
            "Top P",
            min_value=0.0,
            max_value=1.0,
            value=0.95,
            step=0.1,
        )
        max_tokens = st.slider(
            "Max Tokens",
            min_value=0,
            max_value=4096,
            value=512,
            step=1,
            help="The maximum number of tokens to generate",
        )
        repetition_penalty = st.slider(
            "Repetition Penalty",
            min_value=1.0,
            max_value=2.0,
            value=1.0,
            step=0.1,
            help="Controls the likelihood for generating the same word or phrase multiple times in the same sentence or paragraph. 1 implies no penalty, 2 will strongly discourage model to repeat words or phrases.",
        )
        if candidate_type == "model":
            eval_candidate = {
                "type": "model",
                "model": selected_model,
                "sampling_params": {
                    "strategy": strategy,
                    "temperature": temperature,
                    "top_p": top_p,
                    "max_tokens": max_tokens,
                    "repetition_penalty": repetition_penalty,
                },
            }
        elif candidate_type == "agent":
            system_prompt = st.text_area(
                "System Prompt",
                value="You are a helpful AI assistant.",
                help="Initial instructions given to the AI to set its behavior and context",
            )
            tools_json = st.text_area(
                "Tools Configuration (JSON)",
                value=json.dumps(
                    [
                        {
                            "type": "brave_search",
                            "engine": "brave",
                            "api_key": "ENTER_BRAVE_API_KEY_HERE",
                        }
                    ]
                ),
                help="Enter tool configurations in JSON format. Each tool should have a name, description, and parameters.",
                height=200,
            )
            try:
                tools = json.loads(tools_json)
            except json.JSONDecodeError:
                st.error("Invalid JSON format for tools configuration")
                tools = []
            eval_candidate = {
                "type": "agent",
                "config": {
                    "model": selected_model,
                    "instructions": system_prompt,
                    "tools": tools,
                    "tool_choice": "auto",
                    "tool_prompt_format": "json",
                    "input_shields": [],
                    "output_shields": [],
                    "enable_session_persistence": False,
                },
            }
    with params_tab:
        st.write("Define scoring function parameters here")
    with run_tab:
        dataset_id = eval_tasks[selected_eval_task].dataset_id
        rows = llama_stack_api.client.datasetio.get_rows_paginated(
            dataset_id=dataset_id,
            rows_in_page=-1,
        )
        total_rows = len(rows.rows)
        # Add number of examples control
        num_rows = st.number_input(
            "Number of Examples to Evaluate",
            min_value=1,
            max_value=total_rows,
            value=5,
            help="Number of examples from the dataset to evaluate. ",
        )
        eval_task_config = {
            "type": "benchmark",
            "eval_candidate": eval_candidate,
            "scoring_params": {},
        }
        st.json(eval_tasks[selected_eval_task], expanded=True)
        st.json(eval_task_config, expanded=True)
        # Add run button and handle evaluation
        if st.button("Run Evaluation"):
            progress_text = "Running evaluation..."
            progress_bar = st.progress(0, text=progress_text)
            rows = rows.rows
            if num_rows < total_rows:
                rows = rows[:num_rows]
            # Create separate containers for progress text and results
            progress_text_container = st.empty()
            results_container = st.empty()
            output_res = {}
            for i, r in enumerate(rows):
                # Update progress
                progress = i / len(rows)
                progress_bar.progress(progress, text=progress_text)
                # Run evaluation for current row
                eval_res = llama_stack_api.client.eval.evaluate_rows(
                    task_id=selected_eval_task,
                    input_rows=[r],
                    scoring_functions=eval_tasks[selected_eval_task].scoring_functions,
                    task_config=eval_task_config,
                )
                for k in r.keys():
                    if k not in output_res:
                        output_res[k] = []
                    output_res[k].append(r[k])
                for k in eval_res.generations[0].keys():
                    if k not in output_res:
                        output_res[k] = []
                    output_res[k].append(eval_res.generations[0][k])
                for scoring_fn in eval_tasks[selected_eval_task].scoring_functions:
                    if scoring_fn not in output_res:
                        output_res[scoring_fn] = []
                    output_res[scoring_fn].append(
                        eval_res.scores[scoring_fn].score_rows[0]
                    )
                progress_text_container.write(
                    f"Expand to see current processed result ({i+1}/{len(rows)})"
                )
                results_container.json(eval_res, expanded=2)
            progress_bar.progress(1.0, text="Evaluation complete!")
            # Display results in dataframe
            if output_res:
                output_df = pd.DataFrame(output_res)
                st.subheader("Evaluation Results")
                st.dataframe(output_df)
 native_evaluation_page()