llama-stack-mirror/llama_stack/distribution/ui/page/evaluations/native_eval.py

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.

import json

import pandas as pd

import streamlit as st

from modules.api import llama_stack_api


def native_evaluation_page():

    st.set_page_config(page_title="Evaluations (Generation + Scoring)", page_icon="🦙")
    st.title("📊 Evaluations (Generation + Scoring)")

    # Create tabs
    task_tab, candidate_tab, params_tab, run_tab = st.tabs(
        [
            "(1) Select Eval Task",
            "(2) Define Eval Candidate",
            "(3) Define Scoring Parameters",
            "(4) Run Evaluation",
        ]
    )

    with task_tab:
        # Select Eval Tasks
        eval_tasks = llama_stack_api.client.eval_tasks.list()
        eval_tasks = {et.identifier: et for et in eval_tasks}
        eval_tasks_names = list(eval_tasks.keys())
        selected_eval_task = st.selectbox(
            "Choose an eval task.",
            options=eval_tasks_names,
            help="Choose an eval task. Each eval task is parameterized by a dataset, and list of scoring functions.",
        )
        st.json(eval_tasks[selected_eval_task], expanded=True)

    with candidate_tab:
        # Define Eval Candidate
        candidate_type = st.radio("Candidate Type", ["model", "agent"])

        available_models = llama_stack_api.client.models.list()
        available_models = [model.identifier for model in available_models]
        selected_model = st.selectbox(
            "Choose a model",
            available_models,
            index=0,
        )

        # Sampling Parameters
        st.markdown("##### Sampling Parameters")
        strategy = st.selectbox(
            "Strategy",
            ["greedy", "top_p", "top_k"],
            index=0,
        )
        temperature = st.slider(
            "Temperature",
            min_value=0.0,
            max_value=1.0,
            value=0.0,
            step=0.1,
            help="Controls the randomness of the response. Higher values make the output more creative and unexpected, lower values make it more conservative and predictable",
        )
        top_p = st.slider(
            "Top P",
            min_value=0.0,
            max_value=1.0,
            value=0.95,
            step=0.1,
        )
        max_tokens = st.slider(
            "Max Tokens",
            min_value=0,
            max_value=4096,
            value=512,
            step=1,
            help="The maximum number of tokens to generate",
        )
        repetition_penalty = st.slider(
            "Repetition Penalty",
            min_value=1.0,
            max_value=2.0,
            value=1.0,
            step=0.1,
            help="Controls the likelihood for generating the same word or phrase multiple times in the same sentence or paragraph. 1 implies no penalty, 2 will strongly discourage model to repeat words or phrases.",
        )
        if candidate_type == "model":
            eval_candidate = {
                "type": "model",
                "model": selected_model,
                "sampling_params": {
                    "strategy": strategy,
                    "temperature": temperature,
                    "top_p": top_p,
                    "max_tokens": max_tokens,
                    "repetition_penalty": repetition_penalty,
                },
            }
        elif candidate_type == "agent":
            system_prompt = st.text_area(
                "System Prompt",
                value="You are a helpful AI assistant.",
                help="Initial instructions given to the AI to set its behavior and context",
            )
            tools_json = st.text_area(
                "Tools Configuration (JSON)",
                value=json.dumps(
                    [
                        {
                            "type": "brave_search",
                            "engine": "brave",
                            "api_key": "ENTER_BRAVE_API_KEY_HERE",
                        }
                    ]
                ),
                help="Enter tool configurations in JSON format. Each tool should have a name, description, and parameters.",
                height=200,
            )
            try:
                tools = json.loads(tools_json)
            except json.JSONDecodeError:
                st.error("Invalid JSON format for tools configuration")
                tools = []
            eval_candidate = {
                "type": "agent",
                "config": {
                    "model": selected_model,
                    "instructions": system_prompt,
                    "tools": tools,
                    "tool_choice": "auto",
                    "tool_prompt_format": "json",
                    "input_shields": [],
                    "output_shields": [],
                    "enable_session_persistence": False,
                },
            }

    with params_tab:
        st.write("(Optional) Define scoring function parameters here")

    with run_tab:
        # Add info box to explain configurations being used
        st.info(
            """
        Review the configurations that will be used for this evaluation run, make any necessary changes, and then click the "Run Evaluation" button.
        """
        )

        dataset_id = eval_tasks[selected_eval_task].dataset_id
        rows = llama_stack_api.client.datasetio.get_rows_paginated(
            dataset_id=dataset_id,
            rows_in_page=-1,
        )
        total_rows = len(rows.rows)
        # Add number of examples control
        num_rows = st.number_input(
            "Number of Examples to Evaluate",
            min_value=1,
            max_value=total_rows,
            value=5,
            help="Number of examples from the dataset to evaluate. ",
        )

        eval_task_config = {
            "type": "benchmark",
            "eval_candidate": eval_candidate,
            "scoring_params": {},
        }
        st.markdown("##### Evaluation Task")
        st.write("Go back to (1) Select Eval Task to make changes to the eval task. ")
        st.json(eval_tasks[selected_eval_task], expanded=True)
        st.markdown("##### Evaluation Task Configuration")
        st.write(
            "Go back to (2) Define Eval Candidate and (3) Define Scoring Parameters to make changes to the configuration. "
        )
        st.json(eval_task_config, expanded=True)

        # Add run button and handle evaluation
        if st.button("Run Evaluation"):

            progress_text = "Running evaluation..."
            progress_bar = st.progress(0, text=progress_text)
            rows = rows.rows
            if num_rows < total_rows:
                rows = rows[:num_rows]

            # Create separate containers for progress text and results
            progress_text_container = st.empty()
            results_container = st.empty()
            output_res = {}
            for i, r in enumerate(rows):
                # Update progress
                progress = i / len(rows)
                progress_bar.progress(progress, text=progress_text)
                # Run evaluation for current row
                eval_res = llama_stack_api.client.eval.evaluate_rows(
                    task_id=selected_eval_task,
                    input_rows=[r],
                    scoring_functions=eval_tasks[selected_eval_task].scoring_functions,
                    task_config=eval_task_config,
                )

                for k in r.keys():
                    if k not in output_res:
                        output_res[k] = []
                    output_res[k].append(r[k])

                for k in eval_res.generations[0].keys():
                    if k not in output_res:
                        output_res[k] = []
                    output_res[k].append(eval_res.generations[0][k])

                for scoring_fn in eval_tasks[selected_eval_task].scoring_functions:
                    if scoring_fn not in output_res:
                        output_res[scoring_fn] = []
                    output_res[scoring_fn].append(
                        eval_res.scores[scoring_fn].score_rows[0]
                    )

                progress_text_container.write(
                    f"Expand to see current processed result ({i+1}/{len(rows)})"
                )
                results_container.json(eval_res, expanded=2)

            progress_bar.progress(1.0, text="Evaluation complete!")
            # Display results in dataframe
            if output_res:
                output_df = pd.DataFrame(output_res)
                st.subheader("Evaluation Results")
                st.dataframe(output_df)


native_evaluation_page()