llama-stack-mirror/llama_stack/distribution/ui/page/evaluations/native_eval.py

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.

import json

import pandas as pd
import streamlit as st
from modules.api import llama_stack_api


def select_benchmark_1():
    # Select Benchmarks
    st.subheader("1. Choose An Eval Task")
    benchmarks = llama_stack_api.client.benchmarks.list()
    benchmarks = {et.identifier: et for et in benchmarks}
    benchmarks_names = list(benchmarks.keys())
    selected_benchmark = st.selectbox(
        "Choose an eval task.",
        options=benchmarks_names,
        help="Choose an eval task. Each eval task is parameterized by a dataset, and list of scoring functions.",
    )
    with st.expander("View Eval Task"):
        st.json(benchmarks[selected_benchmark], expanded=True)

    st.session_state["selected_benchmark"] = selected_benchmark
    st.session_state["benchmarks"] = benchmarks
    if st.button("Confirm", key="confirm_1"):
        st.session_state["selected_benchmark_1_next"] = True


def define_eval_candidate_2():
    if not st.session_state.get("selected_benchmark_1_next", None):
        return

    st.subheader("2. Define Eval Candidate")
    st.info(
        """
        Define the configurations for the evaluation candidate model or agent used for generation.
        Select "model" if you want to run generation with inference API, or "agent" if you want to run generation with agent API through specifying AgentConfig.
        """
    )
    with st.expander("Define Eval Candidate", expanded=True):
        # Define Eval Candidate
        candidate_type = st.radio("Candidate Type", ["model", "agent"])

        available_models = llama_stack_api.client.models.list()
        available_models = [model.identifier for model in available_models]
        selected_model = st.selectbox(
            "Choose a model",
            available_models,
            index=0,
        )

        # Sampling Parameters
        st.markdown("##### Sampling Parameters")
        temperature = st.slider(
            "Temperature",
            min_value=0.0,
            max_value=1.0,
            value=0.0,
            step=0.1,
            help="Controls the randomness of the response. Higher values make the output more creative and unexpected, lower values make it more conservative and predictable",
        )
        top_p = st.slider(
            "Top P",
            min_value=0.0,
            max_value=1.0,
            value=0.95,
            step=0.1,
        )
        max_tokens = st.slider(
            "Max Tokens",
            min_value=0,
            max_value=4096,
            value=512,
            step=1,
            help="The maximum number of tokens to generate",
        )
        repetition_penalty = st.slider(
            "Repetition Penalty",
            min_value=1.0,
            max_value=2.0,
            value=1.0,
            step=0.1,
            help="Controls the likelihood for generating the same word or phrase multiple times in the same sentence or paragraph. 1 implies no penalty, 2 will strongly discourage model to repeat words or phrases.",
        )
        if candidate_type == "model":
            if temperature > 0.0:
                strategy = {
                    "type": "top_p",
                    "temperature": temperature,
                    "top_p": top_p,
                }
            else:
                strategy = {"type": "greedy"}

            eval_candidate = {
                "type": "model",
                "model": selected_model,
                "sampling_params": {
                    "strategy": strategy,
                    "max_tokens": max_tokens,
                    "repetition_penalty": repetition_penalty,
                },
            }
        elif candidate_type == "agent":
            system_prompt = st.text_area(
                "System Prompt",
                value="You are a helpful AI assistant.",
                help="Initial instructions given to the AI to set its behavior and context",
            )
            tools_json = st.text_area(
                "Tools Configuration (JSON)",
                value=json.dumps(
                    [
                        {
                            "type": "brave_search",
                            "engine": "brave",
                            "api_key": "ENTER_BRAVE_API_KEY_HERE",
                        }
                    ]
                ),
                help="Enter tool configurations in JSON format. Each tool should have a name, description, and parameters.",
                height=200,
            )
            try:
                tools = json.loads(tools_json)
            except json.JSONDecodeError:
                st.error("Invalid JSON format for tools configuration")
                tools = []
            eval_candidate = {
                "type": "agent",
                "config": {
                    "model": selected_model,
                    "instructions": system_prompt,
                    "tools": tools,
                    "tool_choice": "auto",
                    "tool_prompt_format": "json",
                    "input_shields": [],
                    "output_shields": [],
                    "enable_session_persistence": False,
                },
            }
        st.session_state["eval_candidate"] = eval_candidate

    if st.button("Confirm", key="confirm_2"):
        st.session_state["selected_eval_candidate_2_next"] = True


def run_evaluation_3():
    if not st.session_state.get("selected_eval_candidate_2_next", None):
        return

    st.subheader("3. Run Evaluation")
    # Add info box to explain configurations being used
    st.info(
        """
        Review the configurations that will be used for this evaluation run, make any necessary changes, and then click the "Run Evaluation" button.
        """
    )
    selected_benchmark = st.session_state["selected_benchmark"]
    benchmarks = st.session_state["benchmarks"]
    eval_candidate = st.session_state["eval_candidate"]

    dataset_id = benchmarks[selected_benchmark].dataset_id
    rows = llama_stack_api.client.datasets.iterrows(
        dataset_id=dataset_id,
    )
    total_rows = len(rows.data)
    # Add number of examples control
    num_rows = st.number_input(
        "Number of Examples to Evaluate",
        min_value=1,
        max_value=total_rows,
        value=5,
        help="Number of examples from the dataset to evaluate. ",
    )

    benchmark_config = {
        "type": "benchmark",
        "eval_candidate": eval_candidate,
        "scoring_params": {},
    }

    with st.expander("View Evaluation Task", expanded=True):
        st.json(benchmarks[selected_benchmark], expanded=True)
    with st.expander("View Evaluation Task Configuration", expanded=True):
        st.json(benchmark_config, expanded=True)

    # Add run button and handle evaluation
    if st.button("Run Evaluation"):
        progress_text = "Running evaluation..."
        progress_bar = st.progress(0, text=progress_text)
        rows = rows.data
        if num_rows < total_rows:
            rows = rows[:num_rows]

        # Create separate containers for progress text and results
        progress_text_container = st.empty()
        results_container = st.empty()
        output_res = {}
        for i, r in enumerate(rows):
            # Update progress
            progress = i / len(rows)
            progress_bar.progress(progress, text=progress_text)
            # Run evaluation for current row
            eval_res = llama_stack_api.client.eval.evaluate_rows(
                benchmark_id=selected_benchmark,
                input_rows=[r],
                scoring_functions=benchmarks[selected_benchmark].scoring_functions,
                benchmark_config=benchmark_config,
            )

            for k in r.keys():
                if k not in output_res:
                    output_res[k] = []
                output_res[k].append(r[k])

            for k in eval_res.generations[0].keys():
                if k not in output_res:
                    output_res[k] = []
                output_res[k].append(eval_res.generations[0][k])

            for scoring_fn in benchmarks[selected_benchmark].scoring_functions:
                if scoring_fn not in output_res:
                    output_res[scoring_fn] = []
                output_res[scoring_fn].append(eval_res.scores[scoring_fn].score_rows[0])

            progress_text_container.write(f"Expand to see current processed result ({i + 1} / {len(rows)})")
            results_container.json(eval_res, expanded=2)

        progress_bar.progress(1.0, text="Evaluation complete!")
        # Display results in dataframe
        if output_res:
            output_df = pd.DataFrame(output_res)
            st.subheader("Evaluation Results")
            st.dataframe(output_df)


def native_evaluation_page():
    st.set_page_config(page_title="Evaluations (Generation + Scoring)", page_icon="🦙")
    st.title("📊 Evaluations (Generation + Scoring)")

    select_benchmark_1()
    define_eval_candidate_2()
    run_evaluation_3()


native_evaluation_page()