diff --git a/llama_stack/distribution/ui/page/evaluations/native_eval.py b/llama_stack/distribution/ui/page/evaluations/native_eval.py index a79513e8f..772932d1d 100644 --- a/llama_stack/distribution/ui/page/evaluations/native_eval.py +++ b/llama_stack/distribution/ui/page/evaluations/native_eval.py @@ -3,6 +3,11 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. + +import json + +import pandas as pd + import streamlit as st from modules.api import llama_stack_api @@ -13,21 +18,208 @@ def native_evaluation_page(): st.set_page_config(page_title="Native Evaluations", page_icon="🦙") st.title("🦙 Llama Stack Native Evaluations") - # Select Eval Tasks - st.subheader("Select Eval Tasks") - eval_tasks = llama_stack_api.client.eval_tasks.list() - eval_tasks = {et.identifier: et for et in eval_tasks} - eval_tasks_names = list(eval_tasks.keys()) - selected_eval_task = st.selectbox( - "Choose an eval task.", - options=eval_tasks_names, - help="Choose an eval task. Each eval task is parameterized by a dataset, and list of scoring functions.", + # Create tabs + task_tab, candidate_tab, params_tab, run_tab = st.tabs( + [ + "(1) Select Eval Task", + "(2) Define Eval Candidate", + "(3) Define Scoring Parameters", + "(4) Run Evaluation", + ] ) - st.json(eval_tasks[selected_eval_task], expanded=True) - # Define Eval Candidate - st.subheader("Define Eval Candidate") - # eval_candidate = {} + with task_tab: + # Select Eval Tasks + eval_tasks = llama_stack_api.client.eval_tasks.list() + eval_tasks = {et.identifier: et for et in eval_tasks} + eval_tasks_names = list(eval_tasks.keys()) + selected_eval_task = st.selectbox( + "Choose an eval task.", + options=eval_tasks_names, + help="Choose an eval task. Each eval task is parameterized by a dataset, and list of scoring functions.", + ) + st.json(eval_tasks[selected_eval_task], expanded=True) + + with candidate_tab: + # Define Eval Candidate + candidate_type = st.radio("Candidate Type", ["model", "agent"]) + + available_models = llama_stack_api.client.models.list() + available_models = [model.identifier for model in available_models] + selected_model = st.selectbox( + "Choose a model", + available_models, + index=0, + ) + + # Sampling Parameters + st.markdown("##### Sampling Parameters") + strategy = st.selectbox( + "Strategy", + ["greedy", "top_p", "top_k"], + index=0, + ) + temperature = st.slider( + "Temperature", + min_value=0.0, + max_value=1.0, + value=0.0, + step=0.1, + help="Controls the randomness of the response. Higher values make the output more creative and unexpected, lower values make it more conservative and predictable", + ) + top_p = st.slider( + "Top P", + min_value=0.0, + max_value=1.0, + value=0.95, + step=0.1, + ) + max_tokens = st.slider( + "Max Tokens", + min_value=0, + max_value=4096, + value=512, + step=1, + help="The maximum number of tokens to generate", + ) + repetition_penalty = st.slider( + "Repetition Penalty", + min_value=1.0, + max_value=2.0, + value=1.0, + step=0.1, + help="Controls the likelihood for generating the same word or phrase multiple times in the same sentence or paragraph. 1 implies no penalty, 2 will strongly discourage model to repeat words or phrases.", + ) + if candidate_type == "model": + eval_candidate = { + "type": "model", + "model": selected_model, + "sampling_params": { + "strategy": strategy, + "temperature": temperature, + "top_p": top_p, + "max_tokens": max_tokens, + "repetition_penalty": repetition_penalty, + }, + } + elif candidate_type == "agent": + system_prompt = st.text_area( + "System Prompt", + value="You are a helpful AI assistant.", + help="Initial instructions given to the AI to set its behavior and context", + ) + tools_json = st.text_area( + "Tools Configuration (JSON)", + value=json.dumps( + [ + { + "type": "brave_search", + "engine": "brave", + "api_key": "ENTER_BRAVE_API_KEY_HERE", + } + ] + ), + help="Enter tool configurations in JSON format. Each tool should have a name, description, and parameters.", + height=200, + ) + try: + tools = json.loads(tools_json) + except json.JSONDecodeError: + st.error("Invalid JSON format for tools configuration") + tools = [] + eval_candidate = { + "type": "agent", + "config": { + "model": selected_model, + "instructions": system_prompt, + "tools": tools, + "tool_choice": "auto", + "tool_prompt_format": "json", + "input_shields": [], + "output_shields": [], + "enable_session_persistence": False, + }, + } + + with params_tab: + st.write("Define scoring function parameters here") + + with run_tab: + dataset_id = eval_tasks[selected_eval_task].dataset_id + rows = llama_stack_api.client.datasetio.get_rows_paginated( + dataset_id=dataset_id, + rows_in_page=-1, + ) + total_rows = len(rows.rows) + # Add number of examples control + num_rows = st.number_input( + "Number of Examples to Evaluate", + min_value=1, + max_value=total_rows, + value=5, + help="Number of examples from the dataset to evaluate. ", + ) + + eval_task_config = { + "type": "benchmark", + "eval_candidate": eval_candidate, + "scoring_params": {}, + } + st.json(eval_tasks[selected_eval_task], expanded=True) + st.json(eval_task_config, expanded=True) + + # Add run button and handle evaluation + if st.button("Run Evaluation"): + progress_text = "Running evaluation..." + progress_bar = st.progress(0, text=progress_text) + rows = rows.rows + if num_rows < total_rows: + rows = rows[:num_rows] + + # Create separate containers for progress text and results + progress_text_container = st.empty() + results_container = st.empty() + output_res = {} + for i, r in enumerate(rows): + # Update progress + progress = i / len(rows) + progress_bar.progress(progress, text=progress_text) + # Run evaluation for current row + eval_res = llama_stack_api.client.eval.evaluate_rows( + task_id=selected_eval_task, + input_rows=[r], + scoring_functions=eval_tasks[selected_eval_task].scoring_functions, + task_config=eval_task_config, + ) + + for k in r.keys(): + if k not in output_res: + output_res[k] = [] + output_res[k].append(r[k]) + + for k in eval_res.generations[0].keys(): + if k not in output_res: + output_res[k] = [] + output_res[k].append(eval_res.generations[0][k]) + + for scoring_fn in eval_tasks[selected_eval_task].scoring_functions: + if scoring_fn not in output_res: + output_res[scoring_fn] = [] + output_res[scoring_fn].append( + eval_res.scores[scoring_fn].score_rows[0] + ) + + progress_text_container.write( + f"Expand to see current processed result ({i+1}/{len(rows)})" + ) + results_container.json(eval_res, expanded=2) + + progress_bar.progress(1.0, text="Evaluation complete!") + # Display results in dataframe + if output_res: + output_df = pd.DataFrame(output_res) + st.subheader("Evaluation Results") + st.dataframe(output_df) native_evaluation_page()