# Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. import json import pandas as pd import streamlit as st from modules.api import llama_stack_api def native_evaluation_page(): st.set_page_config(page_title="Native Evaluations", page_icon="🦙") st.title("🦙 Llama Stack Native Evaluations") # Create tabs task_tab, candidate_tab, params_tab, run_tab = st.tabs( [ "(1) Select Eval Task", "(2) Define Eval Candidate", "(3) Define Scoring Parameters", "(4) Run Evaluation", ] ) with task_tab: # Select Eval Tasks eval_tasks = llama_stack_api.client.eval_tasks.list() eval_tasks = {et.identifier: et for et in eval_tasks} eval_tasks_names = list(eval_tasks.keys()) selected_eval_task = st.selectbox( "Choose an eval task.", options=eval_tasks_names, help="Choose an eval task. Each eval task is parameterized by a dataset, and list of scoring functions.", ) st.json(eval_tasks[selected_eval_task], expanded=True) with candidate_tab: # Define Eval Candidate candidate_type = st.radio("Candidate Type", ["model", "agent"]) available_models = llama_stack_api.client.models.list() available_models = [model.identifier for model in available_models] selected_model = st.selectbox( "Choose a model", available_models, index=0, ) # Sampling Parameters st.markdown("##### Sampling Parameters") strategy = st.selectbox( "Strategy", ["greedy", "top_p", "top_k"], index=0, ) temperature = st.slider( "Temperature", min_value=0.0, max_value=1.0, value=0.0, step=0.1, help="Controls the randomness of the response. Higher values make the output more creative and unexpected, lower values make it more conservative and predictable", ) top_p = st.slider( "Top P", min_value=0.0, max_value=1.0, value=0.95, step=0.1, ) max_tokens = st.slider( "Max Tokens", min_value=0, max_value=4096, value=512, step=1, help="The maximum number of tokens to generate", ) repetition_penalty = st.slider( "Repetition Penalty", min_value=1.0, max_value=2.0, value=1.0, step=0.1, help="Controls the likelihood for generating the same word or phrase multiple times in the same sentence or paragraph. 1 implies no penalty, 2 will strongly discourage model to repeat words or phrases.", ) if candidate_type == "model": eval_candidate = { "type": "model", "model": selected_model, "sampling_params": { "strategy": strategy, "temperature": temperature, "top_p": top_p, "max_tokens": max_tokens, "repetition_penalty": repetition_penalty, }, } elif candidate_type == "agent": system_prompt = st.text_area( "System Prompt", value="You are a helpful AI assistant.", help="Initial instructions given to the AI to set its behavior and context", ) tools_json = st.text_area( "Tools Configuration (JSON)", value=json.dumps( [ { "type": "brave_search", "engine": "brave", "api_key": "ENTER_BRAVE_API_KEY_HERE", } ] ), help="Enter tool configurations in JSON format. Each tool should have a name, description, and parameters.", height=200, ) try: tools = json.loads(tools_json) except json.JSONDecodeError: st.error("Invalid JSON format for tools configuration") tools = [] eval_candidate = { "type": "agent", "config": { "model": selected_model, "instructions": system_prompt, "tools": tools, "tool_choice": "auto", "tool_prompt_format": "json", "input_shields": [], "output_shields": [], "enable_session_persistence": False, }, } with params_tab: st.write("(Optional) Define scoring function parameters here") with run_tab: dataset_id = eval_tasks[selected_eval_task].dataset_id rows = llama_stack_api.client.datasetio.get_rows_paginated( dataset_id=dataset_id, rows_in_page=-1, ) total_rows = len(rows.rows) # Add number of examples control num_rows = st.number_input( "Number of Examples to Evaluate", min_value=1, max_value=total_rows, value=5, help="Number of examples from the dataset to evaluate. ", ) eval_task_config = { "type": "benchmark", "eval_candidate": eval_candidate, "scoring_params": {}, } st.json(eval_tasks[selected_eval_task], expanded=True) st.json(eval_task_config, expanded=True) # Add run button and handle evaluation if st.button("Run Evaluation"): progress_text = "Running evaluation..." progress_bar = st.progress(0, text=progress_text) rows = rows.rows if num_rows < total_rows: rows = rows[:num_rows] # Create separate containers for progress text and results progress_text_container = st.empty() results_container = st.empty() output_res = {} for i, r in enumerate(rows): # Update progress progress = i / len(rows) progress_bar.progress(progress, text=progress_text) # Run evaluation for current row eval_res = llama_stack_api.client.eval.evaluate_rows( task_id=selected_eval_task, input_rows=[r], scoring_functions=eval_tasks[selected_eval_task].scoring_functions, task_config=eval_task_config, ) for k in r.keys(): if k not in output_res: output_res[k] = [] output_res[k].append(r[k]) for k in eval_res.generations[0].keys(): if k not in output_res: output_res[k] = [] output_res[k].append(eval_res.generations[0][k]) for scoring_fn in eval_tasks[selected_eval_task].scoring_functions: if scoring_fn not in output_res: output_res[scoring_fn] = [] output_res[scoring_fn].append( eval_res.scores[scoring_fn].score_rows[0] ) progress_text_container.write( f"Expand to see current processed result ({i+1}/{len(rows)})" ) results_container.json(eval_res, expanded=2) progress_bar.progress(1.0, text="Evaluation complete!") # Display results in dataframe if output_res: output_df = pd.DataFrame(output_res) st.subheader("Evaluation Results") st.dataframe(output_df) native_evaluation_page()