# Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. import json import pandas as pd import streamlit as st from modules.api import llama_stack_api from modules.utils import process_dataset def application_evaluation_page(): st.set_page_config(page_title="Evaluations (Scoring)", page_icon="🦙") st.title("📊 Evaluations (Scoring)") # File uploader uploaded_file = st.file_uploader("Upload Dataset", type=["csv", "xlsx", "xls"]) if uploaded_file is None: st.error("No file uploaded") return # Process uploaded file df = process_dataset(uploaded_file) if df is None: st.error("Error processing file") return # Display dataset information st.success("Dataset loaded successfully!") # Display dataframe preview st.subheader("Dataset Preview") st.dataframe(df) # Select Scoring Functions to Run Evaluation On st.subheader("Select Scoring Functions") scoring_functions = llama_stack_api.client.scoring_functions.list() scoring_functions = {sf.identifier: sf for sf in scoring_functions} scoring_functions_names = list(scoring_functions.keys()) selected_scoring_functions = st.multiselect( "Choose one or more scoring functions", options=scoring_functions_names, help="Choose one or more scoring functions.", ) available_models = llama_stack_api.client.models.list() available_models = [m.identifier for m in available_models] scoring_params = {} if selected_scoring_functions: st.write("Selected:") for scoring_fn_id in selected_scoring_functions: scoring_fn = scoring_functions[scoring_fn_id] st.write(f"- **{scoring_fn_id}**: {scoring_fn.description}") new_params = None if scoring_fn.params: new_params = {} for param_name, param_value in scoring_fn.params.to_dict().items(): if param_name == "type": new_params[param_name] = param_value continue if param_name == "judge_model": value = st.selectbox( f"Select **{param_name}** for {scoring_fn_id}", options=available_models, index=0, key=f"{scoring_fn_id}_{param_name}", ) new_params[param_name] = value else: value = st.text_area( f"Enter value for **{param_name}** in {scoring_fn_id} in valid JSON format", value=json.dumps(param_value, indent=2), height=80, ) try: new_params[param_name] = json.loads(value) except json.JSONDecodeError: st.error( f"Invalid JSON for **{param_name}** in {scoring_fn_id}" ) st.json(new_params) scoring_params[scoring_fn_id] = new_params # Add run evaluation button & slider total_rows = len(df) num_rows = st.slider("Number of rows to evaluate", 1, total_rows, total_rows) if st.button("Run Evaluation"): progress_text = "Running evaluation..." progress_bar = st.progress(0, text=progress_text) rows = df.to_dict(orient="records") if num_rows < total_rows: rows = rows[:num_rows] # Create separate containers for progress text and results progress_text_container = st.empty() results_container = st.empty() output_res = {} for i, r in enumerate(rows): # Update progress progress = i / len(rows) progress_bar.progress(progress, text=progress_text) # Run evaluation for current row score_res = llama_stack_api.run_scoring( r, scoring_function_ids=selected_scoring_functions, scoring_params=scoring_params, ) for k in r.keys(): if k not in output_res: output_res[k] = [] output_res[k].append(r[k]) for fn_id in selected_scoring_functions: if fn_id not in output_res: output_res[fn_id] = [] output_res[fn_id].append(score_res.results[fn_id].score_rows[0]) # Display current row results using separate containers progress_text_container.write( f"Expand to see current processed result ({i+1}/{len(rows)})" ) results_container.json( score_res.to_json(), expanded=2, ) progress_bar.progress(1.0, text="Evaluation complete!") # Display results in dataframe if output_res: output_df = pd.DataFrame(output_res) st.subheader("Evaluation Results") st.dataframe(output_df) application_evaluation_page()