forked from phoenix-oss/llama-stack-mirror
		
	This fixes the pre-commit check when running locally (not sure why this was not caught on CI check): ``` > pre-commit run --show-diff-on-failure --color=always --all-files trim trailing whitespace.................................................Passed check python ast.........................................................Passed check for merge conflicts................................................Passed check for added large files..............................................Passed fix end of files.........................................................Passed Insert license in comments...............................................Passed flake8...................................................................Failed - hook id: flake8 - exit code: 1 llama_stack/distribution/ui/page/evaluations/app_eval.py:132:65: E226 missing whitespace around arithmetic operator llama_stack/distribution/ui/page/evaluations/native_eval.py:235:61: E226 missing whitespace around arithmetic operator llama_stack/providers/utils/telemetry/trace_protocol.py:56:78: E226 missing whitespace around arithmetic operator ``` Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
		
			
				
	
	
		
			148 lines
		
	
	
	
		
			5.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			148 lines
		
	
	
	
		
			5.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # Copyright (c) Meta Platforms, Inc. and affiliates.
 | |
| # All rights reserved.
 | |
| #
 | |
| # This source code is licensed under the terms described in the LICENSE file in
 | |
| # the root directory of this source tree.
 | |
| 
 | |
| import json
 | |
| 
 | |
| import pandas as pd
 | |
| import streamlit as st
 | |
| 
 | |
| from modules.api import llama_stack_api
 | |
| from modules.utils import process_dataset
 | |
| 
 | |
| 
 | |
| def application_evaluation_page():
 | |
| 
 | |
|     st.set_page_config(page_title="Evaluations (Scoring)", page_icon="🦙")
 | |
|     st.title("📊 Evaluations (Scoring)")
 | |
| 
 | |
|     # File uploader
 | |
|     uploaded_file = st.file_uploader("Upload Dataset", type=["csv", "xlsx", "xls"])
 | |
| 
 | |
|     if uploaded_file is None:
 | |
|         st.error("No file uploaded")
 | |
|         return
 | |
| 
 | |
|     # Process uploaded file
 | |
|     df = process_dataset(uploaded_file)
 | |
|     if df is None:
 | |
|         st.error("Error processing file")
 | |
|         return
 | |
| 
 | |
|     # Display dataset information
 | |
|     st.success("Dataset loaded successfully!")
 | |
| 
 | |
|     # Display dataframe preview
 | |
|     st.subheader("Dataset Preview")
 | |
|     st.dataframe(df)
 | |
| 
 | |
|     # Select Scoring Functions to Run Evaluation On
 | |
|     st.subheader("Select Scoring Functions")
 | |
|     scoring_functions = llama_stack_api.client.scoring_functions.list()
 | |
|     scoring_functions = {sf.identifier: sf for sf in scoring_functions}
 | |
|     scoring_functions_names = list(scoring_functions.keys())
 | |
|     selected_scoring_functions = st.multiselect(
 | |
|         "Choose one or more scoring functions",
 | |
|         options=scoring_functions_names,
 | |
|         help="Choose one or more scoring functions.",
 | |
|     )
 | |
| 
 | |
|     available_models = llama_stack_api.client.models.list()
 | |
|     available_models = [m.identifier for m in available_models]
 | |
| 
 | |
|     scoring_params = {}
 | |
|     if selected_scoring_functions:
 | |
|         st.write("Selected:")
 | |
|         for scoring_fn_id in selected_scoring_functions:
 | |
|             scoring_fn = scoring_functions[scoring_fn_id]
 | |
|             st.write(f"- **{scoring_fn_id}**: {scoring_fn.description}")
 | |
|             new_params = None
 | |
|             if scoring_fn.params:
 | |
|                 new_params = {}
 | |
|                 for param_name, param_value in scoring_fn.params.to_dict().items():
 | |
|                     if param_name == "type":
 | |
|                         new_params[param_name] = param_value
 | |
|                         continue
 | |
| 
 | |
|                     if param_name == "judge_model":
 | |
|                         value = st.selectbox(
 | |
|                             f"Select **{param_name}** for {scoring_fn_id}",
 | |
|                             options=available_models,
 | |
|                             index=0,
 | |
|                             key=f"{scoring_fn_id}_{param_name}",
 | |
|                         )
 | |
|                         new_params[param_name] = value
 | |
|                     else:
 | |
|                         value = st.text_area(
 | |
|                             f"Enter value for **{param_name}** in {scoring_fn_id} in valid JSON format",
 | |
|                             value=json.dumps(param_value, indent=2),
 | |
|                             height=80,
 | |
|                         )
 | |
|                         try:
 | |
|                             new_params[param_name] = json.loads(value)
 | |
|                         except json.JSONDecodeError:
 | |
|                             st.error(
 | |
|                                 f"Invalid JSON for **{param_name}** in {scoring_fn_id}"
 | |
|                             )
 | |
| 
 | |
|                 st.json(new_params)
 | |
|             scoring_params[scoring_fn_id] = new_params
 | |
| 
 | |
|         # Add run evaluation button & slider
 | |
|         total_rows = len(df)
 | |
|         num_rows = st.slider("Number of rows to evaluate", 1, total_rows, total_rows)
 | |
| 
 | |
|         if st.button("Run Evaluation"):
 | |
|             progress_text = "Running evaluation..."
 | |
|             progress_bar = st.progress(0, text=progress_text)
 | |
|             rows = df.to_dict(orient="records")
 | |
|             if num_rows < total_rows:
 | |
|                 rows = rows[:num_rows]
 | |
| 
 | |
|             # Create separate containers for progress text and results
 | |
|             progress_text_container = st.empty()
 | |
|             results_container = st.empty()
 | |
|             output_res = {}
 | |
|             for i, r in enumerate(rows):
 | |
|                 # Update progress
 | |
|                 progress = i / len(rows)
 | |
|                 progress_bar.progress(progress, text=progress_text)
 | |
| 
 | |
|                 # Run evaluation for current row
 | |
|                 score_res = llama_stack_api.run_scoring(
 | |
|                     r,
 | |
|                     scoring_function_ids=selected_scoring_functions,
 | |
|                     scoring_params=scoring_params,
 | |
|                 )
 | |
| 
 | |
|                 for k in r.keys():
 | |
|                     if k not in output_res:
 | |
|                         output_res[k] = []
 | |
|                     output_res[k].append(r[k])
 | |
| 
 | |
|                 for fn_id in selected_scoring_functions:
 | |
|                     if fn_id not in output_res:
 | |
|                         output_res[fn_id] = []
 | |
|                     output_res[fn_id].append(score_res.results[fn_id].score_rows[0])
 | |
| 
 | |
|                 # Display current row results using separate containers
 | |
|                 progress_text_container.write(
 | |
|                     f"Expand to see current processed result ({i + 1} / {len(rows)})"
 | |
|                 )
 | |
|                 results_container.json(
 | |
|                     score_res.to_json(),
 | |
|                     expanded=2,
 | |
|                 )
 | |
| 
 | |
|             progress_bar.progress(1.0, text="Evaluation complete!")
 | |
| 
 | |
|             # Display results in dataframe
 | |
|             if output_res:
 | |
|                 output_df = pd.DataFrame(output_res)
 | |
|                 st.subheader("Evaluation Results")
 | |
|                 st.dataframe(output_df)
 | |
| 
 | |
| 
 | |
| application_evaluation_page()
 |