forked from phoenix-oss/llama-stack-mirror
		
	# What does this PR do? - Update `/eval-tasks` to `/benchmarks` - ⚠️ Remove differentiation between `app` v.s. `benchmark` eval task config. Now we only have `BenchmarkConfig`. The overloaded `benchmark` is confusing and do not add any value. Backward compatibility is being kept as the "type" is not being used anywhere. [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan - This change is backward compatible - Run notebook test with ``` pytest -v -s --nbval-lax ./docs/getting_started.ipynb pytest -v -s --nbval-lax ./docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb ``` <img width="846" alt="image" src="https://github.com/user-attachments/assets/d2fc06a7-593a-444f-bc1f-10ab9b0c843d" /> [//]: # (## Documentation) [//]: # (- [ ] Added a Changelog entry if the change is significant) --------- Signed-off-by: Ihar Hrachyshka <ihar.hrachyshka@gmail.com> Signed-off-by: Ben Browning <bbrownin@redhat.com> Signed-off-by: Sébastien Han <seb@redhat.com> Signed-off-by: reidliu <reid201711@gmail.com> Co-authored-by: Ihar Hrachyshka <ihar.hrachyshka@gmail.com> Co-authored-by: Ben Browning <ben324@gmail.com> Co-authored-by: Sébastien Han <seb@redhat.com> Co-authored-by: Reid <61492567+reidliu41@users.noreply.github.com> Co-authored-by: reidliu <reid201711@gmail.com> Co-authored-by: Yuan Tang <terrytangyuan@gmail.com>
		
			
				
	
	
		
			253 lines
		
	
	
	
		
			9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			253 lines
		
	
	
	
		
			9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # Copyright (c) Meta Platforms, Inc. and affiliates.
 | |
| # All rights reserved.
 | |
| #
 | |
| # This source code is licensed under the terms described in the LICENSE file in
 | |
| # the root directory of this source tree.
 | |
| 
 | |
| import json
 | |
| 
 | |
| import pandas as pd
 | |
| import streamlit as st
 | |
| from modules.api import llama_stack_api
 | |
| 
 | |
| 
 | |
| def select_benchmark_1():
 | |
|     # Select Benchmarks
 | |
|     st.subheader("1. Choose An Eval Task")
 | |
|     benchmarks = llama_stack_api.client.benchmarks.list()
 | |
|     benchmarks = {et.identifier: et for et in benchmarks}
 | |
|     benchmarks_names = list(benchmarks.keys())
 | |
|     selected_benchmark = st.selectbox(
 | |
|         "Choose an eval task.",
 | |
|         options=benchmarks_names,
 | |
|         help="Choose an eval task. Each eval task is parameterized by a dataset, and list of scoring functions.",
 | |
|     )
 | |
|     with st.expander("View Eval Task"):
 | |
|         st.json(benchmarks[selected_benchmark], expanded=True)
 | |
| 
 | |
|     st.session_state["selected_benchmark"] = selected_benchmark
 | |
|     st.session_state["benchmarks"] = benchmarks
 | |
|     if st.button("Confirm", key="confirm_1"):
 | |
|         st.session_state["selected_benchmark_1_next"] = True
 | |
| 
 | |
| 
 | |
| def define_eval_candidate_2():
 | |
|     if not st.session_state.get("selected_benchmark_1_next", None):
 | |
|         return
 | |
| 
 | |
|     st.subheader("2. Define Eval Candidate")
 | |
|     st.info(
 | |
|         """
 | |
|         Define the configurations for the evaluation candidate model or agent used for generation.
 | |
|         Select "model" if you want to run generation with inference API, or "agent" if you want to run generation with agent API through specifying AgentConfig.
 | |
|         """
 | |
|     )
 | |
|     with st.expander("Define Eval Candidate", expanded=True):
 | |
|         # Define Eval Candidate
 | |
|         candidate_type = st.radio("Candidate Type", ["model", "agent"])
 | |
| 
 | |
|         available_models = llama_stack_api.client.models.list()
 | |
|         available_models = [model.identifier for model in available_models]
 | |
|         selected_model = st.selectbox(
 | |
|             "Choose a model",
 | |
|             available_models,
 | |
|             index=0,
 | |
|         )
 | |
| 
 | |
|         # Sampling Parameters
 | |
|         st.markdown("##### Sampling Parameters")
 | |
|         temperature = st.slider(
 | |
|             "Temperature",
 | |
|             min_value=0.0,
 | |
|             max_value=1.0,
 | |
|             value=0.0,
 | |
|             step=0.1,
 | |
|             help="Controls the randomness of the response. Higher values make the output more creative and unexpected, lower values make it more conservative and predictable",
 | |
|         )
 | |
|         top_p = st.slider(
 | |
|             "Top P",
 | |
|             min_value=0.0,
 | |
|             max_value=1.0,
 | |
|             value=0.95,
 | |
|             step=0.1,
 | |
|         )
 | |
|         max_tokens = st.slider(
 | |
|             "Max Tokens",
 | |
|             min_value=0,
 | |
|             max_value=4096,
 | |
|             value=512,
 | |
|             step=1,
 | |
|             help="The maximum number of tokens to generate",
 | |
|         )
 | |
|         repetition_penalty = st.slider(
 | |
|             "Repetition Penalty",
 | |
|             min_value=1.0,
 | |
|             max_value=2.0,
 | |
|             value=1.0,
 | |
|             step=0.1,
 | |
|             help="Controls the likelihood for generating the same word or phrase multiple times in the same sentence or paragraph. 1 implies no penalty, 2 will strongly discourage model to repeat words or phrases.",
 | |
|         )
 | |
|         if candidate_type == "model":
 | |
|             if temperature > 0.0:
 | |
|                 strategy = {
 | |
|                     "type": "top_p",
 | |
|                     "temperature": temperature,
 | |
|                     "top_p": top_p,
 | |
|                 }
 | |
|             else:
 | |
|                 strategy = {"type": "greedy"}
 | |
| 
 | |
|             eval_candidate = {
 | |
|                 "type": "model",
 | |
|                 "model": selected_model,
 | |
|                 "sampling_params": {
 | |
|                     "strategy": strategy,
 | |
|                     "max_tokens": max_tokens,
 | |
|                     "repetition_penalty": repetition_penalty,
 | |
|                 },
 | |
|             }
 | |
|         elif candidate_type == "agent":
 | |
|             system_prompt = st.text_area(
 | |
|                 "System Prompt",
 | |
|                 value="You are a helpful AI assistant.",
 | |
|                 help="Initial instructions given to the AI to set its behavior and context",
 | |
|             )
 | |
|             tools_json = st.text_area(
 | |
|                 "Tools Configuration (JSON)",
 | |
|                 value=json.dumps(
 | |
|                     [
 | |
|                         {
 | |
|                             "type": "brave_search",
 | |
|                             "engine": "brave",
 | |
|                             "api_key": "ENTER_BRAVE_API_KEY_HERE",
 | |
|                         }
 | |
|                     ]
 | |
|                 ),
 | |
|                 help="Enter tool configurations in JSON format. Each tool should have a name, description, and parameters.",
 | |
|                 height=200,
 | |
|             )
 | |
|             try:
 | |
|                 tools = json.loads(tools_json)
 | |
|             except json.JSONDecodeError:
 | |
|                 st.error("Invalid JSON format for tools configuration")
 | |
|                 tools = []
 | |
|             eval_candidate = {
 | |
|                 "type": "agent",
 | |
|                 "config": {
 | |
|                     "model": selected_model,
 | |
|                     "instructions": system_prompt,
 | |
|                     "tools": tools,
 | |
|                     "tool_choice": "auto",
 | |
|                     "tool_prompt_format": "json",
 | |
|                     "input_shields": [],
 | |
|                     "output_shields": [],
 | |
|                     "enable_session_persistence": False,
 | |
|                 },
 | |
|             }
 | |
|         st.session_state["eval_candidate"] = eval_candidate
 | |
| 
 | |
|     if st.button("Confirm", key="confirm_2"):
 | |
|         st.session_state["selected_eval_candidate_2_next"] = True
 | |
| 
 | |
| 
 | |
| def run_evaluation_3():
 | |
|     if not st.session_state.get("selected_eval_candidate_2_next", None):
 | |
|         return
 | |
| 
 | |
|     st.subheader("3. Run Evaluation")
 | |
|     # Add info box to explain configurations being used
 | |
|     st.info(
 | |
|         """
 | |
|         Review the configurations that will be used for this evaluation run, make any necessary changes, and then click the "Run Evaluation" button.
 | |
|         """
 | |
|     )
 | |
|     selected_benchmark = st.session_state["selected_benchmark"]
 | |
|     benchmarks = st.session_state["benchmarks"]
 | |
|     eval_candidate = st.session_state["eval_candidate"]
 | |
| 
 | |
|     dataset_id = benchmarks[selected_benchmark].dataset_id
 | |
|     rows = llama_stack_api.client.datasetio.get_rows_paginated(
 | |
|         dataset_id=dataset_id,
 | |
|         rows_in_page=-1,
 | |
|     )
 | |
|     total_rows = len(rows.rows)
 | |
|     # Add number of examples control
 | |
|     num_rows = st.number_input(
 | |
|         "Number of Examples to Evaluate",
 | |
|         min_value=1,
 | |
|         max_value=total_rows,
 | |
|         value=5,
 | |
|         help="Number of examples from the dataset to evaluate. ",
 | |
|     )
 | |
| 
 | |
|     benchmark_config = {
 | |
|         "type": "benchmark",
 | |
|         "eval_candidate": eval_candidate,
 | |
|         "scoring_params": {},
 | |
|     }
 | |
| 
 | |
|     with st.expander("View Evaluation Task", expanded=True):
 | |
|         st.json(benchmarks[selected_benchmark], expanded=True)
 | |
|     with st.expander("View Evaluation Task Configuration", expanded=True):
 | |
|         st.json(benchmark_config, expanded=True)
 | |
| 
 | |
|     # Add run button and handle evaluation
 | |
|     if st.button("Run Evaluation"):
 | |
|         progress_text = "Running evaluation..."
 | |
|         progress_bar = st.progress(0, text=progress_text)
 | |
|         rows = rows.rows
 | |
|         if num_rows < total_rows:
 | |
|             rows = rows[:num_rows]
 | |
| 
 | |
|         # Create separate containers for progress text and results
 | |
|         progress_text_container = st.empty()
 | |
|         results_container = st.empty()
 | |
|         output_res = {}
 | |
|         for i, r in enumerate(rows):
 | |
|             # Update progress
 | |
|             progress = i / len(rows)
 | |
|             progress_bar.progress(progress, text=progress_text)
 | |
|             # Run evaluation for current row
 | |
|             eval_res = llama_stack_api.client.eval.evaluate_rows(
 | |
|                 benchmark_id=selected_benchmark,
 | |
|                 input_rows=[r],
 | |
|                 scoring_functions=benchmarks[selected_benchmark].scoring_functions,
 | |
|                 task_config=benchmark_config,
 | |
|             )
 | |
| 
 | |
|             for k in r.keys():
 | |
|                 if k not in output_res:
 | |
|                     output_res[k] = []
 | |
|                 output_res[k].append(r[k])
 | |
| 
 | |
|             for k in eval_res.generations[0].keys():
 | |
|                 if k not in output_res:
 | |
|                     output_res[k] = []
 | |
|                 output_res[k].append(eval_res.generations[0][k])
 | |
| 
 | |
|             for scoring_fn in benchmarks[selected_benchmark].scoring_functions:
 | |
|                 if scoring_fn not in output_res:
 | |
|                     output_res[scoring_fn] = []
 | |
|                 output_res[scoring_fn].append(eval_res.scores[scoring_fn].score_rows[0])
 | |
| 
 | |
|             progress_text_container.write(f"Expand to see current processed result ({i + 1} / {len(rows)})")
 | |
|             results_container.json(eval_res, expanded=2)
 | |
| 
 | |
|         progress_bar.progress(1.0, text="Evaluation complete!")
 | |
|         # Display results in dataframe
 | |
|         if output_res:
 | |
|             output_df = pd.DataFrame(output_res)
 | |
|             st.subheader("Evaluation Results")
 | |
|             st.dataframe(output_df)
 | |
| 
 | |
| 
 | |
| def native_evaluation_page():
 | |
|     st.set_page_config(page_title="Evaluations (Generation + Scoring)", page_icon="🦙")
 | |
|     st.title("📊 Evaluations (Generation + Scoring)")
 | |
| 
 | |
|     select_benchmark_1()
 | |
|     define_eval_candidate_2()
 | |
|     run_evaluation_3()
 | |
| 
 | |
| 
 | |
| native_evaluation_page()
 |