[llama stack ui] add native eval & inspect distro & playground pages (#541)

# What does this PR do?

New Pages Added: 

- (1) Inspect Distro
- (2) Evaluations: 
  - (a) native evaluations (including generation)
  - (b) application evaluations (no generation, scoring only)
- (3) Playground: 
  - (a) chat
  - (b) RAG  

## Test Plan

```
streamlit run app.py
```

#### Playground

https://github.com/user-attachments/assets/6ca617e8-32ca-49b2-9774-185020ff5204

#### Inspect

https://github.com/user-attachments/assets/01d52b2d-92af-4e3a-b623-a9b8ba22ba99


#### Evaluations (Generation + Scoring)

https://github.com/user-attachments/assets/345845c7-2a2b-4095-960a-9ae40f6a93cf

#### Evaluations (Scoring)

https://github.com/user-attachments/assets/6cc1659f-eba4-49ca-a0a5-7c243557b4f5


## Before submitting

- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Ran pre-commit to handle lint / formatting issues.
- [ ] Read the [contributor
guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md),
      Pull Request section?
- [ ] Updated relevant documentation.
- [ ] Wrote necessary unit or integration tests.
This commit is contained in:
Xi Yan 2024-12-04 09:47:09 -08:00 committed by GitHub
parent caf1dac114
commit 16769256b7
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
22 changed files with 1000 additions and 166 deletions

View file

@ -3,170 +3,54 @@
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import json
import pandas as pd
import streamlit as st
from modules.api import LlamaStackEvaluation
from modules.utils import process_dataset
EVALUATION_API = LlamaStackEvaluation()
def main():
# Add collapsible sidebar
with st.sidebar:
# Add collapse button
if "sidebar_state" not in st.session_state:
st.session_state.sidebar_state = True
if st.session_state.sidebar_state:
st.title("Navigation")
page = st.radio(
"Select a Page",
["Application Evaluation"],
index=0,
)
else:
page = "Application Evaluation" # Default page when sidebar is collapsed
# Main content area
st.title("🦙 Llama Stack Evaluations")
if page == "Application Evaluation":
application_evaluation_page()
def application_evaluation_page():
# File uploader
uploaded_file = st.file_uploader("Upload Dataset", type=["csv", "xlsx", "xls"])
if uploaded_file is None:
st.error("No file uploaded")
return
# Process uploaded file
df = process_dataset(uploaded_file)
if df is None:
st.error("Error processing file")
return
# Display dataset information
st.success("Dataset loaded successfully!")
# Display dataframe preview
st.subheader("Dataset Preview")
st.dataframe(df)
# Select Scoring Functions to Run Evaluation On
st.subheader("Select Scoring Functions")
scoring_functions = EVALUATION_API.list_scoring_functions()
scoring_functions = {sf.identifier: sf for sf in scoring_functions}
scoring_functions_names = list(scoring_functions.keys())
selected_scoring_functions = st.multiselect(
"Choose one or more scoring functions",
options=scoring_functions_names,
help="Choose one or more scoring functions.",
# Evaluation pages
application_evaluation_page = st.Page(
"page/evaluations/app_eval.py",
title="Evaluations (Scoring)",
icon="📊",
default=False,
)
native_evaluation_page = st.Page(
"page/evaluations/native_eval.py",
title="Evaluations (Generation + Scoring)",
icon="📊",
default=False,
)
available_models = EVALUATION_API.list_models()
available_models = [m.identifier for m in available_models]
# Playground pages
chat_page = st.Page(
"page/playground/chat.py", title="Chat", icon="💬", default=True
)
rag_page = st.Page("page/playground/rag.py", title="RAG", icon="💬", default=False)
scoring_params = {}
if selected_scoring_functions:
st.write("Selected:")
for scoring_fn_id in selected_scoring_functions:
scoring_fn = scoring_functions[scoring_fn_id]
st.write(f"- **{scoring_fn_id}**: {scoring_fn.description}")
new_params = None
if scoring_fn.params:
new_params = {}
for param_name, param_value in scoring_fn.params.to_dict().items():
if param_name == "type":
new_params[param_name] = param_value
continue
# Distribution pages
resources_page = st.Page(
"page/distribution/resources.py", title="Resources", icon="🔍", default=False
)
provider_page = st.Page(
"page/distribution/providers.py",
title="API Providers",
icon="🔍",
default=False,
)
if param_name == "judge_model":
value = st.selectbox(
f"Select **{param_name}** for {scoring_fn_id}",
options=available_models,
index=0,
key=f"{scoring_fn_id}_{param_name}",
)
new_params[param_name] = value
else:
value = st.text_area(
f"Enter value for **{param_name}** in {scoring_fn_id} in valid JSON format",
value=json.dumps(param_value, indent=2),
height=80,
)
try:
new_params[param_name] = json.loads(value)
except json.JSONDecodeError:
st.error(
f"Invalid JSON for **{param_name}** in {scoring_fn_id}"
)
st.json(new_params)
scoring_params[scoring_fn_id] = new_params
# Add run evaluation button & slider
total_rows = len(df)
num_rows = st.slider("Number of rows to evaluate", 1, total_rows, total_rows)
if st.button("Run Evaluation"):
progress_text = "Running evaluation..."
progress_bar = st.progress(0, text=progress_text)
rows = df.to_dict(orient="records")
if num_rows < total_rows:
rows = rows[:num_rows]
# Create separate containers for progress text and results
progress_text_container = st.empty()
results_container = st.empty()
output_res = {}
for i, r in enumerate(rows):
# Update progress
progress = i / len(rows)
progress_bar.progress(progress, text=progress_text)
# Run evaluation for current row
score_res = EVALUATION_API.run_scoring(
r,
scoring_function_ids=selected_scoring_functions,
scoring_params=scoring_params,
)
for k in r.keys():
if k not in output_res:
output_res[k] = []
output_res[k].append(r[k])
for fn_id in selected_scoring_functions:
if fn_id not in output_res:
output_res[fn_id] = []
output_res[fn_id].append(score_res.results[fn_id].score_rows[0])
# Display current row results using separate containers
progress_text_container.write(
f"Expand to see current processed result ({i+1}/{len(rows)})"
)
results_container.json(
score_res.to_json(),
expanded=2,
)
progress_bar.progress(1.0, text="Evaluation complete!")
# Display results in dataframe
if output_res:
output_df = pd.DataFrame(output_res)
st.subheader("Evaluation Results")
st.dataframe(output_df)
pg = st.navigation(
{
"Playground": [
chat_page,
rag_page,
application_evaluation_page,
native_evaluation_page,
],
"Inspect": [provider_page, resources_page],
},
expanded=False,
)
pg.run()
if __name__ == "__main__":