mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-06-27 18:50:41 +00:00
## PR Stack - https://github.com/meta-llama/llama-stack/pull/1573 - https://github.com/meta-llama/llama-stack/pull/1625 - https://github.com/meta-llama/llama-stack/pull/1656 - https://github.com/meta-llama/llama-stack/pull/1657 - https://github.com/meta-llama/llama-stack/pull/1658 - https://github.com/meta-llama/llama-stack/pull/1659 - https://github.com/meta-llama/llama-stack/pull/1660 **Client SDK** - https://github.com/meta-llama/llama-stack-client-python/pull/203 **CI** -1391130488
<img width="1042" alt="image" src="https://github.com/user-attachments/assets/69636067-376d-436b-9204-896e2dd490ca" /> -- the test_rag_agent_with_attachments is flaky and not related to this PR ## Doc <img width="789" alt="image" src="https://github.com/user-attachments/assets/b88390f3-73d6-4483-b09a-a192064e32d9" /> ## Client Usage ```python client.datasets.register( source={ "type": "uri", "uri": "lsfs://mydata.jsonl", }, schema="jsonl_messages", # optional dataset_id="my_first_train_data" ) # quick prototype debugging client.datasets.register( data_reference={ "type": "rows", "rows": [ "messages": [...], ], }, schema="jsonl_messages", ) ``` ## Test Plan - CI:1387805545
``` LLAMA_STACK_CONFIG=fireworks pytest -v tests/integration/datasets/test_datasets.py ``` ``` LLAMA_STACK_CONFIG=fireworks pytest -v tests/integration/scoring/test_scoring.py ``` ``` pytest -v -s --nbval-lax ./docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb ```
252 lines
9 KiB
Python
252 lines
9 KiB
Python
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
# All rights reserved.
|
|
#
|
|
# This source code is licensed under the terms described in the LICENSE file in
|
|
# the root directory of this source tree.
|
|
|
|
import json
|
|
|
|
import pandas as pd
|
|
import streamlit as st
|
|
from modules.api import llama_stack_api
|
|
|
|
|
|
def select_benchmark_1():
|
|
# Select Benchmarks
|
|
st.subheader("1. Choose An Eval Task")
|
|
benchmarks = llama_stack_api.client.benchmarks.list()
|
|
benchmarks = {et.identifier: et for et in benchmarks}
|
|
benchmarks_names = list(benchmarks.keys())
|
|
selected_benchmark = st.selectbox(
|
|
"Choose an eval task.",
|
|
options=benchmarks_names,
|
|
help="Choose an eval task. Each eval task is parameterized by a dataset, and list of scoring functions.",
|
|
)
|
|
with st.expander("View Eval Task"):
|
|
st.json(benchmarks[selected_benchmark], expanded=True)
|
|
|
|
st.session_state["selected_benchmark"] = selected_benchmark
|
|
st.session_state["benchmarks"] = benchmarks
|
|
if st.button("Confirm", key="confirm_1"):
|
|
st.session_state["selected_benchmark_1_next"] = True
|
|
|
|
|
|
def define_eval_candidate_2():
|
|
if not st.session_state.get("selected_benchmark_1_next", None):
|
|
return
|
|
|
|
st.subheader("2. Define Eval Candidate")
|
|
st.info(
|
|
"""
|
|
Define the configurations for the evaluation candidate model or agent used for generation.
|
|
Select "model" if you want to run generation with inference API, or "agent" if you want to run generation with agent API through specifying AgentConfig.
|
|
"""
|
|
)
|
|
with st.expander("Define Eval Candidate", expanded=True):
|
|
# Define Eval Candidate
|
|
candidate_type = st.radio("Candidate Type", ["model", "agent"])
|
|
|
|
available_models = llama_stack_api.client.models.list()
|
|
available_models = [model.identifier for model in available_models]
|
|
selected_model = st.selectbox(
|
|
"Choose a model",
|
|
available_models,
|
|
index=0,
|
|
)
|
|
|
|
# Sampling Parameters
|
|
st.markdown("##### Sampling Parameters")
|
|
temperature = st.slider(
|
|
"Temperature",
|
|
min_value=0.0,
|
|
max_value=1.0,
|
|
value=0.0,
|
|
step=0.1,
|
|
help="Controls the randomness of the response. Higher values make the output more creative and unexpected, lower values make it more conservative and predictable",
|
|
)
|
|
top_p = st.slider(
|
|
"Top P",
|
|
min_value=0.0,
|
|
max_value=1.0,
|
|
value=0.95,
|
|
step=0.1,
|
|
)
|
|
max_tokens = st.slider(
|
|
"Max Tokens",
|
|
min_value=0,
|
|
max_value=4096,
|
|
value=512,
|
|
step=1,
|
|
help="The maximum number of tokens to generate",
|
|
)
|
|
repetition_penalty = st.slider(
|
|
"Repetition Penalty",
|
|
min_value=1.0,
|
|
max_value=2.0,
|
|
value=1.0,
|
|
step=0.1,
|
|
help="Controls the likelihood for generating the same word or phrase multiple times in the same sentence or paragraph. 1 implies no penalty, 2 will strongly discourage model to repeat words or phrases.",
|
|
)
|
|
if candidate_type == "model":
|
|
if temperature > 0.0:
|
|
strategy = {
|
|
"type": "top_p",
|
|
"temperature": temperature,
|
|
"top_p": top_p,
|
|
}
|
|
else:
|
|
strategy = {"type": "greedy"}
|
|
|
|
eval_candidate = {
|
|
"type": "model",
|
|
"model": selected_model,
|
|
"sampling_params": {
|
|
"strategy": strategy,
|
|
"max_tokens": max_tokens,
|
|
"repetition_penalty": repetition_penalty,
|
|
},
|
|
}
|
|
elif candidate_type == "agent":
|
|
system_prompt = st.text_area(
|
|
"System Prompt",
|
|
value="You are a helpful AI assistant.",
|
|
help="Initial instructions given to the AI to set its behavior and context",
|
|
)
|
|
tools_json = st.text_area(
|
|
"Tools Configuration (JSON)",
|
|
value=json.dumps(
|
|
[
|
|
{
|
|
"type": "brave_search",
|
|
"engine": "brave",
|
|
"api_key": "ENTER_BRAVE_API_KEY_HERE",
|
|
}
|
|
]
|
|
),
|
|
help="Enter tool configurations in JSON format. Each tool should have a name, description, and parameters.",
|
|
height=200,
|
|
)
|
|
try:
|
|
tools = json.loads(tools_json)
|
|
except json.JSONDecodeError:
|
|
st.error("Invalid JSON format for tools configuration")
|
|
tools = []
|
|
eval_candidate = {
|
|
"type": "agent",
|
|
"config": {
|
|
"model": selected_model,
|
|
"instructions": system_prompt,
|
|
"tools": tools,
|
|
"tool_choice": "auto",
|
|
"tool_prompt_format": "json",
|
|
"input_shields": [],
|
|
"output_shields": [],
|
|
"enable_session_persistence": False,
|
|
},
|
|
}
|
|
st.session_state["eval_candidate"] = eval_candidate
|
|
|
|
if st.button("Confirm", key="confirm_2"):
|
|
st.session_state["selected_eval_candidate_2_next"] = True
|
|
|
|
|
|
def run_evaluation_3():
|
|
if not st.session_state.get("selected_eval_candidate_2_next", None):
|
|
return
|
|
|
|
st.subheader("3. Run Evaluation")
|
|
# Add info box to explain configurations being used
|
|
st.info(
|
|
"""
|
|
Review the configurations that will be used for this evaluation run, make any necessary changes, and then click the "Run Evaluation" button.
|
|
"""
|
|
)
|
|
selected_benchmark = st.session_state["selected_benchmark"]
|
|
benchmarks = st.session_state["benchmarks"]
|
|
eval_candidate = st.session_state["eval_candidate"]
|
|
|
|
dataset_id = benchmarks[selected_benchmark].dataset_id
|
|
rows = llama_stack_api.client.datasets.iterrows(
|
|
dataset_id=dataset_id,
|
|
)
|
|
total_rows = len(rows.data)
|
|
# Add number of examples control
|
|
num_rows = st.number_input(
|
|
"Number of Examples to Evaluate",
|
|
min_value=1,
|
|
max_value=total_rows,
|
|
value=5,
|
|
help="Number of examples from the dataset to evaluate. ",
|
|
)
|
|
|
|
benchmark_config = {
|
|
"type": "benchmark",
|
|
"eval_candidate": eval_candidate,
|
|
"scoring_params": {},
|
|
}
|
|
|
|
with st.expander("View Evaluation Task", expanded=True):
|
|
st.json(benchmarks[selected_benchmark], expanded=True)
|
|
with st.expander("View Evaluation Task Configuration", expanded=True):
|
|
st.json(benchmark_config, expanded=True)
|
|
|
|
# Add run button and handle evaluation
|
|
if st.button("Run Evaluation"):
|
|
progress_text = "Running evaluation..."
|
|
progress_bar = st.progress(0, text=progress_text)
|
|
rows = rows.data
|
|
if num_rows < total_rows:
|
|
rows = rows[:num_rows]
|
|
|
|
# Create separate containers for progress text and results
|
|
progress_text_container = st.empty()
|
|
results_container = st.empty()
|
|
output_res = {}
|
|
for i, r in enumerate(rows):
|
|
# Update progress
|
|
progress = i / len(rows)
|
|
progress_bar.progress(progress, text=progress_text)
|
|
# Run evaluation for current row
|
|
eval_res = llama_stack_api.client.eval.evaluate_rows(
|
|
benchmark_id=selected_benchmark,
|
|
input_rows=[r],
|
|
scoring_functions=benchmarks[selected_benchmark].scoring_functions,
|
|
benchmark_config=benchmark_config,
|
|
)
|
|
|
|
for k in r.keys():
|
|
if k not in output_res:
|
|
output_res[k] = []
|
|
output_res[k].append(r[k])
|
|
|
|
for k in eval_res.generations[0].keys():
|
|
if k not in output_res:
|
|
output_res[k] = []
|
|
output_res[k].append(eval_res.generations[0][k])
|
|
|
|
for scoring_fn in benchmarks[selected_benchmark].scoring_functions:
|
|
if scoring_fn not in output_res:
|
|
output_res[scoring_fn] = []
|
|
output_res[scoring_fn].append(eval_res.scores[scoring_fn].score_rows[0])
|
|
|
|
progress_text_container.write(f"Expand to see current processed result ({i + 1} / {len(rows)})")
|
|
results_container.json(eval_res, expanded=2)
|
|
|
|
progress_bar.progress(1.0, text="Evaluation complete!")
|
|
# Display results in dataframe
|
|
if output_res:
|
|
output_df = pd.DataFrame(output_res)
|
|
st.subheader("Evaluation Results")
|
|
st.dataframe(output_df)
|
|
|
|
|
|
def native_evaluation_page():
|
|
st.set_page_config(page_title="Evaluations (Generation + Scoring)", page_icon="🦙")
|
|
st.title("📊 Evaluations (Generation + Scoring)")
|
|
|
|
select_benchmark_1()
|
|
define_eval_candidate_2()
|
|
run_evaluation_3()
|
|
|
|
|
|
native_evaluation_page()
|