[llama stack ui] add native eval & inspect distro & playground pages (#541)

# What does this PR do? New Pages Added: - (1) Inspect Distro - (2) Evaluations: - (a) native evaluations (including generation) - (b) application evaluations (no generation, scoring only) - (3) Playground: - (a) chat - (b) RAG ## Test Plan ``` streamlit run app.py ``` #### Playground https://github.com/user-attachments/assets/6ca617e8-32ca-49b2-9774-185020ff5204 #### Inspect https://github.com/user-attachments/assets/01d52b2d-92af-4e3a-b623-a9b8ba22ba99 #### Evaluations (Generation + Scoring) https://github.com/user-attachments/assets/345845c7-2a2b-4095-960a-9ae40f6a93cf #### Evaluations (Scoring) https://github.com/user-attachments/assets/6cc1659f-eba4-49ca-a0a5-7c243557b4f5 ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Ran pre-commit to handle lint / formatting issues. - [ ] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [ ] Updated relevant documentation. - [ ] Wrote necessary unit or integration tests.
2024-12-04 09:47:09 -08:00 · 2024-12-04 09:47:09 -08:00 · 16769256b7
commit 16769256b7
parent caf1dac114
22 changed files with 1000 additions and 166 deletions
--- a/llama_stack/distribution/ui/README.md
+++ b/llama_stack/distribution/ui/README.md
@ -2,6 +2,12 @@

 [!NOTE] This is a work in progress.

+## Prerequisite
+- Start up Llama Stack Server
+```
+llama stack run
+```
+
 ## Running Streamlit App

 ```
--- a/llama_stack/distribution/ui/app.py
+++ b/llama_stack/distribution/ui/app.py
@ -3,170 +3,54 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-
-import json
-
-import pandas as pd
-
 import streamlit as st

-from modules.api import LlamaStackEvaluation
-
-from modules.utils import process_dataset
-
-EVALUATION_API = LlamaStackEvaluation()
-

 def main():
-    # Add collapsible sidebar
-    with st.sidebar:
-        # Add collapse button
-        if "sidebar_state" not in st.session_state:
-            st.session_state.sidebar_state = True
-
-        if st.session_state.sidebar_state:
-            st.title("Navigation")
-            page = st.radio(
-                "Select a Page",
-                ["Application Evaluation"],
-                index=0,
-            )
-        else:
-            page = "Application Evaluation"  # Default page when sidebar is collapsed
-
-    # Main content area
-    st.title("🦙 Llama Stack Evaluations")
-
-    if page == "Application Evaluation":
-        application_evaluation_page()
-
-
-def application_evaluation_page():
-    # File uploader
-    uploaded_file = st.file_uploader("Upload Dataset", type=["csv", "xlsx", "xls"])
-
-    if uploaded_file is None:
-        st.error("No file uploaded")
-        return
-
-    # Process uploaded file
-    df = process_dataset(uploaded_file)
-    if df is None:
-        st.error("Error processing file")
-        return
-
-    # Display dataset information
-    st.success("Dataset loaded successfully!")
-
-    # Display dataframe preview
-    st.subheader("Dataset Preview")
-    st.dataframe(df)
-
-    # Select Scoring Functions to Run Evaluation On
-    st.subheader("Select Scoring Functions")
-    scoring_functions = EVALUATION_API.list_scoring_functions()
-    scoring_functions = {sf.identifier: sf for sf in scoring_functions}
-    scoring_functions_names = list(scoring_functions.keys())
-    selected_scoring_functions = st.multiselect(
-        "Choose one or more scoring functions",
-        options=scoring_functions_names,
-        help="Choose one or more scoring functions.",
+    # Evaluation pages
+    application_evaluation_page = st.Page(
+        "page/evaluations/app_eval.py",
+        title="Evaluations (Scoring)",
+        icon="📊",
+        default=False,
+    )
+    native_evaluation_page = st.Page(
+        "page/evaluations/native_eval.py",
+        title="Evaluations (Generation + Scoring)",
+        icon="📊",
+        default=False,
    )

-    available_models = EVALUATION_API.list_models()
-    available_models = [m.identifier for m in available_models]
+    # Playground pages
+    chat_page = st.Page(
+        "page/playground/chat.py", title="Chat", icon="💬", default=True
+    )
+    rag_page = st.Page("page/playground/rag.py", title="RAG", icon="💬", default=False)

-    scoring_params = {}
-    if selected_scoring_functions:
-        st.write("Selected:")
-        for scoring_fn_id in selected_scoring_functions:
-            scoring_fn = scoring_functions[scoring_fn_id]
-            st.write(f"- **{scoring_fn_id}**: {scoring_fn.description}")
-            new_params = None
-            if scoring_fn.params:
-                new_params = {}
-                for param_name, param_value in scoring_fn.params.to_dict().items():
-                    if param_name == "type":
-                        new_params[param_name] = param_value
-                        continue
+    # Distribution pages
+    resources_page = st.Page(
+        "page/distribution/resources.py", title="Resources", icon="🔍", default=False
+    )
+    provider_page = st.Page(
+        "page/distribution/providers.py",
+        title="API Providers",
+        icon="🔍",
+        default=False,
+    )

-                    if param_name == "judge_model":
-                        value = st.selectbox(
-                            f"Select **{param_name}** for {scoring_fn_id}",
-                            options=available_models,
-                            index=0,
-                            key=f"{scoring_fn_id}_{param_name}",
-                        )
-                        new_params[param_name] = value
-                    else:
-                        value = st.text_area(
-                            f"Enter value for **{param_name}** in {scoring_fn_id} in valid JSON format",
-                            value=json.dumps(param_value, indent=2),
-                            height=80,
-                        )
-                        try:
-                            new_params[param_name] = json.loads(value)
-                        except json.JSONDecodeError:
-                            st.error(
-                                f"Invalid JSON for **{param_name}** in {scoring_fn_id}"
-                            )
-
-                st.json(new_params)
-            scoring_params[scoring_fn_id] = new_params
-
-        # Add run evaluation button & slider
-        total_rows = len(df)
-        num_rows = st.slider("Number of rows to evaluate", 1, total_rows, total_rows)
-
-        if st.button("Run Evaluation"):
-            progress_text = "Running evaluation..."
-            progress_bar = st.progress(0, text=progress_text)
-            rows = df.to_dict(orient="records")
-            if num_rows < total_rows:
-                rows = rows[:num_rows]
-
-            # Create separate containers for progress text and results
-            progress_text_container = st.empty()
-            results_container = st.empty()
-            output_res = {}
-            for i, r in enumerate(rows):
-                # Update progress
-                progress = i / len(rows)
-                progress_bar.progress(progress, text=progress_text)
-
-                # Run evaluation for current row
-                score_res = EVALUATION_API.run_scoring(
-                    r,
-                    scoring_function_ids=selected_scoring_functions,
-                    scoring_params=scoring_params,
-                )
-
-                for k in r.keys():
-                    if k not in output_res:
-                        output_res[k] = []
-                    output_res[k].append(r[k])
-
-                for fn_id in selected_scoring_functions:
-                    if fn_id not in output_res:
-                        output_res[fn_id] = []
-                    output_res[fn_id].append(score_res.results[fn_id].score_rows[0])
-
-                # Display current row results using separate containers
-                progress_text_container.write(
-                    f"Expand to see current processed result ({i+1}/{len(rows)})"
-                )
-                results_container.json(
-                    score_res.to_json(),
-                    expanded=2,
-                )
-
-            progress_bar.progress(1.0, text="Evaluation complete!")
-
-            # Display results in dataframe
-            if output_res:
-                output_df = pd.DataFrame(output_res)
-                st.subheader("Evaluation Results")
-                st.dataframe(output_df)
+    pg = st.navigation(
+        {
+            "Playground": [
+                chat_page,
+                rag_page,
+                application_evaluation_page,
+                native_evaluation_page,
+            ],
+            "Inspect": [provider_page, resources_page],
+        },
+        expanded=False,
+    )
+    pg.run()


 if __name__ == "__main__":
--- a/llama_stack/distribution/ui/modules/init.py
+++ b/llama_stack/distribution/ui/modules/init.py
@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
--- a/llama_stack/distribution/ui/modules/api.py
+++ b/llama_stack/distribution/ui/modules/api.py
@ -11,7 +11,7 @@ from typing import Optional
 from llama_stack_client import LlamaStackClient


-class LlamaStackEvaluation:
+class LlamaStackApi:
    def __init__(self):
        self.client = LlamaStackClient(
            base_url=os.environ.get("LLAMA_STACK_ENDPOINT", "http://localhost:5000"),
@ -22,14 +22,6 @@ class LlamaStackEvaluation:
            },
        )

-    def list_scoring_functions(self):
-        """List all available scoring functions"""
-        return self.client.scoring_functions.list()
-
-    def list_models(self):
-        """List all available judge models"""
-        return self.client.models.list()
-
    def run_scoring(
        self, row, scoring_function_ids: list[str], scoring_params: Optional[dict]
    ):
@ -39,3 +31,6 @@ class LlamaStackEvaluation:
        return self.client.scoring.score(
            input_rows=[row], scoring_functions=scoring_params
        )
+
+
+llama_stack_api = LlamaStackApi()
--- a/llama_stack/distribution/ui/modules/utils.py
+++ b/llama_stack/distribution/ui/modules/utils.py
@ -4,6 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+import base64
 import os

 import pandas as pd
@ -29,3 +30,13 @@ def process_dataset(file):
    except Exception as e:
        st.error(f"Error processing file: {str(e)}")
        return None
+
+
+def data_url_from_file(file) -> str:
+    file_content = file.getvalue()
+    base64_content = base64.b64encode(file_content).decode("utf-8")
+    mime_type = file.type
+
+    data_url = f"data:{mime_type};base64,{base64_content}"
+
+    return data_url
--- a/llama_stack/distribution/ui/page/init.py
+++ b/llama_stack/distribution/ui/page/init.py
@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
--- a/llama_stack/distribution/ui/page/distribution/datasets.py
+++ b/llama_stack/distribution/ui/page/distribution/datasets.py
@ -0,0 +1,19 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import streamlit as st
+from modules.api import llama_stack_api
+
+
+def datasets():
+    st.header("Datasets")
+
+    datasets_info = {
+        d.identifier: d.to_dict() for d in llama_stack_api.client.datasets.list()
+    }
+
+    selected_dataset = st.selectbox("Select a dataset", list(datasets_info.keys()))
+    st.json(datasets_info[selected_dataset], expanded=True)
--- a/llama_stack/distribution/ui/page/distribution/eval_tasks.py
+++ b/llama_stack/distribution/ui/page/distribution/eval_tasks.py
@ -0,0 +1,22 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import streamlit as st
+from modules.api import llama_stack_api
+
+
+def eval_tasks():
+    # Eval Tasks Section
+    st.header("Eval Tasks")
+
+    eval_tasks_info = {
+        d.identifier: d.to_dict() for d in llama_stack_api.client.eval_tasks.list()
+    }
+
+    selected_eval_task = st.selectbox(
+        "Select an eval task", list(eval_tasks_info.keys()), key="eval_task_inspect"
+    )
+    st.json(eval_tasks_info[selected_eval_task], expanded=True)
--- a/llama_stack/distribution/ui/page/distribution/memory_banks.py
+++ b/llama_stack/distribution/ui/page/distribution/memory_banks.py
@ -0,0 +1,23 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import streamlit as st
+from modules.api import llama_stack_api
+
+
+def memory_banks():
+    st.header("Memory Banks")
+    memory_banks_info = {
+        m.identifier: m.to_dict() for m in llama_stack_api.client.memory_banks.list()
+    }
+
+    if len(memory_banks_info) > 0:
+        selected_memory_bank = st.selectbox(
+            "Select a memory bank", list(memory_banks_info.keys())
+        )
+        st.json(memory_banks_info[selected_memory_bank])
+    else:
+        st.info("No memory banks found")
--- a/llama_stack/distribution/ui/page/distribution/models.py
+++ b/llama_stack/distribution/ui/page/distribution/models.py
@ -0,0 +1,19 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import streamlit as st
+from modules.api import llama_stack_api
+
+
+def models():
+    # Models Section
+    st.header("Models")
+    models_info = {
+        m.identifier: m.to_dict() for m in llama_stack_api.client.models.list()
+    }
+
+    selected_model = st.selectbox("Select a model", list(models_info.keys()))
+    st.json(models_info[selected_model])
--- a/llama_stack/distribution/ui/page/distribution/providers.py
+++ b/llama_stack/distribution/ui/page/distribution/providers.py
@ -0,0 +1,20 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import streamlit as st
+from modules.api import llama_stack_api
+
+
+def providers():
+    st.header("🔍 API Providers")
+    apis_providers_info = llama_stack_api.client.providers.list()
+    # selected_api = st.selectbox("Select an API", list(apis_providers_info.keys()))
+    for api in apis_providers_info.keys():
+        st.markdown(f"###### {api}")
+        st.dataframe([p.to_dict() for p in apis_providers_info[api]], width=500)
+
+
+providers()
--- a/llama_stack/distribution/ui/page/distribution/resources.py
+++ b/llama_stack/distribution/ui/page/distribution/resources.py
@ -0,0 +1,52 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from page.distribution.datasets import datasets
+from page.distribution.eval_tasks import eval_tasks
+from page.distribution.memory_banks import memory_banks
+from page.distribution.models import models
+from page.distribution.scoring_functions import scoring_functions
+from page.distribution.shields import shields
+
+from streamlit_option_menu import option_menu
+
+
+def resources_page():
+    options = [
+        "Models",
+        "Memory Banks",
+        "Shields",
+        "Scoring Functions",
+        "Datasets",
+        "Eval Tasks",
+    ]
+    icons = ["magic", "memory", "shield", "file-bar-graph", "database", "list-task"]
+    selected_resource = option_menu(
+        None,
+        options,
+        icons=icons,
+        orientation="horizontal",
+        styles={
+            "nav-link": {
+                "font-size": "12px",
+            },
+        },
+    )
+    if selected_resource == "Eval Tasks":
+        eval_tasks()
+    elif selected_resource == "Memory Banks":
+        memory_banks()
+    elif selected_resource == "Datasets":
+        datasets()
+    elif selected_resource == "Models":
+        models()
+    elif selected_resource == "Scoring Functions":
+        scoring_functions()
+    elif selected_resource == "Shields":
+        shields()
+
+
+resources_page()
--- a/llama_stack/distribution/ui/page/distribution/scoring_functions.py
+++ b/llama_stack/distribution/ui/page/distribution/scoring_functions.py
@ -0,0 +1,22 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import streamlit as st
+from modules.api import llama_stack_api
+
+
+def scoring_functions():
+    st.header("Scoring Functions")
+
+    scoring_functions_info = {
+        s.identifier: s.to_dict()
+        for s in llama_stack_api.client.scoring_functions.list()
+    }
+
+    selected_scoring_function = st.selectbox(
+        "Select a scoring function", list(scoring_functions_info.keys())
+    )
+    st.json(scoring_functions_info[selected_scoring_function], expanded=True)
--- a/llama_stack/distribution/ui/page/distribution/shields.py
+++ b/llama_stack/distribution/ui/page/distribution/shields.py
@ -0,0 +1,20 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import streamlit as st
+from modules.api import llama_stack_api
+
+
+def shields():
+    # Shields Section
+    st.header("Shields")
+
+    shields_info = {
+        s.identifier: s.to_dict() for s in llama_stack_api.client.shields.list()
+    }
+
+    selected_shield = st.selectbox("Select a shield", list(shields_info.keys()))
+    st.json(shields_info[selected_shield])
--- a/llama_stack/distribution/ui/page/evaluations/init.py
+++ b/llama_stack/distribution/ui/page/evaluations/init.py
@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
--- a/llama_stack/distribution/ui/page/evaluations/app_eval.py
+++ b/llama_stack/distribution/ui/page/evaluations/app_eval.py
@ -0,0 +1,148 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import json
+
+import pandas as pd
+import streamlit as st
+
+from modules.api import llama_stack_api
+from modules.utils import process_dataset
+
+
+def application_evaluation_page():
+
+    st.set_page_config(page_title="Evaluations (Scoring)", page_icon="🦙")
+    st.title("📊 Evaluations (Scoring)")
+
+    # File uploader
+    uploaded_file = st.file_uploader("Upload Dataset", type=["csv", "xlsx", "xls"])
+
+    if uploaded_file is None:
+        st.error("No file uploaded")
+        return
+
+    # Process uploaded file
+    df = process_dataset(uploaded_file)
+    if df is None:
+        st.error("Error processing file")
+        return
+
+    # Display dataset information
+    st.success("Dataset loaded successfully!")
+
+    # Display dataframe preview
+    st.subheader("Dataset Preview")
+    st.dataframe(df)
+
+    # Select Scoring Functions to Run Evaluation On
+    st.subheader("Select Scoring Functions")
+    scoring_functions = llama_stack_api.client.scoring_functions.list()
+    scoring_functions = {sf.identifier: sf for sf in scoring_functions}
+    scoring_functions_names = list(scoring_functions.keys())
+    selected_scoring_functions = st.multiselect(
+        "Choose one or more scoring functions",
+        options=scoring_functions_names,
+        help="Choose one or more scoring functions.",
+    )
+
+    available_models = llama_stack_api.client.models.list()
+    available_models = [m.identifier for m in available_models]
+
+    scoring_params = {}
+    if selected_scoring_functions:
+        st.write("Selected:")
+        for scoring_fn_id in selected_scoring_functions:
+            scoring_fn = scoring_functions[scoring_fn_id]
+            st.write(f"- **{scoring_fn_id}**: {scoring_fn.description}")
+            new_params = None
+            if scoring_fn.params:
+                new_params = {}
+                for param_name, param_value in scoring_fn.params.to_dict().items():
+                    if param_name == "type":
+                        new_params[param_name] = param_value
+                        continue
+
+                    if param_name == "judge_model":
+                        value = st.selectbox(
+                            f"Select **{param_name}** for {scoring_fn_id}",
+                            options=available_models,
+                            index=0,
+                            key=f"{scoring_fn_id}_{param_name}",
+                        )
+                        new_params[param_name] = value
+                    else:
+                        value = st.text_area(
+                            f"Enter value for **{param_name}** in {scoring_fn_id} in valid JSON format",
+                            value=json.dumps(param_value, indent=2),
+                            height=80,
+                        )
+                        try:
+                            new_params[param_name] = json.loads(value)
+                        except json.JSONDecodeError:
+                            st.error(
+                                f"Invalid JSON for **{param_name}** in {scoring_fn_id}"
+                            )
+
+                st.json(new_params)
+            scoring_params[scoring_fn_id] = new_params
+
+        # Add run evaluation button & slider
+        total_rows = len(df)
+        num_rows = st.slider("Number of rows to evaluate", 1, total_rows, total_rows)
+
+        if st.button("Run Evaluation"):
+            progress_text = "Running evaluation..."
+            progress_bar = st.progress(0, text=progress_text)
+            rows = df.to_dict(orient="records")
+            if num_rows < total_rows:
+                rows = rows[:num_rows]
+
+            # Create separate containers for progress text and results
+            progress_text_container = st.empty()
+            results_container = st.empty()
+            output_res = {}
+            for i, r in enumerate(rows):
+                # Update progress
+                progress = i / len(rows)
+                progress_bar.progress(progress, text=progress_text)
+
+                # Run evaluation for current row
+                score_res = llama_stack_api.run_scoring(
+                    r,
+                    scoring_function_ids=selected_scoring_functions,
+                    scoring_params=scoring_params,
+                )
+
+                for k in r.keys():
+                    if k not in output_res:
+                        output_res[k] = []
+                    output_res[k].append(r[k])
+
+                for fn_id in selected_scoring_functions:
+                    if fn_id not in output_res:
+                        output_res[fn_id] = []
+                    output_res[fn_id].append(score_res.results[fn_id].score_rows[0])
+
+                # Display current row results using separate containers
+                progress_text_container.write(
+                    f"Expand to see current processed result ({i+1}/{len(rows)})"
+                )
+                results_container.json(
+                    score_res.to_json(),
+                    expanded=2,
+                )
+
+            progress_bar.progress(1.0, text="Evaluation complete!")
+
+            # Display results in dataframe
+            if output_res:
+                output_df = pd.DataFrame(output_res)
+                st.subheader("Evaluation Results")
+                st.dataframe(output_df)
+
+
+application_evaluation_page()
--- a/llama_stack/distribution/ui/page/evaluations/native_eval.py
+++ b/llama_stack/distribution/ui/page/evaluations/native_eval.py
@ -0,0 +1,257 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import json
+
+import pandas as pd
+
+import streamlit as st
+
+from modules.api import llama_stack_api
+
+
+def select_eval_task_1():
+    # Select Eval Tasks
+    st.subheader("1. Choose An Eval Task")
+    eval_tasks = llama_stack_api.client.eval_tasks.list()
+    eval_tasks = {et.identifier: et for et in eval_tasks}
+    eval_tasks_names = list(eval_tasks.keys())
+    selected_eval_task = st.selectbox(
+        "Choose an eval task.",
+        options=eval_tasks_names,
+        help="Choose an eval task. Each eval task is parameterized by a dataset, and list of scoring functions.",
+    )
+    with st.expander("View Eval Task"):
+        st.json(eval_tasks[selected_eval_task], expanded=True)
+
+    st.session_state["selected_eval_task"] = selected_eval_task
+    st.session_state["eval_tasks"] = eval_tasks
+    if st.button("Confirm", key="confirm_1"):
+        st.session_state["selected_eval_task_1_next"] = True
+
+
+def define_eval_candidate_2():
+    if not st.session_state.get("selected_eval_task_1_next", None):
+        return
+
+    st.subheader("2. Define Eval Candidate")
+    st.info(
+        """
+        Define the configurations for the evaluation candidate model or agent used for generation.
+        Select "model" if you want to run generation with inference API, or "agent" if you want to run generation with agent API through specifying AgentConfig.
+        """
+    )
+    with st.expander("Define Eval Candidate", expanded=True):
+        # Define Eval Candidate
+        candidate_type = st.radio("Candidate Type", ["model", "agent"])
+
+        available_models = llama_stack_api.client.models.list()
+        available_models = [model.identifier for model in available_models]
+        selected_model = st.selectbox(
+            "Choose a model",
+            available_models,
+            index=0,
+        )
+
+        # Sampling Parameters
+        st.markdown("##### Sampling Parameters")
+        strategy = st.selectbox(
+            "Strategy",
+            ["greedy", "top_p", "top_k"],
+            index=0,
+        )
+        temperature = st.slider(
+            "Temperature",
+            min_value=0.0,
+            max_value=1.0,
+            value=0.0,
+            step=0.1,
+            help="Controls the randomness of the response. Higher values make the output more creative and unexpected, lower values make it more conservative and predictable",
+        )
+        top_p = st.slider(
+            "Top P",
+            min_value=0.0,
+            max_value=1.0,
+            value=0.95,
+            step=0.1,
+        )
+        max_tokens = st.slider(
+            "Max Tokens",
+            min_value=0,
+            max_value=4096,
+            value=512,
+            step=1,
+            help="The maximum number of tokens to generate",
+        )
+        repetition_penalty = st.slider(
+            "Repetition Penalty",
+            min_value=1.0,
+            max_value=2.0,
+            value=1.0,
+            step=0.1,
+            help="Controls the likelihood for generating the same word or phrase multiple times in the same sentence or paragraph. 1 implies no penalty, 2 will strongly discourage model to repeat words or phrases.",
+        )
+        if candidate_type == "model":
+            eval_candidate = {
+                "type": "model",
+                "model": selected_model,
+                "sampling_params": {
+                    "strategy": strategy,
+                    "temperature": temperature,
+                    "top_p": top_p,
+                    "max_tokens": max_tokens,
+                    "repetition_penalty": repetition_penalty,
+                },
+            }
+        elif candidate_type == "agent":
+            system_prompt = st.text_area(
+                "System Prompt",
+                value="You are a helpful AI assistant.",
+                help="Initial instructions given to the AI to set its behavior and context",
+            )
+            tools_json = st.text_area(
+                "Tools Configuration (JSON)",
+                value=json.dumps(
+                    [
+                        {
+                            "type": "brave_search",
+                            "engine": "brave",
+                            "api_key": "ENTER_BRAVE_API_KEY_HERE",
+                        }
+                    ]
+                ),
+                help="Enter tool configurations in JSON format. Each tool should have a name, description, and parameters.",
+                height=200,
+            )
+            try:
+                tools = json.loads(tools_json)
+            except json.JSONDecodeError:
+                st.error("Invalid JSON format for tools configuration")
+                tools = []
+            eval_candidate = {
+                "type": "agent",
+                "config": {
+                    "model": selected_model,
+                    "instructions": system_prompt,
+                    "tools": tools,
+                    "tool_choice": "auto",
+                    "tool_prompt_format": "json",
+                    "input_shields": [],
+                    "output_shields": [],
+                    "enable_session_persistence": False,
+                },
+            }
+        st.session_state["eval_candidate"] = eval_candidate
+
+    if st.button("Confirm", key="confirm_2"):
+        st.session_state["selected_eval_candidate_2_next"] = True
+
+
+def run_evaluation_3():
+    if not st.session_state.get("selected_eval_candidate_2_next", None):
+        return
+
+    st.subheader("3. Run Evaluation")
+    # Add info box to explain configurations being used
+    st.info(
+        """
+        Review the configurations that will be used for this evaluation run, make any necessary changes, and then click the "Run Evaluation" button.
+        """
+    )
+    selected_eval_task = st.session_state["selected_eval_task"]
+    eval_tasks = st.session_state["eval_tasks"]
+    eval_candidate = st.session_state["eval_candidate"]
+
+    dataset_id = eval_tasks[selected_eval_task].dataset_id
+    rows = llama_stack_api.client.datasetio.get_rows_paginated(
+        dataset_id=dataset_id,
+        rows_in_page=-1,
+    )
+    total_rows = len(rows.rows)
+    # Add number of examples control
+    num_rows = st.number_input(
+        "Number of Examples to Evaluate",
+        min_value=1,
+        max_value=total_rows,
+        value=5,
+        help="Number of examples from the dataset to evaluate. ",
+    )
+
+    eval_task_config = {
+        "type": "benchmark",
+        "eval_candidate": eval_candidate,
+        "scoring_params": {},
+    }
+
+    with st.expander("View Evaluation Task", expanded=True):
+        st.json(eval_tasks[selected_eval_task], expanded=True)
+    with st.expander("View Evaluation Task Configuration", expanded=True):
+        st.json(eval_task_config, expanded=True)
+
+    # Add run button and handle evaluation
+    if st.button("Run Evaluation"):
+
+        progress_text = "Running evaluation..."
+        progress_bar = st.progress(0, text=progress_text)
+        rows = rows.rows
+        if num_rows < total_rows:
+            rows = rows[:num_rows]
+
+        # Create separate containers for progress text and results
+        progress_text_container = st.empty()
+        results_container = st.empty()
+        output_res = {}
+        for i, r in enumerate(rows):
+            # Update progress
+            progress = i / len(rows)
+            progress_bar.progress(progress, text=progress_text)
+            # Run evaluation for current row
+            eval_res = llama_stack_api.client.eval.evaluate_rows(
+                task_id=selected_eval_task,
+                input_rows=[r],
+                scoring_functions=eval_tasks[selected_eval_task].scoring_functions,
+                task_config=eval_task_config,
+            )
+
+            for k in r.keys():
+                if k not in output_res:
+                    output_res[k] = []
+                output_res[k].append(r[k])
+
+            for k in eval_res.generations[0].keys():
+                if k not in output_res:
+                    output_res[k] = []
+                output_res[k].append(eval_res.generations[0][k])
+
+            for scoring_fn in eval_tasks[selected_eval_task].scoring_functions:
+                if scoring_fn not in output_res:
+                    output_res[scoring_fn] = []
+                output_res[scoring_fn].append(eval_res.scores[scoring_fn].score_rows[0])
+
+            progress_text_container.write(
+                f"Expand to see current processed result ({i+1}/{len(rows)})"
+            )
+            results_container.json(eval_res, expanded=2)
+
+        progress_bar.progress(1.0, text="Evaluation complete!")
+        # Display results in dataframe
+        if output_res:
+            output_df = pd.DataFrame(output_res)
+            st.subheader("Evaluation Results")
+            st.dataframe(output_df)
+
+
+def native_evaluation_page():
+
+    st.set_page_config(page_title="Evaluations (Generation + Scoring)", page_icon="🦙")
+    st.title("📊 Evaluations (Generation + Scoring)")
+
+    select_eval_task_1()
+    define_eval_candidate_2()
+    run_evaluation_3()
+
+
+native_evaluation_page()
--- a/llama_stack/distribution/ui/page/playground/init.py
+++ b/llama_stack/distribution/ui/page/playground/init.py
@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
--- a/llama_stack/distribution/ui/page/playground/chat.py
+++ b/llama_stack/distribution/ui/page/playground/chat.py
@ -0,0 +1,123 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import streamlit as st
+from modules.api import llama_stack_api
+
+# Sidebar configurations
+with st.sidebar:
+    st.header("Configuration")
+    available_models = llama_stack_api.client.models.list()
+    available_models = [model.identifier for model in available_models]
+    selected_model = st.selectbox(
+        "Choose a model",
+        available_models,
+        index=0,
+    )
+
+    temperature = st.slider(
+        "Temperature",
+        min_value=0.0,
+        max_value=1.0,
+        value=0.0,
+        step=0.1,
+        help="Controls the randomness of the response. Higher values make the output more creative and unexpected, lower values make it more conservative and predictable",
+    )
+
+    top_p = st.slider(
+        "Top P",
+        min_value=0.0,
+        max_value=1.0,
+        value=0.95,
+        step=0.1,
+    )
+
+    max_tokens = st.slider(
+        "Max Tokens",
+        min_value=0,
+        max_value=4096,
+        value=512,
+        step=1,
+        help="The maximum number of tokens to generate",
+    )
+
+    repetition_penalty = st.slider(
+        "Repetition Penalty",
+        min_value=1.0,
+        max_value=2.0,
+        value=1.0,
+        step=0.1,
+        help="Controls the likelihood for generating the same word or phrase multiple times in the same sentence or paragraph. 1 implies no penalty, 2 will strongly discourage model to repeat words or phrases.",
+    )
+
+    stream = st.checkbox("Stream", value=True)
+    system_prompt = st.text_area(
+        "System Prompt",
+        value="You are a helpful AI assistant.",
+        help="Initial instructions given to the AI to set its behavior and context",
+    )
+
+    # Add clear chat button to sidebar
+    if st.button("Clear Chat", use_container_width=True):
+        st.session_state.messages = []
+        st.rerun()
+
+
+# Main chat interface
+st.title("🦙 Chat")
+
+
+# Initialize chat history
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+
+# Display chat messages
+for message in st.session_state.messages:
+    with st.chat_message(message["role"]):
+        st.markdown(message["content"])
+
+# Chat input
+if prompt := st.chat_input("Example: What is Llama Stack?"):
+    # Add user message to chat history
+    st.session_state.messages.append({"role": "user", "content": prompt})
+
+    # Display user message
+    with st.chat_message("user"):
+        st.markdown(prompt)
+
+    # Display assistant response
+    with st.chat_message("assistant"):
+        message_placeholder = st.empty()
+        full_response = ""
+
+        response = llama_stack_api.client.inference.chat_completion(
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": prompt},
+            ],
+            model_id=selected_model,
+            stream=stream,
+            sampling_params={
+                "temperature": temperature,
+                "top_p": top_p,
+                "max_tokens": max_tokens,
+                "repetition_penalty": repetition_penalty,
+            },
+        )
+
+        if stream:
+            for chunk in response:
+                if chunk.event.event_type == "progress":
+                    full_response += chunk.event.delta
+                message_placeholder.markdown(full_response + "▌")
+            message_placeholder.markdown(full_response)
+        else:
+            full_response = response
+            message_placeholder.markdown(full_response.completion_message.content)
+
+        st.session_state.messages.append(
+            {"role": "assistant", "content": full_response}
+        )
--- a/llama_stack/distribution/ui/page/playground/rag.py
+++ b/llama_stack/distribution/ui/page/playground/rag.py
@ -0,0 +1,188 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import streamlit as st
+from llama_stack_client.lib.agents.agent import Agent
+from llama_stack_client.lib.agents.event_logger import EventLogger
+from llama_stack_client.types.agent_create_params import AgentConfig
+from llama_stack_client.types.memory_insert_params import Document
+
+from modules.api import llama_stack_api
+from modules.utils import data_url_from_file
+
+
+def rag_chat_page():
+    st.title("🦙 RAG")
+
+    with st.sidebar:
+        # File/Directory Upload Section
+        st.subheader("Upload Documents")
+        uploaded_files = st.file_uploader(
+            "Upload file(s) or directory",
+            accept_multiple_files=True,
+            type=["txt", "pdf", "doc", "docx"],  # Add more file types as needed
+        )
+        # Process uploaded files
+        if uploaded_files:
+            st.success(f"Successfully uploaded {len(uploaded_files)} files")
+            # Add memory bank name input field
+            memory_bank_name = st.text_input(
+                "Memory Bank Name",
+                value="rag_bank",
+                help="Enter a unique identifier for this memory bank",
+            )
+            if st.button("Create Memory Bank"):
+                documents = [
+                    Document(
+                        document_id=uploaded_file.name,
+                        content=data_url_from_file(uploaded_file),
+                    )
+                    for i, uploaded_file in enumerate(uploaded_files)
+                ]
+
+                providers = llama_stack_api.client.providers.list()
+                llama_stack_api.client.memory_banks.register(
+                    memory_bank_id=memory_bank_name,  # Use the user-provided name
+                    params={
+                        "embedding_model": "all-MiniLM-L6-v2",
+                        "chunk_size_in_tokens": 512,
+                        "overlap_size_in_tokens": 64,
+                    },
+                    provider_id=providers["memory"][0].provider_id,
+                )
+
+                # insert documents using the custom bank name
+                llama_stack_api.client.memory.insert(
+                    bank_id=memory_bank_name,  # Use the user-provided name
+                    documents=documents,
+                )
+                st.success("Memory bank created successfully!")
+
+        st.subheader("Configure Agent")
+        # select memory banks
+        memory_banks = llama_stack_api.client.memory_banks.list()
+        memory_banks = [bank.identifier for bank in memory_banks]
+        selected_memory_banks = st.multiselect(
+            "Select Memory Banks",
+            memory_banks,
+        )
+        memory_bank_configs = [
+            {"bank_id": bank_id, "type": "vector"} for bank_id in selected_memory_banks
+        ]
+
+        available_models = llama_stack_api.client.models.list()
+        available_models = [model.identifier for model in available_models]
+        selected_model = st.selectbox(
+            "Choose a model",
+            available_models,
+            index=0,
+        )
+        system_prompt = st.text_area(
+            "System Prompt",
+            value="You are a helpful assistant. ",
+            help="Initial instructions given to the AI to set its behavior and context",
+        )
+        temperature = st.slider(
+            "Temperature",
+            min_value=0.0,
+            max_value=1.0,
+            value=0.0,
+            step=0.1,
+            help="Controls the randomness of the response. Higher values make the output more creative and unexpected, lower values make it more conservative and predictable",
+        )
+
+        top_p = st.slider(
+            "Top P",
+            min_value=0.0,
+            max_value=1.0,
+            value=0.95,
+            step=0.1,
+        )
+
+        # Add clear chat button to sidebar
+        if st.button("Clear Chat", use_container_width=True):
+            st.session_state.messages = []
+            st.rerun()
+
+    # Chat Interface
+    if "messages" not in st.session_state:
+        st.session_state.messages = []
+
+    # Display chat history
+    for message in st.session_state.messages:
+        with st.chat_message(message["role"]):
+            st.markdown(message["content"])
+
+    selected_model = llama_stack_api.client.models.list()[0].identifier
+
+    agent_config = AgentConfig(
+        model=selected_model,
+        instructions=system_prompt,
+        sampling_params={
+            "strategy": "greedy",
+            "temperature": temperature,
+            "top_p": top_p,
+        },
+        tools=[
+            {
+                "type": "memory",
+                "memory_bank_configs": memory_bank_configs,
+                "query_generator_config": {"type": "default", "sep": " "},
+                "max_tokens_in_context": 4096,
+                "max_chunks": 10,
+            }
+        ],
+        tool_choice="auto",
+        tool_prompt_format="json",
+        input_shields=[],
+        output_shields=[],
+        enable_session_persistence=False,
+    )
+
+    agent = Agent(llama_stack_api.client, agent_config)
+    session_id = agent.create_session("rag-session")
+
+    # Chat input
+    if prompt := st.chat_input("Ask a question about your documents"):
+        # Add user message to chat history
+        st.session_state.messages.append({"role": "user", "content": prompt})
+
+        # Display user message
+        with st.chat_message("user"):
+            st.markdown(prompt)
+
+        response = agent.create_turn(
+            messages=[
+                {
+                    "role": "user",
+                    "content": prompt,
+                }
+            ],
+            session_id=session_id,
+        )
+
+        # Display assistant response
+        with st.chat_message("assistant"):
+            retrieval_message_placeholder = st.empty()
+            message_placeholder = st.empty()
+            full_response = ""
+            retrieval_response = ""
+            for log in EventLogger().log(response):
+                log.print()
+                if log.role == "memory_retrieval":
+                    retrieval_response += log.content.replace("====", "").strip()
+                    retrieval_message_placeholder.info(retrieval_response)
+                else:
+                    full_response += log.content
+                    message_placeholder.markdown(full_response + "▌")
+            message_placeholder.markdown(full_response)
+
+            st.session_state.messages.append(
+                {"role": "assistant", "content": full_response}
+            )
+
+
+rag_chat_page()
--- a/llama_stack/distribution/ui/requirements.txt
+++ b/llama_stack/distribution/ui/requirements.txt
@ -1,3 +1,4 @@
 streamlit
 pandas
 llama-stack-client>=0.0.55
+streamlit-option-menu
--- a/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_base.py
+++ b/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_base.py
@ -5,7 +5,7 @@
 # the root directory of this source tree.

 from llama_stack.apis.common.type_system import NumberType
-from llama_stack.apis.scoring_functions import ScoringFn
+from llama_stack.apis.scoring_functions import LLMAsJudgeScoringFnParams, ScoringFn


 llm_as_judge_base = ScoringFn(
@ -14,4 +14,8 @@ llm_as_judge_base = ScoringFn(
    return_type=NumberType(),
    provider_id="llm-as-judge",
    provider_resource_id="llm-as-judge-base",
+    params=LLMAsJudgeScoringFnParams(
+        judge_model="meta-llama/Llama-3.1-405B-Instruct",
+        prompt_template="Enter custom LLM as Judge Prompt Template",
+    ),
 )