pre-commit fixes

2025-12-31 10:23:52 +00:00 · 2025-03-14 13:56:05 -07:00 · 2025-03-14 13:56:05 -07:00 · 7e211f8553
commit 7e211f8553
parent 967dd0aa08
314 changed files with 5574 additions and 11369 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
--- a/docs/getting_started.ipynb
+++ b/docs/getting_started.ipynb
@ -141,7 +141,7 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 1,
+      "execution_count": 18,
      "id": "E1UFuJC570Tk",
      "metadata": {
        "colab": {
@ -326,54 +326,108 @@
              "  type: sqlite\n",
              "models:\n",
              "- metadata: <span style=\"font-weight: bold\">{}</span>\n",
+              "  model_id: meta-llama/Meta-Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-8B-Instruct-Turbo\n",
+              "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
+              "  - llm\n",
+              "  provider_id: together\n",
+              "  provider_model_id: meta-llama/Meta-Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-8B-Instruct-Turbo\n",
+              "- metadata: <span style=\"font-weight: bold\">{}</span>\n",
              "  model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-8B-Instruct\n",
              "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
              "  - llm\n",
              "  provider_id: together\n",
              "  provider_model_id: meta-llama/Meta-Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-8B-Instruct-Turbo\n",
              "- metadata: <span style=\"font-weight: bold\">{}</span>\n",
+              "  model_id: meta-llama/Meta-Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-70B-Instruct-Turbo\n",
+              "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
+              "  - llm\n",
+              "  provider_id: together\n",
+              "  provider_model_id: meta-llama/Meta-Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-70B-Instruct-Turbo\n",
+              "- metadata: <span style=\"font-weight: bold\">{}</span>\n",
              "  model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-70B-Instruct\n",
              "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
              "  - llm\n",
              "  provider_id: together\n",
              "  provider_model_id: meta-llama/Meta-Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-70B-Instruct-Turbo\n",
              "- metadata: <span style=\"font-weight: bold\">{}</span>\n",
+              "  model_id: meta-llama/Meta-Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-405B-Instruct-Turbo\n",
+              "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
+              "  - llm\n",
+              "  provider_id: together\n",
+              "  provider_model_id: meta-llama/Meta-Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-405B-Instruct-Turbo\n",
+              "- metadata: <span style=\"font-weight: bold\">{}</span>\n",
              "  model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-405B-Instruct-FP8\n",
              "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
              "  - llm\n",
              "  provider_id: together\n",
              "  provider_model_id: meta-llama/Meta-Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.1</span>-405B-Instruct-Turbo\n",
              "- metadata: <span style=\"font-weight: bold\">{}</span>\n",
+              "  model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-3B-Instruct-Turbo\n",
+              "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
+              "  - llm\n",
+              "  provider_id: together\n",
+              "  provider_model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-3B-Instruct-Turbo\n",
+              "- metadata: <span style=\"font-weight: bold\">{}</span>\n",
              "  model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-3B-Instruct\n",
              "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
              "  - llm\n",
              "  provider_id: together\n",
              "  provider_model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-3B-Instruct-Turbo\n",
              "- metadata: <span style=\"font-weight: bold\">{}</span>\n",
+              "  model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-11B-Vision-Instruct-Turbo\n",
+              "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
+              "  - llm\n",
+              "  provider_id: together\n",
+              "  provider_model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-11B-Vision-Instruct-Turbo\n",
+              "- metadata: <span style=\"font-weight: bold\">{}</span>\n",
              "  model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-11B-Vision-Instruct\n",
              "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
              "  - llm\n",
              "  provider_id: together\n",
              "  provider_model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-11B-Vision-Instruct-Turbo\n",
              "- metadata: <span style=\"font-weight: bold\">{}</span>\n",
+              "  model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-90B-Vision-Instruct-Turbo\n",
+              "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
+              "  - llm\n",
+              "  provider_id: together\n",
+              "  provider_model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-90B-Vision-Instruct-Turbo\n",
+              "- metadata: <span style=\"font-weight: bold\">{}</span>\n",
              "  model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-90B-Vision-Instruct\n",
              "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
              "  - llm\n",
              "  provider_id: together\n",
              "  provider_model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.2</span>-90B-Vision-Instruct-Turbo\n",
              "- metadata: <span style=\"font-weight: bold\">{}</span>\n",
+              "  model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.3</span>-70B-Instruct-Turbo\n",
+              "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
+              "  - llm\n",
+              "  provider_id: together\n",
+              "  provider_model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.3</span>-70B-Instruct-Turbo\n",
+              "- metadata: <span style=\"font-weight: bold\">{}</span>\n",
              "  model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.3</span>-70B-Instruct\n",
              "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
              "  - llm\n",
              "  provider_id: together\n",
              "  provider_model_id: meta-llama/Llama-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.3</span>-70B-Instruct-Turbo\n",
              "- metadata: <span style=\"font-weight: bold\">{}</span>\n",
+              "  model_id: meta-llama/Meta-Llama-Guard-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3</span>-8B\n",
+              "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
+              "  - llm\n",
+              "  provider_id: together\n",
+              "  provider_model_id: meta-llama/Meta-Llama-Guard-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3</span>-8B\n",
+              "- metadata: <span style=\"font-weight: bold\">{}</span>\n",
              "  model_id: meta-llama/Llama-Guard-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3</span>-8B\n",
              "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
              "  - llm\n",
              "  provider_id: together\n",
              "  provider_model_id: meta-llama/Meta-Llama-Guard-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3</span>-8B\n",
              "- metadata: <span style=\"font-weight: bold\">{}</span>\n",
+              "  model_id: meta-llama/Llama-Guard-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3</span>-11B-Vision-Turbo\n",
+              "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
+              "  - llm\n",
+              "  provider_id: together\n",
+              "  provider_model_id: meta-llama/Llama-Guard-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3</span>-11B-Vision-Turbo\n",
+              "- metadata: <span style=\"font-weight: bold\">{}</span>\n",
              "  model_id: meta-llama/Llama-Guard-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3</span>-11B-Vision\n",
              "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
              "  - llm\n",
@ -473,6 +527,9 @@
              "  - config: <span style=\"font-weight: bold\">{}</span>\n",
              "    provider_id: model-context-protocol\n",
              "    provider_type: remote::model-context-protocol\n",
+              "  - config: <span style=\"font-weight: bold\">{}</span>\n",
+              "    provider_id: wolfram-alpha\n",
+              "    provider_type: remote::wolfram-alpha\n",
              "  vector_io:\n",
              "  - config:\n",
              "      kvstore:\n",
@ -504,6 +561,10 @@
              "  mcp_endpoint: null\n",
              "  provider_id: code-interpreter\n",
              "  toolgroup_id: builtin::code_interpreter\n",
+              "- args: null\n",
+              "  mcp_endpoint: null\n",
+              "  provider_id: wolfram-alpha\n",
+              "  toolgroup_id: builtin::wolfram_alpha\n",
              "vector_dbs: <span style=\"font-weight: bold\">[]</span>\n",
              "version: <span style=\"color: #008000; text-decoration-color: #008000\">'2'</span>\n",
              "\n",
@ -530,54 +591,108 @@
              "  type: sqlite\n",
              "models:\n",
              "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
+              "  model_id: meta-llama/Meta-Llama-\u001b[1;36m3.1\u001b[0m-8B-Instruct-Turbo\n",
+              "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
+              "  - llm\n",
+              "  provider_id: together\n",
+              "  provider_model_id: meta-llama/Meta-Llama-\u001b[1;36m3.1\u001b[0m-8B-Instruct-Turbo\n",
+              "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
              "  model_id: meta-llama/Llama-\u001b[1;36m3.1\u001b[0m-8B-Instruct\n",
              "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
              "  - llm\n",
              "  provider_id: together\n",
              "  provider_model_id: meta-llama/Meta-Llama-\u001b[1;36m3.1\u001b[0m-8B-Instruct-Turbo\n",
              "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
+              "  model_id: meta-llama/Meta-Llama-\u001b[1;36m3.1\u001b[0m-70B-Instruct-Turbo\n",
+              "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
+              "  - llm\n",
+              "  provider_id: together\n",
+              "  provider_model_id: meta-llama/Meta-Llama-\u001b[1;36m3.1\u001b[0m-70B-Instruct-Turbo\n",
+              "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
              "  model_id: meta-llama/Llama-\u001b[1;36m3.1\u001b[0m-70B-Instruct\n",
              "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
              "  - llm\n",
              "  provider_id: together\n",
              "  provider_model_id: meta-llama/Meta-Llama-\u001b[1;36m3.1\u001b[0m-70B-Instruct-Turbo\n",
              "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
+              "  model_id: meta-llama/Meta-Llama-\u001b[1;36m3.1\u001b[0m-405B-Instruct-Turbo\n",
+              "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
+              "  - llm\n",
+              "  provider_id: together\n",
+              "  provider_model_id: meta-llama/Meta-Llama-\u001b[1;36m3.1\u001b[0m-405B-Instruct-Turbo\n",
+              "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
              "  model_id: meta-llama/Llama-\u001b[1;36m3.1\u001b[0m-405B-Instruct-FP8\n",
              "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
              "  - llm\n",
              "  provider_id: together\n",
              "  provider_model_id: meta-llama/Meta-Llama-\u001b[1;36m3.1\u001b[0m-405B-Instruct-Turbo\n",
              "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
+              "  model_id: meta-llama/Llama-\u001b[1;36m3.2\u001b[0m-3B-Instruct-Turbo\n",
+              "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
+              "  - llm\n",
+              "  provider_id: together\n",
+              "  provider_model_id: meta-llama/Llama-\u001b[1;36m3.2\u001b[0m-3B-Instruct-Turbo\n",
+              "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
              "  model_id: meta-llama/Llama-\u001b[1;36m3.2\u001b[0m-3B-Instruct\n",
              "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
              "  - llm\n",
              "  provider_id: together\n",
              "  provider_model_id: meta-llama/Llama-\u001b[1;36m3.2\u001b[0m-3B-Instruct-Turbo\n",
              "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
+              "  model_id: meta-llama/Llama-\u001b[1;36m3.2\u001b[0m-11B-Vision-Instruct-Turbo\n",
+              "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
+              "  - llm\n",
+              "  provider_id: together\n",
+              "  provider_model_id: meta-llama/Llama-\u001b[1;36m3.2\u001b[0m-11B-Vision-Instruct-Turbo\n",
+              "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
              "  model_id: meta-llama/Llama-\u001b[1;36m3.2\u001b[0m-11B-Vision-Instruct\n",
              "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
              "  - llm\n",
              "  provider_id: together\n",
              "  provider_model_id: meta-llama/Llama-\u001b[1;36m3.2\u001b[0m-11B-Vision-Instruct-Turbo\n",
              "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
+              "  model_id: meta-llama/Llama-\u001b[1;36m3.2\u001b[0m-90B-Vision-Instruct-Turbo\n",
+              "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
+              "  - llm\n",
+              "  provider_id: together\n",
+              "  provider_model_id: meta-llama/Llama-\u001b[1;36m3.2\u001b[0m-90B-Vision-Instruct-Turbo\n",
+              "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
              "  model_id: meta-llama/Llama-\u001b[1;36m3.2\u001b[0m-90B-Vision-Instruct\n",
              "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
              "  - llm\n",
              "  provider_id: together\n",
              "  provider_model_id: meta-llama/Llama-\u001b[1;36m3.2\u001b[0m-90B-Vision-Instruct-Turbo\n",
              "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
+              "  model_id: meta-llama/Llama-\u001b[1;36m3.3\u001b[0m-70B-Instruct-Turbo\n",
+              "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
+              "  - llm\n",
+              "  provider_id: together\n",
+              "  provider_model_id: meta-llama/Llama-\u001b[1;36m3.3\u001b[0m-70B-Instruct-Turbo\n",
+              "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
              "  model_id: meta-llama/Llama-\u001b[1;36m3.3\u001b[0m-70B-Instruct\n",
              "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
              "  - llm\n",
              "  provider_id: together\n",
              "  provider_model_id: meta-llama/Llama-\u001b[1;36m3.3\u001b[0m-70B-Instruct-Turbo\n",
              "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
+              "  model_id: meta-llama/Meta-Llama-Guard-\u001b[1;36m3\u001b[0m-8B\n",
+              "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
+              "  - llm\n",
+              "  provider_id: together\n",
+              "  provider_model_id: meta-llama/Meta-Llama-Guard-\u001b[1;36m3\u001b[0m-8B\n",
+              "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
              "  model_id: meta-llama/Llama-Guard-\u001b[1;36m3\u001b[0m-8B\n",
              "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
              "  - llm\n",
              "  provider_id: together\n",
              "  provider_model_id: meta-llama/Meta-Llama-Guard-\u001b[1;36m3\u001b[0m-8B\n",
              "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
+              "  model_id: meta-llama/Llama-Guard-\u001b[1;36m3\u001b[0m-11B-Vision-Turbo\n",
+              "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
+              "  - llm\n",
+              "  provider_id: together\n",
+              "  provider_model_id: meta-llama/Llama-Guard-\u001b[1;36m3\u001b[0m-11B-Vision-Turbo\n",
+              "- metadata: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
              "  model_id: meta-llama/Llama-Guard-\u001b[1;36m3\u001b[0m-11B-Vision\n",
              "  model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType\n",
              "  - llm\n",
@ -677,6 +792,9 @@
              "  - config: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
              "    provider_id: model-context-protocol\n",
              "    provider_type: remote::model-context-protocol\n",
+              "  - config: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
+              "    provider_id: wolfram-alpha\n",
+              "    provider_type: remote::wolfram-alpha\n",
              "  vector_io:\n",
              "  - config:\n",
              "      kvstore:\n",
@ -708,6 +826,10 @@
              "  mcp_endpoint: null\n",
              "  provider_id: code-interpreter\n",
              "  toolgroup_id: builtin::code_interpreter\n",
+              "- args: null\n",
+              "  mcp_endpoint: null\n",
+              "  provider_id: wolfram-alpha\n",
+              "  toolgroup_id: builtin::wolfram_alpha\n",
              "vector_dbs: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
              "version: \u001b[32m'2'\u001b[0m\n",
              "\n"
@ -1145,7 +1267,6 @@
        }
      ],
      "source": [
-        "# NBVAL_SKIP\n",
        "from pydantic import BaseModel\n",
        "\n",
        "\n",
@ -1157,7 +1278,7 @@
        "\n",
        "user_input = \"Michael Jordan was born in 1963. He played basketball for the Chicago Bulls. He retired in 2003. Extract this information into JSON for me. \"\n",
        "response = client.inference.completion(\n",
-        "    model_id=model_id,\n",
+        "    model_id=\"meta-llama/Llama-3.1-8B-Instruct\",\n",
        "    content=user_input,\n",
        "    stream=False,\n",
        "    sampling_params={\n",
@ -1513,18 +1634,14 @@
      "source": [
        "from llama_stack_client.lib.agents.agent import Agent\n",
        "from llama_stack_client.lib.agents.event_logger import EventLogger\n",
-        "from llama_stack_client.types.agent_create_params import AgentConfig\n",
        "from termcolor import cprint\n",
        "\n",
-        "agent_config = AgentConfig(\n",
+        "agent = Agent(\n",
+        "    client, \n",
        "    model=model_id,\n",
-        "    instructions=\"You are a helpful assistant\",\n",
-        "    toolgroups=[\"builtin::websearch\"],\n",
-        "    input_shields=[],\n",
-        "    output_shields=[],\n",
-        "    enable_session_persistence=False,\n",
+        "    instructions=\"You are a helpful assistant. Use websearch tool to help answer questions.\",\n",
+        "    tools=[\"builtin::websearch\"],\n",
        ")\n",
-        "agent = Agent(client, agent_config)\n",
        "user_prompts = [\n",
        "    \"Hello\",\n",
        "    \"Which teams played in the NBA western conference finals of 2024\",\n",
@ -1693,7 +1810,6 @@
        "import uuid\n",
        "from llama_stack_client.lib.agents.agent import Agent\n",
        "from llama_stack_client.lib.agents.event_logger import EventLogger\n",
-        "from llama_stack_client.types.agent_create_params import AgentConfig\n",
        "from termcolor import cprint\n",
        "from llama_stack_client.types import Document\n",
        "\n",
@ -1719,11 +1835,11 @@
        "    vector_db_id=vector_db_id,\n",
        "    chunk_size_in_tokens=512,\n",
        ")\n",
-        "agent_config = AgentConfig(\n",
+        "rag_agent = Agent(\n",
+        "    client, \n",
        "    model=model_id,\n",
        "    instructions=\"You are a helpful assistant\",\n",
-        "    enable_session_persistence=False,\n",
-        "    toolgroups = [\n",
+        "    tools = [\n",
        "        {\n",
        "          \"name\": \"builtin::rag/knowledge_search\",\n",
        "          \"args\" : {\n",
@ -1732,7 +1848,6 @@
        "        }\n",
        "    ],\n",
        ")\n",
-        "rag_agent = Agent(client, agent_config)\n",
        "session_id = rag_agent.create_session(\"test-session\")\n",
        "user_prompts = [\n",
        "        \"What are the top 5 topics that were explained? Only list succinct bullet points.\",\n",
@ -1856,23 +1971,19 @@
      "source": [
        "from llama_stack_client.types.agents.turn_create_params import Document\n",
        "\n",
-        "agent_config = AgentConfig(\n",
+        "codex_agent = Agent(\n",
+        "    client, \n",
+        "    model=\"meta-llama/Llama-3.1-8B-Instruct\",\n",
+        "    instructions=\"You are a helpful assistant\",\n",
+        "    tools=[\n",
+        "        \"builtin::code_interpreter\",\n",
+        "        \"builtin::websearch\"\n",
+        "    ],\n",
        "    sampling_params = {\n",
        "        \"max_tokens\" : 4096,\n",
        "        \"temperature\": 0.0\n",
        "    },\n",
-        "    model=\"meta-llama/Llama-3.1-8B-Instruct\",\n",
-        "    instructions=\"You are a helpful assistant\",\n",
-        "    toolgroups=[\n",
-        "        \"builtin::code_interpreter\",\n",
-        "        \"builtin::websearch\"\n",
-        "    ],\n",
-        "    tool_choice=\"auto\",\n",
-        "    input_shields=[],\n",
-        "    output_shields=[],\n",
-        "    enable_session_persistence=False,\n",
        ")\n",
-        "codex_agent = Agent(client, agent_config)\n",
        "session_id = codex_agent.create_session(\"test-session\")\n",
        "\n",
        "\n",
@ -2782,18 +2893,14 @@
        "# NBVAL_SKIP\n",
        "from llama_stack_client.lib.agents.agent import Agent\n",
        "from llama_stack_client.lib.agents.event_logger import EventLogger\n",
-        "from llama_stack_client.types.agent_create_params import AgentConfig\n",
        "from termcolor import cprint\n",
        "\n",
-        "agent_config = AgentConfig(\n",
+        "agent = Agent(\n",
+        "    client, \n",
        "    model=model_id,\n",
        "    instructions=\"You are a helpful assistant\",\n",
-        "    toolgroups=[\"mcp::filesystem\"],\n",
-        "    input_shields=[],\n",
-        "    output_shields=[],\n",
-        "    enable_session_persistence=False,\n",
+        "    tools=[\"mcp::filesystem\"],\n",
        ")\n",
-        "agent = Agent(client, agent_config)\n",
        "user_prompts = [\n",
        "    \"Hello\",\n",
        "    \"list all the files /content\",\n",
@ -2888,17 +2995,13 @@
      "source": [
        "from llama_stack_client.lib.agents.agent import Agent\n",
        "from llama_stack_client.lib.agents.event_logger import EventLogger\n",
-        "from llama_stack_client.types.agent_create_params import AgentConfig\n",
        "\n",
-        "agent_config = AgentConfig(\n",
+        "agent = Agent(\n",
+        "    client, \n",
        "    model=\"meta-llama/Llama-3.3-70B-Instruct\",\n",
        "    instructions=\"You are a helpful assistant. Use search tool to answer the questions. \",\n",
-        "    toolgroups=[\"builtin::websearch\"],\n",
-        "    input_shields=[],\n",
-        "    output_shields=[],\n",
-        "    enable_session_persistence=False,\n",
+        "    tools=[\"builtin::websearch\"],\n",
        ")\n",
-        "agent = Agent(client, agent_config)\n",
        "user_prompts = [\n",
        "    \"Which teams played in the NBA western conference finals of 2024. Search the web for the answer.\",\n",
        "    \"In which episode and season of South Park does Bill Cosby (BSM-471) first appear? Give me the number and title. Search the web for the answer.\",\n",
@ -4098,7 +4201,7 @@
      "source": [
        "## 4. Image Understanding with Llama 3.2\n",
        "\n",
-        "Below is a complete example of using Together's Llama Stack 0.1 server at https://llama-stack.together.ai to ask Llama 3.2 questions about an image."
+        "Below is a complete example of to ask Llama 3.2 questions about an image."
      ]
    },
    {
@ -4106,14 +4209,12 @@
      "id": "82e381ec",
      "metadata": {},
      "source": [
-        "### 4.1 Setup and helpers\n",
-        "\n",
-        "Below we install the Llama Stack client 0.1, download the example image, define two image helpers, and set Llama Stack Together server URL and Llama 3.2 model name.\n"
+        "### 4.1 Setup and helpers\n"
      ]
    },
    {
      "cell_type": "code",
-      "execution_count": 3,
+      "execution_count": 1,
      "id": "44e05e16",
      "metadata": {},
      "outputs": [
@ -4123,7 +4224,7 @@
          "text": [
            "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n",
            "                                 Dload  Upload   Total   Spent    Left  Speed\n",
-            "100  275k  100  275k    0     0   780k      0 --:--:-- --:--:-- --:--:--  780k\n"
+            "100  275k  100  275k    0     0   905k      0 --:--:-- --:--:-- --:--:--  906k\n"
          ]
        }
      ],
@ -4133,32 +4234,13 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
-      "id": "469750f7",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# NBVAL_SKIP\n",
-        "from PIL import Image\n",
-        "import matplotlib.pyplot as plt\n",
-        "\n",
-        "def display_image(path):\n",
-        "  img = Image.open(path)\n",
-        "  plt.imshow(img)\n",
-        "  plt.axis('off')\n",
-        "  plt.show()\n",
-        "\n",
-        "display_image(\"Llama_Repo.jpeg\")"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 20,
      "id": "a2c1e1c2",
      "metadata": {},
      "outputs": [],
      "source": [
        "import base64\n",
+        "vision_model_id = \"meta-llama/Llama-3.2-11B-Vision-Instruct\"\n",
        "\n",
        "def encode_image(image_path):\n",
        "    with open(image_path, \"rb\") as image_file:\n",
@ -4167,19 +4249,6 @@
        "        return base64_url"
      ]
    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "c565f99e",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "from llama_stack_client import LlamaStackClient\n",
-        "\n",
-        "LLAMA_STACK_API_TOGETHER_URL=\"https://llama-stack.together.ai\"\n",
-        "LLAMA32_11B_INSTRUCT = \"meta-llama/Llama-3.2-11B-Vision-Instruct\""
-      ]
-    },
    {
      "cell_type": "markdown",
      "id": "7737cd41",
@ -4192,55 +4261,44 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 21,
      "id": "d7914894",
      "metadata": {},
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "There are three llamas in the image. The llama in the middle is purple, the llama on the left is white, and the llama on the right is also white, but it is wearing a blue party hat. Therefore, there are two different colors of llama in the image: purple and white.\n"
+          ]
+        }
+      ],
      "source": [
-        "from llama_stack_client.lib.inference.event_logger import EventLogger\n",
-        "\n",
-        "async def run_main(image_path: str, prompt):\n",
-        "    client = LlamaStackClient(\n",
-        "        base_url=LLAMA_STACK_API_TOGETHER_URL,\n",
-        "    )\n",
-        "\n",
-        "    message = {\n",
-        "        \"role\": \"user\",\n",
-        "        \"content\": [\n",
-        "            {\n",
-        "                \"type\": \"image\",\n",
-        "                \"image\": {\n",
-        "                     \"url\": {\n",
-        "                          \"uri\": encode_image(image_path)\n",
-        "                     }\n",
+        "response = client.inference.chat_completion(\n",
+        "    messages=[\n",
+        "        {\n",
+        "            \"role\": \"user\",\n",
+        "            \"content\": [\n",
+        "                {\n",
+        "                    \"type\": \"image\",\n",
+        "                    \"image\": {\n",
+        "                        \"url\": {\n",
+        "                            \"uri\": encode_image(\"Llama_Repo.jpeg\")\n",
+        "                        }\n",
+        "                    }\n",
+        "                },\n",
+        "                {\n",
+        "                    \"type\": \"text\",\n",
+        "                    \"text\": \"How many different colors are those llamas? What are those colors?\",\n",
        "                }\n",
-        "            },\n",
-        "            {\n",
-        "                \"type\": \"text\",\n",
-        "                \"text\": prompt,\n",
-        "            }\n",
-        "        ]\n",
-        "    }\n",
+        "            ]\n",
+        "        }\n",
+        "    ],\n",
+        "    model_id=vision_model_id,\n",
+        "    stream=False,\n",
+        ")\n",
        "\n",
-        "    response = client.inference.chat_completion(\n",
-        "        messages=[message],\n",
-        "        model_id=LLAMA32_11B_INSTRUCT,\n",
-        "        stream=False,\n",
-        "    )\n",
-        "\n",
-        "    print(response.completion_message.content.lower().strip())"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "4ee09b97",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "await run_main(\"Llama_Repo.jpeg\",\n",
-        "     \"How many different colors are those llamas?\\\n",
-        "     What are those colors?\")"
+        "print(response.completion_message.content)"
      ]
    },
    {
@ -4255,68 +4313,60 @@
    },
    {
      "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 19,
      "id": "f9a83275",
      "metadata": {},
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "\u001b[33minference> \u001b[0m\u001b[33mThere\u001b[0m\u001b[33m are\u001b[0m\u001b[33m three\u001b[0m\u001b[33m different\u001b[0m\u001b[33m colors\u001b[0m\u001b[33m of\u001b[0m\u001b[33m ll\u001b[0m\u001b[33mamas\u001b[0m\u001b[33m in\u001b[0m\u001b[33m the\u001b[0m\u001b[33m image\u001b[0m\u001b[33m.\u001b[0m\u001b[33m The\u001b[0m\u001b[33m first\u001b[0m\u001b[33m llama\u001b[0m\u001b[33m on\u001b[0m\u001b[33m the\u001b[0m\u001b[33m left\u001b[0m\u001b[33m is\u001b[0m\u001b[33m white\u001b[0m\u001b[33m,\u001b[0m\u001b[33m the\u001b[0m\u001b[33m second\u001b[0m\u001b[33m llama\u001b[0m\u001b[33m in\u001b[0m\u001b[33m the\u001b[0m\u001b[33m middle\u001b[0m\u001b[33m is\u001b[0m\u001b[33m purple\u001b[0m\u001b[33m,\u001b[0m\u001b[33m and\u001b[0m\u001b[33m the\u001b[0m\u001b[33m third\u001b[0m\u001b[33m llama\u001b[0m\u001b[33m on\u001b[0m\u001b[33m the\u001b[0m\u001b[33m right\u001b[0m\u001b[33m is\u001b[0m\u001b[33m white\u001b[0m\u001b[33m with\u001b[0m\u001b[33m a\u001b[0m\u001b[33m blue\u001b[0m\u001b[33m party\u001b[0m\u001b[33m hat\u001b[0m\u001b[33m.\u001b[0m\u001b[97m\u001b[0m\n",
+            "\u001b[30m\u001b[0m"
+          ]
+        }
+      ],
      "source": [
-        "from llama_stack_client.lib.agents.agent import Agent\n",
-        "from llama_stack_client.lib.agents.event_logger import EventLogger\n",
-        "from llama_stack_client.types.agent_create_params import AgentConfig\n",
+        "agent = Agent(\n",
+        "    client, \n",
+        "    model=vision_model_id,\n",
+        "    instructions=\"You are a helpful assistant\",\n",
+        ")\n",
+        "session_id = agent.create_session(\"test-session\")\n",
        "\n",
-        "async def run_main(image_path, prompt):\n",
-        "    base64_image = encode_image(image_path)\n",
-        "\n",
-        "    client = LlamaStackClient(\n",
-        "        base_url=LLAMA_STACK_API_TOGETHER_URL,\n",
-        "    )\n",
-        "\n",
-        "    agent_config = AgentConfig(\n",
-        "        model=LLAMA32_11B_INSTRUCT,\n",
-        "        instructions=\"You are a helpful assistant\",\n",
-        "        enable_session_persistence=False,\n",
-        "        toolgroups=[],\n",
-        "    )\n",
-        "\n",
-        "    agent = Agent(client, agent_config)\n",
-        "    session_id = agent.create_session(\"test-session\")\n",
-        "\n",
-        "    response = agent.create_turn(\n",
-        "        messages=[{\n",
-        "            \"role\": \"user\",\n",
-        "            \"content\": [\n",
-        "                {\n",
-        "                    \"type\": \"image\",\n",
-        "                    \"image\": {\n",
-        "                         \"url\": {\n",
-        "                              \"uri\": encode_image(image_path)\n",
-        "                         }\n",
-        "                    }\n",
-        "                },\n",
-        "                {\n",
-        "                    \"type\": \"text\",\n",
-        "                    \"text\": prompt,\n",
+        "response = agent.create_turn(\n",
+        "    messages=[{\n",
+        "        \"role\": \"user\",\n",
+        "        \"content\": [\n",
+        "            {\n",
+        "                \"type\": \"image\",\n",
+        "                \"image\": {\n",
+        "                        \"url\": {\n",
+        "                            \"uri\": encode_image(\"Llama_Repo.jpeg\")\n",
+        "                        }\n",
        "                }\n",
-        "            ]\n",
-        "        }],\n",
-        "        session_id=session_id,\n",
-        "    )\n",
+        "            },\n",
+        "            {\n",
+        "                \"type\": \"text\",\n",
+        "                \"text\": \"How many different colors are those llamas? What are those colors?\",\n",
+        "            }\n",
+        "        ]\n",
+        "    }],\n",
+        "    session_id=session_id,\n",
+        ")\n",
        "\n",
-        "    for log in EventLogger().log(response):\n",
-        "        log.print()"
+        "for log in EventLogger().log(response):\n",
+        "    log.print()\n",
+        "    "
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
-      "id": "15d0098b",
+      "id": "f3352379",
      "metadata": {},
      "outputs": [],
-      "source": [
-        "await run_main(\"Llama_Repo.jpeg\",\n",
-        "         \"How many different colors are those llamas?\\\n",
-        "         What are those colors?\")"
-      ]
+      "source": []
    }
  ],
  "metadata": {
--- a/docs/notebooks/Alpha_Llama_Stack_Post_Training.ipynb
+++ b/docs/notebooks/Alpha_Llama_Stack_Post_Training.ipynb
@ -3675,7 +3675,7 @@
        "    benchmark_id=\"llama3.2-3B-instruct:tax_eval\",\n",
        "    input_rows=eval_rows.rows,\n",
        "    scoring_functions=[\"braintrust::answer-similarity\"],\n",
-        "    task_config={\n",
+        "    benchmark_config={\n",
        "        \"type\": \"benchmark\",\n",
        "        \"eval_candidate\": {\n",
        "            \"type\": \"model\",\n",
@ -6383,7 +6383,7 @@
        "    benchmark_id=\"Llama-3.2-3B-Instruct-sft-0:tax_eval\",\n",
        "    input_rows=eval_rows.rows,\n",
        "    scoring_functions=[\"braintrust::answer-similarity\"],\n",
-        "    task_config={\n",
+        "    benchmark_config={\n",
        "        \"type\": \"benchmark\",\n",
        "        \"eval_candidate\": {\n",
        "            \"type\": \"model\",\n",
--- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
+++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
@ -781,7 +781,7 @@
        "    benchmark_id=\"meta-reference::mmmu\",\n",
        "    input_rows=eval_rows,\n",
        "    scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n",
-        "    task_config={\n",
+        "    benchmark_config={\n",
        "        \"type\": \"benchmark\",\n",
        "        \"eval_candidate\": {\n",
        "            \"type\": \"model\",\n",
@ -826,10 +826,9 @@
        "_ = client.datasets.register(\n",
        "    dataset_id=simpleqa_dataset_id,\n",
        "    provider_id=\"huggingface\",\n",
-        "    url={\"uri\": \"https://huggingface.co/datasets/llamastack/evals\"},\n",
+        "    url={\"uri\": \"https://huggingface.co/datasets/llamastack/simpleqa\"},\n",
        "    metadata={\n",
-        "        \"path\": \"llamastack/evals\",\n",
-        "        \"name\": \"evals__simpleqa\",\n",
+        "        \"path\": \"llamastack/simpleqa\",\n",
        "        \"split\": \"train\",\n",
        "    },\n",
        "    dataset_schema={\n",
@ -960,7 +959,7 @@
        "    benchmark_id=\"meta-reference::simpleqa\",\n",
        "    input_rows=eval_rows.rows,\n",
        "    scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
-        "    task_config={\n",
+        "    benchmark_config={\n",
        "        \"type\": \"benchmark\",\n",
        "        \"eval_candidate\": {\n",
        "            \"type\": \"model\",\n",
@ -1109,7 +1108,7 @@
        "    benchmark_id=\"meta-reference::simpleqa\",\n",
        "    input_rows=eval_rows.rows,\n",
        "    scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
-        "    task_config={\n",
+        "    benchmark_config={\n",
        "        \"type\": \"benchmark\",\n",
        "        \"eval_candidate\": {\n",
        "            \"type\": \"agent\",\n",
--- a/docs/openapi_generator/README.md
+++ b/docs/openapi_generator/README.md
@ -1,9 +1 @@
 The RFC Specification (OpenAPI format) is generated from the set of API endpoints located in `llama_stack/distribution/server/endpoints.py` using the `generate.py` utility.
-
-Please install the following packages before running the script:
-
-```
-pip install fire PyYAML
-```
-
-Then simply run `sh run_openapi_generator.sh`
--- a/docs/source/building_applications/agent.md
+++ b/docs/source/building_applications/agent.md
@ -14,18 +14,16 @@ Agents are configured using the `AgentConfig` class, which includes:
 - **Safety Shields**: Guardrails to ensure responsible AI behavior

 ```python
-from llama_stack_client.types.agent_create_params import AgentConfig
 from llama_stack_client.lib.agents.agent import Agent

-# Configure an agent
-agent_config = AgentConfig(
-    model="meta-llama/Llama-3-70b-chat",
-    instructions="You are a helpful assistant that can use tools to answer questions.",
-    toolgroups=["builtin::code_interpreter", "builtin::rag/knowledge_search"],
-)

 # Create the agent
-agent = Agent(llama_stack_client, agent_config)
+agent = Agent(
+    llama_stack_client,
+    model="meta-llama/Llama-3-70b-chat",
+    instructions="You are a helpful assistant that can use tools to answer questions.",
+    tools=["builtin::code_interpreter", "builtin::rag/knowledge_search"],
+)
 ```

 ### 2. Sessions
--- a/docs/source/building_applications/agent_execution_loop.md
+++ b/docs/source/building_applications/agent_execution_loop.md
@ -70,18 +70,18 @@ Each step in this process can be monitored and controlled through configurations
 from llama_stack_client import LlamaStackClient
 from llama_stack_client.lib.agents.agent import Agent
 from llama_stack_client.lib.agents.event_logger import EventLogger
-from llama_stack_client.types.agent_create_params import AgentConfig
 from rich.pretty import pprint

 # Replace host and port
 client = LlamaStackClient(base_url=f"http://{HOST}:{PORT}")

-agent_config = AgentConfig(
+agent = Agent(
+    client,
    # Check with `llama-stack-client models list`
    model="Llama3.2-3B-Instruct",
    instructions="You are a helpful assistant",
    # Enable both RAG and tool usage
-    toolgroups=[
+    tools=[
        {
            "name": "builtin::rag/knowledge_search",
            "args": {"vector_db_ids": ["my_docs"]},
@ -98,8 +98,6 @@ agent_config = AgentConfig(
        "max_tokens": 2048,
    },
 )
-
-agent = Agent(client, agent_config)
 session_id = agent.create_session("monitored_session")

 # Stream the agent's execution steps
--- a/docs/source/building_applications/evals.md
+++ b/docs/source/building_applications/evals.md
@ -1,169 +1,127 @@
-# Evals
+# Evaluations

-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing)
+The Llama Stack provides a set of APIs in Llama Stack for supporting running evaluations of LLM applications.
+- `/datasetio` + `/datasets` API
+- `/scoring` + `/scoring_functions` API
+- `/eval` + `/benchmarks` API

-Llama Stack provides the building blocks needed to run benchmark and application evaluations. This guide will walk you through how to use these components to run open benchmark evaluations. Visit our [Evaluation Concepts](../concepts/evaluation_concepts.md) guide for more details on how evaluations work in Llama Stack, and our [Evaluation Reference](../references/evals_reference/index.md) guide for a comprehensive reference on the APIs.

-### 1. Open Benchmark Model Evaluation

-This first example walks you through how to evaluate a model candidate served by Llama Stack on open benchmarks. We will use the following benchmark:
- [MMMU](https://arxiv.org/abs/2311.16502) (A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI): Benchmark designed to evaluate multimodal models.
- [SimpleQA](https://openai.com/index/introducing-simpleqa/): Benchmark designed to access models to answer short, fact-seeking questions.
+This guides walks you through the process of evaluating an LLM application built using Llama Stack. Checkout the [Evaluation Reference](../references/evals_reference/index.md) guide goes over the sets of APIs and developer experience flow of using Llama Stack to run evaluations for benchmark and application use cases. Checkout our Colab notebook on working examples with evaluations [here](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing).

-#### 1.1 Running MMMU
- We will use a pre-processed MMMU dataset from [llamastack/mmmu](https://huggingface.co/datasets/llamastack/mmmu). The preprocessing code is shown in in this [Github Gist](https://gist.github.com/yanxi0830/118e9c560227d27132a7fd10e2c92840). The dataset is obtained by transforming the original [MMMU/MMMU](https://huggingface.co/datasets/MMMU/MMMU) dataset into correct format by `inference/chat-completion` API.

+## Application Evaluation
+
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb)
+
+Llama Stack offers a library of scoring functions and the `/scoring` API, allowing you to run evaluations on your pre-annotated AI application datasets.
+
+In this example, we will show you how to:
+1. Build an Agent with Llama Stack
+2. Query the agent's sessions, turns, and steps
+3. Evaluate the results.
+
+##### Building a Search Agent
 ```python
-import datasets
+from llama_stack_client import LlamaStackClient
+from llama_stack_client.lib.agents.agent import Agent
+from llama_stack_client.lib.agents.event_logger import EventLogger

-ds = datasets.load_dataset(path="llamastack/mmmu", name="Agriculture", split="dev")
-ds = ds.select_columns(["chat_completion_input", "input_query", "expected_answer"])
-eval_rows = ds.to_pandas().to_dict(orient="records")
-```
+client = LlamaStackClient(base_url=f"http://{HOST}:{PORT}")

- Next, we will run evaluation on an model candidate, we will need to:
-  - Define a system prompt
-  - Define an EvalCandidate
-  - Run evaluate on the dataset
-
-```python
-SYSTEM_PROMPT_TEMPLATE = """
-You are an expert in Agriculture whose job is to answer questions from the user using images.
-First, reason about the correct answer.
-Then write the answer in the following format where X is exactly one of A,B,C,D:
-Answer: X
-Make sure X is one of A,B,C,D.
-If you are uncertain of the correct answer, guess the most likely one.
-"""
-
-system_message = {
-    "role": "system",
-    "content": SYSTEM_PROMPT_TEMPLATE,
-}
-
-client.benchmarks.register(
-    benchmark_id="meta-reference::mmmu",
-    dataset_id=f"mmmu-{subset}-{split}",
-    scoring_functions=["basic::regex_parser_multiple_choice_answer"],
+agent = Agent(
+    client,
+    model="meta-llama/Llama-3.3-70B-Instruct",
+    instructions="You are a helpful assistant. Use search tool to answer the questions. ",
+    tools=["builtin::websearch"],
 )
+user_prompts = [
+    "Which teams played in the NBA Western Conference Finals of 2024. Search the web for the answer.",
+    "In which episode and season of South Park does Bill Cosby (BSM-471) first appear? Give me the number and title. Search the web for the answer.",
+    "What is the British-American kickboxer Andrew Tate's kickboxing name? Search the web for the answer.",
+]

-response = client.eval.evaluate_rows(
-    benchmark_id="meta-reference::mmmu",
-    input_rows=eval_rows,
-    scoring_functions=["basic::regex_parser_multiple_choice_answer"],
-    task_config={
-        "type": "benchmark",
-        "eval_candidate": {
-            "type": "model",
-            "model": "meta-llama/Llama-3.2-90B-Vision-Instruct",
-            "sampling_params": {
-                "strategy": {
-                    "type": "greedy",
-                },
-                "max_tokens": 4096,
-                "repeat_penalty": 1.0,
-            },
-            "system_message": system_message,
-        },
-    },
-)
-```
+session_id = agent.create_session("test-session")

-#### 1.2. Running SimpleQA
- We will use a pre-processed SimpleQA dataset from [llamastack/evals](https://huggingface.co/datasets/llamastack/evals/viewer/evals__simpleqa) which is obtained by transforming the input query into correct format accepted by `inference/chat-completion` API.
- Since we will be using this same dataset in our next example for Agentic evaluation, we will register it using the `/datasets` API, and interact with it through `/datasetio` API.
+for prompt in user_prompts:
+    response = agent.create_turn(
+        messages=[
+            {
+                "role": "user",
+                "content": prompt,
+            }
+        ],
+        session_id=session_id,
+    )

-```python
-simpleqa_dataset_id = "huggingface::simpleqa"
-
-_ = client.datasets.register(
-    dataset_id=simpleqa_dataset_id,
-    provider_id="huggingface",
-    url={"uri": "https://huggingface.co/datasets/llamastack/evals"},
-    metadata={
-        "path": "llamastack/evals",
-        "name": "evals__simpleqa",
-        "split": "train",
-    },
-    dataset_schema={
-        "input_query": {"type": "string"},
-        "expected_answer": {"type": "string"},
-        "chat_completion_input": {"type": "chat_completion_input"},
-    },
-)
-
-eval_rows = client.datasetio.get_rows_paginated(
-    dataset_id=simpleqa_dataset_id,
-    rows_in_page=5,
-)
-```
-
-```python
-client.benchmarks.register(
-    benchmark_id="meta-reference::simpleqa",
-    dataset_id=simpleqa_dataset_id,
-    scoring_functions=["llm-as-judge::405b-simpleqa"],
-)
-
-response = client.eval.evaluate_rows(
-    benchmark_id="meta-reference::simpleqa",
-    input_rows=eval_rows.rows,
-    scoring_functions=["llm-as-judge::405b-simpleqa"],
-    task_config={
-        "type": "benchmark",
-        "eval_candidate": {
-            "type": "model",
-            "model": "meta-llama/Llama-3.2-90B-Vision-Instruct",
-            "sampling_params": {
-                "strategy": {
-                    "type": "greedy",
-                },
-                "max_tokens": 4096,
-                "repeat_penalty": 1.0,
-            },
-        },
-    },
-)
+    for log in EventLogger().log(response):
+        log.print()
 ```


-### 2. Agentic Evaluation
- In this example, we will demonstrate how to evaluate a agent candidate served by Llama Stack via `/agent` API.
- We will continue to use the SimpleQA dataset we used in previous example.
- Instead of running evaluation on model, we will run the evaluation on a Search Agent with access to search tool. We will define our agent evaluation candidate through `AgentConfig`.
+##### Query Agent Execution Steps
+
+Now, let's look deeper into the agent's execution steps and see if how well our agent performs.
+```python
+# query the agents session
+from rich.pretty import pprint
+
+session_response = client.agents.session.retrieve(
+    session_id=session_id,
+    agent_id=agent.agent_id,
+)
+
+pprint(session_response)
+```
+
+As a sanity check, we will first check if all user prompts is followed by a tool call to `brave_search`.
+```python
+num_tool_call = 0
+for turn in session_response.turns:
+    for step in turn.steps:
+        if (
+            step.step_type == "tool_execution"
+            and step.tool_calls[0].tool_name == "brave_search"
+        ):
+            num_tool_call += 1
+
+print(
+    f"{num_tool_call}/{len(session_response.turns)} user prompts are followed by a tool call to `brave_search`"
+)
+```
+
+##### Evaluate Agent Responses
+Now, we want to evaluate the agent's responses to the user prompts.
+
+1. First, we will process the agent's execution history into a list of rows that can be used for evaluation.
+2. Next, we will label the rows with the expected answer.
+3. Finally, we will use the `/scoring` API to score the agent's responses.

 ```python
-agent_config = {
-    "model": "meta-llama/Llama-3.1-405B-Instruct",
-    "instructions": "You are a helpful assistant",
-    "sampling_params": {
-        "strategy": {
-            "type": "greedy",
-        },
-    },
-    "tools": [
+eval_rows = []
+
+expected_answers = [
+    "Dallas Mavericks and the Minnesota Timberwolves",
+    "Season 4, Episode 12",
+    "King Cobra",
+]
+
+for i, turn in enumerate(session_response.turns):
+    eval_rows.append(
        {
-            "type": "brave_search",
-            "engine": "tavily",
-            "api_key": userdata.get("TAVILY_SEARCH_API_KEY"),
+            "input_query": turn.input_messages[0].content,
+            "generated_answer": turn.output_message.content,
+            "expected_answer": expected_answers[i],
        }
-    ],
-    "tool_choice": "auto",
-    "input_shields": [],
-    "output_shields": [],
-    "enable_session_persistence": False,
-}
+    )

-response = client.eval.evaluate_rows(
-    benchmark_id="meta-reference::simpleqa",
-    input_rows=eval_rows.rows,
-    scoring_functions=["llm-as-judge::405b-simpleqa"],
-    task_config={
-        "type": "benchmark",
-        "eval_candidate": {
-            "type": "agent",
-            "config": agent_config,
-        },
-    },
+pprint(eval_rows)
+
+scoring_params = {
+    "basic::subset_of": None,
+}
+scoring_response = client.scoring.score(
+    input_rows=eval_rows, scoring_functions=scoring_params
 )
+pprint(scoring_response)
 ```
--- a/docs/source/building_applications/evaluation.md
+++ b/docs/source/building_applications/evaluation.md
@ -1,30 +0,0 @@
-## Testing & Evaluation
-
-Llama Stack provides built-in tools for evaluating your applications:
-
-1. **Benchmarking**: Test against standard datasets
-2. **Application Evaluation**: Score your application's outputs
-3. **Custom Metrics**: Define your own evaluation criteria
-
-Here's how to set up basic evaluation:
-
-```python
-# Create an evaluation task
-response = client.benchmarks.register(
-    benchmark_id="my_eval",
-    dataset_id="my_dataset",
-    scoring_functions=["accuracy", "relevance"],
-)
-
-# Run evaluation
-job = client.eval.run_eval(
-    benchmark_id="my_eval",
-    task_config={
-        "type": "app",
-        "eval_candidate": {"type": "agent", "config": agent_config},
-    },
-)
-
-# Get results
-result = client.eval.job_result(benchmark_id="my_eval", job_id=job.job_id)
-```
--- a/docs/source/building_applications/rag.md
+++ b/docs/source/building_applications/rag.md
@ -20,6 +20,11 @@ We may add more storage types like Graph IO in the future.
 Here's how to set up a vector database for RAG:

 ```python
+# Create http client
+from llama_stack_client import LlamaStackClient
+
+client = LlamaStackClient(base_url=f"http://localhost:{os.environ['LLAMA_STACK_PORT']}")
+
 # Register a vector db
 vector_db_id = "my_documents"
 response = client.vector_dbs.register(
@ -81,15 +86,14 @@ results = client.tool_runtime.rag_tool.query(
 One of the most powerful patterns is combining agents with RAG capabilities. Here's a complete example:

 ```python
-from llama_stack_client.types.agent_create_params import AgentConfig
 from llama_stack_client.lib.agents.agent import Agent

-# Configure agent with memory
-agent_config = AgentConfig(
+# Create agent with memory
+agent = Agent(
+    client,
    model="meta-llama/Llama-3.3-70B-Instruct",
    instructions="You are a helpful assistant",
-    enable_session_persistence=False,
-    toolgroups=[
+    tools=[
        {
            "name": "builtin::rag/knowledge_search",
            "args": {
@ -98,8 +102,6 @@ agent_config = AgentConfig(
        }
    ],
 )
-
-agent = Agent(client, agent_config)
 session_id = agent.create_session("rag_session")


@ -122,7 +124,7 @@ response = agent.create_turn(
    ],
    documents=[
        {
-            "content": "https://raw.githubusercontent.com/example/doc.rst",
+            "content": "https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/memory_optimizations.rst",
            "mime_type": "text/plain",
        }
    ],
@ -136,6 +138,14 @@ response = agent.create_turn(
 )
 ```

+You can print the response with below.
+```python
+from llama_stack_client.lib.agents.event_logger import EventLogger
+
+for log in EventLogger().log(response):
+    log.print()
+```
+
 ### Unregistering Vector DBs

 If you need to clean up and unregister vector databases, you can do so as follows:
--- a/docs/source/building_applications/tools.md
+++ b/docs/source/building_applications/tools.md
@ -5,7 +5,7 @@ An example of this would be a "db_access" tool group that contains tools for int

 Tools are treated as any other resource in llama stack like models. You can register them, have providers for them etc.

-When instatiating an agent, you can provide it a list of tool groups that it has access to. Agent gets the corresponding tool definitions for the specified tool groups and passes them along to the model.
+When instantiating an agent, you can provide it a list of tool groups that it has access to. Agent gets the corresponding tool definitions for the specified tool groups and passes them along to the model.

 Refer to the [Building AI Applications](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb) notebook for more examples on how to use tools.

@ -60,7 +60,7 @@ Features:
 - Disabled dangerous system operations
 - Configurable execution timeouts

-> ⚠️ Important: The code interpreter tool can operate in a controlled enviroment locally or on Podman containers. To ensure proper functionality in containerised environments:
+> ⚠️ Important: The code interpreter tool can operate in a controlled environment locally or on Podman containers. To ensure proper functionality in containerized environments:
 > - The container requires privileged access (e.g., --privileged).
 > - Users without sufficient permissions may encounter permission errors. (`bwrap: Can't mount devpts on /newroot/dev/pts: Permission denied`)
 > - 🔒 Security Warning: Privileged mode grants elevated access and bypasses security restrictions. Use only in local, isolated, or controlled environments.
@ -127,15 +127,11 @@ MCP tools require:

 ## Adding Custom Tools

-When you want to use tools other than the built-in tools, you can implement a python function and decorate it with `@client_tool`.
+When you want to use tools other than the built-in tools, you just need to implement a python function with a docstring. The content of the docstring will be used to describe the tool and the parameters and passed
+along to the generative model.

-To define a custom tool, you need to use the `@client_tool` decorator.
 ```python
-from llama_stack_client.lib.agents.client_tool import client_tool
-
-
 # Example tool definition
-@client_tool
 def my_tool(input: int) -> int:
    """
    Runs my awesome tool.
@ -149,15 +145,7 @@ def my_tool(input: int) -> int:
 Once defined, simply pass the tool to the agent config. `Agent` will take care of the rest (calling the model with the tool definition, executing the tool, and returning the result to the model for the next iteration).
 ```python
 # Example agent config with client provided tools
-client_tools = [
-    my_tool,
-]
-
-agent_config = AgentConfig(
-    ...,
-    client_tools=[client_tool.get_tool_definition() for client_tool in client_tools],
-)
-agent = Agent(client, agent_config, client_tools)
+agent = Agent(client, ..., tools=[my_tool])
 ```

 Refer to [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/blob/main/examples/agents/e2e_loop_with_client_tools.py) for an example of how to use client provided tools.
@ -194,10 +182,10 @@ group_tools = client.tools.list_tools(toolgroup_id="search_tools")

 ```python
 from llama_stack_client.lib.agents.agent import Agent
-from llama_stack_client.types.agent_create_params import AgentConfig

-# Configure the AI agent with necessary parameters
-agent_config = AgentConfig(
+# Instantiate the AI agent with the given configuration
+agent = Agent(
+    client,
    name="code-interpreter",
    description="A code interpreter agent for executing Python code snippets",
    instructions="""
@ -205,14 +193,10 @@ agent_config = AgentConfig(
    Always show the generated code, never generate your own code, and never anticipate results.
    """,
    model="meta-llama/Llama-3.2-3B-Instruct",
-    toolgroups=["builtin::code_interpreter"],
+    tools=["builtin::code_interpreter"],
    max_infer_iters=5,
-    enable_session_persistence=False,
 )

-# Instantiate the AI agent with the given configuration
-agent = Agent(client, agent_config)
-
 # Start a session
 session_id = agent.create_session("tool_session")

--- a/docs/source/concepts/evaluation_concepts.md
+++ b/docs/source/concepts/evaluation_concepts.md
@ -24,17 +24,58 @@ The Evaluation APIs are associated with a set of Resources as shown in the follo
  - Associated with `Benchmark` resource.


-Use the following decision tree to decide how to use LlamaStack Evaluation flow.
-![Eval Flow](../references/evals_reference/resources/eval-flow.png)
+## Open-benchmark Eval
+
+### List of open-benchmarks Llama Stack support
+
+Llama stack pre-registers several popular open-benchmarks to easily evaluate model perfomance via CLI.
+
+The list of open-benchmarks we currently support:
+- [MMLU-COT](https://arxiv.org/abs/2009.03300) (Measuring Massive Multitask Language Understanding): Benchmark designed to comprehensively evaluate the breadth and depth of a model's academic and professional understanding
+- [GPQA-COT](https://arxiv.org/abs/2311.12022) (A Graduate-Level Google-Proof Q&A Benchmark): A challenging benchmark of 448 multiple-choice questions written by domain experts in biology, physics, and chemistry.
+- [SimpleQA](https://openai.com/index/introducing-simpleqa/): Benchmark designed to access models to answer short, fact-seeking questions.
+- [MMMU](https://arxiv.org/abs/2311.16502) (A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI)]: Benchmark designed to evaluate multimodal models.


-```{admonition} Note on Benchmark v.s. Application Evaluation
-:class: tip
- **Benchmark Evaluation** is a well-defined eval-task consisting of `dataset` and `scoring_function`. The generation (inference or agent) will be done as part of evaluation.
- **Application Evaluation** assumes users already have app inputs & generated outputs. Evaluation will purely focus on scoring the generated outputs via scoring functions (e.g. LLM-as-judge).
+You can follow this [contributing guide](https://llama-stack.readthedocs.io/en/latest/references/evals_reference/index.html#open-benchmark-contributing-guide) to add more open-benchmarks to Llama Stack
+
+### Run evaluation on open-benchmarks via CLI
+
+We have built-in functionality to run the supported open-benckmarks using llama-stack-client CLI
+
+#### Spin up Llama Stack server
+
+Spin up llama stack server with 'open-benchmark' template
 ```
+llama stack run llama_stack/templates/open-benchmark/run.yaml
+
+```
+
+#### Run eval CLI
+There are 3 necessary inputs to run a benchmark eval
+- `list of benchmark_ids`: The list of benchmark ids to run evaluation on
+- `model-id`: The model id to evaluate on
+- `utput_dir`: Path to store the evaluate results
+```
+llama-stack-client eval run-benchmark <benchmark_id_1> <benchmark_id_2> ... \
+--model_id <model id to evaluate on> \
+--output_dir <directory to store the evaluate results> \
+```
+
+You can run
+```
+llama-stack-client eval run-benchmark help
+```
+to see the description of all the flags that eval run-benchmark has
+
+
+In the output log, you can find the file path that has your evaluation results. Open that file and you can see you aggrgate
+evaluation results over there.
+
+

 ## What's Next?

- Check out our Colab notebook on working examples with evaluations [here](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing).
+- Check out our Colab notebook on working examples with running benchmark evaluations [here](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb#scrollTo=mxLCsP4MvFqP).
+- Check out our [Building Applications - Evaluation](../building_applications/evals.md) guide for more details on how to use the Evaluation APIs to evaluate your applications.
 - Check out our [Evaluation Reference](../references/evals_reference/index.md) for more details on the APIs.
--- a/docs/source/concepts/index.md
+++ b/docs/source/concepts/index.md
@ -1,5 +1,13 @@
 # Core Concepts

+
+```{toctree}
+:maxdepth: 1
+:hidden:
+
+evaluation_concepts
+```
+
 Given Llama Stack's service-oriented philosophy, a few concepts and workflows arise which may not feel completely natural in the LLM landscape, especially if you are coming with a background in other frameworks.


@ -26,7 +34,7 @@ We are working on adding a few more APIs to complete the application lifecycle.

 The goal of Llama Stack is to build an ecosystem where users can easily swap out different implementations for the same API. Examples for these include:
 - LLM inference providers (e.g., Fireworks, Together, AWS Bedrock, Groq, Cerebras, SambaNova, vLLM, etc.),
- Vector databases (e.g., ChromaDB, Weaviate, Qdrant, FAISS, PGVector, etc.),
+- Vector databases (e.g., ChromaDB, Weaviate, Qdrant, Milvus, FAISS, PGVector, etc.),
 - Safety providers (e.g., Meta's Llama Guard, AWS Bedrock Guardrails, etc.)

 Providers come in two flavors:
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -13,16 +13,18 @@
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information

 from docutils import nodes
-import tomli  # Import tomli for TOML parsing
 from pathlib import Path
+import requests
+import json

 # Read version from pyproject.toml
 with Path(__file__).parent.parent.parent.joinpath("pyproject.toml").open("rb") as f:
-    pyproject = tomli.load(f)
-    llama_stack_version = pyproject["project"]["version"]
+    pypi_url = "https://pypi.org/pypi/llama-stack/json"
+    version_tag = json.loads(requests.get(pypi_url).text)["info"]["version"]
+    print(f"{version_tag=}")

    # generate the full link including text and url here
-    llama_stack_version_url = f"https://github.com/meta-llama/llama-stack/releases/tag/v{llama_stack_version}"
+    llama_stack_version_url = f"https://github.com/meta-llama/llama-stack/releases/tag/v{version_tag}"
    llama_stack_version_link = f"<a href='{llama_stack_version_url}'>release notes</a>"

 project = "llama-stack"
@ -77,7 +79,7 @@ myst_enable_extensions = [

 myst_substitutions = {
    "docker_hub": "https://hub.docker.com/repository/docker/llamastack",
-    "llama_stack_version": llama_stack_version,
+    "llama_stack_version": version_tag,
    "llama_stack_version_link": llama_stack_version_link,
 }

--- a/docs/source/contributing/new_api_provider.md
+++ b/docs/source/contributing/new_api_provider.md
@ -17,25 +17,31 @@ Here are some example PRs to help you get started:

 ## Testing the Provider

+Before running tests, you must have required dependencies installed. This depends on the providers or distributions you are testing. For example, if you are testing the `together` distribution, you should install dependencies via `llama stack build --template together`.
+
 ### 1. Integration Testing
- Create integration tests that use real provider instances and configurations
- For remote services, test actual API interactions
- Avoid mocking at the provider level since adapter layers tend to be thin
- Reference examples in {repopath}`tests/api`

-### 2. Unit Testing (Optional)
- Add unit tests for provider-specific functionality
- See examples in {repopath}`llama_stack/providers/tests/inference/test_text_inference.py`
+Integration tests are located in {repopath}`tests/integration`. These tests use the python client-SDK APIs (from the `llama_stack_client` package) to test functionality. Since these tests use client APIs, they can be run either by pointing to an instance of the Llama Stack server or "inline" by using `LlamaStackAsLibraryClient`.
+
+Consult {repopath}`tests/integration/README.md` for more details on how to run the tests.
+
+Note that each provider's `sample_run_config()` method (in the configuration class for that provider)
+ typically references some environment variables for specifying API keys and the like. You can set these in the environment or pass these via the `--env` flag to the test command.
+
+
+### 2. Unit Testing
+
+Unit tests are located in {repopath}`tests/unit`. Provider-specific unit tests are located in {repopath}`tests/unit/providers`. These tests are all run automatically as part of the CI process.
+
+
+### 3. Additional end-to-end testing

-### 3. End-to-End Testing
 1. Start a Llama Stack server with your new provider
-2. Test using client requests
-3. Verify compatibility with existing client scripts in the [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main) repository
-4. Document which scripts are compatible with your provider
+2. Verify compatibility with existing client scripts in the [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main) repository
+3. Document which scripts are compatible with your provider

 ## Submitting Your PR

 1. Ensure all tests pass
 2. Include a comprehensive test plan in your PR summary
 3. Document any known limitations or considerations
-4. Submit your pull request for review
--- a/docs/source/distributions/building_distro.md
+++ b/docs/source/distributions/building_distro.md
@ -4,6 +4,37 @@
 This guide will walk you through the steps to get started with building a Llama Stack distribution from scratch with your choice of API providers.


+### Setting your log level
+
+In order to specify the proper logging level users can apply the following environment variable `LLAMA_STACK_LOGGING` with the following format:
+
+`LLAMA_STACK_LOGGING=server=debug;core=info`
+
+Where each category in the following list:
+
+- all
+- core
+- server
+- router
+- inference
+- agents
+- safety
+- eval
+- tools
+- client
+
+Can be set to any of the following log levels:
+
+- debug
+- info
+- warning
+- error
+- critical
+
+The default global log level is `info`. `all` sets the log level for all components.
+
+A user can also set `LLAMA_STACK_LOG_FILE` which will pipe the logs to the specified path as well as to the terminal. An example would be: `export LLAMA_STACK_LOG_FILE=server.log`
+
 ### Llama Stack Build

 In order to build your own distribution, we recommend you clone the `llama-stack` repository.
@ -22,25 +53,25 @@ The main points to consider are:

 ```
 llama stack build -h
-
-usage: llama stack build [-h] [--config CONFIG] [--template TEMPLATE] [--list-templates]
-                         [--image-type {conda,container,venv}] [--image-name IMAGE_NAME] [--print-deps-only]
+usage: llama stack build [-h] [--config CONFIG] [--template TEMPLATE] [--list-templates] [--image-type {conda,container,venv}] [--image-name IMAGE_NAME] [--print-deps-only] [--run]

 Build a Llama stack container

 options:
  -h, --help            show this help message and exit
-  --config CONFIG       Path to a config file to use for the build. You can find example configs in llama_stack/distributions/**/build.yaml.
-                        If this argument is not provided, you will be prompted to enter information interactively
-  --template TEMPLATE   Name of the example template config to use for build. You may use `llama stack build --list-templates` to check out the available templates
-  --list-templates      Show the available templates for building a Llama Stack distribution
+  --config CONFIG       Path to a config file to use for the build. You can find example configs in llama_stack/distributions/**/build.yaml. If this argument is not provided, you will
+                        be prompted to enter information interactively (default: None)
+  --template TEMPLATE   Name of the example template config to use for build. You may use `llama stack build --list-templates` to check out the available templates (default: None)
+  --list-templates      Show the available templates for building a Llama Stack distribution (default: False)
  --image-type {conda,container,venv}
-                        Image Type to use for the build. This can be either conda or container or venv. If not specified, will use the image type from the template config.
+                        Image Type to use for the build. This can be either conda or container or venv. If not specified, will use the image type from the template config. (default:
+                        conda)
  --image-name IMAGE_NAME
-                        [for image-type=conda] Name of the conda environment to use for the build. If
-                        not specified, currently active Conda environment will be used. If no Conda
-                        environment is active, you must specify a name.
-  --print-deps-only     Print the dependencies for the stack only, without building the stack
+                        [for image-type=conda|venv] Name of the conda or virtual environment to use for the build. If not specified, currently active Conda environment will be used if
+                        found. (default: None)
+  --print-deps-only     Print the dependencies for the stack only, without building the stack (default: False)
+  --run                 Run the stack after building using the same image type, name, and other applicable arguments (default: False)
+
 ```

 After this step is complete, a file named `<name>-build.yaml` and template file `<name>-run.yaml` will be generated and saved at the output file path specified at the end of the command.
@ -183,8 +214,8 @@ Now, let's start the Llama Stack Distribution Server. You will need the YAML con

 ```
 llama stack run -h
-usage: llama stack run [-h] [--port PORT] [--image-name IMAGE_NAME] [--disable-ipv6] [--env KEY=VALUE] [--tls-keyfile TLS_KEYFILE]
-                       [--tls-certfile TLS_CERTFILE] [--image-type {conda,container,venv}]
+usage: llama stack run [-h] [--port PORT] [--image-name IMAGE_NAME] [--disable-ipv6] [--env KEY=VALUE] [--tls-keyfile TLS_KEYFILE] [--tls-certfile TLS_CERTFILE]
+                       [--image-type {conda,container,venv}]
                       config

 Start the server for a Llama Stack Distribution. You should have already built (or downloaded) and configured the distribution.
@ -194,17 +225,17 @@ positional arguments:

 options:
  -h, --help            show this help message and exit
-  --port PORT           Port to run the server on. It can also be passed via the env var LLAMA_STACK_PORT. Defaults to 8321
+  --port PORT           Port to run the server on. It can also be passed via the env var LLAMA_STACK_PORT. (default: 8321)
  --image-name IMAGE_NAME
-                        Name of the image to run. Defaults to the current conda environment
-  --disable-ipv6        Disable IPv6 support
-  --env KEY=VALUE       Environment variables to pass to the server in KEY=VALUE format. Can be specified multiple times.
+                        Name of the image to run. Defaults to the current conda environment (default: None)
+  --disable-ipv6        Disable IPv6 support (default: False)
+  --env KEY=VALUE       Environment variables to pass to the server in KEY=VALUE format. Can be specified multiple times. (default: [])
  --tls-keyfile TLS_KEYFILE
-                        Path to TLS key file for HTTPS
+                        Path to TLS key file for HTTPS (default: None)
  --tls-certfile TLS_CERTFILE
-                        Path to TLS certificate file for HTTPS
+                        Path to TLS certificate file for HTTPS (default: None)
  --image-type {conda,container,venv}
-                        Image Type used during the build. This can be either conda or container or venv.
+                        Image Type used during the build. This can be either conda or container or venv. (default: conda)

 ```

--- a/docs/source/distributions/remote_hosted_distro/index.md
+++ b/docs/source/distributions/remote_hosted_distro/index.md
@ -17,26 +17,4 @@ $ llama-stack-client configure --endpoint https://llamastack-preview.fireworks.a
 $ llama-stack-client models list
 ```

-You will see outputs:
-```
-$ llama-stack-client models list
-+------------------------------+------------------------------+---------------+------------+
-| identifier                   | llama_model                  | provider_id   | metadata   |
-+==============================+==============================+===============+============+
-| Llama3.1-8B-Instruct         | Llama3.1-8B-Instruct         | fireworks0    | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.1-70B-Instruct        | Llama3.1-70B-Instruct        | fireworks0    | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.1-405B-Instruct       | Llama3.1-405B-Instruct       | fireworks0    | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.2-1B-Instruct         | Llama3.2-1B-Instruct         | fireworks0    | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.2-3B-Instruct         | Llama3.2-3B-Instruct         | fireworks0    | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.2-11B-Vision-Instruct | Llama3.2-11B-Vision-Instruct | fireworks0    | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.2-90B-Vision-Instruct | Llama3.2-90B-Vision-Instruct | fireworks0    | {}         |
-+------------------------------+------------------------------+---------------+------------+
-```
-
 Checkout the [llama-stack-client-python](https://github.com/meta-llama/llama-stack-client-python/blob/main/docs/cli_reference.md) repo for more details on how to use the `llama-stack-client` CLI. Checkout [llama-stack-app](https://github.com/meta-llama/llama-stack-apps/tree/main) for examples applications built on top of Llama Stack.
--- a/docs/source/distributions/self_hosted_distro/fireworks.md
+++ b/docs/source/distributions/self_hosted_distro/fireworks.md
@ -22,7 +22,7 @@ The `llamastack/distribution-fireworks` distribution consists of the following p
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `remote::wolfram-alpha`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |


@ -40,7 +40,6 @@ The following models are available by default:
 - `accounts/fireworks/models/llama-v3p1-8b-instruct (aliases: meta-llama/Llama-3.1-8B-Instruct)`
 - `accounts/fireworks/models/llama-v3p1-70b-instruct (aliases: meta-llama/Llama-3.1-70B-Instruct)`
 - `accounts/fireworks/models/llama-v3p1-405b-instruct (aliases: meta-llama/Llama-3.1-405B-Instruct-FP8)`
- `accounts/fireworks/models/llama-v3p2-1b-instruct (aliases: meta-llama/Llama-3.2-1B-Instruct)`
 - `accounts/fireworks/models/llama-v3p2-3b-instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)`
 - `accounts/fireworks/models/llama-v3p2-11b-vision-instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
 - `accounts/fireworks/models/llama-v3p2-90b-vision-instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
--- a/docs/source/distributions/self_hosted_distro/ollama.md
+++ b/docs/source/distributions/self_hosted_distro/ollama.md
@ -22,8 +22,8 @@ The `llamastack/distribution-ollama` distribution consists of the following prov
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
-| vector_io | `inline::sqlite-vec`, `remote::chromadb`, `remote::pgvector` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
+| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |


 You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.
@ -130,7 +130,7 @@ llama stack run ./run-with-safety.yaml \
 ### (Optional) Update Model Serving Configuration

 ```{note}
-Please check the [model_entries](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/inference/ollama/ollama.py#L45) for the supported Ollama models.
+Please check the [model_entries](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/inference/ollama/models.py) for the supported Ollama models.
 ```

 To serve a new model with `ollama`
--- a/docs/source/distributions/self_hosted_distro/remote-vllm.md
+++ b/docs/source/distributions/self_hosted_distro/remote-vllm.md
@ -21,7 +21,7 @@ The `llamastack/distribution-remote-vllm` distribution consists of the following
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |


--- a/docs/source/distributions/self_hosted_distro/tgi.md
+++ b/docs/source/distributions/self_hosted_distro/tgi.md
@ -35,7 +35,7 @@ The following environment variables can be configured:

 - `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
 - `INFERENCE_MODEL`: Inference model loaded into the TGI server (default: `meta-llama/Llama-3.2-3B-Instruct`)
- `TGI_URL`: URL of the TGI server with the main inference model (default: `http://127.0.0.1:8080}/v1`)
+- `TGI_URL`: URL of the TGI server with the main inference model (default: `http://127.0.0.1:8080/v1`)
 - `TGI_SAFETY_URL`: URL of the TGI server with the safety model (default: `http://127.0.0.1:8081/v1`)
 - `SAFETY_MODEL`: Name of the safety (Llama-Guard) model to use (default: `meta-llama/Llama-Guard-3-1B`)

--- a/docs/source/distributions/self_hosted_distro/together.md
+++ b/docs/source/distributions/self_hosted_distro/together.md
@ -22,7 +22,7 @@ The `llamastack/distribution-together` distribution consists of the following pr
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |


--- a/docs/source/getting_started/index.md
+++ b/docs/source/getting_started/index.md
@ -184,7 +184,6 @@ from termcolor import cprint

 from llama_stack_client.lib.agents.agent import Agent
 from llama_stack_client.lib.agents.event_logger import EventLogger
-from llama_stack_client.types.agent_create_params import AgentConfig
 from llama_stack_client.types import Document


@ -241,13 +240,14 @@ client.tool_runtime.rag_tool.insert(
    chunk_size_in_tokens=512,
 )

-agent_config = AgentConfig(
+rag_agent = Agent(
+    client,
    model=os.environ["INFERENCE_MODEL"],
    # Define instructions for the agent ( aka system prompt)
    instructions="You are a helpful assistant",
    enable_session_persistence=False,
    # Define tools available to the agent
-    toolgroups=[
+    tools=[
        {
            "name": "builtin::rag/knowledge_search",
            "args": {
@ -256,8 +256,6 @@ agent_config = AgentConfig(
        }
    ],
 )
-
-rag_agent = Agent(client, agent_config)
 session_id = rag_agent.create_session("test-session")

 user_prompts = [
--- a/docs/source/index.md
+++ b/docs/source/index.md
@ -68,6 +68,7 @@ A number of "adapters" are available for some popular Inference and Vector Store
 |  FAISS | Single Node |
 |  SQLite-Vec| Single Node |
 |  Chroma | Hosted and Single Node |
+|  Milvus | Hosted and Single Node |
 |  Postgres (PGVector) | Hosted and Single Node |
 |  Weaviate | Hosted |

--- a/docs/source/providers/index.md
+++ b/docs/source/providers/index.md
@ -2,7 +2,7 @@

 The goal of Llama Stack is to build an ecosystem where users can easily swap out different implementations for the same API. Examples for these include:
 - LLM inference providers (e.g., Fireworks, Together, AWS Bedrock, Groq, Cerebras, SambaNova, vLLM, etc.),
- Vector databases (e.g., ChromaDB, Weaviate, Qdrant, FAISS, PGVector, etc.),
+- Vector databases (e.g., ChromaDB, Weaviate, Qdrant, Milvus, FAISS, PGVector, etc.),
 - Safety providers (e.g., Meta's Llama Guard, AWS Bedrock Guardrails, etc.)

 Providers come in two flavors:
@ -55,5 +55,6 @@ vector_io/sqlite-vec
 vector_io/chromadb
 vector_io/pgvector
 vector_io/qdrant
+vector_io/milvus
 vector_io/weaviate
 ```
--- a/docs/source/references/evals_reference/index.md
+++ b/docs/source/references/evals_reference/index.md
@ -24,19 +24,9 @@ The Evaluation APIs are associated with a set of Resources as shown in the follo
  - Associated with `Benchmark` resource.


-Use the following decision tree to decide how to use LlamaStack Evaluation flow.
-![Eval Flow](./resources/eval-flow.png)
-
-
-```{admonition} Note on Benchmark v.s. Application Evaluation
-:class: tip
- **Benchmark Evaluation** is a well-defined eval-task consisting of `dataset` and `scoring_function`. The generation (inference or agent) will be done as part of evaluation.
- **Application Evaluation** assumes users already have app inputs & generated outputs. Evaluation will purely focus on scoring the generated outputs via scoring functions (e.g. LLM-as-judge).
-```
-
 ## Evaluation Examples Walkthrough

-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing)
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb)

 It is best to open this notebook in Colab to follow along with the examples.

@ -63,20 +53,29 @@ eval_rows = ds.to_pandas().to_dict(orient="records")
  - Run evaluate on the dataset

 ```python
+from rich.pretty import pprint
+from tqdm import tqdm
+
 SYSTEM_PROMPT_TEMPLATE = """
-You are an expert in Agriculture whose job is to answer questions from the user using images.
+You are an expert in {subject} whose job is to answer questions from the user using images.
+
 First, reason about the correct answer.
+
 Then write the answer in the following format where X is exactly one of A,B,C,D:
+
 Answer: X
+
 Make sure X is one of A,B,C,D.
+
 If you are uncertain of the correct answer, guess the most likely one.
 """

 system_message = {
    "role": "system",
-    "content": SYSTEM_PROMPT_TEMPLATE,
+    "content": SYSTEM_PROMPT_TEMPLATE.format(subject=subset),
 }

+# register the evaluation benchmark task with the dataset and scoring function
 client.benchmarks.register(
    benchmark_id="meta-reference::mmmu",
    dataset_id=f"mmmu-{subset}-{split}",
@ -87,14 +86,15 @@ response = client.eval.evaluate_rows(
    benchmark_id="meta-reference::mmmu",
    input_rows=eval_rows,
    scoring_functions=["basic::regex_parser_multiple_choice_answer"],
-    task_config={
-        "type": "benchmark",
+    benchmark_config={
        "eval_candidate": {
            "type": "model",
            "model": "meta-llama/Llama-3.2-90B-Vision-Instruct",
            "sampling_params": {
                "strategy": {
-                    "type": "greedy",
+                    "type": "top_p",
+                    "temperature": 1.0,
+                    "top_p": 0.95,
                },
                "max_tokens": 4096,
                "repeat_penalty": 1.0,
@ -103,6 +103,7 @@ response = client.eval.evaluate_rows(
        },
    },
 )
+pprint(response)
 ```

 #### 1.2. Running SimpleQA
@ -115,10 +116,9 @@ simpleqa_dataset_id = "huggingface::simpleqa"
 _ = client.datasets.register(
    dataset_id=simpleqa_dataset_id,
    provider_id="huggingface",
-    url={"uri": "https://huggingface.co/datasets/llamastack/evals"},
+    url={"uri": "https://huggingface.co/datasets/llamastack/simpleqa"},
    metadata={
-        "path": "llamastack/evals",
-        "name": "evals__simpleqa",
+        "path": "llamastack/simpleqa",
        "split": "train",
    },
    dataset_schema={
@ -145,8 +145,7 @@ response = client.eval.evaluate_rows(
    benchmark_id="meta-reference::simpleqa",
    input_rows=eval_rows.rows,
    scoring_functions=["llm-as-judge::405b-simpleqa"],
-    task_config={
-        "type": "benchmark",
+    benchmark_config={
        "eval_candidate": {
            "type": "model",
            "model": "meta-llama/Llama-3.2-90B-Vision-Instruct",
@ -160,6 +159,7 @@ response = client.eval.evaluate_rows(
        },
    },
 )
+pprint(response)
 ```


@ -170,19 +170,17 @@ response = client.eval.evaluate_rows(

 ```python
 agent_config = {
-    "model": "meta-llama/Llama-3.1-405B-Instruct",
-    "instructions": "You are a helpful assistant",
+    "model": "meta-llama/Llama-3.3-70B-Instruct",
+    "instructions": "You are a helpful assistant that have access to tool to search the web. ",
    "sampling_params": {
        "strategy": {
-            "type": "greedy",
-        },
-    },
-    "tools": [
-        {
-            "type": "brave_search",
-            "engine": "tavily",
-            "api_key": userdata.get("TAVILY_SEARCH_API_KEY"),
+            "type": "top_p",
+            "temperature": 0.5,
+            "top_p": 0.9,
        }
+    },
+    "toolgroups": [
+        "builtin::websearch",
    ],
    "tool_choice": "auto",
    "tool_prompt_format": "json",
@ -195,25 +193,22 @@ response = client.eval.evaluate_rows(
    benchmark_id="meta-reference::simpleqa",
    input_rows=eval_rows.rows,
    scoring_functions=["llm-as-judge::405b-simpleqa"],
-    task_config={
-        "type": "benchmark",
+    benchmark_config={
        "eval_candidate": {
            "type": "agent",
            "config": agent_config,
        },
    },
 )
+pprint(response)
 ```

 ### 3. Agentic Application Dataset Scoring
- Llama Stack offers a library of scoring functions and the `/scoring` API, allowing you to run evaluations on your pre-annotated AI application datasets.
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb)

- In this example, we will work with an example RAG dataset and couple of scoring functions for evaluation.
-  - `llm-as-judge::base`: LLM-As-Judge with custom judge prompt & model.
-  - `braintrust::factuality`: Factuality scorer from [braintrust](https://github.com/braintrustdata/autoevals).
-  - `basic::subset_of`: Basic checking if generated answer is a subset of expected answer.
+Llama Stack offers a library of scoring functions and the `/scoring` API, allowing you to run evaluations on your pre-annotated AI application datasets.

- Please checkout our [Llama Stack Playground](https://llama-stack.readthedocs.io/en/latest/playground/index.html) for an interactive interface to upload datasets and run scorings.
+In this example, we will work with an example RAG dataset you have built previously, label with an annotation, and use LLM-As-Judge with custom judge prompt for scoring. Please checkout our [Llama Stack Playground](https://llama-stack.readthedocs.io/en/latest/playground/index.html) for an interactive interface to upload datasets and run scorings.

 ```python
 judge_model_id = "meta-llama/Llama-3.1-405B-Instruct-FP8"
@ -280,18 +275,25 @@ response = client.scoring.score(
 The following examples give the quick steps to start running evaluations using the llama-stack-client CLI.

 #### Benchmark Evaluation CLI
-Usage: There are 2 inputs necessary for running a benchmark eval
- `eval-task-id`: the identifier associated with the eval task. Each `Benchmark` is parametrized by
-  - `dataset_id`: the identifier associated with the dataset.
-  - `List[scoring_function_id]`: list of scoring function identifiers.
- `eval-task-config`: specifies the configuration of the model / agent to evaluate on.
+There are 3 necessary input for running a benchmark eval
+- `list of benchmark_ids`: The list of benchmark ids to run evaluation on
+- `model-id`: The model id to evaluate on
+- `utput_dir`: Path to store the evaluate results
+```
+llama-stack-client eval run-benchmark <benchmark_id_1> <benchmark_id_2> ... \
+--model_id <model id to evaluate on> \
+--output_dir <directory to store the evaluate results> \
+```
+
+You can run
+```
+llama-stack-client eval run-benchmark help
+```
+to see the description of all the flags to run benckmark eval


-```
-llama-stack-client eval run_benchmark <eval-task-id> \
--eval-task-config ~/benchmark_config.json \
--visualize
-```
+In the output log, you can find the path to the file that has your evaluation results. Open that file and you can see you aggrgate
+evaluation results over there.


 #### Application Evaluation CLI
@ -317,28 +319,9 @@ The `BenchmarkConfig` are user specified config to define:
 2. Optionally scoring function params to allow customization of scoring function behaviour. This is useful to parameterize generic scoring functions such as LLMAsJudge with custom `judge_model` / `judge_prompt`.


-**Example Benchmark BenchmarkConfig**
+**Example BenchmarkConfig**
 ```json
 {
-    "type": "benchmark",
-    "eval_candidate": {
-        "type": "model",
-        "model": "Llama3.2-3B-Instruct",
-        "sampling_params": {
-            "strategy": {
-                "type": "greedy",
-            },
-            "max_tokens": 0,
-            "repetition_penalty": 1.0
-        }
-    }
-}
-```
-
-**Example Application BenchmarkConfig**
-```json
-{
-    "type": "app",
    "eval_candidate": {
        "type": "model",
        "model": "Llama3.1-405B-Instruct",
@ -362,3 +345,52 @@ The `BenchmarkConfig` are user specified config to define:
    }
 }
 ```
+
+
+## Open-benchmark Contributing Guide
+
+### Create the new dataset for your new benchmark
+An eval open-benchmark essentially contains 2 parts:
+- `raw data`: The raw dataset associated with the benchmark. You typically need to search the original paper that introduces the benchmark and find the canonical dataset (usually hosted on huggingface)
+- `prompt template`: How to ask the candidate model to generate the answer (prompt template plays a critical role to the evaluation results). Tyically, you can find the reference prompt template associated with the benchmark in benchmarks author's repo ([exmaple](https://github.com/idavidrein/gpqa/blob/main/prompts/chain_of_thought.txt)) or some other popular open source repos ([example](https://github.com/openai/simple-evals/blob/0a6e8f62e52bc5ae915f752466be3af596caf392/common.py#L14))
+
+To create new open-benmark in llama stack, you need to combine the prompt template and the raw data into the `chat_completion_input` column in the evaluation dataset.
+
+Llama stack enforeces the evaluate dataset schema to contain at least 3 columns:
+- `chat_completion_input`: The actual input to the model to run the generation for eval
+- `input_query`: The raw input from the raw dataset without the prompt template
+- `expected_answer`: The ground truth for scoring functions to calcalate the score from.
+
+
+You need to write a script [example convert script](https://gist.github.com/yanxi0830/118e9c560227d27132a7fd10e2c92840) to convert the benchmark raw dataset to llama stack format eval dataset and update the dataset to huggingface [example benchmark dataset](https://huggingface.co/datasets/llamastack/mmmu)
+
+
+### Find scoring function for your new benchmark
+The purpose of scoring function is to calculate the score for each example based on candidate model generation result and expected_answer. It also aggregates the scores from all the examples and generate the final evaluate results.
+
+
+Firstly, you can see if the existing [llama stack scoring functions](https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/inline/scoring) can fulfill your need. If not, you need to write a new scoring function based on what benchmark author / other open source repo describe.
+
+### Add new benchmark into template
+Firstly, you need to add the evaluation dataset associated with your benchmark under `datasets` resource in the [open-benchmark](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/templates/open-benchmark/run.yaml)
+
+Secondly, you need to add the new benchmark you just created under the `benchmarks` resource in the same template. To add the new benchmark, you need to have
+- `benchmark_id`: identifier of the benchmark
+- `dataset_id`: identifier of the dataset associated with your benchmark
+- `scoring_functions`: scoring function to calculate the score based on generation results and expected_answer
+
+
+### Test the new benchmark
+
+Spin up llama stack server with 'open-benchmark' templates
+```
+llama stack run llama_stack/templates/open-benchmark/run.yaml
+
+```
+
+Run eval benchmark CLI with your new benchmark id
+```
+llama-stack-client eval run-benchmark <new_benchmark_id> \
+--model_id <model id to evaluate on> \
+--output_dir <directory to store the evaluate results> \
+```
--- a/docs/source/references/llama_cli_reference/index.md
+++ b/docs/source/references/llama_cli_reference/index.md
@ -1,6 +1,6 @@
 # llama (server-side) CLI Reference

-The `llama` CLI tool helps you setup and use the Llama Stack. It should be available on your path after installing the `llama-stack` package.
+The `llama` CLI tool helps you set up and use the Llama Stack. The CLI is available on your path after installing the `llama-stack` package.

 ## Installation

@ -27,9 +27,9 @@ You have two ways to install Llama Stack:


 ## `llama` subcommands
-1. `download`: `llama` cli tools supports downloading the model from Meta or Hugging Face.
-2. `model`: Lists available models and their properties.
-3. `stack`: Allows you to build and run a Llama Stack server. You can read more about this [here](../../distributions/building_distro).
+1. `download`: Supports downloading models from Meta or Hugging Face.  [Downloading models](#downloading-models)
+2. `model`: Lists available models and their properties. [Understanding models](#understand-the-models)
+3. `stack`: Allows you to build a stack using the `llama stack` distribution and run a Llama Stack server. You can read more about how to build a Llama Stack distribution in the [Build your own Distribution](../../distributions/building_distro) documentation.

 ### Sample Usage

@ -117,7 +117,7 @@ You should see a table like this:
 +----------------------------------+------------------------------------------+----------------+
 ```

-To download models, you can use the llama download command.
+To download models, you can use the `llama download` command.

 ### Downloading from [Meta](https://llama.meta.com/llama-downloads/)

@ -191,7 +191,7 @@ You should see a table like this:
 The `llama model` command helps you explore the model’s interface.

 1. `download`: Download the model from different sources. (meta, huggingface)
-2. `list`: Lists all the models available for download with hardware requirements to deploy the models.
+2. `list`: Lists all the models available for download with hardware requirements for deploying the models.
 3. `prompt-format`: Show llama model message formats.
 4. `describe`: Describes all the properties of the model.

@ -262,13 +262,12 @@ llama model prompt-format -m Llama3.2-3B-Instruct
 ![alt text](../../../resources/prompt-format.png)


-
 You will be shown a Markdown formatted description of the model interface and how prompts / messages are formatted for various scenarios.

 **NOTE**: Outputs in terminal are color printed to show special tokens.

 ### Remove model
-You can run `llama model remove` to remove unecessary model:
+You can run `llama model remove` to remove an unnecessary model:

 ```
 llama model remove -m Llama-Guard-3-8B-int8
--- a/docs/zero_to_hero_guide/04_Tool_Calling101.ipynb
+++ b/docs/zero_to_hero_guide/04_Tool_Calling101.ipynb
@ -294,8 +294,9 @@
    "    # Initialize custom tool (ensure `WebSearchTool` is defined earlier in the notebook)\n",
    "    webSearchTool = WebSearchTool(api_key=BRAVE_SEARCH_API_KEY)\n",
    "\n",
-    "    # Define the agent configuration, including the model and tool setup\n",
-    "    agent_config = AgentConfig(\n",
+    "    # Create an agent instance with the client and configuration\n",
+    "    agent = Agent(\n",
+    "        client, \n",
    "        model=MODEL_NAME,\n",
    "        instructions=\"\"\"You are a helpful assistant that responds to user queries with relevant information and cites sources when available.\"\"\",\n",
    "        sampling_params={\n",
@ -303,17 +304,12 @@
    "                \"type\": \"greedy\",\n",
    "            },\n",
    "        },\n",
-    "        tools=[webSearchTool.get_tool_definition()],\n",
-    "        tool_choice=\"auto\",\n",
-    "        tool_prompt_format=\"python_list\",\n",
+    "        tools=[webSearchTool],\n",
    "        input_shields=input_shields,\n",
    "        output_shields=output_shields,\n",
    "        enable_session_persistence=False,\n",
    "    )\n",
    "\n",
-    "    # Create an agent instance with the client and configuration\n",
-    "    agent = Agent(client, agent_config, [webSearchTool])\n",
-    "\n",
    "    # Create a session for interaction and print the session ID\n",
    "    session_id = agent.create_session(\"test-session\")\n",
    "    print(f\"Created session_id={session_id} for Agent({agent.agent_id})\")\n",
--- a/docs/zero_to_hero_guide/07_Agents101.ipynb
+++ b/docs/zero_to_hero_guide/07_Agents101.ipynb
@ -110,12 +110,12 @@
    "from llama_stack_client import LlamaStackClient\n",
    "from llama_stack_client.lib.agents.agent import Agent\n",
    "from llama_stack_client.lib.agents.event_logger import EventLogger\n",
-    "from llama_stack_client.types.agent_create_params import AgentConfig\n",
    "\n",
    "\n",
    "async def agent_example():\n",
    "    client = LlamaStackClient(base_url=f\"http://{HOST}:{PORT}\")\n",
-    "    agent_config = AgentConfig(\n",
+    "    agent = Agent(\n",
+    "        client, \n",
    "        model=MODEL_NAME,\n",
    "        instructions=\"You are a helpful assistant! If you call builtin tools like brave search, follow the syntax brave_search.call(…)\",\n",
    "        sampling_params={\n",
@ -130,14 +130,7 @@
    "                \"api_key\": BRAVE_SEARCH_API_KEY,\n",
    "            }\n",
    "        ],\n",
-    "        tool_choice=\"auto\",\n",
-    "        tool_prompt_format=\"function_tag\",\n",
-    "        input_shields=[],\n",
-    "        output_shields=[],\n",
-    "        enable_session_persistence=False,\n",
    "    )\n",
-    "\n",
-    "    agent = Agent(client, agent_config)\n",
    "    session_id = agent.create_session(\"test-session\")\n",
    "    print(f\"Created session_id={session_id} for Agent({agent.agent_id})\")\n",
    "\n",
--- a/docs/zero_to_hero_guide/README.md
+++ b/docs/zero_to_hero_guide/README.md
@ -40,7 +40,7 @@ If you're looking for more specific topics, we have a [Zero to Hero Guide](#next
   ollama run llama3.2:3b-instruct-fp16 --keepalive -1m
   ```
   **Note**:
-     - The supported models for llama stack for now is listed in [here](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/inference/ollama/ollama.py#L43)
+     - The supported models for llama stack for now is listed in [here](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/inference/ollama/models.py)
     - `keepalive -1m` is used so that ollama continues to keep the model in memory indefinitely. Otherwise, ollama frees up memory and you would have to run `ollama run` again.

 ---
--- a/docs/zero_to_hero_guide/Tool_Calling101_Using_Together_Llama_Stack_Server.ipynb
+++ b/docs/zero_to_hero_guide/Tool_Calling101_Using_Together_Llama_Stack_Server.ipynb
@ -103,7 +103,6 @@
    "from llama_stack_client.lib.agents.agent import Agent\n",
    "from llama_stack_client.lib.agents.event_logger import EventLogger\n",
    "from llama_stack_client.types.agent_create_params import (\n",
-    "    AgentConfig,\n",
    "    AgentConfigToolSearchToolDefinition,\n",
    ")\n",
    "\n",
@ -117,7 +116,8 @@
    ") -> Agent:\n",
    "    \"\"\"Create an agent with specified tools.\"\"\"\n",
    "    print(\"Using the following model: \", model)\n",
-    "    agent_config = AgentConfig(\n",
+    "    return Agent(\n",
+    "        client, \n",
    "        model=model,\n",
    "        instructions=instructions,\n",
    "        sampling_params={\n",
@ -126,12 +126,7 @@
    "            },\n",
    "        },\n",
    "        tools=tools,\n",
-    "        tool_choice=\"auto\",\n",
-    "        tool_prompt_format=\"json\",\n",
-    "        enable_session_persistence=True,\n",
-    "    )\n",
-    "\n",
-    "    return Agent(client, agent_config)\n"
+    "    )\n"
   ]
  },
  {
@ -360,9 +355,9 @@
    "    # Create the agent with the tool\n",
    "    weather_tool = WeatherTool()\n",
    "\n",
-    "    agent_config = AgentConfig(\n",
+    "    agent = Agent(\n",
+    "        client=client, \n",
    "        model=LLAMA31_8B_INSTRUCT,\n",
-    "        # model=model_name,\n",
    "        instructions=\"\"\"\n",
    "        You are a weather assistant that can provide weather information.\n",
    "        Always specify the location clearly in your responses.\n",
@ -373,16 +368,9 @@
    "                \"type\": \"greedy\",\n",
    "            },\n",
    "        },\n",
-    "        tools=[weather_tool.get_tool_definition()],\n",
-    "        tool_choice=\"auto\",\n",
-    "        tool_prompt_format=\"json\",\n",
-    "        input_shields=[],\n",
-    "        output_shields=[],\n",
-    "        enable_session_persistence=True,\n",
+    "        tools=[weather_tool],\n",
    "    )\n",
    "\n",
-    "    agent = Agent(client=client, agent_config=agent_config, custom_tools=[weather_tool])\n",
-    "\n",
    "    return agent\n",
    "\n",
    "\n",