Merge branch 'main' into chroma

2025-12-03 09:53:45 +00:00 · 2025-09-19 22:53:03 +09:00 · 2025-09-19 22:53:03 +09:00 · c71bcd5479
commit c71bcd5479
parent aaea9fed12 4c2fcb6b51
124 changed files with 25574 additions and 2425 deletions
--- a/docs/README.md
+++ b/docs/README.md
@ -1,6 +1,6 @@
 # Llama Stack Documentation

-Here's a collection of comprehensive guides, examples, and resources for building AI applications with Llama Stack. For the complete documentation, visit our [ReadTheDocs page](https://llama-stack.readthedocs.io/en/latest/index.html).
+Here's a collection of comprehensive guides, examples, and resources for building AI applications with Llama Stack. For the complete documentation, visit our [Github page](https://llamastack.github.io/latest/getting_started/index.html).

 ## Render locally

--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -1380,6 +1380,40 @@
                        }
                    }
                ]
+            },
+            "delete": {
+                "responses": {
+                    "200": {
+                        "description": "OK"
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Benchmarks"
+                ],
+                "description": "Unregister a benchmark.",
+                "parameters": [
+                    {
+                        "name": "benchmark_id",
+                        "in": "path",
+                        "description": "The ID of the benchmark to unregister.",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
            }
        },
        "/v1/openai/v1/chat/completions/{completion_id}": {
@ -1620,6 +1654,40 @@
                        }
                    }
                ]
+            },
+            "delete": {
+                "responses": {
+                    "200": {
+                        "description": "OK"
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "ScoringFunctions"
+                ],
+                "description": "Unregister a scoring function.",
+                "parameters": [
+                    {
+                        "name": "scoring_fn_id",
+                        "in": "path",
+                        "description": "The ID of the scoring function to unregister.",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
            }
        },
        "/v1/shields/{identifier}": {
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -954,6 +954,30 @@ paths:
          required: true
          schema:
            type: string
+    delete:
+      responses:
+        '200':
+          description: OK
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Benchmarks
+      description: Unregister a benchmark.
+      parameters:
+        - name: benchmark_id
+          in: path
+          description: The ID of the benchmark to unregister.
+          required: true
+          schema:
+            type: string
  /v1/openai/v1/chat/completions/{completion_id}:
    get:
      responses:
@ -1119,6 +1143,31 @@ paths:
          required: true
          schema:
            type: string
+    delete:
+      responses:
+        '200':
+          description: OK
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - ScoringFunctions
+      description: Unregister a scoring function.
+      parameters:
+        - name: scoring_fn_id
+          in: path
+          description: >-
+            The ID of the scoring function to unregister.
+          required: true
+          schema:
+            type: string
  /v1/shields/{identifier}:
    get:
      responses:
--- a/docs/getting_started.ipynb
+++ b/docs/getting_started.ipynb
@ -11,11 +11,11 @@
        "\n",
        "# Llama Stack - Building AI Applications\n",
        "\n",
-        "<img src=\"https://llama-stack.readthedocs.io/en/latest/_images/llama-stack.png\" alt=\"drawing\" width=\"500\"/>\n",
+        "<img src=\"https://llamastack.github.io/latest/_images/llama-stack.png\" alt=\"drawing\" width=\"500\"/>\n",
        "\n",
        "[Llama Stack](https://github.com/meta-llama/llama-stack) defines and standardizes the set of core building blocks needed to bring generative AI applications to market. These building blocks are presented in the form of interoperable APIs with a broad set of Service Providers providing their implementations.\n",
        "\n",
-        "Read more about the project here: https://llama-stack.readthedocs.io/en/latest/index.html\n",
+        "Read more about the project here: https://llamastack.github.io/latest/getting_started/index.html\n",
        "\n",
        "In this guide, we will showcase how you can build LLM-powered agentic applications using Llama Stack.\n",
        "\n",
@ -75,7 +75,7 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 1,
+      "execution_count": null,
      "id": "J2kGed0R5PSf",
      "metadata": {
        "colab": {
@ -113,17 +113,17 @@
        }
      ],
      "source": [
-        "import os \n",
+        "import os\n",
        "import subprocess\n",
        "import time\n",
        "\n",
-        "!pip install uv \n",
+        "!pip install uv\n",
        "\n",
        "if \"UV_SYSTEM_PYTHON\" in os.environ:\n",
        "  del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
        "\n",
        "# this command installs all the dependencies needed for the llama stack server with the together inference provider\n",
-        "!uv run --with llama-stack llama stack build --distro together --image-type venv \n",
+        "!uv run --with llama-stack llama stack build --distro together --image-type venv\n",
        "\n",
        "def run_llama_stack_server_background():\n",
        "    log_file = open(\"llama_stack_server.log\", \"w\")\n",
@ -134,7 +134,7 @@
        "        stderr=log_file,\n",
        "        text=True\n",
        "    )\n",
-        "    \n",
+        "\n",
        "    print(f\"Starting Llama Stack server with PID: {process.pid}\")\n",
        "    return process\n",
        "\n",
@ -142,11 +142,11 @@
        "    import requests\n",
        "    from requests.exceptions import ConnectionError\n",
        "    import time\n",
-        "    \n",
+        "\n",
        "    url = \"http://0.0.0.0:8321/v1/health\"\n",
        "    max_retries = 30\n",
        "    retry_interval = 1\n",
-        "    \n",
+        "\n",
        "    print(\"Waiting for server to start\", end=\"\")\n",
        "    for _ in range(max_retries):\n",
        "        try:\n",
@ -157,12 +157,12 @@
        "        except ConnectionError:\n",
        "            print(\".\", end=\"\", flush=True)\n",
        "            time.sleep(retry_interval)\n",
-        "            \n",
+        "\n",
        "    print(\"\\nServer failed to start after\", max_retries * retry_interval, \"seconds\")\n",
        "    return False\n",
        "\n",
        "\n",
-        "# use this helper if needed to kill the server \n",
+        "# use this helper if needed to kill the server\n",
        "def kill_llama_stack_server():\n",
        "    # Kill any existing llama stack server processes\n",
        "    os.system(\"ps aux | grep -v grep | grep llama_stack.core.server.server | awk '{print $2}' | xargs kill -9\")\n"
@ -242,7 +242,7 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 4,
+      "execution_count": null,
      "id": "E1UFuJC570Tk",
      "metadata": {
        "colab": {
@ -407,9 +407,9 @@
        "from llama_stack_client import LlamaStackClient\n",
        "\n",
        "client = LlamaStackClient(\n",
-        "    base_url=\"http://0.0.0.0:8321\", \n",
+        "    base_url=\"http://0.0.0.0:8321\",\n",
        "    provider_data = {\n",
-        "        \"tavily_search_api_key\": os.environ['TAVILY_SEARCH_API_KEY'], \n",
+        "        \"tavily_search_api_key\": os.environ['TAVILY_SEARCH_API_KEY'],\n",
        "        \"together_api_key\": os.environ['TOGETHER_API_KEY']\n",
        "    }\n",
        ")"
@ -1177,7 +1177,7 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 13,
+      "execution_count": null,
      "id": "WS8Gu5b0APHs",
      "metadata": {
        "colab": {
@ -1207,7 +1207,7 @@
        "from termcolor import cprint\n",
        "\n",
        "agent = Agent(\n",
-        "    client, \n",
+        "    client,\n",
        "    model=\"meta-llama/Llama-3.3-70B-Instruct\",\n",
        "    instructions=\"You are a helpful assistant. Use websearch tool to help answer questions.\",\n",
        "    tools=[\"builtin::websearch\"],\n",
@ -1249,7 +1249,7 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 14,
+      "execution_count": null,
      "id": "GvLWltzZCNkg",
      "metadata": {
        "colab": {
@ -1367,7 +1367,7 @@
        "    chunk_size_in_tokens=512,\n",
        ")\n",
        "rag_agent = Agent(\n",
-        "    client, \n",
+        "    client,\n",
        "    model=model_id,\n",
        "    instructions=\"You are a helpful assistant\",\n",
        "    tools = [\n",
@ -2154,7 +2154,7 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 21,
+      "execution_count": null,
      "id": "vttLbj_YO01f",
      "metadata": {
        "colab": {
@ -2217,7 +2217,7 @@
        "from termcolor import cprint\n",
        "\n",
        "agent = Agent(\n",
-        "    client, \n",
+        "    client,\n",
        "    model=model_id,\n",
        "    instructions=\"You are a helpful assistant\",\n",
        "    tools=[\"mcp::filesystem\"],\n",
@ -2283,7 +2283,7 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 22,
+      "execution_count": null,
      "id": "4iCO59kP20Zs",
      "metadata": {
        "colab": {
@ -2317,7 +2317,7 @@
        "from llama_stack_client import Agent, AgentEventLogger\n",
        "\n",
        "agent = Agent(\n",
-        "    client, \n",
+        "    client,\n",
        "    model=\"meta-llama/Llama-3.3-70B-Instruct\",\n",
        "    instructions=\"You are a helpful assistant. Use web_search tool to answer the questions.\",\n",
        "    tools=[\"builtin::websearch\"],\n",
@ -2846,7 +2846,7 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 29,
+      "execution_count": null,
      "id": "44e05e16",
      "metadata": {},
      "outputs": [
@ -2880,8 +2880,7 @@
        "!curl -O https://raw.githubusercontent.com/meta-llama/llama-models/refs/heads/main/Llama_Repo.jpeg\n",
        "\n",
        "from IPython.display import Image\n",
-        "Image(\"Llama_Repo.jpeg\", width=256, height=256)\n",
-        "\n"
+        "Image(\"Llama_Repo.jpeg\", width=256, height=256)\n"
      ]
    },
    {
--- a/docs/getting_started_llama4.ipynb
+++ b/docs/getting_started_llama4.ipynb
@ -11,11 +11,11 @@
        "\n",
        "# Getting Started with Llama 4 in Llama Stack\n",
        "\n",
-        "<img src=\"https://llama-stack.readthedocs.io/en/latest/_images/llama-stack.png\" alt=\"drawing\" width=\"500\"/>\n",
+        "<img src=\"https://llamastack.github.io/latest/_images/llama-stack.png\" alt=\"drawing\" width=\"500\"/>\n",
        "\n",
        "[Llama Stack](https://github.com/meta-llama/llama-stack) defines and standardizes the set of core building blocks needed to bring generative AI applications to market. These building blocks are presented in the form of interoperable APIs with a broad set of Service Providers providing their implementations.\n",
        "\n",
-        "Read more about the project here: https://llama-stack.readthedocs.io/en/latest/index.html\n",
+        "Read more about the project here: https://llamastack.github.io/latest/index.html\n",
        "\n",
        "In this guide, we will showcase how you can get started with using Llama 4 in Llama Stack.\n",
        "\n",
@ -51,7 +51,7 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "!pip install uv \n",
+        "!pip install uv\n",
        "\n",
        "MODEL=\"Llama-4-Scout-17B-16E-Instruct\"\n",
        "# get meta url from llama.com\n",
@ -223,7 +223,7 @@
        }
      ],
      "source": [
-        "import os \n",
+        "import os\n",
        "import subprocess\n",
        "import time\n",
        "\n",
@ -232,8 +232,8 @@
        "if \"UV_SYSTEM_PYTHON\" in os.environ:\n",
        "  del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
        "\n",
-        "# this command installs all the dependencies needed for the llama stack server \n",
-        "!uv run --with llama-stack llama stack build --distro meta-reference-gpu --image-type venv \n",
+        "# this command installs all the dependencies needed for the llama stack server\n",
+        "!uv run --with llama-stack llama stack build --distro meta-reference-gpu --image-type venv\n",
        "\n",
        "def run_llama_stack_server_background():\n",
        "    log_file = open(\"llama_stack_server.log\", \"w\")\n",
@ -244,7 +244,7 @@
        "        stderr=log_file,\n",
        "        text=True\n",
        "    )\n",
-        "    \n",
+        "\n",
        "    print(f\"Starting Llama Stack server with PID: {process.pid}\")\n",
        "    return process\n",
        "\n",
@ -252,11 +252,11 @@
        "    import requests\n",
        "    from requests.exceptions import ConnectionError\n",
        "    import time\n",
-        "    \n",
+        "\n",
        "    url = \"http://0.0.0.0:8321/v1/health\"\n",
        "    max_retries = 30\n",
        "    retry_interval = 1\n",
-        "    \n",
+        "\n",
        "    print(\"Waiting for server to start\", end=\"\")\n",
        "    for _ in range(max_retries):\n",
        "        try:\n",
@ -267,12 +267,12 @@
        "        except ConnectionError:\n",
        "            print(\".\", end=\"\", flush=True)\n",
        "            time.sleep(retry_interval)\n",
-        "            \n",
+        "\n",
        "    print(\"\\nServer failed to start after\", max_retries * retry_interval, \"seconds\")\n",
        "    return False\n",
        "\n",
        "\n",
-        "# use this helper if needed to kill the server \n",
+        "# use this helper if needed to kill the server\n",
        "def kill_llama_stack_server():\n",
        "    # Kill any existing llama stack server processes\n",
        "    os.system(\"ps aux | grep -v grep | grep llama_stack.core.server.server | awk '{print $2}' | xargs kill -9\")\n"
--- a/docs/getting_started_llama_api.ipynb
+++ b/docs/getting_started_llama_api.ipynb
--- a/docs/notebooks/Alpha_Llama_Stack_Post_Training.ipynb
+++ b/docs/notebooks/Alpha_Llama_Stack_Post_Training.ipynb
@ -14,7 +14,7 @@
        "We will also showcase how to leverage existing Llama stack [inference APIs](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/apis/inference/inference.py) (ollama as provider) to get the new model's output and the [eval APIs](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/apis/eval/eval.py) to help you better measure the new model performance. We hope the flywheel of post-training -> eval -> inference can greatly empower agentic apps development.\n",
        "\n",
        "\n",
-        "- Read more about Llama Stack: https://llama-stack.readthedocs.io/en/latest/introduction/index.html\n",
+        "- Read more about Llama Stack: https://llamastack.github.io/latest/index.html\n",
        "- Read more about post training APIs definition: https://github.com/meta-llama/llama-stack/blob/main/llama_stack/apis/post_training/post_training.py\n",
        "\n",
        "\n",
@ -3632,7 +3632,7 @@
      },
      "source": [
        "#### 1.2. Kick-off eval job\n",
-        "- More details on Llama-stack eval: https://llama-stack.readthedocs.io/en/latest/benchmark_evaluations/index.html\n",
+        "- More details on Llama-stack eval: https://llamastack.github.io/latest/references/evals_reference/index.html\n",
        "  - Define an EvalCandidate\n",
        "  - Run evaluate on datasets (we choose brainstrust's answer-similarity as scoring function with OpenAI's model as judge model)\n",
        "\n",
--- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
+++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
@ -12,7 +12,7 @@
        "\n",
        "This notebook will walk you through the main sets of APIs we offer with Llama Stack for supporting running benchmark evaluations of your with working examples to explore the possibilities that Llama Stack opens up for you.\n",
        "\n",
-        "Read more about Llama Stack: https://llama-stack.readthedocs.io/en/latest/index.html"
+        "Read more about Llama Stack: https://llamastack.github.io/latest/index.html"
      ]
    },
    {
--- a/docs/notebooks/nvidia/tool_calling/2_finetuning_and_inference.ipynb
+++ b/docs/notebooks/nvidia/tool_calling/2_finetuning_and_inference.ipynb
@ -373,7 +373,7 @@
    "    metadata={\n",
    "        \"format\": \"json\",\n",
    "        \"description\": \"Tool calling xLAM dataset in OpenAI ChatCompletions format\",\n",
-    "        \"provider\": \"nvidia\"\n",
+    "        \"provider_id\": \"nvidia\"\n",
    "    }\n",
    ")\n",
    "print(response)"
--- a/docs/quick_start.ipynb
+++ b/docs/quick_start.ipynb
@ -11,7 +11,7 @@
        "\n",
        "# Llama Stack - Building AI Applications\n",
        "\n",
-        "<img src=\"https://llama-stack.readthedocs.io/en/latest/_images/llama-stack.png\" alt=\"drawing\" width=\"500\"/>\n",
+        "<img src=\"https://llamastack.github.io/latest/_images/llama-stack.png\" alt=\"drawing\" width=\"500\"/>\n",
        "\n",
        "Get started with Llama Stack in minutes!\n",
        "\n",
@ -138,7 +138,7 @@
      },
      "outputs": [],
      "source": [
-        "import os \n",
+        "import os\n",
        "import subprocess\n",
        "\n",
        "if \"UV_SYSTEM_PYTHON\" in os.environ:\n",
@ -150,13 +150,13 @@
        "def run_llama_stack_server_background():\n",
        "    log_file = open(\"llama_stack_server.log\", \"w\")\n",
        "    process = subprocess.Popen(\n",
-        "        f\"OLLAMA_URL=http://localhost:11434 uv run --with llama-stack llama stack run starter --image-type venv",
+        "        f\"OLLAMA_URL=http://localhost:11434 uv run --with llama-stack llama stack run starter --image-type venv\n",
        "        shell=True,\n",
        "        stdout=log_file,\n",
        "        stderr=log_file,\n",
        "        text=True\n",
        "    )\n",
-        "    \n",
+        "\n",
        "    print(f\"Starting Llama Stack server with PID: {process.pid}\")\n",
        "    return process\n",
        "\n",
@ -164,11 +164,11 @@
        "    import requests\n",
        "    from requests.exceptions import ConnectionError\n",
        "    import time\n",
-        "    \n",
+        "\n",
        "    url = \"http://0.0.0.0:8321/v1/health\"\n",
        "    max_retries = 30\n",
        "    retry_interval = 1\n",
-        "    \n",
+        "\n",
        "    print(\"Waiting for server to start\", end=\"\")\n",
        "    for _ in range(max_retries):\n",
        "        try:\n",
@ -179,12 +179,12 @@
        "        except ConnectionError:\n",
        "            print(\".\", end=\"\", flush=True)\n",
        "            time.sleep(retry_interval)\n",
-        "            \n",
+        "\n",
        "    print(\"\\nServer failed to start after\", max_retries * retry_interval, \"seconds\")\n",
        "    return False\n",
        "\n",
        "\n",
-        "# use this helper if needed to kill the server \n",
+        "# use this helper if needed to kill the server\n",
        "def kill_llama_stack_server():\n",
        "    # Kill any existing llama stack server processes\n",
        "    os.system(\"ps aux | grep -v grep | grep llama_stack.core.server.server | awk '{print $2}' | xargs kill -9\")\n"
--- a/docs/source/apis/api_leveling.md
+++ b/docs/source/apis/api_leveling.md
@ -0,0 +1,94 @@
+# Llama Stack API Stability Leveling
+
+In order to provide a stable experience in Llama Stack, the various APIs need different stability levels indicating the level of support, backwards compatability, and overall production readiness.
+
+## Different Levels
+
+### v1alpha
+
+- Little to no expectation of support between versions
+- Breaking changes are permitted
+- Datatypes and parameters can break
+- Routes can be added and removed
+
+#### Graduation Criteria
+
+- an API can graduate from `v1alpha` to `v1beta` if the team has identified the extent of the non-optional routes and the shape of their parameters/return types for the API eg. `/v1/openai/chat/completions`. Optional types can change.
+- CRUD must stay stable once in `v1beta`. This is a commitment to backward compatibility, guaranteeing that most code you write against the v1beta version will not break during future updates. We may make additive changes (like adding a new, optional field to a response), but we will not make breaking changes (like renaming an existing "modelName" field to "name", changing an ID's data type from an integer to a string, or altering an endpoint URL).
+- for OpenAI APIs, a comparison to the OpenAI spec for the specific API can be done to ensure completeness.
+
+### v1beta
+
+- API routes remain consistent between versions
+- Parameters and return types are not ensured between versions
+- API, besides minor fixes and adjustments, should be _almost_ v1. Changes should not be drastic.
+
+#### Graduation Criteria
+
+- an API can graduate from `v1beta` to `v1` if the API surface and datatypes are complete as identified by the team. The parameters and return types that are mandatory for each route are stable. All aspects of graduating from `v1alpha1` to `v1beta` apply as well.
+- Optional parameters, routes, or parts of the return type can be added after graduating to `v1`
+
+### v1 (stable)
+
+- Considered stable
+- Backwards compatible between Z-streams
+  - Y-stream breaking changes must go through the proper approval and announcement process.
+- Datatypes for a route and its return types cannot change between Z-streams
+  - Y-stream datatype changes should be sparing, unless the changes are additional net-new parameters
+- Must have proper conformance testing as outlined in https://github.com/llamastack/llama-stack/issues/3237
+
+### v2+ (Major Versions)
+
+Introducing a new major version like `/v2` is a significant and disruptive event that should be treated as a last resort. It is reserved for essential changes to a stable `/v1` API that are fundamentally backward-incompatible and cannot be implemented through additive, non-breaking changes or breaking changes across X/Y-Stream releases (x.y.z).
+
+If a `/v2` version is deemed absolutely necessary, it must adhere to the following protocol to ensure a sane and predictable transition for users:
+
+#### Lifecycle Progression
+
+ A new major version must follow the same stability lifecycle as `/v1`. It will be introduced as `/v2alpha`, mature to `/v2beta`, and finally become stable as `/v2`.
+
+#### Coexistence:
+
+The new `/v2` API must be introduced alongside the existing `/v1` API and run in parallel. It must not replace the `/v1` API immediately.
+
+#### Deprecation Policy:
+
+When a `/v2` API is introduced, a clear and generous deprecation policy for the `/v1` API must be published simultaneously. This policy must outline the timeline for the eventual removal of the `/v1` API, giving users ample time to migrate.
+
+### API Stability vs. Provider Stability
+
+The leveling introduced in this document relates to the stability of the API and not specifically the providers within the API.
+
+Providers can iterate as much as they want on functionality as long as they work within the bounds of an API. If they need to change the API, then the API should not be `/v1`, or those breaking changes can only happen on a y-stream release basis.
+
+### Approval and Announcement Process for Breaking Changes
+
+- **PR Labeling**: Any pull request that introduces a breaking API change must be clearly labeled with `breaking-change`.
+- **PR Title/Commit**: Any pull request that introduces a breaking API change must contain `BREAKING CHANGE` in the title and commit footer. Alternatively, the commit can include `!`, eg. `feat(api)!: title goes here` This is outlined in the [conventional commits documentation](https://www.conventionalcommits.org/en/v1.0.0/#specification)
+- **Maintainer Review**: At least one maintainer must explicitly acknowledge the breaking change during review by applying the `breaking-change` label. An approval must come with this label or the acknowledgement this label has already been applied.
+- **Announcement**: Breaking changes require inclusion in release notes and, if applicable, a separate communication (e.g., Discord, Github Issues, or GitHub Discussions) prior to release.
+
+If a PR has proper approvals, labels, and commit/title hygiene, the failing API conformance tests will be bypassed.
+
+
+## Enforcement
+
+### Migration of API routes under `/v1alpha`, `/v1beta`, and `/v1`
+
+Instead of placing every API under `/v1`, any API that is not fully stable or complete should go under `/v1alpha` or `/v1beta`. For example, at the time of this writing,  `post_training` belongs here, as well as any OpenAI-compatible API whose surface does not exactly match the upstream OpenAI API it mimics.
+
+This migration is crucial as we get Llama Stack in the hands of users who intend to productize various APIs. A clear view of what is stable and what is actively being developed will enable users to pick and choose various APIs to build their products on.
+
+This migration will be a breaking change for any API moving out of `/v1`. Ideally, this should happen before 0.3.0 and especially 1.0.0.
+
+### `x-stability` tags in the OpenAPI spec for oasdiff
+
+`x-stability` tags allow tools like oasdiff to enforce different rules for different stability levels; these tags should match the routes: [oasdiff stability](https://github.com/oasdiff/oasdiff/blob/main/docs/STABILITY.md)
+
+### Testing
+
+The testing of each stable API is already outlined in [issue #3237](https://github.com/llamastack/llama-stack/issues/3237) and is being worked on. These sorts of conformance tests should apply primarily to `/v1` APIs only, with `/v1alpha` and `/v1beta` having any tests the maintainers see fit as well as basic testing to ensure the routing works properly.
+
+### New APIs going forward
+
+Any subsequently introduced APIs should be introduced as `/v1alpha`
--- a/docs/source/distributions/self_hosted_distro/nvidia.md
+++ b/docs/source/distributions/self_hosted_distro/nvidia.md
@ -11,6 +11,7 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov
 | agents | `inline::meta-reference` |
 | datasetio | `inline::localfs`, `remote::nvidia` |
 | eval | `remote::nvidia` |
+| files | `inline::localfs` |
 | inference | `remote::nvidia` |
 | post_training | `remote::nvidia` |
 | safety | `remote::nvidia` |
--- a/docs/source/providers/inference/remote_watsonx.md
+++ b/docs/source/providers/inference/remote_watsonx.md
@ -9,8 +9,8 @@ IBM WatsonX inference provider for accessing AI models on IBM's WatsonX platform
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `url` | `<class 'str'>` | No | https://us-south.ml.cloud.ibm.com | A base url for accessing the watsonx.ai |
-| `api_key` | `pydantic.types.SecretStr \| None` | No |  | The watsonx API key, only needed of using the hosted service |
-| `project_id` | `str \| None` | No |  | The Project ID key, only needed of using the hosted service |
+| `api_key` | `pydantic.types.SecretStr \| None` | No |  | The watsonx API key |
+| `project_id` | `str \| None` | No |  | The Project ID key |
 | `timeout` | `<class 'int'>` | No | 60 | Timeout for the HTTP requests |

 ## Sample Configuration
--- a/docs/zero_to_hero_guide/00_Inference101.ipynb
+++ b/docs/zero_to_hero_guide/00_Inference101.ipynb
@ -9,7 +9,7 @@
        "\n",
        "This document provides instructions on how to use Llama Stack's `chat_completion` function for generating text using the `Llama3.2-3B-Instruct` model. \n",
        "\n",
-        "Before you begin, please ensure Llama Stack is installed and set up by following the [Getting Started Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html).\n",
+        "Before you begin, please ensure Llama Stack is installed and set up by following the [Getting Started Guide](https://llamastack.github.io/latest/getting_started/index.html).\n",
        "\n",
        "\n",
        "### Table of Contents\n",
--- a/docs/zero_to_hero_guide/01_Local_Cloud_Inference101.ipynb
+++ b/docs/zero_to_hero_guide/01_Local_Cloud_Inference101.ipynb
@ -10,7 +10,7 @@
    "This guide provides a streamlined setup to switch between local and cloud clients for text generation with Llama Stack’s `chat_completion` API. This setup enables automatic fallback to a cloud instance if the local client is unavailable.\n",
    "\n",
    "### Prerequisites\n",
-    "Before you begin, please ensure Llama Stack is installed and the distribution is set up by following the [Getting Started Guide](https://llama-stack.readthedocs.io/en/latest/). You will need to run two distributions, a local and a cloud distribution, for this demo to work.\n",
+    "Before you begin, please ensure Llama Stack is installed and the distribution is set up by following the [Getting Started Guide](https://llamastack.github.io/latest/getting_started/index.html). You will need to run two distributions, a local and a cloud distribution, for this demo to work.\n",
    "\n",
    "### Implementation"
   ]
--- a/docs/zero_to_hero_guide/02_Prompt_Engineering101.ipynb
+++ b/docs/zero_to_hero_guide/02_Prompt_Engineering101.ipynb
@ -11,7 +11,7 @@
        "\n",
        "This interactive guide covers prompt engineering & best practices with Llama 3.2 and Llama Stack.\n",
        "\n",
-        "Before you begin, please ensure Llama Stack is installed and set up by following the [Getting Started Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html)."
+        "Before you begin, please ensure Llama Stack is installed and set up by following the [Getting Started Guide](https://llamastack.github.io/latest/getting_started/index.html)."
      ]
    },
    {
--- a/docs/zero_to_hero_guide/03_Image_Chat101.ipynb
+++ b/docs/zero_to_hero_guide/03_Image_Chat101.ipynb
@ -7,7 +7,7 @@
      "source": [
        "## Getting Started with LlamaStack Vision API\n",
        "\n",
-        "Before you begin, please ensure Llama Stack is installed and set up by following the [Getting Started Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html).\n",
+        "Before you begin, please ensure Llama Stack is installed and set up by following the [Getting Started Guide](https://llamastack.github.io/latest/getting_started/index.html).\n",
        "\n",
        "Let's import the necessary packages"
      ]
--- a/docs/zero_to_hero_guide/05_Memory101.ipynb
+++ b/docs/zero_to_hero_guide/05_Memory101.ipynb
@ -26,7 +26,7 @@
        "A running instance of the Llama Stack server (we'll use localhost in \n",
        "this tutorial)\n",
        "\n",
-        "Before you begin, please ensure Llama Stack is installed and set up by following the [Getting Started Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html).\n",
+        "Before you begin, please ensure Llama Stack is installed and set up by following the [Getting Started Guide](https://llamastack.github.io/latest/getting_started/index.html).\n",
        "\n",
        "Let's start by installing the required packages:"
      ]
@ -268,7 +268,7 @@
        "    # Split document content into chunks of 512 characters\n",
        "    content = doc.content\n",
        "    chunk_size = 512\n",
-        "    \n",
+        "\n",
        "    # Create chunks of the specified size\n",
        "    for i in range(0, len(content), chunk_size):\n",
        "        chunk_content = content[i:i+chunk_size]\n",
--- a/docs/zero_to_hero_guide/06_Safety101.ipynb
+++ b/docs/zero_to_hero_guide/06_Safety101.ipynb
@ -6,7 +6,7 @@
      "source": [
        "## Safety API 101\n",
        "\n",
-        "This document talks about the Safety APIs in Llama Stack. Before you begin, please ensure Llama Stack is installed and set up by following the [Getting Started Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html).\n",
+        "This document talks about the Safety APIs in Llama Stack. Before you begin, please ensure Llama Stack is installed and set up by following the [Getting Started Guide](https://llamastack.github.io/latest/getting_started/index.html).\n",
        "\n",
        "As outlined in our [Responsible Use Guide](https://www.llama.com/docs/how-to-guides/responsible-use-guide-resources/), LLM apps should deploy appropriate system level safeguards to mitigate safety and security risks of LLM system, similar to the following diagram:\n",
        "\n",
--- a/docs/zero_to_hero_guide/07_Agents101.ipynb
+++ b/docs/zero_to_hero_guide/07_Agents101.ipynb
@ -6,7 +6,7 @@
      "source": [
        "## Agentic API 101\n",
        "\n",
-        "This document talks about the Agentic APIs in Llama Stack. Before you begin, please ensure Llama Stack is installed and set up by following the [Getting Started Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html).\n",
+        "This document talks about the Agentic APIs in Llama Stack. Before you begin, please ensure Llama Stack is installed and set up by following the [Getting Started Guide](https://llamastack.github.io/latest/getting_started/index.html).\n",
        "\n",
        "Starting Llama 3.1 you can build agentic applications capable of:\n",
        "\n",
--- a/docs/zero_to_hero_guide/README.md
+++ b/docs/zero_to_hero_guide/README.md
@ -9,13 +9,18 @@ If you're looking for more specific topics, we have a [Zero to Hero Guide](#next
 > If you'd prefer not to set up a local server, explore our notebook on [tool calling with the Together API](Tool_Calling101_Using_Together_Llama_Stack_Server.ipynb). This notebook will show you how to leverage together.ai's Llama Stack Server API, allowing you to get started with Llama Stack without the need for a locally built and running server.

 ## Table of Contents
-1. [Setup and run ollama](#setup-ollama)
-2. [Install Dependencies and Set Up Environment](#install-dependencies-and-set-up-environment)
-3. [Build, Configure, and Run Llama Stack](#build-configure-and-run-llama-stack)
-4. [Test with llama-stack-client CLI](#test-with-llama-stack-client-cli)
-5. [Test with curl](#test-with-curl)
-6. [Test with Python](#test-with-python)
-7. [Next Steps](#next-steps)
+- [Llama Stack: from Zero to Hero](#llama-stack-from-zero-to-hero)
+  - [Table of Contents](#table-of-contents)
+  - [Setup ollama](#setup-ollama)
+  - [Install Dependencies and Set Up Environment](#install-dependencies-and-set-up-environment)
+  - [Build, Configure, and Run Llama Stack](#build-configure-and-run-llama-stack)
+  - [Test with `llama-stack-client` CLI](#test-with-llama-stack-client-cli)
+  - [Test with `curl`](#test-with-curl)
+  - [Test with Python](#test-with-python)
+    - [1. Create Python Script (`test_llama_stack.py`)](#1-create-python-script-test_llama_stackpy)
+    - [2. Create a Chat Completion Request in Python](#2-create-a-chat-completion-request-in-python)
+    - [3. Run the Python Script](#3-run-the-python-script)
+  - [Next Steps](#next-steps)

 ---

@ -242,7 +247,7 @@ This command initializes the model to interact with your local Llama Stack insta
 ## Next Steps

 **Explore Other Guides**: Dive deeper into specific topics by following these guides:
- [Understanding Distribution](https://llama-stack.readthedocs.io/en/latest/concepts/index.html#distributions)
+- [Understanding Distribution](https://llamastack.github.io/latest/concepts/index.html#distributions)
 - [Inference 101](00_Inference101.ipynb)
 - [Local and Cloud Model Toggling 101](01_Local_Cloud_Inference101.ipynb)
 - [Prompt Engineering](02_Prompt_Engineering101.ipynb)
@ -259,7 +264,7 @@ This command initializes the model to interact with your local Llama Stack insta
  - [Swift SDK](https://github.com/meta-llama/llama-stack-client-swift)
  - [Kotlin SDK](https://github.com/meta-llama/llama-stack-client-kotlin)

-**Advanced Configuration**: Learn how to customize your Llama Stack distribution by referring to the [Building a Llama Stack Distribution](https://llama-stack.readthedocs.io/en/latest/distributions/building_distro.html) guide.
+**Advanced Configuration**: Learn how to customize your Llama Stack distribution by referring to the [Building a Llama Stack Distribution](https://llamastack.github.io/latest/distributions/building_distro.html) guide.

 **Explore Example Apps**: Check out [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) for example applications built using Llama Stack.