feat(api): level inference/rerank and remove experimental (#3565)

# What does this PR do? inference/rerank is the one route in the API intended to not be deprecated. Level it as v1alpha. Additionally, remove `experimental` and opt to instead use `v1alpha` which itself implies an experimental state based on the original proposal Signed-off-by: Charlie Doern <cdoern@redhat.com>
2025-10-04 04:04:14 +00:00 · 2025-09-29 15:42:09 -04:00 · 2025-09-29 15:42:09 -04:00 · aac42ddcc2
commit aac42ddcc2
parent 975ead1d6a
5 changed files with 13 additions and 10 deletions
--- a/docs/static/llama-stack-spec.html
+++ b/docs/static/llama-stack-spec.html
@ -5431,7 +5431,7 @@
                }
            }
        },
-        "/v1/inference/rerank": {
+        "/v1alpha/inference/rerank": {
            "post": {
                "responses": {
                    "200": {
--- a/docs/static/llama-stack-spec.yaml
+++ b/docs/static/llama-stack-spec.yaml
@ -3895,7 +3895,7 @@ paths:
            schema:
              $ref: '#/components/schemas/QueryTracesRequest'
        required: true
-  /v1/inference/rerank:
+  /v1alpha/inference/rerank:
    post:
      responses:
        '200':
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@ -21,7 +21,7 @@ from llama_stack.apis.common.content_types import ContentDelta, InterleavedConte
 from llama_stack.apis.common.responses import Order
 from llama_stack.apis.models import Model
 from llama_stack.apis.telemetry import MetricResponseMixin
-from llama_stack.apis.version import LLAMA_STACK_API_V1
+from llama_stack.apis.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1ALPHA
 from llama_stack.models.llama.datatypes import (
    BuiltinTool,
    StopReason,
@ -1070,7 +1070,7 @@ class InferenceProvider(Protocol):
        """
        ...
-    @webmethod(route="/inference/rerank", method="POST", experimental=True, level=LLAMA_STACK_API_V1)
+    @webmethod(route="/inference/rerank", method="POST", level=LLAMA_STACK_API_V1ALPHA)
    async def rerank(
        self,
        model: str,
--- a/llama_stack/core/resolver.py
+++ b/llama_stack/core/resolver.py
@ -29,6 +29,7 @@ from llama_stack.apis.telemetry import Telemetry
 from llama_stack.apis.tools import ToolGroups, ToolRuntime
 from llama_stack.apis.vector_dbs import VectorDBs
 from llama_stack.apis.vector_io import VectorIO
 from llama_stack.apis.version import LLAMA_STACK_API_V1ALPHA
 from llama_stack.core.client import get_client_impl
 from llama_stack.core.datatypes import (
    AccessRule,
@ -412,8 +413,14 @@ def check_protocol_compliance(obj: Any, protocol: Any) -> None:
    mro = type(obj).__mro__
    for name, value in inspect.getmembers(protocol):
-        if inspect.isfunction(value) and hasattr(value, "__webmethod__"):
+        if inspect.isfunction(value) and hasattr(value, "__webmethods__"):
-            if value.__webmethod__.experimental:
+            has_alpha_api = False
            for webmethod in value.__webmethods__:
                if webmethod.level == LLAMA_STACK_API_V1ALPHA:
                    has_alpha_api = True
                    break
            # if this API has multiple webmethods, and one of them is an alpha API, this API should be skipped when checking for missing or not callable routes
            if has_alpha_api:
                continue
            if not hasattr(obj, name):
                missing_methods.append((name, "missing"))
--- a/llama_stack/schema_utils.py
+++ b/llama_stack/schema_utils.py
@ -22,7 +22,6 @@ class WebMethod:
    raw_bytes_request_body: bool | None = False
    # A descriptive name of the corresponding span created by tracing
    descriptive_name: str | None = None
    experimental: bool | None = False
    required_scope: str | None = None
    deprecated: bool | None = False
@ -39,7 +38,6 @@ def webmethod(
    response_examples: list[Any] | None = None,
    raw_bytes_request_body: bool | None = False,
    descriptive_name: str | None = None,
    experimental: bool | None = False,
    required_scope: str | None = None,
    deprecated: bool | None = False,
 ) -> Callable[[T], T]:
@ -50,7 +48,6 @@ def webmethod(
    :param public: True if the operation can be invoked without prior authentication.
    :param request_examples: Sample requests that the operation might take. Pass a list of objects, not JSON.
    :param response_examples: Sample responses that the operation might produce. Pass a list of objects, not JSON.
    :param experimental: True if the operation is experimental and subject to change.
    :param required_scope: Required scope for this endpoint (e.g., 'monitoring.viewer').
    """
@ -64,7 +61,6 @@ def webmethod(
            response_examples=response_examples,
            raw_bytes_request_body=raw_bytes_request_body,
            descriptive_name=descriptive_name,
            experimental=experimental,
            required_scope=required_scope,
            deprecated=deprecated,
        )