From ad52849072424a56b5c586d2964fa699dcdbada9 Mon Sep 17 00:00:00 2001
From: Jiayi <jiayin@nvidia.com>
Date: Fri, 17 Oct 2025 14:51:17 -0700
Subject: [PATCH] Remove experimental from rerank models doc

---
 docs/docs/providers/inference/index.mdx               | 4 ++--
 docs/static/deprecated-llama-stack-spec.html          | 2 +-
 docs/static/deprecated-llama-stack-spec.yaml          | 4 ++--
 docs/static/llama-stack-spec.html                     | 2 +-
 docs/static/llama-stack-spec.yaml                     | 4 ++--
 docs/static/stainless-llama-stack-spec.html           | 2 +-
 docs/static/stainless-llama-stack-spec.yaml           | 4 ++--
 llama_stack/apis/inference/inference.py               | 2 +-
 llama_stack/providers/utils/inference/openai_mixin.py | 1 +
 9 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/docs/docs/providers/inference/index.mdx b/docs/docs/providers/inference/index.mdx
index bc31caf5f..478611420 100644
--- a/docs/docs/providers/inference/index.mdx
+++ b/docs/docs/providers/inference/index.mdx
@@ -6,7 +6,7 @@ description: "Inference
     This API provides the raw interface to the underlying models. Three kinds of models are supported:
     - LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.
     - Embedding models: these models generate embeddings to be used for semantic search.
-    - Rerank models (Experimental): these models reorder the documents based on their relevance to a query."
+    - Rerank models: these models reorder the documents based on their relevance to a query."
 sidebar_label: Inference
 title: Inference
 ---
@@ -22,6 +22,6 @@ Inference
     This API provides the raw interface to the underlying models. Three kinds of models are supported:
     - LLM models: these models generate "raw" and "chat" (conversational) completions.
     - Embedding models: these models generate embeddings to be used for semantic search.
-    - Rerank models (Experimental): these models reorder the documents based on their relevance to a query.
+    - Rerank models: these models reorder the documents based on their relevance to a query.
 
 This section contains documentation for all available providers for the **inference** API.
diff --git a/docs/static/deprecated-llama-stack-spec.html b/docs/static/deprecated-llama-stack-spec.html
index ef8ec0464..f038a910c 100644
--- a/docs/static/deprecated-llama-stack-spec.html
+++ b/docs/static/deprecated-llama-stack-spec.html
@@ -13459,7 +13459,7 @@
         },
         {
             "name": "Inference",
-            "description": "Llama Stack Inference API for generating completions, chat completions, and embeddings.\n\nThis API provides the raw interface to the underlying models. Three kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.\n- Rerank models (Experimental): these models reorder the documents based on their relevance to a query.",
+            "description": "Llama Stack Inference API for generating completions, chat completions, and embeddings.\n\nThis API provides the raw interface to the underlying models. Three kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.\n- Rerank models: these models reorder the documents based on their relevance to a query.",
             "x-displayName": "Inference"
         },
         {
diff --git a/docs/static/deprecated-llama-stack-spec.yaml b/docs/static/deprecated-llama-stack-spec.yaml
index 74cd3ac56..47f009635 100644
--- a/docs/static/deprecated-llama-stack-spec.yaml
+++ b/docs/static/deprecated-llama-stack-spec.yaml
@@ -10218,8 +10218,8 @@ tags:
       - Embedding models: these models generate embeddings to be used for semantic
       search.
 
-      - Rerank models (Experimental): these models reorder the documents based on
-      their relevance to a query.
+      - Rerank models: these models reorder the documents based on their relevance
+      to a query.
     x-displayName: Inference
   - name: Models
     description: ''
diff --git a/docs/static/llama-stack-spec.html b/docs/static/llama-stack-spec.html
index 4a011de66..ca12ef485 100644
--- a/docs/static/llama-stack-spec.html
+++ b/docs/static/llama-stack-spec.html
@@ -13262,7 +13262,7 @@
         },
         {
             "name": "Inference",
-            "description": "Llama Stack Inference API for generating completions, chat completions, and embeddings.\n\nThis API provides the raw interface to the underlying models. Three kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.\n- Rerank models (Experimental): these models reorder the documents based on their relevance to a query.",
+            "description": "Llama Stack Inference API for generating completions, chat completions, and embeddings.\n\nThis API provides the raw interface to the underlying models. Three kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.\n- Rerank models: these models reorder the documents based on their relevance to a query.",
             "x-displayName": "Inference"
         },
         {
diff --git a/docs/static/llama-stack-spec.yaml b/docs/static/llama-stack-spec.yaml
index 2819cde1f..066b38f9b 100644
--- a/docs/static/llama-stack-spec.yaml
+++ b/docs/static/llama-stack-spec.yaml
@@ -10191,8 +10191,8 @@ tags:
       - Embedding models: these models generate embeddings to be used for semantic
       search.
 
-      - Rerank models (Experimental): these models reorder the documents based on
-      their relevance to a query.
+      - Rerank models: these models reorder the documents based on their relevance
+      to a query.
     x-displayName: Inference
   - name: Inspect
     description: >-
diff --git a/docs/static/stainless-llama-stack-spec.html b/docs/static/stainless-llama-stack-spec.html
index 29edfa6b4..563ad54d0 100644
--- a/docs/static/stainless-llama-stack-spec.html
+++ b/docs/static/stainless-llama-stack-spec.html
@@ -17952,7 +17952,7 @@
         },
         {
             "name": "Inference",
-            "description": "Llama Stack Inference API for generating completions, chat completions, and embeddings.\n\nThis API provides the raw interface to the underlying models. Three kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.\n- Rerank models (Experimental): these models reorder the documents based on their relevance to a query.",
+            "description": "Llama Stack Inference API for generating completions, chat completions, and embeddings.\n\nThis API provides the raw interface to the underlying models. Three kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.\n- Rerank models: these models reorder the documents based on their relevance to a query.",
             "x-displayName": "Inference"
         },
         {
diff --git a/docs/static/stainless-llama-stack-spec.yaml b/docs/static/stainless-llama-stack-spec.yaml
index 1705ecce0..a06c20df4 100644
--- a/docs/static/stainless-llama-stack-spec.yaml
+++ b/docs/static/stainless-llama-stack-spec.yaml
@@ -13586,8 +13586,8 @@ tags:
       - Embedding models: these models generate embeddings to be used for semantic
       search.
 
-      - Rerank models (Experimental): these models reorder the documents based on
-      their relevance to a query.
+      - Rerank models: these models reorder the documents based on their relevance
+      to a query.
     x-displayName: Inference
   - name: Inspect
     description: >-
diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py
index 6dc16305c..049482837 100644
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@@ -1237,7 +1237,7 @@ class Inference(InferenceProvider):
     This API provides the raw interface to the underlying models. Three kinds of models are supported:
     - LLM models: these models generate "raw" and "chat" (conversational) completions.
     - Embedding models: these models generate embeddings to be used for semantic search.
-    - Rerank models (Experimental): these models reorder the documents based on their relevance to a query.
+    - Rerank models: these models reorder the documents based on their relevance to a query.
     """
 
     @webmethod(route="/openai/v1/chat/completions", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
diff --git a/llama_stack/providers/utils/inference/openai_mixin.py b/llama_stack/providers/utils/inference/openai_mixin.py
index adbe4dcb0..e207b1a43 100644
--- a/llama_stack/providers/utils/inference/openai_mixin.py
+++ b/llama_stack/providers/utils/inference/openai_mixin.py
@@ -48,6 +48,7 @@ class OpenAIMixin(NeedsRequestProviderData, ABC, BaseModel):
     - overwrite_completion_id: If True, overwrites the 'id' field in OpenAI responses
     - download_images: If True, downloads images and converts to base64 for providers that require it
     - embedding_model_metadata: A dictionary mapping model IDs to their embedding metadata
+    - rerank_model_list: A list of model IDs for rerank models
     - provider_data_api_key_field: Optional field name in provider data to look for API key
     - list_provider_model_ids: Method to list available models from the provider
     - get_extra_client_params: Method to provide extra parameters to the AsyncOpenAI client