feat(api): introduce /rerank

# What does this PR do? ## Test Plan
2025-12-19 23:39:40 +00:00 · 2025-08-14 16:14:38 -07:00 · 2025-08-14 16:14:38 -07:00 · 306625025a
commit 306625025a
parent e69acbafbf
8 changed files with 336 additions and 1 deletions
--- a/llama_stack/providers/remote/inference/llama_openai_compat/llama.py
+++ b/llama_stack/providers/remote/inference/llama_openai_compat/llama.py
@ -5,6 +5,11 @@
 # the root directory of this source tree.
 import logging

+from llama_stack.apis.inference import (
+    OpenAIChatCompletionContentPartImageParam,
+    OpenAIChatCompletionContentPartTextParam,
+    RerankResponse,
+)
 from llama_stack.providers.remote.inference.llama_openai_compat.config import LlamaCompatConfig
 from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
@ -55,3 +60,12 @@ class LlamaCompatInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):

    async def shutdown(self):
        await super().shutdown()
+
+    async def rerank(
+        self,
+        model: str,
+        query: str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam,
+        items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam],
+        max_num_results: int | None = None,
+    ) -> RerankResponse:
+        raise NotImplementedError("Reranking is not supported for Llama OpenAI Compat")
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@ -37,11 +37,14 @@ from llama_stack.apis.inference import (
    Message,
    OpenAIChatCompletion,
    OpenAIChatCompletionChunk,
+    OpenAIChatCompletionContentPartImageParam,
+    OpenAIChatCompletionContentPartTextParam,
    OpenAICompletion,
    OpenAIEmbeddingsResponse,
    OpenAIEmbeddingUsage,
    OpenAIMessageParam,
    OpenAIResponseFormatParam,
+    RerankResponse,
    ResponseFormat,
    SamplingParams,
    TextTruncation,
@ -641,6 +644,15 @@ class OllamaInferenceAdapter(
    ):
        raise NotImplementedError("Batch chat completion is not supported for Ollama")

+    async def rerank(
+        self,
+        model: str,
+        query: str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam,
+        items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam],
+        max_num_results: int | None = None,
+    ) -> RerankResponse:
+        raise NotImplementedError("Reranking is not supported for Ollama")
+

 async def convert_message_to_openai_dict_for_ollama(message: Message) -> list[dict]:
    async def _convert_content(content) -> dict:
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@ -39,12 +39,15 @@ from llama_stack.apis.inference import (
    Message,
    ModelStore,
    OpenAIChatCompletion,
+    OpenAIChatCompletionContentPartImageParam,
+    OpenAIChatCompletionContentPartTextParam,
    OpenAICompletion,
    OpenAIEmbeddingData,
    OpenAIEmbeddingsResponse,
    OpenAIEmbeddingUsage,
    OpenAIMessageParam,
    OpenAIResponseFormatParam,
+    RerankResponse,
    ResponseFormat,
    SamplingParams,
    TextTruncation,
@ -732,4 +735,13 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
        response_format: ResponseFormat | None = None,
        logprobs: LogProbConfig | None = None,
    ):
-        raise NotImplementedError("Batch chat completion is not supported for Ollama")
+        raise NotImplementedError("Batch chat completion is not supported for vLLM")
+
+    async def rerank(
+        self,
+        model: str,
+        query: str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam,
+        items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam],
+        max_num_results: int | None = None,
+    ) -> RerankResponse:
+        raise NotImplementedError("Reranking is not supported for vLLM")