feat(api): introduce /rerank (#2940)

# What does this PR do? Context: https://github.com/meta-llama/llama-stack/issues/2937 The API design is inspired by existing offerings, but not exactly the same: * `top_n` as the parameter to control number of results, instead of `top_k`, since `n` is conventional to control number * `truncation` bool instead of `max_token_per_doc`, since we should just handle the truncation automatically depending on model capability, instead of user setting the context length manually. * `data` field in the response, to be consistent with other OpenAI APIs (though they don't have a rerank API). Also, it is one less name to learn in the API. ## Test Plan Co-authored-by: Ashwin Bharambe <ashwin.bharambe@gmail.com>
2025-12-03 18:00:36 +00:00 · 2025-08-21 18:23:16 -07:00 · 2025-08-21 18:23:16 -07:00 · c5e2e269e2
commit c5e2e269e2
parent d78ac434bd
8 changed files with 336 additions and 1 deletions
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@ -37,11 +37,14 @@ from llama_stack.apis.inference import (
    Message,
    OpenAIChatCompletion,
    OpenAIChatCompletionChunk,
+    OpenAIChatCompletionContentPartImageParam,
+    OpenAIChatCompletionContentPartTextParam,
    OpenAICompletion,
    OpenAIEmbeddingsResponse,
    OpenAIEmbeddingUsage,
    OpenAIMessageParam,
    OpenAIResponseFormatParam,
+    RerankResponse,
    ResponseFormat,
    SamplingParams,
    TextTruncation,
@ -641,6 +644,15 @@ class OllamaInferenceAdapter(
    ):
        raise NotImplementedError("Batch chat completion is not supported for Ollama")

+    async def rerank(
+        self,
+        model: str,
+        query: str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam,
+        items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam],
+        max_num_results: int | None = None,
+    ) -> RerankResponse:
+        raise NotImplementedError("Reranking is not supported for Ollama")
+

 async def convert_message_to_openai_dict_for_ollama(message: Message) -> list[dict]:
    async def _convert_content(content) -> dict: