feat(api): introduce /rerank (#2940)
Some checks failed
Integration Tests (Replay) / Integration Tests (, , , client=, vision=) (push) Failing after 1s
Test External Providers Installed via Module / test-external-providers-from-module (venv) (push) Has been skipped
Vector IO Integration Tests / test-matrix (push) Failing after 6s
Pre-commit / pre-commit (push) Failing after 7s
Test Llama Stack Build / build-single-provider (push) Failing after 6s
Python Package Build Test / build (3.13) (push) Failing after 8s
Test Llama Stack Build / build-ubi9-container-distribution (push) Failing after 9s
Python Package Build Test / build (3.12) (push) Failing after 9s
Unit Tests / unit-tests (3.12) (push) Failing after 8s
Test External API and Providers / test-external (venv) (push) Failing after 10s
Update ReadTheDocs / update-readthedocs (push) Failing after 11s
Test Llama Stack Build / build-custom-container-distribution (push) Failing after 14s
Unit Tests / unit-tests (3.13) (push) Failing after 12s
Integration Auth Tests / test-matrix (oauth2_token) (push) Failing after 19s
SqlStore Integration Tests / test-postgres (3.12) (push) Failing after 19s
SqlStore Integration Tests / test-postgres (3.13) (push) Failing after 21s
Test Llama Stack Build / generate-matrix (push) Failing after 21s
Test Llama Stack Build / build (push) Has been skipped
UI Tests / ui-tests (22) (push) Failing after 21s

# What does this PR do?
Context: https://github.com/meta-llama/llama-stack/issues/2937

The API design is inspired by existing offerings, but not exactly the
same:
* `top_n` as the parameter to control number of results, instead of
`top_k`, since `n` is conventional to control number
* `truncation` bool instead of `max_token_per_doc`, since we should just
handle the truncation automatically depending on model capability,
instead of user setting the context length manually.
* `data` field in the response, to be consistent with other OpenAI APIs
(though they don't have a rerank API). Also, it is one less name to
learn in the API.

## Test Plan

Co-authored-by: Ashwin Bharambe <ashwin.bharambe@gmail.com>
This commit is contained in:
ehhuang 2025-08-21 18:23:16 -07:00 committed by GitHub
parent d78ac434bd
commit c5e2e269e2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 336 additions and 1 deletions

View file

@ -473,6 +473,28 @@ class EmbeddingsResponse(BaseModel):
embeddings: list[list[float]]
@json_schema_type
class RerankData(BaseModel):
"""A single rerank result from a reranking response.
:param index: The original index of the document in the input list
:param relevance_score: The relevance score from the model output. Values are inverted when applicable so that higher scores indicate greater relevance.
"""
index: int
relevance_score: float
@json_schema_type
class RerankResponse(BaseModel):
"""Response from a reranking request.
:param data: List of rerank result objects, sorted by relevance score (descending)
"""
data: list[RerankData]
@json_schema_type
class OpenAIChatCompletionContentPartTextParam(BaseModel):
"""Text content part for OpenAI-compatible chat completion messages.
@ -1131,6 +1153,24 @@ class InferenceProvider(Protocol):
"""
...
@webmethod(route="/inference/rerank", method="POST", experimental=True)
async def rerank(
self,
model: str,
query: str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam,
items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam],
max_num_results: int | None = None,
) -> RerankResponse:
"""Rerank a list of documents based on their relevance to a query.
:param model: The identifier of the reranking model to use.
:param query: The search query to rank items against. Can be a string, text content part, or image content part. The input must not exceed the model's max input token length.
:param items: List of items to rerank. Each item can be a string, text content part, or image content part. Each input must not exceed the model's max input token length.
:param max_num_results: (Optional) Maximum number of results to return. Default: returns all.
:returns: RerankResponse with indices sorted by relevance score (descending).
"""
raise NotImplementedError("Reranking is not implemented")
@webmethod(route="/openai/v1/completions", method="POST")
async def openai_completion(
self,

View file

@ -33,6 +33,9 @@ from llama_stack.apis.inference import (
InterleavedContent,
LogProbConfig,
Message,
OpenAIChatCompletionContentPartImageParam,
OpenAIChatCompletionContentPartTextParam,
RerankResponse,
ResponseFormat,
SamplingParams,
StopReason,
@ -442,6 +445,15 @@ class MetaReferenceInferenceImpl(
results = await self._nonstream_chat_completion(request_batch)
return BatchChatCompletionResponse(batch=results)
async def rerank(
self,
model: str,
query: str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam,
items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam],
max_num_results: int | None = None,
) -> RerankResponse:
raise NotImplementedError("Reranking is not supported for Meta Reference")
async def _nonstream_chat_completion(
self, request_batch: list[ChatCompletionRequest]
) -> list[ChatCompletionResponse]:

View file

@ -12,6 +12,9 @@ from llama_stack.apis.inference import (
InterleavedContent,
LogProbConfig,
Message,
OpenAIChatCompletionContentPartImageParam,
OpenAIChatCompletionContentPartTextParam,
RerankResponse,
ResponseFormat,
SamplingParams,
ToolChoice,
@ -122,3 +125,12 @@ class SentenceTransformersInferenceImpl(
logprobs: LogProbConfig | None = None,
):
raise NotImplementedError("Batch chat completion is not supported for Sentence Transformers")
async def rerank(
self,
model: str,
query: str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam,
items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam],
max_num_results: int | None = None,
) -> RerankResponse:
raise NotImplementedError("Reranking is not supported for Sentence Transformers")

View file

@ -3,6 +3,11 @@
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from llama_stack.apis.inference import (
OpenAIChatCompletionContentPartImageParam,
OpenAIChatCompletionContentPartTextParam,
RerankResponse,
)
from llama_stack.log import get_logger
from llama_stack.providers.remote.inference.llama_openai_compat.config import LlamaCompatConfig
from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
@ -54,3 +59,12 @@ class LlamaCompatInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
async def shutdown(self):
await super().shutdown()
async def rerank(
self,
model: str,
query: str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam,
items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam],
max_num_results: int | None = None,
) -> RerankResponse:
raise NotImplementedError("Reranking is not supported for Llama OpenAI Compat")

View file

@ -37,11 +37,14 @@ from llama_stack.apis.inference import (
Message,
OpenAIChatCompletion,
OpenAIChatCompletionChunk,
OpenAIChatCompletionContentPartImageParam,
OpenAIChatCompletionContentPartTextParam,
OpenAICompletion,
OpenAIEmbeddingsResponse,
OpenAIEmbeddingUsage,
OpenAIMessageParam,
OpenAIResponseFormatParam,
RerankResponse,
ResponseFormat,
SamplingParams,
TextTruncation,
@ -641,6 +644,15 @@ class OllamaInferenceAdapter(
):
raise NotImplementedError("Batch chat completion is not supported for Ollama")
async def rerank(
self,
model: str,
query: str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam,
items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam],
max_num_results: int | None = None,
) -> RerankResponse:
raise NotImplementedError("Reranking is not supported for Ollama")
async def convert_message_to_openai_dict_for_ollama(message: Message) -> list[dict]:
async def _convert_content(content) -> dict:

View file

@ -39,12 +39,15 @@ from llama_stack.apis.inference import (
Message,
ModelStore,
OpenAIChatCompletion,
OpenAIChatCompletionContentPartImageParam,
OpenAIChatCompletionContentPartTextParam,
OpenAICompletion,
OpenAIEmbeddingData,
OpenAIEmbeddingsResponse,
OpenAIEmbeddingUsage,
OpenAIMessageParam,
OpenAIResponseFormatParam,
RerankResponse,
ResponseFormat,
SamplingParams,
TextTruncation,
@ -732,4 +735,13 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
response_format: ResponseFormat | None = None,
logprobs: LogProbConfig | None = None,
):
raise NotImplementedError("Batch chat completion is not supported for Ollama")
raise NotImplementedError("Batch chat completion is not supported for vLLM")
async def rerank(
self,
model: str,
query: str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam,
items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam],
max_num_results: int | None = None,
) -> RerankResponse:
raise NotImplementedError("Reranking is not supported for vLLM")