diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 0549dda21..38ebb71f6 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -4605,6 +4605,49 @@
}
}
},
+ "/v1/inference/rerank": {
+ "post": {
+ "responses": {
+ "200": {
+ "description": "RerankResponse with indices sorted by relevance score (descending).",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/RerankResponse"
+ }
+ }
+ }
+ },
+ "400": {
+ "$ref": "#/components/responses/BadRequest400"
+ },
+ "429": {
+ "$ref": "#/components/responses/TooManyRequests429"
+ },
+ "500": {
+ "$ref": "#/components/responses/InternalServerError500"
+ },
+ "default": {
+ "$ref": "#/components/responses/DefaultError"
+ }
+ },
+ "tags": [
+ "Inference"
+ ],
+ "description": "Rerank a list of documents based on their relevance to a query.",
+ "parameters": [],
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/RerankRequest"
+ }
+ }
+ },
+ "required": true
+ }
+ }
+ },
"/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/resume": {
"post": {
"responses": {
@@ -16585,6 +16628,95 @@
],
"title": "RegisterVectorDbRequest"
},
+ "RerankRequest": {
+ "type": "object",
+ "properties": {
+ "model": {
+ "type": "string",
+ "description": "The identifier of the reranking model to use."
+ },
+ "query": {
+ "oneOf": [
+ {
+ "type": "string"
+ },
+ {
+ "$ref": "#/components/schemas/OpenAIChatCompletionContentPartTextParam"
+ },
+ {
+ "$ref": "#/components/schemas/OpenAIChatCompletionContentPartImageParam"
+ }
+ ],
+ "description": "The search query to rank items against. Can be a string, text content part, or image content part. The input must not exceed the model's max input token length."
+ },
+ "items": {
+ "type": "array",
+ "items": {
+ "oneOf": [
+ {
+ "type": "string"
+ },
+ {
+ "$ref": "#/components/schemas/OpenAIChatCompletionContentPartTextParam"
+ },
+ {
+ "$ref": "#/components/schemas/OpenAIChatCompletionContentPartImageParam"
+ }
+ ]
+ },
+ "description": "List of items to rerank. Each item can be a string, text content part, or image content part. Each input must not exceed the model's max input token length."
+ },
+ "max_num_results": {
+ "type": "integer",
+ "description": "(Optional) Maximum number of results to return. Default: returns all."
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "model",
+ "query",
+ "items"
+ ],
+ "title": "RerankRequest"
+ },
+ "RerankData": {
+ "type": "object",
+ "properties": {
+ "index": {
+ "type": "integer",
+ "description": "The original index of the document in the input list"
+ },
+ "relevance_score": {
+ "type": "number",
+ "description": "The relevance score from the model output. Values are inverted when applicable so that higher scores indicate greater relevance."
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "index",
+ "relevance_score"
+ ],
+ "title": "RerankData",
+ "description": "A single rerank result from a reranking response."
+ },
+ "RerankResponse": {
+ "type": "object",
+ "properties": {
+ "data": {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/RerankData"
+ },
+ "description": "List of rerank result objects, sorted by relevance score (descending)"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "data"
+ ],
+ "title": "RerankResponse",
+ "description": "Response from a reranking request."
+ },
"ResumeAgentTurnRequest": {
"type": "object",
"properties": {
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index aa47cd58d..bf4bd2143 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -3264,6 +3264,37 @@ paths:
schema:
$ref: '#/components/schemas/QueryTracesRequest'
required: true
+ /v1/inference/rerank:
+ post:
+ responses:
+ '200':
+ description: >-
+ RerankResponse with indices sorted by relevance score (descending).
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/RerankResponse'
+ '400':
+ $ref: '#/components/responses/BadRequest400'
+ '429':
+ $ref: >-
+ #/components/responses/TooManyRequests429
+ '500':
+ $ref: >-
+ #/components/responses/InternalServerError500
+ default:
+ $ref: '#/components/responses/DefaultError'
+ tags:
+ - Inference
+ description: >-
+ Rerank a list of documents based on their relevance to a query.
+ parameters: []
+ requestBody:
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/RerankRequest'
+ required: true
/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/resume:
post:
responses:
@@ -12335,6 +12366,76 @@ components:
- vector_db_id
- embedding_model
title: RegisterVectorDbRequest
+ RerankRequest:
+ type: object
+ properties:
+ model:
+ type: string
+ description: >-
+ The identifier of the reranking model to use.
+ query:
+ oneOf:
+ - type: string
+ - $ref: '#/components/schemas/OpenAIChatCompletionContentPartTextParam'
+ - $ref: '#/components/schemas/OpenAIChatCompletionContentPartImageParam'
+ description: >-
+ The search query to rank items against. Can be a string, text content
+ part, or image content part. The input must not exceed the model's max
+ input token length.
+ items:
+ type: array
+ items:
+ oneOf:
+ - type: string
+ - $ref: '#/components/schemas/OpenAIChatCompletionContentPartTextParam'
+ - $ref: '#/components/schemas/OpenAIChatCompletionContentPartImageParam'
+ description: >-
+ List of items to rerank. Each item can be a string, text content part,
+ or image content part. Each input must not exceed the model's max input
+ token length.
+ max_num_results:
+ type: integer
+ description: >-
+ (Optional) Maximum number of results to return. Default: returns all.
+ additionalProperties: false
+ required:
+ - model
+ - query
+ - items
+ title: RerankRequest
+ RerankData:
+ type: object
+ properties:
+ index:
+ type: integer
+ description: >-
+ The original index of the document in the input list
+ relevance_score:
+ type: number
+ description: >-
+ The relevance score from the model output. Values are inverted when applicable
+ so that higher scores indicate greater relevance.
+ additionalProperties: false
+ required:
+ - index
+ - relevance_score
+ title: RerankData
+ description: >-
+ A single rerank result from a reranking response.
+ RerankResponse:
+ type: object
+ properties:
+ data:
+ type: array
+ items:
+ $ref: '#/components/schemas/RerankData'
+ description: >-
+ List of rerank result objects, sorted by relevance score (descending)
+ additionalProperties: false
+ required:
+ - data
+ title: RerankResponse
+ description: Response from a reranking request.
ResumeAgentTurnRequest:
type: object
properties:
diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py
index 7e7bd0a3d..19630bfb8 100644
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@@ -473,6 +473,28 @@ class EmbeddingsResponse(BaseModel):
embeddings: list[list[float]]
+@json_schema_type
+class RerankData(BaseModel):
+ """A single rerank result from a reranking response.
+
+ :param index: The original index of the document in the input list
+ :param relevance_score: The relevance score from the model output. Values are inverted when applicable so that higher scores indicate greater relevance.
+ """
+
+ index: int
+ relevance_score: float
+
+
+@json_schema_type
+class RerankResponse(BaseModel):
+ """Response from a reranking request.
+
+ :param data: List of rerank result objects, sorted by relevance score (descending)
+ """
+
+ data: list[RerankData]
+
+
@json_schema_type
class OpenAIChatCompletionContentPartTextParam(BaseModel):
"""Text content part for OpenAI-compatible chat completion messages.
@@ -1131,6 +1153,24 @@ class InferenceProvider(Protocol):
"""
...
+ @webmethod(route="/inference/rerank", method="POST", experimental=True)
+ async def rerank(
+ self,
+ model: str,
+ query: str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam,
+ items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam],
+ max_num_results: int | None = None,
+ ) -> RerankResponse:
+ """Rerank a list of documents based on their relevance to a query.
+
+ :param model: The identifier of the reranking model to use.
+ :param query: The search query to rank items against. Can be a string, text content part, or image content part. The input must not exceed the model's max input token length.
+ :param items: List of items to rerank. Each item can be a string, text content part, or image content part. Each input must not exceed the model's max input token length.
+ :param max_num_results: (Optional) Maximum number of results to return. Default: returns all.
+ :returns: RerankResponse with indices sorted by relevance score (descending).
+ """
+ raise NotImplementedError("Reranking is not implemented")
+
@webmethod(route="/openai/v1/completions", method="POST")
async def openai_completion(
self,
diff --git a/llama_stack/providers/inline/inference/meta_reference/inference.py b/llama_stack/providers/inline/inference/meta_reference/inference.py
index 88d7a98ec..904a343d5 100644
--- a/llama_stack/providers/inline/inference/meta_reference/inference.py
+++ b/llama_stack/providers/inline/inference/meta_reference/inference.py
@@ -33,6 +33,9 @@ from llama_stack.apis.inference import (
InterleavedContent,
LogProbConfig,
Message,
+ OpenAIChatCompletionContentPartImageParam,
+ OpenAIChatCompletionContentPartTextParam,
+ RerankResponse,
ResponseFormat,
SamplingParams,
StopReason,
@@ -442,6 +445,15 @@ class MetaReferenceInferenceImpl(
results = await self._nonstream_chat_completion(request_batch)
return BatchChatCompletionResponse(batch=results)
+ async def rerank(
+ self,
+ model: str,
+ query: str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam,
+ items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam],
+ max_num_results: int | None = None,
+ ) -> RerankResponse:
+ raise NotImplementedError("Reranking is not supported for Meta Reference")
+
async def _nonstream_chat_completion(
self, request_batch: list[ChatCompletionRequest]
) -> list[ChatCompletionResponse]:
diff --git a/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py b/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
index fea8a8189..199ac2c61 100644
--- a/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
+++ b/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
@@ -13,6 +13,9 @@ from llama_stack.apis.inference import (
InterleavedContent,
LogProbConfig,
Message,
+ OpenAIChatCompletionContentPartImageParam,
+ OpenAIChatCompletionContentPartTextParam,
+ RerankResponse,
ResponseFormat,
SamplingParams,
ToolChoice,
@@ -122,3 +125,12 @@ class SentenceTransformersInferenceImpl(
logprobs: LogProbConfig | None = None,
):
raise NotImplementedError("Batch chat completion is not supported for Sentence Transformers")
+
+ async def rerank(
+ self,
+ model: str,
+ query: str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam,
+ items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam],
+ max_num_results: int | None = None,
+ ) -> RerankResponse:
+ raise NotImplementedError("Reranking is not supported for Sentence Transformers")
diff --git a/llama_stack/providers/remote/inference/llama_openai_compat/llama.py b/llama_stack/providers/remote/inference/llama_openai_compat/llama.py
index 4857c6723..5aacb5a59 100644
--- a/llama_stack/providers/remote/inference/llama_openai_compat/llama.py
+++ b/llama_stack/providers/remote/inference/llama_openai_compat/llama.py
@@ -5,6 +5,11 @@
# the root directory of this source tree.
import logging
+from llama_stack.apis.inference import (
+ OpenAIChatCompletionContentPartImageParam,
+ OpenAIChatCompletionContentPartTextParam,
+ RerankResponse,
+)
from llama_stack.providers.remote.inference.llama_openai_compat.config import LlamaCompatConfig
from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
@@ -55,3 +60,12 @@ class LlamaCompatInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
async def shutdown(self):
await super().shutdown()
+
+ async def rerank(
+ self,
+ model: str,
+ query: str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam,
+ items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam],
+ max_num_results: int | None = None,
+ ) -> RerankResponse:
+ raise NotImplementedError("Reranking is not supported for Llama OpenAI Compat")
diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py
index a93421536..08226695e 100644
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@@ -37,11 +37,14 @@ from llama_stack.apis.inference import (
Message,
OpenAIChatCompletion,
OpenAIChatCompletionChunk,
+ OpenAIChatCompletionContentPartImageParam,
+ OpenAIChatCompletionContentPartTextParam,
OpenAICompletion,
OpenAIEmbeddingsResponse,
OpenAIEmbeddingUsage,
OpenAIMessageParam,
OpenAIResponseFormatParam,
+ RerankResponse,
ResponseFormat,
SamplingParams,
TextTruncation,
@@ -641,6 +644,15 @@ class OllamaInferenceAdapter(
):
raise NotImplementedError("Batch chat completion is not supported for Ollama")
+ async def rerank(
+ self,
+ model: str,
+ query: str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam,
+ items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam],
+ max_num_results: int | None = None,
+ ) -> RerankResponse:
+ raise NotImplementedError("Reranking is not supported for Ollama")
+
async def convert_message_to_openai_dict_for_ollama(message: Message) -> list[dict]:
async def _convert_content(content) -> dict:
diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py
index ac626874c..5f04c83ed 100644
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@@ -39,12 +39,15 @@ from llama_stack.apis.inference import (
Message,
ModelStore,
OpenAIChatCompletion,
+ OpenAIChatCompletionContentPartImageParam,
+ OpenAIChatCompletionContentPartTextParam,
OpenAICompletion,
OpenAIEmbeddingData,
OpenAIEmbeddingsResponse,
OpenAIEmbeddingUsage,
OpenAIMessageParam,
OpenAIResponseFormatParam,
+ RerankResponse,
ResponseFormat,
SamplingParams,
TextTruncation,
@@ -732,4 +735,13 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
response_format: ResponseFormat | None = None,
logprobs: LogProbConfig | None = None,
):
- raise NotImplementedError("Batch chat completion is not supported for Ollama")
+ raise NotImplementedError("Batch chat completion is not supported for vLLM")
+
+ async def rerank(
+ self,
+ model: str,
+ query: str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam,
+ items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam],
+ max_num_results: int | None = None,
+ ) -> RerankResponse:
+ raise NotImplementedError("Reranking is not supported for vLLM")