diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 0549dda21..38ebb71f6 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -4605,6 +4605,49 @@ } } }, + "/v1/inference/rerank": { + "post": { + "responses": { + "200": { + "description": "RerankResponse with indices sorted by relevance score (descending).", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RerankResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Inference" + ], + "description": "Rerank a list of documents based on their relevance to a query.", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RerankRequest" + } + } + }, + "required": true + } + } + }, "/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/resume": { "post": { "responses": { @@ -16585,6 +16628,95 @@ ], "title": "RegisterVectorDbRequest" }, + "RerankRequest": { + "type": "object", + "properties": { + "model": { + "type": "string", + "description": "The identifier of the reranking model to use." + }, + "query": { + "oneOf": [ + { + "type": "string" + }, + { + "$ref": "#/components/schemas/OpenAIChatCompletionContentPartTextParam" + }, + { + "$ref": "#/components/schemas/OpenAIChatCompletionContentPartImageParam" + } + ], + "description": "The search query to rank items against. Can be a string, text content part, or image content part. The input must not exceed the model's max input token length." + }, + "items": { + "type": "array", + "items": { + "oneOf": [ + { + "type": "string" + }, + { + "$ref": "#/components/schemas/OpenAIChatCompletionContentPartTextParam" + }, + { + "$ref": "#/components/schemas/OpenAIChatCompletionContentPartImageParam" + } + ] + }, + "description": "List of items to rerank. Each item can be a string, text content part, or image content part. Each input must not exceed the model's max input token length." + }, + "max_num_results": { + "type": "integer", + "description": "(Optional) Maximum number of results to return. Default: returns all." + } + }, + "additionalProperties": false, + "required": [ + "model", + "query", + "items" + ], + "title": "RerankRequest" + }, + "RerankData": { + "type": "object", + "properties": { + "index": { + "type": "integer", + "description": "The original index of the document in the input list" + }, + "relevance_score": { + "type": "number", + "description": "The relevance score from the model output. Values are inverted when applicable so that higher scores indicate greater relevance." + } + }, + "additionalProperties": false, + "required": [ + "index", + "relevance_score" + ], + "title": "RerankData", + "description": "A single rerank result from a reranking response." + }, + "RerankResponse": { + "type": "object", + "properties": { + "data": { + "type": "array", + "items": { + "$ref": "#/components/schemas/RerankData" + }, + "description": "List of rerank result objects, sorted by relevance score (descending)" + } + }, + "additionalProperties": false, + "required": [ + "data" + ], + "title": "RerankResponse", + "description": "Response from a reranking request." + }, "ResumeAgentTurnRequest": { "type": "object", "properties": { diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index aa47cd58d..bf4bd2143 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -3264,6 +3264,37 @@ paths: schema: $ref: '#/components/schemas/QueryTracesRequest' required: true + /v1/inference/rerank: + post: + responses: + '200': + description: >- + RerankResponse with indices sorted by relevance score (descending). + content: + application/json: + schema: + $ref: '#/components/schemas/RerankResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Inference + description: >- + Rerank a list of documents based on their relevance to a query. + parameters: [] + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/RerankRequest' + required: true /v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/resume: post: responses: @@ -12335,6 +12366,76 @@ components: - vector_db_id - embedding_model title: RegisterVectorDbRequest + RerankRequest: + type: object + properties: + model: + type: string + description: >- + The identifier of the reranking model to use. + query: + oneOf: + - type: string + - $ref: '#/components/schemas/OpenAIChatCompletionContentPartTextParam' + - $ref: '#/components/schemas/OpenAIChatCompletionContentPartImageParam' + description: >- + The search query to rank items against. Can be a string, text content + part, or image content part. The input must not exceed the model's max + input token length. + items: + type: array + items: + oneOf: + - type: string + - $ref: '#/components/schemas/OpenAIChatCompletionContentPartTextParam' + - $ref: '#/components/schemas/OpenAIChatCompletionContentPartImageParam' + description: >- + List of items to rerank. Each item can be a string, text content part, + or image content part. Each input must not exceed the model's max input + token length. + max_num_results: + type: integer + description: >- + (Optional) Maximum number of results to return. Default: returns all. + additionalProperties: false + required: + - model + - query + - items + title: RerankRequest + RerankData: + type: object + properties: + index: + type: integer + description: >- + The original index of the document in the input list + relevance_score: + type: number + description: >- + The relevance score from the model output. Values are inverted when applicable + so that higher scores indicate greater relevance. + additionalProperties: false + required: + - index + - relevance_score + title: RerankData + description: >- + A single rerank result from a reranking response. + RerankResponse: + type: object + properties: + data: + type: array + items: + $ref: '#/components/schemas/RerankData' + description: >- + List of rerank result objects, sorted by relevance score (descending) + additionalProperties: false + required: + - data + title: RerankResponse + description: Response from a reranking request. ResumeAgentTurnRequest: type: object properties: diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py index 7e7bd0a3d..19630bfb8 100644 --- a/llama_stack/apis/inference/inference.py +++ b/llama_stack/apis/inference/inference.py @@ -473,6 +473,28 @@ class EmbeddingsResponse(BaseModel): embeddings: list[list[float]] +@json_schema_type +class RerankData(BaseModel): + """A single rerank result from a reranking response. + + :param index: The original index of the document in the input list + :param relevance_score: The relevance score from the model output. Values are inverted when applicable so that higher scores indicate greater relevance. + """ + + index: int + relevance_score: float + + +@json_schema_type +class RerankResponse(BaseModel): + """Response from a reranking request. + + :param data: List of rerank result objects, sorted by relevance score (descending) + """ + + data: list[RerankData] + + @json_schema_type class OpenAIChatCompletionContentPartTextParam(BaseModel): """Text content part for OpenAI-compatible chat completion messages. @@ -1131,6 +1153,24 @@ class InferenceProvider(Protocol): """ ... + @webmethod(route="/inference/rerank", method="POST", experimental=True) + async def rerank( + self, + model: str, + query: str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam, + items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam], + max_num_results: int | None = None, + ) -> RerankResponse: + """Rerank a list of documents based on their relevance to a query. + + :param model: The identifier of the reranking model to use. + :param query: The search query to rank items against. Can be a string, text content part, or image content part. The input must not exceed the model's max input token length. + :param items: List of items to rerank. Each item can be a string, text content part, or image content part. Each input must not exceed the model's max input token length. + :param max_num_results: (Optional) Maximum number of results to return. Default: returns all. + :returns: RerankResponse with indices sorted by relevance score (descending). + """ + raise NotImplementedError("Reranking is not implemented") + @webmethod(route="/openai/v1/completions", method="POST") async def openai_completion( self, diff --git a/llama_stack/providers/inline/inference/meta_reference/inference.py b/llama_stack/providers/inline/inference/meta_reference/inference.py index 88d7a98ec..904a343d5 100644 --- a/llama_stack/providers/inline/inference/meta_reference/inference.py +++ b/llama_stack/providers/inline/inference/meta_reference/inference.py @@ -33,6 +33,9 @@ from llama_stack.apis.inference import ( InterleavedContent, LogProbConfig, Message, + OpenAIChatCompletionContentPartImageParam, + OpenAIChatCompletionContentPartTextParam, + RerankResponse, ResponseFormat, SamplingParams, StopReason, @@ -442,6 +445,15 @@ class MetaReferenceInferenceImpl( results = await self._nonstream_chat_completion(request_batch) return BatchChatCompletionResponse(batch=results) + async def rerank( + self, + model: str, + query: str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam, + items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam], + max_num_results: int | None = None, + ) -> RerankResponse: + raise NotImplementedError("Reranking is not supported for Meta Reference") + async def _nonstream_chat_completion( self, request_batch: list[ChatCompletionRequest] ) -> list[ChatCompletionResponse]: diff --git a/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py b/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py index fea8a8189..199ac2c61 100644 --- a/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py +++ b/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py @@ -13,6 +13,9 @@ from llama_stack.apis.inference import ( InterleavedContent, LogProbConfig, Message, + OpenAIChatCompletionContentPartImageParam, + OpenAIChatCompletionContentPartTextParam, + RerankResponse, ResponseFormat, SamplingParams, ToolChoice, @@ -122,3 +125,12 @@ class SentenceTransformersInferenceImpl( logprobs: LogProbConfig | None = None, ): raise NotImplementedError("Batch chat completion is not supported for Sentence Transformers") + + async def rerank( + self, + model: str, + query: str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam, + items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam], + max_num_results: int | None = None, + ) -> RerankResponse: + raise NotImplementedError("Reranking is not supported for Sentence Transformers") diff --git a/llama_stack/providers/remote/inference/llama_openai_compat/llama.py b/llama_stack/providers/remote/inference/llama_openai_compat/llama.py index 4857c6723..5aacb5a59 100644 --- a/llama_stack/providers/remote/inference/llama_openai_compat/llama.py +++ b/llama_stack/providers/remote/inference/llama_openai_compat/llama.py @@ -5,6 +5,11 @@ # the root directory of this source tree. import logging +from llama_stack.apis.inference import ( + OpenAIChatCompletionContentPartImageParam, + OpenAIChatCompletionContentPartTextParam, + RerankResponse, +) from llama_stack.providers.remote.inference.llama_openai_compat.config import LlamaCompatConfig from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin @@ -55,3 +60,12 @@ class LlamaCompatInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin): async def shutdown(self): await super().shutdown() + + async def rerank( + self, + model: str, + query: str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam, + items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam], + max_num_results: int | None = None, + ) -> RerankResponse: + raise NotImplementedError("Reranking is not supported for Llama OpenAI Compat") diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py index a93421536..08226695e 100644 --- a/llama_stack/providers/remote/inference/ollama/ollama.py +++ b/llama_stack/providers/remote/inference/ollama/ollama.py @@ -37,11 +37,14 @@ from llama_stack.apis.inference import ( Message, OpenAIChatCompletion, OpenAIChatCompletionChunk, + OpenAIChatCompletionContentPartImageParam, + OpenAIChatCompletionContentPartTextParam, OpenAICompletion, OpenAIEmbeddingsResponse, OpenAIEmbeddingUsage, OpenAIMessageParam, OpenAIResponseFormatParam, + RerankResponse, ResponseFormat, SamplingParams, TextTruncation, @@ -641,6 +644,15 @@ class OllamaInferenceAdapter( ): raise NotImplementedError("Batch chat completion is not supported for Ollama") + async def rerank( + self, + model: str, + query: str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam, + items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam], + max_num_results: int | None = None, + ) -> RerankResponse: + raise NotImplementedError("Reranking is not supported for Ollama") + async def convert_message_to_openai_dict_for_ollama(message: Message) -> list[dict]: async def _convert_content(content) -> dict: diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py index ac626874c..5f04c83ed 100644 --- a/llama_stack/providers/remote/inference/vllm/vllm.py +++ b/llama_stack/providers/remote/inference/vllm/vllm.py @@ -39,12 +39,15 @@ from llama_stack.apis.inference import ( Message, ModelStore, OpenAIChatCompletion, + OpenAIChatCompletionContentPartImageParam, + OpenAIChatCompletionContentPartTextParam, OpenAICompletion, OpenAIEmbeddingData, OpenAIEmbeddingsResponse, OpenAIEmbeddingUsage, OpenAIMessageParam, OpenAIResponseFormatParam, + RerankResponse, ResponseFormat, SamplingParams, TextTruncation, @@ -732,4 +735,13 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate): response_format: ResponseFormat | None = None, logprobs: LogProbConfig | None = None, ): - raise NotImplementedError("Batch chat completion is not supported for Ollama") + raise NotImplementedError("Batch chat completion is not supported for vLLM") + + async def rerank( + self, + model: str, + query: str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam, + items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam], + max_num_results: int | None = None, + ) -> RerankResponse: + raise NotImplementedError("Reranking is not supported for vLLM")