diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py index 19630bfb8..570ed3d2b 100644 --- a/llama_stack/apis/inference/inference.py +++ b/llama_stack/apis/inference/inference.py @@ -1170,6 +1170,7 @@ class InferenceProvider(Protocol): :returns: RerankResponse with indices sorted by relevance score (descending). """ raise NotImplementedError("Reranking is not implemented") + return # this is so mypy's safe-super rule will consider the method concrete @webmethod(route="/openai/v1/completions", method="POST") async def openai_completion( diff --git a/llama_stack/providers/inline/inference/meta_reference/inference.py b/llama_stack/providers/inline/inference/meta_reference/inference.py index 904a343d5..88d7a98ec 100644 --- a/llama_stack/providers/inline/inference/meta_reference/inference.py +++ b/llama_stack/providers/inline/inference/meta_reference/inference.py @@ -33,9 +33,6 @@ from llama_stack.apis.inference import ( InterleavedContent, LogProbConfig, Message, - OpenAIChatCompletionContentPartImageParam, - OpenAIChatCompletionContentPartTextParam, - RerankResponse, ResponseFormat, SamplingParams, StopReason, @@ -445,15 +442,6 @@ class MetaReferenceInferenceImpl( results = await self._nonstream_chat_completion(request_batch) return BatchChatCompletionResponse(batch=results) - async def rerank( - self, - model: str, - query: str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam, - items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam], - max_num_results: int | None = None, - ) -> RerankResponse: - raise NotImplementedError("Reranking is not supported for Meta Reference") - async def _nonstream_chat_completion( self, request_batch: list[ChatCompletionRequest] ) -> list[ChatCompletionResponse]: diff --git a/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py b/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py index 4b68cc926..600a5bd37 100644 --- a/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py +++ b/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py @@ -12,9 +12,6 @@ from llama_stack.apis.inference import ( InterleavedContent, LogProbConfig, Message, - OpenAIChatCompletionContentPartImageParam, - OpenAIChatCompletionContentPartTextParam, - RerankResponse, ResponseFormat, SamplingParams, ToolChoice, @@ -125,12 +122,3 @@ class SentenceTransformersInferenceImpl( logprobs: LogProbConfig | None = None, ): raise NotImplementedError("Batch chat completion is not supported for Sentence Transformers") - - async def rerank( - self, - model: str, - query: str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam, - items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam], - max_num_results: int | None = None, - ) -> RerankResponse: - raise NotImplementedError("Reranking is not supported for Sentence Transformers") diff --git a/llama_stack/providers/remote/inference/llama_openai_compat/llama.py b/llama_stack/providers/remote/inference/llama_openai_compat/llama.py index 0edff882f..f2069b5e5 100644 --- a/llama_stack/providers/remote/inference/llama_openai_compat/llama.py +++ b/llama_stack/providers/remote/inference/llama_openai_compat/llama.py @@ -3,11 +3,6 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from llama_stack.apis.inference import ( - OpenAIChatCompletionContentPartImageParam, - OpenAIChatCompletionContentPartTextParam, - RerankResponse, -) from llama_stack.log import get_logger from llama_stack.providers.remote.inference.llama_openai_compat.config import LlamaCompatConfig from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin @@ -59,12 +54,3 @@ class LlamaCompatInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin): async def shutdown(self): await super().shutdown() - - async def rerank( - self, - model: str, - query: str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam, - items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam], - max_num_results: int | None = None, - ) -> RerankResponse: - raise NotImplementedError("Reranking is not supported for Llama OpenAI Compat") diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py index d72a94615..d8b331ef7 100644 --- a/llama_stack/providers/remote/inference/ollama/ollama.py +++ b/llama_stack/providers/remote/inference/ollama/ollama.py @@ -37,14 +37,11 @@ from llama_stack.apis.inference import ( Message, OpenAIChatCompletion, OpenAIChatCompletionChunk, - OpenAIChatCompletionContentPartImageParam, - OpenAIChatCompletionContentPartTextParam, OpenAICompletion, OpenAIEmbeddingsResponse, OpenAIEmbeddingUsage, OpenAIMessageParam, OpenAIResponseFormatParam, - RerankResponse, ResponseFormat, SamplingParams, TextTruncation, @@ -644,15 +641,6 @@ class OllamaInferenceAdapter( ): raise NotImplementedError("Batch chat completion is not supported for Ollama") - async def rerank( - self, - model: str, - query: str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam, - items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam], - max_num_results: int | None = None, - ) -> RerankResponse: - raise NotImplementedError("Reranking is not supported for Ollama") - async def convert_message_to_openai_dict_for_ollama(message: Message) -> list[dict]: async def _convert_content(content) -> dict: diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py index a5f7ba52f..f71068318 100644 --- a/llama_stack/providers/remote/inference/vllm/vllm.py +++ b/llama_stack/providers/remote/inference/vllm/vllm.py @@ -39,15 +39,12 @@ from llama_stack.apis.inference import ( Message, ModelStore, OpenAIChatCompletion, - OpenAIChatCompletionContentPartImageParam, - OpenAIChatCompletionContentPartTextParam, OpenAICompletion, OpenAIEmbeddingData, OpenAIEmbeddingsResponse, OpenAIEmbeddingUsage, OpenAIMessageParam, OpenAIResponseFormatParam, - RerankResponse, ResponseFormat, SamplingParams, TextTruncation, @@ -736,12 +733,3 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate): logprobs: LogProbConfig | None = None, ): raise NotImplementedError("Batch chat completion is not supported for vLLM") - - async def rerank( - self, - model: str, - query: str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam, - items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam], - max_num_results: int | None = None, - ) -> RerankResponse: - raise NotImplementedError("Reranking is not supported for vLLM")