indicate to mypy that InferenceProvider.batch_completion/batch_chat_completion is concrete

2025-12-17 13:52:38 +00:00 · 2025-08-22 15:22:20 -04:00 · 2025-08-22 15:22:20 -04:00 · 5f0d3d473e
commit 5f0d3d473e
parent 2ee898cc4c
5 changed files with 2 additions and 89 deletions
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@ -1068,6 +1068,7 @@ class InferenceProvider(Protocol):
        :returns: A BatchCompletionResponse with the full completions.
        """
        raise NotImplementedError("Batch completion is not implemented")
        return  # this is so mypy's safe-super rule will consider the method concrete
    @webmethod(route="/inference/chat-completion", method="POST")
    async def chat_completion(
@ -1132,6 +1133,7 @@ class InferenceProvider(Protocol):
        :returns: A BatchChatCompletionResponse with the full completions.
        """
        raise NotImplementedError("Batch chat completion is not implemented")
        return  # this is so mypy's safe-super rule will consider the method concrete
    @webmethod(route="/inference/embeddings", method="POST")
    async def embeddings(
--- a/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
+++ b/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
@ -9,7 +9,6 @@ from collections.abc import AsyncGenerator
 from llama_stack.apis.inference import (
    CompletionResponse,
    InferenceProvider,
    InterleavedContent,
    LogProbConfig,
    Message,
    ResponseFormat,
@ -100,25 +99,3 @@ class SentenceTransformersInferenceImpl(
        tool_config: ToolConfig | None = None,
    ) -> AsyncGenerator:
        raise ValueError("Sentence transformers don't support chat completion")
    async def batch_completion(
        self,
        model_id: str,
        content_batch: list[InterleavedContent],
        sampling_params: SamplingParams | None = None,
        response_format: ResponseFormat | None = None,
        logprobs: LogProbConfig | None = None,
    ):
        raise NotImplementedError("Batch completion is not supported for Sentence Transformers")
    async def batch_chat_completion(
        self,
        model_id: str,
        messages_batch: list[list[Message]],
        sampling_params: SamplingParams | None = None,
        tools: list[ToolDefinition] | None = None,
        tool_config: ToolConfig | None = None,
        response_format: ResponseFormat | None = None,
        logprobs: LogProbConfig | None = None,
    ):
        raise NotImplementedError("Batch chat completion is not supported for Sentence Transformers")
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@ -619,28 +619,6 @@ class OllamaInferenceAdapter(
            response.id = id
            return response
    async def batch_completion(
        self,
        model_id: str,
        content_batch: list[InterleavedContent],
        sampling_params: SamplingParams | None = None,
        response_format: ResponseFormat | None = None,
        logprobs: LogProbConfig | None = None,
    ):
        raise NotImplementedError("Batch completion is not supported for Ollama")
    async def batch_chat_completion(
        self,
        model_id: str,
        messages_batch: list[list[Message]],
        sampling_params: SamplingParams | None = None,
        tools: list[ToolDefinition] | None = None,
        tool_config: ToolConfig | None = None,
        response_format: ResponseFormat | None = None,
        logprobs: LogProbConfig | None = None,
    ):
        raise NotImplementedError("Batch chat completion is not supported for Ollama")
 async def convert_message_to_openai_dict_for_ollama(message: Message) -> list[dict]:
    async def _convert_content(content) -> dict:
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@ -711,25 +711,3 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
            user=user,
        )
        return await self.client.chat.completions.create(**params)  # type: ignore
    async def batch_completion(
        self,
        model_id: str,
        content_batch: list[InterleavedContent],
        sampling_params: SamplingParams | None = None,
        response_format: ResponseFormat | None = None,
        logprobs: LogProbConfig | None = None,
    ):
        raise NotImplementedError("Batch completion is not supported for Ollama")
    async def batch_chat_completion(
        self,
        model_id: str,
        messages_batch: list[list[Message]],
        sampling_params: SamplingParams | None = None,
        tools: list[ToolDefinition] | None = None,
        tool_config: ToolConfig | None = None,
        response_format: ResponseFormat | None = None,
        logprobs: LogProbConfig | None = None,
    ):
        raise NotImplementedError("Batch chat completion is not supported for vLLM")
--- a/llama_stack/providers/utils/inference/litellm_openai_mixin.py
+++ b/llama_stack/providers/utils/inference/litellm_openai_mixin.py
@ -429,28 +429,6 @@ class LiteLLMOpenAIMixin(
        )
        return await litellm.acompletion(**params)
    async def batch_completion(
        self,
        model_id: str,
        content_batch: list[InterleavedContent],
        sampling_params: SamplingParams | None = None,
        response_format: ResponseFormat | None = None,
        logprobs: LogProbConfig | None = None,
    ):
        raise NotImplementedError("Batch completion is not supported for OpenAI Compat")
    async def batch_chat_completion(
        self,
        model_id: str,
        messages_batch: list[list[Message]],
        sampling_params: SamplingParams | None = None,
        tools: list[ToolDefinition] | None = None,
        tool_config: ToolConfig | None = None,
        response_format: ResponseFormat | None = None,
        logprobs: LogProbConfig | None = None,
    ):
        raise NotImplementedError("Batch chat completion is not supported for OpenAI Compat")
    async def check_model_availability(self, model: str) -> bool:
        """
        Check if a specific model is available via LiteLLM for the current