feat: add batch inference API to llama stack inference

2026-01-02 20:54:30 +00:00 · 2025-04-08 13:50:52 -07:00 · 2025-04-08 13:50:52 -07:00 · 0cfb2e2473
commit 0cfb2e2473
parent ed58a94b30
24 changed files with 1041 additions and 377 deletions
--- a/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
+++ b/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
@ -10,6 +10,7 @@ from typing import AsyncGenerator, List, Optional, Union
 from llama_stack.apis.inference import (
    CompletionResponse,
    Inference,
+    InterleavedContent,
    LogProbConfig,
    Message,
    ResponseFormat,
@ -80,3 +81,25 @@ class SentenceTransformersInferenceImpl(
        tool_config: Optional[ToolConfig] = None,
    ) -> AsyncGenerator:
        raise ValueError("Sentence transformers don't support chat completion")
+
+    async def batch_completion(
+        self,
+        model_id: str,
+        content_batch: List[InterleavedContent],
+        sampling_params: Optional[SamplingParams] = None,
+        response_format: Optional[ResponseFormat] = None,
+        logprobs: Optional[LogProbConfig] = None,
+    ):
+        raise NotImplementedError("Batch completion is not supported for Sentence Transformers")
+
+    async def batch_chat_completion(
+        self,
+        model_id: str,
+        messages_batch: List[List[Message]],
+        sampling_params: Optional[SamplingParams] = None,
+        tools: Optional[List[ToolDefinition]] = None,
+        tool_config: Optional[ToolConfig] = None,
+        response_format: Optional[ResponseFormat] = None,
+        logprobs: Optional[LogProbConfig] = None,
+    ):
+        raise NotImplementedError("Batch chat completion is not supported for Sentence Transformers")