Separate chat_completion stream and non-stream implementations

This is a pretty important requirement. The streaming response type is an AsyncGenerator while the non-stream one is a single object. So far this has worked _sometimes_ due to various pre-existing hacks (and in some cases, just failed.)
2025-12-09 11:20:58 +00:00 · 2024-10-08 10:52:16 -07:00 · 2024-10-08 10:52:16 -07:00 · 0c9eb3341c
commit 0c9eb3341c
parent f8752ab8dc
5 changed files with 346 additions and 287 deletions
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@ -180,8 +180,10 @@ class ModelStore(Protocol):
 class Inference(Protocol):
    model_store: ModelStore

+    # This method is not `async def` because it can result in either an
+    # `AsyncGenerator` or a `CompletionResponse` depending on the value of `stream`.
    @webmethod(route="/inference/completion")
-    async def completion(
+    def completion(
        self,
        model: str,
        content: InterleavedTextMedia,
@ -190,8 +192,10 @@ class Inference(Protocol):
        logprobs: Optional[LogProbConfig] = None,
    ) -> Union[CompletionResponse, CompletionResponseStreamChunk]: ...

+    # This method is not `async def` because it can result in either an
+    # `AsyncGenerator` or a `ChatCompletionResponse` depending on the value of `stream`.
    @webmethod(route="/inference/chat_completion")
-    async def chat_completion(
+    def chat_completion(
        self,
        model: str,
        messages: List[Message],