add streaming support for ollama inference with tests

2025-10-04 12:07:34 +00:00 · 2024-07-31 19:33:36 -07:00 · 2024-07-31 19:33:36 -07:00 · 0e985648f5
commit 0e985648f5
parent 0e75e73fa7
4 changed files with 491 additions and 61 deletions
--- a/llama_toolchain/inference/inference.py
+++ b/llama_toolchain/inference/inference.py
@ -103,13 +103,15 @@ class InferenceImpl(Inference):
                )
            else:
                delta = text
-            yield ChatCompletionResponseStreamChunk(
-                event=ChatCompletionResponseEvent(
-                    event_type=ChatCompletionResponseEventType.progress,
-                    delta=delta,
-                    stop_reason=stop_reason,
+
+            if stop_reason is None:
+                yield ChatCompletionResponseStreamChunk(
+                    event=ChatCompletionResponseEvent(
+                        event_type=ChatCompletionResponseEventType.progress,
+                        delta=delta,
+                        stop_reason=stop_reason,
+                    )
                )
-            )

        if stop_reason is None:
            stop_reason = StopReason.out_of_tokens