fix(streaming_handler.py): support logging complete streaming response on cache hit

2025-04-26 19:24:27 +00:00 · 2025-03-17 18:10:39 -07:00 · 2025-03-17 18:10:39 -07:00 · 301375bf84
commit 301375bf84
parent ba6369e359
3 changed files with 66 additions and 22 deletions
--- a/litellm/litellm_core_utils/streaming_handler.py
+++ b/litellm/litellm_core_utils/streaming_handler.py
@ -1481,6 +1481,15 @@ class CustomStreamWrapper:
                processed_chunk
            )

+    async def async_cache_streaming_response(self, processed_chunk, cache_hit: bool):
+        """
+        Caches the streaming response
+        """
+        if not cache_hit and self.logging_obj._llm_caching_handler is not None:
+            await self.logging_obj._llm_caching_handler._add_streaming_response_to_cache(
+                processed_chunk
+            )
+
    def run_success_logging_and_cache_storage(self, processed_chunk, cache_hit: bool):
        """
        Runs success logging in a thread and adds the response to the cache
@ -1711,13 +1720,6 @@ class CustomStreamWrapper:
                    if processed_chunk is None:
                        continue

-                    if self.logging_obj._llm_caching_handler is not None:
-                        asyncio.create_task(
-                            self.logging_obj._llm_caching_handler._add_streaming_response_to_cache(
-                                processed_chunk=cast(ModelResponse, processed_chunk),
-                            )
-                        )
-
                    choice = processed_chunk.choices[0]
                    if isinstance(choice, StreamingChoices):
                        self.response_uptil_now += choice.delta.get("content", "") or ""
@ -1788,6 +1790,14 @@ class CustomStreamWrapper:
                        "usage",
                        getattr(complete_streaming_response, "usage"),
                    )
+                    asyncio.create_task(
+                        self.async_cache_streaming_response(
+                            processed_chunk=complete_streaming_response.model_copy(
+                                deep=True
+                            ),
+                            cache_hit=cache_hit,
+                        )
+                    )
                if self.sent_stream_usage is False and self.send_stream_usage is True:
                    self.sent_stream_usage = True
                    return response