Fix: Prevent cache token overwrite by last chunk in streaming usage

2025-04-25 02:34:29 +00:00 · 2025-04-24 11:17:02 -07:00 · 2025-04-24 11:17:02 -07:00 · 314a8f7243
commit 314a8f7243
parent b82af5b826
2 changed files with 97 additions and 2 deletions
--- a/litellm/litellm_core_utils/streaming_chunk_builder_utils.py
+++ b/litellm/litellm_core_utils/streaming_chunk_builder_utils.py
@ -348,11 +348,17 @@ class ChunkProcessor:
                    and usage_chunk_dict["completion_tokens"] > 0
                ):
                    completion_tokens = usage_chunk_dict["completion_tokens"]
-                if usage_chunk_dict["cache_creation_input_tokens"] is not None:
+                if usage_chunk_dict["cache_creation_input_tokens"] is not None and (
+                    usage_chunk_dict["cache_creation_input_tokens"] > 0
+                    or cache_creation_input_tokens is None
+                ):
                    cache_creation_input_tokens = usage_chunk_dict[
                        "cache_creation_input_tokens"
                    ]
-                if usage_chunk_dict["cache_read_input_tokens"] is not None:
+                if usage_chunk_dict["cache_read_input_tokens"] is not None and (
+                    usage_chunk_dict["cache_read_input_tokens"] > 0
+                    or cache_read_input_tokens is None
+                ):
                    cache_read_input_tokens = usage_chunk_dict[
                        "cache_read_input_tokens"
                    ]
--- a/tests/litellm/litellm_core_utils/test_streaming_chunk_builder_utils.py
+++ b/tests/litellm/litellm_core_utils/test_streaming_chunk_builder_utils.py
@ -16,6 +16,8 @@ from litellm.types.utils import (
    Function,
    ModelResponseStream,
    StreamingChoices,
+    Usage,
+    PromptTokensDetails,
 )


@ -153,3 +155,90 @@ def test_get_combined_tool_content():
            type="function",
        ),
    ]
+
+
+def test_cache_read_input_tokens_retained():
+    chunk1 = ModelResponseStream(
+        id="chatcmpl-95aabb85-c39f-443d-ae96-0370c404d70c",
+        created=1745513206,
+        model="claude-3-7-sonnet-20250219",
+        object="chat.completion.chunk",
+        system_fingerprint=None,
+        choices=[
+            StreamingChoices(
+                finish_reason=None,
+                index=0,
+                delta=Delta(
+                    provider_specific_fields=None,
+                    content="",
+                    role=None,
+                    function_call=None,
+                    tool_calls=None,
+                    audio=None,
+                ),
+                logprobs=None,
+            )
+        ],
+        provider_specific_fields=None,
+        stream_options={"include_usage": True},
+        usage=Usage(
+            completion_tokens=5,
+            prompt_tokens=11779,
+            total_tokens=11784,
+            completion_tokens_details=None,
+            prompt_tokens_details=PromptTokensDetails(
+                audio_tokens=None, cached_tokens=11775
+            ),
+            cache_creation_input_tokens=4,
+            cache_read_input_tokens=11775,
+        ),
+    )
+
+    chunk2 = ModelResponseStream(
+        id="chatcmpl-95aabb85-c39f-443d-ae96-0370c404d70c",
+        created=1745513207,
+        model="claude-3-7-sonnet-20250219",
+        object="chat.completion.chunk",
+        system_fingerprint=None,
+        choices=[
+            StreamingChoices(
+                finish_reason="stop",
+                index=0,
+                delta=Delta(
+                    provider_specific_fields=None,
+                    content=None,
+                    role=None,
+                    function_call=None,
+                    tool_calls=None,
+                    audio=None,
+                ),
+                logprobs=None,
+            )
+        ],
+        provider_specific_fields=None,
+        stream_options={"include_usage": True},
+        usage=Usage(
+            completion_tokens=214,
+            prompt_tokens=0,
+            total_tokens=214,
+            completion_tokens_details=None,
+            prompt_tokens_details=PromptTokensDetails(
+                audio_tokens=None, cached_tokens=0
+            ),
+            cache_creation_input_tokens=0,
+            cache_read_input_tokens=0,
+        ),
+    )
+
+    # Use dictionaries directly instead of ModelResponseStream
+    chunks = [chunk1, chunk2]
+    processor = ChunkProcessor(chunks=chunks)
+
+    usage = processor.calculate_usage(
+        chunks=chunks,
+        model="claude-3-7-sonnet",
+        completion_output="",
+    )
+
+    assert usage.cache_creation_input_tokens == 4
+    assert usage.cache_read_input_tokens == 11775