Fix: Prevent cache token overwrite by last chunk in streaming usage

This commit is contained in:
magic 2025-04-24 11:17:02 -07:00
parent b82af5b826
commit 314a8f7243
2 changed files with 97 additions and 2 deletions

View file

@ -348,11 +348,17 @@ class ChunkProcessor:
and usage_chunk_dict["completion_tokens"] > 0
):
completion_tokens = usage_chunk_dict["completion_tokens"]
if usage_chunk_dict["cache_creation_input_tokens"] is not None:
if usage_chunk_dict["cache_creation_input_tokens"] is not None and (
usage_chunk_dict["cache_creation_input_tokens"] > 0
or cache_creation_input_tokens is None
):
cache_creation_input_tokens = usage_chunk_dict[
"cache_creation_input_tokens"
]
if usage_chunk_dict["cache_read_input_tokens"] is not None:
if usage_chunk_dict["cache_read_input_tokens"] is not None and (
usage_chunk_dict["cache_read_input_tokens"] > 0
or cache_read_input_tokens is None
):
cache_read_input_tokens = usage_chunk_dict[
"cache_read_input_tokens"
]

View file

@ -16,6 +16,8 @@ from litellm.types.utils import (
Function,
ModelResponseStream,
StreamingChoices,
Usage,
PromptTokensDetails,
)
@ -153,3 +155,90 @@ def test_get_combined_tool_content():
type="function",
),
]
def test_cache_read_input_tokens_retained():
chunk1 = ModelResponseStream(
id="chatcmpl-95aabb85-c39f-443d-ae96-0370c404d70c",
created=1745513206,
model="claude-3-7-sonnet-20250219",
object="chat.completion.chunk",
system_fingerprint=None,
choices=[
StreamingChoices(
finish_reason=None,
index=0,
delta=Delta(
provider_specific_fields=None,
content="",
role=None,
function_call=None,
tool_calls=None,
audio=None,
),
logprobs=None,
)
],
provider_specific_fields=None,
stream_options={"include_usage": True},
usage=Usage(
completion_tokens=5,
prompt_tokens=11779,
total_tokens=11784,
completion_tokens_details=None,
prompt_tokens_details=PromptTokensDetails(
audio_tokens=None, cached_tokens=11775
),
cache_creation_input_tokens=4,
cache_read_input_tokens=11775,
),
)
chunk2 = ModelResponseStream(
id="chatcmpl-95aabb85-c39f-443d-ae96-0370c404d70c",
created=1745513207,
model="claude-3-7-sonnet-20250219",
object="chat.completion.chunk",
system_fingerprint=None,
choices=[
StreamingChoices(
finish_reason="stop",
index=0,
delta=Delta(
provider_specific_fields=None,
content=None,
role=None,
function_call=None,
tool_calls=None,
audio=None,
),
logprobs=None,
)
],
provider_specific_fields=None,
stream_options={"include_usage": True},
usage=Usage(
completion_tokens=214,
prompt_tokens=0,
total_tokens=214,
completion_tokens_details=None,
prompt_tokens_details=PromptTokensDetails(
audio_tokens=None, cached_tokens=0
),
cache_creation_input_tokens=0,
cache_read_input_tokens=0,
),
)
# Use dictionaries directly instead of ModelResponseStream
chunks = [chunk1, chunk2]
processor = ChunkProcessor(chunks=chunks)
usage = processor.calculate_usage(
chunks=chunks,
model="claude-3-7-sonnet",
completion_output="",
)
assert usage.cache_creation_input_tokens == 4
assert usage.cache_read_input_tokens == 11775