mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-26 03:04:13 +00:00
Fix: Prevent cache token overwrite by last chunk in streaming usage
This commit is contained in:
parent
b82af5b826
commit
314a8f7243
2 changed files with 97 additions and 2 deletions
|
@ -348,11 +348,17 @@ class ChunkProcessor:
|
||||||
and usage_chunk_dict["completion_tokens"] > 0
|
and usage_chunk_dict["completion_tokens"] > 0
|
||||||
):
|
):
|
||||||
completion_tokens = usage_chunk_dict["completion_tokens"]
|
completion_tokens = usage_chunk_dict["completion_tokens"]
|
||||||
if usage_chunk_dict["cache_creation_input_tokens"] is not None:
|
if usage_chunk_dict["cache_creation_input_tokens"] is not None and (
|
||||||
|
usage_chunk_dict["cache_creation_input_tokens"] > 0
|
||||||
|
or cache_creation_input_tokens is None
|
||||||
|
):
|
||||||
cache_creation_input_tokens = usage_chunk_dict[
|
cache_creation_input_tokens = usage_chunk_dict[
|
||||||
"cache_creation_input_tokens"
|
"cache_creation_input_tokens"
|
||||||
]
|
]
|
||||||
if usage_chunk_dict["cache_read_input_tokens"] is not None:
|
if usage_chunk_dict["cache_read_input_tokens"] is not None and (
|
||||||
|
usage_chunk_dict["cache_read_input_tokens"] > 0
|
||||||
|
or cache_read_input_tokens is None
|
||||||
|
):
|
||||||
cache_read_input_tokens = usage_chunk_dict[
|
cache_read_input_tokens = usage_chunk_dict[
|
||||||
"cache_read_input_tokens"
|
"cache_read_input_tokens"
|
||||||
]
|
]
|
||||||
|
|
|
@ -16,6 +16,8 @@ from litellm.types.utils import (
|
||||||
Function,
|
Function,
|
||||||
ModelResponseStream,
|
ModelResponseStream,
|
||||||
StreamingChoices,
|
StreamingChoices,
|
||||||
|
Usage,
|
||||||
|
PromptTokensDetails,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -153,3 +155,90 @@ def test_get_combined_tool_content():
|
||||||
type="function",
|
type="function",
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_cache_read_input_tokens_retained():
|
||||||
|
chunk1 = ModelResponseStream(
|
||||||
|
id="chatcmpl-95aabb85-c39f-443d-ae96-0370c404d70c",
|
||||||
|
created=1745513206,
|
||||||
|
model="claude-3-7-sonnet-20250219",
|
||||||
|
object="chat.completion.chunk",
|
||||||
|
system_fingerprint=None,
|
||||||
|
choices=[
|
||||||
|
StreamingChoices(
|
||||||
|
finish_reason=None,
|
||||||
|
index=0,
|
||||||
|
delta=Delta(
|
||||||
|
provider_specific_fields=None,
|
||||||
|
content="",
|
||||||
|
role=None,
|
||||||
|
function_call=None,
|
||||||
|
tool_calls=None,
|
||||||
|
audio=None,
|
||||||
|
),
|
||||||
|
logprobs=None,
|
||||||
|
)
|
||||||
|
],
|
||||||
|
provider_specific_fields=None,
|
||||||
|
stream_options={"include_usage": True},
|
||||||
|
usage=Usage(
|
||||||
|
completion_tokens=5,
|
||||||
|
prompt_tokens=11779,
|
||||||
|
total_tokens=11784,
|
||||||
|
completion_tokens_details=None,
|
||||||
|
prompt_tokens_details=PromptTokensDetails(
|
||||||
|
audio_tokens=None, cached_tokens=11775
|
||||||
|
),
|
||||||
|
cache_creation_input_tokens=4,
|
||||||
|
cache_read_input_tokens=11775,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
chunk2 = ModelResponseStream(
|
||||||
|
id="chatcmpl-95aabb85-c39f-443d-ae96-0370c404d70c",
|
||||||
|
created=1745513207,
|
||||||
|
model="claude-3-7-sonnet-20250219",
|
||||||
|
object="chat.completion.chunk",
|
||||||
|
system_fingerprint=None,
|
||||||
|
choices=[
|
||||||
|
StreamingChoices(
|
||||||
|
finish_reason="stop",
|
||||||
|
index=0,
|
||||||
|
delta=Delta(
|
||||||
|
provider_specific_fields=None,
|
||||||
|
content=None,
|
||||||
|
role=None,
|
||||||
|
function_call=None,
|
||||||
|
tool_calls=None,
|
||||||
|
audio=None,
|
||||||
|
),
|
||||||
|
logprobs=None,
|
||||||
|
)
|
||||||
|
],
|
||||||
|
provider_specific_fields=None,
|
||||||
|
stream_options={"include_usage": True},
|
||||||
|
usage=Usage(
|
||||||
|
completion_tokens=214,
|
||||||
|
prompt_tokens=0,
|
||||||
|
total_tokens=214,
|
||||||
|
completion_tokens_details=None,
|
||||||
|
prompt_tokens_details=PromptTokensDetails(
|
||||||
|
audio_tokens=None, cached_tokens=0
|
||||||
|
),
|
||||||
|
cache_creation_input_tokens=0,
|
||||||
|
cache_read_input_tokens=0,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Use dictionaries directly instead of ModelResponseStream
|
||||||
|
chunks = [chunk1, chunk2]
|
||||||
|
processor = ChunkProcessor(chunks=chunks)
|
||||||
|
|
||||||
|
usage = processor.calculate_usage(
|
||||||
|
chunks=chunks,
|
||||||
|
model="claude-3-7-sonnet",
|
||||||
|
completion_output="",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert usage.cache_creation_input_tokens == 4
|
||||||
|
assert usage.cache_read_input_tokens == 11775
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue