mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-27 03:34:10 +00:00
LiteLLM Minor Fixes & Improvements (12/23/2024) - p3 (#7394)
* build(model_prices_and_context_window.json): add gemini-1.5-flash context caching * fix(context_caching/transformation.py): just use last identified cache point Fixes https://github.com/BerriAI/litellm/issues/6738 * fix(context_caching/transformation.py): pick first contiguous block - handles system message error from google Fixes https://github.com/BerriAI/litellm/issues/6738 * fix(vertex_ai/gemini/): track context caching tokens * refactor(gemini/): place transformation.py inside `chat/` folder make it easy for user to know we support the equivalent endpoint * fix: fix import * refactor(vertex_ai/): move vertex_ai cost calc inside vertex_ai/ folder make it easier to see cost calculation logic * fix: fix linting errors * fix: fix circular import * feat(gemini/cost_calculator.py): support gemini context caching cost calculation generifies anthropic's cost calculation function and uses it across anthropic + gemini * build(model_prices_and_context_window.json): add cost tracking for gemini-1.5-flash-002 w/ context caching Closes https://github.com/BerriAI/litellm/issues/6891 * docs(gemini.md): add gemini context caching architecture diagram make it easier for user to understand how context caching works * docs(gemini.md): link to relevant gemini context caching code * docs(gemini/context_caching): add readme in github, make it easy for dev to know context caching is supported + where to go for code * fix(llm_cost_calc/utils.py): handle gemini 128k token diff cost calc scenario * fix(deepseek/cost_calculator.py): support deepseek context caching cost calculation * test: fix test
This commit is contained in:
parent
905e89bf60
commit
8fe1356406
20 changed files with 719 additions and 447 deletions
|
@ -5,6 +5,7 @@ import sys
|
|||
from typing import Any, Dict, List
|
||||
from unittest.mock import MagicMock, Mock, patch
|
||||
import os
|
||||
import uuid
|
||||
|
||||
sys.path.insert(
|
||||
0, os.path.abspath("../..")
|
||||
|
@ -45,6 +46,7 @@ def _usage_format_tests(usage: litellm.Usage):
|
|||
}
|
||||
```
|
||||
"""
|
||||
print(f"usage={usage}")
|
||||
assert usage.total_tokens == usage.prompt_tokens + usage.completion_tokens
|
||||
|
||||
assert usage.prompt_tokens > usage.prompt_tokens_details.cached_tokens
|
||||
|
@ -342,54 +344,75 @@ class BaseLLMChatTest(ABC):
|
|||
print("Model does not support prompt caching")
|
||||
pytest.skip("Model does not support prompt caching")
|
||||
|
||||
try:
|
||||
for _ in range(2):
|
||||
response = self.completion_function(
|
||||
**base_completion_call_args,
|
||||
messages=[
|
||||
# System Message
|
||||
{
|
||||
"role": "system",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Here is the full text of a complex legal agreement"
|
||||
* 400,
|
||||
"cache_control": {"type": "ephemeral"},
|
||||
}
|
||||
],
|
||||
},
|
||||
# marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache.
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What are the key terms and conditions in this agreement?",
|
||||
"cache_control": {"type": "ephemeral"},
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
|
||||
},
|
||||
# The final turn is marked with cache-control, for continuing in followups.
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What are the key terms and conditions in this agreement?",
|
||||
"cache_control": {"type": "ephemeral"},
|
||||
}
|
||||
],
|
||||
},
|
||||
],
|
||||
max_tokens=10,
|
||||
)
|
||||
uuid_str = str(uuid.uuid4())
|
||||
messages = [
|
||||
# System Message
|
||||
{
|
||||
"role": "system",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Here is the full text of a complex legal agreement {}".format(
|
||||
uuid_str
|
||||
)
|
||||
* 400,
|
||||
"cache_control": {"type": "ephemeral"},
|
||||
}
|
||||
],
|
||||
},
|
||||
# marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache.
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What are the key terms and conditions in this agreement?",
|
||||
"cache_control": {"type": "ephemeral"},
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
|
||||
},
|
||||
# The final turn is marked with cache-control, for continuing in followups.
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What are the key terms and conditions in this agreement?",
|
||||
"cache_control": {"type": "ephemeral"},
|
||||
}
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
_usage_format_tests(response.usage)
|
||||
try:
|
||||
## call 1
|
||||
response = self.completion_function(
|
||||
**base_completion_call_args,
|
||||
messages=messages,
|
||||
max_tokens=10,
|
||||
)
|
||||
|
||||
initial_cost = response._hidden_params["response_cost"]
|
||||
## call 2
|
||||
response = self.completion_function(
|
||||
**base_completion_call_args,
|
||||
messages=messages,
|
||||
max_tokens=10,
|
||||
)
|
||||
|
||||
cached_cost = response._hidden_params["response_cost"]
|
||||
|
||||
assert (
|
||||
cached_cost <= initial_cost
|
||||
), "Cached cost={} should be less than initial cost={}".format(
|
||||
cached_cost, initial_cost
|
||||
)
|
||||
|
||||
_usage_format_tests(response.usage)
|
||||
|
||||
print("response=", response)
|
||||
print("response.usage=", response.usage)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue