forked from phoenix/litellm-mirror
LiteLLM Minor Fixes & Improvements (11/12/2024) (#6705)
* fix(caching): convert arg to equivalent kwargs in llm caching handler prevent unexpected errors * fix(caching_handler.py): don't pass args to caching * fix(caching): remove all *args from caching.py * fix(caching): consistent function signatures + abc method * test(caching_unit_tests.py): add unit tests for llm caching ensures coverage for common caching scenarios across different implementations * refactor(litellm_logging.py): move to using cache key from hidden params instead of regenerating one * fix(router.py): drop redis password requirement * fix(proxy_server.py): fix faulty slack alerting check * fix(langfuse.py): avoid copying functions/thread lock objects in metadata fixes metadata copy error when parent otel span in metadata * test: update test
This commit is contained in:
parent
d39fd60801
commit
9160d80fa5
23 changed files with 525 additions and 204 deletions
223
tests/local_testing/cache_unit_tests.py
Normal file
223
tests/local_testing/cache_unit_tests.py
Normal file
|
@ -0,0 +1,223 @@
|
|||
from abc import ABC, abstractmethod
|
||||
from litellm.caching import LiteLLMCacheType
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import traceback
|
||||
import uuid
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from test_rerank import assert_response_shape
|
||||
|
||||
load_dotenv()
|
||||
import os
|
||||
|
||||
sys.path.insert(
|
||||
0, os.path.abspath("../..")
|
||||
) # Adds the parent directory to the system path
|
||||
import asyncio
|
||||
import hashlib
|
||||
import random
|
||||
|
||||
import pytest
|
||||
|
||||
import litellm
|
||||
from litellm.caching import Cache
|
||||
from litellm import completion, embedding
|
||||
|
||||
|
||||
class LLMCachingUnitTests(ABC):
|
||||
|
||||
@abstractmethod
|
||||
def get_cache_type(self) -> LiteLLMCacheType:
|
||||
pass
|
||||
|
||||
@pytest.mark.parametrize("sync_mode", [True, False])
|
||||
@pytest.mark.asyncio
|
||||
async def test_cache_completion(self, sync_mode):
|
||||
litellm._turn_on_debug()
|
||||
|
||||
random_number = random.randint(
|
||||
1, 100000
|
||||
) # add a random number to ensure it's always adding / reading from cache
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"write a one sentence poem about: {random_number}",
|
||||
}
|
||||
]
|
||||
|
||||
cache_type = self.get_cache_type()
|
||||
litellm.cache = Cache(
|
||||
type=cache_type,
|
||||
)
|
||||
|
||||
if sync_mode:
|
||||
response1 = completion(
|
||||
"gpt-3.5-turbo",
|
||||
messages=messages,
|
||||
caching=True,
|
||||
max_tokens=20,
|
||||
mock_response="This number is so great!",
|
||||
)
|
||||
else:
|
||||
response1 = await litellm.acompletion(
|
||||
"gpt-3.5-turbo",
|
||||
messages=messages,
|
||||
caching=True,
|
||||
max_tokens=20,
|
||||
mock_response="This number is so great!",
|
||||
)
|
||||
# response2 is mocked to a different response from response1,
|
||||
# but the completion from the cache should be used instead of the mock
|
||||
# response since the input is the same as response1
|
||||
await asyncio.sleep(0.5)
|
||||
if sync_mode:
|
||||
response2 = completion(
|
||||
"gpt-3.5-turbo",
|
||||
messages=messages,
|
||||
caching=True,
|
||||
max_tokens=20,
|
||||
mock_response="This number is great!",
|
||||
)
|
||||
else:
|
||||
response2 = await litellm.acompletion(
|
||||
"gpt-3.5-turbo",
|
||||
messages=messages,
|
||||
caching=True,
|
||||
max_tokens=20,
|
||||
mock_response="This number is great!",
|
||||
)
|
||||
if (
|
||||
response1["choices"][0]["message"]["content"]
|
||||
!= response2["choices"][0]["message"]["content"]
|
||||
): # 1 and 2 should be the same
|
||||
# 1&2 have the exact same input params. This MUST Be a CACHE HIT
|
||||
print(f"response1: {response1}")
|
||||
print(f"response2: {response2}")
|
||||
pytest.fail(
|
||||
f"Error occurred: response1 - {response1['choices'][0]['message']['content']} != response2 - {response2['choices'][0]['message']['content']}"
|
||||
)
|
||||
# Since the parameters are not the same as response1, response3 should actually
|
||||
# be the mock response
|
||||
if sync_mode:
|
||||
response3 = completion(
|
||||
"gpt-3.5-turbo",
|
||||
messages=messages,
|
||||
caching=True,
|
||||
temperature=0.5,
|
||||
mock_response="This number is awful!",
|
||||
)
|
||||
else:
|
||||
response3 = await litellm.acompletion(
|
||||
"gpt-3.5-turbo",
|
||||
messages=messages,
|
||||
caching=True,
|
||||
temperature=0.5,
|
||||
mock_response="This number is awful!",
|
||||
)
|
||||
|
||||
print("\nresponse 1", response1)
|
||||
print("\nresponse 2", response2)
|
||||
print("\nresponse 3", response3)
|
||||
# print("\nresponse 4", response4)
|
||||
litellm.cache = None
|
||||
litellm.success_callback = []
|
||||
litellm._async_success_callback = []
|
||||
|
||||
# 1 & 2 should be exactly the same
|
||||
# 1 & 3 should be different, since input params are diff
|
||||
|
||||
if (
|
||||
response1["choices"][0]["message"]["content"]
|
||||
== response3["choices"][0]["message"]["content"]
|
||||
):
|
||||
# if input params like max_tokens, temperature are diff it should NOT be a cache hit
|
||||
print(f"response1: {response1}")
|
||||
print(f"response3: {response3}")
|
||||
pytest.fail(
|
||||
f"Response 1 == response 3. Same model, diff params shoudl not cache Error"
|
||||
f" occurred:"
|
||||
)
|
||||
|
||||
assert response1.id == response2.id
|
||||
assert response1.created == response2.created
|
||||
assert (
|
||||
response1.choices[0].message.content == response2.choices[0].message.content
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize("sync_mode", [True, False])
|
||||
@pytest.mark.asyncio
|
||||
async def test_disk_cache_embedding(self, sync_mode):
|
||||
litellm._turn_on_debug()
|
||||
|
||||
random_number = random.randint(
|
||||
1, 100000
|
||||
) # add a random number to ensure it's always adding / reading from cache
|
||||
input = [f"hello {random_number}"]
|
||||
litellm.cache = Cache(
|
||||
type="disk",
|
||||
)
|
||||
|
||||
if sync_mode:
|
||||
response1 = embedding(
|
||||
"openai/text-embedding-ada-002",
|
||||
input=input,
|
||||
caching=True,
|
||||
)
|
||||
else:
|
||||
response1 = await litellm.aembedding(
|
||||
"openai/text-embedding-ada-002",
|
||||
input=input,
|
||||
caching=True,
|
||||
)
|
||||
# response2 is mocked to a different response from response1,
|
||||
# but the completion from the cache should be used instead of the mock
|
||||
# response since the input is the same as response1
|
||||
await asyncio.sleep(0.5)
|
||||
if sync_mode:
|
||||
response2 = embedding(
|
||||
"openai/text-embedding-ada-002",
|
||||
input=input,
|
||||
caching=True,
|
||||
)
|
||||
else:
|
||||
response2 = await litellm.aembedding(
|
||||
"openai/text-embedding-ada-002",
|
||||
input=input,
|
||||
caching=True,
|
||||
)
|
||||
|
||||
if response2._hidden_params["cache_hit"] is not True:
|
||||
pytest.fail("Cache hit should be True")
|
||||
|
||||
# Since the parameters are not the same as response1, response3 should actually
|
||||
# be the mock response
|
||||
if sync_mode:
|
||||
response3 = embedding(
|
||||
"openai/text-embedding-ada-002",
|
||||
input=input,
|
||||
user="charlie",
|
||||
caching=True,
|
||||
)
|
||||
else:
|
||||
response3 = await litellm.aembedding(
|
||||
"openai/text-embedding-ada-002",
|
||||
input=input,
|
||||
caching=True,
|
||||
user="charlie",
|
||||
)
|
||||
|
||||
print("\nresponse 1", response1)
|
||||
print("\nresponse 2", response2)
|
||||
print("\nresponse 3", response3)
|
||||
# print("\nresponse 4", response4)
|
||||
litellm.cache = None
|
||||
litellm.success_callback = []
|
||||
litellm._async_success_callback = []
|
||||
|
||||
# 1 & 2 should be exactly the same
|
||||
# 1 & 3 should be different, since input params are diff
|
||||
|
||||
if response3._hidden_params.get("cache_hit") is True:
|
||||
pytest.fail("Cache hit should not be True")
|
|
@ -438,7 +438,7 @@ async def test_send_daily_reports_ignores_zero_values():
|
|||
slack_alerting.internal_usage_cache.async_batch_get_cache = AsyncMock(
|
||||
return_value=[None, 0, 10, 0, 0, None]
|
||||
)
|
||||
slack_alerting.internal_usage_cache.async_batch_set_cache = AsyncMock()
|
||||
slack_alerting.internal_usage_cache.async_set_cache_pipeline = AsyncMock()
|
||||
|
||||
router.get_model_info.side_effect = lambda x: {"litellm_params": {"model": x}}
|
||||
|
||||
|
|
|
@ -1103,81 +1103,6 @@ async def test_redis_cache_acompletion_stream_bedrock():
|
|||
raise e
|
||||
|
||||
|
||||
def test_disk_cache_completion():
|
||||
litellm.set_verbose = False
|
||||
|
||||
random_number = random.randint(
|
||||
1, 100000
|
||||
) # add a random number to ensure it's always adding / reading from cache
|
||||
messages = [
|
||||
{"role": "user", "content": f"write a one sentence poem about: {random_number}"}
|
||||
]
|
||||
litellm.cache = Cache(
|
||||
type="disk",
|
||||
)
|
||||
|
||||
response1 = completion(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=messages,
|
||||
caching=True,
|
||||
max_tokens=20,
|
||||
mock_response="This number is so great!",
|
||||
)
|
||||
# response2 is mocked to a different response from response1,
|
||||
# but the completion from the cache should be used instead of the mock
|
||||
# response since the input is the same as response1
|
||||
response2 = completion(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=messages,
|
||||
caching=True,
|
||||
max_tokens=20,
|
||||
mock_response="This number is awful!",
|
||||
)
|
||||
# Since the parameters are not the same as response1, response3 should actually
|
||||
# be the mock response
|
||||
response3 = completion(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=messages,
|
||||
caching=True,
|
||||
temperature=0.5,
|
||||
mock_response="This number is awful!",
|
||||
)
|
||||
|
||||
print("\nresponse 1", response1)
|
||||
print("\nresponse 2", response2)
|
||||
print("\nresponse 3", response3)
|
||||
# print("\nresponse 4", response4)
|
||||
litellm.cache = None
|
||||
litellm.success_callback = []
|
||||
litellm._async_success_callback = []
|
||||
|
||||
# 1 & 2 should be exactly the same
|
||||
# 1 & 3 should be different, since input params are diff
|
||||
if (
|
||||
response1["choices"][0]["message"]["content"]
|
||||
!= response2["choices"][0]["message"]["content"]
|
||||
): # 1 and 2 should be the same
|
||||
# 1&2 have the exact same input params. This MUST Be a CACHE HIT
|
||||
print(f"response1: {response1}")
|
||||
print(f"response2: {response2}")
|
||||
pytest.fail(f"Error occurred:")
|
||||
if (
|
||||
response1["choices"][0]["message"]["content"]
|
||||
== response3["choices"][0]["message"]["content"]
|
||||
):
|
||||
# if input params like max_tokens, temperature are diff it should NOT be a cache hit
|
||||
print(f"response1: {response1}")
|
||||
print(f"response3: {response3}")
|
||||
pytest.fail(
|
||||
f"Response 1 == response 3. Same model, diff params shoudl not cache Error"
|
||||
f" occurred:"
|
||||
)
|
||||
|
||||
assert response1.id == response2.id
|
||||
assert response1.created == response2.created
|
||||
assert response1.choices[0].message.content == response2.choices[0].message.content
|
||||
|
||||
|
||||
# @pytest.mark.skip(reason="AWS Suspended Account")
|
||||
@pytest.mark.parametrize("sync_mode", [True, False])
|
||||
@pytest.mark.asyncio
|
||||
|
|
11
tests/local_testing/test_disk_cache_unit_tests.py
Normal file
11
tests/local_testing/test_disk_cache_unit_tests.py
Normal file
|
@ -0,0 +1,11 @@
|
|||
from cache_unit_tests import LLMCachingUnitTests
|
||||
from litellm.caching import LiteLLMCacheType
|
||||
|
||||
|
||||
class TestDiskCacheUnitTests(LLMCachingUnitTests):
|
||||
def get_cache_type(self) -> LiteLLMCacheType:
|
||||
return LiteLLMCacheType.DISK
|
||||
|
||||
|
||||
# if __name__ == "__main__":
|
||||
# pytest.main([__file__, "-v", "-s"])
|
|
@ -146,7 +146,7 @@ async def test_dual_cache_batch_operations(is_async):
|
|||
|
||||
# Set values
|
||||
if is_async:
|
||||
await dual_cache.async_batch_set_cache(cache_list)
|
||||
await dual_cache.async_set_cache_pipeline(cache_list)
|
||||
else:
|
||||
for key, value in cache_list:
|
||||
dual_cache.set_cache(key, value)
|
||||
|
|
|
@ -212,26 +212,48 @@ def test_get_langfuse_logger_for_request_with_cached_logger():
|
|||
assert result == cached_logger
|
||||
mock_cache.get_cache.assert_called_once()
|
||||
|
||||
@pytest.mark.parametrize("metadata", [
|
||||
{'a': 1, 'b': 2, 'c': 3},
|
||||
{'a': {'nested_a': 1}, 'b': {'nested_b': 2}},
|
||||
{'a': [1, 2, 3], 'b': {4, 5, 6}},
|
||||
{'a': (1, 2), 'b': frozenset([3, 4]), 'c': {'d': [5, 6]}},
|
||||
{'lock': threading.Lock()},
|
||||
{'func': lambda x: x + 1},
|
||||
{
|
||||
'int': 42,
|
||||
'str': 'hello',
|
||||
'list': [1, 2, 3],
|
||||
'set': {4, 5},
|
||||
'dict': {'nested': 'value'},
|
||||
'non_copyable': threading.Lock(),
|
||||
'function': print
|
||||
},
|
||||
['list', 'not', 'a', 'dict'],
|
||||
{'timestamp': datetime.now()},
|
||||
{},
|
||||
None,
|
||||
])
|
||||
def test_langfuse_logger_prepare_metadata(metadata):
|
||||
global_langfuse_logger._prepare_metadata(metadata)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"metadata, expected_metadata",
|
||||
[
|
||||
({"a": 1, "b": 2, "c": 3}, {"a": 1, "b": 2, "c": 3}),
|
||||
(
|
||||
{"a": {"nested_a": 1}, "b": {"nested_b": 2}},
|
||||
{"a": {"nested_a": 1}, "b": {"nested_b": 2}},
|
||||
),
|
||||
({"a": [1, 2, 3], "b": {4, 5, 6}}, {"a": [1, 2, 3], "b": {4, 5, 6}}),
|
||||
(
|
||||
{"a": (1, 2), "b": frozenset([3, 4]), "c": {"d": [5, 6]}},
|
||||
{"a": (1, 2), "b": frozenset([3, 4]), "c": {"d": [5, 6]}},
|
||||
),
|
||||
({"lock": threading.Lock()}, {}),
|
||||
({"func": lambda x: x + 1}, {}),
|
||||
(
|
||||
{
|
||||
"int": 42,
|
||||
"str": "hello",
|
||||
"list": [1, 2, 3],
|
||||
"set": {4, 5},
|
||||
"dict": {"nested": "value"},
|
||||
"non_copyable": threading.Lock(),
|
||||
"function": print,
|
||||
},
|
||||
{
|
||||
"int": 42,
|
||||
"str": "hello",
|
||||
"list": [1, 2, 3],
|
||||
"set": {4, 5},
|
||||
"dict": {"nested": "value"},
|
||||
},
|
||||
),
|
||||
(
|
||||
{"list": ["list", "not", "a", "dict"]},
|
||||
{"list": ["list", "not", "a", "dict"]},
|
||||
),
|
||||
({}, {}),
|
||||
(None, None),
|
||||
],
|
||||
)
|
||||
def test_langfuse_logger_prepare_metadata(metadata, expected_metadata):
|
||||
result = global_langfuse_logger._prepare_metadata(metadata)
|
||||
assert result == expected_metadata
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue