forked from phoenix/litellm-mirror
test(caching_unit_tests.py): add unit tests for llm caching
ensures coverage for common caching scenarios across different implementations
This commit is contained in:
parent
0bc9864c09
commit
16bbed72d4
5 changed files with 244 additions and 188 deletions
|
@ -595,6 +595,7 @@ class LLMCachingHandler:
|
||||||
model_response_object=EmbeddingResponse(),
|
model_response_object=EmbeddingResponse(),
|
||||||
response_type="embedding",
|
response_type="embedding",
|
||||||
)
|
)
|
||||||
|
|
||||||
elif (
|
elif (
|
||||||
call_type == CallTypes.arerank.value or call_type == CallTypes.rerank.value
|
call_type == CallTypes.arerank.value or call_type == CallTypes.rerank.value
|
||||||
) and isinstance(cached_result, dict):
|
) and isinstance(cached_result, dict):
|
||||||
|
@ -618,6 +619,13 @@ class LLMCachingHandler:
|
||||||
response_type="audio_transcription",
|
response_type="audio_transcription",
|
||||||
hidden_params=hidden_params,
|
hidden_params=hidden_params,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if (
|
||||||
|
hasattr(cached_result, "_hidden_params")
|
||||||
|
and cached_result._hidden_params is not None
|
||||||
|
and isinstance(cached_result._hidden_params, dict)
|
||||||
|
):
|
||||||
|
cached_result._hidden_params["cache_hit"] = True
|
||||||
return cached_result
|
return cached_result
|
||||||
|
|
||||||
def _convert_cached_stream_response(
|
def _convert_cached_stream_response(
|
||||||
|
|
|
@ -796,7 +796,7 @@ def client(original_function): # noqa: PLR0915
|
||||||
and kwargs.get("_arealtime", False) is not True
|
and kwargs.get("_arealtime", False) is not True
|
||||||
): # allow users to control returning cached responses from the completion function
|
): # allow users to control returning cached responses from the completion function
|
||||||
# checking cache
|
# checking cache
|
||||||
print_verbose("INSIDE CHECKING CACHE")
|
verbose_logger.debug("INSIDE CHECKING SYNC CACHE")
|
||||||
caching_handler_response: CachingHandlerResponse = (
|
caching_handler_response: CachingHandlerResponse = (
|
||||||
_llm_caching_handler._sync_get_cache(
|
_llm_caching_handler._sync_get_cache(
|
||||||
model=model or "",
|
model=model or "",
|
||||||
|
@ -808,6 +808,7 @@ def client(original_function): # noqa: PLR0915
|
||||||
args=args,
|
args=args,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
if caching_handler_response.cached_result is not None:
|
if caching_handler_response.cached_result is not None:
|
||||||
return caching_handler_response.cached_result
|
return caching_handler_response.cached_result
|
||||||
|
|
||||||
|
|
223
tests/local_testing/cache_unit_tests.py
Normal file
223
tests/local_testing/cache_unit_tests.py
Normal file
|
@ -0,0 +1,223 @@
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from litellm.caching import LiteLLMCacheType
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import traceback
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from test_rerank import assert_response_shape
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
import os
|
||||||
|
|
||||||
|
sys.path.insert(
|
||||||
|
0, os.path.abspath("../..")
|
||||||
|
) # Adds the parent directory to the system path
|
||||||
|
import asyncio
|
||||||
|
import hashlib
|
||||||
|
import random
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
import litellm
|
||||||
|
from litellm.caching import Cache
|
||||||
|
from litellm import completion, embedding
|
||||||
|
|
||||||
|
|
||||||
|
class LLMCachingUnitTests(ABC):
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def get_cache_type(self) -> LiteLLMCacheType:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("sync_mode", [True, False])
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_cache_completion(self, sync_mode):
|
||||||
|
litellm._turn_on_debug()
|
||||||
|
|
||||||
|
random_number = random.randint(
|
||||||
|
1, 100000
|
||||||
|
) # add a random number to ensure it's always adding / reading from cache
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": f"write a one sentence poem about: {random_number}",
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
cache_type = self.get_cache_type()
|
||||||
|
litellm.cache = Cache(
|
||||||
|
type=cache_type,
|
||||||
|
)
|
||||||
|
|
||||||
|
if sync_mode:
|
||||||
|
response1 = completion(
|
||||||
|
"gpt-3.5-turbo",
|
||||||
|
messages=messages,
|
||||||
|
caching=True,
|
||||||
|
max_tokens=20,
|
||||||
|
mock_response="This number is so great!",
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
response1 = await litellm.acompletion(
|
||||||
|
"gpt-3.5-turbo",
|
||||||
|
messages=messages,
|
||||||
|
caching=True,
|
||||||
|
max_tokens=20,
|
||||||
|
mock_response="This number is so great!",
|
||||||
|
)
|
||||||
|
# response2 is mocked to a different response from response1,
|
||||||
|
# but the completion from the cache should be used instead of the mock
|
||||||
|
# response since the input is the same as response1
|
||||||
|
await asyncio.sleep(0.5)
|
||||||
|
if sync_mode:
|
||||||
|
response2 = completion(
|
||||||
|
"gpt-3.5-turbo",
|
||||||
|
messages=messages,
|
||||||
|
caching=True,
|
||||||
|
max_tokens=20,
|
||||||
|
mock_response="This number is great!",
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
response2 = await litellm.acompletion(
|
||||||
|
"gpt-3.5-turbo",
|
||||||
|
messages=messages,
|
||||||
|
caching=True,
|
||||||
|
max_tokens=20,
|
||||||
|
mock_response="This number is great!",
|
||||||
|
)
|
||||||
|
if (
|
||||||
|
response1["choices"][0]["message"]["content"]
|
||||||
|
!= response2["choices"][0]["message"]["content"]
|
||||||
|
): # 1 and 2 should be the same
|
||||||
|
# 1&2 have the exact same input params. This MUST Be a CACHE HIT
|
||||||
|
print(f"response1: {response1}")
|
||||||
|
print(f"response2: {response2}")
|
||||||
|
pytest.fail(
|
||||||
|
f"Error occurred: response1 - {response1['choices'][0]['message']['content']} != response2 - {response2['choices'][0]['message']['content']}"
|
||||||
|
)
|
||||||
|
# Since the parameters are not the same as response1, response3 should actually
|
||||||
|
# be the mock response
|
||||||
|
if sync_mode:
|
||||||
|
response3 = completion(
|
||||||
|
"gpt-3.5-turbo",
|
||||||
|
messages=messages,
|
||||||
|
caching=True,
|
||||||
|
temperature=0.5,
|
||||||
|
mock_response="This number is awful!",
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
response3 = await litellm.acompletion(
|
||||||
|
"gpt-3.5-turbo",
|
||||||
|
messages=messages,
|
||||||
|
caching=True,
|
||||||
|
temperature=0.5,
|
||||||
|
mock_response="This number is awful!",
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\nresponse 1", response1)
|
||||||
|
print("\nresponse 2", response2)
|
||||||
|
print("\nresponse 3", response3)
|
||||||
|
# print("\nresponse 4", response4)
|
||||||
|
litellm.cache = None
|
||||||
|
litellm.success_callback = []
|
||||||
|
litellm._async_success_callback = []
|
||||||
|
|
||||||
|
# 1 & 2 should be exactly the same
|
||||||
|
# 1 & 3 should be different, since input params are diff
|
||||||
|
|
||||||
|
if (
|
||||||
|
response1["choices"][0]["message"]["content"]
|
||||||
|
== response3["choices"][0]["message"]["content"]
|
||||||
|
):
|
||||||
|
# if input params like max_tokens, temperature are diff it should NOT be a cache hit
|
||||||
|
print(f"response1: {response1}")
|
||||||
|
print(f"response3: {response3}")
|
||||||
|
pytest.fail(
|
||||||
|
f"Response 1 == response 3. Same model, diff params shoudl not cache Error"
|
||||||
|
f" occurred:"
|
||||||
|
)
|
||||||
|
|
||||||
|
assert response1.id == response2.id
|
||||||
|
assert response1.created == response2.created
|
||||||
|
assert (
|
||||||
|
response1.choices[0].message.content == response2.choices[0].message.content
|
||||||
|
)
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("sync_mode", [True, False])
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_disk_cache_embedding(self, sync_mode):
|
||||||
|
litellm._turn_on_debug()
|
||||||
|
|
||||||
|
random_number = random.randint(
|
||||||
|
1, 100000
|
||||||
|
) # add a random number to ensure it's always adding / reading from cache
|
||||||
|
input = [f"hello {random_number}"]
|
||||||
|
litellm.cache = Cache(
|
||||||
|
type="disk",
|
||||||
|
)
|
||||||
|
|
||||||
|
if sync_mode:
|
||||||
|
response1 = embedding(
|
||||||
|
"openai/text-embedding-ada-002",
|
||||||
|
input=input,
|
||||||
|
caching=True,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
response1 = await litellm.aembedding(
|
||||||
|
"openai/text-embedding-ada-002",
|
||||||
|
input=input,
|
||||||
|
caching=True,
|
||||||
|
)
|
||||||
|
# response2 is mocked to a different response from response1,
|
||||||
|
# but the completion from the cache should be used instead of the mock
|
||||||
|
# response since the input is the same as response1
|
||||||
|
await asyncio.sleep(0.5)
|
||||||
|
if sync_mode:
|
||||||
|
response2 = embedding(
|
||||||
|
"openai/text-embedding-ada-002",
|
||||||
|
input=input,
|
||||||
|
caching=True,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
response2 = await litellm.aembedding(
|
||||||
|
"openai/text-embedding-ada-002",
|
||||||
|
input=input,
|
||||||
|
caching=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
if response2._hidden_params["cache_hit"] is not True:
|
||||||
|
pytest.fail("Cache hit should be True")
|
||||||
|
|
||||||
|
# Since the parameters are not the same as response1, response3 should actually
|
||||||
|
# be the mock response
|
||||||
|
if sync_mode:
|
||||||
|
response3 = embedding(
|
||||||
|
"openai/text-embedding-ada-002",
|
||||||
|
input=input,
|
||||||
|
user="charlie",
|
||||||
|
caching=True,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
response3 = await litellm.aembedding(
|
||||||
|
"openai/text-embedding-ada-002",
|
||||||
|
input=input,
|
||||||
|
caching=True,
|
||||||
|
user="charlie",
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\nresponse 1", response1)
|
||||||
|
print("\nresponse 2", response2)
|
||||||
|
print("\nresponse 3", response3)
|
||||||
|
# print("\nresponse 4", response4)
|
||||||
|
litellm.cache = None
|
||||||
|
litellm.success_callback = []
|
||||||
|
litellm._async_success_callback = []
|
||||||
|
|
||||||
|
# 1 & 2 should be exactly the same
|
||||||
|
# 1 & 3 should be different, since input params are diff
|
||||||
|
|
||||||
|
if response3._hidden_params.get("cache_hit") is True:
|
||||||
|
pytest.fail("Cache hit should not be True")
|
|
@ -1103,193 +1103,6 @@ async def test_redis_cache_acompletion_stream_bedrock():
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("sync_mode", [True, False])
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_disk_cache_completion(sync_mode):
|
|
||||||
litellm._turn_on_debug()
|
|
||||||
|
|
||||||
random_number = random.randint(
|
|
||||||
1, 100000
|
|
||||||
) # add a random number to ensure it's always adding / reading from cache
|
|
||||||
messages = [
|
|
||||||
{"role": "user", "content": f"write a one sentence poem about: {random_number}"}
|
|
||||||
]
|
|
||||||
litellm.cache = Cache(
|
|
||||||
type="disk",
|
|
||||||
)
|
|
||||||
|
|
||||||
if sync_mode:
|
|
||||||
response1 = completion(
|
|
||||||
"gpt-3.5-turbo",
|
|
||||||
messages=messages,
|
|
||||||
caching=True,
|
|
||||||
max_tokens=20,
|
|
||||||
mock_response="This number is so great!",
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
response1 = await litellm.acompletion(
|
|
||||||
"gpt-3.5-turbo",
|
|
||||||
messages=messages,
|
|
||||||
caching=True,
|
|
||||||
max_tokens=20,
|
|
||||||
mock_response="This number is so great!",
|
|
||||||
)
|
|
||||||
# response2 is mocked to a different response from response1,
|
|
||||||
# but the completion from the cache should be used instead of the mock
|
|
||||||
# response since the input is the same as response1
|
|
||||||
await asyncio.sleep(0.5)
|
|
||||||
if sync_mode:
|
|
||||||
response2 = completion(
|
|
||||||
"gpt-3.5-turbo",
|
|
||||||
messages=messages,
|
|
||||||
caching=True,
|
|
||||||
max_tokens=20,
|
|
||||||
mock_response="This number is great!",
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
response2 = await litellm.acompletion(
|
|
||||||
"gpt-3.5-turbo",
|
|
||||||
messages=messages,
|
|
||||||
caching=True,
|
|
||||||
max_tokens=20,
|
|
||||||
mock_response="This number is great!",
|
|
||||||
)
|
|
||||||
if (
|
|
||||||
response1["choices"][0]["message"]["content"]
|
|
||||||
!= response2["choices"][0]["message"]["content"]
|
|
||||||
): # 1 and 2 should be the same
|
|
||||||
# 1&2 have the exact same input params. This MUST Be a CACHE HIT
|
|
||||||
print(f"response1: {response1}")
|
|
||||||
print(f"response2: {response2}")
|
|
||||||
pytest.fail(
|
|
||||||
f"Error occurred: response1 - {response1['choices'][0]['message']['content']} != response2 - {response2['choices'][0]['message']['content']}"
|
|
||||||
)
|
|
||||||
# Since the parameters are not the same as response1, response3 should actually
|
|
||||||
# be the mock response
|
|
||||||
if sync_mode:
|
|
||||||
response3 = completion(
|
|
||||||
"gpt-3.5-turbo",
|
|
||||||
messages=messages,
|
|
||||||
caching=True,
|
|
||||||
temperature=0.5,
|
|
||||||
mock_response="This number is awful!",
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
response3 = await litellm.acompletion(
|
|
||||||
"gpt-3.5-turbo",
|
|
||||||
messages=messages,
|
|
||||||
caching=True,
|
|
||||||
temperature=0.5,
|
|
||||||
mock_response="This number is awful!",
|
|
||||||
)
|
|
||||||
|
|
||||||
print("\nresponse 1", response1)
|
|
||||||
print("\nresponse 2", response2)
|
|
||||||
print("\nresponse 3", response3)
|
|
||||||
# print("\nresponse 4", response4)
|
|
||||||
litellm.cache = None
|
|
||||||
litellm.success_callback = []
|
|
||||||
litellm._async_success_callback = []
|
|
||||||
|
|
||||||
# 1 & 2 should be exactly the same
|
|
||||||
# 1 & 3 should be different, since input params are diff
|
|
||||||
|
|
||||||
if (
|
|
||||||
response1["choices"][0]["message"]["content"]
|
|
||||||
== response3["choices"][0]["message"]["content"]
|
|
||||||
):
|
|
||||||
# if input params like max_tokens, temperature are diff it should NOT be a cache hit
|
|
||||||
print(f"response1: {response1}")
|
|
||||||
print(f"response3: {response3}")
|
|
||||||
pytest.fail(
|
|
||||||
f"Response 1 == response 3. Same model, diff params shoudl not cache Error"
|
|
||||||
f" occurred:"
|
|
||||||
)
|
|
||||||
|
|
||||||
assert response1.id == response2.id
|
|
||||||
assert response1.created == response2.created
|
|
||||||
assert response1.choices[0].message.content == response2.choices[0].message.content
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("sync_mode", [True, False])
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_disk_cache_embedding(sync_mode):
|
|
||||||
litellm._turn_on_debug()
|
|
||||||
|
|
||||||
random_number = random.randint(
|
|
||||||
1, 100000
|
|
||||||
) # add a random number to ensure it's always adding / reading from cache
|
|
||||||
input = [f"hello {random_number}"]
|
|
||||||
litellm.cache = Cache(
|
|
||||||
type="disk",
|
|
||||||
)
|
|
||||||
|
|
||||||
if sync_mode:
|
|
||||||
response1 = embedding(
|
|
||||||
"openai/text-embedding-ada-002",
|
|
||||||
input=input,
|
|
||||||
caching=True,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
response1 = await litellm.aembedding(
|
|
||||||
"openai/text-embedding-ada-002",
|
|
||||||
input=input,
|
|
||||||
caching=True,
|
|
||||||
)
|
|
||||||
# response2 is mocked to a different response from response1,
|
|
||||||
# but the completion from the cache should be used instead of the mock
|
|
||||||
# response since the input is the same as response1
|
|
||||||
await asyncio.sleep(0.5)
|
|
||||||
if sync_mode:
|
|
||||||
response2 = embedding(
|
|
||||||
"openai/text-embedding-ada-002",
|
|
||||||
input=input,
|
|
||||||
caching=True,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
response2 = await litellm.aembedding(
|
|
||||||
"openai/text-embedding-ada-002",
|
|
||||||
input=input,
|
|
||||||
caching=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
if response2._hidden_params["cache_hit"] is not True:
|
|
||||||
pytest.fail("Cache hit should be True")
|
|
||||||
assert response1.id == response2.id
|
|
||||||
# Since the parameters are not the same as response1, response3 should actually
|
|
||||||
# be the mock response
|
|
||||||
if sync_mode:
|
|
||||||
response3 = embedding(
|
|
||||||
"openai/text-embedding-ada-002",
|
|
||||||
input=input,
|
|
||||||
user="charlie",
|
|
||||||
caching=True,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
response3 = await litellm.acompletion(
|
|
||||||
"openai/text-embedding-ada-002",
|
|
||||||
input=input,
|
|
||||||
caching=True,
|
|
||||||
user="charlie",
|
|
||||||
)
|
|
||||||
|
|
||||||
print("\nresponse 1", response1)
|
|
||||||
print("\nresponse 2", response2)
|
|
||||||
print("\nresponse 3", response3)
|
|
||||||
# print("\nresponse 4", response4)
|
|
||||||
litellm.cache = None
|
|
||||||
litellm.success_callback = []
|
|
||||||
litellm._async_success_callback = []
|
|
||||||
|
|
||||||
# 1 & 2 should be exactly the same
|
|
||||||
# 1 & 3 should be different, since input params are diff
|
|
||||||
|
|
||||||
if response3._hidden_params["cache_hit"] is True:
|
|
||||||
pytest.fail("Cache hit should not be True")
|
|
||||||
|
|
||||||
assert response1.id != response3.id
|
|
||||||
|
|
||||||
|
|
||||||
# @pytest.mark.skip(reason="AWS Suspended Account")
|
# @pytest.mark.skip(reason="AWS Suspended Account")
|
||||||
@pytest.mark.parametrize("sync_mode", [True, False])
|
@pytest.mark.parametrize("sync_mode", [True, False])
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
|
|
11
tests/local_testing/test_disk_cache_unit_tests.py
Normal file
11
tests/local_testing/test_disk_cache_unit_tests.py
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
from cache_unit_tests import LLMCachingUnitTests
|
||||||
|
from litellm.caching import LiteLLMCacheType
|
||||||
|
|
||||||
|
|
||||||
|
class TestDiskCacheUnitTests(LLMCachingUnitTests):
|
||||||
|
def get_cache_type(self) -> LiteLLMCacheType:
|
||||||
|
return LiteLLMCacheType.DISK
|
||||||
|
|
||||||
|
|
||||||
|
# if __name__ == "__main__":
|
||||||
|
# pytest.main([__file__, "-v", "-s"])
|
Loading…
Add table
Add a link
Reference in a new issue