diff --git a/litellm/caching/caching_handler.py b/litellm/caching/caching_handler.py index 9cf6c4189..11ae600b7 100644 --- a/litellm/caching/caching_handler.py +++ b/litellm/caching/caching_handler.py @@ -595,6 +595,7 @@ class LLMCachingHandler: model_response_object=EmbeddingResponse(), response_type="embedding", ) + elif ( call_type == CallTypes.arerank.value or call_type == CallTypes.rerank.value ) and isinstance(cached_result, dict): @@ -618,6 +619,13 @@ class LLMCachingHandler: response_type="audio_transcription", hidden_params=hidden_params, ) + + if ( + hasattr(cached_result, "_hidden_params") + and cached_result._hidden_params is not None + and isinstance(cached_result._hidden_params, dict) + ): + cached_result._hidden_params["cache_hit"] = True return cached_result def _convert_cached_stream_response( diff --git a/litellm/utils.py b/litellm/utils.py index d07d86f7d..a8b7f32b4 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -796,7 +796,7 @@ def client(original_function): # noqa: PLR0915 and kwargs.get("_arealtime", False) is not True ): # allow users to control returning cached responses from the completion function # checking cache - print_verbose("INSIDE CHECKING CACHE") + verbose_logger.debug("INSIDE CHECKING SYNC CACHE") caching_handler_response: CachingHandlerResponse = ( _llm_caching_handler._sync_get_cache( model=model or "", @@ -808,6 +808,7 @@ def client(original_function): # noqa: PLR0915 args=args, ) ) + if caching_handler_response.cached_result is not None: return caching_handler_response.cached_result diff --git a/tests/local_testing/cache_unit_tests.py b/tests/local_testing/cache_unit_tests.py new file mode 100644 index 000000000..da56c773f --- /dev/null +++ b/tests/local_testing/cache_unit_tests.py @@ -0,0 +1,223 @@ +from abc import ABC, abstractmethod +from litellm.caching import LiteLLMCacheType +import os +import sys +import time +import traceback +import uuid + +from dotenv import load_dotenv +from test_rerank import assert_response_shape + +load_dotenv() +import os + +sys.path.insert( + 0, os.path.abspath("../..") +) # Adds the parent directory to the system path +import asyncio +import hashlib +import random + +import pytest + +import litellm +from litellm.caching import Cache +from litellm import completion, embedding + + +class LLMCachingUnitTests(ABC): + + @abstractmethod + def get_cache_type(self) -> LiteLLMCacheType: + pass + + @pytest.mark.parametrize("sync_mode", [True, False]) + @pytest.mark.asyncio + async def test_cache_completion(self, sync_mode): + litellm._turn_on_debug() + + random_number = random.randint( + 1, 100000 + ) # add a random number to ensure it's always adding / reading from cache + messages = [ + { + "role": "user", + "content": f"write a one sentence poem about: {random_number}", + } + ] + + cache_type = self.get_cache_type() + litellm.cache = Cache( + type=cache_type, + ) + + if sync_mode: + response1 = completion( + "gpt-3.5-turbo", + messages=messages, + caching=True, + max_tokens=20, + mock_response="This number is so great!", + ) + else: + response1 = await litellm.acompletion( + "gpt-3.5-turbo", + messages=messages, + caching=True, + max_tokens=20, + mock_response="This number is so great!", + ) + # response2 is mocked to a different response from response1, + # but the completion from the cache should be used instead of the mock + # response since the input is the same as response1 + await asyncio.sleep(0.5) + if sync_mode: + response2 = completion( + "gpt-3.5-turbo", + messages=messages, + caching=True, + max_tokens=20, + mock_response="This number is great!", + ) + else: + response2 = await litellm.acompletion( + "gpt-3.5-turbo", + messages=messages, + caching=True, + max_tokens=20, + mock_response="This number is great!", + ) + if ( + response1["choices"][0]["message"]["content"] + != response2["choices"][0]["message"]["content"] + ): # 1 and 2 should be the same + # 1&2 have the exact same input params. This MUST Be a CACHE HIT + print(f"response1: {response1}") + print(f"response2: {response2}") + pytest.fail( + f"Error occurred: response1 - {response1['choices'][0]['message']['content']} != response2 - {response2['choices'][0]['message']['content']}" + ) + # Since the parameters are not the same as response1, response3 should actually + # be the mock response + if sync_mode: + response3 = completion( + "gpt-3.5-turbo", + messages=messages, + caching=True, + temperature=0.5, + mock_response="This number is awful!", + ) + else: + response3 = await litellm.acompletion( + "gpt-3.5-turbo", + messages=messages, + caching=True, + temperature=0.5, + mock_response="This number is awful!", + ) + + print("\nresponse 1", response1) + print("\nresponse 2", response2) + print("\nresponse 3", response3) + # print("\nresponse 4", response4) + litellm.cache = None + litellm.success_callback = [] + litellm._async_success_callback = [] + + # 1 & 2 should be exactly the same + # 1 & 3 should be different, since input params are diff + + if ( + response1["choices"][0]["message"]["content"] + == response3["choices"][0]["message"]["content"] + ): + # if input params like max_tokens, temperature are diff it should NOT be a cache hit + print(f"response1: {response1}") + print(f"response3: {response3}") + pytest.fail( + f"Response 1 == response 3. Same model, diff params shoudl not cache Error" + f" occurred:" + ) + + assert response1.id == response2.id + assert response1.created == response2.created + assert ( + response1.choices[0].message.content == response2.choices[0].message.content + ) + + @pytest.mark.parametrize("sync_mode", [True, False]) + @pytest.mark.asyncio + async def test_disk_cache_embedding(self, sync_mode): + litellm._turn_on_debug() + + random_number = random.randint( + 1, 100000 + ) # add a random number to ensure it's always adding / reading from cache + input = [f"hello {random_number}"] + litellm.cache = Cache( + type="disk", + ) + + if sync_mode: + response1 = embedding( + "openai/text-embedding-ada-002", + input=input, + caching=True, + ) + else: + response1 = await litellm.aembedding( + "openai/text-embedding-ada-002", + input=input, + caching=True, + ) + # response2 is mocked to a different response from response1, + # but the completion from the cache should be used instead of the mock + # response since the input is the same as response1 + await asyncio.sleep(0.5) + if sync_mode: + response2 = embedding( + "openai/text-embedding-ada-002", + input=input, + caching=True, + ) + else: + response2 = await litellm.aembedding( + "openai/text-embedding-ada-002", + input=input, + caching=True, + ) + + if response2._hidden_params["cache_hit"] is not True: + pytest.fail("Cache hit should be True") + + # Since the parameters are not the same as response1, response3 should actually + # be the mock response + if sync_mode: + response3 = embedding( + "openai/text-embedding-ada-002", + input=input, + user="charlie", + caching=True, + ) + else: + response3 = await litellm.aembedding( + "openai/text-embedding-ada-002", + input=input, + caching=True, + user="charlie", + ) + + print("\nresponse 1", response1) + print("\nresponse 2", response2) + print("\nresponse 3", response3) + # print("\nresponse 4", response4) + litellm.cache = None + litellm.success_callback = [] + litellm._async_success_callback = [] + + # 1 & 2 should be exactly the same + # 1 & 3 should be different, since input params are diff + + if response3._hidden_params.get("cache_hit") is True: + pytest.fail("Cache hit should not be True") diff --git a/tests/local_testing/test_caching.py b/tests/local_testing/test_caching.py index d8295af47..222013a86 100644 --- a/tests/local_testing/test_caching.py +++ b/tests/local_testing/test_caching.py @@ -1103,193 +1103,6 @@ async def test_redis_cache_acompletion_stream_bedrock(): raise e -@pytest.mark.parametrize("sync_mode", [True, False]) -@pytest.mark.asyncio -async def test_disk_cache_completion(sync_mode): - litellm._turn_on_debug() - - random_number = random.randint( - 1, 100000 - ) # add a random number to ensure it's always adding / reading from cache - messages = [ - {"role": "user", "content": f"write a one sentence poem about: {random_number}"} - ] - litellm.cache = Cache( - type="disk", - ) - - if sync_mode: - response1 = completion( - "gpt-3.5-turbo", - messages=messages, - caching=True, - max_tokens=20, - mock_response="This number is so great!", - ) - else: - response1 = await litellm.acompletion( - "gpt-3.5-turbo", - messages=messages, - caching=True, - max_tokens=20, - mock_response="This number is so great!", - ) - # response2 is mocked to a different response from response1, - # but the completion from the cache should be used instead of the mock - # response since the input is the same as response1 - await asyncio.sleep(0.5) - if sync_mode: - response2 = completion( - "gpt-3.5-turbo", - messages=messages, - caching=True, - max_tokens=20, - mock_response="This number is great!", - ) - else: - response2 = await litellm.acompletion( - "gpt-3.5-turbo", - messages=messages, - caching=True, - max_tokens=20, - mock_response="This number is great!", - ) - if ( - response1["choices"][0]["message"]["content"] - != response2["choices"][0]["message"]["content"] - ): # 1 and 2 should be the same - # 1&2 have the exact same input params. This MUST Be a CACHE HIT - print(f"response1: {response1}") - print(f"response2: {response2}") - pytest.fail( - f"Error occurred: response1 - {response1['choices'][0]['message']['content']} != response2 - {response2['choices'][0]['message']['content']}" - ) - # Since the parameters are not the same as response1, response3 should actually - # be the mock response - if sync_mode: - response3 = completion( - "gpt-3.5-turbo", - messages=messages, - caching=True, - temperature=0.5, - mock_response="This number is awful!", - ) - else: - response3 = await litellm.acompletion( - "gpt-3.5-turbo", - messages=messages, - caching=True, - temperature=0.5, - mock_response="This number is awful!", - ) - - print("\nresponse 1", response1) - print("\nresponse 2", response2) - print("\nresponse 3", response3) - # print("\nresponse 4", response4) - litellm.cache = None - litellm.success_callback = [] - litellm._async_success_callback = [] - - # 1 & 2 should be exactly the same - # 1 & 3 should be different, since input params are diff - - if ( - response1["choices"][0]["message"]["content"] - == response3["choices"][0]["message"]["content"] - ): - # if input params like max_tokens, temperature are diff it should NOT be a cache hit - print(f"response1: {response1}") - print(f"response3: {response3}") - pytest.fail( - f"Response 1 == response 3. Same model, diff params shoudl not cache Error" - f" occurred:" - ) - - assert response1.id == response2.id - assert response1.created == response2.created - assert response1.choices[0].message.content == response2.choices[0].message.content - - -@pytest.mark.parametrize("sync_mode", [True, False]) -@pytest.mark.asyncio -async def test_disk_cache_embedding(sync_mode): - litellm._turn_on_debug() - - random_number = random.randint( - 1, 100000 - ) # add a random number to ensure it's always adding / reading from cache - input = [f"hello {random_number}"] - litellm.cache = Cache( - type="disk", - ) - - if sync_mode: - response1 = embedding( - "openai/text-embedding-ada-002", - input=input, - caching=True, - ) - else: - response1 = await litellm.aembedding( - "openai/text-embedding-ada-002", - input=input, - caching=True, - ) - # response2 is mocked to a different response from response1, - # but the completion from the cache should be used instead of the mock - # response since the input is the same as response1 - await asyncio.sleep(0.5) - if sync_mode: - response2 = embedding( - "openai/text-embedding-ada-002", - input=input, - caching=True, - ) - else: - response2 = await litellm.aembedding( - "openai/text-embedding-ada-002", - input=input, - caching=True, - ) - - if response2._hidden_params["cache_hit"] is not True: - pytest.fail("Cache hit should be True") - assert response1.id == response2.id - # Since the parameters are not the same as response1, response3 should actually - # be the mock response - if sync_mode: - response3 = embedding( - "openai/text-embedding-ada-002", - input=input, - user="charlie", - caching=True, - ) - else: - response3 = await litellm.acompletion( - "openai/text-embedding-ada-002", - input=input, - caching=True, - user="charlie", - ) - - print("\nresponse 1", response1) - print("\nresponse 2", response2) - print("\nresponse 3", response3) - # print("\nresponse 4", response4) - litellm.cache = None - litellm.success_callback = [] - litellm._async_success_callback = [] - - # 1 & 2 should be exactly the same - # 1 & 3 should be different, since input params are diff - - if response3._hidden_params["cache_hit"] is True: - pytest.fail("Cache hit should not be True") - - assert response1.id != response3.id - - # @pytest.mark.skip(reason="AWS Suspended Account") @pytest.mark.parametrize("sync_mode", [True, False]) @pytest.mark.asyncio diff --git a/tests/local_testing/test_disk_cache_unit_tests.py b/tests/local_testing/test_disk_cache_unit_tests.py new file mode 100644 index 000000000..c777d04ec --- /dev/null +++ b/tests/local_testing/test_disk_cache_unit_tests.py @@ -0,0 +1,11 @@ +from cache_unit_tests import LLMCachingUnitTests +from litellm.caching import LiteLLMCacheType + + +class TestDiskCacheUnitTests(LLMCachingUnitTests): + def get_cache_type(self) -> LiteLLMCacheType: + return LiteLLMCacheType.DISK + + +# if __name__ == "__main__": +# pytest.main([__file__, "-v", "-s"])