test(caching_unit_tests.py): add unit tests for llm caching

ensures coverage for common caching scenarios across different implementations
2024-11-12 13:21:22 +05:30 · 2024-11-12 13:21:22 +05:30 · 16bbed72d4
commit 16bbed72d4
parent 0bc9864c09
5 changed files with 244 additions and 188 deletions
--- a/litellm/caching/caching_handler.py
+++ b/litellm/caching/caching_handler.py
@ -595,6 +595,7 @@ class LLMCachingHandler:
                model_response_object=EmbeddingResponse(),
                response_type="embedding",
            )
+
        elif (
            call_type == CallTypes.arerank.value or call_type == CallTypes.rerank.value
        ) and isinstance(cached_result, dict):
@ -618,6 +619,13 @@ class LLMCachingHandler:
                response_type="audio_transcription",
                hidden_params=hidden_params,
            )
+
+        if (
+            hasattr(cached_result, "_hidden_params")
+            and cached_result._hidden_params is not None
+            and isinstance(cached_result._hidden_params, dict)
+        ):
+            cached_result._hidden_params["cache_hit"] = True
        return cached_result

    def _convert_cached_stream_response(
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -796,7 +796,7 @@ def client(original_function):  # noqa: PLR0915
                and kwargs.get("_arealtime", False) is not True
            ):  # allow users to control returning cached responses from the completion function
                # checking cache
-                print_verbose("INSIDE CHECKING CACHE")
+                verbose_logger.debug("INSIDE CHECKING SYNC CACHE")
                caching_handler_response: CachingHandlerResponse = (
                    _llm_caching_handler._sync_get_cache(
                        model=model or "",
@ -808,6 +808,7 @@ def client(original_function):  # noqa: PLR0915
                        args=args,
                    )
                )
+
                if caching_handler_response.cached_result is not None:
                    return caching_handler_response.cached_result

--- a/tests/local_testing/cache_unit_tests.py
+++ b/tests/local_testing/cache_unit_tests.py
@ -0,0 +1,223 @@
+from abc import ABC, abstractmethod
+from litellm.caching import LiteLLMCacheType
+import os
+import sys
+import time
+import traceback
+import uuid
+
+from dotenv import load_dotenv
+from test_rerank import assert_response_shape
+
+load_dotenv()
+import os
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
+import asyncio
+import hashlib
+import random
+
+import pytest
+
+import litellm
+from litellm.caching import Cache
+from litellm import completion, embedding
+
+
+class LLMCachingUnitTests(ABC):
+
+    @abstractmethod
+    def get_cache_type(self) -> LiteLLMCacheType:
+        pass
+
+    @pytest.mark.parametrize("sync_mode", [True, False])
+    @pytest.mark.asyncio
+    async def test_cache_completion(self, sync_mode):
+        litellm._turn_on_debug()
+
+        random_number = random.randint(
+            1, 100000
+        )  # add a random number to ensure it's always adding / reading from cache
+        messages = [
+            {
+                "role": "user",
+                "content": f"write a one sentence poem about: {random_number}",
+            }
+        ]
+
+        cache_type = self.get_cache_type()
+        litellm.cache = Cache(
+            type=cache_type,
+        )
+
+        if sync_mode:
+            response1 = completion(
+                "gpt-3.5-turbo",
+                messages=messages,
+                caching=True,
+                max_tokens=20,
+                mock_response="This number is so great!",
+            )
+        else:
+            response1 = await litellm.acompletion(
+                "gpt-3.5-turbo",
+                messages=messages,
+                caching=True,
+                max_tokens=20,
+                mock_response="This number is so great!",
+            )
+        # response2 is mocked to a different response from response1,
+        # but the completion from the cache should be used instead of the mock
+        # response since the input is the same as response1
+        await asyncio.sleep(0.5)
+        if sync_mode:
+            response2 = completion(
+                "gpt-3.5-turbo",
+                messages=messages,
+                caching=True,
+                max_tokens=20,
+                mock_response="This number is great!",
+            )
+        else:
+            response2 = await litellm.acompletion(
+                "gpt-3.5-turbo",
+                messages=messages,
+                caching=True,
+                max_tokens=20,
+                mock_response="This number is great!",
+            )
+        if (
+            response1["choices"][0]["message"]["content"]
+            != response2["choices"][0]["message"]["content"]
+        ):  # 1 and 2 should be the same
+            # 1&2 have the exact same input params. This MUST Be a CACHE HIT
+            print(f"response1: {response1}")
+            print(f"response2: {response2}")
+            pytest.fail(
+                f"Error occurred: response1 - {response1['choices'][0]['message']['content']} != response2 - {response2['choices'][0]['message']['content']}"
+            )
+        # Since the parameters are not the same as response1, response3 should actually
+        # be the mock response
+        if sync_mode:
+            response3 = completion(
+                "gpt-3.5-turbo",
+                messages=messages,
+                caching=True,
+                temperature=0.5,
+                mock_response="This number is awful!",
+            )
+        else:
+            response3 = await litellm.acompletion(
+                "gpt-3.5-turbo",
+                messages=messages,
+                caching=True,
+                temperature=0.5,
+                mock_response="This number is awful!",
+            )
+
+        print("\nresponse 1", response1)
+        print("\nresponse 2", response2)
+        print("\nresponse 3", response3)
+        # print("\nresponse 4", response4)
+        litellm.cache = None
+        litellm.success_callback = []
+        litellm._async_success_callback = []
+
+        # 1 & 2 should be exactly the same
+        # 1 & 3 should be different, since input params are diff
+
+        if (
+            response1["choices"][0]["message"]["content"]
+            == response3["choices"][0]["message"]["content"]
+        ):
+            # if input params like max_tokens, temperature are diff it should NOT be a cache hit
+            print(f"response1: {response1}")
+            print(f"response3: {response3}")
+            pytest.fail(
+                f"Response 1 == response 3. Same model, diff params shoudl not cache Error"
+                f" occurred:"
+            )
+
+        assert response1.id == response2.id
+        assert response1.created == response2.created
+        assert (
+            response1.choices[0].message.content == response2.choices[0].message.content
+        )
+
+    @pytest.mark.parametrize("sync_mode", [True, False])
+    @pytest.mark.asyncio
+    async def test_disk_cache_embedding(self, sync_mode):
+        litellm._turn_on_debug()
+
+        random_number = random.randint(
+            1, 100000
+        )  # add a random number to ensure it's always adding / reading from cache
+        input = [f"hello {random_number}"]
+        litellm.cache = Cache(
+            type="disk",
+        )
+
+        if sync_mode:
+            response1 = embedding(
+                "openai/text-embedding-ada-002",
+                input=input,
+                caching=True,
+            )
+        else:
+            response1 = await litellm.aembedding(
+                "openai/text-embedding-ada-002",
+                input=input,
+                caching=True,
+            )
+        # response2 is mocked to a different response from response1,
+        # but the completion from the cache should be used instead of the mock
+        # response since the input is the same as response1
+        await asyncio.sleep(0.5)
+        if sync_mode:
+            response2 = embedding(
+                "openai/text-embedding-ada-002",
+                input=input,
+                caching=True,
+            )
+        else:
+            response2 = await litellm.aembedding(
+                "openai/text-embedding-ada-002",
+                input=input,
+                caching=True,
+            )
+
+        if response2._hidden_params["cache_hit"] is not True:
+            pytest.fail("Cache hit should be True")
+
+        # Since the parameters are not the same as response1, response3 should actually
+        # be the mock response
+        if sync_mode:
+            response3 = embedding(
+                "openai/text-embedding-ada-002",
+                input=input,
+                user="charlie",
+                caching=True,
+            )
+        else:
+            response3 = await litellm.aembedding(
+                "openai/text-embedding-ada-002",
+                input=input,
+                caching=True,
+                user="charlie",
+            )
+
+        print("\nresponse 1", response1)
+        print("\nresponse 2", response2)
+        print("\nresponse 3", response3)
+        # print("\nresponse 4", response4)
+        litellm.cache = None
+        litellm.success_callback = []
+        litellm._async_success_callback = []
+
+        # 1 & 2 should be exactly the same
+        # 1 & 3 should be different, since input params are diff
+
+        if response3._hidden_params.get("cache_hit") is True:
+            pytest.fail("Cache hit should not be True")
--- a/tests/local_testing/test_caching.py
+++ b/tests/local_testing/test_caching.py
@ -1103,193 +1103,6 @@ async def test_redis_cache_acompletion_stream_bedrock():
        raise e


-@pytest.mark.parametrize("sync_mode", [True, False])
-@pytest.mark.asyncio
-async def test_disk_cache_completion(sync_mode):
-    litellm._turn_on_debug()
-
-    random_number = random.randint(
-        1, 100000
-    )  # add a random number to ensure it's always adding / reading from cache
-    messages = [
-        {"role": "user", "content": f"write a one sentence poem about: {random_number}"}
-    ]
-    litellm.cache = Cache(
-        type="disk",
-    )
-
-    if sync_mode:
-        response1 = completion(
-            "gpt-3.5-turbo",
-            messages=messages,
-            caching=True,
-            max_tokens=20,
-            mock_response="This number is so great!",
-        )
-    else:
-        response1 = await litellm.acompletion(
-            "gpt-3.5-turbo",
-            messages=messages,
-            caching=True,
-            max_tokens=20,
-            mock_response="This number is so great!",
-        )
-    # response2 is mocked to a different response from response1,
-    # but the completion from the cache should be used instead of the mock
-    # response since the input is the same as response1
-    await asyncio.sleep(0.5)
-    if sync_mode:
-        response2 = completion(
-            "gpt-3.5-turbo",
-            messages=messages,
-            caching=True,
-            max_tokens=20,
-            mock_response="This number is great!",
-        )
-    else:
-        response2 = await litellm.acompletion(
-            "gpt-3.5-turbo",
-            messages=messages,
-            caching=True,
-            max_tokens=20,
-            mock_response="This number is great!",
-        )
-    if (
-        response1["choices"][0]["message"]["content"]
-        != response2["choices"][0]["message"]["content"]
-    ):  # 1 and 2 should be the same
-        # 1&2 have the exact same input params. This MUST Be a CACHE HIT
-        print(f"response1: {response1}")
-        print(f"response2: {response2}")
-        pytest.fail(
-            f"Error occurred: response1 - {response1['choices'][0]['message']['content']} != response2 - {response2['choices'][0]['message']['content']}"
-        )
-    # Since the parameters are not the same as response1, response3 should actually
-    # be the mock response
-    if sync_mode:
-        response3 = completion(
-            "gpt-3.5-turbo",
-            messages=messages,
-            caching=True,
-            temperature=0.5,
-            mock_response="This number is awful!",
-        )
-    else:
-        response3 = await litellm.acompletion(
-            "gpt-3.5-turbo",
-            messages=messages,
-            caching=True,
-            temperature=0.5,
-            mock_response="This number is awful!",
-        )
-
-    print("\nresponse 1", response1)
-    print("\nresponse 2", response2)
-    print("\nresponse 3", response3)
-    # print("\nresponse 4", response4)
-    litellm.cache = None
-    litellm.success_callback = []
-    litellm._async_success_callback = []
-
-    # 1 & 2 should be exactly the same
-    # 1 & 3 should be different, since input params are diff
-
-    if (
-        response1["choices"][0]["message"]["content"]
-        == response3["choices"][0]["message"]["content"]
-    ):
-        # if input params like max_tokens, temperature are diff it should NOT be a cache hit
-        print(f"response1: {response1}")
-        print(f"response3: {response3}")
-        pytest.fail(
-            f"Response 1 == response 3. Same model, diff params shoudl not cache Error"
-            f" occurred:"
-        )
-
-    assert response1.id == response2.id
-    assert response1.created == response2.created
-    assert response1.choices[0].message.content == response2.choices[0].message.content
-
-
-@pytest.mark.parametrize("sync_mode", [True, False])
-@pytest.mark.asyncio
-async def test_disk_cache_embedding(sync_mode):
-    litellm._turn_on_debug()
-
-    random_number = random.randint(
-        1, 100000
-    )  # add a random number to ensure it's always adding / reading from cache
-    input = [f"hello {random_number}"]
-    litellm.cache = Cache(
-        type="disk",
-    )
-
-    if sync_mode:
-        response1 = embedding(
-            "openai/text-embedding-ada-002",
-            input=input,
-            caching=True,
-        )
-    else:
-        response1 = await litellm.aembedding(
-            "openai/text-embedding-ada-002",
-            input=input,
-            caching=True,
-        )
-    # response2 is mocked to a different response from response1,
-    # but the completion from the cache should be used instead of the mock
-    # response since the input is the same as response1
-    await asyncio.sleep(0.5)
-    if sync_mode:
-        response2 = embedding(
-            "openai/text-embedding-ada-002",
-            input=input,
-            caching=True,
-        )
-    else:
-        response2 = await litellm.aembedding(
-            "openai/text-embedding-ada-002",
-            input=input,
-            caching=True,
-        )
-
-    if response2._hidden_params["cache_hit"] is not True:
-        pytest.fail("Cache hit should be True")
-    assert response1.id == response2.id
-    # Since the parameters are not the same as response1, response3 should actually
-    # be the mock response
-    if sync_mode:
-        response3 = embedding(
-            "openai/text-embedding-ada-002",
-            input=input,
-            user="charlie",
-            caching=True,
-        )
-    else:
-        response3 = await litellm.acompletion(
-            "openai/text-embedding-ada-002",
-            input=input,
-            caching=True,
-            user="charlie",
-        )
-
-    print("\nresponse 1", response1)
-    print("\nresponse 2", response2)
-    print("\nresponse 3", response3)
-    # print("\nresponse 4", response4)
-    litellm.cache = None
-    litellm.success_callback = []
-    litellm._async_success_callback = []
-
-    # 1 & 2 should be exactly the same
-    # 1 & 3 should be different, since input params are diff
-
-    if response3._hidden_params["cache_hit"] is True:
-        pytest.fail("Cache hit should not be True")
-
-    assert response1.id != response3.id
-
-
 # @pytest.mark.skip(reason="AWS Suspended Account")
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.asyncio
--- a/tests/local_testing/test_disk_cache_unit_tests.py
+++ b/tests/local_testing/test_disk_cache_unit_tests.py
@ -0,0 +1,11 @@
+from cache_unit_tests import LLMCachingUnitTests
+from litellm.caching import LiteLLMCacheType
+
+
+class TestDiskCacheUnitTests(LLMCachingUnitTests):
+    def get_cache_type(self) -> LiteLLMCacheType:
+        return LiteLLMCacheType.DISK
+
+
+# if __name__ == "__main__":
+#     pytest.main([__file__, "-v", "-s"])