test(caching_unit_tests.py): add unit tests for llm caching

ensures coverage for common caching scenarios across different implementations
2024-11-12 13:21:22 +05:30 · 2024-11-12 13:21:22 +05:30 · 16bbed72d4
commit 16bbed72d4
parent 0bc9864c09
5 changed files with 244 additions and 188 deletions
--- a/tests/local_testing/cache_unit_tests.py
+++ b/tests/local_testing/cache_unit_tests.py
@ -0,0 +1,223 @@
+from abc import ABC, abstractmethod
+from litellm.caching import LiteLLMCacheType
+import os
+import sys
+import time
+import traceback
+import uuid
+
+from dotenv import load_dotenv
+from test_rerank import assert_response_shape
+
+load_dotenv()
+import os
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
+import asyncio
+import hashlib
+import random
+
+import pytest
+
+import litellm
+from litellm.caching import Cache
+from litellm import completion, embedding
+
+
+class LLMCachingUnitTests(ABC):
+
+    @abstractmethod
+    def get_cache_type(self) -> LiteLLMCacheType:
+        pass
+
+    @pytest.mark.parametrize("sync_mode", [True, False])
+    @pytest.mark.asyncio
+    async def test_cache_completion(self, sync_mode):
+        litellm._turn_on_debug()
+
+        random_number = random.randint(
+            1, 100000
+        )  # add a random number to ensure it's always adding / reading from cache
+        messages = [
+            {
+                "role": "user",
+                "content": f"write a one sentence poem about: {random_number}",
+            }
+        ]
+
+        cache_type = self.get_cache_type()
+        litellm.cache = Cache(
+            type=cache_type,
+        )
+
+        if sync_mode:
+            response1 = completion(
+                "gpt-3.5-turbo",
+                messages=messages,
+                caching=True,
+                max_tokens=20,
+                mock_response="This number is so great!",
+            )
+        else:
+            response1 = await litellm.acompletion(
+                "gpt-3.5-turbo",
+                messages=messages,
+                caching=True,
+                max_tokens=20,
+                mock_response="This number is so great!",
+            )
+        # response2 is mocked to a different response from response1,
+        # but the completion from the cache should be used instead of the mock
+        # response since the input is the same as response1
+        await asyncio.sleep(0.5)
+        if sync_mode:
+            response2 = completion(
+                "gpt-3.5-turbo",
+                messages=messages,
+                caching=True,
+                max_tokens=20,
+                mock_response="This number is great!",
+            )
+        else:
+            response2 = await litellm.acompletion(
+                "gpt-3.5-turbo",
+                messages=messages,
+                caching=True,
+                max_tokens=20,
+                mock_response="This number is great!",
+            )
+        if (
+            response1["choices"][0]["message"]["content"]
+            != response2["choices"][0]["message"]["content"]
+        ):  # 1 and 2 should be the same
+            # 1&2 have the exact same input params. This MUST Be a CACHE HIT
+            print(f"response1: {response1}")
+            print(f"response2: {response2}")
+            pytest.fail(
+                f"Error occurred: response1 - {response1['choices'][0]['message']['content']} != response2 - {response2['choices'][0]['message']['content']}"
+            )
+        # Since the parameters are not the same as response1, response3 should actually
+        # be the mock response
+        if sync_mode:
+            response3 = completion(
+                "gpt-3.5-turbo",
+                messages=messages,
+                caching=True,
+                temperature=0.5,
+                mock_response="This number is awful!",
+            )
+        else:
+            response3 = await litellm.acompletion(
+                "gpt-3.5-turbo",
+                messages=messages,
+                caching=True,
+                temperature=0.5,
+                mock_response="This number is awful!",
+            )
+
+        print("\nresponse 1", response1)
+        print("\nresponse 2", response2)
+        print("\nresponse 3", response3)
+        # print("\nresponse 4", response4)
+        litellm.cache = None
+        litellm.success_callback = []
+        litellm._async_success_callback = []
+
+        # 1 & 2 should be exactly the same
+        # 1 & 3 should be different, since input params are diff
+
+        if (
+            response1["choices"][0]["message"]["content"]
+            == response3["choices"][0]["message"]["content"]
+        ):
+            # if input params like max_tokens, temperature are diff it should NOT be a cache hit
+            print(f"response1: {response1}")
+            print(f"response3: {response3}")
+            pytest.fail(
+                f"Response 1 == response 3. Same model, diff params shoudl not cache Error"
+                f" occurred:"
+            )
+
+        assert response1.id == response2.id
+        assert response1.created == response2.created
+        assert (
+            response1.choices[0].message.content == response2.choices[0].message.content
+        )
+
+    @pytest.mark.parametrize("sync_mode", [True, False])
+    @pytest.mark.asyncio
+    async def test_disk_cache_embedding(self, sync_mode):
+        litellm._turn_on_debug()
+
+        random_number = random.randint(
+            1, 100000
+        )  # add a random number to ensure it's always adding / reading from cache
+        input = [f"hello {random_number}"]
+        litellm.cache = Cache(
+            type="disk",
+        )
+
+        if sync_mode:
+            response1 = embedding(
+                "openai/text-embedding-ada-002",
+                input=input,
+                caching=True,
+            )
+        else:
+            response1 = await litellm.aembedding(
+                "openai/text-embedding-ada-002",
+                input=input,
+                caching=True,
+            )
+        # response2 is mocked to a different response from response1,
+        # but the completion from the cache should be used instead of the mock
+        # response since the input is the same as response1
+        await asyncio.sleep(0.5)
+        if sync_mode:
+            response2 = embedding(
+                "openai/text-embedding-ada-002",
+                input=input,
+                caching=True,
+            )
+        else:
+            response2 = await litellm.aembedding(
+                "openai/text-embedding-ada-002",
+                input=input,
+                caching=True,
+            )
+
+        if response2._hidden_params["cache_hit"] is not True:
+            pytest.fail("Cache hit should be True")
+
+        # Since the parameters are not the same as response1, response3 should actually
+        # be the mock response
+        if sync_mode:
+            response3 = embedding(
+                "openai/text-embedding-ada-002",
+                input=input,
+                user="charlie",
+                caching=True,
+            )
+        else:
+            response3 = await litellm.aembedding(
+                "openai/text-embedding-ada-002",
+                input=input,
+                caching=True,
+                user="charlie",
+            )
+
+        print("\nresponse 1", response1)
+        print("\nresponse 2", response2)
+        print("\nresponse 3", response3)
+        # print("\nresponse 4", response4)
+        litellm.cache = None
+        litellm.success_callback = []
+        litellm._async_success_callback = []
+
+        # 1 & 2 should be exactly the same
+        # 1 & 3 should be different, since input params are diff
+
+        if response3._hidden_params.get("cache_hit") is True:
+            pytest.fail("Cache hit should not be True")