test(caching_unit_tests.py): add unit tests for llm caching

ensures coverage for common caching scenarios across different implementations
2024-11-12 13:21:22 +05:30 · 2024-11-12 13:21:22 +05:30 · 16bbed72d4
commit 16bbed72d4
parent 0bc9864c09
5 changed files with 244 additions and 188 deletions
--- a/litellm/caching/caching_handler.py
+++ b/litellm/caching/caching_handler.py
@ -595,6 +595,7 @@ class LLMCachingHandler:
                model_response_object=EmbeddingResponse(),
                response_type="embedding",
            )
        elif (
            call_type == CallTypes.arerank.value or call_type == CallTypes.rerank.value
        ) and isinstance(cached_result, dict):
@ -618,6 +619,13 @@ class LLMCachingHandler:
                response_type="audio_transcription",
                hidden_params=hidden_params,
            )
        if (
            hasattr(cached_result, "_hidden_params")
            and cached_result._hidden_params is not None
            and isinstance(cached_result._hidden_params, dict)
        ):
            cached_result._hidden_params["cache_hit"] = True
        return cached_result
    def _convert_cached_stream_response(
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -796,7 +796,7 @@ def client(original_function):  # noqa: PLR0915
                and kwargs.get("_arealtime", False) is not True
            ):  # allow users to control returning cached responses from the completion function
                # checking cache
-                print_verbose("INSIDE CHECKING CACHE")
+                verbose_logger.debug("INSIDE CHECKING SYNC CACHE")
                caching_handler_response: CachingHandlerResponse = (
                    _llm_caching_handler._sync_get_cache(
                        model=model or "",
@ -808,6 +808,7 @@ def client(original_function):  # noqa: PLR0915
                        args=args,
                    )
                )
                if caching_handler_response.cached_result is not None:
                    return caching_handler_response.cached_result
--- a/tests/local_testing/cache_unit_tests.py
+++ b/tests/local_testing/cache_unit_tests.py
@ -0,0 +1,223 @@
 from abc import ABC, abstractmethod
 from litellm.caching import LiteLLMCacheType
 import os
 import sys
 import time
 import traceback
 import uuid
 from dotenv import load_dotenv
 from test_rerank import assert_response_shape
 load_dotenv()
 import os
 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import asyncio
 import hashlib
 import random
 import pytest
 import litellm
 from litellm.caching import Cache
 from litellm import completion, embedding
 class LLMCachingUnitTests(ABC):
    @abstractmethod
    def get_cache_type(self) -> LiteLLMCacheType:
        pass
    @pytest.mark.parametrize("sync_mode", [True, False])
    @pytest.mark.asyncio
    async def test_cache_completion(self, sync_mode):
        litellm._turn_on_debug()
        random_number = random.randint(
            1, 100000
        )  # add a random number to ensure it's always adding / reading from cache
        messages = [
            {
                "role": "user",
                "content": f"write a one sentence poem about: {random_number}",
            }
        ]
        cache_type = self.get_cache_type()
        litellm.cache = Cache(
            type=cache_type,
        )
        if sync_mode:
            response1 = completion(
                "gpt-3.5-turbo",
                messages=messages,
                caching=True,
                max_tokens=20,
                mock_response="This number is so great!",
            )
        else:
            response1 = await litellm.acompletion(
                "gpt-3.5-turbo",
                messages=messages,
                caching=True,
                max_tokens=20,
                mock_response="This number is so great!",
            )
        # response2 is mocked to a different response from response1,
        # but the completion from the cache should be used instead of the mock
        # response since the input is the same as response1
        await asyncio.sleep(0.5)
        if sync_mode:
            response2 = completion(
                "gpt-3.5-turbo",
                messages=messages,
                caching=True,
                max_tokens=20,
                mock_response="This number is great!",
            )
        else:
            response2 = await litellm.acompletion(
                "gpt-3.5-turbo",
                messages=messages,
                caching=True,
                max_tokens=20,
                mock_response="This number is great!",
            )
        if (
            response1["choices"][0]["message"]["content"]
            != response2["choices"][0]["message"]["content"]
        ):  # 1 and 2 should be the same
            # 1&2 have the exact same input params. This MUST Be a CACHE HIT
            print(f"response1: {response1}")
            print(f"response2: {response2}")
            pytest.fail(
                f"Error occurred: response1 - {response1['choices'][0]['message']['content']} != response2 - {response2['choices'][0]['message']['content']}"
            )
        # Since the parameters are not the same as response1, response3 should actually
        # be the mock response
        if sync_mode:
            response3 = completion(
                "gpt-3.5-turbo",
                messages=messages,
                caching=True,
                temperature=0.5,
                mock_response="This number is awful!",
            )
        else:
            response3 = await litellm.acompletion(
                "gpt-3.5-turbo",
                messages=messages,
                caching=True,
                temperature=0.5,
                mock_response="This number is awful!",
            )
        print("\nresponse 1", response1)
        print("\nresponse 2", response2)
        print("\nresponse 3", response3)
        # print("\nresponse 4", response4)
        litellm.cache = None
        litellm.success_callback = []
        litellm._async_success_callback = []
        # 1 & 2 should be exactly the same
        # 1 & 3 should be different, since input params are diff
        if (
            response1["choices"][0]["message"]["content"]
            == response3["choices"][0]["message"]["content"]
        ):
            # if input params like max_tokens, temperature are diff it should NOT be a cache hit
            print(f"response1: {response1}")
            print(f"response3: {response3}")
            pytest.fail(
                f"Response 1 == response 3. Same model, diff params shoudl not cache Error"
                f" occurred:"
            )
        assert response1.id == response2.id
        assert response1.created == response2.created
        assert (
            response1.choices[0].message.content == response2.choices[0].message.content
        )
    @pytest.mark.parametrize("sync_mode", [True, False])
    @pytest.mark.asyncio
    async def test_disk_cache_embedding(self, sync_mode):
        litellm._turn_on_debug()
        random_number = random.randint(
            1, 100000
        )  # add a random number to ensure it's always adding / reading from cache
        input = [f"hello {random_number}"]
        litellm.cache = Cache(
            type="disk",
        )
        if sync_mode:
            response1 = embedding(
                "openai/text-embedding-ada-002",
                input=input,
                caching=True,
            )
        else:
            response1 = await litellm.aembedding(
                "openai/text-embedding-ada-002",
                input=input,
                caching=True,
            )
        # response2 is mocked to a different response from response1,
        # but the completion from the cache should be used instead of the mock
        # response since the input is the same as response1
        await asyncio.sleep(0.5)
        if sync_mode:
            response2 = embedding(
                "openai/text-embedding-ada-002",
                input=input,
                caching=True,
            )
        else:
            response2 = await litellm.aembedding(
                "openai/text-embedding-ada-002",
                input=input,
                caching=True,
            )
        if response2._hidden_params["cache_hit"] is not True:
            pytest.fail("Cache hit should be True")
        # Since the parameters are not the same as response1, response3 should actually
        # be the mock response
        if sync_mode:
            response3 = embedding(
                "openai/text-embedding-ada-002",
                input=input,
                user="charlie",
                caching=True,
            )
        else:
            response3 = await litellm.aembedding(
                "openai/text-embedding-ada-002",
                input=input,
                caching=True,
                user="charlie",
            )
        print("\nresponse 1", response1)
        print("\nresponse 2", response2)
        print("\nresponse 3", response3)
        # print("\nresponse 4", response4)
        litellm.cache = None
        litellm.success_callback = []
        litellm._async_success_callback = []
        # 1 & 2 should be exactly the same
        # 1 & 3 should be different, since input params are diff
        if response3._hidden_params.get("cache_hit") is True:
            pytest.fail("Cache hit should not be True")
--- a/tests/local_testing/test_caching.py
+++ b/tests/local_testing/test_caching.py
@ -1103,193 +1103,6 @@ async def test_redis_cache_acompletion_stream_bedrock():
        raise e
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.asyncio
 async def test_disk_cache_completion(sync_mode):
    litellm._turn_on_debug()
    random_number = random.randint(
        1, 100000
    )  # add a random number to ensure it's always adding / reading from cache
    messages = [
        {"role": "user", "content": f"write a one sentence poem about: {random_number}"}
    ]
    litellm.cache = Cache(
        type="disk",
    )
    if sync_mode:
        response1 = completion(
            "gpt-3.5-turbo",
            messages=messages,
            caching=True,
            max_tokens=20,
            mock_response="This number is so great!",
        )
    else:
        response1 = await litellm.acompletion(
            "gpt-3.5-turbo",
            messages=messages,
            caching=True,
            max_tokens=20,
            mock_response="This number is so great!",
        )
    # response2 is mocked to a different response from response1,
    # but the completion from the cache should be used instead of the mock
    # response since the input is the same as response1
    await asyncio.sleep(0.5)
    if sync_mode:
        response2 = completion(
            "gpt-3.5-turbo",
            messages=messages,
            caching=True,
            max_tokens=20,
            mock_response="This number is great!",
        )
    else:
        response2 = await litellm.acompletion(
            "gpt-3.5-turbo",
            messages=messages,
            caching=True,
            max_tokens=20,
            mock_response="This number is great!",
        )
    if (
        response1["choices"][0]["message"]["content"]
        != response2["choices"][0]["message"]["content"]
    ):  # 1 and 2 should be the same
        # 1&2 have the exact same input params. This MUST Be a CACHE HIT
        print(f"response1: {response1}")
        print(f"response2: {response2}")
        pytest.fail(
            f"Error occurred: response1 - {response1['choices'][0]['message']['content']} != response2 - {response2['choices'][0]['message']['content']}"
        )
    # Since the parameters are not the same as response1, response3 should actually
    # be the mock response
    if sync_mode:
        response3 = completion(
            "gpt-3.5-turbo",
            messages=messages,
            caching=True,
            temperature=0.5,
            mock_response="This number is awful!",
        )
    else:
        response3 = await litellm.acompletion(
            "gpt-3.5-turbo",
            messages=messages,
            caching=True,
            temperature=0.5,
            mock_response="This number is awful!",
        )
    print("\nresponse 1", response1)
    print("\nresponse 2", response2)
    print("\nresponse 3", response3)
    # print("\nresponse 4", response4)
    litellm.cache = None
    litellm.success_callback = []
    litellm._async_success_callback = []
    # 1 & 2 should be exactly the same
    # 1 & 3 should be different, since input params are diff
    if (
        response1["choices"][0]["message"]["content"]
        == response3["choices"][0]["message"]["content"]
    ):
        # if input params like max_tokens, temperature are diff it should NOT be a cache hit
        print(f"response1: {response1}")
        print(f"response3: {response3}")
        pytest.fail(
            f"Response 1 == response 3. Same model, diff params shoudl not cache Error"
            f" occurred:"
        )
    assert response1.id == response2.id
    assert response1.created == response2.created
    assert response1.choices[0].message.content == response2.choices[0].message.content
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.asyncio
 async def test_disk_cache_embedding(sync_mode):
    litellm._turn_on_debug()
    random_number = random.randint(
        1, 100000
    )  # add a random number to ensure it's always adding / reading from cache
    input = [f"hello {random_number}"]
    litellm.cache = Cache(
        type="disk",
    )
    if sync_mode:
        response1 = embedding(
            "openai/text-embedding-ada-002",
            input=input,
            caching=True,
        )
    else:
        response1 = await litellm.aembedding(
            "openai/text-embedding-ada-002",
            input=input,
            caching=True,
        )
    # response2 is mocked to a different response from response1,
    # but the completion from the cache should be used instead of the mock
    # response since the input is the same as response1
    await asyncio.sleep(0.5)
    if sync_mode:
        response2 = embedding(
            "openai/text-embedding-ada-002",
            input=input,
            caching=True,
        )
    else:
        response2 = await litellm.aembedding(
            "openai/text-embedding-ada-002",
            input=input,
            caching=True,
        )
    if response2._hidden_params["cache_hit"] is not True:
        pytest.fail("Cache hit should be True")
    assert response1.id == response2.id
    # Since the parameters are not the same as response1, response3 should actually
    # be the mock response
    if sync_mode:
        response3 = embedding(
            "openai/text-embedding-ada-002",
            input=input,
            user="charlie",
            caching=True,
        )
    else:
        response3 = await litellm.acompletion(
            "openai/text-embedding-ada-002",
            input=input,
            caching=True,
            user="charlie",
        )
    print("\nresponse 1", response1)
    print("\nresponse 2", response2)
    print("\nresponse 3", response3)
    # print("\nresponse 4", response4)
    litellm.cache = None
    litellm.success_callback = []
    litellm._async_success_callback = []
    # 1 & 2 should be exactly the same
    # 1 & 3 should be different, since input params are diff
    if response3._hidden_params["cache_hit"] is True:
        pytest.fail("Cache hit should not be True")
    assert response1.id != response3.id
 # @pytest.mark.skip(reason="AWS Suspended Account")
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.asyncio
--- a/tests/local_testing/test_disk_cache_unit_tests.py
+++ b/tests/local_testing/test_disk_cache_unit_tests.py
@ -0,0 +1,11 @@
 from cache_unit_tests import LLMCachingUnitTests
 from litellm.caching import LiteLLMCacheType
 class TestDiskCacheUnitTests(LLMCachingUnitTests):
    def get_cache_type(self) -> LiteLLMCacheType:
        return LiteLLMCacheType.DISK
 # if __name__ == "__main__":
 #     pytest.main([__file__, "-v", "-s"])