test(caching_unit_tests.py): add unit tests for llm caching

ensures coverage for common caching scenarios across different implementations
2024-11-12 13:21:22 +05:30 · 2024-11-12 13:21:22 +05:30 · 16bbed72d4
commit 16bbed72d4
parent 0bc9864c09
5 changed files with 244 additions and 188 deletions
--- a/tests/local_testing/test_caching.py
+++ b/tests/local_testing/test_caching.py
@ -1103,193 +1103,6 @@ async def test_redis_cache_acompletion_stream_bedrock():
        raise e


-@pytest.mark.parametrize("sync_mode", [True, False])
-@pytest.mark.asyncio
-async def test_disk_cache_completion(sync_mode):
-    litellm._turn_on_debug()
-
-    random_number = random.randint(
-        1, 100000
-    )  # add a random number to ensure it's always adding / reading from cache
-    messages = [
-        {"role": "user", "content": f"write a one sentence poem about: {random_number}"}
-    ]
-    litellm.cache = Cache(
-        type="disk",
-    )
-
-    if sync_mode:
-        response1 = completion(
-            "gpt-3.5-turbo",
-            messages=messages,
-            caching=True,
-            max_tokens=20,
-            mock_response="This number is so great!",
-        )
-    else:
-        response1 = await litellm.acompletion(
-            "gpt-3.5-turbo",
-            messages=messages,
-            caching=True,
-            max_tokens=20,
-            mock_response="This number is so great!",
-        )
-    # response2 is mocked to a different response from response1,
-    # but the completion from the cache should be used instead of the mock
-    # response since the input is the same as response1
-    await asyncio.sleep(0.5)
-    if sync_mode:
-        response2 = completion(
-            "gpt-3.5-turbo",
-            messages=messages,
-            caching=True,
-            max_tokens=20,
-            mock_response="This number is great!",
-        )
-    else:
-        response2 = await litellm.acompletion(
-            "gpt-3.5-turbo",
-            messages=messages,
-            caching=True,
-            max_tokens=20,
-            mock_response="This number is great!",
-        )
-    if (
-        response1["choices"][0]["message"]["content"]
-        != response2["choices"][0]["message"]["content"]
-    ):  # 1 and 2 should be the same
-        # 1&2 have the exact same input params. This MUST Be a CACHE HIT
-        print(f"response1: {response1}")
-        print(f"response2: {response2}")
-        pytest.fail(
-            f"Error occurred: response1 - {response1['choices'][0]['message']['content']} != response2 - {response2['choices'][0]['message']['content']}"
-        )
-    # Since the parameters are not the same as response1, response3 should actually
-    # be the mock response
-    if sync_mode:
-        response3 = completion(
-            "gpt-3.5-turbo",
-            messages=messages,
-            caching=True,
-            temperature=0.5,
-            mock_response="This number is awful!",
-        )
-    else:
-        response3 = await litellm.acompletion(
-            "gpt-3.5-turbo",
-            messages=messages,
-            caching=True,
-            temperature=0.5,
-            mock_response="This number is awful!",
-        )
-
-    print("\nresponse 1", response1)
-    print("\nresponse 2", response2)
-    print("\nresponse 3", response3)
-    # print("\nresponse 4", response4)
-    litellm.cache = None
-    litellm.success_callback = []
-    litellm._async_success_callback = []
-
-    # 1 & 2 should be exactly the same
-    # 1 & 3 should be different, since input params are diff
-
-    if (
-        response1["choices"][0]["message"]["content"]
-        == response3["choices"][0]["message"]["content"]
-    ):
-        # if input params like max_tokens, temperature are diff it should NOT be a cache hit
-        print(f"response1: {response1}")
-        print(f"response3: {response3}")
-        pytest.fail(
-            f"Response 1 == response 3. Same model, diff params shoudl not cache Error"
-            f" occurred:"
-        )
-
-    assert response1.id == response2.id
-    assert response1.created == response2.created
-    assert response1.choices[0].message.content == response2.choices[0].message.content
-
-
-@pytest.mark.parametrize("sync_mode", [True, False])
-@pytest.mark.asyncio
-async def test_disk_cache_embedding(sync_mode):
-    litellm._turn_on_debug()
-
-    random_number = random.randint(
-        1, 100000
-    )  # add a random number to ensure it's always adding / reading from cache
-    input = [f"hello {random_number}"]
-    litellm.cache = Cache(
-        type="disk",
-    )
-
-    if sync_mode:
-        response1 = embedding(
-            "openai/text-embedding-ada-002",
-            input=input,
-            caching=True,
-        )
-    else:
-        response1 = await litellm.aembedding(
-            "openai/text-embedding-ada-002",
-            input=input,
-            caching=True,
-        )
-    # response2 is mocked to a different response from response1,
-    # but the completion from the cache should be used instead of the mock
-    # response since the input is the same as response1
-    await asyncio.sleep(0.5)
-    if sync_mode:
-        response2 = embedding(
-            "openai/text-embedding-ada-002",
-            input=input,
-            caching=True,
-        )
-    else:
-        response2 = await litellm.aembedding(
-            "openai/text-embedding-ada-002",
-            input=input,
-            caching=True,
-        )
-
-    if response2._hidden_params["cache_hit"] is not True:
-        pytest.fail("Cache hit should be True")
-    assert response1.id == response2.id
-    # Since the parameters are not the same as response1, response3 should actually
-    # be the mock response
-    if sync_mode:
-        response3 = embedding(
-            "openai/text-embedding-ada-002",
-            input=input,
-            user="charlie",
-            caching=True,
-        )
-    else:
-        response3 = await litellm.acompletion(
-            "openai/text-embedding-ada-002",
-            input=input,
-            caching=True,
-            user="charlie",
-        )
-
-    print("\nresponse 1", response1)
-    print("\nresponse 2", response2)
-    print("\nresponse 3", response3)
-    # print("\nresponse 4", response4)
-    litellm.cache = None
-    litellm.success_callback = []
-    litellm._async_success_callback = []
-
-    # 1 & 2 should be exactly the same
-    # 1 & 3 should be different, since input params are diff
-
-    if response3._hidden_params["cache_hit"] is True:
-        pytest.fail("Cache hit should not be True")
-
-    assert response1.id != response3.id
-
-
 # @pytest.mark.skip(reason="AWS Suspended Account")
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.asyncio