LiteLLM Minor Fixes & Improvements (11/12/2024) (#6705)

* fix(caching): convert arg to equivalent kwargs in llm caching handler prevent unexpected errors * fix(caching_handler.py): don't pass args to caching * fix(caching): remove all *args from caching.py * fix(caching): consistent function signatures + abc method * test(caching_unit_tests.py): add unit tests for llm caching ensures coverage for common caching scenarios across different implementations * refactor(litellm_logging.py): move to using cache key from hidden params instead of regenerating one * fix(router.py): drop redis password requirement * fix(proxy_server.py): fix faulty slack alerting check * fix(langfuse.py): avoid copying functions/thread lock objects in metadata fixes metadata copy error when parent otel span in metadata * test: update test
2024-11-12 22:50:51 +05:30 · 2024-11-12 22:50:51 +05:30 · 9160d80fa5
commit 9160d80fa5
parent d39fd60801
23 changed files with 525 additions and 204 deletions
--- a/tests/local_testing/cache_unit_tests.py
+++ b/tests/local_testing/cache_unit_tests.py
@ -0,0 +1,223 @@
+from abc import ABC, abstractmethod
+from litellm.caching import LiteLLMCacheType
+import os
+import sys
+import time
+import traceback
+import uuid
+
+from dotenv import load_dotenv
+from test_rerank import assert_response_shape
+
+load_dotenv()
+import os
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
+import asyncio
+import hashlib
+import random
+
+import pytest
+
+import litellm
+from litellm.caching import Cache
+from litellm import completion, embedding
+
+
+class LLMCachingUnitTests(ABC):
+
+    @abstractmethod
+    def get_cache_type(self) -> LiteLLMCacheType:
+        pass
+
+    @pytest.mark.parametrize("sync_mode", [True, False])
+    @pytest.mark.asyncio
+    async def test_cache_completion(self, sync_mode):
+        litellm._turn_on_debug()
+
+        random_number = random.randint(
+            1, 100000
+        )  # add a random number to ensure it's always adding / reading from cache
+        messages = [
+            {
+                "role": "user",
+                "content": f"write a one sentence poem about: {random_number}",
+            }
+        ]
+
+        cache_type = self.get_cache_type()
+        litellm.cache = Cache(
+            type=cache_type,
+        )
+
+        if sync_mode:
+            response1 = completion(
+                "gpt-3.5-turbo",
+                messages=messages,
+                caching=True,
+                max_tokens=20,
+                mock_response="This number is so great!",
+            )
+        else:
+            response1 = await litellm.acompletion(
+                "gpt-3.5-turbo",
+                messages=messages,
+                caching=True,
+                max_tokens=20,
+                mock_response="This number is so great!",
+            )
+        # response2 is mocked to a different response from response1,
+        # but the completion from the cache should be used instead of the mock
+        # response since the input is the same as response1
+        await asyncio.sleep(0.5)
+        if sync_mode:
+            response2 = completion(
+                "gpt-3.5-turbo",
+                messages=messages,
+                caching=True,
+                max_tokens=20,
+                mock_response="This number is great!",
+            )
+        else:
+            response2 = await litellm.acompletion(
+                "gpt-3.5-turbo",
+                messages=messages,
+                caching=True,
+                max_tokens=20,
+                mock_response="This number is great!",
+            )
+        if (
+            response1["choices"][0]["message"]["content"]
+            != response2["choices"][0]["message"]["content"]
+        ):  # 1 and 2 should be the same
+            # 1&2 have the exact same input params. This MUST Be a CACHE HIT
+            print(f"response1: {response1}")
+            print(f"response2: {response2}")
+            pytest.fail(
+                f"Error occurred: response1 - {response1['choices'][0]['message']['content']} != response2 - {response2['choices'][0]['message']['content']}"
+            )
+        # Since the parameters are not the same as response1, response3 should actually
+        # be the mock response
+        if sync_mode:
+            response3 = completion(
+                "gpt-3.5-turbo",
+                messages=messages,
+                caching=True,
+                temperature=0.5,
+                mock_response="This number is awful!",
+            )
+        else:
+            response3 = await litellm.acompletion(
+                "gpt-3.5-turbo",
+                messages=messages,
+                caching=True,
+                temperature=0.5,
+                mock_response="This number is awful!",
+            )
+
+        print("\nresponse 1", response1)
+        print("\nresponse 2", response2)
+        print("\nresponse 3", response3)
+        # print("\nresponse 4", response4)
+        litellm.cache = None
+        litellm.success_callback = []
+        litellm._async_success_callback = []
+
+        # 1 & 2 should be exactly the same
+        # 1 & 3 should be different, since input params are diff
+
+        if (
+            response1["choices"][0]["message"]["content"]
+            == response3["choices"][0]["message"]["content"]
+        ):
+            # if input params like max_tokens, temperature are diff it should NOT be a cache hit
+            print(f"response1: {response1}")
+            print(f"response3: {response3}")
+            pytest.fail(
+                f"Response 1 == response 3. Same model, diff params shoudl not cache Error"
+                f" occurred:"
+            )
+
+        assert response1.id == response2.id
+        assert response1.created == response2.created
+        assert (
+            response1.choices[0].message.content == response2.choices[0].message.content
+        )
+
+    @pytest.mark.parametrize("sync_mode", [True, False])
+    @pytest.mark.asyncio
+    async def test_disk_cache_embedding(self, sync_mode):
+        litellm._turn_on_debug()
+
+        random_number = random.randint(
+            1, 100000
+        )  # add a random number to ensure it's always adding / reading from cache
+        input = [f"hello {random_number}"]
+        litellm.cache = Cache(
+            type="disk",
+        )
+
+        if sync_mode:
+            response1 = embedding(
+                "openai/text-embedding-ada-002",
+                input=input,
+                caching=True,
+            )
+        else:
+            response1 = await litellm.aembedding(
+                "openai/text-embedding-ada-002",
+                input=input,
+                caching=True,
+            )
+        # response2 is mocked to a different response from response1,
+        # but the completion from the cache should be used instead of the mock
+        # response since the input is the same as response1
+        await asyncio.sleep(0.5)
+        if sync_mode:
+            response2 = embedding(
+                "openai/text-embedding-ada-002",
+                input=input,
+                caching=True,
+            )
+        else:
+            response2 = await litellm.aembedding(
+                "openai/text-embedding-ada-002",
+                input=input,
+                caching=True,
+            )
+
+        if response2._hidden_params["cache_hit"] is not True:
+            pytest.fail("Cache hit should be True")
+
+        # Since the parameters are not the same as response1, response3 should actually
+        # be the mock response
+        if sync_mode:
+            response3 = embedding(
+                "openai/text-embedding-ada-002",
+                input=input,
+                user="charlie",
+                caching=True,
+            )
+        else:
+            response3 = await litellm.aembedding(
+                "openai/text-embedding-ada-002",
+                input=input,
+                caching=True,
+                user="charlie",
+            )
+
+        print("\nresponse 1", response1)
+        print("\nresponse 2", response2)
+        print("\nresponse 3", response3)
+        # print("\nresponse 4", response4)
+        litellm.cache = None
+        litellm.success_callback = []
+        litellm._async_success_callback = []
+
+        # 1 & 2 should be exactly the same
+        # 1 & 3 should be different, since input params are diff
+
+        if response3._hidden_params.get("cache_hit") is True:
+            pytest.fail("Cache hit should not be True")
--- a/tests/local_testing/test_alerting.py
+++ b/tests/local_testing/test_alerting.py
@ -438,7 +438,7 @@ async def test_send_daily_reports_ignores_zero_values():
    slack_alerting.internal_usage_cache.async_batch_get_cache = AsyncMock(
        return_value=[None, 0, 10, 0, 0, None]
    )
-    slack_alerting.internal_usage_cache.async_batch_set_cache = AsyncMock()
+    slack_alerting.internal_usage_cache.async_set_cache_pipeline = AsyncMock()

    router.get_model_info.side_effect = lambda x: {"litellm_params": {"model": x}}

--- a/tests/local_testing/test_caching.py
+++ b/tests/local_testing/test_caching.py
@ -1103,81 +1103,6 @@ async def test_redis_cache_acompletion_stream_bedrock():
        raise e


-def test_disk_cache_completion():
-    litellm.set_verbose = False
-
-    random_number = random.randint(
-        1, 100000
-    )  # add a random number to ensure it's always adding / reading from cache
-    messages = [
-        {"role": "user", "content": f"write a one sentence poem about: {random_number}"}
-    ]
-    litellm.cache = Cache(
-        type="disk",
-    )
-
-    response1 = completion(
-        model="gpt-3.5-turbo",
-        messages=messages,
-        caching=True,
-        max_tokens=20,
-        mock_response="This number is so great!",
-    )
-    # response2 is mocked to a different response from response1,
-    # but the completion from the cache should be used instead of the mock
-    # response since the input is the same as response1
-    response2 = completion(
-        model="gpt-3.5-turbo",
-        messages=messages,
-        caching=True,
-        max_tokens=20,
-        mock_response="This number is awful!",
-    )
-    # Since the parameters are not the same as response1, response3 should actually
-    # be the mock response
-    response3 = completion(
-        model="gpt-3.5-turbo",
-        messages=messages,
-        caching=True,
-        temperature=0.5,
-        mock_response="This number is awful!",
-    )
-
-    print("\nresponse 1", response1)
-    print("\nresponse 2", response2)
-    print("\nresponse 3", response3)
-    # print("\nresponse 4", response4)
-    litellm.cache = None
-    litellm.success_callback = []
-    litellm._async_success_callback = []
-
-    # 1 & 2 should be exactly the same
-    # 1 & 3 should be different, since input params are diff
-    if (
-        response1["choices"][0]["message"]["content"]
-        != response2["choices"][0]["message"]["content"]
-    ):  # 1 and 2 should be the same
-        # 1&2 have the exact same input params. This MUST Be a CACHE HIT
-        print(f"response1: {response1}")
-        print(f"response2: {response2}")
-        pytest.fail(f"Error occurred:")
-    if (
-        response1["choices"][0]["message"]["content"]
-        == response3["choices"][0]["message"]["content"]
-    ):
-        # if input params like max_tokens, temperature are diff it should NOT be a cache hit
-        print(f"response1: {response1}")
-        print(f"response3: {response3}")
-        pytest.fail(
-            f"Response 1 == response 3. Same model, diff params shoudl not cache Error"
-            f" occurred:"
-        )
-
-    assert response1.id == response2.id
-    assert response1.created == response2.created
-    assert response1.choices[0].message.content == response2.choices[0].message.content
-
-
 # @pytest.mark.skip(reason="AWS Suspended Account")
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.asyncio
--- a/tests/local_testing/test_disk_cache_unit_tests.py
+++ b/tests/local_testing/test_disk_cache_unit_tests.py
@ -0,0 +1,11 @@
+from cache_unit_tests import LLMCachingUnitTests
+from litellm.caching import LiteLLMCacheType
+
+
+class TestDiskCacheUnitTests(LLMCachingUnitTests):
+    def get_cache_type(self) -> LiteLLMCacheType:
+        return LiteLLMCacheType.DISK
+
+
+# if __name__ == "__main__":
+#     pytest.main([__file__, "-v", "-s"])
--- a/tests/local_testing/test_dual_cache.py
+++ b/tests/local_testing/test_dual_cache.py
@ -146,7 +146,7 @@ async def test_dual_cache_batch_operations(is_async):

    # Set values
    if is_async:
-        await dual_cache.async_batch_set_cache(cache_list)
+        await dual_cache.async_set_cache_pipeline(cache_list)
    else:
        for key, value in cache_list:
            dual_cache.set_cache(key, value)
--- a/tests/logging_callback_tests/test_langfuse_unit_tests.py
+++ b/tests/logging_callback_tests/test_langfuse_unit_tests.py
@ -212,26 +212,48 @@ def test_get_langfuse_logger_for_request_with_cached_logger():
    assert result == cached_logger
    mock_cache.get_cache.assert_called_once()

-@pytest.mark.parametrize("metadata", [
-    {'a': 1, 'b': 2, 'c': 3},
-    {'a': {'nested_a': 1}, 'b': {'nested_b': 2}},
-    {'a': [1, 2, 3], 'b': {4, 5, 6}},
-    {'a': (1, 2), 'b': frozenset([3, 4]), 'c': {'d': [5, 6]}},
-    {'lock': threading.Lock()},
-    {'func': lambda x: x + 1},
-    {
-        'int': 42,
-        'str': 'hello',
-        'list': [1, 2, 3],
-        'set': {4, 5},
-        'dict': {'nested': 'value'},
-        'non_copyable': threading.Lock(),
-        'function': print
-    },
-    ['list', 'not', 'a', 'dict'],
-    {'timestamp': datetime.now()},
-    {},
-    None,
-])
-def test_langfuse_logger_prepare_metadata(metadata):
-    global_langfuse_logger._prepare_metadata(metadata)
+
+@pytest.mark.parametrize(
+    "metadata, expected_metadata",
+    [
+        ({"a": 1, "b": 2, "c": 3}, {"a": 1, "b": 2, "c": 3}),
+        (
+            {"a": {"nested_a": 1}, "b": {"nested_b": 2}},
+            {"a": {"nested_a": 1}, "b": {"nested_b": 2}},
+        ),
+        ({"a": [1, 2, 3], "b": {4, 5, 6}}, {"a": [1, 2, 3], "b": {4, 5, 6}}),
+        (
+            {"a": (1, 2), "b": frozenset([3, 4]), "c": {"d": [5, 6]}},
+            {"a": (1, 2), "b": frozenset([3, 4]), "c": {"d": [5, 6]}},
+        ),
+        ({"lock": threading.Lock()}, {}),
+        ({"func": lambda x: x + 1}, {}),
+        (
+            {
+                "int": 42,
+                "str": "hello",
+                "list": [1, 2, 3],
+                "set": {4, 5},
+                "dict": {"nested": "value"},
+                "non_copyable": threading.Lock(),
+                "function": print,
+            },
+            {
+                "int": 42,
+                "str": "hello",
+                "list": [1, 2, 3],
+                "set": {4, 5},
+                "dict": {"nested": "value"},
+            },
+        ),
+        (
+            {"list": ["list", "not", "a", "dict"]},
+            {"list": ["list", "not", "a", "dict"]},
+        ),
+        ({}, {}),
+        (None, None),
+    ],
+)
+def test_langfuse_logger_prepare_metadata(metadata, expected_metadata):
+    result = global_langfuse_logger._prepare_metadata(metadata)
+    assert result == expected_metadata