Merge pull request #5018 from haadirakhangi/main

Qdrant Semantic Caching
2024-08-21 08:50:43 -07:00 · 2024-08-21 08:50:43 -07:00 · 7d0196191f
commit 7d0196191f
parent 1b6db8359a df5074da56
5 changed files with 694 additions and 6 deletions
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -121,7 +121,7 @@ import importlib.metadata
 from openai import OpenAIError as OriginalError

 from ._logging import verbose_logger
-from .caching import RedisCache, RedisSemanticCache, S3Cache
+from .caching import RedisCache, RedisSemanticCache, S3Cache, QdrantSemanticCache
 from .exceptions import (
    APIConnectionError,
    APIError,
@ -1164,6 +1164,14 @@ def client(original_function):
                        cached_result = await litellm.cache.async_get_cache(
                            *args, **kwargs
                        )
+                    elif isinstance(litellm.cache.cache, QdrantSemanticCache):
+                        preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs)
+                        kwargs["preset_cache_key"] = (
+                            preset_cache_key  # for streaming calls, we need to pass the preset_cache_key
+                        )
+                        cached_result = await litellm.cache.async_get_cache(
+                            *args, **kwargs
+                        )
                    else:  # for s3 caching. [NOT RECOMMENDED IN PROD - this will slow down responses since boto3 is sync]
                        preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs)
                        kwargs["preset_cache_key"] = (