(refactor caching) use LLMCachingHandler for caching streaming responses (#6210)

* use folder for caching

* fix importing caching

* fix clickhouse pyright

* fix linting

* fix correctly pass kwargs and args

* fix test case for embedding

* fix linting

* fix embedding caching logic

* fix refactor handle utils.py

* refactor async set stream cache

* fix linting
This commit is contained in:
Ishaan Jaff 2024-10-14 17:46:45 +05:30 committed by GitHub
parent 78f3228e17
commit d1bef4ad81
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 75 additions and 37 deletions

View file

@ -24,6 +24,7 @@ from litellm import (
verbose_logger,
)
from litellm.caching.caching import DualCache, InMemoryCache, S3Cache
from litellm.caching.caching_handler import LLMCachingHandler
from litellm.cost_calculator import _select_model_name_for_cost_calc
from litellm.integrations.custom_guardrail import CustomGuardrail
from litellm.integrations.custom_logger import CustomLogger
@ -271,6 +272,7 @@ class Logging:
## TIME TO FIRST TOKEN LOGGING ##
self.completion_start_time: Optional[datetime.datetime] = None
self._llm_caching_handler: Optional[LLMCachingHandler] = None
def process_dynamic_callbacks(self):
"""
@ -1625,35 +1627,6 @@ class Logging:
if kwargs.get("no-log", False) is True:
print_verbose("no-log request, skipping logging")
continue
if (
callback == "cache"
and litellm.cache is not None
and self.model_call_details.get("litellm_params", {}).get(
"acompletion", False
)
is True
):
# set_cache once complete streaming response is built
print_verbose("async success_callback: reaches cache for logging!")
kwargs = self.model_call_details
if self.stream:
if "async_complete_streaming_response" not in kwargs:
print_verbose(
f"async success_callback: reaches cache for logging, there is no async_complete_streaming_response. Kwargs={kwargs}\n\n"
)
pass
else:
print_verbose(
"async success_callback: reaches cache for logging, there is a async_complete_streaming_response. Adding to cache"
)
result = kwargs["async_complete_streaming_response"]
# only add to cache once we have a complete streaming response
if litellm.cache is not None and not isinstance(
litellm.cache.cache, S3Cache
):
await litellm.cache.async_add_cache(result, **kwargs)
else:
litellm.cache.add_cache(result, **kwargs)
if callback == "openmeter" and openMeterLogger is not None:
if self.stream is True:
if (