(refactor caching) use LLMCachingHandler for caching streaming responses (#6210)

* use folder for caching * fix importing caching * fix clickhouse pyright * fix linting * fix correctly pass kwargs and args * fix test case for embedding * fix linting * fix embedding caching logic * fix refactor handle utils.py * refactor async set stream cache * fix linting
2024-10-14 17:46:45 +05:30 · 2024-10-14 17:46:45 +05:30 · d1bef4ad81
commit d1bef4ad81
parent 78f3228e17
3 changed files with 75 additions and 37 deletions
--- a/litellm/litellm_core_utils/litellm_logging.py
+++ b/litellm/litellm_core_utils/litellm_logging.py
@ -24,6 +24,7 @@ from litellm import (
    verbose_logger,
 )
 from litellm.caching.caching import DualCache, InMemoryCache, S3Cache
+from litellm.caching.caching_handler import LLMCachingHandler
 from litellm.cost_calculator import _select_model_name_for_cost_calc
 from litellm.integrations.custom_guardrail import CustomGuardrail
 from litellm.integrations.custom_logger import CustomLogger
@ -271,6 +272,7 @@ class Logging:

        ## TIME TO FIRST TOKEN LOGGING ##
        self.completion_start_time: Optional[datetime.datetime] = None
+        self._llm_caching_handler: Optional[LLMCachingHandler] = None

    def process_dynamic_callbacks(self):
        """
@ -1625,35 +1627,6 @@ class Logging:
                if kwargs.get("no-log", False) is True:
                    print_verbose("no-log request, skipping logging")
                    continue
-                if (
-                    callback == "cache"
-                    and litellm.cache is not None
-                    and self.model_call_details.get("litellm_params", {}).get(
-                        "acompletion", False
-                    )
-                    is True
-                ):
-                    # set_cache once complete streaming response is built
-                    print_verbose("async success_callback: reaches cache for logging!")
-                    kwargs = self.model_call_details
-                    if self.stream:
-                        if "async_complete_streaming_response" not in kwargs:
-                            print_verbose(
-                                f"async success_callback: reaches cache for logging, there is no async_complete_streaming_response. Kwargs={kwargs}\n\n"
-                            )
-                            pass
-                        else:
-                            print_verbose(
-                                "async success_callback: reaches cache for logging, there is a async_complete_streaming_response. Adding to cache"
-                            )
-                            result = kwargs["async_complete_streaming_response"]
-                            # only add to cache once we have a complete streaming response
-                            if litellm.cache is not None and not isinstance(
-                                litellm.cache.cache, S3Cache
-                            ):
-                                await litellm.cache.async_add_cache(result, **kwargs)
-                            else:
-                                litellm.cache.add_cache(result, **kwargs)
                if callback == "openmeter" and openMeterLogger is not None:
                    if self.stream is True:
                        if (