(refactor) caching - use _sync_set_cache (#6224)

* caching - use _sync_set_cache * add sync _sync_add_streaming_response_to_cache * use caching class for cache storage
2024-10-16 10:38:07 +05:30 · 2024-10-16 10:38:07 +05:30 · da6a7c3a55
commit da6a7c3a55
parent b8d4973661
3 changed files with 89 additions and 28 deletions
--- a/litellm/caching/caching_handler.py
+++ b/litellm/caching/caching_handler.py
@ -67,6 +67,7 @@ class LLMCachingHandler:
        start_time: datetime.datetime,
    ):
        self.async_streaming_chunks: List[ModelResponse] = []
        self.sync_streaming_chunks: List[ModelResponse] = []
        self.request_kwargs = request_kwargs
        self.original_function = original_function
        self.start_time = start_time
@ -470,12 +471,11 @@ class LLMCachingHandler:
            None
        """
        args = args or ()
        if litellm.cache is None:
            return
        # [OPTIONAL] ADD TO CACHE
-        if (
+        if self._should_store_result_in_cache(
-            (litellm.cache is not None)
+            original_function=original_function, kwargs=kwargs
            and litellm.cache.supported_call_types is not None
            and (str(original_function.__name__) in litellm.cache.supported_call_types)
            and (kwargs.get("cache", {}).get("no-store", False) is not True)
        ):
            if (
                isinstance(result, litellm.ModelResponse)
@ -509,6 +509,42 @@ class LLMCachingHandler:
                    litellm.cache.async_add_cache(result, *args, **kwargs)
                )
    def _sync_set_cache(
        self,
        result: Any,
        kwargs: Dict[str, Any],
        args: Optional[Tuple[Any, ...]] = None,
    ):
        """
        Sync internal method to add the result to the cache
        """
        if litellm.cache is None:
            return
        args = args or ()
        if self._should_store_result_in_cache(
            original_function=self.original_function, kwargs=kwargs
        ):
            litellm.cache.add_cache(result, *args, **kwargs)
        return
    def _should_store_result_in_cache(
        self, original_function: Callable, kwargs: Dict[str, Any]
    ) -> bool:
        """
        Helper function to determine if the result should be stored in the cache.
        Returns:
            bool: True if the result should be stored in the cache, False otherwise.
        """
        return (
            (litellm.cache is not None)
            and litellm.cache.supported_call_types is not None
            and (str(original_function.__name__) in litellm.cache.supported_call_types)
            and (kwargs.get("cache", {}).get("no-store", False) is not True)
        )
    async def _add_streaming_response_to_cache(self, processed_chunk: ModelResponse):
        """
        Internal method to add the streaming response to the cache
@ -536,3 +572,25 @@ class LLMCachingHandler:
                original_function=self.original_function,
                kwargs=self.request_kwargs,
            )
    def _sync_add_streaming_response_to_cache(self, processed_chunk: ModelResponse):
        """
        Sync internal method to add the streaming response to the cache
        """
        complete_streaming_response: Optional[
            Union[ModelResponse, TextCompletionResponse]
        ] = _assemble_complete_response_from_streaming_chunks(
            result=processed_chunk,
            start_time=self.start_time,
            end_time=datetime.datetime.now(),
            request_kwargs=self.request_kwargs,
            streaming_chunks=self.sync_streaming_chunks,
            is_async=False,
        )
        # if a complete_streaming_response is assembled, add it to the cache
        if complete_streaming_response is not None:
            self._sync_set_cache(
                result=complete_streaming_response,
                kwargs=self.request_kwargs,
            )
--- a/litellm/litellm_core_utils/litellm_logging.py
+++ b/litellm/litellm_core_utils/litellm_logging.py
@ -938,19 +938,6 @@ class Logging:
            else:
                callbacks = litellm.success_callback
            ## STREAMING CACHING ##
            if "cache" in callbacks and litellm.cache is not None:
                # this only logs streaming once, complete_streaming_response exists i.e when stream ends
                print_verbose("success_callback: reaches cache for logging!")
                kwargs = self.model_call_details
                if self.stream and _caching_complete_streaming_response is not None:
                    print_verbose(
                        "success_callback: reaches cache for logging, there is a complete_streaming_response. Adding to cache"
                    )
                    result = _caching_complete_streaming_response
                    # only add to cache once we have a complete streaming response
                    litellm.cache.add_cache(result, **kwargs)
            ## REDACT MESSAGES ##
            result = redact_message_input_output_from_logging(
                model_call_details=(
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -765,7 +765,9 @@ def client(original_function):
        print_args_passed_to_litellm(original_function, args, kwargs)
        start_time = datetime.datetime.now()
        result = None
-        logging_obj = kwargs.get("litellm_logging_obj", None)
+        logging_obj: Optional[LiteLLMLoggingObject] = kwargs.get(
            "litellm_logging_obj", None
        )
        # only set litellm_call_id if its not in kwargs
        call_type = original_function.__name__
@ -787,6 +789,12 @@ def client(original_function):
                    original_function.__name__, rules_obj, start_time, *args, **kwargs
                )
            kwargs["litellm_logging_obj"] = logging_obj
            _llm_caching_handler: LLMCachingHandler = LLMCachingHandler(
                original_function=original_function,
                request_kwargs=kwargs,
                start_time=start_time,
            )
            logging_obj._llm_caching_handler = _llm_caching_handler
            # CHECK FOR 'os.environ/' in kwargs
            for k, v in kwargs.items():
@ -1013,12 +1021,11 @@ def client(original_function):
            )
            # [OPTIONAL] ADD TO CACHE
-            if (
+            _llm_caching_handler._sync_set_cache(
-                litellm.cache is not None
+                result=result,
-                and litellm.cache.supported_call_types is not None
+                args=args,
-                and call_type in litellm.cache.supported_call_types
+                kwargs=kwargs,
-            ) and (kwargs.get("cache", {}).get("no-store", False) is not True):
+            )
                litellm.cache.add_cache(result, *args, **kwargs)
            # LOG SUCCESS - handle streaming success logging in the _next_ object, remove `handle_success` once it's deprecated
            verbose_logger.info("Wrapper: Completed Call, calling success_handler")
@ -7886,7 +7893,10 @@ class CustomStreamWrapper:
        """
        self.logging_loop = loop
-    def run_success_logging_in_thread(self, processed_chunk, cache_hit: bool):
+    def run_success_logging_and_cache_storage(self, processed_chunk, cache_hit: bool):
        """
        Runs success logging in a thread and adds the response to the cache
        """
        if litellm.disable_streaming_logging is True:
            """
            [NOT RECOMMENDED]
@ -7914,6 +7924,12 @@ class CustomStreamWrapper:
        ## SYNC LOGGING
        self.logging_obj.success_handler(processed_chunk, None, None, cache_hit)
        ## Sync store in cache
        if self.logging_obj._llm_caching_handler is not None:
            self.logging_obj._llm_caching_handler._sync_add_streaming_response_to_cache(
                processed_chunk
            )
    def finish_reason_handler(self):
        model_response = self.model_response_creator()
        if self.received_finish_reason is not None:
@ -7960,7 +7976,7 @@ class CustomStreamWrapper:
                        continue
                    ## LOGGING
                    threading.Thread(
-                        target=self.run_success_logging_in_thread,
+                        target=self.run_success_logging_and_cache_storage,
                        args=(response, cache_hit),
                    ).start()  # log response
                    choice = response.choices[0]
@ -8028,7 +8044,7 @@ class CustomStreamWrapper:
                    processed_chunk._hidden_params["usage"] = usage
                ## LOGGING
                threading.Thread(
-                    target=self.run_success_logging_in_thread,
+                    target=self.run_success_logging_and_cache_storage,
                    args=(processed_chunk, cache_hit),
                ).start()  # log response
                return processed_chunk