LiteLLM Minor Fixes & Improvements (10/09/2024) (#6139)

* fix(utils.py): don't return 'none' response headers Fixes https://github.com/BerriAI/litellm/issues/6123 * fix(vertex_and_google_ai_studio_gemini.py): support parsing out additional properties and strict value for tool calls Fixes https://github.com/BerriAI/litellm/issues/6136 * fix(cost_calculator.py): set default character value to none Fixes https://github.com/BerriAI/litellm/issues/6133#issuecomment-2403290196 * fix(google.py): fix cost per token / cost per char conversion Fixes https://github.com/BerriAI/litellm/issues/6133#issuecomment-2403370287 * build(model_prices_and_context_window.json): update gemini pricing Fixes https://github.com/BerriAI/litellm/issues/6133 * build(model_prices_and_context_window.json): update gemini pricing * fix(litellm_logging.py): fix streaming caching logging when 'turn_off_message_logging' enabled Stores unredacted response in cache * build(model_prices_and_context_window.json): update gemini-1.5-flash pricing * fix(cost_calculator.py): fix default prompt_character count logic Fixes error in gemini cost calculation * fix(cost_calculator.py): fix cost calc for tts models
2024-10-10 00:42:11 -07:00 · 2024-10-10 00:42:11 -07:00 · 6005450c8f
commit 6005450c8f
parent 60baa65e0e
16 changed files with 788 additions and 534 deletions
--- a/litellm/litellm_core_utils/litellm_logging.py
+++ b/litellm/litellm_core_utils/litellm_logging.py
@ -901,7 +901,9 @@ class Logging:
                        complete_streaming_response = None
                else:
                    self.sync_streaming_chunks.append(result)
-
+            _caching_complete_streaming_response: Optional[
+                Union[ModelResponse, TextCompletionResponse]
+            ] = None
            if complete_streaming_response is not None:
                verbose_logger.debug(
                    "Logging Details LiteLLM-Success Call streaming complete"
@ -909,6 +911,9 @@ class Logging:
                self.model_call_details["complete_streaming_response"] = (
                    complete_streaming_response
                )
+                _caching_complete_streaming_response = copy.deepcopy(
+                    complete_streaming_response
+                )
                self.model_call_details["response_cost"] = (
                    self._response_cost_calculator(result=complete_streaming_response)
                )
@ -937,6 +942,20 @@ class Logging:
            else:
                callbacks = litellm.success_callback

+            ## STREAMING CACHING ##
+            if "cache" in callbacks and litellm.cache is not None:
+                # this only logs streaming once, complete_streaming_response exists i.e when stream ends
+                print_verbose("success_callback: reaches cache for logging!")
+                kwargs = self.model_call_details
+                if self.stream and _caching_complete_streaming_response is not None:
+                    print_verbose(
+                        "success_callback: reaches cache for logging, there is a complete_streaming_response. Adding to cache"
+                    )
+                    result = _caching_complete_streaming_response
+                    # only add to cache once we have a complete streaming response
+                    litellm.cache.add_cache(result, **kwargs)
+
+            ## REDACT MESSAGES ##
            result = redact_message_input_output_from_logging(
                model_call_details=(
                    self.model_call_details
@ -1302,23 +1321,6 @@ class Logging:
                            end_time=end_time,
                            print_verbose=print_verbose,
                        )
-                    if callback == "cache" and litellm.cache is not None:
-                        # this only logs streaming once, complete_streaming_response exists i.e when stream ends
-                        print_verbose("success_callback: reaches cache for logging!")
-                        kwargs = self.model_call_details
-                        if self.stream:
-                            if "complete_streaming_response" not in kwargs:
-                                print_verbose(
-                                    f"success_callback: reaches cache for logging, there is no complete_streaming_response. Kwargs={kwargs}\n\n"
-                                )
-                                pass
-                            else:
-                                print_verbose(
-                                    "success_callback: reaches cache for logging, there is a complete_streaming_response. Adding to cache"
-                                )
-                                result = kwargs["complete_streaming_response"]
-                                # only add to cache once we have a complete streaming response
-                                litellm.cache.add_cache(result, **kwargs)
                    if callback == "athina" and athinaLogger is not None:
                        deep_copy = {}
                        for k, v in self.model_call_details.items():