LiteLLM Minor Fixes & Improvements (10/09/2024) (#6139)

* fix(utils.py): don't return 'none' response headers

Fixes https://github.com/BerriAI/litellm/issues/6123

* fix(vertex_and_google_ai_studio_gemini.py): support parsing out additional properties and strict value for tool calls

Fixes https://github.com/BerriAI/litellm/issues/6136

* fix(cost_calculator.py): set default character value to none

Fixes https://github.com/BerriAI/litellm/issues/6133#issuecomment-2403290196

* fix(google.py): fix cost per token / cost per char conversion

Fixes https://github.com/BerriAI/litellm/issues/6133#issuecomment-2403370287

* build(model_prices_and_context_window.json): update gemini pricing

Fixes https://github.com/BerriAI/litellm/issues/6133

* build(model_prices_and_context_window.json): update gemini pricing

* fix(litellm_logging.py): fix streaming caching logging when 'turn_off_message_logging' enabled

Stores unredacted response in cache

* build(model_prices_and_context_window.json): update gemini-1.5-flash pricing

* fix(cost_calculator.py): fix default prompt_character count logic

Fixes error in gemini cost calculation

* fix(cost_calculator.py): fix cost calc for tts models
This commit is contained in:
Krish Dholakia 2024-10-10 00:42:11 -07:00 committed by GitHub
parent 60baa65e0e
commit 6005450c8f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
16 changed files with 788 additions and 534 deletions

View file

@ -901,7 +901,9 @@ class Logging:
complete_streaming_response = None
else:
self.sync_streaming_chunks.append(result)
_caching_complete_streaming_response: Optional[
Union[ModelResponse, TextCompletionResponse]
] = None
if complete_streaming_response is not None:
verbose_logger.debug(
"Logging Details LiteLLM-Success Call streaming complete"
@ -909,6 +911,9 @@ class Logging:
self.model_call_details["complete_streaming_response"] = (
complete_streaming_response
)
_caching_complete_streaming_response = copy.deepcopy(
complete_streaming_response
)
self.model_call_details["response_cost"] = (
self._response_cost_calculator(result=complete_streaming_response)
)
@ -937,6 +942,20 @@ class Logging:
else:
callbacks = litellm.success_callback
## STREAMING CACHING ##
if "cache" in callbacks and litellm.cache is not None:
# this only logs streaming once, complete_streaming_response exists i.e when stream ends
print_verbose("success_callback: reaches cache for logging!")
kwargs = self.model_call_details
if self.stream and _caching_complete_streaming_response is not None:
print_verbose(
"success_callback: reaches cache for logging, there is a complete_streaming_response. Adding to cache"
)
result = _caching_complete_streaming_response
# only add to cache once we have a complete streaming response
litellm.cache.add_cache(result, **kwargs)
## REDACT MESSAGES ##
result = redact_message_input_output_from_logging(
model_call_details=(
self.model_call_details
@ -1302,23 +1321,6 @@ class Logging:
end_time=end_time,
print_verbose=print_verbose,
)
if callback == "cache" and litellm.cache is not None:
# this only logs streaming once, complete_streaming_response exists i.e when stream ends
print_verbose("success_callback: reaches cache for logging!")
kwargs = self.model_call_details
if self.stream:
if "complete_streaming_response" not in kwargs:
print_verbose(
f"success_callback: reaches cache for logging, there is no complete_streaming_response. Kwargs={kwargs}\n\n"
)
pass
else:
print_verbose(
"success_callback: reaches cache for logging, there is a complete_streaming_response. Adding to cache"
)
result = kwargs["complete_streaming_response"]
# only add to cache once we have a complete streaming response
litellm.cache.add_cache(result, **kwargs)
if callback == "athina" and athinaLogger is not None:
deep_copy = {}
for k, v in self.model_call_details.items():