forked from phoenix/litellm-mirror
(refactor) caching - use _sync_set_cache (#6224)
* caching - use _sync_set_cache * add sync _sync_add_streaming_response_to_cache * use caching class for cache storage
This commit is contained in:
parent
b8d4973661
commit
da6a7c3a55
3 changed files with 89 additions and 28 deletions
|
@ -67,6 +67,7 @@ class LLMCachingHandler:
|
||||||
start_time: datetime.datetime,
|
start_time: datetime.datetime,
|
||||||
):
|
):
|
||||||
self.async_streaming_chunks: List[ModelResponse] = []
|
self.async_streaming_chunks: List[ModelResponse] = []
|
||||||
|
self.sync_streaming_chunks: List[ModelResponse] = []
|
||||||
self.request_kwargs = request_kwargs
|
self.request_kwargs = request_kwargs
|
||||||
self.original_function = original_function
|
self.original_function = original_function
|
||||||
self.start_time = start_time
|
self.start_time = start_time
|
||||||
|
@ -470,12 +471,11 @@ class LLMCachingHandler:
|
||||||
None
|
None
|
||||||
"""
|
"""
|
||||||
args = args or ()
|
args = args or ()
|
||||||
|
if litellm.cache is None:
|
||||||
|
return
|
||||||
# [OPTIONAL] ADD TO CACHE
|
# [OPTIONAL] ADD TO CACHE
|
||||||
if (
|
if self._should_store_result_in_cache(
|
||||||
(litellm.cache is not None)
|
original_function=original_function, kwargs=kwargs
|
||||||
and litellm.cache.supported_call_types is not None
|
|
||||||
and (str(original_function.__name__) in litellm.cache.supported_call_types)
|
|
||||||
and (kwargs.get("cache", {}).get("no-store", False) is not True)
|
|
||||||
):
|
):
|
||||||
if (
|
if (
|
||||||
isinstance(result, litellm.ModelResponse)
|
isinstance(result, litellm.ModelResponse)
|
||||||
|
@ -509,6 +509,42 @@ class LLMCachingHandler:
|
||||||
litellm.cache.async_add_cache(result, *args, **kwargs)
|
litellm.cache.async_add_cache(result, *args, **kwargs)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _sync_set_cache(
|
||||||
|
self,
|
||||||
|
result: Any,
|
||||||
|
kwargs: Dict[str, Any],
|
||||||
|
args: Optional[Tuple[Any, ...]] = None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Sync internal method to add the result to the cache
|
||||||
|
"""
|
||||||
|
if litellm.cache is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
args = args or ()
|
||||||
|
if self._should_store_result_in_cache(
|
||||||
|
original_function=self.original_function, kwargs=kwargs
|
||||||
|
):
|
||||||
|
litellm.cache.add_cache(result, *args, **kwargs)
|
||||||
|
|
||||||
|
return
|
||||||
|
|
||||||
|
def _should_store_result_in_cache(
|
||||||
|
self, original_function: Callable, kwargs: Dict[str, Any]
|
||||||
|
) -> bool:
|
||||||
|
"""
|
||||||
|
Helper function to determine if the result should be stored in the cache.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if the result should be stored in the cache, False otherwise.
|
||||||
|
"""
|
||||||
|
return (
|
||||||
|
(litellm.cache is not None)
|
||||||
|
and litellm.cache.supported_call_types is not None
|
||||||
|
and (str(original_function.__name__) in litellm.cache.supported_call_types)
|
||||||
|
and (kwargs.get("cache", {}).get("no-store", False) is not True)
|
||||||
|
)
|
||||||
|
|
||||||
async def _add_streaming_response_to_cache(self, processed_chunk: ModelResponse):
|
async def _add_streaming_response_to_cache(self, processed_chunk: ModelResponse):
|
||||||
"""
|
"""
|
||||||
Internal method to add the streaming response to the cache
|
Internal method to add the streaming response to the cache
|
||||||
|
@ -536,3 +572,25 @@ class LLMCachingHandler:
|
||||||
original_function=self.original_function,
|
original_function=self.original_function,
|
||||||
kwargs=self.request_kwargs,
|
kwargs=self.request_kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _sync_add_streaming_response_to_cache(self, processed_chunk: ModelResponse):
|
||||||
|
"""
|
||||||
|
Sync internal method to add the streaming response to the cache
|
||||||
|
"""
|
||||||
|
complete_streaming_response: Optional[
|
||||||
|
Union[ModelResponse, TextCompletionResponse]
|
||||||
|
] = _assemble_complete_response_from_streaming_chunks(
|
||||||
|
result=processed_chunk,
|
||||||
|
start_time=self.start_time,
|
||||||
|
end_time=datetime.datetime.now(),
|
||||||
|
request_kwargs=self.request_kwargs,
|
||||||
|
streaming_chunks=self.sync_streaming_chunks,
|
||||||
|
is_async=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
# if a complete_streaming_response is assembled, add it to the cache
|
||||||
|
if complete_streaming_response is not None:
|
||||||
|
self._sync_set_cache(
|
||||||
|
result=complete_streaming_response,
|
||||||
|
kwargs=self.request_kwargs,
|
||||||
|
)
|
||||||
|
|
|
@ -938,19 +938,6 @@ class Logging:
|
||||||
else:
|
else:
|
||||||
callbacks = litellm.success_callback
|
callbacks = litellm.success_callback
|
||||||
|
|
||||||
## STREAMING CACHING ##
|
|
||||||
if "cache" in callbacks and litellm.cache is not None:
|
|
||||||
# this only logs streaming once, complete_streaming_response exists i.e when stream ends
|
|
||||||
print_verbose("success_callback: reaches cache for logging!")
|
|
||||||
kwargs = self.model_call_details
|
|
||||||
if self.stream and _caching_complete_streaming_response is not None:
|
|
||||||
print_verbose(
|
|
||||||
"success_callback: reaches cache for logging, there is a complete_streaming_response. Adding to cache"
|
|
||||||
)
|
|
||||||
result = _caching_complete_streaming_response
|
|
||||||
# only add to cache once we have a complete streaming response
|
|
||||||
litellm.cache.add_cache(result, **kwargs)
|
|
||||||
|
|
||||||
## REDACT MESSAGES ##
|
## REDACT MESSAGES ##
|
||||||
result = redact_message_input_output_from_logging(
|
result = redact_message_input_output_from_logging(
|
||||||
model_call_details=(
|
model_call_details=(
|
||||||
|
|
|
@ -765,7 +765,9 @@ def client(original_function):
|
||||||
print_args_passed_to_litellm(original_function, args, kwargs)
|
print_args_passed_to_litellm(original_function, args, kwargs)
|
||||||
start_time = datetime.datetime.now()
|
start_time = datetime.datetime.now()
|
||||||
result = None
|
result = None
|
||||||
logging_obj = kwargs.get("litellm_logging_obj", None)
|
logging_obj: Optional[LiteLLMLoggingObject] = kwargs.get(
|
||||||
|
"litellm_logging_obj", None
|
||||||
|
)
|
||||||
|
|
||||||
# only set litellm_call_id if its not in kwargs
|
# only set litellm_call_id if its not in kwargs
|
||||||
call_type = original_function.__name__
|
call_type = original_function.__name__
|
||||||
|
@ -787,6 +789,12 @@ def client(original_function):
|
||||||
original_function.__name__, rules_obj, start_time, *args, **kwargs
|
original_function.__name__, rules_obj, start_time, *args, **kwargs
|
||||||
)
|
)
|
||||||
kwargs["litellm_logging_obj"] = logging_obj
|
kwargs["litellm_logging_obj"] = logging_obj
|
||||||
|
_llm_caching_handler: LLMCachingHandler = LLMCachingHandler(
|
||||||
|
original_function=original_function,
|
||||||
|
request_kwargs=kwargs,
|
||||||
|
start_time=start_time,
|
||||||
|
)
|
||||||
|
logging_obj._llm_caching_handler = _llm_caching_handler
|
||||||
|
|
||||||
# CHECK FOR 'os.environ/' in kwargs
|
# CHECK FOR 'os.environ/' in kwargs
|
||||||
for k, v in kwargs.items():
|
for k, v in kwargs.items():
|
||||||
|
@ -1013,12 +1021,11 @@ def client(original_function):
|
||||||
)
|
)
|
||||||
|
|
||||||
# [OPTIONAL] ADD TO CACHE
|
# [OPTIONAL] ADD TO CACHE
|
||||||
if (
|
_llm_caching_handler._sync_set_cache(
|
||||||
litellm.cache is not None
|
result=result,
|
||||||
and litellm.cache.supported_call_types is not None
|
args=args,
|
||||||
and call_type in litellm.cache.supported_call_types
|
kwargs=kwargs,
|
||||||
) and (kwargs.get("cache", {}).get("no-store", False) is not True):
|
)
|
||||||
litellm.cache.add_cache(result, *args, **kwargs)
|
|
||||||
|
|
||||||
# LOG SUCCESS - handle streaming success logging in the _next_ object, remove `handle_success` once it's deprecated
|
# LOG SUCCESS - handle streaming success logging in the _next_ object, remove `handle_success` once it's deprecated
|
||||||
verbose_logger.info("Wrapper: Completed Call, calling success_handler")
|
verbose_logger.info("Wrapper: Completed Call, calling success_handler")
|
||||||
|
@ -7886,7 +7893,10 @@ class CustomStreamWrapper:
|
||||||
"""
|
"""
|
||||||
self.logging_loop = loop
|
self.logging_loop = loop
|
||||||
|
|
||||||
def run_success_logging_in_thread(self, processed_chunk, cache_hit: bool):
|
def run_success_logging_and_cache_storage(self, processed_chunk, cache_hit: bool):
|
||||||
|
"""
|
||||||
|
Runs success logging in a thread and adds the response to the cache
|
||||||
|
"""
|
||||||
if litellm.disable_streaming_logging is True:
|
if litellm.disable_streaming_logging is True:
|
||||||
"""
|
"""
|
||||||
[NOT RECOMMENDED]
|
[NOT RECOMMENDED]
|
||||||
|
@ -7914,6 +7924,12 @@ class CustomStreamWrapper:
|
||||||
## SYNC LOGGING
|
## SYNC LOGGING
|
||||||
self.logging_obj.success_handler(processed_chunk, None, None, cache_hit)
|
self.logging_obj.success_handler(processed_chunk, None, None, cache_hit)
|
||||||
|
|
||||||
|
## Sync store in cache
|
||||||
|
if self.logging_obj._llm_caching_handler is not None:
|
||||||
|
self.logging_obj._llm_caching_handler._sync_add_streaming_response_to_cache(
|
||||||
|
processed_chunk
|
||||||
|
)
|
||||||
|
|
||||||
def finish_reason_handler(self):
|
def finish_reason_handler(self):
|
||||||
model_response = self.model_response_creator()
|
model_response = self.model_response_creator()
|
||||||
if self.received_finish_reason is not None:
|
if self.received_finish_reason is not None:
|
||||||
|
@ -7960,7 +7976,7 @@ class CustomStreamWrapper:
|
||||||
continue
|
continue
|
||||||
## LOGGING
|
## LOGGING
|
||||||
threading.Thread(
|
threading.Thread(
|
||||||
target=self.run_success_logging_in_thread,
|
target=self.run_success_logging_and_cache_storage,
|
||||||
args=(response, cache_hit),
|
args=(response, cache_hit),
|
||||||
).start() # log response
|
).start() # log response
|
||||||
choice = response.choices[0]
|
choice = response.choices[0]
|
||||||
|
@ -8028,7 +8044,7 @@ class CustomStreamWrapper:
|
||||||
processed_chunk._hidden_params["usage"] = usage
|
processed_chunk._hidden_params["usage"] = usage
|
||||||
## LOGGING
|
## LOGGING
|
||||||
threading.Thread(
|
threading.Thread(
|
||||||
target=self.run_success_logging_in_thread,
|
target=self.run_success_logging_and_cache_storage,
|
||||||
args=(processed_chunk, cache_hit),
|
args=(processed_chunk, cache_hit),
|
||||||
).start() # log response
|
).start() # log response
|
||||||
return processed_chunk
|
return processed_chunk
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue