(feat) async completion caching

2023-12-09 14:15:18 -08:00 · 2023-12-09 14:15:18 -08:00 · d18d5a3133
commit d18d5a3133
parent 67c730e264
1 changed files with 82 additions and 1 deletions
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -1056,6 +1056,19 @@ class Logging:
        start_time, end_time, result = self._success_handler_helper_fn(start_time=start_time, end_time=end_time, result=result)
        for callback in litellm._async_success_callback:
            try: 
                if callback == "cache":
                    # set_cache once complete streaming response is built
                    print_verbose("async success_callback: reaches cache for logging!")
                    kwargs = self.model_call_details
                    if self.stream:
                        if "complete_streaming_response" not in kwargs:
                            print_verbose(f"async success_callback: reaches cache for logging, there is no complete_streaming_response. Kwargs={kwargs}\n\n")
                            return
                        else:
                            print_verbose("async success_callback: reaches cache for logging, there is a complete_streaming_response. Adding to cache")
                            result = kwargs["complete_streaming_response"]
                            # only add to cache once we have a complete streaming response
                            litellm.cache.add_cache(result, **kwargs)
                if isinstance(callback, CustomLogger): # custom logger class 
                    print_verbose(f"Async success callbacks: CustomLogger")
                    if self.stream:
@ -1599,7 +1612,12 @@ def client(original_function):
                        print_verbose(f"Cache Hit!")
                        call_type = original_function.__name__
                        if call_type == CallTypes.acompletion.value and isinstance(cached_result, dict):
-                            return convert_to_model_response_object(response_object=cached_result, model_response_object=ModelResponse())
+                            if kwargs.get("stream", False) == True:
                                return convert_to_streaming_response_async(
                                    response_object=cached_result,
                                )
                            else:
                                return convert_to_model_response_object(response_object=cached_result, model_response_object=ModelResponse())
                        else:
                            return cached_result
            # MODEL CALL
@ -3494,6 +3512,69 @@ def handle_failure(exception, traceback_exception, start_time, end_time, args, k
        exception_logging(logger_fn=user_logger_fn, exception=e)
        pass
 async def convert_to_streaming_response_async(response_object: Optional[dict]=None):
    """
    Asynchronously converts a response object to a streaming response.
    Args:
        response_object (Optional[dict]): The response object to be converted. Defaults to None.
    Raises:
        Exception: If the response object is None.
    Yields:
        ModelResponse: The converted streaming response object.
    Returns:
        None
    """
    if response_object is None:
        raise Exception("Error in response object format")
    model_response_object = ModelResponse(stream=True)
    if model_response_object is None:
        raise Exception("Error in response creating model response object")
    choice_list = []
    for idx, choice in enumerate(response_object["choices"]):
        delta = Delta(
            content=choice["message"].get("content", None),
            role=choice["message"]["role"],
            function_call=choice["message"].get("function_call", None),
            tool_calls=choice["message"].get("tool_calls", None)
        )
        finish_reason = choice.get("finish_reason", None)
        if finish_reason is None:
            finish_reason = choice.get("finish_details")
        choice = StreamingChoices(finish_reason=finish_reason, index=idx, delta=delta)
        choice_list.append(choice)
    model_response_object.choices = choice_list
    if "usage" in response_object and response_object["usage"] is not None:
        model_response_object.usage = Usage(
            completion_tokens=response_object["usage"].get("completion_tokens", 0),
            prompt_tokens=response_object["usage"].get("prompt_tokens", 0),
            total_tokens=response_object["usage"].get("total_tokens", 0)
        )
    if "id" in response_object:
        model_response_object.id = response_object["id"]
    if "system_fingerprint" in response_object:
        model_response_object.system_fingerprint = response_object["system_fingerprint"]
    if "model" in response_object:
        model_response_object.model = response_object["model"]
    yield model_response_object
    await asyncio.sleep(0)
 def convert_to_streaming_response(response_object: Optional[dict]=None):
    # used for yielding Cache hits when stream == True
    if response_object is None: