diff --git a/litellm/utils.py b/litellm/utils.py index 7bef853fe..30e6ec69c 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -2996,7 +2996,7 @@ def client(original_function): ) ): # allow users to control returning cached responses from the completion function # checking cache - print_verbose(f"INSIDE CHECKING CACHE") + print_verbose("INSIDE CHECKING CACHE") if ( litellm.cache is not None and str(original_function.__name__) @@ -3103,6 +3103,22 @@ def client(original_function): response_object=cached_result, model_response_object=ModelResponse(), ) + if ( + call_type == CallTypes.atext_completion.value + and isinstance(cached_result, dict) + ): + if kwargs.get("stream", False) == True: + cached_result = convert_to_streaming_response_async( + response_object=cached_result, + ) + cached_result = CustomStreamWrapper( + completion_stream=cached_result, + model=model, + custom_llm_provider="cached_response", + logging_obj=logging_obj, + ) + else: + cached_result = TextCompletionResponse(**cached_result) elif call_type == CallTypes.aembedding.value and isinstance( cached_result, dict ):