working write to cache with streaming

2023-08-28 18:46:25 -07:00 · 2023-08-28 18:46:25 -07:00 · 8af6d967eb
commit 8af6d967eb
parent 2ff82873e6
4 changed files with 84 additions and 46 deletions
--- a/litellm/caching.py
+++ b/litellm/caching.py
@ -1,3 +1,4 @@
+import litellm
 def get_prompt(*args, **kwargs):
    # make this safe checks, it should not throw any exceptions
    if len(args) > 1:
@ -29,8 +30,9 @@ class InMemoryCache():
        self.cache_dict = {}

    def set_cache(self, key, value):
-        #print("in set cache for inmem")
+        print("in set cache for inmem")
        self.cache_dict[key] = value
+        print(self.cache_dict)

    def get_cache(self, key):
        #print("in get cache for inmem")
@ -46,6 +48,8 @@ class Cache():
            self.cache = RedisCache(type, host, port, password)
        if type == "local":
            self.cache = InMemoryCache()
+        litellm.input_callback.append("cache")
+        litellm.success_callback.append("cache")

    def get_cache_key(self, *args, **kwargs):
        prompt = get_prompt(*args, **kwargs)
@ -71,8 +75,11 @@ class Cache():

    def add_cache(self, result, *args, **kwargs):
        try:
+            # print("adding to cache", result)
            cache_key = self.get_cache_key(*args, **kwargs)
+            # print(cache_key)
            if cache_key is not None:
+                # print("adding to cache", cache_key, result)
                self.cache.set_cache(cache_key, result)
        except:
            pass
--- a/litellm/main.py
+++ b/litellm/main.py
@ -196,7 +196,7 @@ def completion(
                    engine=model, messages=messages, **optional_params
                )
            if "stream" in optional_params and optional_params["stream"] == True:
-                response = CustomStreamWrapper(response, model)
+                response = CustomStreamWrapper(response, model, logging_obj=logging)
                return response
            ## LOGGING
            logging.post_call(
@ -254,7 +254,7 @@ def completion(
                    model=model, messages=messages, **optional_params
                )
            if "stream" in optional_params and optional_params["stream"] == True:
-                response = CustomStreamWrapper(response, model)
+                response = CustomStreamWrapper(response, model, logging_obj=logging)
                return response
            ## LOGGING
            logging.post_call(
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@ -153,14 +153,18 @@ def test_embedding_caching():


 # # test caching with streaming
-# messages = [{"role": "user", "content": "draft a 2 pg legal document on applying to litellm"}]
+# messages = [{"role": "user", "content": "hello gm who are u"}]
 # def test_caching_v2_stream():
 #     try:
 #         litellm.cache = Cache()
+#         # litellm.token="ishaan@berri.ai"
 #         response1 = completion(model="gpt-3.5-turbo", messages=messages, stream=True)
 #         for chunk in response1:
-#             print(chunk)
-#             response1_id = chunk['id']
+#             # 
+#             pass
+#             # print("chunk")
+#             pass
+#             # response1_id = chunk['id']

 #         # response2 = completion(model="gpt-3.5-turbo", messages=messages, stream=True)
 #         # for chunk in response2:
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -70,7 +70,7 @@ last_fetched_at_keys = None


 class Message(OpenAIObject):
-    def __init__(self, content="default", role="assistant", **params):
+    def __init__(self, content=" ", role="assistant", **params):
        super(Message, self).__init__(**params)
        self.content = content
        self.role = role
@ -285,6 +285,26 @@ class Logging:
                            call_type = self.call_type, 
                            stream = self.stream
                        )
+                    if callback == "cache":
+                        try:
+                            #print("in cache callback2", self.stream)
+                            #print(original_response)
+                            #print(self.model_call_details)
+                            
+                            if litellm.cache != None:
+                                if self.litellm_params["stream_response"] == None:
+                                    self.litellm_params["stream_response"] = ModelResponse()
+                                else:
+                                    #self.litellm_call_id["stream_response"]["id"] = self.litellm_params["litellm_call_id"]
+                                    self.litellm_params["stream_response"]["choices"][0]["message"]["content"] += original_response
+                                #print("cache is not none")
+                                # convert original_response to format of Model Object
+                                # Set the model
+                                litellm.cache.add_cache(self.litellm_params["stream_response"], **self.model_call_details)
+                            #print(self.litellm_params["stream_response"])
+                        except Exception as e:
+                            print("got exception")
+                            print(e)
                except:
                    print_verbose(
                        f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while post-call logging with integrations {traceback.format_exc()}"
@ -624,6 +644,7 @@ def get_litellm_params(
        "custom_api_base": custom_api_base,
        "litellm_call_id": litellm_call_id,
        "model_alias_map": model_alias_map,
+        "stream_response": None
    }

    return litellm_params
@ -1577,6 +1598,9 @@ class CustomStreamWrapper:
        except:
            raise ValueError(f"Unable to parse response. Original response: {chunk}")

+    def handle_openai_chat_completion_chunk(self, chunk):
+        return chunk["choices"][0]["delta"]["content"]
+
    def handle_baseten_chunk(self, chunk):
        chunk = chunk.decode("utf-8")
        data_json = json.loads(chunk)
@ -1593,6 +1617,7 @@ class CustomStreamWrapper:
            return ""

    def __next__(self):
+        try:
            completion_obj = {"role": "assistant", "content": ""}
            if self.model in litellm.anthropic_models:
                chunk = next(self.completion_stream)
@ -1625,12 +1650,14 @@ class CustomStreamWrapper:
                completion_obj["content"] = self.handle_openai_text_completion_chunk(chunk)
            else: # openai chat/azure models
                chunk = next(self.completion_stream)
-            return chunk
+                completion_obj["content"] = self.handle_openai_chat_completion_chunk(chunk)

            # LOGGING
            self.logging_obj.post_call(completion_obj["content"])
            # return this for all models
            return {"choices": [{"delta": completion_obj}]}
+        except:
+            raise StopIteration


 ########## Reading Config File ############################