fix anthropic and together ai streaming

2023-08-26 15:47:07 -07:00 · 2023-08-26 15:47:07 -07:00 · 9155ba068f
commit 9155ba068f
parent 93e897da48
5 changed files with 105 additions and 28 deletions
--- a/litellm/pycache/main.cpython-311.pyc
+++ b/litellm/pycache/main.cpython-311.pyc
--- a/litellm/pycache/utils.cpython-311.pyc
+++ b/litellm/pycache/utils.cpython-311.pyc
--- a/litellm/main.py
+++ b/litellm/main.py
@ -540,16 +540,9 @@ def completion(

            ## LOGGING
            logging.pre_call(input=prompt, api_key=TOGETHER_AI_TOKEN)
-            if stream == True:
-                return together_ai_completion_streaming(
-                    {
-                        "model": model,
-                        "prompt": prompt,
-                        "request_type": "language-model-inference",
-                        **optional_params,
-                    },
-                    headers=headers,
-                )
+
+            print(f"TOGETHER_AI_TOKEN: {TOGETHER_AI_TOKEN}")
+
            res = requests.post(
                endpoint,
                json={
@ -560,6 +553,12 @@ def completion(
                },
                headers=headers,
            )
+
+            if "stream_tokens" in optional_params and optional_params["stream_tokens"] == True:
+                response = CustomStreamWrapper(
+                    res.iter_lines(), model, custom_llm_provider="together_ai"
+                )
+                return response
            ## LOGGING
            logging.post_call(
                input=prompt, api_key=TOGETHER_AI_TOKEN, original_response=res.text
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@ -9,13 +9,14 @@ sys.path.insert(
 )  # Adds the parent directory to the system path
 import litellm
 from litellm import completion
-litellm.logging = True
-litellm.set_verbose = True
+litellm.logging = False
+litellm.set_verbose = False

 score = 0


 def logger_fn(model_call_object: dict):
+    return
    print(f"model call details: {model_call_object}")


@ -81,17 +82,91 @@ except:

 # # test on huggingface completion call
 # try:
+#     start_time = time.time()
 #     response = completion(
-#         model="meta-llama/Llama-2-7b-chat-hf",
-#         messages=messages,
-#         custom_llm_provider="huggingface",
-#         custom_api_base="https://s7c7gytn18vnu4tw.us-east-1.aws.endpoints.huggingface.cloud",
-#         stream=True,
-#         logger_fn=logger_fn,
+#         model="gpt-3.5-turbo", messages=messages, stream=True, logger_fn=logger_fn
 #     )
+#     complete_response = ""
 #     for chunk in response:
+#         chunk_time = time.time() 
+#         print(f"time since initial request: {chunk_time - start_time:.2f}")
 #         print(chunk["choices"][0]["delta"])
-#     score += 1
+#         complete_response += chunk["choices"][0]["delta"]["content"] if len(chunk["choices"][0]["delta"].keys()) > 0 else "" 
+#     if complete_response == "": 
+#         raise Exception("Empty response received")
 # except:
 #     print(f"error occurred: {traceback.format_exc()}")
 #     pass
+
+# test on together ai completion call
+try:
+    start_time = time.time()
+    response = completion(
+        model="Replit-Code-3B", messages=messages, logger_fn=logger_fn, stream= True
+    )
+    complete_response = ""
+    print(f"returned response object: {response}")
+    for chunk in response:
+        chunk_time = time.time() 
+        print(f"time since initial request: {chunk_time - start_time:.2f}")
+        print(chunk["choices"][0]["delta"])
+        complete_response += chunk["choices"][0]["delta"]["content"] if len(chunk["choices"][0]["delta"].keys()) > 0 else "" 
+    if complete_response == "": 
+        raise Exception("Empty response received")
+except:
+    print(f"error occurred: {traceback.format_exc()}")
+    pass
+
+
+# # test on azure completion call
+# try:
+#     response = completion(
+#         model="azure/chatgpt-test", messages=messages, stream=True, logger_fn=logger_fn
+#     )
+#     response = ""
+#     for chunk in response:
+#         chunk_time = time.time() 
+#         print(f"time since initial request: {chunk_time - start_time:.2f}")
+#         print(chunk["choices"][0]["delta"])
+#         response += chunk["choices"][0]["delta"]
+#     if response == "": 
+#         raise Exception("Empty response received")
+# except:
+#     print(f"error occurred: {traceback.format_exc()}")
+#     pass
+
+
+# # test on anthropic completion call
+# try:
+#     response = completion(
+#         model="claude-instant-1", messages=messages, stream=True, logger_fn=logger_fn
+#     )
+#     response = ""
+#     for chunk in response:
+#         chunk_time = time.time() 
+#         print(f"time since initial request: {chunk_time - start_time:.2f}")
+#         print(chunk["choices"][0]["delta"])
+#         response += chunk["choices"][0]["delta"]
+#     if response == "": 
+#         raise Exception("Empty response received")
+# except:
+#     print(f"error occurred: {traceback.format_exc()}")
+#     pass
+
+
+# # # test on huggingface completion call
+# # try:
+# #     response = completion(
+# #         model="meta-llama/Llama-2-7b-chat-hf",
+# #         messages=messages,
+# #         custom_llm_provider="huggingface",
+# #         custom_api_base="https://s7c7gytn18vnu4tw.us-east-1.aws.endpoints.huggingface.cloud",
+# #         stream=True,
+# #         logger_fn=logger_fn,
+# #     )
+# #     for chunk in response:
+# #         print(chunk["choices"][0]["delta"])
+# #     score += 1
+# # except:
+# #     print(f"error occurred: {traceback.format_exc()}")
+# #     pass
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -371,6 +371,8 @@ def client(original_function):
                )
            if "logger_fn" in kwargs:
                user_logger_fn = kwargs["logger_fn"]
+            # LOG SUCCESS
+            crash_reporting(*args, **kwargs)
        except:  # DO NOT BLOCK running the function because of this
            print_verbose(f"[Non-Blocking] {traceback.format_exc()}")
        pass
@ -444,26 +446,27 @@ def client(original_function):
            function_setup(*args, **kwargs)
            litellm_call_id = str(uuid.uuid4())
            kwargs["litellm_call_id"] = litellm_call_id
-            # [OPTIONAL] CHECK CACHE
            start_time = datetime.datetime.now()
+            # [OPTIONAL] CHECK CACHE
            if (litellm.caching or litellm.caching_with_models) and (
                    cached_result := check_cache(*args, **kwargs)) is not None:
                result = cached_result
-            else:
-                # MODEL CALL
-                result = original_function(*args, **kwargs)
+                return result
+            # MODEL CALL
+            result = original_function(*args, **kwargs)
+            if "stream" in kwargs and kwargs["stream"] == True:
+                return result
            end_time = datetime.datetime.now()
-            # Add response to CACHE
-            if litellm.caching:
+            # [OPTIONAL] ADD TO CACHE
+            if (litellm.caching or litellm.caching_with_models):
                add_cache(result, *args, **kwargs)
            # LOG SUCCESS
-            crash_reporting(*args, **kwargs)
-
            my_thread = threading.Thread(
                target=handle_success,
                args=(args, kwargs, result, start_time,
                      end_time))  # don't interrupt execution of main thread
            my_thread.start()
+            # RETURN RESULT
            return result
        except Exception as e:

@ -1465,7 +1468,7 @@ class CustomStreamWrapper:
        if model in litellm.cohere_models:
            # cohere does not return an iterator, so we need to wrap it in one
            self.completion_stream = iter(completion_stream)
-        elif model == "together_ai":
+        elif custom_llm_provider == "together_ai":
            self.completion_stream = iter(completion_stream)
        else:
            self.completion_stream = completion_stream