fix anthropic and together ai streaming

2023-08-26 15:47:07 -07:00 · 2023-08-26 15:47:07 -07:00 · 9155ba068f
commit 9155ba068f
parent 93e897da48
5 changed files with 105 additions and 28 deletions
--- a/litellm/pycache/main.cpython-311.pyc
+++ b/litellm/pycache/main.cpython-311.pyc
--- a/litellm/pycache/utils.cpython-311.pyc
+++ b/litellm/pycache/utils.cpython-311.pyc
--- a/litellm/main.py
+++ b/litellm/main.py
@ -540,16 +540,9 @@ def completion(
            ## LOGGING
            logging.pre_call(input=prompt, api_key=TOGETHER_AI_TOKEN)
-            if stream == True:
+
-                return together_ai_completion_streaming(
+            print(f"TOGETHER_AI_TOKEN: {TOGETHER_AI_TOKEN}")
-                    {
+
                        "model": model,
                        "prompt": prompt,
                        "request_type": "language-model-inference",
                        **optional_params,
                    },
                    headers=headers,
                )
            res = requests.post(
                endpoint,
                json={
@ -560,6 +553,12 @@ def completion(
                },
                headers=headers,
            )
            if "stream_tokens" in optional_params and optional_params["stream_tokens"] == True:
                response = CustomStreamWrapper(
                    res.iter_lines(), model, custom_llm_provider="together_ai"
                )
                return response
            ## LOGGING
            logging.post_call(
                input=prompt, api_key=TOGETHER_AI_TOKEN, original_response=res.text
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@ -9,13 +9,14 @@ sys.path.insert(
 )  # Adds the parent directory to the system path
 import litellm
 from litellm import completion
-litellm.logging = True
+litellm.logging = False
-litellm.set_verbose = True
+litellm.set_verbose = False
 score = 0
 def logger_fn(model_call_object: dict):
    return
    print(f"model call details: {model_call_object}")
@ -81,17 +82,91 @@ except:
 # # test on huggingface completion call
 # try:
 #     start_time = time.time()
 #     response = completion(
-#         model="meta-llama/Llama-2-7b-chat-hf",
+#         model="gpt-3.5-turbo", messages=messages, stream=True, logger_fn=logger_fn
 #         messages=messages,
 #         custom_llm_provider="huggingface",
 #         custom_api_base="https://s7c7gytn18vnu4tw.us-east-1.aws.endpoints.huggingface.cloud",
 #         stream=True,
 #         logger_fn=logger_fn,
 #     )
 #     complete_response = ""
 #     for chunk in response:
 #         chunk_time = time.time() 
 #         print(f"time since initial request: {chunk_time - start_time:.2f}")
 #         print(chunk["choices"][0]["delta"])
-#     score += 1
+#         complete_response += chunk["choices"][0]["delta"]["content"] if len(chunk["choices"][0]["delta"].keys()) > 0 else "" 
 #     if complete_response == "": 
 #         raise Exception("Empty response received")
 # except:
 #     print(f"error occurred: {traceback.format_exc()}")
 #     pass
 # test on together ai completion call
 try:
    start_time = time.time()
    response = completion(
        model="Replit-Code-3B", messages=messages, logger_fn=logger_fn, stream= True
    )
    complete_response = ""
    print(f"returned response object: {response}")
    for chunk in response:
        chunk_time = time.time() 
        print(f"time since initial request: {chunk_time - start_time:.2f}")
        print(chunk["choices"][0]["delta"])
        complete_response += chunk["choices"][0]["delta"]["content"] if len(chunk["choices"][0]["delta"].keys()) > 0 else "" 
    if complete_response == "": 
        raise Exception("Empty response received")
 except:
    print(f"error occurred: {traceback.format_exc()}")
    pass
 # # test on azure completion call
 # try:
 #     response = completion(
 #         model="azure/chatgpt-test", messages=messages, stream=True, logger_fn=logger_fn
 #     )
 #     response = ""
 #     for chunk in response:
 #         chunk_time = time.time() 
 #         print(f"time since initial request: {chunk_time - start_time:.2f}")
 #         print(chunk["choices"][0]["delta"])
 #         response += chunk["choices"][0]["delta"]
 #     if response == "": 
 #         raise Exception("Empty response received")
 # except:
 #     print(f"error occurred: {traceback.format_exc()}")
 #     pass
 # # test on anthropic completion call
 # try:
 #     response = completion(
 #         model="claude-instant-1", messages=messages, stream=True, logger_fn=logger_fn
 #     )
 #     response = ""
 #     for chunk in response:
 #         chunk_time = time.time() 
 #         print(f"time since initial request: {chunk_time - start_time:.2f}")
 #         print(chunk["choices"][0]["delta"])
 #         response += chunk["choices"][0]["delta"]
 #     if response == "": 
 #         raise Exception("Empty response received")
 # except:
 #     print(f"error occurred: {traceback.format_exc()}")
 #     pass
 # # # test on huggingface completion call
 # # try:
 # #     response = completion(
 # #         model="meta-llama/Llama-2-7b-chat-hf",
 # #         messages=messages,
 # #         custom_llm_provider="huggingface",
 # #         custom_api_base="https://s7c7gytn18vnu4tw.us-east-1.aws.endpoints.huggingface.cloud",
 # #         stream=True,
 # #         logger_fn=logger_fn,
 # #     )
 # #     for chunk in response:
 # #         print(chunk["choices"][0]["delta"])
 # #     score += 1
 # # except:
 # #     print(f"error occurred: {traceback.format_exc()}")
 # #     pass
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -371,6 +371,8 @@ def client(original_function):
                )
            if "logger_fn" in kwargs:
                user_logger_fn = kwargs["logger_fn"]
            # LOG SUCCESS
            crash_reporting(*args, **kwargs)
        except:  # DO NOT BLOCK running the function because of this
            print_verbose(f"[Non-Blocking] {traceback.format_exc()}")
        pass
@ -444,26 +446,27 @@ def client(original_function):
            function_setup(*args, **kwargs)
            litellm_call_id = str(uuid.uuid4())
            kwargs["litellm_call_id"] = litellm_call_id
            # [OPTIONAL] CHECK CACHE
            start_time = datetime.datetime.now()
            # [OPTIONAL] CHECK CACHE
            if (litellm.caching or litellm.caching_with_models) and (
                    cached_result := check_cache(*args, **kwargs)) is not None:
                result = cached_result
-            else:
+                return result
-                # MODEL CALL
+            # MODEL CALL
-                result = original_function(*args, **kwargs)
+            result = original_function(*args, **kwargs)
            if "stream" in kwargs and kwargs["stream"] == True:
                return result
            end_time = datetime.datetime.now()
-            # Add response to CACHE
+            # [OPTIONAL] ADD TO CACHE
-            if litellm.caching:
+            if (litellm.caching or litellm.caching_with_models):
                add_cache(result, *args, **kwargs)
            # LOG SUCCESS
            crash_reporting(*args, **kwargs)
            my_thread = threading.Thread(
                target=handle_success,
                args=(args, kwargs, result, start_time,
                      end_time))  # don't interrupt execution of main thread
            my_thread.start()
            # RETURN RESULT
            return result
        except Exception as e:
@ -1465,7 +1468,7 @@ class CustomStreamWrapper:
        if model in litellm.cohere_models:
            # cohere does not return an iterator, so we need to wrap it in one
            self.completion_stream = iter(completion_stream)
-        elif model == "together_ai":
+        elif custom_llm_provider == "together_ai":
            self.completion_stream = iter(completion_stream)
        else:
            self.completion_stream = completion_stream