fix(utils.py): fix azure streaming bug

2023-12-04 12:38:15 -08:00 · 2023-12-04 12:38:15 -08:00 · 728b879c33
commit 728b879c33
parent 90c13d39ac
4 changed files with 91 additions and 7 deletions
--- a/litellm/tests/test_config.yaml
+++ b/litellm/tests/test_config.yaml
@ -1,3 +1,20 @@
+model_list: 
+  - model_name: "azure-model"
+    litellm_params: 
+      model: "azure/gpt-35-turbo"
+      api_key: "os.environ/AZURE_EUROPE_API_KEY"
+      api_base: "https://my-endpoint-europe-berri-992.openai.azure.com/"
+  - model_name: "azure-model"
+    litellm_params:
+      model: "azure/gpt-35-turbo"
+      api_key: "os.environ/AZURE_CANADA_API_KEY"
+      api_base: "https://my-endpoint-canada-berri992.openai.azure.com"
+  - model_name: "azure-model"
+    litellm_params:
+      model: "azure/gpt-turbo"
+      api_key: "os.environ/AZURE-FRANCE-API-KEY"
+      api_base: "https://openai-france-1234.openai.azure.com"
+
 litellm_settings:
  drop_params: True
  set_verbose: True
--- a/litellm/tests/test_proxy_server_cost.py
+++ b/litellm/tests/test_proxy_server_cost.py
@ -0,0 +1,27 @@
+# #### What this tests ####
+# #    This tests the cost tracking function works with consecutive calls (~10 consecutive calls)
+
+# import sys, os
+# import traceback
+# import pytest
+# sys.path.insert(
+#     0, os.path.abspath("../..")
+# )  # Adds the parent directory to the system path
+# import litellm
+
+# async def test_proxy_cost_tracking(): 
+#     """
+#     Get expected cost. 
+#     Create new key.
+#     Run 10 parallel calls. 
+#     Check cost for key at the end. 
+#     assert it's = expected cost. 
+#     """
+#     model = "gpt-3.5-turbo"
+#     messages = [{"role": "user", "content": "Hey, how's it going?"}]
+#     number_of_calls = 10
+#     expected_cost = litellm.completion_cost(model=model, messages=messages) * number_of_calls
+#     async def litellm_acompletion(): 
+
+
+
--- a/litellm/tests/test_stream_chunk_builder.py
+++ b/litellm/tests/test_stream_chunk_builder.py
@ -110,4 +110,26 @@ def test_stream_chunk_builder_litellm_tool_call():
    except Exception as e: 
       pytest.fail(f"An exception occurred - {str(e)}")

-test_stream_chunk_builder_litellm_tool_call()
+# test_stream_chunk_builder_litellm_tool_call()
+
+def test_stream_chunk_builder_litellm_tool_call_regular_message():
+    try: 
+      messages = [{"role": "user", "content": "Hey, how's it going?"}]
+      litellm.set_verbose = False
+      response = litellm.completion(
+          model="azure/gpt-4-nov-release",
+          messages=messages,
+          tools=tools_schema,
+          stream=True,
+          api_key="os.environ/AZURE_FRANCE_API_KEY",
+          api_base="https://openai-france-1234.openai.azure.com",
+          complete_response = True
+      )
+
+      print(f"complete response: {response}")
+      print(f"complete response usage: {response.usage}")
+      assert response.system_fingerprint is not None
+    except Exception as e: 
+       pytest.fail(f"An exception occurred - {str(e)}")
+
+test_stream_chunk_builder_litellm_tool_call_regular_message()
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -5266,9 +5266,27 @@ class CustomStreamWrapper:
            print_verbose(f"model_response: {model_response}; completion_obj: {completion_obj}")
            print_verbose(f"model_response finish reason 3: {model_response.choices[0].finish_reason}")
            if len(completion_obj["content"]) > 0: # cannot set content of an OpenAI Object to be an empty string
-                hold, model_response_str = self.check_special_tokens(chunk=completion_obj["content"], finish_reason=model_response.choices[0].finish_reason)
+                hold, model_response_str = self.check_special_tokens(chunk=completion_obj["content"], finish_reason=model_response.choices[0].finish_reason) # filter out bos/eos tokens from openai-compatible hf endpoints
                print_verbose(f"hold - {hold}, model_response_str - {model_response_str}")
                if hold is False: 
+                    ## check if openai/azure chunk 
+                    original_chunk = response_obj.get("original_chunk", None)
+                    if original_chunk: 
+                        model_response.id = original_chunk.id
+                        if len(original_chunk.choices) > 0:
+                            try:
+                                delta = dict(original_chunk.choices[0].delta)
+                                model_response.choices[0].delta = Delta(**delta)
+                            except Exception as e:
+                                model_response.choices[0].delta = Delta()
+                        else: 
+                            return 
+                        model_response.system_fingerprint = original_chunk.system_fingerprint
+                        if self.sent_first_chunk == False:
+                            model_response.choices[0].delta["role"] = "assistant"
+                            self.sent_first_chunk = True
+                    else: 
+                        ## else 
                        completion_obj["content"] = model_response_str  
                        if self.sent_first_chunk == False:
                            completion_obj["role"] = "assistant"