fix(utils.py): fix azure streaming bug

2023-12-04 12:38:15 -08:00 · 2023-12-04 12:38:15 -08:00 · 728b879c33
commit 728b879c33
parent 90c13d39ac
4 changed files with 91 additions and 7 deletions
--- a/litellm/tests/test_config.yaml
+++ b/litellm/tests/test_config.yaml
@ -1,3 +1,20 @@
 model_list: 
  - model_name: "azure-model"
    litellm_params: 
      model: "azure/gpt-35-turbo"
      api_key: "os.environ/AZURE_EUROPE_API_KEY"
      api_base: "https://my-endpoint-europe-berri-992.openai.azure.com/"
  - model_name: "azure-model"
    litellm_params:
      model: "azure/gpt-35-turbo"
      api_key: "os.environ/AZURE_CANADA_API_KEY"
      api_base: "https://my-endpoint-canada-berri992.openai.azure.com"
  - model_name: "azure-model"
    litellm_params:
      model: "azure/gpt-turbo"
      api_key: "os.environ/AZURE-FRANCE-API-KEY"
      api_base: "https://openai-france-1234.openai.azure.com"
 litellm_settings:
  drop_params: True
  set_verbose: True
--- a/litellm/tests/test_proxy_server_cost.py
+++ b/litellm/tests/test_proxy_server_cost.py
@ -0,0 +1,27 @@
 # #### What this tests ####
 # #    This tests the cost tracking function works with consecutive calls (~10 consecutive calls)
 # import sys, os
 # import traceback
 # import pytest
 # sys.path.insert(
 #     0, os.path.abspath("../..")
 # )  # Adds the parent directory to the system path
 # import litellm
 # async def test_proxy_cost_tracking(): 
 #     """
 #     Get expected cost. 
 #     Create new key.
 #     Run 10 parallel calls. 
 #     Check cost for key at the end. 
 #     assert it's = expected cost. 
 #     """
 #     model = "gpt-3.5-turbo"
 #     messages = [{"role": "user", "content": "Hey, how's it going?"}]
 #     number_of_calls = 10
 #     expected_cost = litellm.completion_cost(model=model, messages=messages) * number_of_calls
 #     async def litellm_acompletion(): 
--- a/litellm/tests/test_stream_chunk_builder.py
+++ b/litellm/tests/test_stream_chunk_builder.py
@ -110,4 +110,26 @@ def test_stream_chunk_builder_litellm_tool_call():
    except Exception as e: 
       pytest.fail(f"An exception occurred - {str(e)}")
-test_stream_chunk_builder_litellm_tool_call()
+# test_stream_chunk_builder_litellm_tool_call()
 def test_stream_chunk_builder_litellm_tool_call_regular_message():
    try: 
      messages = [{"role": "user", "content": "Hey, how's it going?"}]
      litellm.set_verbose = False
      response = litellm.completion(
          model="azure/gpt-4-nov-release",
          messages=messages,
          tools=tools_schema,
          stream=True,
          api_key="os.environ/AZURE_FRANCE_API_KEY",
          api_base="https://openai-france-1234.openai.azure.com",
          complete_response = True
      )
      print(f"complete response: {response}")
      print(f"complete response usage: {response.usage}")
      assert response.system_fingerprint is not None
    except Exception as e: 
       pytest.fail(f"An exception occurred - {str(e)}")
 test_stream_chunk_builder_litellm_tool_call_regular_message()
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -5266,14 +5266,32 @@ class CustomStreamWrapper:
            print_verbose(f"model_response: {model_response}; completion_obj: {completion_obj}")
            print_verbose(f"model_response finish reason 3: {model_response.choices[0].finish_reason}")
            if len(completion_obj["content"]) > 0: # cannot set content of an OpenAI Object to be an empty string
-                hold, model_response_str = self.check_special_tokens(chunk=completion_obj["content"], finish_reason=model_response.choices[0].finish_reason)
+                hold, model_response_str = self.check_special_tokens(chunk=completion_obj["content"], finish_reason=model_response.choices[0].finish_reason) # filter out bos/eos tokens from openai-compatible hf endpoints
                print_verbose(f"hold - {hold}, model_response_str - {model_response_str}")
                if hold is False: 
-                    completion_obj["content"] = model_response_str  
+                    ## check if openai/azure chunk 
-                    if self.sent_first_chunk == False:
+                    original_chunk = response_obj.get("original_chunk", None)
-                        completion_obj["role"] = "assistant"
+                    if original_chunk: 
-                        self.sent_first_chunk = True
+                        model_response.id = original_chunk.id
-                    model_response.choices[0].delta = Delta(**completion_obj)
+                        if len(original_chunk.choices) > 0:
                            try:
                                delta = dict(original_chunk.choices[0].delta)
                                model_response.choices[0].delta = Delta(**delta)
                            except Exception as e:
                                model_response.choices[0].delta = Delta()
                        else: 
                            return 
                        model_response.system_fingerprint = original_chunk.system_fingerprint
                        if self.sent_first_chunk == False:
                            model_response.choices[0].delta["role"] = "assistant"
                            self.sent_first_chunk = True
                    else: 
                        ## else 
                        completion_obj["content"] = model_response_str  
                        if self.sent_first_chunk == False:
                            completion_obj["role"] = "assistant"
                            self.sent_first_chunk = True
                        model_response.choices[0].delta = Delta(**completion_obj)
                    # LOGGING
                    threading.Thread(target=self.logging_obj.success_handler, args=(model_response,)).start()
                    print_verbose(f"model_response: {model_response}")