fix(utils.py): remove eos token for zephyr models

2023-11-23 17:47:39 -08:00 · 2023-11-23 17:47:39 -08:00 · 94dc3f66f3
commit 94dc3f66f3
parent f24786095a
3 changed files with 33 additions and 18 deletions
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@ -286,21 +286,6 @@ def hf_test_completion_tgi():
        pytest.fail(f"Error occurred: {e}")
 # hf_test_completion_tgi()
 def hf_test_completion_tgi_stream():
    try:
        response = completion(
            model = 'huggingface/HuggingFaceH4/zephyr-7b-beta', 
            messages = [{ "content": "Hello, how are you?","role": "user"}],
            stream=True
        )
        # Add any assertions here to check the response
        print(response)
        for chunk in response:
            print(chunk["choices"][0]["delta"]["content"])
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 # hf_test_completion_tgi_stream()
 # ################### Hugging Face Conversational models ########################
 # def hf_test_completion_conv():
 #     try:
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@ -631,6 +631,29 @@ def ai21_completion_call_bad_key():
 # ai21_completion_call_bad_key()
 def hf_test_completion_tgi_stream():
    try:
        response = completion(
            model = 'huggingface/HuggingFaceH4/zephyr-7b-beta', 
            messages = [{ "content": "Hello, how are you?","role": "user"}],
            stream=True
        )
        # Add any assertions here to check the response
        print(f"response: {response}")
        complete_response = ""
        start_time = time.time()
        for idx, chunk in enumerate(response):
            chunk, finished = streaming_format_tests(idx, chunk)
            complete_response += chunk
            if finished:
                break
        if complete_response.strip() == "": 
            raise Exception("Empty response received")
        print(f"completion_response: {complete_response}")
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 hf_test_completion_tgi_stream()
 # def test_completion_aleph_alpha():
 #     try:
 #         response = completion(
@ -706,7 +729,7 @@ def test_openai_chat_completion_call():
        print(f"error occurred: {traceback.format_exc()}")
        pass
-test_openai_chat_completion_call()
+# test_openai_chat_completion_call()
 def test_openai_chat_completion_complete_response_call():
    try:
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -4538,8 +4538,14 @@ class CustomStreamWrapper:
        if self.logging_obj: 
            self.logging_obj.post_call(text)
-    def check_special_tokens(self, chunk: str): 
+    def check_special_tokens(self, chunk: str, finish_reason: Optional[str]): 
        hold = False
        if finish_reason: 
            for token in self.special_tokens: 
                if token in chunk:
                    chunk = chunk.replace(token, "") 
            return hold, chunk
        if self.sent_first_chunk is True:
            return hold, chunk
@ -4996,8 +5002,9 @@ class CustomStreamWrapper:
            model_response.model = self.model
            print_verbose(f"model_response: {model_response}; completion_obj: {completion_obj}")
            print_verbose(f"model_response finish reason 3: {model_response.choices[0].finish_reason}")
            if len(completion_obj["content"]) > 0: # cannot set content of an OpenAI Object to be an empty string
-                hold, model_response_str = self.check_special_tokens(completion_obj["content"])
+                hold, model_response_str = self.check_special_tokens(chunk=completion_obj["content"], finish_reason=model_response.choices[0].finish_reason)
                if hold is False: 
                    completion_obj["content"] = model_response_str  
                    if self.sent_first_chunk == False: