fix(utils.py): remove eos token for zephyr models

2025-04-24 10:14:26 +00:00 · 2023-11-23 17:47:39 -08:00 · 2023-11-23 17:47:39 -08:00 · 94dc3f66f3
commit 94dc3f66f3
parent f24786095a
3 changed files with 33 additions and 18 deletions
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@ -286,21 +286,6 @@ def hf_test_completion_tgi():
        pytest.fail(f"Error occurred: {e}")
 # hf_test_completion_tgi()

-def hf_test_completion_tgi_stream():
-    try:
-        response = completion(
-            model = 'huggingface/HuggingFaceH4/zephyr-7b-beta', 
-            messages = [{ "content": "Hello, how are you?","role": "user"}],
-            stream=True
-        )
-        # Add any assertions here to check the response
-        print(response)
-        for chunk in response:
-            print(chunk["choices"][0]["delta"]["content"])
-    except Exception as e:
-        pytest.fail(f"Error occurred: {e}")
-# hf_test_completion_tgi_stream()
-
 # ################### Hugging Face Conversational models ########################
 # def hf_test_completion_conv():
 #     try:
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@ -631,6 +631,29 @@ def ai21_completion_call_bad_key():

 # ai21_completion_call_bad_key()

+def hf_test_completion_tgi_stream():
+    try:
+        response = completion(
+            model = 'huggingface/HuggingFaceH4/zephyr-7b-beta', 
+            messages = [{ "content": "Hello, how are you?","role": "user"}],
+            stream=True
+        )
+        # Add any assertions here to check the response
+        print(f"response: {response}")
+        complete_response = ""
+        start_time = time.time()
+        for idx, chunk in enumerate(response):
+            chunk, finished = streaming_format_tests(idx, chunk)
+            complete_response += chunk
+            if finished:
+                break
+        if complete_response.strip() == "": 
+            raise Exception("Empty response received")
+        print(f"completion_response: {complete_response}")
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+hf_test_completion_tgi_stream()
+
 # def test_completion_aleph_alpha():
 #     try:
 #         response = completion(
@ -706,7 +729,7 @@ def test_openai_chat_completion_call():
        print(f"error occurred: {traceback.format_exc()}")
        pass

-test_openai_chat_completion_call()
+# test_openai_chat_completion_call()

 def test_openai_chat_completion_complete_response_call():
    try:
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -4538,8 +4538,14 @@ class CustomStreamWrapper:
        if self.logging_obj: 
            self.logging_obj.post_call(text)
    
-    def check_special_tokens(self, chunk: str): 
+    def check_special_tokens(self, chunk: str, finish_reason: Optional[str]): 
        hold = False
+        if finish_reason: 
+            for token in self.special_tokens: 
+                if token in chunk:
+                    chunk = chunk.replace(token, "") 
+            return hold, chunk
+        
        if self.sent_first_chunk is True:
            return hold, chunk

@ -4996,8 +5002,9 @@ class CustomStreamWrapper:
            model_response.model = self.model
            print_verbose(f"model_response: {model_response}; completion_obj: {completion_obj}")
            print_verbose(f"model_response finish reason 3: {model_response.choices[0].finish_reason}")
+
            if len(completion_obj["content"]) > 0: # cannot set content of an OpenAI Object to be an empty string
-                hold, model_response_str = self.check_special_tokens(completion_obj["content"])
+                hold, model_response_str = self.check_special_tokens(chunk=completion_obj["content"], finish_reason=model_response.choices[0].finish_reason)
                if hold is False: 
                    completion_obj["content"] = model_response_str  
                    if self.sent_first_chunk == False: