From 9e1e97528d51fed59cfc4c8bcce71a871d67fd77 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Mon, 25 Mar 2024 16:33:41 -0700
Subject: [PATCH 1/9] fix(utils.py): ensure last chunk is always empty delta w/
 finish reason

makes sure we're openai-compatible with our streaming. Adds stricter tests for this as well
---
 litellm/tests/log.txt           | 125 +------------
 litellm/tests/test_streaming.py |  80 ++++++++-
 litellm/utils.py                | 301 +++++++++++++++-----------------
 3 files changed, 221 insertions(+), 285 deletions(-)

diff --git a/litellm/tests/log.txt b/litellm/tests/log.txt
index 74a7259bf..79aef9819 100644
--- a/litellm/tests/log.txt
+++ b/litellm/tests/log.txt
@@ -1,119 +1,6 @@
-============================= test session starts ==============================
-platform darwin -- Python 3.11.6, pytest-7.3.1, pluggy-1.3.0
-rootdir: /Users/krrishdholakia/Documents/litellm/litellm/tests
-plugins: timeout-2.2.0, asyncio-0.23.2, anyio-3.7.1, xdist-3.3.1
-asyncio: mode=Mode.STRICT
-collected 1 item
-
-test_completion.py .                                                     [100%]
-
-=============================== warnings summary ===============================
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pydantic/_internal/_config.py:271
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pydantic/_internal/_config.py:271
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pydantic/_internal/_config.py:271
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pydantic/_internal/_config.py:271
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pydantic/_internal/_config.py:271
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pydantic/_internal/_config.py:271
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pydantic/_internal/_config.py:271
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pydantic/_internal/_config.py:271
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pydantic/_internal/_config.py:271
-  /opt/homebrew/lib/python3.11/site-packages/pydantic/_internal/_config.py:271: PydanticDeprecatedSince20: Support for class-based `config` is deprecated, use ConfigDict instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
-    warnings.warn(DEPRECATION_MESSAGE, DeprecationWarning)
-
-../proxy/_types.py:102
-  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:102: PydanticDeprecatedSince20: `pydantic.config.Extra` is deprecated, use literal values instead (e.g. `extra='allow'`). Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
-    extra = Extra.allow  # Allow extra fields
-
-../proxy/_types.py:105
-  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:105: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
-    @root_validator(pre=True)
-
-../proxy/_types.py:134
-  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:134: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
-    @root_validator(pre=True)
-
-../proxy/_types.py:180
-  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:180: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
-    @root_validator(pre=True)
-
-../proxy/_types.py:241
-  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:241: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
-    @root_validator(pre=True)
-
-../proxy/_types.py:253
-  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:253: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
-    @root_validator(pre=True)
-
-../proxy/_types.py:292
-  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:292: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
-    @root_validator(pre=True)
-
-../proxy/_types.py:319
-  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:319: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
-    @root_validator(pre=True)
-
-../proxy/_types.py:570
-  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:570: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
-    @root_validator(pre=True)
-
-../proxy/_types.py:591
-  /Users/krrishdholakia/Documents/litellm/litellm/proxy/_types.py:591: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
-    @root_validator(pre=True)
-
-../utils.py:35
-  /Users/krrishdholakia/Documents/litellm/litellm/utils.py:35: DeprecationWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html
-    import pkg_resources
-
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2871: 10 warnings
-  /opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2871: DeprecationWarning: Deprecated call to `pkg_resources.declare_namespace('google')`.
-  Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
-    declare_namespace(pkg)
-
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2871
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2871
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2871
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2871
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2871
-  /opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2871: DeprecationWarning: Deprecated call to `pkg_resources.declare_namespace('google.cloud')`.
-  Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
-    declare_namespace(pkg)
-
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2350
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2350
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2350
-  /opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2350: DeprecationWarning: Deprecated call to `pkg_resources.declare_namespace('google')`.
-  Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
-    declare_namespace(parent)
-
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2871
-  /opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2871: DeprecationWarning: Deprecated call to `pkg_resources.declare_namespace('google.logging')`.
-  Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
-    declare_namespace(pkg)
-
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2871
-  /opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2871: DeprecationWarning: Deprecated call to `pkg_resources.declare_namespace('google.iam')`.
-  Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
-    declare_namespace(pkg)
-
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2871
-  /opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2871: DeprecationWarning: Deprecated call to `pkg_resources.declare_namespace('mpl_toolkits')`.
-  Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
-    declare_namespace(pkg)
-
-../../../../../../opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2871
-  /opt/homebrew/lib/python3.11/site-packages/pkg_resources/__init__.py:2871: DeprecationWarning: Deprecated call to `pkg_resources.declare_namespace('sphinxcontrib')`.
-  Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
-    declare_namespace(pkg)
-
-../llms/prompt_templates/factory.py:6
-  /Users/krrishdholakia/Documents/litellm/litellm/llms/prompt_templates/factory.py:6: DeprecationWarning: 'imghdr' is deprecated and slated for removal in Python 3.13
-    import imghdr, base64
-
-test_completion.py::test_completion_claude_3_stream
-../utils.py:3249
-../utils.py:3249
-  /Users/krrishdholakia/Documents/litellm/litellm/utils.py:3249: DeprecationWarning: open_text is deprecated. Use files() instead. Refer to https://importlib-resources.readthedocs.io/en/latest/using.html#migrating-from-legacy for migration advice.
-    with resources.open_text(
-
--- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html
-======================== 1 passed, 46 warnings in 3.14s ========================
+<litellm.utils.CustomStreamWrapper object at 0x118bd82d0>
+chunk: ModelResponse(id='chatcmpl-95b7d389-ff5a-4e09-a084-02584ba2cf1e', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content='In the United States of America, the Supreme Court has ultimate judicial authority, and it is the one that rules on legal disputes between the states, or on the interpretation of the', role='assistant', function_call=None, tool_calls=None), logprobs=None)], created=1711406570, model='ai21.j2-mid-v1', object='chat.completion.chunk', system_fingerprint=None, usage=Usage())
+extracted chunk: In the United States of America, the Supreme Court has ultimate judicial authority, and it is the one that rules on legal disputes between the states, or on the interpretation of the
+chunk: ModelResponse(id='chatcmpl-95b7d389-ff5a-4e09-a084-02584ba2cf1e', choices=[StreamingChoices(finish_reason='stop', index=0, delta=Delta(content=None, role=None, function_call=None, tool_calls=None), logprobs=None)], created=1711406570, model='ai21.j2-mid-v1', object='chat.completion.chunk', system_fingerprint=None, usage=Usage())
+extracted chunk: 
+completion_response: In the United States of America, the Supreme Court has ultimate judicial authority, and it is the one that rules on legal disputes between the states, or on the interpretation of the
diff --git a/litellm/tests/test_streaming.py b/litellm/tests/test_streaming.py
index d854177aa..79036ab01 100644
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@@ -108,8 +108,19 @@ last_openai_chunk_example = {
     "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}],
 }
 
+"""
+Final chunk (sdk):
+chunk: ChatCompletionChunk(id='chatcmpl-96mM3oNBlxh2FDWVLKsgaFBBcULmI', 
+choices=[Choice(delta=ChoiceDelta(content=None, function_call=None, role=None, 
+tool_calls=None), finish_reason='stop', index=0, logprobs=None)], 
+created=1711402871, model='gpt-3.5-turbo-0125', object='chat.completion.chunk', system_fingerprint='fp_3bc1b5746c')
+"""
+
 
 def validate_last_format(chunk):
+    """
+    Ensure last chunk has no remaining content or tools
+    """
     assert isinstance(chunk, ModelResponse), "Chunk should be a dictionary."
     assert isinstance(chunk["id"], str), "'id' should be a string."
     assert isinstance(chunk["object"], str), "'object' should be a string."
@@ -119,6 +130,10 @@ def validate_last_format(chunk):
 
     for choice in chunk["choices"]:
         assert isinstance(choice["index"], int), "'index' should be an integer."
+        assert choice["delta"]["content"] is None
+        assert choice["delta"]["function_call"] is None
+        assert choice["delta"]["role"] is None
+        assert choice["delta"]["tool_calls"] is None
         assert isinstance(
             choice["finish_reason"], str
         ), "'finish_reason' should be a string."
@@ -493,13 +508,15 @@ def test_completion_mistral_api_stream():
             stream=True,
         )
         complete_response = ""
+        has_finish_reason = False
         for idx, chunk in enumerate(response):
-            print(chunk)
-            # print(chunk.choices[0].delta)
             chunk, finished = streaming_format_tests(idx, chunk)
             if finished:
+                has_finish_reason = True
                 break
             complete_response += chunk
+        if has_finish_reason == False:
+            raise Exception("finish reason not set")
         if complete_response.strip() == "":
             raise Exception("Empty response received")
         print(f"completion_response: {complete_response}")
@@ -534,11 +551,15 @@ def test_completion_deep_infra_stream():
 
         complete_response = ""
         # Add any assertions here to check the response
+        has_finish_reason = False
         for idx, chunk in enumerate(response):
             chunk, finished = streaming_format_tests(idx, chunk)
             if finished:
+                has_finish_reason = True
                 break
             complete_response += chunk
+        if has_finish_reason == False:
+            raise Exception("finish reason not set")
         if complete_response.strip() == "":
             raise Exception("Empty response received")
         print(f"completion_response: {complete_response}")
@@ -608,11 +629,15 @@ def test_completion_claude_stream_bad_key():
         )
         complete_response = ""
         # Add any assertions here to check the response
+        has_finish_reason = False
         for idx, chunk in enumerate(response):
             chunk, finished = streaming_format_tests(idx, chunk)
             if finished:
+                has_finish_reason = True
                 break
             complete_response += chunk
+        if has_finish_reason == False:
+            raise Exception("finish reason not set")
         if complete_response.strip() == "":
             raise Exception("Empty response received")
         print(f"1234completion_response: {complete_response}")
@@ -626,6 +651,45 @@ def test_completion_claude_stream_bad_key():
 # test_completion_claude_stream_bad_key()
 # test_completion_replicate_stream()
 
+
+def test_vertex_ai_stream():
+    from litellm.tests.test_amazing_vertex_completion import load_vertex_ai_credentials
+
+    load_vertex_ai_credentials()
+    litellm.set_verbose = True
+    litellm.vertex_project = "reliablekeys"
+    import random
+
+    test_models = ["gemini-1.0-pro"]
+    for model in test_models:
+        try:
+            print("making request", model)
+            response = completion(
+                model=model,
+                messages=[
+                    {"role": "user", "content": "write 10 line code code for saying hi"}
+                ],
+                stream=True,
+            )
+            complete_response = ""
+            is_finished = False
+            for idx, chunk in enumerate(response):
+                print(f"chunk in response: {chunk}")
+                chunk, finished = streaming_format_tests(idx, chunk)
+                if finished:
+                    is_finished = True
+                    break
+                complete_response += chunk
+            if complete_response.strip() == "":
+                raise Exception("Empty response received")
+            print(f"completion_response: {complete_response}")
+            assert is_finished == True
+        except litellm.RateLimitError as e:
+            pass
+        except Exception as e:
+            pytest.fail(f"Error occurred: {e}")
+
+
 # def test_completion_vertexai_stream():
 #     try:
 #         import os
@@ -742,11 +806,15 @@ def test_bedrock_claude_3_streaming():
         )
         complete_response = ""
         # Add any assertions here to check the response
+        has_finish_reason = False
         for idx, chunk in enumerate(response):
             chunk, finished = streaming_format_tests(idx, chunk)
             if finished:
+                has_finish_reason = True
                 break
             complete_response += chunk
+        if has_finish_reason == False:
+            raise Exception("finish reason not set")
         if complete_response.strip() == "":
             raise Exception("Empty response received")
         print(f"completion_response: {complete_response}")
@@ -1705,7 +1773,7 @@ def test_success_callback_streaming():
     messages = [{"role": "user", "content": "hello"}]
     print("TESTING LITELLM COMPLETION CALL")
     response = litellm.completion(
-        model="j2-light",
+        model="gpt-3.5-turbo",
         messages=messages,
         stream=True,
         max_tokens=5,
@@ -2072,7 +2140,7 @@ def test_completion_claude_3_function_call_with_streaming():
         )
         idx = 0
         for chunk in response:
-            # print(f"chunk: {chunk}")
+            print(f"chunk in response: {chunk}")
             if idx == 0:
                 assert (
                     chunk.choices[0].delta.tool_calls[0].function.arguments is not None
@@ -2081,7 +2149,7 @@ def test_completion_claude_3_function_call_with_streaming():
                     chunk.choices[0].delta.tool_calls[0].function.arguments, str
                 )
                 validate_first_streaming_function_calling_chunk(chunk=chunk)
-            elif idx == 1:
+            elif idx == 1 and chunk.choices[0].finish_reason is None:
                 validate_second_streaming_function_calling_chunk(chunk=chunk)
             elif chunk.choices[0].finish_reason is not None:  # last chunk
                 validate_final_streaming_function_calling_chunk(chunk=chunk)
@@ -2136,7 +2204,7 @@ async def test_acompletion_claude_3_function_call_with_streaming():
                     chunk.choices[0].delta.tool_calls[0].function.arguments, str
                 )
                 validate_first_streaming_function_calling_chunk(chunk=chunk)
-            elif idx == 1:
+            elif idx == 1 and chunk.choices[0].finish_reason is None:
                 validate_second_streaming_function_calling_chunk(chunk=chunk)
             elif chunk.choices[0].finish_reason is not None:  # last chunk
                 validate_final_streaming_function_calling_chunk(chunk=chunk)
diff --git a/litellm/utils.py b/litellm/utils.py
index 2e16d1d97..6543a4769 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -8458,6 +8458,7 @@ class CustomStreamWrapper:
         self.completion_stream = completion_stream
         self.sent_first_chunk = False
         self.sent_last_chunk = False
+        self.received_finish_reason: Optional[str] = None
         self.special_tokens = ["<|assistant|>", "<|system|>", "<|user|>", "<s>", "</s>"]
         self.holding_chunk = ""
         self.complete_response = ""
@@ -9131,7 +9132,7 @@ class CustomStreamWrapper:
                 "finish_reason": finish_reason,
             }
 
-    def chunk_creator(self, chunk):
+    def model_response_creator(self):
         model_response = ModelResponse(stream=True, model=self.model)
         if self.response_id is not None:
             model_response.id = self.response_id
@@ -9141,6 +9142,20 @@ class CustomStreamWrapper:
         model_response._hidden_params["created_at"] = time.time()
         model_response.choices = [StreamingChoices()]
         model_response.choices[0].finish_reason = None
+        return model_response
+
+    def is_delta_empty(self, delta: Delta) -> bool:
+        is_empty = True
+        if delta.content is not None:
+            is_empty = False
+        elif delta.tool_calls is not None:
+            is_empty = False
+        elif delta.function_call is not None:
+            is_empty = False
+        return is_empty
+
+    def chunk_creator(self, chunk):
+        model_response = self.model_response_creator()
         response_obj = {}
         try:
             # return this for all models
@@ -9149,30 +9164,22 @@ class CustomStreamWrapper:
                 response_obj = self.handle_anthropic_chunk(chunk)
                 completion_obj["content"] = response_obj["text"]
                 if response_obj["is_finished"]:
-                    model_response.choices[0].finish_reason = response_obj[
-                        "finish_reason"
-                    ]
+                    self.received_finish_reason = response_obj["finish_reason"]
             elif self.model == "replicate" or self.custom_llm_provider == "replicate":
                 response_obj = self.handle_replicate_chunk(chunk)
                 completion_obj["content"] = response_obj["text"]
                 if response_obj["is_finished"]:
-                    model_response.choices[0].finish_reason = response_obj[
-                        "finish_reason"
-                    ]
+                    self.received_finish_reason = response_obj["finish_reason"]
             elif self.custom_llm_provider and self.custom_llm_provider == "together_ai":
                 response_obj = self.handle_together_ai_chunk(chunk)
                 completion_obj["content"] = response_obj["text"]
                 if response_obj["is_finished"]:
-                    model_response.choices[0].finish_reason = response_obj[
-                        "finish_reason"
-                    ]
+                    self.received_finish_reason = response_obj["finish_reason"]
             elif self.custom_llm_provider and self.custom_llm_provider == "huggingface":
                 response_obj = self.handle_huggingface_chunk(chunk)
                 completion_obj["content"] = response_obj["text"]
                 if response_obj["is_finished"]:
-                    model_response.choices[0].finish_reason = response_obj[
-                        "finish_reason"
-                    ]
+                    self.received_finish_reason = response_obj["finish_reason"]
             elif (
                 self.custom_llm_provider and self.custom_llm_provider == "baseten"
             ):  # baseten doesn't provide streaming
@@ -9183,16 +9190,12 @@ class CustomStreamWrapper:
                 response_obj = self.handle_ai21_chunk(chunk)
                 completion_obj["content"] = response_obj["text"]
                 if response_obj["is_finished"]:
-                    model_response.choices[0].finish_reason = response_obj[
-                        "finish_reason"
-                    ]
+                    self.received_finish_reason = response_obj["finish_reason"]
             elif self.custom_llm_provider and self.custom_llm_provider == "maritalk":
                 response_obj = self.handle_maritalk_chunk(chunk)
                 completion_obj["content"] = response_obj["text"]
                 if response_obj["is_finished"]:
-                    model_response.choices[0].finish_reason = response_obj[
-                        "finish_reason"
-                    ]
+                    self.received_finish_reason = response_obj["finish_reason"]
             elif self.custom_llm_provider and self.custom_llm_provider == "vllm":
                 completion_obj["content"] = chunk[0].outputs[0].text
             elif (
@@ -9201,152 +9204,116 @@ class CustomStreamWrapper:
                 response_obj = self.handle_aleph_alpha_chunk(chunk)
                 completion_obj["content"] = response_obj["text"]
                 if response_obj["is_finished"]:
-                    model_response.choices[0].finish_reason = response_obj[
-                        "finish_reason"
-                    ]
+                    self.received_finish_reason = response_obj["finish_reason"]
             elif self.custom_llm_provider == "nlp_cloud":
                 try:
                     response_obj = self.handle_nlp_cloud_chunk(chunk)
                     completion_obj["content"] = response_obj["text"]
                     if response_obj["is_finished"]:
-                        model_response.choices[0].finish_reason = response_obj[
-                            "finish_reason"
-                        ]
+                        self.received_finish_reason = response_obj["finish_reason"]
                 except Exception as e:
-                    if self.sent_last_chunk:
+                    if self.received_finish_reason:
                         raise e
                     else:
                         if self.sent_first_chunk is False:
                             raise Exception("An unknown error occurred with the stream")
-                        model_response.choices[0].finish_reason = "stop"
-                        self.sent_last_chunk = True
+                        self.received_finish_reason = "stop"
             elif self.custom_llm_provider == "gemini":
-                try:
-                    if hasattr(chunk, "parts") == True:
-                        try:
-                            if len(chunk.parts) > 0:
-                                completion_obj["content"] = chunk.parts[0].text
-                            if hasattr(chunk.parts[0], "finish_reason"):
-                                model_response.choices[0].finish_reason = (
-                                    map_finish_reason(chunk.parts[0].finish_reason.name)
-                                )
-                        except:
-                            if chunk.parts[0].finish_reason.name == "SAFETY":
-                                raise Exception(
-                                    f"The response was blocked by VertexAI. {str(chunk)}"
-                                )
-                    else:
-                        completion_obj["content"] = str(chunk)
-                except StopIteration as e:
-                    if self.sent_last_chunk:
-                        raise e
-                    else:
-                        model_response.choices[0].finish_reason = "stop"
-                        self.sent_last_chunk = True
+                if hasattr(chunk, "parts") == True:
+                    try:
+                        if len(chunk.parts) > 0:
+                            completion_obj["content"] = chunk.parts[0].text
+                        if hasattr(chunk.parts[0], "finish_reason"):
+                            self.received_finish_reason = chunk.parts[
+                                0
+                            ].finish_reason.name
+                    except:
+                        if chunk.parts[0].finish_reason.name == "SAFETY":
+                            raise Exception(
+                                f"The response was blocked by VertexAI. {str(chunk)}"
+                            )
+                else:
+                    completion_obj["content"] = str(chunk)
             elif self.custom_llm_provider and (self.custom_llm_provider == "vertex_ai"):
-                try:
-                    if hasattr(chunk, "candidates") == True:
+                if hasattr(chunk, "candidates") == True:
+                    try:
                         try:
-                            try:
-                                completion_obj["content"] = chunk.text
-                            except Exception as e:
-                                if "Part has no text." in str(e):
-                                    ## check for function calling
-                                    function_call = (
-                                        chunk.candidates[0]
-                                        .content.parts[0]
-                                        .function_call
-                                    )
-                                    args_dict = {}
-                                    for k, v in function_call.args.items():
-                                        args_dict[k] = v
-                                    args_str = json.dumps(args_dict)
-                                    _delta_obj = litellm.utils.Delta(
-                                        content=None,
-                                        tool_calls=[
-                                            {
-                                                "id": f"call_{str(uuid.uuid4())}",
-                                                "function": {
-                                                    "arguments": args_str,
-                                                    "name": function_call.name,
-                                                },
-                                                "type": "function",
-                                            }
-                                        ],
-                                    )
-                                    _streaming_response = StreamingChoices(
-                                        delta=_delta_obj
-                                    )
-                                    _model_response = ModelResponse(stream=True)
-                                    _model_response.choices = [_streaming_response]
-                                    response_obj = {"original_chunk": _model_response}
-                                else:
-                                    raise e
-                            if (
-                                hasattr(chunk.candidates[0], "finish_reason")
-                                and chunk.candidates[0].finish_reason.name
-                                != "FINISH_REASON_UNSPECIFIED"
-                            ):  # every non-final chunk in vertex ai has this
-                                model_response.choices[0].finish_reason = (
-                                    map_finish_reason(
-                                        chunk.candidates[0].finish_reason.name
-                                    )
-                                )
+                            completion_obj["content"] = chunk.text
                         except Exception as e:
-                            if chunk.candidates[0].finish_reason.name == "SAFETY":
-                                raise Exception(
-                                    f"The response was blocked by VertexAI. {str(chunk)}"
+                            if "Part has no text." in str(e):
+                                ## check for function calling
+                                function_call = (
+                                    chunk.candidates[0].content.parts[0].function_call
                                 )
-                    else:
-                        completion_obj["content"] = str(chunk)
-                except StopIteration as e:
-                    if self.sent_last_chunk:
-                        raise e
-                    else:
-                        model_response.choices[0].finish_reason = "stop"
-                        self.sent_last_chunk = True
+                                args_dict = {}
+                                for k, v in function_call.args.items():
+                                    args_dict[k] = v
+                                args_str = json.dumps(args_dict)
+                                _delta_obj = litellm.utils.Delta(
+                                    content=None,
+                                    tool_calls=[
+                                        {
+                                            "id": f"call_{str(uuid.uuid4())}",
+                                            "function": {
+                                                "arguments": args_str,
+                                                "name": function_call.name,
+                                            },
+                                            "type": "function",
+                                        }
+                                    ],
+                                )
+                                _streaming_response = StreamingChoices(delta=_delta_obj)
+                                _model_response = ModelResponse(stream=True)
+                                _model_response.choices = [_streaming_response]
+                                response_obj = {"original_chunk": _model_response}
+                            else:
+                                raise e
+                        if (
+                            hasattr(chunk.candidates[0], "finish_reason")
+                            and chunk.candidates[0].finish_reason.name
+                            != "FINISH_REASON_UNSPECIFIED"
+                        ):  # every non-final chunk in vertex ai has this
+                            self.received_finish_reason = chunk.candidates[
+                                0
+                            ].finish_reason.name
+                    except Exception as e:
+                        if chunk.candidates[0].finish_reason.name == "SAFETY":
+                            raise Exception(
+                                f"The response was blocked by VertexAI. {str(chunk)}"
+                            )
+                else:
+                    completion_obj["content"] = str(chunk)
             elif self.custom_llm_provider == "cohere":
                 response_obj = self.handle_cohere_chunk(chunk)
                 completion_obj["content"] = response_obj["text"]
                 if response_obj["is_finished"]:
-                    model_response.choices[0].finish_reason = response_obj[
-                        "finish_reason"
-                    ]
+                    self.received_finish_reason = response_obj["finish_reason"]
             elif self.custom_llm_provider == "cohere_chat":
                 response_obj = self.handle_cohere_chat_chunk(chunk)
                 if response_obj is None:
                     return
                 completion_obj["content"] = response_obj["text"]
                 if response_obj["is_finished"]:
-                    model_response.choices[0].finish_reason = response_obj[
-                        "finish_reason"
-                    ]
+                    self.received_finish_reason = response_obj["finish_reason"]
             elif self.custom_llm_provider == "bedrock":
-                if self.sent_last_chunk:
+                if self.received_finish_reason is not None:
                     raise StopIteration
                 response_obj = self.handle_bedrock_stream(chunk)
                 completion_obj["content"] = response_obj["text"]
                 if response_obj["is_finished"]:
-                    model_response.choices[0].finish_reason = response_obj[
-                        "finish_reason"
-                    ]
-                    self.sent_last_chunk = True
+                    self.received_finish_reason = response_obj["finish_reason"]
             elif self.custom_llm_provider == "sagemaker":
-                verbose_logger.debug(f"ENTERS SAGEMAKER STREAMING for chunk {chunk}")
+                print_verbose(f"ENTERS SAGEMAKER STREAMING for chunk {chunk}")
                 response_obj = self.handle_sagemaker_stream(chunk)
                 completion_obj["content"] = response_obj["text"]
                 if response_obj["is_finished"]:
-                    model_response.choices[0].finish_reason = response_obj[
-                        "finish_reason"
-                    ]
-                    self.sent_last_chunk = True
+                    self.received_finish_reason = response_obj["finish_reason"]
             elif self.custom_llm_provider == "petals":
                 if len(self.completion_stream) == 0:
-                    if self.sent_last_chunk:
+                    if self.received_finish_reason is not None:
                         raise StopIteration
                     else:
-                        model_response.choices[0].finish_reason = "stop"
-                        self.sent_last_chunk = True
+                        self.received_finish_reason = "stop"
                 chunk_size = 30
                 new_chunk = self.completion_stream[:chunk_size]
                 completion_obj["content"] = new_chunk
@@ -9356,11 +9323,10 @@ class CustomStreamWrapper:
                 # fake streaming
                 response_obj = {}
                 if len(self.completion_stream) == 0:
-                    if self.sent_last_chunk:
+                    if self.received_finish_reason is not None:
                         raise StopIteration
                     else:
-                        model_response.choices[0].finish_reason = "stop"
-                        self.sent_last_chunk = True
+                        self.received_finish_reason = "stop"
                 chunk_size = 30
                 new_chunk = self.completion_stream[:chunk_size]
                 completion_obj["content"] = new_chunk
@@ -9371,41 +9337,31 @@ class CustomStreamWrapper:
                 completion_obj["content"] = response_obj["text"]
                 print_verbose(f"completion obj content: {completion_obj['content']}")
                 if response_obj["is_finished"]:
-                    model_response.choices[0].finish_reason = response_obj[
-                        "finish_reason"
-                    ]
+                    self.received_finish_reason = response_obj["finish_reason"]
             elif self.custom_llm_provider == "ollama_chat":
                 response_obj = self.handle_ollama_chat_stream(chunk)
                 completion_obj["content"] = response_obj["text"]
                 print_verbose(f"completion obj content: {completion_obj['content']}")
                 if response_obj["is_finished"]:
-                    model_response.choices[0].finish_reason = response_obj[
-                        "finish_reason"
-                    ]
+                    self.received_finish_reason = response_obj["finish_reason"]
             elif self.custom_llm_provider == "cloudflare":
                 response_obj = self.handle_cloudlfare_stream(chunk)
                 completion_obj["content"] = response_obj["text"]
                 print_verbose(f"completion obj content: {completion_obj['content']}")
                 if response_obj["is_finished"]:
-                    model_response.choices[0].finish_reason = response_obj[
-                        "finish_reason"
-                    ]
+                    self.received_finish_reason = response_obj["finish_reason"]
             elif self.custom_llm_provider == "text-completion-openai":
                 response_obj = self.handle_openai_text_completion_chunk(chunk)
                 completion_obj["content"] = response_obj["text"]
                 print_verbose(f"completion obj content: {completion_obj['content']}")
                 if response_obj["is_finished"]:
-                    model_response.choices[0].finish_reason = response_obj[
-                        "finish_reason"
-                    ]
+                    self.received_finish_reason = response_obj["finish_reason"]
             elif self.custom_llm_provider == "azure_text":
                 response_obj = self.handle_azure_text_completion_chunk(chunk)
                 completion_obj["content"] = response_obj["text"]
                 print_verbose(f"completion obj content: {completion_obj['content']}")
                 if response_obj["is_finished"]:
-                    model_response.choices[0].finish_reason = response_obj[
-                        "finish_reason"
-                    ]
+                    self.received_finish_reason = response_obj["finish_reason"]
             elif self.custom_llm_provider == "cached_response":
                 response_obj = {
                     "text": chunk.choices[0].delta.content,
@@ -9419,9 +9375,7 @@ class CustomStreamWrapper:
                 if hasattr(chunk, "id"):
                     model_response.id = chunk.id
                 if response_obj["is_finished"]:
-                    model_response.choices[0].finish_reason = response_obj[
-                        "finish_reason"
-                    ]
+                    self.received_finish_reason = response_obj["finish_reason"]
             else:  # openai / azure chat model
                 if self.custom_llm_provider == "azure":
                     if hasattr(chunk, "model"):
@@ -9437,9 +9391,7 @@ class CustomStreamWrapper:
                         raise Exception(
                             "Mistral API raised a streaming error - finish_reason: error, no content string given."
                         )
-                    model_response.choices[0].finish_reason = response_obj[
-                        "finish_reason"
-                    ]
+                    self.received_finish_reason = response_obj["finish_reason"]
                 if response_obj.get("original_chunk", None) is not None:
                     model_response.system_fingerprint = getattr(
                         response_obj["original_chunk"], "system_fingerprint", None
@@ -9451,7 +9403,7 @@ class CustomStreamWrapper:
 
             model_response.model = self.model
             print_verbose(
-                f"model_response finish reason 3: {model_response.choices[0].finish_reason}; response_obj={response_obj}"
+                f"model_response finish reason 3: {self.received_finish_reason}; response_obj={response_obj}"
             )
             ## FUNCTION CALL PARSING
             if (
@@ -9580,7 +9532,7 @@ class CustomStreamWrapper:
                     return model_response
                 else:
                     return
-            elif model_response.choices[0].finish_reason is not None:
+            elif self.received_finish_reason is not None:
                 # flush any remaining holding chunk
                 if len(self.holding_chunk) > 0:
                     if model_response.choices[0].delta.content is None:
@@ -9590,10 +9542,17 @@ class CustomStreamWrapper:
                             self.holding_chunk + model_response.choices[0].delta.content
                         )
                     self.holding_chunk = ""
-                # get any function call arguments
-                model_response.choices[0].finish_reason = map_finish_reason(
-                    model_response.choices[0].finish_reason
-                )  # ensure consistent output to openai
+                # if delta is None
+                is_delta_empty = self.is_delta_empty(
+                    delta=model_response.choices[0].delta
+                )
+                if is_delta_empty:
+                    # get any function call arguments
+                    model_response.choices[0].finish_reason = map_finish_reason(
+                        finish_reason=self.received_finish_reason
+                    )  # ensure consistent output to openai
+                    self.sent_last_chunk = True
+
                 return model_response
             elif (
                 model_response.choices[0].delta.tool_calls is not None
@@ -9653,6 +9612,16 @@ class CustomStreamWrapper:
         ## SYNC LOGGING
         self.logging_obj.success_handler(processed_chunk)
 
+    def finish_reason_handler(self):
+        model_response = self.model_response_creator()
+        if self.received_finish_reason is not None:
+            model_response.choices[0].finish_reason = map_finish_reason(
+                finish_reason=self.received_finish_reason
+            )
+        else:
+            model_response.choices[0].finish_reason = "stop"
+        return model_response
+
     ## needs to handle the empty string case (even starting chunk can be an empty string)
     def __next__(self):
         try:
@@ -9687,7 +9656,11 @@ class CustomStreamWrapper:
                     # RETURN RESULT
                     return response
         except StopIteration:
-            raise  # Re-raise StopIteration
+            if self.sent_last_chunk == True:
+                raise  # Re-raise StopIteration
+            else:
+                self.sent_last_chunk = True
+                return self.finish_reason_handler()
         except Exception as e:
             traceback_exception = traceback.format_exc()
             # LOG FAILURE - handle streaming failure logging in the _next_ object, remove `handle_failure` once it's deprecated
@@ -9792,9 +9765,17 @@ class CustomStreamWrapper:
                         # RETURN RESULT
                         return processed_chunk
         except StopAsyncIteration:
-            raise
+            if self.sent_last_chunk == True:
+                raise  # Re-raise StopIteration
+            else:
+                self.sent_last_chunk = True
+                return self.finish_reason_handler()
         except StopIteration:
-            raise StopAsyncIteration  # Re-raise StopIteration
+            if self.sent_last_chunk == True:
+                raise StopAsyncIteration
+            else:
+                self.sent_last_chunk = True
+                return self.finish_reason_handler()
         except Exception as e:
             traceback_exception = traceback.format_exc()
             # Handle any exceptions that might occur during streaming

From dc2c4af631b9d7f43caa9a9224cd9b2a4b539756 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Mon, 25 Mar 2024 16:47:17 -0700
Subject: [PATCH 2/9] fix(utils.py): fix text completion streaming

---
 litellm/tests/test_text_completion.py | 3 ++-
 litellm/utils.py                      | 1 -
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/litellm/tests/test_text_completion.py b/litellm/tests/test_text_completion.py
index a16b86449..1a7d2de6e 100644
--- a/litellm/tests/test_text_completion.py
+++ b/litellm/tests/test_text_completion.py
@@ -2907,6 +2907,7 @@ def test_async_text_completion_stream():
 
     async def test_get_response():
         try:
+            litellm.set_verbose = True
             response = await litellm.atext_completion(
                 model="gpt-3.5-turbo-instruct",
                 prompt="good morning",
@@ -2930,7 +2931,7 @@ def test_async_text_completion_stream():
     asyncio.run(test_get_response())
 
 
-test_async_text_completion_stream()
+# test_async_text_completion_stream()
 
 
 @pytest.mark.asyncio
diff --git a/litellm/utils.py b/litellm/utils.py
index 6543a4769..bd47e08ba 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -8899,7 +8899,6 @@ class CustomStreamWrapper:
                 if data_json["choices"][0].get("finish_reason", None):
                     is_finished = True
                     finish_reason = data_json["choices"][0]["finish_reason"]
-                    self.sent_last_chunk = True
                 print_verbose(
                     f"text: {text}; is_finished: {is_finished}; finish_reason: {finish_reason}"
                 )

From 1ac641165be20b0123704693e2d1747ebb9f6fa0 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Mon, 25 Mar 2024 18:20:43 -0700
Subject: [PATCH 3/9] fix(utils.py): persist response id across chunks

---
 litellm/tests/test_custom_logger.py | 9 ++++++---
 litellm/utils.py                    | 5 +++++
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/litellm/tests/test_custom_logger.py b/litellm/tests/test_custom_logger.py
index b2e2b7d22..7e8a53561 100644
--- a/litellm/tests/test_custom_logger.py
+++ b/litellm/tests/test_custom_logger.py
@@ -490,7 +490,7 @@ def test_redis_cache_completion_stream():
             response_1_content += chunk.choices[0].delta.content or ""
         print(response_1_content)
 
-        time.sleep(0.1)  # sleep for 0.1 seconds allow set cache to occur
+        time.sleep(1)  # sleep for 0.1 seconds allow set cache to occur
         response2 = completion(
             model="gpt-3.5-turbo",
             messages=messages,
@@ -505,8 +505,10 @@ def test_redis_cache_completion_stream():
             response_2_id = chunk.id
             print(chunk)
             response_2_content += chunk.choices[0].delta.content or ""
-        print("\nresponse 1", response_1_content)
-        print("\nresponse 2", response_2_content)
+        print(
+            f"\nresponse 1: {response_1_content}",
+        )
+        print(f"\nresponse 2: {response_2_content}")
         assert (
             response_1_id == response_2_id
         ), f"Response 1 != Response 2. Same params, Response 1{response_1_content} != Response 2{response_2_content}"
@@ -516,6 +518,7 @@ def test_redis_cache_completion_stream():
         litellm.success_callback = []
         litellm._async_success_callback = []
         litellm.cache = None
+        raise Exception("it worked!")
     except Exception as e:
         print(e)
         litellm.success_callback = []
diff --git a/litellm/utils.py b/litellm/utils.py
index bd47e08ba..496bb75ec 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -8458,6 +8458,7 @@ class CustomStreamWrapper:
         self.completion_stream = completion_stream
         self.sent_first_chunk = False
         self.sent_last_chunk = False
+        self.system_fingerprint: Optional[str] = None
         self.received_finish_reason: Optional[str] = None
         self.special_tokens = ["<|assistant|>", "<|system|>", "<|user|>", "<s>", "</s>"]
         self.holding_chunk = ""
@@ -9373,6 +9374,7 @@ class CustomStreamWrapper:
                 print_verbose(f"completion obj content: {completion_obj['content']}")
                 if hasattr(chunk, "id"):
                     model_response.id = chunk.id
+                    self.response_id = chunk.id
                 if response_obj["is_finished"]:
                     self.received_finish_reason = response_obj["finish_reason"]
             else:  # openai / azure chat model
@@ -9397,6 +9399,7 @@ class CustomStreamWrapper:
                     )
                     if hasattr(response_obj["original_chunk"], "id"):
                         model_response.id = response_obj["original_chunk"].id
+                        self.response_id = model_response.id
                 if response_obj["logprobs"] is not None:
                     model_response.choices[0].logprobs = response_obj["logprobs"]
 
@@ -9412,6 +9415,7 @@ class CustomStreamWrapper:
                 # enter this branch when no content has been passed in response
                 original_chunk = response_obj.get("original_chunk", None)
                 model_response.id = original_chunk.id
+                self.response_id = original_chunk.id
                 if len(original_chunk.choices) > 0:
                     if (
                         original_chunk.choices[0].delta.function_call is not None
@@ -9493,6 +9497,7 @@ class CustomStreamWrapper:
                     original_chunk = response_obj.get("original_chunk", None)
                     if original_chunk:
                         model_response.id = original_chunk.id
+                        self.response_id = original_chunk.id
                         if len(original_chunk.choices) > 0:
                             try:
                                 delta = dict(original_chunk.choices[0].delta)

From a5776a3054aad5ce45f2b29cf7198b1d78e96e8f Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Mon, 25 Mar 2024 18:32:12 -0700
Subject: [PATCH 4/9] test(test_custom_logger.py): cleanup test

---
 litellm/tests/test_custom_logger.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/litellm/tests/test_custom_logger.py b/litellm/tests/test_custom_logger.py
index 7e8a53561..0b85b463c 100644
--- a/litellm/tests/test_custom_logger.py
+++ b/litellm/tests/test_custom_logger.py
@@ -518,7 +518,6 @@ def test_redis_cache_completion_stream():
         litellm.success_callback = []
         litellm._async_success_callback = []
         litellm.cache = None
-        raise Exception("it worked!")
     except Exception as e:
         print(e)
         litellm.success_callback = []

From bd75498913a399c4a1050b5e1212381d142640ff Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Mon, 25 Mar 2024 19:03:10 -0700
Subject: [PATCH 5/9] fix(utils.py): log success event for streaming

---
 litellm/tests/test_custom_callback_input.py |  2 +
 litellm/utils.py                            | 43 +++++++++++++++------
 2 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/litellm/tests/test_custom_callback_input.py b/litellm/tests/test_custom_callback_input.py
index 5c52867f9..4296f188d 100644
--- a/litellm/tests/test_custom_callback_input.py
+++ b/litellm/tests/test_custom_callback_input.py
@@ -651,6 +651,7 @@ async def test_async_chat_vertex_ai_stream():
     try:
         load_vertex_ai_credentials()
         customHandler = CompletionCustomHandler()
+        litellm.set_verbose = True
         litellm.callbacks = [customHandler]
         # test streaming
         response = await litellm.acompletion(
@@ -667,6 +668,7 @@ async def test_async_chat_vertex_ai_stream():
         async for chunk in response:
             print(f"chunk: {chunk}")
             continue
+        await asyncio.sleep(10)
         print(f"customHandler.states: {customHandler.states}")
         assert (
             customHandler.states.count("async_success") == 1
diff --git a/litellm/utils.py b/litellm/utils.py
index 496bb75ec..51623ce91 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -1774,16 +1774,14 @@ class Logging:
                         end_time=end_time,
                     )
                 except Exception as e:
-                    verbose_logger.debug(
+                    print_verbose(
                         f"Error occurred building stream chunk: {traceback.format_exc()}"
                     )
                     complete_streaming_response = None
             else:
                 self.streaming_chunks.append(result)
         if complete_streaming_response is not None:
-            verbose_logger.debug(
-                "Async success callbacks: Got a complete streaming response"
-            )
+            print_verbose("Async success callbacks: Got a complete streaming response")
             self.model_call_details["async_complete_streaming_response"] = (
                 complete_streaming_response
             )
@@ -1824,7 +1822,7 @@ class Logging:
                     callbacks.append(callback)
         else:
             callbacks = litellm._async_success_callback
-        verbose_logger.debug(f"Async success callbacks: {callbacks}")
+        print_verbose(f"Async success callbacks: {callbacks}")
         for callback in callbacks:
             # check if callback can run for this request
             litellm_params = self.model_call_details.get("litellm_params", {})
@@ -1894,10 +1892,6 @@ class Logging:
                             end_time=end_time,
                         )
                 if callable(callback):  # custom logger functions
-                    # print_verbose(
-                    #     f"Making async function logging call for {callback}, result={result} - {self.model_call_details}",
-                    #     logger_only=True,
-                    # )
                     if self.stream:
                         if (
                             "async_complete_streaming_response"
@@ -9664,7 +9658,12 @@ class CustomStreamWrapper:
                 raise  # Re-raise StopIteration
             else:
                 self.sent_last_chunk = True
-                return self.finish_reason_handler()
+                processed_chunk = self.finish_reason_handler()
+                ## LOGGING
+                threading.Thread(
+                    target=self.logging_obj.success_handler, args=(processed_chunk,)
+                ).start()  # log response
+                return processed_chunk
         except Exception as e:
             traceback_exception = traceback.format_exc()
             # LOG FAILURE - handle streaming failure logging in the _next_ object, remove `handle_failure` once it's deprecated
@@ -9773,13 +9772,33 @@ class CustomStreamWrapper:
                 raise  # Re-raise StopIteration
             else:
                 self.sent_last_chunk = True
-                return self.finish_reason_handler()
+                processed_chunk = self.finish_reason_handler()
+                ## LOGGING
+                threading.Thread(
+                    target=self.logging_obj.success_handler, args=(processed_chunk,)
+                ).start()  # log response
+                asyncio.create_task(
+                    self.logging_obj.async_success_handler(
+                        processed_chunk,
+                    )
+                )
+                return processed_chunk
         except StopIteration:
             if self.sent_last_chunk == True:
                 raise StopAsyncIteration
             else:
                 self.sent_last_chunk = True
-                return self.finish_reason_handler()
+                processed_chunk = self.finish_reason_handler()
+                ## LOGGING
+                threading.Thread(
+                    target=self.logging_obj.success_handler, args=(processed_chunk,)
+                ).start()  # log response
+                asyncio.create_task(
+                    self.logging_obj.async_success_handler(
+                        processed_chunk,
+                    )
+                )
+                return processed_chunk
         except Exception as e:
             traceback_exception = traceback.format_exc()
             # Handle any exceptions that might occur during streaming

From fa297b67ca55e43c018b2c2bc7ea9452276ac759 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Mon, 25 Mar 2024 19:11:39 -0700
Subject: [PATCH 6/9] fix(test_amazing_vertex_completion.py): fix test to check
 if content is none

---
 litellm/tests/test_amazing_vertex_completion.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/litellm/tests/test_amazing_vertex_completion.py b/litellm/tests/test_amazing_vertex_completion.py
index 264bb7a70..89862a4c1 100644
--- a/litellm/tests/test_amazing_vertex_completion.py
+++ b/litellm/tests/test_amazing_vertex_completion.py
@@ -281,7 +281,8 @@ async def test_async_vertexai_streaming_response():
             complete_response = ""
             async for chunk in response:
                 print(f"chunk: {chunk}")
-                complete_response += chunk.choices[0].delta.content
+                if chunk.choices[0].delta.content is not None:
+                    complete_response += chunk.choices[0].delta.content
             print(f"complete_response: {complete_response}")
             assert len(complete_response) > 0
         except litellm.RateLimitError as e:

From 1c55f2ccc53d35f8da4a8123b10a6b52f7ce0195 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Mon, 25 Mar 2024 19:24:09 -0700
Subject: [PATCH 7/9] fix(utils.py): persist system fingerprint across chunks

---
 litellm/utils.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/litellm/utils.py b/litellm/utils.py
index 51623ce91..200bb8607 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -9132,6 +9132,8 @@ class CustomStreamWrapper:
             model_response.id = self.response_id
         else:
             self.response_id = model_response.id
+        if self.system_fingerprint is not None:
+            model_response.system_fingerprint = self.system_fingerprint
         model_response._hidden_params["custom_llm_provider"] = self.custom_llm_provider
         model_response._hidden_params["created_at"] = time.time()
         model_response.choices = [StreamingChoices()]
@@ -9369,6 +9371,8 @@ class CustomStreamWrapper:
                 if hasattr(chunk, "id"):
                     model_response.id = chunk.id
                     self.response_id = chunk.id
+                if hasattr(chunk, "system_fingerprint"):
+                    self.system_fingerprint = chunk.system_fingerprint
                 if response_obj["is_finished"]:
                     self.received_finish_reason = response_obj["finish_reason"]
             else:  # openai / azure chat model
@@ -9388,12 +9392,16 @@ class CustomStreamWrapper:
                         )
                     self.received_finish_reason = response_obj["finish_reason"]
                 if response_obj.get("original_chunk", None) is not None:
-                    model_response.system_fingerprint = getattr(
-                        response_obj["original_chunk"], "system_fingerprint", None
-                    )
                     if hasattr(response_obj["original_chunk"], "id"):
                         model_response.id = response_obj["original_chunk"].id
                         self.response_id = model_response.id
+                    if hasattr(response_obj["original_chunk"], "system_fingerprint"):
+                        model_response.system_fingerprint = response_obj[
+                            "original_chunk"
+                        ].system_fingerprint
+                        self.system_fingerprint = response_obj[
+                            "original_chunk"
+                        ].system_fingerprint
                 if response_obj["logprobs"] is not None:
                     model_response.choices[0].logprobs = response_obj["logprobs"]
 

From 4d85387b5ae484066fa267bb2e5b9993c7c1ba0d Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Mon, 25 Mar 2024 19:33:57 -0700
Subject: [PATCH 8/9] test(test_azure_astreaming_and_function_calling): fix
 test to handle caching

---
 litellm/tests/test_streaming.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/tests/test_streaming.py b/litellm/tests/test_streaming.py
index 79036ab01..ee7cb64cd 100644
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@@ -2096,7 +2096,7 @@ async def test_azure_astreaming_and_function_calling():
                     chunk.choices[0].delta.tool_calls[0].function.arguments, str
                 )
                 validate_first_streaming_function_calling_chunk(chunk=chunk)
-            elif idx == 1:
+            elif idx == 1 and chunk.choices[0].finish_reason is None:
                 validate_second_streaming_function_calling_chunk(chunk=chunk)
             elif chunk.choices[0].finish_reason is not None:  # last chunk
                 validate_final_streaming_function_calling_chunk(chunk=chunk)

From 643fd6ac965166179758c8e1552a39d70879ae4e Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Mon, 25 Mar 2024 21:36:47 -0700
Subject: [PATCH 9/9] test(test_caching.py): fix
 test_redis_cache_acompletion_stream

---
 litellm/tests/test_caching.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index 4bc5d1d5a..100b29847 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -505,7 +505,7 @@ def test_redis_cache_completion_stream():
         response_2_id = ""
         for chunk in response2:
             print(chunk)
-            response_2_id += chunk.id
+            response_2_id = chunk.id
         assert (
             response_1_id == response_2_id
         ), f"Response 1 != Response 2. Same params, Response 1{response_1_id} != Response 2{response_2_id}"