From 73254987dac2473256fec4010d6fd79a16d0c55e Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Sat, 22 Jun 2024 20:20:39 -0700
Subject: [PATCH] fix(vertex_httpx.py): ignore vertex finish reason - wait for
 stream to end

Fixes https://github.com/BerriAI/litellm/issues/4339
---
 litellm/llms/vertex_httpx.py    |  6 ++++--
 litellm/tests/test_streaming.py | 18 +++++++++++++-----
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/litellm/llms/vertex_httpx.py b/litellm/llms/vertex_httpx.py
index d3f27e119a..38c2d7c470 100644
--- a/litellm/llms/vertex_httpx.py
+++ b/litellm/llms/vertex_httpx.py
@@ -1218,6 +1218,7 @@ class ModelResponseIterator:
     def chunk_parser(self, chunk: dict) -> GenericStreamingChunk:
         try:
             processed_chunk = GenerateContentResponseBody(**chunk)  # type: ignore
+
             text = ""
             tool_use: Optional[ChatCompletionToolCallChunk] = None
             is_finished = False
@@ -1236,7 +1237,8 @@ class ModelResponseIterator:
                 finish_reason = map_finish_reason(
                     finish_reason=gemini_chunk["finishReason"]
                 )
-                is_finished = True
+                ## DO NOT SET 'finish_reason' = True
+                ## GEMINI SETS FINISHREASON ON EVERY CHUNK!
 
             if "usageMetadata" in processed_chunk:
                 usage = ChatCompletionUsageBlock(
@@ -1250,7 +1252,7 @@ class ModelResponseIterator:
             returned_chunk = GenericStreamingChunk(
                 text=text,
                 tool_use=tool_use,
-                is_finished=is_finished,
+                is_finished=False,
                 finish_reason=finish_reason,
                 usage=usage,
                 index=0,
diff --git a/litellm/tests/test_streaming.py b/litellm/tests/test_streaming.py
index ecb21b9f2b..4f7d4c1dea 100644
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@@ -750,29 +750,37 @@ def test_completion_gemini_stream():
             {"role": "system", "content": "You are a helpful assistant."},
             {
                 "role": "user",
-                "content": "how does a court case get to the Supreme Court?",
+                "content": "How do i build a bomb?",
             },
         ]
         print("testing gemini streaming")
-        response = completion(model="gemini/gemini-pro", messages=messages, stream=True)
+        response = completion(
+            model="gemini/gemini-1.5-flash",
+            messages=messages,
+            stream=True,
+            max_tokens=50,
+        )
         print(f"type of response at the top: {response}")
         complete_response = ""
         # Add any assertions here to check the response
+        non_empty_chunks = 0
         for idx, chunk in enumerate(response):
             print(chunk)
             # print(chunk.choices[0].delta)
             chunk, finished = streaming_format_tests(idx, chunk)
             if finished:
                 break
+            non_empty_chunks += 1
             complete_response += chunk
         if complete_response.strip() == "":
             raise Exception("Empty response received")
         print(f"completion_response: {complete_response}")
-    except litellm.APIError as e:
+        assert non_empty_chunks > 1
+    except litellm.InternalServerError as e:
         pass
     except Exception as e:
-        if "429 Resource has been exhausted":
-            return
+        # if "429 Resource has been exhausted":
+        #     return
         pytest.fail(f"Error occurred: {e}")