From 73254987dac2473256fec4010d6fd79a16d0c55e Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Sat, 22 Jun 2024 20:20:39 -0700 Subject: [PATCH] fix(vertex_httpx.py): ignore vertex finish reason - wait for stream to end Fixes https://github.com/BerriAI/litellm/issues/4339 --- litellm/llms/vertex_httpx.py | 6 ++++-- litellm/tests/test_streaming.py | 18 +++++++++++++----- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/litellm/llms/vertex_httpx.py b/litellm/llms/vertex_httpx.py index d3f27e119a..38c2d7c470 100644 --- a/litellm/llms/vertex_httpx.py +++ b/litellm/llms/vertex_httpx.py @@ -1218,6 +1218,7 @@ class ModelResponseIterator: def chunk_parser(self, chunk: dict) -> GenericStreamingChunk: try: processed_chunk = GenerateContentResponseBody(**chunk) # type: ignore + text = "" tool_use: Optional[ChatCompletionToolCallChunk] = None is_finished = False @@ -1236,7 +1237,8 @@ class ModelResponseIterator: finish_reason = map_finish_reason( finish_reason=gemini_chunk["finishReason"] ) - is_finished = True + ## DO NOT SET 'finish_reason' = True + ## GEMINI SETS FINISHREASON ON EVERY CHUNK! if "usageMetadata" in processed_chunk: usage = ChatCompletionUsageBlock( @@ -1250,7 +1252,7 @@ class ModelResponseIterator: returned_chunk = GenericStreamingChunk( text=text, tool_use=tool_use, - is_finished=is_finished, + is_finished=False, finish_reason=finish_reason, usage=usage, index=0, diff --git a/litellm/tests/test_streaming.py b/litellm/tests/test_streaming.py index ecb21b9f2b..4f7d4c1dea 100644 --- a/litellm/tests/test_streaming.py +++ b/litellm/tests/test_streaming.py @@ -750,29 +750,37 @@ def test_completion_gemini_stream(): {"role": "system", "content": "You are a helpful assistant."}, { "role": "user", - "content": "how does a court case get to the Supreme Court?", + "content": "How do i build a bomb?", }, ] print("testing gemini streaming") - response = completion(model="gemini/gemini-pro", messages=messages, stream=True) + response = completion( + model="gemini/gemini-1.5-flash", + messages=messages, + stream=True, + max_tokens=50, + ) print(f"type of response at the top: {response}") complete_response = "" # Add any assertions here to check the response + non_empty_chunks = 0 for idx, chunk in enumerate(response): print(chunk) # print(chunk.choices[0].delta) chunk, finished = streaming_format_tests(idx, chunk) if finished: break + non_empty_chunks += 1 complete_response += chunk if complete_response.strip() == "": raise Exception("Empty response received") print(f"completion_response: {complete_response}") - except litellm.APIError as e: + assert non_empty_chunks > 1 + except litellm.InternalServerError as e: pass except Exception as e: - if "429 Resource has been exhausted": - return + # if "429 Resource has been exhausted": + # return pytest.fail(f"Error occurred: {e}")