diff --git a/litellm/tests/test_streaming.py b/litellm/tests/test_streaming.py index 53a7278bb..384404d5d 100644 --- a/litellm/tests/test_streaming.py +++ b/litellm/tests/test_streaming.py @@ -1997,20 +1997,42 @@ def test_openai_chat_completion_complete_response_call(): "model", ["gpt-3.5-turbo", "azure/chatgpt-v-2"], ) -def test_openai_stream_options_call(model): +@pytest.mark.parametrize( + "sync", + [True, False], +) +@pytest.mark.asyncio +async def test_openai_stream_options_call(model, sync): litellm.set_verbose = False - response = litellm.completion( - model=model, - messages=[{"role": "system", "content": "say GM - we're going to make it "}], - stream=True, - stream_options={"include_usage": True}, - max_tokens=10, - ) usage = None chunks = [] - for chunk in response: - print("chunk: ", chunk) - chunks.append(chunk) + if sync: + response = litellm.completion( + model=model, + messages=[ + {"role": "system", "content": "say GM - we're going to make it "} + ], + stream=True, + stream_options={"include_usage": True}, + max_tokens=10, + ) + for chunk in response: + print("chunk: ", chunk) + chunks.append(chunk) + else: + response = await litellm.acompletion( + model=model, + messages=[ + {"role": "system", "content": "say GM - we're going to make it "} + ], + stream=True, + stream_options={"include_usage": True}, + max_tokens=10, + ) + + async for chunk in response: + print("chunk: ", chunk) + chunks.append(chunk) last_chunk = chunks[-1] print("last chunk: ", last_chunk) diff --git a/litellm/utils.py b/litellm/utils.py index 76aee1218..28cc776ed 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -11914,7 +11914,7 @@ class CustomStreamWrapper: input=self.response_uptil_now, model=self.model ) print_verbose(f"final returned processed chunk: {processed_chunk}") - self.chunks.append(response) + self.chunks.append(processed_chunk) return processed_chunk raise StopAsyncIteration else: # temporary patch for non-aiohttp async calls @@ -11954,7 +11954,7 @@ class CustomStreamWrapper: input=self.response_uptil_now, model=self.model ) # RETURN RESULT - self.chunks.append(response) + self.chunks.append(processed_chunk) return processed_chunk except StopAsyncIteration: if self.sent_last_chunk == True: @@ -11965,17 +11965,17 @@ class CustomStreamWrapper: ): # send the final chunk with stream options complete_streaming_response = litellm.stream_chunk_builder( - chunks=self.chunks + chunks=self.chunks, messages=self.messages ) response = self.model_response_creator() response.usage = complete_streaming_response.usage ## LOGGING threading.Thread( - target=self.logging_obj.success_handler, args=(processed_chunk,) + target=self.logging_obj.success_handler, args=(response,) ).start() # log response asyncio.create_task( self.logging_obj.async_success_handler( - processed_chunk, + response, ) ) self.sent_stream_usage = True