fix(utils.py): fix

2025-04-25 18:54:30 +00:00 · 2024-06-04 19:41:20 -07:00 · 2024-06-04 19:41:20 -07:00 · 43af5575c8
commit 43af5575c8
parent 54dacfdf61
2 changed files with 38 additions and 16 deletions
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@ -1997,20 +1997,42 @@ def test_openai_chat_completion_complete_response_call():
    "model",
    ["gpt-3.5-turbo", "azure/chatgpt-v-2"],
 )
-def test_openai_stream_options_call(model):
+@pytest.mark.parametrize(
+    "sync",
+    [True, False],
+)
+@pytest.mark.asyncio
+async def test_openai_stream_options_call(model, sync):
    litellm.set_verbose = False
+    usage = None
+    chunks = []
+    if sync:
        response = litellm.completion(
            model=model,
-        messages=[{"role": "system", "content": "say GM - we're going to make it "}],
+            messages=[
+                {"role": "system", "content": "say GM - we're going to make it "}
+            ],
            stream=True,
            stream_options={"include_usage": True},
            max_tokens=10,
        )
-    usage = None
-    chunks = []
        for chunk in response:
            print("chunk: ", chunk)
            chunks.append(chunk)
+    else:
+        response = await litellm.acompletion(
+            model=model,
+            messages=[
+                {"role": "system", "content": "say GM - we're going to make it "}
+            ],
+            stream=True,
+            stream_options={"include_usage": True},
+            max_tokens=10,
+        )
+
+        async for chunk in response:
+            print("chunk: ", chunk)
+            chunks.append(chunk)

    last_chunk = chunks[-1]
    print("last chunk: ", last_chunk)
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -11914,7 +11914,7 @@ class CustomStreamWrapper:
                        input=self.response_uptil_now, model=self.model
                    )
                    print_verbose(f"final returned processed chunk: {processed_chunk}")
-                    self.chunks.append(response)
+                    self.chunks.append(processed_chunk)
                    return processed_chunk
                raise StopAsyncIteration
            else:  # temporary patch for non-aiohttp async calls
@ -11954,7 +11954,7 @@ class CustomStreamWrapper:
                            input=self.response_uptil_now, model=self.model
                        )
                        # RETURN RESULT
-                        self.chunks.append(response)
+                        self.chunks.append(processed_chunk)
                        return processed_chunk
        except StopAsyncIteration:
            if self.sent_last_chunk == True:
@ -11965,17 +11965,17 @@ class CustomStreamWrapper:
                ):
                    # send the final chunk with stream options
                    complete_streaming_response = litellm.stream_chunk_builder(
-                        chunks=self.chunks
+                        chunks=self.chunks, messages=self.messages
                    )
                    response = self.model_response_creator()
                    response.usage = complete_streaming_response.usage
                    ## LOGGING
                    threading.Thread(
-                        target=self.logging_obj.success_handler, args=(processed_chunk,)
+                        target=self.logging_obj.success_handler, args=(response,)
                    ).start()  # log response
                    asyncio.create_task(
                        self.logging_obj.async_success_handler(
-                            processed_chunk,
+                            response,
                        )
                    )
                    self.sent_stream_usage = True