fix(main.py): fix streaming_chunk_builder to return usage

2025-04-24 10:14:26 +00:00 · 2023-11-24 11:27:04 -08:00 · 2023-11-24 11:27:04 -08:00 · 4a5dae3941
commit 4a5dae3941
parent 8edfcd8e5d
6 changed files with 133 additions and 117 deletions
--- a/litellm/main.py
+++ b/litellm/main.py
@ -2066,14 +2066,14 @@ def config_completion(**kwargs):
            "No config path set, please set a config path using `litellm.config_path = 'path/to/config.json'`"
        )

-def stream_chunk_builder(chunks: list):
+def stream_chunk_builder(chunks: list, messages: Optional[list]=None):
    id = chunks[0]["id"]
    object = chunks[0]["object"]
    created = chunks[0]["created"]
    model = chunks[0]["model"]
    role = chunks[0]["choices"][0]["delta"]["role"]
    finish_reason = chunks[-1]["choices"][0]["finish_reason"]
-    
+
    # Initialize the response dictionary
    response = {
        "id": id,
@ -2105,7 +2105,7 @@ def stream_chunk_builder(chunks: list):
        argument_list = []
        delta = chunks[0]["choices"][0]["delta"]
        function_call = delta.get("function_call", "")
-        function_call_name = function_call.get("name", "")
+        function_call_name = function_call.name

        message = response["choices"][0]["message"]
        message["function_call"] = {}
@ -2120,7 +2120,7 @@ def stream_chunk_builder(chunks: list):
                # Check if a function call is present
                if function_call:
                    # Now, function_call is expected to be a dictionary
-                    arguments = function_call.get("arguments", "")
+                    arguments = function_call.arguments
                    argument_list.append(arguments)

        combined_arguments = "".join(argument_list)
@ -2144,5 +2144,8 @@ def stream_chunk_builder(chunks: list):


    # # Update usage information if needed
+    if messages: 
+        response["usage"]["prompt_tokens"] = litellm.utils.token_counter(model=model, messages=messages)
    response["usage"]["completion_tokens"] = litellm.utils.token_counter(model=model, text=combined_content)
-    return response
+    response["usage"]["total_tokens"] = response["usage"]["prompt_tokens"] + response["usage"]["completion_tokens"]
+    return litellm.utils.convert_to_model_response_object(response_object=response, model_response_object=litellm.ModelResponse())
--- a/litellm/tests/test_router.py
+++ b/litellm/tests/test_router.py
@ -13,78 +13,6 @@ from concurrent.futures import ThreadPoolExecutor
 from dotenv import load_dotenv
 load_dotenv()

-def test_multiple_deployments(): 
-	import concurrent, time
-	litellm.set_verbose=False
-	futures = {}
-	model_list = [{ # list of model deployments 
-		"model_name": "gpt-3.5-turbo", # openai model name 
-		"litellm_params": { # params for litellm completion/embedding call 
-			"model": "azure/chatgpt-v-2", 
-			"api_key": "bad-key",
-			"api_version": os.getenv("AZURE_API_VERSION"),
-			"api_base": os.getenv("AZURE_API_BASE")
-		},
-		"tpm": 240000,
-		"rpm": 1800
-	}, 
-	{
-		"model_name": "gpt-3.5-turbo", # openai model name 
-		"litellm_params": { # params for litellm completion/embedding call 
-			"model": "gpt-3.5-turbo", 
-			"api_key": os.getenv("OPENAI_API_KEY"),
-		},
-		"tpm": 1000000,
-		"rpm": 9000
-	}
-	]
-
-	router = Router(model_list=model_list, 
-				 redis_host=os.getenv("REDIS_HOST"), 
-				 redis_password=os.getenv("REDIS_PASSWORD"), 
-				 redis_port=int(os.getenv("REDIS_PORT")), 
-				 routing_strategy="simple-shuffle",
-				 set_verbose=False,
-				 num_retries=1) # type: ignore
-	kwargs = {"model": "gpt-3.5-turbo", "messages": [{"role": "user", "content": "Hey, how's it going?"}],}
-	
-	results = [] 
-	
-	try:
-		for _ in range(3): 
-			response = router.completion(**kwargs)
-			results.append(response)
-		router.flush_cache()
-	except Exception as e:
-		print(f"FAILED TEST!")
-		pytest.fail(f"An error occurred - {str(e)}")
-
-	# 	start_time = time.time()
-	# 	for _ in range(1000):
-	# 		future = executor.submit(router.completion, **kwargs)
-	# 		futures[future] = future
-
-	# 	# Retrieve the results from the futures
-	# 	while futures:
-	# 		done, not_done = concurrent.futures.wait(futures, timeout=10, return_when=concurrent.futures.FIRST_COMPLETED)
-	# 		for future in done:
-	# 			try:
-	# 				result = future.result()
-	# 				results.append(result)
-	# 				futures.pop(future)  # Remove the done future
-	# 			except Exception as e:
-	# 				print(f"Exception: {e}; traceback: {traceback.format_exc()}")
-	# 				futures.pop(future)  # Remove the done future with exception
-
-	# 		print(f"Remaining futures: {len(futures)}")
-
-	# 	end_time = time.time() 
-	# 	print(f"ELAPSED TIME: {end_time-start_time}")
-		# Check results
-
-
-# test_multiple_deployments()
-
 def test_exception_raising():
 	# this tests if the router raises an exception when invalid params are set
 	# in this test both deployments have bad keys - Keep this test. It validates if the router raises the most recent exception
--- a/litellm/tests/test_router_cooldowns.py
+++ b/litellm/tests/test_router_cooldowns.py
@ -0,0 +1,99 @@
+#### What this tests ####
+#    This tests calling batch_completions by running 100 messages together
+
+import sys, os, time
+import traceback, asyncio
+import pytest
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
+import litellm
+from litellm import Router
+import concurrent
+from dotenv import load_dotenv
+load_dotenv()
+
+model_list = [{ # list of model deployments 
+		"model_name": "gpt-3.5-turbo", # openai model name 
+		"litellm_params": { # params for litellm completion/embedding call 
+			"model": "azure/chatgpt-v-2", 
+			"api_key": "bad-key",
+			"api_version": os.getenv("AZURE_API_VERSION"),
+			"api_base": os.getenv("AZURE_API_BASE")
+		},
+		"tpm": 240000,
+		"rpm": 1800
+	}, 
+	{
+		"model_name": "gpt-3.5-turbo", # openai model name 
+		"litellm_params": { # params for litellm completion/embedding call 
+			"model": "gpt-3.5-turbo", 
+			"api_key": os.getenv("OPENAI_API_KEY"),
+		},
+		"tpm": 1000000,
+		"rpm": 9000
+	}
+]
+
+router = Router(model_list=model_list, 
+                redis_host=os.getenv("REDIS_HOST"), 
+                redis_password=os.getenv("REDIS_PASSWORD"), 
+                redis_port=int(os.getenv("REDIS_PORT")), 
+                routing_strategy="simple-shuffle",
+                set_verbose=True,
+                num_retries=1) # type: ignore
+kwargs = {"model": "gpt-3.5-turbo", "messages": [{"role": "user", "content": "Hey, how's it going?"}],}
+
+
+def test_multiple_deployments_sync(): 
+	import concurrent, time
+	litellm.set_verbose=False
+	results = [] 
+	
+	try:
+		router.flush_cache()
+		for _ in range(3): 
+			response = router.completion(**kwargs)
+			results.append(response)
+		print(results)
+		router.flush_cache()
+	except Exception as e:
+		print(f"FAILED TEST!")
+		pytest.fail(f"An error occurred - {str(e)}")
+
+# test_multiple_deployments_sync()
+
+
+def test_multiple_deployments_parallel():
+    litellm.set_verbose = False  # Corrected the syntax for setting verbose to False
+    results = []
+    futures = {}
+    start_time = time.time()
+    router.flush_cache()
+    # Assuming you have an executor instance defined somewhere in your code
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        for _ in range(5):
+            future = executor.submit(router.completion, **kwargs)
+            futures[future] = future
+
+        # Retrieve the results from the futures
+        while futures:
+            done, not_done = concurrent.futures.wait(futures.values(), timeout=10, return_when=concurrent.futures.FIRST_COMPLETED)
+            for future in done:
+                try:
+                    result = future.result()
+                    results.append(result)
+                    del futures[future]  # Remove the done future
+                except Exception as e:
+                    print(f"Exception: {e}; traceback: {traceback.format_exc()}")
+                    del futures[future]  # Remove the done future with exception
+
+            print(f"Remaining futures: {len(futures)}")
+
+    end_time = time.time()
+    print(results)
+    print(f"ELAPSED TIME: {end_time - start_time}")
+
+# Assuming litellm, router, and executor are defined somewhere in your code
+
+test_multiple_deployments_parallel()
--- a/litellm/tests/test_router_fallbacks.py
+++ b/litellm/tests/test_router_fallbacks.py
@ -93,11 +93,11 @@ def test_async_fallbacks():
            response = await router.acompletion(**kwargs)
            # response = await response
            print(f"response: {response}")
+            router.flush_cache()
        except litellm.Timeout as e: 
            pass
        except Exception as e:
            pytest.fail(f"An exception occurred: {e}")
-        router.flush_cache()

    asyncio.run(test_get_response())

@ -110,8 +110,8 @@ def test_sync_context_window_fallbacks():
        kwargs["messages"] = [{"role": "user", "content": sample_text}]
        response = router.completion(**kwargs)
        print(f"response: {response}")
+        router.flush_cache()
    except Exception as e:
        print(e)
-    router.flush_cache()

 # test_sync_context_window_fallbacks()
--- a/litellm/tests/test_stream_chunk_builder.py
+++ b/litellm/tests/test_stream_chunk_builder.py
@ -1,3 +1,9 @@
+import sys, os, time
+import traceback, asyncio
+import pytest
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
 from litellm import completion, stream_chunk_builder
 import litellm
 import os, dotenv
@ -24,40 +30,21 @@ function_schema = {
  },
 }

-@pytest.mark.skip
 def test_stream_chunk_builder():
-    litellm.set_verbose = False
-    litellm.api_key = os.environ["OPENAI_API_KEY"]
-    response = completion(
-        model="gpt-3.5-turbo",
-        messages=messages,
-        functions=[function_schema],
-        stream=True,
-    )
+    try: 
+      litellm.set_verbose = False
+      litellm.api_key = os.environ["OPENAI_API_KEY"]
+      response = completion(
+          model="gpt-3.5-turbo",
+          messages=messages,
+          functions=[function_schema],
+          stream=True,
+          complete_response=True # runs stream_chunk_builder under-the-hood
+      )

-    chunks = []
-
-    for chunk in response:
-        # print(chunk)
-        chunks.append(chunk)
-
-    try:
-        print(f"chunks: {chunks}")
-        rebuilt_response = stream_chunk_builder(chunks)
-
-        # exract the response from the rebuilt response
-        rebuilt_response["id"]
-        rebuilt_response["object"]
-        rebuilt_response["created"]
-        rebuilt_response["model"]
-        rebuilt_response["choices"]
-        rebuilt_response["choices"][0]["index"]
-        choices = rebuilt_response["choices"][0]
-        message = choices["message"]
-        role = message["role"]
-        content = message["content"]
-        finish_reason = choices["finish_reason"]
-        print(role, content, finish_reason)
-    except Exception as e:
-        raise Exception("stream_chunk_builder failed to rebuild response", e)
+      print(f"response: {response}")
+      print(f"response usage: {response['usage']}")
+    except Exception as e: 
+       pytest.fail(f"An exception occurred - {str(e)}")

+test_stream_chunk_builder()
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -710,7 +710,7 @@ class Logging:
            if self.stream: 
                if result.choices[0].finish_reason is not None: # if it's the last chunk
                    self.streaming_chunks.append(result)
-                    complete_streaming_response = litellm.stream_chunk_builder(self.streaming_chunks)
+                    complete_streaming_response = litellm.stream_chunk_builder(self.streaming_chunks, messages=self.model_call_details.get("messages", None))
                else:
                    self.streaming_chunks.append(result)
            elif isinstance(result, OpenAIObject):
@ -1250,7 +1250,7 @@ def client(original_function):
                    chunks = []
                    for idx, chunk in enumerate(result):
                        chunks.append(chunk)
-                    return litellm.stream_chunk_builder(chunks)
+                    return litellm.stream_chunk_builder(chunks, messages=kwargs.get("messages", None))
                else: 
                    return result
            elif "acompletion" in kwargs and kwargs["acompletion"] == True: 
@ -1360,7 +1360,7 @@ def client(original_function):
                    chunks = []
                    for idx, chunk in enumerate(result):
                        chunks.append(chunk)
-                    return litellm.stream_chunk_builder(chunks)
+                    return litellm.stream_chunk_builder(chunks, messages=kwargs.get("messages", None))
                else: 
                    return result
            
@ -5012,7 +5012,6 @@ class CustomStreamWrapper:
                    return
                completion_obj["content"] = response_obj["text"]
                print_verbose(f"completion obj content: {completion_obj['content']}")
-                print_verbose(f"len(completion_obj['content']: {len(completion_obj['content'])}")
                if response_obj["is_finished"]: 
                    model_response.choices[0].finish_reason = response_obj["finish_reason"]