From 343a06fd844f25fc619c73b7b7ba9d0ed3d03136 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Mon, 25 Dec 2023 07:17:54 +0530
Subject: [PATCH] fix(proxy_server.py): raise streaming exceptions

---
 litellm/main.py                 |  5 +----
 litellm/proxy/proxy_server.py   | 17 ++++++++++-------
 litellm/tests/test_streaming.py |  7 ++++---
 litellm/utils.py                | 13 ++++++++++---
 4 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/litellm/main.py b/litellm/main.py
index 0c3ab4562..fb3be6233 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -213,10 +213,7 @@ async def _async_streaming(response, model, custom_llm_provider, args):
             print_verbose(f"line in async streaming: {line}")
             yield line
     except Exception as e: 
-        print_verbose(f"error raised _async_streaming: {traceback.format_exc()}")
-        raise exception_type(
-                model=model, custom_llm_provider=custom_llm_provider, original_exception=e, completion_kwargs=args,
-            )
+        raise e
 
 def mock_completion(model: str, messages: List, stream: Optional[bool] = False, mock_response: str = "This is a mock request", **kwargs):
     """
diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index effc8297d..3e4e7f25d 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -826,13 +826,15 @@ def data_generator(response):
 
 async def async_data_generator(response, user_api_key_dict):
     print_verbose("inside generator")
-    async for chunk in response:
-        print_verbose(f"returned chunk: {chunk}")
-        try:
-            yield f"data: {json.dumps(chunk.dict())}\n\n"
-        except:
-            yield f"data: {json.dumps(chunk)}\n\n"
-
+    try: 
+        async for chunk in response:
+            print_verbose(f"returned chunk: {chunk}")
+            try:
+                yield f"data: {json.dumps(chunk.dict())}\n\n"
+            except Exception as e:
+                yield f"data: {str(e)}\n\n"
+    except Exception as e: 
+        yield f"data: {str(e)}\n\n"
 def get_litellm_model_info(model: dict = {}):
     model_info = model.get("model_info", {})
     model_to_lookup = model.get("litellm_params", {}).get("model", None)
@@ -971,6 +973,7 @@ async def completion(request: Request, model: Optional[str] = None, user_api_key
         background_tasks.add_task(log_input_output, request, response) # background task for logging to OTEL 
         return response
     except Exception as e: 
+        print(f"EXCEPTION RAISED IN PROXY MAIN.PY")
         print(f"\033[1;31mAn error occurred: {e}\n\n Debug this by setting `--debug`, e.g. `litellm --model gpt-3.5-turbo --debug`")
         traceback.print_exc()
         error_traceback = traceback.format_exc()
diff --git a/litellm/tests/test_streaming.py b/litellm/tests/test_streaming.py
index 2dfd5778a..e02440e8d 100644
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@@ -975,11 +975,12 @@ def test_openai_text_completion_call():
 # test_openai_text_completion_call()
 
 # # test on together ai completion call - starcoder
-def test_together_ai_completion_call_starcoder():
+def test_together_ai_completion_call_mistral():
     try:
+        litellm.set_verbose = False
         start_time = time.time()
         response = completion(
-            model="together_ai/bigcode/starcoder",
+            model="together_ai/mistralai/Mistral-7B-Instruct-v0.2",
             messages=messages,
             logger_fn=logger_fn,
             stream=True,
@@ -1002,7 +1003,7 @@ def test_together_ai_completion_call_starcoder():
         print(f"error occurred: {traceback.format_exc()}")
         pass
 
-# test_together_ai_completion_call_starcoder() 
+test_together_ai_completion_call_starcoder() 
 
 def test_together_ai_completion_call_starcoder_bad_key():
     try:
diff --git a/litellm/utils.py b/litellm/utils.py
index 767bc0b07..94fc46039 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -5146,6 +5146,14 @@ def exception_type(
                                 llm_provider="together_ai",
                                 request=original_exception.request
                             )
+                    elif original_exception.status_code == 422:
+                        exception_mapping_worked = True
+                        raise BadRequestError(
+                            message=f"TogetherAIException - {error_response['error']}",
+                            model=model,
+                            llm_provider="together_ai",
+                            response=original_exception.response
+                        )
                     elif original_exception.status_code == 429:
                             exception_mapping_worked = True
                             raise RateLimitError(
@@ -5584,7 +5592,7 @@ class CustomStreamWrapper:
         elif "[DONE]" in chunk:
             return {"text": text, "is_finished": True, "finish_reason": "stop"}
         elif "error" in chunk:
-            raise ValueError(chunk)
+            raise litellm.together_ai.TogetherAIError(status_code=422, message=f"{str(chunk)}")
         else:
             return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
 
@@ -6131,7 +6139,6 @@ class CustomStreamWrapper:
         except StopIteration:
             raise  # Re-raise StopIteration
         except Exception as e:
-            print_verbose(f"HITS AN ERROR: {str(e)}\n\n {traceback.format_exc()}")
             traceback_exception = traceback.format_exc()
             # LOG FAILURE - handle streaming failure logging in the _next_ object, remove `handle_failure` once it's deprecated
             threading.Thread(target=self.logging_obj.failure_handler, args=(e, traceback_exception)).start()
@@ -6180,7 +6187,7 @@ class CustomStreamWrapper:
             traceback_exception = traceback.format_exc()
             # Handle any exceptions that might occur during streaming
             asyncio.create_task(self.logging_obj.async_failure_handler(e, traceback_exception))
-            raise StopAsyncIteration
+            raise e
 
 class TextCompletionStreamWrapper:
     def __init__(self, completion_stream, model):