From 09ec6d645851fcc62b2851eb4b421a2a77e89468 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 25 Jan 2024 12:49:45 -0800
Subject: [PATCH 01/27] fix(utils.py): fix sagemaker async logging for sync
 streaming

https://github.com/BerriAI/litellm/issues/1592
---
 .circleci/config.yml                        |  3 +
 litellm/llms/sagemaker.py                   | 35 +++++---
 litellm/main.py                             | 11 +--
 litellm/proxy/proxy_server.py               |  3 +
 litellm/proxy/utils.py                      |  5 +-
 litellm/tests/test_custom_callback_input.py | 41 +++++++++
 litellm/tests/test_streaming.py             | 70 ++++++++-------
 litellm/utils.py                            | 94 ++++++++++++++++++---
 proxy_server_config.yaml                    |  4 +
 tests/test_keys.py                          | 45 +++++++++-
 10 files changed, 247 insertions(+), 64 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 1de72a156..e0e6f5743 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -147,6 +147,9 @@ jobs:
               -e AZURE_API_KEY=$AZURE_API_KEY \
               -e AZURE_FRANCE_API_KEY=$AZURE_FRANCE_API_KEY \
               -e AZURE_EUROPE_API_KEY=$AZURE_EUROPE_API_KEY \
+              -e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
+              -e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
+              -e AWS_REGION_NAME=$AWS_REGION_NAME \
               --name my-app \
               -v $(pwd)/proxy_server_config.yaml:/app/config.yaml \
               my-app:latest \
diff --git a/litellm/llms/sagemaker.py b/litellm/llms/sagemaker.py
index 1608f7a0f..78aafe7f7 100644
--- a/litellm/llms/sagemaker.py
+++ b/litellm/llms/sagemaker.py
@@ -34,22 +34,35 @@ class TokenIterator:
         self.byte_iterator = iter(stream)
         self.buffer = io.BytesIO()
         self.read_pos = 0
+        self.end_of_data = False
 
     def __iter__(self):
         return self
 
     def __next__(self):
-        while True:
-            self.buffer.seek(self.read_pos)
-            line = self.buffer.readline()
-            if line and line[-1] == ord("\n"):
-                self.read_pos += len(line) + 1
-                full_line = line[:-1].decode("utf-8")
-                line_data = json.loads(full_line.lstrip("data:").rstrip("/n"))
-                return line_data["token"]["text"]
-            chunk = next(self.byte_iterator)
-            self.buffer.seek(0, io.SEEK_END)
-            self.buffer.write(chunk["PayloadPart"]["Bytes"])
+        try:
+            while True:
+                self.buffer.seek(self.read_pos)
+                line = self.buffer.readline()
+                if line and line[-1] == ord("\n"):
+                    response_obj = {"text": "", "is_finished": False}
+                    self.read_pos += len(line) + 1
+                    full_line = line[:-1].decode("utf-8")
+                    line_data = json.loads(full_line.lstrip("data:").rstrip("/n"))
+                    if line_data.get("generated_text", None) is not None:
+                        self.end_of_data = True
+                        response_obj["is_finished"] = True
+                    response_obj["text"] = line_data["token"]["text"]
+                    return response_obj
+                chunk = next(self.byte_iterator)
+                self.buffer.seek(0, io.SEEK_END)
+                self.buffer.write(chunk["PayloadPart"]["Bytes"])
+        except StopIteration as e:
+            if self.end_of_data == True:
+                raise e  # Re-raise StopIteration
+            else:
+                self.end_of_data = True
+                return "data: [DONE]"
 
 
 class SagemakerConfig:
diff --git a/litellm/main.py b/litellm/main.py
index 6b9a0bb18..fca3bd2b2 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -1514,11 +1514,6 @@ def completion(
             if (
                 "stream" in optional_params and optional_params["stream"] == True
             ):  ## [BETA]
-                # sagemaker does not support streaming as of now so we're faking streaming:
-                # https://discuss.huggingface.co/t/streaming-output-text-when-deploying-on-sagemaker/39611
-                # "SageMaker is currently not supporting streaming responses."
-
-                # fake streaming for sagemaker
                 print_verbose(f"ENTERS SAGEMAKER CUSTOMSTREAMWRAPPER")
                 from .llms.sagemaker import TokenIterator
 
@@ -1529,6 +1524,12 @@ def completion(
                     custom_llm_provider="sagemaker",
                     logging_obj=logging,
                 )
+                ## LOGGING
+                logging.post_call(
+                    input=messages,
+                    api_key=None,
+                    original_response=response,
+                )
                 return response
 
             ## RESPONSE OBJECT
diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index b53484b86..493ad9731 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -690,6 +690,9 @@ async def update_database(
                     existing_spend_obj = await custom_db_client.get_data(
                         key=id, table_name="user"
                     )
+                verbose_proxy_logger.debug(
+                    f"Updating existing_spend_obj: {existing_spend_obj}"
+                )
                 if existing_spend_obj is None:
                     existing_spend = 0
                     existing_spend_obj = LiteLLM_UserTable(
diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py
index faa73d70b..728716886 100644
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@@ -409,7 +409,9 @@ class PrismaClient:
                     hashed_token = token
                     if token.startswith("sk-"):
                         hashed_token = self.hash_token(token=token)
-                print_verbose("PrismaClient: find_unique")
+                    verbose_proxy_logger.debug(
+                        f"PrismaClient: find_unique for token: {hashed_token}"
+                    )
                 if query_type == "find_unique":
                     response = await self.db.litellm_verificationtoken.find_unique(
                         where={"token": hashed_token}
@@ -716,7 +718,6 @@ class PrismaClient:
                 Batch write update queries
                 """
                 batcher = self.db.batch_()
-                verbose_proxy_logger.debug(f"data list for user table: {data_list}")
                 for idx, user in enumerate(data_list):
                     try:
                         data_json = self.jsonify_object(data=user.model_dump())
diff --git a/litellm/tests/test_custom_callback_input.py b/litellm/tests/test_custom_callback_input.py
index 556628d82..a61cc843e 100644
--- a/litellm/tests/test_custom_callback_input.py
+++ b/litellm/tests/test_custom_callback_input.py
@@ -556,6 +556,47 @@ async def test_async_chat_bedrock_stream():
 
 # asyncio.run(test_async_chat_bedrock_stream())
 
+
+## Test Sagemaker + Async
+@pytest.mark.asyncio
+async def test_async_chat_sagemaker_stream():
+    try:
+        customHandler = CompletionCustomHandler()
+        litellm.callbacks = [customHandler]
+        response = await litellm.acompletion(
+            model="sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4",
+            messages=[{"role": "user", "content": "Hi 👋 - i'm async sagemaker"}],
+        )
+        # test streaming
+        response = await litellm.acompletion(
+            model="sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4",
+            messages=[{"role": "user", "content": "Hi 👋 - i'm async sagemaker"}],
+            stream=True,
+        )
+        print(f"response: {response}")
+        async for chunk in response:
+            print(f"chunk: {chunk}")
+            continue
+        ## test failure callback
+        try:
+            response = await litellm.acompletion(
+                model="sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4",
+                messages=[{"role": "user", "content": "Hi 👋 - i'm async sagemaker"}],
+                aws_region_name="my-bad-key",
+                stream=True,
+            )
+            async for chunk in response:
+                continue
+        except:
+            pass
+        time.sleep(1)
+        print(f"customHandler.errors: {customHandler.errors}")
+        assert len(customHandler.errors) == 0
+        litellm.callbacks = []
+    except Exception as e:
+        pytest.fail(f"An exception occurred: {str(e)}")
+
+
 # Text Completion
 
 
diff --git a/litellm/tests/test_streaming.py b/litellm/tests/test_streaming.py
index 14b1a7210..d9f99bece 100644
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@@ -872,41 +872,53 @@ async def test_sagemaker_streaming_async():
         )
 
         # Add any assertions here to check the response
+        print(response)
         complete_response = ""
+        has_finish_reason = False
+        # Add any assertions here to check the response
+        idx = 0
         async for chunk in response:
-            complete_response += chunk.choices[0].delta.content or ""
-        print(f"complete_response: {complete_response}")
-        assert len(complete_response) > 0
+            # print
+            chunk, finished = streaming_format_tests(idx, chunk)
+            has_finish_reason = finished
+            complete_response += chunk
+            if finished:
+                break
+            idx += 1
+        if has_finish_reason is False:
+            raise Exception("finish reason not set for last chunk")
+        if complete_response.strip() == "":
+            raise Exception("Empty response received")
+        print(f"completion_response: {complete_response}")
     except Exception as e:
         pytest.fail(f"An exception occurred - {str(e)}")
 
 
-# def test_completion_sagemaker_stream():
-#     try:
-#         response = completion(
-#             model="sagemaker/jumpstart-dft-meta-textgeneration-llama-2-7b",
-#             messages=messages,
-#             temperature=0.2,
-#             max_tokens=80,
-#             stream=True,
-#         )
-#         complete_response = ""
-#         has_finish_reason = False
-#         # Add any assertions here to check the response
-#         for idx, chunk in enumerate(response):
-#             chunk, finished = streaming_format_tests(idx, chunk)
-#             has_finish_reason = finished
-#             if finished:
-#                 break
-#             complete_response += chunk
-#         if has_finish_reason is False:
-#             raise Exception("finish reason not set for last chunk")
-#         if complete_response.strip() == "":
-#             raise Exception("Empty response received")
-#     except InvalidRequestError as e:
-#         pass
-#     except Exception as e:
-#         pytest.fail(f"Error occurred: {e}")
+def test_completion_sagemaker_stream():
+    try:
+        response = completion(
+            model="sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4",
+            messages=messages,
+            temperature=0.2,
+            max_tokens=80,
+            stream=True,
+        )
+        complete_response = ""
+        has_finish_reason = False
+        # Add any assertions here to check the response
+        for idx, chunk in enumerate(response):
+            chunk, finished = streaming_format_tests(idx, chunk)
+            has_finish_reason = finished
+            if finished:
+                break
+            complete_response += chunk
+        if has_finish_reason is False:
+            raise Exception("finish reason not set for last chunk")
+        if complete_response.strip() == "":
+            raise Exception("Empty response received")
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
 
 # test_completion_sagemaker_stream()
 
diff --git a/litellm/utils.py b/litellm/utils.py
index 0e12463b9..fb3210b1d 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -1417,7 +1417,9 @@ class Logging:
         """
         Implementing async callbacks, to handle asyncio event loop issues when custom integrations need to use async functions.
         """
-        print_verbose(f"Async success callbacks: {litellm._async_success_callback}")
+        verbose_logger.debug(
+            f"Async success callbacks: {litellm._async_success_callback}"
+        )
         start_time, end_time, result = self._success_handler_helper_fn(
             start_time=start_time, end_time=end_time, result=result, cache_hit=cache_hit
         )
@@ -1426,7 +1428,7 @@ class Logging:
         if self.stream:
             if result.choices[0].finish_reason is not None:  # if it's the last chunk
                 self.streaming_chunks.append(result)
-                # print_verbose(f"final set of received chunks: {self.streaming_chunks}")
+                # verbose_logger.debug(f"final set of received chunks: {self.streaming_chunks}")
                 try:
                     complete_streaming_response = litellm.stream_chunk_builder(
                         self.streaming_chunks,
@@ -1435,14 +1437,16 @@ class Logging:
                         end_time=end_time,
                     )
                 except Exception as e:
-                    print_verbose(
+                    verbose_logger.debug(
                         f"Error occurred building stream chunk: {traceback.format_exc()}"
                     )
                     complete_streaming_response = None
             else:
                 self.streaming_chunks.append(result)
         if complete_streaming_response is not None:
-            print_verbose("Async success callbacks: Got a complete streaming response")
+            verbose_logger.debug(
+                "Async success callbacks: Got a complete streaming response"
+            )
             self.model_call_details[
                 "complete_streaming_response"
             ] = complete_streaming_response
@@ -7682,6 +7686,27 @@ class CustomStreamWrapper:
             }
         return ""
 
+    def handle_sagemaker_stream(self, chunk):
+        if "data: [DONE]" in chunk:
+            text = ""
+            is_finished = True
+            finish_reason = "stop"
+            return {
+                "text": text,
+                "is_finished": is_finished,
+                "finish_reason": finish_reason,
+            }
+        elif isinstance(chunk, dict):
+            if chunk["is_finished"] == True:
+                finish_reason = "stop"
+            else:
+                finish_reason = ""
+            return {
+                "text": chunk["text"],
+                "is_finished": chunk["is_finished"],
+                "finish_reason": finish_reason,
+            }
+
     def chunk_creator(self, chunk):
         model_response = ModelResponse(stream=True, model=self.model)
         if self.response_id is not None:
@@ -7807,8 +7832,14 @@ class CustomStreamWrapper:
                     ]
                     self.sent_last_chunk = True
             elif self.custom_llm_provider == "sagemaker":
-                print_verbose(f"ENTERS SAGEMAKER STREAMING for chunk {chunk}")
-                completion_obj["content"] = chunk
+                verbose_logger.debug(f"ENTERS SAGEMAKER STREAMING for chunk {chunk}")
+                response_obj = self.handle_sagemaker_stream(chunk)
+                completion_obj["content"] = response_obj["text"]
+                if response_obj["is_finished"]:
+                    model_response.choices[0].finish_reason = response_obj[
+                        "finish_reason"
+                    ]
+                    self.sent_last_chunk = True
             elif self.custom_llm_provider == "petals":
                 if len(self.completion_stream) == 0:
                     if self.sent_last_chunk:
@@ -7984,6 +8015,19 @@ class CustomStreamWrapper:
                 original_exception=e,
             )
 
+    def run_success_logging_in_thread(self, processed_chunk):
+        # Create an event loop for the new thread
+        ## ASYNC LOGGING
+        # Run the asynchronous function in the new thread's event loop
+        asyncio.run(
+            self.logging_obj.async_success_handler(
+                processed_chunk,
+            )
+        )
+
+        ## SYNC LOGGING
+        self.logging_obj.success_handler(processed_chunk)
+
     ## needs to handle the empty string case (even starting chunk can be an empty string)
     def __next__(self):
         try:
@@ -8002,8 +8046,9 @@ class CustomStreamWrapper:
                         continue
                     ## LOGGING
                     threading.Thread(
-                        target=self.logging_obj.success_handler, args=(response,)
+                        target=self.run_success_logging_in_thread, args=(response,)
                     ).start()  # log response
+
                     # RETURN RESULT
                     return response
         except StopIteration:
@@ -8059,13 +8104,34 @@ class CustomStreamWrapper:
                 raise StopAsyncIteration
             else:  # temporary patch for non-aiohttp async calls
                 # example - boto3 bedrock llms
-                processed_chunk = next(self)
-                asyncio.create_task(
-                    self.logging_obj.async_success_handler(
-                        processed_chunk,
-                    )
-                )
-                return processed_chunk
+                while True:
+                    if isinstance(self.completion_stream, str) or isinstance(
+                        self.completion_stream, bytes
+                    ):
+                        chunk = self.completion_stream
+                    else:
+                        chunk = next(self.completion_stream)
+                    if chunk is not None and chunk != b"":
+                        print_verbose(f"PROCESSED CHUNK PRE CHUNK CREATOR: {chunk}")
+                        processed_chunk = self.chunk_creator(chunk=chunk)
+                        print_verbose(
+                            f"PROCESSED CHUNK POST CHUNK CREATOR: {processed_chunk}"
+                        )
+                        if processed_chunk is None:
+                            continue
+                        ## LOGGING
+                        threading.Thread(
+                            target=self.logging_obj.success_handler,
+                            args=(processed_chunk,),
+                        ).start()  # log processed_chunk
+                        asyncio.create_task(
+                            self.logging_obj.async_success_handler(
+                                processed_chunk,
+                            )
+                        )
+
+                        # RETURN RESULT
+                        return processed_chunk
         except StopAsyncIteration:
             raise
         except StopIteration:
diff --git a/proxy_server_config.yaml b/proxy_server_config.yaml
index dfa8e1151..2c123d156 100644
--- a/proxy_server_config.yaml
+++ b/proxy_server_config.yaml
@@ -11,6 +11,10 @@ model_list:
       api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
       api_version: "2023-05-15"
       api_key: os.environ/AZURE_API_KEY # The `os.environ/` prefix tells litellm to read this from the env. See https://docs.litellm.ai/docs/simple_proxy#load-api-keys-from-vault
+  - model_name: sagemaker-completion-model
+    litellm_params:
+      model: sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4
+      input_cost_per_second: 0.000420  
   - model_name: gpt-4
     litellm_params:
       model: azure/gpt-turbo
diff --git a/tests/test_keys.py b/tests/test_keys.py
index f05204c03..cb06e1f7e 100644
--- a/tests/test_keys.py
+++ b/tests/test_keys.py
@@ -13,17 +13,21 @@ sys.path.insert(
 import litellm
 
 
-async def generate_key(session, i, budget=None, budget_duration=None):
+async def generate_key(
+    session, i, budget=None, budget_duration=None, models=["azure-models", "gpt-4"]
+):
     url = "http://0.0.0.0:4000/key/generate"
     headers = {"Authorization": "Bearer sk-1234", "Content-Type": "application/json"}
     data = {
-        "models": ["azure-models", "gpt-4"],
+        "models": models,
         "aliases": {"mistral-7b": "gpt-3.5-turbo"},
         "duration": None,
         "max_budget": budget,
         "budget_duration": budget_duration,
     }
 
+    print(f"data: {data}")
+
     async with session.post(url, headers=headers, json=data) as response:
         status = response.status
         response_text = await response.text()
@@ -293,7 +297,7 @@ async def test_key_info_spend_values():
         rounded_response_cost = round(response_cost, 8)
         rounded_key_info_spend = round(key_info["info"]["spend"], 8)
         assert rounded_response_cost == rounded_key_info_spend
-        ## streaming
+        ## streaming - azure
         key_gen = await generate_key(session=session, i=0)
         new_key = key_gen["key"]
         prompt_tokens, completion_tokens = await chat_completion_streaming(
@@ -318,6 +322,41 @@ async def test_key_info_spend_values():
         assert rounded_response_cost == rounded_key_info_spend
 
 
+@pytest.mark.asyncio
+async def test_key_info_spend_values_sagemaker():
+    """
+    Tests the sync streaming loop to ensure spend is correctly calculated.
+    - create key
+    - make completion call
+    - assert cost is expected value
+    """
+    async with aiohttp.ClientSession() as session:
+        ## streaming - sagemaker
+        key_gen = await generate_key(session=session, i=0, models=[])
+        new_key = key_gen["key"]
+        prompt_tokens, completion_tokens = await chat_completion_streaming(
+            session=session, key=new_key, model="sagemaker-completion-model"
+        )
+        # print(f"prompt_tokens: {prompt_tokens}, completion_tokens: {completion_tokens}")
+        # prompt_cost, completion_cost = litellm.cost_per_token(
+        #     model="azure/gpt-35-turbo",
+        #     prompt_tokens=prompt_tokens,
+        #     completion_tokens=completion_tokens,
+        # )
+        # response_cost = prompt_cost + completion_cost
+        await asyncio.sleep(5)  # allow db log to be updated
+        key_info = await get_key_info(
+            session=session, get_key=new_key, call_key=new_key
+        )
+        # print(
+        #     f"response_cost: {response_cost}; key_info spend: {key_info['info']['spend']}"
+        # )
+        # rounded_response_cost = round(response_cost, 8)
+        rounded_key_info_spend = round(key_info["info"]["spend"], 8)
+        assert rounded_key_info_spend > 0
+        # assert rounded_response_cost == rounded_key_info_spend
+
+
 @pytest.mark.asyncio
 async def test_key_with_budgets():
     """

From bbe6a92eb999bdfd97149286d59d1c37f72178e8 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 25 Jan 2024 14:51:08 -0800
Subject: [PATCH 02/27] fix(main.py): fix order of assembly for streaming
 chunks

---
 litellm/main.py                     | 6 ++++++
 litellm/tests/test_custom_logger.py | 2 +-
 litellm/utils.py                    | 1 +
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/litellm/main.py b/litellm/main.py
index fca3bd2b2..6b4035473 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -3343,6 +3343,12 @@ def stream_chunk_builder(
     chunks: list, messages: Optional[list] = None, start_time=None, end_time=None
 ):
     model_response = litellm.ModelResponse()
+    ### SORT CHUNKS BASED ON CREATED ORDER ##
+    if chunks[0]._hidden_params.get("created_at", None):
+        # Sort chunks based on created_at in ascending order
+        chunks = sorted(
+            chunks, key=lambda x: x._hidden_params.get("created_at", float("inf"))
+        )
     # set hidden params from chunk to model_response
     if model_response is not None and hasattr(model_response, "_hidden_params"):
         model_response._hidden_params = chunks[0].get("_hidden_params", {})
diff --git a/litellm/tests/test_custom_logger.py b/litellm/tests/test_custom_logger.py
index 565df5b25..e403c3afe 100644
--- a/litellm/tests/test_custom_logger.py
+++ b/litellm/tests/test_custom_logger.py
@@ -211,7 +211,7 @@ def test_azure_completion_stream():
             {"role": "system", "content": "You are a helpful assistant."},
             {
                 "role": "user",
-                "content": "write 1 sentence about litellm being amazing",
+                "content": f"write 1 sentence about litellm being amazing {time.time()}",
             },
         ]
         complete_streaming_response = ""
diff --git a/litellm/utils.py b/litellm/utils.py
index fb3210b1d..02ac83d06 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -7714,6 +7714,7 @@ class CustomStreamWrapper:
         else:
             self.response_id = model_response.id
         model_response._hidden_params["custom_llm_provider"] = self.custom_llm_provider
+        model_response._hidden_params["created_at"] = time.time()
         model_response.choices = [StreamingChoices()]
         model_response.choices[0].finish_reason = None
         response_obj = {}

From 72275ad8cb1a8fe7558726cacda6289a76cdb559 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 25 Jan 2024 15:59:53 -0800
Subject: [PATCH 03/27] fix(main.py): fix logging event loop for async logging
 but sync streaming

---
 litellm/main.py  | 12 ++++--------
 litellm/utils.py | 21 +++++++++++++++------
 2 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/litellm/main.py b/litellm/main.py
index 6b4035473..89750ef46 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -274,14 +274,10 @@ async def acompletion(
         else:
             # Call the synchronous function using run_in_executor
             response = await loop.run_in_executor(None, func_with_context)  # type: ignore
-        # if kwargs.get("stream", False):  # return an async generator
-        #     return _async_streaming(
-        #         response=response,
-        #         model=model,
-        #         custom_llm_provider=custom_llm_provider,
-        #         args=args,
-        #     )
-        # else:
+        if isinstance(response, CustomStreamWrapper):
+            response.set_logging_event_loop(
+                loop=loop
+            )  # sets the logging event loop if the user does sync streaming (e.g. on proxy for sagemaker calls)
         return response
     except Exception as e:
         custom_llm_provider = custom_llm_provider or "openai"
diff --git a/litellm/utils.py b/litellm/utils.py
index 02ac83d06..2bc1d34e9 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -7116,6 +7116,7 @@ class CustomStreamWrapper:
             "model_id": (_model_info.get("id", None))
         }  # returned as x-litellm-model-id response header in proxy
         self.response_id = None
+        self.logging_loop = None
 
     def __iter__(self):
         return self
@@ -8016,16 +8017,24 @@ class CustomStreamWrapper:
                 original_exception=e,
             )
 
+    def set_logging_event_loop(self, loop):
+        self.logging_loop = loop
+
+    async def your_async_function(self):
+        # Your asynchronous code here
+        return "Your asynchronous code is running"
+
     def run_success_logging_in_thread(self, processed_chunk):
         # Create an event loop for the new thread
         ## ASYNC LOGGING
-        # Run the asynchronous function in the new thread's event loop
-        asyncio.run(
-            self.logging_obj.async_success_handler(
-                processed_chunk,
+        if self.logging_loop is not None:
+            future = asyncio.run_coroutine_threadsafe(
+                self.logging_obj.async_success_handler(processed_chunk),
+                loop=self.logging_loop,
             )
-        )
-
+            result = future.result()
+        else:
+            asyncio.run(self.logging_obj.async_success_handler(processed_chunk))
         ## SYNC LOGGING
         self.logging_obj.success_handler(processed_chunk)
 

From 565531fe9e54ec8d0cd9962fb44d82c0ec673d44 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 25 Jan 2024 09:58:43 -0800
Subject: [PATCH 04/27] v0 basic structure

---
 litellm/proxy/proxy_server.py | 26 ++++++++++++++++++++++++++
 litellm/proxy/utils.py        | 10 ++++++++++
 2 files changed, 36 insertions(+)

diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index 493ad9731..ca8ad027d 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -407,6 +407,14 @@ async def user_api_key_auth(
                                 user_max_budget is not None
                                 and user_current_spend is not None
                             ):
+                                asyncio.create_task(
+                                    proxy_logging_obj.budget_alerts(
+                                        user_max_budget=user_max_budget,
+                                        user_current_spend=user_current_spend,
+                                        type="user_and_proxy_budget",
+                                        user_info=_user,
+                                    )
+                                )
                                 if user_current_spend > user_max_budget:
                                     raise Exception(
                                         f"ExceededBudget: User {valid_token.user_id} has exceeded their budget. Current spend: {user_current_spend}; Max Budget: {user_max_budget}"
@@ -422,6 +430,15 @@ async def user_api_key_auth(
                             user_max_budget is not None
                             and user_current_spend is not None
                         ):
+                            asyncio.create_task(
+                                proxy_logging_obj.budget_alerts(
+                                    user_max_budget=user_max_budget,
+                                    user_current_spend=user_current_spend,
+                                    type="user_budget",
+                                    user_info=user_id_information,
+                                )
+                            )
+
                             if user_current_spend > user_max_budget:
                                 raise Exception(
                                     f"ExceededBudget: User {valid_token.user_id} has exceeded their budget. Current spend: {user_current_spend}; Max Budget: {user_max_budget}"
@@ -448,6 +465,15 @@ async def user_api_key_auth(
 
             # Check 4. Token Spend is under budget
             if valid_token.spend is not None and valid_token.max_budget is not None:
+                asyncio.create_task(
+                    proxy_logging_obj.budget_alerts(
+                        user_max_budget=valid_token.max_budget,
+                        user_current_spend=valid_token.spend,
+                        type="token_budget",
+                        user_info=valid_token,
+                    )
+                )
+
                 if valid_token.spend > valid_token.max_budget:
                     raise Exception(
                         f"ExceededTokenBudget: Current spend for token: {valid_token.spend}; Max Budget for Token: {valid_token.max_budget}"
diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py
index 728716886..9f7dd1c87 100644
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@@ -181,6 +181,14 @@ class ProxyLogging:
                     level="Low",
                 )
 
+    async def budget_alerts(
+        self,
+        type: Literal["token_budget", "user_budget", "user_and_proxy_budget"],
+        user_max_budget: float,
+        user_current_spend: float,
+    ):
+        pass
+
     async def alerting_handler(
         self, message: str, level: Literal["Low", "Medium", "High"]
     ):
@@ -191,6 +199,8 @@ class ProxyLogging:
         - Requests are hanging
         - Calls are failing
         - DB Read/Writes are failing
+        - Proxy Close to max budget
+        - Key Close to max budget
 
         Parameters:
             level: str - Low|Medium|High - if calls might fail (Medium) or are failing (High); Currently, no alerts would be 'Low'.

From 55115a75b02f79b978976fdf26084d8844b4bf1c Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 25 Jan 2024 10:01:32 -0800
Subject: [PATCH 05/27] (feat) alerts proxy budgets

---
 litellm/proxy/utils.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py
index 9f7dd1c87..ab14411bb 100644
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@@ -186,7 +186,23 @@ class ProxyLogging:
         type: Literal["token_budget", "user_budget", "user_and_proxy_budget"],
         user_max_budget: float,
         user_current_spend: float,
+        user_info=None,
     ):
+        # percent of max_budget left to spend
+        percent_left = (user_max_budget - user_current_spend) / user_max_budget
+
+        # check if 15% of max budget is left
+        if percent_left <= 0.15:
+            pass
+
+        # check if 5% of max budget is left
+        if percent_left <= 0.05:
+            pass
+
+        # check if crossed budget
+        if user_current_spend >= user_max_budget:
+            pass
+
         pass
 
     async def alerting_handler(

From 7a2a7e047f6c8fdc72d006814e42f0654775d71e Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 25 Jan 2024 11:18:06 -0800
Subject: [PATCH 06/27] (feat) slack alerting budgets

---
 litellm/proxy/utils.py | 45 +++++++++++++++++++++++++++++++++---------
 1 file changed, 36 insertions(+), 9 deletions(-)

diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py
index ab14411bb..f176687f9 100644
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@@ -188,20 +188,47 @@ class ProxyLogging:
         user_current_spend: float,
         user_info=None,
     ):
+        if type == "user_and_proxy_budget":
+            user_info = dict(user_info)
+            user_id = user_info["user_id"]
+            max_budget = user_info["max_budget"]
+            spend = user_info["spend"]
+            user_email = user_info["user_email"]
+            user_info = f"""\nUser ID: {user_id}\nMax Budget: {max_budget}\nSpend: {spend}\nUser Email: {user_email}"""
+        else:
+            user_info = str(user_info)
         # percent of max_budget left to spend
         percent_left = (user_max_budget - user_current_spend) / user_max_budget
-
-        # check if 15% of max budget is left
-        if percent_left <= 0.15:
-            pass
-
-        # check if 5% of max budget is left
-        if percent_left <= 0.05:
-            pass
+        verbose_proxy_logger.debug(
+            f"Bduget Alerts: Percent left: {percent_left} for {user_info}"
+        )
 
         # check if crossed budget
         if user_current_spend >= user_max_budget:
-            pass
+            message = "Budget Crossed for" + user_info
+            await self.alerting_handler(
+                message=message,
+                level="High",
+            )
+            return
+
+        # check if 5% of max budget is left
+        if percent_left <= 0.05:
+            message = "5 Percent budget left for" + user_info
+            await self.alerting_handler(
+                message=message,
+                level="Medium",
+            )
+            return
+
+        # check if 15% of max budget is left
+        if percent_left <= 0.15:
+            message = "15 Percent budget left for" + user_info
+            await self.alerting_handler(
+                message=message,
+                level="Low",
+            )
+            return
 
         pass
 

From 56f49a87bafe0d9bbc1c27147879679501e88612 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 25 Jan 2024 11:32:05 -0800
Subject: [PATCH 07/27] (fix) raise exception budget_duration is set and
 max_budget is Not

---
 litellm/proxy/proxy_server.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index ca8ad027d..c2c15d7e9 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -1654,11 +1654,12 @@ async def startup_event():
             user_id="default_user_id",
         )
 
-    if (
-        prisma_client is not None
-        and litellm.max_budget > 0
-        and litellm.budget_duration is not None
-    ):
+    if prisma_client is not None and litellm.max_budget > 0:
+        if litellm.budget_duration is None:
+            raise Exception(
+                "budget_duration not set on Proxy. budget_duration is required to use max_budget."
+            )
+
         # add proxy budget to db in the user table
         await generate_key_helper_fn(
             user_id=litellm_proxy_budget_name,

From 81c528f6ce678dfb59f7f56e64211ae20c9d9b21 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 25 Jan 2024 11:39:57 -0800
Subject: [PATCH 08/27] (fix) raise correct error when proxy crossed budget

---
 litellm/proxy/proxy_server.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index c2c15d7e9..43e2ec5f5 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -415,9 +415,11 @@ async def user_api_key_auth(
                                         user_info=_user,
                                     )
                                 )
+
+                                _user_id = _user.get("user_id", None)
                                 if user_current_spend > user_max_budget:
                                     raise Exception(
-                                        f"ExceededBudget: User {valid_token.user_id} has exceeded their budget. Current spend: {user_current_spend}; Max Budget: {user_max_budget}"
+                                        f"ExceededBudget: User {_user_id} has exceeded their budget. Current spend: {user_current_spend}; Max Budget: {user_max_budget}"
                                     )
                     else:
                         # Token exists, not expired now check if its in budget for the user

From 229e4920dfcdedf7d4784fb35693ff6dffc23c09 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 25 Jan 2024 11:40:20 -0800
Subject: [PATCH 09/27] (fix) better alert message on budgets

---
 litellm/proxy/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py
index f176687f9..9bef04034 100644
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@@ -214,7 +214,7 @@ class ProxyLogging:
 
         # check if 5% of max budget is left
         if percent_left <= 0.05:
-            message = "5 Percent budget left for" + user_info
+            message = "5% budget left for" + user_info
             await self.alerting_handler(
                 message=message,
                 level="Medium",
@@ -223,7 +223,7 @@ class ProxyLogging:
 
         # check if 15% of max budget is left
         if percent_left <= 0.15:
-            message = "15 Percent budget left for" + user_info
+            message = "15% budget left for" + user_info
             await self.alerting_handler(
                 message=message,
                 level="Low",

From 3a1c8f453f061b9af0d7aefe0486f59537bff05b Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 25 Jan 2024 11:40:56 -0800
Subject: [PATCH 10/27] (docs) track max_budget on proxy config.yaml

---
 litellm/proxy/proxy_config.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml
index b06faac32..65aa21d04 100644
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@@ -61,6 +61,8 @@ model_list:
 litellm_settings:
   fallbacks: [{"openai-gpt-3.5": ["azure-gpt-3.5"]}]
   success_callback: ['langfuse']
+  max_budget: 0.025
+  budget_duration: 30d
   # cache: True     
   # setting callback class
   # callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance]

From b8f917624f47c8d65daa7bd31e36cc36ffc7d2e0 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 25 Jan 2024 11:41:35 -0800
Subject: [PATCH 11/27] (docs) config.yaml

---
 litellm/proxy/proxy_config.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml
index 65aa21d04..7cb2714f4 100644
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@@ -61,8 +61,8 @@ model_list:
 litellm_settings:
   fallbacks: [{"openai-gpt-3.5": ["azure-gpt-3.5"]}]
   success_callback: ['langfuse']
-  max_budget: 0.025
-  budget_duration: 30d
+  max_budget: 0.025       # global budget for proxy 
+  budget_duration: 30d    # global budget duration, will reset after 30d
   # cache: True     
   # setting callback class
   # callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance]

From 1c9b02ad99e62044208b0617c659644a3b6be3fd Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 25 Jan 2024 11:56:52 -0800
Subject: [PATCH 12/27] (fix) alerting debug statements

---
 litellm/proxy/utils.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py
index 9bef04034..4c6030e5b 100644
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@@ -200,11 +200,12 @@ class ProxyLogging:
         # percent of max_budget left to spend
         percent_left = (user_max_budget - user_current_spend) / user_max_budget
         verbose_proxy_logger.debug(
-            f"Bduget Alerts: Percent left: {percent_left} for {user_info}"
+            f"Budget Alerts: Percent left: {percent_left} for {user_info}"
         )
 
         # check if crossed budget
         if user_current_spend >= user_max_budget:
+            verbose_proxy_logger.debug(f"Budget Crossed for {user_info}")
             message = "Budget Crossed for" + user_info
             await self.alerting_handler(
                 message=message,
@@ -230,7 +231,7 @@ class ProxyLogging:
             )
             return
 
-        pass
+        return
 
     async def alerting_handler(
         self, message: str, level: Literal["Low", "Medium", "High"]

From 5264a3eb53eb441e5f56d9b4b388b655021d8d33 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 25 Jan 2024 11:58:55 -0800
Subject: [PATCH 13/27] (fix) do nothing if alerting is not switched on

---
 litellm/proxy/utils.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py
index 4c6030e5b..12605cf40 100644
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@@ -188,6 +188,10 @@ class ProxyLogging:
         user_current_spend: float,
         user_info=None,
     ):
+        if self.alerting is None:
+            # do nothing if alerting is not switched on
+            return
+
         if type == "user_and_proxy_budget":
             user_info = dict(user_info)
             user_id = user_info["user_id"]

From 7e1b9158fe3aec3f2379fbf9e9a01c385f35ae59 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 25 Jan 2024 14:30:49 -0800
Subject: [PATCH 14/27] (test) embedding models

---
 litellm/tests/test_embedding.py | 39 +++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/litellm/tests/test_embedding.py b/litellm/tests/test_embedding.py
index 630b41d72..18a6447e1 100644
--- a/litellm/tests/test_embedding.py
+++ b/litellm/tests/test_embedding.py
@@ -57,6 +57,45 @@ def test_openai_embedding():
 # test_openai_embedding()
 
 
+def test_openai_embedding_3():
+    try:
+        litellm.set_verbose = True
+        response = embedding(
+            model="text-embedding-3-small",
+            input=["good morning from litellm", "this is another item"],
+            metadata={"anything": "good day"},
+        )
+        litellm_response = dict(response)
+        litellm_response_keys = set(litellm_response.keys())
+        litellm_response_keys.discard("_response_ms")
+
+        print(litellm_response_keys)
+        print("LiteLLM Response\n")
+        # print(litellm_response)
+
+        # same request with OpenAI 1.0+
+        import openai
+
+        client = openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"])
+        response = client.embeddings.create(
+            model="text-embedding-3-small",
+            input=["good morning from litellm", "this is another item"],
+        )
+
+        response = dict(response)
+        openai_response_keys = set(response.keys())
+        print(openai_response_keys)
+        assert (
+            litellm_response_keys == openai_response_keys
+        )  # ENSURE the Keys in litellm response is exactly what the openai package returns
+        assert (
+            len(litellm_response["data"]) == 2
+        )  # expect two embedding responses from litellm_response since input had two
+        print(openai_response_keys)
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+
 def test_openai_azure_embedding_simple():
     try:
         litellm.set_verbose = True

From 4b15ae41f43ed53368b725991f21c76a8cf43588 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 25 Jan 2024 14:34:15 -0800
Subject: [PATCH 15/27] (feat) add new OpenAI text-embedding-3

---
 model_prices_and_context_window.json | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json
index 7e5f66990..3fe186908 100644
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@@ -143,6 +143,20 @@
         "litellm_provider": "openai",
         "mode": "chat"
     },
+    "text-embedding-3-large": {
+        "max_tokens": 8191,
+        "input_cost_per_token": 0.00000013,
+        "output_cost_per_token": 0.000000,
+        "litellm_provider": "openai",
+        "mode": "embedding"
+    },
+    "text-embedding-3-small": {
+        "max_tokens": 8191,
+        "input_cost_per_token": 0.00000002,
+        "output_cost_per_token": 0.000000,
+        "litellm_provider": "openai",
+        "mode": "embedding"
+    },
     "text-embedding-ada-002": {
         "max_tokens": 8191,
         "input_cost_per_token": 0.0000001,

From c10bda3d30c5b2c1ec9dbdccfe2b01dffddacf2b Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 25 Jan 2024 14:36:11 -0800
Subject: [PATCH 16/27] (chore) cleanup testing file

---
 litellm/tests/test_embedding.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/tests/test_embedding.py b/litellm/tests/test_embedding.py
index 18a6447e1..42ac6f7f9 100644
--- a/litellm/tests/test_embedding.py
+++ b/litellm/tests/test_embedding.py
@@ -225,7 +225,7 @@ def test_cohere_embedding3():
         pytest.fail(f"Error occurred: {e}")
 
 
-test_cohere_embedding3()
+# test_cohere_embedding3()
 
 
 def test_bedrock_embedding_titan():

From 53961d641e96bfd0fb82bddd23f48cab5bc0f9c8 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 25 Jan 2024 14:38:03 -0800
Subject: [PATCH 17/27] (docs) new OpenAI embedding models

---
 docs/my-website/docs/embedding/supported_embedding.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/my-website/docs/embedding/supported_embedding.md b/docs/my-website/docs/embedding/supported_embedding.md
index 462cc1e70..735aa01c8 100644
--- a/docs/my-website/docs/embedding/supported_embedding.md
+++ b/docs/my-website/docs/embedding/supported_embedding.md
@@ -71,6 +71,8 @@ response = embedding('text-embedding-ada-002', input=["good morning from litellm
 
 | Model Name           | Function Call                               | Required OS Variables                |
 |----------------------|---------------------------------------------|--------------------------------------|
+| text-embedding-3-small | `embedding('text-embedding-3-small', input)` | `os.environ['OPENAI_API_KEY']`       |
+| text-embedding-3-large | `embedding('text-embedding-3-large', input)` | `os.environ['OPENAI_API_KEY']`       |
 | text-embedding-ada-002 | `embedding('text-embedding-ada-002', input)` | `os.environ['OPENAI_API_KEY']`       |
 
 ## Azure OpenAI Embedding Models

From c6a6deaa413ea2929cf1a2589bb0eee025fb17fd Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 25 Jan 2024 14:41:46 -0800
Subject: [PATCH 18/27] (feat) add gpt-4-0125-preview

---
 model_prices_and_context_window.json | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json
index 3fe186908..458ac05a4 100644
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@@ -62,6 +62,15 @@
         "litellm_provider": "openai",
         "mode": "chat"
     },
+    "gpt-4-0125-preview": {
+        "max_tokens": 128000,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00001,
+        "output_cost_per_token": 0.00003,
+        "litellm_provider": "openai",
+        "mode": "chat"
+    },
     "gpt-4-vision-preview": {
         "max_tokens": 128000,
         "max_input_tokens": 128000,

From e00f46a6e9c14924be833a822113fe2d397f25f7 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 25 Jan 2024 14:42:10 -0800
Subject: [PATCH 19/27] (test) gpt-4-0125-preview

---
 litellm/tests/test_completion.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index b2c69804c..e24248bee 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -191,6 +191,21 @@ def test_completion_gpt4_turbo():
 # test_completion_gpt4_turbo()
 
 
+def test_completion_gpt4_turbo_0125():
+    try:
+        response = completion(
+            model="gpt-4-0125-preview",
+            messages=messages,
+            max_tokens=10,
+        )
+        print(response)
+    except openai.RateLimitError:
+        print("got a rate liimt error")
+        pass
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+
 @pytest.mark.skip(reason="this test is flaky")
 def test_completion_gpt4_vision():
     try:

From 8ff00ad8d58d135944389a47bcb0d13231bdad6e Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 25 Jan 2024 14:48:56 -0800
Subject: [PATCH 20/27] (docs) new gpt-4-0125-preview

---
 docs/my-website/docs/providers/openai.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/my-website/docs/providers/openai.md b/docs/my-website/docs/providers/openai.md
index 1a515dea3..26f4a7d69 100644
--- a/docs/my-website/docs/providers/openai.md
+++ b/docs/my-website/docs/providers/openai.md
@@ -34,6 +34,7 @@ os.environ["OPENAI_API_BASE"] = "openaiai-api-base"     # OPTIONAL
 
 | Model Name            | Function Call                                                   |
 |-----------------------|-----------------------------------------------------------------|
+| gpt-4-0125-preview    | `response = completion(model="gpt-4-0125-preview", messages=messages)` |
 | gpt-4-1106-preview    | `response = completion(model="gpt-4-1106-preview", messages=messages)` |
 | gpt-3.5-turbo-1106    | `response = completion(model="gpt-3.5-turbo-1106", messages=messages)` |
 | gpt-3.5-turbo         | `response = completion(model="gpt-3.5-turbo", messages=messages)` |

From 5e7c43ebf74624662de56b8d80830e927cdfdb32 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 25 Jan 2024 14:50:51 -0800
Subject: [PATCH 21/27] =?UTF-8?q?bump:=20version=201.19.2=20=E2=86=92=201.?=
 =?UTF-8?q?19.3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 6f20d92ee..567f08587 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.19.2"
+version = "1.19.3"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@@ -63,7 +63,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.commitizen]
-version = "1.19.2"
+version = "1.19.3"
 version_files = [
     "pyproject.toml:^version"
 ]

From 014f83c847291008ec5c9a328e73eafb319ed01d Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 25 Jan 2024 15:00:51 -0800
Subject: [PATCH 22/27] fix(main.py): allow vertex ai project and location to
 be set in completion() call

---
 litellm/main.py                                 | 12 +++++++++---
 litellm/tests/test_amazing_vertex_completion.py |  4 +++-
 litellm/utils.py                                |  4 ++++
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/litellm/main.py b/litellm/main.py
index 89750ef46..ae5d675c6 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -1417,9 +1417,15 @@ def completion(
                 return response
             response = model_response
         elif custom_llm_provider == "vertex_ai":
-            vertex_ai_project = litellm.vertex_project or get_secret("VERTEXAI_PROJECT")
-            vertex_ai_location = litellm.vertex_location or get_secret(
-                "VERTEXAI_LOCATION"
+            vertex_ai_project = (
+                optional_params.pop("vertex_ai_project", None)
+                or litellm.vertex_project
+                or get_secret("VERTEXAI_PROJECT")
+            )
+            vertex_ai_location = (
+                optional_params.pop("vertex_ai_location", None)
+                or litellm.vertex_location
+                or get_secret("VERTEXAI_LOCATION")
             )
 
             model_response = vertex_ai.completion(
diff --git a/litellm/tests/test_amazing_vertex_completion.py b/litellm/tests/test_amazing_vertex_completion.py
index 8467e4434..85c1cb933 100644
--- a/litellm/tests/test_amazing_vertex_completion.py
+++ b/litellm/tests/test_amazing_vertex_completion.py
@@ -95,7 +95,8 @@ def test_vertex_ai():
         + litellm.vertex_code_text_models
     )
     litellm.set_verbose = False
-    litellm.vertex_project = "reliablekeys"
+    vertex_ai_project = "reliablekeys"
+    # litellm.vertex_project = "reliablekeys"
 
     test_models = random.sample(test_models, 1)
     # test_models += litellm.vertex_language_models  # always test gemini-pro
@@ -117,6 +118,7 @@ def test_vertex_ai():
                 model=model,
                 messages=[{"role": "user", "content": "hi"}],
                 temperature=0.7,
+                vertex_ai_project=vertex_ai_project,
             )
             print("\nModel Response", response)
             print(response)
diff --git a/litellm/utils.py b/litellm/utils.py
index 2bc1d34e9..63fab74cc 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -3351,6 +3351,10 @@ def get_optional_params(
             custom_llm_provider != "bedrock" and custom_llm_provider != "sagemaker"
         ):  # allow dynamically setting boto3 init logic
             continue
+        elif (
+            k.startswith("vertex_") and custom_llm_provider != "vertex_ai"
+        ):  # allow dynamically setting vertex ai init logic
+            continue
         passed_params[k] = v
     default_params = {
         "functions": None,

From 1ae22ea16db10d0e1e0dc8ffe0d32a5c863228eb Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 25 Jan 2024 16:06:01 -0800
Subject: [PATCH 23/27] refactor: trigger new bump

---
 litellm/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/litellm/utils.py b/litellm/utils.py
index 63fab74cc..033990896 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -10,6 +10,7 @@
 import sys, re, binascii, struct
 import litellm
 import dotenv, json, traceback, threading, base64, ast
+
 import subprocess, os
 import litellm, openai
 import itertools

From 13776b1df75d9f3775af869cc80e42a54487cbf4 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 25 Jan 2024 16:06:12 -0800
Subject: [PATCH 24/27] =?UTF-8?q?bump:=20version=201.19.3=20=E2=86=92=201.?=
 =?UTF-8?q?19.4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 567f08587..82eab7fc2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.19.3"
+version = "1.19.4"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@@ -63,7 +63,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.commitizen]
-version = "1.19.3"
+version = "1.19.4"
 version_files = [
     "pyproject.toml:^version"
 ]

From 39aec43b8660001869e60d49b64bff8376c44b61 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 25 Jan 2024 18:15:24 -0800
Subject: [PATCH 25/27] test(main.py): adding more logging

---
 litellm/main.py                     | 6 +++++-
 litellm/tests/test_custom_logger.py | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/litellm/main.py b/litellm/main.py
index ae5d675c6..f9f1139f6 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -15,7 +15,7 @@ import dotenv, traceback, random, asyncio, time, contextvars
 from copy import deepcopy
 import httpx
 import litellm
-
+from ._logging import verbose_logger
 from litellm import (  # type: ignore
     client,
     exception_type,
@@ -3346,11 +3346,15 @@ def stream_chunk_builder(
 ):
     model_response = litellm.ModelResponse()
     ### SORT CHUNKS BASED ON CREATED ORDER ##
+    print_verbose("Goes into checking if chunk has hiddden created at param")
     if chunks[0]._hidden_params.get("created_at", None):
+        print_verbose("Chunks have a created at hidden param")
         # Sort chunks based on created_at in ascending order
         chunks = sorted(
             chunks, key=lambda x: x._hidden_params.get("created_at", float("inf"))
         )
+        print_verbose("Chunks sorted")
+
     # set hidden params from chunk to model_response
     if model_response is not None and hasattr(model_response, "_hidden_params"):
         model_response._hidden_params = chunks[0].get("_hidden_params", {})
diff --git a/litellm/tests/test_custom_logger.py b/litellm/tests/test_custom_logger.py
index e403c3afe..e1c532a88 100644
--- a/litellm/tests/test_custom_logger.py
+++ b/litellm/tests/test_custom_logger.py
@@ -206,7 +206,7 @@ def test_azure_completion_stream():
         # checks if the model response available in the async + stream callbacks is equal to the received response
         customHandler2 = MyCustomHandler()
         litellm.callbacks = [customHandler2]
-        litellm.set_verbose = False
+        litellm.set_verbose = True
         messages = [
             {"role": "system", "content": "You are a helpful assistant."},
             {

From 554f1a090d50d734490df7a2b89acfbee2c052cd Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 25 Jan 2024 18:31:07 -0800
Subject: [PATCH 26/27] test(test_keys.py): add delay for test check

n
---
 tests/test_keys.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_keys.py b/tests/test_keys.py
index cb06e1f7e..348be63af 100644
--- a/tests/test_keys.py
+++ b/tests/test_keys.py
@@ -376,7 +376,7 @@ async def test_key_with_budgets():
         print(f"hashed_token: {hashed_token}")
         key_info = await get_key_info(session=session, get_key=key, call_key=key)
         reset_at_init_value = key_info["info"]["budget_reset_at"]
-        await asyncio.sleep(15)
+        await asyncio.sleep(30)
         key_info = await get_key_info(session=session, get_key=key, call_key=key)
         reset_at_new_value = key_info["info"]["budget_reset_at"]
         assert reset_at_init_value != reset_at_new_value

From e948b39e3ab545b6502e9159d9415d6c55f91a66 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 25 Jan 2024 18:34:13 -0800
Subject: [PATCH 27/27] test(test_streaming.py): fix test to handle none chunk

---
 litellm/tests/test_streaming.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/litellm/tests/test_streaming.py b/litellm/tests/test_streaming.py
index d9f99bece..fda640c96 100644
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@@ -847,9 +847,13 @@ def test_sagemaker_weird_response():
             logging_obj=logging_obj,
         )
         complete_response = ""
-        for chunk in response:
-            print(chunk)
-            complete_response += chunk["choices"][0]["delta"]["content"]
+        for idx, chunk in enumerate(response):
+            # print
+            chunk, finished = streaming_format_tests(idx, chunk)
+            has_finish_reason = finished
+            complete_response += chunk
+            if finished:
+                break
         assert len(complete_response) > 0
     except Exception as e:
         pytest.fail(f"An exception occurred - {str(e)}")