diff --git a/litellm/__init__.py b/litellm/__init__.py
index 0fda1f351..1bff66e6c 100644
--- a/litellm/__init__.py
+++ b/litellm/__init__.py
@@ -1,16 +1,17 @@
 import threading
+
 success_callback = []
 failure_callback = []
-set_verbose=False
-telemetry=True
-max_tokens = 256 # OpenAI Defaults
+set_verbose = False
+telemetry = True
+max_tokens = 256  # OpenAI Defaults
 retry = True
 api_key = None
-openai_key = None 
-azure_key = None 
-anthropic_key = None 
-replicate_key = None 
-cohere_key = None 
+openai_key = None
+azure_key = None
+anthropic_key = None
+replicate_key = None
+cohere_key = None
 openrouter_key = None
 huggingface_key = None
 vertex_project = None
@@ -19,33 +20,99 @@ caching = False
 hugging_api_token = None
 togetherai_api_key = None
 model_cost = {
-    "gpt-3.5-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
-    "gpt-35-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, # azure model name
-    "gpt-3.5-turbo-0613": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
-    "gpt-3.5-turbo-0301": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
-    "gpt-3.5-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
-    "gpt-35-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, # azure model name
-    "gpt-3.5-turbo-16k-0613": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
-    "gpt-4": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
-    "gpt-4-0613": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
-    "gpt-4-32k": {"max_tokens": 8000, "input_cost_per_token": 0.00006, "output_cost_per_token": 0.00012},
-    "claude-instant-1": {"max_tokens": 100000, "input_cost_per_token": 0.00000163, "output_cost_per_token": 0.00000551},
-    "claude-2": {"max_tokens": 100000, "input_cost_per_token": 0.00001102, "output_cost_per_token": 0.00003268},
-    "text-bison-001": {"max_tokens": 8192, "input_cost_per_token": 0.000004, "output_cost_per_token": 0.000004},
-    "chat-bison-001": {"max_tokens": 4096, "input_cost_per_token": 0.000002, "output_cost_per_token": 0.000002},
-    "command-nightly": {"max_tokens": 4096, "input_cost_per_token": 0.000015, "output_cost_per_token": 0.000015},
+    "gpt-3.5-turbo": {
+        "max_tokens": 4000,
+        "input_cost_per_token": 0.0000015,
+        "output_cost_per_token": 0.000002,
+    },
+    "gpt-35-turbo": {
+        "max_tokens": 4000,
+        "input_cost_per_token": 0.0000015,
+        "output_cost_per_token": 0.000002,
+    },  # azure model name
+    "gpt-3.5-turbo-0613": {
+        "max_tokens": 4000,
+        "input_cost_per_token": 0.0000015,
+        "output_cost_per_token": 0.000002,
+    },
+    "gpt-3.5-turbo-0301": {
+        "max_tokens": 4000,
+        "input_cost_per_token": 0.0000015,
+        "output_cost_per_token": 0.000002,
+    },
+    "gpt-3.5-turbo-16k": {
+        "max_tokens": 16000,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000004,
+    },
+    "gpt-35-turbo-16k": {
+        "max_tokens": 16000,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000004,
+    },  # azure model name
+    "gpt-3.5-turbo-16k-0613": {
+        "max_tokens": 16000,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000004,
+    },
+    "gpt-4": {
+        "max_tokens": 8000,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.00006,
+    },
+    "gpt-4-0613": {
+        "max_tokens": 8000,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.00006,
+    },
+    "gpt-4-32k": {
+        "max_tokens": 8000,
+        "input_cost_per_token": 0.00006,
+        "output_cost_per_token": 0.00012,
+    },
+    "claude-instant-1": {
+        "max_tokens": 100000,
+        "input_cost_per_token": 0.00000163,
+        "output_cost_per_token": 0.00000551,
+    },
+    "claude-2": {
+        "max_tokens": 100000,
+        "input_cost_per_token": 0.00001102,
+        "output_cost_per_token": 0.00003268,
+    },
+    "text-bison-001": {
+        "max_tokens": 8192,
+        "input_cost_per_token": 0.000004,
+        "output_cost_per_token": 0.000004,
+    },
+    "chat-bison-001": {
+        "max_tokens": 4096,
+        "input_cost_per_token": 0.000002,
+        "output_cost_per_token": 0.000002,
+    },
+    "command-nightly": {
+        "max_tokens": 4096,
+        "input_cost_per_token": 0.000015,
+        "output_cost_per_token": 0.000015,
+    },
 }
 
+
 ####### THREAD-SPECIFIC DATA ###################
 class MyLocal(threading.local):
     def __init__(self):
         self.user = "Hello World"
 
+
 _thread_context = MyLocal()
+
+
 def identify(event_details):
     # Store user in thread local data
     if "user" in event_details:
         _thread_context.user = event_details["user"]
+
+
 ####### ADDITIONAL PARAMS ################### configurable params if you use proxy models like Helicone, map spend to org id, etc.
 api_base = None
 headers = None
@@ -56,60 +123,48 @@ config_path = None
 secret_manager_client = None
 ####### COMPLETION MODELS ###################
 open_ai_chat_completion_models = [
-  "gpt-4",
-  "gpt-4-0613",
-  "gpt-4-32k",
-  "gpt-4-32k-0613",
-  #################
-  "gpt-3.5-turbo",
-  "gpt-3.5-turbo-16k",
-  "gpt-3.5-turbo-0613",
-  "gpt-3.5-turbo-16k-0613",
-]
-open_ai_text_completion_models = [
-    'text-davinci-003'
+    "gpt-4",
+    "gpt-4-0613",
+    "gpt-4-32k",
+    "gpt-4-32k-0613",
+    #################
+    "gpt-3.5-turbo",
+    "gpt-3.5-turbo-16k",
+    "gpt-3.5-turbo-0613",
+    "gpt-3.5-turbo-16k-0613",
 ]
+open_ai_text_completion_models = ["text-davinci-003"]
 
 cohere_models = [
-    'command-nightly',
-    "command", 
-    "command-light", 
-    "command-medium-beta", 
-    "command-xlarge-beta"
+    "command-nightly",
+    "command",
+    "command-light",
+    "command-medium-beta",
+    "command-xlarge-beta",
 ]
 
-anthropic_models = [
-  "claude-2", 
-  "claude-instant-1",
-  "claude-instant-1.2"
-]
+anthropic_models = ["claude-2", "claude-instant-1", "claude-instant-1.2"]
 
 replicate_models = [
     "replicate/"
-] # placeholder, to make sure we accept any replicate model in our model_list 
+]  # placeholder, to make sure we accept any replicate model in our model_list
 
 openrouter_models = [
-    'google/palm-2-codechat-bison',
-    'google/palm-2-chat-bison',
-    'openai/gpt-3.5-turbo',
-    'openai/gpt-3.5-turbo-16k',
-    'openai/gpt-4-32k',
-    'anthropic/claude-2',
-    'anthropic/claude-instant-v1',
-    'meta-llama/llama-2-13b-chat',
-    'meta-llama/llama-2-70b-chat'
+    "google/palm-2-codechat-bison",
+    "google/palm-2-chat-bison",
+    "openai/gpt-3.5-turbo",
+    "openai/gpt-3.5-turbo-16k",
+    "openai/gpt-4-32k",
+    "anthropic/claude-2",
+    "anthropic/claude-instant-v1",
+    "meta-llama/llama-2-13b-chat",
+    "meta-llama/llama-2-70b-chat",
 ]
 
-vertex_chat_models = [
-    "chat-bison",
-    "chat-bison@001"
-]
+vertex_chat_models = ["chat-bison", "chat-bison@001"]
 
 
-vertex_text_models = [
-    "text-bison",
-    "text-bison@001"
-]
+vertex_text_models = ["text-bison", "text-bison@001"]
 
 huggingface_models = [
     "meta-llama/Llama-2-7b-hf",
@@ -124,25 +179,56 @@ huggingface_models = [
     "meta-llama/Llama-2-13b-chat",
     "meta-llama/Llama-2-70b",
     "meta-llama/Llama-2-70b-chat",
-] # these have been tested on extensively. But by default all text2text-generation and text-generation models are supported by liteLLM. - https://docs.litellm.ai/docs/completion/supported
+]  # these have been tested on extensively. But by default all text2text-generation and text-generation models are supported by liteLLM. - https://docs.litellm.ai/docs/completion/supported
 
-ai21_models = [
-    "j2-ultra",
-    "j2-mid",
-    "j2-light"
+ai21_models = ["j2-ultra", "j2-mid", "j2-light"]
+
+model_list = (
+    open_ai_chat_completion_models
+    + open_ai_text_completion_models
+    + cohere_models
+    + anthropic_models
+    + replicate_models
+    + openrouter_models
+    + huggingface_models
+    + vertex_chat_models
+    + vertex_text_models
+    + ai21_models
+)
+
+provider_list = [
+    "openai",
+    "cohere",
+    "anthropic",
+    "replicate",
+    "huggingface",
+    "together_ai",
+    "openrouter",
+    "vertex_ai",
+    "ai21",
 ]
-
-model_list = open_ai_chat_completion_models + open_ai_text_completion_models + cohere_models + anthropic_models + replicate_models + openrouter_models + huggingface_models + vertex_chat_models + vertex_text_models + ai21_models
-
-provider_list = ["openai", "cohere", "anthropic", "replicate", "huggingface", "together_ai", "openrouter", "vertex_ai", "ai21"]
 ####### EMBEDDING MODELS ###################
-open_ai_embedding_models = [
-    'text-embedding-ada-002'
-]
+open_ai_embedding_models = ["text-embedding-ada-002"]
 
 from .timeout import timeout
 from .testing import *
-from .utils import client, logging, exception_type, get_optional_params, modify_integration, token_counter, cost_per_token, completion_cost, get_litellm_params
+from .utils import (
+    client,
+    logging,
+    exception_type,
+    get_optional_params,
+    modify_integration,
+    token_counter,
+    cost_per_token,
+    completion_cost,
+    get_litellm_params,
+)
 from .main import *  # Import all the symbols from main.py
 from .integrations import *
-from openai.error import AuthenticationError, InvalidRequestError, RateLimitError, ServiceUnavailableError, OpenAIError
\ No newline at end of file
+from openai.error import (
+    AuthenticationError,
+    InvalidRequestError,
+    RateLimitError,
+    ServiceUnavailableError,
+    OpenAIError,
+)
diff --git a/litellm/exceptions.py b/litellm/exceptions.py
index 82f2f5165..51923f86e 100644
--- a/litellm/exceptions.py
+++ b/litellm/exceptions.py
@@ -1,12 +1,21 @@
 ## LiteLLM versions of the OpenAI Exception Types
-from openai.error import AuthenticationError, InvalidRequestError, RateLimitError, ServiceUnavailableError, OpenAIError
+from openai.error import (
+    AuthenticationError,
+    InvalidRequestError,
+    RateLimitError,
+    ServiceUnavailableError,
+    OpenAIError,
+)
+
 
 class AuthenticationError(AuthenticationError):
     def __init__(self, message, llm_provider):
         self.status_code = 401
         self.message = message
         self.llm_provider = llm_provider
-        super().__init__(self.message) # Call the base class constructor with the parameters it needs
+        super().__init__(
+            self.message
+        )  # Call the base class constructor with the parameters it needs
 
 
 class InvalidRequestError(InvalidRequestError):
@@ -15,7 +24,9 @@ class InvalidRequestError(InvalidRequestError):
         self.message = message
         self.model = model
         self.llm_provider = llm_provider
-        super().__init__(self.message, f"{self.model}") # Call the base class constructor with the parameters it needs
+        super().__init__(
+            self.message, f"{self.model}"
+        )  # Call the base class constructor with the parameters it needs
 
 
 class RateLimitError(RateLimitError):
@@ -23,21 +34,29 @@ class RateLimitError(RateLimitError):
         self.status_code = 429
         self.message = message
         self.llm_provider = llm_provider
-        super().__init__(self.message) # Call the base class constructor with the parameters it needs
+        super().__init__(
+            self.message
+        )  # Call the base class constructor with the parameters it needs
+
 
 class ServiceUnavailableError(ServiceUnavailableError):
     def __init__(self, message, llm_provider):
         self.status_code = 500
         self.message = message
         self.llm_provider = llm_provider
-        super().__init__(self.message) # Call the base class constructor with the parameters it needs
+        super().__init__(
+            self.message
+        )  # Call the base class constructor with the parameters it needs
+
 
 class OpenAIError(OpenAIError):
     def __init__(self, original_exception):
         self.status_code = original_exception.http_status
-        super().__init__(http_body=original_exception.http_body,
-                         http_status=original_exception.http_status,
-                         json_body=original_exception.json_body,
-                         headers=original_exception.headers,
-                         code=original_exception.code)
-        self.llm_provider = "openai"
\ No newline at end of file
+        super().__init__(
+            http_body=original_exception.http_body,
+            http_status=original_exception.http_status,
+            json_body=original_exception.json_body,
+            headers=original_exception.headers,
+            code=original_exception.code,
+        )
+        self.llm_provider = "openai"
diff --git a/litellm/integrations/__init__.py b/litellm/integrations/__init__.py
index b9742821a..b6e690fd5 100644
--- a/litellm/integrations/__init__.py
+++ b/litellm/integrations/__init__.py
@@ -1 +1 @@
-from . import *
\ No newline at end of file
+from . import *
diff --git a/litellm/integrations/aispend.py b/litellm/integrations/aispend.py
index 6723a6227..2015d45dd 100644
--- a/litellm/integrations/aispend.py
+++ b/litellm/integrations/aispend.py
@@ -1,53 +1,121 @@
 #### What this does ####
-#    On success + failure, log events to aispend.io 
+#    On success + failure, log events to aispend.io
 import dotenv, os
 import requests
-dotenv.load_dotenv() # Loading env variables using dotenv
+
+dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback
 import datetime
 
 model_cost = {
-    "gpt-3.5-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
-    "gpt-35-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, # azure model name
-    "gpt-3.5-turbo-0613": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
-    "gpt-3.5-turbo-0301": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
-    "gpt-3.5-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
-    "gpt-35-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, # azure model name
-    "gpt-3.5-turbo-16k-0613": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
-    "gpt-4": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
-    "gpt-4-0613": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
-    "gpt-4-32k": {"max_tokens": 8000, "input_cost_per_token": 0.00006, "output_cost_per_token": 0.00012},
-    "claude-instant-1": {"max_tokens": 100000, "input_cost_per_token": 0.00000163, "output_cost_per_token": 0.00000551},
-    "claude-2": {"max_tokens": 100000, "input_cost_per_token": 0.00001102, "output_cost_per_token": 0.00003268},
-    "text-bison-001": {"max_tokens": 8192, "input_cost_per_token": 0.000004, "output_cost_per_token": 0.000004},
-    "chat-bison-001": {"max_tokens": 4096, "input_cost_per_token": 0.000002, "output_cost_per_token": 0.000002},
-    "command-nightly": {"max_tokens": 4096, "input_cost_per_token": 0.000015, "output_cost_per_token": 0.000015},
+    "gpt-3.5-turbo": {
+        "max_tokens": 4000,
+        "input_cost_per_token": 0.0000015,
+        "output_cost_per_token": 0.000002,
+    },
+    "gpt-35-turbo": {
+        "max_tokens": 4000,
+        "input_cost_per_token": 0.0000015,
+        "output_cost_per_token": 0.000002,
+    },  # azure model name
+    "gpt-3.5-turbo-0613": {
+        "max_tokens": 4000,
+        "input_cost_per_token": 0.0000015,
+        "output_cost_per_token": 0.000002,
+    },
+    "gpt-3.5-turbo-0301": {
+        "max_tokens": 4000,
+        "input_cost_per_token": 0.0000015,
+        "output_cost_per_token": 0.000002,
+    },
+    "gpt-3.5-turbo-16k": {
+        "max_tokens": 16000,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000004,
+    },
+    "gpt-35-turbo-16k": {
+        "max_tokens": 16000,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000004,
+    },  # azure model name
+    "gpt-3.5-turbo-16k-0613": {
+        "max_tokens": 16000,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000004,
+    },
+    "gpt-4": {
+        "max_tokens": 8000,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.00006,
+    },
+    "gpt-4-0613": {
+        "max_tokens": 8000,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.00006,
+    },
+    "gpt-4-32k": {
+        "max_tokens": 8000,
+        "input_cost_per_token": 0.00006,
+        "output_cost_per_token": 0.00012,
+    },
+    "claude-instant-1": {
+        "max_tokens": 100000,
+        "input_cost_per_token": 0.00000163,
+        "output_cost_per_token": 0.00000551,
+    },
+    "claude-2": {
+        "max_tokens": 100000,
+        "input_cost_per_token": 0.00001102,
+        "output_cost_per_token": 0.00003268,
+    },
+    "text-bison-001": {
+        "max_tokens": 8192,
+        "input_cost_per_token": 0.000004,
+        "output_cost_per_token": 0.000004,
+    },
+    "chat-bison-001": {
+        "max_tokens": 4096,
+        "input_cost_per_token": 0.000002,
+        "output_cost_per_token": 0.000002,
+    },
+    "command-nightly": {
+        "max_tokens": 4096,
+        "input_cost_per_token": 0.000015,
+        "output_cost_per_token": 0.000015,
+    },
 }
 
+
 class AISpendLogger:
     # Class variables or attributes
     def __init__(self):
         # Instance variables
         self.account_id = os.getenv("AISPEND_ACCOUNT_ID")
         self.api_key = os.getenv("AISPEND_API_KEY")
-    
+
     def price_calculator(self, model, response_obj, start_time, end_time):
         # try and find if the model is in the model_cost map
         # else default to the average of the costs
         prompt_tokens_cost_usd_dollar = 0
         completion_tokens_cost_usd_dollar = 0
         if model in model_cost:
-            prompt_tokens_cost_usd_dollar = model_cost[model]["input_cost_per_token"] * response_obj["usage"]["prompt_tokens"]
-            completion_tokens_cost_usd_dollar = model_cost[model]["output_cost_per_token"] * response_obj["usage"]["completion_tokens"]
-        elif "replicate" in model: 
+            prompt_tokens_cost_usd_dollar = (
+                model_cost[model]["input_cost_per_token"]
+                * response_obj["usage"]["prompt_tokens"]
+            )
+            completion_tokens_cost_usd_dollar = (
+                model_cost[model]["output_cost_per_token"]
+                * response_obj["usage"]["completion_tokens"]
+            )
+        elif "replicate" in model:
             # replicate models are charged based on time
             # llama 2 runs on an nvidia a100 which costs $0.0032 per second - https://replicate.com/replicate/llama-2-70b-chat
-            model_run_time = end_time - start_time # assuming time in seconds
+            model_run_time = end_time - start_time  # assuming time in seconds
             cost_usd_dollar = model_run_time * 0.0032
             prompt_tokens_cost_usd_dollar = cost_usd_dollar / 2
             completion_tokens_cost_usd_dollar = cost_usd_dollar / 2
         else:
-            # calculate average input cost 
+            # calculate average input cost
             input_cost_sum = 0
             output_cost_sum = 0
             for model in model_cost:
@@ -55,37 +123,52 @@ class AISpendLogger:
                 output_cost_sum += model_cost[model]["output_cost_per_token"]
             avg_input_cost = input_cost_sum / len(model_cost.keys())
             avg_output_cost = output_cost_sum / len(model_cost.keys())
-            prompt_tokens_cost_usd_dollar = model_cost[model]["input_cost_per_token"] * response_obj["usage"]["prompt_tokens"]
-            completion_tokens_cost_usd_dollar = model_cost[model]["output_cost_per_token"] * response_obj["usage"]["completion_tokens"]
+            prompt_tokens_cost_usd_dollar = (
+                model_cost[model]["input_cost_per_token"]
+                * response_obj["usage"]["prompt_tokens"]
+            )
+            completion_tokens_cost_usd_dollar = (
+                model_cost[model]["output_cost_per_token"]
+                * response_obj["usage"]["completion_tokens"]
+            )
         return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
-        
+
     def log_event(self, model, response_obj, start_time, end_time, print_verbose):
         # Method definition
         try:
-            print_verbose(f"AISpend Logging - Enters logging function for model {model}")
+            print_verbose(
+                f"AISpend Logging - Enters logging function for model {model}"
+            )
 
             url = f"https://aispend.io/api/v1/accounts/{self.account_id}/data"
             headers = {
-                'Authorization': f'Bearer {self.api_key}',
-                'Content-Type': 'application/json'
+                "Authorization": f"Bearer {self.api_key}",
+                "Content-Type": "application/json",
             }
 
-            response_timestamp = datetime.datetime.fromtimestamp(int(response_obj["created"])).strftime('%Y-%m-%d')
+            response_timestamp = datetime.datetime.fromtimestamp(
+                int(response_obj["created"])
+            ).strftime("%Y-%m-%d")
 
-            prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = self.price_calculator(model, response_obj, start_time, end_time)
+            (
+                prompt_tokens_cost_usd_dollar,
+                completion_tokens_cost_usd_dollar,
+            ) = self.price_calculator(model, response_obj, start_time, end_time)
             prompt_tokens_cost_usd_cent = prompt_tokens_cost_usd_dollar * 100
             completion_tokens_cost_usd_cent = completion_tokens_cost_usd_dollar * 100
-            data = [{
-                "requests": 1,
-                "requests_context": 1,
-                "context_tokens": response_obj["usage"]["prompt_tokens"],
-                "requests_generated": 1,
-                "generated_tokens": response_obj["usage"]["completion_tokens"],
-                "recorded_date": response_timestamp,
-                "model_id": response_obj["model"],
-                "generated_tokens_cost_usd_cent": prompt_tokens_cost_usd_cent,
-                "context_tokens_cost_usd_cent": completion_tokens_cost_usd_cent
-            }]
+            data = [
+                {
+                    "requests": 1,
+                    "requests_context": 1,
+                    "context_tokens": response_obj["usage"]["prompt_tokens"],
+                    "requests_generated": 1,
+                    "generated_tokens": response_obj["usage"]["completion_tokens"],
+                    "recorded_date": response_timestamp,
+                    "model_id": response_obj["model"],
+                    "generated_tokens_cost_usd_cent": prompt_tokens_cost_usd_cent,
+                    "context_tokens_cost_usd_cent": completion_tokens_cost_usd_cent,
+                }
+            ]
 
             print_verbose(f"AISpend Logging - final data object: {data}")
         except:
diff --git a/litellm/integrations/berrispend.py b/litellm/integrations/berrispend.py
index 1742bfed7..7d91ffca7 100644
--- a/litellm/integrations/berrispend.py
+++ b/litellm/integrations/berrispend.py
@@ -1,52 +1,120 @@
 #### What this does ####
-#    On success + failure, log events to aispend.io 
+#    On success + failure, log events to aispend.io
 import dotenv, os
 import requests
-dotenv.load_dotenv() # Loading env variables using dotenv
+
+dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback
 import datetime
 
 model_cost = {
-    "gpt-3.5-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
-    "gpt-35-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, # azure model name
-    "gpt-3.5-turbo-0613": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
-    "gpt-3.5-turbo-0301": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
-    "gpt-3.5-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
-    "gpt-35-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, # azure model name
-    "gpt-3.5-turbo-16k-0613": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
-    "gpt-4": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
-    "gpt-4-0613": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
-    "gpt-4-32k": {"max_tokens": 8000, "input_cost_per_token": 0.00006, "output_cost_per_token": 0.00012},
-    "claude-instant-1": {"max_tokens": 100000, "input_cost_per_token": 0.00000163, "output_cost_per_token": 0.00000551},
-    "claude-2": {"max_tokens": 100000, "input_cost_per_token": 0.00001102, "output_cost_per_token": 0.00003268},
-    "text-bison-001": {"max_tokens": 8192, "input_cost_per_token": 0.000004, "output_cost_per_token": 0.000004},
-    "chat-bison-001": {"max_tokens": 4096, "input_cost_per_token": 0.000002, "output_cost_per_token": 0.000002},
-    "command-nightly": {"max_tokens": 4096, "input_cost_per_token": 0.000015, "output_cost_per_token": 0.000015},
+    "gpt-3.5-turbo": {
+        "max_tokens": 4000,
+        "input_cost_per_token": 0.0000015,
+        "output_cost_per_token": 0.000002,
+    },
+    "gpt-35-turbo": {
+        "max_tokens": 4000,
+        "input_cost_per_token": 0.0000015,
+        "output_cost_per_token": 0.000002,
+    },  # azure model name
+    "gpt-3.5-turbo-0613": {
+        "max_tokens": 4000,
+        "input_cost_per_token": 0.0000015,
+        "output_cost_per_token": 0.000002,
+    },
+    "gpt-3.5-turbo-0301": {
+        "max_tokens": 4000,
+        "input_cost_per_token": 0.0000015,
+        "output_cost_per_token": 0.000002,
+    },
+    "gpt-3.5-turbo-16k": {
+        "max_tokens": 16000,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000004,
+    },
+    "gpt-35-turbo-16k": {
+        "max_tokens": 16000,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000004,
+    },  # azure model name
+    "gpt-3.5-turbo-16k-0613": {
+        "max_tokens": 16000,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000004,
+    },
+    "gpt-4": {
+        "max_tokens": 8000,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.00006,
+    },
+    "gpt-4-0613": {
+        "max_tokens": 8000,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.00006,
+    },
+    "gpt-4-32k": {
+        "max_tokens": 8000,
+        "input_cost_per_token": 0.00006,
+        "output_cost_per_token": 0.00012,
+    },
+    "claude-instant-1": {
+        "max_tokens": 100000,
+        "input_cost_per_token": 0.00000163,
+        "output_cost_per_token": 0.00000551,
+    },
+    "claude-2": {
+        "max_tokens": 100000,
+        "input_cost_per_token": 0.00001102,
+        "output_cost_per_token": 0.00003268,
+    },
+    "text-bison-001": {
+        "max_tokens": 8192,
+        "input_cost_per_token": 0.000004,
+        "output_cost_per_token": 0.000004,
+    },
+    "chat-bison-001": {
+        "max_tokens": 4096,
+        "input_cost_per_token": 0.000002,
+        "output_cost_per_token": 0.000002,
+    },
+    "command-nightly": {
+        "max_tokens": 4096,
+        "input_cost_per_token": 0.000015,
+        "output_cost_per_token": 0.000015,
+    },
 }
 
+
 class BerriSpendLogger:
     # Class variables or attributes
     def __init__(self):
         # Instance variables
         self.account_id = os.getenv("BERRISPEND_ACCOUNT_ID")
-    
+
     def price_calculator(self, model, response_obj, start_time, end_time):
         # try and find if the model is in the model_cost map
         # else default to the average of the costs
         prompt_tokens_cost_usd_dollar = 0
         completion_tokens_cost_usd_dollar = 0
         if model in model_cost:
-            prompt_tokens_cost_usd_dollar = model_cost[model]["input_cost_per_token"] * response_obj["usage"]["prompt_tokens"]
-            completion_tokens_cost_usd_dollar = model_cost[model]["output_cost_per_token"] * response_obj["usage"]["completion_tokens"]
-        elif "replicate" in model: 
+            prompt_tokens_cost_usd_dollar = (
+                model_cost[model]["input_cost_per_token"]
+                * response_obj["usage"]["prompt_tokens"]
+            )
+            completion_tokens_cost_usd_dollar = (
+                model_cost[model]["output_cost_per_token"]
+                * response_obj["usage"]["completion_tokens"]
+            )
+        elif "replicate" in model:
             # replicate models are charged based on time
             # llama 2 runs on an nvidia a100 which costs $0.0032 per second - https://replicate.com/replicate/llama-2-70b-chat
-            model_run_time = end_time - start_time # assuming time in seconds
+            model_run_time = end_time - start_time  # assuming time in seconds
             cost_usd_dollar = model_run_time * 0.0032
             prompt_tokens_cost_usd_dollar = cost_usd_dollar / 2
             completion_tokens_cost_usd_dollar = cost_usd_dollar / 2
         else:
-            # calculate average input cost 
+            # calculate average input cost
             input_cost_sum = 0
             output_cost_sum = 0
             for model in model_cost:
@@ -54,42 +122,59 @@ class BerriSpendLogger:
                 output_cost_sum += model_cost[model]["output_cost_per_token"]
             avg_input_cost = input_cost_sum / len(model_cost.keys())
             avg_output_cost = output_cost_sum / len(model_cost.keys())
-            prompt_tokens_cost_usd_dollar = model_cost[model]["input_cost_per_token"] * response_obj["usage"]["prompt_tokens"]
-            completion_tokens_cost_usd_dollar = model_cost[model]["output_cost_per_token"] * response_obj["usage"]["completion_tokens"]
+            prompt_tokens_cost_usd_dollar = (
+                model_cost[model]["input_cost_per_token"]
+                * response_obj["usage"]["prompt_tokens"]
+            )
+            completion_tokens_cost_usd_dollar = (
+                model_cost[model]["output_cost_per_token"]
+                * response_obj["usage"]["completion_tokens"]
+            )
         return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
-        
-    def log_event(self, model, messages, response_obj, start_time, end_time, print_verbose):
+
+    def log_event(
+        self, model, messages, response_obj, start_time, end_time, print_verbose
+    ):
         # Method definition
         try:
-            print_verbose(f"BerriSpend Logging - Enters logging function for model {model}")
+            print_verbose(
+                f"BerriSpend Logging - Enters logging function for model {model}"
+            )
 
             url = f"https://berrispend.berri.ai/spend"
-            headers = {
-                'Content-Type': 'application/json'
-            }
+            headers = {"Content-Type": "application/json"}
 
-            prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = self.price_calculator(model, response_obj, start_time, end_time)
-            total_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
+            (
+                prompt_tokens_cost_usd_dollar,
+                completion_tokens_cost_usd_dollar,
+            ) = self.price_calculator(model, response_obj, start_time, end_time)
+            total_cost = (
+                prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
+            )
 
-            response_time = (end_time-start_time).total_seconds()
+            response_time = (end_time - start_time).total_seconds()
             if "response" in response_obj:
-                data = [{
-                    "response_time": response_time,
-                    "model_id": response_obj["model"],
-                    "total_cost": total_cost, 
-                    "messages": messages,
-                    "response": response_obj['choices'][0]['message']['content'],
-                    "account_id": self.account_id
-                }]
+                data = [
+                    {
+                        "response_time": response_time,
+                        "model_id": response_obj["model"],
+                        "total_cost": total_cost,
+                        "messages": messages,
+                        "response": response_obj["choices"][0]["message"]["content"],
+                        "account_id": self.account_id,
+                    }
+                ]
             elif "error" in response_obj:
-                data = [{
-                    "response_time": response_time,
-                    "model_id": response_obj["model"],
-                    "total_cost": total_cost, 
-                    "messages": messages,
-                    "error": response_obj['error'],
-                    "account_id": self.account_id
-                }]
+                data = [
+                    {
+                        "response_time": response_time,
+                        "model_id": response_obj["model"],
+                        "total_cost": total_cost,
+                        "messages": messages,
+                        "error": response_obj["error"],
+                        "account_id": self.account_id,
+                    }
+                ]
 
             print_verbose(f"BerriSpend Logging - final data object: {data}")
             response = requests.post(url, headers=headers, json=data)
diff --git a/litellm/integrations/helicone.py b/litellm/integrations/helicone.py
index 9e74b246f..f9dff85db 100644
--- a/litellm/integrations/helicone.py
+++ b/litellm/integrations/helicone.py
@@ -2,19 +2,24 @@
 #    On success, logs events to Helicone
 import dotenv, os
 import requests
-dotenv.load_dotenv() # Loading env variables using dotenv
+
+dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback
+
+
 class HeliconeLogger:
     # Class variables or attributes
     helicone_model_list = ["gpt", "claude"]
+
     def __init__(self):
         # Instance variables
         self.provider_url = "https://api.openai.com/v1"
-        self.key = os.getenv('HELICONE_API_KEY')
+        self.key = os.getenv("HELICONE_API_KEY")
 
     def claude_mapping(self, model, messages, response_obj):
         from anthropic import HUMAN_PROMPT, AI_PROMPT
-        prompt = f"{HUMAN_PROMPT}" 
+
+        prompt = f"{HUMAN_PROMPT}"
         for message in messages:
             if "role" in message:
                 if message["role"] == "user":
@@ -26,48 +31,84 @@ class HeliconeLogger:
         prompt += f"{AI_PROMPT}"
         claude_provider_request = {"model": model, "prompt": prompt}
 
-        claude_response_obj = {"completion": response_obj['choices'][0]['message']['content'], "model": model, "stop_reason": "stop_sequence"}
+        claude_response_obj = {
+            "completion": response_obj["choices"][0]["message"]["content"],
+            "model": model,
+            "stop_reason": "stop_sequence",
+        }
 
         return claude_provider_request, claude_response_obj
-        
-    def log_success(self, model, messages, response_obj, start_time, end_time, print_verbose):
+
+    def log_success(
+        self, model, messages, response_obj, start_time, end_time, print_verbose
+    ):
         # Method definition
         try:
-            print_verbose(f"Helicone Logging - Enters logging function for model {model}")
-            model = model if any(accepted_model in model for accepted_model in self.helicone_model_list) else "gpt-3.5-turbo"
+            print_verbose(
+                f"Helicone Logging - Enters logging function for model {model}"
+            )
+            model = (
+                model
+                if any(
+                    accepted_model in model
+                    for accepted_model in self.helicone_model_list
+                )
+                else "gpt-3.5-turbo"
+            )
             provider_request = {"model": model, "messages": messages}
 
-            if "claude" in model: 
-                provider_request, response_obj = self.claude_mapping(model=model, messages=messages, response_obj=response_obj)
+            if "claude" in model:
+                provider_request, response_obj = self.claude_mapping(
+                    model=model, messages=messages, response_obj=response_obj
+                )
 
             providerResponse = {
-                "json": response_obj, 
-                "headers": {"openai-version": "2020-10-01"}, 
-                "status": 200
+                "json": response_obj,
+                "headers": {"openai-version": "2020-10-01"},
+                "status": 200,
             }
 
             # Code to be executed
             url = "https://api.hconeai.com/oai/v1/log"
             headers = {
-                'Authorization': f'Bearer {self.key}',
-                'Content-Type': 'application/json'
+                "Authorization": f"Bearer {self.key}",
+                "Content-Type": "application/json",
             }
             start_time_seconds = int(start_time.timestamp())
-            start_time_milliseconds = int((start_time.timestamp() - start_time_seconds) * 1000)
+            start_time_milliseconds = int(
+                (start_time.timestamp() - start_time_seconds) * 1000
+            )
             end_time_seconds = int(end_time.timestamp())
-            end_time_milliseconds = int((end_time.timestamp() - end_time_seconds) * 1000)
+            end_time_milliseconds = int(
+                (end_time.timestamp() - end_time_seconds) * 1000
+            )
             data = {
-                "providerRequest": {"url": self.provider_url, "json": provider_request, "meta": {"Helicone-Auth": f"Bearer {self.key}"}},
+                "providerRequest": {
+                    "url": self.provider_url,
+                    "json": provider_request,
+                    "meta": {"Helicone-Auth": f"Bearer {self.key}"},
+                },
                 "providerResponse": providerResponse,
-                "timing": {"startTime": {"seconds": start_time_seconds, "milliseconds": start_time_milliseconds}, "endTime": {"seconds": end_time_seconds, "milliseconds": end_time_milliseconds}} # {"seconds": .., "milliseconds": ..}
+                "timing": {
+                    "startTime": {
+                        "seconds": start_time_seconds,
+                        "milliseconds": start_time_milliseconds,
+                    },
+                    "endTime": {
+                        "seconds": end_time_seconds,
+                        "milliseconds": end_time_milliseconds,
+                    },
+                },  # {"seconds": .., "milliseconds": ..}
             }
             response = requests.post(url, headers=headers, json=data)
             if response.status_code == 200:
                 print_verbose("Helicone Logging - Success!")
             else:
-                print_verbose(f"Helicone Logging - Error Request was not successful. Status Code: {response.status_code}")
+                print_verbose(
+                    f"Helicone Logging - Error Request was not successful. Status Code: {response.status_code}"
+                )
                 print_verbose(f"Helicone Logging - Error {response.text}")
         except:
             # traceback.print_exc()
             print_verbose(f"Helicone Logging Error - {traceback.format_exc()}")
-            pass
\ No newline at end of file
+            pass
diff --git a/litellm/integrations/supabase.py b/litellm/integrations/supabase.py
index 1ac28763f..d27277589 100644
--- a/litellm/integrations/supabase.py
+++ b/litellm/integrations/supabase.py
@@ -3,31 +3,94 @@
 
 import dotenv, os
 import requests
-dotenv.load_dotenv() # Loading env variables using dotenv
+
+dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback
 import datetime, subprocess, sys
 
 model_cost = {
-    "gpt-3.5-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
-    "gpt-35-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, # azure model name
-    "gpt-3.5-turbo-0613": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
-    "gpt-3.5-turbo-0301": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
-    "gpt-3.5-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
-    "gpt-35-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, # azure model name
-    "gpt-3.5-turbo-16k-0613": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
-    "gpt-4": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
-    "gpt-4-0613": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
-    "gpt-4-32k": {"max_tokens": 8000, "input_cost_per_token": 0.00006, "output_cost_per_token": 0.00012},
-    "claude-instant-1": {"max_tokens": 100000, "input_cost_per_token": 0.00000163, "output_cost_per_token": 0.00000551},
-    "claude-2": {"max_tokens": 100000, "input_cost_per_token": 0.00001102, "output_cost_per_token": 0.00003268},
-    "text-bison-001": {"max_tokens": 8192, "input_cost_per_token": 0.000004, "output_cost_per_token": 0.000004},
-    "chat-bison-001": {"max_tokens": 4096, "input_cost_per_token": 0.000002, "output_cost_per_token": 0.000002},
-    "command-nightly": {"max_tokens": 4096, "input_cost_per_token": 0.000015, "output_cost_per_token": 0.000015},
+    "gpt-3.5-turbo": {
+        "max_tokens": 4000,
+        "input_cost_per_token": 0.0000015,
+        "output_cost_per_token": 0.000002,
+    },
+    "gpt-35-turbo": {
+        "max_tokens": 4000,
+        "input_cost_per_token": 0.0000015,
+        "output_cost_per_token": 0.000002,
+    },  # azure model name
+    "gpt-3.5-turbo-0613": {
+        "max_tokens": 4000,
+        "input_cost_per_token": 0.0000015,
+        "output_cost_per_token": 0.000002,
+    },
+    "gpt-3.5-turbo-0301": {
+        "max_tokens": 4000,
+        "input_cost_per_token": 0.0000015,
+        "output_cost_per_token": 0.000002,
+    },
+    "gpt-3.5-turbo-16k": {
+        "max_tokens": 16000,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000004,
+    },
+    "gpt-35-turbo-16k": {
+        "max_tokens": 16000,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000004,
+    },  # azure model name
+    "gpt-3.5-turbo-16k-0613": {
+        "max_tokens": 16000,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000004,
+    },
+    "gpt-4": {
+        "max_tokens": 8000,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.00006,
+    },
+    "gpt-4-0613": {
+        "max_tokens": 8000,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.00006,
+    },
+    "gpt-4-32k": {
+        "max_tokens": 8000,
+        "input_cost_per_token": 0.00006,
+        "output_cost_per_token": 0.00012,
+    },
+    "claude-instant-1": {
+        "max_tokens": 100000,
+        "input_cost_per_token": 0.00000163,
+        "output_cost_per_token": 0.00000551,
+    },
+    "claude-2": {
+        "max_tokens": 100000,
+        "input_cost_per_token": 0.00001102,
+        "output_cost_per_token": 0.00003268,
+    },
+    "text-bison-001": {
+        "max_tokens": 8192,
+        "input_cost_per_token": 0.000004,
+        "output_cost_per_token": 0.000004,
+    },
+    "chat-bison-001": {
+        "max_tokens": 4096,
+        "input_cost_per_token": 0.000002,
+        "output_cost_per_token": 0.000002,
+    },
+    "command-nightly": {
+        "max_tokens": 4096,
+        "input_cost_per_token": 0.000015,
+        "output_cost_per_token": 0.000015,
+    },
 }
 
+
 class Supabase:
     # Class variables or attributes
     supabase_table_name = "request_logs"
+
     def __init__(self):
         # Instance variables
         self.supabase_url = os.getenv("SUPABASE_URL")
@@ -35,9 +98,11 @@ class Supabase:
         try:
             import supabase
         except ImportError:
-            subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'supabase'])
+            subprocess.check_call([sys.executable, "-m", "pip", "install", "supabase"])
             import supabase
-        self.supabase_client = supabase.create_client(self.supabase_url, self.supabase_key)
+        self.supabase_client = supabase.create_client(
+            self.supabase_url, self.supabase_key
+        )
 
     def price_calculator(self, model, response_obj, start_time, end_time):
         # try and find if the model is in the model_cost map
@@ -45,17 +110,23 @@ class Supabase:
         prompt_tokens_cost_usd_dollar = 0
         completion_tokens_cost_usd_dollar = 0
         if model in model_cost:
-            prompt_tokens_cost_usd_dollar = model_cost[model]["input_cost_per_token"] * response_obj["usage"]["prompt_tokens"]
-            completion_tokens_cost_usd_dollar = model_cost[model]["output_cost_per_token"] * response_obj["usage"]["completion_tokens"]
-        elif "replicate" in model: 
+            prompt_tokens_cost_usd_dollar = (
+                model_cost[model]["input_cost_per_token"]
+                * response_obj["usage"]["prompt_tokens"]
+            )
+            completion_tokens_cost_usd_dollar = (
+                model_cost[model]["output_cost_per_token"]
+                * response_obj["usage"]["completion_tokens"]
+            )
+        elif "replicate" in model:
             # replicate models are charged based on time
             # llama 2 runs on an nvidia a100 which costs $0.0032 per second - https://replicate.com/replicate/llama-2-70b-chat
-            model_run_time = end_time - start_time # assuming time in seconds
+            model_run_time = end_time - start_time  # assuming time in seconds
             cost_usd_dollar = model_run_time * 0.0032
             prompt_tokens_cost_usd_dollar = cost_usd_dollar / 2
             completion_tokens_cost_usd_dollar = cost_usd_dollar / 2
         else:
-            # calculate average input cost 
+            # calculate average input cost
             input_cost_sum = 0
             output_cost_sum = 0
             for model in model_cost:
@@ -63,41 +134,75 @@ class Supabase:
                 output_cost_sum += model_cost[model]["output_cost_per_token"]
             avg_input_cost = input_cost_sum / len(model_cost.keys())
             avg_output_cost = output_cost_sum / len(model_cost.keys())
-            prompt_tokens_cost_usd_dollar = model_cost[model]["input_cost_per_token"] * response_obj["usage"]["prompt_tokens"]
-            completion_tokens_cost_usd_dollar = model_cost[model]["output_cost_per_token"] * response_obj["usage"]["completion_tokens"]
+            prompt_tokens_cost_usd_dollar = (
+                model_cost[model]["input_cost_per_token"]
+                * response_obj["usage"]["prompt_tokens"]
+            )
+            completion_tokens_cost_usd_dollar = (
+                model_cost[model]["output_cost_per_token"]
+                * response_obj["usage"]["completion_tokens"]
+            )
         return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
-        
-    def log_event(self, model, messages, end_user, response_obj, start_time, end_time, print_verbose):
+
+    def log_event(
+        self,
+        model,
+        messages,
+        end_user,
+        response_obj,
+        start_time,
+        end_time,
+        print_verbose,
+    ):
         try:
-            print_verbose(f"Supabase Logging - Enters logging function for model {model}, response_obj: {response_obj}")
+            print_verbose(
+                f"Supabase Logging - Enters logging function for model {model}, response_obj: {response_obj}"
+            )
 
-            prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = self.price_calculator(model, response_obj, start_time, end_time)
-            total_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
+            (
+                prompt_tokens_cost_usd_dollar,
+                completion_tokens_cost_usd_dollar,
+            ) = self.price_calculator(model, response_obj, start_time, end_time)
+            total_cost = (
+                prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
+            )
 
-            response_time = (end_time-start_time).total_seconds()
+            response_time = (end_time - start_time).total_seconds()
             if "choices" in response_obj:
                 supabase_data_obj = {
                     "response_time": response_time,
                     "model": response_obj["model"],
-                    "total_cost": total_cost, 
+                    "total_cost": total_cost,
                     "messages": messages,
-                    "response": response_obj['choices'][0]['message']['content'],
-                    "end_user": end_user
+                    "response": response_obj["choices"][0]["message"]["content"],
+                    "end_user": end_user,
                 }
-                print_verbose(f"Supabase Logging - final data object: {supabase_data_obj}")
-                data, count = self.supabase_client.table(self.supabase_table_name).insert(supabase_data_obj).execute()
+                print_verbose(
+                    f"Supabase Logging - final data object: {supabase_data_obj}"
+                )
+                data, count = (
+                    self.supabase_client.table(self.supabase_table_name)
+                    .insert(supabase_data_obj)
+                    .execute()
+                )
             elif "error" in response_obj:
                 supabase_data_obj = {
                     "response_time": response_time,
                     "model": response_obj["model"],
-                    "total_cost": total_cost, 
+                    "total_cost": total_cost,
                     "messages": messages,
-                    "error": response_obj['error'],
-                    "end_user": end_user
+                    "error": response_obj["error"],
+                    "end_user": end_user,
                 }
-                print_verbose(f"Supabase Logging - final data object: {supabase_data_obj}")
-                data, count = self.supabase_client.table(self.supabase_table_name).insert(supabase_data_obj).execute()
-            
+                print_verbose(
+                    f"Supabase Logging - final data object: {supabase_data_obj}"
+                )
+                data, count = (
+                    self.supabase_client.table(self.supabase_table_name)
+                    .insert(supabase_data_obj)
+                    .execute()
+                )
+
         except:
             # traceback.print_exc()
             print_verbose(f"Supabase Logging Error - {traceback.format_exc()}")
diff --git a/litellm/llms/__init__.py b/litellm/llms/__init__.py
index b9742821a..b6e690fd5 100644
--- a/litellm/llms/__init__.py
+++ b/litellm/llms/__init__.py
@@ -1 +1 @@
-from . import *
\ No newline at end of file
+from . import *
diff --git a/litellm/llms/anthropic.py b/litellm/llms/anthropic.py
index 67666ee92..c90b61a11 100644
--- a/litellm/llms/anthropic.py
+++ b/litellm/llms/anthropic.py
@@ -2,54 +2,77 @@ import os, json
 from enum import Enum
 import requests
 from litellm import logging
-import time 
+import time
 from typing import Callable
 from litellm.utils import ModelResponse
 
+
 class AnthropicConstants(Enum):
     HUMAN_PROMPT = "\n\nHuman:"
     AI_PROMPT = "\n\nAssistant:"
 
+
 class AnthropicError(Exception):
     def __init__(self, status_code, message):
         self.status_code = status_code
         self.message = message
-        super().__init__(self.message) # Call the base class constructor with the parameters it needs
+        super().__init__(
+            self.message
+        )  # Call the base class constructor with the parameters it needs
 
-class AnthropicLLM: 
-    
+
+class AnthropicLLM:
     def __init__(self, encoding, default_max_tokens_to_sample, api_key=None):
         self.encoding = encoding
         self.default_max_tokens_to_sample = default_max_tokens_to_sample
         self.completion_url = "https://api.anthropic.com/v1/complete"
         self.api_key = api_key
         self.validate_environment(api_key=api_key)
-    
-    def validate_environment(self, api_key): # set up the environment required to run the model 
+
+    def validate_environment(
+        self, api_key
+    ):  # set up the environment required to run the model
         # set the api key
         if self.api_key == None:
-            raise ValueError("Missing Anthropic API Key - A call is being made to anthropic but no key is set either in the environment variables or via params")
+            raise ValueError(
+                "Missing Anthropic API Key - A call is being made to anthropic but no key is set either in the environment variables or via params"
+            )
         self.api_key = api_key
         self.headers = {
             "accept": "application/json",
             "anthropic-version": "2023-06-01",
             "content-type": "application/json",
-            "x-api-key": self.api_key 
+            "x-api-key": self.api_key,
         }
 
-    def completion(self, model: str, messages: list, model_response: ModelResponse, print_verbose: Callable, optional_params=None, litellm_params=None, logger_fn=None): # logic for parsing in - calling - parsing out model completion calls
+    def completion(
+        self,
+        model: str,
+        messages: list,
+        model_response: ModelResponse,
+        print_verbose: Callable,
+        optional_params=None,
+        litellm_params=None,
+        logger_fn=None,
+    ):  # logic for parsing in - calling - parsing out model completion calls
         model = model
         prompt = f"{AnthropicConstants.HUMAN_PROMPT.value}"
         for message in messages:
             if "role" in message:
                 if message["role"] == "user":
-                    prompt += f"{AnthropicConstants.HUMAN_PROMPT.value}{message['content']}"
+                    prompt += (
+                        f"{AnthropicConstants.HUMAN_PROMPT.value}{message['content']}"
+                    )
                 else:
-                    prompt += f"{AnthropicConstants.AI_PROMPT.value}{message['content']}"
+                    prompt += (
+                        f"{AnthropicConstants.AI_PROMPT.value}{message['content']}"
+                    )
             else:
                 prompt += f"{AnthropicConstants.HUMAN_PROMPT.value}{message['content']}"
         prompt += f"{AnthropicConstants.AI_PROMPT.value}"
-        if "max_tokens" in optional_params and optional_params["max_tokens"] != float('inf'): 
+        if "max_tokens" in optional_params and optional_params["max_tokens"] != float(
+            "inf"
+        ):
             max_tokens = optional_params["max_tokens"]
         else:
             max_tokens = self.default_max_tokens_to_sample
@@ -57,39 +80,66 @@ class AnthropicLLM:
             "model": model,
             "prompt": prompt,
             "max_tokens_to_sample": max_tokens,
-            **optional_params
+            **optional_params,
         }
 
         ## LOGGING
-        logging(model=model, input=prompt, additional_args={"litellm_params": litellm_params, "optional_params": optional_params}, logger_fn=logger_fn)
+        logging(
+            model=model,
+            input=prompt,
+            additional_args={
+                "litellm_params": litellm_params,
+                "optional_params": optional_params,
+            },
+            logger_fn=logger_fn,
+        )
         ## COMPLETION CALL
-        response = requests.post(self.completion_url, headers=self.headers, data=json.dumps(data))
+        response = requests.post(
+            self.completion_url, headers=self.headers, data=json.dumps(data)
+        )
         if "stream" in optional_params and optional_params["stream"] == True:
             return response.iter_lines()
         else:
             ## LOGGING
-            logging(model=model, input=prompt, additional_args={"litellm_params": litellm_params, "optional_params": optional_params, "original_response": response.text}, logger_fn=logger_fn)
+            logging(
+                model=model,
+                input=prompt,
+                additional_args={
+                    "litellm_params": litellm_params,
+                    "optional_params": optional_params,
+                    "original_response": response.text,
+                },
+                logger_fn=logger_fn,
+            )
             print_verbose(f"raw model_response: {response.text}")
             ## RESPONSE OBJECT
             completion_response = response.json()
             if "error" in completion_response:
-                raise AnthropicError(message=completion_response["error"], status_code=response.status_code)
+                raise AnthropicError(
+                    message=completion_response["error"],
+                    status_code=response.status_code,
+                )
             else:
-                model_response["choices"][0]["message"]["content"] = completion_response["completion"]    
-            
+                model_response["choices"][0]["message"][
+                    "content"
+                ] = completion_response["completion"]
+
             ## CALCULATING USAGE
-            prompt_tokens = len(self.encoding.encode(prompt)) ##[TODO] use the anthropic tokenizer here
-            completion_tokens = len(self.encoding.encode(model_response["choices"][0]["message"]["content"])) ##[TODO] use the anthropic tokenizer here
-            
-            
+            prompt_tokens = len(
+                self.encoding.encode(prompt)
+            )  ##[TODO] use the anthropic tokenizer here
+            completion_tokens = len(
+                self.encoding.encode(model_response["choices"][0]["message"]["content"])
+            )  ##[TODO] use the anthropic tokenizer here
+
             model_response["created"] = time.time()
             model_response["model"] = model
             model_response["usage"] = {
                 "prompt_tokens": prompt_tokens,
                 "completion_tokens": completion_tokens,
-                "total_tokens": prompt_tokens + completion_tokens
-                }
+                "total_tokens": prompt_tokens + completion_tokens,
+            }
             return model_response
-    
-    def embedding(): # logic for parsing in - calling - parsing out model embedding calls
-        pass 
\ No newline at end of file
+
+    def embedding():  # logic for parsing in - calling - parsing out model embedding calls
+        pass
diff --git a/litellm/llms/base.py b/litellm/llms/base.py
index 368df9624..bde09f2fb 100644
--- a/litellm/llms/base.py
+++ b/litellm/llms/base.py
@@ -1,11 +1,12 @@
 ## This is a template base class to be used for adding new LLM providers via API calls
 
-class BaseLLM():
-    def validate_environment(): # set up the environment required to run the model 
-        pass  
 
-    def completion(): # logic for parsing in - calling - parsing out model completion calls
+class BaseLLM:
+    def validate_environment():  # set up the environment required to run the model
         pass
 
-    def embedding(): # logic for parsing in - calling - parsing out model embedding calls
-        pass 
\ No newline at end of file
+    def completion():  # logic for parsing in - calling - parsing out model completion calls
+        pass
+
+    def embedding():  # logic for parsing in - calling - parsing out model embedding calls
+        pass
diff --git a/litellm/llms/huggingface_restapi.py b/litellm/llms/huggingface_restapi.py
index 30d67727f..974a6c049 100644
--- a/litellm/llms/huggingface_restapi.py
+++ b/litellm/llms/huggingface_restapi.py
@@ -3,31 +3,47 @@ import os, json
 from enum import Enum
 import requests
 from litellm import logging
-import time 
+import time
 from typing import Callable
 from litellm.utils import ModelResponse
 
+
 class HuggingfaceError(Exception):
     def __init__(self, status_code, message):
         self.status_code = status_code
         self.message = message
-        super().__init__(self.message) # Call the base class constructor with the parameters it needs
+        super().__init__(
+            self.message
+        )  # Call the base class constructor with the parameters it needs
 
-class HuggingfaceRestAPILLM():
+
+class HuggingfaceRestAPILLM:
     def __init__(self, encoding, api_key=None) -> None:
         self.encoding = encoding
         self.validate_environment(api_key=api_key)
 
-    def validate_environment(self, api_key): # set up the environment required to run the model 
+    def validate_environment(
+        self, api_key
+    ):  # set up the environment required to run the model
         self.headers = {
             "content-type": "application/json",
         }
         # get the api key if it exists in the environment or is passed in, but don't require it
         self.api_key = api_key
         if self.api_key != None:
-            self.headers["Authorization"] = f"Bearer {self.api_key}" 
+            self.headers["Authorization"] = f"Bearer {self.api_key}"
 
-    def completion(self, model: str, messages: list, custom_api_base: str, model_response: ModelResponse, print_verbose: Callable, optional_params=None, litellm_params=None, logger_fn=None): # logic for parsing in - calling - parsing out model completion calls
+    def completion(
+        self,
+        model: str,
+        messages: list,
+        custom_api_base: str,
+        model_response: ModelResponse,
+        print_verbose: Callable,
+        optional_params=None,
+        litellm_params=None,
+        logger_fn=None,
+    ):  # logic for parsing in - calling - parsing out model completion calls
         if custom_api_base:
             completion_url = custom_api_base
         elif "HF_API_BASE" in os.environ:
@@ -35,7 +51,9 @@ class HuggingfaceRestAPILLM():
         else:
             completion_url = f"https://api-inference.huggingface.co/models/{model}"
         prompt = ""
-        if "meta-llama" in model and "chat" in model: # use the required special tokens for meta-llama - https://huggingface.co/blog/llama2#how-to-prompt-llama-2
+        if (
+            "meta-llama" in model and "chat" in model
+        ):  # use the required special tokens for meta-llama - https://huggingface.co/blog/llama2#how-to-prompt-llama-2
             prompt = "<s>"
             for message in messages:
                 if message["role"] == "system":
@@ -47,8 +65,8 @@ class HuggingfaceRestAPILLM():
         else:
             for message in messages:
                 prompt += f"{message['content']}"
-        ### MAP INPUT PARAMS 
-        # max tokens  
+        ### MAP INPUT PARAMS
+        # max tokens
         if "max_tokens" in optional_params:
             value = optional_params.pop("max_tokens")
             optional_params["max_new_tokens"] = value
@@ -57,14 +75,33 @@ class HuggingfaceRestAPILLM():
             # "parameters": optional_params
         }
         ## LOGGING
-        logging(model=model, input=prompt, additional_args={"litellm_params": litellm_params, "optional_params": optional_params}, logger_fn=logger_fn)
+        logging(
+            model=model,
+            input=prompt,
+            additional_args={
+                "litellm_params": litellm_params,
+                "optional_params": optional_params,
+            },
+            logger_fn=logger_fn,
+        )
         ## COMPLETION CALL
-        response = requests.post(completion_url, headers=self.headers, data=json.dumps(data))
+        response = requests.post(
+            completion_url, headers=self.headers, data=json.dumps(data)
+        )
         if "stream" in optional_params and optional_params["stream"] == True:
             return response.iter_lines()
         else:
             ## LOGGING
-            logging(model=model, input=prompt, additional_args={"litellm_params": litellm_params, "optional_params": optional_params, "original_response": response.text}, logger_fn=logger_fn)
+            logging(
+                model=model,
+                input=prompt,
+                additional_args={
+                    "litellm_params": litellm_params,
+                    "optional_params": optional_params,
+                    "original_response": response.text,
+                },
+                logger_fn=logger_fn,
+            )
             print_verbose(f"raw model_response: {response.text}")
             ## RESPONSE OBJECT
             completion_response = response.json()
@@ -72,24 +109,32 @@ class HuggingfaceRestAPILLM():
             if isinstance(completion_response, dict) and "error" in completion_response:
                 print_verbose(f"completion error: {completion_response['error']}")
                 print_verbose(f"response.status_code: {response.status_code}")
-                raise HuggingfaceError(message=completion_response["error"], status_code=response.status_code)
+                raise HuggingfaceError(
+                    message=completion_response["error"],
+                    status_code=response.status_code,
+                )
             else:
-                model_response["choices"][0]["message"]["content"] = completion_response[0]["generated_text"]    
-            
+                model_response["choices"][0]["message"][
+                    "content"
+                ] = completion_response[0]["generated_text"]
+
             ## CALCULATING USAGE
-            prompt_tokens = len(self.encoding.encode(prompt)) ##[TODO] use the llama2 tokenizer here
-            completion_tokens = len(self.encoding.encode(model_response["choices"][0]["message"]["content"])) ##[TODO] use the llama2 tokenizer here
-            
-            
+            prompt_tokens = len(
+                self.encoding.encode(prompt)
+            )  ##[TODO] use the llama2 tokenizer here
+            completion_tokens = len(
+                self.encoding.encode(model_response["choices"][0]["message"]["content"])
+            )  ##[TODO] use the llama2 tokenizer here
+
             model_response["created"] = time.time()
             model_response["model"] = model
             model_response["usage"] = {
                 "prompt_tokens": prompt_tokens,
                 "completion_tokens": completion_tokens,
-                "total_tokens": prompt_tokens + completion_tokens
-                }
+                "total_tokens": prompt_tokens + completion_tokens,
+            }
             return model_response
         pass
 
-    def embedding(): # logic for parsing in - calling - parsing out model embedding calls
-        pass 
\ No newline at end of file
+    def embedding():  # logic for parsing in - calling - parsing out model embedding calls
+        pass
diff --git a/litellm/main.py b/litellm/main.py
index 9a809b098..713a21ed6 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -4,533 +4,865 @@ from functools import partial
 import dotenv, traceback, random, asyncio, time
 from copy import deepcopy
 import litellm
-from litellm import client, logging, exception_type, timeout, get_optional_params, get_litellm_params
-from litellm.utils import get_secret, install_and_import, CustomStreamWrapper, read_config_args
+from litellm import (
+    client,
+    logging,
+    exception_type,
+    timeout,
+    get_optional_params,
+    get_litellm_params,
+)
+from litellm.utils import (
+    get_secret,
+    install_and_import,
+    CustomStreamWrapper,
+    read_config_args,
+)
 from .llms.anthropic import AnthropicLLM
 from .llms.huggingface_restapi import HuggingfaceRestAPILLM
 import tiktoken
 from concurrent.futures import ThreadPoolExecutor
+
 encoding = tiktoken.get_encoding("cl100k_base")
-from litellm.utils import get_secret, install_and_import, CustomStreamWrapper, ModelResponse, read_config_args
-from litellm.utils import get_ollama_response_stream, stream_to_string, together_ai_completion_streaming
+from litellm.utils import (
+    get_secret,
+    install_and_import,
+    CustomStreamWrapper,
+    ModelResponse,
+    read_config_args,
+)
+from litellm.utils import (
+    get_ollama_response_stream,
+    stream_to_string,
+    together_ai_completion_streaming,
+)
+
 ####### ENVIRONMENT VARIABLES ###################
-dotenv.load_dotenv() # Loading env variables using dotenv
+dotenv.load_dotenv()  # Loading env variables using dotenv
+
+
 ####### COMPLETION ENDPOINTS ################
 #############################################
 async def acompletion(*args, **kwargs):
-  loop = asyncio.get_event_loop()
-  
-  # Use a partial function to pass your keyword arguments
-  func = partial(completion, *args, **kwargs)
+    loop = asyncio.get_event_loop()
+
+    # Use a partial function to pass your keyword arguments
+    func = partial(completion, *args, **kwargs)
+
+    # Call the synchronous function using run_in_executor
+    return await loop.run_in_executor(None, func)
 
-  # Call the synchronous function using run_in_executor
-  return await loop.run_in_executor(None, func)
 
 @client
 # @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(2), reraise=True, retry_error_callback=lambda retry_state: setattr(retry_state.outcome, 'retry_variable', litellm.retry)) # retry call, turn this off by setting `litellm.retry = False`
-@timeout(600) ## set timeouts, in case calls hang (e.g. Azure) - default is 60s, override with `force_timeout`
+@timeout(
+    600
+)  ## set timeouts, in case calls hang (e.g. Azure) - default is 60s, override with `force_timeout`
 def completion(
-    model, messages,# required params
+    model,
+    messages,  # required params
     # Optional OpenAI params: see https://platform.openai.com/docs/api-reference/chat/create
-    functions=[], function_call="", # optional params
-    temperature=1, top_p=1, n=1, stream=False, stop=None, max_tokens=float('inf'),
-    presence_penalty=0, frequency_penalty=0, logit_bias={}, user="", deployment_id=None,
+    functions=[],
+    function_call="",  # optional params
+    temperature=1,
+    top_p=1,
+    n=1,
+    stream=False,
+    stop=None,
+    max_tokens=float("inf"),
+    presence_penalty=0,
+    frequency_penalty=0,
+    logit_bias={},
+    user="",
+    deployment_id=None,
     # Optional liteLLM function params
-    *, return_async=False, api_key=None, force_timeout=600, logger_fn=None, verbose=False, azure=False, custom_llm_provider=None, custom_api_base=None,
+    *,
+    return_async=False,
+    api_key=None,
+    force_timeout=600,
+    logger_fn=None,
+    verbose=False,
+    azure=False,
+    custom_llm_provider=None,
+    custom_api_base=None,
     # model specific optional params
     # used by text-bison only
-    top_k=40, request_timeout=0, # unused var for old version of OpenAI API
-  ) -> ModelResponse:
-  try:
-    model_response = ModelResponse()
-    if azure: # this flag is deprecated, remove once notebooks are also updated.
-      custom_llm_provider="azure"
-    args = locals()
-    # check if user passed in any of the OpenAI optional params
-    optional_params = get_optional_params(
-      functions=functions, function_call=function_call, 
-      temperature=temperature, top_p=top_p, n=n, stream=stream, stop=stop, max_tokens=max_tokens,
-      presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, logit_bias=logit_bias, user=user, deployment_id=deployment_id,
-      # params to identify the model
-      model=model, custom_llm_provider=custom_llm_provider, top_k=top_k,
-    )
-    # For logging - save the values of the litellm-specific params passed in
-    litellm_params = get_litellm_params(
-      return_async=return_async, api_key=api_key, force_timeout=force_timeout, 
-      logger_fn=logger_fn, verbose=verbose, custom_llm_provider=custom_llm_provider, 
-      custom_api_base=custom_api_base)
-    
-    if custom_llm_provider == "azure":
-      # azure configs
-      openai.api_type = "azure"
-      openai.api_base = litellm.api_base if litellm.api_base is not None else get_secret("AZURE_API_BASE")
-      openai.api_version = litellm.api_version if litellm.api_version is not None else get_secret("AZURE_API_VERSION")
-      # set key
-      openai.api_key = api_key or litellm.azure_key or get_secret("AZURE_API_KEY")
-      ## LOGGING
-      logging(model=model, input=messages, additional_args=optional_params, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn)
-      ## COMPLETION CALL
-      if litellm.headers:
-         response = openai.ChatCompletion.create(
-            engine=model,
-            messages = messages,
-            headers = litellm.headers,
-            **optional_params,
-          )
-      else:
-        response = openai.ChatCompletion.create(
-          model=model,
-          messages = messages,
-          **optional_params
-        )
-    elif model in litellm.open_ai_chat_completion_models or custom_llm_provider == "custom_openai": # allow user to make an openai call with a custom base
-      openai.api_type = "openai"
-      # note: if a user sets a custom base - we should ensure this works
-      api_base = custom_api_base if custom_api_base is not None else litellm.api_base # allow for the setting of dynamic and stateful api-bases
-      openai.api_base = api_base if api_base is not None else "https://api.openai.com/v1"
-      openai.api_version = None
-      if litellm.organization:
-        openai.organization = litellm.organization
-      # set API KEY
-      openai.api_key = api_key or litellm.openai_key or get_secret("OPENAI_API_KEY")
-
-      ## LOGGING
-      logging(model=model, input=messages, additional_args=args, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn)
-      ## COMPLETION CALL
-      if litellm.headers:
-         response = openai.ChatCompletion.create(
-          model=model,
-          messages = messages,
-          headers = litellm.headers,
-          **optional_params
-        )
-      else:
-        response = openai.ChatCompletion.create(
-          model=model,
-          messages = messages,
-          **optional_params
-        )
-    elif model in litellm.open_ai_text_completion_models:
-      openai.api_type = "openai"
-      openai.api_base = litellm.api_base if litellm.api_base is not None else "https://api.openai.com/v1"
-      openai.api_version = None
-      openai.api_key = api_key or litellm.openai_key or get_secret("OPENAI_API_KEY")
-      if litellm.organization:
-        openai.organization = litellm.organization
-      prompt = " ".join([message["content"] for message in messages])
-      ## LOGGING
-      logging(model=model, input=prompt, additional_args=optional_params, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn)
-      ## COMPLETION CALL
-      if litellm.headers:
-        response = openai.Completion.create(
-          model=model,
-          prompt = prompt,
-          headers = litellm.headers,
-        )
-      else:
-        response = openai.Completion.create(
+    top_k=40,
+    request_timeout=0,  # unused var for old version of OpenAI API
+) -> ModelResponse:
+    try:
+        model_response = ModelResponse()
+        if azure:  # this flag is deprecated, remove once notebooks are also updated.
+            custom_llm_provider = "azure"
+        args = locals()
+        # check if user passed in any of the OpenAI optional params
+        optional_params = get_optional_params(
+            functions=functions,
+            function_call=function_call,
+            temperature=temperature,
+            top_p=top_p,
+            n=n,
+            stream=stream,
+            stop=stop,
+            max_tokens=max_tokens,
+            presence_penalty=presence_penalty,
+            frequency_penalty=frequency_penalty,
+            logit_bias=logit_bias,
+            user=user,
+            deployment_id=deployment_id,
+            # params to identify the model
             model=model,
-            prompt = prompt
+            custom_llm_provider=custom_llm_provider,
+            top_k=top_k,
         )
-      completion_response = response["choices"][0]["text"]
-      ## LOGGING
-      logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn)
-      ## RESPONSE OBJECT
-      model_response["choices"][0]["message"]["content"] = completion_response
-      model_response["created"] = response["created"]
-      model_response["model"] = model
-      model_response["usage"] = response["usage"]
-      response = model_response
-    elif "replicate" in model or custom_llm_provider == "replicate":
-      # import replicate/if it fails then pip install replicate
-      install_and_import("replicate")
-      import replicate
-      # Setting the relevant API KEY for replicate, replicate defaults to using os.environ.get("REPLICATE_API_TOKEN")
-      replicate_key = os.environ.get("REPLICATE_API_TOKEN")
-      if replicate_key == None:
-        # user did not set REPLICATE_API_TOKEN in .env
-        replicate_key = get_secret("REPLICATE_API_KEY") or get_secret("REPLICATE_API_TOKEN") or api_key or litellm.replicate_key
-        # set replicate kye 
-        os.environ["REPLICATE_API_TOKEN"] = replicate_key
-      prompt = " ".join([message["content"] for message in messages])
-      input = {"prompt": prompt}
-      if "max_tokens" in optional_params:
-        input["max_length"] = max_tokens # for t5 models 
-        input["max_new_tokens"] = max_tokens # for llama2 models 
-      ## LOGGING
-      logging(model=model, input=input, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens}, logger_fn=logger_fn)
-      ## COMPLETION CALL
-      output = replicate.run(
-        model,
-        input=input)
-      if 'stream' in optional_params and optional_params['stream'] == True:
-        # don't try to access stream object,
-        # let the stream handler know this is replicate
-        response = CustomStreamWrapper(output, "replicate")
-        return response
-      response = ""
-      for item in output: 
-        response += item
-      completion_response = response
-      ## LOGGING
-      logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn)
-      prompt_tokens = len(encoding.encode(prompt))
-      completion_tokens = len(encoding.encode(completion_response))
-      ## RESPONSE OBJECT
-      model_response["choices"][0]["message"]["content"] = completion_response
-      model_response["created"] = time.time()
-      model_response["model"] = model
-      model_response["usage"] = {
-          "prompt_tokens": prompt_tokens,
-          "completion_tokens": completion_tokens,
-          "total_tokens": prompt_tokens + completion_tokens
-        }
-      response = model_response
-    elif model in litellm.anthropic_models:
-      anthropic_key = api_key or litellm.anthropic_key or os.environ.get("ANTHROPIC_API_KEY")
-      anthropic_client = AnthropicLLM(encoding=encoding, default_max_tokens_to_sample=litellm.max_tokens, api_key=anthropic_key)
-      model_response = anthropic_client.completion(model=model, messages=messages, model_response=model_response, print_verbose=print_verbose, optional_params=optional_params, litellm_params=litellm_params, logger_fn=logger_fn)
-      if 'stream' in optional_params and optional_params['stream'] == True:
-        # don't try to access stream object,
-        response = CustomStreamWrapper(model_response, model)
-        return response
-      response = model_response
-    elif model in litellm.openrouter_models or custom_llm_provider == "openrouter":
-      openai.api_type = "openai"
-      # not sure if this will work after someone first uses another API
-      openai.api_base = litellm.api_base if litellm.api_base is not None else "https://openrouter.ai/api/v1"
-      openai.api_version = None
-      if litellm.organization:
-        openai.organization = litellm.organization
-      if api_key:
-          openai.api_key = api_key
-      elif litellm.openrouter_key:
-          openai.api_key = litellm.openrouter_key
-      else:
-          openai.api_key = get_secret("OPENROUTER_API_KEY") or get_secret("OR_API_KEY")
-      ## LOGGING
-      logging(model=model, input=messages, additional_args=optional_params, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn)
-      ## COMPLETION CALL
-      if litellm.headers:
-         response = openai.ChatCompletion.create(
-          model=model,
-          messages = messages,
-          headers = litellm.headers,
-          **optional_params
+        # For logging - save the values of the litellm-specific params passed in
+        litellm_params = get_litellm_params(
+            return_async=return_async,
+            api_key=api_key,
+            force_timeout=force_timeout,
+            logger_fn=logger_fn,
+            verbose=verbose,
+            custom_llm_provider=custom_llm_provider,
+            custom_api_base=custom_api_base,
         )
-      else:
-        openrouter_site_url = get_secret("OR_SITE_URL")
-        openrouter_app_name = get_secret("OR_APP_NAME")
-        # if openrouter_site_url is None, set it to https://litellm.ai
-        if openrouter_site_url is None:
-          openrouter_site_url = "https://litellm.ai"
-        # if openrouter_app_name is None, set it to liteLLM
-        if openrouter_app_name is None:
-          openrouter_app_name = "liteLLM"
-        response = openai.ChatCompletion.create(
-          model=model,
-          messages = messages,
-          headers = 
-          {
-             "HTTP-Referer": openrouter_site_url, # To identify your site
-             "X-Title": openrouter_app_name # To identify your app
-          },
-          **optional_params
+
+        if custom_llm_provider == "azure":
+            # azure configs
+            openai.api_type = "azure"
+            openai.api_base = (
+                litellm.api_base
+                if litellm.api_base is not None
+                else get_secret("AZURE_API_BASE")
+            )
+            openai.api_version = (
+                litellm.api_version
+                if litellm.api_version is not None
+                else get_secret("AZURE_API_VERSION")
+            )
+            # set key
+            openai.api_key = api_key or litellm.azure_key or get_secret("AZURE_API_KEY")
+            ## LOGGING
+            logging(
+                model=model,
+                input=messages,
+                additional_args=optional_params,
+                custom_llm_provider=custom_llm_provider,
+                logger_fn=logger_fn,
+            )
+            ## COMPLETION CALL
+            if litellm.headers:
+                response = openai.ChatCompletion.create(
+                    engine=model,
+                    messages=messages,
+                    headers=litellm.headers,
+                    **optional_params,
+                )
+            else:
+                response = openai.ChatCompletion.create(
+                    model=model, messages=messages, **optional_params
+                )
+        elif (
+            model in litellm.open_ai_chat_completion_models
+            or custom_llm_provider == "custom_openai"
+        ):  # allow user to make an openai call with a custom base
+            openai.api_type = "openai"
+            # note: if a user sets a custom base - we should ensure this works
+            api_base = (
+                custom_api_base if custom_api_base is not None else litellm.api_base
+            )  # allow for the setting of dynamic and stateful api-bases
+            openai.api_base = (
+                api_base if api_base is not None else "https://api.openai.com/v1"
+            )
+            openai.api_version = None
+            if litellm.organization:
+                openai.organization = litellm.organization
+            # set API KEY
+            openai.api_key = (
+                api_key or litellm.openai_key or get_secret("OPENAI_API_KEY")
+            )
+
+            ## LOGGING
+            logging(
+                model=model,
+                input=messages,
+                additional_args=args,
+                custom_llm_provider=custom_llm_provider,
+                logger_fn=logger_fn,
+            )
+            ## COMPLETION CALL
+            if litellm.headers:
+                response = openai.ChatCompletion.create(
+                    model=model,
+                    messages=messages,
+                    headers=litellm.headers,
+                    **optional_params,
+                )
+            else:
+                response = openai.ChatCompletion.create(
+                    model=model, messages=messages, **optional_params
+                )
+        elif model in litellm.open_ai_text_completion_models:
+            openai.api_type = "openai"
+            openai.api_base = (
+                litellm.api_base
+                if litellm.api_base is not None
+                else "https://api.openai.com/v1"
+            )
+            openai.api_version = None
+            openai.api_key = (
+                api_key or litellm.openai_key or get_secret("OPENAI_API_KEY")
+            )
+            if litellm.organization:
+                openai.organization = litellm.organization
+            prompt = " ".join([message["content"] for message in messages])
+            ## LOGGING
+            logging(
+                model=model,
+                input=prompt,
+                additional_args=optional_params,
+                custom_llm_provider=custom_llm_provider,
+                logger_fn=logger_fn,
+            )
+            ## COMPLETION CALL
+            if litellm.headers:
+                response = openai.Completion.create(
+                    model=model,
+                    prompt=prompt,
+                    headers=litellm.headers,
+                )
+            else:
+                response = openai.Completion.create(model=model, prompt=prompt)
+            completion_response = response["choices"][0]["text"]
+            ## LOGGING
+            logging(
+                model=model,
+                input=prompt,
+                custom_llm_provider=custom_llm_provider,
+                additional_args={
+                    "max_tokens": max_tokens,
+                    "original_response": completion_response,
+                },
+                logger_fn=logger_fn,
+            )
+            ## RESPONSE OBJECT
+            model_response["choices"][0]["message"]["content"] = completion_response
+            model_response["created"] = response["created"]
+            model_response["model"] = model
+            model_response["usage"] = response["usage"]
+            response = model_response
+        elif "replicate" in model or custom_llm_provider == "replicate":
+            # import replicate/if it fails then pip install replicate
+            install_and_import("replicate")
+            import replicate
+
+            # Setting the relevant API KEY for replicate, replicate defaults to using os.environ.get("REPLICATE_API_TOKEN")
+            replicate_key = os.environ.get("REPLICATE_API_TOKEN")
+            if replicate_key == None:
+                # user did not set REPLICATE_API_TOKEN in .env
+                replicate_key = (
+                    get_secret("REPLICATE_API_KEY")
+                    or get_secret("REPLICATE_API_TOKEN")
+                    or api_key
+                    or litellm.replicate_key
+                )
+                # set replicate kye
+                os.environ["REPLICATE_API_TOKEN"] = replicate_key
+            prompt = " ".join([message["content"] for message in messages])
+            input = {"prompt": prompt}
+            if "max_tokens" in optional_params:
+                input["max_length"] = max_tokens  # for t5 models
+                input["max_new_tokens"] = max_tokens  # for llama2 models
+            ## LOGGING
+            logging(
+                model=model,
+                input=input,
+                custom_llm_provider=custom_llm_provider,
+                additional_args={"max_tokens": max_tokens},
+                logger_fn=logger_fn,
+            )
+            ## COMPLETION CALL
+            output = replicate.run(model, input=input)
+            if "stream" in optional_params and optional_params["stream"] == True:
+                # don't try to access stream object,
+                # let the stream handler know this is replicate
+                response = CustomStreamWrapper(output, "replicate")
+                return response
+            response = ""
+            for item in output:
+                response += item
+            completion_response = response
+            ## LOGGING
+            logging(
+                model=model,
+                input=prompt,
+                custom_llm_provider=custom_llm_provider,
+                additional_args={
+                    "max_tokens": max_tokens,
+                    "original_response": completion_response,
+                },
+                logger_fn=logger_fn,
+            )
+            prompt_tokens = len(encoding.encode(prompt))
+            completion_tokens = len(encoding.encode(completion_response))
+            ## RESPONSE OBJECT
+            model_response["choices"][0]["message"]["content"] = completion_response
+            model_response["created"] = time.time()
+            model_response["model"] = model
+            model_response["usage"] = {
+                "prompt_tokens": prompt_tokens,
+                "completion_tokens": completion_tokens,
+                "total_tokens": prompt_tokens + completion_tokens,
+            }
+            response = model_response
+        elif model in litellm.anthropic_models:
+            anthropic_key = (
+                api_key or litellm.anthropic_key or os.environ.get("ANTHROPIC_API_KEY")
+            )
+            anthropic_client = AnthropicLLM(
+                encoding=encoding,
+                default_max_tokens_to_sample=litellm.max_tokens,
+                api_key=anthropic_key,
+            )
+            model_response = anthropic_client.completion(
+                model=model,
+                messages=messages,
+                model_response=model_response,
+                print_verbose=print_verbose,
+                optional_params=optional_params,
+                litellm_params=litellm_params,
+                logger_fn=logger_fn,
+            )
+            if "stream" in optional_params and optional_params["stream"] == True:
+                # don't try to access stream object,
+                response = CustomStreamWrapper(model_response, model)
+                return response
+            response = model_response
+        elif model in litellm.openrouter_models or custom_llm_provider == "openrouter":
+            openai.api_type = "openai"
+            # not sure if this will work after someone first uses another API
+            openai.api_base = (
+                litellm.api_base
+                if litellm.api_base is not None
+                else "https://openrouter.ai/api/v1"
+            )
+            openai.api_version = None
+            if litellm.organization:
+                openai.organization = litellm.organization
+            if api_key:
+                openai.api_key = api_key
+            elif litellm.openrouter_key:
+                openai.api_key = litellm.openrouter_key
+            else:
+                openai.api_key = get_secret("OPENROUTER_API_KEY") or get_secret(
+                    "OR_API_KEY"
+                )
+            ## LOGGING
+            logging(
+                model=model,
+                input=messages,
+                additional_args=optional_params,
+                custom_llm_provider=custom_llm_provider,
+                logger_fn=logger_fn,
+            )
+            ## COMPLETION CALL
+            if litellm.headers:
+                response = openai.ChatCompletion.create(
+                    model=model,
+                    messages=messages,
+                    headers=litellm.headers,
+                    **optional_params,
+                )
+            else:
+                openrouter_site_url = get_secret("OR_SITE_URL")
+                openrouter_app_name = get_secret("OR_APP_NAME")
+                # if openrouter_site_url is None, set it to https://litellm.ai
+                if openrouter_site_url is None:
+                    openrouter_site_url = "https://litellm.ai"
+                # if openrouter_app_name is None, set it to liteLLM
+                if openrouter_app_name is None:
+                    openrouter_app_name = "liteLLM"
+                response = openai.ChatCompletion.create(
+                    model=model,
+                    messages=messages,
+                    headers={
+                        "HTTP-Referer": openrouter_site_url,  # To identify your site
+                        "X-Title": openrouter_app_name,  # To identify your app
+                    },
+                    **optional_params,
+                )
+        elif model in litellm.cohere_models:
+            # import cohere/if it fails then pip install cohere
+            install_and_import("cohere")
+            import cohere
+
+            cohere_key = (
+                api_key
+                or litellm.cohere_key
+                or get_secret("COHERE_API_KEY")
+                or get_secret("CO_API_KEY")
+            )
+            co = cohere.Client(cohere_key)
+            prompt = " ".join([message["content"] for message in messages])
+            ## LOGGING
+            logging(
+                model=model,
+                input=prompt,
+                custom_llm_provider=custom_llm_provider,
+                logger_fn=logger_fn,
+            )
+            ## COMPLETION CALL
+            response = co.generate(model=model, prompt=prompt, **optional_params)
+            if "stream" in optional_params and optional_params["stream"] == True:
+                # don't try to access stream object,
+                response = CustomStreamWrapper(response, model)
+                return response
+
+            completion_response = response[0].text
+            ## LOGGING
+            logging(
+                model=model,
+                input=prompt,
+                custom_llm_provider=custom_llm_provider,
+                additional_args={
+                    "max_tokens": max_tokens,
+                    "original_response": completion_response,
+                },
+                logger_fn=logger_fn,
+            )
+            prompt_tokens = len(encoding.encode(prompt))
+            completion_tokens = len(encoding.encode(completion_response))
+            ## RESPONSE OBJECT
+            model_response["choices"][0]["message"]["content"] = completion_response
+            model_response["created"] = time.time()
+            model_response["model"] = model
+            model_response["usage"] = {
+                "prompt_tokens": prompt_tokens,
+                "completion_tokens": completion_tokens,
+                "total_tokens": prompt_tokens + completion_tokens,
+            }
+            response = model_response
+        elif (
+            model in litellm.huggingface_models or custom_llm_provider == "huggingface"
+        ):
+            custom_llm_provider = "huggingface"
+            huggingface_key = (
+                api_key
+                or litellm.huggingface_key
+                or os.environ.get("HF_TOKEN")
+                or os.environ.get("HUGGINGFACE_API_KEY")
+            )
+            huggingface_client = HuggingfaceRestAPILLM(
+                encoding=encoding, api_key=huggingface_key
+            )
+            model_response = huggingface_client.completion(
+                model=model,
+                messages=messages,
+                custom_api_base=custom_api_base,
+                model_response=model_response,
+                print_verbose=print_verbose,
+                optional_params=optional_params,
+                litellm_params=litellm_params,
+                logger_fn=logger_fn,
+            )
+            if "stream" in optional_params and optional_params["stream"] == True:
+                # don't try to access stream object,
+                response = CustomStreamWrapper(
+                    model_response, model, custom_llm_provider="huggingface"
+                )
+                return response
+            response = model_response
+        elif custom_llm_provider == "together_ai" or ("togethercomputer" in model):
+            import requests
+
+            TOGETHER_AI_TOKEN = (
+                get_secret("TOGETHER_AI_TOKEN")
+                or get_secret("TOGETHERAI_API_KEY")
+                or api_key
+                or litellm.togetherai_api_key
+            )
+            headers = {"Authorization": f"Bearer {TOGETHER_AI_TOKEN}"}
+            endpoint = "https://api.together.xyz/inference"
+            prompt = " ".join(
+                [message["content"] for message in messages]
+            )  # TODO: Add chat support for together AI
+
+            ## LOGGING
+            logging(
+                model=model,
+                input=prompt,
+                custom_llm_provider=custom_llm_provider,
+                logger_fn=logger_fn,
+            )
+            if stream == True:
+                return together_ai_completion_streaming(
+                    {
+                        "model": model,
+                        "prompt": prompt,
+                        "request_type": "language-model-inference",
+                        **optional_params,
+                    },
+                    headers=headers,
+                )
+            res = requests.post(
+                endpoint,
+                json={
+                    "model": model,
+                    "prompt": prompt,
+                    "request_type": "language-model-inference",
+                    **optional_params,
+                },
+                headers=headers,
+            )
+            ## LOGGING
+            logging(
+                model=model,
+                input=prompt,
+                custom_llm_provider=custom_llm_provider,
+                additional_args={
+                    "max_tokens": max_tokens,
+                    "original_response": res.text,
+                },
+                logger_fn=logger_fn,
+            )
+
+            # make this safe for reading, if output does not exist raise an error
+            json_response = res.json()
+            if "output" not in json_response:
+                raise Exception(
+                    f"liteLLM: Error Making TogetherAI request, JSON Response {json_response}"
+                )
+            completion_response = json_response["output"]["choices"][0]["text"]
+            prompt_tokens = len(encoding.encode(prompt))
+            completion_tokens = len(encoding.encode(completion_response))
+            ## RESPONSE OBJECT
+            model_response["choices"][0]["message"]["content"] = completion_response
+            model_response["created"] = time.time()
+            model_response["model"] = model
+            model_response["usage"] = {
+                "prompt_tokens": prompt_tokens,
+                "completion_tokens": completion_tokens,
+                "total_tokens": prompt_tokens + completion_tokens,
+            }
+            response = model_response
+        elif model in litellm.vertex_chat_models:
+            # import vertexai/if it fails then pip install vertexai# import cohere/if it fails then pip install cohere
+            install_and_import("vertexai")
+            import vertexai
+            from vertexai.preview.language_models import ChatModel, InputOutputTextPair
+
+            vertexai.init(
+                project=litellm.vertex_project, location=litellm.vertex_location
+            )
+            # vertexai does not use an API key, it looks for credentials.json in the environment
+
+            prompt = " ".join([message["content"] for message in messages])
+            ## LOGGING
+            logging(
+                model=model,
+                input=prompt,
+                custom_llm_provider=custom_llm_provider,
+                additional_args={
+                    "litellm_params": litellm_params,
+                    "optional_params": optional_params,
+                },
+                logger_fn=logger_fn,
+            )
+
+            chat_model = ChatModel.from_pretrained(model)
+
+            chat = chat_model.start_chat()
+            completion_response = chat.send_message(prompt, **optional_params)
+
+            ## LOGGING
+            logging(
+                model=model,
+                input=prompt,
+                custom_llm_provider=custom_llm_provider,
+                additional_args={
+                    "max_tokens": max_tokens,
+                    "original_response": completion_response,
+                },
+                logger_fn=logger_fn,
+            )
+
+            ## RESPONSE OBJECT
+            model_response["choices"][0]["message"]["content"] = completion_response
+            model_response["created"] = time.time()
+            model_response["model"] = model
+        elif model in litellm.vertex_text_models:
+            # import vertexai/if it fails then pip install vertexai# import cohere/if it fails then pip install cohere
+            install_and_import("vertexai")
+            import vertexai
+            from vertexai.language_models import TextGenerationModel
+
+            vertexai.init(
+                project=litellm.vertex_project, location=litellm.vertex_location
+            )
+            # vertexai does not use an API key, it looks for credentials.json in the environment
+
+            prompt = " ".join([message["content"] for message in messages])
+            ## LOGGING
+            logging(
+                model=model,
+                input=prompt,
+                custom_llm_provider=custom_llm_provider,
+                logger_fn=logger_fn,
+            )
+            vertex_model = TextGenerationModel.from_pretrained(model)
+            completion_response = vertex_model.predict(prompt, **optional_params)
+
+            ## LOGGING
+            logging(
+                model=model,
+                input=prompt,
+                custom_llm_provider=custom_llm_provider,
+                additional_args={
+                    "max_tokens": max_tokens,
+                    "original_response": completion_response,
+                },
+                logger_fn=logger_fn,
+            )
+
+            ## RESPONSE OBJECT
+            model_response["choices"][0]["message"]["content"] = completion_response
+            model_response["created"] = time.time()
+            model_response["model"] = model
+            response = model_response
+        elif model in litellm.ai21_models:
+            install_and_import("ai21")
+            import ai21
+
+            ai21.api_key = get_secret("AI21_API_KEY")
+
+            prompt = " ".join([message["content"] for message in messages])
+            ## LOGGING
+            logging(
+                model=model,
+                input=prompt,
+                custom_llm_provider=custom_llm_provider,
+                logger_fn=logger_fn,
+            )
+
+            ai21_response = ai21.Completion.execute(
+                model=model,
+                prompt=prompt,
+            )
+            completion_response = ai21_response["completions"][0]["data"]["text"]
+
+            ## LOGGING
+            logging(
+                model=model,
+                input=prompt,
+                custom_llm_provider=custom_llm_provider,
+                additional_args={
+                    "max_tokens": max_tokens,
+                    "original_response": completion_response,
+                },
+                logger_fn=logger_fn,
+            )
+
+            ## RESPONSE OBJECT
+            model_response["choices"][0]["message"]["content"] = completion_response
+            model_response["created"] = time.time()
+            model_response["model"] = model
+            response = model_response
+        elif custom_llm_provider == "ollama":
+            endpoint = (
+                litellm.api_base if litellm.api_base is not None else custom_api_base
+            )
+            prompt = " ".join([message["content"] for message in messages])
+
+            ## LOGGING
+            logging(model=model, input=prompt, azure=azure, logger_fn=logger_fn)
+            generator = get_ollama_response_stream(endpoint, model, prompt)
+            # assume all responses are streamed
+            return generator
+        elif (
+            custom_llm_provider == "baseten"
+            or litellm.api_base == "https://app.baseten.co"
+        ):
+            import baseten
+
+            base_ten_key = get_secret("BASETEN_API_KEY")
+            baseten.login(base_ten_key)
+
+            prompt = " ".join([message["content"] for message in messages])
+            ## LOGGING
+            logging(
+                model=model,
+                input=prompt,
+                custom_llm_provider=custom_llm_provider,
+                logger_fn=logger_fn,
+            )
+
+            base_ten__model = baseten.deployed_model_version_id(model)
+
+            completion_response = base_ten__model.predict({"prompt": prompt})
+            if type(completion_response) == dict:
+                completion_response = completion_response["data"]
+                if type(completion_response) == dict:
+                    completion_response = completion_response["generated_text"]
+
+            logging(
+                model=model,
+                input=prompt,
+                custom_llm_provider=custom_llm_provider,
+                additional_args={
+                    "max_tokens": max_tokens,
+                    "original_response": completion_response,
+                },
+                logger_fn=logger_fn,
+            )
+
+            ## RESPONSE OBJECT
+            model_response["choices"][0]["message"]["content"] = completion_response
+            model_response["created"] = time.time()
+            model_response["model"] = model
+            response = model_response
+
+        elif custom_llm_provider == "petals" or (
+            litellm.api_base and "chat.petals.dev" in litellm.api_base
+        ):
+            url = "https://chat.petals.dev/api/v1/generate"
+            import requests
+
+            prompt = " ".join([message["content"] for message in messages])
+
+            ## LOGGING
+            logging(
+                model=model,
+                input=prompt,
+                custom_llm_provider=custom_llm_provider,
+                logger_fn=logger_fn,
+            )
+            response = requests.post(
+                url, data={"inputs": prompt, "max_new_tokens": 100, "model": model}
+            )
+            ## LOGGING
+            logging(
+                model=model,
+                input=prompt,
+                custom_llm_provider=custom_llm_provider,
+                additional_args={
+                    "max_tokens": max_tokens,
+                    "original_response": response,
+                },
+                logger_fn=logger_fn,
+            )
+            completion_response = response.json()["outputs"]
+
+            # RESPONSE OBJECT
+            model_response["choices"][0]["message"]["content"] = completion_response
+            model_response["created"] = time.time()
+            model_response["model"] = model
+            response = model_response
+        else:
+            ## LOGGING
+            logging(
+                model=model,
+                input=messages,
+                custom_llm_provider=custom_llm_provider,
+                logger_fn=logger_fn,
+            )
+            args = locals()
+            raise ValueError(
+                f"Unable to map your input to a model. Check your input - {args}"
+            )
+        return response
+    except Exception as e:
+        ## LOGGING
+        logging(
+            model=model,
+            input=messages,
+            custom_llm_provider=custom_llm_provider,
+            additional_args={"max_tokens": max_tokens},
+            logger_fn=logger_fn,
+            exception=e,
+        )
+        ## Map to OpenAI Exception
+        raise exception_type(
+            model=model, custom_llm_provider=custom_llm_provider, original_exception=e
         )
-    elif model in litellm.cohere_models:
-      # import cohere/if it fails then pip install cohere
-      install_and_import("cohere")
-      import cohere
-      cohere_key = api_key or litellm.cohere_key or get_secret("COHERE_API_KEY") or get_secret("CO_API_KEY")
-      co = cohere.Client(cohere_key)
-      prompt = " ".join([message["content"] for message in messages])
-      ## LOGGING
-      logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn)
-      ## COMPLETION CALL
-      response = co.generate(  
-        model=model,
-        prompt = prompt,
-        **optional_params
-      )
-      if 'stream' in optional_params and optional_params['stream'] == True:
-        # don't try to access stream object,
-        response = CustomStreamWrapper(response, model)
-        return response
 
-      completion_response = response[0].text
-      ## LOGGING
-      logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn)
-      prompt_tokens = len(encoding.encode(prompt))
-      completion_tokens = len(encoding.encode(completion_response))
-      ## RESPONSE OBJECT
-      model_response["choices"][0]["message"]["content"] = completion_response
-      model_response["created"] = time.time()
-      model_response["model"] = model
-      model_response["usage"] = {
-          "prompt_tokens": prompt_tokens,
-          "completion_tokens": completion_tokens,
-          "total_tokens": prompt_tokens + completion_tokens
-        }
-      response = model_response
-    elif model in litellm.huggingface_models or custom_llm_provider == "huggingface":
-      custom_llm_provider = "huggingface"
-      huggingface_key = api_key or litellm.huggingface_key or os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_API_KEY")
-      huggingface_client = HuggingfaceRestAPILLM(encoding=encoding, api_key=huggingface_key)
-      model_response = huggingface_client.completion(model=model, messages=messages, custom_api_base=custom_api_base, model_response=model_response, print_verbose=print_verbose, optional_params=optional_params, litellm_params=litellm_params, logger_fn=logger_fn)
-      if 'stream' in optional_params and optional_params['stream'] == True:
-        # don't try to access stream object,
-        response = CustomStreamWrapper(model_response, model, custom_llm_provider="huggingface")
-        return response
-      response = model_response
-    elif custom_llm_provider == "together_ai" or ("togethercomputer" in model):
-      import requests
-      TOGETHER_AI_TOKEN = get_secret("TOGETHER_AI_TOKEN") or get_secret("TOGETHERAI_API_KEY") or api_key or litellm.togetherai_api_key
-      headers = {"Authorization": f"Bearer {TOGETHER_AI_TOKEN}"}
-      endpoint = 'https://api.together.xyz/inference'
-      prompt = " ".join([message["content"] for message in messages]) # TODO: Add chat support for together AI
-
-      ## LOGGING
-      logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn)
-      if stream == True:
-        return together_ai_completion_streaming({
-          "model": model,
-          "prompt": prompt,
-          "request_type": "language-model-inference",
-          **optional_params
-        },
-        headers=headers)
-      res = requests.post(endpoint, json={
-          "model": model,
-          "prompt": prompt,
-          "request_type": "language-model-inference",
-          **optional_params
-        }, 
-        headers=headers
-      )
-      ## LOGGING
-      logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": res.text}, logger_fn=logger_fn)
-
-      # make this safe for reading, if output does not exist raise an error
-      json_response = res.json()
-      if "output" not in json_response:
-        raise Exception(f"liteLLM: Error Making TogetherAI request, JSON Response {json_response}")
-      completion_response = json_response['output']['choices'][0]['text']
-      prompt_tokens = len(encoding.encode(prompt))
-      completion_tokens = len(encoding.encode(completion_response))
-      ## RESPONSE OBJECT
-      model_response["choices"][0]["message"]["content"] = completion_response
-      model_response["created"] = time.time()
-      model_response["model"] = model
-      model_response["usage"] = {
-          "prompt_tokens": prompt_tokens,
-          "completion_tokens": completion_tokens,
-          "total_tokens": prompt_tokens + completion_tokens
-        }
-      response = model_response
-    elif model in litellm.vertex_chat_models:
-      # import vertexai/if it fails then pip install vertexai# import cohere/if it fails then pip install cohere
-      install_and_import("vertexai")
-      import vertexai
-      from vertexai.preview.language_models import ChatModel, InputOutputTextPair
-      vertexai.init(project=litellm.vertex_project, location=litellm.vertex_location)
-      # vertexai does not use an API key, it looks for credentials.json in the environment
-
-      prompt = " ".join([message["content"] for message in messages])
-      ## LOGGING
-      logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"litellm_params": litellm_params, "optional_params": optional_params}, logger_fn=logger_fn)
-
-      chat_model = ChatModel.from_pretrained(model)
-
-
-      chat = chat_model.start_chat()
-      completion_response = chat.send_message(prompt, **optional_params)
-
-      ## LOGGING
-      logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn)
-
-      ## RESPONSE OBJECT
-      model_response["choices"][0]["message"]["content"] = completion_response
-      model_response["created"] = time.time()
-      model_response["model"] = model
-    elif model in litellm.vertex_text_models:
-      # import vertexai/if it fails then pip install vertexai# import cohere/if it fails then pip install cohere
-      install_and_import("vertexai")
-      import vertexai
-      from vertexai.language_models import TextGenerationModel
-
-      vertexai.init(project=litellm.vertex_project, location=litellm.vertex_location)
-      # vertexai does not use an API key, it looks for credentials.json in the environment
-
-      prompt = " ".join([message["content"] for message in messages])
-      ## LOGGING
-      logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn)
-      vertex_model = TextGenerationModel.from_pretrained(model)
-      completion_response= vertex_model.predict(prompt, **optional_params)
-
-      ## LOGGING
-      logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn)
-
-      ## RESPONSE OBJECT
-      model_response["choices"][0]["message"]["content"] = completion_response
-      model_response["created"] = time.time()
-      model_response["model"] = model
-      response = model_response
-    elif model in litellm.ai21_models:
-      install_and_import("ai21")
-      import ai21
-      ai21.api_key = get_secret("AI21_API_KEY")
-
-      prompt = " ".join([message["content"] for message in messages])
-      ## LOGGING
-      logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn)
-
-      ai21_response = ai21.Completion.execute(
-        model=model,
-        prompt=prompt,
-      )
-      completion_response = ai21_response['completions'][0]['data']['text']
-
-      ## LOGGING
-      logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn)
-
-      ## RESPONSE OBJECT
-      model_response["choices"][0]["message"]["content"] = completion_response
-      model_response["created"] = time.time()
-      model_response["model"] = model
-      response = model_response
-    elif custom_llm_provider == "ollama":
-      endpoint = litellm.api_base if litellm.api_base is not None else custom_api_base
-      prompt = " ".join([message["content"] for message in messages])
-
-      ## LOGGING
-      logging(model=model, input=prompt, azure=azure, logger_fn=logger_fn)
-      generator = get_ollama_response_stream(endpoint, model, prompt)
-      # assume all responses are streamed
-      return generator
-    elif custom_llm_provider == "baseten" or litellm.api_base=="https://app.baseten.co":
-      import baseten
-      base_ten_key = get_secret('BASETEN_API_KEY')
-      baseten.login(base_ten_key)
-
-      prompt = " ".join([message["content"] for message in messages])
-      ## LOGGING
-      logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn)
-
-      base_ten__model = baseten.deployed_model_version_id(model)
-
-      completion_response = base_ten__model.predict({"prompt": prompt})
-      if type(completion_response) == dict:
-        completion_response = completion_response["data"]
-        if type(completion_response) == dict:
-          completion_response = completion_response["generated_text"]
-
-      logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn)
-
-      ## RESPONSE OBJECT
-      model_response["choices"][0]["message"]["content"] = completion_response
-      model_response["created"] = time.time()
-      model_response["model"] = model
-      response = model_response
-
-    elif custom_llm_provider == "petals" or (litellm.api_base and "chat.petals.dev" in litellm.api_base):
-      url = "https://chat.petals.dev/api/v1/generate"
-      import requests
-      prompt = " ".join([message["content"] for message in messages])
-
-      ## LOGGING
-      logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn)
-      response = requests.post(url, data={"inputs": prompt, "max_new_tokens": 100, "model": model})
-      ## LOGGING
-      logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": response}, logger_fn=logger_fn)
-      completion_response = response.json()["outputs"]
-      
-      # RESPONSE OBJECT
-      model_response["choices"][0]["message"]["content"] = completion_response
-      model_response["created"] = time.time()
-      model_response["model"] = model
-      response = model_response
-    else:
-      ## LOGGING
-      logging(model=model, input=messages, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn)
-      args = locals()
-      raise ValueError(f"Unable to map your input to a model. Check your input - {args}")
-    return response
-  except Exception as e:
-    ## LOGGING
-    logging(model=model, input=messages, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens}, logger_fn=logger_fn, exception=e)
-    ## Map to OpenAI Exception
-    raise exception_type(model=model, custom_llm_provider=custom_llm_provider, original_exception=e)
 
 def batch_completion(*args, **kwargs):
-  batch_messages = args[1] if len(args) > 1 else kwargs.get("messages")
-  completions = []
-  with ThreadPoolExecutor() as executor:
-    for message_list in batch_messages:
-      if len(args) > 1:
-        args_modified = list(args)
-        args_modified[1] = message_list
-        future = executor.submit(completion, *args_modified)
-      else:
-        kwargs_modified = dict(kwargs)
-        kwargs_modified["messages"] = message_list
-        future = executor.submit(completion, *args, **kwargs_modified)
-      completions.append(future)
-  
-  # Retrieve the results from the futures
-  results = [future.result() for future in completions]
-  return results
+    batch_messages = args[1] if len(args) > 1 else kwargs.get("messages")
+    completions = []
+    with ThreadPoolExecutor() as executor:
+        for message_list in batch_messages:
+            if len(args) > 1:
+                args_modified = list(args)
+                args_modified[1] = message_list
+                future = executor.submit(completion, *args_modified)
+            else:
+                kwargs_modified = dict(kwargs)
+                kwargs_modified["messages"] = message_list
+                future = executor.submit(completion, *args, **kwargs_modified)
+            completions.append(future)
+
+    # Retrieve the results from the futures
+    results = [future.result() for future in completions]
+    return results
+
 
 ### EMBEDDING ENDPOINTS ####################
 @client
-@timeout(60) ## set timeouts, in case calls hang (e.g. Azure) - default is 60s, override with `force_timeout`
+@timeout(
+    60
+)  ## set timeouts, in case calls hang (e.g. Azure) - default is 60s, override with `force_timeout`
 def embedding(model, input=[], azure=False, force_timeout=60, logger_fn=None):
-  try:
-    response = None
-    if azure == True:
-      # azure configs
-      openai.api_type = "azure"
-      openai.api_base = get_secret("AZURE_API_BASE")
-      openai.api_version = get_secret("AZURE_API_VERSION")
-      openai.api_key = get_secret("AZURE_API_KEY")
-      ## LOGGING
-      logging(model=model, input=input, azure=azure, logger_fn=logger_fn)
-      ## EMBEDDING CALL
-      response = openai.Embedding.create(input=input, engine=model)
-      print_verbose(f"response_value: {str(response)[:50]}")
-    elif model in litellm.open_ai_embedding_models:
-      openai.api_type = "openai"
-      openai.api_base = "https://api.openai.com/v1"
-      openai.api_version = None
-      openai.api_key = get_secret("OPENAI_API_KEY")
-      ## LOGGING
-      logging(model=model, input=input, azure=azure, logger_fn=logger_fn)
-      ## EMBEDDING CALL
-      response = openai.Embedding.create(input=input, model=model)
-      print_verbose(f"response_value: {str(response)[:50]}")
-    else: 
-      logging(model=model, input=input, azure=azure, logger_fn=logger_fn)
-      args = locals()
-      raise ValueError(f"No valid embedding model args passed in - {args}")
-    
-    return response
-  except Exception as e:
-    # log the original exception
-    logging(model=model, input=input, azure=azure, logger_fn=logger_fn, exception=e)
-    ## Map to OpenAI Exception
-    raise exception_type(model=model, original_exception=e)
-    raise e
+    try:
+        response = None
+        if azure == True:
+            # azure configs
+            openai.api_type = "azure"
+            openai.api_base = get_secret("AZURE_API_BASE")
+            openai.api_version = get_secret("AZURE_API_VERSION")
+            openai.api_key = get_secret("AZURE_API_KEY")
+            ## LOGGING
+            logging(model=model, input=input, azure=azure, logger_fn=logger_fn)
+            ## EMBEDDING CALL
+            response = openai.Embedding.create(input=input, engine=model)
+            print_verbose(f"response_value: {str(response)[:50]}")
+        elif model in litellm.open_ai_embedding_models:
+            openai.api_type = "openai"
+            openai.api_base = "https://api.openai.com/v1"
+            openai.api_version = None
+            openai.api_key = get_secret("OPENAI_API_KEY")
+            ## LOGGING
+            logging(model=model, input=input, azure=azure, logger_fn=logger_fn)
+            ## EMBEDDING CALL
+            response = openai.Embedding.create(input=input, model=model)
+            print_verbose(f"response_value: {str(response)[:50]}")
+        else:
+            logging(model=model, input=input, azure=azure, logger_fn=logger_fn)
+            args = locals()
+            raise ValueError(f"No valid embedding model args passed in - {args}")
+
+        return response
+    except Exception as e:
+        # log the original exception
+        logging(model=model, input=input, azure=azure, logger_fn=logger_fn, exception=e)
+        ## Map to OpenAI Exception
+        raise exception_type(model=model, original_exception=e)
+        raise e
+
+
 ####### HELPER FUNCTIONS ################
-## Set verbose to true -> ```litellm.set_verbose = True```    
+## Set verbose to true -> ```litellm.set_verbose = True```
 def print_verbose(print_statement):
-  if litellm.set_verbose:
-    print(f"LiteLLM: {print_statement}")
-    if random.random() <= 0.3:
-      print("Get help - https://discord.com/invite/wuPM9dRgDw")
+    if litellm.set_verbose:
+        print(f"LiteLLM: {print_statement}")
+        if random.random() <= 0.3:
+            print("Get help - https://discord.com/invite/wuPM9dRgDw")
+
 
 def config_completion(**kwargs):
-  if litellm.config_path != None:
-    config_args = read_config_args(litellm.config_path)
-    # overwrite any args passed in with config args
-    return completion(**kwargs, **config_args)
-  else:
-    raise ValueError("No config path set, please set a config path using `litellm.config_path = 'path/to/config.json'`")
\ No newline at end of file
+    if litellm.config_path != None:
+        config_args = read_config_args(litellm.config_path)
+        # overwrite any args passed in with config args
+        return completion(**kwargs, **config_args)
+    else:
+        raise ValueError(
+            "No config path set, please set a config path using `litellm.config_path = 'path/to/config.json'`"
+        )
diff --git a/litellm/testing.py b/litellm/testing.py
index 2442bab7c..3e3ce286e 100644
--- a/litellm/testing.py
+++ b/litellm/testing.py
@@ -1,53 +1,82 @@
 import litellm
-import time 
+import time
 from concurrent.futures import ThreadPoolExecutor
 import traceback
 
+
 def testing_batch_completion(*args, **kwargs):
-  try:
-    batch_models = args[0] if len(args) > 0 else kwargs.pop("models") ## expected input format- ["gpt-3.5-turbo", {"model": "qvv0xeq", "custom_llm_provider"="baseten"}...]
-    batch_messages = args[1] if len(args) > 1 else kwargs.pop("messages")
-    results = []
-    completions = []
-    exceptions = []
-    times = []
-    with ThreadPoolExecutor() as executor:
-        for model in batch_models:
-            kwargs_modified = dict(kwargs)
-            args_modified = list(args)
-            if len(args) > 0:
-                args_modified[0] = model["model"]
-            else: 
-                kwargs_modified["model"] = model["model"] if isinstance(model, dict) and "model" in model else model # if model is a dictionary get it's value else assume it's a string
-                kwargs_modified["custom_llm_provider"] = model["custom_llm_provider"] if isinstance(model, dict) and "custom_llm_provider" in model else None
-                kwargs_modified["custom_api_base"] = model["custom_api_base"] if isinstance(model, dict) and "custom_api_base" in model else None
-            for message_list in batch_messages:
-                if len(args) > 1:
-                    args_modified[1] = message_list
-                    future = executor.submit(litellm.completion, *args_modified, **kwargs_modified)
+    try:
+        batch_models = (
+            args[0] if len(args) > 0 else kwargs.pop("models")
+        )  ## expected input format- ["gpt-3.5-turbo", {"model": "qvv0xeq", "custom_llm_provider"="baseten"}...]
+        batch_messages = args[1] if len(args) > 1 else kwargs.pop("messages")
+        results = []
+        completions = []
+        exceptions = []
+        times = []
+        with ThreadPoolExecutor() as executor:
+            for model in batch_models:
+                kwargs_modified = dict(kwargs)
+                args_modified = list(args)
+                if len(args) > 0:
+                    args_modified[0] = model["model"]
                 else:
-                    kwargs_modified["messages"] = message_list
-                    future = executor.submit(litellm.completion, *args_modified, **kwargs_modified)
-                completions.append((future, message_list))
-    
-    # Retrieve the results and calculate elapsed time for each completion call
-    for completion in completions:
-        future, message_list = completion
-        start_time = time.time()
-        try:
-            result = future.result()
-            end_time = time.time()
-            elapsed_time = end_time - start_time
-            result_dict = {"status": "succeeded", "response": future.result(), "prompt": message_list, "response_time": elapsed_time}
-            results.append(result_dict)
-        except Exception as e:
-            end_time = time.time()
-            elapsed_time = end_time - start_time
-            result_dict = {"status": "failed", "response": e, "response_time": elapsed_time}
-            results.append(result_dict)
-    return results
-  except:
-     traceback.print_exc()
+                    kwargs_modified["model"] = (
+                        model["model"]
+                        if isinstance(model, dict) and "model" in model
+                        else model
+                    )  # if model is a dictionary get it's value else assume it's a string
+                    kwargs_modified["custom_llm_provider"] = (
+                        model["custom_llm_provider"]
+                        if isinstance(model, dict) and "custom_llm_provider" in model
+                        else None
+                    )
+                    kwargs_modified["custom_api_base"] = (
+                        model["custom_api_base"]
+                        if isinstance(model, dict) and "custom_api_base" in model
+                        else None
+                    )
+                for message_list in batch_messages:
+                    if len(args) > 1:
+                        args_modified[1] = message_list
+                        future = executor.submit(
+                            litellm.completion, *args_modified, **kwargs_modified
+                        )
+                    else:
+                        kwargs_modified["messages"] = message_list
+                        future = executor.submit(
+                            litellm.completion, *args_modified, **kwargs_modified
+                        )
+                    completions.append((future, message_list))
+
+        # Retrieve the results and calculate elapsed time for each completion call
+        for completion in completions:
+            future, message_list = completion
+            start_time = time.time()
+            try:
+                result = future.result()
+                end_time = time.time()
+                elapsed_time = end_time - start_time
+                result_dict = {
+                    "status": "succeeded",
+                    "response": future.result(),
+                    "prompt": message_list,
+                    "response_time": elapsed_time,
+                }
+                results.append(result_dict)
+            except Exception as e:
+                end_time = time.time()
+                elapsed_time = end_time - start_time
+                result_dict = {
+                    "status": "failed",
+                    "response": e,
+                    "response_time": elapsed_time,
+                }
+                results.append(result_dict)
+        return results
+    except:
+        traceback.print_exc()
+
 
 def duration_test_model(original_function):
     def wrapper_function(*args, **kwargs):
@@ -70,22 +99,39 @@ def duration_test_model(original_function):
     # Return the wrapper function
     return wrapper_function
 
+
 @duration_test_model
 def load_test_model(models: list, prompt: str = None, num_calls: int = None):
-  test_calls = 100
-  if num_calls:
-     test_calls = num_calls
-  input_prompt = prompt if prompt else "Hey, how's it going?"
-  messages = [{"role": "user", "content": prompt}] if prompt else [{"role": "user", "content": input_prompt}]
-  full_message_list = [messages for _ in range(test_calls)] # call it as many times as set by user to load test models
-  start_time = time.time()
-  try:
-    results = testing_batch_completion(models=models, messages=full_message_list)
-    end_time = time.time() 
-    response_time = end_time - start_time
-    return {"total_response_time": response_time, "calls_made": test_calls, "prompt": input_prompt, "results": results}
-  except Exception as e:
-    traceback.print_exc()
-    end_time = time.time() 
-    response_time = end_time - start_time
-    return {"total_response_time": response_time, "calls_made": test_calls, "prompt": input_prompt, "exception": e}
\ No newline at end of file
+    test_calls = 100
+    if num_calls:
+        test_calls = num_calls
+    input_prompt = prompt if prompt else "Hey, how's it going?"
+    messages = (
+        [{"role": "user", "content": prompt}]
+        if prompt
+        else [{"role": "user", "content": input_prompt}]
+    )
+    full_message_list = [
+        messages for _ in range(test_calls)
+    ]  # call it as many times as set by user to load test models
+    start_time = time.time()
+    try:
+        results = testing_batch_completion(models=models, messages=full_message_list)
+        end_time = time.time()
+        response_time = end_time - start_time
+        return {
+            "total_response_time": response_time,
+            "calls_made": test_calls,
+            "prompt": input_prompt,
+            "results": results,
+        }
+    except Exception as e:
+        traceback.print_exc()
+        end_time = time.time()
+        response_time = end_time - start_time
+        return {
+            "total_response_time": response_time,
+            "calls_made": test_calls,
+            "prompt": input_prompt,
+            "exception": e,
+        }
diff --git a/litellm/tests/test_api_key_param.py b/litellm/tests/test_api_key_param.py
index 6213730f5..cebcb1a37 100644
--- a/litellm/tests/test_api_key_param.py
+++ b/litellm/tests/test_api_key_param.py
@@ -3,27 +3,37 @@
 
 import sys, os
 import traceback
-sys.path.insert(0, os.path.abspath('../..'))  # Adds the parent directory to the system path
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
 import litellm
 from litellm import embedding, completion
 
 litellm.set_verbose = False
 
+
 def logger_fn(model_call_object: dict):
     print(f"model call details: {model_call_object}")
 
+
 user_message = "Hello, how are you?"
-messages = [{ "content": user_message,"role": "user"}]
+messages = [{"content": user_message, "role": "user"}]
 
 ## Test 1: Setting key dynamically
 temp_key = os.environ.get("ANTHROPIC_API_KEY")
 os.environ["ANTHROPIC_API_KEY"] = "bad-key"
-# test on openai completion call 
+# test on openai completion call
 try:
-    response = completion(model="claude-instant-1", messages=messages, logger_fn=logger_fn, api_key=temp_key)
+    response = completion(
+        model="claude-instant-1",
+        messages=messages,
+        logger_fn=logger_fn,
+        api_key=temp_key,
+    )
     print(f"response: {response}")
 except:
-    print(f"error occurred: {traceback.format_exc()}") 
+    print(f"error occurred: {traceback.format_exc()}")
     pass
 os.environ["ANTHROPIC_API_KEY"] = temp_key
 
@@ -31,11 +41,13 @@ os.environ["ANTHROPIC_API_KEY"] = temp_key
 ## Test 2: Setting key via __init__ params
 litellm.anthropic_key = os.environ.get("ANTHROPIC_API_KEY")
 os.environ.pop("ANTHROPIC_API_KEY")
-# test on openai completion call 
+# test on openai completion call
 try:
-    response = completion(model="claude-instant-1", messages=messages, logger_fn=logger_fn)
+    response = completion(
+        model="claude-instant-1", messages=messages, logger_fn=logger_fn
+    )
     print(f"response: {response}")
 except:
-    print(f"error occurred: {traceback.format_exc()}") 
+    print(f"error occurred: {traceback.format_exc()}")
     pass
 os.environ["ANTHROPIC_API_KEY"] = temp_key
diff --git a/litellm/tests/test_async_fn.py b/litellm/tests/test_async_fn.py
index b0925c4b5..c20c5cde6 100644
--- a/litellm/tests/test_async_fn.py
+++ b/litellm/tests/test_async_fn.py
@@ -5,17 +5,22 @@ import sys, os
 import pytest
 import traceback
 import asyncio
-sys.path.insert(0, os.path.abspath('../..'))  # Adds the parent directory to the system path
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
 from litellm import acompletion
 
+
 async def test_get_response():
     user_message = "Hello, how are you?"
-    messages = [{ "content": user_message,"role": "user"}]
+    messages = [{"content": user_message, "role": "user"}]
     try:
         response = await acompletion(model="gpt-3.5-turbo", messages=messages)
     except Exception as e:
         pytest.fail(f"error occurred: {e}")
     return response
 
+
 response = asyncio.run(test_get_response())
-print(response)
\ No newline at end of file
+print(response)
diff --git a/litellm/tests/test_bad_params.py b/litellm/tests/test_bad_params.py
index 0a2313c78..71cbffe56 100644
--- a/litellm/tests/test_bad_params.py
+++ b/litellm/tests/test_bad_params.py
@@ -1,16 +1,17 @@
 #### What this tests ####
 #    This tests chaos monkeys - if random parts of the system are broken / things aren't sent correctly - what happens.
-#    Expect to add more edge cases to this over time. 
+#    Expect to add more edge cases to this over time.
 
 import sys, os
 import traceback
 from dotenv import load_dotenv
+
 load_dotenv()
 # Get the current directory of the script
 current_dir = os.path.dirname(os.path.abspath(__file__))
 
 # Get the parent directory by joining the current directory with '..'
-parent_dir = os.path.join(current_dir, '../..')
+parent_dir = os.path.join(current_dir, "../..")
 
 # Add the parent directory to the system path
 sys.path.append(parent_dir)
@@ -26,7 +27,7 @@ litellm.failure_callback = ["slack", "sentry", "posthog"]
 
 
 user_message = "Hello, how are you?"
-messages = [{ "content": user_message,"role": "user"}]
+messages = [{"content": user_message, "role": "user"}]
 model_val = None
 
 
@@ -35,18 +36,18 @@ def test_completion_with_empty_model():
     try:
         response = completion(model=model_val, messages=messages)
     except Exception as e:
-        print(f"error occurred: {e}") 
+        print(f"error occurred: {e}")
         pass
 
 
-#bad key
+# bad key
 temp_key = os.environ.get("OPENAI_API_KEY")
 os.environ["OPENAI_API_KEY"] = "bad-key"
-# test on openai completion call 
+# test on openai completion call
 try:
     response = completion(model="gpt-3.5-turbo", messages=messages)
     print(f"response: {response}")
 except:
-    print(f"error occurred: {traceback.format_exc()}") 
+    print(f"error occurred: {traceback.format_exc()}")
     pass
-os.environ["OPENAI_API_KEY"] = temp_key
\ No newline at end of file
+os.environ["OPENAI_API_KEY"] = temp_key
diff --git a/litellm/tests/test_batch_completions.py b/litellm/tests/test_batch_completions.py
index d15628f56..a136351ba 100644
--- a/litellm/tests/test_batch_completions.py
+++ b/litellm/tests/test_batch_completions.py
@@ -3,7 +3,10 @@
 
 import sys, os
 import traceback
-sys.path.insert(0, os.path.abspath('../..'))  # Adds the parent directory to the system path
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
 import litellm
 from litellm import batch_completion
 
@@ -14,4 +17,4 @@ model = "gpt-3.5-turbo"
 
 result = batch_completion(model=model, messages=messages)
 print(result)
-print(len(result))
\ No newline at end of file
+print(len(result))
diff --git a/litellm/tests/test_berrispend_integration.py b/litellm/tests/test_berrispend_integration.py
index 122c9201d..500285b85 100644
--- a/litellm/tests/test_berrispend_integration.py
+++ b/litellm/tests/test_berrispend_integration.py
@@ -19,7 +19,7 @@
 
 
 # #openai call
-# response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}]) 
+# response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}])
 
 # #bad request call
-# response = completion(model="chatgpt-test", messages=[{"role": "user", "content": "Hi 👋 - i'm a bad request"}]) 
\ No newline at end of file
+# response = completion(model="chatgpt-test", messages=[{"role": "user", "content": "Hi 👋 - i'm a bad request"}])
diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index c6500c557..5d7e962cf 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -1,9 +1,13 @@
 import sys, os
 import traceback
 from dotenv import load_dotenv
+
 load_dotenv()
 import os
-sys.path.insert(0, os.path.abspath('../..'))  # Adds the parent directory to the system path
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
 import pytest
 import litellm
 from litellm import embedding, completion
@@ -12,7 +16,6 @@ litellm.caching = True
 messages = [{"role": "user", "content": "who is ishaan Github?  "}]
 
 
-
 # test if response cached
 def test_caching():
     try:
@@ -27,9 +30,5 @@ def test_caching():
             pytest.fail(f"Error occurred: {e}")
     except Exception as e:
         litellm.caching = False
-        print(f"error occurred: {traceback.format_exc()}") 
+        print(f"error occurred: {traceback.format_exc()}")
         pytest.fail(f"Error occurred: {e}")
-
-
-
-
diff --git a/litellm/tests/test_client.py b/litellm/tests/test_client.py
index 3c591d4cd..f29ae5a94 100644
--- a/litellm/tests/test_client.py
+++ b/litellm/tests/test_client.py
@@ -5,7 +5,9 @@ import sys, os
 import traceback
 import pytest
 
-sys.path.insert(0, os.path.abspath('../..'))  # Adds the parent directory to the system path
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
 import litellm
 from litellm import embedding, completion
 
@@ -14,17 +16,22 @@ litellm.failure_callback = ["slack", "sentry", "posthog"]
 
 litellm.set_verbose = True
 
+
 def logger_fn(model_call_object: dict):
     # print(f"model call details: {model_call_object}")
     pass
 
+
 user_message = "Hello, how are you?"
-messages = [{ "content": user_message,"role": "user"}]
+messages = [{"content": user_message, "role": "user"}]
+
 
 def test_completion_openai():
     try:
         print("running query")
-        response = completion(model="gpt-3.5-turbo", messages=messages, logger_fn=logger_fn)
+        response = completion(
+            model="gpt-3.5-turbo", messages=messages, logger_fn=logger_fn
+        )
         print(f"response: {response}")
         # Add any assertions here to check the response
     except Exception as e:
@@ -34,33 +41,46 @@ def test_completion_openai():
 
 def test_completion_claude():
     try:
-        response = completion(model="claude-instant-1", messages=messages, logger_fn=logger_fn)
+        response = completion(
+            model="claude-instant-1", messages=messages, logger_fn=logger_fn
+        )
         # Add any assertions here to check the response
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
 
+
 def test_completion_non_openai():
     try:
-        response = completion(model="command-nightly", messages=messages, logger_fn=logger_fn)
+        response = completion(
+            model="command-nightly", messages=messages, logger_fn=logger_fn
+        )
         # Add any assertions here to check the response
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
 
+
 def test_embedding_openai():
     try:
-        response = embedding(model='text-embedding-ada-002', input=[user_message], logger_fn=logger_fn)
+        response = embedding(
+            model="text-embedding-ada-002", input=[user_message], logger_fn=logger_fn
+        )
         # Add any assertions here to check the response
         print(f"response: {str(response)[:50]}")
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
 
+
 def test_bad_azure_embedding():
     try:
-        response = embedding(model='chatgpt-test', input=[user_message], logger_fn=logger_fn)
+        response = embedding(
+            model="chatgpt-test", input=[user_message], logger_fn=logger_fn
+        )
         # Add any assertions here to check the response
         print(f"response: {str(response)[:50]}")
     except Exception as e:
         pass
+
+
 # def test_good_azure_embedding():
 #     try:
 #         response = embedding(model='azure-embedding-model', input=[user_message], azure=True, logger_fn=logger_fn)
@@ -68,4 +88,3 @@ def test_bad_azure_embedding():
 #         print(f"response: {str(response)[:50]}")
 #     except Exception as e:
 #         pytest.fail(f"Error occurred: {e}")
-
diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index 00054d6a8..370668afb 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -1,44 +1,58 @@
 import sys, os
 import traceback
 from dotenv import load_dotenv
+
 load_dotenv()
 import os
-sys.path.insert(0, os.path.abspath('../..'))  # Adds the parent directory to the system path
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
 import pytest
 import litellm
 from litellm import embedding, completion
+
 # from infisical import InfisicalClient
 
 # litellm.set_verbose = True
 # litellm.secret_manager_client = InfisicalClient(token=os.environ["INFISICAL_TOKEN"])
 
 user_message = "Hello, whats the weather in San Francisco??"
-messages = [{ "content": user_message,"role": "user"}]
+messages = [{"content": user_message, "role": "user"}]
+
 
 def logger_fn(user_model_dict):
     print(f"user_model_dict: {user_model_dict}")
 
+
 def test_completion_claude():
     try:
-        response = completion(model="claude-instant-1", messages=messages, logger_fn=logger_fn)
+        response = completion(
+            model="claude-instant-1", messages=messages, logger_fn=logger_fn
+        )
         # Add any assertions here to check the response
         print(response)
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
 
+
 def test_completion_claude_stream():
     try:
         messages = [
             {"role": "system", "content": "You are a helpful assistant."},
-            {"role": "user", "content": "how does a court case get to the Supreme Court?"}
+            {
+                "role": "user",
+                "content": "how does a court case get to the Supreme Court?",
+            },
         ]
         response = completion(model="claude-2", messages=messages, stream=True)
         # Add any assertions here to check the response
         for chunk in response:
-            print(chunk['choices'][0]['delta']) # same as openai format
+            print(chunk["choices"][0]["delta"])  # same as openai format
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
 
+
 # def test_completion_hf_api():
 #     try:
 #         user_message = "write some code to find the sum of two numbers"
@@ -62,10 +76,12 @@ def test_completion_claude_stream():
 
 def test_completion_cohere():
     try:
-        response = completion(model="command-nightly", messages=messages, max_tokens=100)
+        response = completion(
+            model="command-nightly", messages=messages, max_tokens=100
+        )
         # Add any assertions here to check the response
         print(response)
-        response_str = response['choices'][0]['message']['content']
+        response_str = response["choices"][0]["message"]["content"]
         print(f"str response{response_str}")
         response_str_2 = response.choices[0].message.content
         if type(response_str) != str:
@@ -75,24 +91,31 @@ def test_completion_cohere():
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
 
+
 def test_completion_cohere_stream():
     try:
         messages = [
             {"role": "system", "content": "You are a helpful assistant."},
-            {"role": "user", "content": "how does a court case get to the Supreme Court?"}
+            {
+                "role": "user",
+                "content": "how does a court case get to the Supreme Court?",
+            },
         ]
-        response = completion(model="command-nightly", messages=messages, stream=True, max_tokens=50)
+        response = completion(
+            model="command-nightly", messages=messages, stream=True, max_tokens=50
+        )
         # Add any assertions here to check the response
         for chunk in response:
-            print(chunk['choices'][0]['delta']) # same as openai format
+            print(chunk["choices"][0]["delta"])  # same as openai format
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
 
+
 def test_completion_openai():
     try:
         response = completion(model="gpt-3.5-turbo", messages=messages)
 
-        response_str = response['choices'][0]['message']['content']
+        response_str = response["choices"][0]["message"]["content"]
         response_str_2 = response.choices[0].message.content
         assert response_str == response_str_2
         assert type(response_str) == str
@@ -100,6 +123,7 @@ def test_completion_openai():
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
 
+
 def test_completion_text_openai():
     try:
         response = completion(model="text-davinci-003", messages=messages)
@@ -108,17 +132,31 @@ def test_completion_text_openai():
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
 
+
 def test_completion_openai_with_optional_params():
     try:
-        response = completion(model="gpt-3.5-turbo", messages=messages, temperature=0.5, top_p=0.1, user="ishaan_dev@berri.ai")
+        response = completion(
+            model="gpt-3.5-turbo",
+            messages=messages,
+            temperature=0.5,
+            top_p=0.1,
+            user="ishaan_dev@berri.ai",
+        )
         # Add any assertions here to check the response
         print(response)
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
 
+
 def test_completion_openrouter():
     try:
-        response = completion(model="google/palm-2-chat-bison", messages=messages, temperature=0.5, top_p=0.1, user="ishaan_dev@berri.ai")
+        response = completion(
+            model="google/palm-2-chat-bison",
+            messages=messages,
+            temperature=0.5,
+            top_p=0.1,
+            user="ishaan_dev@berri.ai",
+        )
         # Add any assertions here to check the response
         print(response)
     except Exception as e:
@@ -127,12 +165,23 @@ def test_completion_openrouter():
 
 def test_completion_openai_with_more_optional_params():
     try:
-        response = completion(model="gpt-3.5-turbo", messages=messages, temperature=0.5, top_p=0.1, n=2, max_tokens=150, presence_penalty=0.5, frequency_penalty=-0.5, logit_bias={123: 5}, user="ishaan_dev@berri.ai")
+        response = completion(
+            model="gpt-3.5-turbo",
+            messages=messages,
+            temperature=0.5,
+            top_p=0.1,
+            n=2,
+            max_tokens=150,
+            presence_penalty=0.5,
+            frequency_penalty=-0.5,
+            logit_bias={123: 5},
+            user="ishaan_dev@berri.ai",
+        )
         # Add any assertions here to check the response
         print(response)
-        response_str = response['choices'][0]['message']['content']
+        response_str = response["choices"][0]["message"]["content"]
         response_str_2 = response.choices[0].message.content
-        print(response['choices'][0]['message']['content'])
+        print(response["choices"][0]["message"]["content"])
         print(response.choices[0].message.content)
         if type(response_str) != str:
             pytest.fail(f"Error occurred: {e}")
@@ -141,14 +190,28 @@ def test_completion_openai_with_more_optional_params():
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
 
+
 def test_completion_openai_with_stream():
     try:
-        response = completion(model="gpt-3.5-turbo", messages=messages, temperature=0.5, top_p=0.1, n=2, max_tokens=150, presence_penalty=0.5, stream=True, frequency_penalty=-0.5, logit_bias={27000: 5}, user="ishaan_dev@berri.ai")
+        response = completion(
+            model="gpt-3.5-turbo",
+            messages=messages,
+            temperature=0.5,
+            top_p=0.1,
+            n=2,
+            max_tokens=150,
+            presence_penalty=0.5,
+            stream=True,
+            frequency_penalty=-0.5,
+            logit_bias={27000: 5},
+            user="ishaan_dev@berri.ai",
+        )
         # Add any assertions here to check the response
         print(response)
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
 
+
 def test_completion_openai_with_functions():
     function1 = [
         {
@@ -159,33 +222,39 @@ def test_completion_openai_with_functions():
                 "properties": {
                     "location": {
                         "type": "string",
-                        "description": "The city and state, e.g. San Francisco, CA"
+                        "description": "The city and state, e.g. San Francisco, CA",
                     },
-                    "unit": {
-                        "type": "string",
-                        "enum": ["celsius", "fahrenheit"]
-                    }
+                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
                 },
-                "required": ["location"]
-            }
+                "required": ["location"],
+            },
         }
     ]
     try:
-        response = completion(model="gpt-3.5-turbo", messages=messages, functions=function1)
+        response = completion(
+            model="gpt-3.5-turbo", messages=messages, functions=function1
+        )
         # Add any assertions here to check the response
         print(response)
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
 
+
 def test_completion_azure():
     try:
-        response = completion(model="gpt-3.5-turbo", deployment_id="chatgpt-test", messages=messages, custom_llm_provider="azure")
+        response = completion(
+            model="gpt-3.5-turbo",
+            deployment_id="chatgpt-test",
+            messages=messages,
+            custom_llm_provider="azure",
+        )
         # Add any assertions here to check the response
         print(response)
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
 
-# Replicate API endpoints are unstable -> throw random CUDA errors -> this means our tests can fail even if our tests weren't incorrect. 
+
+# Replicate API endpoints are unstable -> throw random CUDA errors -> this means our tests can fail even if our tests weren't incorrect.
 def test_completion_replicate_llama_stream():
     model_name = "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1"
     try:
@@ -197,23 +266,32 @@ def test_completion_replicate_llama_stream():
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
 
+
 def test_completion_replicate_stability_stream():
     model_name = "stability-ai/stablelm-tuned-alpha-7b:c49dae362cbaecd2ceabb5bd34fdb68413c4ff775111fea065d259d577757beb"
     try:
-        response = completion(model=model_name, messages=messages, stream=True, custom_llm_provider="replicate")
+        response = completion(
+            model=model_name,
+            messages=messages,
+            stream=True,
+            custom_llm_provider="replicate",
+        )
         # Add any assertions here to check the response
         for chunk in response:
-            print(chunk['choices'][0]['delta'])
+            print(chunk["choices"][0]["delta"])
         print(response)
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
 
+
 def test_completion_replicate_stability():
     model_name = "stability-ai/stablelm-tuned-alpha-7b:c49dae362cbaecd2ceabb5bd34fdb68413c4ff775111fea065d259d577757beb"
     try:
-        response = completion(model=model_name, messages=messages, custom_llm_provider="replicate")
+        response = completion(
+            model=model_name, messages=messages, custom_llm_provider="replicate"
+        )
         # Add any assertions here to check the response
-        response_str = response['choices'][0]['message']['content']
+        response_str = response["choices"][0]["message"]["content"]
         response_str_2 = response.choices[0].message.content
         print(response_str)
         print(response_str_2)
@@ -224,6 +302,7 @@ def test_completion_replicate_stability():
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
 
+
 ######## Test TogetherAI ########
 def test_completion_together_ai():
     model_name = "togethercomputer/llama-2-70b-chat"
@@ -234,15 +313,22 @@ def test_completion_together_ai():
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
 
+
 def test_petals():
     model_name = "stabilityai/StableBeluga2"
     try:
-        response = completion(model=model_name, messages=messages, custom_llm_provider="petals", force_timeout=120)
+        response = completion(
+            model=model_name,
+            messages=messages,
+            custom_llm_provider="petals",
+            force_timeout=120,
+        )
         # Add any assertions here to check the response
         print(response)
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
 
+
 # def test_baseten_falcon_7bcompletion():
 #     model_name = "qvv0xeq"
 #     try:
@@ -290,7 +376,6 @@ def test_petals():
 #         pytest.fail(f"Error occurred: {e}")
 
 
-
 #### Test A121 ###################
 # def test_completion_ai21():
 #     model_name = "j2-light"
@@ -301,7 +386,7 @@ def test_petals():
 #     except Exception as e:
 #         pytest.fail(f"Error occurred: {e}")
 
-# test config file with completion # 
+# test config file with completion #
 # def test_completion_openai_config():
 #     try:
 #         litellm.config_path = "../config.json"
@@ -333,4 +418,3 @@ def test_petals():
 #     return
 
 # test_completion_together_ai_stream()
-
diff --git a/litellm/tests/test_custom_api_base.py b/litellm/tests/test_custom_api_base.py
index 966fff954..70a477eab 100644
--- a/litellm/tests/test_custom_api_base.py
+++ b/litellm/tests/test_custom_api_base.py
@@ -1,20 +1,33 @@
 import sys, os
 import traceback
 from dotenv import load_dotenv
+
 load_dotenv()
 import os
-sys.path.insert(0, os.path.abspath('../..'))  # Adds the parent directory to the system path
-import litellm 
-from litellm import completion 
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
+import litellm
+from litellm import completion
+
 
 def logging_fn(model_call_dict):
     print(f"model call details: {model_call_dict}")
+
+
 models = ["gorilla-7b-hf-v1", "gpt-4"]
 custom_llm_provider = None
 messages = [{"role": "user", "content": "Hey,  how's it going?"}]
-for model in models: # iterate through list
+for model in models:  # iterate through list
     custom_api_base = None
-    if model == "gorilla-7b-hf-v1": 
+    if model == "gorilla-7b-hf-v1":
         custom_llm_provider = "custom_openai"
         custom_api_base = "http://zanino.millennium.berkeley.edu:8000/v1"
-    completion(model=model, messages=messages, custom_llm_provider=custom_llm_provider, custom_api_base=custom_api_base, logger_fn=logging_fn)
+    completion(
+        model=model,
+        messages=messages,
+        custom_llm_provider=custom_llm_provider,
+        custom_api_base=custom_api_base,
+        logger_fn=logging_fn,
+    )
diff --git a/litellm/tests/test_embedding.py b/litellm/tests/test_embedding.py
index a31d2a4fa..a9b3f2b79 100644
--- a/litellm/tests/test_embedding.py
+++ b/litellm/tests/test_embedding.py
@@ -1,9 +1,10 @@
-
 import sys, os
 import traceback
 import pytest
 
-sys.path.insert(0, os.path.abspath('../..'))  # Adds the parent directory to the system path
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
 import litellm
 from litellm import embedding, completion
 from infisical import InfisicalClient
@@ -11,10 +12,13 @@ from infisical import InfisicalClient
 # # litellm.set_verbose = True
 # litellm.secret_manager_client = InfisicalClient(token=os.environ["INFISICAL_TOKEN"])
 
+
 def test_openai_embedding():
     try:
-        response = embedding(model='text-embedding-ada-002', input=["good morning from litellm"])
+        response = embedding(
+            model="text-embedding-ada-002", input=["good morning from litellm"]
+        )
         # Add any assertions here to check the response
         print(f"response: {str(response)}")
     except Exception as e:
-        pytest.fail(f"Error occurred: {e}")
\ No newline at end of file
+        pytest.fail(f"Error occurred: {e}")
diff --git a/litellm/tests/test_exceptions.py b/litellm/tests/test_exceptions.py
index 76cff6bdb..6620eb2ae 100644
--- a/litellm/tests/test_exceptions.py
+++ b/litellm/tests/test_exceptions.py
@@ -1,10 +1,21 @@
 # from openai.error import AuthenticationError, InvalidRequestError, RateLimitError, OpenAIError
-import os 
+import os
 import sys
 import traceback
-sys.path.insert(0, os.path.abspath('../..'))  # Adds the parent directory to the system path
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
 import litellm
-from litellm import embedding, completion, AuthenticationError, InvalidRequestError, RateLimitError, ServiceUnavailableError, OpenAIError
+from litellm import (
+    embedding,
+    completion,
+    AuthenticationError,
+    InvalidRequestError,
+    RateLimitError,
+    ServiceUnavailableError,
+    OpenAIError,
+)
 from concurrent.futures import ThreadPoolExecutor
 import pytest
 
@@ -23,8 +34,10 @@ litellm.failure_callback = ["sentry"]
 # models = ["gpt-3.5-turbo", "chatgpt-test",  "claude-instant-1", "command-nightly"]
 test_model = "claude-instant-1"
 models = ["claude-instant-1"]
+
+
 def logging_fn(model_call_dict):
-    if "model" in model_call_dict: 
+    if "model" in model_call_dict:
         print(f"model_call_dict: {model_call_dict['model']}")
     else:
         print(f"model_call_dict: {model_call_dict}")
@@ -38,7 +51,12 @@ def test_context_window(model):
     try:
         model = "chatgpt-test"
         print(f"model: {model}")
-        response = completion(model=model, messages=messages, custom_llm_provider="azure", logger_fn=logging_fn)
+        response = completion(
+            model=model,
+            messages=messages,
+            custom_llm_provider="azure",
+            logger_fn=logging_fn,
+        )
         print(f"response: {response}")
     except InvalidRequestError as e:
         print(f"InvalidRequestError: {e.llm_provider}")
@@ -52,14 +70,17 @@ def test_context_window(model):
         print(f"Uncaught Exception - {e}")
         pytest.fail(f"Error occurred: {e}")
     return
+
+
 test_context_window(test_model)
 
+
 # Test 2: InvalidAuth Errors
 @pytest.mark.parametrize("model", models)
-def invalid_auth(model): # set the model key to an invalid key, depending on the model 
-    messages = [{ "content": "Hello, how are you?","role": "user"}]
+def invalid_auth(model):  # set the model key to an invalid key, depending on the model
+    messages = [{"content": "Hello, how are you?", "role": "user"}]
     temporary_key = None
-    try: 
+    try:
         custom_llm_provider = None
         if model == "gpt-3.5-turbo":
             temporary_key = os.environ["OPENAI_API_KEY"]
@@ -74,22 +95,29 @@ def invalid_auth(model): # set the model key to an invalid key, depending on the
         elif model == "command-nightly":
             temporary_key = os.environ["COHERE_API_KEY"]
             os.environ["COHERE_API_KEY"] = "bad-key"
-        elif model == "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1":
-            temporary_key = os.environ["REPLICATE_API_KEY"] 
+        elif (
+            model
+            == "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1"
+        ):
+            temporary_key = os.environ["REPLICATE_API_KEY"]
             os.environ["REPLICATE_API_KEY"] = "bad-key"
         print(f"model: {model}")
-        response = completion(model=model, messages=messages, custom_llm_provider=custom_llm_provider)
+        response = completion(
+            model=model, messages=messages, custom_llm_provider=custom_llm_provider
+        )
         print(f"response: {response}")
     except AuthenticationError as e:
         print(f"AuthenticationError Caught Exception - {e.llm_provider}")
-    except OpenAIError: # is at least an openai error -> in case of random model errors - e.g. overloaded server
+    except (
+        OpenAIError
+    ):  # is at least an openai error -> in case of random model errors - e.g. overloaded server
         print(f"OpenAIError Caught Exception - {e}")
     except Exception as e:
         print(type(e))
         print(e.__class__.__name__)
         print(f"Uncaught Exception - {e}")
         pytest.fail(f"Error occurred: {e}")
-    if temporary_key != None: # reset the key
+    if temporary_key != None:  # reset the key
         if model == "gpt-3.5-turbo":
             os.environ["OPENAI_API_KEY"] = temporary_key
         elif model == "chatgpt-test":
@@ -99,13 +127,18 @@ def invalid_auth(model): # set the model key to an invalid key, depending on the
             os.environ["ANTHROPIC_API_KEY"] = temporary_key
         elif model == "command-nightly":
             os.environ["COHERE_API_KEY"] = temporary_key
-        elif model == "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1":
+        elif (
+            model
+            == "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1"
+        ):
             os.environ["REPLICATE_API_KEY"] = temporary_key
     return
+
+
 invalid_auth(test_model)
-# # Test 3: Rate Limit Errors 
+# # Test 3: Rate Limit Errors
 # def test_model(model):
-#     try: 
+#     try:
 #         sample_text = "how does a court case get to the Supreme Court?" * 50000
 #         messages = [{ "content": sample_text,"role": "user"}]
 #         custom_llm_provider = None
@@ -142,5 +175,3 @@ invalid_auth(test_model)
 
 # accuracy_score = counts[True]/(counts[True] + counts[False])
 # print(f"accuracy_score: {accuracy_score}")
-
-
diff --git a/litellm/tests/test_helicone_integration.py b/litellm/tests/test_helicone_integration.py
index 0b1d6ce8a..66e375d17 100644
--- a/litellm/tests/test_helicone_integration.py
+++ b/litellm/tests/test_helicone_integration.py
@@ -5,7 +5,9 @@ import sys, os
 import traceback
 import pytest
 
-sys.path.insert(0, os.path.abspath('../..'))  # Adds the parent directory to the system path
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
 import litellm
 from litellm import embedding, completion
 
@@ -14,11 +16,15 @@ litellm.success_callback = ["helicone"]
 litellm.set_verbose = True
 
 user_message = "Hello, how are you?"
-messages = [{ "content": user_message,"role": "user"}]
+messages = [{"content": user_message, "role": "user"}]
 
 
-#openai call
-response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}]) 
+# openai call
+response = completion(
+    model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}]
+)
 
-#cohere call
-response = completion(model="command-nightly", messages=[{"role": "user", "content": "Hi 👋 - i'm cohere"}]) 
\ No newline at end of file
+# cohere call
+response = completion(
+    model="command-nightly", messages=[{"role": "user", "content": "Hi 👋 - i'm cohere"}]
+)
diff --git a/litellm/tests/test_load_test_model.py b/litellm/tests/test_load_test_model.py
index 8040dabe7..0820990c2 100644
--- a/litellm/tests/test_load_test_model.py
+++ b/litellm/tests/test_load_test_model.py
@@ -1,22 +1,37 @@
 import sys, os
 import traceback
-sys.path.insert(0, os.path.abspath('../..'))  # Adds the parent directory to the system path
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
 import litellm
 from litellm import load_test_model, testing_batch_completion
 
-# ## Load Test Model 
+# ## Load Test Model
 # model="gpt-3.5-turbo"
 # result = load_test_model(model=model, num_calls=5)
 # print(result)
 # print(len(result["results"]))
 
-# ## Duration Test Model 
+# ## Duration Test Model
 # model="gpt-3.5-turbo"
 # result = load_test_model(model=model, num_calls=5, duration=15, interval=15) # duration test the model for 2 minutes, sending 5 calls every 15s
 # print(result)
 
-## Quality Test across Model 
-models = ["gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-4", "claude-instant-1", {"model": "replicate/llama-2-70b-chat:58d078176e02c219e11eb4da5a02a7830a283b14cf8f94537af893ccff5ee781", "custom_llm_provider": "replicate"}]
-messages =  [[{"role": "user", "content": "What is your name?"}], [{"role": "user", "content": "Hey, how's it going?"}]]
+## Quality Test across Model
+models = [
+    "gpt-3.5-turbo",
+    "gpt-3.5-turbo-16k",
+    "gpt-4",
+    "claude-instant-1",
+    {
+        "model": "replicate/llama-2-70b-chat:58d078176e02c219e11eb4da5a02a7830a283b14cf8f94537af893ccff5ee781",
+        "custom_llm_provider": "replicate",
+    },
+]
+messages = [
+    [{"role": "user", "content": "What is your name?"}],
+    [{"role": "user", "content": "Hey, how's it going?"}],
+]
 result = testing_batch_completion(models=models, messages=messages)
-print(result)
\ No newline at end of file
+print(result)
diff --git a/litellm/tests/test_logging.py b/litellm/tests/test_logging.py
index 3174083ef..37caeffa9 100644
--- a/litellm/tests/test_logging.py
+++ b/litellm/tests/test_logging.py
@@ -3,7 +3,10 @@
 
 import sys, os
 import traceback
-sys.path.insert(0, os.path.abspath('../..'))  # Adds the parent directory to the system path
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
 import litellm
 from litellm import embedding, completion
 
@@ -11,49 +14,53 @@ litellm.set_verbose = False
 
 score = 0
 
+
 def logger_fn(model_call_object: dict):
     print(f"model call details: {model_call_object}")
 
-user_message = "Hello, how are you?"
-messages = [{ "content": user_message,"role": "user"}]
 
-# test on openai completion call 
+user_message = "Hello, how are you?"
+messages = [{"content": user_message, "role": "user"}]
+
+# test on openai completion call
 try:
     response = completion(model="gpt-3.5-turbo", messages=messages, logger_fn=logger_fn)
-    score +=1 
+    score += 1
 except:
-    print(f"error occurred: {traceback.format_exc()}") 
+    print(f"error occurred: {traceback.format_exc()}")
     pass
 
-# test on non-openai completion call 
+# test on non-openai completion call
 try:
-    response = completion(model="claude-instant-1", messages=messages, logger_fn=logger_fn)
+    response = completion(
+        model="claude-instant-1", messages=messages, logger_fn=logger_fn
+    )
     print(f"claude response: {response}")
-    score +=1 
+    score += 1
 except:
-    print(f"error occurred: {traceback.format_exc()}") 
+    print(f"error occurred: {traceback.format_exc()}")
     pass
 
-# # test on openai embedding call 
-# try: 
+# # test on openai embedding call
+# try:
 #     response = embedding(model='text-embedding-ada-002', input=[user_message], logger_fn=logger_fn)
-#     score +=1 
+#     score +=1
 # except:
 #     traceback.print_exc()
 
 # # test on bad azure openai embedding call -> missing azure flag and this isn't an embedding model
-# try: 
+# try:
 #     response = embedding(model='chatgpt-test', input=[user_message], logger_fn=logger_fn)
 # except:
 #     score +=1 # expect this to fail
 #     traceback.print_exc()
 
-# # test on good azure openai embedding call 
-# try: 
+# # test on good azure openai embedding call
+# try:
 #     response = embedding(model='azure-embedding-model', input=[user_message], azure=True, logger_fn=logger_fn)
-#     score +=1 
+#     score +=1
 # except:
 #     traceback.print_exc()
 
 
-# print(f"Score: {score}, Overall score: {score/5}")
\ No newline at end of file
+# print(f"Score: {score}, Overall score: {score/5}")
diff --git a/litellm/tests/test_model_fallback.py b/litellm/tests/test_model_fallback.py
index 69dc1f68d..82535f77a 100644
--- a/litellm/tests/test_model_fallback.py
+++ b/litellm/tests/test_model_fallback.py
@@ -3,7 +3,10 @@
 
 import sys, os
 import traceback
-sys.path.insert(0, os.path.abspath('../..'))  # Adds the parent directory to the system path
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
 import litellm
 from litellm import embedding, completion
 
@@ -15,11 +18,11 @@ litellm.set_verbose = True
 model_fallback_list = ["claude-instant-1", "gpt-3.5-turbo", "chatgpt-test"]
 
 user_message = "Hello, how are you?"
-messages = [{ "content": user_message,"role": "user"}]
+messages = [{"content": user_message, "role": "user"}]
 
 for model in model_fallback_list:
     try:
         response = embedding(model="text-embedding-ada-002", input=[user_message])
         response = completion(model=model, messages=messages)
     except Exception as e:
-        print(f"error occurred: {traceback.format_exc()}") 
+        print(f"error occurred: {traceback.format_exc()}")
diff --git a/litellm/tests/test_model_response_typing/server.py b/litellm/tests/test_model_response_typing/server.py
index 0399f0d91..80dbc33af 100644
--- a/litellm/tests/test_model_response_typing/server.py
+++ b/litellm/tests/test_model_response_typing/server.py
@@ -20,4 +20,4 @@
 
 # if __name__ == '__main__':
 #     from waitress import serve
-#     serve(app, host='localhost', port=8080, threads=10)
\ No newline at end of file
+#     serve(app, host='localhost', port=8080, threads=10)
diff --git a/litellm/tests/test_model_response_typing/test.py b/litellm/tests/test_model_response_typing/test.py
index 12d2b259b..95d404809 100644
--- a/litellm/tests/test_model_response_typing/test.py
+++ b/litellm/tests/test_model_response_typing/test.py
@@ -1,4 +1,4 @@
-# import requests, json 
+# import requests, json
 
 # BASE_URL = 'http://localhost:8080'
 
@@ -11,4 +11,4 @@
 #     print("Hello route test passed!")
 
 # if __name__ == '__main__':
-#     test_hello_route()
\ No newline at end of file
+#     test_hello_route()
diff --git a/litellm/tests/test_no_client.py b/litellm/tests/test_no_client.py
index 79c47d0da..05badddb6 100644
--- a/litellm/tests/test_no_client.py
+++ b/litellm/tests/test_no_client.py
@@ -4,7 +4,10 @@
 
 import sys, os
 import traceback
-sys.path.insert(0, os.path.abspath('../..'))  # Adds the parent directory to the system path
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
 import litellm
 from litellm import embedding, completion
 
@@ -13,11 +16,11 @@ litellm.set_verbose = True
 model_fallback_list = ["claude-instant-1", "gpt-3.5-turbo", "chatgpt-test"]
 
 user_message = "Hello, how are you?"
-messages = [{ "content": user_message,"role": "user"}]
+messages = [{"content": user_message, "role": "user"}]
 
 for model in model_fallback_list:
     try:
         response = embedding(model="text-embedding-ada-002", input=[user_message])
         response = completion(model=model, messages=messages)
     except Exception as e:
-        print(f"error occurred: {traceback.format_exc()}") 
+        print(f"error occurred: {traceback.format_exc()}")
diff --git a/litellm/tests/test_ollama.py b/litellm/tests/test_ollama.py
index d95414560..8e0732a2c 100644
--- a/litellm/tests/test_ollama.py
+++ b/litellm/tests/test_ollama.py
@@ -53,7 +53,6 @@
 # # # return this generator to the client for streaming requests
 
 
-
 # # async def get_response():
 # #     global generator
 # #     async for elem in generator:
diff --git a/litellm/tests/test_ollama_local.py b/litellm/tests/test_ollama_local.py
index 22544f4cf..a9431a932 100644
--- a/litellm/tests/test_ollama_local.py
+++ b/litellm/tests/test_ollama_local.py
@@ -12,7 +12,6 @@
 # import asyncio
 
 
-
 # user_message = "respond in 20 words. who are you?"
 # messages = [{ "content": user_message,"role": "user"}]
 
@@ -45,8 +44,3 @@
 #         pytest.fail(f"Error occurred: {e}")
 
 # test_completion_ollama_stream()
-
-
-
-
-
diff --git a/litellm/tests/test_secrets.py b/litellm/tests/test_secrets.py
index b262044c4..9b9757015 100644
--- a/litellm/tests/test_secrets.py
+++ b/litellm/tests/test_secrets.py
@@ -4,7 +4,10 @@
 
 import sys, os
 import traceback
-sys.path.insert(0, os.path.abspath('../..'))  # Adds the parent directory to the system path
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
 import litellm
 from litellm import embedding, completion
 from infisical import InfisicalClient
@@ -15,7 +18,7 @@ infisical_token = os.environ["INFISICAL_TOKEN"]
 litellm.secret_manager_client = InfisicalClient(token=infisical_token)
 
 user_message = "Hello, whats the weather in San Francisco??"
-messages = [{ "content": user_message,"role": "user"}]
+messages = [{"content": user_message, "role": "user"}]
 
 
 def test_completion_openai():
@@ -28,5 +31,5 @@ def test_completion_openai():
         pytest.fail(f"Error occurred: {e}")
     litellm.secret_manager_client = None
 
-test_completion_openai()
 
+test_completion_openai()
diff --git a/litellm/tests/test_streaming.py b/litellm/tests/test_streaming.py
index 317dea904..ef2063828 100644
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@@ -3,7 +3,10 @@
 
 import sys, os
 import traceback
-sys.path.insert(0, os.path.abspath('../..'))  # Adds the parent directory to the system path
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
 import litellm
 from litellm import completion
 
@@ -11,29 +14,40 @@ litellm.set_verbose = False
 
 score = 0
 
+
 def logger_fn(model_call_object: dict):
     print(f"model call details: {model_call_object}")
 
-user_message = "Hello, how are you?"
-messages = [{ "content": user_message,"role": "user"}]
 
-# test on anthropic completion call 
+user_message = "Hello, how are you?"
+messages = [{"content": user_message, "role": "user"}]
+
+# test on anthropic completion call
 try:
-    response = completion(model="claude-instant-1", messages=messages, stream=True, logger_fn=logger_fn)
+    response = completion(
+        model="claude-instant-1", messages=messages, stream=True, logger_fn=logger_fn
+    )
     for chunk in response:
-        print(chunk['choices'][0]['delta'])
-    score +=1 
+        print(chunk["choices"][0]["delta"])
+    score += 1
 except:
-    print(f"error occurred: {traceback.format_exc()}") 
+    print(f"error occurred: {traceback.format_exc()}")
     pass
 
 
-# test on anthropic completion call 
+# test on anthropic completion call
 try:
-    response = completion(model="meta-llama/Llama-2-7b-chat-hf", messages=messages, custom_llm_provider="huggingface", custom_api_base="https://s7c7gytn18vnu4tw.us-east-1.aws.endpoints.huggingface.cloud", stream=True, logger_fn=logger_fn)
+    response = completion(
+        model="meta-llama/Llama-2-7b-chat-hf",
+        messages=messages,
+        custom_llm_provider="huggingface",
+        custom_api_base="https://s7c7gytn18vnu4tw.us-east-1.aws.endpoints.huggingface.cloud",
+        stream=True,
+        logger_fn=logger_fn,
+    )
     for chunk in response:
-        print(chunk['choices'][0]['delta'])
-    score +=1 
+        print(chunk["choices"][0]["delta"])
+    score += 1
 except:
-    print(f"error occurred: {traceback.format_exc()}") 
-    pass
\ No newline at end of file
+    print(f"error occurred: {traceback.format_exc()}")
+    pass
diff --git a/litellm/tests/test_supabase_integration.py b/litellm/tests/test_supabase_integration.py
index ac4e31b58..882d0bbc6 100644
--- a/litellm/tests/test_supabase_integration.py
+++ b/litellm/tests/test_supabase_integration.py
@@ -21,7 +21,7 @@
 
 
 # #openai call
-# response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}]) 
+# response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}])
 
 # #bad request call
-# response = completion(model="chatgpt-test", messages=[{"role": "user", "content": "Hi 👋 - i'm a bad request"}]) 
\ No newline at end of file
+# response = completion(model="chatgpt-test", messages=[{"role": "user", "content": "Hi 👋 - i'm a bad request"}])
diff --git a/litellm/tests/test_timeout.py b/litellm/tests/test_timeout.py
index 31f27e12b..b2bc43ed8 100644
--- a/litellm/tests/test_timeout.py
+++ b/litellm/tests/test_timeout.py
@@ -3,10 +3,14 @@
 
 import sys, os
 import traceback
-sys.path.insert(0, os.path.abspath('../..'))  # Adds the parent directory to the system path
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
 import time
 from litellm import timeout
 
+
 @timeout(10)
 def stop_after_10_s(force_timeout=60):
     print("Stopping after 10 seconds")
@@ -14,14 +18,14 @@ def stop_after_10_s(force_timeout=60):
     return
 
 
-start_time = time.time() 
+start_time = time.time()
 
 try:
-  stop_after_10_s(force_timeout=1)
+    stop_after_10_s(force_timeout=1)
 except Exception as e:
-  print(e)
-  pass
+    print(e)
+    pass
 
 end_time = time.time()
 
-print(f"total time: {end_time-start_time}")
\ No newline at end of file
+print(f"total time: {end_time-start_time}")
diff --git a/litellm/tests/test_vertex.py b/litellm/tests/test_vertex.py
index 468ba8d32..01088ec89 100644
--- a/litellm/tests/test_vertex.py
+++ b/litellm/tests/test_vertex.py
@@ -49,4 +49,4 @@
 
 # # chat = chat_model.start_chat()
 # # response = chat.send_message("who are u? write a sentence", **parameters)
-# # print(f"Response from Model: {response.text}")
\ No newline at end of file
+# # print(f"Response from Model: {response.text}")
diff --git a/litellm/timeout.py b/litellm/timeout.py
index 81d99e7de..cca4b06e7 100644
--- a/litellm/timeout.py
+++ b/litellm/timeout.py
@@ -11,9 +11,7 @@ from threading import Thread
 from openai.error import Timeout
 
 
-def timeout(
-    timeout_duration: float = None, exception_to_raise = Timeout
-):
+def timeout(timeout_duration: float = None, exception_to_raise=Timeout):
     """
     Wraps a function to raise the specified exception if execution time
     is greater than the specified timeout.
@@ -44,7 +42,9 @@ def timeout(
                 result = future.result(timeout=local_timeout_duration)
             except futures.TimeoutError:
                 thread.stop_loop()
-                raise exception_to_raise(f"A timeout error occurred. The function call took longer than {local_timeout_duration} second(s).")
+                raise exception_to_raise(
+                    f"A timeout error occurred. The function call took longer than {local_timeout_duration} second(s)."
+                )
             thread.stop_loop()
             return result
 
@@ -59,7 +59,9 @@ def timeout(
                 )
                 return value
             except asyncio.TimeoutError:
-                raise exception_to_raise(f"A timeout error occurred. The function call took longer than {local_timeout_duration} second(s).")
+                raise exception_to_raise(
+                    f"A timeout error occurred. The function call took longer than {local_timeout_duration} second(s)."
+                )
 
         if iscoroutinefunction(func):
             return async_wrapper
@@ -80,4 +82,4 @@ class _LoopWrapper(Thread):
     def stop_loop(self):
         for task in asyncio.all_tasks(self.loop):
             task.cancel()
-        self.loop.call_soon_threadsafe(self.loop.stop)
\ No newline at end of file
+        self.loop.call_soon_threadsafe(self.loop.stop)
diff --git a/litellm/utils.py b/litellm/utils.py
index b45418933..3190b56d6 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -1,10 +1,11 @@
 import sys
 import dotenv, json, traceback, threading
-import subprocess, os 
-import litellm, openai 
+import subprocess, os
+import litellm, openai
 import random, uuid, requests
 import datetime, time
 import tiktoken
+
 encoding = tiktoken.get_encoding("cl100k_base")
 import pkg_resources
 from .integrations.helicone import HeliconeLogger
@@ -13,10 +14,17 @@ from .integrations.berrispend import BerriSpendLogger
 from .integrations.supabase import Supabase
 from openai.error import OpenAIError as OriginalError
 from openai.openai_object import OpenAIObject
-from .exceptions import AuthenticationError, InvalidRequestError, RateLimitError, ServiceUnavailableError, OpenAIError
+from .exceptions import (
+    AuthenticationError,
+    InvalidRequestError,
+    RateLimitError,
+    ServiceUnavailableError,
+    OpenAIError,
+)
 from typing import List, Dict, Union
+
 ####### ENVIRONMENT VARIABLES ###################
-dotenv.load_dotenv() # Loading env variables using dotenv
+dotenv.load_dotenv()  # Loading env variables using dotenv
 sentry_sdk_instance = None
 capture_exception = None
 add_breadcrumb = None
@@ -51,12 +59,14 @@ local_cache = {}
 #  'usage': {'prompt_tokens': 18, 'completion_tokens': 23, 'total_tokens': 41}
 # }
 
+
 class Message(OpenAIObject):
     def __init__(self, content="default", role="assistant", **params):
         super(Message, self).__init__(**params)
         self.content = content
         self.role = role
 
+
 class Choices(OpenAIObject):
     def __init__(self, finish_reason="stop", index=0, message=Message(), **params):
         super(Choices, self).__init__(**params)
@@ -64,38 +74,48 @@ class Choices(OpenAIObject):
         self.index = index
         self.message = message
 
+
 class ModelResponse(OpenAIObject):
     def __init__(self, choices=None, created=None, model=None, usage=None, **params):
         super(ModelResponse, self).__init__(**params)
         self.choices = choices if choices else [Choices()]
         self.created = created
         self.model = model
-        self.usage = usage if usage else {
-            "prompt_tokens": None,
-            "completion_tokens": None,
-            "total_tokens": None
-        }
+        self.usage = (
+            usage
+            if usage
+            else {
+                "prompt_tokens": None,
+                "completion_tokens": None,
+                "total_tokens": None,
+            }
+        )
 
     def to_dict_recursive(self):
         d = super().to_dict_recursive()
-        d['choices'] = [choice.to_dict_recursive() for choice in self.choices]
+        d["choices"] = [choice.to_dict_recursive() for choice in self.choices]
         return d
+
+
 ############################################################
 def print_verbose(print_statement):
-  if litellm.set_verbose:
-    print(f"LiteLLM: {print_statement}")
-    if random.random() <= 0.3:
-      print("Get help - https://discord.com/invite/wuPM9dRgDw")
+    if litellm.set_verbose:
+        print(f"LiteLLM: {print_statement}")
+        if random.random() <= 0.3:
+            print("Get help - https://discord.com/invite/wuPM9dRgDw")
+
 
 ####### Package Import Handler ###################
 import importlib
 import subprocess
+
+
 def install_and_import(package: str):
     if package in globals().keys():
-      print_verbose(f"{package} has already been imported.")
-      return
+        print_verbose(f"{package} has already been imported.")
+        return
     try:
-        # Import the module 
+        # Import the module
         module = importlib.import_module(package)
     except ImportError:
         print_verbose(f"{package} is not installed. Installing...")
@@ -108,200 +128,262 @@ def install_and_import(package: str):
     finally:
         if package not in globals().keys():
             globals()[package] = importlib.import_module(package)
+
+
 ##################################################
 
-####### LOGGING ###################
-#Logging function -> log the exact model details + what's being sent | Non-Blocking
-def logging(model=None, input=None, custom_llm_provider=None, azure=False, additional_args={}, logger_fn=None, exception=None):
-  try:
-    model_call_details = {}
-    if model:
-      model_call_details["model"] = model
-    if azure:
-      model_call_details["azure"] = azure
-    if custom_llm_provider:
-       model_call_details["custom_llm_provider"] = custom_llm_provider
-    if exception:
-      model_call_details["exception"] = exception
-    if input:
-      model_call_details["input"] = input
-    
-    if len(additional_args):
-       model_call_details["additional_args"] = additional_args
-    # log additional call details -> api key, etc. 
-    if model:
-      if azure == True or model in litellm.open_ai_chat_completion_models or model in litellm.open_ai_chat_completion_models or model in litellm.open_ai_embedding_models:
-        model_call_details["api_type"] = openai.api_type
-        model_call_details["api_base"] = openai.api_base
-        model_call_details["api_version"] = openai.api_version
-        model_call_details["api_key"] = openai.api_key
-      elif "replicate" in model:
-        model_call_details["api_key"] = os.environ.get("REPLICATE_API_TOKEN")
-      elif model in litellm.anthropic_models:
-        model_call_details["api_key"] = os.environ.get("ANTHROPIC_API_KEY")
-      elif model in litellm.cohere_models:
-        model_call_details["api_key"] = os.environ.get("COHERE_API_KEY")
-    ## User Logging -> if you pass in a custom logging function or want to use sentry breadcrumbs
-    print_verbose(f"Logging Details: logger_fn - {logger_fn} | callable(logger_fn) - {callable(logger_fn)}")
-    if logger_fn and callable(logger_fn):
-      try:
-        logger_fn(model_call_details) # Expectation: any logger function passed in by the user should accept a dict object
-      except Exception as e:
-        print(f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while logging {traceback.format_exc()}")
-  except Exception as e:
-    print(f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while logging {traceback.format_exc()}")
-    pass
 
-####### CLIENT ################### 
+####### LOGGING ###################
+# Logging function -> log the exact model details + what's being sent | Non-Blocking
+def logging(
+    model=None,
+    input=None,
+    custom_llm_provider=None,
+    azure=False,
+    additional_args={},
+    logger_fn=None,
+    exception=None,
+):
+    try:
+        model_call_details = {}
+        if model:
+            model_call_details["model"] = model
+        if azure:
+            model_call_details["azure"] = azure
+        if custom_llm_provider:
+            model_call_details["custom_llm_provider"] = custom_llm_provider
+        if exception:
+            model_call_details["exception"] = exception
+        if input:
+            model_call_details["input"] = input
+
+        if len(additional_args):
+            model_call_details["additional_args"] = additional_args
+        # log additional call details -> api key, etc.
+        if model:
+            if (
+                azure == True
+                or model in litellm.open_ai_chat_completion_models
+                or model in litellm.open_ai_chat_completion_models
+                or model in litellm.open_ai_embedding_models
+            ):
+                model_call_details["api_type"] = openai.api_type
+                model_call_details["api_base"] = openai.api_base
+                model_call_details["api_version"] = openai.api_version
+                model_call_details["api_key"] = openai.api_key
+            elif "replicate" in model:
+                model_call_details["api_key"] = os.environ.get("REPLICATE_API_TOKEN")
+            elif model in litellm.anthropic_models:
+                model_call_details["api_key"] = os.environ.get("ANTHROPIC_API_KEY")
+            elif model in litellm.cohere_models:
+                model_call_details["api_key"] = os.environ.get("COHERE_API_KEY")
+        ## User Logging -> if you pass in a custom logging function or want to use sentry breadcrumbs
+        print_verbose(
+            f"Logging Details: logger_fn - {logger_fn} | callable(logger_fn) - {callable(logger_fn)}"
+        )
+        if logger_fn and callable(logger_fn):
+            try:
+                logger_fn(
+                    model_call_details
+                )  # Expectation: any logger function passed in by the user should accept a dict object
+            except Exception as e:
+                print(
+                    f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while logging {traceback.format_exc()}"
+                )
+    except Exception as e:
+        print(
+            f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while logging {traceback.format_exc()}"
+        )
+        pass
+
+
+####### CLIENT ###################
 # make it easy to log if completion/embedding runs succeeded or failed + see what happened | Non-Blocking
 def client(original_function):
-    def function_setup(*args, **kwargs): #just run once to check if user wants to send their data anywhere - PostHog/Sentry/Slack/etc.
-      try: 
-        global callback_list, add_breadcrumb, user_logger_fn
-        if (len(litellm.success_callback) > 0 or len(litellm.failure_callback) > 0) and len(callback_list) == 0: 
-          callback_list = list(set(litellm.success_callback + litellm.failure_callback))
-          set_callbacks(callback_list=callback_list,)
-        if add_breadcrumb:
-          add_breadcrumb(
-                category="litellm.llm_call",
-                message=f"Positional Args: {args}, Keyword Args: {kwargs}",
-                level="info",
-            )
-        if "logger_fn" in kwargs:
-           user_logger_fn = kwargs["logger_fn"]
-      except: # DO NOT BLOCK running the function because of this
-        print_verbose(f"[Non-Blocking] {traceback.format_exc()}")
-      pass
+    def function_setup(
+        *args, **kwargs
+    ):  # just run once to check if user wants to send their data anywhere - PostHog/Sentry/Slack/etc.
+        try:
+            global callback_list, add_breadcrumb, user_logger_fn
+            if (
+                len(litellm.success_callback) > 0 or len(litellm.failure_callback) > 0
+            ) and len(callback_list) == 0:
+                callback_list = list(
+                    set(litellm.success_callback + litellm.failure_callback)
+                )
+                set_callbacks(
+                    callback_list=callback_list,
+                )
+            if add_breadcrumb:
+                add_breadcrumb(
+                    category="litellm.llm_call",
+                    message=f"Positional Args: {args}, Keyword Args: {kwargs}",
+                    level="info",
+                )
+            if "logger_fn" in kwargs:
+                user_logger_fn = kwargs["logger_fn"]
+        except:  # DO NOT BLOCK running the function because of this
+            print_verbose(f"[Non-Blocking] {traceback.format_exc()}")
+        pass
 
     def crash_reporting(*args, **kwargs):
-      if litellm.telemetry:
-        try:
-          model = args[0] if len(args) > 0 else kwargs["model"]
-          exception = kwargs["exception"] if "exception" in kwargs else None
-          custom_llm_provider = kwargs["custom_llm_provider"] if "custom_llm_provider" in kwargs else None
-          safe_crash_reporting(model=model, exception=exception, custom_llm_provider=custom_llm_provider) # log usage-crash details. Do not log any user details. If you want to turn this off, set `litellm.telemetry=False`.
-        except:
-           #[Non-Blocking Error]
-           pass
+        if litellm.telemetry:
+            try:
+                model = args[0] if len(args) > 0 else kwargs["model"]
+                exception = kwargs["exception"] if "exception" in kwargs else None
+                custom_llm_provider = (
+                    kwargs["custom_llm_provider"]
+                    if "custom_llm_provider" in kwargs
+                    else None
+                )
+                safe_crash_reporting(
+                    model=model,
+                    exception=exception,
+                    custom_llm_provider=custom_llm_provider,
+                )  # log usage-crash details. Do not log any user details. If you want to turn this off, set `litellm.telemetry=False`.
+            except:
+                # [Non-Blocking Error]
+                pass
 
     def get_prompt(*args, **kwargs):
-      # make this safe checks, it should not throw any exceptions
-      if len(args) > 1:
-        messages = args[1]
-        prompt = " ".join(message["content"] for message in messages) 
-        return prompt
-      if "messages" in kwargs:
-        messages = kwargs["messages"]
-        prompt = " ".join(message["content"] for message in messages) 
-        return prompt
-      return None
+        # make this safe checks, it should not throw any exceptions
+        if len(args) > 1:
+            messages = args[1]
+            prompt = " ".join(message["content"] for message in messages)
+            return prompt
+        if "messages" in kwargs:
+            messages = kwargs["messages"]
+            prompt = " ".join(message["content"] for message in messages)
+            return prompt
+        return None
 
     def check_cache(*args, **kwargs):
-      try: # never block execution
-        prompt = get_prompt(*args, **kwargs)
-        if prompt != None and prompt in local_cache: # check if messages / prompt exists
-          result = local_cache[prompt]
-          return result
-        else:
-          return None
-      except:
-        return None
- 
+        try:  # never block execution
+            prompt = get_prompt(*args, **kwargs)
+            if (
+                prompt != None and prompt in local_cache
+            ):  # check if messages / prompt exists
+                result = local_cache[prompt]
+                return result
+            else:
+                return None
+        except:
+            return None
+
     def add_cache(result, *args, **kwargs):
-      try: # never block execution
-        prompt = get_prompt(*args, **kwargs)
-        local_cache[prompt] = result
-      except:
-        pass
+        try:  # never block execution
+            prompt = get_prompt(*args, **kwargs)
+            local_cache[prompt] = result
+        except:
+            pass
 
     def wrapper(*args, **kwargs):
         start_time = None
         result = None
         try:
-          function_setup(*args, **kwargs)
-          ## MODEL CALL
-          start_time = datetime.datetime.now()
-          if litellm.caching and (cached_result := check_cache(*args, **kwargs)) is not None:
-              result = cached_result
-          else:
-              result = original_function(*args, **kwargs)
-          end_time = datetime.datetime.now()
-          ## Add response to CACHE 
-          if litellm.caching:
-            add_cache(result, *args, **kwargs)
-          ## LOG SUCCESS 
-          crash_reporting(*args, **kwargs)
-          my_thread = threading.Thread(target=handle_success, args=(args, kwargs, result, start_time, end_time)) # don't interrupt execution of main thread
-          my_thread.start()
-          return result
+            function_setup(*args, **kwargs)
+            ## MODEL CALL
+            start_time = datetime.datetime.now()
+            if (
+                litellm.caching
+                and (cached_result := check_cache(*args, **kwargs)) is not None
+            ):
+                result = cached_result
+            else:
+                result = original_function(*args, **kwargs)
+            end_time = datetime.datetime.now()
+            ## Add response to CACHE
+            if litellm.caching:
+                add_cache(result, *args, **kwargs)
+            ## LOG SUCCESS
+            crash_reporting(*args, **kwargs)
+            my_thread = threading.Thread(
+                target=handle_success, args=(args, kwargs, result, start_time, end_time)
+            )  # don't interrupt execution of main thread
+            my_thread.start()
+            return result
         except Exception as e:
-          traceback_exception = traceback.format_exc()
-          crash_reporting(*args, **kwargs, exception=traceback_exception)
-          end_time = datetime.datetime.now()
-          my_thread = threading.Thread(target=handle_failure, args=(e, traceback_exception, start_time, end_time, args, kwargs)) # don't interrupt execution of main thread
-          my_thread.start()
-          raise e
+            traceback_exception = traceback.format_exc()
+            crash_reporting(*args, **kwargs, exception=traceback_exception)
+            end_time = datetime.datetime.now()
+            my_thread = threading.Thread(
+                target=handle_failure,
+                args=(e, traceback_exception, start_time, end_time, args, kwargs),
+            )  # don't interrupt execution of main thread
+            my_thread.start()
+            raise e
+
     return wrapper
 
+
 ####### USAGE CALCULATOR ################
 
+
 def token_counter(model, text):
-  # use tiktoken or anthropic's tokenizer depending on the model
-  num_tokens = 0
-  if "claude" in model:
-    install_and_import('anthropic')
-    from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT
-    anthropic = Anthropic()
-    num_tokens = anthropic.count_tokens(text)
-  else:
-    num_tokens = len(encoding.encode(text))
-  return num_tokens
+    # use tiktoken or anthropic's tokenizer depending on the model
+    num_tokens = 0
+    if "claude" in model:
+        install_and_import("anthropic")
+        from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT
+
+        anthropic = Anthropic()
+        num_tokens = anthropic.count_tokens(text)
+    else:
+        num_tokens = len(encoding.encode(text))
+    return num_tokens
 
 
-def cost_per_token(model="gpt-3.5-turbo", prompt_tokens = 0, completion_tokens = 0):
-   ## given 
-  prompt_tokens_cost_usd_dollar = 0
-  completion_tokens_cost_usd_dollar = 0
-  model_cost_ref = litellm.model_cost
-  if model in model_cost_ref:
-    prompt_tokens_cost_usd_dollar = model_cost_ref[model]["input_cost_per_token"] * prompt_tokens
-    completion_tokens_cost_usd_dollar = model_cost_ref[model]["output_cost_per_token"] * completion_tokens
-    return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
-  else:
-    # calculate average input cost 
-    input_cost_sum = 0
-    output_cost_sum = 0
+def cost_per_token(model="gpt-3.5-turbo", prompt_tokens=0, completion_tokens=0):
+    ## given
+    prompt_tokens_cost_usd_dollar = 0
+    completion_tokens_cost_usd_dollar = 0
     model_cost_ref = litellm.model_cost
-    for model in model_cost_ref:
-        input_cost_sum += model_cost_ref[model]["input_cost_per_token"]
-        output_cost_sum += model_cost_ref[model]["output_cost_per_token"]
-    avg_input_cost = input_cost_sum / len(model_cost_ref.keys())
-    avg_output_cost = output_cost_sum / len(model_cost_ref.keys())
-    prompt_tokens_cost_usd_dollar = avg_input_cost * prompt_tokens
-    completion_tokens_cost_usd_dollar = avg_output_cost * completion_tokens
-    return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
-    
+    if model in model_cost_ref:
+        prompt_tokens_cost_usd_dollar = (
+            model_cost_ref[model]["input_cost_per_token"] * prompt_tokens
+        )
+        completion_tokens_cost_usd_dollar = (
+            model_cost_ref[model]["output_cost_per_token"] * completion_tokens
+        )
+        return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
+    else:
+        # calculate average input cost
+        input_cost_sum = 0
+        output_cost_sum = 0
+        model_cost_ref = litellm.model_cost
+        for model in model_cost_ref:
+            input_cost_sum += model_cost_ref[model]["input_cost_per_token"]
+            output_cost_sum += model_cost_ref[model]["output_cost_per_token"]
+        avg_input_cost = input_cost_sum / len(model_cost_ref.keys())
+        avg_output_cost = output_cost_sum / len(model_cost_ref.keys())
+        prompt_tokens_cost_usd_dollar = avg_input_cost * prompt_tokens
+        completion_tokens_cost_usd_dollar = avg_output_cost * completion_tokens
+        return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
+
 
 def completion_cost(model="gpt-3.5-turbo", prompt="", completion=""):
-   prompt_tokens = token_counter(model=model, text=prompt)
-   completion_tokens = token_counter(model=model, text=completion)
-   prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = cost_per_token(model=model, prompt_tokens = prompt_tokens, completion_tokens = completion_tokens)
-   return prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
+    prompt_tokens = token_counter(model=model, text=prompt)
+    completion_tokens = token_counter(model=model, text=completion)
+    prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = cost_per_token(
+        model=model, prompt_tokens=prompt_tokens, completion_tokens=completion_tokens
+    )
+    return prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
+
 
 ####### HELPER FUNCTIONS ################
 def get_litellm_params(
     return_async=False,
-    api_key=None, 
-    force_timeout=600, 
-    azure=False, 
-    logger_fn=None, 
+    api_key=None,
+    force_timeout=600,
+    azure=False,
+    logger_fn=None,
     verbose=False,
-    hugging_face=False, 
+    hugging_face=False,
     replicate=False,
-    together_ai=False, 
-    custom_llm_provider=None, 
-    custom_api_base=None
-): 
+    together_ai=False,
+    custom_llm_provider=None,
+    custom_api_base=None,
+):
     litellm_params = {
         "return_async": return_async,
         "api_key": api_key,
@@ -309,463 +391,656 @@ def get_litellm_params(
         "logger_fn": logger_fn,
         "verbose": verbose,
         "custom_llm_provider": custom_llm_provider,
-        "custom_api_base": custom_api_base
+        "custom_api_base": custom_api_base,
     }
-    
+
     return litellm_params
 
 
 def get_optional_params(
     # 12 optional params
-    functions = [],
-    function_call = "",
-    temperature = 1,
-    top_p = 1,
-    n = 1,
-    stream = False,
-    stop = None,
-    max_tokens = float('inf'),
-    presence_penalty = 0,
-    frequency_penalty = 0,
-    logit_bias = {},
-    user = "",
-    deployment_id = None,
-    model = None,
-    custom_llm_provider = "",
-    top_k = 40,
+    functions=[],
+    function_call="",
+    temperature=1,
+    top_p=1,
+    n=1,
+    stream=False,
+    stop=None,
+    max_tokens=float("inf"),
+    presence_penalty=0,
+    frequency_penalty=0,
+    logit_bias={},
+    user="",
+    deployment_id=None,
+    model=None,
+    custom_llm_provider="",
+    top_k=40,
 ):
-  optional_params = {}
-  if model in litellm.anthropic_models:
-    # handle anthropic params
-    if stream:
-      optional_params["stream"] = stream
-    if stop != None:
-        optional_params["stop_sequences"] = stop
-    if temperature != 1:
+    optional_params = {}
+    if model in litellm.anthropic_models:
+        # handle anthropic params
+        if stream:
+            optional_params["stream"] = stream
+        if stop != None:
+            optional_params["stop_sequences"] = stop
+        if temperature != 1:
+            optional_params["temperature"] = temperature
+        if top_p != 1:
+            optional_params["top_p"] = top_p
+        return optional_params
+    elif model in litellm.cohere_models:
+        # handle cohere params
+        if stream:
+            optional_params["stream"] = stream
+        if temperature != 1:
+            optional_params["temperature"] = temperature
+        if max_tokens != float("inf"):
+            optional_params["max_tokens"] = max_tokens
+        return optional_params
+    elif custom_llm_provider == "replicate":
+        # any replicate models
+        # TODO: handle translating remaining replicate params
+        if stream:
+            optional_params["stream"] = stream
+            return optional_params
+    elif custom_llm_provider == "together_ai" or ("togethercomputer" in model):
+        if stream:
+            optional_params["stream_tokens"] = stream
+        if temperature != 1:
+            optional_params["temperature"] = temperature
+        if top_p != 1:
+            optional_params["top_p"] = top_p
+        if max_tokens != float("inf"):
+            optional_params["max_tokens"] = max_tokens
+        if frequency_penalty != 0:
+            optional_params["frequency_penalty"] = frequency_penalty
+    elif (
+        model == "chat-bison"
+    ):  # chat-bison has diff args from chat-bison@001 ty Google
+        if temperature != 1:
+            optional_params["temperature"] = temperature
+        if top_p != 1:
+            optional_params["top_p"] = top_p
+        if max_tokens != float("inf"):
+            optional_params["max_output_tokens"] = max_tokens
+    elif model in litellm.vertex_text_models:
+        # required params for all text vertex calls
+        # temperature=0.2, top_p=0.1, top_k=20
+        # always set temperature, top_p, top_k else, text bison fails
         optional_params["temperature"] = temperature
-    if top_p != 1:
         optional_params["top_p"] = top_p
-    return optional_params
-  elif model in litellm.cohere_models:
-     # handle cohere params
-    if stream:
-      optional_params["stream"] = stream
-    if temperature != 1:
-        optional_params["temperature"] = temperature
-    if max_tokens != float('inf'):
-        optional_params["max_tokens"] = max_tokens
-    return optional_params
-  elif custom_llm_provider == "replicate":
-    # any replicate models
-    # TODO: handle translating remaining replicate params
-    if stream:
-      optional_params["stream"] = stream
-      return optional_params
-  elif custom_llm_provider == "together_ai" or ("togethercomputer" in model):
-      if stream:
-        optional_params["stream_tokens"] = stream
-      if temperature != 1:
-          optional_params["temperature"] = temperature
-      if top_p != 1:
-        optional_params["top_p"] = top_p
-      if max_tokens != float('inf'):
-        optional_params["max_tokens"] = max_tokens
-      if frequency_penalty != 0:
-        optional_params["frequency_penalty"] = frequency_penalty
-  elif model == "chat-bison": # chat-bison has diff args from chat-bison@001 ty Google
-     if temperature != 1:
-        optional_params["temperature"] = temperature
-     if top_p != 1:
-        optional_params["top_p"] = top_p
-     if max_tokens != float('inf'):
-        optional_params["max_output_tokens"] = max_tokens
-  elif model in litellm.vertex_text_models:
-      # required params for all text vertex calls
-      # temperature=0.2, top_p=0.1, top_k=20
-      # always set temperature, top_p, top_k else, text bison fails
-      optional_params["temperature"] = temperature
-      optional_params["top_p"] = top_p
-      optional_params["top_k"] = top_k
+        optional_params["top_k"] = top_k
 
-  else:# assume passing in params for openai/azure openai
-    if functions != []:
-        optional_params["functions"] = functions
-    if function_call != "":
-        optional_params["function_call"] = function_call
-    if temperature != 1:
-        optional_params["temperature"] = temperature
-    if top_p != 1:
-        optional_params["top_p"] = top_p
-    if n != 1:
-        optional_params["n"] = n
-    if stream:
-        optional_params["stream"] = stream
-    if stop != None:
-        optional_params["stop"] = stop
-    if max_tokens != float('inf'):
-        optional_params["max_tokens"] = max_tokens
-    if presence_penalty != 0:
-        optional_params["presence_penalty"] = presence_penalty
-    if frequency_penalty != 0:
-        optional_params["frequency_penalty"] = frequency_penalty
-    if logit_bias != {}:
-        optional_params["logit_bias"] = logit_bias
-    if user != "":
-        optional_params["user"] = user
-    if deployment_id != None:
-        optional_params["deployment_id"] = deployment_id
+    else:  # assume passing in params for openai/azure openai
+        if functions != []:
+            optional_params["functions"] = functions
+        if function_call != "":
+            optional_params["function_call"] = function_call
+        if temperature != 1:
+            optional_params["temperature"] = temperature
+        if top_p != 1:
+            optional_params["top_p"] = top_p
+        if n != 1:
+            optional_params["n"] = n
+        if stream:
+            optional_params["stream"] = stream
+        if stop != None:
+            optional_params["stop"] = stop
+        if max_tokens != float("inf"):
+            optional_params["max_tokens"] = max_tokens
+        if presence_penalty != 0:
+            optional_params["presence_penalty"] = presence_penalty
+        if frequency_penalty != 0:
+            optional_params["frequency_penalty"] = frequency_penalty
+        if logit_bias != {}:
+            optional_params["logit_bias"] = logit_bias
+        if user != "":
+            optional_params["user"] = user
+        if deployment_id != None:
+            optional_params["deployment_id"] = deployment_id
+        return optional_params
     return optional_params
-  return optional_params
 
-def load_test_model(model: str, custom_llm_provider: str = None, custom_api_base: str = None, prompt: str = None, num_calls: int = None, force_timeout: int = None):
-  test_prompt = "Hey, how's it going"
-  test_calls = 100
-  if prompt:
-     test_prompt = prompt
-  if num_calls:
-     test_calls = num_calls
-  messages = [[{"role": "user", "content": test_prompt}] for _ in range(test_calls)]
-  start_time = time.time()
-  try:
-    litellm.batch_completion(model=model, messages=messages, custom_llm_provider=custom_llm_provider, custom_api_base = custom_api_base, force_timeout=force_timeout)
-    end_time = time.time() 
-    response_time = end_time - start_time
-    return {"total_response_time": response_time, "calls_made": 100, "status": "success", "exception": None}
-  except Exception as e:
-    end_time = time.time() 
-    response_time = end_time - start_time
-    return {"total_response_time": response_time, "calls_made": 100, "status": "failed", "exception": e}
+
+def load_test_model(
+    model: str,
+    custom_llm_provider: str = None,
+    custom_api_base: str = None,
+    prompt: str = None,
+    num_calls: int = None,
+    force_timeout: int = None,
+):
+    test_prompt = "Hey, how's it going"
+    test_calls = 100
+    if prompt:
+        test_prompt = prompt
+    if num_calls:
+        test_calls = num_calls
+    messages = [[{"role": "user", "content": test_prompt}] for _ in range(test_calls)]
+    start_time = time.time()
+    try:
+        litellm.batch_completion(
+            model=model,
+            messages=messages,
+            custom_llm_provider=custom_llm_provider,
+            custom_api_base=custom_api_base,
+            force_timeout=force_timeout,
+        )
+        end_time = time.time()
+        response_time = end_time - start_time
+        return {
+            "total_response_time": response_time,
+            "calls_made": 100,
+            "status": "success",
+            "exception": None,
+        }
+    except Exception as e:
+        end_time = time.time()
+        response_time = end_time - start_time
+        return {
+            "total_response_time": response_time,
+            "calls_made": 100,
+            "status": "failed",
+            "exception": e,
+        }
+
 
 def set_callbacks(callback_list):
-  global sentry_sdk_instance, capture_exception, add_breadcrumb, posthog, slack_app, alerts_channel, heliconeLogger, aispendLogger, berrispendLogger, supabaseClient
-  try:
-    for callback in callback_list:
-      if callback == "sentry" or "SENTRY_API_URL" in os.environ:
-        try:
-            import sentry_sdk
-        except ImportError:
-            print_verbose("Package 'sentry_sdk' is missing. Installing it...")
-            subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sentry_sdk'])
-            import sentry_sdk
-        sentry_sdk_instance = sentry_sdk
-        sentry_trace_rate = os.environ.get("SENTRY_API_TRACE_RATE") if "SENTRY_API_TRACE_RATE" in os.environ else "1.0"
-        sentry_sdk_instance.init(dsn=os.environ.get("SENTRY_API_URL"), traces_sample_rate=float(sentry_trace_rate))
-        capture_exception = sentry_sdk_instance.capture_exception
-        add_breadcrumb = sentry_sdk_instance.add_breadcrumb 
-      elif callback == "posthog":
-        try:
-            from posthog import Posthog
-        except ImportError:
-            print_verbose("Package 'posthog' is missing. Installing it...")
-            subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'posthog'])
-            from posthog import Posthog
-        posthog = Posthog(
-          project_api_key=os.environ.get("POSTHOG_API_KEY"),
-          host=os.environ.get("POSTHOG_API_URL"))
-      elif callback == "slack":
-        try:
-            from slack_bolt import App
-        except ImportError:
-            print_verbose("Package 'slack_bolt' is missing. Installing it...")
-            subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'slack_bolt'])
-            from slack_bolt import App
-        slack_app = App(
-          token=os.environ.get("SLACK_API_TOKEN"),
-          signing_secret=os.environ.get("SLACK_API_SECRET")
-        )
-        alerts_channel = os.environ["SLACK_API_CHANNEL"]
-        print_verbose(f"Initialized Slack App: {slack_app}")
-      elif callback == "helicone":
-        heliconeLogger = HeliconeLogger()
-      elif callback == "aispend":
-        aispendLogger = AISpendLogger()
-      elif callback == "berrispend": 
-        berrispendLogger = BerriSpendLogger()
-      elif callback == "supabase":
-         supabaseClient = Supabase()
-  except Exception as e:
-    raise e
+    global sentry_sdk_instance, capture_exception, add_breadcrumb, posthog, slack_app, alerts_channel, heliconeLogger, aispendLogger, berrispendLogger, supabaseClient
+    try:
+        for callback in callback_list:
+            if callback == "sentry" or "SENTRY_API_URL" in os.environ:
+                try:
+                    import sentry_sdk
+                except ImportError:
+                    print_verbose("Package 'sentry_sdk' is missing. Installing it...")
+                    subprocess.check_call(
+                        [sys.executable, "-m", "pip", "install", "sentry_sdk"]
+                    )
+                    import sentry_sdk
+                sentry_sdk_instance = sentry_sdk
+                sentry_trace_rate = (
+                    os.environ.get("SENTRY_API_TRACE_RATE")
+                    if "SENTRY_API_TRACE_RATE" in os.environ
+                    else "1.0"
+                )
+                sentry_sdk_instance.init(
+                    dsn=os.environ.get("SENTRY_API_URL"),
+                    traces_sample_rate=float(sentry_trace_rate),
+                )
+                capture_exception = sentry_sdk_instance.capture_exception
+                add_breadcrumb = sentry_sdk_instance.add_breadcrumb
+            elif callback == "posthog":
+                try:
+                    from posthog import Posthog
+                except ImportError:
+                    print_verbose("Package 'posthog' is missing. Installing it...")
+                    subprocess.check_call(
+                        [sys.executable, "-m", "pip", "install", "posthog"]
+                    )
+                    from posthog import Posthog
+                posthog = Posthog(
+                    project_api_key=os.environ.get("POSTHOG_API_KEY"),
+                    host=os.environ.get("POSTHOG_API_URL"),
+                )
+            elif callback == "slack":
+                try:
+                    from slack_bolt import App
+                except ImportError:
+                    print_verbose("Package 'slack_bolt' is missing. Installing it...")
+                    subprocess.check_call(
+                        [sys.executable, "-m", "pip", "install", "slack_bolt"]
+                    )
+                    from slack_bolt import App
+                slack_app = App(
+                    token=os.environ.get("SLACK_API_TOKEN"),
+                    signing_secret=os.environ.get("SLACK_API_SECRET"),
+                )
+                alerts_channel = os.environ["SLACK_API_CHANNEL"]
+                print_verbose(f"Initialized Slack App: {slack_app}")
+            elif callback == "helicone":
+                heliconeLogger = HeliconeLogger()
+            elif callback == "aispend":
+                aispendLogger = AISpendLogger()
+            elif callback == "berrispend":
+                berrispendLogger = BerriSpendLogger()
+            elif callback == "supabase":
+                supabaseClient = Supabase()
+    except Exception as e:
+        raise e
 
 
 def handle_failure(exception, traceback_exception, start_time, end_time, args, kwargs):
     global sentry_sdk_instance, capture_exception, add_breadcrumb, posthog, slack_app, alerts_channel, aispendLogger, berrispendLogger
     try:
-      # print_verbose(f"handle_failure args: {args}")
-      # print_verbose(f"handle_failure kwargs: {kwargs}")
-      
-      success_handler = additional_details.pop("success_handler", None)
-      failure_handler = additional_details.pop("failure_handler", None)
-      
-      additional_details["Event_Name"] = additional_details.pop("failed_event_name", "litellm.failed_query")
-      print_verbose(f"self.failure_callback: {litellm.failure_callback}")
+        # print_verbose(f"handle_failure args: {args}")
+        # print_verbose(f"handle_failure kwargs: {kwargs}")
 
+        success_handler = additional_details.pop("success_handler", None)
+        failure_handler = additional_details.pop("failure_handler", None)
 
-      # print_verbose(f"additional_details: {additional_details}")
-      for callback in litellm.failure_callback:
-        try:
-          if callback == "slack":
-            slack_msg = "" 
-            if len(kwargs) > 0: 
-              for key in kwargs: 
-                slack_msg += f"{key}: {kwargs[key]}\n"
-            if len(args) > 0:
-              for i, arg in enumerate(args):
-                slack_msg += f"LiteLLM_Args_{str(i)}: {arg}"
-            for detail in additional_details: 
-              slack_msg += f"{detail}: {additional_details[detail]}\n"
-            slack_msg += f"Traceback: {traceback_exception}"
-            slack_app.client.chat_postMessage(channel=alerts_channel, text=slack_msg)
-          elif callback == "sentry":
-            capture_exception(exception)
-          elif callback == "posthog": 
-            print_verbose(f"inside posthog, additional_details: {len(additional_details.keys())}")
-            ph_obj = {}
-            if len(kwargs) > 0: 
-              ph_obj = kwargs
-            if len(args) > 0:
-              for i, arg in enumerate(args):
-                ph_obj["litellm_args_" + str(i)] = arg
-            for detail in additional_details:
-              ph_obj[detail] = additional_details[detail]
-            event_name = additional_details["Event_Name"]
-            print_verbose(f"ph_obj: {ph_obj}")
-            print_verbose(f"PostHog Event Name: {event_name}")
-            if "user_id" in additional_details:
-              posthog.capture(additional_details["user_id"], event_name, ph_obj)
-            else: # PostHog calls require a unique id to identify a user - https://posthog.com/docs/libraries/python
-              unique_id = str(uuid.uuid4())
-              posthog.capture(unique_id, event_name)
-              print_verbose(f"successfully logged to PostHog!")
-          elif callback == "berrispend": 
-              print_verbose("reaches berrispend for logging!")
-              model = args[0] if len(args) > 0 else kwargs["model"]
-              messages = args[1] if len(args) > 1 else kwargs["messages"]
-              result = {
-                 "model": model,
-                 "created": time.time(),
-                 "error": traceback_exception,
-                 "usage": {
-                    "prompt_tokens": prompt_token_calculator(model, messages=messages),
-                    "completion_tokens": 0
-                 }
-              }
-              berrispendLogger.log_event(model=model, messages=messages, response_obj=result, start_time=start_time, end_time=end_time, print_verbose=print_verbose)
-          elif callback == "aispend":
-              print_verbose("reaches aispend for logging!")
-              model = args[0] if len(args) > 0 else kwargs["model"]
-              messages = args[1] if len(args) > 1 else kwargs["messages"]
-              result = {
-                 "model": model,
-                 "created": time.time(),
-                 "usage": {
-                    "prompt_tokens": prompt_token_calculator(model, messages=messages),
-                    "completion_tokens": 0
-                 }
-              }
-              aispendLogger.log_event(model=model, response_obj=result, start_time=start_time, end_time=end_time, print_verbose=print_verbose)
-          elif callback == "supabase":
-              print_verbose("reaches supabase for logging!")
-              model = args[0] if len(args) > 0 else kwargs["model"]
-              messages = args[1] if len(args) > 1 else kwargs["messages"]
-              result = {
-                 "model": model,
-                 "created": time.time(),
-                 "error": traceback_exception,
-                 "usage": {
-                    "prompt_tokens": prompt_token_calculator(model, messages=messages),
-                    "completion_tokens": 0
-                 }
-              }
-              print(f"litellm._thread_context: {litellm._thread_context}")
-              supabaseClient.log_event(model=model, messages=messages, end_user=litellm._thread_context.user, response_obj=result, start_time=start_time, end_time=end_time, print_verbose=print_verbose)
+        additional_details["Event_Name"] = additional_details.pop(
+            "failed_event_name", "litellm.failed_query"
+        )
+        print_verbose(f"self.failure_callback: {litellm.failure_callback}")
 
-        except:
-          print_verbose(f"Error Occurred while logging failure: {traceback.format_exc()}")
-          pass
-      
-      if failure_handler and callable(failure_handler):
-        call_details = {
-          "exception": exception,
-          "additional_details": additional_details
-        }
-        failure_handler(call_details)
-      pass
+        # print_verbose(f"additional_details: {additional_details}")
+        for callback in litellm.failure_callback:
+            try:
+                if callback == "slack":
+                    slack_msg = ""
+                    if len(kwargs) > 0:
+                        for key in kwargs:
+                            slack_msg += f"{key}: {kwargs[key]}\n"
+                    if len(args) > 0:
+                        for i, arg in enumerate(args):
+                            slack_msg += f"LiteLLM_Args_{str(i)}: {arg}"
+                    for detail in additional_details:
+                        slack_msg += f"{detail}: {additional_details[detail]}\n"
+                    slack_msg += f"Traceback: {traceback_exception}"
+                    slack_app.client.chat_postMessage(
+                        channel=alerts_channel, text=slack_msg
+                    )
+                elif callback == "sentry":
+                    capture_exception(exception)
+                elif callback == "posthog":
+                    print_verbose(
+                        f"inside posthog, additional_details: {len(additional_details.keys())}"
+                    )
+                    ph_obj = {}
+                    if len(kwargs) > 0:
+                        ph_obj = kwargs
+                    if len(args) > 0:
+                        for i, arg in enumerate(args):
+                            ph_obj["litellm_args_" + str(i)] = arg
+                    for detail in additional_details:
+                        ph_obj[detail] = additional_details[detail]
+                    event_name = additional_details["Event_Name"]
+                    print_verbose(f"ph_obj: {ph_obj}")
+                    print_verbose(f"PostHog Event Name: {event_name}")
+                    if "user_id" in additional_details:
+                        posthog.capture(
+                            additional_details["user_id"], event_name, ph_obj
+                        )
+                    else:  # PostHog calls require a unique id to identify a user - https://posthog.com/docs/libraries/python
+                        unique_id = str(uuid.uuid4())
+                        posthog.capture(unique_id, event_name)
+                        print_verbose(f"successfully logged to PostHog!")
+                elif callback == "berrispend":
+                    print_verbose("reaches berrispend for logging!")
+                    model = args[0] if len(args) > 0 else kwargs["model"]
+                    messages = args[1] if len(args) > 1 else kwargs["messages"]
+                    result = {
+                        "model": model,
+                        "created": time.time(),
+                        "error": traceback_exception,
+                        "usage": {
+                            "prompt_tokens": prompt_token_calculator(
+                                model, messages=messages
+                            ),
+                            "completion_tokens": 0,
+                        },
+                    }
+                    berrispendLogger.log_event(
+                        model=model,
+                        messages=messages,
+                        response_obj=result,
+                        start_time=start_time,
+                        end_time=end_time,
+                        print_verbose=print_verbose,
+                    )
+                elif callback == "aispend":
+                    print_verbose("reaches aispend for logging!")
+                    model = args[0] if len(args) > 0 else kwargs["model"]
+                    messages = args[1] if len(args) > 1 else kwargs["messages"]
+                    result = {
+                        "model": model,
+                        "created": time.time(),
+                        "usage": {
+                            "prompt_tokens": prompt_token_calculator(
+                                model, messages=messages
+                            ),
+                            "completion_tokens": 0,
+                        },
+                    }
+                    aispendLogger.log_event(
+                        model=model,
+                        response_obj=result,
+                        start_time=start_time,
+                        end_time=end_time,
+                        print_verbose=print_verbose,
+                    )
+                elif callback == "supabase":
+                    print_verbose("reaches supabase for logging!")
+                    model = args[0] if len(args) > 0 else kwargs["model"]
+                    messages = args[1] if len(args) > 1 else kwargs["messages"]
+                    result = {
+                        "model": model,
+                        "created": time.time(),
+                        "error": traceback_exception,
+                        "usage": {
+                            "prompt_tokens": prompt_token_calculator(
+                                model, messages=messages
+                            ),
+                            "completion_tokens": 0,
+                        },
+                    }
+                    print(f"litellm._thread_context: {litellm._thread_context}")
+                    supabaseClient.log_event(
+                        model=model,
+                        messages=messages,
+                        end_user=litellm._thread_context.user,
+                        response_obj=result,
+                        start_time=start_time,
+                        end_time=end_time,
+                        print_verbose=print_verbose,
+                    )
+
+            except:
+                print_verbose(
+                    f"Error Occurred while logging failure: {traceback.format_exc()}"
+                )
+                pass
+
+        if failure_handler and callable(failure_handler):
+            call_details = {
+                "exception": exception,
+                "additional_details": additional_details,
+            }
+            failure_handler(call_details)
+        pass
     except Exception as e:
-      ## LOGGING
-      logging(logger_fn=user_logger_fn, exception=e)
-      pass
-
-def handle_success(args, kwargs, result, start_time, end_time):
-  global heliconeLogger, aispendLogger
-  try:
-    success_handler = additional_details.pop("success_handler", None)
-    failure_handler = additional_details.pop("failure_handler", None)
-    additional_details["Event_Name"] = additional_details.pop("successful_event_name", "litellm.succes_query")
-    for callback in litellm.success_callback:
-      try:
-        if callback == "posthog":
-          ph_obj = {}
-          for detail in additional_details:
-            ph_obj[detail] = additional_details[detail]
-          event_name = additional_details["Event_Name"]
-          if "user_id" in additional_details:
-            posthog.capture(additional_details["user_id"], event_name, ph_obj)
-          else: # PostHog calls require a unique id to identify a user - https://posthog.com/docs/libraries/python
-            unique_id = str(uuid.uuid4())
-            posthog.capture(unique_id, event_name, ph_obj)
-          pass
-        elif callback == "slack":
-          slack_msg = "" 
-          for detail in additional_details: 
-            slack_msg += f"{detail}: {additional_details[detail]}\n"
-          slack_app.client.chat_postMessage(channel=alerts_channel, text=slack_msg)
-        elif callback == "helicone":
-          print_verbose("reaches helicone for logging!")
-          model = args[0] if len(args) > 0 else kwargs["model"]
-          messages = args[1] if len(args) > 1 else kwargs["messages"]
-          heliconeLogger.log_success(model=model, messages=messages, response_obj=result, start_time=start_time, end_time=end_time, print_verbose=print_verbose)
-        elif callback == "aispend":
-          print_verbose("reaches aispend for logging!")
-          model = args[0] if len(args) > 0 else kwargs["model"]
-          aispendLogger.log_event(model=model, response_obj=result, start_time=start_time, end_time=end_time, print_verbose=print_verbose)
-        elif callback == "berrispend":
-          print_verbose("reaches berrispend for logging!")
-          model = args[0] if len(args) > 0 else kwargs["model"]
-          messages = args[1] if len(args) > 1 else kwargs["messages"]
-          berrispendLogger.log_event(model=model, messages=messages, response_obj=result, start_time=start_time, end_time=end_time, print_verbose=print_verbose)
-        elif callback == "supabase":
-          print_verbose("reaches supabase for logging!")
-          model = args[0] if len(args) > 0 else kwargs["model"]
-          messages = args[1] if len(args) > 1 else kwargs["messages"]
-          print(f"litellm._thread_context: {litellm._thread_context}")
-          supabaseClient.log_event(model=model, messages=messages, end_user=litellm._thread_context.user, response_obj=result, start_time=start_time, end_time=end_time, print_verbose=print_verbose)
-      except Exception as e:
         ## LOGGING
         logging(logger_fn=user_logger_fn, exception=e)
-        print_verbose(f"[Non-Blocking] Success Callback Error - {traceback.format_exc()}")
         pass
 
-    if success_handler and callable(success_handler):
-      success_handler(args, kwargs)
-    pass
-  except Exception as e:
-    ## LOGGING
-    logging(logger_fn=user_logger_fn, exception=e)
-    print_verbose(f"[Non-Blocking] Success Callback Error - {traceback.format_exc()}")
-    pass
+
+def handle_success(args, kwargs, result, start_time, end_time):
+    global heliconeLogger, aispendLogger
+    try:
+        success_handler = additional_details.pop("success_handler", None)
+        failure_handler = additional_details.pop("failure_handler", None)
+        additional_details["Event_Name"] = additional_details.pop(
+            "successful_event_name", "litellm.succes_query"
+        )
+        for callback in litellm.success_callback:
+            try:
+                if callback == "posthog":
+                    ph_obj = {}
+                    for detail in additional_details:
+                        ph_obj[detail] = additional_details[detail]
+                    event_name = additional_details["Event_Name"]
+                    if "user_id" in additional_details:
+                        posthog.capture(
+                            additional_details["user_id"], event_name, ph_obj
+                        )
+                    else:  # PostHog calls require a unique id to identify a user - https://posthog.com/docs/libraries/python
+                        unique_id = str(uuid.uuid4())
+                        posthog.capture(unique_id, event_name, ph_obj)
+                    pass
+                elif callback == "slack":
+                    slack_msg = ""
+                    for detail in additional_details:
+                        slack_msg += f"{detail}: {additional_details[detail]}\n"
+                    slack_app.client.chat_postMessage(
+                        channel=alerts_channel, text=slack_msg
+                    )
+                elif callback == "helicone":
+                    print_verbose("reaches helicone for logging!")
+                    model = args[0] if len(args) > 0 else kwargs["model"]
+                    messages = args[1] if len(args) > 1 else kwargs["messages"]
+                    heliconeLogger.log_success(
+                        model=model,
+                        messages=messages,
+                        response_obj=result,
+                        start_time=start_time,
+                        end_time=end_time,
+                        print_verbose=print_verbose,
+                    )
+                elif callback == "aispend":
+                    print_verbose("reaches aispend for logging!")
+                    model = args[0] if len(args) > 0 else kwargs["model"]
+                    aispendLogger.log_event(
+                        model=model,
+                        response_obj=result,
+                        start_time=start_time,
+                        end_time=end_time,
+                        print_verbose=print_verbose,
+                    )
+                elif callback == "berrispend":
+                    print_verbose("reaches berrispend for logging!")
+                    model = args[0] if len(args) > 0 else kwargs["model"]
+                    messages = args[1] if len(args) > 1 else kwargs["messages"]
+                    berrispendLogger.log_event(
+                        model=model,
+                        messages=messages,
+                        response_obj=result,
+                        start_time=start_time,
+                        end_time=end_time,
+                        print_verbose=print_verbose,
+                    )
+                elif callback == "supabase":
+                    print_verbose("reaches supabase for logging!")
+                    model = args[0] if len(args) > 0 else kwargs["model"]
+                    messages = args[1] if len(args) > 1 else kwargs["messages"]
+                    print(f"litellm._thread_context: {litellm._thread_context}")
+                    supabaseClient.log_event(
+                        model=model,
+                        messages=messages,
+                        end_user=litellm._thread_context.user,
+                        response_obj=result,
+                        start_time=start_time,
+                        end_time=end_time,
+                        print_verbose=print_verbose,
+                    )
+            except Exception as e:
+                ## LOGGING
+                logging(logger_fn=user_logger_fn, exception=e)
+                print_verbose(
+                    f"[Non-Blocking] Success Callback Error - {traceback.format_exc()}"
+                )
+                pass
+
+        if success_handler and callable(success_handler):
+            success_handler(args, kwargs)
+        pass
+    except Exception as e:
+        ## LOGGING
+        logging(logger_fn=user_logger_fn, exception=e)
+        print_verbose(
+            f"[Non-Blocking] Success Callback Error - {traceback.format_exc()}"
+        )
+        pass
+
 
 def prompt_token_calculator(model, messages):
-  # use tiktoken or anthropic's tokenizer depending on the model
-  text = " ".join(message["content"] for message in messages)
-  num_tokens = 0
-  if "claude" in model:
-    install_and_import('anthropic')
-    from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT
-    anthropic = Anthropic()
-    num_tokens = anthropic.count_tokens(text)
-  else:
-    num_tokens = len(encoding.encode(text))
-  return num_tokens
+    # use tiktoken or anthropic's tokenizer depending on the model
+    text = " ".join(message["content"] for message in messages)
+    num_tokens = 0
+    if "claude" in model:
+        install_and_import("anthropic")
+        from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT
 
-# integration helper function 
+        anthropic = Anthropic()
+        num_tokens = anthropic.count_tokens(text)
+    else:
+        num_tokens = len(encoding.encode(text))
+    return num_tokens
+
+
+# integration helper function
 def modify_integration(integration_name, integration_params):
-   global supabaseClient
-   if integration_name == "supabase":
-      if "table_name" in integration_params:
-         Supabase.supabase_table_name = integration_params["table_name"]
+    global supabaseClient
+    if integration_name == "supabase":
+        if "table_name" in integration_params:
+            Supabase.supabase_table_name = integration_params["table_name"]
+
 
 def exception_type(model, original_exception, custom_llm_provider):
     global user_logger_fn
     exception_mapping_worked = False
     try:
-      if isinstance(original_exception, OriginalError):
-          # Handle the OpenAIError
-          exception_mapping_worked = True
-          if custom_llm_provider == "azure":
-            original_exception.llm_provider = "azure"
-          else:
-            original_exception.llm_provider = "openai"
-          raise original_exception
-      elif model:
-        error_str = str(original_exception)
-        if isinstance(original_exception, BaseException):
-          exception_type = type(original_exception).__name__
+        if isinstance(original_exception, OriginalError):
+            # Handle the OpenAIError
+            exception_mapping_worked = True
+            if custom_llm_provider == "azure":
+                original_exception.llm_provider = "azure"
+            else:
+                original_exception.llm_provider = "openai"
+            raise original_exception
+        elif model:
+            error_str = str(original_exception)
+            if isinstance(original_exception, BaseException):
+                exception_type = type(original_exception).__name__
+            else:
+                exception_type = ""
+            logging(
+                model=model,
+                additional_args={
+                    "error_str": error_str,
+                    "exception_type": exception_type,
+                    "original_exception": original_exception,
+                },
+                logger_fn=user_logger_fn,
+            )
+            if "claude" in model:  # one of the anthropics
+                if hasattr(original_exception, "status_code"):
+                    print_verbose(f"status_code: {original_exception.status_code}")
+                    if original_exception.status_code == 401:
+                        exception_mapping_worked = True
+                        raise AuthenticationError(
+                            message=f"AnthropicException - {original_exception.message}",
+                            llm_provider="anthropic",
+                        )
+                    elif original_exception.status_code == 400:
+                        exception_mapping_worked = True
+                        raise InvalidRequestError(
+                            message=f"AnthropicException - {original_exception.message}",
+                            model=model,
+                            llm_provider="anthropic",
+                        )
+                    elif original_exception.status_code == 429:
+                        exception_mapping_worked = True
+                        raise RateLimitError(
+                            message=f"AnthropicException - {original_exception.message}",
+                            llm_provider="anthropic",
+                        )
+                elif (
+                    "Could not resolve authentication method. Expected either api_key or auth_token to be set."
+                    in error_str
+                ):
+                    exception_mapping_worked = True
+                    raise AuthenticationError(
+                        message=f"AnthropicException - {original_exception.message}",
+                        llm_provider="anthropic",
+                    )
+            elif "replicate" in model:
+                if "Incorrect authentication token" in error_str:
+                    exception_mapping_worked = True
+                    raise AuthenticationError(
+                        message=f"ReplicateException - {error_str}",
+                        llm_provider="replicate",
+                    )
+                elif exception_type == "ModelError":
+                    exception_mapping_worked = True
+                    raise InvalidRequestError(
+                        message=f"ReplicateException - {error_str}",
+                        model=model,
+                        llm_provider="replicate",
+                    )
+                elif "Request was throttled" in error_str:
+                    exception_mapping_worked = True
+                    raise RateLimitError(
+                        message=f"ReplicateException - {error_str}",
+                        llm_provider="replicate",
+                    )
+                elif (
+                    exception_type == "ReplicateError"
+                ):  ## ReplicateError implies an error on Replicate server side, not user side
+                    raise ServiceUnavailableError(
+                        message=f"ReplicateException - {error_str}",
+                        llm_provider="replicate",
+                    )
+            elif model == "command-nightly":  # Cohere
+                if (
+                    "invalid api token" in error_str
+                    or "No API key provided." in error_str
+                ):
+                    exception_mapping_worked = True
+                    raise AuthenticationError(
+                        message=f"CohereException - {original_exception.message}",
+                        llm_provider="cohere",
+                    )
+                elif "too many tokens" in error_str:
+                    exception_mapping_worked = True
+                    raise InvalidRequestError(
+                        message=f"CohereException - {original_exception.message}",
+                        model=model,
+                        llm_provider="cohere",
+                    )
+                elif (
+                    "CohereConnectionError" in exception_type
+                ):  # cohere seems to fire these errors when we load test it (1k+ messages / min)
+                    exception_mapping_worked = True
+                    raise RateLimitError(
+                        message=f"CohereException - {original_exception.message}",
+                        llm_provider="cohere",
+                    )
+            elif custom_llm_provider == "huggingface":
+                if hasattr(original_exception, "status_code"):
+                    if original_exception.status_code == 401:
+                        exception_mapping_worked = True
+                        raise AuthenticationError(
+                            message=f"HuggingfaceException - {original_exception.message}",
+                            llm_provider="huggingface",
+                        )
+                    elif original_exception.status_code == 400:
+                        exception_mapping_worked = True
+                        raise InvalidRequestError(
+                            message=f"HuggingfaceException - {original_exception.message}",
+                            model=model,
+                            llm_provider="huggingface",
+                        )
+                    elif original_exception.status_code == 429:
+                        exception_mapping_worked = True
+                        raise RateLimitError(
+                            message=f"HuggingfaceException - {original_exception.message}",
+                            llm_provider="huggingface",
+                        )
+            raise original_exception  # base case - return the original exception
         else:
-          exception_type = ""
-        logging(model=model, additional_args={"error_str": error_str, "exception_type": exception_type, "original_exception": original_exception}, logger_fn=user_logger_fn)
-        if "claude" in model: #one of the anthropics
-          if hasattr(original_exception, "status_code"):
-            print_verbose(f"status_code: {original_exception.status_code}")
-            if original_exception.status_code == 401:
-              exception_mapping_worked = True
-              raise AuthenticationError(message=f"AnthropicException - {original_exception.message}", llm_provider="anthropic")
-            elif original_exception.status_code == 400:
-              exception_mapping_worked = True
-              raise InvalidRequestError(message=f"AnthropicException - {original_exception.message}", model=model, llm_provider="anthropic")
-            elif original_exception.status_code == 429:
-              exception_mapping_worked = True
-              raise RateLimitError(message=f"AnthropicException - {original_exception.message}", llm_provider="anthropic")
-          elif "Could not resolve authentication method. Expected either api_key or auth_token to be set." in error_str:
-            exception_mapping_worked = True
-            raise AuthenticationError(message=f"AnthropicException - {original_exception.message}", llm_provider="anthropic")
-        elif "replicate" in model:
-          if "Incorrect authentication token" in error_str:
-            exception_mapping_worked = True
-            raise AuthenticationError(message=f"ReplicateException - {error_str}", llm_provider="replicate")
-          elif exception_type == "ModelError":
-            exception_mapping_worked = True
-            raise InvalidRequestError(message=f"ReplicateException - {error_str}", model=model, llm_provider="replicate")
-          elif "Request was throttled" in error_str:
-            exception_mapping_worked = True
-            raise RateLimitError(message=f"ReplicateException - {error_str}", llm_provider="replicate")
-          elif exception_type == "ReplicateError": ## ReplicateError implies an error on Replicate server side, not user side
-            raise ServiceUnavailableError(message=f"ReplicateException - {error_str}", llm_provider="replicate")
-        elif model == "command-nightly": #Cohere
-          if "invalid api token" in error_str or "No API key provided." in error_str:
-            exception_mapping_worked = True
-            raise AuthenticationError(message=f"CohereException - {original_exception.message}", llm_provider="cohere")
-          elif "too many tokens" in error_str:
-            exception_mapping_worked = True
-            raise InvalidRequestError(message=f"CohereException - {original_exception.message}", model=model, llm_provider="cohere")
-          elif "CohereConnectionError" in exception_type: # cohere seems to fire these errors when we load test it (1k+ messages / min)
-            exception_mapping_worked = True
-            raise RateLimitError(message=f"CohereException - {original_exception.message}", llm_provider="cohere")
-        elif custom_llm_provider == "huggingface":
-           if hasattr(original_exception, "status_code"):
-              if original_exception.status_code == 401:
-                exception_mapping_worked = True
-                raise AuthenticationError(message=f"HuggingfaceException - {original_exception.message}", llm_provider="huggingface")
-              elif original_exception.status_code == 400:
-                exception_mapping_worked = True
-                raise InvalidRequestError(message=f"HuggingfaceException - {original_exception.message}", model=model, llm_provider="huggingface")
-              elif original_exception.status_code == 429:
-                exception_mapping_worked = True
-                raise RateLimitError(message=f"HuggingfaceException - {original_exception.message}", llm_provider="huggingface")
-        raise original_exception # base case - return the original exception
-      else:
-        raise original_exception
+            raise original_exception
     except Exception as e:
-      ## LOGGING
-      logging(logger_fn=user_logger_fn, additional_args={"exception_mapping_worked": exception_mapping_worked, "original_exception": original_exception}, exception=e) 
-      if exception_mapping_worked:
-        raise e
-      else: # don't let an error with mapping interrupt the user from receiving an error from the llm api calls 
-         raise original_exception
+        ## LOGGING
+        logging(
+            logger_fn=user_logger_fn,
+            additional_args={
+                "exception_mapping_worked": exception_mapping_worked,
+                "original_exception": original_exception,
+            },
+            exception=e,
+        )
+        if exception_mapping_worked:
+            raise e
+        else:  # don't let an error with mapping interrupt the user from receiving an error from the llm api calls
+            raise original_exception
+
 
 def safe_crash_reporting(model=None, exception=None, custom_llm_provider=None):
     data = {
-      "model": model,
-      "exception": str(exception),
-      "custom_llm_provider": custom_llm_provider
+        "model": model,
+        "exception": str(exception),
+        "custom_llm_provider": custom_llm_provider,
     }
     threading.Thread(target=litellm_telemetry, args=(data,)).start()
 
+
 def litellm_telemetry(data):
     # Load or generate the UUID
-    uuid_file = 'litellm_uuid.txt'
+    uuid_file = "litellm_uuid.txt"
     try:
         # Try to open the file and load the UUID
-        with open(uuid_file, 'r') as file:
+        with open(uuid_file, "r") as file:
             uuid_value = file.read()
             if uuid_value:
                 uuid_value = uuid_value.strip()
@@ -775,42 +1050,48 @@ def litellm_telemetry(data):
         # Generate a new UUID if the file doesn't exist or is empty
         new_uuid = uuid.uuid4()
         uuid_value = str(new_uuid)
-        with open(uuid_file, 'w') as file:
+        with open(uuid_file, "w") as file:
             file.write(uuid_value)
-    except:
-      # [Non-Blocking Error]
-      return
-    
-    try:
-      # Prepare the data to send to litellm logging api
-      payload = {
-          'uuid': uuid_value,
-          'data': data,
-          'version': pkg_resources.get_distribution("litellm").version
-      }
-      # Make the POST request to litellm logging api
-      response = requests.post('https://litellm.berri.ai/logging', headers={"Content-Type": "application/json"}, json=payload)
-      response.raise_for_status()  # Raise an exception for HTTP errors
     except:
         # [Non-Blocking Error]
         return
 
+    try:
+        # Prepare the data to send to litellm logging api
+        payload = {
+            "uuid": uuid_value,
+            "data": data,
+            "version": pkg_resources.get_distribution("litellm").version,
+        }
+        # Make the POST request to litellm logging api
+        response = requests.post(
+            "https://litellm.berri.ai/logging",
+            headers={"Content-Type": "application/json"},
+            json=payload,
+        )
+        response.raise_for_status()  # Raise an exception for HTTP errors
+    except:
+        # [Non-Blocking Error]
+        return
+
+
 ######### Secret Manager ############################
 # checks if user has passed in a secret manager client
 # if passed in then checks the secret there
 def get_secret(secret_name):
-  if litellm.secret_manager_client != None:
-     # TODO: check which secret manager is being used
-     # currently only supports Infisical
-     secret = litellm.secret_manager_client.get_secret(secret_name).secret_value
-     if secret != None:
-        return secret # if secret found in secret manager return it
-     else:
-        raise ValueError(f"Secret '{secret_name}' not found in secret manager")
-  elif litellm.api_key != None: # if users use litellm default key
-    return litellm.api_key
-  else:
-    return os.environ.get(secret_name)
+    if litellm.secret_manager_client != None:
+        # TODO: check which secret manager is being used
+        # currently only supports Infisical
+        secret = litellm.secret_manager_client.get_secret(secret_name).secret_value
+        if secret != None:
+            return secret  # if secret found in secret manager return it
+        else:
+            raise ValueError(f"Secret '{secret_name}' not found in secret manager")
+    elif litellm.api_key != None:  # if users use litellm default key
+        return litellm.api_key
+    else:
+        return os.environ.get(secret_name)
+
 
 ######## Streaming Class ############################
 # wraps the completion stream to return the correct format for the model
@@ -820,73 +1101,73 @@ class CustomStreamWrapper:
         self.model = model
         self.custom_llm_provider = custom_llm_provider
         if model in litellm.cohere_models:
-           # cohere does not return an iterator, so we need to wrap it in one
-           self.completion_stream = iter(completion_stream)
+            # cohere does not return an iterator, so we need to wrap it in one
+            self.completion_stream = iter(completion_stream)
         elif model == "together_ai":
             self.completion_stream = iter(completion_stream)
-        else: 
-          self.completion_stream = completion_stream
+        else:
+            self.completion_stream = completion_stream
 
     def __iter__(self):
         return self
 
     def handle_anthropic_chunk(self, chunk):
-      str_line = chunk.decode('utf-8')  # Convert bytes to string
-      if str_line.startswith('data:'):
-          data_json = json.loads(str_line[5:])
-          return data_json.get("completion", "")
-      return ""
+        str_line = chunk.decode("utf-8")  # Convert bytes to string
+        if str_line.startswith("data:"):
+            data_json = json.loads(str_line[5:])
+            return data_json.get("completion", "")
+        return ""
 
-    def handle_together_ai_chunk(self, chunk): 
-      chunk = chunk.decode("utf-8")
-      text_index = chunk.find('"text":"') # this checks if text: exists
-      text_start = text_index + len('"text":"')
-      text_end = chunk.find('"}', text_start)
-      if text_index != -1 and text_end != -1: 
-          extracted_text = chunk[text_start:text_end]
-          return extracted_text
-      else:
-          return ""
-    
-    def handle_huggingface_chunk(self, chunk): 
-      chunk = chunk.decode("utf-8")
-      if chunk.startswith('data:'):
-          data_json = json.loads(chunk[5:])
-          if "token" in data_json and "text" in data_json["token"]:
-             return data_json["token"]["text"]
-          else:
-             return ""
-      return ""
+    def handle_together_ai_chunk(self, chunk):
+        chunk = chunk.decode("utf-8")
+        text_index = chunk.find('"text":"')  # this checks if text: exists
+        text_start = text_index + len('"text":"')
+        text_end = chunk.find('"}', text_start)
+        if text_index != -1 and text_end != -1:
+            extracted_text = chunk[text_start:text_end]
+            return extracted_text
+        else:
+            return ""
+
+    def handle_huggingface_chunk(self, chunk):
+        chunk = chunk.decode("utf-8")
+        if chunk.startswith("data:"):
+            data_json = json.loads(chunk[5:])
+            if "token" in data_json and "text" in data_json["token"]:
+                return data_json["token"]["text"]
+            else:
+                return ""
+        return ""
 
     def __next__(self):
-        completion_obj ={ "role": "assistant", "content": ""}
+        completion_obj = {"role": "assistant", "content": ""}
         if self.model in litellm.anthropic_models:
-          chunk = next(self.completion_stream)
-          completion_obj["content"] = self.handle_anthropic_chunk(chunk)
+            chunk = next(self.completion_stream)
+            completion_obj["content"] = self.handle_anthropic_chunk(chunk)
         elif self.model == "replicate":
-           chunk = next(self.completion_stream)
-           completion_obj["content"] = chunk
+            chunk = next(self.completion_stream)
+            completion_obj["content"] = chunk
         elif (self.model == "together_ai") or ("togethercomputer" in self.model):
-          chunk = next(self.completion_stream)
-          text_data =  self.handle_together_ai_chunk(chunk)
-          if text_data == "":
-             return self.__next__()
-          completion_obj["content"] = text_data
+            chunk = next(self.completion_stream)
+            text_data = self.handle_together_ai_chunk(chunk)
+            if text_data == "":
+                return self.__next__()
+            completion_obj["content"] = text_data
         elif self.model in litellm.cohere_models:
-          chunk = next(self.completion_stream)
-          completion_obj["content"] = chunk.text
+            chunk = next(self.completion_stream)
+            completion_obj["content"] = chunk.text
         elif self.custom_llm_provider and self.custom_llm_provider == "huggingface":
-           chunk = next(self.completion_stream)
-           completion_obj["content"] = self.handle_huggingface_chunk(chunk)
+            chunk = next(self.completion_stream)
+            completion_obj["content"] = self.handle_huggingface_chunk(chunk)
         # return this for all models
         return {"choices": [{"delta": completion_obj}]}
 
 
-
 ########## Reading Config File ############################
 def read_config_args(config_path):
     try:
         import os
+
         current_path = os.getcwd()
         with open(config_path, "r") as config_file:
             config = json.load(config_file)
@@ -900,9 +1181,13 @@ def read_config_args(config_path):
 
 ########## ollama implementation ############################
 import aiohttp
-async def get_ollama_response_stream(api_base="http://localhost:11434", model="llama2", prompt="Why is the sky blue?"):
+
+
+async def get_ollama_response_stream(
+    api_base="http://localhost:11434", model="llama2", prompt="Why is the sky blue?"
+):
     session = aiohttp.ClientSession()
-    url = f'{api_base}/api/generate'
+    url = f"{api_base}/api/generate"
     data = {
         "model": model,
         "prompt": prompt,
@@ -918,7 +1203,10 @@ async def get_ollama_response_stream(api_base="http://localhost:11434", model="l
                             if chunk.strip() != "":
                                 j = json.loads(chunk)
                                 if "response" in j:
-                                    completion_obj ={ "role": "assistant", "content": ""}
+                                    completion_obj = {
+                                        "role": "assistant",
+                                        "content": "",
+                                    }
                                     completion_obj["content"] = j["response"]
                                     yield {"choices": [{"delta": completion_obj}]}
                                     # self.responses.append(j["response"])
@@ -930,16 +1218,16 @@ async def get_ollama_response_stream(api_base="http://localhost:11434", model="l
 
 
 async def stream_to_string(generator):
-   response = ""
-   async for chunk in generator:
-      response += chunk["content"]
-   return response
+    response = ""
+    async for chunk in generator:
+        response += chunk["content"]
+    return response
 
 
 ########## Together AI streaming #############################
 async def together_ai_completion_streaming(json_data, headers):
     session = aiohttp.ClientSession()
-    url = 'https://api.together.xyz/inference'
+    url = "https://api.together.xyz/inference"
     # headers = {
     #     'Authorization': f'Bearer {together_ai_token}',
     #     'Content-Type': 'application/json'
@@ -962,15 +1250,14 @@ async def together_ai_completion_streaming(json_data, headers):
                 if line:
                     try:
                         json_chunk = line.decode("utf-8")
-                        json_string = json_chunk.split('data: ')[1]
+                        json_string = json_chunk.split("data: ")[1]
                         # Convert the JSON string to a dictionary
                         data_dict = json.loads(json_string)
-                        completion_response = data_dict['choices'][0]['text']
-                        completion_obj ={ "role": "assistant", "content": ""}
+                        completion_response = data_dict["choices"][0]["text"]
+                        completion_obj = {"role": "assistant", "content": ""}
                         completion_obj["content"] = completion_response
                         yield {"choices": [{"delta": completion_obj}]}
                     except:
                         pass
     finally:
         await session.close()
-