add linting

2023-08-18 11:05:05 -07:00 · 2023-08-18 11:05:05 -07:00 · 15b1da9dc8
commit 15b1da9dc8
parent 8ef47524bf
40 changed files with 3110 additions and 1709 deletions
--- a/litellm/init.py
+++ b/litellm/init.py
@ -1,4 +1,5 @@
 import threading
 success_callback = []
 failure_callback = []
 set_verbose = False
@ -19,33 +20,99 @@ caching = False
 hugging_api_token = None
 togetherai_api_key = None
 model_cost = {
-    "gpt-3.5-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
+    "gpt-3.5-turbo": {
-    "gpt-35-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, # azure model name
+        "max_tokens": 4000,
-    "gpt-3.5-turbo-0613": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
+        "input_cost_per_token": 0.0000015,
-    "gpt-3.5-turbo-0301": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
+        "output_cost_per_token": 0.000002,
-    "gpt-3.5-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
+    },
-    "gpt-35-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, # azure model name
+    "gpt-35-turbo": {
-    "gpt-3.5-turbo-16k-0613": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
+        "max_tokens": 4000,
-    "gpt-4": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
+        "input_cost_per_token": 0.0000015,
-    "gpt-4-0613": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
+        "output_cost_per_token": 0.000002,
-    "gpt-4-32k": {"max_tokens": 8000, "input_cost_per_token": 0.00006, "output_cost_per_token": 0.00012},
+    },  # azure model name
-    "claude-instant-1": {"max_tokens": 100000, "input_cost_per_token": 0.00000163, "output_cost_per_token": 0.00000551},
+    "gpt-3.5-turbo-0613": {
-    "claude-2": {"max_tokens": 100000, "input_cost_per_token": 0.00001102, "output_cost_per_token": 0.00003268},
+        "max_tokens": 4000,
-    "text-bison-001": {"max_tokens": 8192, "input_cost_per_token": 0.000004, "output_cost_per_token": 0.000004},
+        "input_cost_per_token": 0.0000015,
-    "chat-bison-001": {"max_tokens": 4096, "input_cost_per_token": 0.000002, "output_cost_per_token": 0.000002},
+        "output_cost_per_token": 0.000002,
-    "command-nightly": {"max_tokens": 4096, "input_cost_per_token": 0.000015, "output_cost_per_token": 0.000015},
+    },
    "gpt-3.5-turbo-0301": {
        "max_tokens": 4000,
        "input_cost_per_token": 0.0000015,
        "output_cost_per_token": 0.000002,
    },
    "gpt-3.5-turbo-16k": {
        "max_tokens": 16000,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000004,
    },
    "gpt-35-turbo-16k": {
        "max_tokens": 16000,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000004,
    },  # azure model name
    "gpt-3.5-turbo-16k-0613": {
        "max_tokens": 16000,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000004,
    },
    "gpt-4": {
        "max_tokens": 8000,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.00006,
    },
    "gpt-4-0613": {
        "max_tokens": 8000,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.00006,
    },
    "gpt-4-32k": {
        "max_tokens": 8000,
        "input_cost_per_token": 0.00006,
        "output_cost_per_token": 0.00012,
    },
    "claude-instant-1": {
        "max_tokens": 100000,
        "input_cost_per_token": 0.00000163,
        "output_cost_per_token": 0.00000551,
    },
    "claude-2": {
        "max_tokens": 100000,
        "input_cost_per_token": 0.00001102,
        "output_cost_per_token": 0.00003268,
    },
    "text-bison-001": {
        "max_tokens": 8192,
        "input_cost_per_token": 0.000004,
        "output_cost_per_token": 0.000004,
    },
    "chat-bison-001": {
        "max_tokens": 4096,
        "input_cost_per_token": 0.000002,
        "output_cost_per_token": 0.000002,
    },
    "command-nightly": {
        "max_tokens": 4096,
        "input_cost_per_token": 0.000015,
        "output_cost_per_token": 0.000015,
    },
 }
 ####### THREAD-SPECIFIC DATA ###################
 class MyLocal(threading.local):
    def __init__(self):
        self.user = "Hello World"
 _thread_context = MyLocal()
 def identify(event_details):
    # Store user in thread local data
    if "user" in event_details:
        _thread_context.user = event_details["user"]
 ####### ADDITIONAL PARAMS ################### configurable params if you use proxy models like Helicone, map spend to org id, etc.
 api_base = None
 headers = None
@ -66,50 +133,38 @@ open_ai_chat_completion_models = [
    "gpt-3.5-turbo-0613",
    "gpt-3.5-turbo-16k-0613",
 ]
-open_ai_text_completion_models = [
+open_ai_text_completion_models = ["text-davinci-003"]
    'text-davinci-003'
 ]
 cohere_models = [
-    'command-nightly',
+    "command-nightly",
    "command",
    "command-light",
    "command-medium-beta",
-    "command-xlarge-beta"
+    "command-xlarge-beta",
 ]
-anthropic_models = [
+anthropic_models = ["claude-2", "claude-instant-1", "claude-instant-1.2"]
  "claude-2", 
  "claude-instant-1",
  "claude-instant-1.2"
 ]
 replicate_models = [
    "replicate/"
 ]  # placeholder, to make sure we accept any replicate model in our model_list
 openrouter_models = [
-    'google/palm-2-codechat-bison',
+    "google/palm-2-codechat-bison",
-    'google/palm-2-chat-bison',
+    "google/palm-2-chat-bison",
-    'openai/gpt-3.5-turbo',
+    "openai/gpt-3.5-turbo",
-    'openai/gpt-3.5-turbo-16k',
+    "openai/gpt-3.5-turbo-16k",
-    'openai/gpt-4-32k',
+    "openai/gpt-4-32k",
-    'anthropic/claude-2',
+    "anthropic/claude-2",
-    'anthropic/claude-instant-v1',
+    "anthropic/claude-instant-v1",
-    'meta-llama/llama-2-13b-chat',
+    "meta-llama/llama-2-13b-chat",
-    'meta-llama/llama-2-70b-chat'
+    "meta-llama/llama-2-70b-chat",
 ]
-vertex_chat_models = [
+vertex_chat_models = ["chat-bison", "chat-bison@001"]
    "chat-bison",
    "chat-bison@001"
 ]
-vertex_text_models = [
+vertex_text_models = ["text-bison", "text-bison@001"]
    "text-bison",
    "text-bison@001"
 ]
 huggingface_models = [
    "meta-llama/Llama-2-7b-hf",
@ -126,23 +181,54 @@ huggingface_models = [
    "meta-llama/Llama-2-70b-chat",
 ]  # these have been tested on extensively. But by default all text2text-generation and text-generation models are supported by liteLLM. - https://docs.litellm.ai/docs/completion/supported
-ai21_models = [
+ai21_models = ["j2-ultra", "j2-mid", "j2-light"]
-    "j2-ultra",
+
-    "j2-mid",
+model_list = (
-    "j2-light"
+    open_ai_chat_completion_models
    + open_ai_text_completion_models
    + cohere_models
    + anthropic_models
    + replicate_models
    + openrouter_models
    + huggingface_models
    + vertex_chat_models
    + vertex_text_models
    + ai21_models
 )
 provider_list = [
    "openai",
    "cohere",
    "anthropic",
    "replicate",
    "huggingface",
    "together_ai",
    "openrouter",
    "vertex_ai",
    "ai21",
 ]
 model_list = open_ai_chat_completion_models + open_ai_text_completion_models + cohere_models + anthropic_models + replicate_models + openrouter_models + huggingface_models + vertex_chat_models + vertex_text_models + ai21_models
 provider_list = ["openai", "cohere", "anthropic", "replicate", "huggingface", "together_ai", "openrouter", "vertex_ai", "ai21"]
 ####### EMBEDDING MODELS ###################
-open_ai_embedding_models = [
+open_ai_embedding_models = ["text-embedding-ada-002"]
    'text-embedding-ada-002'
 ]
 from .timeout import timeout
 from .testing import *
-from .utils import client, logging, exception_type, get_optional_params, modify_integration, token_counter, cost_per_token, completion_cost, get_litellm_params
+from .utils import (
    client,
    logging,
    exception_type,
    get_optional_params,
    modify_integration,
    token_counter,
    cost_per_token,
    completion_cost,
    get_litellm_params,
 )
 from .main import *  # Import all the symbols from main.py
 from .integrations import *
-from openai.error import AuthenticationError, InvalidRequestError, RateLimitError, ServiceUnavailableError, OpenAIError
+from openai.error import (
    AuthenticationError,
    InvalidRequestError,
    RateLimitError,
    ServiceUnavailableError,
    OpenAIError,
 )
--- a/litellm/exceptions.py
+++ b/litellm/exceptions.py
@ -1,12 +1,21 @@
 ## LiteLLM versions of the OpenAI Exception Types
-from openai.error import AuthenticationError, InvalidRequestError, RateLimitError, ServiceUnavailableError, OpenAIError
+from openai.error import (
    AuthenticationError,
    InvalidRequestError,
    RateLimitError,
    ServiceUnavailableError,
    OpenAIError,
 )
 class AuthenticationError(AuthenticationError):
    def __init__(self, message, llm_provider):
        self.status_code = 401
        self.message = message
        self.llm_provider = llm_provider
-        super().__init__(self.message) # Call the base class constructor with the parameters it needs
+        super().__init__(
            self.message
        )  # Call the base class constructor with the parameters it needs
 class InvalidRequestError(InvalidRequestError):
@ -15,7 +24,9 @@ class InvalidRequestError(InvalidRequestError):
        self.message = message
        self.model = model
        self.llm_provider = llm_provider
-        super().__init__(self.message, f"{self.model}") # Call the base class constructor with the parameters it needs
+        super().__init__(
            self.message, f"{self.model}"
        )  # Call the base class constructor with the parameters it needs
 class RateLimitError(RateLimitError):
@ -23,21 +34,29 @@ class RateLimitError(RateLimitError):
        self.status_code = 429
        self.message = message
        self.llm_provider = llm_provider
-        super().__init__(self.message) # Call the base class constructor with the parameters it needs
+        super().__init__(
            self.message
        )  # Call the base class constructor with the parameters it needs
 class ServiceUnavailableError(ServiceUnavailableError):
    def __init__(self, message, llm_provider):
        self.status_code = 500
        self.message = message
        self.llm_provider = llm_provider
-        super().__init__(self.message) # Call the base class constructor with the parameters it needs
+        super().__init__(
            self.message
        )  # Call the base class constructor with the parameters it needs
 class OpenAIError(OpenAIError):
    def __init__(self, original_exception):
        self.status_code = original_exception.http_status
-        super().__init__(http_body=original_exception.http_body,
+        super().__init__(
            http_body=original_exception.http_body,
            http_status=original_exception.http_status,
            json_body=original_exception.json_body,
            headers=original_exception.headers,
-                         code=original_exception.code)
+            code=original_exception.code,
        )
        self.llm_provider = "openai"
--- a/litellm/integrations/aispend.py
+++ b/litellm/integrations/aispend.py
@ -2,28 +2,90 @@
 #    On success + failure, log events to aispend.io
 import dotenv, os
 import requests
 dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback
 import datetime
 model_cost = {
-    "gpt-3.5-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
+    "gpt-3.5-turbo": {
-    "gpt-35-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, # azure model name
+        "max_tokens": 4000,
-    "gpt-3.5-turbo-0613": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
+        "input_cost_per_token": 0.0000015,
-    "gpt-3.5-turbo-0301": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
+        "output_cost_per_token": 0.000002,
-    "gpt-3.5-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
+    },
-    "gpt-35-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, # azure model name
+    "gpt-35-turbo": {
-    "gpt-3.5-turbo-16k-0613": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
+        "max_tokens": 4000,
-    "gpt-4": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
+        "input_cost_per_token": 0.0000015,
-    "gpt-4-0613": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
+        "output_cost_per_token": 0.000002,
-    "gpt-4-32k": {"max_tokens": 8000, "input_cost_per_token": 0.00006, "output_cost_per_token": 0.00012},
+    },  # azure model name
-    "claude-instant-1": {"max_tokens": 100000, "input_cost_per_token": 0.00000163, "output_cost_per_token": 0.00000551},
+    "gpt-3.5-turbo-0613": {
-    "claude-2": {"max_tokens": 100000, "input_cost_per_token": 0.00001102, "output_cost_per_token": 0.00003268},
+        "max_tokens": 4000,
-    "text-bison-001": {"max_tokens": 8192, "input_cost_per_token": 0.000004, "output_cost_per_token": 0.000004},
+        "input_cost_per_token": 0.0000015,
-    "chat-bison-001": {"max_tokens": 4096, "input_cost_per_token": 0.000002, "output_cost_per_token": 0.000002},
+        "output_cost_per_token": 0.000002,
-    "command-nightly": {"max_tokens": 4096, "input_cost_per_token": 0.000015, "output_cost_per_token": 0.000015},
+    },
    "gpt-3.5-turbo-0301": {
        "max_tokens": 4000,
        "input_cost_per_token": 0.0000015,
        "output_cost_per_token": 0.000002,
    },
    "gpt-3.5-turbo-16k": {
        "max_tokens": 16000,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000004,
    },
    "gpt-35-turbo-16k": {
        "max_tokens": 16000,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000004,
    },  # azure model name
    "gpt-3.5-turbo-16k-0613": {
        "max_tokens": 16000,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000004,
    },
    "gpt-4": {
        "max_tokens": 8000,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.00006,
    },
    "gpt-4-0613": {
        "max_tokens": 8000,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.00006,
    },
    "gpt-4-32k": {
        "max_tokens": 8000,
        "input_cost_per_token": 0.00006,
        "output_cost_per_token": 0.00012,
    },
    "claude-instant-1": {
        "max_tokens": 100000,
        "input_cost_per_token": 0.00000163,
        "output_cost_per_token": 0.00000551,
    },
    "claude-2": {
        "max_tokens": 100000,
        "input_cost_per_token": 0.00001102,
        "output_cost_per_token": 0.00003268,
    },
    "text-bison-001": {
        "max_tokens": 8192,
        "input_cost_per_token": 0.000004,
        "output_cost_per_token": 0.000004,
    },
    "chat-bison-001": {
        "max_tokens": 4096,
        "input_cost_per_token": 0.000002,
        "output_cost_per_token": 0.000002,
    },
    "command-nightly": {
        "max_tokens": 4096,
        "input_cost_per_token": 0.000015,
        "output_cost_per_token": 0.000015,
    },
 }
 class AISpendLogger:
    # Class variables or attributes
    def __init__(self):
@ -37,8 +99,14 @@ class AISpendLogger:
        prompt_tokens_cost_usd_dollar = 0
        completion_tokens_cost_usd_dollar = 0
        if model in model_cost:
-            prompt_tokens_cost_usd_dollar = model_cost[model]["input_cost_per_token"] * response_obj["usage"]["prompt_tokens"]
+            prompt_tokens_cost_usd_dollar = (
-            completion_tokens_cost_usd_dollar = model_cost[model]["output_cost_per_token"] * response_obj["usage"]["completion_tokens"]
+                model_cost[model]["input_cost_per_token"]
                * response_obj["usage"]["prompt_tokens"]
            )
            completion_tokens_cost_usd_dollar = (
                model_cost[model]["output_cost_per_token"]
                * response_obj["usage"]["completion_tokens"]
            )
        elif "replicate" in model:
            # replicate models are charged based on time
            # llama 2 runs on an nvidia a100 which costs $0.0032 per second - https://replicate.com/replicate/llama-2-70b-chat
@ -55,27 +123,41 @@ class AISpendLogger:
                output_cost_sum += model_cost[model]["output_cost_per_token"]
            avg_input_cost = input_cost_sum / len(model_cost.keys())
            avg_output_cost = output_cost_sum / len(model_cost.keys())
-            prompt_tokens_cost_usd_dollar = model_cost[model]["input_cost_per_token"] * response_obj["usage"]["prompt_tokens"]
+            prompt_tokens_cost_usd_dollar = (
-            completion_tokens_cost_usd_dollar = model_cost[model]["output_cost_per_token"] * response_obj["usage"]["completion_tokens"]
+                model_cost[model]["input_cost_per_token"]
                * response_obj["usage"]["prompt_tokens"]
            )
            completion_tokens_cost_usd_dollar = (
                model_cost[model]["output_cost_per_token"]
                * response_obj["usage"]["completion_tokens"]
            )
        return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
    def log_event(self, model, response_obj, start_time, end_time, print_verbose):
        # Method definition
        try:
-            print_verbose(f"AISpend Logging - Enters logging function for model {model}")
+            print_verbose(
                f"AISpend Logging - Enters logging function for model {model}"
            )
            url = f"https://aispend.io/api/v1/accounts/{self.account_id}/data"
            headers = {
-                'Authorization': f'Bearer {self.api_key}',
+                "Authorization": f"Bearer {self.api_key}",
-                'Content-Type': 'application/json'
+                "Content-Type": "application/json",
            }
-            response_timestamp = datetime.datetime.fromtimestamp(int(response_obj["created"])).strftime('%Y-%m-%d')
+            response_timestamp = datetime.datetime.fromtimestamp(
                int(response_obj["created"])
            ).strftime("%Y-%m-%d")
-            prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = self.price_calculator(model, response_obj, start_time, end_time)
+            (
                prompt_tokens_cost_usd_dollar,
                completion_tokens_cost_usd_dollar,
            ) = self.price_calculator(model, response_obj, start_time, end_time)
            prompt_tokens_cost_usd_cent = prompt_tokens_cost_usd_dollar * 100
            completion_tokens_cost_usd_cent = completion_tokens_cost_usd_dollar * 100
-            data = [{
+            data = [
                {
                    "requests": 1,
                    "requests_context": 1,
                    "context_tokens": response_obj["usage"]["prompt_tokens"],
@ -84,8 +166,9 @@ class AISpendLogger:
                    "recorded_date": response_timestamp,
                    "model_id": response_obj["model"],
                    "generated_tokens_cost_usd_cent": prompt_tokens_cost_usd_cent,
-                "context_tokens_cost_usd_cent": completion_tokens_cost_usd_cent
+                    "context_tokens_cost_usd_cent": completion_tokens_cost_usd_cent,
-            }]
+                }
            ]
            print_verbose(f"AISpend Logging - final data object: {data}")
        except:
--- a/litellm/integrations/berrispend.py
+++ b/litellm/integrations/berrispend.py
@ -2,28 +2,90 @@
 #    On success + failure, log events to aispend.io
 import dotenv, os
 import requests
 dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback
 import datetime
 model_cost = {
-    "gpt-3.5-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
+    "gpt-3.5-turbo": {
-    "gpt-35-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, # azure model name
+        "max_tokens": 4000,
-    "gpt-3.5-turbo-0613": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
+        "input_cost_per_token": 0.0000015,
-    "gpt-3.5-turbo-0301": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
+        "output_cost_per_token": 0.000002,
-    "gpt-3.5-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
+    },
-    "gpt-35-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, # azure model name
+    "gpt-35-turbo": {
-    "gpt-3.5-turbo-16k-0613": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
+        "max_tokens": 4000,
-    "gpt-4": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
+        "input_cost_per_token": 0.0000015,
-    "gpt-4-0613": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
+        "output_cost_per_token": 0.000002,
-    "gpt-4-32k": {"max_tokens": 8000, "input_cost_per_token": 0.00006, "output_cost_per_token": 0.00012},
+    },  # azure model name
-    "claude-instant-1": {"max_tokens": 100000, "input_cost_per_token": 0.00000163, "output_cost_per_token": 0.00000551},
+    "gpt-3.5-turbo-0613": {
-    "claude-2": {"max_tokens": 100000, "input_cost_per_token": 0.00001102, "output_cost_per_token": 0.00003268},
+        "max_tokens": 4000,
-    "text-bison-001": {"max_tokens": 8192, "input_cost_per_token": 0.000004, "output_cost_per_token": 0.000004},
+        "input_cost_per_token": 0.0000015,
-    "chat-bison-001": {"max_tokens": 4096, "input_cost_per_token": 0.000002, "output_cost_per_token": 0.000002},
+        "output_cost_per_token": 0.000002,
-    "command-nightly": {"max_tokens": 4096, "input_cost_per_token": 0.000015, "output_cost_per_token": 0.000015},
+    },
    "gpt-3.5-turbo-0301": {
        "max_tokens": 4000,
        "input_cost_per_token": 0.0000015,
        "output_cost_per_token": 0.000002,
    },
    "gpt-3.5-turbo-16k": {
        "max_tokens": 16000,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000004,
    },
    "gpt-35-turbo-16k": {
        "max_tokens": 16000,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000004,
    },  # azure model name
    "gpt-3.5-turbo-16k-0613": {
        "max_tokens": 16000,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000004,
    },
    "gpt-4": {
        "max_tokens": 8000,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.00006,
    },
    "gpt-4-0613": {
        "max_tokens": 8000,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.00006,
    },
    "gpt-4-32k": {
        "max_tokens": 8000,
        "input_cost_per_token": 0.00006,
        "output_cost_per_token": 0.00012,
    },
    "claude-instant-1": {
        "max_tokens": 100000,
        "input_cost_per_token": 0.00000163,
        "output_cost_per_token": 0.00000551,
    },
    "claude-2": {
        "max_tokens": 100000,
        "input_cost_per_token": 0.00001102,
        "output_cost_per_token": 0.00003268,
    },
    "text-bison-001": {
        "max_tokens": 8192,
        "input_cost_per_token": 0.000004,
        "output_cost_per_token": 0.000004,
    },
    "chat-bison-001": {
        "max_tokens": 4096,
        "input_cost_per_token": 0.000002,
        "output_cost_per_token": 0.000002,
    },
    "command-nightly": {
        "max_tokens": 4096,
        "input_cost_per_token": 0.000015,
        "output_cost_per_token": 0.000015,
    },
 }
 class BerriSpendLogger:
    # Class variables or attributes
    def __init__(self):
@ -36,8 +98,14 @@ class BerriSpendLogger:
        prompt_tokens_cost_usd_dollar = 0
        completion_tokens_cost_usd_dollar = 0
        if model in model_cost:
-            prompt_tokens_cost_usd_dollar = model_cost[model]["input_cost_per_token"] * response_obj["usage"]["prompt_tokens"]
+            prompt_tokens_cost_usd_dollar = (
-            completion_tokens_cost_usd_dollar = model_cost[model]["output_cost_per_token"] * response_obj["usage"]["completion_tokens"]
+                model_cost[model]["input_cost_per_token"]
                * response_obj["usage"]["prompt_tokens"]
            )
            completion_tokens_cost_usd_dollar = (
                model_cost[model]["output_cost_per_token"]
                * response_obj["usage"]["completion_tokens"]
            )
        elif "replicate" in model:
            # replicate models are charged based on time
            # llama 2 runs on an nvidia a100 which costs $0.0032 per second - https://replicate.com/replicate/llama-2-70b-chat
@ -54,42 +122,59 @@ class BerriSpendLogger:
                output_cost_sum += model_cost[model]["output_cost_per_token"]
            avg_input_cost = input_cost_sum / len(model_cost.keys())
            avg_output_cost = output_cost_sum / len(model_cost.keys())
-            prompt_tokens_cost_usd_dollar = model_cost[model]["input_cost_per_token"] * response_obj["usage"]["prompt_tokens"]
+            prompt_tokens_cost_usd_dollar = (
-            completion_tokens_cost_usd_dollar = model_cost[model]["output_cost_per_token"] * response_obj["usage"]["completion_tokens"]
+                model_cost[model]["input_cost_per_token"]
                * response_obj["usage"]["prompt_tokens"]
            )
            completion_tokens_cost_usd_dollar = (
                model_cost[model]["output_cost_per_token"]
                * response_obj["usage"]["completion_tokens"]
            )
        return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
-    def log_event(self, model, messages, response_obj, start_time, end_time, print_verbose):
+    def log_event(
        self, model, messages, response_obj, start_time, end_time, print_verbose
    ):
        # Method definition
        try:
-            print_verbose(f"BerriSpend Logging - Enters logging function for model {model}")
+            print_verbose(
                f"BerriSpend Logging - Enters logging function for model {model}"
            )
            url = f"https://berrispend.berri.ai/spend"
-            headers = {
+            headers = {"Content-Type": "application/json"}
                'Content-Type': 'application/json'
            }
-            prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = self.price_calculator(model, response_obj, start_time, end_time)
+            (
-            total_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
+                prompt_tokens_cost_usd_dollar,
                completion_tokens_cost_usd_dollar,
            ) = self.price_calculator(model, response_obj, start_time, end_time)
            total_cost = (
                prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
            )
            response_time = (end_time - start_time).total_seconds()
            if "response" in response_obj:
-                data = [{
+                data = [
                    {
                        "response_time": response_time,
                        "model_id": response_obj["model"],
                        "total_cost": total_cost,
                        "messages": messages,
-                    "response": response_obj['choices'][0]['message']['content'],
+                        "response": response_obj["choices"][0]["message"]["content"],
-                    "account_id": self.account_id
+                        "account_id": self.account_id,
-                }]
+                    }
                ]
            elif "error" in response_obj:
-                data = [{
+                data = [
                    {
                        "response_time": response_time,
                        "model_id": response_obj["model"],
                        "total_cost": total_cost,
                        "messages": messages,
-                    "error": response_obj['error'],
+                        "error": response_obj["error"],
-                    "account_id": self.account_id
+                        "account_id": self.account_id,
-                }]
+                    }
                ]
            print_verbose(f"BerriSpend Logging - final data object: {data}")
            response = requests.post(url, headers=headers, json=data)
--- a/litellm/integrations/helicone.py
+++ b/litellm/integrations/helicone.py
@ -2,18 +2,23 @@
 #    On success, logs events to Helicone
 import dotenv, os
 import requests
 dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback
 class HeliconeLogger:
    # Class variables or attributes
    helicone_model_list = ["gpt", "claude"]
    def __init__(self):
        # Instance variables
        self.provider_url = "https://api.openai.com/v1"
-        self.key = os.getenv('HELICONE_API_KEY')
+        self.key = os.getenv("HELICONE_API_KEY")
    def claude_mapping(self, model, messages, response_obj):
        from anthropic import HUMAN_PROMPT, AI_PROMPT
        prompt = f"{HUMAN_PROMPT}"
        for message in messages:
            if "role" in message:
@ -26,46 +31,82 @@ class HeliconeLogger:
        prompt += f"{AI_PROMPT}"
        claude_provider_request = {"model": model, "prompt": prompt}
-        claude_response_obj = {"completion": response_obj['choices'][0]['message']['content'], "model": model, "stop_reason": "stop_sequence"}
+        claude_response_obj = {
            "completion": response_obj["choices"][0]["message"]["content"],
            "model": model,
            "stop_reason": "stop_sequence",
        }
        return claude_provider_request, claude_response_obj
-    def log_success(self, model, messages, response_obj, start_time, end_time, print_verbose):
+    def log_success(
        self, model, messages, response_obj, start_time, end_time, print_verbose
    ):
        # Method definition
        try:
-            print_verbose(f"Helicone Logging - Enters logging function for model {model}")
+            print_verbose(
-            model = model if any(accepted_model in model for accepted_model in self.helicone_model_list) else "gpt-3.5-turbo"
+                f"Helicone Logging - Enters logging function for model {model}"
            )
            model = (
                model
                if any(
                    accepted_model in model
                    for accepted_model in self.helicone_model_list
                )
                else "gpt-3.5-turbo"
            )
            provider_request = {"model": model, "messages": messages}
            if "claude" in model:
-                provider_request, response_obj = self.claude_mapping(model=model, messages=messages, response_obj=response_obj)
+                provider_request, response_obj = self.claude_mapping(
                    model=model, messages=messages, response_obj=response_obj
                )
            providerResponse = {
                "json": response_obj,
                "headers": {"openai-version": "2020-10-01"},
-                "status": 200
+                "status": 200,
            }
            # Code to be executed
            url = "https://api.hconeai.com/oai/v1/log"
            headers = {
-                'Authorization': f'Bearer {self.key}',
+                "Authorization": f"Bearer {self.key}",
-                'Content-Type': 'application/json'
+                "Content-Type": "application/json",
            }
            start_time_seconds = int(start_time.timestamp())
-            start_time_milliseconds = int((start_time.timestamp() - start_time_seconds) * 1000)
+            start_time_milliseconds = int(
                (start_time.timestamp() - start_time_seconds) * 1000
            )
            end_time_seconds = int(end_time.timestamp())
-            end_time_milliseconds = int((end_time.timestamp() - end_time_seconds) * 1000)
+            end_time_milliseconds = int(
                (end_time.timestamp() - end_time_seconds) * 1000
            )
            data = {
-                "providerRequest": {"url": self.provider_url, "json": provider_request, "meta": {"Helicone-Auth": f"Bearer {self.key}"}},
+                "providerRequest": {
                    "url": self.provider_url,
                    "json": provider_request,
                    "meta": {"Helicone-Auth": f"Bearer {self.key}"},
                },
                "providerResponse": providerResponse,
-                "timing": {"startTime": {"seconds": start_time_seconds, "milliseconds": start_time_milliseconds}, "endTime": {"seconds": end_time_seconds, "milliseconds": end_time_milliseconds}} # {"seconds": .., "milliseconds": ..}
+                "timing": {
                    "startTime": {
                        "seconds": start_time_seconds,
                        "milliseconds": start_time_milliseconds,
                    },
                    "endTime": {
                        "seconds": end_time_seconds,
                        "milliseconds": end_time_milliseconds,
                    },
                },  # {"seconds": .., "milliseconds": ..}
            }
            response = requests.post(url, headers=headers, json=data)
            if response.status_code == 200:
                print_verbose("Helicone Logging - Success!")
            else:
-                print_verbose(f"Helicone Logging - Error Request was not successful. Status Code: {response.status_code}")
+                print_verbose(
                    f"Helicone Logging - Error Request was not successful. Status Code: {response.status_code}"
                )
                print_verbose(f"Helicone Logging - Error {response.text}")
        except:
            # traceback.print_exc()
--- a/litellm/integrations/supabase.py
+++ b/litellm/integrations/supabase.py
@ -3,31 +3,94 @@
 import dotenv, os
 import requests
 dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback
 import datetime, subprocess, sys
 model_cost = {
-    "gpt-3.5-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
+    "gpt-3.5-turbo": {
-    "gpt-35-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, # azure model name
+        "max_tokens": 4000,
-    "gpt-3.5-turbo-0613": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
+        "input_cost_per_token": 0.0000015,
-    "gpt-3.5-turbo-0301": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
+        "output_cost_per_token": 0.000002,
-    "gpt-3.5-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
+    },
-    "gpt-35-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, # azure model name
+    "gpt-35-turbo": {
-    "gpt-3.5-turbo-16k-0613": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
+        "max_tokens": 4000,
-    "gpt-4": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
+        "input_cost_per_token": 0.0000015,
-    "gpt-4-0613": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
+        "output_cost_per_token": 0.000002,
-    "gpt-4-32k": {"max_tokens": 8000, "input_cost_per_token": 0.00006, "output_cost_per_token": 0.00012},
+    },  # azure model name
-    "claude-instant-1": {"max_tokens": 100000, "input_cost_per_token": 0.00000163, "output_cost_per_token": 0.00000551},
+    "gpt-3.5-turbo-0613": {
-    "claude-2": {"max_tokens": 100000, "input_cost_per_token": 0.00001102, "output_cost_per_token": 0.00003268},
+        "max_tokens": 4000,
-    "text-bison-001": {"max_tokens": 8192, "input_cost_per_token": 0.000004, "output_cost_per_token": 0.000004},
+        "input_cost_per_token": 0.0000015,
-    "chat-bison-001": {"max_tokens": 4096, "input_cost_per_token": 0.000002, "output_cost_per_token": 0.000002},
+        "output_cost_per_token": 0.000002,
-    "command-nightly": {"max_tokens": 4096, "input_cost_per_token": 0.000015, "output_cost_per_token": 0.000015},
+    },
    "gpt-3.5-turbo-0301": {
        "max_tokens": 4000,
        "input_cost_per_token": 0.0000015,
        "output_cost_per_token": 0.000002,
    },
    "gpt-3.5-turbo-16k": {
        "max_tokens": 16000,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000004,
    },
    "gpt-35-turbo-16k": {
        "max_tokens": 16000,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000004,
    },  # azure model name
    "gpt-3.5-turbo-16k-0613": {
        "max_tokens": 16000,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000004,
    },
    "gpt-4": {
        "max_tokens": 8000,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.00006,
    },
    "gpt-4-0613": {
        "max_tokens": 8000,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.00006,
    },
    "gpt-4-32k": {
        "max_tokens": 8000,
        "input_cost_per_token": 0.00006,
        "output_cost_per_token": 0.00012,
    },
    "claude-instant-1": {
        "max_tokens": 100000,
        "input_cost_per_token": 0.00000163,
        "output_cost_per_token": 0.00000551,
    },
    "claude-2": {
        "max_tokens": 100000,
        "input_cost_per_token": 0.00001102,
        "output_cost_per_token": 0.00003268,
    },
    "text-bison-001": {
        "max_tokens": 8192,
        "input_cost_per_token": 0.000004,
        "output_cost_per_token": 0.000004,
    },
    "chat-bison-001": {
        "max_tokens": 4096,
        "input_cost_per_token": 0.000002,
        "output_cost_per_token": 0.000002,
    },
    "command-nightly": {
        "max_tokens": 4096,
        "input_cost_per_token": 0.000015,
        "output_cost_per_token": 0.000015,
    },
 }
 class Supabase:
    # Class variables or attributes
    supabase_table_name = "request_logs"
    def __init__(self):
        # Instance variables
        self.supabase_url = os.getenv("SUPABASE_URL")
@ -35,9 +98,11 @@ class Supabase:
        try:
            import supabase
        except ImportError:
-            subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'supabase'])
+            subprocess.check_call([sys.executable, "-m", "pip", "install", "supabase"])
            import supabase
-        self.supabase_client = supabase.create_client(self.supabase_url, self.supabase_key)
+        self.supabase_client = supabase.create_client(
            self.supabase_url, self.supabase_key
        )
    def price_calculator(self, model, response_obj, start_time, end_time):
        # try and find if the model is in the model_cost map
@ -45,8 +110,14 @@ class Supabase:
        prompt_tokens_cost_usd_dollar = 0
        completion_tokens_cost_usd_dollar = 0
        if model in model_cost:
-            prompt_tokens_cost_usd_dollar = model_cost[model]["input_cost_per_token"] * response_obj["usage"]["prompt_tokens"]
+            prompt_tokens_cost_usd_dollar = (
-            completion_tokens_cost_usd_dollar = model_cost[model]["output_cost_per_token"] * response_obj["usage"]["completion_tokens"]
+                model_cost[model]["input_cost_per_token"]
                * response_obj["usage"]["prompt_tokens"]
            )
            completion_tokens_cost_usd_dollar = (
                model_cost[model]["output_cost_per_token"]
                * response_obj["usage"]["completion_tokens"]
            )
        elif "replicate" in model:
            # replicate models are charged based on time
            # llama 2 runs on an nvidia a100 which costs $0.0032 per second - https://replicate.com/replicate/llama-2-70b-chat
@ -63,16 +134,38 @@ class Supabase:
                output_cost_sum += model_cost[model]["output_cost_per_token"]
            avg_input_cost = input_cost_sum / len(model_cost.keys())
            avg_output_cost = output_cost_sum / len(model_cost.keys())
-            prompt_tokens_cost_usd_dollar = model_cost[model]["input_cost_per_token"] * response_obj["usage"]["prompt_tokens"]
+            prompt_tokens_cost_usd_dollar = (
-            completion_tokens_cost_usd_dollar = model_cost[model]["output_cost_per_token"] * response_obj["usage"]["completion_tokens"]
+                model_cost[model]["input_cost_per_token"]
                * response_obj["usage"]["prompt_tokens"]
            )
            completion_tokens_cost_usd_dollar = (
                model_cost[model]["output_cost_per_token"]
                * response_obj["usage"]["completion_tokens"]
            )
        return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
-    def log_event(self, model, messages, end_user, response_obj, start_time, end_time, print_verbose):
+    def log_event(
        self,
        model,
        messages,
        end_user,
        response_obj,
        start_time,
        end_time,
        print_verbose,
    ):
        try:
-            print_verbose(f"Supabase Logging - Enters logging function for model {model}, response_obj: {response_obj}")
+            print_verbose(
                f"Supabase Logging - Enters logging function for model {model}, response_obj: {response_obj}"
            )
-            prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = self.price_calculator(model, response_obj, start_time, end_time)
+            (
-            total_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
+                prompt_tokens_cost_usd_dollar,
                completion_tokens_cost_usd_dollar,
            ) = self.price_calculator(model, response_obj, start_time, end_time)
            total_cost = (
                prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
            )
            response_time = (end_time - start_time).total_seconds()
            if "choices" in response_obj:
@ -81,22 +174,34 @@ class Supabase:
                    "model": response_obj["model"],
                    "total_cost": total_cost,
                    "messages": messages,
-                    "response": response_obj['choices'][0]['message']['content'],
+                    "response": response_obj["choices"][0]["message"]["content"],
-                    "end_user": end_user
+                    "end_user": end_user,
                }
-                print_verbose(f"Supabase Logging - final data object: {supabase_data_obj}")
+                print_verbose(
-                data, count = self.supabase_client.table(self.supabase_table_name).insert(supabase_data_obj).execute()
+                    f"Supabase Logging - final data object: {supabase_data_obj}"
                )
                data, count = (
                    self.supabase_client.table(self.supabase_table_name)
                    .insert(supabase_data_obj)
                    .execute()
                )
            elif "error" in response_obj:
                supabase_data_obj = {
                    "response_time": response_time,
                    "model": response_obj["model"],
                    "total_cost": total_cost,
                    "messages": messages,
-                    "error": response_obj['error'],
+                    "error": response_obj["error"],
-                    "end_user": end_user
+                    "end_user": end_user,
                }
-                print_verbose(f"Supabase Logging - final data object: {supabase_data_obj}")
+                print_verbose(
-                data, count = self.supabase_client.table(self.supabase_table_name).insert(supabase_data_obj).execute()
+                    f"Supabase Logging - final data object: {supabase_data_obj}"
                )
                data, count = (
                    self.supabase_client.table(self.supabase_table_name)
                    .insert(supabase_data_obj)
                    .execute()
                )
        except:
            # traceback.print_exc()
--- a/litellm/llms/anthropic.py
+++ b/litellm/llms/anthropic.py
@ -6,18 +6,22 @@ import time
 from typing import Callable
 from litellm.utils import ModelResponse
 class AnthropicConstants(Enum):
    HUMAN_PROMPT = "\n\nHuman:"
    AI_PROMPT = "\n\nAssistant:"
 class AnthropicError(Exception):
    def __init__(self, status_code, message):
        self.status_code = status_code
        self.message = message
-        super().__init__(self.message) # Call the base class constructor with the parameters it needs
+        super().__init__(
            self.message
        )  # Call the base class constructor with the parameters it needs
 class AnthropicLLM:
    def __init__(self, encoding, default_max_tokens_to_sample, api_key=None):
        self.encoding = encoding
        self.default_max_tokens_to_sample = default_max_tokens_to_sample
@ -25,31 +29,50 @@ class AnthropicLLM:
        self.api_key = api_key
        self.validate_environment(api_key=api_key)
-    def validate_environment(self, api_key): # set up the environment required to run the model 
+    def validate_environment(
        self, api_key
    ):  # set up the environment required to run the model
        # set the api key
        if self.api_key == None:
-            raise ValueError("Missing Anthropic API Key - A call is being made to anthropic but no key is set either in the environment variables or via params")
+            raise ValueError(
                "Missing Anthropic API Key - A call is being made to anthropic but no key is set either in the environment variables or via params"
            )
        self.api_key = api_key
        self.headers = {
            "accept": "application/json",
            "anthropic-version": "2023-06-01",
            "content-type": "application/json",
-            "x-api-key": self.api_key 
+            "x-api-key": self.api_key,
        }
-    def completion(self, model: str, messages: list, model_response: ModelResponse, print_verbose: Callable, optional_params=None, litellm_params=None, logger_fn=None): # logic for parsing in - calling - parsing out model completion calls
+    def completion(
        self,
        model: str,
        messages: list,
        model_response: ModelResponse,
        print_verbose: Callable,
        optional_params=None,
        litellm_params=None,
        logger_fn=None,
    ):  # logic for parsing in - calling - parsing out model completion calls
        model = model
        prompt = f"{AnthropicConstants.HUMAN_PROMPT.value}"
        for message in messages:
            if "role" in message:
                if message["role"] == "user":
-                    prompt += f"{AnthropicConstants.HUMAN_PROMPT.value}{message['content']}"
+                    prompt += (
                        f"{AnthropicConstants.HUMAN_PROMPT.value}{message['content']}"
                    )
                else:
-                    prompt += f"{AnthropicConstants.AI_PROMPT.value}{message['content']}"
+                    prompt += (
                        f"{AnthropicConstants.AI_PROMPT.value}{message['content']}"
                    )
            else:
                prompt += f"{AnthropicConstants.HUMAN_PROMPT.value}{message['content']}"
        prompt += f"{AnthropicConstants.AI_PROMPT.value}"
-        if "max_tokens" in optional_params and optional_params["max_tokens"] != float('inf'): 
+        if "max_tokens" in optional_params and optional_params["max_tokens"] != float(
            "inf"
        ):
            max_tokens = optional_params["max_tokens"]
        else:
            max_tokens = self.default_max_tokens_to_sample
@ -57,37 +80,64 @@ class AnthropicLLM:
            "model": model,
            "prompt": prompt,
            "max_tokens_to_sample": max_tokens,
-            **optional_params
+            **optional_params,
        }
        ## LOGGING
-        logging(model=model, input=prompt, additional_args={"litellm_params": litellm_params, "optional_params": optional_params}, logger_fn=logger_fn)
+        logging(
            model=model,
            input=prompt,
            additional_args={
                "litellm_params": litellm_params,
                "optional_params": optional_params,
            },
            logger_fn=logger_fn,
        )
        ## COMPLETION CALL
-        response = requests.post(self.completion_url, headers=self.headers, data=json.dumps(data))
+        response = requests.post(
            self.completion_url, headers=self.headers, data=json.dumps(data)
        )
        if "stream" in optional_params and optional_params["stream"] == True:
            return response.iter_lines()
        else:
            ## LOGGING
-            logging(model=model, input=prompt, additional_args={"litellm_params": litellm_params, "optional_params": optional_params, "original_response": response.text}, logger_fn=logger_fn)
+            logging(
                model=model,
                input=prompt,
                additional_args={
                    "litellm_params": litellm_params,
                    "optional_params": optional_params,
                    "original_response": response.text,
                },
                logger_fn=logger_fn,
            )
            print_verbose(f"raw model_response: {response.text}")
            ## RESPONSE OBJECT
            completion_response = response.json()
            if "error" in completion_response:
-                raise AnthropicError(message=completion_response["error"], status_code=response.status_code)
+                raise AnthropicError(
                    message=completion_response["error"],
                    status_code=response.status_code,
                )
            else:
-                model_response["choices"][0]["message"]["content"] = completion_response["completion"]    
+                model_response["choices"][0]["message"][
                    "content"
                ] = completion_response["completion"]
            ## CALCULATING USAGE
-            prompt_tokens = len(self.encoding.encode(prompt)) ##[TODO] use the anthropic tokenizer here
+            prompt_tokens = len(
-            completion_tokens = len(self.encoding.encode(model_response["choices"][0]["message"]["content"])) ##[TODO] use the anthropic tokenizer here
+                self.encoding.encode(prompt)
-            
+            )  ##[TODO] use the anthropic tokenizer here
            completion_tokens = len(
                self.encoding.encode(model_response["choices"][0]["message"]["content"])
            )  ##[TODO] use the anthropic tokenizer here
            model_response["created"] = time.time()
            model_response["model"] = model
            model_response["usage"] = {
                "prompt_tokens": prompt_tokens,
                "completion_tokens": completion_tokens,
-                "total_tokens": prompt_tokens + completion_tokens
+                "total_tokens": prompt_tokens + completion_tokens,
            }
            return model_response
--- a/litellm/llms/base.py
+++ b/litellm/llms/base.py
@ -1,6 +1,7 @@
 ## This is a template base class to be used for adding new LLM providers via API calls
-class BaseLLM():
+
 class BaseLLM:
    def validate_environment():  # set up the environment required to run the model
        pass
--- a/litellm/llms/huggingface_restapi.py
+++ b/litellm/llms/huggingface_restapi.py
@ -7,18 +7,24 @@ import time
 from typing import Callable
 from litellm.utils import ModelResponse
 class HuggingfaceError(Exception):
    def __init__(self, status_code, message):
        self.status_code = status_code
        self.message = message
-        super().__init__(self.message) # Call the base class constructor with the parameters it needs
+        super().__init__(
            self.message
        )  # Call the base class constructor with the parameters it needs
-class HuggingfaceRestAPILLM():
+
 class HuggingfaceRestAPILLM:
    def __init__(self, encoding, api_key=None) -> None:
        self.encoding = encoding
        self.validate_environment(api_key=api_key)
-    def validate_environment(self, api_key): # set up the environment required to run the model 
+    def validate_environment(
        self, api_key
    ):  # set up the environment required to run the model
        self.headers = {
            "content-type": "application/json",
        }
@ -27,7 +33,17 @@ class HuggingfaceRestAPILLM():
        if self.api_key != None:
            self.headers["Authorization"] = f"Bearer {self.api_key}"
-    def completion(self, model: str, messages: list, custom_api_base: str, model_response: ModelResponse, print_verbose: Callable, optional_params=None, litellm_params=None, logger_fn=None): # logic for parsing in - calling - parsing out model completion calls
+    def completion(
        self,
        model: str,
        messages: list,
        custom_api_base: str,
        model_response: ModelResponse,
        print_verbose: Callable,
        optional_params=None,
        litellm_params=None,
        logger_fn=None,
    ):  # logic for parsing in - calling - parsing out model completion calls
        if custom_api_base:
            completion_url = custom_api_base
        elif "HF_API_BASE" in os.environ:
@ -35,7 +51,9 @@ class HuggingfaceRestAPILLM():
        else:
            completion_url = f"https://api-inference.huggingface.co/models/{model}"
        prompt = ""
-        if "meta-llama" in model and "chat" in model: # use the required special tokens for meta-llama - https://huggingface.co/blog/llama2#how-to-prompt-llama-2
+        if (
            "meta-llama" in model and "chat" in model
        ):  # use the required special tokens for meta-llama - https://huggingface.co/blog/llama2#how-to-prompt-llama-2
            prompt = "<s>"
            for message in messages:
                if message["role"] == "system":
@ -57,14 +75,33 @@ class HuggingfaceRestAPILLM():
            # "parameters": optional_params
        }
        ## LOGGING
-        logging(model=model, input=prompt, additional_args={"litellm_params": litellm_params, "optional_params": optional_params}, logger_fn=logger_fn)
+        logging(
            model=model,
            input=prompt,
            additional_args={
                "litellm_params": litellm_params,
                "optional_params": optional_params,
            },
            logger_fn=logger_fn,
        )
        ## COMPLETION CALL
-        response = requests.post(completion_url, headers=self.headers, data=json.dumps(data))
+        response = requests.post(
            completion_url, headers=self.headers, data=json.dumps(data)
        )
        if "stream" in optional_params and optional_params["stream"] == True:
            return response.iter_lines()
        else:
            ## LOGGING
-            logging(model=model, input=prompt, additional_args={"litellm_params": litellm_params, "optional_params": optional_params, "original_response": response.text}, logger_fn=logger_fn)
+            logging(
                model=model,
                input=prompt,
                additional_args={
                    "litellm_params": litellm_params,
                    "optional_params": optional_params,
                    "original_response": response.text,
                },
                logger_fn=logger_fn,
            )
            print_verbose(f"raw model_response: {response.text}")
            ## RESPONSE OBJECT
            completion_response = response.json()
@ -72,21 +109,29 @@ class HuggingfaceRestAPILLM():
            if isinstance(completion_response, dict) and "error" in completion_response:
                print_verbose(f"completion error: {completion_response['error']}")
                print_verbose(f"response.status_code: {response.status_code}")
-                raise HuggingfaceError(message=completion_response["error"], status_code=response.status_code)
+                raise HuggingfaceError(
                    message=completion_response["error"],
                    status_code=response.status_code,
                )
            else:
-                model_response["choices"][0]["message"]["content"] = completion_response[0]["generated_text"]    
+                model_response["choices"][0]["message"][
                    "content"
                ] = completion_response[0]["generated_text"]
            ## CALCULATING USAGE
-            prompt_tokens = len(self.encoding.encode(prompt)) ##[TODO] use the llama2 tokenizer here
+            prompt_tokens = len(
-            completion_tokens = len(self.encoding.encode(model_response["choices"][0]["message"]["content"])) ##[TODO] use the llama2 tokenizer here
+                self.encoding.encode(prompt)
-            
+            )  ##[TODO] use the llama2 tokenizer here
            completion_tokens = len(
                self.encoding.encode(model_response["choices"][0]["message"]["content"])
            )  ##[TODO] use the llama2 tokenizer here
            model_response["created"] = time.time()
            model_response["model"] = model
            model_response["usage"] = {
                "prompt_tokens": prompt_tokens,
                "completion_tokens": completion_tokens,
-                "total_tokens": prompt_tokens + completion_tokens
+                "total_tokens": prompt_tokens + completion_tokens,
            }
            return model_response
        pass
--- a/litellm/main.py
+++ b/litellm/main.py
@ -4,17 +4,43 @@ from functools import partial
 import dotenv, traceback, random, asyncio, time
 from copy import deepcopy
 import litellm
-from litellm import client, logging, exception_type, timeout, get_optional_params, get_litellm_params
+from litellm import (
-from litellm.utils import get_secret, install_and_import, CustomStreamWrapper, read_config_args
+    client,
    logging,
    exception_type,
    timeout,
    get_optional_params,
    get_litellm_params,
 )
 from litellm.utils import (
    get_secret,
    install_and_import,
    CustomStreamWrapper,
    read_config_args,
 )
 from .llms.anthropic import AnthropicLLM
 from .llms.huggingface_restapi import HuggingfaceRestAPILLM
 import tiktoken
 from concurrent.futures import ThreadPoolExecutor
 encoding = tiktoken.get_encoding("cl100k_base")
-from litellm.utils import get_secret, install_and_import, CustomStreamWrapper, ModelResponse, read_config_args
+from litellm.utils import (
-from litellm.utils import get_ollama_response_stream, stream_to_string, together_ai_completion_streaming
+    get_secret,
    install_and_import,
    CustomStreamWrapper,
    ModelResponse,
    read_config_args,
 )
 from litellm.utils import (
    get_ollama_response_stream,
    stream_to_string,
    together_ai_completion_streaming,
 )
 ####### ENVIRONMENT VARIABLES ###################
 dotenv.load_dotenv()  # Loading env variables using dotenv
 ####### COMPLETION ENDPOINTS ################
 #############################################
 async def acompletion(*args, **kwargs):
@ -26,20 +52,43 @@ async def acompletion(*args, **kwargs):
    # Call the synchronous function using run_in_executor
    return await loop.run_in_executor(None, func)
@client
 # @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(2), reraise=True, retry_error_callback=lambda retry_state: setattr(retry_state.outcome, 'retry_variable', litellm.retry)) # retry call, turn this off by setting `litellm.retry = False`
-@timeout(600) ## set timeouts, in case calls hang (e.g. Azure) - default is 60s, override with `force_timeout`
+@timeout(
    600
 )  ## set timeouts, in case calls hang (e.g. Azure) - default is 60s, override with `force_timeout`
 def completion(
-    model, messages,# required params
+    model,
    messages,  # required params
    # Optional OpenAI params: see https://platform.openai.com/docs/api-reference/chat/create
-    functions=[], function_call="", # optional params
+    functions=[],
-    temperature=1, top_p=1, n=1, stream=False, stop=None, max_tokens=float('inf'),
+    function_call="",  # optional params
-    presence_penalty=0, frequency_penalty=0, logit_bias={}, user="", deployment_id=None,
+    temperature=1,
    top_p=1,
    n=1,
    stream=False,
    stop=None,
    max_tokens=float("inf"),
    presence_penalty=0,
    frequency_penalty=0,
    logit_bias={},
    user="",
    deployment_id=None,
    # Optional liteLLM function params
-    *, return_async=False, api_key=None, force_timeout=600, logger_fn=None, verbose=False, azure=False, custom_llm_provider=None, custom_api_base=None,
+    *,
    return_async=False,
    api_key=None,
    force_timeout=600,
    logger_fn=None,
    verbose=False,
    azure=False,
    custom_llm_provider=None,
    custom_api_base=None,
    # model specific optional params
    # used by text-bison only
-    top_k=40, request_timeout=0, # unused var for old version of OpenAI API
+    top_k=40,
    request_timeout=0,  # unused var for old version of OpenAI API
 ) -> ModelResponse:
    try:
        model_response = ModelResponse()
@ -48,27 +97,58 @@ def completion(
        args = locals()
        # check if user passed in any of the OpenAI optional params
        optional_params = get_optional_params(
-      functions=functions, function_call=function_call, 
+            functions=functions,
-      temperature=temperature, top_p=top_p, n=n, stream=stream, stop=stop, max_tokens=max_tokens,
+            function_call=function_call,
-      presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, logit_bias=logit_bias, user=user, deployment_id=deployment_id,
+            temperature=temperature,
            top_p=top_p,
            n=n,
            stream=stream,
            stop=stop,
            max_tokens=max_tokens,
            presence_penalty=presence_penalty,
            frequency_penalty=frequency_penalty,
            logit_bias=logit_bias,
            user=user,
            deployment_id=deployment_id,
            # params to identify the model
-      model=model, custom_llm_provider=custom_llm_provider, top_k=top_k,
+            model=model,
            custom_llm_provider=custom_llm_provider,
            top_k=top_k,
        )
        # For logging - save the values of the litellm-specific params passed in
        litellm_params = get_litellm_params(
-      return_async=return_async, api_key=api_key, force_timeout=force_timeout, 
+            return_async=return_async,
-      logger_fn=logger_fn, verbose=verbose, custom_llm_provider=custom_llm_provider, 
+            api_key=api_key,
-      custom_api_base=custom_api_base)
+            force_timeout=force_timeout,
            logger_fn=logger_fn,
            verbose=verbose,
            custom_llm_provider=custom_llm_provider,
            custom_api_base=custom_api_base,
        )
        if custom_llm_provider == "azure":
            # azure configs
            openai.api_type = "azure"
-      openai.api_base = litellm.api_base if litellm.api_base is not None else get_secret("AZURE_API_BASE")
+            openai.api_base = (
-      openai.api_version = litellm.api_version if litellm.api_version is not None else get_secret("AZURE_API_VERSION")
+                litellm.api_base
                if litellm.api_base is not None
                else get_secret("AZURE_API_BASE")
            )
            openai.api_version = (
                litellm.api_version
                if litellm.api_version is not None
                else get_secret("AZURE_API_VERSION")
            )
            # set key
            openai.api_key = api_key or litellm.azure_key or get_secret("AZURE_API_KEY")
            ## LOGGING
-      logging(model=model, input=messages, additional_args=optional_params, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn)
+            logging(
                model=model,
                input=messages,
                additional_args=optional_params,
                custom_llm_provider=custom_llm_provider,
                logger_fn=logger_fn,
            )
            ## COMPLETION CALL
            if litellm.headers:
                response = openai.ChatCompletion.create(
@ -79,47 +159,70 @@ def completion(
                )
            else:
                response = openai.ChatCompletion.create(
-          model=model,
+                    model=model, messages=messages, **optional_params
          messages = messages,
          **optional_params
                )
-    elif model in litellm.open_ai_chat_completion_models or custom_llm_provider == "custom_openai": # allow user to make an openai call with a custom base
+        elif (
            model in litellm.open_ai_chat_completion_models
            or custom_llm_provider == "custom_openai"
        ):  # allow user to make an openai call with a custom base
            openai.api_type = "openai"
            # note: if a user sets a custom base - we should ensure this works
-      api_base = custom_api_base if custom_api_base is not None else litellm.api_base # allow for the setting of dynamic and stateful api-bases
+            api_base = (
-      openai.api_base = api_base if api_base is not None else "https://api.openai.com/v1"
+                custom_api_base if custom_api_base is not None else litellm.api_base
            )  # allow for the setting of dynamic and stateful api-bases
            openai.api_base = (
                api_base if api_base is not None else "https://api.openai.com/v1"
            )
            openai.api_version = None
            if litellm.organization:
                openai.organization = litellm.organization
            # set API KEY
-      openai.api_key = api_key or litellm.openai_key or get_secret("OPENAI_API_KEY")
+            openai.api_key = (
                api_key or litellm.openai_key or get_secret("OPENAI_API_KEY")
            )
            ## LOGGING
-      logging(model=model, input=messages, additional_args=args, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn)
+            logging(
                model=model,
                input=messages,
                additional_args=args,
                custom_llm_provider=custom_llm_provider,
                logger_fn=logger_fn,
            )
            ## COMPLETION CALL
            if litellm.headers:
                response = openai.ChatCompletion.create(
                    model=model,
                    messages=messages,
                    headers=litellm.headers,
-          **optional_params
+                    **optional_params,
                )
            else:
                response = openai.ChatCompletion.create(
-          model=model,
+                    model=model, messages=messages, **optional_params
          messages = messages,
          **optional_params
                )
        elif model in litellm.open_ai_text_completion_models:
            openai.api_type = "openai"
-      openai.api_base = litellm.api_base if litellm.api_base is not None else "https://api.openai.com/v1"
+            openai.api_base = (
                litellm.api_base
                if litellm.api_base is not None
                else "https://api.openai.com/v1"
            )
            openai.api_version = None
-      openai.api_key = api_key or litellm.openai_key or get_secret("OPENAI_API_KEY")
+            openai.api_key = (
                api_key or litellm.openai_key or get_secret("OPENAI_API_KEY")
            )
            if litellm.organization:
                openai.organization = litellm.organization
            prompt = " ".join([message["content"] for message in messages])
            ## LOGGING
-      logging(model=model, input=prompt, additional_args=optional_params, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn)
+            logging(
                model=model,
                input=prompt,
                additional_args=optional_params,
                custom_llm_provider=custom_llm_provider,
                logger_fn=logger_fn,
            )
            ## COMPLETION CALL
            if litellm.headers:
                response = openai.Completion.create(
@ -128,13 +231,19 @@ def completion(
                    headers=litellm.headers,
                )
            else:
-        response = openai.Completion.create(
+                response = openai.Completion.create(model=model, prompt=prompt)
            model=model,
            prompt = prompt
        )
            completion_response = response["choices"][0]["text"]
            ## LOGGING
-      logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn)
+            logging(
                model=model,
                input=prompt,
                custom_llm_provider=custom_llm_provider,
                additional_args={
                    "max_tokens": max_tokens,
                    "original_response": completion_response,
                },
                logger_fn=logger_fn,
            )
            ## RESPONSE OBJECT
            model_response["choices"][0]["message"]["content"] = completion_response
            model_response["created"] = response["created"]
@ -145,11 +254,17 @@ def completion(
            # import replicate/if it fails then pip install replicate
            install_and_import("replicate")
            import replicate
            # Setting the relevant API KEY for replicate, replicate defaults to using os.environ.get("REPLICATE_API_TOKEN")
            replicate_key = os.environ.get("REPLICATE_API_TOKEN")
            if replicate_key == None:
                # user did not set REPLICATE_API_TOKEN in .env
-        replicate_key = get_secret("REPLICATE_API_KEY") or get_secret("REPLICATE_API_TOKEN") or api_key or litellm.replicate_key
+                replicate_key = (
                    get_secret("REPLICATE_API_KEY")
                    or get_secret("REPLICATE_API_TOKEN")
                    or api_key
                    or litellm.replicate_key
                )
                # set replicate kye
                os.environ["REPLICATE_API_TOKEN"] = replicate_key
            prompt = " ".join([message["content"] for message in messages])
@ -158,12 +273,16 @@ def completion(
                input["max_length"] = max_tokens  # for t5 models
                input["max_new_tokens"] = max_tokens  # for llama2 models
            ## LOGGING
-      logging(model=model, input=input, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens}, logger_fn=logger_fn)
+            logging(
                model=model,
                input=input,
                custom_llm_provider=custom_llm_provider,
                additional_args={"max_tokens": max_tokens},
                logger_fn=logger_fn,
            )
            ## COMPLETION CALL
-      output = replicate.run(
+            output = replicate.run(model, input=input)
-        model,
+            if "stream" in optional_params and optional_params["stream"] == True:
        input=input)
      if 'stream' in optional_params and optional_params['stream'] == True:
                # don't try to access stream object,
                # let the stream handler know this is replicate
                response = CustomStreamWrapper(output, "replicate")
@ -173,7 +292,16 @@ def completion(
                response += item
            completion_response = response
            ## LOGGING
-      logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn)
+            logging(
                model=model,
                input=prompt,
                custom_llm_provider=custom_llm_provider,
                additional_args={
                    "max_tokens": max_tokens,
                    "original_response": completion_response,
                },
                logger_fn=logger_fn,
            )
            prompt_tokens = len(encoding.encode(prompt))
            completion_tokens = len(encoding.encode(completion_response))
            ## RESPONSE OBJECT
@ -183,14 +311,28 @@ def completion(
            model_response["usage"] = {
                "prompt_tokens": prompt_tokens,
                "completion_tokens": completion_tokens,
-          "total_tokens": prompt_tokens + completion_tokens
+                "total_tokens": prompt_tokens + completion_tokens,
            }
            response = model_response
        elif model in litellm.anthropic_models:
-      anthropic_key = api_key or litellm.anthropic_key or os.environ.get("ANTHROPIC_API_KEY")
+            anthropic_key = (
-      anthropic_client = AnthropicLLM(encoding=encoding, default_max_tokens_to_sample=litellm.max_tokens, api_key=anthropic_key)
+                api_key or litellm.anthropic_key or os.environ.get("ANTHROPIC_API_KEY")
-      model_response = anthropic_client.completion(model=model, messages=messages, model_response=model_response, print_verbose=print_verbose, optional_params=optional_params, litellm_params=litellm_params, logger_fn=logger_fn)
+            )
-      if 'stream' in optional_params and optional_params['stream'] == True:
+            anthropic_client = AnthropicLLM(
                encoding=encoding,
                default_max_tokens_to_sample=litellm.max_tokens,
                api_key=anthropic_key,
            )
            model_response = anthropic_client.completion(
                model=model,
                messages=messages,
                model_response=model_response,
                print_verbose=print_verbose,
                optional_params=optional_params,
                litellm_params=litellm_params,
                logger_fn=logger_fn,
            )
            if "stream" in optional_params and optional_params["stream"] == True:
                # don't try to access stream object,
                response = CustomStreamWrapper(model_response, model)
                return response
@ -198,7 +340,11 @@ def completion(
        elif model in litellm.openrouter_models or custom_llm_provider == "openrouter":
            openai.api_type = "openai"
            # not sure if this will work after someone first uses another API
-      openai.api_base = litellm.api_base if litellm.api_base is not None else "https://openrouter.ai/api/v1"
+            openai.api_base = (
                litellm.api_base
                if litellm.api_base is not None
                else "https://openrouter.ai/api/v1"
            )
            openai.api_version = None
            if litellm.organization:
                openai.organization = litellm.organization
@ -207,16 +353,24 @@ def completion(
            elif litellm.openrouter_key:
                openai.api_key = litellm.openrouter_key
            else:
-          openai.api_key = get_secret("OPENROUTER_API_KEY") or get_secret("OR_API_KEY")
+                openai.api_key = get_secret("OPENROUTER_API_KEY") or get_secret(
                    "OR_API_KEY"
                )
            ## LOGGING
-      logging(model=model, input=messages, additional_args=optional_params, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn)
+            logging(
                model=model,
                input=messages,
                additional_args=optional_params,
                custom_llm_provider=custom_llm_provider,
                logger_fn=logger_fn,
            )
            ## COMPLETION CALL
            if litellm.headers:
                response = openai.ChatCompletion.create(
                    model=model,
                    messages=messages,
                    headers=litellm.headers,
-          **optional_params
+                    **optional_params,
                )
            else:
                openrouter_site_url = get_secret("OR_SITE_URL")
@ -230,36 +384,51 @@ def completion(
                response = openai.ChatCompletion.create(
                    model=model,
                    messages=messages,
-          headers = 
+                    headers={
          {
                        "HTTP-Referer": openrouter_site_url,  # To identify your site
-             "X-Title": openrouter_app_name # To identify your app
+                        "X-Title": openrouter_app_name,  # To identify your app
                    },
-          **optional_params
+                    **optional_params,
                )
        elif model in litellm.cohere_models:
            # import cohere/if it fails then pip install cohere
            install_and_import("cohere")
            import cohere
-      cohere_key = api_key or litellm.cohere_key or get_secret("COHERE_API_KEY") or get_secret("CO_API_KEY")
+
            cohere_key = (
                api_key
                or litellm.cohere_key
                or get_secret("COHERE_API_KEY")
                or get_secret("CO_API_KEY")
            )
            co = cohere.Client(cohere_key)
            prompt = " ".join([message["content"] for message in messages])
            ## LOGGING
-      logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn)
+            logging(
      ## COMPLETION CALL
      response = co.generate(  
                model=model,
-        prompt = prompt,
+                input=prompt,
-        **optional_params
+                custom_llm_provider=custom_llm_provider,
                logger_fn=logger_fn,
            )
-      if 'stream' in optional_params and optional_params['stream'] == True:
+            ## COMPLETION CALL
            response = co.generate(model=model, prompt=prompt, **optional_params)
            if "stream" in optional_params and optional_params["stream"] == True:
                # don't try to access stream object,
                response = CustomStreamWrapper(response, model)
                return response
            completion_response = response[0].text
            ## LOGGING
-      logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn)
+            logging(
                model=model,
                input=prompt,
                custom_llm_provider=custom_llm_provider,
                additional_args={
                    "max_tokens": max_tokens,
                    "original_response": completion_response,
                },
                logger_fn=logger_fn,
            )
            prompt_tokens = len(encoding.encode(prompt))
            completion_tokens = len(encoding.encode(completion_response))
            ## RESPONSE OBJECT
@ -269,52 +438,100 @@ def completion(
            model_response["usage"] = {
                "prompt_tokens": prompt_tokens,
                "completion_tokens": completion_tokens,
-          "total_tokens": prompt_tokens + completion_tokens
+                "total_tokens": prompt_tokens + completion_tokens,
            }
            response = model_response
-    elif model in litellm.huggingface_models or custom_llm_provider == "huggingface":
+        elif (
            model in litellm.huggingface_models or custom_llm_provider == "huggingface"
        ):
            custom_llm_provider = "huggingface"
-      huggingface_key = api_key or litellm.huggingface_key or os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_API_KEY")
+            huggingface_key = (
-      huggingface_client = HuggingfaceRestAPILLM(encoding=encoding, api_key=huggingface_key)
+                api_key
-      model_response = huggingface_client.completion(model=model, messages=messages, custom_api_base=custom_api_base, model_response=model_response, print_verbose=print_verbose, optional_params=optional_params, litellm_params=litellm_params, logger_fn=logger_fn)
+                or litellm.huggingface_key
-      if 'stream' in optional_params and optional_params['stream'] == True:
+                or os.environ.get("HF_TOKEN")
                or os.environ.get("HUGGINGFACE_API_KEY")
            )
            huggingface_client = HuggingfaceRestAPILLM(
                encoding=encoding, api_key=huggingface_key
            )
            model_response = huggingface_client.completion(
                model=model,
                messages=messages,
                custom_api_base=custom_api_base,
                model_response=model_response,
                print_verbose=print_verbose,
                optional_params=optional_params,
                litellm_params=litellm_params,
                logger_fn=logger_fn,
            )
            if "stream" in optional_params and optional_params["stream"] == True:
                # don't try to access stream object,
-        response = CustomStreamWrapper(model_response, model, custom_llm_provider="huggingface")
+                response = CustomStreamWrapper(
                    model_response, model, custom_llm_provider="huggingface"
                )
                return response
            response = model_response
        elif custom_llm_provider == "together_ai" or ("togethercomputer" in model):
            import requests
-      TOGETHER_AI_TOKEN = get_secret("TOGETHER_AI_TOKEN") or get_secret("TOGETHERAI_API_KEY") or api_key or litellm.togetherai_api_key
+
            TOGETHER_AI_TOKEN = (
                get_secret("TOGETHER_AI_TOKEN")
                or get_secret("TOGETHERAI_API_KEY")
                or api_key
                or litellm.togetherai_api_key
            )
            headers = {"Authorization": f"Bearer {TOGETHER_AI_TOKEN}"}
-      endpoint = 'https://api.together.xyz/inference'
+            endpoint = "https://api.together.xyz/inference"
-      prompt = " ".join([message["content"] for message in messages]) # TODO: Add chat support for together AI
+            prompt = " ".join(
                [message["content"] for message in messages]
            )  # TODO: Add chat support for together AI
            ## LOGGING
-      logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn)
+            logging(
                model=model,
                input=prompt,
                custom_llm_provider=custom_llm_provider,
                logger_fn=logger_fn,
            )
            if stream == True:
-        return together_ai_completion_streaming({
+                return together_ai_completion_streaming(
                    {
                        "model": model,
                        "prompt": prompt,
                        "request_type": "language-model-inference",
-          **optional_params
+                        **optional_params,
                    },
-        headers=headers)
+                    headers=headers,
-      res = requests.post(endpoint, json={
+                )
            res = requests.post(
                endpoint,
                json={
                    "model": model,
                    "prompt": prompt,
                    "request_type": "language-model-inference",
-          **optional_params
+                    **optional_params,
                },
-        headers=headers
+                headers=headers,
            )
            ## LOGGING
-      logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": res.text}, logger_fn=logger_fn)
+            logging(
                model=model,
                input=prompt,
                custom_llm_provider=custom_llm_provider,
                additional_args={
                    "max_tokens": max_tokens,
                    "original_response": res.text,
                },
                logger_fn=logger_fn,
            )
            # make this safe for reading, if output does not exist raise an error
            json_response = res.json()
            if "output" not in json_response:
-        raise Exception(f"liteLLM: Error Making TogetherAI request, JSON Response {json_response}")
+                raise Exception(
-      completion_response = json_response['output']['choices'][0]['text']
+                    f"liteLLM: Error Making TogetherAI request, JSON Response {json_response}"
                )
            completion_response = json_response["output"]["choices"][0]["text"]
            prompt_tokens = len(encoding.encode(prompt))
            completion_tokens = len(encoding.encode(completion_response))
            ## RESPONSE OBJECT
@ -324,7 +541,7 @@ def completion(
            model_response["usage"] = {
                "prompt_tokens": prompt_tokens,
                "completion_tokens": completion_tokens,
-          "total_tokens": prompt_tokens + completion_tokens
+                "total_tokens": prompt_tokens + completion_tokens,
            }
            response = model_response
        elif model in litellm.vertex_chat_models:
@ -332,21 +549,41 @@ def completion(
            install_and_import("vertexai")
            import vertexai
            from vertexai.preview.language_models import ChatModel, InputOutputTextPair
-      vertexai.init(project=litellm.vertex_project, location=litellm.vertex_location)
+
            vertexai.init(
                project=litellm.vertex_project, location=litellm.vertex_location
            )
            # vertexai does not use an API key, it looks for credentials.json in the environment
            prompt = " ".join([message["content"] for message in messages])
            ## LOGGING
-      logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"litellm_params": litellm_params, "optional_params": optional_params}, logger_fn=logger_fn)
+            logging(
                model=model,
                input=prompt,
                custom_llm_provider=custom_llm_provider,
                additional_args={
                    "litellm_params": litellm_params,
                    "optional_params": optional_params,
                },
                logger_fn=logger_fn,
            )
            chat_model = ChatModel.from_pretrained(model)
            chat = chat_model.start_chat()
            completion_response = chat.send_message(prompt, **optional_params)
            ## LOGGING
-      logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn)
+            logging(
                model=model,
                input=prompt,
                custom_llm_provider=custom_llm_provider,
                additional_args={
                    "max_tokens": max_tokens,
                    "original_response": completion_response,
                },
                logger_fn=logger_fn,
            )
            ## RESPONSE OBJECT
            model_response["choices"][0]["message"]["content"] = completion_response
@ -358,17 +595,33 @@ def completion(
            import vertexai
            from vertexai.language_models import TextGenerationModel
-      vertexai.init(project=litellm.vertex_project, location=litellm.vertex_location)
+            vertexai.init(
                project=litellm.vertex_project, location=litellm.vertex_location
            )
            # vertexai does not use an API key, it looks for credentials.json in the environment
            prompt = " ".join([message["content"] for message in messages])
            ## LOGGING
-      logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn)
+            logging(
                model=model,
                input=prompt,
                custom_llm_provider=custom_llm_provider,
                logger_fn=logger_fn,
            )
            vertex_model = TextGenerationModel.from_pretrained(model)
            completion_response = vertex_model.predict(prompt, **optional_params)
            ## LOGGING
-      logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn)
+            logging(
                model=model,
                input=prompt,
                custom_llm_provider=custom_llm_provider,
                additional_args={
                    "max_tokens": max_tokens,
                    "original_response": completion_response,
                },
                logger_fn=logger_fn,
            )
            ## RESPONSE OBJECT
            model_response["choices"][0]["message"]["content"] = completion_response
@ -378,20 +631,35 @@ def completion(
        elif model in litellm.ai21_models:
            install_and_import("ai21")
            import ai21
            ai21.api_key = get_secret("AI21_API_KEY")
            prompt = " ".join([message["content"] for message in messages])
            ## LOGGING
-      logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn)
+            logging(
                model=model,
                input=prompt,
                custom_llm_provider=custom_llm_provider,
                logger_fn=logger_fn,
            )
            ai21_response = ai21.Completion.execute(
                model=model,
                prompt=prompt,
            )
-      completion_response = ai21_response['completions'][0]['data']['text']
+            completion_response = ai21_response["completions"][0]["data"]["text"]
            ## LOGGING
-      logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn)
+            logging(
                model=model,
                input=prompt,
                custom_llm_provider=custom_llm_provider,
                additional_args={
                    "max_tokens": max_tokens,
                    "original_response": completion_response,
                },
                logger_fn=logger_fn,
            )
            ## RESPONSE OBJECT
            model_response["choices"][0]["message"]["content"] = completion_response
@ -399,7 +667,9 @@ def completion(
            model_response["model"] = model
            response = model_response
        elif custom_llm_provider == "ollama":
-      endpoint = litellm.api_base if litellm.api_base is not None else custom_api_base
+            endpoint = (
                litellm.api_base if litellm.api_base is not None else custom_api_base
            )
            prompt = " ".join([message["content"] for message in messages])
            ## LOGGING
@ -407,14 +677,23 @@ def completion(
            generator = get_ollama_response_stream(endpoint, model, prompt)
            # assume all responses are streamed
            return generator
-    elif custom_llm_provider == "baseten" or litellm.api_base=="https://app.baseten.co":
+        elif (
            custom_llm_provider == "baseten"
            or litellm.api_base == "https://app.baseten.co"
        ):
            import baseten
-      base_ten_key = get_secret('BASETEN_API_KEY')
+
            base_ten_key = get_secret("BASETEN_API_KEY")
            baseten.login(base_ten_key)
            prompt = " ".join([message["content"] for message in messages])
            ## LOGGING
-      logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn)
+            logging(
                model=model,
                input=prompt,
                custom_llm_provider=custom_llm_provider,
                logger_fn=logger_fn,
            )
            base_ten__model = baseten.deployed_model_version_id(model)
@ -424,7 +703,16 @@ def completion(
                if type(completion_response) == dict:
                    completion_response = completion_response["generated_text"]
-      logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn)
+            logging(
                model=model,
                input=prompt,
                custom_llm_provider=custom_llm_provider,
                additional_args={
                    "max_tokens": max_tokens,
                    "original_response": completion_response,
                },
                logger_fn=logger_fn,
            )
            ## RESPONSE OBJECT
            model_response["choices"][0]["message"]["content"] = completion_response
@ -432,16 +720,35 @@ def completion(
            model_response["model"] = model
            response = model_response
-    elif custom_llm_provider == "petals" or (litellm.api_base and "chat.petals.dev" in litellm.api_base):
+        elif custom_llm_provider == "petals" or (
            litellm.api_base and "chat.petals.dev" in litellm.api_base
        ):
            url = "https://chat.petals.dev/api/v1/generate"
            import requests
            prompt = " ".join([message["content"] for message in messages])
            ## LOGGING
-      logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn)
+            logging(
-      response = requests.post(url, data={"inputs": prompt, "max_new_tokens": 100, "model": model})
+                model=model,
                input=prompt,
                custom_llm_provider=custom_llm_provider,
                logger_fn=logger_fn,
            )
            response = requests.post(
                url, data={"inputs": prompt, "max_new_tokens": 100, "model": model}
            )
            ## LOGGING
-      logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": response}, logger_fn=logger_fn)
+            logging(
                model=model,
                input=prompt,
                custom_llm_provider=custom_llm_provider,
                additional_args={
                    "max_tokens": max_tokens,
                    "original_response": response,
                },
                logger_fn=logger_fn,
            )
            completion_response = response.json()["outputs"]
            # RESPONSE OBJECT
@ -451,15 +758,32 @@ def completion(
            response = model_response
        else:
            ## LOGGING
-      logging(model=model, input=messages, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn)
+            logging(
                model=model,
                input=messages,
                custom_llm_provider=custom_llm_provider,
                logger_fn=logger_fn,
            )
            args = locals()
-      raise ValueError(f"Unable to map your input to a model. Check your input - {args}")
+            raise ValueError(
                f"Unable to map your input to a model. Check your input - {args}"
            )
        return response
    except Exception as e:
        ## LOGGING
-    logging(model=model, input=messages, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens}, logger_fn=logger_fn, exception=e)
+        logging(
            model=model,
            input=messages,
            custom_llm_provider=custom_llm_provider,
            additional_args={"max_tokens": max_tokens},
            logger_fn=logger_fn,
            exception=e,
        )
        ## Map to OpenAI Exception
-    raise exception_type(model=model, custom_llm_provider=custom_llm_provider, original_exception=e)
+        raise exception_type(
            model=model, custom_llm_provider=custom_llm_provider, original_exception=e
        )
 def batch_completion(*args, **kwargs):
    batch_messages = args[1] if len(args) > 1 else kwargs.get("messages")
@ -480,9 +804,12 @@ def batch_completion(*args, **kwargs):
    results = [future.result() for future in completions]
    return results
 ### EMBEDDING ENDPOINTS ####################
@client
-@timeout(60) ## set timeouts, in case calls hang (e.g. Azure) - default is 60s, override with `force_timeout`
+@timeout(
    60
 )  ## set timeouts, in case calls hang (e.g. Azure) - default is 60s, override with `force_timeout`
 def embedding(model, input=[], azure=False, force_timeout=60, logger_fn=None):
    try:
        response = None
@ -519,6 +846,8 @@ def embedding(model, input=[], azure=False, force_timeout=60, logger_fn=None):
        ## Map to OpenAI Exception
        raise exception_type(model=model, original_exception=e)
        raise e
 ####### HELPER FUNCTIONS ################
 ## Set verbose to true -> ```litellm.set_verbose = True```
 def print_verbose(print_statement):
@ -527,10 +856,13 @@ def print_verbose(print_statement):
        if random.random() <= 0.3:
            print("Get help - https://discord.com/invite/wuPM9dRgDw")
 def config_completion(**kwargs):
    if litellm.config_path != None:
        config_args = read_config_args(litellm.config_path)
        # overwrite any args passed in with config args
        return completion(**kwargs, **config_args)
    else:
-    raise ValueError("No config path set, please set a config path using `litellm.config_path = 'path/to/config.json'`")
+        raise ValueError(
            "No config path set, please set a config path using `litellm.config_path = 'path/to/config.json'`"
        )
--- a/litellm/testing.py
+++ b/litellm/testing.py
@ -3,9 +3,12 @@ import time
 from concurrent.futures import ThreadPoolExecutor
 import traceback
 def testing_batch_completion(*args, **kwargs):
    try:
-    batch_models = args[0] if len(args) > 0 else kwargs.pop("models") ## expected input format- ["gpt-3.5-turbo", {"model": "qvv0xeq", "custom_llm_provider"="baseten"}...]
+        batch_models = (
            args[0] if len(args) > 0 else kwargs.pop("models")
        )  ## expected input format- ["gpt-3.5-turbo", {"model": "qvv0xeq", "custom_llm_provider"="baseten"}...]
        batch_messages = args[1] if len(args) > 1 else kwargs.pop("messages")
        results = []
        completions = []
@ -18,16 +21,32 @@ def testing_batch_completion(*args, **kwargs):
                if len(args) > 0:
                    args_modified[0] = model["model"]
                else:
-                kwargs_modified["model"] = model["model"] if isinstance(model, dict) and "model" in model else model # if model is a dictionary get it's value else assume it's a string
+                    kwargs_modified["model"] = (
-                kwargs_modified["custom_llm_provider"] = model["custom_llm_provider"] if isinstance(model, dict) and "custom_llm_provider" in model else None
+                        model["model"]
-                kwargs_modified["custom_api_base"] = model["custom_api_base"] if isinstance(model, dict) and "custom_api_base" in model else None
+                        if isinstance(model, dict) and "model" in model
                        else model
                    )  # if model is a dictionary get it's value else assume it's a string
                    kwargs_modified["custom_llm_provider"] = (
                        model["custom_llm_provider"]
                        if isinstance(model, dict) and "custom_llm_provider" in model
                        else None
                    )
                    kwargs_modified["custom_api_base"] = (
                        model["custom_api_base"]
                        if isinstance(model, dict) and "custom_api_base" in model
                        else None
                    )
                for message_list in batch_messages:
                    if len(args) > 1:
                        args_modified[1] = message_list
-                    future = executor.submit(litellm.completion, *args_modified, **kwargs_modified)
+                        future = executor.submit(
                            litellm.completion, *args_modified, **kwargs_modified
                        )
                    else:
                        kwargs_modified["messages"] = message_list
-                    future = executor.submit(litellm.completion, *args_modified, **kwargs_modified)
+                        future = executor.submit(
                            litellm.completion, *args_modified, **kwargs_modified
                        )
                    completions.append((future, message_list))
        # Retrieve the results and calculate elapsed time for each completion call
@ -38,17 +57,27 @@ def testing_batch_completion(*args, **kwargs):
                result = future.result()
                end_time = time.time()
                elapsed_time = end_time - start_time
-            result_dict = {"status": "succeeded", "response": future.result(), "prompt": message_list, "response_time": elapsed_time}
+                result_dict = {
                    "status": "succeeded",
                    "response": future.result(),
                    "prompt": message_list,
                    "response_time": elapsed_time,
                }
                results.append(result_dict)
            except Exception as e:
                end_time = time.time()
                elapsed_time = end_time - start_time
-            result_dict = {"status": "failed", "response": e, "response_time": elapsed_time}
+                result_dict = {
                    "status": "failed",
                    "response": e,
                    "response_time": elapsed_time,
                }
                results.append(result_dict)
        return results
    except:
        traceback.print_exc()
 def duration_test_model(original_function):
    def wrapper_function(*args, **kwargs):
        # Code to be executed before the original function
@ -70,22 +99,39 @@ def duration_test_model(original_function):
    # Return the wrapper function
    return wrapper_function
@duration_test_model
 def load_test_model(models: list, prompt: str = None, num_calls: int = None):
    test_calls = 100
    if num_calls:
        test_calls = num_calls
    input_prompt = prompt if prompt else "Hey, how's it going?"
-  messages = [{"role": "user", "content": prompt}] if prompt else [{"role": "user", "content": input_prompt}]
+    messages = (
-  full_message_list = [messages for _ in range(test_calls)] # call it as many times as set by user to load test models
+        [{"role": "user", "content": prompt}]
        if prompt
        else [{"role": "user", "content": input_prompt}]
    )
    full_message_list = [
        messages for _ in range(test_calls)
    ]  # call it as many times as set by user to load test models
    start_time = time.time()
    try:
        results = testing_batch_completion(models=models, messages=full_message_list)
        end_time = time.time()
        response_time = end_time - start_time
-    return {"total_response_time": response_time, "calls_made": test_calls, "prompt": input_prompt, "results": results}
+        return {
            "total_response_time": response_time,
            "calls_made": test_calls,
            "prompt": input_prompt,
            "results": results,
        }
    except Exception as e:
        traceback.print_exc()
        end_time = time.time()
        response_time = end_time - start_time
-    return {"total_response_time": response_time, "calls_made": test_calls, "prompt": input_prompt, "exception": e}
+        return {
            "total_response_time": response_time,
            "calls_made": test_calls,
            "prompt": input_prompt,
            "exception": e,
        }
--- a/litellm/tests/test_api_key_param.py
+++ b/litellm/tests/test_api_key_param.py
@ -3,15 +3,20 @@
 import sys, os
 import traceback
-sys.path.insert(0, os.path.abspath('../..'))  # Adds the parent directory to the system path
+
 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import litellm
 from litellm import embedding, completion
 litellm.set_verbose = False
 def logger_fn(model_call_object: dict):
    print(f"model call details: {model_call_object}")
 user_message = "Hello, how are you?"
 messages = [{"content": user_message, "role": "user"}]
@ -20,7 +25,12 @@ temp_key = os.environ.get("ANTHROPIC_API_KEY")
 os.environ["ANTHROPIC_API_KEY"] = "bad-key"
 # test on openai completion call
 try:
-    response = completion(model="claude-instant-1", messages=messages, logger_fn=logger_fn, api_key=temp_key)
+    response = completion(
        model="claude-instant-1",
        messages=messages,
        logger_fn=logger_fn,
        api_key=temp_key,
    )
    print(f"response: {response}")
 except:
    print(f"error occurred: {traceback.format_exc()}")
@ -33,7 +43,9 @@ litellm.anthropic_key = os.environ.get("ANTHROPIC_API_KEY")
 os.environ.pop("ANTHROPIC_API_KEY")
 # test on openai completion call
 try:
-    response = completion(model="claude-instant-1", messages=messages, logger_fn=logger_fn)
+    response = completion(
        model="claude-instant-1", messages=messages, logger_fn=logger_fn
    )
    print(f"response: {response}")
 except:
    print(f"error occurred: {traceback.format_exc()}")
--- a/litellm/tests/test_async_fn.py
+++ b/litellm/tests/test_async_fn.py
@ -5,9 +5,13 @@ import sys, os
 import pytest
 import traceback
 import asyncio
-sys.path.insert(0, os.path.abspath('../..'))  # Adds the parent directory to the system path
+
 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 from litellm import acompletion
 async def test_get_response():
    user_message = "Hello, how are you?"
    messages = [{"content": user_message, "role": "user"}]
@ -17,5 +21,6 @@ async def test_get_response():
        pytest.fail(f"error occurred: {e}")
    return response
 response = asyncio.run(test_get_response())
 print(response)
--- a/litellm/tests/test_bad_params.py
+++ b/litellm/tests/test_bad_params.py
@ -5,12 +5,13 @@
 import sys, os
 import traceback
 from dotenv import load_dotenv
 load_dotenv()
 # Get the current directory of the script
 current_dir = os.path.dirname(os.path.abspath(__file__))
 # Get the parent directory by joining the current directory with '..'
-parent_dir = os.path.join(current_dir, '../..')
+parent_dir = os.path.join(current_dir, "../..")
 # Add the parent directory to the system path
 sys.path.append(parent_dir)
--- a/litellm/tests/test_batch_completions.py
+++ b/litellm/tests/test_batch_completions.py
@ -3,7 +3,10 @@
 import sys, os
 import traceback
-sys.path.insert(0, os.path.abspath('../..'))  # Adds the parent directory to the system path
+
 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import litellm
 from litellm import batch_completion
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@ -1,9 +1,13 @@
 import sys, os
 import traceback
 from dotenv import load_dotenv
 load_dotenv()
 import os
-sys.path.insert(0, os.path.abspath('../..'))  # Adds the parent directory to the system path
+
 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import pytest
 import litellm
 from litellm import embedding, completion
@ -12,7 +16,6 @@ litellm.caching = True
 messages = [{"role": "user", "content": "who is ishaan Github?  "}]
 # test if response cached
 def test_caching():
    try:
@ -29,7 +32,3 @@ def test_caching():
        litellm.caching = False
        print(f"error occurred: {traceback.format_exc()}")
        pytest.fail(f"Error occurred: {e}")
--- a/litellm/tests/test_client.py
+++ b/litellm/tests/test_client.py
@ -5,7 +5,9 @@ import sys, os
 import traceback
 import pytest
-sys.path.insert(0, os.path.abspath('../..'))  # Adds the parent directory to the system path
+sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import litellm
 from litellm import embedding, completion
@ -14,17 +16,22 @@ litellm.failure_callback = ["slack", "sentry", "posthog"]
 litellm.set_verbose = True
 def logger_fn(model_call_object: dict):
    # print(f"model call details: {model_call_object}")
    pass
 user_message = "Hello, how are you?"
 messages = [{"content": user_message, "role": "user"}]
 def test_completion_openai():
    try:
        print("running query")
-        response = completion(model="gpt-3.5-turbo", messages=messages, logger_fn=logger_fn)
+        response = completion(
            model="gpt-3.5-turbo", messages=messages, logger_fn=logger_fn
        )
        print(f"response: {response}")
        # Add any assertions here to check the response
    except Exception as e:
@ -34,33 +41,46 @@ def test_completion_openai():
 def test_completion_claude():
    try:
-        response = completion(model="claude-instant-1", messages=messages, logger_fn=logger_fn)
+        response = completion(
            model="claude-instant-1", messages=messages, logger_fn=logger_fn
        )
        # Add any assertions here to check the response
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 def test_completion_non_openai():
    try:
-        response = completion(model="command-nightly", messages=messages, logger_fn=logger_fn)
+        response = completion(
            model="command-nightly", messages=messages, logger_fn=logger_fn
        )
        # Add any assertions here to check the response
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 def test_embedding_openai():
    try:
-        response = embedding(model='text-embedding-ada-002', input=[user_message], logger_fn=logger_fn)
+        response = embedding(
            model="text-embedding-ada-002", input=[user_message], logger_fn=logger_fn
        )
        # Add any assertions here to check the response
        print(f"response: {str(response)[:50]}")
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 def test_bad_azure_embedding():
    try:
-        response = embedding(model='chatgpt-test', input=[user_message], logger_fn=logger_fn)
+        response = embedding(
            model="chatgpt-test", input=[user_message], logger_fn=logger_fn
        )
        # Add any assertions here to check the response
        print(f"response: {str(response)[:50]}")
    except Exception as e:
        pass
 # def test_good_azure_embedding():
 #     try:
 #         response = embedding(model='azure-embedding-model', input=[user_message], azure=True, logger_fn=logger_fn)
@ -68,4 +88,3 @@ def test_bad_azure_embedding():
 #         print(f"response: {str(response)[:50]}")
 #     except Exception as e:
 #         pytest.fail(f"Error occurred: {e}")
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@ -1,12 +1,17 @@
 import sys, os
 import traceback
 from dotenv import load_dotenv
 load_dotenv()
 import os
-sys.path.insert(0, os.path.abspath('../..'))  # Adds the parent directory to the system path
+
 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import pytest
 import litellm
 from litellm import embedding, completion
 # from infisical import InfisicalClient
 # litellm.set_verbose = True
@ -15,30 +20,39 @@ from litellm import embedding, completion
 user_message = "Hello, whats the weather in San Francisco??"
 messages = [{"content": user_message, "role": "user"}]
 def logger_fn(user_model_dict):
    print(f"user_model_dict: {user_model_dict}")
 def test_completion_claude():
    try:
-        response = completion(model="claude-instant-1", messages=messages, logger_fn=logger_fn)
+        response = completion(
            model="claude-instant-1", messages=messages, logger_fn=logger_fn
        )
        # Add any assertions here to check the response
        print(response)
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 def test_completion_claude_stream():
    try:
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
-            {"role": "user", "content": "how does a court case get to the Supreme Court?"}
+            {
                "role": "user",
                "content": "how does a court case get to the Supreme Court?",
            },
        ]
        response = completion(model="claude-2", messages=messages, stream=True)
        # Add any assertions here to check the response
        for chunk in response:
-            print(chunk['choices'][0]['delta']) # same as openai format
+            print(chunk["choices"][0]["delta"])  # same as openai format
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 # def test_completion_hf_api():
 #     try:
 #         user_message = "write some code to find the sum of two numbers"
@ -62,10 +76,12 @@ def test_completion_claude_stream():
 def test_completion_cohere():
    try:
-        response = completion(model="command-nightly", messages=messages, max_tokens=100)
+        response = completion(
            model="command-nightly", messages=messages, max_tokens=100
        )
        # Add any assertions here to check the response
        print(response)
-        response_str = response['choices'][0]['message']['content']
+        response_str = response["choices"][0]["message"]["content"]
        print(f"str response{response_str}")
        response_str_2 = response.choices[0].message.content
        if type(response_str) != str:
@ -75,24 +91,31 @@ def test_completion_cohere():
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 def test_completion_cohere_stream():
    try:
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
-            {"role": "user", "content": "how does a court case get to the Supreme Court?"}
+            {
                "role": "user",
                "content": "how does a court case get to the Supreme Court?",
            },
        ]
-        response = completion(model="command-nightly", messages=messages, stream=True, max_tokens=50)
+        response = completion(
            model="command-nightly", messages=messages, stream=True, max_tokens=50
        )
        # Add any assertions here to check the response
        for chunk in response:
-            print(chunk['choices'][0]['delta']) # same as openai format
+            print(chunk["choices"][0]["delta"])  # same as openai format
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 def test_completion_openai():
    try:
        response = completion(model="gpt-3.5-turbo", messages=messages)
-        response_str = response['choices'][0]['message']['content']
+        response_str = response["choices"][0]["message"]["content"]
        response_str_2 = response.choices[0].message.content
        assert response_str == response_str_2
        assert type(response_str) == str
@ -100,6 +123,7 @@ def test_completion_openai():
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 def test_completion_text_openai():
    try:
        response = completion(model="text-davinci-003", messages=messages)
@ -108,17 +132,31 @@ def test_completion_text_openai():
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 def test_completion_openai_with_optional_params():
    try:
-        response = completion(model="gpt-3.5-turbo", messages=messages, temperature=0.5, top_p=0.1, user="ishaan_dev@berri.ai")
+        response = completion(
            model="gpt-3.5-turbo",
            messages=messages,
            temperature=0.5,
            top_p=0.1,
            user="ishaan_dev@berri.ai",
        )
        # Add any assertions here to check the response
        print(response)
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 def test_completion_openrouter():
    try:
-        response = completion(model="google/palm-2-chat-bison", messages=messages, temperature=0.5, top_p=0.1, user="ishaan_dev@berri.ai")
+        response = completion(
            model="google/palm-2-chat-bison",
            messages=messages,
            temperature=0.5,
            top_p=0.1,
            user="ishaan_dev@berri.ai",
        )
        # Add any assertions here to check the response
        print(response)
    except Exception as e:
@ -127,12 +165,23 @@ def test_completion_openrouter():
 def test_completion_openai_with_more_optional_params():
    try:
-        response = completion(model="gpt-3.5-turbo", messages=messages, temperature=0.5, top_p=0.1, n=2, max_tokens=150, presence_penalty=0.5, frequency_penalty=-0.5, logit_bias={123: 5}, user="ishaan_dev@berri.ai")
+        response = completion(
            model="gpt-3.5-turbo",
            messages=messages,
            temperature=0.5,
            top_p=0.1,
            n=2,
            max_tokens=150,
            presence_penalty=0.5,
            frequency_penalty=-0.5,
            logit_bias={123: 5},
            user="ishaan_dev@berri.ai",
        )
        # Add any assertions here to check the response
        print(response)
-        response_str = response['choices'][0]['message']['content']
+        response_str = response["choices"][0]["message"]["content"]
        response_str_2 = response.choices[0].message.content
-        print(response['choices'][0]['message']['content'])
+        print(response["choices"][0]["message"]["content"])
        print(response.choices[0].message.content)
        if type(response_str) != str:
            pytest.fail(f"Error occurred: {e}")
@ -141,14 +190,28 @@ def test_completion_openai_with_more_optional_params():
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 def test_completion_openai_with_stream():
    try:
-        response = completion(model="gpt-3.5-turbo", messages=messages, temperature=0.5, top_p=0.1, n=2, max_tokens=150, presence_penalty=0.5, stream=True, frequency_penalty=-0.5, logit_bias={27000: 5}, user="ishaan_dev@berri.ai")
+        response = completion(
            model="gpt-3.5-turbo",
            messages=messages,
            temperature=0.5,
            top_p=0.1,
            n=2,
            max_tokens=150,
            presence_penalty=0.5,
            stream=True,
            frequency_penalty=-0.5,
            logit_bias={27000: 5},
            user="ishaan_dev@berri.ai",
        )
        # Add any assertions here to check the response
        print(response)
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 def test_completion_openai_with_functions():
    function1 = [
        {
@ -159,32 +222,38 @@ def test_completion_openai_with_functions():
                "properties": {
                    "location": {
                        "type": "string",
-                        "description": "The city and state, e.g. San Francisco, CA"
+                        "description": "The city and state, e.g. San Francisco, CA",
                    },
-                    "unit": {
+                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
-                        "type": "string",
+                },
-                        "enum": ["celsius", "fahrenheit"]
+                "required": ["location"],
                    }
            },
                "required": ["location"]
            }
        }
    ]
    try:
-        response = completion(model="gpt-3.5-turbo", messages=messages, functions=function1)
+        response = completion(
            model="gpt-3.5-turbo", messages=messages, functions=function1
        )
        # Add any assertions here to check the response
        print(response)
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 def test_completion_azure():
    try:
-        response = completion(model="gpt-3.5-turbo", deployment_id="chatgpt-test", messages=messages, custom_llm_provider="azure")
+        response = completion(
            model="gpt-3.5-turbo",
            deployment_id="chatgpt-test",
            messages=messages,
            custom_llm_provider="azure",
        )
        # Add any assertions here to check the response
        print(response)
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 # Replicate API endpoints are unstable -> throw random CUDA errors -> this means our tests can fail even if our tests weren't incorrect.
 def test_completion_replicate_llama_stream():
    model_name = "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1"
@ -197,23 +266,32 @@ def test_completion_replicate_llama_stream():
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 def test_completion_replicate_stability_stream():
    model_name = "stability-ai/stablelm-tuned-alpha-7b:c49dae362cbaecd2ceabb5bd34fdb68413c4ff775111fea065d259d577757beb"
    try:
-        response = completion(model=model_name, messages=messages, stream=True, custom_llm_provider="replicate")
+        response = completion(
            model=model_name,
            messages=messages,
            stream=True,
            custom_llm_provider="replicate",
        )
        # Add any assertions here to check the response
        for chunk in response:
-            print(chunk['choices'][0]['delta'])
+            print(chunk["choices"][0]["delta"])
        print(response)
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 def test_completion_replicate_stability():
    model_name = "stability-ai/stablelm-tuned-alpha-7b:c49dae362cbaecd2ceabb5bd34fdb68413c4ff775111fea065d259d577757beb"
    try:
-        response = completion(model=model_name, messages=messages, custom_llm_provider="replicate")
+        response = completion(
            model=model_name, messages=messages, custom_llm_provider="replicate"
        )
        # Add any assertions here to check the response
-        response_str = response['choices'][0]['message']['content']
+        response_str = response["choices"][0]["message"]["content"]
        response_str_2 = response.choices[0].message.content
        print(response_str)
        print(response_str_2)
@ -224,6 +302,7 @@ def test_completion_replicate_stability():
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 ######## Test TogetherAI ########
 def test_completion_together_ai():
    model_name = "togethercomputer/llama-2-70b-chat"
@ -234,15 +313,22 @@ def test_completion_together_ai():
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 def test_petals():
    model_name = "stabilityai/StableBeluga2"
    try:
-        response = completion(model=model_name, messages=messages, custom_llm_provider="petals", force_timeout=120)
+        response = completion(
            model=model_name,
            messages=messages,
            custom_llm_provider="petals",
            force_timeout=120,
        )
        # Add any assertions here to check the response
        print(response)
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 # def test_baseten_falcon_7bcompletion():
 #     model_name = "qvv0xeq"
 #     try:
@ -290,7 +376,6 @@ def test_petals():
 #         pytest.fail(f"Error occurred: {e}")
 #### Test A121 ###################
 # def test_completion_ai21():
 #     model_name = "j2-light"
@ -333,4 +418,3 @@ def test_petals():
 #     return
 # test_completion_together_ai_stream()
--- a/litellm/tests/test_custom_api_base.py
+++ b/litellm/tests/test_custom_api_base.py
@ -1,14 +1,21 @@
 import sys, os
 import traceback
 from dotenv import load_dotenv
 load_dotenv()
 import os
-sys.path.insert(0, os.path.abspath('../..'))  # Adds the parent directory to the system path
+
 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import litellm
 from litellm import completion
 def logging_fn(model_call_dict):
    print(f"model call details: {model_call_dict}")
 models = ["gorilla-7b-hf-v1", "gpt-4"]
 custom_llm_provider = None
 messages = [{"role": "user", "content": "Hey,  how's it going?"}]
@ -17,4 +24,10 @@ for model in models: # iterate through list
    if model == "gorilla-7b-hf-v1":
        custom_llm_provider = "custom_openai"
        custom_api_base = "http://zanino.millennium.berkeley.edu:8000/v1"
-    completion(model=model, messages=messages, custom_llm_provider=custom_llm_provider, custom_api_base=custom_api_base, logger_fn=logging_fn)
+    completion(
        model=model,
        messages=messages,
        custom_llm_provider=custom_llm_provider,
        custom_api_base=custom_api_base,
        logger_fn=logging_fn,
    )
--- a/litellm/tests/test_embedding.py
+++ b/litellm/tests/test_embedding.py
@ -1,9 +1,10 @@
 import sys, os
 import traceback
 import pytest
-sys.path.insert(0, os.path.abspath('../..'))  # Adds the parent directory to the system path
+sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import litellm
 from litellm import embedding, completion
 from infisical import InfisicalClient
@ -11,9 +12,12 @@ from infisical import InfisicalClient
 # # litellm.set_verbose = True
 # litellm.secret_manager_client = InfisicalClient(token=os.environ["INFISICAL_TOKEN"])
 def test_openai_embedding():
    try:
-        response = embedding(model='text-embedding-ada-002', input=["good morning from litellm"])
+        response = embedding(
            model="text-embedding-ada-002", input=["good morning from litellm"]
        )
        # Add any assertions here to check the response
        print(f"response: {str(response)}")
    except Exception as e:
--- a/litellm/tests/test_exceptions.py
+++ b/litellm/tests/test_exceptions.py
@ -2,9 +2,20 @@
 import os
 import sys
 import traceback
-sys.path.insert(0, os.path.abspath('../..'))  # Adds the parent directory to the system path
+
 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import litellm
-from litellm import embedding, completion, AuthenticationError, InvalidRequestError, RateLimitError, ServiceUnavailableError, OpenAIError
+from litellm import (
    embedding,
    completion,
    AuthenticationError,
    InvalidRequestError,
    RateLimitError,
    ServiceUnavailableError,
    OpenAIError,
 )
 from concurrent.futures import ThreadPoolExecutor
 import pytest
@ -23,6 +34,8 @@ litellm.failure_callback = ["sentry"]
 # models = ["gpt-3.5-turbo", "chatgpt-test",  "claude-instant-1", "command-nightly"]
 test_model = "claude-instant-1"
 models = ["claude-instant-1"]
 def logging_fn(model_call_dict):
    if "model" in model_call_dict:
        print(f"model_call_dict: {model_call_dict['model']}")
@ -38,7 +51,12 @@ def test_context_window(model):
    try:
        model = "chatgpt-test"
        print(f"model: {model}")
-        response = completion(model=model, messages=messages, custom_llm_provider="azure", logger_fn=logging_fn)
+        response = completion(
            model=model,
            messages=messages,
            custom_llm_provider="azure",
            logger_fn=logging_fn,
        )
        print(f"response: {response}")
    except InvalidRequestError as e:
        print(f"InvalidRequestError: {e.llm_provider}")
@ -52,8 +70,11 @@ def test_context_window(model):
        print(f"Uncaught Exception - {e}")
        pytest.fail(f"Error occurred: {e}")
    return
 test_context_window(test_model)
 # Test 2: InvalidAuth Errors
@pytest.mark.parametrize("model", models)
 def invalid_auth(model):  # set the model key to an invalid key, depending on the model
@ -74,15 +95,22 @@ def invalid_auth(model): # set the model key to an invalid key, depending on the
        elif model == "command-nightly":
            temporary_key = os.environ["COHERE_API_KEY"]
            os.environ["COHERE_API_KEY"] = "bad-key"
-        elif model == "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1":
+        elif (
            model
            == "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1"
        ):
            temporary_key = os.environ["REPLICATE_API_KEY"]
            os.environ["REPLICATE_API_KEY"] = "bad-key"
        print(f"model: {model}")
-        response = completion(model=model, messages=messages, custom_llm_provider=custom_llm_provider)
+        response = completion(
            model=model, messages=messages, custom_llm_provider=custom_llm_provider
        )
        print(f"response: {response}")
    except AuthenticationError as e:
        print(f"AuthenticationError Caught Exception - {e.llm_provider}")
-    except OpenAIError: # is at least an openai error -> in case of random model errors - e.g. overloaded server
+    except (
        OpenAIError
    ):  # is at least an openai error -> in case of random model errors - e.g. overloaded server
        print(f"OpenAIError Caught Exception - {e}")
    except Exception as e:
        print(type(e))
@ -99,9 +127,14 @@ def invalid_auth(model): # set the model key to an invalid key, depending on the
            os.environ["ANTHROPIC_API_KEY"] = temporary_key
        elif model == "command-nightly":
            os.environ["COHERE_API_KEY"] = temporary_key
-        elif model == "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1":
+        elif (
            model
            == "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1"
        ):
            os.environ["REPLICATE_API_KEY"] = temporary_key
    return
 invalid_auth(test_model)
 # # Test 3: Rate Limit Errors
 # def test_model(model):
@ -142,5 +175,3 @@ invalid_auth(test_model)
 # accuracy_score = counts[True]/(counts[True] + counts[False])
 # print(f"accuracy_score: {accuracy_score}")
--- a/litellm/tests/test_helicone_integration.py
+++ b/litellm/tests/test_helicone_integration.py
@ -5,7 +5,9 @@ import sys, os
 import traceback
 import pytest
-sys.path.insert(0, os.path.abspath('../..'))  # Adds the parent directory to the system path
+sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import litellm
 from litellm import embedding, completion
@ -18,7 +20,11 @@ messages = [{ "content": user_message,"role": "user"}]
 # openai call
-response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}]) 
+response = completion(
    model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}]
 )
 # cohere call
-response = completion(model="command-nightly", messages=[{"role": "user", "content": "Hi 👋 - i'm cohere"}]) 
+response = completion(
    model="command-nightly", messages=[{"role": "user", "content": "Hi 👋 - i'm cohere"}]
 )
--- a/litellm/tests/test_load_test_model.py
+++ b/litellm/tests/test_load_test_model.py
@ -1,6 +1,9 @@
 import sys, os
 import traceback
-sys.path.insert(0, os.path.abspath('../..'))  # Adds the parent directory to the system path
+
 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import litellm
 from litellm import load_test_model, testing_batch_completion
@ -16,7 +19,19 @@ from litellm import load_test_model, testing_batch_completion
 # print(result)
 ## Quality Test across Model
-models = ["gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-4", "claude-instant-1", {"model": "replicate/llama-2-70b-chat:58d078176e02c219e11eb4da5a02a7830a283b14cf8f94537af893ccff5ee781", "custom_llm_provider": "replicate"}]
+models = [
-messages =  [[{"role": "user", "content": "What is your name?"}], [{"role": "user", "content": "Hey, how's it going?"}]]
+    "gpt-3.5-turbo",
    "gpt-3.5-turbo-16k",
    "gpt-4",
    "claude-instant-1",
    {
        "model": "replicate/llama-2-70b-chat:58d078176e02c219e11eb4da5a02a7830a283b14cf8f94537af893ccff5ee781",
        "custom_llm_provider": "replicate",
    },
 ]
 messages = [
    [{"role": "user", "content": "What is your name?"}],
    [{"role": "user", "content": "Hey, how's it going?"}],
 ]
 result = testing_batch_completion(models=models, messages=messages)
 print(result)
--- a/litellm/tests/test_logging.py
+++ b/litellm/tests/test_logging.py
@ -3,7 +3,10 @@
 import sys, os
 import traceback
-sys.path.insert(0, os.path.abspath('../..'))  # Adds the parent directory to the system path
+
 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import litellm
 from litellm import embedding, completion
@ -11,9 +14,11 @@ litellm.set_verbose = False
 score = 0
 def logger_fn(model_call_object: dict):
    print(f"model call details: {model_call_object}")
 user_message = "Hello, how are you?"
 messages = [{"content": user_message, "role": "user"}]
@ -27,7 +32,9 @@ except:
 # test on non-openai completion call
 try:
-    response = completion(model="claude-instant-1", messages=messages, logger_fn=logger_fn)
+    response = completion(
        model="claude-instant-1", messages=messages, logger_fn=logger_fn
    )
    print(f"claude response: {response}")
    score += 1
 except:
--- a/litellm/tests/test_model_fallback.py
+++ b/litellm/tests/test_model_fallback.py
@ -3,7 +3,10 @@
 import sys, os
 import traceback
-sys.path.insert(0, os.path.abspath('../..'))  # Adds the parent directory to the system path
+
 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import litellm
 from litellm import embedding, completion
--- a/litellm/tests/test_no_client.py
+++ b/litellm/tests/test_no_client.py
@ -4,7 +4,10 @@
 import sys, os
 import traceback
-sys.path.insert(0, os.path.abspath('../..'))  # Adds the parent directory to the system path
+
 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import litellm
 from litellm import embedding, completion
--- a/litellm/tests/test_ollama.py
+++ b/litellm/tests/test_ollama.py
@ -53,7 +53,6 @@
 # # # return this generator to the client for streaming requests
 # # async def get_response():
 # #     global generator
 # #     async for elem in generator:
--- a/litellm/tests/test_ollama_local.py
+++ b/litellm/tests/test_ollama_local.py
@ -12,7 +12,6 @@
 # import asyncio
 # user_message = "respond in 20 words. who are you?"
 # messages = [{ "content": user_message,"role": "user"}]
@ -45,8 +44,3 @@
 #         pytest.fail(f"Error occurred: {e}")
 # test_completion_ollama_stream()
--- a/litellm/tests/test_secrets.py
+++ b/litellm/tests/test_secrets.py
@ -4,7 +4,10 @@
 import sys, os
 import traceback
-sys.path.insert(0, os.path.abspath('../..'))  # Adds the parent directory to the system path
+
 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import litellm
 from litellm import embedding, completion
 from infisical import InfisicalClient
@ -28,5 +31,5 @@ def test_completion_openai():
        pytest.fail(f"Error occurred: {e}")
    litellm.secret_manager_client = None
 test_completion_openai()
 test_completion_openai()
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@ -3,7 +3,10 @@
 import sys, os
 import traceback
-sys.path.insert(0, os.path.abspath('../..'))  # Adds the parent directory to the system path
+
 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import litellm
 from litellm import completion
@ -11,17 +14,21 @@ litellm.set_verbose = False
 score = 0
 def logger_fn(model_call_object: dict):
    print(f"model call details: {model_call_object}")
 user_message = "Hello, how are you?"
 messages = [{"content": user_message, "role": "user"}]
 # test on anthropic completion call
 try:
-    response = completion(model="claude-instant-1", messages=messages, stream=True, logger_fn=logger_fn)
+    response = completion(
        model="claude-instant-1", messages=messages, stream=True, logger_fn=logger_fn
    )
    for chunk in response:
-        print(chunk['choices'][0]['delta'])
+        print(chunk["choices"][0]["delta"])
    score += 1
 except:
    print(f"error occurred: {traceback.format_exc()}")
@ -30,9 +37,16 @@ except:
 # test on anthropic completion call
 try:
-    response = completion(model="meta-llama/Llama-2-7b-chat-hf", messages=messages, custom_llm_provider="huggingface", custom_api_base="https://s7c7gytn18vnu4tw.us-east-1.aws.endpoints.huggingface.cloud", stream=True, logger_fn=logger_fn)
+    response = completion(
        model="meta-llama/Llama-2-7b-chat-hf",
        messages=messages,
        custom_llm_provider="huggingface",
        custom_api_base="https://s7c7gytn18vnu4tw.us-east-1.aws.endpoints.huggingface.cloud",
        stream=True,
        logger_fn=logger_fn,
    )
    for chunk in response:
-        print(chunk['choices'][0]['delta'])
+        print(chunk["choices"][0]["delta"])
    score += 1
 except:
    print(f"error occurred: {traceback.format_exc()}")
--- a/litellm/tests/test_timeout.py
+++ b/litellm/tests/test_timeout.py
@ -3,10 +3,14 @@
 import sys, os
 import traceback
-sys.path.insert(0, os.path.abspath('../..'))  # Adds the parent directory to the system path
+
 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import time
 from litellm import timeout
@timeout(10)
 def stop_after_10_s(force_timeout=60):
    print("Stopping after 10 seconds")
--- a/litellm/timeout.py
+++ b/litellm/timeout.py
@ -11,9 +11,7 @@ from threading import Thread
 from openai.error import Timeout
-def timeout(
+def timeout(timeout_duration: float = None, exception_to_raise=Timeout):
    timeout_duration: float = None, exception_to_raise = Timeout
 ):
    """
    Wraps a function to raise the specified exception if execution time
    is greater than the specified timeout.
@ -44,7 +42,9 @@ def timeout(
                result = future.result(timeout=local_timeout_duration)
            except futures.TimeoutError:
                thread.stop_loop()
-                raise exception_to_raise(f"A timeout error occurred. The function call took longer than {local_timeout_duration} second(s).")
+                raise exception_to_raise(
                    f"A timeout error occurred. The function call took longer than {local_timeout_duration} second(s)."
                )
            thread.stop_loop()
            return result
@ -59,7 +59,9 @@ def timeout(
                )
                return value
            except asyncio.TimeoutError:
-                raise exception_to_raise(f"A timeout error occurred. The function call took longer than {local_timeout_duration} second(s).")
+                raise exception_to_raise(
                    f"A timeout error occurred. The function call took longer than {local_timeout_duration} second(s)."
                )
        if iscoroutinefunction(func):
            return async_wrapper
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -5,6 +5,7 @@ import litellm, openai
 import random, uuid, requests
 import datetime, time
 import tiktoken
 encoding = tiktoken.get_encoding("cl100k_base")
 import pkg_resources
 from .integrations.helicone import HeliconeLogger
@ -13,8 +14,15 @@ from .integrations.berrispend import BerriSpendLogger
 from .integrations.supabase import Supabase
 from openai.error import OpenAIError as OriginalError
 from openai.openai_object import OpenAIObject
-from .exceptions import AuthenticationError, InvalidRequestError, RateLimitError, ServiceUnavailableError, OpenAIError
+from .exceptions import (
    AuthenticationError,
    InvalidRequestError,
    RateLimitError,
    ServiceUnavailableError,
    OpenAIError,
 )
 from typing import List, Dict, Union
 ####### ENVIRONMENT VARIABLES ###################
 dotenv.load_dotenv()  # Loading env variables using dotenv
 sentry_sdk_instance = None
@ -51,12 +59,14 @@ local_cache = {}
 #  'usage': {'prompt_tokens': 18, 'completion_tokens': 23, 'total_tokens': 41}
 # }
 class Message(OpenAIObject):
    def __init__(self, content="default", role="assistant", **params):
        super(Message, self).__init__(**params)
        self.content = content
        self.role = role
 class Choices(OpenAIObject):
    def __init__(self, finish_reason="stop", index=0, message=Message(), **params):
        super(Choices, self).__init__(**params)
@ -64,22 +74,29 @@ class Choices(OpenAIObject):
        self.index = index
        self.message = message
 class ModelResponse(OpenAIObject):
    def __init__(self, choices=None, created=None, model=None, usage=None, **params):
        super(ModelResponse, self).__init__(**params)
        self.choices = choices if choices else [Choices()]
        self.created = created
        self.model = model
-        self.usage = usage if usage else {
+        self.usage = (
            usage
            if usage
            else {
                "prompt_tokens": None,
                "completion_tokens": None,
-            "total_tokens": None
+                "total_tokens": None,
            }
        )
    def to_dict_recursive(self):
        d = super().to_dict_recursive()
-        d['choices'] = [choice.to_dict_recursive() for choice in self.choices]
+        d["choices"] = [choice.to_dict_recursive() for choice in self.choices]
        return d
 ############################################################
 def print_verbose(print_statement):
    if litellm.set_verbose:
@ -87,9 +104,12 @@ def print_verbose(print_statement):
        if random.random() <= 0.3:
            print("Get help - https://discord.com/invite/wuPM9dRgDw")
 ####### Package Import Handler ###################
 import importlib
 import subprocess
 def install_and_import(package: str):
    if package in globals().keys():
        print_verbose(f"{package} has already been imported.")
@ -108,11 +128,22 @@ def install_and_import(package: str):
    finally:
        if package not in globals().keys():
            globals()[package] = importlib.import_module(package)
 ##################################################
 ####### LOGGING ###################
 # Logging function -> log the exact model details + what's being sent | Non-Blocking
-def logging(model=None, input=None, custom_llm_provider=None, azure=False, additional_args={}, logger_fn=None, exception=None):
+def logging(
    model=None,
    input=None,
    custom_llm_provider=None,
    azure=False,
    additional_args={},
    logger_fn=None,
    exception=None,
 ):
    try:
        model_call_details = {}
        if model:
@ -130,7 +161,12 @@ def logging(model=None, input=None, custom_llm_provider=None, azure=False, addit
            model_call_details["additional_args"] = additional_args
        # log additional call details -> api key, etc.
        if model:
-      if azure == True or model in litellm.open_ai_chat_completion_models or model in litellm.open_ai_chat_completion_models or model in litellm.open_ai_embedding_models:
+            if (
                azure == True
                or model in litellm.open_ai_chat_completion_models
                or model in litellm.open_ai_chat_completion_models
                or model in litellm.open_ai_embedding_models
            ):
                model_call_details["api_type"] = openai.api_type
                model_call_details["api_base"] = openai.api_base
                model_call_details["api_version"] = openai.api_version
@ -142,25 +178,42 @@ def logging(model=None, input=None, custom_llm_provider=None, azure=False, addit
            elif model in litellm.cohere_models:
                model_call_details["api_key"] = os.environ.get("COHERE_API_KEY")
        ## User Logging -> if you pass in a custom logging function or want to use sentry breadcrumbs
-    print_verbose(f"Logging Details: logger_fn - {logger_fn} | callable(logger_fn) - {callable(logger_fn)}")
+        print_verbose(
            f"Logging Details: logger_fn - {logger_fn} | callable(logger_fn) - {callable(logger_fn)}"
        )
        if logger_fn and callable(logger_fn):
            try:
-        logger_fn(model_call_details) # Expectation: any logger function passed in by the user should accept a dict object
+                logger_fn(
                    model_call_details
                )  # Expectation: any logger function passed in by the user should accept a dict object
            except Exception as e:
-        print(f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while logging {traceback.format_exc()}")
+                print(
                    f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while logging {traceback.format_exc()}"
                )
    except Exception as e:
-    print(f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while logging {traceback.format_exc()}")
+        print(
            f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while logging {traceback.format_exc()}"
        )
        pass
 ####### CLIENT ###################
 # make it easy to log if completion/embedding runs succeeded or failed + see what happened | Non-Blocking
 def client(original_function):
-    def function_setup(*args, **kwargs): #just run once to check if user wants to send their data anywhere - PostHog/Sentry/Slack/etc.
+    def function_setup(
        *args, **kwargs
    ):  # just run once to check if user wants to send their data anywhere - PostHog/Sentry/Slack/etc.
        try:
            global callback_list, add_breadcrumb, user_logger_fn
-        if (len(litellm.success_callback) > 0 or len(litellm.failure_callback) > 0) and len(callback_list) == 0: 
+            if (
-          callback_list = list(set(litellm.success_callback + litellm.failure_callback))
+                len(litellm.success_callback) > 0 or len(litellm.failure_callback) > 0
-          set_callbacks(callback_list=callback_list,)
+            ) and len(callback_list) == 0:
                callback_list = list(
                    set(litellm.success_callback + litellm.failure_callback)
                )
                set_callbacks(
                    callback_list=callback_list,
                )
            if add_breadcrumb:
                add_breadcrumb(
                    category="litellm.llm_call",
@ -178,8 +231,16 @@ def client(original_function):
            try:
                model = args[0] if len(args) > 0 else kwargs["model"]
                exception = kwargs["exception"] if "exception" in kwargs else None
-          custom_llm_provider = kwargs["custom_llm_provider"] if "custom_llm_provider" in kwargs else None
+                custom_llm_provider = (
-          safe_crash_reporting(model=model, exception=exception, custom_llm_provider=custom_llm_provider) # log usage-crash details. Do not log any user details. If you want to turn this off, set `litellm.telemetry=False`.
+                    kwargs["custom_llm_provider"]
                    if "custom_llm_provider" in kwargs
                    else None
                )
                safe_crash_reporting(
                    model=model,
                    exception=exception,
                    custom_llm_provider=custom_llm_provider,
                )  # log usage-crash details. Do not log any user details. If you want to turn this off, set `litellm.telemetry=False`.
            except:
                # [Non-Blocking Error]
                pass
@ -199,7 +260,9 @@ def client(original_function):
    def check_cache(*args, **kwargs):
        try:  # never block execution
            prompt = get_prompt(*args, **kwargs)
-        if prompt != None and prompt in local_cache: # check if messages / prompt exists
+            if (
                prompt != None and prompt in local_cache
            ):  # check if messages / prompt exists
                result = local_cache[prompt]
                return result
            else:
@ -221,7 +284,10 @@ def client(original_function):
            function_setup(*args, **kwargs)
            ## MODEL CALL
            start_time = datetime.datetime.now()
-          if litellm.caching and (cached_result := check_cache(*args, **kwargs)) is not None:
+            if (
                litellm.caching
                and (cached_result := check_cache(*args, **kwargs)) is not None
            ):
                result = cached_result
            else:
                result = original_function(*args, **kwargs)
@ -231,26 +297,35 @@ def client(original_function):
                add_cache(result, *args, **kwargs)
            ## LOG SUCCESS
            crash_reporting(*args, **kwargs)
-          my_thread = threading.Thread(target=handle_success, args=(args, kwargs, result, start_time, end_time)) # don't interrupt execution of main thread
+            my_thread = threading.Thread(
                target=handle_success, args=(args, kwargs, result, start_time, end_time)
            )  # don't interrupt execution of main thread
            my_thread.start()
            return result
        except Exception as e:
            traceback_exception = traceback.format_exc()
            crash_reporting(*args, **kwargs, exception=traceback_exception)
            end_time = datetime.datetime.now()
-          my_thread = threading.Thread(target=handle_failure, args=(e, traceback_exception, start_time, end_time, args, kwargs)) # don't interrupt execution of main thread
+            my_thread = threading.Thread(
                target=handle_failure,
                args=(e, traceback_exception, start_time, end_time, args, kwargs),
            )  # don't interrupt execution of main thread
            my_thread.start()
            raise e
    return wrapper
 ####### USAGE CALCULATOR ################
 def token_counter(model, text):
    # use tiktoken or anthropic's tokenizer depending on the model
    num_tokens = 0
    if "claude" in model:
-    install_and_import('anthropic')
+        install_and_import("anthropic")
        from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT
        anthropic = Anthropic()
        num_tokens = anthropic.count_tokens(text)
    else:
@ -264,8 +339,12 @@ def cost_per_token(model="gpt-3.5-turbo", prompt_tokens = 0, completion_tokens =
    completion_tokens_cost_usd_dollar = 0
    model_cost_ref = litellm.model_cost
    if model in model_cost_ref:
-    prompt_tokens_cost_usd_dollar = model_cost_ref[model]["input_cost_per_token"] * prompt_tokens
+        prompt_tokens_cost_usd_dollar = (
-    completion_tokens_cost_usd_dollar = model_cost_ref[model]["output_cost_per_token"] * completion_tokens
+            model_cost_ref[model]["input_cost_per_token"] * prompt_tokens
        )
        completion_tokens_cost_usd_dollar = (
            model_cost_ref[model]["output_cost_per_token"] * completion_tokens
        )
        return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
    else:
        # calculate average input cost
@ -285,9 +364,12 @@ def cost_per_token(model="gpt-3.5-turbo", prompt_tokens = 0, completion_tokens =
 def completion_cost(model="gpt-3.5-turbo", prompt="", completion=""):
    prompt_tokens = token_counter(model=model, text=prompt)
    completion_tokens = token_counter(model=model, text=completion)
-   prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = cost_per_token(model=model, prompt_tokens = prompt_tokens, completion_tokens = completion_tokens)
+    prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = cost_per_token(
        model=model, prompt_tokens=prompt_tokens, completion_tokens=completion_tokens
    )
    return prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
 ####### HELPER FUNCTIONS ################
 def get_litellm_params(
    return_async=False,
@ -300,7 +382,7 @@ def get_litellm_params(
    replicate=False,
    together_ai=False,
    custom_llm_provider=None,
-    custom_api_base=None
+    custom_api_base=None,
 ):
    litellm_params = {
        "return_async": return_async,
@ -309,7 +391,7 @@ def get_litellm_params(
        "logger_fn": logger_fn,
        "verbose": verbose,
        "custom_llm_provider": custom_llm_provider,
-        "custom_api_base": custom_api_base
+        "custom_api_base": custom_api_base,
    }
    return litellm_params
@ -324,7 +406,7 @@ def get_optional_params(
    n=1,
    stream=False,
    stop=None,
-    max_tokens = float('inf'),
+    max_tokens=float("inf"),
    presence_penalty=0,
    frequency_penalty=0,
    logit_bias={},
@ -352,7 +434,7 @@ def get_optional_params(
            optional_params["stream"] = stream
        if temperature != 1:
            optional_params["temperature"] = temperature
-    if max_tokens != float('inf'):
+        if max_tokens != float("inf"):
            optional_params["max_tokens"] = max_tokens
        return optional_params
    elif custom_llm_provider == "replicate":
@ -368,16 +450,18 @@ def get_optional_params(
            optional_params["temperature"] = temperature
        if top_p != 1:
            optional_params["top_p"] = top_p
-      if max_tokens != float('inf'):
+        if max_tokens != float("inf"):
            optional_params["max_tokens"] = max_tokens
        if frequency_penalty != 0:
            optional_params["frequency_penalty"] = frequency_penalty
-  elif model == "chat-bison": # chat-bison has diff args from chat-bison@001 ty Google
+    elif (
        model == "chat-bison"
    ):  # chat-bison has diff args from chat-bison@001 ty Google
        if temperature != 1:
            optional_params["temperature"] = temperature
        if top_p != 1:
            optional_params["top_p"] = top_p
-     if max_tokens != float('inf'):
+        if max_tokens != float("inf"):
            optional_params["max_output_tokens"] = max_tokens
    elif model in litellm.vertex_text_models:
        # required params for all text vertex calls
@ -402,7 +486,7 @@ def get_optional_params(
            optional_params["stream"] = stream
        if stop != None:
            optional_params["stop"] = stop
-    if max_tokens != float('inf'):
+        if max_tokens != float("inf"):
            optional_params["max_tokens"] = max_tokens
        if presence_penalty != 0:
            optional_params["presence_penalty"] = presence_penalty
@ -417,7 +501,15 @@ def get_optional_params(
        return optional_params
    return optional_params
-def load_test_model(model: str, custom_llm_provider: str = None, custom_api_base: str = None, prompt: str = None, num_calls: int = None, force_timeout: int = None):
+
 def load_test_model(
    model: str,
    custom_llm_provider: str = None,
    custom_api_base: str = None,
    prompt: str = None,
    num_calls: int = None,
    force_timeout: int = None,
 ):
    test_prompt = "Hey, how's it going"
    test_calls = 100
    if prompt:
@ -427,14 +519,31 @@ def load_test_model(model: str, custom_llm_provider: str = None, custom_api_base
    messages = [[{"role": "user", "content": test_prompt}] for _ in range(test_calls)]
    start_time = time.time()
    try:
-    litellm.batch_completion(model=model, messages=messages, custom_llm_provider=custom_llm_provider, custom_api_base = custom_api_base, force_timeout=force_timeout)
+        litellm.batch_completion(
            model=model,
            messages=messages,
            custom_llm_provider=custom_llm_provider,
            custom_api_base=custom_api_base,
            force_timeout=force_timeout,
        )
        end_time = time.time()
        response_time = end_time - start_time
-    return {"total_response_time": response_time, "calls_made": 100, "status": "success", "exception": None}
+        return {
            "total_response_time": response_time,
            "calls_made": 100,
            "status": "success",
            "exception": None,
        }
    except Exception as e:
        end_time = time.time()
        response_time = end_time - start_time
-    return {"total_response_time": response_time, "calls_made": 100, "status": "failed", "exception": e}
+        return {
            "total_response_time": response_time,
            "calls_made": 100,
            "status": "failed",
            "exception": e,
        }
 def set_callbacks(callback_list):
    global sentry_sdk_instance, capture_exception, add_breadcrumb, posthog, slack_app, alerts_channel, heliconeLogger, aispendLogger, berrispendLogger, supabaseClient
@ -445,11 +554,20 @@ def set_callbacks(callback_list):
                    import sentry_sdk
                except ImportError:
                    print_verbose("Package 'sentry_sdk' is missing. Installing it...")
-            subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sentry_sdk'])
+                    subprocess.check_call(
                        [sys.executable, "-m", "pip", "install", "sentry_sdk"]
                    )
                    import sentry_sdk
                sentry_sdk_instance = sentry_sdk
-        sentry_trace_rate = os.environ.get("SENTRY_API_TRACE_RATE") if "SENTRY_API_TRACE_RATE" in os.environ else "1.0"
+                sentry_trace_rate = (
-        sentry_sdk_instance.init(dsn=os.environ.get("SENTRY_API_URL"), traces_sample_rate=float(sentry_trace_rate))
+                    os.environ.get("SENTRY_API_TRACE_RATE")
                    if "SENTRY_API_TRACE_RATE" in os.environ
                    else "1.0"
                )
                sentry_sdk_instance.init(
                    dsn=os.environ.get("SENTRY_API_URL"),
                    traces_sample_rate=float(sentry_trace_rate),
                )
                capture_exception = sentry_sdk_instance.capture_exception
                add_breadcrumb = sentry_sdk_instance.add_breadcrumb
            elif callback == "posthog":
@ -457,21 +575,26 @@ def set_callbacks(callback_list):
                    from posthog import Posthog
                except ImportError:
                    print_verbose("Package 'posthog' is missing. Installing it...")
-            subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'posthog'])
+                    subprocess.check_call(
                        [sys.executable, "-m", "pip", "install", "posthog"]
                    )
                    from posthog import Posthog
                posthog = Posthog(
                    project_api_key=os.environ.get("POSTHOG_API_KEY"),
-          host=os.environ.get("POSTHOG_API_URL"))
+                    host=os.environ.get("POSTHOG_API_URL"),
                )
            elif callback == "slack":
                try:
                    from slack_bolt import App
                except ImportError:
                    print_verbose("Package 'slack_bolt' is missing. Installing it...")
-            subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'slack_bolt'])
+                    subprocess.check_call(
                        [sys.executable, "-m", "pip", "install", "slack_bolt"]
                    )
                    from slack_bolt import App
                slack_app = App(
                    token=os.environ.get("SLACK_API_TOKEN"),
-          signing_secret=os.environ.get("SLACK_API_SECRET")
+                    signing_secret=os.environ.get("SLACK_API_SECRET"),
                )
                alerts_channel = os.environ["SLACK_API_CHANNEL"]
                print_verbose(f"Initialized Slack App: {slack_app}")
@ -496,10 +619,11 @@ def handle_failure(exception, traceback_exception, start_time, end_time, args, k
        success_handler = additional_details.pop("success_handler", None)
        failure_handler = additional_details.pop("failure_handler", None)
-      additional_details["Event_Name"] = additional_details.pop("failed_event_name", "litellm.failed_query")
+        additional_details["Event_Name"] = additional_details.pop(
            "failed_event_name", "litellm.failed_query"
        )
        print_verbose(f"self.failure_callback: {litellm.failure_callback}")
        # print_verbose(f"additional_details: {additional_details}")
        for callback in litellm.failure_callback:
            try:
@ -514,11 +638,15 @@ def handle_failure(exception, traceback_exception, start_time, end_time, args, k
                    for detail in additional_details:
                        slack_msg += f"{detail}: {additional_details[detail]}\n"
                    slack_msg += f"Traceback: {traceback_exception}"
-            slack_app.client.chat_postMessage(channel=alerts_channel, text=slack_msg)
+                    slack_app.client.chat_postMessage(
                        channel=alerts_channel, text=slack_msg
                    )
                elif callback == "sentry":
                    capture_exception(exception)
                elif callback == "posthog":
-            print_verbose(f"inside posthog, additional_details: {len(additional_details.keys())}")
+                    print_verbose(
                        f"inside posthog, additional_details: {len(additional_details.keys())}"
                    )
                    ph_obj = {}
                    if len(kwargs) > 0:
                        ph_obj = kwargs
@ -531,7 +659,9 @@ def handle_failure(exception, traceback_exception, start_time, end_time, args, k
                    print_verbose(f"ph_obj: {ph_obj}")
                    print_verbose(f"PostHog Event Name: {event_name}")
                    if "user_id" in additional_details:
-              posthog.capture(additional_details["user_id"], event_name, ph_obj)
+                        posthog.capture(
                            additional_details["user_id"], event_name, ph_obj
                        )
                    else:  # PostHog calls require a unique id to identify a user - https://posthog.com/docs/libraries/python
                        unique_id = str(uuid.uuid4())
                        posthog.capture(unique_id, event_name)
@ -545,11 +675,20 @@ def handle_failure(exception, traceback_exception, start_time, end_time, args, k
                        "created": time.time(),
                        "error": traceback_exception,
                        "usage": {
-                    "prompt_tokens": prompt_token_calculator(model, messages=messages),
+                            "prompt_tokens": prompt_token_calculator(
-                    "completion_tokens": 0
+                                model, messages=messages
                            ),
                            "completion_tokens": 0,
                        },
                    }
-              }
+                    berrispendLogger.log_event(
-              berrispendLogger.log_event(model=model, messages=messages, response_obj=result, start_time=start_time, end_time=end_time, print_verbose=print_verbose)
+                        model=model,
                        messages=messages,
                        response_obj=result,
                        start_time=start_time,
                        end_time=end_time,
                        print_verbose=print_verbose,
                    )
                elif callback == "aispend":
                    print_verbose("reaches aispend for logging!")
                    model = args[0] if len(args) > 0 else kwargs["model"]
@ -558,11 +697,19 @@ def handle_failure(exception, traceback_exception, start_time, end_time, args, k
                        "model": model,
                        "created": time.time(),
                        "usage": {
-                    "prompt_tokens": prompt_token_calculator(model, messages=messages),
+                            "prompt_tokens": prompt_token_calculator(
-                    "completion_tokens": 0
+                                model, messages=messages
                            ),
                            "completion_tokens": 0,
                        },
                    }
-              }
+                    aispendLogger.log_event(
-              aispendLogger.log_event(model=model, response_obj=result, start_time=start_time, end_time=end_time, print_verbose=print_verbose)
+                        model=model,
                        response_obj=result,
                        start_time=start_time,
                        end_time=end_time,
                        print_verbose=print_verbose,
                    )
                elif callback == "supabase":
                    print_verbose("reaches supabase for logging!")
                    model = args[0] if len(args) > 0 else kwargs["model"]
@ -572,21 +719,33 @@ def handle_failure(exception, traceback_exception, start_time, end_time, args, k
                        "created": time.time(),
                        "error": traceback_exception,
                        "usage": {
-                    "prompt_tokens": prompt_token_calculator(model, messages=messages),
+                            "prompt_tokens": prompt_token_calculator(
-                    "completion_tokens": 0
+                                model, messages=messages
-                 }
+                            ),
                            "completion_tokens": 0,
                        },
                    }
                    print(f"litellm._thread_context: {litellm._thread_context}")
-              supabaseClient.log_event(model=model, messages=messages, end_user=litellm._thread_context.user, response_obj=result, start_time=start_time, end_time=end_time, print_verbose=print_verbose)
+                    supabaseClient.log_event(
                        model=model,
                        messages=messages,
                        end_user=litellm._thread_context.user,
                        response_obj=result,
                        start_time=start_time,
                        end_time=end_time,
                        print_verbose=print_verbose,
                    )
            except:
-          print_verbose(f"Error Occurred while logging failure: {traceback.format_exc()}")
+                print_verbose(
                    f"Error Occurred while logging failure: {traceback.format_exc()}"
                )
                pass
        if failure_handler and callable(failure_handler):
            call_details = {
                "exception": exception,
-          "additional_details": additional_details
+                "additional_details": additional_details,
            }
            failure_handler(call_details)
        pass
@ -595,12 +754,15 @@ def handle_failure(exception, traceback_exception, start_time, end_time, args, k
        logging(logger_fn=user_logger_fn, exception=e)
        pass
 def handle_success(args, kwargs, result, start_time, end_time):
    global heliconeLogger, aispendLogger
    try:
        success_handler = additional_details.pop("success_handler", None)
        failure_handler = additional_details.pop("failure_handler", None)
-    additional_details["Event_Name"] = additional_details.pop("successful_event_name", "litellm.succes_query")
+        additional_details["Event_Name"] = additional_details.pop(
            "successful_event_name", "litellm.succes_query"
        )
        for callback in litellm.success_callback:
            try:
                if callback == "posthog":
@ -609,7 +771,9 @@ def handle_success(args, kwargs, result, start_time, end_time):
                        ph_obj[detail] = additional_details[detail]
                    event_name = additional_details["Event_Name"]
                    if "user_id" in additional_details:
-            posthog.capture(additional_details["user_id"], event_name, ph_obj)
+                        posthog.capture(
                            additional_details["user_id"], event_name, ph_obj
                        )
                    else:  # PostHog calls require a unique id to identify a user - https://posthog.com/docs/libraries/python
                        unique_id = str(uuid.uuid4())
                        posthog.capture(unique_id, event_name, ph_obj)
@ -618,31 +782,63 @@ def handle_success(args, kwargs, result, start_time, end_time):
                    slack_msg = ""
                    for detail in additional_details:
                        slack_msg += f"{detail}: {additional_details[detail]}\n"
-          slack_app.client.chat_postMessage(channel=alerts_channel, text=slack_msg)
+                    slack_app.client.chat_postMessage(
                        channel=alerts_channel, text=slack_msg
                    )
                elif callback == "helicone":
                    print_verbose("reaches helicone for logging!")
                    model = args[0] if len(args) > 0 else kwargs["model"]
                    messages = args[1] if len(args) > 1 else kwargs["messages"]
-          heliconeLogger.log_success(model=model, messages=messages, response_obj=result, start_time=start_time, end_time=end_time, print_verbose=print_verbose)
+                    heliconeLogger.log_success(
                        model=model,
                        messages=messages,
                        response_obj=result,
                        start_time=start_time,
                        end_time=end_time,
                        print_verbose=print_verbose,
                    )
                elif callback == "aispend":
                    print_verbose("reaches aispend for logging!")
                    model = args[0] if len(args) > 0 else kwargs["model"]
-          aispendLogger.log_event(model=model, response_obj=result, start_time=start_time, end_time=end_time, print_verbose=print_verbose)
+                    aispendLogger.log_event(
                        model=model,
                        response_obj=result,
                        start_time=start_time,
                        end_time=end_time,
                        print_verbose=print_verbose,
                    )
                elif callback == "berrispend":
                    print_verbose("reaches berrispend for logging!")
                    model = args[0] if len(args) > 0 else kwargs["model"]
                    messages = args[1] if len(args) > 1 else kwargs["messages"]
-          berrispendLogger.log_event(model=model, messages=messages, response_obj=result, start_time=start_time, end_time=end_time, print_verbose=print_verbose)
+                    berrispendLogger.log_event(
                        model=model,
                        messages=messages,
                        response_obj=result,
                        start_time=start_time,
                        end_time=end_time,
                        print_verbose=print_verbose,
                    )
                elif callback == "supabase":
                    print_verbose("reaches supabase for logging!")
                    model = args[0] if len(args) > 0 else kwargs["model"]
                    messages = args[1] if len(args) > 1 else kwargs["messages"]
                    print(f"litellm._thread_context: {litellm._thread_context}")
-          supabaseClient.log_event(model=model, messages=messages, end_user=litellm._thread_context.user, response_obj=result, start_time=start_time, end_time=end_time, print_verbose=print_verbose)
+                    supabaseClient.log_event(
                        model=model,
                        messages=messages,
                        end_user=litellm._thread_context.user,
                        response_obj=result,
                        start_time=start_time,
                        end_time=end_time,
                        print_verbose=print_verbose,
                    )
            except Exception as e:
                ## LOGGING
                logging(logger_fn=user_logger_fn, exception=e)
-        print_verbose(f"[Non-Blocking] Success Callback Error - {traceback.format_exc()}")
+                print_verbose(
                    f"[Non-Blocking] Success Callback Error - {traceback.format_exc()}"
                )
                pass
        if success_handler and callable(success_handler):
@ -651,22 +847,27 @@ def handle_success(args, kwargs, result, start_time, end_time):
    except Exception as e:
        ## LOGGING
        logging(logger_fn=user_logger_fn, exception=e)
-    print_verbose(f"[Non-Blocking] Success Callback Error - {traceback.format_exc()}")
+        print_verbose(
            f"[Non-Blocking] Success Callback Error - {traceback.format_exc()}"
        )
        pass
 def prompt_token_calculator(model, messages):
    # use tiktoken or anthropic's tokenizer depending on the model
    text = " ".join(message["content"] for message in messages)
    num_tokens = 0
    if "claude" in model:
-    install_and_import('anthropic')
+        install_and_import("anthropic")
        from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT
        anthropic = Anthropic()
        num_tokens = anthropic.count_tokens(text)
    else:
        num_tokens = len(encoding.encode(text))
    return num_tokens
 # integration helper function
 def modify_integration(integration_name, integration_params):
    global supabaseClient
@ -674,6 +875,7 @@ def modify_integration(integration_name, integration_params):
        if "table_name" in integration_params:
            Supabase.supabase_table_name = integration_params["table_name"]
 def exception_type(model, original_exception, custom_llm_provider):
    global user_logger_fn
    exception_mapping_worked = False
@ -692,80 +894,153 @@ def exception_type(model, original_exception, custom_llm_provider):
                exception_type = type(original_exception).__name__
            else:
                exception_type = ""
-        logging(model=model, additional_args={"error_str": error_str, "exception_type": exception_type, "original_exception": original_exception}, logger_fn=user_logger_fn)
+            logging(
                model=model,
                additional_args={
                    "error_str": error_str,
                    "exception_type": exception_type,
                    "original_exception": original_exception,
                },
                logger_fn=user_logger_fn,
            )
            if "claude" in model:  # one of the anthropics
                if hasattr(original_exception, "status_code"):
                    print_verbose(f"status_code: {original_exception.status_code}")
                    if original_exception.status_code == 401:
                        exception_mapping_worked = True
-              raise AuthenticationError(message=f"AnthropicException - {original_exception.message}", llm_provider="anthropic")
+                        raise AuthenticationError(
                            message=f"AnthropicException - {original_exception.message}",
                            llm_provider="anthropic",
                        )
                    elif original_exception.status_code == 400:
                        exception_mapping_worked = True
-              raise InvalidRequestError(message=f"AnthropicException - {original_exception.message}", model=model, llm_provider="anthropic")
+                        raise InvalidRequestError(
                            message=f"AnthropicException - {original_exception.message}",
                            model=model,
                            llm_provider="anthropic",
                        )
                    elif original_exception.status_code == 429:
                        exception_mapping_worked = True
-              raise RateLimitError(message=f"AnthropicException - {original_exception.message}", llm_provider="anthropic")
+                        raise RateLimitError(
-          elif "Could not resolve authentication method. Expected either api_key or auth_token to be set." in error_str:
+                            message=f"AnthropicException - {original_exception.message}",
                            llm_provider="anthropic",
                        )
                elif (
                    "Could not resolve authentication method. Expected either api_key or auth_token to be set."
                    in error_str
                ):
                    exception_mapping_worked = True
-            raise AuthenticationError(message=f"AnthropicException - {original_exception.message}", llm_provider="anthropic")
+                    raise AuthenticationError(
                        message=f"AnthropicException - {original_exception.message}",
                        llm_provider="anthropic",
                    )
            elif "replicate" in model:
                if "Incorrect authentication token" in error_str:
                    exception_mapping_worked = True
-            raise AuthenticationError(message=f"ReplicateException - {error_str}", llm_provider="replicate")
+                    raise AuthenticationError(
                        message=f"ReplicateException - {error_str}",
                        llm_provider="replicate",
                    )
                elif exception_type == "ModelError":
                    exception_mapping_worked = True
-            raise InvalidRequestError(message=f"ReplicateException - {error_str}", model=model, llm_provider="replicate")
+                    raise InvalidRequestError(
                        message=f"ReplicateException - {error_str}",
                        model=model,
                        llm_provider="replicate",
                    )
                elif "Request was throttled" in error_str:
                    exception_mapping_worked = True
-            raise RateLimitError(message=f"ReplicateException - {error_str}", llm_provider="replicate")
+                    raise RateLimitError(
-          elif exception_type == "ReplicateError": ## ReplicateError implies an error on Replicate server side, not user side
+                        message=f"ReplicateException - {error_str}",
-            raise ServiceUnavailableError(message=f"ReplicateException - {error_str}", llm_provider="replicate")
+                        llm_provider="replicate",
                    )
                elif (
                    exception_type == "ReplicateError"
                ):  ## ReplicateError implies an error on Replicate server side, not user side
                    raise ServiceUnavailableError(
                        message=f"ReplicateException - {error_str}",
                        llm_provider="replicate",
                    )
            elif model == "command-nightly":  # Cohere
-          if "invalid api token" in error_str or "No API key provided." in error_str:
+                if (
                    "invalid api token" in error_str
                    or "No API key provided." in error_str
                ):
                    exception_mapping_worked = True
-            raise AuthenticationError(message=f"CohereException - {original_exception.message}", llm_provider="cohere")
+                    raise AuthenticationError(
                        message=f"CohereException - {original_exception.message}",
                        llm_provider="cohere",
                    )
                elif "too many tokens" in error_str:
                    exception_mapping_worked = True
-            raise InvalidRequestError(message=f"CohereException - {original_exception.message}", model=model, llm_provider="cohere")
+                    raise InvalidRequestError(
-          elif "CohereConnectionError" in exception_type: # cohere seems to fire these errors when we load test it (1k+ messages / min)
+                        message=f"CohereException - {original_exception.message}",
                        model=model,
                        llm_provider="cohere",
                    )
                elif (
                    "CohereConnectionError" in exception_type
                ):  # cohere seems to fire these errors when we load test it (1k+ messages / min)
                    exception_mapping_worked = True
-            raise RateLimitError(message=f"CohereException - {original_exception.message}", llm_provider="cohere")
+                    raise RateLimitError(
                        message=f"CohereException - {original_exception.message}",
                        llm_provider="cohere",
                    )
            elif custom_llm_provider == "huggingface":
                if hasattr(original_exception, "status_code"):
                    if original_exception.status_code == 401:
                        exception_mapping_worked = True
-                raise AuthenticationError(message=f"HuggingfaceException - {original_exception.message}", llm_provider="huggingface")
+                        raise AuthenticationError(
                            message=f"HuggingfaceException - {original_exception.message}",
                            llm_provider="huggingface",
                        )
                    elif original_exception.status_code == 400:
                        exception_mapping_worked = True
-                raise InvalidRequestError(message=f"HuggingfaceException - {original_exception.message}", model=model, llm_provider="huggingface")
+                        raise InvalidRequestError(
                            message=f"HuggingfaceException - {original_exception.message}",
                            model=model,
                            llm_provider="huggingface",
                        )
                    elif original_exception.status_code == 429:
                        exception_mapping_worked = True
-                raise RateLimitError(message=f"HuggingfaceException - {original_exception.message}", llm_provider="huggingface")
+                        raise RateLimitError(
                            message=f"HuggingfaceException - {original_exception.message}",
                            llm_provider="huggingface",
                        )
            raise original_exception  # base case - return the original exception
        else:
            raise original_exception
    except Exception as e:
        ## LOGGING
-      logging(logger_fn=user_logger_fn, additional_args={"exception_mapping_worked": exception_mapping_worked, "original_exception": original_exception}, exception=e) 
+        logging(
            logger_fn=user_logger_fn,
            additional_args={
                "exception_mapping_worked": exception_mapping_worked,
                "original_exception": original_exception,
            },
            exception=e,
        )
        if exception_mapping_worked:
            raise e
        else:  # don't let an error with mapping interrupt the user from receiving an error from the llm api calls
            raise original_exception
 def safe_crash_reporting(model=None, exception=None, custom_llm_provider=None):
    data = {
        "model": model,
        "exception": str(exception),
-      "custom_llm_provider": custom_llm_provider
+        "custom_llm_provider": custom_llm_provider,
    }
    threading.Thread(target=litellm_telemetry, args=(data,)).start()
 def litellm_telemetry(data):
    # Load or generate the UUID
-    uuid_file = 'litellm_uuid.txt'
+    uuid_file = "litellm_uuid.txt"
    try:
        # Try to open the file and load the UUID
-        with open(uuid_file, 'r') as file:
+        with open(uuid_file, "r") as file:
            uuid_value = file.read()
            if uuid_value:
                uuid_value = uuid_value.strip()
@ -775,7 +1050,7 @@ def litellm_telemetry(data):
        # Generate a new UUID if the file doesn't exist or is empty
        new_uuid = uuid.uuid4()
        uuid_value = str(new_uuid)
-        with open(uuid_file, 'w') as file:
+        with open(uuid_file, "w") as file:
            file.write(uuid_value)
    except:
        # [Non-Blocking Error]
@ -784,17 +1059,22 @@ def litellm_telemetry(data):
    try:
        # Prepare the data to send to litellm logging api
        payload = {
-          'uuid': uuid_value,
+            "uuid": uuid_value,
-          'data': data,
+            "data": data,
-          'version': pkg_resources.get_distribution("litellm").version
+            "version": pkg_resources.get_distribution("litellm").version,
        }
        # Make the POST request to litellm logging api
-      response = requests.post('https://litellm.berri.ai/logging', headers={"Content-Type": "application/json"}, json=payload)
+        response = requests.post(
            "https://litellm.berri.ai/logging",
            headers={"Content-Type": "application/json"},
            json=payload,
        )
        response.raise_for_status()  # Raise an exception for HTTP errors
    except:
        # [Non-Blocking Error]
        return
 ######### Secret Manager ############################
 # checks if user has passed in a secret manager client
 # if passed in then checks the secret there
@ -812,6 +1092,7 @@ def get_secret(secret_name):
    else:
        return os.environ.get(secret_name)
 ######## Streaming Class ############################
 # wraps the completion stream to return the correct format for the model
 # replicate/anthropic/cohere
@ -831,8 +1112,8 @@ class CustomStreamWrapper:
        return self
    def handle_anthropic_chunk(self, chunk):
-      str_line = chunk.decode('utf-8')  # Convert bytes to string
+        str_line = chunk.decode("utf-8")  # Convert bytes to string
-      if str_line.startswith('data:'):
+        if str_line.startswith("data:"):
            data_json = json.loads(str_line[5:])
            return data_json.get("completion", "")
        return ""
@ -850,7 +1131,7 @@ class CustomStreamWrapper:
    def handle_huggingface_chunk(self, chunk):
        chunk = chunk.decode("utf-8")
-      if chunk.startswith('data:'):
+        if chunk.startswith("data:"):
            data_json = json.loads(chunk[5:])
            if "token" in data_json and "text" in data_json["token"]:
                return data_json["token"]["text"]
@ -882,11 +1163,11 @@ class CustomStreamWrapper:
        return {"choices": [{"delta": completion_obj}]}
 ########## Reading Config File ############################
 def read_config_args(config_path):
    try:
        import os
        current_path = os.getcwd()
        with open(config_path, "r") as config_file:
            config = json.load(config_file)
@ -900,9 +1181,13 @@ def read_config_args(config_path):
 ########## ollama implementation ############################
 import aiohttp
-async def get_ollama_response_stream(api_base="http://localhost:11434", model="llama2", prompt="Why is the sky blue?"):
+
 async def get_ollama_response_stream(
    api_base="http://localhost:11434", model="llama2", prompt="Why is the sky blue?"
 ):
    session = aiohttp.ClientSession()
-    url = f'{api_base}/api/generate'
+    url = f"{api_base}/api/generate"
    data = {
        "model": model,
        "prompt": prompt,
@ -918,7 +1203,10 @@ async def get_ollama_response_stream(api_base="http://localhost:11434", model="l
                            if chunk.strip() != "":
                                j = json.loads(chunk)
                                if "response" in j:
-                                    completion_obj ={ "role": "assistant", "content": ""}
+                                    completion_obj = {
                                        "role": "assistant",
                                        "content": "",
                                    }
                                    completion_obj["content"] = j["response"]
                                    yield {"choices": [{"delta": completion_obj}]}
                                    # self.responses.append(j["response"])
@ -939,7 +1227,7 @@ async def stream_to_string(generator):
 ########## Together AI streaming #############################
 async def together_ai_completion_streaming(json_data, headers):
    session = aiohttp.ClientSession()
-    url = 'https://api.together.xyz/inference'
+    url = "https://api.together.xyz/inference"
    # headers = {
    #     'Authorization': f'Bearer {together_ai_token}',
    #     'Content-Type': 'application/json'
@ -962,10 +1250,10 @@ async def together_ai_completion_streaming(json_data, headers):
                if line:
                    try:
                        json_chunk = line.decode("utf-8")
-                        json_string = json_chunk.split('data: ')[1]
+                        json_string = json_chunk.split("data: ")[1]
                        # Convert the JSON string to a dictionary
                        data_dict = json.loads(json_string)
-                        completion_response = data_dict['choices'][0]['text']
+                        completion_response = data_dict["choices"][0]["text"]
                        completion_obj = {"role": "assistant", "content": ""}
                        completion_obj["content"] = completion_response
                        yield {"choices": [{"delta": completion_obj}]}
@ -973,4 +1261,3 @@ async def together_ai_completion_streaming(json_data, headers):
                        pass
    finally:
        await session.close()