Merge branch 'main' into litellm_budget_per_key

2024-01-22 15:49:57 -08:00 · 2024-01-22 15:49:57 -08:00 · db68774d60
commit db68774d60
parent 9d34281025 36c6d3cd90
20 changed files with 731 additions and 183 deletions
--- a/docs/my-website/docs/proxy/custom_pricing.md
+++ b/docs/my-website/docs/proxy/custom_pricing.md
@ -0,0 +1,34 @@
+import Image from '@theme/IdealImage';
+
+# Custom Pricing - Sagemaker, etc. 
+
+Use this to register custom pricing (cost per token or cost per second) for models. 
+
+## Quick Start 
+
+Register custom pricing for sagemaker completion + embedding models. 
+
+For cost per second pricing, you **just** need to register `input_cost_per_second`. 
+
+**Step 1: Add pricing to config.yaml**
+```yaml
+model_list:
+  - model_name: sagemaker-completion-model
+    litellm_params:
+      model: sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4
+      input_cost_per_second: 0.000420
+  - model_name: sagemaker-embedding-model
+    litellm_params:
+      model: sagemaker/berri-benchmarking-gpt-j-6b-fp16
+      input_cost_per_second: 0.000420
+```
+
+**Step 2: Start proxy**
+
+```bash
+litellm /path/to/config.yaml
+```
+
+**Step 3: View Spend Logs**
+
+<Image img={require('../../img/spend_logs_table.png')} />
--- a/docs/my-website/docs/proxy/virtual_keys.md
+++ b/docs/my-website/docs/proxy/virtual_keys.md
@ -440,6 +440,97 @@ general_settings:
 $ litellm --config /path/to/config.yaml 
 ```

+## Custom /key/generate
+
+If you need to add custom logic before generating a Proxy API Key (Example Validating `team_id`)
+
+### 1. Write a custom `custom_generate_key_fn`
+
+
+The input to the custom_generate_key_fn function is a single parameter: `data` [(Type: GenerateKeyRequest)](https://github.com/BerriAI/litellm/blob/main/litellm/proxy/_types.py#L125)
+
+The output of your `custom_generate_key_fn` should be a dictionary with the following structure
+```python
+{
+    "decision": False,
+    "message": "This violates LiteLLM Proxy Rules. No team id provided.",
+}
+
+```
+
+- decision (Type: bool): A boolean value indicating whether the key generation is allowed (True) or not (False).
+
+- message (Type: str, Optional): An optional message providing additional information about the decision. This field is included when the decision is False.
+
+
+```python
+async def custom_generate_key_fn(data: GenerateKeyRequest)-> dict:
+        """
+        Asynchronous function for generating a key based on the input data.
+
+        Args:
+            data (GenerateKeyRequest): The input data for key generation.
+
+        Returns:
+            dict: A dictionary containing the decision and an optional message.
+            {
+                "decision": False,
+                "message": "This violates LiteLLM Proxy Rules. No team id provided.",
+            }
+        """
+        
+        # decide if a key should be generated or not
+        print("using custom auth function!")
+        data_json = data.json()  # type: ignore
+
+        # Unpacking variables
+        team_id = data_json.get("team_id")
+        duration = data_json.get("duration")
+        models = data_json.get("models")
+        aliases = data_json.get("aliases")
+        config = data_json.get("config")
+        spend = data_json.get("spend")
+        user_id = data_json.get("user_id")
+        max_parallel_requests = data_json.get("max_parallel_requests")
+        metadata = data_json.get("metadata")
+        tpm_limit = data_json.get("tpm_limit")
+        rpm_limit = data_json.get("rpm_limit")
+
+        if team_id is not None and team_id == "litellm-core-infra@gmail.com":
+            # only team_id="litellm-core-infra@gmail.com" can make keys
+            return {
+                "decision": True,
+            }
+        else:
+            print("Failed custom auth")
+            return {
+                "decision": False,
+                "message": "This violates LiteLLM Proxy Rules. No team id provided.",
+            }
+```
+
+
+### 2. Pass the filepath (relative to the config.yaml)
+
+Pass the filepath to the config.yaml 
+
+e.g. if they're both in the same dir - `./config.yaml` and `./custom_auth.py`, this is what it looks like:
+```yaml 
+model_list: 
+  - model_name: "openai-model"
+    litellm_params: 
+      model: "gpt-3.5-turbo"
+
+litellm_settings:
+  drop_params: True
+  set_verbose: True
+
+general_settings:
+  custom_key_generate: custom_auth.custom_generate_key_fn
+```
+
+
+

 ## [BETA] Dynamo DB 

--- a/docs/my-website/img/spend_logs_table.png
+++ b/docs/my-website/img/spend_logs_table.png
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -139,6 +139,7 @@ const sidebars = {
          "items": [
            "proxy/call_hooks",
            "proxy/rules",
+            "proxy/custom_pricing"
          ]
        },
        "proxy/deploy", 
--- a/litellm/_logging.py
+++ b/litellm/_logging.py
@ -12,15 +12,6 @@ formatter = logging.Formatter("\033[92m%(name)s - %(levelname)s\033[0m: %(messag

 handler.setFormatter(formatter)

-
-def print_verbose(print_statement):
-    try:
-        if set_verbose:
-            print(print_statement)  # noqa
-    except:
-        pass
-
-
 verbose_proxy_logger = logging.getLogger("LiteLLM Proxy")
 verbose_router_logger = logging.getLogger("LiteLLM Router")
 verbose_logger = logging.getLogger("LiteLLM")
@ -29,3 +20,18 @@ verbose_logger = logging.getLogger("LiteLLM")
 verbose_router_logger.addHandler(handler)
 verbose_proxy_logger.addHandler(handler)
 verbose_logger.addHandler(handler)
+
+
+def print_verbose(print_statement):
+    try:
+        if set_verbose:
+            print(print_statement)  # noqa
+            verbose_logger.setLevel(level=logging.DEBUG)  # set package log to debug
+            verbose_router_logger.setLevel(
+                level=logging.DEBUG
+            )  # set router logs to debug
+            verbose_proxy_logger.setLevel(
+                level=logging.DEBUG
+            )  # set proxy logs to debug
+    except:
+        pass
--- a/litellm/llms/azure.py
+++ b/litellm/llms/azure.py
@ -629,12 +629,23 @@ class AzureChatCompletion(BaseLLM):
                client_session = litellm.aclient_session or httpx.AsyncClient(
                    transport=AsyncCustomHTTPTransport(),
                )
-                openai_aclient = AsyncAzureOpenAI(
+                azure_client = AsyncAzureOpenAI(
                    http_client=client_session, **azure_client_params
                )
            else:
-                openai_aclient = client
-            response = await openai_aclient.images.generate(**data, timeout=timeout)
+                azure_client = client
+            ## LOGGING
+            logging_obj.pre_call(
+                input=data["prompt"],
+                api_key=azure_client.api_key,
+                additional_args={
+                    "headers": {"api_key": azure_client.api_key},
+                    "api_base": azure_client._base_url._uri_reference,
+                    "acompletion": True,
+                    "complete_input_dict": data,
+                },
+            )
+            response = await azure_client.images.generate(**data, timeout=timeout)
            stringified_response = response.model_dump()
            ## LOGGING
            logging_obj.post_call(
@ -719,7 +730,7 @@ class AzureChatCompletion(BaseLLM):
                input=prompt,
                api_key=azure_client.api_key,
                additional_args={
-                    "headers": {"Authorization": f"Bearer {azure_client.api_key}"},
+                    "headers": {"api_key": azure_client.api_key},
                    "api_base": azure_client._base_url._uri_reference,
                    "acompletion": False,
                    "complete_input_dict": data,
--- a/litellm/llms/custom_httpx/azure_dall_e_2.py
+++ b/litellm/llms/custom_httpx/azure_dall_e_2.py
@ -43,7 +43,7 @@ class AsyncCustomHTTPTransport(httpx.AsyncHTTPTransport):
                        request=request,
                    )

-                time.sleep(int(response.headers.get("retry-after")) or 10)
+                await asyncio.sleep(int(response.headers.get("retry-after") or 10))
                response = await super().handle_async_request(request)
                await response.aread()

@ -95,7 +95,6 @@ class CustomHTTPTransport(httpx.HTTPTransport):
            request.method = "GET"
            response = super().handle_request(request)
            response.read()
-
            timeout_secs: int = 120
            start_time = time.time()
            while response.json()["status"] not in ["succeeded", "failed"]:
@ -112,11 +111,9 @@ class CustomHTTPTransport(httpx.HTTPTransport):
                        content=json.dumps(timeout).encode("utf-8"),
                        request=request,
                    )
-
-                time.sleep(int(response.headers.get("retry-after")) or 10)
+                time.sleep(int(response.headers.get("retry-after", None) or 10))
                response = super().handle_request(request)
                response.read()
-
            if response.json()["status"] == "failed":
                error_data = response.json()
                return httpx.Response(
--- a/litellm/main.py
+++ b/litellm/main.py
@ -348,6 +348,13 @@ def mock_completion(
            prompt_tokens=10, completion_tokens=20, total_tokens=30
        )

+        try:
+            _, custom_llm_provider, _, _ = litellm.utils.get_llm_provider(model=model)
+            model_response._hidden_params["custom_llm_provider"] = custom_llm_provider
+        except:
+            # dont let setting a hidden param block a mock_respose
+            pass
+
        return model_response

    except:
@ -450,6 +457,8 @@ def completion(
    ### CUSTOM MODEL COST ###
    input_cost_per_token = kwargs.get("input_cost_per_token", None)
    output_cost_per_token = kwargs.get("output_cost_per_token", None)
+    input_cost_per_second = kwargs.get("input_cost_per_second", None)
+    output_cost_per_second = kwargs.get("output_cost_per_second", None)
    ### CUSTOM PROMPT TEMPLATE ###
    initial_prompt_value = kwargs.get("initial_prompt_value", None)
    roles = kwargs.get("roles", None)
@ -527,6 +536,8 @@ def completion(
        "tpm",
        "input_cost_per_token",
        "output_cost_per_token",
+        "input_cost_per_second",
+        "output_cost_per_second",
        "hf_model_name",
        "model_info",
        "proxy_server_request",
@ -589,6 +600,19 @@ def completion(
                    }
                }
            )
+        if (
+            input_cost_per_second is not None
+        ):  # time based pricing just needs cost in place
+            output_cost_per_second = output_cost_per_second or 0.0
+            litellm.register_model(
+                {
+                    model: {
+                        "input_cost_per_second": input_cost_per_second,
+                        "output_cost_per_second": output_cost_per_second,
+                        "litellm_provider": custom_llm_provider,
+                    }
+                }
+            )
        ### BUILD CUSTOM PROMPT TEMPLATE -- IF GIVEN ###
        custom_prompt_dict = {}  # type: ignore
        if (
@ -2240,6 +2264,11 @@ def embedding(
    encoding_format = kwargs.get("encoding_format", None)
    proxy_server_request = kwargs.get("proxy_server_request", None)
    aembedding = kwargs.get("aembedding", None)
+    ### CUSTOM MODEL COST ###
+    input_cost_per_token = kwargs.get("input_cost_per_token", None)
+    output_cost_per_token = kwargs.get("output_cost_per_token", None)
+    input_cost_per_second = kwargs.get("input_cost_per_second", None)
+    output_cost_per_second = kwargs.get("output_cost_per_second", None)
    openai_params = [
        "user",
        "request_timeout",
@ -2288,6 +2317,8 @@ def embedding(
        "tpm",
        "input_cost_per_token",
        "output_cost_per_token",
+        "input_cost_per_second",
+        "output_cost_per_second",
        "hf_model_name",
        "proxy_server_request",
        "model_info",
@ -2313,6 +2344,28 @@ def embedding(
        custom_llm_provider=custom_llm_provider,
        **non_default_params,
    )
+    ### REGISTER CUSTOM MODEL PRICING -- IF GIVEN ###
+    if input_cost_per_token is not None and output_cost_per_token is not None:
+        litellm.register_model(
+            {
+                model: {
+                    "input_cost_per_token": input_cost_per_token,
+                    "output_cost_per_token": output_cost_per_token,
+                    "litellm_provider": custom_llm_provider,
+                }
+            }
+        )
+    if input_cost_per_second is not None:  # time based pricing just needs cost in place
+        output_cost_per_second = output_cost_per_second or 0.0
+        litellm.register_model(
+            {
+                model: {
+                    "input_cost_per_second": input_cost_per_second,
+                    "output_cost_per_second": output_cost_per_second,
+                    "litellm_provider": custom_llm_provider,
+                }
+            }
+        )
    try:
        response = None
        logging = litellm_logging_obj
@ -3281,7 +3334,9 @@ def stream_chunk_builder_text_completion(chunks: list, messages: Optional[List]
    return response


-def stream_chunk_builder(chunks: list, messages: Optional[list] = None):
+def stream_chunk_builder(
+    chunks: list, messages: Optional[list] = None, start_time=None, end_time=None
+):
    model_response = litellm.ModelResponse()
    # set hidden params from chunk to model_response
    if model_response is not None and hasattr(model_response, "_hidden_params"):
@ -3456,5 +3511,8 @@ def stream_chunk_builder(chunks: list, messages: Optional[list] = None):
        response["usage"]["prompt_tokens"] + response["usage"]["completion_tokens"]
    )
    return convert_to_model_response_object(
-        response_object=response, model_response_object=model_response
+        response_object=response,
+        model_response_object=model_response,
+        start_time=start_time,
+        end_time=end_time,
    )
--- a/litellm/proxy/example_config_yaml/custom_auth.py
+++ b/litellm/proxy/example_config_yaml/custom_auth.py
@ -1,4 +1,4 @@
-from litellm.proxy._types import UserAPIKeyAuth
+from litellm.proxy._types import UserAPIKeyAuth, GenerateKeyRequest
 from fastapi import Request
 from dotenv import load_dotenv
 import os
@ -14,3 +14,40 @@ async def user_api_key_auth(request: Request, api_key: str) -> UserAPIKeyAuth:
        raise Exception
    except:
        raise Exception
+
+
+async def generate_key_fn(data: GenerateKeyRequest):
+    """
+    Asynchronously decides if a key should be generated or not based on the provided data.
+
+    Args:
+        data (GenerateKeyRequest): The data to be used for decision making.
+
+    Returns:
+        bool: True if a key should be generated, False otherwise.
+    """
+    # decide if a key should be generated or not
+    data_json = data.json()  # type: ignore
+
+    # Unpacking variables
+    team_id = data_json.get("team_id")
+    duration = data_json.get("duration")
+    models = data_json.get("models")
+    aliases = data_json.get("aliases")
+    config = data_json.get("config")
+    spend = data_json.get("spend")
+    user_id = data_json.get("user_id")
+    max_parallel_requests = data_json.get("max_parallel_requests")
+    metadata = data_json.get("metadata")
+    tpm_limit = data_json.get("tpm_limit")
+    rpm_limit = data_json.get("rpm_limit")
+
+    if team_id is not None and len(team_id) > 0:
+        return {
+            "decision": True,
+        }
+    else:
+        return {
+            "decision": True,
+            "message": "This violates LiteLLM Proxy Rules. No team id provided.",
+        }
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@ -62,8 +62,9 @@ litellm_settings:
  # setting callback class
  # callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance]

-# general_settings: 
-#   master_key: sk-1234
+general_settings: 
+  master_key: sk-1234
+  custom_key_generate: custom_auth.generate_key_fn
  # database_type: "dynamo_db" 
  # database_args: { # 👈  all args - https://github.com/BerriAI/litellm/blob/befbcbb7ac8f59835ce47415c128decf37aac328/litellm/proxy/_types.py#L190
  #   "billing_mode": "PAY_PER_REQUEST", 
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -187,6 +187,7 @@ prisma_client: Optional[PrismaClient] = None
 custom_db_client: Optional[DBClient] = None
 user_api_key_cache = DualCache()
 user_custom_auth = None
+user_custom_key_generate = None
 use_background_health_checks = None
 use_queue = False
 health_check_interval = None
@ -584,7 +585,7 @@ async def track_cost_callback(
                "user_api_key_user_id", None
            )

-            verbose_proxy_logger.debug(
+            verbose_proxy_logger.info(
                f"streaming response_cost {response_cost}, for user_id {user_id}"
            )
            if user_api_key and (
@ -609,7 +610,7 @@ async def track_cost_callback(
            user_id = user_id or kwargs["litellm_params"]["metadata"].get(
                "user_api_key_user_id", None
            )
-            verbose_proxy_logger.debug(
+            verbose_proxy_logger.info(
                f"response_cost {response_cost}, for user_id {user_id}"
            )
            if user_api_key and (
@ -896,7 +897,7 @@ class ProxyConfig:
        """
        Load config values into proxy global state
        """
-        global master_key, user_config_file_path, otel_logging, user_custom_auth, user_custom_auth_path, use_background_health_checks, health_check_interval, use_queue, custom_db_client
+        global master_key, user_config_file_path, otel_logging, user_custom_auth, user_custom_auth_path, user_custom_key_generate, use_background_health_checks, health_check_interval, use_queue, custom_db_client

        # Load existing config
        config = await self.get_config(config_file_path=config_file_path)
@ -1074,6 +1075,12 @@ class ProxyConfig:
                user_custom_auth = get_instance_fn(
                    value=custom_auth, config_file_path=config_file_path
                )
+
+            custom_key_generate = general_settings.get("custom_key_generate", None)
+            if custom_key_generate is not None:
+                user_custom_key_generate = get_instance_fn(
+                    value=custom_key_generate, config_file_path=config_file_path
+                )
            ## dynamodb
            database_type = general_settings.get("database_type", None)
            if database_type is not None and (
@ -2189,7 +2196,16 @@ async def generate_key_fn(
    - expires: (datetime) Datetime object for when key expires.
    - user_id: (str) Unique user id - used for tracking spend across multiple keys for same user id.
    """
+    global user_custom_key_generate
    verbose_proxy_logger.debug("entered /key/generate")
+
+    if user_custom_key_generate is not None:
+        result = await user_custom_key_generate(data)
+        decision = result.get("decision", True)
+        message = result.get("message", "Authentication Failed - Custom Auth Rule")
+        if not decision:
+            raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail=message)
+
    data_json = data.json()  # type: ignore

    # if we get max_budget passed to /key/generate, then use it as key_max_budget. Since generate_key_helper_fn is used to make new users
@ -2978,7 +2994,7 @@ async def get_routes():

@router.on_event("shutdown")
 async def shutdown_event():
-    global prisma_client, master_key, user_custom_auth
+    global prisma_client, master_key, user_custom_auth, user_custom_key_generate
    if prisma_client:
        verbose_proxy_logger.debug("Disconnecting from Prisma")
        await prisma_client.disconnect()
@ -2988,7 +3004,7 @@ async def shutdown_event():


 def cleanup_router_config_variables():
-    global master_key, user_config_file_path, otel_logging, user_custom_auth, user_custom_auth_path, use_background_health_checks, health_check_interval, prisma_client, custom_db_client
+    global master_key, user_config_file_path, otel_logging, user_custom_auth, user_custom_auth_path, user_custom_key_generate, use_background_health_checks, health_check_interval, prisma_client, custom_db_client

    # Set all variables to None
    master_key = None
@ -2996,6 +3012,7 @@ def cleanup_router_config_variables():
    otel_logging = None
    user_custom_auth = None
    user_custom_auth_path = None
+    user_custom_key_generate = None
    use_background_health_checks = None
    health_check_interval = None
    prisma_client = None
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@ -449,6 +449,7 @@ class PrismaClient:
                        "update": {},  # don't do anything if it already exists
                    },
                )
+                verbose_proxy_logger.info(f"Data Inserted into Keys Table")
                return new_verification_token
            elif table_name == "user":
                db_data = self.jsonify_object(data=data)
@ -459,6 +460,7 @@ class PrismaClient:
                        "update": {},  # don't do anything if it already exists
                    },
                )
+                verbose_proxy_logger.info(f"Data Inserted into User Table")
                return new_user_row
            elif table_name == "config":
                """
@ -483,6 +485,7 @@ class PrismaClient:

                    tasks.append(updated_table_row)
                await asyncio.gather(*tasks)
+                verbose_proxy_logger.info(f"Data Inserted into Config Table")
            elif table_name == "spend":
                db_data = self.jsonify_object(data=data)
                new_spend_row = await self.db.litellm_spendlogs.upsert(
@ -492,6 +495,7 @@ class PrismaClient:
                        "update": {},  # don't do anything if it already exists
                    },
                )
+                verbose_proxy_logger.info(f"Data Inserted into Spend Table")
                return new_spend_row

        except Exception as e:
--- a/litellm/router.py
+++ b/litellm/router.py
@ -997,6 +997,9 @@ class Router:
                        """
                        try:
                            kwargs["model"] = mg
+                            kwargs.setdefault("metadata", {}).update(
+                                {"model_group": mg}
+                            )  # update model_group used, if fallbacks are done
                            response = await self.async_function_with_retries(
                                *args, **kwargs
                            )
@ -1025,8 +1028,10 @@ class Router:
                                f"Falling back to model_group = {mg}"
                            )
                            kwargs["model"] = mg
-                            kwargs["metadata"]["model_group"] = mg
-                            response = await self.async_function_with_retries(
+                            kwargs.setdefault("metadata", {}).update(
+                                {"model_group": mg}
+                            )  # update model_group used, if fallbacks are done
+                            response = await self.async_function_with_fallbacks(
                                *args, **kwargs
                            )
                            return response
@ -1191,6 +1196,9 @@ class Router:
                            ## LOGGING
                            kwargs = self.log_retry(kwargs=kwargs, e=original_exception)
                            kwargs["model"] = mg
+                            kwargs.setdefault("metadata", {}).update(
+                                {"model_group": mg}
+                            )  # update model_group used, if fallbacks are done
                            response = self.function_with_fallbacks(*args, **kwargs)
                            return response
                        except Exception as e:
@ -1214,6 +1222,9 @@ class Router:
                            ## LOGGING
                            kwargs = self.log_retry(kwargs=kwargs, e=original_exception)
                            kwargs["model"] = mg
+                            kwargs.setdefault("metadata", {}).update(
+                                {"model_group": mg}
+                            )  # update model_group used, if fallbacks are done
                            response = self.function_with_fallbacks(*args, **kwargs)
                            return response
                        except Exception as e:
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@ -1372,16 +1372,21 @@ def test_customprompt_together_ai():

 def test_completion_sagemaker():
    try:
-        print("testing sagemaker")
        litellm.set_verbose = True
+        print("testing sagemaker")
        response = completion(
            model="sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4",
            messages=messages,
            temperature=0.2,
            max_tokens=80,
+            input_cost_per_second=0.000420,
        )
        # Add any assertions here to check the response
        print(response)
+        cost = completion_cost(completion_response=response)
+        assert (
+            cost > 0.0 and cost < 1.0
+        )  # should never be > $1 for a single completion call
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")

--- a/litellm/tests/test_custom_logger.py
+++ b/litellm/tests/test_custom_logger.py
@ -1,14 +1,17 @@
 ### What this tests ####
 import sys, os, time, inspect, asyncio, traceback
 import pytest
-sys.path.insert(0, os.path.abspath('../..'))
+
+sys.path.insert(0, os.path.abspath("../.."))

 from litellm import completion, embedding
 import litellm
 from litellm.integrations.custom_logger import CustomLogger

+
 class MyCustomHandler(CustomLogger):
    complete_streaming_response_in_callback = ""
+
    def __init__(self):
        self.success: bool = False  # type: ignore
        self.failure: bool = False  # type: ignore
@ -45,7 +48,6 @@ class MyCustomHandler(CustomLogger):
        if kwargs.get("stream") == True:
            self.sync_stream_collected_response = response_obj

-
    def log_failure_event(self, kwargs, response_obj, start_time, end_time):
        print(f"On Failure")
        self.failure = True
@ -72,14 +74,20 @@ class MyCustomHandler(CustomLogger):

        self.async_completion_kwargs_fail = kwargs

+
 class TmpFunction:
    complete_streaming_response_in_callback = ""
    async_success: bool = False
+
    async def async_test_logging_fn(self, kwargs, completion_obj, start_time, end_time):
        print(f"ON ASYNC LOGGING")
        self.async_success = True
-        print(f'kwargs.get("complete_streaming_response"): {kwargs.get("complete_streaming_response")}')
-        self.complete_streaming_response_in_callback = kwargs.get("complete_streaming_response")
+        print(
+            f'kwargs.get("complete_streaming_response"): {kwargs.get("complete_streaming_response")}'
+        )
+        self.complete_streaming_response_in_callback = kwargs.get(
+            "complete_streaming_response"
+        )


 def test_async_chat_openai_stream():
@ -88,29 +96,39 @@ def test_async_chat_openai_stream():
        # litellm.set_verbose = True
        litellm.success_callback = [tmp_function.async_test_logging_fn]
        complete_streaming_response = ""
+
        async def call_gpt():
            nonlocal complete_streaming_response
-            response = await litellm.acompletion(model="gpt-3.5-turbo",
-                                messages=[{
-                                    "role": "user",
-                                    "content": "Hi 👋 - i'm openai"
-                                }],
-                                stream=True)
+            response = await litellm.acompletion(
+                model="gpt-3.5-turbo",
+                messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}],
+                stream=True,
+            )
            async for chunk in response:
-                complete_streaming_response += chunk["choices"][0]["delta"]["content"] or ""
+                complete_streaming_response += (
+                    chunk["choices"][0]["delta"]["content"] or ""
+                )
                print(complete_streaming_response)
+
        asyncio.run(call_gpt())
        complete_streaming_response = complete_streaming_response.strip("'")
-        response1 = tmp_function.complete_streaming_response_in_callback["choices"][0]["message"]["content"]
+        response1 = tmp_function.complete_streaming_response_in_callback["choices"][0][
+            "message"
+        ]["content"]
        response2 = complete_streaming_response
        # assert [ord(c) for c in response1] == [ord(c) for c in response2]
+        print(f"response1: {response1}")
+        print(f"response2: {response2}")
        assert response1 == response2
        assert tmp_function.async_success == True
    except Exception as e:
        print(e)
        pytest.fail(f"An error occurred - {str(e)}")
+
+
 # test_async_chat_openai_stream()

+
 def test_completion_azure_stream_moderation_failure():
    try:
        customHandler = MyCustomHandler()
@ -152,27 +170,32 @@ def test_async_custom_handler_stream():
            },
        ]
        complete_streaming_response = ""
+
        async def test_1():
            nonlocal complete_streaming_response
            response = await litellm.acompletion(
-                model="azure/chatgpt-v-2", 
-                messages=messages,
-                stream=True
+                model="azure/chatgpt-v-2", messages=messages, stream=True
            )
            async for chunk in response:
-                complete_streaming_response += chunk["choices"][0]["delta"]["content"] or ""
+                complete_streaming_response += (
+                    chunk["choices"][0]["delta"]["content"] or ""
+                )
                print(complete_streaming_response)

        asyncio.run(test_1())

        response_in_success_handler = customHandler2.stream_collected_response
-        response_in_success_handler = response_in_success_handler["choices"][0]["message"]["content"]
+        response_in_success_handler = response_in_success_handler["choices"][0][
+            "message"
+        ]["content"]
        print("\n\n")
        print("response_in_success_handler: ", response_in_success_handler)
        print("complete_streaming_response: ", complete_streaming_response)
        assert response_in_success_handler == complete_streaming_response
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
+
+
 # test_async_custom_handler_stream()


@ -194,9 +217,7 @@ def test_azure_completion_stream():
        complete_streaming_response = ""

        response = litellm.completion(
-            model="azure/chatgpt-v-2", 
-            messages=messages,
-            stream=True
+            model="azure/chatgpt-v-2", messages=messages, stream=True
        )
        for chunk in response:
            complete_streaming_response += chunk["choices"][0]["delta"]["content"] or ""
@ -204,7 +225,9 @@ def test_azure_completion_stream():

        time.sleep(0.5)  # wait 1/2 second before checking callbacks
        response_in_success_handler = customHandler2.sync_stream_collected_response
-        response_in_success_handler = response_in_success_handler["choices"][0]["message"]["content"]
+        response_in_success_handler = response_in_success_handler["choices"][0][
+            "message"
+        ]["content"]
        print("\n\n")
        print("response_in_success_handler: ", response_in_success_handler)
        print("complete_streaming_response: ", complete_streaming_response)
@ -212,6 +235,7 @@ def test_azure_completion_stream():
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")

+
@pytest.mark.asyncio
 async def test_async_custom_handler_completion():
    try:
@ -222,14 +246,21 @@ async def test_async_custom_handler_completion():
        litellm.callbacks = [customHandler_success]
        response = await litellm.acompletion(
            model="gpt-3.5-turbo",
-                messages=[{
+            messages=[
+                {
                    "role": "user",
                    "content": "hello from litellm test",
-                }]
+                }
+            ],
        )
        await asyncio.sleep(1)
-        assert customHandler_success.async_success == True, "async success is not set to True even after success"
-        assert customHandler_success.async_completion_kwargs.get("model") == "gpt-3.5-turbo"
+        assert (
+            customHandler_success.async_success == True
+        ), "async success is not set to True even after success"
+        assert (
+            customHandler_success.async_completion_kwargs.get("model")
+            == "gpt-3.5-turbo"
+        )
        # failure
        litellm.callbacks = [customHandler_failure]
        messages = [
@ -249,15 +280,28 @@ async def test_async_custom_handler_completion():
            )
        except:
            pass
-        assert customHandler_failure.async_failure == True, "async failure is not set to True even after failure"        
-        assert customHandler_failure.async_completion_kwargs_fail.get("model") == "gpt-3.5-turbo"
-        assert len(str(customHandler_failure.async_completion_kwargs_fail.get("exception"))) > 10 # expect APIError("OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: test. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}"), 'traceback_exception': 'Traceback (most recent call last):\n  File "/Users/ishaanjaffer/Github/litellm/litellm/llms/openai.py", line 269, in acompletion\n    response = await openai_aclient.chat.completions.create(**data)\n  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/openai/resources/chat/completions.py", line 119
+        assert (
+            customHandler_failure.async_failure == True
+        ), "async failure is not set to True even after failure"
+        assert (
+            customHandler_failure.async_completion_kwargs_fail.get("model")
+            == "gpt-3.5-turbo"
+        )
+        assert (
+            len(
+                str(customHandler_failure.async_completion_kwargs_fail.get("exception"))
+            )
+            > 10
+        )  # expect APIError("OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: test. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}"), 'traceback_exception': 'Traceback (most recent call last):\n  File "/Users/ishaanjaffer/Github/litellm/litellm/llms/openai.py", line 269, in acompletion\n    response = await openai_aclient.chat.completions.create(**data)\n  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/openai/resources/chat/completions.py", line 119
        litellm.callbacks = []
        print("Passed setting async failure")
    except Exception as e:
        pytest.fail(f"An exception occurred - {str(e)}")
+
+
 # asyncio.run(test_async_custom_handler_completion())

+
@pytest.mark.asyncio
 async def test_async_custom_handler_embedding():
    try:
@ -270,9 +314,17 @@ async def test_async_custom_handler_embedding():
            input=["hello world"],
        )
        await asyncio.sleep(1)
-        assert customHandler_embedding.async_success_embedding == True, "async_success_embedding is not set to True even after success"
-        assert customHandler_embedding.async_embedding_kwargs.get("model") == "text-embedding-ada-002"
-        assert customHandler_embedding.async_embedding_response["usage"]["prompt_tokens"] ==2
+        assert (
+            customHandler_embedding.async_success_embedding == True
+        ), "async_success_embedding is not set to True even after success"
+        assert (
+            customHandler_embedding.async_embedding_kwargs.get("model")
+            == "text-embedding-ada-002"
+        )
+        assert (
+            customHandler_embedding.async_embedding_response["usage"]["prompt_tokens"]
+            == 2
+        )
        print("Passed setting async success: Embedding")
        # failure
        assert customHandler_embedding.async_failure_embedding == False
@ -284,13 +336,28 @@ async def test_async_custom_handler_embedding():
            )
        except:
            pass
-        assert customHandler_embedding.async_failure_embedding == True, "async failure embedding is not set to True even after failure"        
-        assert customHandler_embedding.async_embedding_kwargs_fail.get("model") == "text-embedding-ada-002"
-        assert len(str(customHandler_embedding.async_embedding_kwargs_fail.get("exception"))) > 10 # exppect APIError("OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: test. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}"), 'traceback_exception': 'Traceback (most recent call last):\n  File "/Users/ishaanjaffer/Github/litellm/litellm/llms/openai.py", line 269, in acompletion\n    response = await openai_aclient.chat.completions.create(**data)\n  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/openai/resources/chat/completions.py", line 119
+        assert (
+            customHandler_embedding.async_failure_embedding == True
+        ), "async failure embedding is not set to True even after failure"
+        assert (
+            customHandler_embedding.async_embedding_kwargs_fail.get("model")
+            == "text-embedding-ada-002"
+        )
+        assert (
+            len(
+                str(
+                    customHandler_embedding.async_embedding_kwargs_fail.get("exception")
+                )
+            )
+            > 10
+        )  # exppect APIError("OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: test. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}"), 'traceback_exception': 'Traceback (most recent call last):\n  File "/Users/ishaanjaffer/Github/litellm/litellm/llms/openai.py", line 269, in acompletion\n    response = await openai_aclient.chat.completions.create(**data)\n  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/openai/resources/chat/completions.py", line 119
    except Exception as e:
        pytest.fail(f"An exception occurred - {str(e)}")
+
+
 # asyncio.run(test_async_custom_handler_embedding())

+
@pytest.mark.asyncio
 async def test_async_custom_handler_embedding_optional_param():
    """
@ -300,16 +367,19 @@ async def test_async_custom_handler_embedding_optional_param():
    customHandler_optional_params = MyCustomHandler()
    litellm.callbacks = [customHandler_optional_params]
    response = await litellm.aembedding(
-                model="azure/azure-embedding-model", 
-                input = ["hello world"],
-                user = "John"
+        model="azure/azure-embedding-model", input=["hello world"], user="John"
    )
    await asyncio.sleep(1)  # success callback is async
    assert customHandler_optional_params.user == "John"
-    assert customHandler_optional_params.user == customHandler_optional_params.data_sent_to_api["user"]
+    assert (
+        customHandler_optional_params.user
+        == customHandler_optional_params.data_sent_to_api["user"]
+    )
+

 # asyncio.run(test_async_custom_handler_embedding_optional_param())

+
@pytest.mark.asyncio
 async def test_async_custom_handler_embedding_optional_param_bedrock():
    """
@ -323,9 +393,7 @@ async def test_async_custom_handler_embedding_optional_param_bedrock():
    customHandler_optional_params = MyCustomHandler()
    litellm.callbacks = [customHandler_optional_params]
    response = await litellm.aembedding(
-                model="bedrock/amazon.titan-embed-text-v1", 
-                input = ["hello world"],
-                user = "John"
+        model="bedrock/amazon.titan-embed-text-v1", input=["hello world"], user="John"
    )
    await asyncio.sleep(1)  # success callback is async
    assert customHandler_optional_params.user == "John"
@ -334,16 +402,36 @@ async def test_async_custom_handler_embedding_optional_param_bedrock():

 def test_redis_cache_completion_stream():
    from litellm import Cache
+
    # Important Test - This tests if we can add to streaming cache, when custom callbacks are set
    import random
+
    try:
        print("\nrunning test_redis_cache_completion_stream")
        litellm.set_verbose = True
-        random_number = random.randint(1, 100000) # add a random number to ensure it's always adding / reading from cache
-        messages = [{"role": "user", "content": f"write a one sentence poem about: {random_number}"}]
-        litellm.cache = Cache(type="redis", host=os.environ['REDIS_HOST'], port=os.environ['REDIS_PORT'], password=os.environ['REDIS_PASSWORD'])
+        random_number = random.randint(
+            1, 100000
+        )  # add a random number to ensure it's always adding / reading from cache
+        messages = [
+            {
+                "role": "user",
+                "content": f"write a one sentence poem about: {random_number}",
+            }
+        ]
+        litellm.cache = Cache(
+            type="redis",
+            host=os.environ["REDIS_HOST"],
+            port=os.environ["REDIS_PORT"],
+            password=os.environ["REDIS_PASSWORD"],
+        )
        print("test for caching, streaming + completion")
-        response1 = completion(model="gpt-3.5-turbo", messages=messages, max_tokens=40, temperature=0.2, stream=True)
+        response1 = completion(
+            model="gpt-3.5-turbo",
+            messages=messages,
+            max_tokens=40,
+            temperature=0.2,
+            stream=True,
+        )
        response_1_content = ""
        for chunk in response1:
            print(chunk)
@ -351,14 +439,22 @@ def test_redis_cache_completion_stream():
        print(response_1_content)

        time.sleep(0.1)  # sleep for 0.1 seconds allow set cache to occur
-        response2 = completion(model="gpt-3.5-turbo", messages=messages, max_tokens=40, temperature=0.2, stream=True)
+        response2 = completion(
+            model="gpt-3.5-turbo",
+            messages=messages,
+            max_tokens=40,
+            temperature=0.2,
+            stream=True,
+        )
        response_2_content = ""
        for chunk in response2:
            print(chunk)
            response_2_content += chunk.choices[0].delta.content or ""
        print("\nresponse 1", response_1_content)
        print("\nresponse 2", response_2_content)
-        assert response_1_content == response_2_content, f"Response 1 != Response 2. Same params, Response 1{response_1_content} != Response 2{response_2_content}"
+        assert (
+            response_1_content == response_2_content
+        ), f"Response 1 != Response 2. Same params, Response 1{response_1_content} != Response 2{response_2_content}"
        litellm.success_callback = []
        litellm._async_success_callback = []
        litellm.cache = None
@ -366,4 +462,6 @@ def test_redis_cache_completion_stream():
        print(e)
        litellm.success_callback = []
        raise e
+
+
 # test_redis_cache_completion_stream()
--- a/litellm/tests/test_embedding.py
+++ b/litellm/tests/test_embedding.py
@ -10,7 +10,7 @@ sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import litellm
-from litellm import embedding, completion
+from litellm import embedding, completion, completion_cost

 litellm.set_verbose = False

@ -341,8 +341,30 @@ def test_sagemaker_embeddings():
        response = litellm.embedding(
            model="sagemaker/berri-benchmarking-gpt-j-6b-fp16",
            input=["good morning from litellm", "this is another item"],
+            input_cost_per_second=0.000420,
        )
        print(f"response: {response}")
+        cost = completion_cost(completion_response=response)
+        assert (
+            cost > 0.0 and cost < 1.0
+        )  # should never be > $1 for a single embedding call
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+
+@pytest.mark.asyncio
+async def test_sagemaker_aembeddings():
+    try:
+        response = await litellm.aembedding(
+            model="sagemaker/berri-benchmarking-gpt-j-6b-fp16",
+            input=["good morning from litellm", "this is another item"],
+            input_cost_per_second=0.000420,
+        )
+        print(f"response: {response}")
+        cost = completion_cost(completion_response=response)
+        assert (
+            cost > 0.0 and cost < 1.0
+        )  # should never be > $1 for a single embedding call
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")

--- a/litellm/tests/test_key_generate_prisma.py
+++ b/litellm/tests/test_key_generate_prisma.py
@ -35,6 +35,7 @@ import pytest, logging, asyncio
 import litellm, asyncio
 from litellm.proxy.proxy_server import (
    new_user,
+    generate_key_fn,
    user_api_key_auth,
    user_update,
    delete_key_fn,
@ -53,6 +54,7 @@ from litellm.proxy._types import (
    DynamoDBArgs,
    DeleteKeyRequest,
    UpdateKeyRequest,
+    GenerateKeyRequest,
 )
 from litellm.proxy.utils import DBClient
 from starlette.datastructures import URL
@ -598,6 +600,85 @@ def test_generate_and_update_key(prisma_client):
        print(e.detail)
        pytest.fail(f"An exception occurred - {str(e)}")
 
+def test_key_generate_with_custom_auth(prisma_client):
+    # custom - generate key function
+    async def custom_generate_key_fn(data: GenerateKeyRequest) -> dict:
+        """
+        Asynchronous function for generating a key based on the input data.
+
+        Args:
+            data (GenerateKeyRequest): The input data for key generation.
+
+        Returns:
+            dict: A dictionary containing the decision and an optional message.
+            {
+                "decision": False,
+                "message": "This violates LiteLLM Proxy Rules. No team id provided.",
+            }
+        """
+
+        # decide if a key should be generated or not
+        print("using custom auth function!")
+        data_json = data.json()  # type: ignore
+
+        # Unpacking variables
+        team_id = data_json.get("team_id")
+        duration = data_json.get("duration")
+        models = data_json.get("models")
+        aliases = data_json.get("aliases")
+        config = data_json.get("config")
+        spend = data_json.get("spend")
+        user_id = data_json.get("user_id")
+        max_parallel_requests = data_json.get("max_parallel_requests")
+        metadata = data_json.get("metadata")
+        tpm_limit = data_json.get("tpm_limit")
+        rpm_limit = data_json.get("rpm_limit")
+
+        if team_id is not None and team_id == "litellm-core-infra@gmail.com":
+            # only team_id="litellm-core-infra@gmail.com" can make keys
+            return {
+                "decision": True,
+            }
+        else:
+            print("Failed custom auth")
+            return {
+                "decision": False,
+                "message": "This violates LiteLLM Proxy Rules. No team id provided.",
+            }
+
+    setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
+    setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
+    setattr(
+        litellm.proxy.proxy_server, "user_custom_key_generate", custom_generate_key_fn
+    )
+                try:
+                request = GenerateKeyRequest()
+                key = await generate_key_fn(request)
+                pytest.fail(f"Expected an exception. Got {key}")
+            except Exception as e:
+                # this should fail
+                print("Got Exception", e)
+                print(e.detail)
+                print("First request failed!. This is expected")
+                assert (
+                    "This violates LiteLLM Proxy Rules. No team id provided."
+                    in e.detail
+                )
+
+            request_2 = GenerateKeyRequest(
+                team_id="litellm-core-infra@gmail.com",
+            )
+
+            key = await generate_key_fn(request_2)
+            print(key)
+            generated_key = key.key
+
+        asyncio.run(test())
+    except Exception as e:
+        print("Got Exception", e)
+        print(e.detail)
+        pytest.fail(f"An exception occurred - {str(e)}")
+

 def test_call_with_key_over_budget(prisma_client):
    # 12. Make a call with a key over budget, expect to fail
--- a/litellm/tests/test_router_fallbacks.py
+++ b/litellm/tests/test_router_fallbacks.py
@ -716,7 +716,7 @@ def test_usage_based_routing_fallbacks():
        # Constants for TPM and RPM allocation
        AZURE_FAST_TPM = 3
        AZURE_BASIC_TPM = 4
-        OPENAI_TPM = 2000
+        OPENAI_TPM = 400
        ANTHROPIC_TPM = 100000

        def get_azure_params(deployment_name: str):
@ -775,6 +775,7 @@ def test_usage_based_routing_fallbacks():
            model_list=model_list,
            fallbacks=fallbacks_list,
            set_verbose=True,
+            debug_level="DEBUG",
            routing_strategy="usage-based-routing",
            redis_host=os.environ["REDIS_HOST"],
            redis_port=os.environ["REDIS_PORT"],
@ -783,17 +784,32 @@ def test_usage_based_routing_fallbacks():
        messages = [
            {"content": "Tell me a joke.", "role": "user"},
        ]
-
        response = router.completion(
-            model="azure/gpt-4-fast", messages=messages, timeout=5
+            model="azure/gpt-4-fast",
+            messages=messages,
+            timeout=5,
+            mock_response="very nice to meet you",
        )
        print("response: ", response)
        print("response._hidden_params: ", response._hidden_params)
-
        # in this test, we expect azure/gpt-4 fast to fail, then azure-gpt-4 basic to fail and then openai-gpt-4 to pass
        # the token count of this message is > AZURE_FAST_TPM, > AZURE_BASIC_TPM
        assert response._hidden_params["custom_llm_provider"] == "openai"

+        # now make 100 mock requests to OpenAI - expect it to fallback to anthropic-claude-instant-1.2
+        for i in range(20):
+            response = router.completion(
+                model="azure/gpt-4-fast",
+                messages=messages,
+                timeout=5,
+                mock_response="very nice to meet you",
+            )
+            print("response: ", response)
+            print("response._hidden_params: ", response._hidden_params)
+            if i == 19:
+                # by the 19th call we should have hit TPM LIMIT for OpenAI, it should fallback to anthropic-claude-instant-1.2
+                assert response._hidden_params["custom_llm_provider"] == "anthropic"
+
    except Exception as e:
        pytest.fail(f"An exception occurred {e}")

--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -765,6 +765,7 @@ class Logging:
        self.litellm_call_id = litellm_call_id
        self.function_id = function_id
        self.streaming_chunks = []  # for generating complete stream response
+        self.sync_streaming_chunks = []  # for generating complete stream response
        self.model_call_details = {}

    def update_environment_variables(
@ -828,7 +829,7 @@ class Logging:
                [f"-H '{k}: {v}'" for k, v in masked_headers.items()]
            )

-            print_verbose(f"PRE-API-CALL ADDITIONAL ARGS: {additional_args}")
+            verbose_logger.debug(f"PRE-API-CALL ADDITIONAL ARGS: {additional_args}")

            curl_command = "\n\nPOST Request Sent from LiteLLM:\n"
            curl_command += "curl -X POST \\\n"
@ -994,13 +995,10 @@ class Logging:
            self.model_call_details["log_event_type"] = "post_api_call"

            # User Logging -> if you pass in a custom logging function
-            print_verbose(
+            verbose_logger.debug(
                f"RAW RESPONSE:\n{self.model_call_details.get('original_response', self.model_call_details)}\n\n"
            )
-            print_verbose(
-                f"Logging Details Post-API Call: logger_fn - {self.logger_fn} | callable(logger_fn) - {callable(self.logger_fn)}"
-            )
-            print_verbose(
+            verbose_logger.debug(
                f"Logging Details Post-API Call: LiteLLM Params: {self.model_call_details}"
            )
            if self.logger_fn and callable(self.logger_fn):
@ -1094,20 +1092,20 @@ class Logging:
                if (
                    result.choices[0].finish_reason is not None
                ):  # if it's the last chunk
-                    self.streaming_chunks.append(result)
-                    # print_verbose(f"final set of received chunks: {self.streaming_chunks}")
+                    self.sync_streaming_chunks.append(result)
+                    # print_verbose(f"final set of received chunks: {self.sync_streaming_chunks}")
                    try:
                        complete_streaming_response = litellm.stream_chunk_builder(
-                            self.streaming_chunks,
+                            self.sync_streaming_chunks,
                            messages=self.model_call_details.get("messages", None),
                        )
                    except:
                        complete_streaming_response = None
                else:
-                    self.streaming_chunks.append(result)
+                    self.sync_streaming_chunks.append(result)

            if complete_streaming_response:
-                verbose_logger.info(
+                verbose_logger.debug(
                    f"Logging Details LiteLLM-Success Call streaming complete"
                )
                self.model_call_details[
@ -1307,7 +1305,9 @@ class Logging:
                        )
                        == False
                    ):  # custom logger class
-                        print_verbose(f"success callbacks: Running Custom Logger Class")
+                        verbose_logger.info(
+                            f"success callbacks: Running SYNC Custom Logger Class"
+                        )
                        if self.stream and complete_streaming_response is None:
                            callback.log_stream_event(
                                kwargs=self.model_call_details,
@ -1329,7 +1329,17 @@ class Logging:
                                start_time=start_time,
                                end_time=end_time,
                            )
-                    if callable(callback):  # custom logger functions
+                    elif (
+                        callable(callback) == True
+                        and self.model_call_details.get("litellm_params", {}).get(
+                            "acompletion", False
+                        )
+                        == False
+                        and self.model_call_details.get("litellm_params", {}).get(
+                            "aembedding", False
+                        )
+                        == False
+                    ):  # custom logger functions
                        print_verbose(
                            f"success callbacks: Running Custom Callback Function"
                        )
@ -1364,6 +1374,9 @@ class Logging:
        Implementing async callbacks, to handle asyncio event loop issues when custom integrations need to use async functions.
        """
        print_verbose(f"Async success callbacks: {litellm._async_success_callback}")
+        start_time, end_time, result = self._success_handler_helper_fn(
+            start_time=start_time, end_time=end_time, result=result, cache_hit=cache_hit
+        )
        ## BUILD COMPLETE STREAMED RESPONSE
        complete_streaming_response = None
        if self.stream:
@ -1374,6 +1387,8 @@ class Logging:
                    complete_streaming_response = litellm.stream_chunk_builder(
                        self.streaming_chunks,
                        messages=self.model_call_details.get("messages", None),
+                        start_time=start_time,
+                        end_time=end_time,
                    )
                except Exception as e:
                    print_verbose(
@ -1387,9 +1402,7 @@ class Logging:
            self.model_call_details[
                "complete_streaming_response"
            ] = complete_streaming_response
-        start_time, end_time, result = self._success_handler_helper_fn(
-            start_time=start_time, end_time=end_time, result=result, cache_hit=cache_hit
-        )
+
        for callback in litellm._async_success_callback:
            try:
                if callback == "cache" and litellm.cache is not None:
@ -1436,7 +1449,6 @@ class Logging:
                            end_time=end_time,
                        )
                if callable(callback):  # custom logger functions
-                    print_verbose(f"Async success callbacks: async_log_event")
                    await customLogger.async_log_event(
                        kwargs=self.model_call_details,
                        response_obj=result,
@ -2134,7 +2146,7 @@ def client(original_function):
                litellm.cache.add_cache(result, *args, **kwargs)

            # LOG SUCCESS - handle streaming success logging in the _next_ object, remove `handle_success` once it's deprecated
-            print_verbose(f"Wrapper: Completed Call, calling success_handler")
+            verbose_logger.info(f"Wrapper: Completed Call, calling success_handler")
            threading.Thread(
                target=logging_obj.success_handler, args=(result, start_time, end_time)
            ).start()
@ -2373,7 +2385,9 @@ def client(original_function):
                result._hidden_params["model_id"] = kwargs.get("model_info", {}).get(
                    "id", None
                )
-            if isinstance(result, ModelResponse):
+            if isinstance(result, ModelResponse) or isinstance(
+                result, EmbeddingResponse
+            ):
                result._response_ms = (
                    end_time - start_time
                ).total_seconds() * 1000  # return response latency in ms like openai
@ -2806,7 +2820,11 @@ def token_counter(


 def cost_per_token(
-    model="", prompt_tokens=0, completion_tokens=0, custom_llm_provider=None
+    model="",
+    prompt_tokens=0,
+    completion_tokens=0,
+    response_time_ms=None,
+    custom_llm_provider=None,
 ):
    """
    Calculates the cost per token for a given model, prompt tokens, and completion tokens.
@ -2828,15 +2846,36 @@ def cost_per_token(
    else:
        model_with_provider = model
    # see this https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models
-    print_verbose(f"Looking up model={model} in model_cost_map")
+    verbose_logger.debug(f"Looking up model={model} in model_cost_map")

    if model in model_cost_ref:
+        verbose_logger.debug(f"Success: model={model} in model_cost_map")
+        if (
+            model_cost_ref[model].get("input_cost_per_token", None) is not None
+            and model_cost_ref[model].get("output_cost_per_token", None) is not None
+        ):
+            ## COST PER TOKEN ##
            prompt_tokens_cost_usd_dollar = (
                model_cost_ref[model]["input_cost_per_token"] * prompt_tokens
            )
            completion_tokens_cost_usd_dollar = (
                model_cost_ref[model]["output_cost_per_token"] * completion_tokens
            )
+        elif (
+            model_cost_ref[model].get("input_cost_per_second", None) is not None
+            and response_time_ms is not None
+        ):
+            verbose_logger.debug(
+                f"For model={model} - input_cost_per_second: {model_cost_ref[model].get('input_cost_per_second')}; response time: {response_time_ms}"
+            )
+            ## COST PER SECOND ##
+            prompt_tokens_cost_usd_dollar = (
+                model_cost_ref[model]["input_cost_per_second"] * response_time_ms / 1000
+            )
+            completion_tokens_cost_usd_dollar = 0.0
+        verbose_logger.debug(
+            f"Returned custom cost for model={model} - prompt_tokens_cost_usd_dollar: {prompt_tokens_cost_usd_dollar}, completion_tokens_cost_usd_dollar: {completion_tokens_cost_usd_dollar}"
+        )
        return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
    elif model_with_provider in model_cost_ref:
        print_verbose(f"Looking up model={model_with_provider} in model_cost_map")
@ -2938,6 +2977,10 @@ def completion_cost(
            completion_tokens = completion_response.get("usage", {}).get(
                "completion_tokens", 0
            )
+            total_time = completion_response.get("_response_ms", 0)
+            verbose_logger.debug(
+                f"completion_response response ms: {completion_response.get('_response_ms')} "
+            )
            model = (
                model or completion_response["model"]
            )  # check if user passed an override for model, if it's none check completion_response['model']
@ -2975,6 +3018,7 @@ def completion_cost(
            prompt_tokens=prompt_tokens,
            completion_tokens=completion_tokens,
            custom_llm_provider=custom_llm_provider,
+            response_time_ms=total_time,
        )
        return prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
    except Exception as e:
@ -3005,9 +3049,8 @@ def register_model(model_cost: Union[str, dict]):

    for key, value in loaded_model_cost.items():
        ## override / add new keys to the existing model cost dictionary
-        if key in litellm.model_cost:
-            for k, v in loaded_model_cost[key].items():
-                litellm.model_cost[key][k] = v
+        litellm.model_cost.setdefault(key, {}).update(value)
+        verbose_logger.debug(f"{key} added to model cost map")
        # add new model names to provider lists
        if value.get("litellm_provider") == "openai":
            if key not in litellm.open_ai_chat_completion_models:
@ -3300,11 +3343,13 @@ def get_optional_params(
                )

    def _check_valid_arg(supported_params):
-        print_verbose(
+        verbose_logger.debug(
            f"\nLiteLLM completion() model= {model}; provider = {custom_llm_provider}"
        )
-        print_verbose(f"\nLiteLLM: Params passed to completion() {passed_params}")
-        print_verbose(
+        verbose_logger.debug(
+            f"\nLiteLLM: Params passed to completion() {passed_params}"
+        )
+        verbose_logger.debug(
            f"\nLiteLLM: Non-Default params passed to completion() {non_default_params}"
        )
        unsupported_params = {}
@ -5150,6 +5195,8 @@ def convert_to_model_response_object(
        "completion", "embedding", "image_generation"
    ] = "completion",
    stream=False,
+    start_time=None,
+    end_time=None,
 ):
    try:
        if response_type == "completion" and (
@ -5203,6 +5250,12 @@ def convert_to_model_response_object(

            if "model" in response_object:
                model_response_object.model = response_object["model"]
+
+            if start_time is not None and end_time is not None:
+                model_response_object._response_ms = (  # type: ignore
+                    end_time - start_time
+                ).total_seconds() * 1000
+
            return model_response_object
        elif response_type == "embedding" and (
            model_response_object is None
@ -5227,6 +5280,11 @@ def convert_to_model_response_object(
                model_response_object.usage.prompt_tokens = response_object["usage"].get("prompt_tokens", 0)  # type: ignore
                model_response_object.usage.total_tokens = response_object["usage"].get("total_tokens", 0)  # type: ignore

+            if start_time is not None and end_time is not None:
+                model_response_object._response_ms = (  # type: ignore
+                    end_time - start_time
+                ).total_seconds() * 1000  # return response latency in ms like openai
+
            return model_response_object
        elif response_type == "image_generation" and (
            model_response_object is None
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.18.8"
+version = "1.18.9"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@ -61,7 +61,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"

 [tool.commitizen]
-version = "1.18.8"
+version = "1.18.9"
 version_files = [
    "pyproject.toml:^version"
 ]