Merge remote-tracking branch 'src/main'

2024-01-27 19:24:35 +01:00 · 2024-01-27 19:24:35 +01:00 · 4dd18b553a
commit 4dd18b553a
parent 4ba809b835 950c753429
29 changed files with 550 additions and 170 deletions
--- a/.github/workflows/ghcr_deploy.yml
+++ b/.github/workflows/ghcr_deploy.yml
@ -34,13 +34,6 @@ jobs:
        with:
          push: true
          tags: litellm/litellm:${{ github.event.inputs.tag || 'latest' }} 
      -
        name: Build and push litellm-ui image
        uses: docker/build-push-action@v5
        with:
          push: true
          file: ui/Dockerfile
          tags: litellm/litellm-ui:${{ github.event.inputs.tag || 'latest' }}
      -
        name: Build and push litellm-database image
        uses: docker/build-push-action@v5
--- a/docs/my-website/docs/embedding/supported_embedding.md
+++ b/docs/my-website/docs/embedding/supported_embedding.md
@ -13,8 +13,8 @@ response = embedding(model='text-embedding-ada-002', input=["good morning from l
 - `model`: *string* - ID of the model to use. `model='text-embedding-ada-002'`
- `input`: *array* - Input text to embed, encoded as a string or array of tokens. To embed multiple inputs in a single request, pass an array of strings or array of token arrays. The input must not exceed the max input tokens for the model (8192 tokens for text-embedding-ada-002), cannot be an empty string, and any array must be 2048 dimensions or less. 
+- `input`: *string or array* - Input text to embed, encoded as a string or array of tokens. To embed multiple inputs in a single request, pass an array of strings or array of token arrays. The input must not exceed the max input tokens for the model (8192 tokens for text-embedding-ada-002), cannot be an empty string, and any array must be 2048 dimensions or less. 
-```
+```python
 input=["good morning from litellm"]
 ```
@ -22,7 +22,11 @@ input=["good morning from litellm"]
 - `user`: *string (optional)* A unique identifier representing your end-user, 
- `timeout`: *integer* - The maximum time, in seconds, to wait for the API to respond. Defaults to 600 seconds (10 minutes).
+- `dimensions`: *integer (Optional)* The number of dimensions the resulting output embeddings should have. Only supported in OpenAI/Azure text-embedding-3 and later models.
 - `encoding_format`: *string (Optional)* The format to return the embeddings in. Can be either `"float"` or `"base64"`. Defaults to `encoding_format="float"`
 - `timeout`: *integer (Optional)* - The maximum time, in seconds, to wait for the API to respond. Defaults to 600 seconds (10 minutes).
 - `api_base`: *string (optional)* - The api endpoint you want to call the model with
@ -66,7 +70,12 @@ input=["good morning from litellm"]
 from litellm import embedding
 import os
 os.environ['OPENAI_API_KEY'] = ""
-response = embedding('text-embedding-ada-002', input=["good morning from litellm"])
+response = embedding(
    model="text-embedding-3-small",
    input=["good morning from litellm", "this is another item"],
    metadata={"anything": "good day"},
    dimensions=5 # Only supported in text-embedding-3 and later models.
 )
 ```
 | Model Name           | Function Call                               | Required OS Variables                |
--- a/docs/my-website/docs/proxy/alerting.md
+++ b/docs/my-website/docs/proxy/alerting.md
@ -1,6 +1,13 @@
 # Slack Alerting
-Get alerts for failed db read/writes, hanging api calls, failed api calls. 
+Get alerts for:
 - hanging LLM api calls
 - failed LLM api calls
 - slow LLM api calls
 - budget Tracking per key/user:
    - When a User/Key crosses their Budget 
    - When a User/Key is 15% away from crossing their Budget
 - failed db read/writes
 ## Quick Start
--- a/docs/my-website/docs/routing.md
+++ b/docs/my-website/docs/routing.md
@ -605,6 +605,49 @@ response = router.completion(model="gpt-3.5-turbo", messages=messages)
 print(f"response: {response}")
 ```
 ## Custom Callbacks - Track API Key, API Endpoint, Model Used 
 If you need to track the api_key, api endpoint, model, custom_llm_provider used for each completion call, you can setup a [custom callback](https://docs.litellm.ai/docs/observability/custom_callback) 
 ### Usage
 ```python
 import litellm
 from litellm.integrations.custom_logger import CustomLogger
 class MyCustomHandler(CustomLogger):        
 	def log_success_event(self, kwargs, response_obj, start_time, end_time): 
 		print(f"On Success")
 		print("kwargs=", kwargs)
 		litellm_params= kwargs.get("litellm_params")
 		api_key = litellm_params.get("api_key")
 		api_base = litellm_params.get("api_base")
 		custom_llm_provider= litellm_params.get("custom_llm_provider")
 		response_cost = kwargs.get("response_cost")
 		# print the values
 		print("api_key=", api_key)
 		print("api_base=", api_base)
 		print("custom_llm_provider=", custom_llm_provider)
 		print("response_cost=", response_cost)
 	def log_failure_event(self, kwargs, response_obj, start_time, end_time): 
 		print(f"On Failure")
 		print("kwargs=")
 customHandler = MyCustomHandler()
 litellm.callbacks = [customHandler]
 # Init Router
 router = Router(model_list=model_list, routing_strategy="simple-shuffle")
 # router completion call
 response = router.completion(
 	model="gpt-3.5-turbo", 
 	messages=[{ "role": "user", "content": "Hi who are you"}]
 )
 ```
 ## Deploy Router 
--- a/litellm/budget_manager.py
+++ b/litellm/budget_manager.py
@ -1,3 +1,12 @@
 # +-----------------------------------------------+
 # |                                               |
 # |           NOT PROXY BUDGET MANAGER            |
 # |  proxy budget manager is in proxy_server.py   |
 # |                                               |
 # +-----------------------------------------------+
 #
 #  Thank you users! We ❤️ you! - Krrish & Ishaan
 import os, json, time
 import litellm
 from litellm.utils import ModelResponse
@ -16,7 +25,7 @@ class BudgetManager:
        self.client_type = client_type
        self.project_name = project_name
        self.api_base = api_base or "https://api.litellm.ai"
-        self.headers = headers or {'Content-Type': 'application/json'}
+        self.headers = headers or {"Content-Type": "application/json"}
        ## load the data or init the initial dictionaries
        self.load_data()
--- a/litellm/llms/bedrock.py
+++ b/litellm/llms/bedrock.py
@ -659,9 +659,16 @@ def completion(
                )
        ## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here.
-        prompt_tokens = len(encoding.encode(prompt))
+        prompt_tokens = response_metadata.get(
-        completion_tokens = len(
+            "x-amzn-bedrock-input-token-count", len(encoding.encode(prompt))
-            encoding.encode(model_response["choices"][0]["message"].get("content", ""))
+        )
        completion_tokens = response_metadata.get(
            "x-amzn-bedrock-output-token-count",
            len(
                encoding.encode(
                    model_response["choices"][0]["message"].get("content", "")
                )
            ),
        )
        model_response["created"] = int(time.time())
@ -672,6 +679,8 @@ def completion(
            total_tokens=prompt_tokens + completion_tokens,
        )
        model_response.usage = usage
        model_response._hidden_params["region_name"] = client.meta.region_name
        print_verbose(f"model_response._hidden_params: {model_response._hidden_params}")
        return model_response
    except BedrockError as e:
        exception_mapping_worked = True
--- a/litellm/llms/openai.py
+++ b/litellm/llms/openai.py
@ -718,8 +718,22 @@ class OpenAIChatCompletion(BaseLLM):
            return convert_to_model_response_object(response_object=response, model_response_object=model_response, response_type="image_generation")  # type: ignore
        except OpenAIError as e:
            exception_mapping_worked = True
            ## LOGGING
            logging_obj.post_call(
                input=prompt,
                api_key=api_key,
                additional_args={"complete_input_dict": data},
                original_response=str(e),
            )
            raise e
        except Exception as e:
            ## LOGGING
            logging_obj.post_call(
                input=prompt,
                api_key=api_key,
                additional_args={"complete_input_dict": data},
                original_response=str(e),
            )
            if hasattr(e, "status_code"):
                raise OpenAIError(status_code=e.status_code, message=str(e))
            else:
--- a/litellm/main.py
+++ b/litellm/main.py
@ -10,7 +10,6 @@
 import os, openai, sys, json, inspect, uuid, datetime, threading
 from typing import Any, Literal, Union
 from functools import partial
 import dotenv, traceback, random, asyncio, time, contextvars
 from copy import deepcopy
 import httpx
@ -586,6 +585,10 @@ def completion(
        )
        if model_response is not None and hasattr(model_response, "_hidden_params"):
            model_response._hidden_params["custom_llm_provider"] = custom_llm_provider
            model_response._hidden_params["region_name"] = kwargs.get(
                "aws_region_name", None
            )  # support region-based pricing for bedrock
        ### REGISTER CUSTOM MODEL PRICING -- IF GIVEN ###
        if input_cost_per_token is not None and output_cost_per_token is not None:
            litellm.register_model(
@ -2224,6 +2227,7 @@ def embedding(
    model,
    input=[],
    # Optional params
    dimensions: Optional[int] = None,
    timeout=600,  # default to 10 minutes
    # set api_base, api_version, api_key
    api_base: Optional[str] = None,
@ -2244,6 +2248,7 @@ def embedding(
    Parameters:
    - model: The embedding model to use.
    - input: The input for which embeddings are to be generated.
    - dimensions: The number of dimensions the resulting output embeddings should have. Only supported in text-embedding-3 and later models.
    - timeout: The timeout value for the API call, default 10 mins
    - litellm_call_id: The call ID for litellm logging.
    - litellm_logging_obj: The litellm logging object.
@ -2277,6 +2282,7 @@ def embedding(
    output_cost_per_second = kwargs.get("output_cost_per_second", None)
    openai_params = [
        "user",
        "dimensions",
        "request_timeout",
        "api_base",
        "api_version",
@ -2345,7 +2351,9 @@ def embedding(
        api_key=api_key,
    )
    optional_params = get_optional_params_embeddings(
        model=model,
        user=user,
        dimensions=dimensions,
        encoding_format=encoding_format,
        custom_llm_provider=custom_llm_provider,
        **non_default_params,
@ -3067,7 +3075,7 @@ def image_generation(
            custom_llm_provider=custom_llm_provider,
            **non_default_params,
        )
-        logging = litellm_logging_obj
+        logging: Logging = litellm_logging_obj
        logging.update_environment_variables(
            model=model,
            user=user,
--- a/litellm/proxy/_types.py
+++ b/litellm/proxy/_types.py
@ -140,6 +140,7 @@ class GenerateRequestBase(LiteLLMBase):
 class GenerateKeyRequest(GenerateRequestBase):
    key_alias: Optional[str] = None
    duration: Optional[str] = "1h"
    aliases: Optional[dict] = {}
    config: Optional[dict] = {}
@ -304,6 +305,8 @@ class ConfigYAML(LiteLLMBase):
 class LiteLLM_VerificationToken(LiteLLMBase):
    token: str
    key_name: Optional[str] = None
    key_alias: Optional[str] = None
    spend: float = 0.0
    max_budget: Optional[float] = None
    expires: Union[str, None]
@ -346,11 +349,12 @@ class LiteLLM_SpendLogs(LiteLLMBase):
    model: Optional[str] = ""
    call_type: str
    spend: Optional[float] = 0.0
    total_tokens: Optional[int] = 0
    prompt_tokens: Optional[int] = 0
    completion_tokens: Optional[int] = 0
    startTime: Union[str, datetime, None]
    endTime: Union[str, datetime, None]
    user: Optional[str] = ""
    modelParameters: Optional[Json] = {}
    usage: Optional[Json] = {}
    metadata: Optional[Json] = {}
    cache_hit: Optional[str] = "False"
    cache_key: Optional[str] = None
--- a/litellm/proxy/db/dynamo_db.py
+++ b/litellm/proxy/db/dynamo_db.py
@ -5,6 +5,7 @@ from litellm.proxy._types import (
    LiteLLM_Config,
    LiteLLM_UserTable,
 )
 from litellm.proxy.utils import hash_token
 from litellm import get_secret
 from typing import Any, List, Literal, Optional, Union
 import json
@ -187,6 +188,8 @@ class DynamoDBWrapper(CustomDB):
                table = client.table(self.database_arguments.spend_table_name)
            for k, v in value.items():
                if k == "token" and value[k].startswith("sk-"):
                    value[k] = hash_token(token=v)
                if isinstance(v, datetime):
                    value[k] = v.isoformat()
@ -229,6 +232,10 @@ class DynamoDBWrapper(CustomDB):
                table = client.table(self.database_arguments.config_table_name)
                key_name = "param_name"
            if key_name == "token" and key.startswith("sk-"):
                # ensure it's hashed
                key = hash_token(token=key)
            response = await table.get_item({key_name: key})
            new_response: Any = None
@ -308,6 +315,8 @@ class DynamoDBWrapper(CustomDB):
                # Convert datetime object to ISO8601 string
                if isinstance(v, datetime):
                    v = v.isoformat()
                if k == "token" and value[k].startswith("sk-"):
                    value[k] = hash_token(token=v)
                # Accumulate updates
                actions.append((F(k), Value(value=v)))
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@ -11,6 +11,12 @@ model_list:
      output_cost_per_token: 0.00003
      max_tokens: 4096
      base_model: gpt-3.5-turbo
  - model_name: gpt-4
    litellm_params:
      model: azure/chatgpt-v-2
      api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
      api_version: "2023-05-15"
      api_key: os.environ/AZURE_API_KEY # The `os.environ/` prefix tells litellm to read this from the env. See https://docs.litellm.ai/docs/simple_proxy#load-api-keys-from-vault
  - model_name: gpt-vision
    litellm_params:
      model: azure/gpt-4-vision
@ -61,7 +67,7 @@ model_list:
 litellm_settings:
  fallbacks: [{"openai-gpt-3.5": ["azure-gpt-3.5"]}]
  success_callback: ['langfuse']
-  max_budget: 0.025       # global budget for proxy 
+  max_budget: 10      # global budget for proxy 
  budget_duration: 30d    # global budget duration, will reset after 30d
  # cache: True     
  # setting callback class
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -75,6 +75,7 @@ from litellm.proxy.utils import (
    send_email,
    get_logging_payload,
    reset_budget,
    hash_token,
 )
 from litellm.proxy.secret_managers.google_kms import load_google_kms
 import pydantic
@ -243,6 +244,8 @@ async def user_api_key_auth(
            response = await user_custom_auth(request=request, api_key=api_key)
            return UserAPIKeyAuth.model_validate(response)
        ### LITELLM-DEFINED AUTH FUNCTION ###
        if isinstance(api_key, str):
            assert api_key.startswith("sk-")  # prevent token hashes from being used
        if master_key is None:
            if isinstance(api_key, str):
                return UserAPIKeyAuth(api_key=api_key)
@ -288,8 +291,9 @@ async def user_api_key_auth(
            raise Exception("No connected db.")
        ## check for cache hit (In-Memory Cache)
        if api_key.startswith("sk-"):
            api_key = hash_token(token=api_key)
        valid_token = user_api_key_cache.get_cache(key=api_key)
        verbose_proxy_logger.debug(f"valid_token from cache: {valid_token}")
        if valid_token is None:
            ## check db
            verbose_proxy_logger.debug(f"api key: {api_key}")
@ -482,10 +486,10 @@ async def user_api_key_auth(
                    )
            # Token passed all checks
            # Add token to cache
            user_api_key_cache.set_cache(key=api_key, value=valid_token, ttl=60)
            api_key = valid_token.token
            # Add hashed token to cache
            user_api_key_cache.set_cache(key=api_key, value=valid_token, ttl=60)
            valid_token_dict = _get_pydantic_json_dict(valid_token)
            valid_token_dict.pop("token", None)
            """
@ -520,7 +524,10 @@ async def user_api_key_auth(
                    # check if user can access this route
                    query_params = request.query_params
                    key = query_params.get("key")
-                    if prisma_client.hash_token(token=key) != api_key:
+                    if (
                        key is not None
                        and prisma_client.hash_token(token=key) != api_key
                    ):
                        raise HTTPException(
                            status_code=status.HTTP_403_FORBIDDEN,
                            detail="user not allowed to access this key's info",
@ -748,6 +755,9 @@ async def update_database(
        ### UPDATE KEY SPEND ###
        async def _update_key_db():
            verbose_proxy_logger.debug(
                f"adding spend to key db. Response cost: {response_cost}. Token: {token}."
            )
            if prisma_client is not None:
                # Fetch the existing cost for the given token
                existing_spend_obj = await prisma_client.get_data(token=token)
@ -1239,6 +1249,7 @@ async def generate_key_helper_fn(
    rpm_limit: Optional[int] = None,
    query_type: Literal["insert_data", "update_data"] = "insert_data",
    update_key_values: Optional[dict] = None,
    key_alias: Optional[str] = None,
 ):
    global prisma_client, custom_db_client
@ -1312,6 +1323,7 @@ async def generate_key_helper_fn(
        }
        key_data = {
            "token": token,
            "key_alias": key_alias,
            "expires": expires,
            "models": models,
            "aliases": aliases_json,
@ -1327,6 +1339,8 @@ async def generate_key_helper_fn(
            "budget_duration": key_budget_duration,
            "budget_reset_at": key_reset_at,
        }
        if general_settings.get("allow_user_auth", False) == True:
            key_data["key_name"] = f"sk-...{token[-4:]}"
        if prisma_client is not None:
            ## CREATE USER (If necessary)
            verbose_proxy_logger.debug(f"prisma_client: Creating User={user_data}")
@ -2451,10 +2465,10 @@ async def delete_key_fn(data: DeleteKeyRequest):
    Delete a key from the key management system.
    Parameters::
-    - keys (List[str]): A list of keys to delete. Example {"keys": ["sk-QWrxEynunsNpV1zT48HIrw"]}
+    - keys (List[str]): A list of keys or hashed keys to delete. Example {"keys": ["sk-QWrxEynunsNpV1zT48HIrw", "837e17519f44683334df5291321d97b8bf1098cd490e49e215f6fea935aa28be"]}
    Returns:
-    - deleted_keys (List[str]): A list of deleted keys. Example {"deleted_keys": ["sk-QWrxEynunsNpV1zT48HIrw"]}
+    - deleted_keys (List[str]): A list of deleted keys. Example {"deleted_keys": ["sk-QWrxEynunsNpV1zT48HIrw", "837e17519f44683334df5291321d97b8bf1098cd490e49e215f6fea935aa28be"]}
    Raises:
@ -2491,14 +2505,39 @@ async def delete_key_fn(data: DeleteKeyRequest):
    "/key/info", tags=["key management"], dependencies=[Depends(user_api_key_auth)]
 )
 async def info_key_fn(
-    key: str = fastapi.Query(..., description="Key in the request parameters"),
+    key: Optional[str] = fastapi.Query(
        default=None, description="Key in the request parameters"
    ),
    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
 ):
    """
    Retrieve information about a key.
    Parameters:
        key: Optional[str] = Query parameter representing the key in the request
        user_api_key_dict: UserAPIKeyAuth = Dependency representing the user's API key
    Returns:
        Dict containing the key and its associated information
    Example Curl:
    ```
    curl -X GET "http://0.0.0.0:8000/key/info?key=sk-02Wr4IAlN3NvPXvL5JVvDA" \
 -H "Authorization: Bearer sk-1234"
    ```
    Example Curl - if no key is passed, it will use the Key Passed in Authorization Header
    ```
    curl -X GET "http://0.0.0.0:8000/key/info" \
 -H "Authorization: Bearer sk-02Wr4IAlN3NvPXvL5JVvDA"
    ```
    """
    global prisma_client
    try:
        if prisma_client is None:
            raise Exception(
                f"Database not connected. Connect a database to your proxy - https://docs.litellm.ai/docs/simple_proxy#managing-auth---virtual-keys"
            )
        if key == None:
            key = user_api_key_dict.api_key
        key_info = await prisma_client.get_data(token=key)
        ## REMOVE HASHED TOKEN INFO BEFORE RETURNING ##
        try:
--- a/litellm/proxy/schema.prisma
+++ b/litellm/proxy/schema.prisma
@ -7,6 +7,7 @@ generator client {
  provider = "prisma-client-py"
 }
 // Track spend, rate limit, budget Users
 model LiteLLM_UserTable {
 		user_id    String @unique
    team_id    String?
@ -21,9 +22,11 @@ model LiteLLM_UserTable {
    budget_reset_at DateTime?
 }
-// required for token gen
+// Generate Tokens for Proxy
 model LiteLLM_VerificationToken {
    token      String   @unique
    key_name   String?
    key_alias   String?
    spend      Float    @default(0.0)
    expires    DateTime?
    models     String[]
@ -40,22 +43,25 @@ model LiteLLM_VerificationToken {
    budget_reset_at DateTime?
 }
 // store proxy config.yaml
 model LiteLLM_Config {
  param_name String @id
  param_value Json?
 }
 // View spend, model, api_key per request
 model LiteLLM_SpendLogs {
  request_id          String @unique
  call_type           String
  api_key             String  @default ("")
  spend               Float    @default(0.0)
  total_tokens        Int     @default(0)
  prompt_tokens       Int     @default(0)
  completion_tokens   Int     @default(0)
  startTime           DateTime // Assuming start_time is a DateTime field
  endTime             DateTime // Assuming end_time is a DateTime field
  model               String   @default("")
  user                String   @default("")
  modelParameters     Json     @default("{}")// Assuming optional_params is a JSON field
  usage               Json     @default("{}")
  metadata            Json     @default("{}")
  cache_hit           String   @default("")
  cache_key           String   @default("")
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@ -198,7 +198,14 @@ class ProxyLogging:
            max_budget = user_info["max_budget"]
            spend = user_info["spend"]
            user_email = user_info["user_email"]
-            user_info = f"""\nUser ID: {user_id}\nMax Budget: {max_budget}\nSpend: {spend}\nUser Email: {user_email}"""
+            user_info = f"""\nUser ID: {user_id}\nMax Budget: ${max_budget}\nSpend: ${spend}\nUser Email: {user_email}"""
        elif type == "token_budget":
            token_info = dict(user_info)
            token = token_info["token"]
            spend = token_info["spend"]
            max_budget = token_info["max_budget"]
            user_id = token_info["user_id"]
            user_info = f"""\nToken: {token}\nSpend: ${spend}\nMax Budget: ${max_budget}\nUser ID: {user_id}"""
        else:
            user_info = str(user_info)
        # percent of max_budget left to spend
@ -814,7 +821,13 @@ class PrismaClient:
        Allow user to delete a key(s)
        """
        try:
-            hashed_tokens = [self.hash_token(token=token) for token in tokens]
+            hashed_tokens = []
            for token in tokens:
                if isinstance(token, str) and token.startswith("sk-"):
                    hashed_token = self.hash_token(token=token)
                else:
                    hashed_token = token
                hashed_tokens.append(hashed_token)
            await self.db.litellm_verificationtoken.delete_many(
                where={"token": {"in": hashed_tokens}}
            )
@ -1060,10 +1073,11 @@ def get_logging_payload(kwargs, response_obj, start_time, end_time):
    metadata = (
        litellm_params.get("metadata", {}) or {}
    )  # if litellm_params['metadata'] == None
    optional_params = kwargs.get("optional_params", {})
    call_type = kwargs.get("call_type", "litellm.completion")
    cache_hit = kwargs.get("cache_hit", False)
    usage = response_obj["usage"]
    if type(usage) == litellm.Usage:
        usage = dict(usage)
    id = response_obj.get("id", str(uuid.uuid4()))
    api_key = metadata.get("user_api_key", "")
    if api_key is not None and isinstance(api_key, str) and api_key.startswith("sk-"):
@ -1091,10 +1105,11 @@ def get_logging_payload(kwargs, response_obj, start_time, end_time):
        "endTime": end_time,
        "model": kwargs.get("model", ""),
        "user": kwargs.get("user", ""),
        "modelParameters": optional_params,
        "usage": usage,
        "metadata": metadata,
        "cache_key": cache_key,
        "total_tokens": usage.get("total_tokens", 0),
        "prompt_tokens": usage.get("prompt_tokens", 0),
        "completion_tokens": usage.get("completion_tokens", 0),
    }
    json_fields = [
@ -1119,8 +1134,6 @@ def get_logging_payload(kwargs, response_obj, start_time, end_time):
                payload[param] = payload[param].model_dump_json()
            if type(payload[param]) == litellm.EmbeddingResponse:
                payload[param] = payload[param].model_dump_json()
            elif type(payload[param]) == litellm.Usage:
                payload[param] = payload[param].model_dump_json()
            else:
                payload[param] = json.dumps(payload[param])
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@ -723,8 +723,8 @@ def test_cache_override():
    print(f"Embedding 2 response time: {end_time - start_time} seconds")
    assert (
-        end_time - start_time > 0.1
+        end_time - start_time > 0.05
-    )  # ensure 2nd response comes in over 0.1s. This should not be cached.
+    )  # ensure 2nd response comes in over 0.05s. This should not be cached.
 # test_cache_override()
--- a/litellm/tests/test_completion_cost.py
+++ b/litellm/tests/test_completion_cost.py
@ -124,7 +124,7 @@ def test_cost_azure_gpt_35():
        )
-test_cost_azure_gpt_35()
+# test_cost_azure_gpt_35()
 def test_cost_azure_embedding():
@ -165,3 +165,71 @@ def test_cost_openai_image_gen():
        model="dall-e-2", size="1024-x-1024", quality="standard", n=1
    )
    assert cost == 0.019922944
 def test_cost_bedrock_pricing():
    """
    - get pricing specific to region for a model
    """
    from litellm import ModelResponse, Choices, Message
    from litellm.utils import Usage
    litellm.set_verbose = True
    input_tokens = litellm.token_counter(
        model="bedrock/anthropic.claude-instant-v1",
        messages=[{"role": "user", "content": "Hey, how's it going?"}],
    )
    print(f"input_tokens: {input_tokens}")
    output_tokens = litellm.token_counter(
        model="bedrock/anthropic.claude-instant-v1",
        text="It's all going well",
        count_response_tokens=True,
    )
    print(f"output_tokens: {output_tokens}")
    resp = ModelResponse(
        id="chatcmpl-e41836bb-bb8b-4df2-8e70-8f3e160155ac",
        choices=[
            Choices(
                finish_reason=None,
                index=0,
                message=Message(
                    content="It's all going well",
                    role="assistant",
                ),
            )
        ],
        created=1700775391,
        model="anthropic.claude-instant-v1",
        object="chat.completion",
        system_fingerprint=None,
        usage=Usage(
            prompt_tokens=input_tokens,
            completion_tokens=output_tokens,
            total_tokens=input_tokens + output_tokens,
        ),
    )
    resp._hidden_params = {
        "custom_llm_provider": "bedrock",
        "region_name": "ap-northeast-1",
    }
    cost = litellm.completion_cost(
        model="anthropic.claude-instant-v1",
        completion_response=resp,
        messages=[{"role": "user", "content": "Hey, how's it going?"}],
    )
    predicted_cost = input_tokens * 0.00000223 + 0.00000755 * output_tokens
    assert cost == predicted_cost
 def test_cost_bedrock_pricing_actual_calls():
    litellm.set_verbose = True
    model = "anthropic.claude-instant-v1"
    messages = [{"role": "user", "content": "Hey, how's it going?"}]
    response = litellm.completion(model=model, messages=messages)
    assert response._hidden_params["region_name"] is not None
    cost = litellm.completion_cost(
        completion_response=response,
        messages=[{"role": "user", "content": "Hey, how's it going?"}],
    )
    assert cost > 0
--- a/litellm/tests/test_configs/test_config_no_auth.yaml
+++ b/litellm/tests/test_configs/test_config_no_auth.yaml
@ -53,9 +53,9 @@ model_list:
    api_key: os.environ/AZURE_API_KEY
    api_version: 2023-07-01-preview
    model: azure/azure-embedding-model
  model_name: azure-embedding-model
  model_info:
-    mode: "embedding"
+    mode: embedding
  model_name: azure-embedding-model
 - litellm_params:
    model: gpt-3.5-turbo
  model_info:
@ -80,43 +80,49 @@ model_list:
    description: this is a test openai model
    id: 9b1ef341-322c-410a-8992-903987fef439
  model_name: test_openai_models
- model_name: amazon-embeddings
+- litellm_params:
-  litellm_params:
+    model: bedrock/amazon.titan-embed-text-v1
    model: "bedrock/amazon.titan-embed-text-v1"
  model_info:
    mode: embedding
- model_name: "GPT-J 6B - Sagemaker Text Embedding (Internal)"
+  model_name: amazon-embeddings
-  litellm_params: 
+- litellm_params:
-    model: "sagemaker/berri-benchmarking-gpt-j-6b-fp16"
+    model: sagemaker/berri-benchmarking-gpt-j-6b-fp16
  model_info:
    mode: embedding
- model_name: dall-e-3
+  model_name: GPT-J 6B - Sagemaker Text Embedding (Internal)
-  litellm_params: 
+- litellm_params:
    model: dall-e-3
  model_info:
    mode: image_generation
- model_name: dall-e-3
+  model_name: dall-e-3
-  litellm_params: 
+- litellm_params:
-    model: "azure/dall-e-3-test"
+    api_base: os.environ/AZURE_SWEDEN_API_BASE
-    api_version: "2023-12-01-preview"
+    api_key: os.environ/AZURE_SWEDEN_API_KEY
-    api_base: "os.environ/AZURE_SWEDEN_API_BASE"
+    api_version: 2023-12-01-preview
-    api_key: "os.environ/AZURE_SWEDEN_API_KEY"
+    model: azure/dall-e-3-test
  model_info:
    mode: image_generation
- model_name: dall-e-2
+  model_name: dall-e-3
-  litellm_params: 
+- litellm_params:
-    model: "azure/"
+    api_base: os.environ/AZURE_API_BASE
-    api_version: "2023-06-01-preview"
+    api_key: os.environ/AZURE_API_KEY
-    api_base: "os.environ/AZURE_API_BASE"
+    api_version: 2023-06-01-preview
-    api_key: "os.environ/AZURE_API_KEY"
+    model: azure/
  model_info:
    mode: image_generation
- model_name: text-embedding-ada-002
+  model_name: dall-e-2
-  litellm_params:
+- litellm_params:
    api_base: os.environ/AZURE_API_BASE
    api_key: os.environ/AZURE_API_KEY
    api_version: 2023-07-01-preview
    model: azure/azure-embedding-model
    api_base: "os.environ/AZURE_API_BASE"
    api_key: "os.environ/AZURE_API_KEY"
    api_version: "2023-07-01-preview"
  model_info:
    base_model: text-embedding-ada-002
    mode: embedding
-    base_model: text-embedding-ada-002
+  model_name: text-embedding-ada-002
 - litellm_params:
    model: gpt-3.5-turbo
  model_info:
    description: this is a test openai model
    id: 34cb2419-7c63-44ae-a189-53f1d1ce5953
  model_name: test_openai_models
--- a/litellm/tests/test_custom_callback_input.py
+++ b/litellm/tests/test_custom_callback_input.py
@ -819,47 +819,49 @@ async def test_async_embedding_azure_caching():
 # Image Generation
-# ## Test OpenAI + Sync
+## Test OpenAI + Sync
-# def test_image_generation_openai():
+def test_image_generation_openai():
-#     try:
+    try:
-#         customHandler_success = CompletionCustomHandler()
+        customHandler_success = CompletionCustomHandler()
-#         customHandler_failure = CompletionCustomHandler()
+        customHandler_failure = CompletionCustomHandler()
-#         litellm.callbacks = [customHandler_success]
+        # litellm.callbacks = [customHandler_success]
-#         litellm.set_verbose = True
+        # litellm.set_verbose = True
-#         response = litellm.image_generation(
+        # response = litellm.image_generation(
-#             prompt="A cute baby sea otter", model="dall-e-3"
+        #     prompt="A cute baby sea otter", model="dall-e-3"
-#         )
+        # )
-#         print(f"response: {response}")
+        # print(f"response: {response}")
-#         assert len(response.data) > 0
+        # assert len(response.data) > 0
-#         print(f"customHandler_success.errors: {customHandler_success.errors}")
+        # print(f"customHandler_success.errors: {customHandler_success.errors}")
-#         print(f"customHandler_success.states: {customHandler_success.states}")
+        # print(f"customHandler_success.states: {customHandler_success.states}")
-#         assert len(customHandler_success.errors) == 0
+        # assert len(customHandler_success.errors) == 0
-#         assert len(customHandler_success.states) == 3  # pre, post, success
+        # assert len(customHandler_success.states) == 3  # pre, post, success
-#         # test failure callback
+        # test failure callback
-#         litellm.callbacks = [customHandler_failure]
+        litellm.callbacks = [customHandler_failure]
-#         try:
+        try:
-#             response = litellm.image_generation(
+            response = litellm.image_generation(
-#                 prompt="A cute baby sea otter", model="dall-e-4"
+                prompt="A cute baby sea otter",
-#             )
+                model="dall-e-2",
-#         except:
+                api_key="my-bad-api-key",
-#             pass
+            )
-#         print(f"customHandler_failure.errors: {customHandler_failure.errors}")
+        except:
-#         print(f"customHandler_failure.states: {customHandler_failure.states}")
+            pass
-#         assert len(customHandler_failure.errors) == 0
+        print(f"customHandler_failure.errors: {customHandler_failure.errors}")
-#         assert len(customHandler_failure.states) == 3  # pre, post, failure
+        print(f"customHandler_failure.states: {customHandler_failure.states}")
-#     except litellm.RateLimitError as e:
+        assert len(customHandler_failure.errors) == 0
-#         pass
+        assert len(customHandler_failure.states) == 3  # pre, post, failure
-#     except litellm.ContentPolicyViolationError:
+    except litellm.RateLimitError as e:
-#         pass  # OpenAI randomly raises these errors - skip when they occur
+        pass
-#     except Exception as e:
+    except litellm.ContentPolicyViolationError:
-#         pytest.fail(f"An exception occurred - {str(e)}")
+        pass  # OpenAI randomly raises these errors - skip when they occur
    except Exception as e:
        pytest.fail(f"An exception occurred - {str(e)}")
-# test_image_generation_openai()
+test_image_generation_openai()
 ## Test OpenAI + Async
 ## Test Azure + Sync
--- a/litellm/tests/test_embedding.py
+++ b/litellm/tests/test_embedding.py
@ -64,7 +64,9 @@ def test_openai_embedding_3():
            model="text-embedding-3-small",
            input=["good morning from litellm", "this is another item"],
            metadata={"anything": "good day"},
            dimensions=5,
        )
        print(f"response:", response)
        litellm_response = dict(response)
        litellm_response_keys = set(litellm_response.keys())
        litellm_response_keys.discard("_response_ms")
@ -80,6 +82,7 @@ def test_openai_embedding_3():
        response = client.embeddings.create(
            model="text-embedding-3-small",
            input=["good morning from litellm", "this is another item"],
            dimensions=5,
        )
        response = dict(response)
--- a/litellm/tests/test_key_generate_dynamodb.py
+++ b/litellm/tests/test_key_generate_dynamodb.py
@ -33,7 +33,7 @@ from litellm.proxy.proxy_server import (
 )
 from litellm.proxy._types import NewUserRequest, DynamoDBArgs, GenerateKeyRequest
-from litellm.proxy.utils import DBClient
+from litellm.proxy.utils import DBClient, hash_token
 from starlette.datastructures import URL
@ -232,7 +232,7 @@ def test_call_with_user_over_budget(custom_db_client):
                    "stream": False,
                    "litellm_params": {
                        "metadata": {
-                            "user_api_key": generated_key,
+                            "user_api_key": hash_token(generated_key),
                            "user_api_key_user_id": user_id,
                        }
                    },
@ -305,7 +305,7 @@ def test_call_with_user_over_budget_stream(custom_db_client):
                    "complete_streaming_response": resp,
                    "litellm_params": {
                        "metadata": {
-                            "user_api_key": generated_key,
+                            "user_api_key": hash_token(generated_key),
                            "user_api_key_user_id": user_id,
                        }
                    },
@ -376,7 +376,7 @@ def test_call_with_user_key_budget(custom_db_client):
                    "stream": False,
                    "litellm_params": {
                        "metadata": {
-                            "user_api_key": generated_key,
+                            "user_api_key": hash_token(generated_key),
                            "user_api_key_user_id": user_id,
                        }
                    },
@ -449,7 +449,7 @@ def test_call_with_key_over_budget_stream(custom_db_client):
                    "complete_streaming_response": resp,
                    "litellm_params": {
                        "metadata": {
-                            "user_api_key": generated_key,
+                            "user_api_key": hash_token(generated_key),
                            "user_api_key_user_id": user_id,
                        }
                    },
--- a/litellm/tests/test_key_generate_prisma.py
+++ b/litellm/tests/test_key_generate_prisma.py
@ -12,6 +12,8 @@
 # 11. Generate a Key, cal key/info, call key/update, call key/info
 # 12. Make a call with key over budget, expect to fail
 # 14. Make a streaming chat/completions call with key over budget, expect to fail
 # 15. Generate key, when `allow_user_auth`=False - check if `/key/info` returns key_name=null
 # 16. Generate key, when `allow_user_auth`=True - check if `/key/info` returns key_name=sk...<last-4-digits>
 # function to call to generate key - async def new_user(data: NewUserRequest):
@ -46,7 +48,7 @@ from litellm.proxy.proxy_server import (
    spend_key_fn,
    view_spend_logs,
 )
-from litellm.proxy.utils import PrismaClient, ProxyLogging
+from litellm.proxy.utils import PrismaClient, ProxyLogging, hash_token
 from litellm._logging import verbose_proxy_logger
 verbose_proxy_logger.setLevel(level=logging.DEBUG)
@ -86,6 +88,7 @@ def prisma_client():
    litellm.proxy.proxy_server.litellm_proxy_budget_name = (
        f"litellm-proxy-budget-{time.time()}"
    )
    litellm.proxy.proxy_server.user_custom_key_generate = None
    return prisma_client
@ -918,7 +921,7 @@ def test_call_with_key_over_budget(prisma_client):
                    "stream": False,
                    "litellm_params": {
                        "metadata": {
-                            "user_api_key": generated_key,
+                            "user_api_key": hash_token(generated_key),
                            "user_api_key_user_id": user_id,
                        }
                    },
@ -1009,7 +1012,7 @@ async def test_call_with_key_never_over_budget(prisma_client):
                "stream": False,
                "litellm_params": {
                    "metadata": {
-                        "user_api_key": generated_key,
+                        "user_api_key": hash_token(generated_key),
                        "user_api_key_user_id": user_id,
                    }
                },
@ -1083,7 +1086,7 @@ async def test_call_with_key_over_budget_stream(prisma_client):
                "complete_streaming_response": resp,
                "litellm_params": {
                    "metadata": {
-                        "user_api_key": generated_key,
+                        "user_api_key": hash_token(generated_key),
                        "user_api_key_user_id": user_id,
                    }
                },
@ -1140,3 +1143,48 @@ async def test_view_spend_per_key(prisma_client):
    except Exception as e:
        print("Got Exception", e)
        pytest.fail(f"Got exception {e}")
@pytest.mark.asyncio()
 async def test_key_name_null(prisma_client):
    """
    - create key
    - get key info
    - assert key_name is null
    """
    setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
    setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
    await litellm.proxy.proxy_server.prisma_client.connect()
    try:
        request = GenerateKeyRequest()
        key = await generate_key_fn(request)
        generated_key = key.key
        result = await info_key_fn(key=generated_key)
        print("result from info_key_fn", result)
        assert result["info"]["key_name"] is None
    except Exception as e:
        print("Got Exception", e)
        pytest.fail(f"Got exception {e}")
@pytest.mark.asyncio()
 async def test_key_name_set(prisma_client):
    """
    - create key
    - get key info
    - assert key_name is not null
    """
    setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
    setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
    setattr(litellm.proxy.proxy_server, "general_settings", {"allow_user_auth": True})
    await litellm.proxy.proxy_server.prisma_client.connect()
    try:
        request = GenerateKeyRequest()
        key = await generate_key_fn(request)
        generated_key = key.key
        result = await info_key_fn(key=generated_key)
        print("result from info_key_fn", result)
        assert isinstance(result["info"]["key_name"], str)
    except Exception as e:
        print("Got Exception", e)
        pytest.fail(f"Got exception {e}")
--- a/litellm/tests/test_proxy_pass_user_config.py
+++ b/litellm/tests/test_proxy_pass_user_config.py
@ -32,7 +32,7 @@ from litellm.proxy.proxy_server import (
 )  # Replace with the actual module where your FastAPI router is defined
 # Your bearer token
-token = ""
+token = "sk-1234"
 headers = {"Authorization": f"Bearer {token}"}
--- a/litellm/tests/test_proxy_server.py
+++ b/litellm/tests/test_proxy_server.py
@ -31,7 +31,7 @@ from litellm.proxy.proxy_server import (
 )  # Replace with the actual module where your FastAPI router is defined
 # Your bearer token
-token = ""
+token = "sk-1234"
 headers = {"Authorization": f"Bearer {token}"}
--- a/litellm/tests/test_proxy_server_caching.py
+++ b/litellm/tests/test_proxy_server_caching.py
@ -33,7 +33,7 @@ from litellm.proxy.proxy_server import (
 )  # Replace with the actual module where your FastAPI router is defined
 # Your bearer token
-token = ""
+token = "sk-1234"
 headers = {"Authorization": f"Bearer {token}"}
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -714,6 +714,7 @@ class ImageResponse(OpenAIObject):
 ############################################################
 def print_verbose(print_statement):
    try:
        verbose_logger.debug(print_statement)
        if litellm.set_verbose:
            print(print_statement)  # noqa
    except:
@ -2029,14 +2030,15 @@ def client(original_function):
                start_time=start_time,
            )
            ## check if metadata is passed in
            litellm_params = {}
            if "metadata" in kwargs:
-                litellm_params = {"metadata": kwargs["metadata"]}
+                litellm_params["metadata"] = kwargs["metadata"]
-                logging_obj.update_environment_variables(
+            logging_obj.update_environment_variables(
-                    model=model,
+                model=model,
-                    user="",
+                user="",
-                    optional_params={},
+                optional_params={},
-                    litellm_params=litellm_params,
+                litellm_params=litellm_params,
-                )
+            )
            return logging_obj
        except Exception as e:
            import logging
@ -2900,6 +2902,7 @@ def cost_per_token(
    completion_tokens=0,
    response_time_ms=None,
    custom_llm_provider=None,
    region_name=None,
 ):
    """
    Calculates the cost per token for a given model, prompt tokens, and completion tokens.
@ -2916,16 +2919,46 @@ def cost_per_token(
    prompt_tokens_cost_usd_dollar = 0
    completion_tokens_cost_usd_dollar = 0
    model_cost_ref = litellm.model_cost
    model_with_provider = model
    if custom_llm_provider is not None:
        model_with_provider = custom_llm_provider + "/" + model
-    else:
+        if region_name is not None:
-        model_with_provider = model
+            model_with_provider_and_region = (
                f"{custom_llm_provider}/{region_name}/{model}"
            )
            if (
                model_with_provider_and_region in model_cost_ref
            ):  # use region based pricing, if it's available
                model_with_provider = model_with_provider_and_region
    # see this https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models
-    verbose_logger.debug(f"Looking up model={model} in model_cost_map")
+    print_verbose(f"Looking up model={model} in model_cost_map")
-
+    if model_with_provider in model_cost_ref:
        print_verbose(
            f"Success: model={model_with_provider} in model_cost_map - {model_cost_ref[model_with_provider]}"
        )
        print_verbose(
            f"applying cost={model_cost_ref[model_with_provider].get('input_cost_per_token', None)} for prompt_tokens={prompt_tokens}"
        )
        prompt_tokens_cost_usd_dollar = (
            model_cost_ref[model_with_provider]["input_cost_per_token"] * prompt_tokens
        )
        print_verbose(
            f"calculated prompt_tokens_cost_usd_dollar: {prompt_tokens_cost_usd_dollar}"
        )
        print_verbose(
            f"applying cost={model_cost_ref[model_with_provider].get('output_cost_per_token', None)} for completion_tokens={completion_tokens}"
        )
        completion_tokens_cost_usd_dollar = (
            model_cost_ref[model_with_provider]["output_cost_per_token"]
            * completion_tokens
        )
        print_verbose(
            f"calculated completion_tokens_cost_usd_dollar: {completion_tokens_cost_usd_dollar}"
        )
        return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
    if model in model_cost_ref:
-        verbose_logger.debug(f"Success: model={model} in model_cost_map")
+        print_verbose(f"Success: model={model} in model_cost_map")
-        verbose_logger.debug(
+        print_verbose(
            f"prompt_tokens={prompt_tokens}; completion_tokens={completion_tokens}"
        )
        if (
@ -2943,7 +2976,7 @@ def cost_per_token(
            model_cost_ref[model].get("input_cost_per_second", None) is not None
            and response_time_ms is not None
        ):
-            verbose_logger.debug(
+            print_verbose(
                f"For model={model} - input_cost_per_second: {model_cost_ref[model].get('input_cost_per_second')}; response time: {response_time_ms}"
            )
            ## COST PER SECOND ##
@ -2951,30 +2984,12 @@ def cost_per_token(
                model_cost_ref[model]["input_cost_per_second"] * response_time_ms / 1000
            )
            completion_tokens_cost_usd_dollar = 0.0
-        verbose_logger.debug(
+        print_verbose(
            f"Returned custom cost for model={model} - prompt_tokens_cost_usd_dollar: {prompt_tokens_cost_usd_dollar}, completion_tokens_cost_usd_dollar: {completion_tokens_cost_usd_dollar}"
        )
        return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
    elif model_with_provider in model_cost_ref:
        verbose_logger.debug(
            f"Looking up model={model_with_provider} in model_cost_map"
        )
        verbose_logger.debug(
            f"applying cost={model_cost_ref[model_with_provider]['input_cost_per_token']} for prompt_tokens={prompt_tokens}"
        )
        prompt_tokens_cost_usd_dollar = (
            model_cost_ref[model_with_provider]["input_cost_per_token"] * prompt_tokens
        )
        verbose_logger.debug(
            f"applying cost={model_cost_ref[model_with_provider]['output_cost_per_token']} for completion_tokens={completion_tokens}"
        )
        completion_tokens_cost_usd_dollar = (
            model_cost_ref[model_with_provider]["output_cost_per_token"]
            * completion_tokens
        )
        return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
    elif "ft:gpt-3.5-turbo" in model:
-        verbose_logger.debug(f"Cost Tracking: {model} is an OpenAI FinteTuned LLM")
+        print_verbose(f"Cost Tracking: {model} is an OpenAI FinteTuned LLM")
        # fuzzy match ft:gpt-3.5-turbo:abcd-id-cool-litellm
        prompt_tokens_cost_usd_dollar = (
            model_cost_ref["ft:gpt-3.5-turbo"]["input_cost_per_token"] * prompt_tokens
@ -3031,7 +3046,10 @@ def completion_cost(
    prompt="",
    messages: List = [],
    completion="",
-    total_time=0.0,  # used for replicate
+    total_time=0.0,  # used for replicate, sagemaker
    ### REGION ###
    custom_llm_provider=None,
    region_name=None,  # used for bedrock pricing
    ### IMAGE GEN ###
    size=None,
    quality=None,
@ -3080,12 +3098,13 @@ def completion_cost(
            model = (
                model or completion_response["model"]
            )  # check if user passed an override for model, if it's none check completion_response['model']
-            if completion_response is not None and hasattr(
+            if hasattr(completion_response, "_hidden_params"):
                completion_response, "_hidden_params"
            ):
                custom_llm_provider = completion_response._hidden_params.get(
                    "custom_llm_provider", ""
                )
                region_name = completion_response._hidden_params.get(
                    "region_name", region_name
                )
        else:
            if len(messages) > 0:
                prompt_tokens = token_counter(model=model, messages=messages)
@ -3146,8 +3165,13 @@ def completion_cost(
            completion_tokens=completion_tokens,
            custom_llm_provider=custom_llm_provider,
            response_time_ms=total_time,
            region_name=region_name,
        )
-        return prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
+        _final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
        print_verbose(
            f"final cost: {_final_cost}; prompt_tokens_cost_usd_dollar: {prompt_tokens_cost_usd_dollar}; completion_tokens_cost_usd_dollar: {completion_tokens_cost_usd_dollar}"
        )
        return _final_cost
    except Exception as e:
        raise e
@ -3313,8 +3337,10 @@ def get_optional_params_image_gen(
 def get_optional_params_embeddings(
    # 2 optional params
    model=None,
    user=None,
    encoding_format=None,
    dimensions=None,
    custom_llm_provider="",
    **kwargs,
 ):
@ -3325,7 +3351,7 @@ def get_optional_params_embeddings(
    for k, v in special_params.items():
        passed_params[k] = v
-    default_params = {"user": None, "encoding_format": None}
+    default_params = {"user": None, "encoding_format": None, "dimensions": None}
    non_default_params = {
        k: v
@ -3333,6 +3359,19 @@ def get_optional_params_embeddings(
        if (k in default_params and v != default_params[k])
    }
    ## raise exception if non-default value passed for non-openai/azure embedding calls
    if custom_llm_provider == "openai":
        # 'dimensions` is only supported in `text-embedding-3` and later models
        if (
            model is not None
            and "text-embedding-3" not in model
            and "dimensions" in non_default_params.keys()
        ):
            raise UnsupportedParamsError(
                status_code=500,
                message=f"Setting dimensions is not supported for OpenAI `text-embedding-3` and later models. To drop it from the call, set `litellm.drop_params = True`.",
            )
    if (
        custom_llm_provider != "openai"
        and custom_llm_provider != "azure"
--- a/poetry.lock
+++ b/poetry.lock
@ -1158,13 +1158,13 @@ files = [
 [[package]]
 name = "openai"
-version = "1.8.0"
+version = "1.10.0"
 description = "The official Python library for the openai API"
 optional = false
 python-versions = ">=3.7.1"
 files = [
-    {file = "openai-1.8.0-py3-none-any.whl", hash = "sha256:0f8f53805826103fdd8adaf379ad3ec23f9d867e698cbc14caf34b778d150175"},
+    {file = "openai-1.10.0-py3-none-any.whl", hash = "sha256:aa69e97d0223ace9835fbf9c997abe9ee95318f684fd2de6d02c870700c71ebc"},
-    {file = "openai-1.8.0.tar.gz", hash = "sha256:93366be27802f517e89328801913d2a5ede45e3b86fdcab420385b8a1b88c767"},
+    {file = "openai-1.10.0.tar.gz", hash = "sha256:208886cb501b930dc63f48d51db9c15e5380380f80516d07332adad67c9f1053"},
 ]
 [package.dependencies]
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.19.4"
+version = "1.20.0"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@ -63,7 +63,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"
 [tool.commitizen]
-version = "1.19.4"
+version = "1.20.0"
 version_files = [
    "pyproject.toml:^version"
 ]
--- a/schema.prisma
+++ b/schema.prisma
@ -25,6 +25,8 @@ model LiteLLM_UserTable {
 // Generate Tokens for Proxy
 model LiteLLM_VerificationToken {
    token      String   @unique
    key_name   String?
    key_alias   String?
    spend      Float    @default(0.0)
    expires    DateTime?
    models     String[]
@ -53,12 +55,13 @@ model LiteLLM_SpendLogs {
  call_type           String
  api_key             String  @default ("")
  spend               Float    @default(0.0)
  total_tokens        Int     @default(0)
  prompt_tokens       Int     @default(0)
  completion_tokens   Int     @default(0)
  startTime           DateTime // Assuming start_time is a DateTime field
  endTime             DateTime // Assuming end_time is a DateTime field
  model               String   @default("")
  user                String   @default("")
  modelParameters     Json     @default("{}")// Assuming optional_params is a JSON field
  usage               Json     @default("{}")
  metadata            Json     @default("{}")
  cache_hit           String   @default("")
  cache_key           String   @default("")
--- a/tests/test_keys.py
+++ b/tests/test_keys.py
@ -115,7 +115,9 @@ async def chat_completion(session, key, model="gpt-4"):
        print()
        if status != 200:
-            raise Exception(f"Request did not return a 200 status code: {status}")
+            raise Exception(
                f"Request did not return a 200 status code: {status}. Response: {response_text}"
            )
        return await response.json()
@ -201,11 +203,14 @@ async def test_key_delete():
        )
-async def get_key_info(session, get_key, call_key):
+async def get_key_info(session, call_key, get_key=None):
    """
    Make sure only models user has access to are returned
    """
-    url = f"http://0.0.0.0:4000/key/info?key={get_key}"
+    if get_key is None:
        url = "http://0.0.0.0:4000/key/info"
    else:
        url = f"http://0.0.0.0:4000/key/info?key={get_key}"
    headers = {
        "Authorization": f"Bearer {call_key}",
        "Content-Type": "application/json",
@ -241,6 +246,9 @@ async def test_key_info():
        await get_key_info(session=session, get_key=key, call_key="sk-1234")
        # as key itself #
        await get_key_info(session=session, get_key=key, call_key=key)
        # as key itself, use the auth param, and no query key needed
        await get_key_info(session=session, call_key=key)
        # as random key #
        key_gen = await generate_key(session=session, i=0)
        random_key = key_gen["key"]
@ -281,14 +289,20 @@ async def test_key_info_spend_values():
        await asyncio.sleep(5)
        spend_logs = await get_spend_logs(session=session, request_id=response["id"])
        print(f"spend_logs: {spend_logs}")
-        usage = spend_logs[0]["usage"]
+        completion_tokens = spend_logs[0]["completion_tokens"]
        prompt_tokens = spend_logs[0]["prompt_tokens"]
        print(f"prompt_tokens: {prompt_tokens}; completion_tokens: {completion_tokens}")
        litellm.set_verbose = True
        prompt_cost, completion_cost = litellm.cost_per_token(
            model="gpt-35-turbo",
-            prompt_tokens=usage["prompt_tokens"],
+            prompt_tokens=prompt_tokens,
-            completion_tokens=usage["completion_tokens"],
+            completion_tokens=completion_tokens,
            custom_llm_provider="azure",
        )
        print("prompt_cost: ", prompt_cost, "completion_cost: ", completion_cost)
        response_cost = prompt_cost + completion_cost
        print(f"response_cost: {response_cost}")
        await asyncio.sleep(5)  # allow db log to be updated
        key_info = await get_key_info(session=session, get_key=key, call_key=key)
        print(
@ -380,3 +394,31 @@ async def test_key_with_budgets():
        key_info = await get_key_info(session=session, get_key=key, call_key=key)
        reset_at_new_value = key_info["info"]["budget_reset_at"]
        assert reset_at_init_value != reset_at_new_value
@pytest.mark.asyncio
 async def test_key_crossing_budget():
    """
    - Create key with budget with budget=0.00000001
    - make a /chat/completions call
    - wait 5s
    - make a /chat/completions call - should fail with key crossed it's budget
    - Check if value updated
    """
    from litellm.proxy.utils import hash_token
    async with aiohttp.ClientSession() as session:
        key_gen = await generate_key(session=session, i=0, budget=0.0000001)
        key = key_gen["key"]
        hashed_token = hash_token(token=key)
        print(f"hashed_token: {hashed_token}")
        response = await chat_completion(session=session, key=key)
        print("response 1: ", response)
        await asyncio.sleep(2)
        try:
            response = await chat_completion(session=session, key=key)
            pytest.fail("Should have failed - Key crossed it's budget")
        except Exception as e:
            assert "ExceededTokenBudget: Current spend for token:" in str(e)