From f5da95685a4f384a47b0af5631b2112283a3daa6 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Fri, 26 Jan 2024 14:53:58 -0800
Subject: [PATCH] feat(utils.py): support region based pricing for bedrock +
 use bedrock's token counts if given

---
 litellm/budget_manager.py             | 11 +++-
 litellm/llms/bedrock.py               | 15 ++++-
 litellm/main.py                       |  4 ++
 litellm/tests/test_completion_cost.py | 70 ++++++++++++++++++++-
 litellm/utils.py                      | 87 +++++++++++++++++----------
 5 files changed, 150 insertions(+), 37 deletions(-)

diff --git a/litellm/budget_manager.py b/litellm/budget_manager.py
index 036474197..841015753 100644
--- a/litellm/budget_manager.py
+++ b/litellm/budget_manager.py
@@ -1,3 +1,12 @@
+# +-----------------------------------------------+
+# |                                               |
+# |           NOT PROXY BUDGET MANAGER            |
+# |  proxy budget manager is in proxy_server.py   |
+# |                                               |
+# +-----------------------------------------------+
+#
+#  Thank you users! We ❤️ you! - Krrish & Ishaan
+
 import os, json, time
 import litellm
 from litellm.utils import ModelResponse
@@ -16,7 +25,7 @@ class BudgetManager:
         self.client_type = client_type
         self.project_name = project_name
         self.api_base = api_base or "https://api.litellm.ai"
-        self.headers = headers or {'Content-Type': 'application/json'}
+        self.headers = headers or {"Content-Type": "application/json"}
         ## load the data or init the initial dictionaries
         self.load_data()
 
diff --git a/litellm/llms/bedrock.py b/litellm/llms/bedrock.py
index 4c36137da..bcf35c3d1 100644
--- a/litellm/llms/bedrock.py
+++ b/litellm/llms/bedrock.py
@@ -659,9 +659,16 @@ def completion(
                 )
 
         ## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here.
-        prompt_tokens = len(encoding.encode(prompt))
-        completion_tokens = len(
-            encoding.encode(model_response["choices"][0]["message"].get("content", ""))
+        prompt_tokens = response_metadata.get(
+            "x-amzn-bedrock-input-token-count", len(encoding.encode(prompt))
+        )
+        completion_tokens = response_metadata.get(
+            "x-amzn-bedrock-output-token-count",
+            len(
+                encoding.encode(
+                    model_response["choices"][0]["message"].get("content", "")
+                )
+            ),
         )
 
         model_response["created"] = int(time.time())
@@ -672,6 +679,8 @@ def completion(
             total_tokens=prompt_tokens + completion_tokens,
         )
         model_response.usage = usage
+        model_response._hidden_params["region_name"] = client.meta.region_name
+        print_verbose(f"model_response._hidden_params: {model_response._hidden_params}")
         return model_response
     except BedrockError as e:
         exception_mapping_worked = True
diff --git a/litellm/main.py b/litellm/main.py
index f9f1139f6..01edd3ea7 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -586,6 +586,10 @@ def completion(
         )
         if model_response is not None and hasattr(model_response, "_hidden_params"):
             model_response._hidden_params["custom_llm_provider"] = custom_llm_provider
+            model_response._hidden_params["region_name"] = kwargs.get(
+                "aws_region_name", None
+            )  # support region-based pricing for bedrock
+
         ### REGISTER CUSTOM MODEL PRICING -- IF GIVEN ###
         if input_cost_per_token is not None and output_cost_per_token is not None:
             litellm.register_model(
diff --git a/litellm/tests/test_completion_cost.py b/litellm/tests/test_completion_cost.py
index 505f28981..b117223ab 100644
--- a/litellm/tests/test_completion_cost.py
+++ b/litellm/tests/test_completion_cost.py
@@ -124,7 +124,7 @@ def test_cost_azure_gpt_35():
         )
 
 
-test_cost_azure_gpt_35()
+# test_cost_azure_gpt_35()
 
 
 def test_cost_azure_embedding():
@@ -165,3 +165,71 @@ def test_cost_openai_image_gen():
         model="dall-e-2", size="1024-x-1024", quality="standard", n=1
     )
     assert cost == 0.019922944
+
+
+def test_cost_bedrock_pricing():
+    """
+    - get pricing specific to region for a model
+    """
+    from litellm import ModelResponse, Choices, Message
+    from litellm.utils import Usage
+
+    litellm.set_verbose = True
+    input_tokens = litellm.token_counter(
+        model="bedrock/anthropic.claude-instant-v1",
+        messages=[{"role": "user", "content": "Hey, how's it going?"}],
+    )
+    print(f"input_tokens: {input_tokens}")
+    output_tokens = litellm.token_counter(
+        model="bedrock/anthropic.claude-instant-v1",
+        text="It's all going well",
+        count_response_tokens=True,
+    )
+    print(f"output_tokens: {output_tokens}")
+    resp = ModelResponse(
+        id="chatcmpl-e41836bb-bb8b-4df2-8e70-8f3e160155ac",
+        choices=[
+            Choices(
+                finish_reason=None,
+                index=0,
+                message=Message(
+                    content="It's all going well",
+                    role="assistant",
+                ),
+            )
+        ],
+        created=1700775391,
+        model="anthropic.claude-instant-v1",
+        object="chat.completion",
+        system_fingerprint=None,
+        usage=Usage(
+            prompt_tokens=input_tokens,
+            completion_tokens=output_tokens,
+            total_tokens=input_tokens + output_tokens,
+        ),
+    )
+    resp._hidden_params = {
+        "custom_llm_provider": "bedrock",
+        "region_name": "ap-northeast-1",
+    }
+
+    cost = litellm.completion_cost(
+        model="anthropic.claude-instant-v1",
+        completion_response=resp,
+        messages=[{"role": "user", "content": "Hey, how's it going?"}],
+    )
+    predicted_cost = input_tokens * 0.00000223 + 0.00000755 * output_tokens
+    assert cost == predicted_cost
+
+
+def test_cost_bedrock_pricing_actual_calls():
+    litellm.set_verbose = True
+    model = "anthropic.claude-instant-v1"
+    messages = [{"role": "user", "content": "Hey, how's it going?"}]
+    response = litellm.completion(model=model, messages=messages)
+    assert response._hidden_params["region_name"] is not None
+    cost = litellm.completion_cost(
+        completion_response=response,
+        messages=[{"role": "user", "content": "Hey, how's it going?"}],
+    )
+    assert cost > 0
diff --git a/litellm/utils.py b/litellm/utils.py
index b0e48bbc6..91b3a0f0a 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -714,6 +714,7 @@ class ImageResponse(OpenAIObject):
 ############################################################
 def print_verbose(print_statement):
     try:
+        verbose_logger.debug(print_statement)
         if litellm.set_verbose:
             print(print_statement)  # noqa
     except:
@@ -2900,6 +2901,7 @@ def cost_per_token(
     completion_tokens=0,
     response_time_ms=None,
     custom_llm_provider=None,
+    region_name=None,
 ):
     """
     Calculates the cost per token for a given model, prompt tokens, and completion tokens.
@@ -2916,16 +2918,46 @@ def cost_per_token(
     prompt_tokens_cost_usd_dollar = 0
     completion_tokens_cost_usd_dollar = 0
     model_cost_ref = litellm.model_cost
+    model_with_provider = model
     if custom_llm_provider is not None:
         model_with_provider = custom_llm_provider + "/" + model
-    else:
-        model_with_provider = model
+        if region_name is not None:
+            model_with_provider_and_region = (
+                f"{custom_llm_provider}/{region_name}/{model}"
+            )
+            if (
+                model_with_provider_and_region in model_cost_ref
+            ):  # use region based pricing, if it's available
+                model_with_provider = model_with_provider_and_region
     # see this https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models
-    verbose_logger.debug(f"Looking up model={model} in model_cost_map")
-
+    print_verbose(f"Looking up model={model} in model_cost_map")
+    if model_with_provider in model_cost_ref:
+        print_verbose(
+            f"Success: model={model_with_provider} in model_cost_map - {model_cost_ref[model_with_provider]}"
+        )
+        print_verbose(
+            f"applying cost={model_cost_ref[model_with_provider]['input_cost_per_token']} for prompt_tokens={prompt_tokens}"
+        )
+        prompt_tokens_cost_usd_dollar = (
+            model_cost_ref[model_with_provider]["input_cost_per_token"] * prompt_tokens
+        )
+        print_verbose(
+            f"calculated prompt_tokens_cost_usd_dollar: {prompt_tokens_cost_usd_dollar}"
+        )
+        print_verbose(
+            f"applying cost={model_cost_ref[model_with_provider]['output_cost_per_token']} for completion_tokens={completion_tokens}"
+        )
+        completion_tokens_cost_usd_dollar = (
+            model_cost_ref[model_with_provider]["output_cost_per_token"]
+            * completion_tokens
+        )
+        print_verbose(
+            f"calculated completion_tokens_cost_usd_dollar: {completion_tokens_cost_usd_dollar}"
+        )
+        return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
     if model in model_cost_ref:
-        verbose_logger.debug(f"Success: model={model} in model_cost_map")
-        verbose_logger.debug(
+        print_verbose(f"Success: model={model} in model_cost_map")
+        print_verbose(
             f"prompt_tokens={prompt_tokens}; completion_tokens={completion_tokens}"
         )
         if (
@@ -2943,7 +2975,7 @@ def cost_per_token(
             model_cost_ref[model].get("input_cost_per_second", None) is not None
             and response_time_ms is not None
         ):
-            verbose_logger.debug(
+            print_verbose(
                 f"For model={model} - input_cost_per_second: {model_cost_ref[model].get('input_cost_per_second')}; response time: {response_time_ms}"
             )
             ## COST PER SECOND ##
@@ -2951,30 +2983,12 @@ def cost_per_token(
                 model_cost_ref[model]["input_cost_per_second"] * response_time_ms / 1000
             )
             completion_tokens_cost_usd_dollar = 0.0
-        verbose_logger.debug(
+        print_verbose(
             f"Returned custom cost for model={model} - prompt_tokens_cost_usd_dollar: {prompt_tokens_cost_usd_dollar}, completion_tokens_cost_usd_dollar: {completion_tokens_cost_usd_dollar}"
         )
         return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
-    elif model_with_provider in model_cost_ref:
-        verbose_logger.debug(
-            f"Looking up model={model_with_provider} in model_cost_map"
-        )
-        verbose_logger.debug(
-            f"applying cost={model_cost_ref[model_with_provider]['input_cost_per_token']} for prompt_tokens={prompt_tokens}"
-        )
-        prompt_tokens_cost_usd_dollar = (
-            model_cost_ref[model_with_provider]["input_cost_per_token"] * prompt_tokens
-        )
-        verbose_logger.debug(
-            f"applying cost={model_cost_ref[model_with_provider]['output_cost_per_token']} for completion_tokens={completion_tokens}"
-        )
-        completion_tokens_cost_usd_dollar = (
-            model_cost_ref[model_with_provider]["output_cost_per_token"]
-            * completion_tokens
-        )
-        return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
     elif "ft:gpt-3.5-turbo" in model:
-        verbose_logger.debug(f"Cost Tracking: {model} is an OpenAI FinteTuned LLM")
+        print_verbose(f"Cost Tracking: {model} is an OpenAI FinteTuned LLM")
         # fuzzy match ft:gpt-3.5-turbo:abcd-id-cool-litellm
         prompt_tokens_cost_usd_dollar = (
             model_cost_ref["ft:gpt-3.5-turbo"]["input_cost_per_token"] * prompt_tokens
@@ -3031,7 +3045,10 @@ def completion_cost(
     prompt="",
     messages: List = [],
     completion="",
-    total_time=0.0,  # used for replicate
+    total_time=0.0,  # used for replicate, sagemaker
+    ### REGION ###
+    custom_llm_provider=None,
+    region_name=None,  # used for bedrock pricing
     ### IMAGE GEN ###
     size=None,
     quality=None,
@@ -3080,12 +3097,13 @@ def completion_cost(
             model = (
                 model or completion_response["model"]
             )  # check if user passed an override for model, if it's none check completion_response['model']
-            if completion_response is not None and hasattr(
-                completion_response, "_hidden_params"
-            ):
+            if hasattr(completion_response, "_hidden_params"):
                 custom_llm_provider = completion_response._hidden_params.get(
                     "custom_llm_provider", ""
                 )
+                region_name = completion_response._hidden_params.get(
+                    "region_name", region_name
+                )
         else:
             if len(messages) > 0:
                 prompt_tokens = token_counter(model=model, messages=messages)
@@ -3146,8 +3164,13 @@ def completion_cost(
             completion_tokens=completion_tokens,
             custom_llm_provider=custom_llm_provider,
             response_time_ms=total_time,
+            region_name=region_name,
         )
-        return prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
+        _final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
+        print_verbose(
+            f"final cost: {_final_cost}; prompt_tokens_cost_usd_dollar: {prompt_tokens_cost_usd_dollar}; completion_tokens_cost_usd_dollar: {completion_tokens_cost_usd_dollar}"
+        )
+        return _final_cost
     except Exception as e:
         raise e