From 3f965df68b03e431be99df82b00a9811d544ac8f Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 11 Jul 2024 11:52:18 -0700
Subject: [PATCH] fix(llm_cost_calc/google.py): fix google embedding cost
 calculation

Fixes https://github.com/BerriAI/litellm/issues/4630
---
 litellm/cost_calculator.py                    | 30 +++++---
 litellm/integrations/slack_alerting.py        |  6 +-
 litellm/litellm_core_utils/litellm_logging.py |  2 +-
 .../llm_cost_calc/google.py                   | 28 ++++++-
 litellm/proxy/_new_secret_config.yaml         | 12 ++-
 litellm/tests/test_completion_cost.py         | 73 +++++++++++++++++++
 6 files changed, 133 insertions(+), 18 deletions(-)

diff --git a/litellm/cost_calculator.py b/litellm/cost_calculator.py
index 0bc65a7f1d..13a9e4bdc6 100644
--- a/litellm/cost_calculator.py
+++ b/litellm/cost_calculator.py
@@ -15,10 +15,12 @@ from litellm.litellm_core_utils.llm_cost_calc.google import (
 from litellm.litellm_core_utils.llm_cost_calc.google import (
     cost_per_token as google_cost_per_token,
 )
+from litellm.litellm_core_utils.llm_cost_calc.google import (
+    cost_router as google_cost_router,
+)
 from litellm.litellm_core_utils.llm_cost_calc.utils import _generic_cost_per_character
 from litellm.types.llms.openai import HttpxBinaryResponseContent
 from litellm.types.router import SPECIAL_MODEL_INFO_PARAMS
-
 from litellm.utils import (
     CallTypes,
     CostPerToken,
@@ -160,22 +162,32 @@ def cost_per_token(
 
     # see this https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models
     print_verbose(f"Looking up model={model} in model_cost_map")
-    if custom_llm_provider == "vertex_ai" and "claude" in model:
-        return google_cost_per_token(
-            model=model_without_prefix,
-            custom_llm_provider=custom_llm_provider,
-            prompt_tokens=prompt_tokens,
-            completion_tokens=completion_tokens,
-        )
     if custom_llm_provider == "vertex_ai":
-        return google_cost_per_character(
+        cost_router = google_cost_router(
             model=model_without_prefix,
             custom_llm_provider=custom_llm_provider,
             prompt_characters=prompt_characters,
             completion_characters=completion_characters,
             prompt_tokens=prompt_tokens,
             completion_tokens=completion_tokens,
+            call_type=call_type,
         )
+        if cost_router == "cost_per_character":
+            return google_cost_per_character(
+                model=model_without_prefix,
+                custom_llm_provider=custom_llm_provider,
+                prompt_characters=prompt_characters,
+                completion_characters=completion_characters,
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens,
+            )
+        elif cost_router == "cost_per_token":
+            return google_cost_per_token(
+                model=model_without_prefix,
+                custom_llm_provider=custom_llm_provider,
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens,
+            )
     elif custom_llm_provider == "gemini":
         return google_cost_per_token(
             model=model_without_prefix,
diff --git a/litellm/integrations/slack_alerting.py b/litellm/integrations/slack_alerting.py
index 437e8ce135..b7b62b61f6 100644
--- a/litellm/integrations/slack_alerting.py
+++ b/litellm/integrations/slack_alerting.py
@@ -1530,9 +1530,9 @@ Model Info:
         """Log deployment latency"""
         try:
             if "daily_reports" in self.alert_types:
-                model_id = (
-                    kwargs.get("litellm_params", {}).get("model_info", {}).get("id", "")
-                )
+                litellm_params = kwargs.get("litellm_params", {}) or {}
+                model_info = litellm_params.get("model_info", {}) or {}
+                model_id = model_info.get("id", "") or ""
                 response_s: timedelta = end_time - start_time
 
                 final_value = response_s
diff --git a/litellm/litellm_core_utils/litellm_logging.py b/litellm/litellm_core_utils/litellm_logging.py
index 0271c57147..0edc90325d 100644
--- a/litellm/litellm_core_utils/litellm_logging.py
+++ b/litellm/litellm_core_utils/litellm_logging.py
@@ -1275,7 +1275,7 @@ class Logging:
                     f"Model={self.model}; cost={self.model_call_details['response_cost']}"
                 )
             except litellm.NotFoundError as e:
-                verbose_logger.error(
+                verbose_logger.warning(
                     f"Model={self.model} not found in completion cost map. Setting 'response_cost' to None"
                 )
                 self.model_call_details["response_cost"] = None
diff --git a/litellm/litellm_core_utils/llm_cost_calc/google.py b/litellm/litellm_core_utils/llm_cost_calc/google.py
index 2c958cf88a..76da0da51e 100644
--- a/litellm/litellm_core_utils/llm_cost_calc/google.py
+++ b/litellm/litellm_core_utils/llm_cost_calc/google.py
@@ -1,7 +1,7 @@
 # What is this?
 ## Cost calculation for Google AI Studio / Vertex AI models
 import traceback
-from typing import List, Literal, Optional, Tuple
+from typing import List, Literal, Optional, Tuple, Union
 
 import litellm
 from litellm import verbose_logger
@@ -29,6 +29,32 @@ def _is_above_128k(tokens: float) -> bool:
     return False
 
 
+def cost_router(
+    model: str,
+    custom_llm_provider: str,
+    prompt_tokens: float,
+    completion_tokens: float,
+    prompt_characters: float,
+    completion_characters: float,
+    call_type: Union[Literal["embedding", "aembedding"], str],
+) -> Literal["cost_per_character", "cost_per_token"]:
+    """
+    Route the cost calc to the right place, based on model/call_type/etc.
+
+    Returns
+        - str, the specific google cost calc function it should route to.
+    """
+    if custom_llm_provider == "vertex_ai" and "claude" in model:
+        return "cost_per_token"
+    elif custom_llm_provider == "gemini":
+        return "cost_per_token"
+    elif custom_llm_provider == "vertex_ai" and (
+        call_type == "embedding" or call_type == "aembedding"
+    ):
+        return "cost_per_token"
+    return "cost_per_character"
+
+
 def cost_per_character(
     model: str,
     custom_llm_provider: str,
diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml
index 0f1f981d7a..a8c9e88233 100644
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@@ -1,10 +1,14 @@
 model_list:
-  - model_name: "*"
+  - model_name: azure-ai-mistral
     litellm_params:
-      model: "openai/*"
-  - model_name: claude-3-5-sonnet-20240620
+      api_base: os.environ/AZURE_AI_MISTRAL_API_BASE
+      api_key: os.environ/AZURE_AI_MISTRAL_API_KEY
+      model: azure_ai/Mistral-large-nmefg
+  - model_name: azure-ai-phi
     litellm_params:
-      model: gpt-3.5-turbo
+      api_base: os.environ/AZURE_AI_PHI_API_BASE
+      api_key: os.environ/AZURE_AI_PHI_API_KEY
+      model: azure_ai/Phi-3-medium-128k-instruct-fpmvj
 
 
 general_settings:
diff --git a/litellm/tests/test_completion_cost.py b/litellm/tests/test_completion_cost.py
index 1b4df0ecc0..5d30ccb036 100644
--- a/litellm/tests/test_completion_cost.py
+++ b/litellm/tests/test_completion_cost.py
@@ -712,6 +712,79 @@ def test_vertex_ai_claude_completion_cost():
     assert cost == predicted_cost
 
 
+def test_vertex_ai_embedding_completion_cost(caplog):
+    """
+    Relevant issue - https://github.com/BerriAI/litellm/issues/4630
+    """
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
+    litellm.model_cost = litellm.get_model_cost_map(url="")
+
+    text = "The quick brown fox jumps over the lazy dog."
+    input_tokens = litellm.token_counter(
+        model="vertex_ai/textembedding-gecko", text=text
+    )
+
+    model_info = litellm.get_model_info(model="vertex_ai/textembedding-gecko")
+
+    print("\nExpected model info:\n{}\n\n".format(model_info))
+
+    expected_input_cost = input_tokens * model_info["input_cost_per_token"]
+
+    ## CALCULATED COST
+    calculated_input_cost, calculated_output_cost = cost_per_token(
+        model="textembedding-gecko",
+        custom_llm_provider="vertex_ai",
+        prompt_tokens=input_tokens,
+        call_type="aembedding",
+    )
+
+    assert round(expected_input_cost, 6) == round(calculated_input_cost, 6)
+    print("expected_input_cost: {}".format(expected_input_cost))
+    print("calculated_input_cost: {}".format(calculated_input_cost))
+
+    captured_logs = [rec.message for rec in caplog.records]
+    for item in captured_logs:
+        print("\nitem:{}\n".format(item))
+        if (
+            "litellm.litellm_core_utils.llm_cost_calc.google.cost_per_character(): Exception occured "
+            in item
+        ):
+            raise Exception("Error log raised for calculating embedding cost")
+
+
+# def test_vertex_ai_embedding_completion_cost_e2e():
+#     """
+#     Relevant issue - https://github.com/BerriAI/litellm/issues/4630
+#     """
+#     from litellm.tests.test_amazing_vertex_completion import load_vertex_ai_credentials
+
+#     load_vertex_ai_credentials()
+#     os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
+#     litellm.model_cost = litellm.get_model_cost_map(url="")
+
+#     text = "The quick brown fox jumps over the lazy dog."
+#     input_tokens = litellm.token_counter(
+#         model="vertex_ai/textembedding-gecko", text=text
+#     )
+
+#     model_info = litellm.get_model_info(model="vertex_ai/textembedding-gecko")
+
+#     print("\nExpected model info:\n{}\n\n".format(model_info))
+
+#     expected_input_cost = input_tokens * model_info["input_cost_per_token"]
+
+#     ## CALCULATED COST
+#     resp = litellm.embedding(model="textembedding-gecko", input=[text])
+
+#     calculated_input_cost = resp._hidden_params["response_cost"]
+
+#     assert round(expected_input_cost, 6) == round(calculated_input_cost, 6)
+#     print("expected_input_cost: {}".format(expected_input_cost))
+#     print("calculated_input_cost: {}".format(calculated_input_cost))
+
+#     assert False
+
+
 @pytest.mark.parametrize("sync_mode", [True, False])
 @pytest.mark.asyncio
 async def test_completion_cost_hidden_params(sync_mode):