From f681f0f2b26ceea97f1d2fd9267d4ee92dc000e0 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Fri, 5 Jan 2024 13:11:23 +0530
Subject: [PATCH] (feat) completion_cost - embeddings + raise Exception

---
 litellm/__init__.py             |  7 ++++-
 litellm/tests/test_embedding.py |  7 ++++-
 litellm/utils.py                | 47 ++++++++++++++++++++++-----------
 3 files changed, 43 insertions(+), 18 deletions(-)

diff --git a/litellm/__init__.py b/litellm/__init__.py
index 8668fe850..f848dd324 100644
--- a/litellm/__init__.py
+++ b/litellm/__init__.py
@@ -338,7 +338,8 @@ baseten_models: List = [
 ]  # FALCON 7B  # WizardLM  # Mosaic ML
 
 
-# used for token counting
+# used for Cost Tracking & Token counting
+# https://azure.microsoft.com/en-in/pricing/details/cognitive-services/openai-service/
 # Azure returns gpt-35-turbo in their responses, we need to map this to azure/gpt-3.5-turbo for token counting
 azure_llms = {
     "gpt-35-turbo": "azure/gpt-35-turbo",
@@ -346,6 +347,10 @@ azure_llms = {
     "gpt-35-turbo-instruct": "azure/gpt-35-turbo-instruct",
 }
 
+azure_embedding_models = {
+    "ada": "azure/ada",
+}
+
 petals_models = [
     "petals-team/StableBeluga2",
 ]
diff --git a/litellm/tests/test_embedding.py b/litellm/tests/test_embedding.py
index 2a86f79d7..ae59424f6 100644
--- a/litellm/tests/test_embedding.py
+++ b/litellm/tests/test_embedding.py
@@ -59,6 +59,7 @@ def test_openai_embedding():
 
 def test_openai_azure_embedding_simple():
     try:
+        litellm.set_verbose = True
         response = embedding(
             model="azure/azure-embedding-model",
             input=["good morning from litellm"],
@@ -70,11 +71,15 @@ def test_openai_azure_embedding_simple():
             response_keys
         )  # assert litellm response has expected keys from OpenAI embedding response
 
+        request_cost = litellm.completion_cost(completion_response=response)
+
+        print("Calculated request cost=", request_cost)
+
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
 
 
-# test_openai_azure_embedding_simple()
+test_openai_azure_embedding_simple()
 
 
 def test_openai_azure_embedding_timeouts():
diff --git a/litellm/utils.py b/litellm/utils.py
index 42c9b4157..3f3978dd2 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -2740,6 +2740,8 @@ def cost_per_token(model="", prompt_tokens=0, completion_tokens=0):
     completion_tokens_cost_usd_dollar = 0
     model_cost_ref = litellm.model_cost
     # see this https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models
+    print_verbose(f"Looking up model={model} in model_cost_map")
+
     if model in model_cost_ref:
         prompt_tokens_cost_usd_dollar = (
             model_cost_ref[model]["input_cost_per_token"] * prompt_tokens
@@ -2749,6 +2751,7 @@ def cost_per_token(model="", prompt_tokens=0, completion_tokens=0):
         )
         return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
     elif "ft:gpt-3.5-turbo" in model:
+        print_verbose(f"Cost Tracking: {model} is an OpenAI FinteTuned LLM")
         # fuzzy match ft:gpt-3.5-turbo:abcd-id-cool-litellm
         prompt_tokens_cost_usd_dollar = (
             model_cost_ref["ft:gpt-3.5-turbo"]["input_cost_per_token"] * prompt_tokens
@@ -2759,6 +2762,7 @@ def cost_per_token(model="", prompt_tokens=0, completion_tokens=0):
         )
         return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
     elif model in litellm.azure_llms:
+        print_verbose(f"Cost Tracking: {model} is an Azure LLM")
         model = litellm.azure_llms[model]
         prompt_tokens_cost_usd_dollar = (
             model_cost_ref[model]["input_cost_per_token"] * prompt_tokens
@@ -2767,19 +2771,29 @@ def cost_per_token(model="", prompt_tokens=0, completion_tokens=0):
             model_cost_ref[model]["output_cost_per_token"] * completion_tokens
         )
         return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
-    else:
-        # calculate average input cost, azure/gpt-deployments can potentially go here if users don't specify, gpt-4, gpt-3.5-turbo. LLMs litellm knows
-        input_cost_sum = 0
-        output_cost_sum = 0
-        model_cost_ref = litellm.model_cost
-        for model in model_cost_ref:
-            input_cost_sum += model_cost_ref[model]["input_cost_per_token"]
-            output_cost_sum += model_cost_ref[model]["output_cost_per_token"]
-        avg_input_cost = input_cost_sum / len(model_cost_ref.keys())
-        avg_output_cost = output_cost_sum / len(model_cost_ref.keys())
-        prompt_tokens_cost_usd_dollar = avg_input_cost * prompt_tokens
-        completion_tokens_cost_usd_dollar = avg_output_cost * completion_tokens
+    elif model in litellm.azure_embedding_models:
+        print_verbose(f"Cost Tracking: {model} is an Azure Embedding Model")
+        model = litellm.azure_embedding_models[model]
+        prompt_tokens_cost_usd_dollar = (
+            model_cost_ref[model]["input_cost_per_token"] * prompt_tokens
+        )
+        completion_tokens_cost_usd_dollar = (
+            model_cost_ref[model]["output_cost_per_token"] * completion_tokens
+        )
         return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
+    else:
+        # if model is not in model_prices_and_context_window.json. Raise an exception-let users know
+        error_str = f"Model not in model_prices_and_context_window.json. You passed model={model}\n"
+        raise litellm.exceptions.NotFoundError(  # type: ignore
+            message=error_str,
+            model=model,
+            response=httpx.Response(
+                status_code=404,
+                content=error_str,
+                request=httpx.request(method="cost_per_token", url="https://github.com/BerriAI/litellm"),  # type: ignore
+            ),
+            llm_provider="",
+        )
 
 
 def completion_cost(
@@ -2821,8 +2835,10 @@ def completion_cost(
         completion_tokens = 0
         if completion_response is not None:
             # get input/output tokens from completion_response
-            prompt_tokens = completion_response["usage"]["prompt_tokens"]
-            completion_tokens = completion_response["usage"]["completion_tokens"]
+            prompt_tokens = completion_response.get("usage", {}).get("prompt_tokens", 0)
+            completion_tokens = completion_response.get("usage", {}).get(
+                "completion_tokens", 0
+            )
             model = (
                 model or completion_response["model"]
             )  # check if user passed an override for model, if it's none check completion_response['model']
@@ -2852,8 +2868,7 @@ def completion_cost(
         )
         return prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
     except Exception as e:
-        print_verbose(f"LiteLLM: Excepton when cost calculating {str(e)}")
-        return 0.0  # this should not block a users execution path
+        raise e
 
 
 ####### HELPER FUNCTIONS ################