(feat) add cost tracking for OpenAI prompt caching (#6055)

* add cache_read_input_token_cost for prompt caching models * add prompt caching for latest models * add openai cost calculator * add openai prompt caching test * fix lint check * add not on how usage._cache_read_input_tokens is used * fix cost calc whisper openai * use output_cost_per_second * add input_cost_per_second
2024-10-05 14:20:15 +05:30 · 2024-10-05 14:20:15 +05:30 · 3682f661d8
commit 3682f661d8
parent 930606ad63
5 changed files with 202 additions and 32 deletions
--- a/litellm/cost_calculator.py
+++ b/litellm/cost_calculator.py
@ -34,6 +34,7 @@ from litellm.llms.databricks.cost_calculator import (
 from litellm.llms.fireworks_ai.cost_calculator import (
    cost_per_token as fireworks_ai_cost_per_token,
 )
+from litellm.llms.OpenAI.cost_calculation import cost_per_token as openai_cost_per_token
 from litellm.llms.together_ai.cost_calculator import get_model_params_and_category
 from litellm.rerank_api.types import RerankResponse
 from litellm.types.llms.openai import HttpxBinaryResponseContent
@ -55,7 +56,7 @@ from litellm.utils import (
 def _cost_per_token_custom_pricing_helper(
    prompt_tokens: float = 0,
    completion_tokens: float = 0,
-    response_time_ms=None,
+    response_time_ms: Optional[float] = 0.0,
    ### CUSTOM PRICING ###
    custom_cost_per_token: Optional[CostPerToken] = None,
    custom_cost_per_second: Optional[float] = None,
@ -79,7 +80,7 @@ def cost_per_token(
    model: str = "",
    prompt_tokens: int = 0,
    completion_tokens: int = 0,
-    response_time_ms=None,
+    response_time_ms: Optional[float] = 0.0,
    custom_llm_provider: Optional[str] = None,
    region_name=None,
    ### CHARACTER PRICING ###
@ -198,7 +199,33 @@ def cost_per_token(

    # see this https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models
    print_verbose(f"Looking up model={model} in model_cost_map")
-    if custom_llm_provider == "vertex_ai":
+    if call_type == "speech" or call_type == "aspeech":
+        prompt_cost, completion_cost = _generic_cost_per_character(
+            model=model_without_prefix,
+            custom_llm_provider=custom_llm_provider,
+            prompt_characters=prompt_characters,
+            completion_characters=completion_characters,
+            custom_prompt_cost=None,
+            custom_completion_cost=0,
+        )
+        if prompt_cost is None or completion_cost is None:
+            raise ValueError(
+                "cost for tts call is None. prompt_cost={}, completion_cost={}, model={}, custom_llm_provider={}, prompt_characters={}, completion_characters={}".format(
+                    prompt_cost,
+                    completion_cost,
+                    model_without_prefix,
+                    custom_llm_provider,
+                    prompt_characters,
+                    completion_characters,
+                )
+            )
+        return prompt_cost, completion_cost
+    elif call_type == "arerank" or call_type == "rerank":
+        return rerank_cost(
+            model=model,
+            custom_llm_provider=custom_llm_provider,
+        )
+    elif custom_llm_provider == "vertex_ai":
        cost_router = google_cost_router(
            model=model_without_prefix,
            custom_llm_provider=custom_llm_provider,
@ -226,6 +253,10 @@ def cost_per_token(
            )
    elif custom_llm_provider == "anthropic":
        return anthropic_cost_per_token(model=model, usage=usage_block)
+    elif custom_llm_provider == "openai":
+        return openai_cost_per_token(
+            model=model, usage=usage_block, response_time_ms=response_time_ms
+        )
    elif custom_llm_provider == "databricks":
        return databricks_cost_per_token(model=model, usage=usage_block)
    elif custom_llm_provider == "fireworks_ai":
@ -237,32 +268,6 @@ def cost_per_token(
            prompt_tokens=prompt_tokens,
            completion_tokens=completion_tokens,
        )
-    elif call_type == "speech" or call_type == "aspeech":
-        prompt_cost, completion_cost = _generic_cost_per_character(
-            model=model_without_prefix,
-            custom_llm_provider=custom_llm_provider,
-            prompt_characters=prompt_characters,
-            completion_characters=completion_characters,
-            custom_prompt_cost=None,
-            custom_completion_cost=0,
-        )
-        if prompt_cost is None or completion_cost is None:
-            raise ValueError(
-                "cost for tts call is None. prompt_cost={}, completion_cost={}, model={}, custom_llm_provider={}, prompt_characters={}, completion_characters={}".format(
-                    prompt_cost,
-                    completion_cost,
-                    model_without_prefix,
-                    custom_llm_provider,
-                    prompt_characters,
-                    completion_characters,
-                )
-            )
-        return prompt_cost, completion_cost
-    elif call_type == "arerank" or call_type == "rerank":
-        return rerank_cost(
-            model=model,
-            custom_llm_provider=custom_llm_provider,
-        )
    elif model in model_cost_ref:
        print_verbose(f"Success: model={model} in model_cost_map")
        print_verbose(
@ -461,7 +466,7 @@ def completion_cost(
    prompt="",
    messages: List = [],
    completion="",
-    total_time=0.0,  # used for replicate, sagemaker
+    total_time: Optional[float] = 0.0,  # used for replicate, sagemaker
    call_type: Literal[
        "embedding",
        "aembedding",
@ -501,7 +506,7 @@ def completion_cost(
        model (str): Optional. The name of the language model used in the completion calls
        prompt (str): Optional. The input prompt passed to the llm
        completion (str): Optional. The output completion text from the llm
-        total_time (float): Optional. (Only used for Replicate LLMs) The total time used for the request in seconds
+        total_time (float, int): Optional. (Only used for Replicate LLMs) The total time used for the request in seconds
        custom_cost_per_token: Optional[CostPerToken]: the cost per input + output token for the llm api call.
        custom_cost_per_second: Optional[float]: the cost per second for the llm api call.

@ -562,6 +567,13 @@ def completion_cost(
            completion_tokens = _usage.get("completion_tokens", 0)
            cache_creation_input_tokens = _usage.get("cache_creation_input_tokens", 0)
            cache_read_input_tokens = _usage.get("cache_read_input_tokens", 0)
+            if (
+                "prompt_tokens_details" in _usage
+                and _usage["prompt_tokens_details"] != {}
+                and _usage["prompt_tokens_details"]
+            ):
+                prompt_tokens_details = _usage.get("prompt_tokens_details", {})
+                cache_read_input_tokens = prompt_tokens_details.get("cached_tokens", 0)

            total_time = getattr(completion_response, "_response_ms", 0)
            verbose_logger.debug(
--- a/litellm/llms/OpenAI/cost_calculation.py
+++ b/litellm/llms/OpenAI/cost_calculation.py
@ -0,0 +1,68 @@
+"""
+Helper util for handling openai-specific cost calculation
+- e.g.: prompt caching
+"""
+
+from typing import Optional, Tuple
+
+from litellm._logging import verbose_logger
+from litellm.types.utils import Usage
+from litellm.utils import get_model_info
+
+
+def cost_per_token(
+    model: str, usage: Usage, response_time_ms: Optional[float] = 0.0
+) -> Tuple[float, float]:
+    """
+    Calculates the cost per token for a given model, prompt tokens, and completion tokens.
+
+    Input:
+        - model: str, the model name without provider prefix
+        - usage: LiteLLM Usage block, containing anthropic caching information
+
+    Returns:
+        Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd
+    """
+    ## GET MODEL INFO
+    model_info = get_model_info(model=model, custom_llm_provider="openai")
+
+    ## CALCULATE INPUT COST
+    prompt_cost: float = usage["prompt_tokens"] * model_info["input_cost_per_token"]
+
+    ## CALCULATE OUTPUT COST
+    completion_cost: float = (
+        usage["completion_tokens"] * model_info["output_cost_per_token"]
+    )
+
+    ## Prompt Caching cost calculation
+    if model_info.get("cache_read_input_token_cost") is not None:
+        # Note: We read ._cache_read_input_tokens from the Usage - since cost_calculator.py standardizes the cache read tokens on usage._cache_read_input_tokens
+        prompt_cost += usage._cache_read_input_tokens * (
+            model_info.get("cache_read_input_token_cost", 0) or 0
+        )
+
+    ## Speech / Audio cost calculation
+    if (
+        "output_cost_per_second" in model_info
+        and model_info["output_cost_per_second"] is not None
+        and response_time_ms is not None
+    ):
+        verbose_logger.debug(
+            f"For model={model} - output_cost_per_second: {model_info.get('output_cost_per_second')}; response time: {response_time_ms}"
+        )
+        ## COST PER SECOND ##
+        prompt_cost = 0
+        completion_cost = model_info["output_cost_per_second"] * response_time_ms / 1000
+    elif (
+        "input_cost_per_second" in model_info
+        and model_info["input_cost_per_second"] is not None
+        and response_time_ms is not None
+    ):
+        verbose_logger.debug(
+            f"For model={model} - input_cost_per_second: {model_info.get('input_cost_per_second')}; response time: {response_time_ms}"
+        )
+        ## COST PER SECOND ##
+        prompt_cost = model_info["input_cost_per_second"] * response_time_ms / 1000
+        completion_cost = 0.0
+
+    return prompt_cost, completion_cost
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@ -79,6 +79,8 @@ class ModelInfo(TypedDict, total=False):
    output_vector_size: Optional[int]
    output_cost_per_video_per_second: Optional[float]  # only for vertex ai models
    output_cost_per_audio_per_second: Optional[float]  # only for vertex ai models
+    output_cost_per_second: Optional[float]  # for OpenAI Speech models
+
    litellm_provider: Required[str]
    mode: Required[
        Literal[
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -5092,6 +5092,7 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
                output_cost_per_character_above_128k_tokens=_model_info.get(
                    "output_cost_per_character_above_128k_tokens", None
                ),
+                output_cost_per_second=_model_info.get("output_cost_per_second", None),
                output_vector_size=_model_info.get("output_vector_size", None),
                litellm_provider=_model_info.get(
                    "litellm_provider", custom_llm_provider
--- a/tests/local_testing/test_completion_cost.py
+++ b/tests/local_testing/test_completion_cost.py
@ -24,6 +24,7 @@ from litellm import (
    model_cost,
    open_ai_chat_completion_models,
 )
+from litellm.types.utils import PromptTokensDetails
 from litellm.litellm_core_utils.litellm_logging import CustomLogger


@ -209,7 +210,9 @@ def test_cost_ft_gpt_35():
            usage=Usage(prompt_tokens=21, completion_tokens=17, total_tokens=38),
        )

-        cost = litellm.completion_cost(completion_response=resp)
+        cost = litellm.completion_cost(
+            completion_response=resp, custom_llm_provider="openai"
+        )
        print("\n Calculated Cost for ft:gpt-3.5", cost)
        input_cost = model_cost["ft:gpt-3.5-turbo"]["input_cost_per_token"]
        output_cost = model_cost["ft:gpt-3.5-turbo"]["output_cost_per_token"]
@ -1330,6 +1333,90 @@ def test_completion_cost_vertex_llama3():
    assert cost == 0


+def test_cost_openai_prompt_caching():
+    from litellm.utils import Choices, Message, ModelResponse, Usage
+    from litellm import get_model_info
+
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
+    litellm.model_cost = litellm.get_model_cost_map(url="")
+
+    model = "gpt-4o-mini-2024-07-18"
+
+    ## LLM API CALL ## (MORE EXPENSIVE)
+    response_1 = ModelResponse(
+        id="chatcmpl-3f427194-0840-4d08-b571-56bfe38a5424",
+        choices=[
+            Choices(
+                finish_reason="length",
+                index=0,
+                message=Message(
+                    content="Hello! I'm doing well, thank you for",
+                    role="assistant",
+                    tool_calls=None,
+                    function_call=None,
+                ),
+            )
+        ],
+        created=1725036547,
+        model=model,
+        object="chat.completion",
+        system_fingerprint=None,
+        usage=Usage(
+            completion_tokens=10,
+            prompt_tokens=14,
+            total_tokens=24,
+        ),
+    )
+
+    ## PROMPT CACHE HIT ## (LESS EXPENSIVE)
+    response_2 = ModelResponse(
+        id="chatcmpl-3f427194-0840-4d08-b571-56bfe38a5424",
+        choices=[
+            Choices(
+                finish_reason="length",
+                index=0,
+                message=Message(
+                    content="Hello! I'm doing well, thank you for",
+                    role="assistant",
+                    tool_calls=None,
+                    function_call=None,
+                ),
+            )
+        ],
+        created=1725036547,
+        model=model,
+        object="chat.completion",
+        system_fingerprint=None,
+        usage=Usage(
+            completion_tokens=10,
+            prompt_tokens=0,
+            total_tokens=10,
+            prompt_tokens_details=PromptTokensDetails(
+                cached_tokens=14,
+            ),
+        ),
+    )
+
+    cost_1 = completion_cost(model=model, completion_response=response_1)
+    cost_2 = completion_cost(model=model, completion_response=response_2)
+    assert cost_1 > cost_2
+
+    model_info = get_model_info(model=model, custom_llm_provider="openai")
+    usage = response_2.usage
+
+    _expected_cost2 = (
+        usage.prompt_tokens * model_info["input_cost_per_token"]
+        + usage.completion_tokens * model_info["output_cost_per_token"]
+        + usage.prompt_tokens_details.cached_tokens
+        * model_info["cache_read_input_token_cost"]
+    )
+
+    print("_expected_cost2", _expected_cost2)
+    print("cost_2", cost_2)
+
+    assert cost_2 == _expected_cost2
+
+
@pytest.mark.parametrize(
    "model",
    [