(feat) add cost tracking for OpenAI prompt caching (#6055)

* add cache_read_input_token_cost for prompt caching models * add prompt caching for latest models * add openai cost calculator * add openai prompt caching test * fix lint check * add not on how usage._cache_read_input_tokens is used * fix cost calc whisper openai * use output_cost_per_second * add input_cost_per_second
2024-10-05 14:20:15 +05:30 · 2024-10-05 14:20:15 +05:30 · 3682f661d8
commit 3682f661d8
parent 930606ad63
5 changed files with 202 additions and 32 deletions
--- a/tests/local_testing/test_completion_cost.py
+++ b/tests/local_testing/test_completion_cost.py
@ -24,6 +24,7 @@ from litellm import (
    model_cost,
    open_ai_chat_completion_models,
 )
+from litellm.types.utils import PromptTokensDetails
 from litellm.litellm_core_utils.litellm_logging import CustomLogger


@ -209,7 +210,9 @@ def test_cost_ft_gpt_35():
            usage=Usage(prompt_tokens=21, completion_tokens=17, total_tokens=38),
        )

-        cost = litellm.completion_cost(completion_response=resp)
+        cost = litellm.completion_cost(
+            completion_response=resp, custom_llm_provider="openai"
+        )
        print("\n Calculated Cost for ft:gpt-3.5", cost)
        input_cost = model_cost["ft:gpt-3.5-turbo"]["input_cost_per_token"]
        output_cost = model_cost["ft:gpt-3.5-turbo"]["output_cost_per_token"]
@ -1330,6 +1333,90 @@ def test_completion_cost_vertex_llama3():
    assert cost == 0


+def test_cost_openai_prompt_caching():
+    from litellm.utils import Choices, Message, ModelResponse, Usage
+    from litellm import get_model_info
+
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
+    litellm.model_cost = litellm.get_model_cost_map(url="")
+
+    model = "gpt-4o-mini-2024-07-18"
+
+    ## LLM API CALL ## (MORE EXPENSIVE)
+    response_1 = ModelResponse(
+        id="chatcmpl-3f427194-0840-4d08-b571-56bfe38a5424",
+        choices=[
+            Choices(
+                finish_reason="length",
+                index=0,
+                message=Message(
+                    content="Hello! I'm doing well, thank you for",
+                    role="assistant",
+                    tool_calls=None,
+                    function_call=None,
+                ),
+            )
+        ],
+        created=1725036547,
+        model=model,
+        object="chat.completion",
+        system_fingerprint=None,
+        usage=Usage(
+            completion_tokens=10,
+            prompt_tokens=14,
+            total_tokens=24,
+        ),
+    )
+
+    ## PROMPT CACHE HIT ## (LESS EXPENSIVE)
+    response_2 = ModelResponse(
+        id="chatcmpl-3f427194-0840-4d08-b571-56bfe38a5424",
+        choices=[
+            Choices(
+                finish_reason="length",
+                index=0,
+                message=Message(
+                    content="Hello! I'm doing well, thank you for",
+                    role="assistant",
+                    tool_calls=None,
+                    function_call=None,
+                ),
+            )
+        ],
+        created=1725036547,
+        model=model,
+        object="chat.completion",
+        system_fingerprint=None,
+        usage=Usage(
+            completion_tokens=10,
+            prompt_tokens=0,
+            total_tokens=10,
+            prompt_tokens_details=PromptTokensDetails(
+                cached_tokens=14,
+            ),
+        ),
+    )
+
+    cost_1 = completion_cost(model=model, completion_response=response_1)
+    cost_2 = completion_cost(model=model, completion_response=response_2)
+    assert cost_1 > cost_2
+
+    model_info = get_model_info(model=model, custom_llm_provider="openai")
+    usage = response_2.usage
+
+    _expected_cost2 = (
+        usage.prompt_tokens * model_info["input_cost_per_token"]
+        + usage.completion_tokens * model_info["output_cost_per_token"]
+        + usage.prompt_tokens_details.cached_tokens
+        * model_info["cache_read_input_token_cost"]
+    )
+
+    print("_expected_cost2", _expected_cost2)
+    print("cost_2", cost_2)
+
+    assert cost_2 == _expected_cost2
+
+
@pytest.mark.parametrize(
    "model",
    [