(feat) add azure openai cost tracking for prompt caching (#6077)

* add azure o1 models to model cost map * add azure o1 cost tracking * fix azure cost calc * add get llm provider test
2024-10-05 15:04:18 +05:30 · 2024-10-05 15:04:18 +05:30 · ab0b536143
commit ab0b536143
parent 7267852511
4 changed files with 160 additions and 0 deletions
--- a/litellm/cost_calculator.py
+++ b/litellm/cost_calculator.py
@ -25,6 +25,9 @@ from litellm.llms.anthropic.cost_calculation import (
 from litellm.llms.azure_ai.cost_calculator import (
    cost_per_query as azure_ai_rerank_cost_per_query,
 )
 from litellm.llms.AzureOpenAI.cost_calculation import (
    cost_per_token as azure_openai_cost_per_token,
 )
 from litellm.llms.cohere.cost_calculator import (
    cost_per_query as cohere_rerank_cost_per_query,
 )
@ -261,6 +264,10 @@ def cost_per_token(
        return databricks_cost_per_token(model=model, usage=usage_block)
    elif custom_llm_provider == "fireworks_ai":
        return fireworks_ai_cost_per_token(model=model, usage=usage_block)
    elif custom_llm_provider == "azure":
        return azure_openai_cost_per_token(
            model=model, usage=usage_block, response_time_ms=response_time_ms
        )
    elif custom_llm_provider == "gemini":
        return google_cost_per_token(
            model=model_without_prefix,
--- a/litellm/llms/AzureOpenAI/cost_calculation.py
+++ b/litellm/llms/AzureOpenAI/cost_calculation.py
@ -0,0 +1,57 @@
 """
 Helper util for handling azure openai-specific cost calculation
 - e.g.: prompt caching
 """
 from typing import Optional, Tuple
 from litellm._logging import verbose_logger
 from litellm.types.utils import Usage
 from litellm.utils import get_model_info
 def cost_per_token(
    model: str, usage: Usage, response_time_ms: Optional[float] = 0.0
 ) -> Tuple[float, float]:
    """
    Calculates the cost per token for a given model, prompt tokens, and completion tokens.
    Input:
        - model: str, the model name without provider prefix
        - usage: LiteLLM Usage block, containing anthropic caching information
    Returns:
        Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd
    """
    ## GET MODEL INFO
    model_info = get_model_info(model=model, custom_llm_provider="azure")
    ## CALCULATE INPUT COST
    prompt_cost: float = usage["prompt_tokens"] * model_info["input_cost_per_token"]
    ## CALCULATE OUTPUT COST
    completion_cost: float = (
        usage["completion_tokens"] * model_info["output_cost_per_token"]
    )
    ## Prompt Caching cost calculation
    if model_info.get("cache_read_input_token_cost") is not None:
        # Note: We read ._cache_read_input_tokens from the Usage - since cost_calculator.py standardizes the cache read tokens on usage._cache_read_input_tokens
        prompt_cost += usage._cache_read_input_tokens * (
            model_info.get("cache_read_input_token_cost", 0) or 0
        )
    ## Speech / Audio cost calculation
    if (
        "output_cost_per_second" in model_info
        and model_info["output_cost_per_second"] is not None
        and response_time_ms is not None
    ):
        verbose_logger.debug(
            f"For model={model} - output_cost_per_second: {model_info.get('output_cost_per_second')}; response time: {response_time_ms}"
        )
        ## COST PER SECOND ##
        prompt_cost = 0
        completion_cost = model_info["output_cost_per_second"] * response_time_ms / 1000
    return prompt_cost, completion_cost
--- a/tests/local_testing/test_completion_cost.py
+++ b/tests/local_testing/test_completion_cost.py
@ -1295,6 +1295,93 @@ def test_completion_cost_fireworks_ai(model):
    cost = completion_cost(completion_response=resp)
 def test_cost_azure_openai_prompt_caching():
    from litellm.utils import Choices, Message, ModelResponse, Usage
    from litellm.types.utils import PromptTokensDetails, CompletionTokensDetails
    from litellm import get_model_info
    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    model = "azure/o1-mini"
    ## LLM API CALL ## (MORE EXPENSIVE)
    response_1 = ModelResponse(
        id="chatcmpl-3f427194-0840-4d08-b571-56bfe38a5424",
        choices=[
            Choices(
                finish_reason="length",
                index=0,
                message=Message(
                    content="Hello! I'm doing well, thank you for",
                    role="assistant",
                    tool_calls=None,
                    function_call=None,
                ),
            )
        ],
        created=1725036547,
        model=model,
        object="chat.completion",
        system_fingerprint=None,
        usage=Usage(
            completion_tokens=10,
            prompt_tokens=14,
            total_tokens=24,
            completion_tokens_details=CompletionTokensDetails(reasoning_tokens=2),
        ),
    )
    ## PROMPT CACHE HIT ## (LESS EXPENSIVE)
    response_2 = ModelResponse(
        id="chatcmpl-3f427194-0840-4d08-b571-56bfe38a5424",
        choices=[
            Choices(
                finish_reason="length",
                index=0,
                message=Message(
                    content="Hello! I'm doing well, thank you for",
                    role="assistant",
                    tool_calls=None,
                    function_call=None,
                ),
            )
        ],
        created=1725036547,
        model=model,
        object="chat.completion",
        system_fingerprint=None,
        usage=Usage(
            completion_tokens=10,
            prompt_tokens=0,
            total_tokens=10,
            prompt_tokens_details=PromptTokensDetails(
                cached_tokens=14,
            ),
            completion_tokens_details=CompletionTokensDetails(reasoning_tokens=2),
        ),
    )
    cost_1 = completion_cost(model=model, completion_response=response_1)
    cost_2 = completion_cost(model=model, completion_response=response_2)
    assert cost_1 > cost_2
    model_info = get_model_info(model=model, custom_llm_provider="azure")
    usage = response_2.usage
    _expected_cost2 = (
        usage.prompt_tokens * model_info["input_cost_per_token"]
        + usage.completion_tokens * model_info["output_cost_per_token"]
        + usage.prompt_tokens_details.cached_tokens
        * model_info["cache_read_input_token_cost"]
    )
    print("_expected_cost2", _expected_cost2)
    print("cost_2", cost_2)
    assert cost_2 == _expected_cost2
 def test_completion_cost_vertex_llama3():
    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
--- a/tests/local_testing/test_get_llm_provider.py
+++ b/tests/local_testing/test_get_llm_provider.py
@ -115,3 +115,12 @@ def test_get_llm_provider_cohere_chat_test2():
    print("api_base=", api_base)
    assert custom_llm_provider == "cohere_chat"
    assert model == "command-r-plus"
 def test_get_llm_provider_azure_o1():
    model, custom_llm_provider, dynamic_api_key, api_base = litellm.get_llm_provider(
        model="azure/o1-mini",
    )
    assert custom_llm_provider == "azure"
    assert model == "o1-mini"