From ab0b5361439cd3ea76cdee7278d1f27a5bfcf919 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Sat, 5 Oct 2024 15:04:18 +0530 Subject: [PATCH] (feat) add azure openai cost tracking for prompt caching (#6077) * add azure o1 models to model cost map * add azure o1 cost tracking * fix azure cost calc * add get llm provider test --- litellm/cost_calculator.py | 7 ++ litellm/llms/AzureOpenAI/cost_calculation.py | 57 +++++++++++++ tests/local_testing/test_completion_cost.py | 87 ++++++++++++++++++++ tests/local_testing/test_get_llm_provider.py | 9 ++ 4 files changed, 160 insertions(+) create mode 100644 litellm/llms/AzureOpenAI/cost_calculation.py diff --git a/litellm/cost_calculator.py b/litellm/cost_calculator.py index aa2d3ad76..ac7649e3c 100644 --- a/litellm/cost_calculator.py +++ b/litellm/cost_calculator.py @@ -25,6 +25,9 @@ from litellm.llms.anthropic.cost_calculation import ( from litellm.llms.azure_ai.cost_calculator import ( cost_per_query as azure_ai_rerank_cost_per_query, ) +from litellm.llms.AzureOpenAI.cost_calculation import ( + cost_per_token as azure_openai_cost_per_token, +) from litellm.llms.cohere.cost_calculator import ( cost_per_query as cohere_rerank_cost_per_query, ) @@ -261,6 +264,10 @@ def cost_per_token( return databricks_cost_per_token(model=model, usage=usage_block) elif custom_llm_provider == "fireworks_ai": return fireworks_ai_cost_per_token(model=model, usage=usage_block) + elif custom_llm_provider == "azure": + return azure_openai_cost_per_token( + model=model, usage=usage_block, response_time_ms=response_time_ms + ) elif custom_llm_provider == "gemini": return google_cost_per_token( model=model_without_prefix, diff --git a/litellm/llms/AzureOpenAI/cost_calculation.py b/litellm/llms/AzureOpenAI/cost_calculation.py new file mode 100644 index 000000000..5d619e994 --- /dev/null +++ b/litellm/llms/AzureOpenAI/cost_calculation.py @@ -0,0 +1,57 @@ +""" +Helper util for handling azure openai-specific cost calculation +- e.g.: prompt caching +""" + +from typing import Optional, Tuple + +from litellm._logging import verbose_logger +from litellm.types.utils import Usage +from litellm.utils import get_model_info + + +def cost_per_token( + model: str, usage: Usage, response_time_ms: Optional[float] = 0.0 +) -> Tuple[float, float]: + """ + Calculates the cost per token for a given model, prompt tokens, and completion tokens. + + Input: + - model: str, the model name without provider prefix + - usage: LiteLLM Usage block, containing anthropic caching information + + Returns: + Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd + """ + ## GET MODEL INFO + model_info = get_model_info(model=model, custom_llm_provider="azure") + + ## CALCULATE INPUT COST + prompt_cost: float = usage["prompt_tokens"] * model_info["input_cost_per_token"] + + ## CALCULATE OUTPUT COST + completion_cost: float = ( + usage["completion_tokens"] * model_info["output_cost_per_token"] + ) + + ## Prompt Caching cost calculation + if model_info.get("cache_read_input_token_cost") is not None: + # Note: We read ._cache_read_input_tokens from the Usage - since cost_calculator.py standardizes the cache read tokens on usage._cache_read_input_tokens + prompt_cost += usage._cache_read_input_tokens * ( + model_info.get("cache_read_input_token_cost", 0) or 0 + ) + + ## Speech / Audio cost calculation + if ( + "output_cost_per_second" in model_info + and model_info["output_cost_per_second"] is not None + and response_time_ms is not None + ): + verbose_logger.debug( + f"For model={model} - output_cost_per_second: {model_info.get('output_cost_per_second')}; response time: {response_time_ms}" + ) + ## COST PER SECOND ## + prompt_cost = 0 + completion_cost = model_info["output_cost_per_second"] * response_time_ms / 1000 + + return prompt_cost, completion_cost diff --git a/tests/local_testing/test_completion_cost.py b/tests/local_testing/test_completion_cost.py index 02c87ed04..d12a90cc3 100644 --- a/tests/local_testing/test_completion_cost.py +++ b/tests/local_testing/test_completion_cost.py @@ -1295,6 +1295,93 @@ def test_completion_cost_fireworks_ai(model): cost = completion_cost(completion_response=resp) +def test_cost_azure_openai_prompt_caching(): + from litellm.utils import Choices, Message, ModelResponse, Usage + from litellm.types.utils import PromptTokensDetails, CompletionTokensDetails + from litellm import get_model_info + + os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True" + litellm.model_cost = litellm.get_model_cost_map(url="") + + model = "azure/o1-mini" + + ## LLM API CALL ## (MORE EXPENSIVE) + response_1 = ModelResponse( + id="chatcmpl-3f427194-0840-4d08-b571-56bfe38a5424", + choices=[ + Choices( + finish_reason="length", + index=0, + message=Message( + content="Hello! I'm doing well, thank you for", + role="assistant", + tool_calls=None, + function_call=None, + ), + ) + ], + created=1725036547, + model=model, + object="chat.completion", + system_fingerprint=None, + usage=Usage( + completion_tokens=10, + prompt_tokens=14, + total_tokens=24, + completion_tokens_details=CompletionTokensDetails(reasoning_tokens=2), + ), + ) + + ## PROMPT CACHE HIT ## (LESS EXPENSIVE) + response_2 = ModelResponse( + id="chatcmpl-3f427194-0840-4d08-b571-56bfe38a5424", + choices=[ + Choices( + finish_reason="length", + index=0, + message=Message( + content="Hello! I'm doing well, thank you for", + role="assistant", + tool_calls=None, + function_call=None, + ), + ) + ], + created=1725036547, + model=model, + object="chat.completion", + system_fingerprint=None, + usage=Usage( + completion_tokens=10, + prompt_tokens=0, + total_tokens=10, + prompt_tokens_details=PromptTokensDetails( + cached_tokens=14, + ), + completion_tokens_details=CompletionTokensDetails(reasoning_tokens=2), + ), + ) + + cost_1 = completion_cost(model=model, completion_response=response_1) + cost_2 = completion_cost(model=model, completion_response=response_2) + assert cost_1 > cost_2 + + model_info = get_model_info(model=model, custom_llm_provider="azure") + usage = response_2.usage + + _expected_cost2 = ( + usage.prompt_tokens * model_info["input_cost_per_token"] + + usage.completion_tokens * model_info["output_cost_per_token"] + + usage.prompt_tokens_details.cached_tokens + * model_info["cache_read_input_token_cost"] + ) + + print("_expected_cost2", _expected_cost2) + print("cost_2", cost_2) + + assert cost_2 == _expected_cost2 + + def test_completion_cost_vertex_llama3(): os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True" litellm.model_cost = litellm.get_model_cost_map(url="") diff --git a/tests/local_testing/test_get_llm_provider.py b/tests/local_testing/test_get_llm_provider.py index add551818..b80e3f2cb 100644 --- a/tests/local_testing/test_get_llm_provider.py +++ b/tests/local_testing/test_get_llm_provider.py @@ -115,3 +115,12 @@ def test_get_llm_provider_cohere_chat_test2(): print("api_base=", api_base) assert custom_llm_provider == "cohere_chat" assert model == "command-r-plus" + + +def test_get_llm_provider_azure_o1(): + + model, custom_llm_provider, dynamic_api_key, api_base = litellm.get_llm_provider( + model="azure/o1-mini", + ) + assert custom_llm_provider == "azure" + assert model == "o1-mini"