forked from phoenix/litellm-mirror
(feat) add azure openai cost tracking for prompt caching (#6077)
* add azure o1 models to model cost map * add azure o1 cost tracking * fix azure cost calc * add get llm provider test
This commit is contained in:
parent
7267852511
commit
ab0b536143
4 changed files with 160 additions and 0 deletions
|
@ -25,6 +25,9 @@ from litellm.llms.anthropic.cost_calculation import (
|
|||
from litellm.llms.azure_ai.cost_calculator import (
|
||||
cost_per_query as azure_ai_rerank_cost_per_query,
|
||||
)
|
||||
from litellm.llms.AzureOpenAI.cost_calculation import (
|
||||
cost_per_token as azure_openai_cost_per_token,
|
||||
)
|
||||
from litellm.llms.cohere.cost_calculator import (
|
||||
cost_per_query as cohere_rerank_cost_per_query,
|
||||
)
|
||||
|
@ -261,6 +264,10 @@ def cost_per_token(
|
|||
return databricks_cost_per_token(model=model, usage=usage_block)
|
||||
elif custom_llm_provider == "fireworks_ai":
|
||||
return fireworks_ai_cost_per_token(model=model, usage=usage_block)
|
||||
elif custom_llm_provider == "azure":
|
||||
return azure_openai_cost_per_token(
|
||||
model=model, usage=usage_block, response_time_ms=response_time_ms
|
||||
)
|
||||
elif custom_llm_provider == "gemini":
|
||||
return google_cost_per_token(
|
||||
model=model_without_prefix,
|
||||
|
|
57
litellm/llms/AzureOpenAI/cost_calculation.py
Normal file
57
litellm/llms/AzureOpenAI/cost_calculation.py
Normal file
|
@ -0,0 +1,57 @@
|
|||
"""
|
||||
Helper util for handling azure openai-specific cost calculation
|
||||
- e.g.: prompt caching
|
||||
"""
|
||||
|
||||
from typing import Optional, Tuple
|
||||
|
||||
from litellm._logging import verbose_logger
|
||||
from litellm.types.utils import Usage
|
||||
from litellm.utils import get_model_info
|
||||
|
||||
|
||||
def cost_per_token(
|
||||
model: str, usage: Usage, response_time_ms: Optional[float] = 0.0
|
||||
) -> Tuple[float, float]:
|
||||
"""
|
||||
Calculates the cost per token for a given model, prompt tokens, and completion tokens.
|
||||
|
||||
Input:
|
||||
- model: str, the model name without provider prefix
|
||||
- usage: LiteLLM Usage block, containing anthropic caching information
|
||||
|
||||
Returns:
|
||||
Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd
|
||||
"""
|
||||
## GET MODEL INFO
|
||||
model_info = get_model_info(model=model, custom_llm_provider="azure")
|
||||
|
||||
## CALCULATE INPUT COST
|
||||
prompt_cost: float = usage["prompt_tokens"] * model_info["input_cost_per_token"]
|
||||
|
||||
## CALCULATE OUTPUT COST
|
||||
completion_cost: float = (
|
||||
usage["completion_tokens"] * model_info["output_cost_per_token"]
|
||||
)
|
||||
|
||||
## Prompt Caching cost calculation
|
||||
if model_info.get("cache_read_input_token_cost") is not None:
|
||||
# Note: We read ._cache_read_input_tokens from the Usage - since cost_calculator.py standardizes the cache read tokens on usage._cache_read_input_tokens
|
||||
prompt_cost += usage._cache_read_input_tokens * (
|
||||
model_info.get("cache_read_input_token_cost", 0) or 0
|
||||
)
|
||||
|
||||
## Speech / Audio cost calculation
|
||||
if (
|
||||
"output_cost_per_second" in model_info
|
||||
and model_info["output_cost_per_second"] is not None
|
||||
and response_time_ms is not None
|
||||
):
|
||||
verbose_logger.debug(
|
||||
f"For model={model} - output_cost_per_second: {model_info.get('output_cost_per_second')}; response time: {response_time_ms}"
|
||||
)
|
||||
## COST PER SECOND ##
|
||||
prompt_cost = 0
|
||||
completion_cost = model_info["output_cost_per_second"] * response_time_ms / 1000
|
||||
|
||||
return prompt_cost, completion_cost
|
|
@ -1295,6 +1295,93 @@ def test_completion_cost_fireworks_ai(model):
|
|||
cost = completion_cost(completion_response=resp)
|
||||
|
||||
|
||||
def test_cost_azure_openai_prompt_caching():
|
||||
from litellm.utils import Choices, Message, ModelResponse, Usage
|
||||
from litellm.types.utils import PromptTokensDetails, CompletionTokensDetails
|
||||
from litellm import get_model_info
|
||||
|
||||
os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
|
||||
litellm.model_cost = litellm.get_model_cost_map(url="")
|
||||
|
||||
model = "azure/o1-mini"
|
||||
|
||||
## LLM API CALL ## (MORE EXPENSIVE)
|
||||
response_1 = ModelResponse(
|
||||
id="chatcmpl-3f427194-0840-4d08-b571-56bfe38a5424",
|
||||
choices=[
|
||||
Choices(
|
||||
finish_reason="length",
|
||||
index=0,
|
||||
message=Message(
|
||||
content="Hello! I'm doing well, thank you for",
|
||||
role="assistant",
|
||||
tool_calls=None,
|
||||
function_call=None,
|
||||
),
|
||||
)
|
||||
],
|
||||
created=1725036547,
|
||||
model=model,
|
||||
object="chat.completion",
|
||||
system_fingerprint=None,
|
||||
usage=Usage(
|
||||
completion_tokens=10,
|
||||
prompt_tokens=14,
|
||||
total_tokens=24,
|
||||
completion_tokens_details=CompletionTokensDetails(reasoning_tokens=2),
|
||||
),
|
||||
)
|
||||
|
||||
## PROMPT CACHE HIT ## (LESS EXPENSIVE)
|
||||
response_2 = ModelResponse(
|
||||
id="chatcmpl-3f427194-0840-4d08-b571-56bfe38a5424",
|
||||
choices=[
|
||||
Choices(
|
||||
finish_reason="length",
|
||||
index=0,
|
||||
message=Message(
|
||||
content="Hello! I'm doing well, thank you for",
|
||||
role="assistant",
|
||||
tool_calls=None,
|
||||
function_call=None,
|
||||
),
|
||||
)
|
||||
],
|
||||
created=1725036547,
|
||||
model=model,
|
||||
object="chat.completion",
|
||||
system_fingerprint=None,
|
||||
usage=Usage(
|
||||
completion_tokens=10,
|
||||
prompt_tokens=0,
|
||||
total_tokens=10,
|
||||
prompt_tokens_details=PromptTokensDetails(
|
||||
cached_tokens=14,
|
||||
),
|
||||
completion_tokens_details=CompletionTokensDetails(reasoning_tokens=2),
|
||||
),
|
||||
)
|
||||
|
||||
cost_1 = completion_cost(model=model, completion_response=response_1)
|
||||
cost_2 = completion_cost(model=model, completion_response=response_2)
|
||||
assert cost_1 > cost_2
|
||||
|
||||
model_info = get_model_info(model=model, custom_llm_provider="azure")
|
||||
usage = response_2.usage
|
||||
|
||||
_expected_cost2 = (
|
||||
usage.prompt_tokens * model_info["input_cost_per_token"]
|
||||
+ usage.completion_tokens * model_info["output_cost_per_token"]
|
||||
+ usage.prompt_tokens_details.cached_tokens
|
||||
* model_info["cache_read_input_token_cost"]
|
||||
)
|
||||
|
||||
print("_expected_cost2", _expected_cost2)
|
||||
print("cost_2", cost_2)
|
||||
|
||||
assert cost_2 == _expected_cost2
|
||||
|
||||
|
||||
def test_completion_cost_vertex_llama3():
|
||||
os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
|
||||
litellm.model_cost = litellm.get_model_cost_map(url="")
|
||||
|
|
|
@ -115,3 +115,12 @@ def test_get_llm_provider_cohere_chat_test2():
|
|||
print("api_base=", api_base)
|
||||
assert custom_llm_provider == "cohere_chat"
|
||||
assert model == "command-r-plus"
|
||||
|
||||
|
||||
def test_get_llm_provider_azure_o1():
|
||||
|
||||
model, custom_llm_provider, dynamic_api_key, api_base = litellm.get_llm_provider(
|
||||
model="azure/o1-mini",
|
||||
)
|
||||
assert custom_llm_provider == "azure"
|
||||
assert model == "o1-mini"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue