forked from phoenix/litellm-mirror
(feat) add cost tracking for OpenAI prompt caching (#6055)
* add cache_read_input_token_cost for prompt caching models * add prompt caching for latest models * add openai cost calculator * add openai prompt caching test * fix lint check * add not on how usage._cache_read_input_tokens is used * fix cost calc whisper openai * use output_cost_per_second * add input_cost_per_second
This commit is contained in:
parent
930606ad63
commit
3682f661d8
5 changed files with 202 additions and 32 deletions
|
@ -34,6 +34,7 @@ from litellm.llms.databricks.cost_calculator import (
|
|||
from litellm.llms.fireworks_ai.cost_calculator import (
|
||||
cost_per_token as fireworks_ai_cost_per_token,
|
||||
)
|
||||
from litellm.llms.OpenAI.cost_calculation import cost_per_token as openai_cost_per_token
|
||||
from litellm.llms.together_ai.cost_calculator import get_model_params_and_category
|
||||
from litellm.rerank_api.types import RerankResponse
|
||||
from litellm.types.llms.openai import HttpxBinaryResponseContent
|
||||
|
@ -55,7 +56,7 @@ from litellm.utils import (
|
|||
def _cost_per_token_custom_pricing_helper(
|
||||
prompt_tokens: float = 0,
|
||||
completion_tokens: float = 0,
|
||||
response_time_ms=None,
|
||||
response_time_ms: Optional[float] = 0.0,
|
||||
### CUSTOM PRICING ###
|
||||
custom_cost_per_token: Optional[CostPerToken] = None,
|
||||
custom_cost_per_second: Optional[float] = None,
|
||||
|
@ -79,7 +80,7 @@ def cost_per_token(
|
|||
model: str = "",
|
||||
prompt_tokens: int = 0,
|
||||
completion_tokens: int = 0,
|
||||
response_time_ms=None,
|
||||
response_time_ms: Optional[float] = 0.0,
|
||||
custom_llm_provider: Optional[str] = None,
|
||||
region_name=None,
|
||||
### CHARACTER PRICING ###
|
||||
|
@ -198,7 +199,33 @@ def cost_per_token(
|
|||
|
||||
# see this https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models
|
||||
print_verbose(f"Looking up model={model} in model_cost_map")
|
||||
if custom_llm_provider == "vertex_ai":
|
||||
if call_type == "speech" or call_type == "aspeech":
|
||||
prompt_cost, completion_cost = _generic_cost_per_character(
|
||||
model=model_without_prefix,
|
||||
custom_llm_provider=custom_llm_provider,
|
||||
prompt_characters=prompt_characters,
|
||||
completion_characters=completion_characters,
|
||||
custom_prompt_cost=None,
|
||||
custom_completion_cost=0,
|
||||
)
|
||||
if prompt_cost is None or completion_cost is None:
|
||||
raise ValueError(
|
||||
"cost for tts call is None. prompt_cost={}, completion_cost={}, model={}, custom_llm_provider={}, prompt_characters={}, completion_characters={}".format(
|
||||
prompt_cost,
|
||||
completion_cost,
|
||||
model_without_prefix,
|
||||
custom_llm_provider,
|
||||
prompt_characters,
|
||||
completion_characters,
|
||||
)
|
||||
)
|
||||
return prompt_cost, completion_cost
|
||||
elif call_type == "arerank" or call_type == "rerank":
|
||||
return rerank_cost(
|
||||
model=model,
|
||||
custom_llm_provider=custom_llm_provider,
|
||||
)
|
||||
elif custom_llm_provider == "vertex_ai":
|
||||
cost_router = google_cost_router(
|
||||
model=model_without_prefix,
|
||||
custom_llm_provider=custom_llm_provider,
|
||||
|
@ -226,6 +253,10 @@ def cost_per_token(
|
|||
)
|
||||
elif custom_llm_provider == "anthropic":
|
||||
return anthropic_cost_per_token(model=model, usage=usage_block)
|
||||
elif custom_llm_provider == "openai":
|
||||
return openai_cost_per_token(
|
||||
model=model, usage=usage_block, response_time_ms=response_time_ms
|
||||
)
|
||||
elif custom_llm_provider == "databricks":
|
||||
return databricks_cost_per_token(model=model, usage=usage_block)
|
||||
elif custom_llm_provider == "fireworks_ai":
|
||||
|
@ -237,32 +268,6 @@ def cost_per_token(
|
|||
prompt_tokens=prompt_tokens,
|
||||
completion_tokens=completion_tokens,
|
||||
)
|
||||
elif call_type == "speech" or call_type == "aspeech":
|
||||
prompt_cost, completion_cost = _generic_cost_per_character(
|
||||
model=model_without_prefix,
|
||||
custom_llm_provider=custom_llm_provider,
|
||||
prompt_characters=prompt_characters,
|
||||
completion_characters=completion_characters,
|
||||
custom_prompt_cost=None,
|
||||
custom_completion_cost=0,
|
||||
)
|
||||
if prompt_cost is None or completion_cost is None:
|
||||
raise ValueError(
|
||||
"cost for tts call is None. prompt_cost={}, completion_cost={}, model={}, custom_llm_provider={}, prompt_characters={}, completion_characters={}".format(
|
||||
prompt_cost,
|
||||
completion_cost,
|
||||
model_without_prefix,
|
||||
custom_llm_provider,
|
||||
prompt_characters,
|
||||
completion_characters,
|
||||
)
|
||||
)
|
||||
return prompt_cost, completion_cost
|
||||
elif call_type == "arerank" or call_type == "rerank":
|
||||
return rerank_cost(
|
||||
model=model,
|
||||
custom_llm_provider=custom_llm_provider,
|
||||
)
|
||||
elif model in model_cost_ref:
|
||||
print_verbose(f"Success: model={model} in model_cost_map")
|
||||
print_verbose(
|
||||
|
@ -461,7 +466,7 @@ def completion_cost(
|
|||
prompt="",
|
||||
messages: List = [],
|
||||
completion="",
|
||||
total_time=0.0, # used for replicate, sagemaker
|
||||
total_time: Optional[float] = 0.0, # used for replicate, sagemaker
|
||||
call_type: Literal[
|
||||
"embedding",
|
||||
"aembedding",
|
||||
|
@ -501,7 +506,7 @@ def completion_cost(
|
|||
model (str): Optional. The name of the language model used in the completion calls
|
||||
prompt (str): Optional. The input prompt passed to the llm
|
||||
completion (str): Optional. The output completion text from the llm
|
||||
total_time (float): Optional. (Only used for Replicate LLMs) The total time used for the request in seconds
|
||||
total_time (float, int): Optional. (Only used for Replicate LLMs) The total time used for the request in seconds
|
||||
custom_cost_per_token: Optional[CostPerToken]: the cost per input + output token for the llm api call.
|
||||
custom_cost_per_second: Optional[float]: the cost per second for the llm api call.
|
||||
|
||||
|
@ -562,6 +567,13 @@ def completion_cost(
|
|||
completion_tokens = _usage.get("completion_tokens", 0)
|
||||
cache_creation_input_tokens = _usage.get("cache_creation_input_tokens", 0)
|
||||
cache_read_input_tokens = _usage.get("cache_read_input_tokens", 0)
|
||||
if (
|
||||
"prompt_tokens_details" in _usage
|
||||
and _usage["prompt_tokens_details"] != {}
|
||||
and _usage["prompt_tokens_details"]
|
||||
):
|
||||
prompt_tokens_details = _usage.get("prompt_tokens_details", {})
|
||||
cache_read_input_tokens = prompt_tokens_details.get("cached_tokens", 0)
|
||||
|
||||
total_time = getattr(completion_response, "_response_ms", 0)
|
||||
verbose_logger.debug(
|
||||
|
|
68
litellm/llms/OpenAI/cost_calculation.py
Normal file
68
litellm/llms/OpenAI/cost_calculation.py
Normal file
|
@ -0,0 +1,68 @@
|
|||
"""
|
||||
Helper util for handling openai-specific cost calculation
|
||||
- e.g.: prompt caching
|
||||
"""
|
||||
|
||||
from typing import Optional, Tuple
|
||||
|
||||
from litellm._logging import verbose_logger
|
||||
from litellm.types.utils import Usage
|
||||
from litellm.utils import get_model_info
|
||||
|
||||
|
||||
def cost_per_token(
|
||||
model: str, usage: Usage, response_time_ms: Optional[float] = 0.0
|
||||
) -> Tuple[float, float]:
|
||||
"""
|
||||
Calculates the cost per token for a given model, prompt tokens, and completion tokens.
|
||||
|
||||
Input:
|
||||
- model: str, the model name without provider prefix
|
||||
- usage: LiteLLM Usage block, containing anthropic caching information
|
||||
|
||||
Returns:
|
||||
Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd
|
||||
"""
|
||||
## GET MODEL INFO
|
||||
model_info = get_model_info(model=model, custom_llm_provider="openai")
|
||||
|
||||
## CALCULATE INPUT COST
|
||||
prompt_cost: float = usage["prompt_tokens"] * model_info["input_cost_per_token"]
|
||||
|
||||
## CALCULATE OUTPUT COST
|
||||
completion_cost: float = (
|
||||
usage["completion_tokens"] * model_info["output_cost_per_token"]
|
||||
)
|
||||
|
||||
## Prompt Caching cost calculation
|
||||
if model_info.get("cache_read_input_token_cost") is not None:
|
||||
# Note: We read ._cache_read_input_tokens from the Usage - since cost_calculator.py standardizes the cache read tokens on usage._cache_read_input_tokens
|
||||
prompt_cost += usage._cache_read_input_tokens * (
|
||||
model_info.get("cache_read_input_token_cost", 0) or 0
|
||||
)
|
||||
|
||||
## Speech / Audio cost calculation
|
||||
if (
|
||||
"output_cost_per_second" in model_info
|
||||
and model_info["output_cost_per_second"] is not None
|
||||
and response_time_ms is not None
|
||||
):
|
||||
verbose_logger.debug(
|
||||
f"For model={model} - output_cost_per_second: {model_info.get('output_cost_per_second')}; response time: {response_time_ms}"
|
||||
)
|
||||
## COST PER SECOND ##
|
||||
prompt_cost = 0
|
||||
completion_cost = model_info["output_cost_per_second"] * response_time_ms / 1000
|
||||
elif (
|
||||
"input_cost_per_second" in model_info
|
||||
and model_info["input_cost_per_second"] is not None
|
||||
and response_time_ms is not None
|
||||
):
|
||||
verbose_logger.debug(
|
||||
f"For model={model} - input_cost_per_second: {model_info.get('input_cost_per_second')}; response time: {response_time_ms}"
|
||||
)
|
||||
## COST PER SECOND ##
|
||||
prompt_cost = model_info["input_cost_per_second"] * response_time_ms / 1000
|
||||
completion_cost = 0.0
|
||||
|
||||
return prompt_cost, completion_cost
|
|
@ -79,6 +79,8 @@ class ModelInfo(TypedDict, total=False):
|
|||
output_vector_size: Optional[int]
|
||||
output_cost_per_video_per_second: Optional[float] # only for vertex ai models
|
||||
output_cost_per_audio_per_second: Optional[float] # only for vertex ai models
|
||||
output_cost_per_second: Optional[float] # for OpenAI Speech models
|
||||
|
||||
litellm_provider: Required[str]
|
||||
mode: Required[
|
||||
Literal[
|
||||
|
|
|
@ -5092,6 +5092,7 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
|
|||
output_cost_per_character_above_128k_tokens=_model_info.get(
|
||||
"output_cost_per_character_above_128k_tokens", None
|
||||
),
|
||||
output_cost_per_second=_model_info.get("output_cost_per_second", None),
|
||||
output_vector_size=_model_info.get("output_vector_size", None),
|
||||
litellm_provider=_model_info.get(
|
||||
"litellm_provider", custom_llm_provider
|
||||
|
|
|
@ -24,6 +24,7 @@ from litellm import (
|
|||
model_cost,
|
||||
open_ai_chat_completion_models,
|
||||
)
|
||||
from litellm.types.utils import PromptTokensDetails
|
||||
from litellm.litellm_core_utils.litellm_logging import CustomLogger
|
||||
|
||||
|
||||
|
@ -209,7 +210,9 @@ def test_cost_ft_gpt_35():
|
|||
usage=Usage(prompt_tokens=21, completion_tokens=17, total_tokens=38),
|
||||
)
|
||||
|
||||
cost = litellm.completion_cost(completion_response=resp)
|
||||
cost = litellm.completion_cost(
|
||||
completion_response=resp, custom_llm_provider="openai"
|
||||
)
|
||||
print("\n Calculated Cost for ft:gpt-3.5", cost)
|
||||
input_cost = model_cost["ft:gpt-3.5-turbo"]["input_cost_per_token"]
|
||||
output_cost = model_cost["ft:gpt-3.5-turbo"]["output_cost_per_token"]
|
||||
|
@ -1330,6 +1333,90 @@ def test_completion_cost_vertex_llama3():
|
|||
assert cost == 0
|
||||
|
||||
|
||||
def test_cost_openai_prompt_caching():
|
||||
from litellm.utils import Choices, Message, ModelResponse, Usage
|
||||
from litellm import get_model_info
|
||||
|
||||
os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
|
||||
litellm.model_cost = litellm.get_model_cost_map(url="")
|
||||
|
||||
model = "gpt-4o-mini-2024-07-18"
|
||||
|
||||
## LLM API CALL ## (MORE EXPENSIVE)
|
||||
response_1 = ModelResponse(
|
||||
id="chatcmpl-3f427194-0840-4d08-b571-56bfe38a5424",
|
||||
choices=[
|
||||
Choices(
|
||||
finish_reason="length",
|
||||
index=0,
|
||||
message=Message(
|
||||
content="Hello! I'm doing well, thank you for",
|
||||
role="assistant",
|
||||
tool_calls=None,
|
||||
function_call=None,
|
||||
),
|
||||
)
|
||||
],
|
||||
created=1725036547,
|
||||
model=model,
|
||||
object="chat.completion",
|
||||
system_fingerprint=None,
|
||||
usage=Usage(
|
||||
completion_tokens=10,
|
||||
prompt_tokens=14,
|
||||
total_tokens=24,
|
||||
),
|
||||
)
|
||||
|
||||
## PROMPT CACHE HIT ## (LESS EXPENSIVE)
|
||||
response_2 = ModelResponse(
|
||||
id="chatcmpl-3f427194-0840-4d08-b571-56bfe38a5424",
|
||||
choices=[
|
||||
Choices(
|
||||
finish_reason="length",
|
||||
index=0,
|
||||
message=Message(
|
||||
content="Hello! I'm doing well, thank you for",
|
||||
role="assistant",
|
||||
tool_calls=None,
|
||||
function_call=None,
|
||||
),
|
||||
)
|
||||
],
|
||||
created=1725036547,
|
||||
model=model,
|
||||
object="chat.completion",
|
||||
system_fingerprint=None,
|
||||
usage=Usage(
|
||||
completion_tokens=10,
|
||||
prompt_tokens=0,
|
||||
total_tokens=10,
|
||||
prompt_tokens_details=PromptTokensDetails(
|
||||
cached_tokens=14,
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
cost_1 = completion_cost(model=model, completion_response=response_1)
|
||||
cost_2 = completion_cost(model=model, completion_response=response_2)
|
||||
assert cost_1 > cost_2
|
||||
|
||||
model_info = get_model_info(model=model, custom_llm_provider="openai")
|
||||
usage = response_2.usage
|
||||
|
||||
_expected_cost2 = (
|
||||
usage.prompt_tokens * model_info["input_cost_per_token"]
|
||||
+ usage.completion_tokens * model_info["output_cost_per_token"]
|
||||
+ usage.prompt_tokens_details.cached_tokens
|
||||
* model_info["cache_read_input_token_cost"]
|
||||
)
|
||||
|
||||
print("_expected_cost2", _expected_cost2)
|
||||
print("cost_2", cost_2)
|
||||
|
||||
assert cost_2 == _expected_cost2
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue