diff --git a/litellm/cost_calculator.py b/litellm/cost_calculator.py index 797606121..aa2d3ad76 100644 --- a/litellm/cost_calculator.py +++ b/litellm/cost_calculator.py @@ -34,6 +34,7 @@ from litellm.llms.databricks.cost_calculator import ( from litellm.llms.fireworks_ai.cost_calculator import ( cost_per_token as fireworks_ai_cost_per_token, ) +from litellm.llms.OpenAI.cost_calculation import cost_per_token as openai_cost_per_token from litellm.llms.together_ai.cost_calculator import get_model_params_and_category from litellm.rerank_api.types import RerankResponse from litellm.types.llms.openai import HttpxBinaryResponseContent @@ -55,7 +56,7 @@ from litellm.utils import ( def _cost_per_token_custom_pricing_helper( prompt_tokens: float = 0, completion_tokens: float = 0, - response_time_ms=None, + response_time_ms: Optional[float] = 0.0, ### CUSTOM PRICING ### custom_cost_per_token: Optional[CostPerToken] = None, custom_cost_per_second: Optional[float] = None, @@ -79,7 +80,7 @@ def cost_per_token( model: str = "", prompt_tokens: int = 0, completion_tokens: int = 0, - response_time_ms=None, + response_time_ms: Optional[float] = 0.0, custom_llm_provider: Optional[str] = None, region_name=None, ### CHARACTER PRICING ### @@ -198,7 +199,33 @@ def cost_per_token( # see this https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models print_verbose(f"Looking up model={model} in model_cost_map") - if custom_llm_provider == "vertex_ai": + if call_type == "speech" or call_type == "aspeech": + prompt_cost, completion_cost = _generic_cost_per_character( + model=model_without_prefix, + custom_llm_provider=custom_llm_provider, + prompt_characters=prompt_characters, + completion_characters=completion_characters, + custom_prompt_cost=None, + custom_completion_cost=0, + ) + if prompt_cost is None or completion_cost is None: + raise ValueError( + "cost for tts call is None. prompt_cost={}, completion_cost={}, model={}, custom_llm_provider={}, prompt_characters={}, completion_characters={}".format( + prompt_cost, + completion_cost, + model_without_prefix, + custom_llm_provider, + prompt_characters, + completion_characters, + ) + ) + return prompt_cost, completion_cost + elif call_type == "arerank" or call_type == "rerank": + return rerank_cost( + model=model, + custom_llm_provider=custom_llm_provider, + ) + elif custom_llm_provider == "vertex_ai": cost_router = google_cost_router( model=model_without_prefix, custom_llm_provider=custom_llm_provider, @@ -226,6 +253,10 @@ def cost_per_token( ) elif custom_llm_provider == "anthropic": return anthropic_cost_per_token(model=model, usage=usage_block) + elif custom_llm_provider == "openai": + return openai_cost_per_token( + model=model, usage=usage_block, response_time_ms=response_time_ms + ) elif custom_llm_provider == "databricks": return databricks_cost_per_token(model=model, usage=usage_block) elif custom_llm_provider == "fireworks_ai": @@ -237,32 +268,6 @@ def cost_per_token( prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, ) - elif call_type == "speech" or call_type == "aspeech": - prompt_cost, completion_cost = _generic_cost_per_character( - model=model_without_prefix, - custom_llm_provider=custom_llm_provider, - prompt_characters=prompt_characters, - completion_characters=completion_characters, - custom_prompt_cost=None, - custom_completion_cost=0, - ) - if prompt_cost is None or completion_cost is None: - raise ValueError( - "cost for tts call is None. prompt_cost={}, completion_cost={}, model={}, custom_llm_provider={}, prompt_characters={}, completion_characters={}".format( - prompt_cost, - completion_cost, - model_without_prefix, - custom_llm_provider, - prompt_characters, - completion_characters, - ) - ) - return prompt_cost, completion_cost - elif call_type == "arerank" or call_type == "rerank": - return rerank_cost( - model=model, - custom_llm_provider=custom_llm_provider, - ) elif model in model_cost_ref: print_verbose(f"Success: model={model} in model_cost_map") print_verbose( @@ -461,7 +466,7 @@ def completion_cost( prompt="", messages: List = [], completion="", - total_time=0.0, # used for replicate, sagemaker + total_time: Optional[float] = 0.0, # used for replicate, sagemaker call_type: Literal[ "embedding", "aembedding", @@ -501,7 +506,7 @@ def completion_cost( model (str): Optional. The name of the language model used in the completion calls prompt (str): Optional. The input prompt passed to the llm completion (str): Optional. The output completion text from the llm - total_time (float): Optional. (Only used for Replicate LLMs) The total time used for the request in seconds + total_time (float, int): Optional. (Only used for Replicate LLMs) The total time used for the request in seconds custom_cost_per_token: Optional[CostPerToken]: the cost per input + output token for the llm api call. custom_cost_per_second: Optional[float]: the cost per second for the llm api call. @@ -562,6 +567,13 @@ def completion_cost( completion_tokens = _usage.get("completion_tokens", 0) cache_creation_input_tokens = _usage.get("cache_creation_input_tokens", 0) cache_read_input_tokens = _usage.get("cache_read_input_tokens", 0) + if ( + "prompt_tokens_details" in _usage + and _usage["prompt_tokens_details"] != {} + and _usage["prompt_tokens_details"] + ): + prompt_tokens_details = _usage.get("prompt_tokens_details", {}) + cache_read_input_tokens = prompt_tokens_details.get("cached_tokens", 0) total_time = getattr(completion_response, "_response_ms", 0) verbose_logger.debug( diff --git a/litellm/llms/OpenAI/cost_calculation.py b/litellm/llms/OpenAI/cost_calculation.py new file mode 100644 index 000000000..6b8e5f41e --- /dev/null +++ b/litellm/llms/OpenAI/cost_calculation.py @@ -0,0 +1,68 @@ +""" +Helper util for handling openai-specific cost calculation +- e.g.: prompt caching +""" + +from typing import Optional, Tuple + +from litellm._logging import verbose_logger +from litellm.types.utils import Usage +from litellm.utils import get_model_info + + +def cost_per_token( + model: str, usage: Usage, response_time_ms: Optional[float] = 0.0 +) -> Tuple[float, float]: + """ + Calculates the cost per token for a given model, prompt tokens, and completion tokens. + + Input: + - model: str, the model name without provider prefix + - usage: LiteLLM Usage block, containing anthropic caching information + + Returns: + Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd + """ + ## GET MODEL INFO + model_info = get_model_info(model=model, custom_llm_provider="openai") + + ## CALCULATE INPUT COST + prompt_cost: float = usage["prompt_tokens"] * model_info["input_cost_per_token"] + + ## CALCULATE OUTPUT COST + completion_cost: float = ( + usage["completion_tokens"] * model_info["output_cost_per_token"] + ) + + ## Prompt Caching cost calculation + if model_info.get("cache_read_input_token_cost") is not None: + # Note: We read ._cache_read_input_tokens from the Usage - since cost_calculator.py standardizes the cache read tokens on usage._cache_read_input_tokens + prompt_cost += usage._cache_read_input_tokens * ( + model_info.get("cache_read_input_token_cost", 0) or 0 + ) + + ## Speech / Audio cost calculation + if ( + "output_cost_per_second" in model_info + and model_info["output_cost_per_second"] is not None + and response_time_ms is not None + ): + verbose_logger.debug( + f"For model={model} - output_cost_per_second: {model_info.get('output_cost_per_second')}; response time: {response_time_ms}" + ) + ## COST PER SECOND ## + prompt_cost = 0 + completion_cost = model_info["output_cost_per_second"] * response_time_ms / 1000 + elif ( + "input_cost_per_second" in model_info + and model_info["input_cost_per_second"] is not None + and response_time_ms is not None + ): + verbose_logger.debug( + f"For model={model} - input_cost_per_second: {model_info.get('input_cost_per_second')}; response time: {response_time_ms}" + ) + ## COST PER SECOND ## + prompt_cost = model_info["input_cost_per_second"] * response_time_ms / 1000 + completion_cost = 0.0 + + return prompt_cost, completion_cost diff --git a/litellm/types/utils.py b/litellm/types/utils.py index 35d48d9ff..4cabf6834 100644 --- a/litellm/types/utils.py +++ b/litellm/types/utils.py @@ -79,6 +79,8 @@ class ModelInfo(TypedDict, total=False): output_vector_size: Optional[int] output_cost_per_video_per_second: Optional[float] # only for vertex ai models output_cost_per_audio_per_second: Optional[float] # only for vertex ai models + output_cost_per_second: Optional[float] # for OpenAI Speech models + litellm_provider: Required[str] mode: Required[ Literal[ diff --git a/litellm/utils.py b/litellm/utils.py index e8dea5759..57166fd2a 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -5092,6 +5092,7 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod output_cost_per_character_above_128k_tokens=_model_info.get( "output_cost_per_character_above_128k_tokens", None ), + output_cost_per_second=_model_info.get("output_cost_per_second", None), output_vector_size=_model_info.get("output_vector_size", None), litellm_provider=_model_info.get( "litellm_provider", custom_llm_provider diff --git a/tests/local_testing/test_completion_cost.py b/tests/local_testing/test_completion_cost.py index 8408a5051..02c87ed04 100644 --- a/tests/local_testing/test_completion_cost.py +++ b/tests/local_testing/test_completion_cost.py @@ -24,6 +24,7 @@ from litellm import ( model_cost, open_ai_chat_completion_models, ) +from litellm.types.utils import PromptTokensDetails from litellm.litellm_core_utils.litellm_logging import CustomLogger @@ -209,7 +210,9 @@ def test_cost_ft_gpt_35(): usage=Usage(prompt_tokens=21, completion_tokens=17, total_tokens=38), ) - cost = litellm.completion_cost(completion_response=resp) + cost = litellm.completion_cost( + completion_response=resp, custom_llm_provider="openai" + ) print("\n Calculated Cost for ft:gpt-3.5", cost) input_cost = model_cost["ft:gpt-3.5-turbo"]["input_cost_per_token"] output_cost = model_cost["ft:gpt-3.5-turbo"]["output_cost_per_token"] @@ -1330,6 +1333,90 @@ def test_completion_cost_vertex_llama3(): assert cost == 0 +def test_cost_openai_prompt_caching(): + from litellm.utils import Choices, Message, ModelResponse, Usage + from litellm import get_model_info + + os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True" + litellm.model_cost = litellm.get_model_cost_map(url="") + + model = "gpt-4o-mini-2024-07-18" + + ## LLM API CALL ## (MORE EXPENSIVE) + response_1 = ModelResponse( + id="chatcmpl-3f427194-0840-4d08-b571-56bfe38a5424", + choices=[ + Choices( + finish_reason="length", + index=0, + message=Message( + content="Hello! I'm doing well, thank you for", + role="assistant", + tool_calls=None, + function_call=None, + ), + ) + ], + created=1725036547, + model=model, + object="chat.completion", + system_fingerprint=None, + usage=Usage( + completion_tokens=10, + prompt_tokens=14, + total_tokens=24, + ), + ) + + ## PROMPT CACHE HIT ## (LESS EXPENSIVE) + response_2 = ModelResponse( + id="chatcmpl-3f427194-0840-4d08-b571-56bfe38a5424", + choices=[ + Choices( + finish_reason="length", + index=0, + message=Message( + content="Hello! I'm doing well, thank you for", + role="assistant", + tool_calls=None, + function_call=None, + ), + ) + ], + created=1725036547, + model=model, + object="chat.completion", + system_fingerprint=None, + usage=Usage( + completion_tokens=10, + prompt_tokens=0, + total_tokens=10, + prompt_tokens_details=PromptTokensDetails( + cached_tokens=14, + ), + ), + ) + + cost_1 = completion_cost(model=model, completion_response=response_1) + cost_2 = completion_cost(model=model, completion_response=response_2) + assert cost_1 > cost_2 + + model_info = get_model_info(model=model, custom_llm_provider="openai") + usage = response_2.usage + + _expected_cost2 = ( + usage.prompt_tokens * model_info["input_cost_per_token"] + + usage.completion_tokens * model_info["output_cost_per_token"] + + usage.prompt_tokens_details.cached_tokens + * model_info["cache_read_input_token_cost"] + ) + + print("_expected_cost2", _expected_cost2) + print("cost_2", cost_2) + + assert cost_2 == _expected_cost2 + + @pytest.mark.parametrize( "model", [