diff --git a/litellm/cost_calculator.py b/litellm/cost_calculator.py index 9c980aa3a..789811b46 100644 --- a/litellm/cost_calculator.py +++ b/litellm/cost_calculator.py @@ -6,6 +6,9 @@ from typing import List, Literal, Optional, Tuple, Union import litellm import litellm._logging from litellm import verbose_logger +from litellm.litellm_core_utils.llm_cost_calc.google import ( + cost_per_character as google_cost_per_character, +) from litellm.litellm_core_utils.llm_cost_calc.google import ( cost_per_token as google_cost_per_token, ) @@ -23,8 +26,8 @@ from litellm.utils import ( def _cost_per_token_custom_pricing_helper( - prompt_tokens=0, - completion_tokens=0, + prompt_tokens: float = 0, + completion_tokens: float = 0, response_time_ms=None, ### CUSTOM PRICING ### custom_cost_per_token: Optional[CostPerToken] = None, @@ -52,6 +55,9 @@ def cost_per_token( response_time_ms=None, custom_llm_provider: Optional[str] = None, region_name=None, + ### CHARACTER PRICING ### + prompt_characters: float = 0, + completion_characters: float = 0, ### CUSTOM PRICING ### custom_cost_per_token: Optional[CostPerToken] = None, custom_cost_per_second: Optional[float] = None, @@ -64,6 +70,8 @@ def cost_per_token( prompt_tokens (int): The number of tokens in the prompt. completion_tokens (int): The number of tokens in the completion. response_time (float): The amount of time, in milliseconds, it took the call to complete. + prompt_characters (float): The number of characters in the prompt. Used for vertex ai cost calculation. + completion_characters (float): The number of characters in the completion response. Used for vertex ai cost calculation. custom_llm_provider (str): The llm provider to whom the call was made (see init.py for full list) custom_cost_per_token: Optional[CostPerToken]: the cost per input + output token for the llm api call. custom_cost_per_second: Optional[float]: the cost per second for the llm api call. @@ -127,7 +135,16 @@ def cost_per_token( # see this https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models print_verbose(f"Looking up model={model} in model_cost_map") - if custom_llm_provider == "vertex_ai" or custom_llm_provider == "gemini": + if custom_llm_provider == "vertex_ai": + return google_cost_per_character( + model=model_without_prefix, + custom_llm_provider=custom_llm_provider, + prompt_characters=prompt_characters, + completion_characters=completion_characters, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + ) + elif custom_llm_provider == "gemini": return google_cost_per_token( model=model_without_prefix, custom_llm_provider=custom_llm_provider, @@ -378,7 +395,9 @@ def completion_cost( model = "dall-e-2" # for dall-e-2, azure expects an empty model name # Handle Inputs to completion_cost prompt_tokens = 0 + prompt_characters = 0 completion_tokens = 0 + completion_characters = 0 custom_llm_provider = None if completion_response is not None: # get input/output tokens from completion_response @@ -495,6 +514,30 @@ def completion_cost( f"Model is None and does not exist in passed completion_response. Passed completion_response={completion_response}, model={model}" ) + if ( + custom_llm_provider is not None + and custom_llm_provider == "vertex_ai" + and completion_response is not None + and isinstance(completion_response, ModelResponse) + ): + # Calculate the prompt characters + response characters + if len("messages") > 0: + prompt_string = litellm.utils.get_formatted_prompt( + data={"messages": messages}, call_type="completion" + ) + else: + prompt_string = "" + + prompt_characters = litellm.utils._count_characters(text=prompt_string) + + completion_string = litellm.utils.get_response_string( + response_obj=completion_response + ) + + completion_characters = litellm.utils._count_characters( + text=completion_string + ) + ( prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar, @@ -507,6 +550,8 @@ def completion_cost( region_name=region_name, custom_cost_per_second=custom_cost_per_second, custom_cost_per_token=custom_cost_per_token, + prompt_characters=prompt_characters, + completion_characters=completion_characters, ) _final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar print_verbose( diff --git a/litellm/litellm_core_utils/llm_cost_calc/google.py b/litellm/litellm_core_utils/llm_cost_calc/google.py index 747860070..2c958cf88 100644 --- a/litellm/litellm_core_utils/llm_cost_calc/google.py +++ b/litellm/litellm_core_utils/llm_cost_calc/google.py @@ -1,8 +1,10 @@ # What is this? ## Cost calculation for Google AI Studio / Vertex AI models -from typing import Literal, Tuple +import traceback +from typing import List, Literal, Optional, Tuple import litellm +from litellm import verbose_logger """ Gemini pricing covers: @@ -12,6 +14,12 @@ Gemini pricing covers: - video """ +""" +Vertex AI -> character based pricing + +Google AI Studio -> token based pricing +""" + models_without_dynamic_pricing = ["gemini-1.0-pro", "gemini-pro"] @@ -21,6 +29,124 @@ def _is_above_128k(tokens: float) -> bool: return False +def cost_per_character( + model: str, + custom_llm_provider: str, + prompt_tokens: float, + completion_tokens: float, + prompt_characters: float, + completion_characters: float, +) -> Tuple[float, float]: + """ + Calculates the cost per character for a given VertexAI model, input messages, and response object. + + Input: + - model: str, the model name without provider prefix + - custom_llm_provider: str, "vertex_ai-*" + - prompt_characters: float, the number of input characters + - completion_characters: float, the number of output characters + + Returns: + Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd + + Raises: + Exception if model requires >128k pricing, but model cost not mapped + """ + model_info = litellm.get_model_info( + model=model, custom_llm_provider=custom_llm_provider + ) + + ## GET MODEL INFO + model_info = litellm.get_model_info( + model=model, custom_llm_provider=custom_llm_provider + ) + + ## CALCULATE INPUT COST + try: + if ( + _is_above_128k(tokens=prompt_characters * 4) # 1 token = 4 char + and model not in models_without_dynamic_pricing + ): + ## check if character pricing, else default to token pricing + assert ( + "input_cost_per_character_above_128k_tokens" in model_info + and model_info["input_cost_per_character_above_128k_tokens"] is not None + ), "model info for model={} does not have 'input_cost_per_character_above_128k_tokens'-pricing for > 128k tokens\nmodel_info={}".format( + model, model_info + ) + prompt_cost = ( + prompt_characters + * model_info["input_cost_per_character_above_128k_tokens"] + ) + else: + assert ( + "input_cost_per_character" in model_info + and model_info["input_cost_per_character"] is not None + ), "model info for model={} does not have 'input_cost_per_character'-pricing\nmodel_info={}".format( + model, model_info + ) + prompt_cost = prompt_characters * model_info["input_cost_per_character"] + except Exception as e: + verbose_logger.error( + "litellm.litellm_core_utils.llm_cost_calc.google.cost_per_character(): Exception occured - {}\n{}\n\ + Defaulting to (cost_per_token * 4) calculation for prompt_cost".format( + str(e), traceback.format_exc() + ) + ) + initial_prompt_cost, _ = cost_per_token( + model=model, + custom_llm_provider=custom_llm_provider, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + ) + + prompt_cost = initial_prompt_cost * 4 + + ## CALCULATE OUTPUT COST + try: + if ( + _is_above_128k(tokens=completion_characters * 4) # 1 token = 4 char + and model not in models_without_dynamic_pricing + ): + assert ( + "output_cost_per_character_above_128k_tokens" in model_info + and model_info["output_cost_per_character_above_128k_tokens"] + is not None + ), "model info for model={} does not have 'output_cost_per_character_above_128k_tokens' pricing\nmodel_info={}".format( + model, model_info + ) + completion_cost = ( + completion_tokens + * model_info["output_cost_per_character_above_128k_tokens"] + ) + else: + assert ( + "output_cost_per_character" in model_info + and model_info["output_cost_per_character"] is not None + ), "model info for model={} does not have 'output_cost_per_character'-pricing\nmodel_info={}".format( + model, model_info + ) + completion_cost = ( + completion_tokens * model_info["output_cost_per_character"] + ) + except Exception as e: + verbose_logger.error( + "litellm.litellm_core_utils.llm_cost_calc.google.cost_per_character(): Exception occured - {}\n{}\n\ + Defaulting to (cost_per_token * 4) calculation for completion_cost".format( + str(e), traceback.format_exc() + ) + ) + _, initial_completion_cost = cost_per_token( + model=model, + custom_llm_provider=custom_llm_provider, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + ) + + completion_cost = initial_completion_cost * 4 + return prompt_cost, completion_cost + + def cost_per_token( model: str, custom_llm_provider: str, @@ -53,7 +179,8 @@ def cost_per_token( and model not in models_without_dynamic_pricing ): assert ( - model_info["input_cost_per_token_above_128k_tokens"] is not None + "input_cost_per_token_above_128k_tokens" in model_info + and model_info["input_cost_per_token_above_128k_tokens"] is not None ), "model info for model={} does not have pricing for > 128k tokens\nmodel_info={}".format( model, model_info ) @@ -69,7 +196,8 @@ def cost_per_token( and model not in models_without_dynamic_pricing ): assert ( - model_info["output_cost_per_token_above_128k_tokens"] is not None + "output_cost_per_token_above_128k_tokens" in model_info + and model_info["output_cost_per_token_above_128k_tokens"] is not None ), "model info for model={} does not have pricing for > 128k tokens\nmodel_info={}".format( model, model_info ) diff --git a/litellm/tests/test_completion_cost.py b/litellm/tests/test_completion_cost.py index b7c85679d..42f59c5f5 100644 --- a/litellm/tests/test_completion_cost.py +++ b/litellm/tests/test_completion_cost.py @@ -576,7 +576,7 @@ def test_together_ai_qwen_completion_cost(): @pytest.mark.parametrize("above_128k", [False, True]) -@pytest.mark.parametrize("provider", ["vertex_ai", "gemini"]) +@pytest.mark.parametrize("provider", ["gemini"]) def test_gemini_completion_cost(above_128k, provider): """ Check if cost correctly calculated for gemini models based on context window @@ -628,3 +628,35 @@ def test_gemini_completion_cost(above_128k, provider): assert calculated_input_cost == input_cost assert calculated_output_cost == output_cost + + +def _count_characters(text): + # Remove white spaces and count characters + filtered_text = "".join(char for char in text if not char.isspace()) + return len(filtered_text) + + +def test_vertex_ai_completion_cost(): + os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True" + litellm.model_cost = litellm.get_model_cost_map(url="") + + text = "The quick brown fox jumps over the lazy dog." + characters = _count_characters(text=text) + + model_info = litellm.get_model_info(model="gemini-1.5-flash") + + print("\nExpected model info:\n{}\n\n".format(model_info)) + + expected_input_cost = characters * model_info["input_cost_per_character"] + + ## CALCULATED COST + calculated_input_cost, calculated_output_cost = cost_per_token( + model="gemini-1.5-flash", + custom_llm_provider="vertex_ai", + prompt_characters=characters, + completion_characters=0, + ) + + assert round(expected_input_cost, 6) == round(calculated_input_cost, 6) + print("expected_input_cost: {}".format(expected_input_cost)) + print("calculated_input_cost: {}".format(calculated_input_cost)) diff --git a/litellm/types/utils.py b/litellm/types/utils.py index 142eef300..da6da4dc9 100644 --- a/litellm/types/utils.py +++ b/litellm/types/utils.py @@ -44,15 +44,25 @@ class ModelInfo(TypedDict, total=False): max_input_tokens: Required[Optional[int]] max_output_tokens: Required[Optional[int]] input_cost_per_token: Required[float] - input_cost_per_token_above_128k_tokens: Optional[float] - input_cost_per_image: Optional[float] - input_cost_per_audio_per_second: Optional[float] - input_cost_per_video_per_second: Optional[float] + input_cost_per_character: Optional[float] # only for vertex ai models + input_cost_per_token_above_128k_tokens: Optional[float] # only for vertex ai models + input_cost_per_character_above_128k_tokens: Optional[ + float + ] # only for vertex ai models + input_cost_per_image: Optional[float] # only for vertex ai models + input_cost_per_audio_per_second: Optional[float] # only for vertex ai models + input_cost_per_video_per_second: Optional[float] # only for vertex ai models output_cost_per_token: Required[float] - output_cost_per_token_above_128k_tokens: Optional[float] + output_cost_per_character: Required[float] # only for vertex ai models + output_cost_per_token_above_128k_tokens: Optional[ + float + ] # only for vertex ai models + output_cost_per_character_above_128k_tokens: Optional[ + float + ] # only for vertex ai models output_cost_per_image: Optional[float] - output_cost_per_video_per_second: Optional[float] - output_cost_per_audio_per_second: Optional[float] + output_cost_per_video_per_second: Optional[float] # only for vertex ai models + output_cost_per_audio_per_second: Optional[float] # only for vertex ai models litellm_provider: Required[str] mode: Required[ Literal[ diff --git a/litellm/utils.py b/litellm/utils.py index 0623e26b3..2cd19be54 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -3810,6 +3810,12 @@ def get_supported_openai_params( return None +def _count_characters(text: str) -> int: + # Remove white spaces and count characters + filtered_text = "".join(char for char in text if not char.isspace()) + return len(filtered_text) + + def get_formatted_prompt( data: dict, call_type: Literal[ @@ -3828,9 +3834,20 @@ def get_formatted_prompt( """ prompt = "" if call_type == "completion": - for m in data["messages"]: - if "content" in m and isinstance(m["content"], str): - prompt += m["content"] + for message in data["messages"]: + if message.get("content", None) is not None: + content = message.get("content") + if isinstance(content, str): + prompt += message["content"] + elif isinstance(content, List): + for c in content: + if c["type"] == "text": + prompt += c["text"] + if "tool_calls" in message: + for tool_call in message["tool_calls"]: + if "function" in tool_call: + function_arguments = tool_call["function"]["arguments"] + prompt += function_arguments elif call_type == "text_completion": prompt = data["prompt"] elif call_type == "embedding" or call_type == "moderation": @@ -3847,6 +3864,17 @@ def get_formatted_prompt( return prompt +def get_response_string(response_obj: ModelResponse) -> str: + _choices: List[Choices] = response_obj.choices # type: ignore + + response_str = "" + for choice in _choices: + if choice.message.content is not None: + response_str += choice.message.content + + return response_str + + def _is_non_openai_azure_model(model: str) -> bool: try: model_name = model.split("/", 1)[1] @@ -4392,13 +4420,22 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod max_input_tokens=_model_info.get("max_input_tokens", None), max_output_tokens=_model_info.get("max_output_tokens", None), input_cost_per_token=_model_info.get("input_cost_per_token", 0), + input_cost_per_character=_model_info.get( + "input_cost_per_character", None + ), input_cost_per_token_above_128k_tokens=_model_info.get( "input_cost_per_token_above_128k_tokens", None ), output_cost_per_token=_model_info.get("output_cost_per_token", 0), + output_cost_per_character=_model_info.get( + "output_cost_per_character", None + ), output_cost_per_token_above_128k_tokens=_model_info.get( "output_cost_per_token_above_128k_tokens", None ), + output_cost_per_character_above_128k_tokens=_model_info.get( + "output_cost_per_character_above_128k_tokens", None + ), litellm_provider=_model_info.get( "litellm_provider", custom_llm_provider ), @@ -4426,13 +4463,22 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod max_input_tokens=_model_info.get("max_input_tokens", None), max_output_tokens=_model_info.get("max_output_tokens", None), input_cost_per_token=_model_info.get("input_cost_per_token", 0), + input_cost_per_character=_model_info.get( + "input_cost_per_character", None + ), input_cost_per_token_above_128k_tokens=_model_info.get( "input_cost_per_token_above_128k_tokens", None ), output_cost_per_token=_model_info.get("output_cost_per_token", 0), + output_cost_per_character=_model_info.get( + "output_cost_per_character", None + ), output_cost_per_token_above_128k_tokens=_model_info.get( "output_cost_per_token_above_128k_tokens", None ), + output_cost_per_character_above_128k_tokens=_model_info.get( + "output_cost_per_character_above_128k_tokens", None + ), litellm_provider=_model_info.get( "litellm_provider", custom_llm_provider ), @@ -4460,13 +4506,22 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod max_input_tokens=_model_info.get("max_input_tokens", None), max_output_tokens=_model_info.get("max_output_tokens", None), input_cost_per_token=_model_info.get("input_cost_per_token", 0), + input_cost_per_character=_model_info.get( + "input_cost_per_character", None + ), input_cost_per_token_above_128k_tokens=_model_info.get( "input_cost_per_token_above_128k_tokens", None ), output_cost_per_token=_model_info.get("output_cost_per_token", 0), + output_cost_per_character=_model_info.get( + "output_cost_per_character", None + ), output_cost_per_token_above_128k_tokens=_model_info.get( "output_cost_per_token_above_128k_tokens", None ), + output_cost_per_character_above_128k_tokens=_model_info.get( + "output_cost_per_character_above_128k_tokens", None + ), litellm_provider=_model_info.get( "litellm_provider", custom_llm_provider ),