diff --git a/litellm/cost_calculator.py b/litellm/cost_calculator.py index c84df53e8..cdbf69ff5 100644 --- a/litellm/cost_calculator.py +++ b/litellm/cost_calculator.py @@ -1,20 +1,24 @@ # What is this? ## File for 'response_cost' calculation in Logging -from typing import Optional, Union, Literal, List, Tuple +from typing import List, Literal, Optional, Tuple, Union + +import litellm import litellm._logging +from litellm import verbose_logger +from litellm.litellm_core_utils.llm_cost_calc.google import ( + cost_per_token as google_cost_per_token, +) from litellm.utils import ( - ModelResponse, + CallTypes, + CostPerToken, EmbeddingResponse, ImageResponse, - TranscriptionResponse, + ModelResponse, TextCompletionResponse, - CallTypes, + TranscriptionResponse, print_verbose, - CostPerToken, token_counter, ) -import litellm -from litellm import verbose_logger def _cost_per_token_custom_pricing_helper( @@ -42,10 +46,10 @@ def _cost_per_token_custom_pricing_helper( def cost_per_token( model: str = "", - prompt_tokens=0, - completion_tokens=0, + prompt_tokens: float = 0, + completion_tokens: float = 0, response_time_ms=None, - custom_llm_provider=None, + custom_llm_provider: Optional[str] = None, region_name=None, ### CUSTOM PRICING ### custom_cost_per_token: Optional[CostPerToken] = None, @@ -66,6 +70,7 @@ def cost_per_token( Returns: tuple: A tuple containing the cost in USD dollars for prompt tokens and completion tokens, respectively. """ + args = locals() if model is None: raise Exception("Invalid arg. Model cannot be none.") ## CUSTOM PRICING ## @@ -94,7 +99,8 @@ def cost_per_token( model_with_provider_and_region in model_cost_ref ): # use region based pricing, if it's available model_with_provider = model_with_provider_and_region - + else: + _, custom_llm_provider, _, _ = litellm.get_llm_provider(model=model) model_without_prefix = model model_parts = model.split("/") if len(model_parts) > 1: @@ -120,7 +126,14 @@ def cost_per_token( # see this https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models print_verbose(f"Looking up model={model} in model_cost_map") - if model in model_cost_ref: + if custom_llm_provider == "vertex_ai" or custom_llm_provider == "gemini": + return google_cost_per_token( + model=model_without_prefix, + custom_llm_provider=custom_llm_provider, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + ) + elif model in model_cost_ref: print_verbose(f"Success: model={model} in model_cost_map") print_verbose( f"prompt_tokens={prompt_tokens}; completion_tokens={completion_tokens}" diff --git a/litellm/litellm_core_utils/llm_cost_calc/google.py b/litellm/litellm_core_utils/llm_cost_calc/google.py new file mode 100644 index 000000000..747860070 --- /dev/null +++ b/litellm/litellm_core_utils/llm_cost_calc/google.py @@ -0,0 +1,82 @@ +# What is this? +## Cost calculation for Google AI Studio / Vertex AI models +from typing import Literal, Tuple + +import litellm + +""" +Gemini pricing covers: +- token +- image +- audio +- video +""" + +models_without_dynamic_pricing = ["gemini-1.0-pro", "gemini-pro"] + + +def _is_above_128k(tokens: float) -> bool: + if tokens > 128000: + return True + return False + + +def cost_per_token( + model: str, + custom_llm_provider: str, + prompt_tokens: float, + completion_tokens: float, +) -> Tuple[float, float]: + """ + Calculates the cost per token for a given model, prompt tokens, and completion tokens. + + Input: + - model: str, the model name without provider prefix + - custom_llm_provider: str, either "vertex_ai-*" or "gemini" + - prompt_tokens: float, the number of input tokens + - completion_tokens: float, the number of output tokens + + Returns: + Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd + + Raises: + Exception if model requires >128k pricing, but model cost not mapped + """ + ## GET MODEL INFO + model_info = litellm.get_model_info( + model=model, custom_llm_provider=custom_llm_provider + ) + + ## CALCULATE INPUT COST + if ( + _is_above_128k(tokens=prompt_tokens) + and model not in models_without_dynamic_pricing + ): + assert ( + model_info["input_cost_per_token_above_128k_tokens"] is not None + ), "model info for model={} does not have pricing for > 128k tokens\nmodel_info={}".format( + model, model_info + ) + prompt_cost = ( + prompt_tokens * model_info["input_cost_per_token_above_128k_tokens"] + ) + else: + prompt_cost = prompt_tokens * model_info["input_cost_per_token"] + + ## CALCULATE OUTPUT COST + if ( + _is_above_128k(tokens=completion_tokens) + and model not in models_without_dynamic_pricing + ): + assert ( + model_info["output_cost_per_token_above_128k_tokens"] is not None + ), "model info for model={} does not have pricing for > 128k tokens\nmodel_info={}".format( + model, model_info + ) + completion_cost = ( + completion_tokens * model_info["output_cost_per_token_above_128k_tokens"] + ) + else: + completion_cost = completion_tokens * model_info["output_cost_per_token"] + + return prompt_cost, completion_cost diff --git a/litellm/tests/test_completion_cost.py b/litellm/tests/test_completion_cost.py index 3f7288854..b7c85679d 100644 --- a/litellm/tests/test_completion_cost.py +++ b/litellm/tests/test_completion_cost.py @@ -1,20 +1,28 @@ -import sys, os +import os +import sys import traceback +import litellm.cost_calculator + sys.path.insert( 0, os.path.abspath("../..") ) # Adds the parent directory to the system path +import asyncio import time from typing import Optional + +import pytest + import litellm from litellm import ( + TranscriptionResponse, + completion_cost, + cost_per_token, get_max_tokens, model_cost, open_ai_chat_completion_models, - TranscriptionResponse, ) from litellm.litellm_core_utils.litellm_logging import CustomLogger -import pytest, asyncio class CustomLoggingHandler(CustomLogger): @@ -66,7 +74,7 @@ async def test_custom_pricing(sync_mode): def test_custom_pricing_as_completion_cost_param(): - from litellm import ModelResponse, Choices, Message + from litellm import Choices, Message, ModelResponse from litellm.utils import Usage resp = ModelResponse( @@ -134,7 +142,7 @@ def test_cost_ft_gpt_35(): try: # this tests if litellm.completion_cost can calculate cost for ft:gpt-3.5-turbo:my-org:custom_suffix:id # it needs to lookup ft:gpt-3.5-turbo in the litellm model_cost map to get the correct cost - from litellm import ModelResponse, Choices, Message + from litellm import Choices, Message, ModelResponse from litellm.utils import Usage resp = ModelResponse( @@ -179,7 +187,7 @@ def test_cost_azure_gpt_35(): try: # this tests if litellm.completion_cost can calculate cost for azure/chatgpt-deployment-2 which maps to azure/gpt-3.5-turbo # for this test we check if passing `model` to completion_cost overrides the completion cost - from litellm import ModelResponse, Choices, Message + from litellm import Choices, Message, ModelResponse from litellm.utils import Usage resp = ModelResponse( @@ -266,7 +274,7 @@ def test_cost_bedrock_pricing(): """ - get pricing specific to region for a model """ - from litellm import ModelResponse, Choices, Message + from litellm import Choices, Message, ModelResponse from litellm.utils import Usage litellm.set_verbose = True @@ -475,13 +483,13 @@ def test_replicate_llama3_cost_tracking(): @pytest.mark.parametrize("is_streaming", [True, False]) # def test_groq_response_cost_tracking(is_streaming): from litellm.utils import ( - ModelResponse, - Choices, - Message, - Usage, CallTypes, - StreamingChoices, + Choices, Delta, + Message, + ModelResponse, + StreamingChoices, + Usage, ) response = ModelResponse( @@ -565,3 +573,58 @@ def test_together_ai_qwen_completion_cost(): ) assert response == "together-ai-41.1b-80b" + + +@pytest.mark.parametrize("above_128k", [False, True]) +@pytest.mark.parametrize("provider", ["vertex_ai", "gemini"]) +def test_gemini_completion_cost(above_128k, provider): + """ + Check if cost correctly calculated for gemini models based on context window + """ + os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True" + litellm.model_cost = litellm.get_model_cost_map(url="") + if provider == "gemini": + model_name = "gemini-1.5-flash-latest" + else: + model_name = "gemini-1.5-flash-preview-0514" + if above_128k: + prompt_tokens = 128001.0 + output_tokens = 228001.0 + else: + prompt_tokens = 128.0 + output_tokens = 228.0 + ## GET MODEL FROM LITELLM.MODEL_INFO + model_info = litellm.get_model_info(model=model_name, custom_llm_provider=provider) + + ## EXPECTED COST + if above_128k: + assert ( + model_info["input_cost_per_token_above_128k_tokens"] is not None + ), "model info for model={} does not have pricing for > 128k tokens\nmodel_info={}".format( + model_name, model_info + ) + assert ( + model_info["output_cost_per_token_above_128k_tokens"] is not None + ), "model info for model={} does not have pricing for > 128k tokens\nmodel_info={}".format( + model_name, model_info + ) + input_cost = ( + prompt_tokens * model_info["input_cost_per_token_above_128k_tokens"] + ) + output_cost = ( + output_tokens * model_info["output_cost_per_token_above_128k_tokens"] + ) + else: + input_cost = prompt_tokens * model_info["input_cost_per_token"] + output_cost = output_tokens * model_info["output_cost_per_token"] + + ## CALCULATED COST + calculated_input_cost, calculated_output_cost = cost_per_token( + model=model_name, + prompt_tokens=prompt_tokens, + completion_tokens=output_tokens, + custom_llm_provider=provider, + ) + + assert calculated_input_cost == input_cost + assert calculated_output_cost == output_cost diff --git a/litellm/types/utils.py b/litellm/types/utils.py index 29d21143e..b7c0e318e 100644 --- a/litellm/types/utils.py +++ b/litellm/types/utils.py @@ -1,14 +1,15 @@ -from typing import List, Optional, Union, Dict, Tuple, Literal -from typing_extensions import TypedDict -from enum import Enum -from typing_extensions import override, Required, Dict -from .llms.openai import ChatCompletionUsageBlock, ChatCompletionToolCallChunk -from ..litellm_core_utils.core_helpers import map_finish_reason -from openai._models import BaseModel as OpenAIObject -from pydantic import ConfigDict -import uuid import json import time +import uuid +from enum import Enum +from typing import Dict, List, Literal, Optional, Tuple, Union + +from openai._models import BaseModel as OpenAIObject +from pydantic import ConfigDict +from typing_extensions import Dict, Required, TypedDict, override + +from ..litellm_core_utils.core_helpers import map_finish_reason +from .llms.openai import ChatCompletionToolCallChunk, ChatCompletionUsageBlock def _generate_id(): # private helper function @@ -34,21 +35,31 @@ class ProviderField(TypedDict): field_value: str -class ModelInfo(TypedDict): +class ModelInfo(TypedDict, total=False): """ Model info for a given model, this is information found in litellm.model_prices_and_context_window.json """ - max_tokens: Optional[int] - max_input_tokens: Optional[int] - max_output_tokens: Optional[int] - input_cost_per_token: float - output_cost_per_token: float - litellm_provider: str - mode: Literal[ - "completion", "embedding", "image_generation", "chat", "audio_transcription" + max_tokens: Required[Optional[int]] + max_input_tokens: Required[Optional[int]] + max_output_tokens: Required[Optional[int]] + input_cost_per_token: Required[float] + input_cost_per_token_above_128k_tokens: Optional[float] + input_cost_per_image: Optional[float] + input_cost_per_audio_per_second: Optional[float] + input_cost_per_video_per_second: Optional[float] + output_cost_per_token: Required[float] + output_cost_per_token_above_128k_tokens: Optional[float] + output_cost_per_image: Optional[float] + output_cost_per_video_per_second: Optional[float] + output_cost_per_audio_per_second: Optional[float] + litellm_provider: Required[str] + mode: Required[ + Literal[ + "completion", "embedding", "image_generation", "chat", "audio_transcription" + ] ] - supported_openai_params: Optional[List[str]] + supported_openai_params: Required[Optional[List[str]]] class GenericStreamingChunk(TypedDict): diff --git a/litellm/utils.py b/litellm/utils.py index 0b898165d..574380321 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -4286,8 +4286,10 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod split_model, custom_llm_provider, _, _ = get_llm_provider(model=model) except: pass + combined_model_name = model else: split_model = model + combined_model_name = "{}/{}".format(custom_llm_provider, model) ######################### supported_openai_params = litellm.get_supported_openai_params( @@ -4305,33 +4307,58 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod } else: """ - Check if: - 1. 'model' in litellm.model_cost. Checks "groq/llama3-8b-8192" in litellm.model_cost - 2. 'split_model' in litellm.model_cost. Checks "llama3-8b-8192" in litellm.model_cost + Check if: (in order of specificity) + 1. 'custom_llm_provider/model' in litellm.model_cost. Checks "groq/llama3-8b-8192" if model="llama3-8b-8192" and custom_llm_provider="groq" + 2. 'model' in litellm.model_cost. Checks "groq/llama3-8b-8192" in litellm.model_cost if model="groq/llama3-8b-8192" and custom_llm_provider=None + 3. 'split_model' in litellm.model_cost. Checks "llama3-8b-8192" in litellm.model_cost if model="groq/llama3-8b-8192" """ - if model in litellm.model_cost: + if combined_model_name in litellm.model_cost: + _model_info = litellm.model_cost[combined_model_name] + _model_info["supported_openai_params"] = supported_openai_params + if ( + "litellm_provider" in _model_info + and _model_info["litellm_provider"] != custom_llm_provider + ): + if custom_llm_provider == "vertex_ai" and _model_info[ + "litellm_provider" + ].startswith("vertex_ai"): + pass + else: + raise Exception + return _model_info + elif model in litellm.model_cost: _model_info = litellm.model_cost[model] _model_info["supported_openai_params"] = supported_openai_params if ( "litellm_provider" in _model_info and _model_info["litellm_provider"] != custom_llm_provider ): - raise Exception + if custom_llm_provider == "vertex_ai" and _model_info[ + "litellm_provider" + ].startswith("vertex_ai"): + pass + else: + raise Exception return _model_info - if split_model in litellm.model_cost: + elif split_model in litellm.model_cost: _model_info = litellm.model_cost[split_model] _model_info["supported_openai_params"] = supported_openai_params if ( "litellm_provider" in _model_info and _model_info["litellm_provider"] != custom_llm_provider ): - raise Exception + if custom_llm_provider == "vertex_ai" and _model_info[ + "litellm_provider" + ].startswith("vertex_ai"): + pass + else: + raise Exception return _model_info else: raise ValueError( "This model isn't mapped yet. Add it here - https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json" ) - except: + except Exception: raise Exception( "This model isn't mapped yet. Add it here - https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json" )