feat(cost_calculator.py): add cost calculating for dynamic context window (vertex ai / google ai studio)

This commit is contained in:
Krrish Dholakia 2024-06-17 12:38:10 -07:00
parent 577b90aad8
commit f597aa432b
5 changed files with 247 additions and 51 deletions

View file

@ -1,20 +1,24 @@
# What is this? # What is this?
## File for 'response_cost' calculation in Logging ## File for 'response_cost' calculation in Logging
from typing import Optional, Union, Literal, List, Tuple from typing import List, Literal, Optional, Tuple, Union
import litellm
import litellm._logging import litellm._logging
from litellm import verbose_logger
from litellm.litellm_core_utils.llm_cost_calc.google import (
cost_per_token as google_cost_per_token,
)
from litellm.utils import ( from litellm.utils import (
ModelResponse, CallTypes,
CostPerToken,
EmbeddingResponse, EmbeddingResponse,
ImageResponse, ImageResponse,
TranscriptionResponse, ModelResponse,
TextCompletionResponse, TextCompletionResponse,
CallTypes, TranscriptionResponse,
print_verbose, print_verbose,
CostPerToken,
token_counter, token_counter,
) )
import litellm
from litellm import verbose_logger
def _cost_per_token_custom_pricing_helper( def _cost_per_token_custom_pricing_helper(
@ -42,10 +46,10 @@ def _cost_per_token_custom_pricing_helper(
def cost_per_token( def cost_per_token(
model: str = "", model: str = "",
prompt_tokens=0, prompt_tokens: float = 0,
completion_tokens=0, completion_tokens: float = 0,
response_time_ms=None, response_time_ms=None,
custom_llm_provider=None, custom_llm_provider: Optional[str] = None,
region_name=None, region_name=None,
### CUSTOM PRICING ### ### CUSTOM PRICING ###
custom_cost_per_token: Optional[CostPerToken] = None, custom_cost_per_token: Optional[CostPerToken] = None,
@ -66,6 +70,7 @@ def cost_per_token(
Returns: Returns:
tuple: A tuple containing the cost in USD dollars for prompt tokens and completion tokens, respectively. tuple: A tuple containing the cost in USD dollars for prompt tokens and completion tokens, respectively.
""" """
args = locals()
if model is None: if model is None:
raise Exception("Invalid arg. Model cannot be none.") raise Exception("Invalid arg. Model cannot be none.")
## CUSTOM PRICING ## ## CUSTOM PRICING ##
@ -94,7 +99,8 @@ def cost_per_token(
model_with_provider_and_region in model_cost_ref model_with_provider_and_region in model_cost_ref
): # use region based pricing, if it's available ): # use region based pricing, if it's available
model_with_provider = model_with_provider_and_region model_with_provider = model_with_provider_and_region
else:
_, custom_llm_provider, _, _ = litellm.get_llm_provider(model=model)
model_without_prefix = model model_without_prefix = model
model_parts = model.split("/") model_parts = model.split("/")
if len(model_parts) > 1: if len(model_parts) > 1:
@ -120,7 +126,14 @@ def cost_per_token(
# see this https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models # see this https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models
print_verbose(f"Looking up model={model} in model_cost_map") print_verbose(f"Looking up model={model} in model_cost_map")
if model in model_cost_ref: if custom_llm_provider == "vertex_ai" or custom_llm_provider == "gemini":
return google_cost_per_token(
model=model_without_prefix,
custom_llm_provider=custom_llm_provider,
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
)
elif model in model_cost_ref:
print_verbose(f"Success: model={model} in model_cost_map") print_verbose(f"Success: model={model} in model_cost_map")
print_verbose( print_verbose(
f"prompt_tokens={prompt_tokens}; completion_tokens={completion_tokens}" f"prompt_tokens={prompt_tokens}; completion_tokens={completion_tokens}"

View file

@ -0,0 +1,82 @@
# What is this?
## Cost calculation for Google AI Studio / Vertex AI models
from typing import Literal, Tuple
import litellm
"""
Gemini pricing covers:
- token
- image
- audio
- video
"""
models_without_dynamic_pricing = ["gemini-1.0-pro", "gemini-pro"]
def _is_above_128k(tokens: float) -> bool:
if tokens > 128000:
return True
return False
def cost_per_token(
model: str,
custom_llm_provider: str,
prompt_tokens: float,
completion_tokens: float,
) -> Tuple[float, float]:
"""
Calculates the cost per token for a given model, prompt tokens, and completion tokens.
Input:
- model: str, the model name without provider prefix
- custom_llm_provider: str, either "vertex_ai-*" or "gemini"
- prompt_tokens: float, the number of input tokens
- completion_tokens: float, the number of output tokens
Returns:
Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd
Raises:
Exception if model requires >128k pricing, but model cost not mapped
"""
## GET MODEL INFO
model_info = litellm.get_model_info(
model=model, custom_llm_provider=custom_llm_provider
)
## CALCULATE INPUT COST
if (
_is_above_128k(tokens=prompt_tokens)
and model not in models_without_dynamic_pricing
):
assert (
model_info["input_cost_per_token_above_128k_tokens"] is not None
), "model info for model={} does not have pricing for > 128k tokens\nmodel_info={}".format(
model, model_info
)
prompt_cost = (
prompt_tokens * model_info["input_cost_per_token_above_128k_tokens"]
)
else:
prompt_cost = prompt_tokens * model_info["input_cost_per_token"]
## CALCULATE OUTPUT COST
if (
_is_above_128k(tokens=completion_tokens)
and model not in models_without_dynamic_pricing
):
assert (
model_info["output_cost_per_token_above_128k_tokens"] is not None
), "model info for model={} does not have pricing for > 128k tokens\nmodel_info={}".format(
model, model_info
)
completion_cost = (
completion_tokens * model_info["output_cost_per_token_above_128k_tokens"]
)
else:
completion_cost = completion_tokens * model_info["output_cost_per_token"]
return prompt_cost, completion_cost

View file

@ -1,20 +1,28 @@
import sys, os import os
import sys
import traceback import traceback
import litellm.cost_calculator
sys.path.insert( sys.path.insert(
0, os.path.abspath("../..") 0, os.path.abspath("../..")
) # Adds the parent directory to the system path ) # Adds the parent directory to the system path
import asyncio
import time import time
from typing import Optional from typing import Optional
import pytest
import litellm import litellm
from litellm import ( from litellm import (
TranscriptionResponse,
completion_cost,
cost_per_token,
get_max_tokens, get_max_tokens,
model_cost, model_cost,
open_ai_chat_completion_models, open_ai_chat_completion_models,
TranscriptionResponse,
) )
from litellm.litellm_core_utils.litellm_logging import CustomLogger from litellm.litellm_core_utils.litellm_logging import CustomLogger
import pytest, asyncio
class CustomLoggingHandler(CustomLogger): class CustomLoggingHandler(CustomLogger):
@ -66,7 +74,7 @@ async def test_custom_pricing(sync_mode):
def test_custom_pricing_as_completion_cost_param(): def test_custom_pricing_as_completion_cost_param():
from litellm import ModelResponse, Choices, Message from litellm import Choices, Message, ModelResponse
from litellm.utils import Usage from litellm.utils import Usage
resp = ModelResponse( resp = ModelResponse(
@ -134,7 +142,7 @@ def test_cost_ft_gpt_35():
try: try:
# this tests if litellm.completion_cost can calculate cost for ft:gpt-3.5-turbo:my-org:custom_suffix:id # this tests if litellm.completion_cost can calculate cost for ft:gpt-3.5-turbo:my-org:custom_suffix:id
# it needs to lookup ft:gpt-3.5-turbo in the litellm model_cost map to get the correct cost # it needs to lookup ft:gpt-3.5-turbo in the litellm model_cost map to get the correct cost
from litellm import ModelResponse, Choices, Message from litellm import Choices, Message, ModelResponse
from litellm.utils import Usage from litellm.utils import Usage
resp = ModelResponse( resp = ModelResponse(
@ -179,7 +187,7 @@ def test_cost_azure_gpt_35():
try: try:
# this tests if litellm.completion_cost can calculate cost for azure/chatgpt-deployment-2 which maps to azure/gpt-3.5-turbo # this tests if litellm.completion_cost can calculate cost for azure/chatgpt-deployment-2 which maps to azure/gpt-3.5-turbo
# for this test we check if passing `model` to completion_cost overrides the completion cost # for this test we check if passing `model` to completion_cost overrides the completion cost
from litellm import ModelResponse, Choices, Message from litellm import Choices, Message, ModelResponse
from litellm.utils import Usage from litellm.utils import Usage
resp = ModelResponse( resp = ModelResponse(
@ -266,7 +274,7 @@ def test_cost_bedrock_pricing():
""" """
- get pricing specific to region for a model - get pricing specific to region for a model
""" """
from litellm import ModelResponse, Choices, Message from litellm import Choices, Message, ModelResponse
from litellm.utils import Usage from litellm.utils import Usage
litellm.set_verbose = True litellm.set_verbose = True
@ -475,13 +483,13 @@ def test_replicate_llama3_cost_tracking():
@pytest.mark.parametrize("is_streaming", [True, False]) # @pytest.mark.parametrize("is_streaming", [True, False]) #
def test_groq_response_cost_tracking(is_streaming): def test_groq_response_cost_tracking(is_streaming):
from litellm.utils import ( from litellm.utils import (
ModelResponse,
Choices,
Message,
Usage,
CallTypes, CallTypes,
StreamingChoices, Choices,
Delta, Delta,
Message,
ModelResponse,
StreamingChoices,
Usage,
) )
response = ModelResponse( response = ModelResponse(
@ -565,3 +573,58 @@ def test_together_ai_qwen_completion_cost():
) )
assert response == "together-ai-41.1b-80b" assert response == "together-ai-41.1b-80b"
@pytest.mark.parametrize("above_128k", [False, True])
@pytest.mark.parametrize("provider", ["vertex_ai", "gemini"])
def test_gemini_completion_cost(above_128k, provider):
"""
Check if cost correctly calculated for gemini models based on context window
"""
os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
litellm.model_cost = litellm.get_model_cost_map(url="")
if provider == "gemini":
model_name = "gemini-1.5-flash-latest"
else:
model_name = "gemini-1.5-flash-preview-0514"
if above_128k:
prompt_tokens = 128001.0
output_tokens = 228001.0
else:
prompt_tokens = 128.0
output_tokens = 228.0
## GET MODEL FROM LITELLM.MODEL_INFO
model_info = litellm.get_model_info(model=model_name, custom_llm_provider=provider)
## EXPECTED COST
if above_128k:
assert (
model_info["input_cost_per_token_above_128k_tokens"] is not None
), "model info for model={} does not have pricing for > 128k tokens\nmodel_info={}".format(
model_name, model_info
)
assert (
model_info["output_cost_per_token_above_128k_tokens"] is not None
), "model info for model={} does not have pricing for > 128k tokens\nmodel_info={}".format(
model_name, model_info
)
input_cost = (
prompt_tokens * model_info["input_cost_per_token_above_128k_tokens"]
)
output_cost = (
output_tokens * model_info["output_cost_per_token_above_128k_tokens"]
)
else:
input_cost = prompt_tokens * model_info["input_cost_per_token"]
output_cost = output_tokens * model_info["output_cost_per_token"]
## CALCULATED COST
calculated_input_cost, calculated_output_cost = cost_per_token(
model=model_name,
prompt_tokens=prompt_tokens,
completion_tokens=output_tokens,
custom_llm_provider=provider,
)
assert calculated_input_cost == input_cost
assert calculated_output_cost == output_cost

View file

@ -1,14 +1,15 @@
from typing import List, Optional, Union, Dict, Tuple, Literal
from typing_extensions import TypedDict
from enum import Enum
from typing_extensions import override, Required, Dict
from .llms.openai import ChatCompletionUsageBlock, ChatCompletionToolCallChunk
from ..litellm_core_utils.core_helpers import map_finish_reason
from openai._models import BaseModel as OpenAIObject
from pydantic import ConfigDict
import uuid
import json import json
import time import time
import uuid
from enum import Enum
from typing import Dict, List, Literal, Optional, Tuple, Union
from openai._models import BaseModel as OpenAIObject
from pydantic import ConfigDict
from typing_extensions import Dict, Required, TypedDict, override
from ..litellm_core_utils.core_helpers import map_finish_reason
from .llms.openai import ChatCompletionToolCallChunk, ChatCompletionUsageBlock
def _generate_id(): # private helper function def _generate_id(): # private helper function
@ -34,21 +35,31 @@ class ProviderField(TypedDict):
field_value: str field_value: str
class ModelInfo(TypedDict): class ModelInfo(TypedDict, total=False):
""" """
Model info for a given model, this is information found in litellm.model_prices_and_context_window.json Model info for a given model, this is information found in litellm.model_prices_and_context_window.json
""" """
max_tokens: Optional[int] max_tokens: Required[Optional[int]]
max_input_tokens: Optional[int] max_input_tokens: Required[Optional[int]]
max_output_tokens: Optional[int] max_output_tokens: Required[Optional[int]]
input_cost_per_token: float input_cost_per_token: Required[float]
output_cost_per_token: float input_cost_per_token_above_128k_tokens: Optional[float]
litellm_provider: str input_cost_per_image: Optional[float]
mode: Literal[ input_cost_per_audio_per_second: Optional[float]
"completion", "embedding", "image_generation", "chat", "audio_transcription" input_cost_per_video_per_second: Optional[float]
output_cost_per_token: Required[float]
output_cost_per_token_above_128k_tokens: Optional[float]
output_cost_per_image: Optional[float]
output_cost_per_video_per_second: Optional[float]
output_cost_per_audio_per_second: Optional[float]
litellm_provider: Required[str]
mode: Required[
Literal[
"completion", "embedding", "image_generation", "chat", "audio_transcription"
]
] ]
supported_openai_params: Optional[List[str]] supported_openai_params: Required[Optional[List[str]]]
class GenericStreamingChunk(TypedDict): class GenericStreamingChunk(TypedDict):

View file

@ -4286,8 +4286,10 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
split_model, custom_llm_provider, _, _ = get_llm_provider(model=model) split_model, custom_llm_provider, _, _ = get_llm_provider(model=model)
except: except:
pass pass
combined_model_name = model
else: else:
split_model = model split_model = model
combined_model_name = "{}/{}".format(custom_llm_provider, model)
######################### #########################
supported_openai_params = litellm.get_supported_openai_params( supported_openai_params = litellm.get_supported_openai_params(
@ -4305,33 +4307,58 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
} }
else: else:
""" """
Check if: Check if: (in order of specificity)
1. 'model' in litellm.model_cost. Checks "groq/llama3-8b-8192" in litellm.model_cost 1. 'custom_llm_provider/model' in litellm.model_cost. Checks "groq/llama3-8b-8192" if model="llama3-8b-8192" and custom_llm_provider="groq"
2. 'split_model' in litellm.model_cost. Checks "llama3-8b-8192" in litellm.model_cost 2. 'model' in litellm.model_cost. Checks "groq/llama3-8b-8192" in litellm.model_cost if model="groq/llama3-8b-8192" and custom_llm_provider=None
3. 'split_model' in litellm.model_cost. Checks "llama3-8b-8192" in litellm.model_cost if model="groq/llama3-8b-8192"
""" """
if model in litellm.model_cost: if combined_model_name in litellm.model_cost:
_model_info = litellm.model_cost[combined_model_name]
_model_info["supported_openai_params"] = supported_openai_params
if (
"litellm_provider" in _model_info
and _model_info["litellm_provider"] != custom_llm_provider
):
if custom_llm_provider == "vertex_ai" and _model_info[
"litellm_provider"
].startswith("vertex_ai"):
pass
else:
raise Exception
return _model_info
elif model in litellm.model_cost:
_model_info = litellm.model_cost[model] _model_info = litellm.model_cost[model]
_model_info["supported_openai_params"] = supported_openai_params _model_info["supported_openai_params"] = supported_openai_params
if ( if (
"litellm_provider" in _model_info "litellm_provider" in _model_info
and _model_info["litellm_provider"] != custom_llm_provider and _model_info["litellm_provider"] != custom_llm_provider
): ):
raise Exception if custom_llm_provider == "vertex_ai" and _model_info[
"litellm_provider"
].startswith("vertex_ai"):
pass
else:
raise Exception
return _model_info return _model_info
if split_model in litellm.model_cost: elif split_model in litellm.model_cost:
_model_info = litellm.model_cost[split_model] _model_info = litellm.model_cost[split_model]
_model_info["supported_openai_params"] = supported_openai_params _model_info["supported_openai_params"] = supported_openai_params
if ( if (
"litellm_provider" in _model_info "litellm_provider" in _model_info
and _model_info["litellm_provider"] != custom_llm_provider and _model_info["litellm_provider"] != custom_llm_provider
): ):
raise Exception if custom_llm_provider == "vertex_ai" and _model_info[
"litellm_provider"
].startswith("vertex_ai"):
pass
else:
raise Exception
return _model_info return _model_info
else: else:
raise ValueError( raise ValueError(
"This model isn't mapped yet. Add it here - https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json" "This model isn't mapped yet. Add it here - https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json"
) )
except: except Exception:
raise Exception( raise Exception(
"This model isn't mapped yet. Add it here - https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json" "This model isn't mapped yet. Add it here - https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json"
) )