fix(utils.py): return 'response_cost' in completion call

Closes https://github.com/BerriAI/litellm/issues/4335
This commit is contained in:
Krrish Dholakia 2024-06-26 17:55:57 -07:00
parent 151d19960e
commit f533e1da09
4 changed files with 260 additions and 64 deletions

View file

@ -101,8 +101,12 @@ def cost_per_token(
if custom_llm_provider is not None:
model_with_provider = custom_llm_provider + "/" + model
if region_name is not None:
model_with_provider_and_region = f"{custom_llm_provider}/{region_name}/{model}"
if model_with_provider_and_region in model_cost_ref: # use region based pricing, if it's available
model_with_provider_and_region = (
f"{custom_llm_provider}/{region_name}/{model}"
)
if (
model_with_provider_and_region in model_cost_ref
): # use region based pricing, if it's available
model_with_provider = model_with_provider_and_region
else:
_, custom_llm_provider, _, _ = litellm.get_llm_provider(model=model)
@ -118,7 +122,9 @@ def cost_per_token(
Option2. model = "openai/gpt-4" - model = provider/model
Option3. model = "anthropic.claude-3" - model = model
"""
if model_with_provider in model_cost_ref: # Option 2. use model with provider, model = "openai/gpt-4"
if (
model_with_provider in model_cost_ref
): # Option 2. use model with provider, model = "openai/gpt-4"
model = model_with_provider
elif model in model_cost_ref: # Option 1. use model passed, model="gpt-4"
model = model
@ -154,29 +160,45 @@ def cost_per_token(
)
elif model in model_cost_ref:
print_verbose(f"Success: model={model} in model_cost_map")
print_verbose(f"prompt_tokens={prompt_tokens}; completion_tokens={completion_tokens}")
print_verbose(
f"prompt_tokens={prompt_tokens}; completion_tokens={completion_tokens}"
)
if (
model_cost_ref[model].get("input_cost_per_token", None) is not None
and model_cost_ref[model].get("output_cost_per_token", None) is not None
):
## COST PER TOKEN ##
prompt_tokens_cost_usd_dollar = model_cost_ref[model]["input_cost_per_token"] * prompt_tokens
completion_tokens_cost_usd_dollar = model_cost_ref[model]["output_cost_per_token"] * completion_tokens
elif model_cost_ref[model].get("output_cost_per_second", None) is not None and response_time_ms is not None:
prompt_tokens_cost_usd_dollar = (
model_cost_ref[model]["input_cost_per_token"] * prompt_tokens
)
completion_tokens_cost_usd_dollar = (
model_cost_ref[model]["output_cost_per_token"] * completion_tokens
)
elif (
model_cost_ref[model].get("output_cost_per_second", None) is not None
and response_time_ms is not None
):
print_verbose(
f"For model={model} - output_cost_per_second: {model_cost_ref[model].get('output_cost_per_second')}; response time: {response_time_ms}"
)
## COST PER SECOND ##
prompt_tokens_cost_usd_dollar = 0
completion_tokens_cost_usd_dollar = (
model_cost_ref[model]["output_cost_per_second"] * response_time_ms / 1000
model_cost_ref[model]["output_cost_per_second"]
* response_time_ms
/ 1000
)
elif model_cost_ref[model].get("input_cost_per_second", None) is not None and response_time_ms is not None:
elif (
model_cost_ref[model].get("input_cost_per_second", None) is not None
and response_time_ms is not None
):
print_verbose(
f"For model={model} - input_cost_per_second: {model_cost_ref[model].get('input_cost_per_second')}; response time: {response_time_ms}"
)
## COST PER SECOND ##
prompt_tokens_cost_usd_dollar = model_cost_ref[model]["input_cost_per_second"] * response_time_ms / 1000
prompt_tokens_cost_usd_dollar = (
model_cost_ref[model]["input_cost_per_second"] * response_time_ms / 1000
)
completion_tokens_cost_usd_dollar = 0.0
print_verbose(
f"Returned custom cost for model={model} - prompt_tokens_cost_usd_dollar: {prompt_tokens_cost_usd_dollar}, completion_tokens_cost_usd_dollar: {completion_tokens_cost_usd_dollar}"
@ -185,40 +207,57 @@ def cost_per_token(
elif "ft:gpt-3.5-turbo" in model:
print_verbose(f"Cost Tracking: {model} is an OpenAI FinteTuned LLM")
# fuzzy match ft:gpt-3.5-turbo:abcd-id-cool-litellm
prompt_tokens_cost_usd_dollar = model_cost_ref["ft:gpt-3.5-turbo"]["input_cost_per_token"] * prompt_tokens
prompt_tokens_cost_usd_dollar = (
model_cost_ref["ft:gpt-3.5-turbo"]["input_cost_per_token"] * prompt_tokens
)
completion_tokens_cost_usd_dollar = (
model_cost_ref["ft:gpt-3.5-turbo"]["output_cost_per_token"] * completion_tokens
model_cost_ref["ft:gpt-3.5-turbo"]["output_cost_per_token"]
* completion_tokens
)
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
elif "ft:gpt-4-0613" in model:
print_verbose(f"Cost Tracking: {model} is an OpenAI FinteTuned LLM")
# fuzzy match ft:gpt-4-0613:abcd-id-cool-litellm
prompt_tokens_cost_usd_dollar = model_cost_ref["ft:gpt-4-0613"]["input_cost_per_token"] * prompt_tokens
completion_tokens_cost_usd_dollar = model_cost_ref["ft:gpt-4-0613"]["output_cost_per_token"] * completion_tokens
prompt_tokens_cost_usd_dollar = (
model_cost_ref["ft:gpt-4-0613"]["input_cost_per_token"] * prompt_tokens
)
completion_tokens_cost_usd_dollar = (
model_cost_ref["ft:gpt-4-0613"]["output_cost_per_token"] * completion_tokens
)
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
elif "ft:gpt-4o-2024-05-13" in model:
print_verbose(f"Cost Tracking: {model} is an OpenAI FinteTuned LLM")
# fuzzy match ft:gpt-4o-2024-05-13:abcd-id-cool-litellm
prompt_tokens_cost_usd_dollar = model_cost_ref["ft:gpt-4o-2024-05-13"]["input_cost_per_token"] * prompt_tokens
prompt_tokens_cost_usd_dollar = (
model_cost_ref["ft:gpt-4o-2024-05-13"]["input_cost_per_token"]
* prompt_tokens
)
completion_tokens_cost_usd_dollar = (
model_cost_ref["ft:gpt-4o-2024-05-13"]["output_cost_per_token"] * completion_tokens
model_cost_ref["ft:gpt-4o-2024-05-13"]["output_cost_per_token"]
* completion_tokens
)
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
elif "ft:davinci-002" in model:
print_verbose(f"Cost Tracking: {model} is an OpenAI FinteTuned LLM")
# fuzzy match ft:davinci-002:abcd-id-cool-litellm
prompt_tokens_cost_usd_dollar = model_cost_ref["ft:davinci-002"]["input_cost_per_token"] * prompt_tokens
prompt_tokens_cost_usd_dollar = (
model_cost_ref["ft:davinci-002"]["input_cost_per_token"] * prompt_tokens
)
completion_tokens_cost_usd_dollar = (
model_cost_ref["ft:davinci-002"]["output_cost_per_token"] * completion_tokens
model_cost_ref["ft:davinci-002"]["output_cost_per_token"]
* completion_tokens
)
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
elif "ft:babbage-002" in model:
print_verbose(f"Cost Tracking: {model} is an OpenAI FinteTuned LLM")
# fuzzy match ft:babbage-002:abcd-id-cool-litellm
prompt_tokens_cost_usd_dollar = model_cost_ref["ft:babbage-002"]["input_cost_per_token"] * prompt_tokens
prompt_tokens_cost_usd_dollar = (
model_cost_ref["ft:babbage-002"]["input_cost_per_token"] * prompt_tokens
)
completion_tokens_cost_usd_dollar = (
model_cost_ref["ft:babbage-002"]["output_cost_per_token"] * completion_tokens
model_cost_ref["ft:babbage-002"]["output_cost_per_token"]
* completion_tokens
)
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
elif model in litellm.azure_llms:
@ -227,17 +266,25 @@ def cost_per_token(
verbose_logger.debug(
f"applying cost={model_cost_ref[model]['input_cost_per_token']} for prompt_tokens={prompt_tokens}"
)
prompt_tokens_cost_usd_dollar = model_cost_ref[model]["input_cost_per_token"] * prompt_tokens
prompt_tokens_cost_usd_dollar = (
model_cost_ref[model]["input_cost_per_token"] * prompt_tokens
)
verbose_logger.debug(
f"applying cost={model_cost_ref[model]['output_cost_per_token']} for completion_tokens={completion_tokens}"
)
completion_tokens_cost_usd_dollar = model_cost_ref[model]["output_cost_per_token"] * completion_tokens
completion_tokens_cost_usd_dollar = (
model_cost_ref[model]["output_cost_per_token"] * completion_tokens
)
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
elif model in litellm.azure_embedding_models:
verbose_logger.debug(f"Cost Tracking: {model} is an Azure Embedding Model")
model = litellm.azure_embedding_models[model]
prompt_tokens_cost_usd_dollar = model_cost_ref[model]["input_cost_per_token"] * prompt_tokens
completion_tokens_cost_usd_dollar = model_cost_ref[model]["output_cost_per_token"] * completion_tokens
prompt_tokens_cost_usd_dollar = (
model_cost_ref[model]["input_cost_per_token"] * prompt_tokens
)
completion_tokens_cost_usd_dollar = (
model_cost_ref[model]["output_cost_per_token"] * completion_tokens
)
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
else:
# if model is not in model_prices_and_context_window.json. Raise an exception-let users know
@ -261,7 +308,9 @@ def get_model_params_and_category(model_name) -> str:
import re
model_name = model_name.lower()
re_params_match = re.search(r"(\d+b)", model_name) # catch all decimals like 3b, 70b, etc
re_params_match = re.search(
r"(\d+b)", model_name
) # catch all decimals like 3b, 70b, etc
category = None
if re_params_match is not None:
params_match = str(re_params_match.group(1))
@ -292,7 +341,9 @@ def get_model_params_and_category(model_name) -> str:
def get_replicate_completion_pricing(completion_response=None, total_time=0.0):
# see https://replicate.com/pricing
# for all litellm currently supported LLMs, almost all requests go to a100_80gb
a100_80gb_price_per_second_public = 0.001400 # assume all calls sent to A100 80GB for now
a100_80gb_price_per_second_public = (
0.001400 # assume all calls sent to A100 80GB for now
)
if total_time == 0.0: # total time is in ms
start_time = completion_response["created"]
end_time = getattr(completion_response, "ended", time.time())
@ -381,9 +432,13 @@ def completion_cost(
if completion_response is not None:
# get input/output tokens from completion_response
prompt_tokens = completion_response.get("usage", {}).get("prompt_tokens", 0)
completion_tokens = completion_response.get("usage", {}).get("completion_tokens", 0)
completion_tokens = completion_response.get("usage", {}).get(
"completion_tokens", 0
)
total_time = completion_response.get("_response_ms", 0)
verbose_logger.debug(f"completion_response response ms: {completion_response.get('_response_ms')} ")
verbose_logger.debug(
f"completion_response response ms: {completion_response.get('_response_ms')} "
)
model = model or completion_response.get(
"model", None
) # check if user passed an override for model, if it's none check completion_response['model']
@ -393,15 +448,25 @@ def completion_cost(
and len(completion_response._hidden_params["model"]) > 0
):
model = completion_response._hidden_params.get("model", model)
custom_llm_provider = completion_response._hidden_params.get("custom_llm_provider", "")
region_name = completion_response._hidden_params.get("region_name", region_name)
size = completion_response._hidden_params.get("optional_params", {}).get(
custom_llm_provider = completion_response._hidden_params.get(
"custom_llm_provider", ""
)
region_name = completion_response._hidden_params.get(
"region_name", region_name
)
size = completion_response._hidden_params.get(
"optional_params", {}
).get(
"size", "1024-x-1024"
) # openai default
quality = completion_response._hidden_params.get("optional_params", {}).get(
quality = completion_response._hidden_params.get(
"optional_params", {}
).get(
"quality", "standard"
) # openai default
n = completion_response._hidden_params.get("optional_params", {}).get("n", 1) # openai default
n = completion_response._hidden_params.get("optional_params", {}).get(
"n", 1
) # openai default
else:
if len(messages) > 0:
prompt_tokens = token_counter(model=model, messages=messages)
@ -413,7 +478,10 @@ def completion_cost(
f"Model is None and does not exist in passed completion_response. Passed completion_response={completion_response}, model={model}"
)
if call_type == CallTypes.image_generation.value or call_type == CallTypes.aimage_generation.value:
if (
call_type == CallTypes.image_generation.value
or call_type == CallTypes.aimage_generation.value
):
### IMAGE GENERATION COST CALCULATION ###
if custom_llm_provider == "vertex_ai":
# https://cloud.google.com/vertex-ai/generative-ai/pricing
@ -431,23 +499,43 @@ def completion_cost(
height = int(size[0]) # if it's 1024-x-1024 vs. 1024x1024
width = int(size[1])
verbose_logger.debug(f"image_gen_model_name: {image_gen_model_name}")
verbose_logger.debug(f"image_gen_model_name_with_quality: {image_gen_model_name_with_quality}")
verbose_logger.debug(
f"image_gen_model_name_with_quality: {image_gen_model_name_with_quality}"
)
if image_gen_model_name in litellm.model_cost:
return litellm.model_cost[image_gen_model_name]["input_cost_per_pixel"] * height * width * n
return (
litellm.model_cost[image_gen_model_name]["input_cost_per_pixel"]
* height
* width
* n
)
elif image_gen_model_name_with_quality in litellm.model_cost:
return (
litellm.model_cost[image_gen_model_name_with_quality]["input_cost_per_pixel"] * height * width * n
litellm.model_cost[image_gen_model_name_with_quality][
"input_cost_per_pixel"
]
* height
* width
* n
)
else:
raise Exception(f"Model={image_gen_model_name} not found in completion cost model map")
raise Exception(
f"Model={image_gen_model_name} not found in completion cost model map"
)
# Calculate cost based on prompt_tokens, completion_tokens
if "togethercomputer" in model or "together_ai" in model or custom_llm_provider == "together_ai":
if (
"togethercomputer" in model
or "together_ai" in model
or custom_llm_provider == "together_ai"
):
# together ai prices based on size of llm
# get_model_params_and_category takes a model name and returns the category of LLM size it is in model_prices_and_context_window.json
model = get_model_params_and_category(model)
# replicate llms are calculate based on time for request running
# see https://replicate.com/pricing
elif (model in litellm.replicate_models or "replicate" in model) and model not in litellm.model_cost:
elif (
model in litellm.replicate_models or "replicate" in model
) and model not in litellm.model_cost:
# for unmapped replicate model, default to replicate's time tracking logic
return get_replicate_completion_pricing(completion_response, total_time)
@ -464,15 +552,21 @@ def completion_cost(
):
# Calculate the prompt characters + response characters
if len("messages") > 0:
prompt_string = litellm.utils.get_formatted_prompt(data={"messages": messages}, call_type="completion")
prompt_string = litellm.utils.get_formatted_prompt(
data={"messages": messages}, call_type="completion"
)
else:
prompt_string = ""
prompt_characters = litellm.utils._count_characters(text=prompt_string)
completion_string = litellm.utils.get_response_string(response_obj=completion_response)
completion_string = litellm.utils.get_response_string(
response_obj=completion_response
)
completion_characters = litellm.utils._count_characters(text=completion_string)
completion_characters = litellm.utils._count_characters(
text=completion_string
)
(
prompt_tokens_cost_usd_dollar,
@ -507,7 +601,7 @@ def response_cost_calculator(
TextCompletionResponse,
],
model: str,
custom_llm_provider: str,
custom_llm_provider: Optional[str],
call_type: Literal[
"embedding",
"aembedding",
@ -529,6 +623,10 @@ def response_cost_calculator(
base_model: Optional[str] = None,
custom_pricing: Optional[bool] = None,
) -> Optional[float]:
"""
Returns
- float or None: cost of response OR none if error.
"""
try:
response_cost: float = 0.0
if cache_hit is not None and cache_hit is True:
@ -544,7 +642,9 @@ def response_cost_calculator(
)
else:
if (
model in litellm.model_cost and custom_pricing is not None and custom_llm_provider is True
model in litellm.model_cost
and custom_pricing is not None
and custom_llm_provider is True
): # override defaults if custom pricing is set
base_model = model
# base_model defaults to None if not set on model_info
@ -556,5 +656,7 @@ def response_cost_calculator(
)
return response_cost
except litellm.NotFoundError as e:
print_verbose(f"Model={model} for LLM Provider={custom_llm_provider} not found in completion cost map.")
print_verbose(
f"Model={model} for LLM Provider={custom_llm_provider} not found in completion cost map."
)
return None

View file

@ -433,6 +433,7 @@ def get_custom_headers(
api_base: Optional[str] = None,
version: Optional[str] = None,
model_region: Optional[str] = None,
response_cost: Optional[Union[float, str]] = None,
fastest_response_batch_completion: Optional[bool] = None,
**kwargs,
) -> dict:
@ -443,6 +444,7 @@ def get_custom_headers(
"x-litellm-model-api-base": api_base,
"x-litellm-version": version,
"x-litellm-model-region": model_region,
"x-litellm-response-cost": str(response_cost),
"x-litellm-key-tpm-limit": str(user_api_key_dict.tpm_limit),
"x-litellm-key-rpm-limit": str(user_api_key_dict.rpm_limit),
"x-litellm-fastest_response_batch_completion": (
@ -3048,6 +3050,7 @@ async def chat_completion(
model_id = hidden_params.get("model_id", None) or ""
cache_key = hidden_params.get("cache_key", None) or ""
api_base = hidden_params.get("api_base", None) or ""
response_cost = hidden_params.get("response_cost", None) or ""
fastest_response_batch_completion = hidden_params.get(
"fastest_response_batch_completion", None
)
@ -3066,6 +3069,7 @@ async def chat_completion(
cache_key=cache_key,
api_base=api_base,
version=version,
response_cost=response_cost,
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
fastest_response_batch_completion=fastest_response_batch_completion,
)
@ -3095,6 +3099,7 @@ async def chat_completion(
cache_key=cache_key,
api_base=api_base,
version=version,
response_cost=response_cost,
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
fastest_response_batch_completion=fastest_response_batch_completion,
**additional_headers,
@ -3290,6 +3295,7 @@ async def completion(
model_id = hidden_params.get("model_id", None) or ""
cache_key = hidden_params.get("cache_key", None) or ""
api_base = hidden_params.get("api_base", None) or ""
response_cost = hidden_params.get("response_cost", None) or ""
### ALERTING ###
data["litellm_status"] = "success" # used for alerting
@ -3304,6 +3310,7 @@ async def completion(
cache_key=cache_key,
api_base=api_base,
version=version,
response_cost=response_cost,
)
selected_data_generator = select_data_generator(
response=response,
@ -3323,6 +3330,7 @@ async def completion(
cache_key=cache_key,
api_base=api_base,
version=version,
response_cost=response_cost,
)
)
@ -3527,6 +3535,7 @@ async def embeddings(
model_id = hidden_params.get("model_id", None) or ""
cache_key = hidden_params.get("cache_key", None) or ""
api_base = hidden_params.get("api_base", None) or ""
response_cost = hidden_params.get("response_cost", None) or ""
fastapi_response.headers.update(
get_custom_headers(
@ -3535,6 +3544,7 @@ async def embeddings(
cache_key=cache_key,
api_base=api_base,
version=version,
response_cost=response_cost,
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
)
)
@ -3676,6 +3686,7 @@ async def image_generation(
model_id = hidden_params.get("model_id", None) or ""
cache_key = hidden_params.get("cache_key", None) or ""
api_base = hidden_params.get("api_base", None) or ""
response_cost = hidden_params.get("response_cost", None) or ""
fastapi_response.headers.update(
get_custom_headers(
@ -3684,6 +3695,7 @@ async def image_generation(
cache_key=cache_key,
api_base=api_base,
version=version,
response_cost=response_cost,
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
)
)
@ -3812,6 +3824,7 @@ async def audio_speech(
model_id = hidden_params.get("model_id", None) or ""
cache_key = hidden_params.get("cache_key", None) or ""
api_base = hidden_params.get("api_base", None) or ""
response_cost = hidden_params.get("response_cost", None) or ""
# Printing each chunk size
async def generate(_response: HttpxBinaryResponseContent):
@ -3825,6 +3838,7 @@ async def audio_speech(
cache_key=cache_key,
api_base=api_base,
version=version,
response_cost=response_cost,
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
fastest_response_batch_completion=None,
)
@ -3976,6 +3990,7 @@ async def audio_transcriptions(
model_id = hidden_params.get("model_id", None) or ""
cache_key = hidden_params.get("cache_key", None) or ""
api_base = hidden_params.get("api_base", None) or ""
response_cost = hidden_params.get("response_cost", None) or ""
fastapi_response.headers.update(
get_custom_headers(
@ -3984,6 +3999,7 @@ async def audio_transcriptions(
cache_key=cache_key,
api_base=api_base,
version=version,
response_cost=response_cost,
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
)
)

View file

@ -4,7 +4,9 @@ import traceback
import litellm.cost_calculator
sys.path.insert(0, os.path.abspath("../..")) # Adds the parent directory to the system path
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import asyncio
import time
from typing import Optional
@ -167,11 +169,15 @@ def test_cost_ft_gpt_35():
input_cost = model_cost["ft:gpt-3.5-turbo"]["input_cost_per_token"]
output_cost = model_cost["ft:gpt-3.5-turbo"]["output_cost_per_token"]
print(input_cost, output_cost)
expected_cost = (input_cost * resp.usage.prompt_tokens) + (output_cost * resp.usage.completion_tokens)
expected_cost = (input_cost * resp.usage.prompt_tokens) + (
output_cost * resp.usage.completion_tokens
)
print("\n Excpected cost", expected_cost)
assert cost == expected_cost
except Exception as e:
pytest.fail(f"Cost Calc failed for ft:gpt-3.5. Expected {expected_cost}, Calculated cost {cost}")
pytest.fail(
f"Cost Calc failed for ft:gpt-3.5. Expected {expected_cost}, Calculated cost {cost}"
)
# test_cost_ft_gpt_35()
@ -200,15 +206,21 @@ def test_cost_azure_gpt_35():
usage=Usage(prompt_tokens=21, completion_tokens=17, total_tokens=38),
)
cost = litellm.completion_cost(completion_response=resp, model="azure/gpt-35-turbo")
cost = litellm.completion_cost(
completion_response=resp, model="azure/gpt-35-turbo"
)
print("\n Calculated Cost for azure/gpt-3.5-turbo", cost)
input_cost = model_cost["azure/gpt-35-turbo"]["input_cost_per_token"]
output_cost = model_cost["azure/gpt-35-turbo"]["output_cost_per_token"]
expected_cost = (input_cost * resp.usage.prompt_tokens) + (output_cost * resp.usage.completion_tokens)
expected_cost = (input_cost * resp.usage.prompt_tokens) + (
output_cost * resp.usage.completion_tokens
)
print("\n Excpected cost", expected_cost)
assert cost == expected_cost
except Exception as e:
pytest.fail(f"Cost Calc failed for azure/gpt-3.5-turbo. Expected {expected_cost}, Calculated cost {cost}")
pytest.fail(
f"Cost Calc failed for azure/gpt-3.5-turbo. Expected {expected_cost}, Calculated cost {cost}"
)
# test_cost_azure_gpt_35()
@ -239,7 +251,9 @@ def test_cost_azure_embedding():
assert cost == expected_cost
except Exception as e:
pytest.fail(f"Cost Calc failed for azure/gpt-3.5-turbo. Expected {expected_cost}, Calculated cost {cost}")
pytest.fail(
f"Cost Calc failed for azure/gpt-3.5-turbo. Expected {expected_cost}, Calculated cost {cost}"
)
# test_cost_azure_embedding()
@ -315,7 +329,9 @@ def test_cost_bedrock_pricing_actual_calls():
litellm.set_verbose = True
model = "anthropic.claude-instant-v1"
messages = [{"role": "user", "content": "Hey, how's it going?"}]
response = litellm.completion(model=model, messages=messages, mock_response="hello cool one")
response = litellm.completion(
model=model, messages=messages, mock_response="hello cool one"
)
print("response", response)
cost = litellm.completion_cost(
@ -345,7 +361,8 @@ def test_whisper_openai():
print(f"cost: {cost}")
print(f"whisper dict: {litellm.model_cost['whisper-1']}")
expected_cost = round(
litellm.model_cost["whisper-1"]["output_cost_per_second"] * _total_time_in_seconds,
litellm.model_cost["whisper-1"]["output_cost_per_second"]
* _total_time_in_seconds,
5,
)
assert cost == expected_cost
@ -365,12 +382,15 @@ def test_whisper_azure():
_total_time_in_seconds = 3
transcription._response_ms = _total_time_in_seconds * 1000
cost = litellm.completion_cost(model="azure/azure-whisper", completion_response=transcription)
cost = litellm.completion_cost(
model="azure/azure-whisper", completion_response=transcription
)
print(f"cost: {cost}")
print(f"whisper dict: {litellm.model_cost['whisper-1']}")
expected_cost = round(
litellm.model_cost["whisper-1"]["output_cost_per_second"] * _total_time_in_seconds,
litellm.model_cost["whisper-1"]["output_cost_per_second"]
* _total_time_in_seconds,
5,
)
assert cost == expected_cost
@ -401,7 +421,9 @@ def test_dalle_3_azure_cost_tracking():
response.usage = {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}
response._hidden_params = {"model": "dall-e-3", "model_id": None}
print(f"response hidden params: {response._hidden_params}")
cost = litellm.completion_cost(completion_response=response, call_type="image_generation")
cost = litellm.completion_cost(
completion_response=response, call_type="image_generation"
)
assert cost > 0
@ -433,7 +455,9 @@ def test_replicate_llama3_cost_tracking():
model="replicate/meta/meta-llama-3-8b-instruct",
object="chat.completion",
system_fingerprint=None,
usage=litellm.utils.Usage(prompt_tokens=48, completion_tokens=31, total_tokens=79),
usage=litellm.utils.Usage(
prompt_tokens=48, completion_tokens=31, total_tokens=79
),
)
cost = litellm.completion_cost(
completion_response=response,
@ -443,8 +467,14 @@ def test_replicate_llama3_cost_tracking():
print(f"cost: {cost}")
cost = round(cost, 5)
expected_cost = round(
litellm.model_cost["replicate/meta/meta-llama-3-8b-instruct"]["input_cost_per_token"] * 48
+ litellm.model_cost["replicate/meta/meta-llama-3-8b-instruct"]["output_cost_per_token"] * 31,
litellm.model_cost["replicate/meta/meta-llama-3-8b-instruct"][
"input_cost_per_token"
]
* 48
+ litellm.model_cost["replicate/meta/meta-llama-3-8b-instruct"][
"output_cost_per_token"
]
* 31,
5,
)
assert cost == expected_cost
@ -538,7 +568,9 @@ def test_together_ai_qwen_completion_cost():
"custom_cost_per_second": None,
}
response = litellm.cost_calculator.get_model_params_and_category(model_name="qwen/Qwen2-72B-Instruct")
response = litellm.cost_calculator.get_model_params_and_category(
model_name="qwen/Qwen2-72B-Instruct"
)
assert response == "together-ai-41.1b-80b"
@ -576,8 +608,12 @@ def test_gemini_completion_cost(above_128k, provider):
), "model info for model={} does not have pricing for > 128k tokens\nmodel_info={}".format(
model_name, model_info
)
input_cost = prompt_tokens * model_info["input_cost_per_token_above_128k_tokens"]
output_cost = output_tokens * model_info["output_cost_per_token_above_128k_tokens"]
input_cost = (
prompt_tokens * model_info["input_cost_per_token_above_128k_tokens"]
)
output_cost = (
output_tokens * model_info["output_cost_per_token_above_128k_tokens"]
)
else:
input_cost = prompt_tokens * model_info["input_cost_per_token"]
output_cost = output_tokens * model_info["output_cost_per_token"]
@ -674,3 +710,23 @@ def test_vertex_ai_claude_completion_cost():
)
predicted_cost = input_tokens * 0.000003 + 0.000015 * output_tokens
assert cost == predicted_cost
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.asyncio
async def test_completion_cost_hidden_params(sync_mode):
if sync_mode:
response = litellm.completion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
mock_response="Hello world",
)
else:
response = await litellm.acompletion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
mock_response="Hello world",
)
assert "response_cost" in response._hidden_params
assert isinstance(response._hidden_params["response_cost"], float)

View file

@ -899,6 +899,17 @@ def client(original_function):
model=model,
optional_params=getattr(logging_obj, "optional_params", {}),
)
result._hidden_params["response_cost"] = (
litellm.response_cost_calculator(
response_object=result,
model=getattr(logging_obj, "model", ""),
custom_llm_provider=getattr(
logging_obj, "custom_llm_provider", None
),
call_type=getattr(logging_obj, "call_type", "completion"),
optional_params=getattr(logging_obj, "optional_params", {}),
)
)
result._response_ms = (
end_time - start_time
).total_seconds() * 1000 # return response latency in ms like openai
@ -1292,6 +1303,17 @@ def client(original_function):
model=model,
optional_params=kwargs,
)
result._hidden_params["response_cost"] = (
litellm.response_cost_calculator(
response_object=result,
model=getattr(logging_obj, "model", ""),
custom_llm_provider=getattr(
logging_obj, "custom_llm_provider", None
),
call_type=getattr(logging_obj, "call_type", "completion"),
optional_params=getattr(logging_obj, "optional_params", {}),
)
)
if (
isinstance(result, ModelResponse)
or isinstance(result, EmbeddingResponse)