diff --git a/litellm/cost_calculator.py b/litellm/cost_calculator.py index d61e812d0..993344e5b 100644 --- a/litellm/cost_calculator.py +++ b/litellm/cost_calculator.py @@ -101,8 +101,12 @@ def cost_per_token( if custom_llm_provider is not None: model_with_provider = custom_llm_provider + "/" + model if region_name is not None: - model_with_provider_and_region = f"{custom_llm_provider}/{region_name}/{model}" - if model_with_provider_and_region in model_cost_ref: # use region based pricing, if it's available + model_with_provider_and_region = ( + f"{custom_llm_provider}/{region_name}/{model}" + ) + if ( + model_with_provider_and_region in model_cost_ref + ): # use region based pricing, if it's available model_with_provider = model_with_provider_and_region else: _, custom_llm_provider, _, _ = litellm.get_llm_provider(model=model) @@ -118,7 +122,9 @@ def cost_per_token( Option2. model = "openai/gpt-4" - model = provider/model Option3. model = "anthropic.claude-3" - model = model """ - if model_with_provider in model_cost_ref: # Option 2. use model with provider, model = "openai/gpt-4" + if ( + model_with_provider in model_cost_ref + ): # Option 2. use model with provider, model = "openai/gpt-4" model = model_with_provider elif model in model_cost_ref: # Option 1. use model passed, model="gpt-4" model = model @@ -154,29 +160,45 @@ def cost_per_token( ) elif model in model_cost_ref: print_verbose(f"Success: model={model} in model_cost_map") - print_verbose(f"prompt_tokens={prompt_tokens}; completion_tokens={completion_tokens}") + print_verbose( + f"prompt_tokens={prompt_tokens}; completion_tokens={completion_tokens}" + ) if ( model_cost_ref[model].get("input_cost_per_token", None) is not None and model_cost_ref[model].get("output_cost_per_token", None) is not None ): ## COST PER TOKEN ## - prompt_tokens_cost_usd_dollar = model_cost_ref[model]["input_cost_per_token"] * prompt_tokens - completion_tokens_cost_usd_dollar = model_cost_ref[model]["output_cost_per_token"] * completion_tokens - elif model_cost_ref[model].get("output_cost_per_second", None) is not None and response_time_ms is not None: + prompt_tokens_cost_usd_dollar = ( + model_cost_ref[model]["input_cost_per_token"] * prompt_tokens + ) + completion_tokens_cost_usd_dollar = ( + model_cost_ref[model]["output_cost_per_token"] * completion_tokens + ) + elif ( + model_cost_ref[model].get("output_cost_per_second", None) is not None + and response_time_ms is not None + ): print_verbose( f"For model={model} - output_cost_per_second: {model_cost_ref[model].get('output_cost_per_second')}; response time: {response_time_ms}" ) ## COST PER SECOND ## prompt_tokens_cost_usd_dollar = 0 completion_tokens_cost_usd_dollar = ( - model_cost_ref[model]["output_cost_per_second"] * response_time_ms / 1000 + model_cost_ref[model]["output_cost_per_second"] + * response_time_ms + / 1000 ) - elif model_cost_ref[model].get("input_cost_per_second", None) is not None and response_time_ms is not None: + elif ( + model_cost_ref[model].get("input_cost_per_second", None) is not None + and response_time_ms is not None + ): print_verbose( f"For model={model} - input_cost_per_second: {model_cost_ref[model].get('input_cost_per_second')}; response time: {response_time_ms}" ) ## COST PER SECOND ## - prompt_tokens_cost_usd_dollar = model_cost_ref[model]["input_cost_per_second"] * response_time_ms / 1000 + prompt_tokens_cost_usd_dollar = ( + model_cost_ref[model]["input_cost_per_second"] * response_time_ms / 1000 + ) completion_tokens_cost_usd_dollar = 0.0 print_verbose( f"Returned custom cost for model={model} - prompt_tokens_cost_usd_dollar: {prompt_tokens_cost_usd_dollar}, completion_tokens_cost_usd_dollar: {completion_tokens_cost_usd_dollar}" @@ -185,40 +207,57 @@ def cost_per_token( elif "ft:gpt-3.5-turbo" in model: print_verbose(f"Cost Tracking: {model} is an OpenAI FinteTuned LLM") # fuzzy match ft:gpt-3.5-turbo:abcd-id-cool-litellm - prompt_tokens_cost_usd_dollar = model_cost_ref["ft:gpt-3.5-turbo"]["input_cost_per_token"] * prompt_tokens + prompt_tokens_cost_usd_dollar = ( + model_cost_ref["ft:gpt-3.5-turbo"]["input_cost_per_token"] * prompt_tokens + ) completion_tokens_cost_usd_dollar = ( - model_cost_ref["ft:gpt-3.5-turbo"]["output_cost_per_token"] * completion_tokens + model_cost_ref["ft:gpt-3.5-turbo"]["output_cost_per_token"] + * completion_tokens ) return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar elif "ft:gpt-4-0613" in model: print_verbose(f"Cost Tracking: {model} is an OpenAI FinteTuned LLM") # fuzzy match ft:gpt-4-0613:abcd-id-cool-litellm - prompt_tokens_cost_usd_dollar = model_cost_ref["ft:gpt-4-0613"]["input_cost_per_token"] * prompt_tokens - completion_tokens_cost_usd_dollar = model_cost_ref["ft:gpt-4-0613"]["output_cost_per_token"] * completion_tokens + prompt_tokens_cost_usd_dollar = ( + model_cost_ref["ft:gpt-4-0613"]["input_cost_per_token"] * prompt_tokens + ) + completion_tokens_cost_usd_dollar = ( + model_cost_ref["ft:gpt-4-0613"]["output_cost_per_token"] * completion_tokens + ) return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar elif "ft:gpt-4o-2024-05-13" in model: print_verbose(f"Cost Tracking: {model} is an OpenAI FinteTuned LLM") # fuzzy match ft:gpt-4o-2024-05-13:abcd-id-cool-litellm - prompt_tokens_cost_usd_dollar = model_cost_ref["ft:gpt-4o-2024-05-13"]["input_cost_per_token"] * prompt_tokens + prompt_tokens_cost_usd_dollar = ( + model_cost_ref["ft:gpt-4o-2024-05-13"]["input_cost_per_token"] + * prompt_tokens + ) completion_tokens_cost_usd_dollar = ( - model_cost_ref["ft:gpt-4o-2024-05-13"]["output_cost_per_token"] * completion_tokens + model_cost_ref["ft:gpt-4o-2024-05-13"]["output_cost_per_token"] + * completion_tokens ) return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar elif "ft:davinci-002" in model: print_verbose(f"Cost Tracking: {model} is an OpenAI FinteTuned LLM") # fuzzy match ft:davinci-002:abcd-id-cool-litellm - prompt_tokens_cost_usd_dollar = model_cost_ref["ft:davinci-002"]["input_cost_per_token"] * prompt_tokens + prompt_tokens_cost_usd_dollar = ( + model_cost_ref["ft:davinci-002"]["input_cost_per_token"] * prompt_tokens + ) completion_tokens_cost_usd_dollar = ( - model_cost_ref["ft:davinci-002"]["output_cost_per_token"] * completion_tokens + model_cost_ref["ft:davinci-002"]["output_cost_per_token"] + * completion_tokens ) return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar elif "ft:babbage-002" in model: print_verbose(f"Cost Tracking: {model} is an OpenAI FinteTuned LLM") # fuzzy match ft:babbage-002:abcd-id-cool-litellm - prompt_tokens_cost_usd_dollar = model_cost_ref["ft:babbage-002"]["input_cost_per_token"] * prompt_tokens + prompt_tokens_cost_usd_dollar = ( + model_cost_ref["ft:babbage-002"]["input_cost_per_token"] * prompt_tokens + ) completion_tokens_cost_usd_dollar = ( - model_cost_ref["ft:babbage-002"]["output_cost_per_token"] * completion_tokens + model_cost_ref["ft:babbage-002"]["output_cost_per_token"] + * completion_tokens ) return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar elif model in litellm.azure_llms: @@ -227,17 +266,25 @@ def cost_per_token( verbose_logger.debug( f"applying cost={model_cost_ref[model]['input_cost_per_token']} for prompt_tokens={prompt_tokens}" ) - prompt_tokens_cost_usd_dollar = model_cost_ref[model]["input_cost_per_token"] * prompt_tokens + prompt_tokens_cost_usd_dollar = ( + model_cost_ref[model]["input_cost_per_token"] * prompt_tokens + ) verbose_logger.debug( f"applying cost={model_cost_ref[model]['output_cost_per_token']} for completion_tokens={completion_tokens}" ) - completion_tokens_cost_usd_dollar = model_cost_ref[model]["output_cost_per_token"] * completion_tokens + completion_tokens_cost_usd_dollar = ( + model_cost_ref[model]["output_cost_per_token"] * completion_tokens + ) return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar elif model in litellm.azure_embedding_models: verbose_logger.debug(f"Cost Tracking: {model} is an Azure Embedding Model") model = litellm.azure_embedding_models[model] - prompt_tokens_cost_usd_dollar = model_cost_ref[model]["input_cost_per_token"] * prompt_tokens - completion_tokens_cost_usd_dollar = model_cost_ref[model]["output_cost_per_token"] * completion_tokens + prompt_tokens_cost_usd_dollar = ( + model_cost_ref[model]["input_cost_per_token"] * prompt_tokens + ) + completion_tokens_cost_usd_dollar = ( + model_cost_ref[model]["output_cost_per_token"] * completion_tokens + ) return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar else: # if model is not in model_prices_and_context_window.json. Raise an exception-let users know @@ -261,7 +308,9 @@ def get_model_params_and_category(model_name) -> str: import re model_name = model_name.lower() - re_params_match = re.search(r"(\d+b)", model_name) # catch all decimals like 3b, 70b, etc + re_params_match = re.search( + r"(\d+b)", model_name + ) # catch all decimals like 3b, 70b, etc category = None if re_params_match is not None: params_match = str(re_params_match.group(1)) @@ -292,7 +341,9 @@ def get_model_params_and_category(model_name) -> str: def get_replicate_completion_pricing(completion_response=None, total_time=0.0): # see https://replicate.com/pricing # for all litellm currently supported LLMs, almost all requests go to a100_80gb - a100_80gb_price_per_second_public = 0.001400 # assume all calls sent to A100 80GB for now + a100_80gb_price_per_second_public = ( + 0.001400 # assume all calls sent to A100 80GB for now + ) if total_time == 0.0: # total time is in ms start_time = completion_response["created"] end_time = getattr(completion_response, "ended", time.time()) @@ -381,9 +432,13 @@ def completion_cost( if completion_response is not None: # get input/output tokens from completion_response prompt_tokens = completion_response.get("usage", {}).get("prompt_tokens", 0) - completion_tokens = completion_response.get("usage", {}).get("completion_tokens", 0) + completion_tokens = completion_response.get("usage", {}).get( + "completion_tokens", 0 + ) total_time = completion_response.get("_response_ms", 0) - verbose_logger.debug(f"completion_response response ms: {completion_response.get('_response_ms')} ") + verbose_logger.debug( + f"completion_response response ms: {completion_response.get('_response_ms')} " + ) model = model or completion_response.get( "model", None ) # check if user passed an override for model, if it's none check completion_response['model'] @@ -393,15 +448,25 @@ def completion_cost( and len(completion_response._hidden_params["model"]) > 0 ): model = completion_response._hidden_params.get("model", model) - custom_llm_provider = completion_response._hidden_params.get("custom_llm_provider", "") - region_name = completion_response._hidden_params.get("region_name", region_name) - size = completion_response._hidden_params.get("optional_params", {}).get( + custom_llm_provider = completion_response._hidden_params.get( + "custom_llm_provider", "" + ) + region_name = completion_response._hidden_params.get( + "region_name", region_name + ) + size = completion_response._hidden_params.get( + "optional_params", {} + ).get( "size", "1024-x-1024" ) # openai default - quality = completion_response._hidden_params.get("optional_params", {}).get( + quality = completion_response._hidden_params.get( + "optional_params", {} + ).get( "quality", "standard" ) # openai default - n = completion_response._hidden_params.get("optional_params", {}).get("n", 1) # openai default + n = completion_response._hidden_params.get("optional_params", {}).get( + "n", 1 + ) # openai default else: if len(messages) > 0: prompt_tokens = token_counter(model=model, messages=messages) @@ -413,7 +478,10 @@ def completion_cost( f"Model is None and does not exist in passed completion_response. Passed completion_response={completion_response}, model={model}" ) - if call_type == CallTypes.image_generation.value or call_type == CallTypes.aimage_generation.value: + if ( + call_type == CallTypes.image_generation.value + or call_type == CallTypes.aimage_generation.value + ): ### IMAGE GENERATION COST CALCULATION ### if custom_llm_provider == "vertex_ai": # https://cloud.google.com/vertex-ai/generative-ai/pricing @@ -431,23 +499,43 @@ def completion_cost( height = int(size[0]) # if it's 1024-x-1024 vs. 1024x1024 width = int(size[1]) verbose_logger.debug(f"image_gen_model_name: {image_gen_model_name}") - verbose_logger.debug(f"image_gen_model_name_with_quality: {image_gen_model_name_with_quality}") + verbose_logger.debug( + f"image_gen_model_name_with_quality: {image_gen_model_name_with_quality}" + ) if image_gen_model_name in litellm.model_cost: - return litellm.model_cost[image_gen_model_name]["input_cost_per_pixel"] * height * width * n + return ( + litellm.model_cost[image_gen_model_name]["input_cost_per_pixel"] + * height + * width + * n + ) elif image_gen_model_name_with_quality in litellm.model_cost: return ( - litellm.model_cost[image_gen_model_name_with_quality]["input_cost_per_pixel"] * height * width * n + litellm.model_cost[image_gen_model_name_with_quality][ + "input_cost_per_pixel" + ] + * height + * width + * n ) else: - raise Exception(f"Model={image_gen_model_name} not found in completion cost model map") + raise Exception( + f"Model={image_gen_model_name} not found in completion cost model map" + ) # Calculate cost based on prompt_tokens, completion_tokens - if "togethercomputer" in model or "together_ai" in model or custom_llm_provider == "together_ai": + if ( + "togethercomputer" in model + or "together_ai" in model + or custom_llm_provider == "together_ai" + ): # together ai prices based on size of llm # get_model_params_and_category takes a model name and returns the category of LLM size it is in model_prices_and_context_window.json model = get_model_params_and_category(model) # replicate llms are calculate based on time for request running # see https://replicate.com/pricing - elif (model in litellm.replicate_models or "replicate" in model) and model not in litellm.model_cost: + elif ( + model in litellm.replicate_models or "replicate" in model + ) and model not in litellm.model_cost: # for unmapped replicate model, default to replicate's time tracking logic return get_replicate_completion_pricing(completion_response, total_time) @@ -464,15 +552,21 @@ def completion_cost( ): # Calculate the prompt characters + response characters if len("messages") > 0: - prompt_string = litellm.utils.get_formatted_prompt(data={"messages": messages}, call_type="completion") + prompt_string = litellm.utils.get_formatted_prompt( + data={"messages": messages}, call_type="completion" + ) else: prompt_string = "" prompt_characters = litellm.utils._count_characters(text=prompt_string) - completion_string = litellm.utils.get_response_string(response_obj=completion_response) + completion_string = litellm.utils.get_response_string( + response_obj=completion_response + ) - completion_characters = litellm.utils._count_characters(text=completion_string) + completion_characters = litellm.utils._count_characters( + text=completion_string + ) ( prompt_tokens_cost_usd_dollar, @@ -507,7 +601,7 @@ def response_cost_calculator( TextCompletionResponse, ], model: str, - custom_llm_provider: str, + custom_llm_provider: Optional[str], call_type: Literal[ "embedding", "aembedding", @@ -529,6 +623,10 @@ def response_cost_calculator( base_model: Optional[str] = None, custom_pricing: Optional[bool] = None, ) -> Optional[float]: + """ + Returns + - float or None: cost of response OR none if error. + """ try: response_cost: float = 0.0 if cache_hit is not None and cache_hit is True: @@ -544,7 +642,9 @@ def response_cost_calculator( ) else: if ( - model in litellm.model_cost and custom_pricing is not None and custom_llm_provider is True + model in litellm.model_cost + and custom_pricing is not None + and custom_llm_provider is True ): # override defaults if custom pricing is set base_model = model # base_model defaults to None if not set on model_info @@ -556,5 +656,7 @@ def response_cost_calculator( ) return response_cost except litellm.NotFoundError as e: - print_verbose(f"Model={model} for LLM Provider={custom_llm_provider} not found in completion cost map.") + print_verbose( + f"Model={model} for LLM Provider={custom_llm_provider} not found in completion cost map." + ) return None diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index c3b855c5f..8844dc54d 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -433,6 +433,7 @@ def get_custom_headers( api_base: Optional[str] = None, version: Optional[str] = None, model_region: Optional[str] = None, + response_cost: Optional[Union[float, str]] = None, fastest_response_batch_completion: Optional[bool] = None, **kwargs, ) -> dict: @@ -443,6 +444,7 @@ def get_custom_headers( "x-litellm-model-api-base": api_base, "x-litellm-version": version, "x-litellm-model-region": model_region, + "x-litellm-response-cost": str(response_cost), "x-litellm-key-tpm-limit": str(user_api_key_dict.tpm_limit), "x-litellm-key-rpm-limit": str(user_api_key_dict.rpm_limit), "x-litellm-fastest_response_batch_completion": ( @@ -3048,6 +3050,7 @@ async def chat_completion( model_id = hidden_params.get("model_id", None) or "" cache_key = hidden_params.get("cache_key", None) or "" api_base = hidden_params.get("api_base", None) or "" + response_cost = hidden_params.get("response_cost", None) or "" fastest_response_batch_completion = hidden_params.get( "fastest_response_batch_completion", None ) @@ -3066,6 +3069,7 @@ async def chat_completion( cache_key=cache_key, api_base=api_base, version=version, + response_cost=response_cost, model_region=getattr(user_api_key_dict, "allowed_model_region", ""), fastest_response_batch_completion=fastest_response_batch_completion, ) @@ -3095,6 +3099,7 @@ async def chat_completion( cache_key=cache_key, api_base=api_base, version=version, + response_cost=response_cost, model_region=getattr(user_api_key_dict, "allowed_model_region", ""), fastest_response_batch_completion=fastest_response_batch_completion, **additional_headers, @@ -3290,6 +3295,7 @@ async def completion( model_id = hidden_params.get("model_id", None) or "" cache_key = hidden_params.get("cache_key", None) or "" api_base = hidden_params.get("api_base", None) or "" + response_cost = hidden_params.get("response_cost", None) or "" ### ALERTING ### data["litellm_status"] = "success" # used for alerting @@ -3304,6 +3310,7 @@ async def completion( cache_key=cache_key, api_base=api_base, version=version, + response_cost=response_cost, ) selected_data_generator = select_data_generator( response=response, @@ -3323,6 +3330,7 @@ async def completion( cache_key=cache_key, api_base=api_base, version=version, + response_cost=response_cost, ) ) @@ -3527,6 +3535,7 @@ async def embeddings( model_id = hidden_params.get("model_id", None) or "" cache_key = hidden_params.get("cache_key", None) or "" api_base = hidden_params.get("api_base", None) or "" + response_cost = hidden_params.get("response_cost", None) or "" fastapi_response.headers.update( get_custom_headers( @@ -3535,6 +3544,7 @@ async def embeddings( cache_key=cache_key, api_base=api_base, version=version, + response_cost=response_cost, model_region=getattr(user_api_key_dict, "allowed_model_region", ""), ) ) @@ -3676,6 +3686,7 @@ async def image_generation( model_id = hidden_params.get("model_id", None) or "" cache_key = hidden_params.get("cache_key", None) or "" api_base = hidden_params.get("api_base", None) or "" + response_cost = hidden_params.get("response_cost", None) or "" fastapi_response.headers.update( get_custom_headers( @@ -3684,6 +3695,7 @@ async def image_generation( cache_key=cache_key, api_base=api_base, version=version, + response_cost=response_cost, model_region=getattr(user_api_key_dict, "allowed_model_region", ""), ) ) @@ -3812,6 +3824,7 @@ async def audio_speech( model_id = hidden_params.get("model_id", None) or "" cache_key = hidden_params.get("cache_key", None) or "" api_base = hidden_params.get("api_base", None) or "" + response_cost = hidden_params.get("response_cost", None) or "" # Printing each chunk size async def generate(_response: HttpxBinaryResponseContent): @@ -3825,6 +3838,7 @@ async def audio_speech( cache_key=cache_key, api_base=api_base, version=version, + response_cost=response_cost, model_region=getattr(user_api_key_dict, "allowed_model_region", ""), fastest_response_batch_completion=None, ) @@ -3976,6 +3990,7 @@ async def audio_transcriptions( model_id = hidden_params.get("model_id", None) or "" cache_key = hidden_params.get("cache_key", None) or "" api_base = hidden_params.get("api_base", None) or "" + response_cost = hidden_params.get("response_cost", None) or "" fastapi_response.headers.update( get_custom_headers( @@ -3984,6 +3999,7 @@ async def audio_transcriptions( cache_key=cache_key, api_base=api_base, version=version, + response_cost=response_cost, model_region=getattr(user_api_key_dict, "allowed_model_region", ""), ) ) diff --git a/litellm/tests/test_completion_cost.py b/litellm/tests/test_completion_cost.py index e854345b3..017d3ef72 100644 --- a/litellm/tests/test_completion_cost.py +++ b/litellm/tests/test_completion_cost.py @@ -4,7 +4,9 @@ import traceback import litellm.cost_calculator -sys.path.insert(0, os.path.abspath("../..")) # Adds the parent directory to the system path +sys.path.insert( + 0, os.path.abspath("../..") +) # Adds the parent directory to the system path import asyncio import time from typing import Optional @@ -167,11 +169,15 @@ def test_cost_ft_gpt_35(): input_cost = model_cost["ft:gpt-3.5-turbo"]["input_cost_per_token"] output_cost = model_cost["ft:gpt-3.5-turbo"]["output_cost_per_token"] print(input_cost, output_cost) - expected_cost = (input_cost * resp.usage.prompt_tokens) + (output_cost * resp.usage.completion_tokens) + expected_cost = (input_cost * resp.usage.prompt_tokens) + ( + output_cost * resp.usage.completion_tokens + ) print("\n Excpected cost", expected_cost) assert cost == expected_cost except Exception as e: - pytest.fail(f"Cost Calc failed for ft:gpt-3.5. Expected {expected_cost}, Calculated cost {cost}") + pytest.fail( + f"Cost Calc failed for ft:gpt-3.5. Expected {expected_cost}, Calculated cost {cost}" + ) # test_cost_ft_gpt_35() @@ -200,15 +206,21 @@ def test_cost_azure_gpt_35(): usage=Usage(prompt_tokens=21, completion_tokens=17, total_tokens=38), ) - cost = litellm.completion_cost(completion_response=resp, model="azure/gpt-35-turbo") + cost = litellm.completion_cost( + completion_response=resp, model="azure/gpt-35-turbo" + ) print("\n Calculated Cost for azure/gpt-3.5-turbo", cost) input_cost = model_cost["azure/gpt-35-turbo"]["input_cost_per_token"] output_cost = model_cost["azure/gpt-35-turbo"]["output_cost_per_token"] - expected_cost = (input_cost * resp.usage.prompt_tokens) + (output_cost * resp.usage.completion_tokens) + expected_cost = (input_cost * resp.usage.prompt_tokens) + ( + output_cost * resp.usage.completion_tokens + ) print("\n Excpected cost", expected_cost) assert cost == expected_cost except Exception as e: - pytest.fail(f"Cost Calc failed for azure/gpt-3.5-turbo. Expected {expected_cost}, Calculated cost {cost}") + pytest.fail( + f"Cost Calc failed for azure/gpt-3.5-turbo. Expected {expected_cost}, Calculated cost {cost}" + ) # test_cost_azure_gpt_35() @@ -239,7 +251,9 @@ def test_cost_azure_embedding(): assert cost == expected_cost except Exception as e: - pytest.fail(f"Cost Calc failed for azure/gpt-3.5-turbo. Expected {expected_cost}, Calculated cost {cost}") + pytest.fail( + f"Cost Calc failed for azure/gpt-3.5-turbo. Expected {expected_cost}, Calculated cost {cost}" + ) # test_cost_azure_embedding() @@ -315,7 +329,9 @@ def test_cost_bedrock_pricing_actual_calls(): litellm.set_verbose = True model = "anthropic.claude-instant-v1" messages = [{"role": "user", "content": "Hey, how's it going?"}] - response = litellm.completion(model=model, messages=messages, mock_response="hello cool one") + response = litellm.completion( + model=model, messages=messages, mock_response="hello cool one" + ) print("response", response) cost = litellm.completion_cost( @@ -345,7 +361,8 @@ def test_whisper_openai(): print(f"cost: {cost}") print(f"whisper dict: {litellm.model_cost['whisper-1']}") expected_cost = round( - litellm.model_cost["whisper-1"]["output_cost_per_second"] * _total_time_in_seconds, + litellm.model_cost["whisper-1"]["output_cost_per_second"] + * _total_time_in_seconds, 5, ) assert cost == expected_cost @@ -365,12 +382,15 @@ def test_whisper_azure(): _total_time_in_seconds = 3 transcription._response_ms = _total_time_in_seconds * 1000 - cost = litellm.completion_cost(model="azure/azure-whisper", completion_response=transcription) + cost = litellm.completion_cost( + model="azure/azure-whisper", completion_response=transcription + ) print(f"cost: {cost}") print(f"whisper dict: {litellm.model_cost['whisper-1']}") expected_cost = round( - litellm.model_cost["whisper-1"]["output_cost_per_second"] * _total_time_in_seconds, + litellm.model_cost["whisper-1"]["output_cost_per_second"] + * _total_time_in_seconds, 5, ) assert cost == expected_cost @@ -401,7 +421,9 @@ def test_dalle_3_azure_cost_tracking(): response.usage = {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0} response._hidden_params = {"model": "dall-e-3", "model_id": None} print(f"response hidden params: {response._hidden_params}") - cost = litellm.completion_cost(completion_response=response, call_type="image_generation") + cost = litellm.completion_cost( + completion_response=response, call_type="image_generation" + ) assert cost > 0 @@ -433,7 +455,9 @@ def test_replicate_llama3_cost_tracking(): model="replicate/meta/meta-llama-3-8b-instruct", object="chat.completion", system_fingerprint=None, - usage=litellm.utils.Usage(prompt_tokens=48, completion_tokens=31, total_tokens=79), + usage=litellm.utils.Usage( + prompt_tokens=48, completion_tokens=31, total_tokens=79 + ), ) cost = litellm.completion_cost( completion_response=response, @@ -443,8 +467,14 @@ def test_replicate_llama3_cost_tracking(): print(f"cost: {cost}") cost = round(cost, 5) expected_cost = round( - litellm.model_cost["replicate/meta/meta-llama-3-8b-instruct"]["input_cost_per_token"] * 48 - + litellm.model_cost["replicate/meta/meta-llama-3-8b-instruct"]["output_cost_per_token"] * 31, + litellm.model_cost["replicate/meta/meta-llama-3-8b-instruct"][ + "input_cost_per_token" + ] + * 48 + + litellm.model_cost["replicate/meta/meta-llama-3-8b-instruct"][ + "output_cost_per_token" + ] + * 31, 5, ) assert cost == expected_cost @@ -538,7 +568,9 @@ def test_together_ai_qwen_completion_cost(): "custom_cost_per_second": None, } - response = litellm.cost_calculator.get_model_params_and_category(model_name="qwen/Qwen2-72B-Instruct") + response = litellm.cost_calculator.get_model_params_and_category( + model_name="qwen/Qwen2-72B-Instruct" + ) assert response == "together-ai-41.1b-80b" @@ -576,8 +608,12 @@ def test_gemini_completion_cost(above_128k, provider): ), "model info for model={} does not have pricing for > 128k tokens\nmodel_info={}".format( model_name, model_info ) - input_cost = prompt_tokens * model_info["input_cost_per_token_above_128k_tokens"] - output_cost = output_tokens * model_info["output_cost_per_token_above_128k_tokens"] + input_cost = ( + prompt_tokens * model_info["input_cost_per_token_above_128k_tokens"] + ) + output_cost = ( + output_tokens * model_info["output_cost_per_token_above_128k_tokens"] + ) else: input_cost = prompt_tokens * model_info["input_cost_per_token"] output_cost = output_tokens * model_info["output_cost_per_token"] @@ -674,3 +710,23 @@ def test_vertex_ai_claude_completion_cost(): ) predicted_cost = input_tokens * 0.000003 + 0.000015 * output_tokens assert cost == predicted_cost + + +@pytest.mark.parametrize("sync_mode", [True, False]) +@pytest.mark.asyncio +async def test_completion_cost_hidden_params(sync_mode): + if sync_mode: + response = litellm.completion( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "Hey, how's it going?"}], + mock_response="Hello world", + ) + else: + response = await litellm.acompletion( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "Hey, how's it going?"}], + mock_response="Hello world", + ) + + assert "response_cost" in response._hidden_params + assert isinstance(response._hidden_params["response_cost"], float) diff --git a/litellm/utils.py b/litellm/utils.py index 76c93d589..0f5ff6863 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -899,6 +899,17 @@ def client(original_function): model=model, optional_params=getattr(logging_obj, "optional_params", {}), ) + result._hidden_params["response_cost"] = ( + litellm.response_cost_calculator( + response_object=result, + model=getattr(logging_obj, "model", ""), + custom_llm_provider=getattr( + logging_obj, "custom_llm_provider", None + ), + call_type=getattr(logging_obj, "call_type", "completion"), + optional_params=getattr(logging_obj, "optional_params", {}), + ) + ) result._response_ms = ( end_time - start_time ).total_seconds() * 1000 # return response latency in ms like openai @@ -1292,6 +1303,17 @@ def client(original_function): model=model, optional_params=kwargs, ) + result._hidden_params["response_cost"] = ( + litellm.response_cost_calculator( + response_object=result, + model=getattr(logging_obj, "model", ""), + custom_llm_provider=getattr( + logging_obj, "custom_llm_provider", None + ), + call_type=getattr(logging_obj, "call_type", "completion"), + optional_params=getattr(logging_obj, "optional_params", {}), + ) + ) if ( isinstance(result, ModelResponse) or isinstance(result, EmbeddingResponse)