From f5da95685a4f384a47b0af5631b2112283a3daa6 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Fri, 26 Jan 2024 14:53:58 -0800 Subject: [PATCH] feat(utils.py): support region based pricing for bedrock + use bedrock's token counts if given --- litellm/budget_manager.py | 11 +++- litellm/llms/bedrock.py | 15 ++++- litellm/main.py | 4 ++ litellm/tests/test_completion_cost.py | 70 ++++++++++++++++++++- litellm/utils.py | 87 +++++++++++++++++---------- 5 files changed, 150 insertions(+), 37 deletions(-) diff --git a/litellm/budget_manager.py b/litellm/budget_manager.py index 036474197..841015753 100644 --- a/litellm/budget_manager.py +++ b/litellm/budget_manager.py @@ -1,3 +1,12 @@ +# +-----------------------------------------------+ +# | | +# | NOT PROXY BUDGET MANAGER | +# | proxy budget manager is in proxy_server.py | +# | | +# +-----------------------------------------------+ +# +# Thank you users! We ❤️ you! - Krrish & Ishaan + import os, json, time import litellm from litellm.utils import ModelResponse @@ -16,7 +25,7 @@ class BudgetManager: self.client_type = client_type self.project_name = project_name self.api_base = api_base or "https://api.litellm.ai" - self.headers = headers or {'Content-Type': 'application/json'} + self.headers = headers or {"Content-Type": "application/json"} ## load the data or init the initial dictionaries self.load_data() diff --git a/litellm/llms/bedrock.py b/litellm/llms/bedrock.py index 4c36137da..bcf35c3d1 100644 --- a/litellm/llms/bedrock.py +++ b/litellm/llms/bedrock.py @@ -659,9 +659,16 @@ def completion( ) ## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here. - prompt_tokens = len(encoding.encode(prompt)) - completion_tokens = len( - encoding.encode(model_response["choices"][0]["message"].get("content", "")) + prompt_tokens = response_metadata.get( + "x-amzn-bedrock-input-token-count", len(encoding.encode(prompt)) + ) + completion_tokens = response_metadata.get( + "x-amzn-bedrock-output-token-count", + len( + encoding.encode( + model_response["choices"][0]["message"].get("content", "") + ) + ), ) model_response["created"] = int(time.time()) @@ -672,6 +679,8 @@ def completion( total_tokens=prompt_tokens + completion_tokens, ) model_response.usage = usage + model_response._hidden_params["region_name"] = client.meta.region_name + print_verbose(f"model_response._hidden_params: {model_response._hidden_params}") return model_response except BedrockError as e: exception_mapping_worked = True diff --git a/litellm/main.py b/litellm/main.py index f9f1139f6..01edd3ea7 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -586,6 +586,10 @@ def completion( ) if model_response is not None and hasattr(model_response, "_hidden_params"): model_response._hidden_params["custom_llm_provider"] = custom_llm_provider + model_response._hidden_params["region_name"] = kwargs.get( + "aws_region_name", None + ) # support region-based pricing for bedrock + ### REGISTER CUSTOM MODEL PRICING -- IF GIVEN ### if input_cost_per_token is not None and output_cost_per_token is not None: litellm.register_model( diff --git a/litellm/tests/test_completion_cost.py b/litellm/tests/test_completion_cost.py index 505f28981..b117223ab 100644 --- a/litellm/tests/test_completion_cost.py +++ b/litellm/tests/test_completion_cost.py @@ -124,7 +124,7 @@ def test_cost_azure_gpt_35(): ) -test_cost_azure_gpt_35() +# test_cost_azure_gpt_35() def test_cost_azure_embedding(): @@ -165,3 +165,71 @@ def test_cost_openai_image_gen(): model="dall-e-2", size="1024-x-1024", quality="standard", n=1 ) assert cost == 0.019922944 + + +def test_cost_bedrock_pricing(): + """ + - get pricing specific to region for a model + """ + from litellm import ModelResponse, Choices, Message + from litellm.utils import Usage + + litellm.set_verbose = True + input_tokens = litellm.token_counter( + model="bedrock/anthropic.claude-instant-v1", + messages=[{"role": "user", "content": "Hey, how's it going?"}], + ) + print(f"input_tokens: {input_tokens}") + output_tokens = litellm.token_counter( + model="bedrock/anthropic.claude-instant-v1", + text="It's all going well", + count_response_tokens=True, + ) + print(f"output_tokens: {output_tokens}") + resp = ModelResponse( + id="chatcmpl-e41836bb-bb8b-4df2-8e70-8f3e160155ac", + choices=[ + Choices( + finish_reason=None, + index=0, + message=Message( + content="It's all going well", + role="assistant", + ), + ) + ], + created=1700775391, + model="anthropic.claude-instant-v1", + object="chat.completion", + system_fingerprint=None, + usage=Usage( + prompt_tokens=input_tokens, + completion_tokens=output_tokens, + total_tokens=input_tokens + output_tokens, + ), + ) + resp._hidden_params = { + "custom_llm_provider": "bedrock", + "region_name": "ap-northeast-1", + } + + cost = litellm.completion_cost( + model="anthropic.claude-instant-v1", + completion_response=resp, + messages=[{"role": "user", "content": "Hey, how's it going?"}], + ) + predicted_cost = input_tokens * 0.00000223 + 0.00000755 * output_tokens + assert cost == predicted_cost + + +def test_cost_bedrock_pricing_actual_calls(): + litellm.set_verbose = True + model = "anthropic.claude-instant-v1" + messages = [{"role": "user", "content": "Hey, how's it going?"}] + response = litellm.completion(model=model, messages=messages) + assert response._hidden_params["region_name"] is not None + cost = litellm.completion_cost( + completion_response=response, + messages=[{"role": "user", "content": "Hey, how's it going?"}], + ) + assert cost > 0 diff --git a/litellm/utils.py b/litellm/utils.py index b0e48bbc6..91b3a0f0a 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -714,6 +714,7 @@ class ImageResponse(OpenAIObject): ############################################################ def print_verbose(print_statement): try: + verbose_logger.debug(print_statement) if litellm.set_verbose: print(print_statement) # noqa except: @@ -2900,6 +2901,7 @@ def cost_per_token( completion_tokens=0, response_time_ms=None, custom_llm_provider=None, + region_name=None, ): """ Calculates the cost per token for a given model, prompt tokens, and completion tokens. @@ -2916,16 +2918,46 @@ def cost_per_token( prompt_tokens_cost_usd_dollar = 0 completion_tokens_cost_usd_dollar = 0 model_cost_ref = litellm.model_cost + model_with_provider = model if custom_llm_provider is not None: model_with_provider = custom_llm_provider + "/" + model - else: - model_with_provider = model + if region_name is not None: + model_with_provider_and_region = ( + f"{custom_llm_provider}/{region_name}/{model}" + ) + if ( + model_with_provider_and_region in model_cost_ref + ): # use region based pricing, if it's available + model_with_provider = model_with_provider_and_region # see this https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models - verbose_logger.debug(f"Looking up model={model} in model_cost_map") - + print_verbose(f"Looking up model={model} in model_cost_map") + if model_with_provider in model_cost_ref: + print_verbose( + f"Success: model={model_with_provider} in model_cost_map - {model_cost_ref[model_with_provider]}" + ) + print_verbose( + f"applying cost={model_cost_ref[model_with_provider]['input_cost_per_token']} for prompt_tokens={prompt_tokens}" + ) + prompt_tokens_cost_usd_dollar = ( + model_cost_ref[model_with_provider]["input_cost_per_token"] * prompt_tokens + ) + print_verbose( + f"calculated prompt_tokens_cost_usd_dollar: {prompt_tokens_cost_usd_dollar}" + ) + print_verbose( + f"applying cost={model_cost_ref[model_with_provider]['output_cost_per_token']} for completion_tokens={completion_tokens}" + ) + completion_tokens_cost_usd_dollar = ( + model_cost_ref[model_with_provider]["output_cost_per_token"] + * completion_tokens + ) + print_verbose( + f"calculated completion_tokens_cost_usd_dollar: {completion_tokens_cost_usd_dollar}" + ) + return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar if model in model_cost_ref: - verbose_logger.debug(f"Success: model={model} in model_cost_map") - verbose_logger.debug( + print_verbose(f"Success: model={model} in model_cost_map") + print_verbose( f"prompt_tokens={prompt_tokens}; completion_tokens={completion_tokens}" ) if ( @@ -2943,7 +2975,7 @@ def cost_per_token( model_cost_ref[model].get("input_cost_per_second", None) is not None and response_time_ms is not None ): - verbose_logger.debug( + print_verbose( f"For model={model} - input_cost_per_second: {model_cost_ref[model].get('input_cost_per_second')}; response time: {response_time_ms}" ) ## COST PER SECOND ## @@ -2951,30 +2983,12 @@ def cost_per_token( model_cost_ref[model]["input_cost_per_second"] * response_time_ms / 1000 ) completion_tokens_cost_usd_dollar = 0.0 - verbose_logger.debug( + print_verbose( f"Returned custom cost for model={model} - prompt_tokens_cost_usd_dollar: {prompt_tokens_cost_usd_dollar}, completion_tokens_cost_usd_dollar: {completion_tokens_cost_usd_dollar}" ) return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar - elif model_with_provider in model_cost_ref: - verbose_logger.debug( - f"Looking up model={model_with_provider} in model_cost_map" - ) - verbose_logger.debug( - f"applying cost={model_cost_ref[model_with_provider]['input_cost_per_token']} for prompt_tokens={prompt_tokens}" - ) - prompt_tokens_cost_usd_dollar = ( - model_cost_ref[model_with_provider]["input_cost_per_token"] * prompt_tokens - ) - verbose_logger.debug( - f"applying cost={model_cost_ref[model_with_provider]['output_cost_per_token']} for completion_tokens={completion_tokens}" - ) - completion_tokens_cost_usd_dollar = ( - model_cost_ref[model_with_provider]["output_cost_per_token"] - * completion_tokens - ) - return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar elif "ft:gpt-3.5-turbo" in model: - verbose_logger.debug(f"Cost Tracking: {model} is an OpenAI FinteTuned LLM") + print_verbose(f"Cost Tracking: {model} is an OpenAI FinteTuned LLM") # fuzzy match ft:gpt-3.5-turbo:abcd-id-cool-litellm prompt_tokens_cost_usd_dollar = ( model_cost_ref["ft:gpt-3.5-turbo"]["input_cost_per_token"] * prompt_tokens @@ -3031,7 +3045,10 @@ def completion_cost( prompt="", messages: List = [], completion="", - total_time=0.0, # used for replicate + total_time=0.0, # used for replicate, sagemaker + ### REGION ### + custom_llm_provider=None, + region_name=None, # used for bedrock pricing ### IMAGE GEN ### size=None, quality=None, @@ -3080,12 +3097,13 @@ def completion_cost( model = ( model or completion_response["model"] ) # check if user passed an override for model, if it's none check completion_response['model'] - if completion_response is not None and hasattr( - completion_response, "_hidden_params" - ): + if hasattr(completion_response, "_hidden_params"): custom_llm_provider = completion_response._hidden_params.get( "custom_llm_provider", "" ) + region_name = completion_response._hidden_params.get( + "region_name", region_name + ) else: if len(messages) > 0: prompt_tokens = token_counter(model=model, messages=messages) @@ -3146,8 +3164,13 @@ def completion_cost( completion_tokens=completion_tokens, custom_llm_provider=custom_llm_provider, response_time_ms=total_time, + region_name=region_name, ) - return prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar + _final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar + print_verbose( + f"final cost: {_final_cost}; prompt_tokens_cost_usd_dollar: {prompt_tokens_cost_usd_dollar}; completion_tokens_cost_usd_dollar: {completion_tokens_cost_usd_dollar}" + ) + return _final_cost except Exception as e: raise e