forked from phoenix/litellm-mirror
feat(utils.py): support region based pricing for bedrock + use bedrock's token counts if given
This commit is contained in:
parent
511510a1cc
commit
f5da95685a
5 changed files with 150 additions and 37 deletions
|
@ -1,3 +1,12 @@
|
||||||
|
# +-----------------------------------------------+
|
||||||
|
# | |
|
||||||
|
# | NOT PROXY BUDGET MANAGER |
|
||||||
|
# | proxy budget manager is in proxy_server.py |
|
||||||
|
# | |
|
||||||
|
# +-----------------------------------------------+
|
||||||
|
#
|
||||||
|
# Thank you users! We ❤️ you! - Krrish & Ishaan
|
||||||
|
|
||||||
import os, json, time
|
import os, json, time
|
||||||
import litellm
|
import litellm
|
||||||
from litellm.utils import ModelResponse
|
from litellm.utils import ModelResponse
|
||||||
|
@ -16,7 +25,7 @@ class BudgetManager:
|
||||||
self.client_type = client_type
|
self.client_type = client_type
|
||||||
self.project_name = project_name
|
self.project_name = project_name
|
||||||
self.api_base = api_base or "https://api.litellm.ai"
|
self.api_base = api_base or "https://api.litellm.ai"
|
||||||
self.headers = headers or {'Content-Type': 'application/json'}
|
self.headers = headers or {"Content-Type": "application/json"}
|
||||||
## load the data or init the initial dictionaries
|
## load the data or init the initial dictionaries
|
||||||
self.load_data()
|
self.load_data()
|
||||||
|
|
||||||
|
|
|
@ -659,9 +659,16 @@ def completion(
|
||||||
)
|
)
|
||||||
|
|
||||||
## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here.
|
## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here.
|
||||||
prompt_tokens = len(encoding.encode(prompt))
|
prompt_tokens = response_metadata.get(
|
||||||
completion_tokens = len(
|
"x-amzn-bedrock-input-token-count", len(encoding.encode(prompt))
|
||||||
encoding.encode(model_response["choices"][0]["message"].get("content", ""))
|
)
|
||||||
|
completion_tokens = response_metadata.get(
|
||||||
|
"x-amzn-bedrock-output-token-count",
|
||||||
|
len(
|
||||||
|
encoding.encode(
|
||||||
|
model_response["choices"][0]["message"].get("content", "")
|
||||||
|
)
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
model_response["created"] = int(time.time())
|
model_response["created"] = int(time.time())
|
||||||
|
@ -672,6 +679,8 @@ def completion(
|
||||||
total_tokens=prompt_tokens + completion_tokens,
|
total_tokens=prompt_tokens + completion_tokens,
|
||||||
)
|
)
|
||||||
model_response.usage = usage
|
model_response.usage = usage
|
||||||
|
model_response._hidden_params["region_name"] = client.meta.region_name
|
||||||
|
print_verbose(f"model_response._hidden_params: {model_response._hidden_params}")
|
||||||
return model_response
|
return model_response
|
||||||
except BedrockError as e:
|
except BedrockError as e:
|
||||||
exception_mapping_worked = True
|
exception_mapping_worked = True
|
||||||
|
|
|
@ -586,6 +586,10 @@ def completion(
|
||||||
)
|
)
|
||||||
if model_response is not None and hasattr(model_response, "_hidden_params"):
|
if model_response is not None and hasattr(model_response, "_hidden_params"):
|
||||||
model_response._hidden_params["custom_llm_provider"] = custom_llm_provider
|
model_response._hidden_params["custom_llm_provider"] = custom_llm_provider
|
||||||
|
model_response._hidden_params["region_name"] = kwargs.get(
|
||||||
|
"aws_region_name", None
|
||||||
|
) # support region-based pricing for bedrock
|
||||||
|
|
||||||
### REGISTER CUSTOM MODEL PRICING -- IF GIVEN ###
|
### REGISTER CUSTOM MODEL PRICING -- IF GIVEN ###
|
||||||
if input_cost_per_token is not None and output_cost_per_token is not None:
|
if input_cost_per_token is not None and output_cost_per_token is not None:
|
||||||
litellm.register_model(
|
litellm.register_model(
|
||||||
|
|
|
@ -124,7 +124,7 @@ def test_cost_azure_gpt_35():
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
test_cost_azure_gpt_35()
|
# test_cost_azure_gpt_35()
|
||||||
|
|
||||||
|
|
||||||
def test_cost_azure_embedding():
|
def test_cost_azure_embedding():
|
||||||
|
@ -165,3 +165,71 @@ def test_cost_openai_image_gen():
|
||||||
model="dall-e-2", size="1024-x-1024", quality="standard", n=1
|
model="dall-e-2", size="1024-x-1024", quality="standard", n=1
|
||||||
)
|
)
|
||||||
assert cost == 0.019922944
|
assert cost == 0.019922944
|
||||||
|
|
||||||
|
|
||||||
|
def test_cost_bedrock_pricing():
|
||||||
|
"""
|
||||||
|
- get pricing specific to region for a model
|
||||||
|
"""
|
||||||
|
from litellm import ModelResponse, Choices, Message
|
||||||
|
from litellm.utils import Usage
|
||||||
|
|
||||||
|
litellm.set_verbose = True
|
||||||
|
input_tokens = litellm.token_counter(
|
||||||
|
model="bedrock/anthropic.claude-instant-v1",
|
||||||
|
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||||
|
)
|
||||||
|
print(f"input_tokens: {input_tokens}")
|
||||||
|
output_tokens = litellm.token_counter(
|
||||||
|
model="bedrock/anthropic.claude-instant-v1",
|
||||||
|
text="It's all going well",
|
||||||
|
count_response_tokens=True,
|
||||||
|
)
|
||||||
|
print(f"output_tokens: {output_tokens}")
|
||||||
|
resp = ModelResponse(
|
||||||
|
id="chatcmpl-e41836bb-bb8b-4df2-8e70-8f3e160155ac",
|
||||||
|
choices=[
|
||||||
|
Choices(
|
||||||
|
finish_reason=None,
|
||||||
|
index=0,
|
||||||
|
message=Message(
|
||||||
|
content="It's all going well",
|
||||||
|
role="assistant",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
],
|
||||||
|
created=1700775391,
|
||||||
|
model="anthropic.claude-instant-v1",
|
||||||
|
object="chat.completion",
|
||||||
|
system_fingerprint=None,
|
||||||
|
usage=Usage(
|
||||||
|
prompt_tokens=input_tokens,
|
||||||
|
completion_tokens=output_tokens,
|
||||||
|
total_tokens=input_tokens + output_tokens,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
resp._hidden_params = {
|
||||||
|
"custom_llm_provider": "bedrock",
|
||||||
|
"region_name": "ap-northeast-1",
|
||||||
|
}
|
||||||
|
|
||||||
|
cost = litellm.completion_cost(
|
||||||
|
model="anthropic.claude-instant-v1",
|
||||||
|
completion_response=resp,
|
||||||
|
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||||
|
)
|
||||||
|
predicted_cost = input_tokens * 0.00000223 + 0.00000755 * output_tokens
|
||||||
|
assert cost == predicted_cost
|
||||||
|
|
||||||
|
|
||||||
|
def test_cost_bedrock_pricing_actual_calls():
|
||||||
|
litellm.set_verbose = True
|
||||||
|
model = "anthropic.claude-instant-v1"
|
||||||
|
messages = [{"role": "user", "content": "Hey, how's it going?"}]
|
||||||
|
response = litellm.completion(model=model, messages=messages)
|
||||||
|
assert response._hidden_params["region_name"] is not None
|
||||||
|
cost = litellm.completion_cost(
|
||||||
|
completion_response=response,
|
||||||
|
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||||
|
)
|
||||||
|
assert cost > 0
|
||||||
|
|
|
@ -714,6 +714,7 @@ class ImageResponse(OpenAIObject):
|
||||||
############################################################
|
############################################################
|
||||||
def print_verbose(print_statement):
|
def print_verbose(print_statement):
|
||||||
try:
|
try:
|
||||||
|
verbose_logger.debug(print_statement)
|
||||||
if litellm.set_verbose:
|
if litellm.set_verbose:
|
||||||
print(print_statement) # noqa
|
print(print_statement) # noqa
|
||||||
except:
|
except:
|
||||||
|
@ -2900,6 +2901,7 @@ def cost_per_token(
|
||||||
completion_tokens=0,
|
completion_tokens=0,
|
||||||
response_time_ms=None,
|
response_time_ms=None,
|
||||||
custom_llm_provider=None,
|
custom_llm_provider=None,
|
||||||
|
region_name=None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Calculates the cost per token for a given model, prompt tokens, and completion tokens.
|
Calculates the cost per token for a given model, prompt tokens, and completion tokens.
|
||||||
|
@ -2916,16 +2918,46 @@ def cost_per_token(
|
||||||
prompt_tokens_cost_usd_dollar = 0
|
prompt_tokens_cost_usd_dollar = 0
|
||||||
completion_tokens_cost_usd_dollar = 0
|
completion_tokens_cost_usd_dollar = 0
|
||||||
model_cost_ref = litellm.model_cost
|
model_cost_ref = litellm.model_cost
|
||||||
|
model_with_provider = model
|
||||||
if custom_llm_provider is not None:
|
if custom_llm_provider is not None:
|
||||||
model_with_provider = custom_llm_provider + "/" + model
|
model_with_provider = custom_llm_provider + "/" + model
|
||||||
else:
|
if region_name is not None:
|
||||||
model_with_provider = model
|
model_with_provider_and_region = (
|
||||||
|
f"{custom_llm_provider}/{region_name}/{model}"
|
||||||
|
)
|
||||||
|
if (
|
||||||
|
model_with_provider_and_region in model_cost_ref
|
||||||
|
): # use region based pricing, if it's available
|
||||||
|
model_with_provider = model_with_provider_and_region
|
||||||
# see this https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models
|
# see this https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models
|
||||||
verbose_logger.debug(f"Looking up model={model} in model_cost_map")
|
print_verbose(f"Looking up model={model} in model_cost_map")
|
||||||
|
if model_with_provider in model_cost_ref:
|
||||||
|
print_verbose(
|
||||||
|
f"Success: model={model_with_provider} in model_cost_map - {model_cost_ref[model_with_provider]}"
|
||||||
|
)
|
||||||
|
print_verbose(
|
||||||
|
f"applying cost={model_cost_ref[model_with_provider]['input_cost_per_token']} for prompt_tokens={prompt_tokens}"
|
||||||
|
)
|
||||||
|
prompt_tokens_cost_usd_dollar = (
|
||||||
|
model_cost_ref[model_with_provider]["input_cost_per_token"] * prompt_tokens
|
||||||
|
)
|
||||||
|
print_verbose(
|
||||||
|
f"calculated prompt_tokens_cost_usd_dollar: {prompt_tokens_cost_usd_dollar}"
|
||||||
|
)
|
||||||
|
print_verbose(
|
||||||
|
f"applying cost={model_cost_ref[model_with_provider]['output_cost_per_token']} for completion_tokens={completion_tokens}"
|
||||||
|
)
|
||||||
|
completion_tokens_cost_usd_dollar = (
|
||||||
|
model_cost_ref[model_with_provider]["output_cost_per_token"]
|
||||||
|
* completion_tokens
|
||||||
|
)
|
||||||
|
print_verbose(
|
||||||
|
f"calculated completion_tokens_cost_usd_dollar: {completion_tokens_cost_usd_dollar}"
|
||||||
|
)
|
||||||
|
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
|
||||||
if model in model_cost_ref:
|
if model in model_cost_ref:
|
||||||
verbose_logger.debug(f"Success: model={model} in model_cost_map")
|
print_verbose(f"Success: model={model} in model_cost_map")
|
||||||
verbose_logger.debug(
|
print_verbose(
|
||||||
f"prompt_tokens={prompt_tokens}; completion_tokens={completion_tokens}"
|
f"prompt_tokens={prompt_tokens}; completion_tokens={completion_tokens}"
|
||||||
)
|
)
|
||||||
if (
|
if (
|
||||||
|
@ -2943,7 +2975,7 @@ def cost_per_token(
|
||||||
model_cost_ref[model].get("input_cost_per_second", None) is not None
|
model_cost_ref[model].get("input_cost_per_second", None) is not None
|
||||||
and response_time_ms is not None
|
and response_time_ms is not None
|
||||||
):
|
):
|
||||||
verbose_logger.debug(
|
print_verbose(
|
||||||
f"For model={model} - input_cost_per_second: {model_cost_ref[model].get('input_cost_per_second')}; response time: {response_time_ms}"
|
f"For model={model} - input_cost_per_second: {model_cost_ref[model].get('input_cost_per_second')}; response time: {response_time_ms}"
|
||||||
)
|
)
|
||||||
## COST PER SECOND ##
|
## COST PER SECOND ##
|
||||||
|
@ -2951,30 +2983,12 @@ def cost_per_token(
|
||||||
model_cost_ref[model]["input_cost_per_second"] * response_time_ms / 1000
|
model_cost_ref[model]["input_cost_per_second"] * response_time_ms / 1000
|
||||||
)
|
)
|
||||||
completion_tokens_cost_usd_dollar = 0.0
|
completion_tokens_cost_usd_dollar = 0.0
|
||||||
verbose_logger.debug(
|
print_verbose(
|
||||||
f"Returned custom cost for model={model} - prompt_tokens_cost_usd_dollar: {prompt_tokens_cost_usd_dollar}, completion_tokens_cost_usd_dollar: {completion_tokens_cost_usd_dollar}"
|
f"Returned custom cost for model={model} - prompt_tokens_cost_usd_dollar: {prompt_tokens_cost_usd_dollar}, completion_tokens_cost_usd_dollar: {completion_tokens_cost_usd_dollar}"
|
||||||
)
|
)
|
||||||
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
|
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
|
||||||
elif model_with_provider in model_cost_ref:
|
|
||||||
verbose_logger.debug(
|
|
||||||
f"Looking up model={model_with_provider} in model_cost_map"
|
|
||||||
)
|
|
||||||
verbose_logger.debug(
|
|
||||||
f"applying cost={model_cost_ref[model_with_provider]['input_cost_per_token']} for prompt_tokens={prompt_tokens}"
|
|
||||||
)
|
|
||||||
prompt_tokens_cost_usd_dollar = (
|
|
||||||
model_cost_ref[model_with_provider]["input_cost_per_token"] * prompt_tokens
|
|
||||||
)
|
|
||||||
verbose_logger.debug(
|
|
||||||
f"applying cost={model_cost_ref[model_with_provider]['output_cost_per_token']} for completion_tokens={completion_tokens}"
|
|
||||||
)
|
|
||||||
completion_tokens_cost_usd_dollar = (
|
|
||||||
model_cost_ref[model_with_provider]["output_cost_per_token"]
|
|
||||||
* completion_tokens
|
|
||||||
)
|
|
||||||
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
|
|
||||||
elif "ft:gpt-3.5-turbo" in model:
|
elif "ft:gpt-3.5-turbo" in model:
|
||||||
verbose_logger.debug(f"Cost Tracking: {model} is an OpenAI FinteTuned LLM")
|
print_verbose(f"Cost Tracking: {model} is an OpenAI FinteTuned LLM")
|
||||||
# fuzzy match ft:gpt-3.5-turbo:abcd-id-cool-litellm
|
# fuzzy match ft:gpt-3.5-turbo:abcd-id-cool-litellm
|
||||||
prompt_tokens_cost_usd_dollar = (
|
prompt_tokens_cost_usd_dollar = (
|
||||||
model_cost_ref["ft:gpt-3.5-turbo"]["input_cost_per_token"] * prompt_tokens
|
model_cost_ref["ft:gpt-3.5-turbo"]["input_cost_per_token"] * prompt_tokens
|
||||||
|
@ -3031,7 +3045,10 @@ def completion_cost(
|
||||||
prompt="",
|
prompt="",
|
||||||
messages: List = [],
|
messages: List = [],
|
||||||
completion="",
|
completion="",
|
||||||
total_time=0.0, # used for replicate
|
total_time=0.0, # used for replicate, sagemaker
|
||||||
|
### REGION ###
|
||||||
|
custom_llm_provider=None,
|
||||||
|
region_name=None, # used for bedrock pricing
|
||||||
### IMAGE GEN ###
|
### IMAGE GEN ###
|
||||||
size=None,
|
size=None,
|
||||||
quality=None,
|
quality=None,
|
||||||
|
@ -3080,12 +3097,13 @@ def completion_cost(
|
||||||
model = (
|
model = (
|
||||||
model or completion_response["model"]
|
model or completion_response["model"]
|
||||||
) # check if user passed an override for model, if it's none check completion_response['model']
|
) # check if user passed an override for model, if it's none check completion_response['model']
|
||||||
if completion_response is not None and hasattr(
|
if hasattr(completion_response, "_hidden_params"):
|
||||||
completion_response, "_hidden_params"
|
|
||||||
):
|
|
||||||
custom_llm_provider = completion_response._hidden_params.get(
|
custom_llm_provider = completion_response._hidden_params.get(
|
||||||
"custom_llm_provider", ""
|
"custom_llm_provider", ""
|
||||||
)
|
)
|
||||||
|
region_name = completion_response._hidden_params.get(
|
||||||
|
"region_name", region_name
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
if len(messages) > 0:
|
if len(messages) > 0:
|
||||||
prompt_tokens = token_counter(model=model, messages=messages)
|
prompt_tokens = token_counter(model=model, messages=messages)
|
||||||
|
@ -3146,8 +3164,13 @@ def completion_cost(
|
||||||
completion_tokens=completion_tokens,
|
completion_tokens=completion_tokens,
|
||||||
custom_llm_provider=custom_llm_provider,
|
custom_llm_provider=custom_llm_provider,
|
||||||
response_time_ms=total_time,
|
response_time_ms=total_time,
|
||||||
|
region_name=region_name,
|
||||||
)
|
)
|
||||||
return prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
|
_final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
|
||||||
|
print_verbose(
|
||||||
|
f"final cost: {_final_cost}; prompt_tokens_cost_usd_dollar: {prompt_tokens_cost_usd_dollar}; completion_tokens_cost_usd_dollar: {completion_tokens_cost_usd_dollar}"
|
||||||
|
)
|
||||||
|
return _final_cost
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue