forked from phoenix/litellm-mirror
(feat) completion_cost - embeddings + raise Exception
This commit is contained in:
parent
db50a07318
commit
f681f0f2b2
3 changed files with 43 additions and 18 deletions
|
@ -338,7 +338,8 @@ baseten_models: List = [
|
||||||
] # FALCON 7B # WizardLM # Mosaic ML
|
] # FALCON 7B # WizardLM # Mosaic ML
|
||||||
|
|
||||||
|
|
||||||
# used for token counting
|
# used for Cost Tracking & Token counting
|
||||||
|
# https://azure.microsoft.com/en-in/pricing/details/cognitive-services/openai-service/
|
||||||
# Azure returns gpt-35-turbo in their responses, we need to map this to azure/gpt-3.5-turbo for token counting
|
# Azure returns gpt-35-turbo in their responses, we need to map this to azure/gpt-3.5-turbo for token counting
|
||||||
azure_llms = {
|
azure_llms = {
|
||||||
"gpt-35-turbo": "azure/gpt-35-turbo",
|
"gpt-35-turbo": "azure/gpt-35-turbo",
|
||||||
|
@ -346,6 +347,10 @@ azure_llms = {
|
||||||
"gpt-35-turbo-instruct": "azure/gpt-35-turbo-instruct",
|
"gpt-35-turbo-instruct": "azure/gpt-35-turbo-instruct",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
azure_embedding_models = {
|
||||||
|
"ada": "azure/ada",
|
||||||
|
}
|
||||||
|
|
||||||
petals_models = [
|
petals_models = [
|
||||||
"petals-team/StableBeluga2",
|
"petals-team/StableBeluga2",
|
||||||
]
|
]
|
||||||
|
|
|
@ -59,6 +59,7 @@ def test_openai_embedding():
|
||||||
|
|
||||||
def test_openai_azure_embedding_simple():
|
def test_openai_azure_embedding_simple():
|
||||||
try:
|
try:
|
||||||
|
litellm.set_verbose = True
|
||||||
response = embedding(
|
response = embedding(
|
||||||
model="azure/azure-embedding-model",
|
model="azure/azure-embedding-model",
|
||||||
input=["good morning from litellm"],
|
input=["good morning from litellm"],
|
||||||
|
@ -70,11 +71,15 @@ def test_openai_azure_embedding_simple():
|
||||||
response_keys
|
response_keys
|
||||||
) # assert litellm response has expected keys from OpenAI embedding response
|
) # assert litellm response has expected keys from OpenAI embedding response
|
||||||
|
|
||||||
|
request_cost = litellm.completion_cost(completion_response=response)
|
||||||
|
|
||||||
|
print("Calculated request cost=", request_cost)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pytest.fail(f"Error occurred: {e}")
|
pytest.fail(f"Error occurred: {e}")
|
||||||
|
|
||||||
|
|
||||||
# test_openai_azure_embedding_simple()
|
test_openai_azure_embedding_simple()
|
||||||
|
|
||||||
|
|
||||||
def test_openai_azure_embedding_timeouts():
|
def test_openai_azure_embedding_timeouts():
|
||||||
|
|
|
@ -2740,6 +2740,8 @@ def cost_per_token(model="", prompt_tokens=0, completion_tokens=0):
|
||||||
completion_tokens_cost_usd_dollar = 0
|
completion_tokens_cost_usd_dollar = 0
|
||||||
model_cost_ref = litellm.model_cost
|
model_cost_ref = litellm.model_cost
|
||||||
# see this https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models
|
# see this https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models
|
||||||
|
print_verbose(f"Looking up model={model} in model_cost_map")
|
||||||
|
|
||||||
if model in model_cost_ref:
|
if model in model_cost_ref:
|
||||||
prompt_tokens_cost_usd_dollar = (
|
prompt_tokens_cost_usd_dollar = (
|
||||||
model_cost_ref[model]["input_cost_per_token"] * prompt_tokens
|
model_cost_ref[model]["input_cost_per_token"] * prompt_tokens
|
||||||
|
@ -2749,6 +2751,7 @@ def cost_per_token(model="", prompt_tokens=0, completion_tokens=0):
|
||||||
)
|
)
|
||||||
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
|
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
|
||||||
elif "ft:gpt-3.5-turbo" in model:
|
elif "ft:gpt-3.5-turbo" in model:
|
||||||
|
print_verbose(f"Cost Tracking: {model} is an OpenAI FinteTuned LLM")
|
||||||
# fuzzy match ft:gpt-3.5-turbo:abcd-id-cool-litellm
|
# fuzzy match ft:gpt-3.5-turbo:abcd-id-cool-litellm
|
||||||
prompt_tokens_cost_usd_dollar = (
|
prompt_tokens_cost_usd_dollar = (
|
||||||
model_cost_ref["ft:gpt-3.5-turbo"]["input_cost_per_token"] * prompt_tokens
|
model_cost_ref["ft:gpt-3.5-turbo"]["input_cost_per_token"] * prompt_tokens
|
||||||
|
@ -2759,6 +2762,7 @@ def cost_per_token(model="", prompt_tokens=0, completion_tokens=0):
|
||||||
)
|
)
|
||||||
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
|
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
|
||||||
elif model in litellm.azure_llms:
|
elif model in litellm.azure_llms:
|
||||||
|
print_verbose(f"Cost Tracking: {model} is an Azure LLM")
|
||||||
model = litellm.azure_llms[model]
|
model = litellm.azure_llms[model]
|
||||||
prompt_tokens_cost_usd_dollar = (
|
prompt_tokens_cost_usd_dollar = (
|
||||||
model_cost_ref[model]["input_cost_per_token"] * prompt_tokens
|
model_cost_ref[model]["input_cost_per_token"] * prompt_tokens
|
||||||
|
@ -2767,19 +2771,29 @@ def cost_per_token(model="", prompt_tokens=0, completion_tokens=0):
|
||||||
model_cost_ref[model]["output_cost_per_token"] * completion_tokens
|
model_cost_ref[model]["output_cost_per_token"] * completion_tokens
|
||||||
)
|
)
|
||||||
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
|
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
|
||||||
else:
|
elif model in litellm.azure_embedding_models:
|
||||||
# calculate average input cost, azure/gpt-deployments can potentially go here if users don't specify, gpt-4, gpt-3.5-turbo. LLMs litellm knows
|
print_verbose(f"Cost Tracking: {model} is an Azure Embedding Model")
|
||||||
input_cost_sum = 0
|
model = litellm.azure_embedding_models[model]
|
||||||
output_cost_sum = 0
|
prompt_tokens_cost_usd_dollar = (
|
||||||
model_cost_ref = litellm.model_cost
|
model_cost_ref[model]["input_cost_per_token"] * prompt_tokens
|
||||||
for model in model_cost_ref:
|
)
|
||||||
input_cost_sum += model_cost_ref[model]["input_cost_per_token"]
|
completion_tokens_cost_usd_dollar = (
|
||||||
output_cost_sum += model_cost_ref[model]["output_cost_per_token"]
|
model_cost_ref[model]["output_cost_per_token"] * completion_tokens
|
||||||
avg_input_cost = input_cost_sum / len(model_cost_ref.keys())
|
)
|
||||||
avg_output_cost = output_cost_sum / len(model_cost_ref.keys())
|
|
||||||
prompt_tokens_cost_usd_dollar = avg_input_cost * prompt_tokens
|
|
||||||
completion_tokens_cost_usd_dollar = avg_output_cost * completion_tokens
|
|
||||||
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
|
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
|
||||||
|
else:
|
||||||
|
# if model is not in model_prices_and_context_window.json. Raise an exception-let users know
|
||||||
|
error_str = f"Model not in model_prices_and_context_window.json. You passed model={model}\n"
|
||||||
|
raise litellm.exceptions.NotFoundError( # type: ignore
|
||||||
|
message=error_str,
|
||||||
|
model=model,
|
||||||
|
response=httpx.Response(
|
||||||
|
status_code=404,
|
||||||
|
content=error_str,
|
||||||
|
request=httpx.request(method="cost_per_token", url="https://github.com/BerriAI/litellm"), # type: ignore
|
||||||
|
),
|
||||||
|
llm_provider="",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def completion_cost(
|
def completion_cost(
|
||||||
|
@ -2821,8 +2835,10 @@ def completion_cost(
|
||||||
completion_tokens = 0
|
completion_tokens = 0
|
||||||
if completion_response is not None:
|
if completion_response is not None:
|
||||||
# get input/output tokens from completion_response
|
# get input/output tokens from completion_response
|
||||||
prompt_tokens = completion_response["usage"]["prompt_tokens"]
|
prompt_tokens = completion_response.get("usage", {}).get("prompt_tokens", 0)
|
||||||
completion_tokens = completion_response["usage"]["completion_tokens"]
|
completion_tokens = completion_response.get("usage", {}).get(
|
||||||
|
"completion_tokens", 0
|
||||||
|
)
|
||||||
model = (
|
model = (
|
||||||
model or completion_response["model"]
|
model or completion_response["model"]
|
||||||
) # check if user passed an override for model, if it's none check completion_response['model']
|
) # check if user passed an override for model, if it's none check completion_response['model']
|
||||||
|
@ -2852,8 +2868,7 @@ def completion_cost(
|
||||||
)
|
)
|
||||||
return prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
|
return prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print_verbose(f"LiteLLM: Excepton when cost calculating {str(e)}")
|
raise e
|
||||||
return 0.0 # this should not block a users execution path
|
|
||||||
|
|
||||||
|
|
||||||
####### HELPER FUNCTIONS ################
|
####### HELPER FUNCTIONS ################
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue