forked from phoenix/litellm-mirror
Merge pull request #3354 from BerriAI/litellm_replicate_cost_tracking
fix(utils.py): replicate now also has token based pricing for some models
This commit is contained in:
commit
ec2510029a
4 changed files with 294 additions and 4 deletions
|
@ -1418,6 +1418,123 @@
|
||||||
"litellm_provider": "replicate",
|
"litellm_provider": "replicate",
|
||||||
"mode": "chat"
|
"mode": "chat"
|
||||||
},
|
},
|
||||||
|
"replicate/meta/llama-2-13b": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.0000001,
|
||||||
|
"output_cost_per_token": 0.0000005,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/meta/llama-2-13b-chat": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.0000001,
|
||||||
|
"output_cost_per_token": 0.0000005,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/meta/llama-2-70b": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.00000065,
|
||||||
|
"output_cost_per_token": 0.00000275,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/meta/llama-2-70b-chat": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.00000065,
|
||||||
|
"output_cost_per_token": 0.00000275,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/meta/llama-2-7b": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.00000005,
|
||||||
|
"output_cost_per_token": 0.00000025,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/meta/llama-2-7b-chat": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.00000005,
|
||||||
|
"output_cost_per_token": 0.00000025,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/meta/llama-3-70b": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.00000065,
|
||||||
|
"output_cost_per_token": 0.00000275,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/meta/llama-3-70b-instruct": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.00000065,
|
||||||
|
"output_cost_per_token": 0.00000275,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/meta/llama-3-8b": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.00000005,
|
||||||
|
"output_cost_per_token": 0.00000025,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/meta/llama-3-8b-instruct": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.00000005,
|
||||||
|
"output_cost_per_token": 0.00000025,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/mistralai/mistral-7b-v0.1": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.00000005,
|
||||||
|
"output_cost_per_token": 0.00000025,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/mistralai/mistral-7b-instruct-v0.2": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.00000005,
|
||||||
|
"output_cost_per_token": 0.00000025,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/mistralai/mixtral-8x7b-instruct-v0.1": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.0000003,
|
||||||
|
"output_cost_per_token": 0.000001,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
"openrouter/openai/gpt-3.5-turbo": {
|
"openrouter/openai/gpt-3.5-turbo": {
|
||||||
"max_tokens": 4095,
|
"max_tokens": 4095,
|
||||||
"input_cost_per_token": 0.0000015,
|
"input_cost_per_token": 0.0000015,
|
||||||
|
|
|
@ -328,3 +328,56 @@ def test_dalle_3_azure_cost_tracking():
|
||||||
completion_response=response, call_type="image_generation"
|
completion_response=response, call_type="image_generation"
|
||||||
)
|
)
|
||||||
assert cost > 0
|
assert cost > 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_replicate_llama3_cost_tracking():
|
||||||
|
litellm.set_verbose = True
|
||||||
|
model = "replicate/meta/meta-llama-3-8b-instruct"
|
||||||
|
litellm.register_model(
|
||||||
|
{
|
||||||
|
"replicate/meta/meta-llama-3-8b-instruct": {
|
||||||
|
"input_cost_per_token": 0.00000005,
|
||||||
|
"output_cost_per_token": 0.00000025,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)
|
||||||
|
response = litellm.ModelResponse(
|
||||||
|
id="chatcmpl-cad7282f-7f68-41e7-a5ab-9eb33ae301dc",
|
||||||
|
choices=[
|
||||||
|
litellm.utils.Choices(
|
||||||
|
finish_reason="stop",
|
||||||
|
index=0,
|
||||||
|
message=litellm.utils.Message(
|
||||||
|
content="I'm doing well, thanks for asking! I'm here to help you with any questions or tasks you may have. How can I assist you today?",
|
||||||
|
role="assistant",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
],
|
||||||
|
created=1714401369,
|
||||||
|
model="replicate/meta/meta-llama-3-8b-instruct",
|
||||||
|
object="chat.completion",
|
||||||
|
system_fingerprint=None,
|
||||||
|
usage=litellm.utils.Usage(
|
||||||
|
prompt_tokens=48, completion_tokens=31, total_tokens=79
|
||||||
|
),
|
||||||
|
)
|
||||||
|
cost = litellm.completion_cost(
|
||||||
|
completion_response=response,
|
||||||
|
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"cost: {cost}")
|
||||||
|
cost = round(cost, 5)
|
||||||
|
expected_cost = round(
|
||||||
|
litellm.model_cost["replicate/meta/meta-llama-3-8b-instruct"][
|
||||||
|
"input_cost_per_token"
|
||||||
|
]
|
||||||
|
* 48
|
||||||
|
+ litellm.model_cost["replicate/meta/meta-llama-3-8b-instruct"][
|
||||||
|
"output_cost_per_token"
|
||||||
|
]
|
||||||
|
* 31,
|
||||||
|
5,
|
||||||
|
)
|
||||||
|
assert cost == expected_cost
|
||||||
|
|
|
@ -3641,12 +3641,12 @@ def get_replicate_completion_pricing(completion_response=None, total_time=0.0):
|
||||||
a100_80gb_price_per_second_public = (
|
a100_80gb_price_per_second_public = (
|
||||||
0.001400 # assume all calls sent to A100 80GB for now
|
0.001400 # assume all calls sent to A100 80GB for now
|
||||||
)
|
)
|
||||||
if total_time == 0.0:
|
if total_time == 0.0: # total time is in ms
|
||||||
start_time = completion_response["created"]
|
start_time = completion_response["created"]
|
||||||
end_time = completion_response["ended"]
|
end_time = completion_response["ended"]
|
||||||
total_time = end_time - start_time
|
total_time = end_time - start_time
|
||||||
|
|
||||||
return a100_80gb_price_per_second_public * total_time
|
return a100_80gb_price_per_second_public * total_time / 1000
|
||||||
|
|
||||||
|
|
||||||
def _select_tokenizer(model: str):
|
def _select_tokenizer(model: str):
|
||||||
|
@ -3668,7 +3668,7 @@ def _select_tokenizer(model: str):
|
||||||
tokenizer = Tokenizer.from_str(json_str)
|
tokenizer = Tokenizer.from_str(json_str)
|
||||||
return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
|
return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
|
||||||
# llama2
|
# llama2
|
||||||
elif "llama-2" in model.lower():
|
elif "llama-2" in model.lower() or "replicate" in model.lower():
|
||||||
tokenizer = Tokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
|
tokenizer = Tokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
|
||||||
return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
|
return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
|
||||||
# default - tiktoken
|
# default - tiktoken
|
||||||
|
@ -4269,7 +4269,10 @@ def completion_cost(
|
||||||
model = get_model_params_and_category(model)
|
model = get_model_params_and_category(model)
|
||||||
# replicate llms are calculate based on time for request running
|
# replicate llms are calculate based on time for request running
|
||||||
# see https://replicate.com/pricing
|
# see https://replicate.com/pricing
|
||||||
elif model in litellm.replicate_models or "replicate" in model:
|
elif (
|
||||||
|
model in litellm.replicate_models or "replicate" in model
|
||||||
|
) and model not in litellm.model_cost:
|
||||||
|
# for unmapped replicate model, default to replicate's time tracking logic
|
||||||
return get_replicate_completion_pricing(completion_response, total_time)
|
return get_replicate_completion_pricing(completion_response, total_time)
|
||||||
|
|
||||||
(
|
(
|
||||||
|
|
|
@ -1418,6 +1418,123 @@
|
||||||
"litellm_provider": "replicate",
|
"litellm_provider": "replicate",
|
||||||
"mode": "chat"
|
"mode": "chat"
|
||||||
},
|
},
|
||||||
|
"replicate/meta/llama-2-13b": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.0000001,
|
||||||
|
"output_cost_per_token": 0.0000005,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/meta/llama-2-13b-chat": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.0000001,
|
||||||
|
"output_cost_per_token": 0.0000005,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/meta/llama-2-70b": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.00000065,
|
||||||
|
"output_cost_per_token": 0.00000275,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/meta/llama-2-70b-chat": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.00000065,
|
||||||
|
"output_cost_per_token": 0.00000275,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/meta/llama-2-7b": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.00000005,
|
||||||
|
"output_cost_per_token": 0.00000025,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/meta/llama-2-7b-chat": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.00000005,
|
||||||
|
"output_cost_per_token": 0.00000025,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/meta/llama-3-70b": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.00000065,
|
||||||
|
"output_cost_per_token": 0.00000275,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/meta/llama-3-70b-instruct": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.00000065,
|
||||||
|
"output_cost_per_token": 0.00000275,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/meta/llama-3-8b": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.00000005,
|
||||||
|
"output_cost_per_token": 0.00000025,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/meta/llama-3-8b-instruct": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.00000005,
|
||||||
|
"output_cost_per_token": 0.00000025,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/mistralai/mistral-7b-v0.1": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.00000005,
|
||||||
|
"output_cost_per_token": 0.00000025,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/mistralai/mistral-7b-instruct-v0.2": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.00000005,
|
||||||
|
"output_cost_per_token": 0.00000025,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
|
"replicate/mistralai/mixtral-8x7b-instruct-v0.1": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"input_cost_per_token": 0.0000003,
|
||||||
|
"output_cost_per_token": 0.000001,
|
||||||
|
"litellm_provider": "replicate",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
"openrouter/openai/gpt-3.5-turbo": {
|
"openrouter/openai/gpt-3.5-turbo": {
|
||||||
"max_tokens": 4095,
|
"max_tokens": 4095,
|
||||||
"input_cost_per_token": 0.0000015,
|
"input_cost_per_token": 0.0000015,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue