diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json index b12edc262..b695d8086 100644 --- a/litellm/model_prices_and_context_window_backup.json +++ b/litellm/model_prices_and_context_window_backup.json @@ -1418,6 +1418,123 @@ "litellm_provider": "replicate", "mode": "chat" }, + "replicate/meta/llama-2-13b": { + "max_tokens": 4096, + "max_input_tokens": 4096, + "max_output_tokens": 4096, + "input_cost_per_token": 0.0000001, + "output_cost_per_token": 0.0000005, + "litellm_provider": "replicate", + "mode": "chat" + }, + "replicate/meta/llama-2-13b-chat": { + "max_tokens": 4096, + "max_input_tokens": 4096, + "max_output_tokens": 4096, + "input_cost_per_token": 0.0000001, + "output_cost_per_token": 0.0000005, + "litellm_provider": "replicate", + "mode": "chat" + }, + "replicate/meta/llama-2-70b": { + "max_tokens": 4096, + "max_input_tokens": 4096, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00000065, + "output_cost_per_token": 0.00000275, + "litellm_provider": "replicate", + "mode": "chat" + }, + "replicate/meta/llama-2-70b-chat": { + "max_tokens": 4096, + "max_input_tokens": 4096, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00000065, + "output_cost_per_token": 0.00000275, + "litellm_provider": "replicate", + "mode": "chat" + }, + "replicate/meta/llama-2-7b": { + "max_tokens": 4096, + "max_input_tokens": 4096, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00000005, + "output_cost_per_token": 0.00000025, + "litellm_provider": "replicate", + "mode": "chat" + }, + "replicate/meta/llama-2-7b-chat": { + "max_tokens": 4096, + "max_input_tokens": 4096, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00000005, + "output_cost_per_token": 0.00000025, + "litellm_provider": "replicate", + "mode": "chat" + }, + "replicate/meta/llama-3-70b": { + "max_tokens": 4096, + "max_input_tokens": 4096, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00000065, + "output_cost_per_token": 0.00000275, + "litellm_provider": "replicate", + "mode": "chat" + }, + "replicate/meta/llama-3-70b-instruct": { + "max_tokens": 4096, + "max_input_tokens": 4096, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00000065, + "output_cost_per_token": 0.00000275, + "litellm_provider": "replicate", + "mode": "chat" + }, + "replicate/meta/llama-3-8b": { + "max_tokens": 4096, + "max_input_tokens": 4096, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00000005, + "output_cost_per_token": 0.00000025, + "litellm_provider": "replicate", + "mode": "chat" + }, + "replicate/meta/llama-3-8b-instruct": { + "max_tokens": 4096, + "max_input_tokens": 4096, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00000005, + "output_cost_per_token": 0.00000025, + "litellm_provider": "replicate", + "mode": "chat" + }, + "replicate/mistralai/mistral-7b-v0.1": { + "max_tokens": 4096, + "max_input_tokens": 4096, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00000005, + "output_cost_per_token": 0.00000025, + "litellm_provider": "replicate", + "mode": "chat" + }, + "replicate/mistralai/mistral-7b-instruct-v0.2": { + "max_tokens": 4096, + "max_input_tokens": 4096, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00000005, + "output_cost_per_token": 0.00000025, + "litellm_provider": "replicate", + "mode": "chat" + }, + "replicate/mistralai/mixtral-8x7b-instruct-v0.1": { + "max_tokens": 4096, + "max_input_tokens": 4096, + "max_output_tokens": 4096, + "input_cost_per_token": 0.0000003, + "output_cost_per_token": 0.000001, + "litellm_provider": "replicate", + "mode": "chat" + }, "openrouter/openai/gpt-3.5-turbo": { "max_tokens": 4095, "input_cost_per_token": 0.0000015, diff --git a/litellm/tests/test_completion_cost.py b/litellm/tests/test_completion_cost.py index f17d5a464..fecd53e19 100644 --- a/litellm/tests/test_completion_cost.py +++ b/litellm/tests/test_completion_cost.py @@ -328,3 +328,56 @@ def test_dalle_3_azure_cost_tracking(): completion_response=response, call_type="image_generation" ) assert cost > 0 + + +def test_replicate_llama3_cost_tracking(): + litellm.set_verbose = True + model = "replicate/meta/meta-llama-3-8b-instruct" + litellm.register_model( + { + "replicate/meta/meta-llama-3-8b-instruct": { + "input_cost_per_token": 0.00000005, + "output_cost_per_token": 0.00000025, + "litellm_provider": "replicate", + } + } + ) + response = litellm.ModelResponse( + id="chatcmpl-cad7282f-7f68-41e7-a5ab-9eb33ae301dc", + choices=[ + litellm.utils.Choices( + finish_reason="stop", + index=0, + message=litellm.utils.Message( + content="I'm doing well, thanks for asking! I'm here to help you with any questions or tasks you may have. How can I assist you today?", + role="assistant", + ), + ) + ], + created=1714401369, + model="replicate/meta/meta-llama-3-8b-instruct", + object="chat.completion", + system_fingerprint=None, + usage=litellm.utils.Usage( + prompt_tokens=48, completion_tokens=31, total_tokens=79 + ), + ) + cost = litellm.completion_cost( + completion_response=response, + messages=[{"role": "user", "content": "Hey, how's it going?"}], + ) + + print(f"cost: {cost}") + cost = round(cost, 5) + expected_cost = round( + litellm.model_cost["replicate/meta/meta-llama-3-8b-instruct"][ + "input_cost_per_token" + ] + * 48 + + litellm.model_cost["replicate/meta/meta-llama-3-8b-instruct"][ + "output_cost_per_token" + ] + * 31, + 5, + ) + assert cost == expected_cost diff --git a/litellm/utils.py b/litellm/utils.py index bb8df09b0..302d18e99 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -3641,12 +3641,12 @@ def get_replicate_completion_pricing(completion_response=None, total_time=0.0): a100_80gb_price_per_second_public = ( 0.001400 # assume all calls sent to A100 80GB for now ) - if total_time == 0.0: + if total_time == 0.0: # total time is in ms start_time = completion_response["created"] end_time = completion_response["ended"] total_time = end_time - start_time - return a100_80gb_price_per_second_public * total_time + return a100_80gb_price_per_second_public * total_time / 1000 def _select_tokenizer(model: str): @@ -3668,7 +3668,7 @@ def _select_tokenizer(model: str): tokenizer = Tokenizer.from_str(json_str) return {"type": "huggingface_tokenizer", "tokenizer": tokenizer} # llama2 - elif "llama-2" in model.lower(): + elif "llama-2" in model.lower() or "replicate" in model.lower(): tokenizer = Tokenizer.from_pretrained("hf-internal-testing/llama-tokenizer") return {"type": "huggingface_tokenizer", "tokenizer": tokenizer} # default - tiktoken @@ -4269,7 +4269,10 @@ def completion_cost( model = get_model_params_and_category(model) # replicate llms are calculate based on time for request running # see https://replicate.com/pricing - elif model in litellm.replicate_models or "replicate" in model: + elif ( + model in litellm.replicate_models or "replicate" in model + ) and model not in litellm.model_cost: + # for unmapped replicate model, default to replicate's time tracking logic return get_replicate_completion_pricing(completion_response, total_time) ( diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json index b12edc262..b695d8086 100644 --- a/model_prices_and_context_window.json +++ b/model_prices_and_context_window.json @@ -1418,6 +1418,123 @@ "litellm_provider": "replicate", "mode": "chat" }, + "replicate/meta/llama-2-13b": { + "max_tokens": 4096, + "max_input_tokens": 4096, + "max_output_tokens": 4096, + "input_cost_per_token": 0.0000001, + "output_cost_per_token": 0.0000005, + "litellm_provider": "replicate", + "mode": "chat" + }, + "replicate/meta/llama-2-13b-chat": { + "max_tokens": 4096, + "max_input_tokens": 4096, + "max_output_tokens": 4096, + "input_cost_per_token": 0.0000001, + "output_cost_per_token": 0.0000005, + "litellm_provider": "replicate", + "mode": "chat" + }, + "replicate/meta/llama-2-70b": { + "max_tokens": 4096, + "max_input_tokens": 4096, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00000065, + "output_cost_per_token": 0.00000275, + "litellm_provider": "replicate", + "mode": "chat" + }, + "replicate/meta/llama-2-70b-chat": { + "max_tokens": 4096, + "max_input_tokens": 4096, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00000065, + "output_cost_per_token": 0.00000275, + "litellm_provider": "replicate", + "mode": "chat" + }, + "replicate/meta/llama-2-7b": { + "max_tokens": 4096, + "max_input_tokens": 4096, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00000005, + "output_cost_per_token": 0.00000025, + "litellm_provider": "replicate", + "mode": "chat" + }, + "replicate/meta/llama-2-7b-chat": { + "max_tokens": 4096, + "max_input_tokens": 4096, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00000005, + "output_cost_per_token": 0.00000025, + "litellm_provider": "replicate", + "mode": "chat" + }, + "replicate/meta/llama-3-70b": { + "max_tokens": 4096, + "max_input_tokens": 4096, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00000065, + "output_cost_per_token": 0.00000275, + "litellm_provider": "replicate", + "mode": "chat" + }, + "replicate/meta/llama-3-70b-instruct": { + "max_tokens": 4096, + "max_input_tokens": 4096, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00000065, + "output_cost_per_token": 0.00000275, + "litellm_provider": "replicate", + "mode": "chat" + }, + "replicate/meta/llama-3-8b": { + "max_tokens": 4096, + "max_input_tokens": 4096, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00000005, + "output_cost_per_token": 0.00000025, + "litellm_provider": "replicate", + "mode": "chat" + }, + "replicate/meta/llama-3-8b-instruct": { + "max_tokens": 4096, + "max_input_tokens": 4096, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00000005, + "output_cost_per_token": 0.00000025, + "litellm_provider": "replicate", + "mode": "chat" + }, + "replicate/mistralai/mistral-7b-v0.1": { + "max_tokens": 4096, + "max_input_tokens": 4096, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00000005, + "output_cost_per_token": 0.00000025, + "litellm_provider": "replicate", + "mode": "chat" + }, + "replicate/mistralai/mistral-7b-instruct-v0.2": { + "max_tokens": 4096, + "max_input_tokens": 4096, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00000005, + "output_cost_per_token": 0.00000025, + "litellm_provider": "replicate", + "mode": "chat" + }, + "replicate/mistralai/mixtral-8x7b-instruct-v0.1": { + "max_tokens": 4096, + "max_input_tokens": 4096, + "max_output_tokens": 4096, + "input_cost_per_token": 0.0000003, + "output_cost_per_token": 0.000001, + "litellm_provider": "replicate", + "mode": "chat" + }, "openrouter/openai/gpt-3.5-turbo": { "max_tokens": 4095, "input_cost_per_token": 0.0000015,