mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-24 18:24:20 +00:00
add replicate pricing
This commit is contained in:
parent
4cfcabd919
commit
8b3b682000
4 changed files with 63 additions and 9 deletions
|
@ -125,6 +125,7 @@ def completion(
|
||||||
## Step1: Start Prediction: gets a prediction url
|
## Step1: Start Prediction: gets a prediction url
|
||||||
## Step2: Poll prediction url for response
|
## Step2: Poll prediction url for response
|
||||||
## Step2: is handled with and without streaming
|
## Step2: is handled with and without streaming
|
||||||
|
model_response["created"] = time.time() # for pricing this must remain right before calling api
|
||||||
prediction_url = start_prediction(version_id, input_data, api_key, logging_obj=logging_obj)
|
prediction_url = start_prediction(version_id, input_data, api_key, logging_obj=logging_obj)
|
||||||
print_verbose(prediction_url)
|
print_verbose(prediction_url)
|
||||||
|
|
||||||
|
@ -134,7 +135,7 @@ def completion(
|
||||||
return handle_prediction_response_streaming(prediction_url, api_key, print_verbose)
|
return handle_prediction_response_streaming(prediction_url, api_key, print_verbose)
|
||||||
else:
|
else:
|
||||||
result, logs = handle_prediction_response(prediction_url, api_key, print_verbose)
|
result, logs = handle_prediction_response(prediction_url, api_key, print_verbose)
|
||||||
|
model_response["ended"] = time.time() # for pricing this must remain right after calling api
|
||||||
## LOGGING
|
## LOGGING
|
||||||
logging_obj.post_call(
|
logging_obj.post_call(
|
||||||
input=prompt,
|
input=prompt,
|
||||||
|
@ -154,8 +155,7 @@ def completion(
|
||||||
# Calculate usage
|
# Calculate usage
|
||||||
prompt_tokens = len(encoding.encode(prompt))
|
prompt_tokens = len(encoding.encode(prompt))
|
||||||
completion_tokens = len(encoding.encode(model_response["choices"][0]["message"]["content"]))
|
completion_tokens = len(encoding.encode(model_response["choices"][0]["message"]["content"]))
|
||||||
model_response["created"] = time.time()
|
model_response["model"] = "replicate/" + model
|
||||||
model_response["model"] = model
|
|
||||||
model_response["usage"] = {
|
model_response["usage"] = {
|
||||||
"prompt_tokens": prompt_tokens,
|
"prompt_tokens": prompt_tokens,
|
||||||
"completion_tokens": completion_tokens,
|
"completion_tokens": completion_tokens,
|
||||||
|
@ -164,7 +164,6 @@ def completion(
|
||||||
return model_response
|
return model_response
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# # Example usage:
|
# # Example usage:
|
||||||
# response = completion(
|
# response = completion(
|
||||||
# api_key="",
|
# api_key="",
|
||||||
|
|
|
@ -35,4 +35,34 @@ def test_completion_togetherai_cost():
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pytest.fail(f"Error occurred: {e}")
|
pytest.fail(f"Error occurred: {e}")
|
||||||
test_completion_togetherai_cost()
|
# test_completion_togetherai_cost()
|
||||||
|
|
||||||
|
|
||||||
|
def test_completion_replicate_llama_2():
|
||||||
|
model_name = "replicate/llama-2-70b-chat:2796ee9483c3fd7aa2e171d38f4ca12251a30609463dcfd4cd76703f22e96cdf"
|
||||||
|
try:
|
||||||
|
response = completion(
|
||||||
|
model=model_name,
|
||||||
|
messages=messages,
|
||||||
|
max_tokens=20,
|
||||||
|
custom_llm_provider="replicate"
|
||||||
|
)
|
||||||
|
print(response)
|
||||||
|
# Add any assertions here to check the response
|
||||||
|
response_str = response["choices"][0]["message"]["content"]
|
||||||
|
print(response_str)
|
||||||
|
|
||||||
|
# Add any assertions here to check the response
|
||||||
|
print(response)
|
||||||
|
print("Completion Cost: for togethercomputer/llama-2-70b-chat")
|
||||||
|
cost = completion_cost(completion_response=response)
|
||||||
|
formatted_string = f"${float(cost):.10f}"
|
||||||
|
print(formatted_string)
|
||||||
|
|
||||||
|
if type(response_str) != str:
|
||||||
|
pytest.fail(f"Error occurred: {e}")
|
||||||
|
except Exception as e:
|
||||||
|
pytest.fail(f"Error occurred: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
# test_completion_replicate_llama_2()
|
|
@ -529,7 +529,7 @@ def client(original_function):
|
||||||
# TODO: Add to cache for streaming
|
# TODO: Add to cache for streaming
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
# [OPTIONAL] ADD TO CACHE
|
# [OPTIONAL] ADD TO CACHE
|
||||||
if litellm.caching or litellm.caching_with_models or litellm.cache != None: # user init a cache object
|
if litellm.caching or litellm.caching_with_models or litellm.cache != None: # user init a cache object
|
||||||
litellm.cache.add_cache(result, *args, **kwargs)
|
litellm.cache.add_cache(result, *args, **kwargs)
|
||||||
|
@ -594,6 +594,21 @@ def get_model_params_and_category(model_name):
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def get_replicate_completion_pricing(completion_response=None, run_time_in_seconds=0.0):
|
||||||
|
# see https://replicate.com/pricing
|
||||||
|
a100_40gb_price_per_second_public = 0.001150
|
||||||
|
|
||||||
|
# for all litellm currently supported LLMs, almost all requests go to a100_80gb
|
||||||
|
a100_80gb_price_per_second_public = 0.001400
|
||||||
|
|
||||||
|
start_time = completion_response['created']
|
||||||
|
end_time = completion_response["ended"]
|
||||||
|
run_time_in_seconds = end_time - start_time
|
||||||
|
|
||||||
|
print("total_replicate_run_time", run_time_in_seconds)
|
||||||
|
|
||||||
|
return a100_80gb_price_per_second_public*run_time_in_seconds
|
||||||
|
|
||||||
|
|
||||||
def token_counter(model, text):
|
def token_counter(model, text):
|
||||||
# use tiktoken or anthropic's tokenizer depending on the model
|
# use tiktoken or anthropic's tokenizer depending on the model
|
||||||
|
@ -647,6 +662,8 @@ def completion_cost(
|
||||||
completion="",
|
completion="",
|
||||||
completion_response=None
|
completion_response=None
|
||||||
):
|
):
|
||||||
|
|
||||||
|
# Handle Inputs to completion_cost
|
||||||
prompt_tokens = 0
|
prompt_tokens = 0
|
||||||
completion_tokens = 0
|
completion_tokens = 0
|
||||||
if completion_response != None:
|
if completion_response != None:
|
||||||
|
@ -657,10 +674,20 @@ def completion_cost(
|
||||||
else:
|
else:
|
||||||
prompt_tokens = token_counter(model=model, text=prompt)
|
prompt_tokens = token_counter(model=model, text=prompt)
|
||||||
completion_tokens = token_counter(model=model, text=completion)
|
completion_tokens = token_counter(model=model, text=completion)
|
||||||
|
|
||||||
|
# Calculate cost based on prompt_tokens, completion_tokens
|
||||||
if "togethercomputer" in model:
|
if "togethercomputer" in model:
|
||||||
# together ai prices based on size of llm
|
# together ai prices based on size of llm
|
||||||
# get_model_params_and_category takes a model name and returns the category of LLM size it is in model_prices_and_context_window.json
|
# get_model_params_and_category takes a model name and returns the category of LLM size it is in model_prices_and_context_window.json
|
||||||
model = get_model_params_and_category(model)
|
model = get_model_params_and_category(model)
|
||||||
|
# replicate llms are calculate based on time for request running
|
||||||
|
# see https://replicate.com/pricing
|
||||||
|
elif (
|
||||||
|
model in litellm.replicate_models or
|
||||||
|
"replicate" in model
|
||||||
|
):
|
||||||
|
return get_replicate_completion_pricing(completion_response)
|
||||||
|
|
||||||
prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = cost_per_token(
|
prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = cost_per_token(
|
||||||
model=model, prompt_tokens=prompt_tokens, completion_tokens=completion_tokens
|
model=model, prompt_tokens=prompt_tokens, completion_tokens=completion_tokens
|
||||||
)
|
)
|
||||||
|
|
|
@ -65,9 +65,7 @@
|
||||||
"output_cost_per_token": 0.000015
|
"output_cost_per_token": 0.000015
|
||||||
},
|
},
|
||||||
"replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1": {
|
"replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1": {
|
||||||
"max_tokens": 4096,
|
"max_tokens": 4096
|
||||||
"input_cost_per_token": 0.00000608,
|
|
||||||
"output_cost_per_token": 0.00000608
|
|
||||||
},
|
},
|
||||||
"together-ai-up-to-3b": {
|
"together-ai-up-to-3b": {
|
||||||
"input_cost_per_token": 0.0000001,
|
"input_cost_per_token": 0.0000001,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue