diff --git a/litellm/__pycache__/main.cpython-311.pyc b/litellm/__pycache__/main.cpython-311.pyc index 06cc9494e..e1b1f2e55 100644 Binary files a/litellm/__pycache__/main.cpython-311.pyc and b/litellm/__pycache__/main.cpython-311.pyc differ diff --git a/litellm/__pycache__/utils.cpython-311.pyc b/litellm/__pycache__/utils.cpython-311.pyc index 93c419413..5f984ca53 100644 Binary files a/litellm/__pycache__/utils.cpython-311.pyc and b/litellm/__pycache__/utils.cpython-311.pyc differ diff --git a/litellm/integrations/__pycache__/aispend.cpython-311.pyc b/litellm/integrations/__pycache__/aispend.cpython-311.pyc new file mode 100644 index 000000000..a8231afd7 Binary files /dev/null and b/litellm/integrations/__pycache__/aispend.cpython-311.pyc differ diff --git a/litellm/integrations/__pycache__/berrispend.cpython-311.pyc b/litellm/integrations/__pycache__/berrispend.cpython-311.pyc new file mode 100644 index 000000000..b89fd0a31 Binary files /dev/null and b/litellm/integrations/__pycache__/berrispend.cpython-311.pyc differ diff --git a/litellm/integrations/__pycache__/helicone.cpython-311.pyc b/litellm/integrations/__pycache__/helicone.cpython-311.pyc new file mode 100644 index 000000000..100603a85 Binary files /dev/null and b/litellm/integrations/__pycache__/helicone.cpython-311.pyc differ diff --git a/litellm/integrations/aispend.py b/litellm/integrations/aispend.py new file mode 100644 index 000000000..6723a6227 --- /dev/null +++ b/litellm/integrations/aispend.py @@ -0,0 +1,94 @@ +#### What this does #### +# On success + failure, log events to aispend.io +import dotenv, os +import requests +dotenv.load_dotenv() # Loading env variables using dotenv +import traceback +import datetime + +model_cost = { + "gpt-3.5-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, + "gpt-35-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, # azure model name + "gpt-3.5-turbo-0613": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, + "gpt-3.5-turbo-0301": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, + "gpt-3.5-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, + "gpt-35-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, # azure model name + "gpt-3.5-turbo-16k-0613": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, + "gpt-4": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006}, + "gpt-4-0613": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006}, + "gpt-4-32k": {"max_tokens": 8000, "input_cost_per_token": 0.00006, "output_cost_per_token": 0.00012}, + "claude-instant-1": {"max_tokens": 100000, "input_cost_per_token": 0.00000163, "output_cost_per_token": 0.00000551}, + "claude-2": {"max_tokens": 100000, "input_cost_per_token": 0.00001102, "output_cost_per_token": 0.00003268}, + "text-bison-001": {"max_tokens": 8192, "input_cost_per_token": 0.000004, "output_cost_per_token": 0.000004}, + "chat-bison-001": {"max_tokens": 4096, "input_cost_per_token": 0.000002, "output_cost_per_token": 0.000002}, + "command-nightly": {"max_tokens": 4096, "input_cost_per_token": 0.000015, "output_cost_per_token": 0.000015}, +} + +class AISpendLogger: + # Class variables or attributes + def __init__(self): + # Instance variables + self.account_id = os.getenv("AISPEND_ACCOUNT_ID") + self.api_key = os.getenv("AISPEND_API_KEY") + + def price_calculator(self, model, response_obj, start_time, end_time): + # try and find if the model is in the model_cost map + # else default to the average of the costs + prompt_tokens_cost_usd_dollar = 0 + completion_tokens_cost_usd_dollar = 0 + if model in model_cost: + prompt_tokens_cost_usd_dollar = model_cost[model]["input_cost_per_token"] * response_obj["usage"]["prompt_tokens"] + completion_tokens_cost_usd_dollar = model_cost[model]["output_cost_per_token"] * response_obj["usage"]["completion_tokens"] + elif "replicate" in model: + # replicate models are charged based on time + # llama 2 runs on an nvidia a100 which costs $0.0032 per second - https://replicate.com/replicate/llama-2-70b-chat + model_run_time = end_time - start_time # assuming time in seconds + cost_usd_dollar = model_run_time * 0.0032 + prompt_tokens_cost_usd_dollar = cost_usd_dollar / 2 + completion_tokens_cost_usd_dollar = cost_usd_dollar / 2 + else: + # calculate average input cost + input_cost_sum = 0 + output_cost_sum = 0 + for model in model_cost: + input_cost_sum += model_cost[model]["input_cost_per_token"] + output_cost_sum += model_cost[model]["output_cost_per_token"] + avg_input_cost = input_cost_sum / len(model_cost.keys()) + avg_output_cost = output_cost_sum / len(model_cost.keys()) + prompt_tokens_cost_usd_dollar = model_cost[model]["input_cost_per_token"] * response_obj["usage"]["prompt_tokens"] + completion_tokens_cost_usd_dollar = model_cost[model]["output_cost_per_token"] * response_obj["usage"]["completion_tokens"] + return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar + + def log_event(self, model, response_obj, start_time, end_time, print_verbose): + # Method definition + try: + print_verbose(f"AISpend Logging - Enters logging function for model {model}") + + url = f"https://aispend.io/api/v1/accounts/{self.account_id}/data" + headers = { + 'Authorization': f'Bearer {self.api_key}', + 'Content-Type': 'application/json' + } + + response_timestamp = datetime.datetime.fromtimestamp(int(response_obj["created"])).strftime('%Y-%m-%d') + + prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = self.price_calculator(model, response_obj, start_time, end_time) + prompt_tokens_cost_usd_cent = prompt_tokens_cost_usd_dollar * 100 + completion_tokens_cost_usd_cent = completion_tokens_cost_usd_dollar * 100 + data = [{ + "requests": 1, + "requests_context": 1, + "context_tokens": response_obj["usage"]["prompt_tokens"], + "requests_generated": 1, + "generated_tokens": response_obj["usage"]["completion_tokens"], + "recorded_date": response_timestamp, + "model_id": response_obj["model"], + "generated_tokens_cost_usd_cent": prompt_tokens_cost_usd_cent, + "context_tokens_cost_usd_cent": completion_tokens_cost_usd_cent + }] + + print_verbose(f"AISpend Logging - final data object: {data}") + except: + # traceback.print_exc() + print_verbose(f"AISpend Logging Error - {traceback.format_exc()}") + pass diff --git a/litellm/integrations/berrispend.py b/litellm/integrations/berrispend.py new file mode 100644 index 000000000..1742bfed7 --- /dev/null +++ b/litellm/integrations/berrispend.py @@ -0,0 +1,99 @@ +#### What this does #### +# On success + failure, log events to aispend.io +import dotenv, os +import requests +dotenv.load_dotenv() # Loading env variables using dotenv +import traceback +import datetime + +model_cost = { + "gpt-3.5-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, + "gpt-35-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, # azure model name + "gpt-3.5-turbo-0613": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, + "gpt-3.5-turbo-0301": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, + "gpt-3.5-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, + "gpt-35-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, # azure model name + "gpt-3.5-turbo-16k-0613": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, + "gpt-4": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006}, + "gpt-4-0613": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006}, + "gpt-4-32k": {"max_tokens": 8000, "input_cost_per_token": 0.00006, "output_cost_per_token": 0.00012}, + "claude-instant-1": {"max_tokens": 100000, "input_cost_per_token": 0.00000163, "output_cost_per_token": 0.00000551}, + "claude-2": {"max_tokens": 100000, "input_cost_per_token": 0.00001102, "output_cost_per_token": 0.00003268}, + "text-bison-001": {"max_tokens": 8192, "input_cost_per_token": 0.000004, "output_cost_per_token": 0.000004}, + "chat-bison-001": {"max_tokens": 4096, "input_cost_per_token": 0.000002, "output_cost_per_token": 0.000002}, + "command-nightly": {"max_tokens": 4096, "input_cost_per_token": 0.000015, "output_cost_per_token": 0.000015}, +} + +class BerriSpendLogger: + # Class variables or attributes + def __init__(self): + # Instance variables + self.account_id = os.getenv("BERRISPEND_ACCOUNT_ID") + + def price_calculator(self, model, response_obj, start_time, end_time): + # try and find if the model is in the model_cost map + # else default to the average of the costs + prompt_tokens_cost_usd_dollar = 0 + completion_tokens_cost_usd_dollar = 0 + if model in model_cost: + prompt_tokens_cost_usd_dollar = model_cost[model]["input_cost_per_token"] * response_obj["usage"]["prompt_tokens"] + completion_tokens_cost_usd_dollar = model_cost[model]["output_cost_per_token"] * response_obj["usage"]["completion_tokens"] + elif "replicate" in model: + # replicate models are charged based on time + # llama 2 runs on an nvidia a100 which costs $0.0032 per second - https://replicate.com/replicate/llama-2-70b-chat + model_run_time = end_time - start_time # assuming time in seconds + cost_usd_dollar = model_run_time * 0.0032 + prompt_tokens_cost_usd_dollar = cost_usd_dollar / 2 + completion_tokens_cost_usd_dollar = cost_usd_dollar / 2 + else: + # calculate average input cost + input_cost_sum = 0 + output_cost_sum = 0 + for model in model_cost: + input_cost_sum += model_cost[model]["input_cost_per_token"] + output_cost_sum += model_cost[model]["output_cost_per_token"] + avg_input_cost = input_cost_sum / len(model_cost.keys()) + avg_output_cost = output_cost_sum / len(model_cost.keys()) + prompt_tokens_cost_usd_dollar = model_cost[model]["input_cost_per_token"] * response_obj["usage"]["prompt_tokens"] + completion_tokens_cost_usd_dollar = model_cost[model]["output_cost_per_token"] * response_obj["usage"]["completion_tokens"] + return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar + + def log_event(self, model, messages, response_obj, start_time, end_time, print_verbose): + # Method definition + try: + print_verbose(f"BerriSpend Logging - Enters logging function for model {model}") + + url = f"https://berrispend.berri.ai/spend" + headers = { + 'Content-Type': 'application/json' + } + + prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = self.price_calculator(model, response_obj, start_time, end_time) + total_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar + + response_time = (end_time-start_time).total_seconds() + if "response" in response_obj: + data = [{ + "response_time": response_time, + "model_id": response_obj["model"], + "total_cost": total_cost, + "messages": messages, + "response": response_obj['choices'][0]['message']['content'], + "account_id": self.account_id + }] + elif "error" in response_obj: + data = [{ + "response_time": response_time, + "model_id": response_obj["model"], + "total_cost": total_cost, + "messages": messages, + "error": response_obj['error'], + "account_id": self.account_id + }] + + print_verbose(f"BerriSpend Logging - final data object: {data}") + response = requests.post(url, headers=headers, json=data) + except: + # traceback.print_exc() + print_verbose(f"BerriSpend Logging Error - {traceback.format_exc()}") + pass diff --git a/litellm/tests/test_berrispend_integration.py b/litellm/tests/test_berrispend_integration.py new file mode 100644 index 000000000..ac937e5fc --- /dev/null +++ b/litellm/tests/test_berrispend_integration.py @@ -0,0 +1,25 @@ +#### What this tests #### +# This tests if logging to the helicone integration actually works + +import sys, os +import traceback +import pytest + +sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path +import litellm +from litellm import embedding, completion + +litellm.success_callback = ["berrispend"] +litellm.failure_callback = ["berrispend"] + +litellm.set_verbose = True + +user_message = "Hello, how are you?" +messages = [{ "content": user_message,"role": "user"}] + + +#openai call +response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}]) + +#bad request call +response = completion(model="chatgpt-test", messages=[{"role": "user", "content": "Hi 👋 - i'm a bad request"}]) \ No newline at end of file diff --git a/litellm/utils.py b/litellm/utils.py index 4c9fe5463..076378b1d 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -2,7 +2,10 @@ import dotenv, json, traceback, threading import subprocess, os import litellm, openai import random, uuid, requests -import datetime +import datetime, time +from anthropic import Anthropic +import tiktoken +encoding = tiktoken.get_encoding("cl100k_base") from openai.error import AuthenticationError, InvalidRequestError, RateLimitError, ServiceUnavailableError, OpenAIError ####### ENVIRONMENT VARIABLES ################### dotenv.load_dotenv() # Loading env variables using dotenv @@ -13,6 +16,8 @@ posthog = None slack_app = None alerts_channel = None heliconeLogger = None +aispendLogger = None +berrispendLogger = None callback_list = [] user_logger_fn = None additional_details = {} @@ -89,6 +94,7 @@ def client(original_function): pass def wrapper(*args, **kwargs): + start_time = None try: function_setup(*args, **kwargs) ## MODEL CALL @@ -101,7 +107,8 @@ def client(original_function): return result except Exception as e: traceback_exception = traceback.format_exc() - my_thread = threading.Thread(target=handle_failure, args=(e, traceback_exception, args, kwargs)) # don't interrupt execution of main thread + end_time = datetime.datetime.now() + my_thread = threading.Thread(target=handle_failure, args=(e, traceback_exception, start_time, end_time, args, kwargs)) # don't interrupt execution of main thread my_thread.start() raise e return wrapper @@ -153,7 +160,7 @@ def get_optional_params( return optional_params def set_callbacks(callback_list): - global sentry_sdk_instance, capture_exception, add_breadcrumb, posthog, slack_app, alerts_channel, heliconeLogger + global sentry_sdk_instance, capture_exception, add_breadcrumb, posthog, slack_app, alerts_channel, heliconeLogger, aispendLogger, berrispendLogger try: for callback in callback_list: if callback == "sentry": @@ -193,14 +200,19 @@ def set_callbacks(callback_list): print_verbose(f"Initialized Slack App: {slack_app}") elif callback == "helicone": from .integrations.helicone import HeliconeLogger - heliconeLogger = HeliconeLogger() + elif callback == "aispend": + from .integrations.aispend import AISpendLogger + aispendLogger = AISpendLogger() + elif callback == "berrispend": + from .integrations.berrispend import BerriSpendLogger + berrispendLogger = BerriSpendLogger() except: pass -def handle_failure(exception, traceback_exception, args, kwargs): - global sentry_sdk_instance, capture_exception, add_breadcrumb, posthog, slack_app, alerts_channel +def handle_failure(exception, traceback_exception, start_time, end_time, args, kwargs): + global sentry_sdk_instance, capture_exception, add_breadcrumb, posthog, slack_app, alerts_channel, aispendLogger, berrispendLogger try: # print_verbose(f"handle_failure args: {args}") # print_verbose(f"handle_failure kwargs: {kwargs}") @@ -248,6 +260,33 @@ def handle_failure(exception, traceback_exception, args, kwargs): unique_id = str(uuid.uuid4()) posthog.capture(unique_id, event_name) print_verbose(f"successfully logged to PostHog!") + elif callback == "berrispend": + print_verbose("reaches berrispend for logging!") + model = args[0] if len(args) > 0 else kwargs["model"] + messages = args[1] if len(args) > 1 else kwargs["messages"] + result = { + "model": model, + "created": time.time(), + "error": traceback_exception, + "usage": { + "prompt_tokens": prompt_token_calculator(model, messages=messages), + "completion_tokens": 0 + } + } + berrispendLogger.log_event(model=model, messages=messages, response_obj=result, start_time=start_time, end_time=end_time, print_verbose=print_verbose) + elif callback == "aispend": + print_verbose("reaches aispend for logging!") + model = args[0] if len(args) > 0 else kwargs["model"] + messages = args[1] if len(args) > 1 else kwargs["messages"] + result = { + "model": model, + "created": time.time(), + "usage": { + "prompt_tokens": prompt_token_calculator(model, messages=messages), + "completion_tokens": 0 + } + } + aispendLogger.log_event(model=model, response_obj=result, start_time=start_time, end_time=end_time, print_verbose=print_verbose) except: print_verbose(f"Error Occurred while logging failure: {traceback.format_exc()}") pass @@ -264,8 +303,21 @@ def handle_failure(exception, traceback_exception, args, kwargs): logging(logger_fn=user_logger_fn, exception=e) pass +def prompt_token_calculator(model, messages): + # use tiktoken or anthropic's tokenizer depending on the model + text = " ".join(message["content"] for message in messages) + num_tokens = 0 + if "claude" in model: + anthropic = Anthropic() + num_tokens = anthropic.count_tokens(text) + else: + num_tokens = len(encoding.encode(text)) + return num_tokens + + + def handle_success(args, kwargs, result, start_time, end_time): - global heliconeLogger + global heliconeLogger, aispendLogger try: success_handler = additional_details.pop("success_handler", None) failure_handler = additional_details.pop("failure_handler", None) @@ -293,8 +345,19 @@ def handle_success(args, kwargs, result, start_time, end_time): model = args[0] if len(args) > 0 else kwargs["model"] messages = args[1] if len(args) > 1 else kwargs["messages"] heliconeLogger.log_success(model=model, messages=messages, response_obj=result, start_time=start_time, end_time=end_time, print_verbose=print_verbose) - except: - print_verbose(f"Success Callback Error - {traceback.format_exc()}") + elif callback == "aispend": + print_verbose("reaches aispend for logging!") + model = args[0] if len(args) > 0 else kwargs["model"] + aispendLogger.log_event(model=model, response_obj=result, start_time=start_time, end_time=end_time, print_verbose=print_verbose) + elif callback == "berrispend": + print_verbose("reaches berrispend for logging!") + model = args[0] if len(args) > 0 else kwargs["model"] + messages = args[1] if len(args) > 1 else kwargs["messages"] + berrispendLogger.log_event(model=model, messages=messages, response_obj=result, start_time=start_time, end_time=end_time, print_verbose=print_verbose) + except Exception as e: + ## LOGGING + logging(logger_fn=user_logger_fn, exception=e) + print_verbose(f"[Non-Blocking] Success Callback Error - {traceback.format_exc()}") pass if success_handler and callable(success_handler): @@ -303,7 +366,7 @@ def handle_success(args, kwargs, result, start_time, end_time): except Exception as e: ## LOGGING logging(logger_fn=user_logger_fn, exception=e) - print_verbose(f"Success Callback Error - {traceback.format_exc()}") + print_verbose(f"[Non-Blocking] Success Callback Error - {traceback.format_exc()}") pass diff --git a/pyproject.toml b/pyproject.toml index 642da1ee7..c71bb1579 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "litellm" -version = "0.1.343" +version = "0.1.344" description = "Library to easily interface with LLM API providers" authors = ["BerriAI"] license = "MIT License"