diff --git a/litellm/__init__.py b/litellm/__init__.py index 0fda1f351..1bff66e6c 100644 --- a/litellm/__init__.py +++ b/litellm/__init__.py @@ -1,16 +1,17 @@ import threading + success_callback = [] failure_callback = [] -set_verbose=False -telemetry=True -max_tokens = 256 # OpenAI Defaults +set_verbose = False +telemetry = True +max_tokens = 256 # OpenAI Defaults retry = True api_key = None -openai_key = None -azure_key = None -anthropic_key = None -replicate_key = None -cohere_key = None +openai_key = None +azure_key = None +anthropic_key = None +replicate_key = None +cohere_key = None openrouter_key = None huggingface_key = None vertex_project = None @@ -19,33 +20,99 @@ caching = False hugging_api_token = None togetherai_api_key = None model_cost = { - "gpt-3.5-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, - "gpt-35-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, # azure model name - "gpt-3.5-turbo-0613": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, - "gpt-3.5-turbo-0301": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, - "gpt-3.5-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, - "gpt-35-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, # azure model name - "gpt-3.5-turbo-16k-0613": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, - "gpt-4": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006}, - "gpt-4-0613": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006}, - "gpt-4-32k": {"max_tokens": 8000, "input_cost_per_token": 0.00006, "output_cost_per_token": 0.00012}, - "claude-instant-1": {"max_tokens": 100000, "input_cost_per_token": 0.00000163, "output_cost_per_token": 0.00000551}, - "claude-2": {"max_tokens": 100000, "input_cost_per_token": 0.00001102, "output_cost_per_token": 0.00003268}, - "text-bison-001": {"max_tokens": 8192, "input_cost_per_token": 0.000004, "output_cost_per_token": 0.000004}, - "chat-bison-001": {"max_tokens": 4096, "input_cost_per_token": 0.000002, "output_cost_per_token": 0.000002}, - "command-nightly": {"max_tokens": 4096, "input_cost_per_token": 0.000015, "output_cost_per_token": 0.000015}, + "gpt-3.5-turbo": { + "max_tokens": 4000, + "input_cost_per_token": 0.0000015, + "output_cost_per_token": 0.000002, + }, + "gpt-35-turbo": { + "max_tokens": 4000, + "input_cost_per_token": 0.0000015, + "output_cost_per_token": 0.000002, + }, # azure model name + "gpt-3.5-turbo-0613": { + "max_tokens": 4000, + "input_cost_per_token": 0.0000015, + "output_cost_per_token": 0.000002, + }, + "gpt-3.5-turbo-0301": { + "max_tokens": 4000, + "input_cost_per_token": 0.0000015, + "output_cost_per_token": 0.000002, + }, + "gpt-3.5-turbo-16k": { + "max_tokens": 16000, + "input_cost_per_token": 0.000003, + "output_cost_per_token": 0.000004, + }, + "gpt-35-turbo-16k": { + "max_tokens": 16000, + "input_cost_per_token": 0.000003, + "output_cost_per_token": 0.000004, + }, # azure model name + "gpt-3.5-turbo-16k-0613": { + "max_tokens": 16000, + "input_cost_per_token": 0.000003, + "output_cost_per_token": 0.000004, + }, + "gpt-4": { + "max_tokens": 8000, + "input_cost_per_token": 0.000003, + "output_cost_per_token": 0.00006, + }, + "gpt-4-0613": { + "max_tokens": 8000, + "input_cost_per_token": 0.000003, + "output_cost_per_token": 0.00006, + }, + "gpt-4-32k": { + "max_tokens": 8000, + "input_cost_per_token": 0.00006, + "output_cost_per_token": 0.00012, + }, + "claude-instant-1": { + "max_tokens": 100000, + "input_cost_per_token": 0.00000163, + "output_cost_per_token": 0.00000551, + }, + "claude-2": { + "max_tokens": 100000, + "input_cost_per_token": 0.00001102, + "output_cost_per_token": 0.00003268, + }, + "text-bison-001": { + "max_tokens": 8192, + "input_cost_per_token": 0.000004, + "output_cost_per_token": 0.000004, + }, + "chat-bison-001": { + "max_tokens": 4096, + "input_cost_per_token": 0.000002, + "output_cost_per_token": 0.000002, + }, + "command-nightly": { + "max_tokens": 4096, + "input_cost_per_token": 0.000015, + "output_cost_per_token": 0.000015, + }, } + ####### THREAD-SPECIFIC DATA ################### class MyLocal(threading.local): def __init__(self): self.user = "Hello World" + _thread_context = MyLocal() + + def identify(event_details): # Store user in thread local data if "user" in event_details: _thread_context.user = event_details["user"] + + ####### ADDITIONAL PARAMS ################### configurable params if you use proxy models like Helicone, map spend to org id, etc. api_base = None headers = None @@ -56,60 +123,48 @@ config_path = None secret_manager_client = None ####### COMPLETION MODELS ################### open_ai_chat_completion_models = [ - "gpt-4", - "gpt-4-0613", - "gpt-4-32k", - "gpt-4-32k-0613", - ################# - "gpt-3.5-turbo", - "gpt-3.5-turbo-16k", - "gpt-3.5-turbo-0613", - "gpt-3.5-turbo-16k-0613", -] -open_ai_text_completion_models = [ - 'text-davinci-003' + "gpt-4", + "gpt-4-0613", + "gpt-4-32k", + "gpt-4-32k-0613", + ################# + "gpt-3.5-turbo", + "gpt-3.5-turbo-16k", + "gpt-3.5-turbo-0613", + "gpt-3.5-turbo-16k-0613", ] +open_ai_text_completion_models = ["text-davinci-003"] cohere_models = [ - 'command-nightly', - "command", - "command-light", - "command-medium-beta", - "command-xlarge-beta" + "command-nightly", + "command", + "command-light", + "command-medium-beta", + "command-xlarge-beta", ] -anthropic_models = [ - "claude-2", - "claude-instant-1", - "claude-instant-1.2" -] +anthropic_models = ["claude-2", "claude-instant-1", "claude-instant-1.2"] replicate_models = [ "replicate/" -] # placeholder, to make sure we accept any replicate model in our model_list +] # placeholder, to make sure we accept any replicate model in our model_list openrouter_models = [ - 'google/palm-2-codechat-bison', - 'google/palm-2-chat-bison', - 'openai/gpt-3.5-turbo', - 'openai/gpt-3.5-turbo-16k', - 'openai/gpt-4-32k', - 'anthropic/claude-2', - 'anthropic/claude-instant-v1', - 'meta-llama/llama-2-13b-chat', - 'meta-llama/llama-2-70b-chat' + "google/palm-2-codechat-bison", + "google/palm-2-chat-bison", + "openai/gpt-3.5-turbo", + "openai/gpt-3.5-turbo-16k", + "openai/gpt-4-32k", + "anthropic/claude-2", + "anthropic/claude-instant-v1", + "meta-llama/llama-2-13b-chat", + "meta-llama/llama-2-70b-chat", ] -vertex_chat_models = [ - "chat-bison", - "chat-bison@001" -] +vertex_chat_models = ["chat-bison", "chat-bison@001"] -vertex_text_models = [ - "text-bison", - "text-bison@001" -] +vertex_text_models = ["text-bison", "text-bison@001"] huggingface_models = [ "meta-llama/Llama-2-7b-hf", @@ -124,25 +179,56 @@ huggingface_models = [ "meta-llama/Llama-2-13b-chat", "meta-llama/Llama-2-70b", "meta-llama/Llama-2-70b-chat", -] # these have been tested on extensively. But by default all text2text-generation and text-generation models are supported by liteLLM. - https://docs.litellm.ai/docs/completion/supported +] # these have been tested on extensively. But by default all text2text-generation and text-generation models are supported by liteLLM. - https://docs.litellm.ai/docs/completion/supported -ai21_models = [ - "j2-ultra", - "j2-mid", - "j2-light" +ai21_models = ["j2-ultra", "j2-mid", "j2-light"] + +model_list = ( + open_ai_chat_completion_models + + open_ai_text_completion_models + + cohere_models + + anthropic_models + + replicate_models + + openrouter_models + + huggingface_models + + vertex_chat_models + + vertex_text_models + + ai21_models +) + +provider_list = [ + "openai", + "cohere", + "anthropic", + "replicate", + "huggingface", + "together_ai", + "openrouter", + "vertex_ai", + "ai21", ] - -model_list = open_ai_chat_completion_models + open_ai_text_completion_models + cohere_models + anthropic_models + replicate_models + openrouter_models + huggingface_models + vertex_chat_models + vertex_text_models + ai21_models - -provider_list = ["openai", "cohere", "anthropic", "replicate", "huggingface", "together_ai", "openrouter", "vertex_ai", "ai21"] ####### EMBEDDING MODELS ################### -open_ai_embedding_models = [ - 'text-embedding-ada-002' -] +open_ai_embedding_models = ["text-embedding-ada-002"] from .timeout import timeout from .testing import * -from .utils import client, logging, exception_type, get_optional_params, modify_integration, token_counter, cost_per_token, completion_cost, get_litellm_params +from .utils import ( + client, + logging, + exception_type, + get_optional_params, + modify_integration, + token_counter, + cost_per_token, + completion_cost, + get_litellm_params, +) from .main import * # Import all the symbols from main.py from .integrations import * -from openai.error import AuthenticationError, InvalidRequestError, RateLimitError, ServiceUnavailableError, OpenAIError \ No newline at end of file +from openai.error import ( + AuthenticationError, + InvalidRequestError, + RateLimitError, + ServiceUnavailableError, + OpenAIError, +) diff --git a/litellm/exceptions.py b/litellm/exceptions.py index 82f2f5165..51923f86e 100644 --- a/litellm/exceptions.py +++ b/litellm/exceptions.py @@ -1,12 +1,21 @@ ## LiteLLM versions of the OpenAI Exception Types -from openai.error import AuthenticationError, InvalidRequestError, RateLimitError, ServiceUnavailableError, OpenAIError +from openai.error import ( + AuthenticationError, + InvalidRequestError, + RateLimitError, + ServiceUnavailableError, + OpenAIError, +) + class AuthenticationError(AuthenticationError): def __init__(self, message, llm_provider): self.status_code = 401 self.message = message self.llm_provider = llm_provider - super().__init__(self.message) # Call the base class constructor with the parameters it needs + super().__init__( + self.message + ) # Call the base class constructor with the parameters it needs class InvalidRequestError(InvalidRequestError): @@ -15,7 +24,9 @@ class InvalidRequestError(InvalidRequestError): self.message = message self.model = model self.llm_provider = llm_provider - super().__init__(self.message, f"{self.model}") # Call the base class constructor with the parameters it needs + super().__init__( + self.message, f"{self.model}" + ) # Call the base class constructor with the parameters it needs class RateLimitError(RateLimitError): @@ -23,21 +34,29 @@ class RateLimitError(RateLimitError): self.status_code = 429 self.message = message self.llm_provider = llm_provider - super().__init__(self.message) # Call the base class constructor with the parameters it needs + super().__init__( + self.message + ) # Call the base class constructor with the parameters it needs + class ServiceUnavailableError(ServiceUnavailableError): def __init__(self, message, llm_provider): self.status_code = 500 self.message = message self.llm_provider = llm_provider - super().__init__(self.message) # Call the base class constructor with the parameters it needs + super().__init__( + self.message + ) # Call the base class constructor with the parameters it needs + class OpenAIError(OpenAIError): def __init__(self, original_exception): self.status_code = original_exception.http_status - super().__init__(http_body=original_exception.http_body, - http_status=original_exception.http_status, - json_body=original_exception.json_body, - headers=original_exception.headers, - code=original_exception.code) - self.llm_provider = "openai" \ No newline at end of file + super().__init__( + http_body=original_exception.http_body, + http_status=original_exception.http_status, + json_body=original_exception.json_body, + headers=original_exception.headers, + code=original_exception.code, + ) + self.llm_provider = "openai" diff --git a/litellm/integrations/__init__.py b/litellm/integrations/__init__.py index b9742821a..b6e690fd5 100644 --- a/litellm/integrations/__init__.py +++ b/litellm/integrations/__init__.py @@ -1 +1 @@ -from . import * \ No newline at end of file +from . import * diff --git a/litellm/integrations/aispend.py b/litellm/integrations/aispend.py index 6723a6227..2015d45dd 100644 --- a/litellm/integrations/aispend.py +++ b/litellm/integrations/aispend.py @@ -1,53 +1,121 @@ #### What this does #### -# On success + failure, log events to aispend.io +# On success + failure, log events to aispend.io import dotenv, os import requests -dotenv.load_dotenv() # Loading env variables using dotenv + +dotenv.load_dotenv() # Loading env variables using dotenv import traceback import datetime model_cost = { - "gpt-3.5-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, - "gpt-35-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, # azure model name - "gpt-3.5-turbo-0613": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, - "gpt-3.5-turbo-0301": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, - "gpt-3.5-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, - "gpt-35-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, # azure model name - "gpt-3.5-turbo-16k-0613": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, - "gpt-4": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006}, - "gpt-4-0613": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006}, - "gpt-4-32k": {"max_tokens": 8000, "input_cost_per_token": 0.00006, "output_cost_per_token": 0.00012}, - "claude-instant-1": {"max_tokens": 100000, "input_cost_per_token": 0.00000163, "output_cost_per_token": 0.00000551}, - "claude-2": {"max_tokens": 100000, "input_cost_per_token": 0.00001102, "output_cost_per_token": 0.00003268}, - "text-bison-001": {"max_tokens": 8192, "input_cost_per_token": 0.000004, "output_cost_per_token": 0.000004}, - "chat-bison-001": {"max_tokens": 4096, "input_cost_per_token": 0.000002, "output_cost_per_token": 0.000002}, - "command-nightly": {"max_tokens": 4096, "input_cost_per_token": 0.000015, "output_cost_per_token": 0.000015}, + "gpt-3.5-turbo": { + "max_tokens": 4000, + "input_cost_per_token": 0.0000015, + "output_cost_per_token": 0.000002, + }, + "gpt-35-turbo": { + "max_tokens": 4000, + "input_cost_per_token": 0.0000015, + "output_cost_per_token": 0.000002, + }, # azure model name + "gpt-3.5-turbo-0613": { + "max_tokens": 4000, + "input_cost_per_token": 0.0000015, + "output_cost_per_token": 0.000002, + }, + "gpt-3.5-turbo-0301": { + "max_tokens": 4000, + "input_cost_per_token": 0.0000015, + "output_cost_per_token": 0.000002, + }, + "gpt-3.5-turbo-16k": { + "max_tokens": 16000, + "input_cost_per_token": 0.000003, + "output_cost_per_token": 0.000004, + }, + "gpt-35-turbo-16k": { + "max_tokens": 16000, + "input_cost_per_token": 0.000003, + "output_cost_per_token": 0.000004, + }, # azure model name + "gpt-3.5-turbo-16k-0613": { + "max_tokens": 16000, + "input_cost_per_token": 0.000003, + "output_cost_per_token": 0.000004, + }, + "gpt-4": { + "max_tokens": 8000, + "input_cost_per_token": 0.000003, + "output_cost_per_token": 0.00006, + }, + "gpt-4-0613": { + "max_tokens": 8000, + "input_cost_per_token": 0.000003, + "output_cost_per_token": 0.00006, + }, + "gpt-4-32k": { + "max_tokens": 8000, + "input_cost_per_token": 0.00006, + "output_cost_per_token": 0.00012, + }, + "claude-instant-1": { + "max_tokens": 100000, + "input_cost_per_token": 0.00000163, + "output_cost_per_token": 0.00000551, + }, + "claude-2": { + "max_tokens": 100000, + "input_cost_per_token": 0.00001102, + "output_cost_per_token": 0.00003268, + }, + "text-bison-001": { + "max_tokens": 8192, + "input_cost_per_token": 0.000004, + "output_cost_per_token": 0.000004, + }, + "chat-bison-001": { + "max_tokens": 4096, + "input_cost_per_token": 0.000002, + "output_cost_per_token": 0.000002, + }, + "command-nightly": { + "max_tokens": 4096, + "input_cost_per_token": 0.000015, + "output_cost_per_token": 0.000015, + }, } + class AISpendLogger: # Class variables or attributes def __init__(self): # Instance variables self.account_id = os.getenv("AISPEND_ACCOUNT_ID") self.api_key = os.getenv("AISPEND_API_KEY") - + def price_calculator(self, model, response_obj, start_time, end_time): # try and find if the model is in the model_cost map # else default to the average of the costs prompt_tokens_cost_usd_dollar = 0 completion_tokens_cost_usd_dollar = 0 if model in model_cost: - prompt_tokens_cost_usd_dollar = model_cost[model]["input_cost_per_token"] * response_obj["usage"]["prompt_tokens"] - completion_tokens_cost_usd_dollar = model_cost[model]["output_cost_per_token"] * response_obj["usage"]["completion_tokens"] - elif "replicate" in model: + prompt_tokens_cost_usd_dollar = ( + model_cost[model]["input_cost_per_token"] + * response_obj["usage"]["prompt_tokens"] + ) + completion_tokens_cost_usd_dollar = ( + model_cost[model]["output_cost_per_token"] + * response_obj["usage"]["completion_tokens"] + ) + elif "replicate" in model: # replicate models are charged based on time # llama 2 runs on an nvidia a100 which costs $0.0032 per second - https://replicate.com/replicate/llama-2-70b-chat - model_run_time = end_time - start_time # assuming time in seconds + model_run_time = end_time - start_time # assuming time in seconds cost_usd_dollar = model_run_time * 0.0032 prompt_tokens_cost_usd_dollar = cost_usd_dollar / 2 completion_tokens_cost_usd_dollar = cost_usd_dollar / 2 else: - # calculate average input cost + # calculate average input cost input_cost_sum = 0 output_cost_sum = 0 for model in model_cost: @@ -55,37 +123,52 @@ class AISpendLogger: output_cost_sum += model_cost[model]["output_cost_per_token"] avg_input_cost = input_cost_sum / len(model_cost.keys()) avg_output_cost = output_cost_sum / len(model_cost.keys()) - prompt_tokens_cost_usd_dollar = model_cost[model]["input_cost_per_token"] * response_obj["usage"]["prompt_tokens"] - completion_tokens_cost_usd_dollar = model_cost[model]["output_cost_per_token"] * response_obj["usage"]["completion_tokens"] + prompt_tokens_cost_usd_dollar = ( + model_cost[model]["input_cost_per_token"] + * response_obj["usage"]["prompt_tokens"] + ) + completion_tokens_cost_usd_dollar = ( + model_cost[model]["output_cost_per_token"] + * response_obj["usage"]["completion_tokens"] + ) return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar - + def log_event(self, model, response_obj, start_time, end_time, print_verbose): # Method definition try: - print_verbose(f"AISpend Logging - Enters logging function for model {model}") + print_verbose( + f"AISpend Logging - Enters logging function for model {model}" + ) url = f"https://aispend.io/api/v1/accounts/{self.account_id}/data" headers = { - 'Authorization': f'Bearer {self.api_key}', - 'Content-Type': 'application/json' + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", } - response_timestamp = datetime.datetime.fromtimestamp(int(response_obj["created"])).strftime('%Y-%m-%d') + response_timestamp = datetime.datetime.fromtimestamp( + int(response_obj["created"]) + ).strftime("%Y-%m-%d") - prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = self.price_calculator(model, response_obj, start_time, end_time) + ( + prompt_tokens_cost_usd_dollar, + completion_tokens_cost_usd_dollar, + ) = self.price_calculator(model, response_obj, start_time, end_time) prompt_tokens_cost_usd_cent = prompt_tokens_cost_usd_dollar * 100 completion_tokens_cost_usd_cent = completion_tokens_cost_usd_dollar * 100 - data = [{ - "requests": 1, - "requests_context": 1, - "context_tokens": response_obj["usage"]["prompt_tokens"], - "requests_generated": 1, - "generated_tokens": response_obj["usage"]["completion_tokens"], - "recorded_date": response_timestamp, - "model_id": response_obj["model"], - "generated_tokens_cost_usd_cent": prompt_tokens_cost_usd_cent, - "context_tokens_cost_usd_cent": completion_tokens_cost_usd_cent - }] + data = [ + { + "requests": 1, + "requests_context": 1, + "context_tokens": response_obj["usage"]["prompt_tokens"], + "requests_generated": 1, + "generated_tokens": response_obj["usage"]["completion_tokens"], + "recorded_date": response_timestamp, + "model_id": response_obj["model"], + "generated_tokens_cost_usd_cent": prompt_tokens_cost_usd_cent, + "context_tokens_cost_usd_cent": completion_tokens_cost_usd_cent, + } + ] print_verbose(f"AISpend Logging - final data object: {data}") except: diff --git a/litellm/integrations/berrispend.py b/litellm/integrations/berrispend.py index 1742bfed7..7d91ffca7 100644 --- a/litellm/integrations/berrispend.py +++ b/litellm/integrations/berrispend.py @@ -1,52 +1,120 @@ #### What this does #### -# On success + failure, log events to aispend.io +# On success + failure, log events to aispend.io import dotenv, os import requests -dotenv.load_dotenv() # Loading env variables using dotenv + +dotenv.load_dotenv() # Loading env variables using dotenv import traceback import datetime model_cost = { - "gpt-3.5-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, - "gpt-35-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, # azure model name - "gpt-3.5-turbo-0613": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, - "gpt-3.5-turbo-0301": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, - "gpt-3.5-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, - "gpt-35-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, # azure model name - "gpt-3.5-turbo-16k-0613": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, - "gpt-4": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006}, - "gpt-4-0613": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006}, - "gpt-4-32k": {"max_tokens": 8000, "input_cost_per_token": 0.00006, "output_cost_per_token": 0.00012}, - "claude-instant-1": {"max_tokens": 100000, "input_cost_per_token": 0.00000163, "output_cost_per_token": 0.00000551}, - "claude-2": {"max_tokens": 100000, "input_cost_per_token": 0.00001102, "output_cost_per_token": 0.00003268}, - "text-bison-001": {"max_tokens": 8192, "input_cost_per_token": 0.000004, "output_cost_per_token": 0.000004}, - "chat-bison-001": {"max_tokens": 4096, "input_cost_per_token": 0.000002, "output_cost_per_token": 0.000002}, - "command-nightly": {"max_tokens": 4096, "input_cost_per_token": 0.000015, "output_cost_per_token": 0.000015}, + "gpt-3.5-turbo": { + "max_tokens": 4000, + "input_cost_per_token": 0.0000015, + "output_cost_per_token": 0.000002, + }, + "gpt-35-turbo": { + "max_tokens": 4000, + "input_cost_per_token": 0.0000015, + "output_cost_per_token": 0.000002, + }, # azure model name + "gpt-3.5-turbo-0613": { + "max_tokens": 4000, + "input_cost_per_token": 0.0000015, + "output_cost_per_token": 0.000002, + }, + "gpt-3.5-turbo-0301": { + "max_tokens": 4000, + "input_cost_per_token": 0.0000015, + "output_cost_per_token": 0.000002, + }, + "gpt-3.5-turbo-16k": { + "max_tokens": 16000, + "input_cost_per_token": 0.000003, + "output_cost_per_token": 0.000004, + }, + "gpt-35-turbo-16k": { + "max_tokens": 16000, + "input_cost_per_token": 0.000003, + "output_cost_per_token": 0.000004, + }, # azure model name + "gpt-3.5-turbo-16k-0613": { + "max_tokens": 16000, + "input_cost_per_token": 0.000003, + "output_cost_per_token": 0.000004, + }, + "gpt-4": { + "max_tokens": 8000, + "input_cost_per_token": 0.000003, + "output_cost_per_token": 0.00006, + }, + "gpt-4-0613": { + "max_tokens": 8000, + "input_cost_per_token": 0.000003, + "output_cost_per_token": 0.00006, + }, + "gpt-4-32k": { + "max_tokens": 8000, + "input_cost_per_token": 0.00006, + "output_cost_per_token": 0.00012, + }, + "claude-instant-1": { + "max_tokens": 100000, + "input_cost_per_token": 0.00000163, + "output_cost_per_token": 0.00000551, + }, + "claude-2": { + "max_tokens": 100000, + "input_cost_per_token": 0.00001102, + "output_cost_per_token": 0.00003268, + }, + "text-bison-001": { + "max_tokens": 8192, + "input_cost_per_token": 0.000004, + "output_cost_per_token": 0.000004, + }, + "chat-bison-001": { + "max_tokens": 4096, + "input_cost_per_token": 0.000002, + "output_cost_per_token": 0.000002, + }, + "command-nightly": { + "max_tokens": 4096, + "input_cost_per_token": 0.000015, + "output_cost_per_token": 0.000015, + }, } + class BerriSpendLogger: # Class variables or attributes def __init__(self): # Instance variables self.account_id = os.getenv("BERRISPEND_ACCOUNT_ID") - + def price_calculator(self, model, response_obj, start_time, end_time): # try and find if the model is in the model_cost map # else default to the average of the costs prompt_tokens_cost_usd_dollar = 0 completion_tokens_cost_usd_dollar = 0 if model in model_cost: - prompt_tokens_cost_usd_dollar = model_cost[model]["input_cost_per_token"] * response_obj["usage"]["prompt_tokens"] - completion_tokens_cost_usd_dollar = model_cost[model]["output_cost_per_token"] * response_obj["usage"]["completion_tokens"] - elif "replicate" in model: + prompt_tokens_cost_usd_dollar = ( + model_cost[model]["input_cost_per_token"] + * response_obj["usage"]["prompt_tokens"] + ) + completion_tokens_cost_usd_dollar = ( + model_cost[model]["output_cost_per_token"] + * response_obj["usage"]["completion_tokens"] + ) + elif "replicate" in model: # replicate models are charged based on time # llama 2 runs on an nvidia a100 which costs $0.0032 per second - https://replicate.com/replicate/llama-2-70b-chat - model_run_time = end_time - start_time # assuming time in seconds + model_run_time = end_time - start_time # assuming time in seconds cost_usd_dollar = model_run_time * 0.0032 prompt_tokens_cost_usd_dollar = cost_usd_dollar / 2 completion_tokens_cost_usd_dollar = cost_usd_dollar / 2 else: - # calculate average input cost + # calculate average input cost input_cost_sum = 0 output_cost_sum = 0 for model in model_cost: @@ -54,42 +122,59 @@ class BerriSpendLogger: output_cost_sum += model_cost[model]["output_cost_per_token"] avg_input_cost = input_cost_sum / len(model_cost.keys()) avg_output_cost = output_cost_sum / len(model_cost.keys()) - prompt_tokens_cost_usd_dollar = model_cost[model]["input_cost_per_token"] * response_obj["usage"]["prompt_tokens"] - completion_tokens_cost_usd_dollar = model_cost[model]["output_cost_per_token"] * response_obj["usage"]["completion_tokens"] + prompt_tokens_cost_usd_dollar = ( + model_cost[model]["input_cost_per_token"] + * response_obj["usage"]["prompt_tokens"] + ) + completion_tokens_cost_usd_dollar = ( + model_cost[model]["output_cost_per_token"] + * response_obj["usage"]["completion_tokens"] + ) return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar - - def log_event(self, model, messages, response_obj, start_time, end_time, print_verbose): + + def log_event( + self, model, messages, response_obj, start_time, end_time, print_verbose + ): # Method definition try: - print_verbose(f"BerriSpend Logging - Enters logging function for model {model}") + print_verbose( + f"BerriSpend Logging - Enters logging function for model {model}" + ) url = f"https://berrispend.berri.ai/spend" - headers = { - 'Content-Type': 'application/json' - } + headers = {"Content-Type": "application/json"} - prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = self.price_calculator(model, response_obj, start_time, end_time) - total_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar + ( + prompt_tokens_cost_usd_dollar, + completion_tokens_cost_usd_dollar, + ) = self.price_calculator(model, response_obj, start_time, end_time) + total_cost = ( + prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar + ) - response_time = (end_time-start_time).total_seconds() + response_time = (end_time - start_time).total_seconds() if "response" in response_obj: - data = [{ - "response_time": response_time, - "model_id": response_obj["model"], - "total_cost": total_cost, - "messages": messages, - "response": response_obj['choices'][0]['message']['content'], - "account_id": self.account_id - }] + data = [ + { + "response_time": response_time, + "model_id": response_obj["model"], + "total_cost": total_cost, + "messages": messages, + "response": response_obj["choices"][0]["message"]["content"], + "account_id": self.account_id, + } + ] elif "error" in response_obj: - data = [{ - "response_time": response_time, - "model_id": response_obj["model"], - "total_cost": total_cost, - "messages": messages, - "error": response_obj['error'], - "account_id": self.account_id - }] + data = [ + { + "response_time": response_time, + "model_id": response_obj["model"], + "total_cost": total_cost, + "messages": messages, + "error": response_obj["error"], + "account_id": self.account_id, + } + ] print_verbose(f"BerriSpend Logging - final data object: {data}") response = requests.post(url, headers=headers, json=data) diff --git a/litellm/integrations/helicone.py b/litellm/integrations/helicone.py index 9e74b246f..f9dff85db 100644 --- a/litellm/integrations/helicone.py +++ b/litellm/integrations/helicone.py @@ -2,19 +2,24 @@ # On success, logs events to Helicone import dotenv, os import requests -dotenv.load_dotenv() # Loading env variables using dotenv + +dotenv.load_dotenv() # Loading env variables using dotenv import traceback + + class HeliconeLogger: # Class variables or attributes helicone_model_list = ["gpt", "claude"] + def __init__(self): # Instance variables self.provider_url = "https://api.openai.com/v1" - self.key = os.getenv('HELICONE_API_KEY') + self.key = os.getenv("HELICONE_API_KEY") def claude_mapping(self, model, messages, response_obj): from anthropic import HUMAN_PROMPT, AI_PROMPT - prompt = f"{HUMAN_PROMPT}" + + prompt = f"{HUMAN_PROMPT}" for message in messages: if "role" in message: if message["role"] == "user": @@ -26,48 +31,84 @@ class HeliconeLogger: prompt += f"{AI_PROMPT}" claude_provider_request = {"model": model, "prompt": prompt} - claude_response_obj = {"completion": response_obj['choices'][0]['message']['content'], "model": model, "stop_reason": "stop_sequence"} + claude_response_obj = { + "completion": response_obj["choices"][0]["message"]["content"], + "model": model, + "stop_reason": "stop_sequence", + } return claude_provider_request, claude_response_obj - - def log_success(self, model, messages, response_obj, start_time, end_time, print_verbose): + + def log_success( + self, model, messages, response_obj, start_time, end_time, print_verbose + ): # Method definition try: - print_verbose(f"Helicone Logging - Enters logging function for model {model}") - model = model if any(accepted_model in model for accepted_model in self.helicone_model_list) else "gpt-3.5-turbo" + print_verbose( + f"Helicone Logging - Enters logging function for model {model}" + ) + model = ( + model + if any( + accepted_model in model + for accepted_model in self.helicone_model_list + ) + else "gpt-3.5-turbo" + ) provider_request = {"model": model, "messages": messages} - if "claude" in model: - provider_request, response_obj = self.claude_mapping(model=model, messages=messages, response_obj=response_obj) + if "claude" in model: + provider_request, response_obj = self.claude_mapping( + model=model, messages=messages, response_obj=response_obj + ) providerResponse = { - "json": response_obj, - "headers": {"openai-version": "2020-10-01"}, - "status": 200 + "json": response_obj, + "headers": {"openai-version": "2020-10-01"}, + "status": 200, } # Code to be executed url = "https://api.hconeai.com/oai/v1/log" headers = { - 'Authorization': f'Bearer {self.key}', - 'Content-Type': 'application/json' + "Authorization": f"Bearer {self.key}", + "Content-Type": "application/json", } start_time_seconds = int(start_time.timestamp()) - start_time_milliseconds = int((start_time.timestamp() - start_time_seconds) * 1000) + start_time_milliseconds = int( + (start_time.timestamp() - start_time_seconds) * 1000 + ) end_time_seconds = int(end_time.timestamp()) - end_time_milliseconds = int((end_time.timestamp() - end_time_seconds) * 1000) + end_time_milliseconds = int( + (end_time.timestamp() - end_time_seconds) * 1000 + ) data = { - "providerRequest": {"url": self.provider_url, "json": provider_request, "meta": {"Helicone-Auth": f"Bearer {self.key}"}}, + "providerRequest": { + "url": self.provider_url, + "json": provider_request, + "meta": {"Helicone-Auth": f"Bearer {self.key}"}, + }, "providerResponse": providerResponse, - "timing": {"startTime": {"seconds": start_time_seconds, "milliseconds": start_time_milliseconds}, "endTime": {"seconds": end_time_seconds, "milliseconds": end_time_milliseconds}} # {"seconds": .., "milliseconds": ..} + "timing": { + "startTime": { + "seconds": start_time_seconds, + "milliseconds": start_time_milliseconds, + }, + "endTime": { + "seconds": end_time_seconds, + "milliseconds": end_time_milliseconds, + }, + }, # {"seconds": .., "milliseconds": ..} } response = requests.post(url, headers=headers, json=data) if response.status_code == 200: print_verbose("Helicone Logging - Success!") else: - print_verbose(f"Helicone Logging - Error Request was not successful. Status Code: {response.status_code}") + print_verbose( + f"Helicone Logging - Error Request was not successful. Status Code: {response.status_code}" + ) print_verbose(f"Helicone Logging - Error {response.text}") except: # traceback.print_exc() print_verbose(f"Helicone Logging Error - {traceback.format_exc()}") - pass \ No newline at end of file + pass diff --git a/litellm/integrations/supabase.py b/litellm/integrations/supabase.py index 1ac28763f..d27277589 100644 --- a/litellm/integrations/supabase.py +++ b/litellm/integrations/supabase.py @@ -3,31 +3,94 @@ import dotenv, os import requests -dotenv.load_dotenv() # Loading env variables using dotenv + +dotenv.load_dotenv() # Loading env variables using dotenv import traceback import datetime, subprocess, sys model_cost = { - "gpt-3.5-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, - "gpt-35-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, # azure model name - "gpt-3.5-turbo-0613": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, - "gpt-3.5-turbo-0301": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, - "gpt-3.5-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, - "gpt-35-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, # azure model name - "gpt-3.5-turbo-16k-0613": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, - "gpt-4": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006}, - "gpt-4-0613": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006}, - "gpt-4-32k": {"max_tokens": 8000, "input_cost_per_token": 0.00006, "output_cost_per_token": 0.00012}, - "claude-instant-1": {"max_tokens": 100000, "input_cost_per_token": 0.00000163, "output_cost_per_token": 0.00000551}, - "claude-2": {"max_tokens": 100000, "input_cost_per_token": 0.00001102, "output_cost_per_token": 0.00003268}, - "text-bison-001": {"max_tokens": 8192, "input_cost_per_token": 0.000004, "output_cost_per_token": 0.000004}, - "chat-bison-001": {"max_tokens": 4096, "input_cost_per_token": 0.000002, "output_cost_per_token": 0.000002}, - "command-nightly": {"max_tokens": 4096, "input_cost_per_token": 0.000015, "output_cost_per_token": 0.000015}, + "gpt-3.5-turbo": { + "max_tokens": 4000, + "input_cost_per_token": 0.0000015, + "output_cost_per_token": 0.000002, + }, + "gpt-35-turbo": { + "max_tokens": 4000, + "input_cost_per_token": 0.0000015, + "output_cost_per_token": 0.000002, + }, # azure model name + "gpt-3.5-turbo-0613": { + "max_tokens": 4000, + "input_cost_per_token": 0.0000015, + "output_cost_per_token": 0.000002, + }, + "gpt-3.5-turbo-0301": { + "max_tokens": 4000, + "input_cost_per_token": 0.0000015, + "output_cost_per_token": 0.000002, + }, + "gpt-3.5-turbo-16k": { + "max_tokens": 16000, + "input_cost_per_token": 0.000003, + "output_cost_per_token": 0.000004, + }, + "gpt-35-turbo-16k": { + "max_tokens": 16000, + "input_cost_per_token": 0.000003, + "output_cost_per_token": 0.000004, + }, # azure model name + "gpt-3.5-turbo-16k-0613": { + "max_tokens": 16000, + "input_cost_per_token": 0.000003, + "output_cost_per_token": 0.000004, + }, + "gpt-4": { + "max_tokens": 8000, + "input_cost_per_token": 0.000003, + "output_cost_per_token": 0.00006, + }, + "gpt-4-0613": { + "max_tokens": 8000, + "input_cost_per_token": 0.000003, + "output_cost_per_token": 0.00006, + }, + "gpt-4-32k": { + "max_tokens": 8000, + "input_cost_per_token": 0.00006, + "output_cost_per_token": 0.00012, + }, + "claude-instant-1": { + "max_tokens": 100000, + "input_cost_per_token": 0.00000163, + "output_cost_per_token": 0.00000551, + }, + "claude-2": { + "max_tokens": 100000, + "input_cost_per_token": 0.00001102, + "output_cost_per_token": 0.00003268, + }, + "text-bison-001": { + "max_tokens": 8192, + "input_cost_per_token": 0.000004, + "output_cost_per_token": 0.000004, + }, + "chat-bison-001": { + "max_tokens": 4096, + "input_cost_per_token": 0.000002, + "output_cost_per_token": 0.000002, + }, + "command-nightly": { + "max_tokens": 4096, + "input_cost_per_token": 0.000015, + "output_cost_per_token": 0.000015, + }, } + class Supabase: # Class variables or attributes supabase_table_name = "request_logs" + def __init__(self): # Instance variables self.supabase_url = os.getenv("SUPABASE_URL") @@ -35,9 +98,11 @@ class Supabase: try: import supabase except ImportError: - subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'supabase']) + subprocess.check_call([sys.executable, "-m", "pip", "install", "supabase"]) import supabase - self.supabase_client = supabase.create_client(self.supabase_url, self.supabase_key) + self.supabase_client = supabase.create_client( + self.supabase_url, self.supabase_key + ) def price_calculator(self, model, response_obj, start_time, end_time): # try and find if the model is in the model_cost map @@ -45,17 +110,23 @@ class Supabase: prompt_tokens_cost_usd_dollar = 0 completion_tokens_cost_usd_dollar = 0 if model in model_cost: - prompt_tokens_cost_usd_dollar = model_cost[model]["input_cost_per_token"] * response_obj["usage"]["prompt_tokens"] - completion_tokens_cost_usd_dollar = model_cost[model]["output_cost_per_token"] * response_obj["usage"]["completion_tokens"] - elif "replicate" in model: + prompt_tokens_cost_usd_dollar = ( + model_cost[model]["input_cost_per_token"] + * response_obj["usage"]["prompt_tokens"] + ) + completion_tokens_cost_usd_dollar = ( + model_cost[model]["output_cost_per_token"] + * response_obj["usage"]["completion_tokens"] + ) + elif "replicate" in model: # replicate models are charged based on time # llama 2 runs on an nvidia a100 which costs $0.0032 per second - https://replicate.com/replicate/llama-2-70b-chat - model_run_time = end_time - start_time # assuming time in seconds + model_run_time = end_time - start_time # assuming time in seconds cost_usd_dollar = model_run_time * 0.0032 prompt_tokens_cost_usd_dollar = cost_usd_dollar / 2 completion_tokens_cost_usd_dollar = cost_usd_dollar / 2 else: - # calculate average input cost + # calculate average input cost input_cost_sum = 0 output_cost_sum = 0 for model in model_cost: @@ -63,41 +134,75 @@ class Supabase: output_cost_sum += model_cost[model]["output_cost_per_token"] avg_input_cost = input_cost_sum / len(model_cost.keys()) avg_output_cost = output_cost_sum / len(model_cost.keys()) - prompt_tokens_cost_usd_dollar = model_cost[model]["input_cost_per_token"] * response_obj["usage"]["prompt_tokens"] - completion_tokens_cost_usd_dollar = model_cost[model]["output_cost_per_token"] * response_obj["usage"]["completion_tokens"] + prompt_tokens_cost_usd_dollar = ( + model_cost[model]["input_cost_per_token"] + * response_obj["usage"]["prompt_tokens"] + ) + completion_tokens_cost_usd_dollar = ( + model_cost[model]["output_cost_per_token"] + * response_obj["usage"]["completion_tokens"] + ) return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar - - def log_event(self, model, messages, end_user, response_obj, start_time, end_time, print_verbose): + + def log_event( + self, + model, + messages, + end_user, + response_obj, + start_time, + end_time, + print_verbose, + ): try: - print_verbose(f"Supabase Logging - Enters logging function for model {model}, response_obj: {response_obj}") + print_verbose( + f"Supabase Logging - Enters logging function for model {model}, response_obj: {response_obj}" + ) - prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = self.price_calculator(model, response_obj, start_time, end_time) - total_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar + ( + prompt_tokens_cost_usd_dollar, + completion_tokens_cost_usd_dollar, + ) = self.price_calculator(model, response_obj, start_time, end_time) + total_cost = ( + prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar + ) - response_time = (end_time-start_time).total_seconds() + response_time = (end_time - start_time).total_seconds() if "choices" in response_obj: supabase_data_obj = { "response_time": response_time, "model": response_obj["model"], - "total_cost": total_cost, + "total_cost": total_cost, "messages": messages, - "response": response_obj['choices'][0]['message']['content'], - "end_user": end_user + "response": response_obj["choices"][0]["message"]["content"], + "end_user": end_user, } - print_verbose(f"Supabase Logging - final data object: {supabase_data_obj}") - data, count = self.supabase_client.table(self.supabase_table_name).insert(supabase_data_obj).execute() + print_verbose( + f"Supabase Logging - final data object: {supabase_data_obj}" + ) + data, count = ( + self.supabase_client.table(self.supabase_table_name) + .insert(supabase_data_obj) + .execute() + ) elif "error" in response_obj: supabase_data_obj = { "response_time": response_time, "model": response_obj["model"], - "total_cost": total_cost, + "total_cost": total_cost, "messages": messages, - "error": response_obj['error'], - "end_user": end_user + "error": response_obj["error"], + "end_user": end_user, } - print_verbose(f"Supabase Logging - final data object: {supabase_data_obj}") - data, count = self.supabase_client.table(self.supabase_table_name).insert(supabase_data_obj).execute() - + print_verbose( + f"Supabase Logging - final data object: {supabase_data_obj}" + ) + data, count = ( + self.supabase_client.table(self.supabase_table_name) + .insert(supabase_data_obj) + .execute() + ) + except: # traceback.print_exc() print_verbose(f"Supabase Logging Error - {traceback.format_exc()}") diff --git a/litellm/llms/__init__.py b/litellm/llms/__init__.py index b9742821a..b6e690fd5 100644 --- a/litellm/llms/__init__.py +++ b/litellm/llms/__init__.py @@ -1 +1 @@ -from . import * \ No newline at end of file +from . import * diff --git a/litellm/llms/anthropic.py b/litellm/llms/anthropic.py index 67666ee92..c90b61a11 100644 --- a/litellm/llms/anthropic.py +++ b/litellm/llms/anthropic.py @@ -2,54 +2,77 @@ import os, json from enum import Enum import requests from litellm import logging -import time +import time from typing import Callable from litellm.utils import ModelResponse + class AnthropicConstants(Enum): HUMAN_PROMPT = "\n\nHuman:" AI_PROMPT = "\n\nAssistant:" + class AnthropicError(Exception): def __init__(self, status_code, message): self.status_code = status_code self.message = message - super().__init__(self.message) # Call the base class constructor with the parameters it needs + super().__init__( + self.message + ) # Call the base class constructor with the parameters it needs -class AnthropicLLM: - + +class AnthropicLLM: def __init__(self, encoding, default_max_tokens_to_sample, api_key=None): self.encoding = encoding self.default_max_tokens_to_sample = default_max_tokens_to_sample self.completion_url = "https://api.anthropic.com/v1/complete" self.api_key = api_key self.validate_environment(api_key=api_key) - - def validate_environment(self, api_key): # set up the environment required to run the model + + def validate_environment( + self, api_key + ): # set up the environment required to run the model # set the api key if self.api_key == None: - raise ValueError("Missing Anthropic API Key - A call is being made to anthropic but no key is set either in the environment variables or via params") + raise ValueError( + "Missing Anthropic API Key - A call is being made to anthropic but no key is set either in the environment variables or via params" + ) self.api_key = api_key self.headers = { "accept": "application/json", "anthropic-version": "2023-06-01", "content-type": "application/json", - "x-api-key": self.api_key + "x-api-key": self.api_key, } - def completion(self, model: str, messages: list, model_response: ModelResponse, print_verbose: Callable, optional_params=None, litellm_params=None, logger_fn=None): # logic for parsing in - calling - parsing out model completion calls + def completion( + self, + model: str, + messages: list, + model_response: ModelResponse, + print_verbose: Callable, + optional_params=None, + litellm_params=None, + logger_fn=None, + ): # logic for parsing in - calling - parsing out model completion calls model = model prompt = f"{AnthropicConstants.HUMAN_PROMPT.value}" for message in messages: if "role" in message: if message["role"] == "user": - prompt += f"{AnthropicConstants.HUMAN_PROMPT.value}{message['content']}" + prompt += ( + f"{AnthropicConstants.HUMAN_PROMPT.value}{message['content']}" + ) else: - prompt += f"{AnthropicConstants.AI_PROMPT.value}{message['content']}" + prompt += ( + f"{AnthropicConstants.AI_PROMPT.value}{message['content']}" + ) else: prompt += f"{AnthropicConstants.HUMAN_PROMPT.value}{message['content']}" prompt += f"{AnthropicConstants.AI_PROMPT.value}" - if "max_tokens" in optional_params and optional_params["max_tokens"] != float('inf'): + if "max_tokens" in optional_params and optional_params["max_tokens"] != float( + "inf" + ): max_tokens = optional_params["max_tokens"] else: max_tokens = self.default_max_tokens_to_sample @@ -57,39 +80,66 @@ class AnthropicLLM: "model": model, "prompt": prompt, "max_tokens_to_sample": max_tokens, - **optional_params + **optional_params, } ## LOGGING - logging(model=model, input=prompt, additional_args={"litellm_params": litellm_params, "optional_params": optional_params}, logger_fn=logger_fn) + logging( + model=model, + input=prompt, + additional_args={ + "litellm_params": litellm_params, + "optional_params": optional_params, + }, + logger_fn=logger_fn, + ) ## COMPLETION CALL - response = requests.post(self.completion_url, headers=self.headers, data=json.dumps(data)) + response = requests.post( + self.completion_url, headers=self.headers, data=json.dumps(data) + ) if "stream" in optional_params and optional_params["stream"] == True: return response.iter_lines() else: ## LOGGING - logging(model=model, input=prompt, additional_args={"litellm_params": litellm_params, "optional_params": optional_params, "original_response": response.text}, logger_fn=logger_fn) + logging( + model=model, + input=prompt, + additional_args={ + "litellm_params": litellm_params, + "optional_params": optional_params, + "original_response": response.text, + }, + logger_fn=logger_fn, + ) print_verbose(f"raw model_response: {response.text}") ## RESPONSE OBJECT completion_response = response.json() if "error" in completion_response: - raise AnthropicError(message=completion_response["error"], status_code=response.status_code) + raise AnthropicError( + message=completion_response["error"], + status_code=response.status_code, + ) else: - model_response["choices"][0]["message"]["content"] = completion_response["completion"] - + model_response["choices"][0]["message"][ + "content" + ] = completion_response["completion"] + ## CALCULATING USAGE - prompt_tokens = len(self.encoding.encode(prompt)) ##[TODO] use the anthropic tokenizer here - completion_tokens = len(self.encoding.encode(model_response["choices"][0]["message"]["content"])) ##[TODO] use the anthropic tokenizer here - - + prompt_tokens = len( + self.encoding.encode(prompt) + ) ##[TODO] use the anthropic tokenizer here + completion_tokens = len( + self.encoding.encode(model_response["choices"][0]["message"]["content"]) + ) ##[TODO] use the anthropic tokenizer here + model_response["created"] = time.time() model_response["model"] = model model_response["usage"] = { "prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens, - "total_tokens": prompt_tokens + completion_tokens - } + "total_tokens": prompt_tokens + completion_tokens, + } return model_response - - def embedding(): # logic for parsing in - calling - parsing out model embedding calls - pass \ No newline at end of file + + def embedding(): # logic for parsing in - calling - parsing out model embedding calls + pass diff --git a/litellm/llms/base.py b/litellm/llms/base.py index 368df9624..bde09f2fb 100644 --- a/litellm/llms/base.py +++ b/litellm/llms/base.py @@ -1,11 +1,12 @@ ## This is a template base class to be used for adding new LLM providers via API calls -class BaseLLM(): - def validate_environment(): # set up the environment required to run the model - pass - def completion(): # logic for parsing in - calling - parsing out model completion calls +class BaseLLM: + def validate_environment(): # set up the environment required to run the model pass - def embedding(): # logic for parsing in - calling - parsing out model embedding calls - pass \ No newline at end of file + def completion(): # logic for parsing in - calling - parsing out model completion calls + pass + + def embedding(): # logic for parsing in - calling - parsing out model embedding calls + pass diff --git a/litellm/llms/huggingface_restapi.py b/litellm/llms/huggingface_restapi.py index 30d67727f..974a6c049 100644 --- a/litellm/llms/huggingface_restapi.py +++ b/litellm/llms/huggingface_restapi.py @@ -3,31 +3,47 @@ import os, json from enum import Enum import requests from litellm import logging -import time +import time from typing import Callable from litellm.utils import ModelResponse + class HuggingfaceError(Exception): def __init__(self, status_code, message): self.status_code = status_code self.message = message - super().__init__(self.message) # Call the base class constructor with the parameters it needs + super().__init__( + self.message + ) # Call the base class constructor with the parameters it needs -class HuggingfaceRestAPILLM(): + +class HuggingfaceRestAPILLM: def __init__(self, encoding, api_key=None) -> None: self.encoding = encoding self.validate_environment(api_key=api_key) - def validate_environment(self, api_key): # set up the environment required to run the model + def validate_environment( + self, api_key + ): # set up the environment required to run the model self.headers = { "content-type": "application/json", } # get the api key if it exists in the environment or is passed in, but don't require it self.api_key = api_key if self.api_key != None: - self.headers["Authorization"] = f"Bearer {self.api_key}" + self.headers["Authorization"] = f"Bearer {self.api_key}" - def completion(self, model: str, messages: list, custom_api_base: str, model_response: ModelResponse, print_verbose: Callable, optional_params=None, litellm_params=None, logger_fn=None): # logic for parsing in - calling - parsing out model completion calls + def completion( + self, + model: str, + messages: list, + custom_api_base: str, + model_response: ModelResponse, + print_verbose: Callable, + optional_params=None, + litellm_params=None, + logger_fn=None, + ): # logic for parsing in - calling - parsing out model completion calls if custom_api_base: completion_url = custom_api_base elif "HF_API_BASE" in os.environ: @@ -35,7 +51,9 @@ class HuggingfaceRestAPILLM(): else: completion_url = f"https://api-inference.huggingface.co/models/{model}" prompt = "" - if "meta-llama" in model and "chat" in model: # use the required special tokens for meta-llama - https://huggingface.co/blog/llama2#how-to-prompt-llama-2 + if ( + "meta-llama" in model and "chat" in model + ): # use the required special tokens for meta-llama - https://huggingface.co/blog/llama2#how-to-prompt-llama-2 prompt = "" for message in messages: if message["role"] == "system": @@ -47,8 +65,8 @@ class HuggingfaceRestAPILLM(): else: for message in messages: prompt += f"{message['content']}" - ### MAP INPUT PARAMS - # max tokens + ### MAP INPUT PARAMS + # max tokens if "max_tokens" in optional_params: value = optional_params.pop("max_tokens") optional_params["max_new_tokens"] = value @@ -57,14 +75,33 @@ class HuggingfaceRestAPILLM(): # "parameters": optional_params } ## LOGGING - logging(model=model, input=prompt, additional_args={"litellm_params": litellm_params, "optional_params": optional_params}, logger_fn=logger_fn) + logging( + model=model, + input=prompt, + additional_args={ + "litellm_params": litellm_params, + "optional_params": optional_params, + }, + logger_fn=logger_fn, + ) ## COMPLETION CALL - response = requests.post(completion_url, headers=self.headers, data=json.dumps(data)) + response = requests.post( + completion_url, headers=self.headers, data=json.dumps(data) + ) if "stream" in optional_params and optional_params["stream"] == True: return response.iter_lines() else: ## LOGGING - logging(model=model, input=prompt, additional_args={"litellm_params": litellm_params, "optional_params": optional_params, "original_response": response.text}, logger_fn=logger_fn) + logging( + model=model, + input=prompt, + additional_args={ + "litellm_params": litellm_params, + "optional_params": optional_params, + "original_response": response.text, + }, + logger_fn=logger_fn, + ) print_verbose(f"raw model_response: {response.text}") ## RESPONSE OBJECT completion_response = response.json() @@ -72,24 +109,32 @@ class HuggingfaceRestAPILLM(): if isinstance(completion_response, dict) and "error" in completion_response: print_verbose(f"completion error: {completion_response['error']}") print_verbose(f"response.status_code: {response.status_code}") - raise HuggingfaceError(message=completion_response["error"], status_code=response.status_code) + raise HuggingfaceError( + message=completion_response["error"], + status_code=response.status_code, + ) else: - model_response["choices"][0]["message"]["content"] = completion_response[0]["generated_text"] - + model_response["choices"][0]["message"][ + "content" + ] = completion_response[0]["generated_text"] + ## CALCULATING USAGE - prompt_tokens = len(self.encoding.encode(prompt)) ##[TODO] use the llama2 tokenizer here - completion_tokens = len(self.encoding.encode(model_response["choices"][0]["message"]["content"])) ##[TODO] use the llama2 tokenizer here - - + prompt_tokens = len( + self.encoding.encode(prompt) + ) ##[TODO] use the llama2 tokenizer here + completion_tokens = len( + self.encoding.encode(model_response["choices"][0]["message"]["content"]) + ) ##[TODO] use the llama2 tokenizer here + model_response["created"] = time.time() model_response["model"] = model model_response["usage"] = { "prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens, - "total_tokens": prompt_tokens + completion_tokens - } + "total_tokens": prompt_tokens + completion_tokens, + } return model_response pass - def embedding(): # logic for parsing in - calling - parsing out model embedding calls - pass \ No newline at end of file + def embedding(): # logic for parsing in - calling - parsing out model embedding calls + pass diff --git a/litellm/main.py b/litellm/main.py index 9a809b098..713a21ed6 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -4,533 +4,865 @@ from functools import partial import dotenv, traceback, random, asyncio, time from copy import deepcopy import litellm -from litellm import client, logging, exception_type, timeout, get_optional_params, get_litellm_params -from litellm.utils import get_secret, install_and_import, CustomStreamWrapper, read_config_args +from litellm import ( + client, + logging, + exception_type, + timeout, + get_optional_params, + get_litellm_params, +) +from litellm.utils import ( + get_secret, + install_and_import, + CustomStreamWrapper, + read_config_args, +) from .llms.anthropic import AnthropicLLM from .llms.huggingface_restapi import HuggingfaceRestAPILLM import tiktoken from concurrent.futures import ThreadPoolExecutor + encoding = tiktoken.get_encoding("cl100k_base") -from litellm.utils import get_secret, install_and_import, CustomStreamWrapper, ModelResponse, read_config_args -from litellm.utils import get_ollama_response_stream, stream_to_string, together_ai_completion_streaming +from litellm.utils import ( + get_secret, + install_and_import, + CustomStreamWrapper, + ModelResponse, + read_config_args, +) +from litellm.utils import ( + get_ollama_response_stream, + stream_to_string, + together_ai_completion_streaming, +) + ####### ENVIRONMENT VARIABLES ################### -dotenv.load_dotenv() # Loading env variables using dotenv +dotenv.load_dotenv() # Loading env variables using dotenv + + ####### COMPLETION ENDPOINTS ################ ############################################# async def acompletion(*args, **kwargs): - loop = asyncio.get_event_loop() - - # Use a partial function to pass your keyword arguments - func = partial(completion, *args, **kwargs) + loop = asyncio.get_event_loop() + + # Use a partial function to pass your keyword arguments + func = partial(completion, *args, **kwargs) + + # Call the synchronous function using run_in_executor + return await loop.run_in_executor(None, func) - # Call the synchronous function using run_in_executor - return await loop.run_in_executor(None, func) @client # @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(2), reraise=True, retry_error_callback=lambda retry_state: setattr(retry_state.outcome, 'retry_variable', litellm.retry)) # retry call, turn this off by setting `litellm.retry = False` -@timeout(600) ## set timeouts, in case calls hang (e.g. Azure) - default is 60s, override with `force_timeout` +@timeout( + 600 +) ## set timeouts, in case calls hang (e.g. Azure) - default is 60s, override with `force_timeout` def completion( - model, messages,# required params + model, + messages, # required params # Optional OpenAI params: see https://platform.openai.com/docs/api-reference/chat/create - functions=[], function_call="", # optional params - temperature=1, top_p=1, n=1, stream=False, stop=None, max_tokens=float('inf'), - presence_penalty=0, frequency_penalty=0, logit_bias={}, user="", deployment_id=None, + functions=[], + function_call="", # optional params + temperature=1, + top_p=1, + n=1, + stream=False, + stop=None, + max_tokens=float("inf"), + presence_penalty=0, + frequency_penalty=0, + logit_bias={}, + user="", + deployment_id=None, # Optional liteLLM function params - *, return_async=False, api_key=None, force_timeout=600, logger_fn=None, verbose=False, azure=False, custom_llm_provider=None, custom_api_base=None, + *, + return_async=False, + api_key=None, + force_timeout=600, + logger_fn=None, + verbose=False, + azure=False, + custom_llm_provider=None, + custom_api_base=None, # model specific optional params # used by text-bison only - top_k=40, request_timeout=0, # unused var for old version of OpenAI API - ) -> ModelResponse: - try: - model_response = ModelResponse() - if azure: # this flag is deprecated, remove once notebooks are also updated. - custom_llm_provider="azure" - args = locals() - # check if user passed in any of the OpenAI optional params - optional_params = get_optional_params( - functions=functions, function_call=function_call, - temperature=temperature, top_p=top_p, n=n, stream=stream, stop=stop, max_tokens=max_tokens, - presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, logit_bias=logit_bias, user=user, deployment_id=deployment_id, - # params to identify the model - model=model, custom_llm_provider=custom_llm_provider, top_k=top_k, - ) - # For logging - save the values of the litellm-specific params passed in - litellm_params = get_litellm_params( - return_async=return_async, api_key=api_key, force_timeout=force_timeout, - logger_fn=logger_fn, verbose=verbose, custom_llm_provider=custom_llm_provider, - custom_api_base=custom_api_base) - - if custom_llm_provider == "azure": - # azure configs - openai.api_type = "azure" - openai.api_base = litellm.api_base if litellm.api_base is not None else get_secret("AZURE_API_BASE") - openai.api_version = litellm.api_version if litellm.api_version is not None else get_secret("AZURE_API_VERSION") - # set key - openai.api_key = api_key or litellm.azure_key or get_secret("AZURE_API_KEY") - ## LOGGING - logging(model=model, input=messages, additional_args=optional_params, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn) - ## COMPLETION CALL - if litellm.headers: - response = openai.ChatCompletion.create( - engine=model, - messages = messages, - headers = litellm.headers, - **optional_params, - ) - else: - response = openai.ChatCompletion.create( - model=model, - messages = messages, - **optional_params - ) - elif model in litellm.open_ai_chat_completion_models or custom_llm_provider == "custom_openai": # allow user to make an openai call with a custom base - openai.api_type = "openai" - # note: if a user sets a custom base - we should ensure this works - api_base = custom_api_base if custom_api_base is not None else litellm.api_base # allow for the setting of dynamic and stateful api-bases - openai.api_base = api_base if api_base is not None else "https://api.openai.com/v1" - openai.api_version = None - if litellm.organization: - openai.organization = litellm.organization - # set API KEY - openai.api_key = api_key or litellm.openai_key or get_secret("OPENAI_API_KEY") - - ## LOGGING - logging(model=model, input=messages, additional_args=args, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn) - ## COMPLETION CALL - if litellm.headers: - response = openai.ChatCompletion.create( - model=model, - messages = messages, - headers = litellm.headers, - **optional_params - ) - else: - response = openai.ChatCompletion.create( - model=model, - messages = messages, - **optional_params - ) - elif model in litellm.open_ai_text_completion_models: - openai.api_type = "openai" - openai.api_base = litellm.api_base if litellm.api_base is not None else "https://api.openai.com/v1" - openai.api_version = None - openai.api_key = api_key or litellm.openai_key or get_secret("OPENAI_API_KEY") - if litellm.organization: - openai.organization = litellm.organization - prompt = " ".join([message["content"] for message in messages]) - ## LOGGING - logging(model=model, input=prompt, additional_args=optional_params, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn) - ## COMPLETION CALL - if litellm.headers: - response = openai.Completion.create( - model=model, - prompt = prompt, - headers = litellm.headers, - ) - else: - response = openai.Completion.create( + top_k=40, + request_timeout=0, # unused var for old version of OpenAI API +) -> ModelResponse: + try: + model_response = ModelResponse() + if azure: # this flag is deprecated, remove once notebooks are also updated. + custom_llm_provider = "azure" + args = locals() + # check if user passed in any of the OpenAI optional params + optional_params = get_optional_params( + functions=functions, + function_call=function_call, + temperature=temperature, + top_p=top_p, + n=n, + stream=stream, + stop=stop, + max_tokens=max_tokens, + presence_penalty=presence_penalty, + frequency_penalty=frequency_penalty, + logit_bias=logit_bias, + user=user, + deployment_id=deployment_id, + # params to identify the model model=model, - prompt = prompt + custom_llm_provider=custom_llm_provider, + top_k=top_k, ) - completion_response = response["choices"][0]["text"] - ## LOGGING - logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn) - ## RESPONSE OBJECT - model_response["choices"][0]["message"]["content"] = completion_response - model_response["created"] = response["created"] - model_response["model"] = model - model_response["usage"] = response["usage"] - response = model_response - elif "replicate" in model or custom_llm_provider == "replicate": - # import replicate/if it fails then pip install replicate - install_and_import("replicate") - import replicate - # Setting the relevant API KEY for replicate, replicate defaults to using os.environ.get("REPLICATE_API_TOKEN") - replicate_key = os.environ.get("REPLICATE_API_TOKEN") - if replicate_key == None: - # user did not set REPLICATE_API_TOKEN in .env - replicate_key = get_secret("REPLICATE_API_KEY") or get_secret("REPLICATE_API_TOKEN") or api_key or litellm.replicate_key - # set replicate kye - os.environ["REPLICATE_API_TOKEN"] = replicate_key - prompt = " ".join([message["content"] for message in messages]) - input = {"prompt": prompt} - if "max_tokens" in optional_params: - input["max_length"] = max_tokens # for t5 models - input["max_new_tokens"] = max_tokens # for llama2 models - ## LOGGING - logging(model=model, input=input, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens}, logger_fn=logger_fn) - ## COMPLETION CALL - output = replicate.run( - model, - input=input) - if 'stream' in optional_params and optional_params['stream'] == True: - # don't try to access stream object, - # let the stream handler know this is replicate - response = CustomStreamWrapper(output, "replicate") - return response - response = "" - for item in output: - response += item - completion_response = response - ## LOGGING - logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn) - prompt_tokens = len(encoding.encode(prompt)) - completion_tokens = len(encoding.encode(completion_response)) - ## RESPONSE OBJECT - model_response["choices"][0]["message"]["content"] = completion_response - model_response["created"] = time.time() - model_response["model"] = model - model_response["usage"] = { - "prompt_tokens": prompt_tokens, - "completion_tokens": completion_tokens, - "total_tokens": prompt_tokens + completion_tokens - } - response = model_response - elif model in litellm.anthropic_models: - anthropic_key = api_key or litellm.anthropic_key or os.environ.get("ANTHROPIC_API_KEY") - anthropic_client = AnthropicLLM(encoding=encoding, default_max_tokens_to_sample=litellm.max_tokens, api_key=anthropic_key) - model_response = anthropic_client.completion(model=model, messages=messages, model_response=model_response, print_verbose=print_verbose, optional_params=optional_params, litellm_params=litellm_params, logger_fn=logger_fn) - if 'stream' in optional_params and optional_params['stream'] == True: - # don't try to access stream object, - response = CustomStreamWrapper(model_response, model) - return response - response = model_response - elif model in litellm.openrouter_models or custom_llm_provider == "openrouter": - openai.api_type = "openai" - # not sure if this will work after someone first uses another API - openai.api_base = litellm.api_base if litellm.api_base is not None else "https://openrouter.ai/api/v1" - openai.api_version = None - if litellm.organization: - openai.organization = litellm.organization - if api_key: - openai.api_key = api_key - elif litellm.openrouter_key: - openai.api_key = litellm.openrouter_key - else: - openai.api_key = get_secret("OPENROUTER_API_KEY") or get_secret("OR_API_KEY") - ## LOGGING - logging(model=model, input=messages, additional_args=optional_params, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn) - ## COMPLETION CALL - if litellm.headers: - response = openai.ChatCompletion.create( - model=model, - messages = messages, - headers = litellm.headers, - **optional_params + # For logging - save the values of the litellm-specific params passed in + litellm_params = get_litellm_params( + return_async=return_async, + api_key=api_key, + force_timeout=force_timeout, + logger_fn=logger_fn, + verbose=verbose, + custom_llm_provider=custom_llm_provider, + custom_api_base=custom_api_base, ) - else: - openrouter_site_url = get_secret("OR_SITE_URL") - openrouter_app_name = get_secret("OR_APP_NAME") - # if openrouter_site_url is None, set it to https://litellm.ai - if openrouter_site_url is None: - openrouter_site_url = "https://litellm.ai" - # if openrouter_app_name is None, set it to liteLLM - if openrouter_app_name is None: - openrouter_app_name = "liteLLM" - response = openai.ChatCompletion.create( - model=model, - messages = messages, - headers = - { - "HTTP-Referer": openrouter_site_url, # To identify your site - "X-Title": openrouter_app_name # To identify your app - }, - **optional_params + + if custom_llm_provider == "azure": + # azure configs + openai.api_type = "azure" + openai.api_base = ( + litellm.api_base + if litellm.api_base is not None + else get_secret("AZURE_API_BASE") + ) + openai.api_version = ( + litellm.api_version + if litellm.api_version is not None + else get_secret("AZURE_API_VERSION") + ) + # set key + openai.api_key = api_key or litellm.azure_key or get_secret("AZURE_API_KEY") + ## LOGGING + logging( + model=model, + input=messages, + additional_args=optional_params, + custom_llm_provider=custom_llm_provider, + logger_fn=logger_fn, + ) + ## COMPLETION CALL + if litellm.headers: + response = openai.ChatCompletion.create( + engine=model, + messages=messages, + headers=litellm.headers, + **optional_params, + ) + else: + response = openai.ChatCompletion.create( + model=model, messages=messages, **optional_params + ) + elif ( + model in litellm.open_ai_chat_completion_models + or custom_llm_provider == "custom_openai" + ): # allow user to make an openai call with a custom base + openai.api_type = "openai" + # note: if a user sets a custom base - we should ensure this works + api_base = ( + custom_api_base if custom_api_base is not None else litellm.api_base + ) # allow for the setting of dynamic and stateful api-bases + openai.api_base = ( + api_base if api_base is not None else "https://api.openai.com/v1" + ) + openai.api_version = None + if litellm.organization: + openai.organization = litellm.organization + # set API KEY + openai.api_key = ( + api_key or litellm.openai_key or get_secret("OPENAI_API_KEY") + ) + + ## LOGGING + logging( + model=model, + input=messages, + additional_args=args, + custom_llm_provider=custom_llm_provider, + logger_fn=logger_fn, + ) + ## COMPLETION CALL + if litellm.headers: + response = openai.ChatCompletion.create( + model=model, + messages=messages, + headers=litellm.headers, + **optional_params, + ) + else: + response = openai.ChatCompletion.create( + model=model, messages=messages, **optional_params + ) + elif model in litellm.open_ai_text_completion_models: + openai.api_type = "openai" + openai.api_base = ( + litellm.api_base + if litellm.api_base is not None + else "https://api.openai.com/v1" + ) + openai.api_version = None + openai.api_key = ( + api_key or litellm.openai_key or get_secret("OPENAI_API_KEY") + ) + if litellm.organization: + openai.organization = litellm.organization + prompt = " ".join([message["content"] for message in messages]) + ## LOGGING + logging( + model=model, + input=prompt, + additional_args=optional_params, + custom_llm_provider=custom_llm_provider, + logger_fn=logger_fn, + ) + ## COMPLETION CALL + if litellm.headers: + response = openai.Completion.create( + model=model, + prompt=prompt, + headers=litellm.headers, + ) + else: + response = openai.Completion.create(model=model, prompt=prompt) + completion_response = response["choices"][0]["text"] + ## LOGGING + logging( + model=model, + input=prompt, + custom_llm_provider=custom_llm_provider, + additional_args={ + "max_tokens": max_tokens, + "original_response": completion_response, + }, + logger_fn=logger_fn, + ) + ## RESPONSE OBJECT + model_response["choices"][0]["message"]["content"] = completion_response + model_response["created"] = response["created"] + model_response["model"] = model + model_response["usage"] = response["usage"] + response = model_response + elif "replicate" in model or custom_llm_provider == "replicate": + # import replicate/if it fails then pip install replicate + install_and_import("replicate") + import replicate + + # Setting the relevant API KEY for replicate, replicate defaults to using os.environ.get("REPLICATE_API_TOKEN") + replicate_key = os.environ.get("REPLICATE_API_TOKEN") + if replicate_key == None: + # user did not set REPLICATE_API_TOKEN in .env + replicate_key = ( + get_secret("REPLICATE_API_KEY") + or get_secret("REPLICATE_API_TOKEN") + or api_key + or litellm.replicate_key + ) + # set replicate kye + os.environ["REPLICATE_API_TOKEN"] = replicate_key + prompt = " ".join([message["content"] for message in messages]) + input = {"prompt": prompt} + if "max_tokens" in optional_params: + input["max_length"] = max_tokens # for t5 models + input["max_new_tokens"] = max_tokens # for llama2 models + ## LOGGING + logging( + model=model, + input=input, + custom_llm_provider=custom_llm_provider, + additional_args={"max_tokens": max_tokens}, + logger_fn=logger_fn, + ) + ## COMPLETION CALL + output = replicate.run(model, input=input) + if "stream" in optional_params and optional_params["stream"] == True: + # don't try to access stream object, + # let the stream handler know this is replicate + response = CustomStreamWrapper(output, "replicate") + return response + response = "" + for item in output: + response += item + completion_response = response + ## LOGGING + logging( + model=model, + input=prompt, + custom_llm_provider=custom_llm_provider, + additional_args={ + "max_tokens": max_tokens, + "original_response": completion_response, + }, + logger_fn=logger_fn, + ) + prompt_tokens = len(encoding.encode(prompt)) + completion_tokens = len(encoding.encode(completion_response)) + ## RESPONSE OBJECT + model_response["choices"][0]["message"]["content"] = completion_response + model_response["created"] = time.time() + model_response["model"] = model + model_response["usage"] = { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": prompt_tokens + completion_tokens, + } + response = model_response + elif model in litellm.anthropic_models: + anthropic_key = ( + api_key or litellm.anthropic_key or os.environ.get("ANTHROPIC_API_KEY") + ) + anthropic_client = AnthropicLLM( + encoding=encoding, + default_max_tokens_to_sample=litellm.max_tokens, + api_key=anthropic_key, + ) + model_response = anthropic_client.completion( + model=model, + messages=messages, + model_response=model_response, + print_verbose=print_verbose, + optional_params=optional_params, + litellm_params=litellm_params, + logger_fn=logger_fn, + ) + if "stream" in optional_params and optional_params["stream"] == True: + # don't try to access stream object, + response = CustomStreamWrapper(model_response, model) + return response + response = model_response + elif model in litellm.openrouter_models or custom_llm_provider == "openrouter": + openai.api_type = "openai" + # not sure if this will work after someone first uses another API + openai.api_base = ( + litellm.api_base + if litellm.api_base is not None + else "https://openrouter.ai/api/v1" + ) + openai.api_version = None + if litellm.organization: + openai.organization = litellm.organization + if api_key: + openai.api_key = api_key + elif litellm.openrouter_key: + openai.api_key = litellm.openrouter_key + else: + openai.api_key = get_secret("OPENROUTER_API_KEY") or get_secret( + "OR_API_KEY" + ) + ## LOGGING + logging( + model=model, + input=messages, + additional_args=optional_params, + custom_llm_provider=custom_llm_provider, + logger_fn=logger_fn, + ) + ## COMPLETION CALL + if litellm.headers: + response = openai.ChatCompletion.create( + model=model, + messages=messages, + headers=litellm.headers, + **optional_params, + ) + else: + openrouter_site_url = get_secret("OR_SITE_URL") + openrouter_app_name = get_secret("OR_APP_NAME") + # if openrouter_site_url is None, set it to https://litellm.ai + if openrouter_site_url is None: + openrouter_site_url = "https://litellm.ai" + # if openrouter_app_name is None, set it to liteLLM + if openrouter_app_name is None: + openrouter_app_name = "liteLLM" + response = openai.ChatCompletion.create( + model=model, + messages=messages, + headers={ + "HTTP-Referer": openrouter_site_url, # To identify your site + "X-Title": openrouter_app_name, # To identify your app + }, + **optional_params, + ) + elif model in litellm.cohere_models: + # import cohere/if it fails then pip install cohere + install_and_import("cohere") + import cohere + + cohere_key = ( + api_key + or litellm.cohere_key + or get_secret("COHERE_API_KEY") + or get_secret("CO_API_KEY") + ) + co = cohere.Client(cohere_key) + prompt = " ".join([message["content"] for message in messages]) + ## LOGGING + logging( + model=model, + input=prompt, + custom_llm_provider=custom_llm_provider, + logger_fn=logger_fn, + ) + ## COMPLETION CALL + response = co.generate(model=model, prompt=prompt, **optional_params) + if "stream" in optional_params and optional_params["stream"] == True: + # don't try to access stream object, + response = CustomStreamWrapper(response, model) + return response + + completion_response = response[0].text + ## LOGGING + logging( + model=model, + input=prompt, + custom_llm_provider=custom_llm_provider, + additional_args={ + "max_tokens": max_tokens, + "original_response": completion_response, + }, + logger_fn=logger_fn, + ) + prompt_tokens = len(encoding.encode(prompt)) + completion_tokens = len(encoding.encode(completion_response)) + ## RESPONSE OBJECT + model_response["choices"][0]["message"]["content"] = completion_response + model_response["created"] = time.time() + model_response["model"] = model + model_response["usage"] = { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": prompt_tokens + completion_tokens, + } + response = model_response + elif ( + model in litellm.huggingface_models or custom_llm_provider == "huggingface" + ): + custom_llm_provider = "huggingface" + huggingface_key = ( + api_key + or litellm.huggingface_key + or os.environ.get("HF_TOKEN") + or os.environ.get("HUGGINGFACE_API_KEY") + ) + huggingface_client = HuggingfaceRestAPILLM( + encoding=encoding, api_key=huggingface_key + ) + model_response = huggingface_client.completion( + model=model, + messages=messages, + custom_api_base=custom_api_base, + model_response=model_response, + print_verbose=print_verbose, + optional_params=optional_params, + litellm_params=litellm_params, + logger_fn=logger_fn, + ) + if "stream" in optional_params and optional_params["stream"] == True: + # don't try to access stream object, + response = CustomStreamWrapper( + model_response, model, custom_llm_provider="huggingface" + ) + return response + response = model_response + elif custom_llm_provider == "together_ai" or ("togethercomputer" in model): + import requests + + TOGETHER_AI_TOKEN = ( + get_secret("TOGETHER_AI_TOKEN") + or get_secret("TOGETHERAI_API_KEY") + or api_key + or litellm.togetherai_api_key + ) + headers = {"Authorization": f"Bearer {TOGETHER_AI_TOKEN}"} + endpoint = "https://api.together.xyz/inference" + prompt = " ".join( + [message["content"] for message in messages] + ) # TODO: Add chat support for together AI + + ## LOGGING + logging( + model=model, + input=prompt, + custom_llm_provider=custom_llm_provider, + logger_fn=logger_fn, + ) + if stream == True: + return together_ai_completion_streaming( + { + "model": model, + "prompt": prompt, + "request_type": "language-model-inference", + **optional_params, + }, + headers=headers, + ) + res = requests.post( + endpoint, + json={ + "model": model, + "prompt": prompt, + "request_type": "language-model-inference", + **optional_params, + }, + headers=headers, + ) + ## LOGGING + logging( + model=model, + input=prompt, + custom_llm_provider=custom_llm_provider, + additional_args={ + "max_tokens": max_tokens, + "original_response": res.text, + }, + logger_fn=logger_fn, + ) + + # make this safe for reading, if output does not exist raise an error + json_response = res.json() + if "output" not in json_response: + raise Exception( + f"liteLLM: Error Making TogetherAI request, JSON Response {json_response}" + ) + completion_response = json_response["output"]["choices"][0]["text"] + prompt_tokens = len(encoding.encode(prompt)) + completion_tokens = len(encoding.encode(completion_response)) + ## RESPONSE OBJECT + model_response["choices"][0]["message"]["content"] = completion_response + model_response["created"] = time.time() + model_response["model"] = model + model_response["usage"] = { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": prompt_tokens + completion_tokens, + } + response = model_response + elif model in litellm.vertex_chat_models: + # import vertexai/if it fails then pip install vertexai# import cohere/if it fails then pip install cohere + install_and_import("vertexai") + import vertexai + from vertexai.preview.language_models import ChatModel, InputOutputTextPair + + vertexai.init( + project=litellm.vertex_project, location=litellm.vertex_location + ) + # vertexai does not use an API key, it looks for credentials.json in the environment + + prompt = " ".join([message["content"] for message in messages]) + ## LOGGING + logging( + model=model, + input=prompt, + custom_llm_provider=custom_llm_provider, + additional_args={ + "litellm_params": litellm_params, + "optional_params": optional_params, + }, + logger_fn=logger_fn, + ) + + chat_model = ChatModel.from_pretrained(model) + + chat = chat_model.start_chat() + completion_response = chat.send_message(prompt, **optional_params) + + ## LOGGING + logging( + model=model, + input=prompt, + custom_llm_provider=custom_llm_provider, + additional_args={ + "max_tokens": max_tokens, + "original_response": completion_response, + }, + logger_fn=logger_fn, + ) + + ## RESPONSE OBJECT + model_response["choices"][0]["message"]["content"] = completion_response + model_response["created"] = time.time() + model_response["model"] = model + elif model in litellm.vertex_text_models: + # import vertexai/if it fails then pip install vertexai# import cohere/if it fails then pip install cohere + install_and_import("vertexai") + import vertexai + from vertexai.language_models import TextGenerationModel + + vertexai.init( + project=litellm.vertex_project, location=litellm.vertex_location + ) + # vertexai does not use an API key, it looks for credentials.json in the environment + + prompt = " ".join([message["content"] for message in messages]) + ## LOGGING + logging( + model=model, + input=prompt, + custom_llm_provider=custom_llm_provider, + logger_fn=logger_fn, + ) + vertex_model = TextGenerationModel.from_pretrained(model) + completion_response = vertex_model.predict(prompt, **optional_params) + + ## LOGGING + logging( + model=model, + input=prompt, + custom_llm_provider=custom_llm_provider, + additional_args={ + "max_tokens": max_tokens, + "original_response": completion_response, + }, + logger_fn=logger_fn, + ) + + ## RESPONSE OBJECT + model_response["choices"][0]["message"]["content"] = completion_response + model_response["created"] = time.time() + model_response["model"] = model + response = model_response + elif model in litellm.ai21_models: + install_and_import("ai21") + import ai21 + + ai21.api_key = get_secret("AI21_API_KEY") + + prompt = " ".join([message["content"] for message in messages]) + ## LOGGING + logging( + model=model, + input=prompt, + custom_llm_provider=custom_llm_provider, + logger_fn=logger_fn, + ) + + ai21_response = ai21.Completion.execute( + model=model, + prompt=prompt, + ) + completion_response = ai21_response["completions"][0]["data"]["text"] + + ## LOGGING + logging( + model=model, + input=prompt, + custom_llm_provider=custom_llm_provider, + additional_args={ + "max_tokens": max_tokens, + "original_response": completion_response, + }, + logger_fn=logger_fn, + ) + + ## RESPONSE OBJECT + model_response["choices"][0]["message"]["content"] = completion_response + model_response["created"] = time.time() + model_response["model"] = model + response = model_response + elif custom_llm_provider == "ollama": + endpoint = ( + litellm.api_base if litellm.api_base is not None else custom_api_base + ) + prompt = " ".join([message["content"] for message in messages]) + + ## LOGGING + logging(model=model, input=prompt, azure=azure, logger_fn=logger_fn) + generator = get_ollama_response_stream(endpoint, model, prompt) + # assume all responses are streamed + return generator + elif ( + custom_llm_provider == "baseten" + or litellm.api_base == "https://app.baseten.co" + ): + import baseten + + base_ten_key = get_secret("BASETEN_API_KEY") + baseten.login(base_ten_key) + + prompt = " ".join([message["content"] for message in messages]) + ## LOGGING + logging( + model=model, + input=prompt, + custom_llm_provider=custom_llm_provider, + logger_fn=logger_fn, + ) + + base_ten__model = baseten.deployed_model_version_id(model) + + completion_response = base_ten__model.predict({"prompt": prompt}) + if type(completion_response) == dict: + completion_response = completion_response["data"] + if type(completion_response) == dict: + completion_response = completion_response["generated_text"] + + logging( + model=model, + input=prompt, + custom_llm_provider=custom_llm_provider, + additional_args={ + "max_tokens": max_tokens, + "original_response": completion_response, + }, + logger_fn=logger_fn, + ) + + ## RESPONSE OBJECT + model_response["choices"][0]["message"]["content"] = completion_response + model_response["created"] = time.time() + model_response["model"] = model + response = model_response + + elif custom_llm_provider == "petals" or ( + litellm.api_base and "chat.petals.dev" in litellm.api_base + ): + url = "https://chat.petals.dev/api/v1/generate" + import requests + + prompt = " ".join([message["content"] for message in messages]) + + ## LOGGING + logging( + model=model, + input=prompt, + custom_llm_provider=custom_llm_provider, + logger_fn=logger_fn, + ) + response = requests.post( + url, data={"inputs": prompt, "max_new_tokens": 100, "model": model} + ) + ## LOGGING + logging( + model=model, + input=prompt, + custom_llm_provider=custom_llm_provider, + additional_args={ + "max_tokens": max_tokens, + "original_response": response, + }, + logger_fn=logger_fn, + ) + completion_response = response.json()["outputs"] + + # RESPONSE OBJECT + model_response["choices"][0]["message"]["content"] = completion_response + model_response["created"] = time.time() + model_response["model"] = model + response = model_response + else: + ## LOGGING + logging( + model=model, + input=messages, + custom_llm_provider=custom_llm_provider, + logger_fn=logger_fn, + ) + args = locals() + raise ValueError( + f"Unable to map your input to a model. Check your input - {args}" + ) + return response + except Exception as e: + ## LOGGING + logging( + model=model, + input=messages, + custom_llm_provider=custom_llm_provider, + additional_args={"max_tokens": max_tokens}, + logger_fn=logger_fn, + exception=e, + ) + ## Map to OpenAI Exception + raise exception_type( + model=model, custom_llm_provider=custom_llm_provider, original_exception=e ) - elif model in litellm.cohere_models: - # import cohere/if it fails then pip install cohere - install_and_import("cohere") - import cohere - cohere_key = api_key or litellm.cohere_key or get_secret("COHERE_API_KEY") or get_secret("CO_API_KEY") - co = cohere.Client(cohere_key) - prompt = " ".join([message["content"] for message in messages]) - ## LOGGING - logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn) - ## COMPLETION CALL - response = co.generate( - model=model, - prompt = prompt, - **optional_params - ) - if 'stream' in optional_params and optional_params['stream'] == True: - # don't try to access stream object, - response = CustomStreamWrapper(response, model) - return response - completion_response = response[0].text - ## LOGGING - logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn) - prompt_tokens = len(encoding.encode(prompt)) - completion_tokens = len(encoding.encode(completion_response)) - ## RESPONSE OBJECT - model_response["choices"][0]["message"]["content"] = completion_response - model_response["created"] = time.time() - model_response["model"] = model - model_response["usage"] = { - "prompt_tokens": prompt_tokens, - "completion_tokens": completion_tokens, - "total_tokens": prompt_tokens + completion_tokens - } - response = model_response - elif model in litellm.huggingface_models or custom_llm_provider == "huggingface": - custom_llm_provider = "huggingface" - huggingface_key = api_key or litellm.huggingface_key or os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_API_KEY") - huggingface_client = HuggingfaceRestAPILLM(encoding=encoding, api_key=huggingface_key) - model_response = huggingface_client.completion(model=model, messages=messages, custom_api_base=custom_api_base, model_response=model_response, print_verbose=print_verbose, optional_params=optional_params, litellm_params=litellm_params, logger_fn=logger_fn) - if 'stream' in optional_params and optional_params['stream'] == True: - # don't try to access stream object, - response = CustomStreamWrapper(model_response, model, custom_llm_provider="huggingface") - return response - response = model_response - elif custom_llm_provider == "together_ai" or ("togethercomputer" in model): - import requests - TOGETHER_AI_TOKEN = get_secret("TOGETHER_AI_TOKEN") or get_secret("TOGETHERAI_API_KEY") or api_key or litellm.togetherai_api_key - headers = {"Authorization": f"Bearer {TOGETHER_AI_TOKEN}"} - endpoint = 'https://api.together.xyz/inference' - prompt = " ".join([message["content"] for message in messages]) # TODO: Add chat support for together AI - - ## LOGGING - logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn) - if stream == True: - return together_ai_completion_streaming({ - "model": model, - "prompt": prompt, - "request_type": "language-model-inference", - **optional_params - }, - headers=headers) - res = requests.post(endpoint, json={ - "model": model, - "prompt": prompt, - "request_type": "language-model-inference", - **optional_params - }, - headers=headers - ) - ## LOGGING - logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": res.text}, logger_fn=logger_fn) - - # make this safe for reading, if output does not exist raise an error - json_response = res.json() - if "output" not in json_response: - raise Exception(f"liteLLM: Error Making TogetherAI request, JSON Response {json_response}") - completion_response = json_response['output']['choices'][0]['text'] - prompt_tokens = len(encoding.encode(prompt)) - completion_tokens = len(encoding.encode(completion_response)) - ## RESPONSE OBJECT - model_response["choices"][0]["message"]["content"] = completion_response - model_response["created"] = time.time() - model_response["model"] = model - model_response["usage"] = { - "prompt_tokens": prompt_tokens, - "completion_tokens": completion_tokens, - "total_tokens": prompt_tokens + completion_tokens - } - response = model_response - elif model in litellm.vertex_chat_models: - # import vertexai/if it fails then pip install vertexai# import cohere/if it fails then pip install cohere - install_and_import("vertexai") - import vertexai - from vertexai.preview.language_models import ChatModel, InputOutputTextPair - vertexai.init(project=litellm.vertex_project, location=litellm.vertex_location) - # vertexai does not use an API key, it looks for credentials.json in the environment - - prompt = " ".join([message["content"] for message in messages]) - ## LOGGING - logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"litellm_params": litellm_params, "optional_params": optional_params}, logger_fn=logger_fn) - - chat_model = ChatModel.from_pretrained(model) - - - chat = chat_model.start_chat() - completion_response = chat.send_message(prompt, **optional_params) - - ## LOGGING - logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn) - - ## RESPONSE OBJECT - model_response["choices"][0]["message"]["content"] = completion_response - model_response["created"] = time.time() - model_response["model"] = model - elif model in litellm.vertex_text_models: - # import vertexai/if it fails then pip install vertexai# import cohere/if it fails then pip install cohere - install_and_import("vertexai") - import vertexai - from vertexai.language_models import TextGenerationModel - - vertexai.init(project=litellm.vertex_project, location=litellm.vertex_location) - # vertexai does not use an API key, it looks for credentials.json in the environment - - prompt = " ".join([message["content"] for message in messages]) - ## LOGGING - logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn) - vertex_model = TextGenerationModel.from_pretrained(model) - completion_response= vertex_model.predict(prompt, **optional_params) - - ## LOGGING - logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn) - - ## RESPONSE OBJECT - model_response["choices"][0]["message"]["content"] = completion_response - model_response["created"] = time.time() - model_response["model"] = model - response = model_response - elif model in litellm.ai21_models: - install_and_import("ai21") - import ai21 - ai21.api_key = get_secret("AI21_API_KEY") - - prompt = " ".join([message["content"] for message in messages]) - ## LOGGING - logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn) - - ai21_response = ai21.Completion.execute( - model=model, - prompt=prompt, - ) - completion_response = ai21_response['completions'][0]['data']['text'] - - ## LOGGING - logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn) - - ## RESPONSE OBJECT - model_response["choices"][0]["message"]["content"] = completion_response - model_response["created"] = time.time() - model_response["model"] = model - response = model_response - elif custom_llm_provider == "ollama": - endpoint = litellm.api_base if litellm.api_base is not None else custom_api_base - prompt = " ".join([message["content"] for message in messages]) - - ## LOGGING - logging(model=model, input=prompt, azure=azure, logger_fn=logger_fn) - generator = get_ollama_response_stream(endpoint, model, prompt) - # assume all responses are streamed - return generator - elif custom_llm_provider == "baseten" or litellm.api_base=="https://app.baseten.co": - import baseten - base_ten_key = get_secret('BASETEN_API_KEY') - baseten.login(base_ten_key) - - prompt = " ".join([message["content"] for message in messages]) - ## LOGGING - logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn) - - base_ten__model = baseten.deployed_model_version_id(model) - - completion_response = base_ten__model.predict({"prompt": prompt}) - if type(completion_response) == dict: - completion_response = completion_response["data"] - if type(completion_response) == dict: - completion_response = completion_response["generated_text"] - - logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn) - - ## RESPONSE OBJECT - model_response["choices"][0]["message"]["content"] = completion_response - model_response["created"] = time.time() - model_response["model"] = model - response = model_response - - elif custom_llm_provider == "petals" or (litellm.api_base and "chat.petals.dev" in litellm.api_base): - url = "https://chat.petals.dev/api/v1/generate" - import requests - prompt = " ".join([message["content"] for message in messages]) - - ## LOGGING - logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn) - response = requests.post(url, data={"inputs": prompt, "max_new_tokens": 100, "model": model}) - ## LOGGING - logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": response}, logger_fn=logger_fn) - completion_response = response.json()["outputs"] - - # RESPONSE OBJECT - model_response["choices"][0]["message"]["content"] = completion_response - model_response["created"] = time.time() - model_response["model"] = model - response = model_response - else: - ## LOGGING - logging(model=model, input=messages, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn) - args = locals() - raise ValueError(f"Unable to map your input to a model. Check your input - {args}") - return response - except Exception as e: - ## LOGGING - logging(model=model, input=messages, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens}, logger_fn=logger_fn, exception=e) - ## Map to OpenAI Exception - raise exception_type(model=model, custom_llm_provider=custom_llm_provider, original_exception=e) def batch_completion(*args, **kwargs): - batch_messages = args[1] if len(args) > 1 else kwargs.get("messages") - completions = [] - with ThreadPoolExecutor() as executor: - for message_list in batch_messages: - if len(args) > 1: - args_modified = list(args) - args_modified[1] = message_list - future = executor.submit(completion, *args_modified) - else: - kwargs_modified = dict(kwargs) - kwargs_modified["messages"] = message_list - future = executor.submit(completion, *args, **kwargs_modified) - completions.append(future) - - # Retrieve the results from the futures - results = [future.result() for future in completions] - return results + batch_messages = args[1] if len(args) > 1 else kwargs.get("messages") + completions = [] + with ThreadPoolExecutor() as executor: + for message_list in batch_messages: + if len(args) > 1: + args_modified = list(args) + args_modified[1] = message_list + future = executor.submit(completion, *args_modified) + else: + kwargs_modified = dict(kwargs) + kwargs_modified["messages"] = message_list + future = executor.submit(completion, *args, **kwargs_modified) + completions.append(future) + + # Retrieve the results from the futures + results = [future.result() for future in completions] + return results + ### EMBEDDING ENDPOINTS #################### @client -@timeout(60) ## set timeouts, in case calls hang (e.g. Azure) - default is 60s, override with `force_timeout` +@timeout( + 60 +) ## set timeouts, in case calls hang (e.g. Azure) - default is 60s, override with `force_timeout` def embedding(model, input=[], azure=False, force_timeout=60, logger_fn=None): - try: - response = None - if azure == True: - # azure configs - openai.api_type = "azure" - openai.api_base = get_secret("AZURE_API_BASE") - openai.api_version = get_secret("AZURE_API_VERSION") - openai.api_key = get_secret("AZURE_API_KEY") - ## LOGGING - logging(model=model, input=input, azure=azure, logger_fn=logger_fn) - ## EMBEDDING CALL - response = openai.Embedding.create(input=input, engine=model) - print_verbose(f"response_value: {str(response)[:50]}") - elif model in litellm.open_ai_embedding_models: - openai.api_type = "openai" - openai.api_base = "https://api.openai.com/v1" - openai.api_version = None - openai.api_key = get_secret("OPENAI_API_KEY") - ## LOGGING - logging(model=model, input=input, azure=azure, logger_fn=logger_fn) - ## EMBEDDING CALL - response = openai.Embedding.create(input=input, model=model) - print_verbose(f"response_value: {str(response)[:50]}") - else: - logging(model=model, input=input, azure=azure, logger_fn=logger_fn) - args = locals() - raise ValueError(f"No valid embedding model args passed in - {args}") - - return response - except Exception as e: - # log the original exception - logging(model=model, input=input, azure=azure, logger_fn=logger_fn, exception=e) - ## Map to OpenAI Exception - raise exception_type(model=model, original_exception=e) - raise e + try: + response = None + if azure == True: + # azure configs + openai.api_type = "azure" + openai.api_base = get_secret("AZURE_API_BASE") + openai.api_version = get_secret("AZURE_API_VERSION") + openai.api_key = get_secret("AZURE_API_KEY") + ## LOGGING + logging(model=model, input=input, azure=azure, logger_fn=logger_fn) + ## EMBEDDING CALL + response = openai.Embedding.create(input=input, engine=model) + print_verbose(f"response_value: {str(response)[:50]}") + elif model in litellm.open_ai_embedding_models: + openai.api_type = "openai" + openai.api_base = "https://api.openai.com/v1" + openai.api_version = None + openai.api_key = get_secret("OPENAI_API_KEY") + ## LOGGING + logging(model=model, input=input, azure=azure, logger_fn=logger_fn) + ## EMBEDDING CALL + response = openai.Embedding.create(input=input, model=model) + print_verbose(f"response_value: {str(response)[:50]}") + else: + logging(model=model, input=input, azure=azure, logger_fn=logger_fn) + args = locals() + raise ValueError(f"No valid embedding model args passed in - {args}") + + return response + except Exception as e: + # log the original exception + logging(model=model, input=input, azure=azure, logger_fn=logger_fn, exception=e) + ## Map to OpenAI Exception + raise exception_type(model=model, original_exception=e) + raise e + + ####### HELPER FUNCTIONS ################ -## Set verbose to true -> ```litellm.set_verbose = True``` +## Set verbose to true -> ```litellm.set_verbose = True``` def print_verbose(print_statement): - if litellm.set_verbose: - print(f"LiteLLM: {print_statement}") - if random.random() <= 0.3: - print("Get help - https://discord.com/invite/wuPM9dRgDw") + if litellm.set_verbose: + print(f"LiteLLM: {print_statement}") + if random.random() <= 0.3: + print("Get help - https://discord.com/invite/wuPM9dRgDw") + def config_completion(**kwargs): - if litellm.config_path != None: - config_args = read_config_args(litellm.config_path) - # overwrite any args passed in with config args - return completion(**kwargs, **config_args) - else: - raise ValueError("No config path set, please set a config path using `litellm.config_path = 'path/to/config.json'`") \ No newline at end of file + if litellm.config_path != None: + config_args = read_config_args(litellm.config_path) + # overwrite any args passed in with config args + return completion(**kwargs, **config_args) + else: + raise ValueError( + "No config path set, please set a config path using `litellm.config_path = 'path/to/config.json'`" + ) diff --git a/litellm/testing.py b/litellm/testing.py index 2442bab7c..3e3ce286e 100644 --- a/litellm/testing.py +++ b/litellm/testing.py @@ -1,53 +1,82 @@ import litellm -import time +import time from concurrent.futures import ThreadPoolExecutor import traceback + def testing_batch_completion(*args, **kwargs): - try: - batch_models = args[0] if len(args) > 0 else kwargs.pop("models") ## expected input format- ["gpt-3.5-turbo", {"model": "qvv0xeq", "custom_llm_provider"="baseten"}...] - batch_messages = args[1] if len(args) > 1 else kwargs.pop("messages") - results = [] - completions = [] - exceptions = [] - times = [] - with ThreadPoolExecutor() as executor: - for model in batch_models: - kwargs_modified = dict(kwargs) - args_modified = list(args) - if len(args) > 0: - args_modified[0] = model["model"] - else: - kwargs_modified["model"] = model["model"] if isinstance(model, dict) and "model" in model else model # if model is a dictionary get it's value else assume it's a string - kwargs_modified["custom_llm_provider"] = model["custom_llm_provider"] if isinstance(model, dict) and "custom_llm_provider" in model else None - kwargs_modified["custom_api_base"] = model["custom_api_base"] if isinstance(model, dict) and "custom_api_base" in model else None - for message_list in batch_messages: - if len(args) > 1: - args_modified[1] = message_list - future = executor.submit(litellm.completion, *args_modified, **kwargs_modified) + try: + batch_models = ( + args[0] if len(args) > 0 else kwargs.pop("models") + ) ## expected input format- ["gpt-3.5-turbo", {"model": "qvv0xeq", "custom_llm_provider"="baseten"}...] + batch_messages = args[1] if len(args) > 1 else kwargs.pop("messages") + results = [] + completions = [] + exceptions = [] + times = [] + with ThreadPoolExecutor() as executor: + for model in batch_models: + kwargs_modified = dict(kwargs) + args_modified = list(args) + if len(args) > 0: + args_modified[0] = model["model"] else: - kwargs_modified["messages"] = message_list - future = executor.submit(litellm.completion, *args_modified, **kwargs_modified) - completions.append((future, message_list)) - - # Retrieve the results and calculate elapsed time for each completion call - for completion in completions: - future, message_list = completion - start_time = time.time() - try: - result = future.result() - end_time = time.time() - elapsed_time = end_time - start_time - result_dict = {"status": "succeeded", "response": future.result(), "prompt": message_list, "response_time": elapsed_time} - results.append(result_dict) - except Exception as e: - end_time = time.time() - elapsed_time = end_time - start_time - result_dict = {"status": "failed", "response": e, "response_time": elapsed_time} - results.append(result_dict) - return results - except: - traceback.print_exc() + kwargs_modified["model"] = ( + model["model"] + if isinstance(model, dict) and "model" in model + else model + ) # if model is a dictionary get it's value else assume it's a string + kwargs_modified["custom_llm_provider"] = ( + model["custom_llm_provider"] + if isinstance(model, dict) and "custom_llm_provider" in model + else None + ) + kwargs_modified["custom_api_base"] = ( + model["custom_api_base"] + if isinstance(model, dict) and "custom_api_base" in model + else None + ) + for message_list in batch_messages: + if len(args) > 1: + args_modified[1] = message_list + future = executor.submit( + litellm.completion, *args_modified, **kwargs_modified + ) + else: + kwargs_modified["messages"] = message_list + future = executor.submit( + litellm.completion, *args_modified, **kwargs_modified + ) + completions.append((future, message_list)) + + # Retrieve the results and calculate elapsed time for each completion call + for completion in completions: + future, message_list = completion + start_time = time.time() + try: + result = future.result() + end_time = time.time() + elapsed_time = end_time - start_time + result_dict = { + "status": "succeeded", + "response": future.result(), + "prompt": message_list, + "response_time": elapsed_time, + } + results.append(result_dict) + except Exception as e: + end_time = time.time() + elapsed_time = end_time - start_time + result_dict = { + "status": "failed", + "response": e, + "response_time": elapsed_time, + } + results.append(result_dict) + return results + except: + traceback.print_exc() + def duration_test_model(original_function): def wrapper_function(*args, **kwargs): @@ -70,22 +99,39 @@ def duration_test_model(original_function): # Return the wrapper function return wrapper_function + @duration_test_model def load_test_model(models: list, prompt: str = None, num_calls: int = None): - test_calls = 100 - if num_calls: - test_calls = num_calls - input_prompt = prompt if prompt else "Hey, how's it going?" - messages = [{"role": "user", "content": prompt}] if prompt else [{"role": "user", "content": input_prompt}] - full_message_list = [messages for _ in range(test_calls)] # call it as many times as set by user to load test models - start_time = time.time() - try: - results = testing_batch_completion(models=models, messages=full_message_list) - end_time = time.time() - response_time = end_time - start_time - return {"total_response_time": response_time, "calls_made": test_calls, "prompt": input_prompt, "results": results} - except Exception as e: - traceback.print_exc() - end_time = time.time() - response_time = end_time - start_time - return {"total_response_time": response_time, "calls_made": test_calls, "prompt": input_prompt, "exception": e} \ No newline at end of file + test_calls = 100 + if num_calls: + test_calls = num_calls + input_prompt = prompt if prompt else "Hey, how's it going?" + messages = ( + [{"role": "user", "content": prompt}] + if prompt + else [{"role": "user", "content": input_prompt}] + ) + full_message_list = [ + messages for _ in range(test_calls) + ] # call it as many times as set by user to load test models + start_time = time.time() + try: + results = testing_batch_completion(models=models, messages=full_message_list) + end_time = time.time() + response_time = end_time - start_time + return { + "total_response_time": response_time, + "calls_made": test_calls, + "prompt": input_prompt, + "results": results, + } + except Exception as e: + traceback.print_exc() + end_time = time.time() + response_time = end_time - start_time + return { + "total_response_time": response_time, + "calls_made": test_calls, + "prompt": input_prompt, + "exception": e, + } diff --git a/litellm/tests/test_api_key_param.py b/litellm/tests/test_api_key_param.py index 6213730f5..cebcb1a37 100644 --- a/litellm/tests/test_api_key_param.py +++ b/litellm/tests/test_api_key_param.py @@ -3,27 +3,37 @@ import sys, os import traceback -sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path + +sys.path.insert( + 0, os.path.abspath("../..") +) # Adds the parent directory to the system path import litellm from litellm import embedding, completion litellm.set_verbose = False + def logger_fn(model_call_object: dict): print(f"model call details: {model_call_object}") + user_message = "Hello, how are you?" -messages = [{ "content": user_message,"role": "user"}] +messages = [{"content": user_message, "role": "user"}] ## Test 1: Setting key dynamically temp_key = os.environ.get("ANTHROPIC_API_KEY") os.environ["ANTHROPIC_API_KEY"] = "bad-key" -# test on openai completion call +# test on openai completion call try: - response = completion(model="claude-instant-1", messages=messages, logger_fn=logger_fn, api_key=temp_key) + response = completion( + model="claude-instant-1", + messages=messages, + logger_fn=logger_fn, + api_key=temp_key, + ) print(f"response: {response}") except: - print(f"error occurred: {traceback.format_exc()}") + print(f"error occurred: {traceback.format_exc()}") pass os.environ["ANTHROPIC_API_KEY"] = temp_key @@ -31,11 +41,13 @@ os.environ["ANTHROPIC_API_KEY"] = temp_key ## Test 2: Setting key via __init__ params litellm.anthropic_key = os.environ.get("ANTHROPIC_API_KEY") os.environ.pop("ANTHROPIC_API_KEY") -# test on openai completion call +# test on openai completion call try: - response = completion(model="claude-instant-1", messages=messages, logger_fn=logger_fn) + response = completion( + model="claude-instant-1", messages=messages, logger_fn=logger_fn + ) print(f"response: {response}") except: - print(f"error occurred: {traceback.format_exc()}") + print(f"error occurred: {traceback.format_exc()}") pass os.environ["ANTHROPIC_API_KEY"] = temp_key diff --git a/litellm/tests/test_async_fn.py b/litellm/tests/test_async_fn.py index b0925c4b5..c20c5cde6 100644 --- a/litellm/tests/test_async_fn.py +++ b/litellm/tests/test_async_fn.py @@ -5,17 +5,22 @@ import sys, os import pytest import traceback import asyncio -sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path + +sys.path.insert( + 0, os.path.abspath("../..") +) # Adds the parent directory to the system path from litellm import acompletion + async def test_get_response(): user_message = "Hello, how are you?" - messages = [{ "content": user_message,"role": "user"}] + messages = [{"content": user_message, "role": "user"}] try: response = await acompletion(model="gpt-3.5-turbo", messages=messages) except Exception as e: pytest.fail(f"error occurred: {e}") return response + response = asyncio.run(test_get_response()) -print(response) \ No newline at end of file +print(response) diff --git a/litellm/tests/test_bad_params.py b/litellm/tests/test_bad_params.py index 0a2313c78..71cbffe56 100644 --- a/litellm/tests/test_bad_params.py +++ b/litellm/tests/test_bad_params.py @@ -1,16 +1,17 @@ #### What this tests #### # This tests chaos monkeys - if random parts of the system are broken / things aren't sent correctly - what happens. -# Expect to add more edge cases to this over time. +# Expect to add more edge cases to this over time. import sys, os import traceback from dotenv import load_dotenv + load_dotenv() # Get the current directory of the script current_dir = os.path.dirname(os.path.abspath(__file__)) # Get the parent directory by joining the current directory with '..' -parent_dir = os.path.join(current_dir, '../..') +parent_dir = os.path.join(current_dir, "../..") # Add the parent directory to the system path sys.path.append(parent_dir) @@ -26,7 +27,7 @@ litellm.failure_callback = ["slack", "sentry", "posthog"] user_message = "Hello, how are you?" -messages = [{ "content": user_message,"role": "user"}] +messages = [{"content": user_message, "role": "user"}] model_val = None @@ -35,18 +36,18 @@ def test_completion_with_empty_model(): try: response = completion(model=model_val, messages=messages) except Exception as e: - print(f"error occurred: {e}") + print(f"error occurred: {e}") pass -#bad key +# bad key temp_key = os.environ.get("OPENAI_API_KEY") os.environ["OPENAI_API_KEY"] = "bad-key" -# test on openai completion call +# test on openai completion call try: response = completion(model="gpt-3.5-turbo", messages=messages) print(f"response: {response}") except: - print(f"error occurred: {traceback.format_exc()}") + print(f"error occurred: {traceback.format_exc()}") pass -os.environ["OPENAI_API_KEY"] = temp_key \ No newline at end of file +os.environ["OPENAI_API_KEY"] = temp_key diff --git a/litellm/tests/test_batch_completions.py b/litellm/tests/test_batch_completions.py index d15628f56..a136351ba 100644 --- a/litellm/tests/test_batch_completions.py +++ b/litellm/tests/test_batch_completions.py @@ -3,7 +3,10 @@ import sys, os import traceback -sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path + +sys.path.insert( + 0, os.path.abspath("../..") +) # Adds the parent directory to the system path import litellm from litellm import batch_completion @@ -14,4 +17,4 @@ model = "gpt-3.5-turbo" result = batch_completion(model=model, messages=messages) print(result) -print(len(result)) \ No newline at end of file +print(len(result)) diff --git a/litellm/tests/test_berrispend_integration.py b/litellm/tests/test_berrispend_integration.py index 122c9201d..500285b85 100644 --- a/litellm/tests/test_berrispend_integration.py +++ b/litellm/tests/test_berrispend_integration.py @@ -19,7 +19,7 @@ # #openai call -# response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}]) +# response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}]) # #bad request call -# response = completion(model="chatgpt-test", messages=[{"role": "user", "content": "Hi 👋 - i'm a bad request"}]) \ No newline at end of file +# response = completion(model="chatgpt-test", messages=[{"role": "user", "content": "Hi 👋 - i'm a bad request"}]) diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index c6500c557..5d7e962cf 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -1,9 +1,13 @@ import sys, os import traceback from dotenv import load_dotenv + load_dotenv() import os -sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path + +sys.path.insert( + 0, os.path.abspath("../..") +) # Adds the parent directory to the system path import pytest import litellm from litellm import embedding, completion @@ -12,7 +16,6 @@ litellm.caching = True messages = [{"role": "user", "content": "who is ishaan Github? "}] - # test if response cached def test_caching(): try: @@ -27,9 +30,5 @@ def test_caching(): pytest.fail(f"Error occurred: {e}") except Exception as e: litellm.caching = False - print(f"error occurred: {traceback.format_exc()}") + print(f"error occurred: {traceback.format_exc()}") pytest.fail(f"Error occurred: {e}") - - - - diff --git a/litellm/tests/test_client.py b/litellm/tests/test_client.py index 3c591d4cd..f29ae5a94 100644 --- a/litellm/tests/test_client.py +++ b/litellm/tests/test_client.py @@ -5,7 +5,9 @@ import sys, os import traceback import pytest -sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path +sys.path.insert( + 0, os.path.abspath("../..") +) # Adds the parent directory to the system path import litellm from litellm import embedding, completion @@ -14,17 +16,22 @@ litellm.failure_callback = ["slack", "sentry", "posthog"] litellm.set_verbose = True + def logger_fn(model_call_object: dict): # print(f"model call details: {model_call_object}") pass + user_message = "Hello, how are you?" -messages = [{ "content": user_message,"role": "user"}] +messages = [{"content": user_message, "role": "user"}] + def test_completion_openai(): try: print("running query") - response = completion(model="gpt-3.5-turbo", messages=messages, logger_fn=logger_fn) + response = completion( + model="gpt-3.5-turbo", messages=messages, logger_fn=logger_fn + ) print(f"response: {response}") # Add any assertions here to check the response except Exception as e: @@ -34,33 +41,46 @@ def test_completion_openai(): def test_completion_claude(): try: - response = completion(model="claude-instant-1", messages=messages, logger_fn=logger_fn) + response = completion( + model="claude-instant-1", messages=messages, logger_fn=logger_fn + ) # Add any assertions here to check the response except Exception as e: pytest.fail(f"Error occurred: {e}") + def test_completion_non_openai(): try: - response = completion(model="command-nightly", messages=messages, logger_fn=logger_fn) + response = completion( + model="command-nightly", messages=messages, logger_fn=logger_fn + ) # Add any assertions here to check the response except Exception as e: pytest.fail(f"Error occurred: {e}") + def test_embedding_openai(): try: - response = embedding(model='text-embedding-ada-002', input=[user_message], logger_fn=logger_fn) + response = embedding( + model="text-embedding-ada-002", input=[user_message], logger_fn=logger_fn + ) # Add any assertions here to check the response print(f"response: {str(response)[:50]}") except Exception as e: pytest.fail(f"Error occurred: {e}") + def test_bad_azure_embedding(): try: - response = embedding(model='chatgpt-test', input=[user_message], logger_fn=logger_fn) + response = embedding( + model="chatgpt-test", input=[user_message], logger_fn=logger_fn + ) # Add any assertions here to check the response print(f"response: {str(response)[:50]}") except Exception as e: pass + + # def test_good_azure_embedding(): # try: # response = embedding(model='azure-embedding-model', input=[user_message], azure=True, logger_fn=logger_fn) @@ -68,4 +88,3 @@ def test_bad_azure_embedding(): # print(f"response: {str(response)[:50]}") # except Exception as e: # pytest.fail(f"Error occurred: {e}") - diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index 00054d6a8..370668afb 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -1,44 +1,58 @@ import sys, os import traceback from dotenv import load_dotenv + load_dotenv() import os -sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path + +sys.path.insert( + 0, os.path.abspath("../..") +) # Adds the parent directory to the system path import pytest import litellm from litellm import embedding, completion + # from infisical import InfisicalClient # litellm.set_verbose = True # litellm.secret_manager_client = InfisicalClient(token=os.environ["INFISICAL_TOKEN"]) user_message = "Hello, whats the weather in San Francisco??" -messages = [{ "content": user_message,"role": "user"}] +messages = [{"content": user_message, "role": "user"}] + def logger_fn(user_model_dict): print(f"user_model_dict: {user_model_dict}") + def test_completion_claude(): try: - response = completion(model="claude-instant-1", messages=messages, logger_fn=logger_fn) + response = completion( + model="claude-instant-1", messages=messages, logger_fn=logger_fn + ) # Add any assertions here to check the response print(response) except Exception as e: pytest.fail(f"Error occurred: {e}") + def test_completion_claude_stream(): try: messages = [ {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": "how does a court case get to the Supreme Court?"} + { + "role": "user", + "content": "how does a court case get to the Supreme Court?", + }, ] response = completion(model="claude-2", messages=messages, stream=True) # Add any assertions here to check the response for chunk in response: - print(chunk['choices'][0]['delta']) # same as openai format + print(chunk["choices"][0]["delta"]) # same as openai format except Exception as e: pytest.fail(f"Error occurred: {e}") + # def test_completion_hf_api(): # try: # user_message = "write some code to find the sum of two numbers" @@ -62,10 +76,12 @@ def test_completion_claude_stream(): def test_completion_cohere(): try: - response = completion(model="command-nightly", messages=messages, max_tokens=100) + response = completion( + model="command-nightly", messages=messages, max_tokens=100 + ) # Add any assertions here to check the response print(response) - response_str = response['choices'][0]['message']['content'] + response_str = response["choices"][0]["message"]["content"] print(f"str response{response_str}") response_str_2 = response.choices[0].message.content if type(response_str) != str: @@ -75,24 +91,31 @@ def test_completion_cohere(): except Exception as e: pytest.fail(f"Error occurred: {e}") + def test_completion_cohere_stream(): try: messages = [ {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": "how does a court case get to the Supreme Court?"} + { + "role": "user", + "content": "how does a court case get to the Supreme Court?", + }, ] - response = completion(model="command-nightly", messages=messages, stream=True, max_tokens=50) + response = completion( + model="command-nightly", messages=messages, stream=True, max_tokens=50 + ) # Add any assertions here to check the response for chunk in response: - print(chunk['choices'][0]['delta']) # same as openai format + print(chunk["choices"][0]["delta"]) # same as openai format except Exception as e: pytest.fail(f"Error occurred: {e}") + def test_completion_openai(): try: response = completion(model="gpt-3.5-turbo", messages=messages) - response_str = response['choices'][0]['message']['content'] + response_str = response["choices"][0]["message"]["content"] response_str_2 = response.choices[0].message.content assert response_str == response_str_2 assert type(response_str) == str @@ -100,6 +123,7 @@ def test_completion_openai(): except Exception as e: pytest.fail(f"Error occurred: {e}") + def test_completion_text_openai(): try: response = completion(model="text-davinci-003", messages=messages) @@ -108,17 +132,31 @@ def test_completion_text_openai(): except Exception as e: pytest.fail(f"Error occurred: {e}") + def test_completion_openai_with_optional_params(): try: - response = completion(model="gpt-3.5-turbo", messages=messages, temperature=0.5, top_p=0.1, user="ishaan_dev@berri.ai") + response = completion( + model="gpt-3.5-turbo", + messages=messages, + temperature=0.5, + top_p=0.1, + user="ishaan_dev@berri.ai", + ) # Add any assertions here to check the response print(response) except Exception as e: pytest.fail(f"Error occurred: {e}") + def test_completion_openrouter(): try: - response = completion(model="google/palm-2-chat-bison", messages=messages, temperature=0.5, top_p=0.1, user="ishaan_dev@berri.ai") + response = completion( + model="google/palm-2-chat-bison", + messages=messages, + temperature=0.5, + top_p=0.1, + user="ishaan_dev@berri.ai", + ) # Add any assertions here to check the response print(response) except Exception as e: @@ -127,12 +165,23 @@ def test_completion_openrouter(): def test_completion_openai_with_more_optional_params(): try: - response = completion(model="gpt-3.5-turbo", messages=messages, temperature=0.5, top_p=0.1, n=2, max_tokens=150, presence_penalty=0.5, frequency_penalty=-0.5, logit_bias={123: 5}, user="ishaan_dev@berri.ai") + response = completion( + model="gpt-3.5-turbo", + messages=messages, + temperature=0.5, + top_p=0.1, + n=2, + max_tokens=150, + presence_penalty=0.5, + frequency_penalty=-0.5, + logit_bias={123: 5}, + user="ishaan_dev@berri.ai", + ) # Add any assertions here to check the response print(response) - response_str = response['choices'][0]['message']['content'] + response_str = response["choices"][0]["message"]["content"] response_str_2 = response.choices[0].message.content - print(response['choices'][0]['message']['content']) + print(response["choices"][0]["message"]["content"]) print(response.choices[0].message.content) if type(response_str) != str: pytest.fail(f"Error occurred: {e}") @@ -141,14 +190,28 @@ def test_completion_openai_with_more_optional_params(): except Exception as e: pytest.fail(f"Error occurred: {e}") + def test_completion_openai_with_stream(): try: - response = completion(model="gpt-3.5-turbo", messages=messages, temperature=0.5, top_p=0.1, n=2, max_tokens=150, presence_penalty=0.5, stream=True, frequency_penalty=-0.5, logit_bias={27000: 5}, user="ishaan_dev@berri.ai") + response = completion( + model="gpt-3.5-turbo", + messages=messages, + temperature=0.5, + top_p=0.1, + n=2, + max_tokens=150, + presence_penalty=0.5, + stream=True, + frequency_penalty=-0.5, + logit_bias={27000: 5}, + user="ishaan_dev@berri.ai", + ) # Add any assertions here to check the response print(response) except Exception as e: pytest.fail(f"Error occurred: {e}") + def test_completion_openai_with_functions(): function1 = [ { @@ -159,33 +222,39 @@ def test_completion_openai_with_functions(): "properties": { "location": { "type": "string", - "description": "The city and state, e.g. San Francisco, CA" + "description": "The city and state, e.g. San Francisco, CA", }, - "unit": { - "type": "string", - "enum": ["celsius", "fahrenheit"] - } + "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}, }, - "required": ["location"] - } + "required": ["location"], + }, } ] try: - response = completion(model="gpt-3.5-turbo", messages=messages, functions=function1) + response = completion( + model="gpt-3.5-turbo", messages=messages, functions=function1 + ) # Add any assertions here to check the response print(response) except Exception as e: pytest.fail(f"Error occurred: {e}") + def test_completion_azure(): try: - response = completion(model="gpt-3.5-turbo", deployment_id="chatgpt-test", messages=messages, custom_llm_provider="azure") + response = completion( + model="gpt-3.5-turbo", + deployment_id="chatgpt-test", + messages=messages, + custom_llm_provider="azure", + ) # Add any assertions here to check the response print(response) except Exception as e: pytest.fail(f"Error occurred: {e}") -# Replicate API endpoints are unstable -> throw random CUDA errors -> this means our tests can fail even if our tests weren't incorrect. + +# Replicate API endpoints are unstable -> throw random CUDA errors -> this means our tests can fail even if our tests weren't incorrect. def test_completion_replicate_llama_stream(): model_name = "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1" try: @@ -197,23 +266,32 @@ def test_completion_replicate_llama_stream(): except Exception as e: pytest.fail(f"Error occurred: {e}") + def test_completion_replicate_stability_stream(): model_name = "stability-ai/stablelm-tuned-alpha-7b:c49dae362cbaecd2ceabb5bd34fdb68413c4ff775111fea065d259d577757beb" try: - response = completion(model=model_name, messages=messages, stream=True, custom_llm_provider="replicate") + response = completion( + model=model_name, + messages=messages, + stream=True, + custom_llm_provider="replicate", + ) # Add any assertions here to check the response for chunk in response: - print(chunk['choices'][0]['delta']) + print(chunk["choices"][0]["delta"]) print(response) except Exception as e: pytest.fail(f"Error occurred: {e}") + def test_completion_replicate_stability(): model_name = "stability-ai/stablelm-tuned-alpha-7b:c49dae362cbaecd2ceabb5bd34fdb68413c4ff775111fea065d259d577757beb" try: - response = completion(model=model_name, messages=messages, custom_llm_provider="replicate") + response = completion( + model=model_name, messages=messages, custom_llm_provider="replicate" + ) # Add any assertions here to check the response - response_str = response['choices'][0]['message']['content'] + response_str = response["choices"][0]["message"]["content"] response_str_2 = response.choices[0].message.content print(response_str) print(response_str_2) @@ -224,6 +302,7 @@ def test_completion_replicate_stability(): except Exception as e: pytest.fail(f"Error occurred: {e}") + ######## Test TogetherAI ######## def test_completion_together_ai(): model_name = "togethercomputer/llama-2-70b-chat" @@ -234,15 +313,22 @@ def test_completion_together_ai(): except Exception as e: pytest.fail(f"Error occurred: {e}") + def test_petals(): model_name = "stabilityai/StableBeluga2" try: - response = completion(model=model_name, messages=messages, custom_llm_provider="petals", force_timeout=120) + response = completion( + model=model_name, + messages=messages, + custom_llm_provider="petals", + force_timeout=120, + ) # Add any assertions here to check the response print(response) except Exception as e: pytest.fail(f"Error occurred: {e}") + # def test_baseten_falcon_7bcompletion(): # model_name = "qvv0xeq" # try: @@ -290,7 +376,6 @@ def test_petals(): # pytest.fail(f"Error occurred: {e}") - #### Test A121 ################### # def test_completion_ai21(): # model_name = "j2-light" @@ -301,7 +386,7 @@ def test_petals(): # except Exception as e: # pytest.fail(f"Error occurred: {e}") -# test config file with completion # +# test config file with completion # # def test_completion_openai_config(): # try: # litellm.config_path = "../config.json" @@ -333,4 +418,3 @@ def test_petals(): # return # test_completion_together_ai_stream() - diff --git a/litellm/tests/test_custom_api_base.py b/litellm/tests/test_custom_api_base.py index 966fff954..70a477eab 100644 --- a/litellm/tests/test_custom_api_base.py +++ b/litellm/tests/test_custom_api_base.py @@ -1,20 +1,33 @@ import sys, os import traceback from dotenv import load_dotenv + load_dotenv() import os -sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path -import litellm -from litellm import completion + +sys.path.insert( + 0, os.path.abspath("../..") +) # Adds the parent directory to the system path +import litellm +from litellm import completion + def logging_fn(model_call_dict): print(f"model call details: {model_call_dict}") + + models = ["gorilla-7b-hf-v1", "gpt-4"] custom_llm_provider = None messages = [{"role": "user", "content": "Hey, how's it going?"}] -for model in models: # iterate through list +for model in models: # iterate through list custom_api_base = None - if model == "gorilla-7b-hf-v1": + if model == "gorilla-7b-hf-v1": custom_llm_provider = "custom_openai" custom_api_base = "http://zanino.millennium.berkeley.edu:8000/v1" - completion(model=model, messages=messages, custom_llm_provider=custom_llm_provider, custom_api_base=custom_api_base, logger_fn=logging_fn) + completion( + model=model, + messages=messages, + custom_llm_provider=custom_llm_provider, + custom_api_base=custom_api_base, + logger_fn=logging_fn, + ) diff --git a/litellm/tests/test_embedding.py b/litellm/tests/test_embedding.py index a31d2a4fa..a9b3f2b79 100644 --- a/litellm/tests/test_embedding.py +++ b/litellm/tests/test_embedding.py @@ -1,9 +1,10 @@ - import sys, os import traceback import pytest -sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path +sys.path.insert( + 0, os.path.abspath("../..") +) # Adds the parent directory to the system path import litellm from litellm import embedding, completion from infisical import InfisicalClient @@ -11,10 +12,13 @@ from infisical import InfisicalClient # # litellm.set_verbose = True # litellm.secret_manager_client = InfisicalClient(token=os.environ["INFISICAL_TOKEN"]) + def test_openai_embedding(): try: - response = embedding(model='text-embedding-ada-002', input=["good morning from litellm"]) + response = embedding( + model="text-embedding-ada-002", input=["good morning from litellm"] + ) # Add any assertions here to check the response print(f"response: {str(response)}") except Exception as e: - pytest.fail(f"Error occurred: {e}") \ No newline at end of file + pytest.fail(f"Error occurred: {e}") diff --git a/litellm/tests/test_exceptions.py b/litellm/tests/test_exceptions.py index 76cff6bdb..6620eb2ae 100644 --- a/litellm/tests/test_exceptions.py +++ b/litellm/tests/test_exceptions.py @@ -1,10 +1,21 @@ # from openai.error import AuthenticationError, InvalidRequestError, RateLimitError, OpenAIError -import os +import os import sys import traceback -sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path + +sys.path.insert( + 0, os.path.abspath("../..") +) # Adds the parent directory to the system path import litellm -from litellm import embedding, completion, AuthenticationError, InvalidRequestError, RateLimitError, ServiceUnavailableError, OpenAIError +from litellm import ( + embedding, + completion, + AuthenticationError, + InvalidRequestError, + RateLimitError, + ServiceUnavailableError, + OpenAIError, +) from concurrent.futures import ThreadPoolExecutor import pytest @@ -23,8 +34,10 @@ litellm.failure_callback = ["sentry"] # models = ["gpt-3.5-turbo", "chatgpt-test", "claude-instant-1", "command-nightly"] test_model = "claude-instant-1" models = ["claude-instant-1"] + + def logging_fn(model_call_dict): - if "model" in model_call_dict: + if "model" in model_call_dict: print(f"model_call_dict: {model_call_dict['model']}") else: print(f"model_call_dict: {model_call_dict}") @@ -38,7 +51,12 @@ def test_context_window(model): try: model = "chatgpt-test" print(f"model: {model}") - response = completion(model=model, messages=messages, custom_llm_provider="azure", logger_fn=logging_fn) + response = completion( + model=model, + messages=messages, + custom_llm_provider="azure", + logger_fn=logging_fn, + ) print(f"response: {response}") except InvalidRequestError as e: print(f"InvalidRequestError: {e.llm_provider}") @@ -52,14 +70,17 @@ def test_context_window(model): print(f"Uncaught Exception - {e}") pytest.fail(f"Error occurred: {e}") return + + test_context_window(test_model) + # Test 2: InvalidAuth Errors @pytest.mark.parametrize("model", models) -def invalid_auth(model): # set the model key to an invalid key, depending on the model - messages = [{ "content": "Hello, how are you?","role": "user"}] +def invalid_auth(model): # set the model key to an invalid key, depending on the model + messages = [{"content": "Hello, how are you?", "role": "user"}] temporary_key = None - try: + try: custom_llm_provider = None if model == "gpt-3.5-turbo": temporary_key = os.environ["OPENAI_API_KEY"] @@ -74,22 +95,29 @@ def invalid_auth(model): # set the model key to an invalid key, depending on the elif model == "command-nightly": temporary_key = os.environ["COHERE_API_KEY"] os.environ["COHERE_API_KEY"] = "bad-key" - elif model == "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1": - temporary_key = os.environ["REPLICATE_API_KEY"] + elif ( + model + == "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1" + ): + temporary_key = os.environ["REPLICATE_API_KEY"] os.environ["REPLICATE_API_KEY"] = "bad-key" print(f"model: {model}") - response = completion(model=model, messages=messages, custom_llm_provider=custom_llm_provider) + response = completion( + model=model, messages=messages, custom_llm_provider=custom_llm_provider + ) print(f"response: {response}") except AuthenticationError as e: print(f"AuthenticationError Caught Exception - {e.llm_provider}") - except OpenAIError: # is at least an openai error -> in case of random model errors - e.g. overloaded server + except ( + OpenAIError + ): # is at least an openai error -> in case of random model errors - e.g. overloaded server print(f"OpenAIError Caught Exception - {e}") except Exception as e: print(type(e)) print(e.__class__.__name__) print(f"Uncaught Exception - {e}") pytest.fail(f"Error occurred: {e}") - if temporary_key != None: # reset the key + if temporary_key != None: # reset the key if model == "gpt-3.5-turbo": os.environ["OPENAI_API_KEY"] = temporary_key elif model == "chatgpt-test": @@ -99,13 +127,18 @@ def invalid_auth(model): # set the model key to an invalid key, depending on the os.environ["ANTHROPIC_API_KEY"] = temporary_key elif model == "command-nightly": os.environ["COHERE_API_KEY"] = temporary_key - elif model == "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1": + elif ( + model + == "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1" + ): os.environ["REPLICATE_API_KEY"] = temporary_key return + + invalid_auth(test_model) -# # Test 3: Rate Limit Errors +# # Test 3: Rate Limit Errors # def test_model(model): -# try: +# try: # sample_text = "how does a court case get to the Supreme Court?" * 50000 # messages = [{ "content": sample_text,"role": "user"}] # custom_llm_provider = None @@ -142,5 +175,3 @@ invalid_auth(test_model) # accuracy_score = counts[True]/(counts[True] + counts[False]) # print(f"accuracy_score: {accuracy_score}") - - diff --git a/litellm/tests/test_helicone_integration.py b/litellm/tests/test_helicone_integration.py index 0b1d6ce8a..66e375d17 100644 --- a/litellm/tests/test_helicone_integration.py +++ b/litellm/tests/test_helicone_integration.py @@ -5,7 +5,9 @@ import sys, os import traceback import pytest -sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path +sys.path.insert( + 0, os.path.abspath("../..") +) # Adds the parent directory to the system path import litellm from litellm import embedding, completion @@ -14,11 +16,15 @@ litellm.success_callback = ["helicone"] litellm.set_verbose = True user_message = "Hello, how are you?" -messages = [{ "content": user_message,"role": "user"}] +messages = [{"content": user_message, "role": "user"}] -#openai call -response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}]) +# openai call +response = completion( + model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}] +) -#cohere call -response = completion(model="command-nightly", messages=[{"role": "user", "content": "Hi 👋 - i'm cohere"}]) \ No newline at end of file +# cohere call +response = completion( + model="command-nightly", messages=[{"role": "user", "content": "Hi 👋 - i'm cohere"}] +) diff --git a/litellm/tests/test_load_test_model.py b/litellm/tests/test_load_test_model.py index 8040dabe7..0820990c2 100644 --- a/litellm/tests/test_load_test_model.py +++ b/litellm/tests/test_load_test_model.py @@ -1,22 +1,37 @@ import sys, os import traceback -sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path + +sys.path.insert( + 0, os.path.abspath("../..") +) # Adds the parent directory to the system path import litellm from litellm import load_test_model, testing_batch_completion -# ## Load Test Model +# ## Load Test Model # model="gpt-3.5-turbo" # result = load_test_model(model=model, num_calls=5) # print(result) # print(len(result["results"])) -# ## Duration Test Model +# ## Duration Test Model # model="gpt-3.5-turbo" # result = load_test_model(model=model, num_calls=5, duration=15, interval=15) # duration test the model for 2 minutes, sending 5 calls every 15s # print(result) -## Quality Test across Model -models = ["gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-4", "claude-instant-1", {"model": "replicate/llama-2-70b-chat:58d078176e02c219e11eb4da5a02a7830a283b14cf8f94537af893ccff5ee781", "custom_llm_provider": "replicate"}] -messages = [[{"role": "user", "content": "What is your name?"}], [{"role": "user", "content": "Hey, how's it going?"}]] +## Quality Test across Model +models = [ + "gpt-3.5-turbo", + "gpt-3.5-turbo-16k", + "gpt-4", + "claude-instant-1", + { + "model": "replicate/llama-2-70b-chat:58d078176e02c219e11eb4da5a02a7830a283b14cf8f94537af893ccff5ee781", + "custom_llm_provider": "replicate", + }, +] +messages = [ + [{"role": "user", "content": "What is your name?"}], + [{"role": "user", "content": "Hey, how's it going?"}], +] result = testing_batch_completion(models=models, messages=messages) -print(result) \ No newline at end of file +print(result) diff --git a/litellm/tests/test_logging.py b/litellm/tests/test_logging.py index 3174083ef..37caeffa9 100644 --- a/litellm/tests/test_logging.py +++ b/litellm/tests/test_logging.py @@ -3,7 +3,10 @@ import sys, os import traceback -sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path + +sys.path.insert( + 0, os.path.abspath("../..") +) # Adds the parent directory to the system path import litellm from litellm import embedding, completion @@ -11,49 +14,53 @@ litellm.set_verbose = False score = 0 + def logger_fn(model_call_object: dict): print(f"model call details: {model_call_object}") -user_message = "Hello, how are you?" -messages = [{ "content": user_message,"role": "user"}] -# test on openai completion call +user_message = "Hello, how are you?" +messages = [{"content": user_message, "role": "user"}] + +# test on openai completion call try: response = completion(model="gpt-3.5-turbo", messages=messages, logger_fn=logger_fn) - score +=1 + score += 1 except: - print(f"error occurred: {traceback.format_exc()}") + print(f"error occurred: {traceback.format_exc()}") pass -# test on non-openai completion call +# test on non-openai completion call try: - response = completion(model="claude-instant-1", messages=messages, logger_fn=logger_fn) + response = completion( + model="claude-instant-1", messages=messages, logger_fn=logger_fn + ) print(f"claude response: {response}") - score +=1 + score += 1 except: - print(f"error occurred: {traceback.format_exc()}") + print(f"error occurred: {traceback.format_exc()}") pass -# # test on openai embedding call -# try: +# # test on openai embedding call +# try: # response = embedding(model='text-embedding-ada-002', input=[user_message], logger_fn=logger_fn) -# score +=1 +# score +=1 # except: # traceback.print_exc() # # test on bad azure openai embedding call -> missing azure flag and this isn't an embedding model -# try: +# try: # response = embedding(model='chatgpt-test', input=[user_message], logger_fn=logger_fn) # except: # score +=1 # expect this to fail # traceback.print_exc() -# # test on good azure openai embedding call -# try: +# # test on good azure openai embedding call +# try: # response = embedding(model='azure-embedding-model', input=[user_message], azure=True, logger_fn=logger_fn) -# score +=1 +# score +=1 # except: # traceback.print_exc() -# print(f"Score: {score}, Overall score: {score/5}") \ No newline at end of file +# print(f"Score: {score}, Overall score: {score/5}") diff --git a/litellm/tests/test_model_fallback.py b/litellm/tests/test_model_fallback.py index 69dc1f68d..82535f77a 100644 --- a/litellm/tests/test_model_fallback.py +++ b/litellm/tests/test_model_fallback.py @@ -3,7 +3,10 @@ import sys, os import traceback -sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path + +sys.path.insert( + 0, os.path.abspath("../..") +) # Adds the parent directory to the system path import litellm from litellm import embedding, completion @@ -15,11 +18,11 @@ litellm.set_verbose = True model_fallback_list = ["claude-instant-1", "gpt-3.5-turbo", "chatgpt-test"] user_message = "Hello, how are you?" -messages = [{ "content": user_message,"role": "user"}] +messages = [{"content": user_message, "role": "user"}] for model in model_fallback_list: try: response = embedding(model="text-embedding-ada-002", input=[user_message]) response = completion(model=model, messages=messages) except Exception as e: - print(f"error occurred: {traceback.format_exc()}") + print(f"error occurred: {traceback.format_exc()}") diff --git a/litellm/tests/test_model_response_typing/server.py b/litellm/tests/test_model_response_typing/server.py index 0399f0d91..80dbc33af 100644 --- a/litellm/tests/test_model_response_typing/server.py +++ b/litellm/tests/test_model_response_typing/server.py @@ -20,4 +20,4 @@ # if __name__ == '__main__': # from waitress import serve -# serve(app, host='localhost', port=8080, threads=10) \ No newline at end of file +# serve(app, host='localhost', port=8080, threads=10) diff --git a/litellm/tests/test_model_response_typing/test.py b/litellm/tests/test_model_response_typing/test.py index 12d2b259b..95d404809 100644 --- a/litellm/tests/test_model_response_typing/test.py +++ b/litellm/tests/test_model_response_typing/test.py @@ -1,4 +1,4 @@ -# import requests, json +# import requests, json # BASE_URL = 'http://localhost:8080' @@ -11,4 +11,4 @@ # print("Hello route test passed!") # if __name__ == '__main__': -# test_hello_route() \ No newline at end of file +# test_hello_route() diff --git a/litellm/tests/test_no_client.py b/litellm/tests/test_no_client.py index 79c47d0da..05badddb6 100644 --- a/litellm/tests/test_no_client.py +++ b/litellm/tests/test_no_client.py @@ -4,7 +4,10 @@ import sys, os import traceback -sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path + +sys.path.insert( + 0, os.path.abspath("../..") +) # Adds the parent directory to the system path import litellm from litellm import embedding, completion @@ -13,11 +16,11 @@ litellm.set_verbose = True model_fallback_list = ["claude-instant-1", "gpt-3.5-turbo", "chatgpt-test"] user_message = "Hello, how are you?" -messages = [{ "content": user_message,"role": "user"}] +messages = [{"content": user_message, "role": "user"}] for model in model_fallback_list: try: response = embedding(model="text-embedding-ada-002", input=[user_message]) response = completion(model=model, messages=messages) except Exception as e: - print(f"error occurred: {traceback.format_exc()}") + print(f"error occurred: {traceback.format_exc()}") diff --git a/litellm/tests/test_ollama.py b/litellm/tests/test_ollama.py index d95414560..8e0732a2c 100644 --- a/litellm/tests/test_ollama.py +++ b/litellm/tests/test_ollama.py @@ -53,7 +53,6 @@ # # # return this generator to the client for streaming requests - # # async def get_response(): # # global generator # # async for elem in generator: diff --git a/litellm/tests/test_ollama_local.py b/litellm/tests/test_ollama_local.py index 22544f4cf..a9431a932 100644 --- a/litellm/tests/test_ollama_local.py +++ b/litellm/tests/test_ollama_local.py @@ -12,7 +12,6 @@ # import asyncio - # user_message = "respond in 20 words. who are you?" # messages = [{ "content": user_message,"role": "user"}] @@ -45,8 +44,3 @@ # pytest.fail(f"Error occurred: {e}") # test_completion_ollama_stream() - - - - - diff --git a/litellm/tests/test_secrets.py b/litellm/tests/test_secrets.py index b262044c4..9b9757015 100644 --- a/litellm/tests/test_secrets.py +++ b/litellm/tests/test_secrets.py @@ -4,7 +4,10 @@ import sys, os import traceback -sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path + +sys.path.insert( + 0, os.path.abspath("../..") +) # Adds the parent directory to the system path import litellm from litellm import embedding, completion from infisical import InfisicalClient @@ -15,7 +18,7 @@ infisical_token = os.environ["INFISICAL_TOKEN"] litellm.secret_manager_client = InfisicalClient(token=infisical_token) user_message = "Hello, whats the weather in San Francisco??" -messages = [{ "content": user_message,"role": "user"}] +messages = [{"content": user_message, "role": "user"}] def test_completion_openai(): @@ -28,5 +31,5 @@ def test_completion_openai(): pytest.fail(f"Error occurred: {e}") litellm.secret_manager_client = None -test_completion_openai() +test_completion_openai() diff --git a/litellm/tests/test_streaming.py b/litellm/tests/test_streaming.py index 317dea904..ef2063828 100644 --- a/litellm/tests/test_streaming.py +++ b/litellm/tests/test_streaming.py @@ -3,7 +3,10 @@ import sys, os import traceback -sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path + +sys.path.insert( + 0, os.path.abspath("../..") +) # Adds the parent directory to the system path import litellm from litellm import completion @@ -11,29 +14,40 @@ litellm.set_verbose = False score = 0 + def logger_fn(model_call_object: dict): print(f"model call details: {model_call_object}") -user_message = "Hello, how are you?" -messages = [{ "content": user_message,"role": "user"}] -# test on anthropic completion call +user_message = "Hello, how are you?" +messages = [{"content": user_message, "role": "user"}] + +# test on anthropic completion call try: - response = completion(model="claude-instant-1", messages=messages, stream=True, logger_fn=logger_fn) + response = completion( + model="claude-instant-1", messages=messages, stream=True, logger_fn=logger_fn + ) for chunk in response: - print(chunk['choices'][0]['delta']) - score +=1 + print(chunk["choices"][0]["delta"]) + score += 1 except: - print(f"error occurred: {traceback.format_exc()}") + print(f"error occurred: {traceback.format_exc()}") pass -# test on anthropic completion call +# test on anthropic completion call try: - response = completion(model="meta-llama/Llama-2-7b-chat-hf", messages=messages, custom_llm_provider="huggingface", custom_api_base="https://s7c7gytn18vnu4tw.us-east-1.aws.endpoints.huggingface.cloud", stream=True, logger_fn=logger_fn) + response = completion( + model="meta-llama/Llama-2-7b-chat-hf", + messages=messages, + custom_llm_provider="huggingface", + custom_api_base="https://s7c7gytn18vnu4tw.us-east-1.aws.endpoints.huggingface.cloud", + stream=True, + logger_fn=logger_fn, + ) for chunk in response: - print(chunk['choices'][0]['delta']) - score +=1 + print(chunk["choices"][0]["delta"]) + score += 1 except: - print(f"error occurred: {traceback.format_exc()}") - pass \ No newline at end of file + print(f"error occurred: {traceback.format_exc()}") + pass diff --git a/litellm/tests/test_supabase_integration.py b/litellm/tests/test_supabase_integration.py index ac4e31b58..882d0bbc6 100644 --- a/litellm/tests/test_supabase_integration.py +++ b/litellm/tests/test_supabase_integration.py @@ -21,7 +21,7 @@ # #openai call -# response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}]) +# response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}]) # #bad request call -# response = completion(model="chatgpt-test", messages=[{"role": "user", "content": "Hi 👋 - i'm a bad request"}]) \ No newline at end of file +# response = completion(model="chatgpt-test", messages=[{"role": "user", "content": "Hi 👋 - i'm a bad request"}]) diff --git a/litellm/tests/test_timeout.py b/litellm/tests/test_timeout.py index 31f27e12b..b2bc43ed8 100644 --- a/litellm/tests/test_timeout.py +++ b/litellm/tests/test_timeout.py @@ -3,10 +3,14 @@ import sys, os import traceback -sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path + +sys.path.insert( + 0, os.path.abspath("../..") +) # Adds the parent directory to the system path import time from litellm import timeout + @timeout(10) def stop_after_10_s(force_timeout=60): print("Stopping after 10 seconds") @@ -14,14 +18,14 @@ def stop_after_10_s(force_timeout=60): return -start_time = time.time() +start_time = time.time() try: - stop_after_10_s(force_timeout=1) + stop_after_10_s(force_timeout=1) except Exception as e: - print(e) - pass + print(e) + pass end_time = time.time() -print(f"total time: {end_time-start_time}") \ No newline at end of file +print(f"total time: {end_time-start_time}") diff --git a/litellm/tests/test_vertex.py b/litellm/tests/test_vertex.py index 468ba8d32..01088ec89 100644 --- a/litellm/tests/test_vertex.py +++ b/litellm/tests/test_vertex.py @@ -49,4 +49,4 @@ # # chat = chat_model.start_chat() # # response = chat.send_message("who are u? write a sentence", **parameters) -# # print(f"Response from Model: {response.text}") \ No newline at end of file +# # print(f"Response from Model: {response.text}") diff --git a/litellm/timeout.py b/litellm/timeout.py index 81d99e7de..cca4b06e7 100644 --- a/litellm/timeout.py +++ b/litellm/timeout.py @@ -11,9 +11,7 @@ from threading import Thread from openai.error import Timeout -def timeout( - timeout_duration: float = None, exception_to_raise = Timeout -): +def timeout(timeout_duration: float = None, exception_to_raise=Timeout): """ Wraps a function to raise the specified exception if execution time is greater than the specified timeout. @@ -44,7 +42,9 @@ def timeout( result = future.result(timeout=local_timeout_duration) except futures.TimeoutError: thread.stop_loop() - raise exception_to_raise(f"A timeout error occurred. The function call took longer than {local_timeout_duration} second(s).") + raise exception_to_raise( + f"A timeout error occurred. The function call took longer than {local_timeout_duration} second(s)." + ) thread.stop_loop() return result @@ -59,7 +59,9 @@ def timeout( ) return value except asyncio.TimeoutError: - raise exception_to_raise(f"A timeout error occurred. The function call took longer than {local_timeout_duration} second(s).") + raise exception_to_raise( + f"A timeout error occurred. The function call took longer than {local_timeout_duration} second(s)." + ) if iscoroutinefunction(func): return async_wrapper @@ -80,4 +82,4 @@ class _LoopWrapper(Thread): def stop_loop(self): for task in asyncio.all_tasks(self.loop): task.cancel() - self.loop.call_soon_threadsafe(self.loop.stop) \ No newline at end of file + self.loop.call_soon_threadsafe(self.loop.stop) diff --git a/litellm/utils.py b/litellm/utils.py index b45418933..3190b56d6 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -1,10 +1,11 @@ import sys import dotenv, json, traceback, threading -import subprocess, os -import litellm, openai +import subprocess, os +import litellm, openai import random, uuid, requests import datetime, time import tiktoken + encoding = tiktoken.get_encoding("cl100k_base") import pkg_resources from .integrations.helicone import HeliconeLogger @@ -13,10 +14,17 @@ from .integrations.berrispend import BerriSpendLogger from .integrations.supabase import Supabase from openai.error import OpenAIError as OriginalError from openai.openai_object import OpenAIObject -from .exceptions import AuthenticationError, InvalidRequestError, RateLimitError, ServiceUnavailableError, OpenAIError +from .exceptions import ( + AuthenticationError, + InvalidRequestError, + RateLimitError, + ServiceUnavailableError, + OpenAIError, +) from typing import List, Dict, Union + ####### ENVIRONMENT VARIABLES ################### -dotenv.load_dotenv() # Loading env variables using dotenv +dotenv.load_dotenv() # Loading env variables using dotenv sentry_sdk_instance = None capture_exception = None add_breadcrumb = None @@ -51,12 +59,14 @@ local_cache = {} # 'usage': {'prompt_tokens': 18, 'completion_tokens': 23, 'total_tokens': 41} # } + class Message(OpenAIObject): def __init__(self, content="default", role="assistant", **params): super(Message, self).__init__(**params) self.content = content self.role = role + class Choices(OpenAIObject): def __init__(self, finish_reason="stop", index=0, message=Message(), **params): super(Choices, self).__init__(**params) @@ -64,38 +74,48 @@ class Choices(OpenAIObject): self.index = index self.message = message + class ModelResponse(OpenAIObject): def __init__(self, choices=None, created=None, model=None, usage=None, **params): super(ModelResponse, self).__init__(**params) self.choices = choices if choices else [Choices()] self.created = created self.model = model - self.usage = usage if usage else { - "prompt_tokens": None, - "completion_tokens": None, - "total_tokens": None - } + self.usage = ( + usage + if usage + else { + "prompt_tokens": None, + "completion_tokens": None, + "total_tokens": None, + } + ) def to_dict_recursive(self): d = super().to_dict_recursive() - d['choices'] = [choice.to_dict_recursive() for choice in self.choices] + d["choices"] = [choice.to_dict_recursive() for choice in self.choices] return d + + ############################################################ def print_verbose(print_statement): - if litellm.set_verbose: - print(f"LiteLLM: {print_statement}") - if random.random() <= 0.3: - print("Get help - https://discord.com/invite/wuPM9dRgDw") + if litellm.set_verbose: + print(f"LiteLLM: {print_statement}") + if random.random() <= 0.3: + print("Get help - https://discord.com/invite/wuPM9dRgDw") + ####### Package Import Handler ################### import importlib import subprocess + + def install_and_import(package: str): if package in globals().keys(): - print_verbose(f"{package} has already been imported.") - return + print_verbose(f"{package} has already been imported.") + return try: - # Import the module + # Import the module module = importlib.import_module(package) except ImportError: print_verbose(f"{package} is not installed. Installing...") @@ -108,200 +128,262 @@ def install_and_import(package: str): finally: if package not in globals().keys(): globals()[package] = importlib.import_module(package) + + ################################################## -####### LOGGING ################### -#Logging function -> log the exact model details + what's being sent | Non-Blocking -def logging(model=None, input=None, custom_llm_provider=None, azure=False, additional_args={}, logger_fn=None, exception=None): - try: - model_call_details = {} - if model: - model_call_details["model"] = model - if azure: - model_call_details["azure"] = azure - if custom_llm_provider: - model_call_details["custom_llm_provider"] = custom_llm_provider - if exception: - model_call_details["exception"] = exception - if input: - model_call_details["input"] = input - - if len(additional_args): - model_call_details["additional_args"] = additional_args - # log additional call details -> api key, etc. - if model: - if azure == True or model in litellm.open_ai_chat_completion_models or model in litellm.open_ai_chat_completion_models or model in litellm.open_ai_embedding_models: - model_call_details["api_type"] = openai.api_type - model_call_details["api_base"] = openai.api_base - model_call_details["api_version"] = openai.api_version - model_call_details["api_key"] = openai.api_key - elif "replicate" in model: - model_call_details["api_key"] = os.environ.get("REPLICATE_API_TOKEN") - elif model in litellm.anthropic_models: - model_call_details["api_key"] = os.environ.get("ANTHROPIC_API_KEY") - elif model in litellm.cohere_models: - model_call_details["api_key"] = os.environ.get("COHERE_API_KEY") - ## User Logging -> if you pass in a custom logging function or want to use sentry breadcrumbs - print_verbose(f"Logging Details: logger_fn - {logger_fn} | callable(logger_fn) - {callable(logger_fn)}") - if logger_fn and callable(logger_fn): - try: - logger_fn(model_call_details) # Expectation: any logger function passed in by the user should accept a dict object - except Exception as e: - print(f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while logging {traceback.format_exc()}") - except Exception as e: - print(f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while logging {traceback.format_exc()}") - pass -####### CLIENT ################### +####### LOGGING ################### +# Logging function -> log the exact model details + what's being sent | Non-Blocking +def logging( + model=None, + input=None, + custom_llm_provider=None, + azure=False, + additional_args={}, + logger_fn=None, + exception=None, +): + try: + model_call_details = {} + if model: + model_call_details["model"] = model + if azure: + model_call_details["azure"] = azure + if custom_llm_provider: + model_call_details["custom_llm_provider"] = custom_llm_provider + if exception: + model_call_details["exception"] = exception + if input: + model_call_details["input"] = input + + if len(additional_args): + model_call_details["additional_args"] = additional_args + # log additional call details -> api key, etc. + if model: + if ( + azure == True + or model in litellm.open_ai_chat_completion_models + or model in litellm.open_ai_chat_completion_models + or model in litellm.open_ai_embedding_models + ): + model_call_details["api_type"] = openai.api_type + model_call_details["api_base"] = openai.api_base + model_call_details["api_version"] = openai.api_version + model_call_details["api_key"] = openai.api_key + elif "replicate" in model: + model_call_details["api_key"] = os.environ.get("REPLICATE_API_TOKEN") + elif model in litellm.anthropic_models: + model_call_details["api_key"] = os.environ.get("ANTHROPIC_API_KEY") + elif model in litellm.cohere_models: + model_call_details["api_key"] = os.environ.get("COHERE_API_KEY") + ## User Logging -> if you pass in a custom logging function or want to use sentry breadcrumbs + print_verbose( + f"Logging Details: logger_fn - {logger_fn} | callable(logger_fn) - {callable(logger_fn)}" + ) + if logger_fn and callable(logger_fn): + try: + logger_fn( + model_call_details + ) # Expectation: any logger function passed in by the user should accept a dict object + except Exception as e: + print( + f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while logging {traceback.format_exc()}" + ) + except Exception as e: + print( + f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while logging {traceback.format_exc()}" + ) + pass + + +####### CLIENT ################### # make it easy to log if completion/embedding runs succeeded or failed + see what happened | Non-Blocking def client(original_function): - def function_setup(*args, **kwargs): #just run once to check if user wants to send their data anywhere - PostHog/Sentry/Slack/etc. - try: - global callback_list, add_breadcrumb, user_logger_fn - if (len(litellm.success_callback) > 0 or len(litellm.failure_callback) > 0) and len(callback_list) == 0: - callback_list = list(set(litellm.success_callback + litellm.failure_callback)) - set_callbacks(callback_list=callback_list,) - if add_breadcrumb: - add_breadcrumb( - category="litellm.llm_call", - message=f"Positional Args: {args}, Keyword Args: {kwargs}", - level="info", - ) - if "logger_fn" in kwargs: - user_logger_fn = kwargs["logger_fn"] - except: # DO NOT BLOCK running the function because of this - print_verbose(f"[Non-Blocking] {traceback.format_exc()}") - pass + def function_setup( + *args, **kwargs + ): # just run once to check if user wants to send their data anywhere - PostHog/Sentry/Slack/etc. + try: + global callback_list, add_breadcrumb, user_logger_fn + if ( + len(litellm.success_callback) > 0 or len(litellm.failure_callback) > 0 + ) and len(callback_list) == 0: + callback_list = list( + set(litellm.success_callback + litellm.failure_callback) + ) + set_callbacks( + callback_list=callback_list, + ) + if add_breadcrumb: + add_breadcrumb( + category="litellm.llm_call", + message=f"Positional Args: {args}, Keyword Args: {kwargs}", + level="info", + ) + if "logger_fn" in kwargs: + user_logger_fn = kwargs["logger_fn"] + except: # DO NOT BLOCK running the function because of this + print_verbose(f"[Non-Blocking] {traceback.format_exc()}") + pass def crash_reporting(*args, **kwargs): - if litellm.telemetry: - try: - model = args[0] if len(args) > 0 else kwargs["model"] - exception = kwargs["exception"] if "exception" in kwargs else None - custom_llm_provider = kwargs["custom_llm_provider"] if "custom_llm_provider" in kwargs else None - safe_crash_reporting(model=model, exception=exception, custom_llm_provider=custom_llm_provider) # log usage-crash details. Do not log any user details. If you want to turn this off, set `litellm.telemetry=False`. - except: - #[Non-Blocking Error] - pass + if litellm.telemetry: + try: + model = args[0] if len(args) > 0 else kwargs["model"] + exception = kwargs["exception"] if "exception" in kwargs else None + custom_llm_provider = ( + kwargs["custom_llm_provider"] + if "custom_llm_provider" in kwargs + else None + ) + safe_crash_reporting( + model=model, + exception=exception, + custom_llm_provider=custom_llm_provider, + ) # log usage-crash details. Do not log any user details. If you want to turn this off, set `litellm.telemetry=False`. + except: + # [Non-Blocking Error] + pass def get_prompt(*args, **kwargs): - # make this safe checks, it should not throw any exceptions - if len(args) > 1: - messages = args[1] - prompt = " ".join(message["content"] for message in messages) - return prompt - if "messages" in kwargs: - messages = kwargs["messages"] - prompt = " ".join(message["content"] for message in messages) - return prompt - return None + # make this safe checks, it should not throw any exceptions + if len(args) > 1: + messages = args[1] + prompt = " ".join(message["content"] for message in messages) + return prompt + if "messages" in kwargs: + messages = kwargs["messages"] + prompt = " ".join(message["content"] for message in messages) + return prompt + return None def check_cache(*args, **kwargs): - try: # never block execution - prompt = get_prompt(*args, **kwargs) - if prompt != None and prompt in local_cache: # check if messages / prompt exists - result = local_cache[prompt] - return result - else: - return None - except: - return None - + try: # never block execution + prompt = get_prompt(*args, **kwargs) + if ( + prompt != None and prompt in local_cache + ): # check if messages / prompt exists + result = local_cache[prompt] + return result + else: + return None + except: + return None + def add_cache(result, *args, **kwargs): - try: # never block execution - prompt = get_prompt(*args, **kwargs) - local_cache[prompt] = result - except: - pass + try: # never block execution + prompt = get_prompt(*args, **kwargs) + local_cache[prompt] = result + except: + pass def wrapper(*args, **kwargs): start_time = None result = None try: - function_setup(*args, **kwargs) - ## MODEL CALL - start_time = datetime.datetime.now() - if litellm.caching and (cached_result := check_cache(*args, **kwargs)) is not None: - result = cached_result - else: - result = original_function(*args, **kwargs) - end_time = datetime.datetime.now() - ## Add response to CACHE - if litellm.caching: - add_cache(result, *args, **kwargs) - ## LOG SUCCESS - crash_reporting(*args, **kwargs) - my_thread = threading.Thread(target=handle_success, args=(args, kwargs, result, start_time, end_time)) # don't interrupt execution of main thread - my_thread.start() - return result + function_setup(*args, **kwargs) + ## MODEL CALL + start_time = datetime.datetime.now() + if ( + litellm.caching + and (cached_result := check_cache(*args, **kwargs)) is not None + ): + result = cached_result + else: + result = original_function(*args, **kwargs) + end_time = datetime.datetime.now() + ## Add response to CACHE + if litellm.caching: + add_cache(result, *args, **kwargs) + ## LOG SUCCESS + crash_reporting(*args, **kwargs) + my_thread = threading.Thread( + target=handle_success, args=(args, kwargs, result, start_time, end_time) + ) # don't interrupt execution of main thread + my_thread.start() + return result except Exception as e: - traceback_exception = traceback.format_exc() - crash_reporting(*args, **kwargs, exception=traceback_exception) - end_time = datetime.datetime.now() - my_thread = threading.Thread(target=handle_failure, args=(e, traceback_exception, start_time, end_time, args, kwargs)) # don't interrupt execution of main thread - my_thread.start() - raise e + traceback_exception = traceback.format_exc() + crash_reporting(*args, **kwargs, exception=traceback_exception) + end_time = datetime.datetime.now() + my_thread = threading.Thread( + target=handle_failure, + args=(e, traceback_exception, start_time, end_time, args, kwargs), + ) # don't interrupt execution of main thread + my_thread.start() + raise e + return wrapper + ####### USAGE CALCULATOR ################ + def token_counter(model, text): - # use tiktoken or anthropic's tokenizer depending on the model - num_tokens = 0 - if "claude" in model: - install_and_import('anthropic') - from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT - anthropic = Anthropic() - num_tokens = anthropic.count_tokens(text) - else: - num_tokens = len(encoding.encode(text)) - return num_tokens + # use tiktoken or anthropic's tokenizer depending on the model + num_tokens = 0 + if "claude" in model: + install_and_import("anthropic") + from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT + + anthropic = Anthropic() + num_tokens = anthropic.count_tokens(text) + else: + num_tokens = len(encoding.encode(text)) + return num_tokens -def cost_per_token(model="gpt-3.5-turbo", prompt_tokens = 0, completion_tokens = 0): - ## given - prompt_tokens_cost_usd_dollar = 0 - completion_tokens_cost_usd_dollar = 0 - model_cost_ref = litellm.model_cost - if model in model_cost_ref: - prompt_tokens_cost_usd_dollar = model_cost_ref[model]["input_cost_per_token"] * prompt_tokens - completion_tokens_cost_usd_dollar = model_cost_ref[model]["output_cost_per_token"] * completion_tokens - return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar - else: - # calculate average input cost - input_cost_sum = 0 - output_cost_sum = 0 +def cost_per_token(model="gpt-3.5-turbo", prompt_tokens=0, completion_tokens=0): + ## given + prompt_tokens_cost_usd_dollar = 0 + completion_tokens_cost_usd_dollar = 0 model_cost_ref = litellm.model_cost - for model in model_cost_ref: - input_cost_sum += model_cost_ref[model]["input_cost_per_token"] - output_cost_sum += model_cost_ref[model]["output_cost_per_token"] - avg_input_cost = input_cost_sum / len(model_cost_ref.keys()) - avg_output_cost = output_cost_sum / len(model_cost_ref.keys()) - prompt_tokens_cost_usd_dollar = avg_input_cost * prompt_tokens - completion_tokens_cost_usd_dollar = avg_output_cost * completion_tokens - return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar - + if model in model_cost_ref: + prompt_tokens_cost_usd_dollar = ( + model_cost_ref[model]["input_cost_per_token"] * prompt_tokens + ) + completion_tokens_cost_usd_dollar = ( + model_cost_ref[model]["output_cost_per_token"] * completion_tokens + ) + return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar + else: + # calculate average input cost + input_cost_sum = 0 + output_cost_sum = 0 + model_cost_ref = litellm.model_cost + for model in model_cost_ref: + input_cost_sum += model_cost_ref[model]["input_cost_per_token"] + output_cost_sum += model_cost_ref[model]["output_cost_per_token"] + avg_input_cost = input_cost_sum / len(model_cost_ref.keys()) + avg_output_cost = output_cost_sum / len(model_cost_ref.keys()) + prompt_tokens_cost_usd_dollar = avg_input_cost * prompt_tokens + completion_tokens_cost_usd_dollar = avg_output_cost * completion_tokens + return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar + def completion_cost(model="gpt-3.5-turbo", prompt="", completion=""): - prompt_tokens = token_counter(model=model, text=prompt) - completion_tokens = token_counter(model=model, text=completion) - prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = cost_per_token(model=model, prompt_tokens = prompt_tokens, completion_tokens = completion_tokens) - return prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar + prompt_tokens = token_counter(model=model, text=prompt) + completion_tokens = token_counter(model=model, text=completion) + prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = cost_per_token( + model=model, prompt_tokens=prompt_tokens, completion_tokens=completion_tokens + ) + return prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar + ####### HELPER FUNCTIONS ################ def get_litellm_params( return_async=False, - api_key=None, - force_timeout=600, - azure=False, - logger_fn=None, + api_key=None, + force_timeout=600, + azure=False, + logger_fn=None, verbose=False, - hugging_face=False, + hugging_face=False, replicate=False, - together_ai=False, - custom_llm_provider=None, - custom_api_base=None -): + together_ai=False, + custom_llm_provider=None, + custom_api_base=None, +): litellm_params = { "return_async": return_async, "api_key": api_key, @@ -309,463 +391,656 @@ def get_litellm_params( "logger_fn": logger_fn, "verbose": verbose, "custom_llm_provider": custom_llm_provider, - "custom_api_base": custom_api_base + "custom_api_base": custom_api_base, } - + return litellm_params def get_optional_params( # 12 optional params - functions = [], - function_call = "", - temperature = 1, - top_p = 1, - n = 1, - stream = False, - stop = None, - max_tokens = float('inf'), - presence_penalty = 0, - frequency_penalty = 0, - logit_bias = {}, - user = "", - deployment_id = None, - model = None, - custom_llm_provider = "", - top_k = 40, + functions=[], + function_call="", + temperature=1, + top_p=1, + n=1, + stream=False, + stop=None, + max_tokens=float("inf"), + presence_penalty=0, + frequency_penalty=0, + logit_bias={}, + user="", + deployment_id=None, + model=None, + custom_llm_provider="", + top_k=40, ): - optional_params = {} - if model in litellm.anthropic_models: - # handle anthropic params - if stream: - optional_params["stream"] = stream - if stop != None: - optional_params["stop_sequences"] = stop - if temperature != 1: + optional_params = {} + if model in litellm.anthropic_models: + # handle anthropic params + if stream: + optional_params["stream"] = stream + if stop != None: + optional_params["stop_sequences"] = stop + if temperature != 1: + optional_params["temperature"] = temperature + if top_p != 1: + optional_params["top_p"] = top_p + return optional_params + elif model in litellm.cohere_models: + # handle cohere params + if stream: + optional_params["stream"] = stream + if temperature != 1: + optional_params["temperature"] = temperature + if max_tokens != float("inf"): + optional_params["max_tokens"] = max_tokens + return optional_params + elif custom_llm_provider == "replicate": + # any replicate models + # TODO: handle translating remaining replicate params + if stream: + optional_params["stream"] = stream + return optional_params + elif custom_llm_provider == "together_ai" or ("togethercomputer" in model): + if stream: + optional_params["stream_tokens"] = stream + if temperature != 1: + optional_params["temperature"] = temperature + if top_p != 1: + optional_params["top_p"] = top_p + if max_tokens != float("inf"): + optional_params["max_tokens"] = max_tokens + if frequency_penalty != 0: + optional_params["frequency_penalty"] = frequency_penalty + elif ( + model == "chat-bison" + ): # chat-bison has diff args from chat-bison@001 ty Google + if temperature != 1: + optional_params["temperature"] = temperature + if top_p != 1: + optional_params["top_p"] = top_p + if max_tokens != float("inf"): + optional_params["max_output_tokens"] = max_tokens + elif model in litellm.vertex_text_models: + # required params for all text vertex calls + # temperature=0.2, top_p=0.1, top_k=20 + # always set temperature, top_p, top_k else, text bison fails optional_params["temperature"] = temperature - if top_p != 1: optional_params["top_p"] = top_p - return optional_params - elif model in litellm.cohere_models: - # handle cohere params - if stream: - optional_params["stream"] = stream - if temperature != 1: - optional_params["temperature"] = temperature - if max_tokens != float('inf'): - optional_params["max_tokens"] = max_tokens - return optional_params - elif custom_llm_provider == "replicate": - # any replicate models - # TODO: handle translating remaining replicate params - if stream: - optional_params["stream"] = stream - return optional_params - elif custom_llm_provider == "together_ai" or ("togethercomputer" in model): - if stream: - optional_params["stream_tokens"] = stream - if temperature != 1: - optional_params["temperature"] = temperature - if top_p != 1: - optional_params["top_p"] = top_p - if max_tokens != float('inf'): - optional_params["max_tokens"] = max_tokens - if frequency_penalty != 0: - optional_params["frequency_penalty"] = frequency_penalty - elif model == "chat-bison": # chat-bison has diff args from chat-bison@001 ty Google - if temperature != 1: - optional_params["temperature"] = temperature - if top_p != 1: - optional_params["top_p"] = top_p - if max_tokens != float('inf'): - optional_params["max_output_tokens"] = max_tokens - elif model in litellm.vertex_text_models: - # required params for all text vertex calls - # temperature=0.2, top_p=0.1, top_k=20 - # always set temperature, top_p, top_k else, text bison fails - optional_params["temperature"] = temperature - optional_params["top_p"] = top_p - optional_params["top_k"] = top_k + optional_params["top_k"] = top_k - else:# assume passing in params for openai/azure openai - if functions != []: - optional_params["functions"] = functions - if function_call != "": - optional_params["function_call"] = function_call - if temperature != 1: - optional_params["temperature"] = temperature - if top_p != 1: - optional_params["top_p"] = top_p - if n != 1: - optional_params["n"] = n - if stream: - optional_params["stream"] = stream - if stop != None: - optional_params["stop"] = stop - if max_tokens != float('inf'): - optional_params["max_tokens"] = max_tokens - if presence_penalty != 0: - optional_params["presence_penalty"] = presence_penalty - if frequency_penalty != 0: - optional_params["frequency_penalty"] = frequency_penalty - if logit_bias != {}: - optional_params["logit_bias"] = logit_bias - if user != "": - optional_params["user"] = user - if deployment_id != None: - optional_params["deployment_id"] = deployment_id + else: # assume passing in params for openai/azure openai + if functions != []: + optional_params["functions"] = functions + if function_call != "": + optional_params["function_call"] = function_call + if temperature != 1: + optional_params["temperature"] = temperature + if top_p != 1: + optional_params["top_p"] = top_p + if n != 1: + optional_params["n"] = n + if stream: + optional_params["stream"] = stream + if stop != None: + optional_params["stop"] = stop + if max_tokens != float("inf"): + optional_params["max_tokens"] = max_tokens + if presence_penalty != 0: + optional_params["presence_penalty"] = presence_penalty + if frequency_penalty != 0: + optional_params["frequency_penalty"] = frequency_penalty + if logit_bias != {}: + optional_params["logit_bias"] = logit_bias + if user != "": + optional_params["user"] = user + if deployment_id != None: + optional_params["deployment_id"] = deployment_id + return optional_params return optional_params - return optional_params -def load_test_model(model: str, custom_llm_provider: str = None, custom_api_base: str = None, prompt: str = None, num_calls: int = None, force_timeout: int = None): - test_prompt = "Hey, how's it going" - test_calls = 100 - if prompt: - test_prompt = prompt - if num_calls: - test_calls = num_calls - messages = [[{"role": "user", "content": test_prompt}] for _ in range(test_calls)] - start_time = time.time() - try: - litellm.batch_completion(model=model, messages=messages, custom_llm_provider=custom_llm_provider, custom_api_base = custom_api_base, force_timeout=force_timeout) - end_time = time.time() - response_time = end_time - start_time - return {"total_response_time": response_time, "calls_made": 100, "status": "success", "exception": None} - except Exception as e: - end_time = time.time() - response_time = end_time - start_time - return {"total_response_time": response_time, "calls_made": 100, "status": "failed", "exception": e} + +def load_test_model( + model: str, + custom_llm_provider: str = None, + custom_api_base: str = None, + prompt: str = None, + num_calls: int = None, + force_timeout: int = None, +): + test_prompt = "Hey, how's it going" + test_calls = 100 + if prompt: + test_prompt = prompt + if num_calls: + test_calls = num_calls + messages = [[{"role": "user", "content": test_prompt}] for _ in range(test_calls)] + start_time = time.time() + try: + litellm.batch_completion( + model=model, + messages=messages, + custom_llm_provider=custom_llm_provider, + custom_api_base=custom_api_base, + force_timeout=force_timeout, + ) + end_time = time.time() + response_time = end_time - start_time + return { + "total_response_time": response_time, + "calls_made": 100, + "status": "success", + "exception": None, + } + except Exception as e: + end_time = time.time() + response_time = end_time - start_time + return { + "total_response_time": response_time, + "calls_made": 100, + "status": "failed", + "exception": e, + } + def set_callbacks(callback_list): - global sentry_sdk_instance, capture_exception, add_breadcrumb, posthog, slack_app, alerts_channel, heliconeLogger, aispendLogger, berrispendLogger, supabaseClient - try: - for callback in callback_list: - if callback == "sentry" or "SENTRY_API_URL" in os.environ: - try: - import sentry_sdk - except ImportError: - print_verbose("Package 'sentry_sdk' is missing. Installing it...") - subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sentry_sdk']) - import sentry_sdk - sentry_sdk_instance = sentry_sdk - sentry_trace_rate = os.environ.get("SENTRY_API_TRACE_RATE") if "SENTRY_API_TRACE_RATE" in os.environ else "1.0" - sentry_sdk_instance.init(dsn=os.environ.get("SENTRY_API_URL"), traces_sample_rate=float(sentry_trace_rate)) - capture_exception = sentry_sdk_instance.capture_exception - add_breadcrumb = sentry_sdk_instance.add_breadcrumb - elif callback == "posthog": - try: - from posthog import Posthog - except ImportError: - print_verbose("Package 'posthog' is missing. Installing it...") - subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'posthog']) - from posthog import Posthog - posthog = Posthog( - project_api_key=os.environ.get("POSTHOG_API_KEY"), - host=os.environ.get("POSTHOG_API_URL")) - elif callback == "slack": - try: - from slack_bolt import App - except ImportError: - print_verbose("Package 'slack_bolt' is missing. Installing it...") - subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'slack_bolt']) - from slack_bolt import App - slack_app = App( - token=os.environ.get("SLACK_API_TOKEN"), - signing_secret=os.environ.get("SLACK_API_SECRET") - ) - alerts_channel = os.environ["SLACK_API_CHANNEL"] - print_verbose(f"Initialized Slack App: {slack_app}") - elif callback == "helicone": - heliconeLogger = HeliconeLogger() - elif callback == "aispend": - aispendLogger = AISpendLogger() - elif callback == "berrispend": - berrispendLogger = BerriSpendLogger() - elif callback == "supabase": - supabaseClient = Supabase() - except Exception as e: - raise e + global sentry_sdk_instance, capture_exception, add_breadcrumb, posthog, slack_app, alerts_channel, heliconeLogger, aispendLogger, berrispendLogger, supabaseClient + try: + for callback in callback_list: + if callback == "sentry" or "SENTRY_API_URL" in os.environ: + try: + import sentry_sdk + except ImportError: + print_verbose("Package 'sentry_sdk' is missing. Installing it...") + subprocess.check_call( + [sys.executable, "-m", "pip", "install", "sentry_sdk"] + ) + import sentry_sdk + sentry_sdk_instance = sentry_sdk + sentry_trace_rate = ( + os.environ.get("SENTRY_API_TRACE_RATE") + if "SENTRY_API_TRACE_RATE" in os.environ + else "1.0" + ) + sentry_sdk_instance.init( + dsn=os.environ.get("SENTRY_API_URL"), + traces_sample_rate=float(sentry_trace_rate), + ) + capture_exception = sentry_sdk_instance.capture_exception + add_breadcrumb = sentry_sdk_instance.add_breadcrumb + elif callback == "posthog": + try: + from posthog import Posthog + except ImportError: + print_verbose("Package 'posthog' is missing. Installing it...") + subprocess.check_call( + [sys.executable, "-m", "pip", "install", "posthog"] + ) + from posthog import Posthog + posthog = Posthog( + project_api_key=os.environ.get("POSTHOG_API_KEY"), + host=os.environ.get("POSTHOG_API_URL"), + ) + elif callback == "slack": + try: + from slack_bolt import App + except ImportError: + print_verbose("Package 'slack_bolt' is missing. Installing it...") + subprocess.check_call( + [sys.executable, "-m", "pip", "install", "slack_bolt"] + ) + from slack_bolt import App + slack_app = App( + token=os.environ.get("SLACK_API_TOKEN"), + signing_secret=os.environ.get("SLACK_API_SECRET"), + ) + alerts_channel = os.environ["SLACK_API_CHANNEL"] + print_verbose(f"Initialized Slack App: {slack_app}") + elif callback == "helicone": + heliconeLogger = HeliconeLogger() + elif callback == "aispend": + aispendLogger = AISpendLogger() + elif callback == "berrispend": + berrispendLogger = BerriSpendLogger() + elif callback == "supabase": + supabaseClient = Supabase() + except Exception as e: + raise e def handle_failure(exception, traceback_exception, start_time, end_time, args, kwargs): global sentry_sdk_instance, capture_exception, add_breadcrumb, posthog, slack_app, alerts_channel, aispendLogger, berrispendLogger try: - # print_verbose(f"handle_failure args: {args}") - # print_verbose(f"handle_failure kwargs: {kwargs}") - - success_handler = additional_details.pop("success_handler", None) - failure_handler = additional_details.pop("failure_handler", None) - - additional_details["Event_Name"] = additional_details.pop("failed_event_name", "litellm.failed_query") - print_verbose(f"self.failure_callback: {litellm.failure_callback}") + # print_verbose(f"handle_failure args: {args}") + # print_verbose(f"handle_failure kwargs: {kwargs}") + success_handler = additional_details.pop("success_handler", None) + failure_handler = additional_details.pop("failure_handler", None) - # print_verbose(f"additional_details: {additional_details}") - for callback in litellm.failure_callback: - try: - if callback == "slack": - slack_msg = "" - if len(kwargs) > 0: - for key in kwargs: - slack_msg += f"{key}: {kwargs[key]}\n" - if len(args) > 0: - for i, arg in enumerate(args): - slack_msg += f"LiteLLM_Args_{str(i)}: {arg}" - for detail in additional_details: - slack_msg += f"{detail}: {additional_details[detail]}\n" - slack_msg += f"Traceback: {traceback_exception}" - slack_app.client.chat_postMessage(channel=alerts_channel, text=slack_msg) - elif callback == "sentry": - capture_exception(exception) - elif callback == "posthog": - print_verbose(f"inside posthog, additional_details: {len(additional_details.keys())}") - ph_obj = {} - if len(kwargs) > 0: - ph_obj = kwargs - if len(args) > 0: - for i, arg in enumerate(args): - ph_obj["litellm_args_" + str(i)] = arg - for detail in additional_details: - ph_obj[detail] = additional_details[detail] - event_name = additional_details["Event_Name"] - print_verbose(f"ph_obj: {ph_obj}") - print_verbose(f"PostHog Event Name: {event_name}") - if "user_id" in additional_details: - posthog.capture(additional_details["user_id"], event_name, ph_obj) - else: # PostHog calls require a unique id to identify a user - https://posthog.com/docs/libraries/python - unique_id = str(uuid.uuid4()) - posthog.capture(unique_id, event_name) - print_verbose(f"successfully logged to PostHog!") - elif callback == "berrispend": - print_verbose("reaches berrispend for logging!") - model = args[0] if len(args) > 0 else kwargs["model"] - messages = args[1] if len(args) > 1 else kwargs["messages"] - result = { - "model": model, - "created": time.time(), - "error": traceback_exception, - "usage": { - "prompt_tokens": prompt_token_calculator(model, messages=messages), - "completion_tokens": 0 - } - } - berrispendLogger.log_event(model=model, messages=messages, response_obj=result, start_time=start_time, end_time=end_time, print_verbose=print_verbose) - elif callback == "aispend": - print_verbose("reaches aispend for logging!") - model = args[0] if len(args) > 0 else kwargs["model"] - messages = args[1] if len(args) > 1 else kwargs["messages"] - result = { - "model": model, - "created": time.time(), - "usage": { - "prompt_tokens": prompt_token_calculator(model, messages=messages), - "completion_tokens": 0 - } - } - aispendLogger.log_event(model=model, response_obj=result, start_time=start_time, end_time=end_time, print_verbose=print_verbose) - elif callback == "supabase": - print_verbose("reaches supabase for logging!") - model = args[0] if len(args) > 0 else kwargs["model"] - messages = args[1] if len(args) > 1 else kwargs["messages"] - result = { - "model": model, - "created": time.time(), - "error": traceback_exception, - "usage": { - "prompt_tokens": prompt_token_calculator(model, messages=messages), - "completion_tokens": 0 - } - } - print(f"litellm._thread_context: {litellm._thread_context}") - supabaseClient.log_event(model=model, messages=messages, end_user=litellm._thread_context.user, response_obj=result, start_time=start_time, end_time=end_time, print_verbose=print_verbose) + additional_details["Event_Name"] = additional_details.pop( + "failed_event_name", "litellm.failed_query" + ) + print_verbose(f"self.failure_callback: {litellm.failure_callback}") - except: - print_verbose(f"Error Occurred while logging failure: {traceback.format_exc()}") - pass - - if failure_handler and callable(failure_handler): - call_details = { - "exception": exception, - "additional_details": additional_details - } - failure_handler(call_details) - pass + # print_verbose(f"additional_details: {additional_details}") + for callback in litellm.failure_callback: + try: + if callback == "slack": + slack_msg = "" + if len(kwargs) > 0: + for key in kwargs: + slack_msg += f"{key}: {kwargs[key]}\n" + if len(args) > 0: + for i, arg in enumerate(args): + slack_msg += f"LiteLLM_Args_{str(i)}: {arg}" + for detail in additional_details: + slack_msg += f"{detail}: {additional_details[detail]}\n" + slack_msg += f"Traceback: {traceback_exception}" + slack_app.client.chat_postMessage( + channel=alerts_channel, text=slack_msg + ) + elif callback == "sentry": + capture_exception(exception) + elif callback == "posthog": + print_verbose( + f"inside posthog, additional_details: {len(additional_details.keys())}" + ) + ph_obj = {} + if len(kwargs) > 0: + ph_obj = kwargs + if len(args) > 0: + for i, arg in enumerate(args): + ph_obj["litellm_args_" + str(i)] = arg + for detail in additional_details: + ph_obj[detail] = additional_details[detail] + event_name = additional_details["Event_Name"] + print_verbose(f"ph_obj: {ph_obj}") + print_verbose(f"PostHog Event Name: {event_name}") + if "user_id" in additional_details: + posthog.capture( + additional_details["user_id"], event_name, ph_obj + ) + else: # PostHog calls require a unique id to identify a user - https://posthog.com/docs/libraries/python + unique_id = str(uuid.uuid4()) + posthog.capture(unique_id, event_name) + print_verbose(f"successfully logged to PostHog!") + elif callback == "berrispend": + print_verbose("reaches berrispend for logging!") + model = args[0] if len(args) > 0 else kwargs["model"] + messages = args[1] if len(args) > 1 else kwargs["messages"] + result = { + "model": model, + "created": time.time(), + "error": traceback_exception, + "usage": { + "prompt_tokens": prompt_token_calculator( + model, messages=messages + ), + "completion_tokens": 0, + }, + } + berrispendLogger.log_event( + model=model, + messages=messages, + response_obj=result, + start_time=start_time, + end_time=end_time, + print_verbose=print_verbose, + ) + elif callback == "aispend": + print_verbose("reaches aispend for logging!") + model = args[0] if len(args) > 0 else kwargs["model"] + messages = args[1] if len(args) > 1 else kwargs["messages"] + result = { + "model": model, + "created": time.time(), + "usage": { + "prompt_tokens": prompt_token_calculator( + model, messages=messages + ), + "completion_tokens": 0, + }, + } + aispendLogger.log_event( + model=model, + response_obj=result, + start_time=start_time, + end_time=end_time, + print_verbose=print_verbose, + ) + elif callback == "supabase": + print_verbose("reaches supabase for logging!") + model = args[0] if len(args) > 0 else kwargs["model"] + messages = args[1] if len(args) > 1 else kwargs["messages"] + result = { + "model": model, + "created": time.time(), + "error": traceback_exception, + "usage": { + "prompt_tokens": prompt_token_calculator( + model, messages=messages + ), + "completion_tokens": 0, + }, + } + print(f"litellm._thread_context: {litellm._thread_context}") + supabaseClient.log_event( + model=model, + messages=messages, + end_user=litellm._thread_context.user, + response_obj=result, + start_time=start_time, + end_time=end_time, + print_verbose=print_verbose, + ) + + except: + print_verbose( + f"Error Occurred while logging failure: {traceback.format_exc()}" + ) + pass + + if failure_handler and callable(failure_handler): + call_details = { + "exception": exception, + "additional_details": additional_details, + } + failure_handler(call_details) + pass except Exception as e: - ## LOGGING - logging(logger_fn=user_logger_fn, exception=e) - pass - -def handle_success(args, kwargs, result, start_time, end_time): - global heliconeLogger, aispendLogger - try: - success_handler = additional_details.pop("success_handler", None) - failure_handler = additional_details.pop("failure_handler", None) - additional_details["Event_Name"] = additional_details.pop("successful_event_name", "litellm.succes_query") - for callback in litellm.success_callback: - try: - if callback == "posthog": - ph_obj = {} - for detail in additional_details: - ph_obj[detail] = additional_details[detail] - event_name = additional_details["Event_Name"] - if "user_id" in additional_details: - posthog.capture(additional_details["user_id"], event_name, ph_obj) - else: # PostHog calls require a unique id to identify a user - https://posthog.com/docs/libraries/python - unique_id = str(uuid.uuid4()) - posthog.capture(unique_id, event_name, ph_obj) - pass - elif callback == "slack": - slack_msg = "" - for detail in additional_details: - slack_msg += f"{detail}: {additional_details[detail]}\n" - slack_app.client.chat_postMessage(channel=alerts_channel, text=slack_msg) - elif callback == "helicone": - print_verbose("reaches helicone for logging!") - model = args[0] if len(args) > 0 else kwargs["model"] - messages = args[1] if len(args) > 1 else kwargs["messages"] - heliconeLogger.log_success(model=model, messages=messages, response_obj=result, start_time=start_time, end_time=end_time, print_verbose=print_verbose) - elif callback == "aispend": - print_verbose("reaches aispend for logging!") - model = args[0] if len(args) > 0 else kwargs["model"] - aispendLogger.log_event(model=model, response_obj=result, start_time=start_time, end_time=end_time, print_verbose=print_verbose) - elif callback == "berrispend": - print_verbose("reaches berrispend for logging!") - model = args[0] if len(args) > 0 else kwargs["model"] - messages = args[1] if len(args) > 1 else kwargs["messages"] - berrispendLogger.log_event(model=model, messages=messages, response_obj=result, start_time=start_time, end_time=end_time, print_verbose=print_verbose) - elif callback == "supabase": - print_verbose("reaches supabase for logging!") - model = args[0] if len(args) > 0 else kwargs["model"] - messages = args[1] if len(args) > 1 else kwargs["messages"] - print(f"litellm._thread_context: {litellm._thread_context}") - supabaseClient.log_event(model=model, messages=messages, end_user=litellm._thread_context.user, response_obj=result, start_time=start_time, end_time=end_time, print_verbose=print_verbose) - except Exception as e: ## LOGGING logging(logger_fn=user_logger_fn, exception=e) - print_verbose(f"[Non-Blocking] Success Callback Error - {traceback.format_exc()}") pass - if success_handler and callable(success_handler): - success_handler(args, kwargs) - pass - except Exception as e: - ## LOGGING - logging(logger_fn=user_logger_fn, exception=e) - print_verbose(f"[Non-Blocking] Success Callback Error - {traceback.format_exc()}") - pass + +def handle_success(args, kwargs, result, start_time, end_time): + global heliconeLogger, aispendLogger + try: + success_handler = additional_details.pop("success_handler", None) + failure_handler = additional_details.pop("failure_handler", None) + additional_details["Event_Name"] = additional_details.pop( + "successful_event_name", "litellm.succes_query" + ) + for callback in litellm.success_callback: + try: + if callback == "posthog": + ph_obj = {} + for detail in additional_details: + ph_obj[detail] = additional_details[detail] + event_name = additional_details["Event_Name"] + if "user_id" in additional_details: + posthog.capture( + additional_details["user_id"], event_name, ph_obj + ) + else: # PostHog calls require a unique id to identify a user - https://posthog.com/docs/libraries/python + unique_id = str(uuid.uuid4()) + posthog.capture(unique_id, event_name, ph_obj) + pass + elif callback == "slack": + slack_msg = "" + for detail in additional_details: + slack_msg += f"{detail}: {additional_details[detail]}\n" + slack_app.client.chat_postMessage( + channel=alerts_channel, text=slack_msg + ) + elif callback == "helicone": + print_verbose("reaches helicone for logging!") + model = args[0] if len(args) > 0 else kwargs["model"] + messages = args[1] if len(args) > 1 else kwargs["messages"] + heliconeLogger.log_success( + model=model, + messages=messages, + response_obj=result, + start_time=start_time, + end_time=end_time, + print_verbose=print_verbose, + ) + elif callback == "aispend": + print_verbose("reaches aispend for logging!") + model = args[0] if len(args) > 0 else kwargs["model"] + aispendLogger.log_event( + model=model, + response_obj=result, + start_time=start_time, + end_time=end_time, + print_verbose=print_verbose, + ) + elif callback == "berrispend": + print_verbose("reaches berrispend for logging!") + model = args[0] if len(args) > 0 else kwargs["model"] + messages = args[1] if len(args) > 1 else kwargs["messages"] + berrispendLogger.log_event( + model=model, + messages=messages, + response_obj=result, + start_time=start_time, + end_time=end_time, + print_verbose=print_verbose, + ) + elif callback == "supabase": + print_verbose("reaches supabase for logging!") + model = args[0] if len(args) > 0 else kwargs["model"] + messages = args[1] if len(args) > 1 else kwargs["messages"] + print(f"litellm._thread_context: {litellm._thread_context}") + supabaseClient.log_event( + model=model, + messages=messages, + end_user=litellm._thread_context.user, + response_obj=result, + start_time=start_time, + end_time=end_time, + print_verbose=print_verbose, + ) + except Exception as e: + ## LOGGING + logging(logger_fn=user_logger_fn, exception=e) + print_verbose( + f"[Non-Blocking] Success Callback Error - {traceback.format_exc()}" + ) + pass + + if success_handler and callable(success_handler): + success_handler(args, kwargs) + pass + except Exception as e: + ## LOGGING + logging(logger_fn=user_logger_fn, exception=e) + print_verbose( + f"[Non-Blocking] Success Callback Error - {traceback.format_exc()}" + ) + pass + def prompt_token_calculator(model, messages): - # use tiktoken or anthropic's tokenizer depending on the model - text = " ".join(message["content"] for message in messages) - num_tokens = 0 - if "claude" in model: - install_and_import('anthropic') - from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT - anthropic = Anthropic() - num_tokens = anthropic.count_tokens(text) - else: - num_tokens = len(encoding.encode(text)) - return num_tokens + # use tiktoken or anthropic's tokenizer depending on the model + text = " ".join(message["content"] for message in messages) + num_tokens = 0 + if "claude" in model: + install_and_import("anthropic") + from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT -# integration helper function + anthropic = Anthropic() + num_tokens = anthropic.count_tokens(text) + else: + num_tokens = len(encoding.encode(text)) + return num_tokens + + +# integration helper function def modify_integration(integration_name, integration_params): - global supabaseClient - if integration_name == "supabase": - if "table_name" in integration_params: - Supabase.supabase_table_name = integration_params["table_name"] + global supabaseClient + if integration_name == "supabase": + if "table_name" in integration_params: + Supabase.supabase_table_name = integration_params["table_name"] + def exception_type(model, original_exception, custom_llm_provider): global user_logger_fn exception_mapping_worked = False try: - if isinstance(original_exception, OriginalError): - # Handle the OpenAIError - exception_mapping_worked = True - if custom_llm_provider == "azure": - original_exception.llm_provider = "azure" - else: - original_exception.llm_provider = "openai" - raise original_exception - elif model: - error_str = str(original_exception) - if isinstance(original_exception, BaseException): - exception_type = type(original_exception).__name__ + if isinstance(original_exception, OriginalError): + # Handle the OpenAIError + exception_mapping_worked = True + if custom_llm_provider == "azure": + original_exception.llm_provider = "azure" + else: + original_exception.llm_provider = "openai" + raise original_exception + elif model: + error_str = str(original_exception) + if isinstance(original_exception, BaseException): + exception_type = type(original_exception).__name__ + else: + exception_type = "" + logging( + model=model, + additional_args={ + "error_str": error_str, + "exception_type": exception_type, + "original_exception": original_exception, + }, + logger_fn=user_logger_fn, + ) + if "claude" in model: # one of the anthropics + if hasattr(original_exception, "status_code"): + print_verbose(f"status_code: {original_exception.status_code}") + if original_exception.status_code == 401: + exception_mapping_worked = True + raise AuthenticationError( + message=f"AnthropicException - {original_exception.message}", + llm_provider="anthropic", + ) + elif original_exception.status_code == 400: + exception_mapping_worked = True + raise InvalidRequestError( + message=f"AnthropicException - {original_exception.message}", + model=model, + llm_provider="anthropic", + ) + elif original_exception.status_code == 429: + exception_mapping_worked = True + raise RateLimitError( + message=f"AnthropicException - {original_exception.message}", + llm_provider="anthropic", + ) + elif ( + "Could not resolve authentication method. Expected either api_key or auth_token to be set." + in error_str + ): + exception_mapping_worked = True + raise AuthenticationError( + message=f"AnthropicException - {original_exception.message}", + llm_provider="anthropic", + ) + elif "replicate" in model: + if "Incorrect authentication token" in error_str: + exception_mapping_worked = True + raise AuthenticationError( + message=f"ReplicateException - {error_str}", + llm_provider="replicate", + ) + elif exception_type == "ModelError": + exception_mapping_worked = True + raise InvalidRequestError( + message=f"ReplicateException - {error_str}", + model=model, + llm_provider="replicate", + ) + elif "Request was throttled" in error_str: + exception_mapping_worked = True + raise RateLimitError( + message=f"ReplicateException - {error_str}", + llm_provider="replicate", + ) + elif ( + exception_type == "ReplicateError" + ): ## ReplicateError implies an error on Replicate server side, not user side + raise ServiceUnavailableError( + message=f"ReplicateException - {error_str}", + llm_provider="replicate", + ) + elif model == "command-nightly": # Cohere + if ( + "invalid api token" in error_str + or "No API key provided." in error_str + ): + exception_mapping_worked = True + raise AuthenticationError( + message=f"CohereException - {original_exception.message}", + llm_provider="cohere", + ) + elif "too many tokens" in error_str: + exception_mapping_worked = True + raise InvalidRequestError( + message=f"CohereException - {original_exception.message}", + model=model, + llm_provider="cohere", + ) + elif ( + "CohereConnectionError" in exception_type + ): # cohere seems to fire these errors when we load test it (1k+ messages / min) + exception_mapping_worked = True + raise RateLimitError( + message=f"CohereException - {original_exception.message}", + llm_provider="cohere", + ) + elif custom_llm_provider == "huggingface": + if hasattr(original_exception, "status_code"): + if original_exception.status_code == 401: + exception_mapping_worked = True + raise AuthenticationError( + message=f"HuggingfaceException - {original_exception.message}", + llm_provider="huggingface", + ) + elif original_exception.status_code == 400: + exception_mapping_worked = True + raise InvalidRequestError( + message=f"HuggingfaceException - {original_exception.message}", + model=model, + llm_provider="huggingface", + ) + elif original_exception.status_code == 429: + exception_mapping_worked = True + raise RateLimitError( + message=f"HuggingfaceException - {original_exception.message}", + llm_provider="huggingface", + ) + raise original_exception # base case - return the original exception else: - exception_type = "" - logging(model=model, additional_args={"error_str": error_str, "exception_type": exception_type, "original_exception": original_exception}, logger_fn=user_logger_fn) - if "claude" in model: #one of the anthropics - if hasattr(original_exception, "status_code"): - print_verbose(f"status_code: {original_exception.status_code}") - if original_exception.status_code == 401: - exception_mapping_worked = True - raise AuthenticationError(message=f"AnthropicException - {original_exception.message}", llm_provider="anthropic") - elif original_exception.status_code == 400: - exception_mapping_worked = True - raise InvalidRequestError(message=f"AnthropicException - {original_exception.message}", model=model, llm_provider="anthropic") - elif original_exception.status_code == 429: - exception_mapping_worked = True - raise RateLimitError(message=f"AnthropicException - {original_exception.message}", llm_provider="anthropic") - elif "Could not resolve authentication method. Expected either api_key or auth_token to be set." in error_str: - exception_mapping_worked = True - raise AuthenticationError(message=f"AnthropicException - {original_exception.message}", llm_provider="anthropic") - elif "replicate" in model: - if "Incorrect authentication token" in error_str: - exception_mapping_worked = True - raise AuthenticationError(message=f"ReplicateException - {error_str}", llm_provider="replicate") - elif exception_type == "ModelError": - exception_mapping_worked = True - raise InvalidRequestError(message=f"ReplicateException - {error_str}", model=model, llm_provider="replicate") - elif "Request was throttled" in error_str: - exception_mapping_worked = True - raise RateLimitError(message=f"ReplicateException - {error_str}", llm_provider="replicate") - elif exception_type == "ReplicateError": ## ReplicateError implies an error on Replicate server side, not user side - raise ServiceUnavailableError(message=f"ReplicateException - {error_str}", llm_provider="replicate") - elif model == "command-nightly": #Cohere - if "invalid api token" in error_str or "No API key provided." in error_str: - exception_mapping_worked = True - raise AuthenticationError(message=f"CohereException - {original_exception.message}", llm_provider="cohere") - elif "too many tokens" in error_str: - exception_mapping_worked = True - raise InvalidRequestError(message=f"CohereException - {original_exception.message}", model=model, llm_provider="cohere") - elif "CohereConnectionError" in exception_type: # cohere seems to fire these errors when we load test it (1k+ messages / min) - exception_mapping_worked = True - raise RateLimitError(message=f"CohereException - {original_exception.message}", llm_provider="cohere") - elif custom_llm_provider == "huggingface": - if hasattr(original_exception, "status_code"): - if original_exception.status_code == 401: - exception_mapping_worked = True - raise AuthenticationError(message=f"HuggingfaceException - {original_exception.message}", llm_provider="huggingface") - elif original_exception.status_code == 400: - exception_mapping_worked = True - raise InvalidRequestError(message=f"HuggingfaceException - {original_exception.message}", model=model, llm_provider="huggingface") - elif original_exception.status_code == 429: - exception_mapping_worked = True - raise RateLimitError(message=f"HuggingfaceException - {original_exception.message}", llm_provider="huggingface") - raise original_exception # base case - return the original exception - else: - raise original_exception + raise original_exception except Exception as e: - ## LOGGING - logging(logger_fn=user_logger_fn, additional_args={"exception_mapping_worked": exception_mapping_worked, "original_exception": original_exception}, exception=e) - if exception_mapping_worked: - raise e - else: # don't let an error with mapping interrupt the user from receiving an error from the llm api calls - raise original_exception + ## LOGGING + logging( + logger_fn=user_logger_fn, + additional_args={ + "exception_mapping_worked": exception_mapping_worked, + "original_exception": original_exception, + }, + exception=e, + ) + if exception_mapping_worked: + raise e + else: # don't let an error with mapping interrupt the user from receiving an error from the llm api calls + raise original_exception + def safe_crash_reporting(model=None, exception=None, custom_llm_provider=None): data = { - "model": model, - "exception": str(exception), - "custom_llm_provider": custom_llm_provider + "model": model, + "exception": str(exception), + "custom_llm_provider": custom_llm_provider, } threading.Thread(target=litellm_telemetry, args=(data,)).start() + def litellm_telemetry(data): # Load or generate the UUID - uuid_file = 'litellm_uuid.txt' + uuid_file = "litellm_uuid.txt" try: # Try to open the file and load the UUID - with open(uuid_file, 'r') as file: + with open(uuid_file, "r") as file: uuid_value = file.read() if uuid_value: uuid_value = uuid_value.strip() @@ -775,42 +1050,48 @@ def litellm_telemetry(data): # Generate a new UUID if the file doesn't exist or is empty new_uuid = uuid.uuid4() uuid_value = str(new_uuid) - with open(uuid_file, 'w') as file: + with open(uuid_file, "w") as file: file.write(uuid_value) - except: - # [Non-Blocking Error] - return - - try: - # Prepare the data to send to litellm logging api - payload = { - 'uuid': uuid_value, - 'data': data, - 'version': pkg_resources.get_distribution("litellm").version - } - # Make the POST request to litellm logging api - response = requests.post('https://litellm.berri.ai/logging', headers={"Content-Type": "application/json"}, json=payload) - response.raise_for_status() # Raise an exception for HTTP errors except: # [Non-Blocking Error] return + try: + # Prepare the data to send to litellm logging api + payload = { + "uuid": uuid_value, + "data": data, + "version": pkg_resources.get_distribution("litellm").version, + } + # Make the POST request to litellm logging api + response = requests.post( + "https://litellm.berri.ai/logging", + headers={"Content-Type": "application/json"}, + json=payload, + ) + response.raise_for_status() # Raise an exception for HTTP errors + except: + # [Non-Blocking Error] + return + + ######### Secret Manager ############################ # checks if user has passed in a secret manager client # if passed in then checks the secret there def get_secret(secret_name): - if litellm.secret_manager_client != None: - # TODO: check which secret manager is being used - # currently only supports Infisical - secret = litellm.secret_manager_client.get_secret(secret_name).secret_value - if secret != None: - return secret # if secret found in secret manager return it - else: - raise ValueError(f"Secret '{secret_name}' not found in secret manager") - elif litellm.api_key != None: # if users use litellm default key - return litellm.api_key - else: - return os.environ.get(secret_name) + if litellm.secret_manager_client != None: + # TODO: check which secret manager is being used + # currently only supports Infisical + secret = litellm.secret_manager_client.get_secret(secret_name).secret_value + if secret != None: + return secret # if secret found in secret manager return it + else: + raise ValueError(f"Secret '{secret_name}' not found in secret manager") + elif litellm.api_key != None: # if users use litellm default key + return litellm.api_key + else: + return os.environ.get(secret_name) + ######## Streaming Class ############################ # wraps the completion stream to return the correct format for the model @@ -820,73 +1101,73 @@ class CustomStreamWrapper: self.model = model self.custom_llm_provider = custom_llm_provider if model in litellm.cohere_models: - # cohere does not return an iterator, so we need to wrap it in one - self.completion_stream = iter(completion_stream) + # cohere does not return an iterator, so we need to wrap it in one + self.completion_stream = iter(completion_stream) elif model == "together_ai": self.completion_stream = iter(completion_stream) - else: - self.completion_stream = completion_stream + else: + self.completion_stream = completion_stream def __iter__(self): return self def handle_anthropic_chunk(self, chunk): - str_line = chunk.decode('utf-8') # Convert bytes to string - if str_line.startswith('data:'): - data_json = json.loads(str_line[5:]) - return data_json.get("completion", "") - return "" + str_line = chunk.decode("utf-8") # Convert bytes to string + if str_line.startswith("data:"): + data_json = json.loads(str_line[5:]) + return data_json.get("completion", "") + return "" - def handle_together_ai_chunk(self, chunk): - chunk = chunk.decode("utf-8") - text_index = chunk.find('"text":"') # this checks if text: exists - text_start = text_index + len('"text":"') - text_end = chunk.find('"}', text_start) - if text_index != -1 and text_end != -1: - extracted_text = chunk[text_start:text_end] - return extracted_text - else: - return "" - - def handle_huggingface_chunk(self, chunk): - chunk = chunk.decode("utf-8") - if chunk.startswith('data:'): - data_json = json.loads(chunk[5:]) - if "token" in data_json and "text" in data_json["token"]: - return data_json["token"]["text"] - else: - return "" - return "" + def handle_together_ai_chunk(self, chunk): + chunk = chunk.decode("utf-8") + text_index = chunk.find('"text":"') # this checks if text: exists + text_start = text_index + len('"text":"') + text_end = chunk.find('"}', text_start) + if text_index != -1 and text_end != -1: + extracted_text = chunk[text_start:text_end] + return extracted_text + else: + return "" + + def handle_huggingface_chunk(self, chunk): + chunk = chunk.decode("utf-8") + if chunk.startswith("data:"): + data_json = json.loads(chunk[5:]) + if "token" in data_json and "text" in data_json["token"]: + return data_json["token"]["text"] + else: + return "" + return "" def __next__(self): - completion_obj ={ "role": "assistant", "content": ""} + completion_obj = {"role": "assistant", "content": ""} if self.model in litellm.anthropic_models: - chunk = next(self.completion_stream) - completion_obj["content"] = self.handle_anthropic_chunk(chunk) + chunk = next(self.completion_stream) + completion_obj["content"] = self.handle_anthropic_chunk(chunk) elif self.model == "replicate": - chunk = next(self.completion_stream) - completion_obj["content"] = chunk + chunk = next(self.completion_stream) + completion_obj["content"] = chunk elif (self.model == "together_ai") or ("togethercomputer" in self.model): - chunk = next(self.completion_stream) - text_data = self.handle_together_ai_chunk(chunk) - if text_data == "": - return self.__next__() - completion_obj["content"] = text_data + chunk = next(self.completion_stream) + text_data = self.handle_together_ai_chunk(chunk) + if text_data == "": + return self.__next__() + completion_obj["content"] = text_data elif self.model in litellm.cohere_models: - chunk = next(self.completion_stream) - completion_obj["content"] = chunk.text + chunk = next(self.completion_stream) + completion_obj["content"] = chunk.text elif self.custom_llm_provider and self.custom_llm_provider == "huggingface": - chunk = next(self.completion_stream) - completion_obj["content"] = self.handle_huggingface_chunk(chunk) + chunk = next(self.completion_stream) + completion_obj["content"] = self.handle_huggingface_chunk(chunk) # return this for all models return {"choices": [{"delta": completion_obj}]} - ########## Reading Config File ############################ def read_config_args(config_path): try: import os + current_path = os.getcwd() with open(config_path, "r") as config_file: config = json.load(config_file) @@ -900,9 +1181,13 @@ def read_config_args(config_path): ########## ollama implementation ############################ import aiohttp -async def get_ollama_response_stream(api_base="http://localhost:11434", model="llama2", prompt="Why is the sky blue?"): + + +async def get_ollama_response_stream( + api_base="http://localhost:11434", model="llama2", prompt="Why is the sky blue?" +): session = aiohttp.ClientSession() - url = f'{api_base}/api/generate' + url = f"{api_base}/api/generate" data = { "model": model, "prompt": prompt, @@ -918,7 +1203,10 @@ async def get_ollama_response_stream(api_base="http://localhost:11434", model="l if chunk.strip() != "": j = json.loads(chunk) if "response" in j: - completion_obj ={ "role": "assistant", "content": ""} + completion_obj = { + "role": "assistant", + "content": "", + } completion_obj["content"] = j["response"] yield {"choices": [{"delta": completion_obj}]} # self.responses.append(j["response"]) @@ -930,16 +1218,16 @@ async def get_ollama_response_stream(api_base="http://localhost:11434", model="l async def stream_to_string(generator): - response = "" - async for chunk in generator: - response += chunk["content"] - return response + response = "" + async for chunk in generator: + response += chunk["content"] + return response ########## Together AI streaming ############################# async def together_ai_completion_streaming(json_data, headers): session = aiohttp.ClientSession() - url = 'https://api.together.xyz/inference' + url = "https://api.together.xyz/inference" # headers = { # 'Authorization': f'Bearer {together_ai_token}', # 'Content-Type': 'application/json' @@ -962,15 +1250,14 @@ async def together_ai_completion_streaming(json_data, headers): if line: try: json_chunk = line.decode("utf-8") - json_string = json_chunk.split('data: ')[1] + json_string = json_chunk.split("data: ")[1] # Convert the JSON string to a dictionary data_dict = json.loads(json_string) - completion_response = data_dict['choices'][0]['text'] - completion_obj ={ "role": "assistant", "content": ""} + completion_response = data_dict["choices"][0]["text"] + completion_obj = {"role": "assistant", "content": ""} completion_obj["content"] = completion_response yield {"choices": [{"delta": completion_obj}]} except: pass finally: await session.close() -