forked from phoenix/litellm-mirror
3546 lines
157 KiB
Python
3546 lines
157 KiB
Python
# +-----------------------------------------------+
|
|
# | |
|
|
# | Give Feedback / Get Help |
|
|
# | https://github.com/BerriAI/litellm/issues/new |
|
|
# | |
|
|
# +-----------------------------------------------+
|
|
#
|
|
# Thank you users! We ❤️ you! - Krrish & Ishaan
|
|
|
|
import sys
|
|
import dotenv, json, traceback, threading
|
|
import subprocess, os
|
|
import litellm, openai
|
|
import itertools
|
|
import random, uuid, requests
|
|
import datetime, time
|
|
import tiktoken
|
|
import uuid
|
|
import aiohttp
|
|
from tokenizers import Tokenizer
|
|
import pkg_resources
|
|
encoding = tiktoken.get_encoding("cl100k_base")
|
|
import importlib.metadata
|
|
from .integrations.traceloop import TraceloopLogger
|
|
from .integrations.helicone import HeliconeLogger
|
|
from .integrations.aispend import AISpendLogger
|
|
from .integrations.berrispend import BerriSpendLogger
|
|
from .integrations.supabase import Supabase
|
|
from .integrations.llmonitor import LLMonitorLogger
|
|
from .integrations.prompt_layer import PromptLayerLogger
|
|
from .integrations.custom_logger import CustomLogger
|
|
from .integrations.langfuse import LangFuseLogger
|
|
from .integrations.litedebugger import LiteDebugger
|
|
from openai.error import OpenAIError as OriginalError
|
|
from openai.openai_object import OpenAIObject
|
|
from .exceptions import (
|
|
AuthenticationError,
|
|
InvalidRequestError,
|
|
RateLimitError,
|
|
ServiceUnavailableError,
|
|
OpenAIError,
|
|
ContextWindowExceededError,
|
|
Timeout,
|
|
APIConnectionError,
|
|
APIError,
|
|
BudgetExceededError
|
|
)
|
|
from typing import cast, List, Dict, Union, Optional
|
|
from .caching import Cache
|
|
from .llms.prompt_templates.factory import llama_2_special_tokens
|
|
|
|
####### ENVIRONMENT VARIABLES ####################
|
|
dotenv.load_dotenv() # Loading env variables using dotenv
|
|
sentry_sdk_instance = None
|
|
capture_exception = None
|
|
add_breadcrumb = None
|
|
posthog = None
|
|
slack_app = None
|
|
alerts_channel = None
|
|
heliconeLogger = None
|
|
promptLayerLogger = None
|
|
customLogger = None
|
|
langFuseLogger = None
|
|
llmonitorLogger = None
|
|
aispendLogger = None
|
|
berrispendLogger = None
|
|
supabaseClient = None
|
|
liteDebuggerClient = None
|
|
callback_list: Optional[List[str]] = []
|
|
user_logger_fn = None
|
|
additional_details: Optional[Dict[str, str]] = {}
|
|
local_cache: Optional[Dict[str, str]] = {}
|
|
last_fetched_at = None
|
|
last_fetched_at_keys = None
|
|
######## Model Response #########################
|
|
# All liteLLM Model responses will be in this format, Follows the OpenAI Format
|
|
# https://docs.litellm.ai/docs/completion/output
|
|
# {
|
|
# 'choices': [
|
|
# {
|
|
# 'finish_reason': 'stop',
|
|
# 'index': 0,
|
|
# 'message': {
|
|
# 'role': 'assistant',
|
|
# 'content': " I'm doing well, thank you for asking. I am Claude, an AI assistant created by Anthropic."
|
|
# }
|
|
# }
|
|
# ],
|
|
# 'created': 1691429984.3852863,
|
|
# 'model': 'claude-instant-1',
|
|
# 'usage': {'prompt_tokens': 18, 'completion_tokens': 23, 'total_tokens': 41}
|
|
# }
|
|
|
|
def _generate_id(): # private helper function
|
|
return 'chatcmpl-' + str(uuid.uuid4())
|
|
|
|
class Message(OpenAIObject):
|
|
def __init__(self, content="default", role="assistant", logprobs=None, **params):
|
|
super(Message, self).__init__(**params)
|
|
self.content = content
|
|
self.role = role
|
|
self.logprobs = logprobs
|
|
|
|
class Delta(OpenAIObject):
|
|
def __init__(self, content=None, logprobs=None, role=None, **params):
|
|
super(Delta, self).__init__(**params)
|
|
if content is not None:
|
|
self.content = content
|
|
if role:
|
|
self.role = role
|
|
|
|
|
|
class Choices(OpenAIObject):
|
|
def __init__(self, finish_reason=None, index=0, message=None, **params):
|
|
super(Choices, self).__init__(**params)
|
|
if finish_reason:
|
|
self.finish_reason = finish_reason
|
|
else:
|
|
self.finish_reason = "stop"
|
|
self.index = index
|
|
if message is None:
|
|
self.message = Message(content=None)
|
|
else:
|
|
self.message = message
|
|
|
|
class StreamingChoices(OpenAIObject):
|
|
def __init__(self, finish_reason=None, index=0, delta: Optional[Delta]=None, **params):
|
|
super(StreamingChoices, self).__init__(**params)
|
|
self.finish_reason = finish_reason
|
|
self.index = index
|
|
if delta:
|
|
self.delta = delta
|
|
else:
|
|
self.delta = Delta()
|
|
|
|
class ModelResponse(OpenAIObject):
|
|
def __init__(self, id=None, choices=None, created=None, model=None, usage=None, stream=False, response_ms=None, **params):
|
|
if stream:
|
|
self.object = "chat.completion.chunk"
|
|
self.choices = [StreamingChoices()]
|
|
else:
|
|
if model in litellm.open_ai_embedding_models:
|
|
self.object = "embedding"
|
|
else:
|
|
self.object = "chat.completion"
|
|
self.choices = [Choices()]
|
|
if id is None:
|
|
self.id = _generate_id()
|
|
else:
|
|
self.id = id
|
|
if created is None:
|
|
self.created = int(time.time())
|
|
else:
|
|
self.created = created
|
|
if response_ms:
|
|
self._response_ms = response_ms
|
|
else:
|
|
self._response_ms = None
|
|
self.model = model
|
|
self.usage = (
|
|
usage
|
|
if usage
|
|
else {
|
|
"prompt_tokens": None,
|
|
"completion_tokens": None,
|
|
"total_tokens": None,
|
|
}
|
|
)
|
|
super(ModelResponse, self).__init__(**params)
|
|
|
|
def to_dict_recursive(self):
|
|
d = super().to_dict_recursive()
|
|
d["choices"] = [choice.to_dict_recursive() for choice in self.choices]
|
|
return d
|
|
|
|
|
|
############################################################
|
|
def print_verbose(print_statement):
|
|
if litellm.set_verbose:
|
|
print(f"LiteLLM: {print_statement}")
|
|
|
|
####### LOGGING ###################
|
|
from enum import Enum
|
|
|
|
class CallTypes(Enum):
|
|
embedding = 'embedding'
|
|
completion = 'completion'
|
|
|
|
# Logging function -> log the exact model details + what's being sent | Non-Blocking
|
|
class Logging:
|
|
global supabaseClient, liteDebuggerClient, promptLayerLogger
|
|
|
|
def __init__(self, model, messages, stream, call_type, start_time, litellm_call_id, function_id):
|
|
if call_type not in [item.value for item in CallTypes]:
|
|
allowed_values = ", ".join([item.value for item in CallTypes])
|
|
raise ValueError(f"Invalid call_type {call_type}. Allowed values: {allowed_values}")
|
|
self.model = model
|
|
self.messages = messages
|
|
self.stream = stream
|
|
self.start_time = start_time # log the call start time
|
|
self.call_type = call_type
|
|
self.litellm_call_id = litellm_call_id
|
|
self.function_id = function_id
|
|
|
|
def update_environment_variables(self, model, user, optional_params, litellm_params):
|
|
self.optional_params = optional_params
|
|
self.model = model
|
|
self.user = user
|
|
self.litellm_params = litellm_params
|
|
self.logger_fn = litellm_params["logger_fn"]
|
|
print_verbose(f"self.optional_params: {self.optional_params}")
|
|
self.model_call_details = {
|
|
"model": self.model,
|
|
"messages": self.messages,
|
|
"optional_params": self.optional_params,
|
|
"litellm_params": self.litellm_params,
|
|
}
|
|
|
|
def pre_call(self, input, api_key, model=None, additional_args={}):
|
|
# Log the exact input to the LLM API
|
|
print_verbose(f"Logging Details Pre-API Call for call id {self.litellm_call_id}")
|
|
litellm.error_logs['PRE_CALL'] = locals()
|
|
try:
|
|
# print_verbose(f"logging pre call for model: {self.model} with call type: {self.call_type}")
|
|
self.model_call_details["input"] = input
|
|
self.model_call_details["api_key"] = api_key
|
|
self.model_call_details["additional_args"] = additional_args
|
|
|
|
if (
|
|
model
|
|
): # if model name was changes pre-call, overwrite the initial model call name with the new one
|
|
self.model_call_details["model"] = model
|
|
|
|
# User Logging -> if you pass in a custom logging function
|
|
print_verbose(f"model call details: {self.model_call_details}")
|
|
if self.logger_fn and callable(self.logger_fn):
|
|
try:
|
|
self.logger_fn(
|
|
self.model_call_details
|
|
) # Expectation: any logger function passed in by the user should accept a dict object
|
|
except Exception as e:
|
|
print_verbose(
|
|
f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while logging {traceback.format_exc()}"
|
|
)
|
|
|
|
if litellm.max_budget and self.stream:
|
|
start_time = self.start_time
|
|
end_time = self.start_time # no time has passed as the call hasn't been made yet
|
|
time_diff = (end_time - start_time).total_seconds()
|
|
float_diff = float(time_diff)
|
|
litellm._current_cost += litellm.completion_cost(model=self.model, prompt="".join(message["content"] for message in self.messages), completion="", total_time=float_diff)
|
|
|
|
# Input Integration Logging -> If you want to log the fact that an attempt to call the model was made
|
|
for callback in litellm.input_callback:
|
|
try:
|
|
if callback == "supabase":
|
|
print_verbose("reaches supabase for logging!")
|
|
model = self.model_call_details["model"]
|
|
messages = self.model_call_details["input"]
|
|
print(f"supabaseClient: {supabaseClient}")
|
|
supabaseClient.input_log_event(
|
|
model=model,
|
|
messages=messages,
|
|
end_user=litellm._thread_context.user,
|
|
litellm_call_id=self.litellm_params["litellm_call_id"],
|
|
print_verbose=print_verbose,
|
|
)
|
|
|
|
elif callback == "lite_debugger":
|
|
print_verbose(f"reaches litedebugger for logging! - model_call_details {self.model_call_details}")
|
|
model = self.model_call_details["model"]
|
|
messages = self.model_call_details["input"]
|
|
print_verbose(f"liteDebuggerClient: {liteDebuggerClient}")
|
|
liteDebuggerClient.input_log_event(
|
|
model=model,
|
|
messages=messages,
|
|
end_user=litellm._thread_context.user,
|
|
litellm_call_id=self.litellm_params["litellm_call_id"],
|
|
litellm_params=self.model_call_details["litellm_params"],
|
|
optional_params=self.model_call_details["optional_params"],
|
|
print_verbose=print_verbose,
|
|
call_type=self.call_type
|
|
)
|
|
except Exception as e:
|
|
print_verbose(
|
|
f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while input logging with integrations {traceback.format_exc()}"
|
|
)
|
|
print_verbose(
|
|
f"LiteLLM.Logging: is sentry capture exception initialized {capture_exception}"
|
|
)
|
|
if capture_exception: # log this error to sentry for debugging
|
|
capture_exception(e)
|
|
except:
|
|
print_verbose(
|
|
f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while logging {traceback.format_exc()}"
|
|
)
|
|
print_verbose(
|
|
f"LiteLLM.Logging: is sentry capture exception initialized {capture_exception}"
|
|
)
|
|
if capture_exception: # log this error to sentry for debugging
|
|
capture_exception(e)
|
|
|
|
def post_call(self, original_response, input=None, api_key=None, additional_args={}):
|
|
# Log the exact result from the LLM API, for streaming - log the type of response received
|
|
litellm.error_logs['POST_CALL'] = locals()
|
|
try:
|
|
self.model_call_details["input"] = input
|
|
self.model_call_details["api_key"] = api_key
|
|
self.model_call_details["original_response"] = original_response
|
|
self.model_call_details["additional_args"] = additional_args
|
|
|
|
# User Logging -> if you pass in a custom logging function
|
|
print_verbose(f"model call details: {self.model_call_details}")
|
|
print_verbose(
|
|
f"Logging Details Post-API Call: logger_fn - {self.logger_fn} | callable(logger_fn) - {callable(self.logger_fn)}"
|
|
)
|
|
if self.logger_fn and callable(self.logger_fn):
|
|
try:
|
|
self.logger_fn(
|
|
self.model_call_details
|
|
) # Expectation: any logger function passed in by the user should accept a dict object
|
|
except Exception as e:
|
|
print_verbose(
|
|
f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while logging {traceback.format_exc()}"
|
|
)
|
|
|
|
# Input Integration Logging -> If you want to log the fact that an attempt to call the model was made
|
|
for callback in litellm.input_callback:
|
|
try:
|
|
if callback == "lite_debugger":
|
|
print_verbose("reaches litedebugger for post-call logging!")
|
|
print_verbose(f"liteDebuggerClient: {liteDebuggerClient}")
|
|
liteDebuggerClient.post_call_log_event(
|
|
original_response=original_response,
|
|
litellm_call_id=self.litellm_params["litellm_call_id"],
|
|
print_verbose=print_verbose,
|
|
call_type = self.call_type,
|
|
stream = self.stream,
|
|
)
|
|
except:
|
|
print_verbose(
|
|
f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while post-call logging with integrations {traceback.format_exc()}"
|
|
)
|
|
print_verbose(
|
|
f"LiteLLM.Logging: is sentry capture exception initialized {capture_exception}"
|
|
)
|
|
if capture_exception: # log this error to sentry for debugging
|
|
capture_exception(e)
|
|
except:
|
|
print_verbose(
|
|
f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while logging {traceback.format_exc()}"
|
|
)
|
|
pass
|
|
|
|
|
|
def success_handler(self, result, start_time=None, end_time=None):
|
|
print_verbose(
|
|
f"Logging Details LiteLLM-Success Call"
|
|
)
|
|
try:
|
|
if start_time is None:
|
|
start_time = self.start_time
|
|
if end_time is None:
|
|
end_time = datetime.datetime.now()
|
|
print_verbose(f"success callbacks: {litellm.success_callback}")
|
|
|
|
if litellm.max_budget and self.stream:
|
|
time_diff = (end_time - start_time).total_seconds()
|
|
float_diff = float(time_diff)
|
|
litellm._current_cost += litellm.completion_cost(model=self.model, prompt="", completion=result["content"], total_time=float_diff)
|
|
|
|
for callback in litellm.success_callback:
|
|
try:
|
|
if callback == "lite_debugger":
|
|
print_verbose("reaches lite_debugger for logging!")
|
|
print_verbose(f"liteDebuggerClient: {liteDebuggerClient}")
|
|
print_verbose(f"liteDebuggerClient details function {self.call_type} and stream set to {self.stream}")
|
|
liteDebuggerClient.log_event(
|
|
end_user=litellm._thread_context.user,
|
|
response_obj=result,
|
|
start_time=start_time,
|
|
end_time=end_time,
|
|
litellm_call_id=self.litellm_call_id,
|
|
print_verbose=print_verbose,
|
|
call_type = self.call_type,
|
|
stream = self.stream,
|
|
)
|
|
if callback == "api_manager":
|
|
print_verbose("reaches api manager for updating model cost")
|
|
litellm.apiManager.update_cost(completion_obj=result, user=self.user)
|
|
if callback == "cache":
|
|
# print("entering logger first time")
|
|
# print(self.litellm_params["stream_response"])
|
|
if litellm.cache != None and self.model_call_details.get('optional_params', {}).get('stream', False) == True:
|
|
litellm_call_id = self.litellm_params["litellm_call_id"]
|
|
if litellm_call_id in self.litellm_params["stream_response"]:
|
|
# append for the given call_id
|
|
if self.litellm_params["stream_response"][litellm_call_id]["choices"][0]["message"]["content"] == "default":
|
|
self.litellm_params["stream_response"][litellm_call_id]["choices"][0]["message"]["content"] = result["content"] # handle first try
|
|
else:
|
|
self.litellm_params["stream_response"][litellm_call_id]["choices"][0]["message"]["content"] += result["content"]
|
|
else: # init a streaming response for this call id
|
|
new_model_response = ModelResponse(choices=[Choices(message=Message(content="default"))])
|
|
#print("creating new model response")
|
|
#print(new_model_response)
|
|
self.litellm_params["stream_response"][litellm_call_id] = new_model_response
|
|
#print("adding to cache for", litellm_call_id)
|
|
litellm.cache.add_cache(self.litellm_params["stream_response"][litellm_call_id], **self.model_call_details)
|
|
if callback == "promptlayer":
|
|
print_verbose("reaches promptlayer for logging!")
|
|
promptLayerLogger.log_event(
|
|
kwargs=self.model_call_details,
|
|
response_obj=result,
|
|
start_time=start_time,
|
|
end_time=end_time,
|
|
print_verbose=print_verbose,
|
|
)
|
|
|
|
except Exception as e:
|
|
print_verbose(
|
|
f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while success logging with integrations {traceback.format_exc()}"
|
|
)
|
|
print_verbose(
|
|
f"LiteLLM.Logging: is sentry capture exception initialized {capture_exception}"
|
|
)
|
|
if capture_exception: # log this error to sentry for debugging
|
|
capture_exception(e)
|
|
except:
|
|
print_verbose(
|
|
f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while success logging {traceback.format_exc()}"
|
|
)
|
|
pass
|
|
|
|
def failure_handler(self, exception, traceback_exception, start_time=None, end_time=None):
|
|
print_verbose(
|
|
f"Logging Details LiteLLM-Failure Call"
|
|
)
|
|
try:
|
|
if start_time is None:
|
|
start_time = self.start_time
|
|
if end_time is None:
|
|
end_time = datetime.datetime.now()
|
|
|
|
for callback in litellm.failure_callback:
|
|
try:
|
|
if callback == "lite_debugger":
|
|
print_verbose("reaches lite_debugger for logging!")
|
|
print_verbose(f"liteDebuggerClient: {liteDebuggerClient}")
|
|
result = {
|
|
"model": self.model,
|
|
"created": time.time(),
|
|
"error": traceback_exception,
|
|
"usage": {
|
|
"prompt_tokens": prompt_token_calculator(
|
|
self.model, messages=self.messages
|
|
),
|
|
"completion_tokens": 0,
|
|
},
|
|
}
|
|
liteDebuggerClient.log_event(
|
|
model=self.model,
|
|
messages=self.messages,
|
|
end_user=litellm._thread_context.user,
|
|
response_obj=result,
|
|
start_time=start_time,
|
|
end_time=end_time,
|
|
litellm_call_id=self.litellm_call_id,
|
|
print_verbose=print_verbose,
|
|
call_type = self.call_type,
|
|
stream = self.stream,
|
|
)
|
|
except Exception as e:
|
|
print_verbose(
|
|
f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while failure logging with integrations {traceback.format_exc()}"
|
|
)
|
|
print_verbose(
|
|
f"LiteLLM.Logging: is sentry capture exception initialized {capture_exception}"
|
|
)
|
|
if capture_exception: # log this error to sentry for debugging
|
|
capture_exception(e)
|
|
except:
|
|
print_verbose(
|
|
f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while failure logging {traceback.format_exc()}"
|
|
)
|
|
pass
|
|
|
|
|
|
def exception_logging(
|
|
additional_args={},
|
|
logger_fn=None,
|
|
exception=None,
|
|
):
|
|
try:
|
|
model_call_details = {}
|
|
if exception:
|
|
model_call_details["exception"] = exception
|
|
model_call_details["additional_args"] = additional_args
|
|
# User Logging -> if you pass in a custom logging function or want to use sentry breadcrumbs
|
|
print_verbose(
|
|
f"Logging Details: logger_fn - {logger_fn} | callable(logger_fn) - {callable(logger_fn)}"
|
|
)
|
|
if logger_fn and callable(logger_fn):
|
|
try:
|
|
logger_fn(
|
|
model_call_details
|
|
) # Expectation: any logger function passed in by the user should accept a dict object
|
|
except Exception as e:
|
|
print(
|
|
f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while logging {traceback.format_exc()}"
|
|
)
|
|
except Exception as e:
|
|
print(
|
|
f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while logging {traceback.format_exc()}"
|
|
)
|
|
pass
|
|
|
|
|
|
####### CLIENT ###################
|
|
# make it easy to log if completion/embedding runs succeeded or failed + see what happened | Non-Blocking
|
|
def client(original_function):
|
|
global liteDebuggerClient, get_all_keys
|
|
|
|
def function_setup(
|
|
start_time, *args, **kwargs
|
|
): # just run once to check if user wants to send their data anywhere - PostHog/Sentry/Slack/etc.
|
|
try:
|
|
global callback_list, add_breadcrumb, user_logger_fn, Logging
|
|
function_id = kwargs["id"] if "id" in kwargs else None
|
|
if litellm.use_client or ("use_client" in kwargs and kwargs["use_client"] == True):
|
|
print_verbose(f"litedebugger initialized")
|
|
if "lite_debugger" not in litellm.input_callback:
|
|
litellm.input_callback.append("lite_debugger")
|
|
if "lite_debugger" not in litellm.success_callback:
|
|
litellm.success_callback.append("lite_debugger")
|
|
if "lite_debugger" not in litellm.failure_callback:
|
|
litellm.failure_callback.append("lite_debugger")
|
|
if (
|
|
len(litellm.input_callback) > 0
|
|
or len(litellm.success_callback) > 0
|
|
or len(litellm.failure_callback) > 0
|
|
) and len(callback_list) == 0:
|
|
callback_list = list(
|
|
set(
|
|
litellm.input_callback
|
|
+ litellm.success_callback
|
|
+ litellm.failure_callback
|
|
)
|
|
)
|
|
set_callbacks(
|
|
callback_list=callback_list,
|
|
function_id=function_id
|
|
)
|
|
if add_breadcrumb:
|
|
add_breadcrumb(
|
|
category="litellm.llm_call",
|
|
message=f"Positional Args: {args}, Keyword Args: {kwargs}",
|
|
level="info",
|
|
)
|
|
if "logger_fn" in kwargs:
|
|
user_logger_fn = kwargs["logger_fn"]
|
|
# CRASH REPORTING TELEMETRY
|
|
crash_reporting(*args, **kwargs)
|
|
# INIT LOGGER - for user-specified integrations
|
|
model = args[0] if len(args) > 0 else kwargs["model"]
|
|
call_type = original_function.__name__
|
|
if call_type == CallTypes.completion.value:
|
|
messages = args[1] if len(args) > 1 else kwargs["messages"]
|
|
elif call_type == CallTypes.embedding.value:
|
|
messages = args[1] if len(args) > 1 else kwargs["input"]
|
|
stream = True if "stream" in kwargs and kwargs["stream"] == True else False
|
|
logging_obj = Logging(model=model, messages=messages, stream=stream, litellm_call_id=kwargs["litellm_call_id"], function_id=function_id, call_type=call_type, start_time=start_time)
|
|
return logging_obj
|
|
except Exception as e: # DO NOT BLOCK running the function because of this
|
|
print_verbose(f"[Non-Blocking] {traceback.format_exc()}; args - {args}; kwargs - {kwargs}")
|
|
print(e)
|
|
pass
|
|
|
|
def crash_reporting(*args, **kwargs):
|
|
if litellm.telemetry:
|
|
try:
|
|
model = args[0] if len(args) > 0 else kwargs["model"]
|
|
exception = kwargs["exception"] if "exception" in kwargs else None
|
|
custom_llm_provider = (
|
|
kwargs["custom_llm_provider"]
|
|
if "custom_llm_provider" in kwargs
|
|
else None
|
|
)
|
|
safe_crash_reporting(
|
|
model=model,
|
|
exception=exception,
|
|
custom_llm_provider=custom_llm_provider,
|
|
) # log usage-crash details. Do not log any user details. If you want to turn this off, set `litellm.telemetry=False`.
|
|
except:
|
|
# [Non-Blocking Error]
|
|
pass
|
|
|
|
def wrapper(*args, **kwargs):
|
|
start_time = datetime.datetime.now()
|
|
result = None
|
|
litellm_call_id = str(uuid.uuid4())
|
|
kwargs["litellm_call_id"] = litellm_call_id
|
|
try:
|
|
model = args[0] if len(args) > 0 else kwargs["model"]
|
|
except:
|
|
raise ValueError("model param not passed in.")
|
|
|
|
try:
|
|
logging_obj = function_setup(start_time, *args, **kwargs)
|
|
kwargs["litellm_logging_obj"] = logging_obj
|
|
|
|
# [OPTIONAL] CHECK BUDGET
|
|
if litellm.max_budget:
|
|
if litellm._current_cost > litellm.max_budget:
|
|
raise BudgetExceededError(current_cost=litellm._current_cost, max_budget=litellm.max_budget)
|
|
|
|
# [OPTIONAL] CHECK CACHE
|
|
# remove this after deprecating litellm.caching
|
|
if (litellm.caching or litellm.caching_with_models) and litellm.cache is None:
|
|
litellm.cache = Cache()
|
|
|
|
if kwargs.get("caching", False): # allow users to control returning cached responses from the completion function
|
|
# checking cache
|
|
if (litellm.cache != None or litellm.caching or litellm.caching_with_models):
|
|
print_verbose(f"LiteLLM: Checking Cache")
|
|
cached_result = litellm.cache.get_cache(*args, **kwargs)
|
|
if cached_result != None:
|
|
return cached_result
|
|
|
|
# MODEL CALL
|
|
result = original_function(*args, **kwargs)
|
|
end_time = datetime.datetime.now()
|
|
if "stream" in kwargs and kwargs["stream"] == True:
|
|
# TODO: Add to cache for streaming
|
|
return result
|
|
|
|
|
|
# [OPTIONAL] ADD TO CACHE
|
|
if litellm.caching or litellm.caching_with_models or litellm.cache != None: # user init a cache object
|
|
litellm.cache.add_cache(result, *args, **kwargs)
|
|
|
|
# [OPTIONAL] Return LiteLLM call_id
|
|
if litellm.use_client == True:
|
|
result['litellm_call_id'] = litellm_call_id
|
|
|
|
# LOG SUCCESS - handle streaming success logging in the _next_ object, remove `handle_success` once it's deprecated
|
|
logging_obj.success_handler(result, start_time, end_time)
|
|
# threading.Thread(target=logging_obj.success_handler, args=(result, start_time, end_time)).start()
|
|
my_thread = threading.Thread(
|
|
target=handle_success, args=(args, kwargs, result, start_time, end_time)
|
|
) # don't interrupt execution of main thread
|
|
my_thread.start()
|
|
# RETURN RESULT
|
|
result._response_ms = (end_time - start_time).total_seconds() * 1000 # return response latency in ms like openai
|
|
return result
|
|
except Exception as e:
|
|
traceback_exception = traceback.format_exc()
|
|
crash_reporting(*args, **kwargs, exception=traceback_exception)
|
|
end_time = datetime.datetime.now()
|
|
# LOG FAILURE - handle streaming failure logging in the _next_ object, remove `handle_failure` once it's deprecated
|
|
threading.Thread(target=logging_obj.failure_handler, args=(e, traceback_exception, start_time, end_time)).start()
|
|
my_thread = threading.Thread(
|
|
target=handle_failure,
|
|
args=(e, traceback_exception, start_time, end_time, args, kwargs),
|
|
) # don't interrupt execution of main thread
|
|
my_thread.start()
|
|
if hasattr(e, "message"):
|
|
if (
|
|
liteDebuggerClient and liteDebuggerClient.dashboard_url != None
|
|
): # make it easy to get to the debugger logs if you've initialized it
|
|
e.message += f"\n Check the log in your dashboard - {liteDebuggerClient.dashboard_url}"
|
|
raise e
|
|
return wrapper
|
|
|
|
|
|
####### USAGE CALCULATOR ################
|
|
|
|
|
|
# Extract the number of billion parameters from the model name
|
|
# only used for together_computer LLMs
|
|
def get_model_params_and_category(model_name):
|
|
import re
|
|
params_match = re.search(r'(\d+b)', model_name) # catch all decimals like 3b, 70b, etc
|
|
category = None
|
|
if params_match != None:
|
|
params_match = params_match.group(1)
|
|
params_match = params_match.replace("b", "")
|
|
params_billion = float(params_match)
|
|
# Determine the category based on the number of parameters
|
|
if params_billion <= 3.0:
|
|
category = "together-ai-up-to-3b"
|
|
elif params_billion <= 7.0:
|
|
category = "together-ai-3.1b-7b"
|
|
elif params_billion <= 20.0:
|
|
category = "together-ai-7.1b-20b"
|
|
elif params_billion <= 40.0:
|
|
category = "together-ai-20.1b-40b"
|
|
elif params_billion <= 70.0:
|
|
category = "together-ai-40.1b-70b"
|
|
return category
|
|
|
|
return None
|
|
|
|
def get_replicate_completion_pricing(completion_response=None, total_time=0.0):
|
|
# see https://replicate.com/pricing
|
|
a100_40gb_price_per_second_public = 0.001150
|
|
# for all litellm currently supported LLMs, almost all requests go to a100_80gb
|
|
a100_80gb_price_per_second_public = 0.001400 # assume all calls sent to A100 80GB for now
|
|
if total_time == 0.0:
|
|
start_time = completion_response['created']
|
|
end_time = completion_response["ended"]
|
|
total_time = end_time - start_time
|
|
|
|
return a100_80gb_price_per_second_public*total_time
|
|
|
|
|
|
def token_counter(model="", text=None, messages: Optional[List] = None):
|
|
# Args:
|
|
# text: raw text string passed to model
|
|
# messages: Optional, alternative to passing in text. List of Dicts passed to completion, messages = [{"role": "user", "content": "hello"}]
|
|
# use tiktoken or anthropic's tokenizer depending on the model
|
|
if text == None:
|
|
if messages is not None:
|
|
text = " ".join([message["content"] for message in messages])
|
|
else:
|
|
raise ValueError("text and messages cannot both be None")
|
|
num_tokens = 0
|
|
|
|
if model is not None:
|
|
# cohere
|
|
if model in litellm.cohere_models:
|
|
tokenizer = Tokenizer.from_pretrained("Cohere/command-nightly")
|
|
enc = tokenizer.encode(text)
|
|
num_tokens = len(enc.ids)
|
|
# anthropic
|
|
elif model in litellm.anthropic_models:
|
|
# Read the JSON file
|
|
filename = pkg_resources.resource_filename(__name__, 'llms/tokenizers/anthropic_tokenizer.json')
|
|
with open(filename, 'r') as f:
|
|
json_data = json.load(f)
|
|
# Decode the JSON data from utf-8
|
|
json_data_decoded = json.dumps(json_data, ensure_ascii=False)
|
|
# Convert to str
|
|
json_str = str(json_data_decoded)
|
|
# load tokenizer
|
|
tokenizer = Tokenizer.from_str(json_str)
|
|
enc = tokenizer.encode(text)
|
|
num_tokens = len(enc.ids)
|
|
# llama2
|
|
elif "llama-2" in model.lower():
|
|
tokenizer = Tokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
|
|
enc = tokenizer.encode(text)
|
|
num_tokens = len(enc.ids)
|
|
# default - tiktoken
|
|
else:
|
|
num_tokens = len(encoding.encode(text))
|
|
else:
|
|
num_tokens = len(encoding.encode(text))
|
|
return num_tokens
|
|
|
|
|
|
def cost_per_token(model="gpt-3.5-turbo", prompt_tokens=0, completion_tokens=0):
|
|
# given
|
|
prompt_tokens_cost_usd_dollar = 0
|
|
completion_tokens_cost_usd_dollar = 0
|
|
model_cost_ref = litellm.model_cost
|
|
if model in model_cost_ref:
|
|
prompt_tokens_cost_usd_dollar = (
|
|
model_cost_ref[model]["input_cost_per_token"] * prompt_tokens
|
|
)
|
|
completion_tokens_cost_usd_dollar = (
|
|
model_cost_ref[model]["output_cost_per_token"] * completion_tokens
|
|
)
|
|
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
|
|
else:
|
|
# calculate average input cost
|
|
input_cost_sum = 0
|
|
output_cost_sum = 0
|
|
model_cost_ref = litellm.model_cost
|
|
for model in model_cost_ref:
|
|
input_cost_sum += model_cost_ref[model]["input_cost_per_token"]
|
|
output_cost_sum += model_cost_ref[model]["output_cost_per_token"]
|
|
avg_input_cost = input_cost_sum / len(model_cost_ref.keys())
|
|
avg_output_cost = output_cost_sum / len(model_cost_ref.keys())
|
|
prompt_tokens_cost_usd_dollar = avg_input_cost * prompt_tokens
|
|
completion_tokens_cost_usd_dollar = avg_output_cost * completion_tokens
|
|
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
|
|
|
|
|
|
def completion_cost(
|
|
completion_response=None,
|
|
model="gpt-3.5-turbo",
|
|
prompt="",
|
|
completion="",
|
|
total_time=0.0, # used for replicate
|
|
):
|
|
"""
|
|
Calculate the cost of a given completion call fot GPT-3.5-turbo, llama2, any litellm supported llm.
|
|
|
|
Parameters:
|
|
completion_response (litellm.ModelResponses): [Required] The response received from a LiteLLM completion request.
|
|
|
|
[OPTIONAL PARAMS]
|
|
model (str): Optional. The name of the language model used in the completion calls
|
|
prompt (str): Optional. The input prompt passed to the llm
|
|
completion (str): Optional. The output completion text from the llm
|
|
total_time (float): Optional. (Only used for Replicate LLMs) The total time used for the request in seconds
|
|
|
|
Returns:
|
|
float: The cost in USD dollars for the completion based on the provided parameters.
|
|
|
|
Note:
|
|
- If completion_response is provided, the function extracts token information and the model name from it.
|
|
- If completion_response is not provided, the function calculates token counts based on the model and input text.
|
|
- The cost is calculated based on the model, prompt tokens, and completion tokens.
|
|
- For certain models containing "togethercomputer" in the name, prices are based on the model size.
|
|
- For Replicate models, the cost is calculated based on the total time used for the request.
|
|
|
|
Exceptions:
|
|
- If an error occurs during execution, the function returns 0.0 without blocking the user's execution path.
|
|
"""
|
|
try:
|
|
# Handle Inputs to completion_cost
|
|
prompt_tokens = 0
|
|
completion_tokens = 0
|
|
if completion_response != None:
|
|
# get input/output tokens from completion_response
|
|
prompt_tokens = completion_response['usage']['prompt_tokens']
|
|
completion_tokens = completion_response['usage']['completion_tokens']
|
|
model = completion_response['model'] # get model from completion_response
|
|
else:
|
|
prompt_tokens = token_counter(model=model, text=prompt)
|
|
completion_tokens = token_counter(model=model, text=completion)
|
|
|
|
# Calculate cost based on prompt_tokens, completion_tokens
|
|
if "togethercomputer" in model:
|
|
# together ai prices based on size of llm
|
|
# get_model_params_and_category takes a model name and returns the category of LLM size it is in model_prices_and_context_window.json
|
|
model = get_model_params_and_category(model)
|
|
# replicate llms are calculate based on time for request running
|
|
# see https://replicate.com/pricing
|
|
elif (
|
|
model in litellm.replicate_models or
|
|
"replicate" in model
|
|
):
|
|
return get_replicate_completion_pricing(completion_response, total_time)
|
|
prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = cost_per_token(
|
|
model=model, prompt_tokens=prompt_tokens, completion_tokens=completion_tokens
|
|
)
|
|
return prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
|
|
except:
|
|
return 0.0 # this should not block a users execution path
|
|
|
|
####### HELPER FUNCTIONS ################
|
|
def get_litellm_params(
|
|
return_async=False,
|
|
api_key=None,
|
|
force_timeout=600,
|
|
azure=False,
|
|
logger_fn=None,
|
|
verbose=False,
|
|
hugging_face=False,
|
|
replicate=False,
|
|
together_ai=False,
|
|
custom_llm_provider=None,
|
|
api_base=None,
|
|
litellm_call_id=None,
|
|
model_alias_map=None,
|
|
completion_call_id=None,
|
|
metadata=None
|
|
):
|
|
litellm_params = {
|
|
"return_async": return_async,
|
|
"api_key": api_key,
|
|
"force_timeout": force_timeout,
|
|
"logger_fn": logger_fn,
|
|
"verbose": verbose,
|
|
"custom_llm_provider": custom_llm_provider,
|
|
"api_base": api_base,
|
|
"litellm_call_id": litellm_call_id,
|
|
"model_alias_map": model_alias_map,
|
|
"completion_call_id": completion_call_id,
|
|
"metadata": metadata,
|
|
"stream_response": {} # litellm_call_id: ModelResponse Dict
|
|
}
|
|
|
|
return litellm_params
|
|
|
|
|
|
def get_optional_params( # use the openai defaults
|
|
# 12 optional params
|
|
functions=[],
|
|
function_call="",
|
|
temperature=1,
|
|
top_p=1,
|
|
n=1,
|
|
stream=False,
|
|
stop=None,
|
|
max_tokens=float("inf"),
|
|
presence_penalty=0,
|
|
frequency_penalty=0,
|
|
logit_bias={},
|
|
num_beams=1,
|
|
remove_input=False, # for nlp_cloud
|
|
user="",
|
|
deployment_id=None,
|
|
model=None,
|
|
custom_llm_provider="",
|
|
top_k=40,
|
|
return_full_text=False,
|
|
task=None
|
|
):
|
|
optional_params = {}
|
|
if model in litellm.anthropic_models:
|
|
# handle anthropic params
|
|
if stream:
|
|
optional_params["stream"] = stream
|
|
if stop != None:
|
|
optional_params["stop_sequences"] = stop
|
|
if temperature != 1:
|
|
optional_params["temperature"] = temperature
|
|
if top_p != 1:
|
|
optional_params["top_p"] = top_p
|
|
if max_tokens != float("inf"):
|
|
optional_params["max_tokens_to_sample"] = max_tokens
|
|
return optional_params
|
|
elif model in litellm.cohere_models:
|
|
# handle cohere params
|
|
if stream:
|
|
optional_params["stream"] = stream
|
|
if temperature != 1:
|
|
optional_params["temperature"] = temperature
|
|
if max_tokens != float("inf"):
|
|
optional_params["max_tokens"] = max_tokens
|
|
if logit_bias != {}:
|
|
optional_params["logit_bias"] = logit_bias
|
|
return optional_params
|
|
elif custom_llm_provider == "replicate":
|
|
if stream:
|
|
optional_params["stream"] = stream
|
|
return optional_params
|
|
if max_tokens != float("inf"):
|
|
if "vicuna" in model or "flan" in model:
|
|
optional_params["max_length"] = max_tokens
|
|
else:
|
|
optional_params["max_new_tokens"] = max_tokens
|
|
if temperature != 1:
|
|
optional_params["temperature"] = temperature
|
|
if top_p != 1:
|
|
optional_params["top_p"] = top_p
|
|
if top_k != 40:
|
|
optional_params["top_k"] = top_k
|
|
if stop != None:
|
|
optional_params["stop_sequences"] = stop
|
|
elif custom_llm_provider == "huggingface":
|
|
if temperature != 1:
|
|
optional_params["temperature"] = temperature
|
|
if top_p != 1:
|
|
optional_params["top_p"] = top_p
|
|
if n != 1:
|
|
optional_params["best_of"] = n
|
|
optional_params["do_sample"] = True # need to sample if you want best of for hf inference endpoints
|
|
if stream:
|
|
optional_params["stream"] = stream
|
|
if stop != None:
|
|
optional_params["stop"] = stop
|
|
if max_tokens != float("inf"):
|
|
optional_params["max_new_tokens"] = max_tokens
|
|
if presence_penalty != 0:
|
|
optional_params["repetition_penalty"] = presence_penalty
|
|
optional_params["return_full_text"] = return_full_text
|
|
optional_params["details"] = True
|
|
optional_params["task"] = task
|
|
elif custom_llm_provider == "together_ai":
|
|
if stream:
|
|
optional_params["stream_tokens"] = stream
|
|
if temperature != 1:
|
|
optional_params["temperature"] = temperature
|
|
if top_p != 1:
|
|
optional_params["top_p"] = top_p
|
|
if top_k != 40:
|
|
optional_params["top_k"] = top_k
|
|
if max_tokens != float("inf"):
|
|
optional_params["max_tokens"] = max_tokens
|
|
if frequency_penalty != 0:
|
|
optional_params["frequency_penalty"] = frequency_penalty # TODO: Check if should be repetition penalty
|
|
if stop != None:
|
|
optional_params["stop"] = stop #TG AI expects a list, example ["\n\n\n\n","<|endoftext|>"]
|
|
elif custom_llm_provider == "palm":
|
|
if temperature != 1:
|
|
optional_params["temperature"] = temperature
|
|
if top_p != 1:
|
|
optional_params["top_p"] = top_p
|
|
elif (
|
|
model in litellm.vertex_chat_models or model in litellm.vertex_code_chat_models
|
|
): # chat-bison has diff args from chat-bison@001, ty Google :)
|
|
if temperature != 1:
|
|
optional_params["temperature"] = temperature
|
|
if top_p != 1:
|
|
optional_params["top_p"] = top_p
|
|
if max_tokens != float("inf"):
|
|
optional_params["max_output_tokens"] = max_tokens
|
|
elif model in litellm.vertex_text_models:
|
|
# required params for all text vertex calls
|
|
# temperature=0.2, top_p=0.1, top_k=20
|
|
# always set temperature, top_p, top_k else, text bison fails
|
|
optional_params["temperature"] = temperature
|
|
optional_params["top_p"] = top_p
|
|
optional_params["top_k"] = top_k
|
|
if max_tokens != float("inf"):
|
|
optional_params["max_output_tokens"] = max_tokens
|
|
elif model in model in litellm.vertex_code_text_models:
|
|
optional_params["temperature"] = temperature
|
|
if max_tokens != float("inf"):
|
|
optional_params["max_output_tokens"] = max_tokens
|
|
elif custom_llm_provider == "baseten":
|
|
optional_params["temperature"] = temperature
|
|
optional_params["stream"] = stream
|
|
if top_p != 1:
|
|
optional_params["top_p"] = top_p
|
|
optional_params["top_k"] = top_k
|
|
optional_params["num_beams"] = num_beams
|
|
if max_tokens != float("inf"):
|
|
optional_params["max_new_tokens"] = max_tokens
|
|
elif custom_llm_provider == "sagemaker":
|
|
if "llama-2" in model:
|
|
# llama-2 models on sagemaker support the following args
|
|
"""
|
|
max_new_tokens: Model generates text until the output length (excluding the input context length) reaches max_new_tokens. If specified, it must be a positive integer.
|
|
temperature: Controls the randomness in the output. Higher temperature results in output sequence with low-probability words and lower temperature results in output sequence with high-probability words. If temperature -> 0, it results in greedy decoding. If specified, it must be a positive float.
|
|
top_p: In each step of text generation, sample from the smallest possible set of words with cumulative probability top_p. If specified, it must be a float between 0 and 1.
|
|
return_full_text: If True, input text will be part of the output generated text. If specified, it must be boolean. The default value for it is False.
|
|
"""
|
|
if max_tokens != float("inf"):
|
|
optional_params["max_new_tokens"] = max_tokens
|
|
if temperature != 1:
|
|
optional_params["temperature"] = temperature
|
|
if top_p != 1:
|
|
optional_params["top_p"] = top_p
|
|
elif custom_llm_provider == "bedrock":
|
|
if "ai21" in model or "anthropic" in model:
|
|
# params "maxTokens":200,"temperature":0,"topP":250,"stop_sequences":[],
|
|
# https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=j2-ultra
|
|
if max_tokens != float("inf"):
|
|
optional_params["maxTokens"] = max_tokens
|
|
if temperature != 1:
|
|
optional_params["temperature"] = temperature
|
|
if stop != None:
|
|
optional_params["stop_sequences"] = stop
|
|
if top_p != 1:
|
|
optional_params["topP"] = top_p
|
|
|
|
elif "amazon" in model: # amazon titan llms
|
|
# see https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=titan-large
|
|
if max_tokens != float("inf"):
|
|
optional_params["maxTokenCount"] = max_tokens
|
|
if temperature != 1:
|
|
optional_params["temperature"] = temperature
|
|
if stop != None:
|
|
optional_params["stopSequences"] = stop
|
|
if top_p != 1:
|
|
optional_params["topP"] = top_p
|
|
|
|
elif model in litellm.aleph_alpha_models:
|
|
if max_tokens != float("inf"):
|
|
optional_params["maximum_tokens"] = max_tokens
|
|
if stream:
|
|
optional_params["stream"] = stream
|
|
if temperature != 1:
|
|
optional_params["temperature"] = temperature
|
|
if top_k != 40:
|
|
optional_params["top_k"] = top_k
|
|
if top_p != 1:
|
|
optional_params["top_p"] = top_p
|
|
if presence_penalty != 0:
|
|
optional_params["presence_penalty"] = presence_penalty
|
|
if frequency_penalty != 0:
|
|
optional_params["frequency_penalty"] = frequency_penalty
|
|
if n != 1:
|
|
optional_params["n"] = n
|
|
if stop != None:
|
|
optional_params["stop_sequences"] = stop
|
|
elif model in litellm.nlp_cloud_models or custom_llm_provider == "nlp_cloud":
|
|
if max_tokens != float("inf"):
|
|
optional_params["max_length"] = max_tokens
|
|
if stream:
|
|
optional_params["stream"] = stream
|
|
if temperature != 1:
|
|
optional_params["temperature"] = temperature
|
|
if top_k != 40:
|
|
optional_params["top_k"] = top_k
|
|
if top_p != 1:
|
|
optional_params["top_p"] = top_p
|
|
if presence_penalty != 0:
|
|
optional_params["presence_penalty"] = presence_penalty
|
|
if frequency_penalty != 0:
|
|
optional_params["frequency_penalty"] = frequency_penalty
|
|
if num_beams != 1:
|
|
optional_params["num_beams"] = num_beams
|
|
if n != 1:
|
|
optional_params["num_return_sequences"] = n
|
|
if remove_input == True:
|
|
optional_params["remove_input"] = True
|
|
if stop != None:
|
|
optional_params["stop_sequences"] = stop
|
|
elif model in litellm.petals_models or custom_llm_provider == "petals":
|
|
# max_new_tokens=1,temperature=0.9, top_p=0.6
|
|
if max_tokens != float("inf"):
|
|
optional_params["max_new_tokens"] = max_tokens
|
|
if temperature != 1:
|
|
optional_params["temperature"] = temperature
|
|
if top_p != 1:
|
|
optional_params["top_p"] = top_p
|
|
else: # assume passing in params for openai/azure openai
|
|
if functions != []:
|
|
optional_params["functions"] = functions
|
|
if function_call != "":
|
|
optional_params["function_call"] = function_call
|
|
if temperature != 1:
|
|
optional_params["temperature"] = temperature
|
|
if top_p != 1:
|
|
optional_params["top_p"] = top_p
|
|
if n != 1:
|
|
optional_params["n"] = n
|
|
if stream:
|
|
optional_params["stream"] = stream
|
|
if stop != None:
|
|
optional_params["stop"] = stop
|
|
if max_tokens != float("inf"):
|
|
optional_params["max_tokens"] = max_tokens
|
|
if presence_penalty != 0:
|
|
optional_params["presence_penalty"] = presence_penalty
|
|
if frequency_penalty != 0:
|
|
optional_params["frequency_penalty"] = frequency_penalty
|
|
if logit_bias != {}:
|
|
optional_params["logit_bias"] = logit_bias
|
|
if user != "":
|
|
optional_params["user"] = user
|
|
if deployment_id != None:
|
|
optional_params["deployment_id"] = deployment_id
|
|
return optional_params
|
|
return optional_params
|
|
|
|
def get_llm_provider(model: str, custom_llm_provider: Optional[str] = None):
|
|
try:
|
|
# check if llm provider provided
|
|
if custom_llm_provider:
|
|
return model, custom_llm_provider
|
|
|
|
# check if llm provider part of model name
|
|
if model.split("/",1)[0] in litellm.provider_list:
|
|
custom_llm_provider = model.split("/", 1)[0]
|
|
model = model.split("/", 1)[1]
|
|
return model, custom_llm_provider
|
|
|
|
# check if model in known model provider list
|
|
## openai - chatcompletion + text completion
|
|
if model in litellm.open_ai_chat_completion_models:
|
|
custom_llm_provider = "openai"
|
|
elif model in litellm.open_ai_text_completion_models:
|
|
custom_llm_provider = "text-completion-openai"
|
|
## anthropic
|
|
elif model in litellm.anthropic_models:
|
|
custom_llm_provider = "anthropic"
|
|
## cohere
|
|
elif model in litellm.cohere_models:
|
|
custom_llm_provider = "cohere"
|
|
## replicate
|
|
elif model in litellm.replicate_models:
|
|
custom_llm_provider = "replicate"
|
|
## openrouter
|
|
elif model in litellm.openrouter_models:
|
|
custom_llm_provider = "openrouter"
|
|
## vertex - text + chat models
|
|
elif model in litellm.vertex_chat_models or model in litellm.vertex_text_models:
|
|
custom_llm_provider = "vertex_ai"
|
|
## huggingface
|
|
elif model in litellm.huggingface_models:
|
|
custom_llm_provider = "huggingface"
|
|
## ai21
|
|
elif model in litellm.ai21_models:
|
|
custom_llm_provider = "ai21"
|
|
## together_ai
|
|
elif model in litellm.together_ai_models:
|
|
custom_llm_provider = "together_ai"
|
|
## aleph_alpha
|
|
elif model in litellm.aleph_alpha_models:
|
|
custom_llm_provider = "aleph_alpha"
|
|
## baseten
|
|
elif model in litellm.baseten_models:
|
|
custom_llm_provider = "baseten"
|
|
## nlp_cloud
|
|
elif model in litellm.nlp_cloud_models:
|
|
custom_llm_provider = "nlp_cloud"
|
|
## petals
|
|
elif model in litellm.petals_models:
|
|
custom_llm_provider = "petals"
|
|
|
|
if custom_llm_provider is None or custom_llm_provider=="":
|
|
raise ValueError(f"LLM Provider NOT provided. Pass in the LLM provider you are trying to call. E.g. For 'Huggingface' inference endpoints pass in `completion(model='huggingface/{model}',..)` Learn more: https://docs.litellm.ai/docs/providers")
|
|
return model, custom_llm_provider
|
|
except Exception as e:
|
|
raise e
|
|
|
|
|
|
def get_api_key(llm_provider: str, dynamic_api_key: Optional[str]):
|
|
api_key = (dynamic_api_key or litellm.api_key)
|
|
# openai
|
|
if llm_provider == "openai" or llm_provider == "text-completion-openai":
|
|
api_key = (
|
|
api_key or
|
|
litellm.openai_key or
|
|
get_secret("OPENAI_API_KEY")
|
|
)
|
|
# anthropic
|
|
elif llm_provider == "anthropic":
|
|
api_key = (
|
|
api_key or
|
|
litellm.anthropic_key or
|
|
get_secret("ANTHROPIC_API_KEY")
|
|
)
|
|
# ai21
|
|
elif llm_provider == "ai21":
|
|
api_key = (
|
|
api_key or
|
|
litellm.ai21_key or
|
|
get_secret("AI211_API_KEY")
|
|
)
|
|
# aleph_alpha
|
|
elif llm_provider == "aleph_alpha":
|
|
api_key = (
|
|
api_key or
|
|
litellm.aleph_alpha_key or
|
|
get_secret("ALEPH_ALPHA_API_KEY")
|
|
)
|
|
# baseten
|
|
elif llm_provider == "baseten":
|
|
api_key = (
|
|
api_key or
|
|
litellm.baseten_key or
|
|
get_secret("BASETEN_API_KEY")
|
|
)
|
|
# cohere
|
|
elif llm_provider == "cohere":
|
|
api_key = (
|
|
api_key or
|
|
litellm.cohere_key or
|
|
get_secret("COHERE_API_KEY")
|
|
)
|
|
# huggingface
|
|
elif llm_provider == "huggingface":
|
|
api_key = (
|
|
api_key or
|
|
litellm.huggingface_key or
|
|
get_secret("HUGGINGFACE_API_KEY")
|
|
)
|
|
# nlp_cloud
|
|
elif llm_provider == "nlp_cloud":
|
|
api_key = (
|
|
api_key or
|
|
litellm.nlp_cloud_key or
|
|
get_secret("NLP_CLOUD_API_KEY")
|
|
)
|
|
# replicate
|
|
elif llm_provider == "replicate":
|
|
api_key = (
|
|
api_key or
|
|
litellm.replicate_key or
|
|
get_secret("REPLICATE_API_KEY")
|
|
)
|
|
# together_ai
|
|
elif llm_provider == "together_ai":
|
|
api_key = (
|
|
api_key or
|
|
litellm.togetherai_api_key or
|
|
get_secret("TOGETHERAI_API_KEY") or
|
|
get_secret("TOGETHER_AI_TOKEN")
|
|
)
|
|
return api_key
|
|
def get_max_tokens(model: str):
|
|
try:
|
|
return litellm.model_cost[model]
|
|
except:
|
|
raise Exception("This model isn't mapped yet. Add it here - https://github.com/BerriAI/litellm/blob/main/cookbook/community-resources/max_tokens.json")
|
|
|
|
|
|
def load_test_model(
|
|
model: str,
|
|
custom_llm_provider: str = "",
|
|
api_base: str = "",
|
|
prompt: str = "",
|
|
num_calls: int = 0,
|
|
force_timeout: int = 0,
|
|
):
|
|
test_prompt = "Hey, how's it going"
|
|
test_calls = 100
|
|
if prompt:
|
|
test_prompt = prompt
|
|
if num_calls:
|
|
test_calls = num_calls
|
|
messages = [[{"role": "user", "content": test_prompt}] for _ in range(test_calls)]
|
|
start_time = time.time()
|
|
try:
|
|
litellm.batch_completion(
|
|
model=model,
|
|
messages=messages,
|
|
custom_llm_provider=custom_llm_provider,
|
|
api_base=api_base,
|
|
force_timeout=force_timeout,
|
|
)
|
|
end_time = time.time()
|
|
response_time = end_time - start_time
|
|
return {
|
|
"total_response_time": response_time,
|
|
"calls_made": 100,
|
|
"status": "success",
|
|
"exception": None,
|
|
}
|
|
except Exception as e:
|
|
end_time = time.time()
|
|
response_time = end_time - start_time
|
|
return {
|
|
"total_response_time": response_time,
|
|
"calls_made": 100,
|
|
"status": "failed",
|
|
"exception": e,
|
|
}
|
|
|
|
def validate_environment(model: str) -> dict:
|
|
keys_in_environment = False
|
|
missing_keys = []
|
|
## EXTRACT LLM PROVIDER - if model name provided
|
|
custom_llm_provider = None
|
|
# check if llm provider part of model name
|
|
if model.split("/",1)[0] in litellm.provider_list:
|
|
custom_llm_provider = model.split("/", 1)[0]
|
|
model = model.split("/", 1)[1]
|
|
custom_llm_provider_passed_in = True
|
|
|
|
if custom_llm_provider:
|
|
if custom_llm_provider == "openai":
|
|
if "OPENAI_API_KEY" in os.environ:
|
|
keys_in_environment = True
|
|
else:
|
|
missing_keys.append("OPENAI_API_KEY")
|
|
elif custom_llm_provider == "azure":
|
|
if ("AZURE_API_BASE" in os.environ
|
|
and "AZURE_API_VERSION" in os.environ
|
|
and "AZURE_API_KEY" in os.environ):
|
|
keys_in_environment = True
|
|
else:
|
|
missing_keys.extend(["AZURE_API_BASE", "AZURE_API_VERSION", "AZURE_API_KEY"])
|
|
elif custom_llm_provider == "anthropic":
|
|
if "ANTHROPIC_API_KEY" in os.environ:
|
|
keys_in_environment = True
|
|
else:
|
|
missing_keys.append("ANTHROPIC_API_KEY")
|
|
elif custom_llm_provider == "cohere":
|
|
if "COHERE_API_KEY" in os.environ:
|
|
keys_in_environment = True
|
|
else:
|
|
missing_keys.append("COHERE_API_KEY")
|
|
elif custom_llm_provider == "replicate":
|
|
if "REPLICATE_API_KEY" in os.environ:
|
|
keys_in_environment = True
|
|
else:
|
|
missing_keys.append("REPLICATE_API_KEY")
|
|
elif custom_llm_provider == "openrouter":
|
|
if "OPENROUTER_API_KEY" in os.environ:
|
|
keys_in_environment = True
|
|
else:
|
|
missing_keys.append("OPENROUTER_API_KEY")
|
|
elif custom_llm_provider == "vertex_ai":
|
|
if ("VERTEXAI_PROJECT" in os.environ
|
|
and "VERTEXAI_LOCATION" in os.environ):
|
|
keys_in_environment = True
|
|
else:
|
|
missing_keys.extend(["VERTEXAI_PROJECT", "VERTEXAI_PROJECT"])
|
|
elif custom_llm_provider == "huggingface":
|
|
if "HUGGINGFACE_API_KEY" in os.environ:
|
|
keys_in_environment = True
|
|
else:
|
|
missing_keys.append("HUGGINGFACE_API_KEY")
|
|
elif custom_llm_provider == "ai21":
|
|
if "AI21_API_KEY" in os.environ:
|
|
keys_in_environment = True
|
|
else:
|
|
missing_keys.append("AI21_API_KEY")
|
|
elif custom_llm_provider == "together_ai":
|
|
if "TOGETHERAI_API_KEY" in os.environ:
|
|
keys_in_environment = True
|
|
else:
|
|
missing_keys.append("TOGETHERAI_API_KEY")
|
|
elif custom_llm_provider == "aleph_alpha":
|
|
if "ALEPH_ALPHA_API_KEY" in os.environ:
|
|
keys_in_environment = True
|
|
else:
|
|
missing_keys.append("ALEPH_ALPHA_API_KEY")
|
|
elif custom_llm_provider == "baseten":
|
|
if "BASETEN_API_KEY" in os.environ:
|
|
keys_in_environment = True
|
|
else:
|
|
missing_keys.append("BASETEN_API_KEY")
|
|
elif custom_llm_provider == "nlp_cloud":
|
|
if "NLP_CLOUD_API_KEY" in os.environ:
|
|
keys_in_environment = True
|
|
else:
|
|
missing_keys.append("NLP_CLOUD_API_KEY")
|
|
else:
|
|
## openai - chatcompletion + text completion
|
|
if model in litellm.open_ai_chat_completion_models or litellm.open_ai_text_completion_models:
|
|
if "OPENAI_API_KEY" in os.environ:
|
|
keys_in_environment = True
|
|
else:
|
|
missing_keys.append("OPENAI_API_KEY")
|
|
## anthropic
|
|
elif model in litellm.anthropic_models:
|
|
if "ANTHROPIC_API_KEY" in os.environ:
|
|
keys_in_environment = True
|
|
else:
|
|
missing_keys.append("ANTHROPIC_API_KEY")
|
|
## cohere
|
|
elif model in litellm.cohere_models:
|
|
if "COHERE_API_KEY" in os.environ:
|
|
keys_in_environment = True
|
|
else:
|
|
missing_keys.append("COHERE_API_KEY")
|
|
## replicate
|
|
elif model in litellm.replicate_models:
|
|
if "REPLICATE_API_KEY" in os.environ:
|
|
keys_in_environment = True
|
|
else:
|
|
missing_keys.append("REPLICATE_API_KEY")
|
|
## openrouter
|
|
elif model in litellm.openrouter_models:
|
|
if "OPENROUTER_API_KEY" in os.environ:
|
|
keys_in_environment = True
|
|
else:
|
|
missing_keys.append("OPENROUTER_API_KEY")
|
|
## vertex - text + chat models
|
|
elif model in litellm.vertex_chat_models or model in litellm.vertex_text_models:
|
|
if ("VERTEXAI_PROJECT" in os.environ
|
|
and "VERTEXAI_LOCATION" in os.environ):
|
|
keys_in_environment = True
|
|
else:
|
|
missing_keys.extend(["VERTEXAI_PROJECT", "VERTEXAI_PROJECT"])
|
|
## huggingface
|
|
elif model in litellm.huggingface_models:
|
|
if "HUGGINGFACE_API_KEY" in os.environ:
|
|
keys_in_environment = True
|
|
else:
|
|
missing_keys.append("HUGGINGFACE_API_KEY")
|
|
## ai21
|
|
elif model in litellm.ai21_models:
|
|
if "AI21_API_KEY" in os.environ:
|
|
keys_in_environment = True
|
|
else:
|
|
missing_keys.append("AI21_API_KEY")
|
|
## together_ai
|
|
elif model in litellm.together_ai_models:
|
|
if "TOGETHERAI_API_KEY" in os.environ:
|
|
keys_in_environment = True
|
|
else:
|
|
missing_keys.append("TOGETHERAI_API_KEY")
|
|
## aleph_alpha
|
|
elif model in litellm.aleph_alpha_models:
|
|
if "ALEPH_ALPHA_API_KEY" in os.environ:
|
|
keys_in_environment = True
|
|
else:
|
|
missing_keys.append("ALEPH_ALPHA_API_KEY")
|
|
## baseten
|
|
elif model in litellm.baseten_models:
|
|
if "BASETEN_API_KEY" in os.environ:
|
|
keys_in_environment = True
|
|
else:
|
|
missing_keys.append("BASETEN_API_KEY")
|
|
## nlp_cloud
|
|
elif model in litellm.nlp_cloud_models:
|
|
if "NLP_CLOUD_API_KEY" in os.environ:
|
|
keys_in_environment = True
|
|
else:
|
|
missing_keys.append("NLP_CLOUD_API_KEY")
|
|
return {"keys_in_environment": keys_in_environment, "missing_keys": missing_keys}
|
|
|
|
def set_callbacks(callback_list, function_id=None):
|
|
global sentry_sdk_instance, capture_exception, add_breadcrumb, posthog, slack_app, alerts_channel, traceloopLogger, heliconeLogger, aispendLogger, berrispendLogger, supabaseClient, liteDebuggerClient, llmonitorLogger, promptLayerLogger, langFuseLogger, customLogger
|
|
try:
|
|
for callback in callback_list:
|
|
print_verbose(f"callback: {callback}")
|
|
if callback == "sentry":
|
|
try:
|
|
import sentry_sdk
|
|
except ImportError:
|
|
print_verbose("Package 'sentry_sdk' is missing. Installing it...")
|
|
subprocess.check_call(
|
|
[sys.executable, "-m", "pip", "install", "sentry_sdk"]
|
|
)
|
|
import sentry_sdk
|
|
sentry_sdk_instance = sentry_sdk
|
|
sentry_trace_rate = (
|
|
os.environ.get("SENTRY_API_TRACE_RATE")
|
|
if "SENTRY_API_TRACE_RATE" in os.environ
|
|
else "1.0"
|
|
)
|
|
sentry_sdk_instance.init(
|
|
dsn=os.environ.get("SENTRY_API_URL"),
|
|
traces_sample_rate=float(sentry_trace_rate),
|
|
)
|
|
capture_exception = sentry_sdk_instance.capture_exception
|
|
add_breadcrumb = sentry_sdk_instance.add_breadcrumb
|
|
elif callback == "posthog":
|
|
try:
|
|
from posthog import Posthog
|
|
except ImportError:
|
|
print_verbose("Package 'posthog' is missing. Installing it...")
|
|
subprocess.check_call(
|
|
[sys.executable, "-m", "pip", "install", "posthog"]
|
|
)
|
|
from posthog import Posthog
|
|
posthog = Posthog(
|
|
project_api_key=os.environ.get("POSTHOG_API_KEY"),
|
|
host=os.environ.get("POSTHOG_API_URL"),
|
|
)
|
|
elif callback == "slack":
|
|
try:
|
|
from slack_bolt import App
|
|
except ImportError:
|
|
print_verbose("Package 'slack_bolt' is missing. Installing it...")
|
|
subprocess.check_call(
|
|
[sys.executable, "-m", "pip", "install", "slack_bolt"]
|
|
)
|
|
from slack_bolt import App
|
|
slack_app = App(
|
|
token=os.environ.get("SLACK_API_TOKEN"),
|
|
signing_secret=os.environ.get("SLACK_API_SECRET"),
|
|
)
|
|
alerts_channel = os.environ["SLACK_API_CHANNEL"]
|
|
print_verbose(f"Initialized Slack App: {slack_app}")
|
|
elif callback == "traceloop":
|
|
traceloopLogger = TraceloopLogger()
|
|
elif callback == "helicone":
|
|
heliconeLogger = HeliconeLogger()
|
|
elif callback == "llmonitor":
|
|
llmonitorLogger = LLMonitorLogger()
|
|
elif callback == "promptlayer":
|
|
promptLayerLogger = PromptLayerLogger()
|
|
elif callback == "langfuse":
|
|
langFuseLogger = LangFuseLogger()
|
|
elif callback == "aispend":
|
|
aispendLogger = AISpendLogger()
|
|
elif callback == "berrispend":
|
|
berrispendLogger = BerriSpendLogger()
|
|
elif callback == "supabase":
|
|
print_verbose(f"instantiating supabase")
|
|
supabaseClient = Supabase()
|
|
elif callback == "lite_debugger":
|
|
print_verbose(f"instantiating lite_debugger")
|
|
if function_id:
|
|
liteDebuggerClient = LiteDebugger(email=function_id)
|
|
elif litellm.token:
|
|
liteDebuggerClient = LiteDebugger(email=litellm.token)
|
|
elif litellm.email:
|
|
liteDebuggerClient = LiteDebugger(email=litellm.email)
|
|
else:
|
|
liteDebuggerClient = LiteDebugger(email=str(uuid.uuid4()))
|
|
elif callable(callback):
|
|
customLogger = CustomLogger(callback_func=callback)
|
|
except Exception as e:
|
|
raise e
|
|
|
|
|
|
def handle_failure(exception, traceback_exception, start_time, end_time, args, kwargs):
|
|
global sentry_sdk_instance, capture_exception, add_breadcrumb, posthog, slack_app, alerts_channel, aispendLogger, berrispendLogger, supabaseClient, liteDebuggerClient, llmonitorLogger
|
|
try:
|
|
# print_verbose(f"handle_failure args: {args}")
|
|
# print_verbose(f"handle_failure kwargs: {kwargs}")
|
|
|
|
success_handler = additional_details.pop("success_handler", None)
|
|
failure_handler = additional_details.pop("failure_handler", None)
|
|
|
|
additional_details["Event_Name"] = additional_details.pop(
|
|
"failed_event_name", "litellm.failed_query"
|
|
)
|
|
print_verbose(f"self.failure_callback: {litellm.failure_callback}")
|
|
for callback in litellm.failure_callback:
|
|
try:
|
|
if callback == "slack":
|
|
slack_msg = ""
|
|
if len(kwargs) > 0:
|
|
for key in kwargs:
|
|
slack_msg += f"{key}: {kwargs[key]}\n"
|
|
if len(args) > 0:
|
|
for i, arg in enumerate(args):
|
|
slack_msg += f"LiteLLM_Args_{str(i)}: {arg}"
|
|
for detail in additional_details:
|
|
slack_msg += f"{detail}: {additional_details[detail]}\n"
|
|
slack_msg += f"Traceback: {traceback_exception}"
|
|
slack_app.client.chat_postMessage(
|
|
channel=alerts_channel, text=slack_msg
|
|
)
|
|
elif callback == "sentry":
|
|
capture_exception(exception)
|
|
elif callback == "posthog":
|
|
print_verbose(
|
|
f"inside posthog, additional_details: {len(additional_details.keys())}"
|
|
)
|
|
ph_obj = {}
|
|
if len(kwargs) > 0:
|
|
ph_obj = kwargs
|
|
if len(args) > 0:
|
|
for i, arg in enumerate(args):
|
|
ph_obj["litellm_args_" + str(i)] = arg
|
|
for detail in additional_details:
|
|
ph_obj[detail] = additional_details[detail]
|
|
event_name = additional_details["Event_Name"]
|
|
print_verbose(f"ph_obj: {ph_obj}")
|
|
print_verbose(f"PostHog Event Name: {event_name}")
|
|
if "user_id" in additional_details:
|
|
posthog.capture(
|
|
additional_details["user_id"], event_name, ph_obj
|
|
)
|
|
else: # PostHog calls require a unique id to identify a user - https://posthog.com/docs/libraries/python
|
|
unique_id = str(uuid.uuid4())
|
|
posthog.capture(unique_id, event_name)
|
|
print_verbose(f"successfully logged to PostHog!")
|
|
elif callback == "berrispend":
|
|
print_verbose("reaches berrispend for logging!")
|
|
model = args[0] if len(args) > 0 else kwargs["model"]
|
|
messages = args[1] if len(args) > 1 else kwargs["messages"]
|
|
result = {
|
|
"model": model,
|
|
"created": time.time(),
|
|
"error": traceback_exception,
|
|
"usage": {
|
|
"prompt_tokens": prompt_token_calculator(
|
|
model, messages=messages
|
|
),
|
|
"completion_tokens": 0,
|
|
},
|
|
}
|
|
berrispendLogger.log_event(
|
|
model=model,
|
|
messages=messages,
|
|
response_obj=result,
|
|
start_time=start_time,
|
|
end_time=end_time,
|
|
print_verbose=print_verbose,
|
|
)
|
|
elif callback == "aispend":
|
|
print_verbose("reaches aispend for logging!")
|
|
model = args[0] if len(args) > 0 else kwargs["model"]
|
|
messages = args[1] if len(args) > 1 else kwargs["messages"]
|
|
result = {
|
|
"model": model,
|
|
"created": time.time(),
|
|
"usage": {
|
|
"prompt_tokens": prompt_token_calculator(
|
|
model, messages=messages
|
|
),
|
|
"completion_tokens": 0,
|
|
},
|
|
}
|
|
aispendLogger.log_event(
|
|
model=model,
|
|
response_obj=result,
|
|
start_time=start_time,
|
|
end_time=end_time,
|
|
print_verbose=print_verbose,
|
|
)
|
|
elif callback == "llmonitor":
|
|
print_verbose("reaches llmonitor for logging error!")
|
|
|
|
model = args[0] if len(args) > 0 else kwargs["model"]
|
|
|
|
input = (
|
|
args[1]
|
|
if len(args) > 1
|
|
else kwargs.get("messages", kwargs.get("input", None))
|
|
)
|
|
|
|
type = "embed" if "input" in kwargs else "llm"
|
|
|
|
llmonitorLogger.log_event(
|
|
type=type,
|
|
event="error",
|
|
user_id=litellm._thread_context.user,
|
|
model=model,
|
|
input=input,
|
|
error=traceback_exception,
|
|
run_id=kwargs["litellm_call_id"],
|
|
start_time=start_time,
|
|
end_time=end_time,
|
|
print_verbose=print_verbose,
|
|
)
|
|
elif callback == "supabase":
|
|
print_verbose("reaches supabase for logging!")
|
|
print_verbose(f"supabaseClient: {supabaseClient}")
|
|
model = args[0] if len(args) > 0 else kwargs["model"]
|
|
messages = args[1] if len(args) > 1 else kwargs["messages"]
|
|
result = {
|
|
"model": model,
|
|
"created": time.time(),
|
|
"error": traceback_exception,
|
|
"usage": {
|
|
"prompt_tokens": prompt_token_calculator(
|
|
model, messages=messages
|
|
),
|
|
"completion_tokens": 0,
|
|
},
|
|
}
|
|
supabaseClient.log_event(
|
|
model=model,
|
|
messages=messages,
|
|
end_user=litellm._thread_context.user,
|
|
response_obj=result,
|
|
start_time=start_time,
|
|
end_time=end_time,
|
|
litellm_call_id=kwargs["litellm_call_id"],
|
|
print_verbose=print_verbose,
|
|
)
|
|
except:
|
|
print_verbose(
|
|
f"Error Occurred while logging failure: {traceback.format_exc()}"
|
|
)
|
|
pass
|
|
|
|
if failure_handler and callable(failure_handler):
|
|
call_details = {
|
|
"exception": exception,
|
|
"additional_details": additional_details,
|
|
}
|
|
failure_handler(call_details)
|
|
pass
|
|
except Exception as e:
|
|
# LOGGING
|
|
exception_logging(logger_fn=user_logger_fn, exception=e)
|
|
pass
|
|
|
|
|
|
def handle_success(args, kwargs, result, start_time, end_time):
|
|
global heliconeLogger, aispendLogger, supabaseClient, liteDebuggerClient, llmonitorLogger
|
|
try:
|
|
model = args[0] if len(args) > 0 else kwargs["model"]
|
|
input = (
|
|
args[1]
|
|
if len(args) > 1
|
|
else kwargs.get("messages", kwargs.get("input", None))
|
|
)
|
|
success_handler = additional_details.pop("success_handler", None)
|
|
failure_handler = additional_details.pop("failure_handler", None)
|
|
additional_details["Event_Name"] = additional_details.pop(
|
|
"successful_event_name", "litellm.succes_query"
|
|
)
|
|
for callback in litellm.success_callback:
|
|
try:
|
|
if callback == "posthog":
|
|
ph_obj = {}
|
|
for detail in additional_details:
|
|
ph_obj[detail] = additional_details[detail]
|
|
event_name = additional_details["Event_Name"]
|
|
if "user_id" in additional_details:
|
|
posthog.capture(
|
|
additional_details["user_id"], event_name, ph_obj
|
|
)
|
|
else: # PostHog calls require a unique id to identify a user - https://posthog.com/docs/libraries/python
|
|
unique_id = str(uuid.uuid4())
|
|
posthog.capture(unique_id, event_name, ph_obj)
|
|
pass
|
|
elif callback == "slack":
|
|
slack_msg = ""
|
|
for detail in additional_details:
|
|
slack_msg += f"{detail}: {additional_details[detail]}\n"
|
|
slack_app.client.chat_postMessage(
|
|
channel=alerts_channel, text=slack_msg
|
|
)
|
|
elif callback == "helicone":
|
|
print_verbose("reaches helicone for logging!")
|
|
model = args[0] if len(args) > 0 else kwargs["model"]
|
|
messages = args[1] if len(args) > 1 else kwargs["messages"]
|
|
heliconeLogger.log_success(
|
|
model=model,
|
|
messages=messages,
|
|
response_obj=result,
|
|
start_time=start_time,
|
|
end_time=end_time,
|
|
print_verbose=print_verbose,
|
|
)
|
|
elif callback == "llmonitor":
|
|
print_verbose("reaches llmonitor for logging!")
|
|
model = args[0] if len(args) > 0 else kwargs["model"]
|
|
|
|
input = (
|
|
args[1]
|
|
if len(args) > 1
|
|
else kwargs.get("messages", kwargs.get("input", None))
|
|
)
|
|
|
|
# if contains input, it's 'embedding', otherwise 'llm'
|
|
type = "embed" if "input" in kwargs else "llm"
|
|
|
|
llmonitorLogger.log_event(
|
|
type=type,
|
|
event="end",
|
|
model=model,
|
|
input=input,
|
|
user_id=litellm._thread_context.user,
|
|
response_obj=result,
|
|
start_time=start_time,
|
|
end_time=end_time,
|
|
run_id=kwargs["litellm_call_id"],
|
|
print_verbose=print_verbose,
|
|
)
|
|
elif callback == "langfuse":
|
|
print_verbose("reaches langfuse for logging!")
|
|
langFuseLogger.log_event(
|
|
kwargs=kwargs,
|
|
response_obj=result,
|
|
start_time=start_time,
|
|
end_time=end_time,
|
|
print_verbose=print_verbose,
|
|
)
|
|
|
|
elif callback == "aispend":
|
|
print_verbose("reaches aispend for logging!")
|
|
model = args[0] if len(args) > 0 else kwargs["model"]
|
|
aispendLogger.log_event(
|
|
model=model,
|
|
response_obj=result,
|
|
start_time=start_time,
|
|
end_time=end_time,
|
|
print_verbose=print_verbose,
|
|
)
|
|
elif callback == "supabase":
|
|
print_verbose("reaches supabase for logging!")
|
|
model = args[0] if len(args) > 0 else kwargs["model"]
|
|
messages = (
|
|
args[1]
|
|
if len(args) > 1
|
|
else kwargs.get("messages", {"role": "user", "content": ""})
|
|
)
|
|
print(f"supabaseClient: {supabaseClient}")
|
|
supabaseClient.log_event(
|
|
model=model,
|
|
messages=messages,
|
|
end_user=litellm._thread_context.user,
|
|
response_obj=result,
|
|
start_time=start_time,
|
|
end_time=end_time,
|
|
litellm_call_id=kwargs["litellm_call_id"],
|
|
print_verbose=print_verbose,
|
|
)
|
|
elif callable(callback): # custom logger functions
|
|
customLogger.log_event(
|
|
kwargs=kwargs,
|
|
response_obj=result,
|
|
start_time=start_time,
|
|
end_time=end_time,
|
|
print_verbose=print_verbose,
|
|
)
|
|
except Exception as e:
|
|
# LOGGING
|
|
exception_logging(logger_fn=user_logger_fn, exception=e)
|
|
print_verbose(
|
|
f"[Non-Blocking] Success Callback Error - {traceback.format_exc()}"
|
|
)
|
|
pass
|
|
|
|
if success_handler and callable(success_handler):
|
|
success_handler(args, kwargs)
|
|
pass
|
|
except Exception as e:
|
|
# LOGGING
|
|
exception_logging(logger_fn=user_logger_fn, exception=e)
|
|
print_verbose(
|
|
f"[Non-Blocking] Success Callback Error - {traceback.format_exc()}"
|
|
)
|
|
pass
|
|
|
|
|
|
def acreate(*args, **kwargs): ## Thin client to handle the acreate langchain call
|
|
return litellm.acompletion(*args, **kwargs)
|
|
|
|
|
|
def prompt_token_calculator(model, messages):
|
|
# use tiktoken or anthropic's tokenizer depending on the model
|
|
text = " ".join(message["content"] for message in messages)
|
|
num_tokens = 0
|
|
if "claude" in model:
|
|
try:
|
|
import anthropic
|
|
except:
|
|
Exception("Anthropic import failed please run `pip install anthropic`")
|
|
from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT
|
|
|
|
anthropic = Anthropic()
|
|
num_tokens = anthropic.count_tokens(text)
|
|
else:
|
|
num_tokens = len(encoding.encode(text))
|
|
return num_tokens
|
|
|
|
|
|
def valid_model(model):
|
|
try:
|
|
# for a given model name, check if the user has the right permissions to access the model
|
|
if (
|
|
model in litellm.open_ai_chat_completion_models
|
|
or model in litellm.open_ai_text_completion_models
|
|
):
|
|
openai.Model.retrieve(model)
|
|
else:
|
|
messages = [{"role": "user", "content": "Hello World"}]
|
|
litellm.completion(model=model, messages=messages)
|
|
except:
|
|
raise InvalidRequestError(message="", model=model, llm_provider="")
|
|
|
|
# check valid api key
|
|
def check_valid_key(model: str, api_key: str):
|
|
# returns True if key is valid for the model
|
|
# returns False if key is invalid for the model
|
|
messages = [{"role": "user", "content": "Hey, how's it going?"}]
|
|
try:
|
|
litellm.completion(model=model, messages=messages, api_key=api_key, max_tokens=10)
|
|
return True
|
|
except AuthenticationError as e:
|
|
return False
|
|
except Exception as e:
|
|
return False
|
|
|
|
# integration helper function
|
|
def modify_integration(integration_name, integration_params):
|
|
global supabaseClient
|
|
if integration_name == "supabase":
|
|
if "table_name" in integration_params:
|
|
Supabase.supabase_table_name = integration_params["table_name"]
|
|
|
|
|
|
# custom prompt helper function
|
|
def register_prompt_template(model: str, roles: dict, initial_prompt_value: str = "", final_prompt_value: str = ""):
|
|
"""
|
|
Example usage:
|
|
```
|
|
import litellm
|
|
litellm.register_prompt_template(
|
|
model="llama-2",
|
|
roles={
|
|
"system": {
|
|
"pre_message": "[INST] <<SYS>>\n",
|
|
"post_message": "\n<</SYS>>\n [/INST]\n"
|
|
},
|
|
"user": { # follow this format https://github.com/facebookresearch/llama/blob/77062717054710e352a99add63d160274ce670c6/llama/generation.py#L348
|
|
"pre_message": "[INST] ",
|
|
"post_message": " [/INST]\n"
|
|
},
|
|
"assistant": {
|
|
"post_message": "\n" # follows this - https://replicate.com/blog/how-to-prompt-llama
|
|
}
|
|
}
|
|
)
|
|
```
|
|
"""
|
|
litellm.custom_prompt_dict[model] = {
|
|
"roles": roles,
|
|
"initial_prompt_value": initial_prompt_value,
|
|
"final_prompt_value": final_prompt_value
|
|
}
|
|
return litellm.custom_prompt_dict
|
|
|
|
####### [BETA] HOSTED PRODUCT ################ - https://docs.litellm.ai/docs/debugging/hosted_debugging
|
|
|
|
|
|
def get_all_keys(llm_provider=None):
|
|
try:
|
|
global last_fetched_at_keys
|
|
# if user is using hosted product -> instantiate their env with their hosted api keys - refresh every 5 minutes
|
|
print_verbose(f"Reaches get all keys, llm_provider: {llm_provider}")
|
|
user_email = (
|
|
os.getenv("LITELLM_EMAIL")
|
|
or litellm.email
|
|
or litellm.token
|
|
or os.getenv("LITELLM_TOKEN")
|
|
)
|
|
if user_email:
|
|
time_delta = 0
|
|
if last_fetched_at_keys != None:
|
|
current_time = time.time()
|
|
time_delta = current_time - last_fetched_at_keys
|
|
if (
|
|
time_delta > 300 or last_fetched_at_keys == None or llm_provider
|
|
): # if the llm provider is passed in , assume this happening due to an AuthError for that provider
|
|
# make the api call
|
|
last_fetched_at = time.time()
|
|
print_verbose(f"last_fetched_at: {last_fetched_at}")
|
|
response = requests.post(
|
|
url="http://api.litellm.ai/get_all_keys",
|
|
headers={"content-type": "application/json"},
|
|
data=json.dumps({"user_email": user_email}),
|
|
)
|
|
print_verbose(f"get model key response: {response.text}")
|
|
data = response.json()
|
|
# update model list
|
|
for key, value in data[
|
|
"model_keys"
|
|
].items(): # follows the LITELLM API KEY format - <UPPERCASE_PROVIDER_NAME>_API_KEY - e.g. HUGGINGFACE_API_KEY
|
|
os.environ[key] = value
|
|
# set model alias map
|
|
for model_alias, value in data["model_alias_map"].items():
|
|
litellm.model_alias_map[model_alias] = value
|
|
return "it worked!"
|
|
return None
|
|
return None
|
|
except:
|
|
print_verbose(
|
|
f"[Non-Blocking Error] get_all_keys error - {traceback.format_exc()}"
|
|
)
|
|
pass
|
|
|
|
|
|
def get_model_list():
|
|
global last_fetched_at
|
|
try:
|
|
# if user is using hosted product -> get their updated model list
|
|
user_email = (
|
|
os.getenv("LITELLM_EMAIL")
|
|
or litellm.email
|
|
or litellm.token
|
|
or os.getenv("LITELLM_TOKEN")
|
|
)
|
|
if user_email:
|
|
# make the api call
|
|
last_fetched_at = time.time()
|
|
print(f"last_fetched_at: {last_fetched_at}")
|
|
response = requests.post(
|
|
url="http://api.litellm.ai/get_model_list",
|
|
headers={"content-type": "application/json"},
|
|
data=json.dumps({"user_email": user_email}),
|
|
)
|
|
print_verbose(f"get_model_list response: {response.text}")
|
|
data = response.json()
|
|
# update model list
|
|
model_list = data["model_list"]
|
|
# # check if all model providers are in environment
|
|
# model_providers = data["model_providers"]
|
|
# missing_llm_provider = None
|
|
# for item in model_providers:
|
|
# if f"{item.upper()}_API_KEY" not in os.environ:
|
|
# missing_llm_provider = item
|
|
# break
|
|
# # update environment - if required
|
|
# threading.Thread(target=get_all_keys, args=(missing_llm_provider)).start()
|
|
return model_list
|
|
return [] # return empty list by default
|
|
except:
|
|
print_verbose(
|
|
f"[Non-Blocking Error] get_model_list error - {traceback.format_exc()}"
|
|
)
|
|
|
|
####### EXCEPTION MAPPING ################
|
|
def exception_type(
|
|
model,
|
|
original_exception,
|
|
custom_llm_provider,
|
|
completion_kwargs={},
|
|
):
|
|
global user_logger_fn, liteDebuggerClient
|
|
exception_mapping_worked = False
|
|
print()
|
|
print("\033[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new\033[0m")
|
|
print()
|
|
print()
|
|
if litellm.set_verbose == True:
|
|
litellm.error_logs['EXCEPTION'] = original_exception
|
|
litellm.error_logs['KWARGS'] = completion_kwargs
|
|
try:
|
|
# code to show users their litellm error dashboard
|
|
import urllib.parse
|
|
import json
|
|
for log_key in litellm.error_logs:
|
|
current_logs = litellm.error_logs[log_key]
|
|
if type(current_logs) == dict:
|
|
filtered_error_logs = {key: str(value) for key, value in current_logs.items()}
|
|
litellm.error_logs[log_key] = filtered_error_logs
|
|
else:
|
|
litellm.error_logs[log_key] = str(current_logs)
|
|
|
|
# Convert the filtered_error_logs dictionary to a JSON string
|
|
error_logs_json = json.dumps(litellm.error_logs)
|
|
# URL-encode the JSON data
|
|
encoded_data = urllib.parse.quote(error_logs_json)
|
|
|
|
print("👉 view error logs:")
|
|
print("\033[91m" + '\033[4m' + 'https://logs.litellm.ai/?data=' + str(encoded_data) + "\033[0m")
|
|
|
|
except:
|
|
pass
|
|
try:
|
|
if isinstance(original_exception, OriginalError):
|
|
# Handle the OpenAIError
|
|
exception_mapping_worked = True
|
|
if model in litellm.openrouter_models:
|
|
if original_exception.http_status == 413:
|
|
raise InvalidRequestError(
|
|
message=str(original_exception),
|
|
model=model,
|
|
llm_provider="openrouter"
|
|
)
|
|
original_exception.llm_provider = "openrouter"
|
|
elif custom_llm_provider == "azure":
|
|
original_exception.llm_provider = "azure"
|
|
else:
|
|
original_exception.llm_provider = "openai"
|
|
if "This model's maximum context length is" in original_exception._message:
|
|
raise ContextWindowExceededError(
|
|
message=str(original_exception),
|
|
model=model,
|
|
llm_provider=original_exception.llm_provider
|
|
)
|
|
raise original_exception
|
|
elif model:
|
|
error_str = str(original_exception)
|
|
if isinstance(original_exception, BaseException):
|
|
exception_type = type(original_exception).__name__
|
|
else:
|
|
exception_type = ""
|
|
if custom_llm_provider == "anthropic": # one of the anthropics
|
|
if hasattr(original_exception, "message"):
|
|
if "prompt is too long" in original_exception.message:
|
|
exception_mapping_worked = True
|
|
raise ContextWindowExceededError(
|
|
message=original_exception.message,
|
|
model=model,
|
|
llm_provider="anthropic"
|
|
)
|
|
if "Invalid API Key" in original_exception.message:
|
|
exception_mapping_worked = True
|
|
raise AuthenticationError(
|
|
message=original_exception.message,
|
|
model=model,
|
|
llm_provider="anthropic"
|
|
)
|
|
if hasattr(original_exception, "status_code"):
|
|
print_verbose(f"status_code: {original_exception.status_code}")
|
|
if original_exception.status_code == 401:
|
|
exception_mapping_worked = True
|
|
raise AuthenticationError(
|
|
message=f"AnthropicException - {original_exception.message}",
|
|
llm_provider="anthropic",
|
|
model=model
|
|
)
|
|
elif original_exception.status_code == 400:
|
|
exception_mapping_worked = True
|
|
raise InvalidRequestError(
|
|
message=f"AnthropicException - {original_exception.message}",
|
|
model=model,
|
|
llm_provider="anthropic",
|
|
)
|
|
elif original_exception.status_code == 408:
|
|
exception_mapping_worked = True
|
|
raise Timeout(
|
|
message=f"AnthropicException - {original_exception.message}",
|
|
model=model,
|
|
llm_provider="anthropic"
|
|
)
|
|
elif original_exception.status_code == 413:
|
|
exception_mapping_worked = True
|
|
raise InvalidRequestError(
|
|
message=f"AnthropicException - {original_exception.message}",
|
|
model=model,
|
|
llm_provider="anthropic",
|
|
)
|
|
elif original_exception.status_code == 429:
|
|
exception_mapping_worked = True
|
|
raise RateLimitError(
|
|
message=f"AnthropicException - {original_exception.message}",
|
|
llm_provider="anthropic",
|
|
model=model
|
|
)
|
|
elif original_exception.status_code == 500:
|
|
exception_mapping_worked = True
|
|
raise ServiceUnavailableError(
|
|
message=f"AnthropicException - {original_exception.message}",
|
|
llm_provider="anthropic",
|
|
model=model
|
|
)
|
|
else:
|
|
exception_mapping_worked = True
|
|
raise APIError(
|
|
status_code=original_exception.status_code,
|
|
message=f"AnthropicException - {original_exception.message}",
|
|
llm_provider="anthropic",
|
|
model=model
|
|
)
|
|
elif custom_llm_provider == "replicate":
|
|
if "Incorrect authentication token" in error_str:
|
|
exception_mapping_worked = True
|
|
raise AuthenticationError(
|
|
message=f"ReplicateException - {error_str}",
|
|
llm_provider="replicate",
|
|
model=model
|
|
)
|
|
elif "input is too long" in error_str:
|
|
exception_mapping_worked = True
|
|
raise ContextWindowExceededError(
|
|
message=f"ReplicateException - {error_str}",
|
|
model=model,
|
|
llm_provider="replicate",
|
|
)
|
|
elif exception_type == "ModelError":
|
|
exception_mapping_worked = True
|
|
raise InvalidRequestError(
|
|
message=f"ReplicateException - {error_str}",
|
|
model=model,
|
|
llm_provider="replicate",
|
|
)
|
|
elif "Request was throttled" in error_str:
|
|
exception_mapping_worked = True
|
|
raise RateLimitError(
|
|
message=f"ReplicateException - {error_str}",
|
|
llm_provider="replicate",
|
|
model=model
|
|
)
|
|
elif hasattr(original_exception, "status_code"):
|
|
if original_exception.status_code == 401:
|
|
exception_mapping_worked = True
|
|
raise AuthenticationError(
|
|
message=f"ReplicateException - {original_exception.message}",
|
|
llm_provider="replicate",
|
|
model=model
|
|
)
|
|
elif original_exception.status_code == 400 or original_exception.status_code == 422:
|
|
exception_mapping_worked = True
|
|
raise InvalidRequestError(
|
|
message=f"ReplicateException - {original_exception.message}",
|
|
model=model,
|
|
llm_provider="replicate",
|
|
)
|
|
elif original_exception.status_code == 408:
|
|
exception_mapping_worked = True
|
|
raise Timeout(
|
|
message=f"ReplicateException - {original_exception.message}",
|
|
model=model,
|
|
llm_provider="replicate"
|
|
)
|
|
elif original_exception.status_code == 413:
|
|
exception_mapping_worked = True
|
|
raise InvalidRequestError(
|
|
message=f"ReplicateException - {original_exception.message}",
|
|
model=model,
|
|
llm_provider="replicate",
|
|
)
|
|
elif original_exception.status_code == 429:
|
|
exception_mapping_worked = True
|
|
raise RateLimitError(
|
|
message=f"ReplicateException - {original_exception.message}",
|
|
llm_provider="replicate",
|
|
model=model
|
|
)
|
|
elif original_exception.status_code == 500:
|
|
exception_mapping_worked = True
|
|
raise ServiceUnavailableError(
|
|
message=f"ReplicateException - {original_exception.message}",
|
|
llm_provider="replicate",
|
|
model=model
|
|
)
|
|
exception_mapping_worked = True
|
|
raise APIError(
|
|
status_code=original_exception.status_code,
|
|
message=f"ReplicateException - {original_exception.message}",
|
|
llm_provider="replicate",
|
|
model=model
|
|
)
|
|
elif custom_llm_provider == "bedrock":
|
|
if "Unable to locate credentials" in error_str:
|
|
exception_mapping_worked = True
|
|
raise InvalidRequestError(
|
|
message=f"BedrockException - {error_str}",
|
|
model=model,
|
|
llm_provider="bedrock"
|
|
)
|
|
elif custom_llm_provider == "sagemaker":
|
|
if "Unable to locate credentials" in error_str:
|
|
exception_mapping_worked = True
|
|
raise InvalidRequestError(
|
|
message=f"SagemakerException - {error_str}",
|
|
model=model,
|
|
llm_provider="sagemaker"
|
|
)
|
|
elif custom_llm_provider == "vertex_ai":
|
|
if "Vertex AI API has not been used in project" in error_str or "Unable to find your project" in error_str:
|
|
exception_mapping_worked = True
|
|
raise InvalidRequestError(
|
|
message=f"VertexAIException - {error_str}",
|
|
model=model,
|
|
llm_provider="vertex_ai"
|
|
)
|
|
elif custom_llm_provider == "palm":
|
|
if "503 Getting metadata" in error_str:
|
|
# auth errors look like this
|
|
# 503 Getting metadata from plugin failed with error: Reauthentication is needed. Please run `gcloud auth application-default login` to reauthenticate.
|
|
exception_mapping_worked = True
|
|
raise InvalidRequestError(
|
|
message=f"PalmException - Invalid api key",
|
|
model=model,
|
|
llm_provider="palm"
|
|
)
|
|
if "400 Request payload size exceeds" in error_str:
|
|
exception_mapping_worked = True
|
|
raise ContextWindowExceededError(
|
|
message=f"PalmException - {error_str}",
|
|
model=model,
|
|
llm_provider="palm",
|
|
)
|
|
# Dailed: Error occurred: 400 Request payload size exceeds the limit: 20000 bytes
|
|
elif custom_llm_provider == "cohere": # Cohere
|
|
if (
|
|
"invalid api token" in error_str
|
|
or "No API key provided." in error_str
|
|
):
|
|
exception_mapping_worked = True
|
|
raise AuthenticationError(
|
|
message=f"CohereException - {original_exception.message}",
|
|
llm_provider="cohere",
|
|
model=model
|
|
)
|
|
elif "too many tokens" in error_str:
|
|
exception_mapping_worked = True
|
|
raise ContextWindowExceededError(
|
|
message=f"CohereException - {original_exception.message}",
|
|
model=model,
|
|
llm_provider="cohere",
|
|
)
|
|
elif hasattr(original_exception, "status_code"):
|
|
if original_exception.status_code == 400 or original_exception.status_code == 498:
|
|
exception_mapping_worked = True
|
|
raise InvalidRequestError(
|
|
message=f"CohereException - {original_exception.message}",
|
|
llm_provider="cohere",
|
|
model=model
|
|
)
|
|
elif original_exception.status_code == 500:
|
|
exception_mapping_worked = True
|
|
raise ServiceUnavailableError(
|
|
message=f"CohereException - {original_exception.message}",
|
|
llm_provider="cohere",
|
|
model=model
|
|
)
|
|
elif (
|
|
"CohereConnectionError" in exception_type
|
|
): # cohere seems to fire these errors when we load test it (1k+ messages / min)
|
|
exception_mapping_worked = True
|
|
raise RateLimitError(
|
|
message=f"CohereException - {original_exception.message}",
|
|
llm_provider="cohere",
|
|
model=model
|
|
)
|
|
elif "invalid type:" in error_str:
|
|
exception_mapping_worked = True
|
|
raise InvalidRequestError(
|
|
message=f"CohereException - {original_exception.message}",
|
|
llm_provider="cohere",
|
|
model=model
|
|
)
|
|
elif "Unexpected server error" in error_str:
|
|
exception_mapping_worked = True
|
|
raise ServiceUnavailableError(
|
|
message=f"CohereException - {original_exception.message}",
|
|
llm_provider="cohere",
|
|
model=model
|
|
)
|
|
else:
|
|
if hasattr(original_exception, "status_code"):
|
|
exception_mapping_worked = True
|
|
raise APIError(
|
|
status_code=original_exception.status_code,
|
|
message=f"CohereException - {original_exception.message}",
|
|
llm_provider="cohere",
|
|
model=model
|
|
)
|
|
raise original_exception
|
|
elif custom_llm_provider == "huggingface":
|
|
if "length limit exceeded" in error_str:
|
|
exception_mapping_worked = True
|
|
raise ContextWindowExceededError(
|
|
message=error_str,
|
|
model=model,
|
|
llm_provider="huggingface"
|
|
)
|
|
elif "A valid user token is required" in error_str:
|
|
exception_mapping_worked = True
|
|
raise InvalidRequestError(
|
|
message=error_str,
|
|
llm_provider="huggingface",
|
|
model=model
|
|
)
|
|
if hasattr(original_exception, "status_code"):
|
|
if original_exception.status_code == 401:
|
|
exception_mapping_worked = True
|
|
raise AuthenticationError(
|
|
message=f"HuggingfaceException - {original_exception.message}",
|
|
llm_provider="huggingface",
|
|
model=model
|
|
)
|
|
elif original_exception.status_code == 400:
|
|
exception_mapping_worked = True
|
|
raise InvalidRequestError(
|
|
message=f"HuggingfaceException - {original_exception.message}",
|
|
model=model,
|
|
llm_provider="huggingface",
|
|
)
|
|
elif original_exception.status_code == 408:
|
|
exception_mapping_worked = True
|
|
raise Timeout(
|
|
message=f"HuggingfaceException - {original_exception.message}",
|
|
model=model,
|
|
llm_provider="huggingface"
|
|
)
|
|
elif original_exception.status_code == 429:
|
|
exception_mapping_worked = True
|
|
raise RateLimitError(
|
|
message=f"HuggingfaceException - {original_exception.message}",
|
|
llm_provider="huggingface",
|
|
model=model
|
|
)
|
|
else:
|
|
exception_mapping_worked = True
|
|
raise APIError(
|
|
status_code=original_exception.status_code,
|
|
message=f"HuggingfaceException - {original_exception.message}",
|
|
llm_provider="huggingface",
|
|
model=model
|
|
)
|
|
exception_mapping_worked = True
|
|
raise APIError(status_code=500, message=error_str, model=model, llm_provider=custom_llm_provider)
|
|
elif custom_llm_provider == "ai21":
|
|
if hasattr(original_exception, "message"):
|
|
if "Prompt has too many tokens" in original_exception.message:
|
|
exception_mapping_worked = True
|
|
raise ContextWindowExceededError(
|
|
message=f"AI21Exception - {original_exception.message}",
|
|
model=model,
|
|
llm_provider="ai21"
|
|
)
|
|
if "Bad or missing API token." in original_exception.message:
|
|
exception_mapping_worked = True
|
|
raise InvalidRequestError(
|
|
message=f"AI21Exception - {original_exception.message}",
|
|
model=model,
|
|
llm_provider="ai21"
|
|
)
|
|
if hasattr(original_exception, "status_code"):
|
|
if original_exception.status_code == 401:
|
|
exception_mapping_worked = True
|
|
raise AuthenticationError(
|
|
message=f"AI21Exception - {original_exception.message}",
|
|
llm_provider="ai21",
|
|
model=model
|
|
)
|
|
elif original_exception.status_code == 408:
|
|
exception_mapping_worked = True
|
|
raise Timeout(
|
|
message=f"AI21Exception - {original_exception.message}",
|
|
model=model,
|
|
llm_provider="ai21"
|
|
)
|
|
if original_exception.status_code == 422:
|
|
exception_mapping_worked = True
|
|
raise InvalidRequestError(
|
|
message=f"AI21Exception - {original_exception.message}",
|
|
model=model,
|
|
llm_provider="ai21",
|
|
)
|
|
elif original_exception.status_code == 429:
|
|
exception_mapping_worked = True
|
|
raise RateLimitError(
|
|
message=f"AI21Exception - {original_exception.message}",
|
|
llm_provider="ai21",
|
|
)
|
|
else:
|
|
exception_mapping_worked = True
|
|
raise APIError(
|
|
status_code=original_exception.status_code,
|
|
message=f"AI21Exception - {original_exception.message}",
|
|
llm_provider="ai21",
|
|
model=model
|
|
)
|
|
elif custom_llm_provider == "nlp_cloud":
|
|
if "detail" in error_str:
|
|
if "Input text length should not exceed" in error_str:
|
|
exception_mapping_worked = True
|
|
raise ContextWindowExceededError(
|
|
message=f"NLPCloudException - {error_str}",
|
|
model=model,
|
|
llm_provider="nlp_cloud"
|
|
)
|
|
elif "value is not a valid" in error_str:
|
|
exception_mapping_worked = True
|
|
raise InvalidRequestError(
|
|
message=f"NLPCloudException - {error_str}",
|
|
model=model,
|
|
llm_provider="nlp_cloud"
|
|
)
|
|
else:
|
|
exception_mapping_worked = True
|
|
raise APIError(
|
|
status_code=500,
|
|
message=f"NLPCloudException - {error_str}",
|
|
model=model,
|
|
llm_provider="nlp_cloud"
|
|
)
|
|
if hasattr(original_exception, "status_code"): # https://docs.nlpcloud.com/?shell#errors
|
|
if original_exception.status_code == 400 or original_exception.status_code == 406 or original_exception.status_code == 413 or original_exception.status_code == 422:
|
|
exception_mapping_worked = True
|
|
raise InvalidRequestError(
|
|
message=f"NLPCloudException - {original_exception.message}",
|
|
llm_provider="nlp_cloud",
|
|
model=model
|
|
)
|
|
elif original_exception.status_code == 401 or original_exception.status_code == 403:
|
|
exception_mapping_worked = True
|
|
raise AuthenticationError(
|
|
message=f"NLPCloudException - {original_exception.message}",
|
|
llm_provider="nlp_cloud",
|
|
model=model
|
|
)
|
|
elif original_exception.status_code == 522 or original_exception.status_code == 524:
|
|
exception_mapping_worked = True
|
|
raise Timeout(
|
|
message=f"NLPCloudException - {original_exception.message}",
|
|
model=model,
|
|
llm_provider="nlp_cloud"
|
|
)
|
|
elif original_exception.status_code == 429 or original_exception.status_code == 402:
|
|
exception_mapping_worked = True
|
|
raise RateLimitError(
|
|
message=f"NLPCloudException - {original_exception.message}",
|
|
llm_provider="nlp_cloud",
|
|
)
|
|
elif original_exception.status_code == 500 or original_exception.status_code == 503:
|
|
exception_mapping_worked = True
|
|
raise APIError(
|
|
status_code=original_exception.status_code,
|
|
message=f"NLPCloudException - {original_exception.message}",
|
|
llm_provider="nlp_cloud",
|
|
model=model
|
|
)
|
|
elif original_exception.status_code == 504 or original_exception.status_code == 520:
|
|
exception_mapping_worked = True
|
|
raise ServiceUnavailableError(
|
|
message=f"NLPCloudException - {original_exception.message}",
|
|
model=model,
|
|
llm_provider="nlp_cloud"
|
|
)
|
|
else:
|
|
exception_mapping_worked = True
|
|
raise APIError(
|
|
status_code=original_exception.status_code,
|
|
message=f"NLPCloudException - {original_exception.message}",
|
|
llm_provider="nlp_cloud",
|
|
model=model
|
|
)
|
|
elif custom_llm_provider == "together_ai":
|
|
import json
|
|
error_response = json.loads(error_str)
|
|
if "error" in error_response and "`inputs` tokens + `max_new_tokens` must be <=" in error_response["error"]:
|
|
exception_mapping_worked = True
|
|
raise ContextWindowExceededError(
|
|
message=f"TogetherAIException - {error_response['error']}",
|
|
model=model,
|
|
llm_provider="together_ai"
|
|
)
|
|
elif "error" in error_response and "invalid private key" in error_response["error"]:
|
|
exception_mapping_worked = True
|
|
raise AuthenticationError(
|
|
message=f"TogetherAIException - {error_response['error']}",
|
|
llm_provider="together_ai",
|
|
model=model
|
|
)
|
|
elif "error" in error_response and "INVALID_ARGUMENT" in error_response["error"]:
|
|
exception_mapping_worked = True
|
|
raise InvalidRequestError(
|
|
message=f"TogetherAIException - {error_response['error']}",
|
|
model=model,
|
|
llm_provider="together_ai"
|
|
)
|
|
elif "error" in error_response and "API key doesn't match expected format." in error_response["error"]:
|
|
exception_mapping_worked = True
|
|
raise InvalidRequestError(
|
|
message=f"TogetherAIException - {error_response['error']}",
|
|
model=model,
|
|
llm_provider="together_ai"
|
|
)
|
|
elif "error_type" in error_response and error_response["error_type"] == "validation":
|
|
exception_mapping_worked = True
|
|
raise InvalidRequestError(
|
|
message=f"TogetherAIException - {error_response['error']}",
|
|
model=model,
|
|
llm_provider="together_ai"
|
|
)
|
|
elif original_exception.status_code == 408:
|
|
exception_mapping_worked = True
|
|
raise Timeout(
|
|
message=f"TogetherAIException - {original_exception.message}",
|
|
model=model,
|
|
llm_provider="together_ai"
|
|
)
|
|
elif original_exception.status_code == 429:
|
|
exception_mapping_worked = True
|
|
raise RateLimitError(
|
|
message=f"TogetherAIException - {original_exception.message}",
|
|
llm_provider="together_ai",
|
|
model=model
|
|
)
|
|
else:
|
|
exception_mapping_worked = True
|
|
raise APIError(
|
|
status_code=original_exception.status_code,
|
|
message=f"TogetherAIException - {original_exception.message}",
|
|
llm_provider="together_ai",
|
|
model=model
|
|
)
|
|
elif custom_llm_provider == "aleph_alpha":
|
|
if "This is longer than the model's maximum context length" in error_str:
|
|
exception_mapping_worked = True
|
|
raise ContextWindowExceededError(
|
|
message=f"AlephAlphaException - {original_exception.message}",
|
|
llm_provider="aleph_alpha",
|
|
model=model
|
|
)
|
|
elif "InvalidToken" in error_str or "No token provided" in error_str:
|
|
exception_mapping_worked = True
|
|
raise InvalidRequestError(
|
|
message=f"AlephAlphaException - {original_exception.message}",
|
|
llm_provider="aleph_alpha",
|
|
model=model
|
|
)
|
|
elif hasattr(original_exception, "status_code"):
|
|
print(f"status code: {original_exception.status_code}")
|
|
if original_exception.status_code == 401:
|
|
exception_mapping_worked = True
|
|
raise AuthenticationError(
|
|
message=f"AlephAlphaException - {original_exception.message}",
|
|
llm_provider="aleph_alpha",
|
|
model=model
|
|
)
|
|
elif original_exception.status_code == 400:
|
|
exception_mapping_worked = True
|
|
raise InvalidRequestError(
|
|
message=f"AlephAlphaException - {original_exception.message}",
|
|
llm_provider="aleph_alpha",
|
|
model=model
|
|
)
|
|
elif original_exception.status_code == 429:
|
|
exception_mapping_worked = True
|
|
raise RateLimitError(
|
|
message=f"AlephAlphaException - {original_exception.message}",
|
|
llm_provider="aleph_alpha",
|
|
model=model
|
|
)
|
|
elif original_exception.status_code == 500:
|
|
exception_mapping_worked = True
|
|
raise ServiceUnavailableError(
|
|
message=f"AlephAlphaException - {original_exception.message}",
|
|
llm_provider="aleph_alpha",
|
|
model=model
|
|
)
|
|
raise original_exception
|
|
raise original_exception
|
|
elif custom_llm_provider == "vllm":
|
|
if hasattr(original_exception, "status_code"):
|
|
if original_exception.status_code == 0:
|
|
exception_mapping_worked = True
|
|
raise APIConnectionError(
|
|
message=f"VLLMException - {original_exception.message}",
|
|
llm_provider="vllm",
|
|
model=model
|
|
)
|
|
elif custom_llm_provider == "ollama":
|
|
if "no attribute 'async_get_ollama_response_stream" in error_str:
|
|
raise ImportError("Import error - trying to use async for ollama. import async_generator failed. Try 'pip install async_generator'")
|
|
exception_mapping_worked = True
|
|
raise APIError(status_code=500, message=str(original_exception), llm_provider=custom_llm_provider, model=model)
|
|
except Exception as e:
|
|
# LOGGING
|
|
exception_logging(
|
|
logger_fn=user_logger_fn,
|
|
additional_args={
|
|
"exception_mapping_worked": exception_mapping_worked,
|
|
"original_exception": original_exception,
|
|
},
|
|
exception=e,
|
|
)
|
|
## AUTH ERROR
|
|
if isinstance(e, AuthenticationError) and (
|
|
litellm.email or "LITELLM_EMAIL" in os.environ
|
|
):
|
|
threading.Thread(target=get_all_keys, args=(e.llm_provider,)).start()
|
|
# don't let an error with mapping interrupt the user from receiving an error from the llm api calls
|
|
if exception_mapping_worked:
|
|
raise e
|
|
else:
|
|
raise original_exception
|
|
|
|
|
|
####### CRASH REPORTING ################
|
|
def safe_crash_reporting(model=None, exception=None, custom_llm_provider=None):
|
|
data = {
|
|
"model": model,
|
|
"exception": str(exception),
|
|
"custom_llm_provider": custom_llm_provider,
|
|
}
|
|
threading.Thread(target=litellm_telemetry, args=(data,)).start()
|
|
|
|
def get_or_generate_uuid():
|
|
temp_dir = os.path.join(os.path.abspath(os.sep), "tmp")
|
|
uuid_file = os.path.join(temp_dir, "litellm_uuid.txt")
|
|
try:
|
|
# Try to open the file and load the UUID
|
|
with open(uuid_file, "r") as file:
|
|
uuid_value = file.read()
|
|
if uuid_value:
|
|
uuid_value = uuid_value.strip()
|
|
else:
|
|
raise FileNotFoundError
|
|
|
|
except FileNotFoundError:
|
|
# Generate a new UUID if the file doesn't exist or is empty
|
|
try:
|
|
new_uuid = uuid.uuid4()
|
|
uuid_value = str(new_uuid)
|
|
with open(uuid_file, "w") as file:
|
|
file.write(uuid_value)
|
|
except: # if writing to tmp/litellm_uuid.txt then retry writing to litellm_uuid.txt
|
|
try:
|
|
new_uuid = uuid.uuid4()
|
|
uuid_value = str(new_uuid)
|
|
with open("litellm_uuid.txt", "w") as file:
|
|
file.write(uuid_value)
|
|
except: # if this 3rd attempt fails just pass
|
|
# Good first issue for someone to improve this function :)
|
|
return
|
|
except:
|
|
# [Non-Blocking Error]
|
|
return
|
|
return uuid_value
|
|
|
|
|
|
def litellm_telemetry(data):
|
|
# Load or generate the UUID
|
|
uuid_value = ""
|
|
try:
|
|
uuid_value = get_or_generate_uuid()
|
|
except:
|
|
uuid_value = str(uuid.uuid4())
|
|
try:
|
|
# Prepare the data to send to litellm logging api
|
|
payload = {
|
|
"uuid": uuid_value,
|
|
"data": data,
|
|
"version:": importlib.metadata.version("litellm"),
|
|
}
|
|
# Make the POST request to litellm logging api
|
|
response = requests.post(
|
|
"https://litellm.berri.ai/logging",
|
|
headers={"Content-Type": "application/json"},
|
|
json=payload,
|
|
)
|
|
response.raise_for_status() # Raise an exception for HTTP errors
|
|
except:
|
|
# [Non-Blocking Error]
|
|
return
|
|
|
|
|
|
######### Secret Manager ############################
|
|
# checks if user has passed in a secret manager client
|
|
# if passed in then checks the secret there
|
|
def get_secret(secret_name):
|
|
if litellm.secret_manager_client != None:
|
|
# TODO: check which secret manager is being used
|
|
# currently only supports Infisical
|
|
try:
|
|
secret = litellm.secret_manager_client.get_secret(secret_name).secret_value
|
|
except:
|
|
secret = None
|
|
return secret
|
|
else:
|
|
return os.environ.get(secret_name)
|
|
|
|
|
|
######## Streaming Class ############################
|
|
# wraps the completion stream to return the correct format for the model
|
|
# replicate/anthropic/cohere
|
|
class CustomStreamWrapper:
|
|
def __init__(self, completion_stream, model, custom_llm_provider=None, logging_obj=None):
|
|
self.model = model
|
|
self.custom_llm_provider = custom_llm_provider
|
|
self.logging_obj = logging_obj
|
|
self.completion_stream = completion_stream
|
|
self.sent_first_chunk = False
|
|
self.sent_last_chunk = False
|
|
if self.logging_obj:
|
|
# Log the type of the received item
|
|
self.logging_obj.post_call(str(type(completion_stream)))
|
|
|
|
def __iter__(self):
|
|
return self
|
|
|
|
def __aiter__(self):
|
|
return self
|
|
|
|
def logging(self, text):
|
|
if self.logging_obj:
|
|
self.logging_obj.post_call(text)
|
|
|
|
def handle_anthropic_chunk(self, chunk):
|
|
str_line = chunk.decode("utf-8") # Convert bytes to string
|
|
text = ""
|
|
is_finished = False
|
|
finish_reason = None
|
|
if str_line.startswith("data:"):
|
|
data_json = json.loads(str_line[5:])
|
|
text = data_json.get("completion", "")
|
|
if data_json.get("stop_reason", None):
|
|
is_finished = True
|
|
finish_reason = data_json["stop_reason"]
|
|
return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
|
|
elif "error" in str_line:
|
|
raise ValueError(f"Unable to parse response. Original response: {str_line}")
|
|
else:
|
|
return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
|
|
|
|
def handle_together_ai_chunk(self, chunk):
|
|
chunk = chunk.decode("utf-8")
|
|
text = ""
|
|
is_finished = False
|
|
finish_reason = None
|
|
if "text" in chunk:
|
|
text_index = chunk.find('"text":"') # this checks if text: exists
|
|
text_start = text_index + len('"text":"')
|
|
text_end = chunk.find('"}', text_start)
|
|
if text_index != -1 and text_end != -1:
|
|
extracted_text = chunk[text_start:text_end]
|
|
text = extracted_text
|
|
return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
|
|
elif "[DONE]" in chunk:
|
|
return {"text": text, "is_finished": True, "finish_reason": "stop"}
|
|
elif "error" in chunk:
|
|
raise ValueError(chunk)
|
|
else:
|
|
return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
|
|
|
|
def handle_huggingface_chunk(self, chunk):
|
|
chunk = chunk.decode("utf-8")
|
|
text = ""
|
|
is_finished = False
|
|
finish_reason = ""
|
|
if chunk.startswith("data:"):
|
|
data_json = json.loads(chunk[5:])
|
|
print(f"data json: {data_json}")
|
|
if "token" in data_json and "text" in data_json["token"]:
|
|
text = data_json["token"]["text"]
|
|
if "meta-llama/Llama-2" in self.model: #clean eos tokens like </s> from the returned output text
|
|
if any(token in text for token in llama_2_special_tokens):
|
|
text = text.replace("<s>", "").replace("</s>", "")
|
|
if data_json.get("details", False) and data_json["details"].get("finish_reason", False):
|
|
is_finished = True
|
|
finish_reason = data_json["details"]["finish_reason"]
|
|
return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
|
|
elif "error" in chunk:
|
|
raise ValueError(chunk)
|
|
return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
|
|
|
|
def handle_ai21_chunk(self, chunk): # fake streaming
|
|
chunk = chunk.decode("utf-8")
|
|
data_json = json.loads(chunk)
|
|
try:
|
|
text = data_json["completions"][0]["data"]["text"]
|
|
is_finished = True
|
|
finish_reason = "stop"
|
|
return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
|
|
except:
|
|
raise ValueError(f"Unable to parse response. Original response: {chunk}")
|
|
|
|
def handle_nlp_cloud_chunk(self, chunk):
|
|
chunk = chunk.decode("utf-8")
|
|
data_json = json.loads(chunk)
|
|
try:
|
|
text = data_json["generated_text"]
|
|
is_finished = True
|
|
finish_reason = "stop"
|
|
return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
|
|
except:
|
|
raise ValueError(f"Unable to parse response. Original response: {chunk}")
|
|
|
|
def handle_aleph_alpha_chunk(self, chunk):
|
|
chunk = chunk.decode("utf-8")
|
|
data_json = json.loads(chunk)
|
|
try:
|
|
text = data_json["completions"][0]["completion"]
|
|
is_finished = True
|
|
finish_reason = "stop"
|
|
return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
|
|
except:
|
|
raise ValueError(f"Unable to parse response. Original response: {chunk}")
|
|
|
|
def handle_cohere_chunk(self, chunk):
|
|
chunk = chunk.decode("utf-8")
|
|
data_json = json.loads(chunk)
|
|
try:
|
|
text = ""
|
|
is_finished = False
|
|
finish_reason = ""
|
|
if "text" in data_json:
|
|
text = data_json["text"]
|
|
elif "is_finished" in data_json:
|
|
is_finished = data_json["is_finished"]
|
|
finish_reason = data_json["finish_reason"]
|
|
else:
|
|
raise Exception(data_json)
|
|
return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
|
|
except:
|
|
raise ValueError(f"Unable to parse response. Original response: {chunk}")
|
|
|
|
def handle_replicate_chunk(self, chunk):
|
|
print(f"chunk: {chunk}")
|
|
try:
|
|
text = ""
|
|
is_finished = False
|
|
finish_reason = ""
|
|
if "output" in chunk:
|
|
text = chunk['output']
|
|
if "status" in chunk:
|
|
if chunk["status"] == "succeeded":
|
|
is_finished = True
|
|
finish_reason = "stop"
|
|
elif chunk.get("error", None):
|
|
raise Exception(chunk["error"])
|
|
return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
|
|
except:
|
|
raise ValueError(f"Unable to parse response. Original response: {chunk}")
|
|
|
|
def handle_openai_text_completion_chunk(self, chunk):
|
|
try:
|
|
print(f"chunk: {chunk}")
|
|
return chunk["choices"][0]["text"]
|
|
except:
|
|
raise ValueError(f"Unable to parse response. Original response: {chunk}")
|
|
|
|
def handle_openai_chat_completion_chunk(self, chunk):
|
|
try:
|
|
return chunk["choices"][0]["delta"]["content"]
|
|
except:
|
|
return ""
|
|
|
|
def handle_baseten_chunk(self, chunk):
|
|
try:
|
|
chunk = chunk.decode("utf-8")
|
|
if len(chunk) > 0:
|
|
if chunk.startswith("data:"):
|
|
data_json = json.loads(chunk[5:])
|
|
if "token" in data_json and "text" in data_json["token"]:
|
|
return data_json["token"]["text"]
|
|
else:
|
|
return ""
|
|
data_json = json.loads(chunk)
|
|
if "model_output" in data_json:
|
|
if isinstance(data_json["model_output"], dict) and "data" in data_json["model_output"] and isinstance(data_json["model_output"]["data"], list):
|
|
return data_json["model_output"]["data"][0]
|
|
elif isinstance(data_json["model_output"], str):
|
|
return data_json["model_output"]
|
|
elif "completion" in data_json and isinstance(data_json["completion"], str):
|
|
return data_json["completion"]
|
|
else:
|
|
raise ValueError(f"Unable to parse response. Original response: {chunk}")
|
|
else:
|
|
return ""
|
|
else:
|
|
return ""
|
|
except:
|
|
traceback.print_exc()
|
|
return ""
|
|
|
|
def handle_bedrock_stream(self, chunk):
|
|
chunk = chunk.get('chunk')
|
|
if chunk:
|
|
chunk_data = json.loads(chunk.get('bytes').decode())
|
|
text = ""
|
|
is_finished = False
|
|
finish_reason = ""
|
|
if "outputText" in chunk_data:
|
|
text = chunk_data['outputText']
|
|
if chunk_data.get("completionReason", None):
|
|
is_finished = True
|
|
finish_reason = chunk_data["completionReason"]
|
|
elif chunk.get("error", None):
|
|
raise Exception(chunk["error"])
|
|
return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
|
|
return ""
|
|
|
|
## needs to handle the empty string case (even starting chunk can be an empty string)
|
|
def __next__(self):
|
|
model_response = ModelResponse(stream=True, model=self.model)
|
|
try:
|
|
while True: # loop until a non-empty string is found
|
|
# return this for all models
|
|
completion_obj = {"content": ""}
|
|
if self.custom_llm_provider and self.custom_llm_provider == "anthropic":
|
|
chunk = next(self.completion_stream)
|
|
response_obj = self.handle_anthropic_chunk(chunk)
|
|
completion_obj["content"] = response_obj["text"]
|
|
if response_obj["is_finished"]:
|
|
model_response.choices[0].finish_reason = response_obj["finish_reason"]
|
|
elif self.model == "replicate" or self.custom_llm_provider == "replicate":
|
|
chunk = next(self.completion_stream)
|
|
response_obj = self.handle_replicate_chunk(chunk)
|
|
completion_obj["content"] = response_obj["text"]
|
|
if response_obj["is_finished"]:
|
|
model_response.choices[0].finish_reason = response_obj["finish_reason"]
|
|
elif (
|
|
self.custom_llm_provider and self.custom_llm_provider == "together_ai"):
|
|
chunk = next(self.completion_stream)
|
|
response_obj = self.handle_together_ai_chunk(chunk)
|
|
completion_obj["content"] = response_obj["text"]
|
|
if response_obj["is_finished"]:
|
|
model_response.choices[0].finish_reason = response_obj["finish_reason"]
|
|
elif self.custom_llm_provider and self.custom_llm_provider == "huggingface":
|
|
chunk = next(self.completion_stream)
|
|
response_obj = self.handle_huggingface_chunk(chunk)
|
|
completion_obj["content"] = response_obj["text"]
|
|
if response_obj["is_finished"]:
|
|
model_response.choices[0].finish_reason = response_obj["finish_reason"]
|
|
elif self.custom_llm_provider and self.custom_llm_provider == "baseten": # baseten doesn't provide streaming
|
|
chunk = next(self.completion_stream)
|
|
completion_obj["content"] = self.handle_baseten_chunk(chunk)
|
|
elif self.custom_llm_provider and self.custom_llm_provider == "ai21": #ai21 doesn't provide streaming
|
|
chunk = next(self.completion_stream)
|
|
response_obj = self.handle_ai21_chunk(chunk)
|
|
completion_obj["content"] = response_obj["text"]
|
|
if response_obj["is_finished"]:
|
|
model_response.choices[0].finish_reason = response_obj["finish_reason"]
|
|
elif self.custom_llm_provider and self.custom_llm_provider == "vllm":
|
|
chunk = next(self.completion_stream)
|
|
completion_obj["content"] = chunk[0].outputs[0].text
|
|
elif self.custom_llm_provider and self.custom_llm_provider == "aleph_alpha": #aleph alpha doesn't provide streaming
|
|
chunk = next(self.completion_stream)
|
|
response_obj = self.handle_aleph_alpha_chunk(chunk)
|
|
completion_obj["content"] = response_obj["text"]
|
|
if response_obj["is_finished"]:
|
|
model_response.choices[0].finish_reason = response_obj["finish_reason"]
|
|
elif self.custom_llm_provider and self.custom_llm_provider == "text-completion-openai":
|
|
chunk = next(self.completion_stream)
|
|
completion_obj["content"] = self.handle_openai_text_completion_chunk(chunk)
|
|
elif self.model in litellm.nlp_cloud_models or self.custom_llm_provider == "nlp_cloud":
|
|
try:
|
|
chunk = next(self.completion_stream)
|
|
response_obj = self.handle_nlp_cloud_chunk(chunk)
|
|
completion_obj["content"] = response_obj["text"]
|
|
if response_obj["is_finished"]:
|
|
model_response.choices[0].finish_reason = response_obj["finish_reason"]
|
|
except Exception as e:
|
|
if self.sent_last_chunk:
|
|
raise e
|
|
else:
|
|
if self.sent_first_chunk is False:
|
|
raise Exception("An unknown error occurred with the stream")
|
|
model_response.choices[0].finish_reason = "stop"
|
|
self.sent_last_chunk = True
|
|
elif self.custom_llm_provider and self.custom_llm_provider == "vertex_ai":
|
|
try:
|
|
chunk = next(self.completion_stream)
|
|
completion_obj["content"] = str(chunk)
|
|
except StopIteration as e:
|
|
if self.sent_last_chunk:
|
|
raise e
|
|
else:
|
|
model_response.choices[0].finish_reason = "stop"
|
|
self.sent_last_chunk = True
|
|
elif self.custom_llm_provider == "cohere":
|
|
chunk = next(self.completion_stream)
|
|
response_obj = self.handle_cohere_chunk(chunk)
|
|
completion_obj["content"] = response_obj["text"]
|
|
if response_obj["is_finished"]:
|
|
model_response.choices[0].finish_reason = response_obj["finish_reason"]
|
|
elif self.custom_llm_provider == "bedrock":
|
|
chunk = next(self.completion_stream)
|
|
response_obj = self.handle_bedrock_stream(chunk)
|
|
completion_obj["content"] = response_obj["text"]
|
|
if response_obj["is_finished"]:
|
|
model_response.choices[0].finish_reason = response_obj["finish_reason"]
|
|
elif self.custom_llm_provider == "sagemaker":
|
|
if len(self.completion_stream)==0:
|
|
if self.sent_last_chunk:
|
|
raise StopIteration
|
|
else:
|
|
model_response.choices[0].finish_reason = "stop"
|
|
self.sent_last_chunk = True
|
|
chunk_size = 30
|
|
new_chunk = self.completion_stream[:chunk_size]
|
|
completion_obj["content"] = new_chunk
|
|
self.completion_stream = self.completion_stream[chunk_size:]
|
|
time.sleep(0.05)
|
|
elif self.custom_llm_provider == "palm":
|
|
# fake streaming
|
|
if len(self.completion_stream)==0:
|
|
if self.sent_last_chunk:
|
|
raise StopIteration
|
|
else:
|
|
model_response.choices[0].finish_reason = "stop"
|
|
self.sent_last_chunk = True
|
|
chunk_size = 30
|
|
new_chunk = self.completion_stream[:chunk_size]
|
|
completion_obj["content"] = new_chunk
|
|
self.completion_stream = self.completion_stream[chunk_size:]
|
|
time.sleep(0.05)
|
|
else: # openai chat/azure models
|
|
chunk = next(self.completion_stream)
|
|
model_response = chunk
|
|
# LOGGING
|
|
threading.Thread(target=self.logging_obj.success_handler, args=(completion_obj,)).start()
|
|
return model_response
|
|
|
|
# LOGGING
|
|
threading.Thread(target=self.logging_obj.success_handler, args=(completion_obj,)).start()
|
|
model_response.model = self.model
|
|
if len(completion_obj["content"]) > 0: # cannot set content of an OpenAI Object to be an empty string
|
|
if self.sent_first_chunk == False:
|
|
completion_obj["role"] = "assistant"
|
|
self.sent_first_chunk = True
|
|
model_response.choices[0].delta = Delta(**completion_obj)
|
|
return model_response
|
|
elif model_response.choices[0].finish_reason:
|
|
return model_response
|
|
except StopIteration:
|
|
raise StopIteration
|
|
except Exception as e:
|
|
e.message = str(e)
|
|
return exception_type(model=self.model, custom_llm_provider=self.custom_llm_provider, original_exception=e)
|
|
|
|
async def __anext__(self):
|
|
try:
|
|
return next(self)
|
|
except StopIteration:
|
|
raise StopAsyncIteration
|
|
|
|
|
|
def mock_completion_streaming_obj(model_response, mock_response, model):
|
|
for i in range(0, len(mock_response), 3):
|
|
completion_obj = {"role": "assistant", "content": mock_response[i: i+3]}
|
|
model_response.choices[0].delta = completion_obj
|
|
yield model_response
|
|
|
|
########## Reading Config File ############################
|
|
def read_config_args(config_path) -> dict:
|
|
try:
|
|
import os
|
|
|
|
current_path = os.getcwd()
|
|
with open(config_path, "r") as config_file:
|
|
config = json.load(config_file)
|
|
|
|
# read keys/ values from config file and return them
|
|
return config
|
|
except Exception as e:
|
|
raise e
|
|
|
|
########## experimental completion variants ############################
|
|
|
|
def completion_with_config(config: Union[dict, str], **kwargs):
|
|
if config is not None:
|
|
if isinstance(config, str):
|
|
config = read_config_args(config)
|
|
elif isinstance(config, dict):
|
|
config = config
|
|
else:
|
|
raise Exception("Config path must be a string or a dictionary.")
|
|
else:
|
|
raise Exception("Config path not passed in.")
|
|
|
|
if config is None:
|
|
raise Exception("No completion config in the config file")
|
|
|
|
models_with_config = config["model"].keys()
|
|
model = kwargs["model"]
|
|
messages = kwargs["messages"]
|
|
|
|
## completion config
|
|
fallback_models = config.get("default_fallback_models", None)
|
|
available_models = config.get("available_models", None)
|
|
adapt_to_prompt_size = config.get("adapt_to_prompt_size", False)
|
|
trim_messages_flag = config.get("trim_messages", False)
|
|
prompt_larger_than_model = False
|
|
max_model = model
|
|
try:
|
|
max_tokens = litellm.get_max_tokens(model)["max_tokens"]
|
|
except:
|
|
max_tokens = 2048 # assume curr model's max window is 2048 tokens
|
|
if adapt_to_prompt_size:
|
|
## Pick model based on token window
|
|
prompt_tokens = litellm.token_counter(model="gpt-3.5-turbo", text="".join(message["content"] for message in messages))
|
|
try:
|
|
curr_max_tokens = litellm.get_max_tokens(model)["max_tokens"]
|
|
except:
|
|
curr_max_tokens = 2048
|
|
if curr_max_tokens < prompt_tokens:
|
|
prompt_larger_than_model = True
|
|
for available_model in available_models:
|
|
try:
|
|
curr_max_tokens = litellm.get_max_tokens(available_model)["max_tokens"]
|
|
if curr_max_tokens > max_tokens:
|
|
max_tokens = curr_max_tokens
|
|
max_model = available_model
|
|
if curr_max_tokens > prompt_tokens:
|
|
model = available_model
|
|
prompt_larger_than_model = False
|
|
except:
|
|
continue
|
|
if prompt_larger_than_model:
|
|
messages = trim_messages(messages=messages, model=max_model)
|
|
kwargs["messages"] = messages
|
|
|
|
kwargs["model"] = model
|
|
try:
|
|
if model in models_with_config:
|
|
## Moderation check
|
|
if config["model"][model].get("needs_moderation"):
|
|
input = " ".join(message["content"] for message in messages)
|
|
response = litellm.moderation(input=input)
|
|
flagged = response["results"][0]["flagged"]
|
|
if flagged:
|
|
raise Exception("This response was flagged as inappropriate")
|
|
|
|
## Model-specific Error Handling
|
|
error_handling = None
|
|
if config["model"][model].get("error_handling"):
|
|
error_handling = config["model"][model]["error_handling"]
|
|
|
|
try:
|
|
response = litellm.completion(**kwargs)
|
|
return response
|
|
except Exception as e:
|
|
exception_name = type(e).__name__
|
|
fallback_model = None
|
|
if error_handling and exception_name in error_handling:
|
|
error_handler = error_handling[exception_name]
|
|
# either switch model or api key
|
|
fallback_model = error_handler.get("fallback_model", None)
|
|
if fallback_model:
|
|
kwargs["model"] = fallback_model
|
|
return litellm.completion(**kwargs)
|
|
raise e
|
|
else:
|
|
return litellm.completion(**kwargs)
|
|
except Exception as e:
|
|
if fallback_models:
|
|
model = fallback_models.pop(0)
|
|
return completion_with_fallbacks(model=model, messages=messages, fallbacks=fallback_models)
|
|
raise e
|
|
|
|
|
|
|
|
def get_model_split_test(models, completion_call_id):
|
|
global last_fetched_at
|
|
try:
|
|
# make the api call
|
|
last_fetched_at = time.time()
|
|
response = requests.post(
|
|
#http://api.litellm.ai
|
|
url="http://api.litellm.ai/get_model_split_test", # get the updated dict from table or update the table with the dict
|
|
headers={"content-type": "application/json"},
|
|
data=json.dumps({"completion_call_id": completion_call_id, "models": models}),
|
|
)
|
|
print_verbose(f"get_model_list response: {response.text}")
|
|
data = response.json()
|
|
# update model list
|
|
split_test_models = data["split_test_models"]
|
|
model_configs = data.get("model_configs", {})
|
|
# update environment - if required
|
|
threading.Thread(target=get_all_keys, args=()).start()
|
|
return split_test_models, model_configs
|
|
except:
|
|
print_verbose(
|
|
f"[Non-Blocking Error] get_all_keys error - {traceback.format_exc()}"
|
|
)
|
|
|
|
|
|
def completion_with_split_tests(models={}, messages=[], use_client=False, override_client=False, **kwargs):
|
|
"""
|
|
Example Usage:
|
|
|
|
models = {
|
|
"gpt-4": 0.7,
|
|
"huggingface/wizard-coder": 0.3
|
|
}
|
|
messages = [{ "content": "Hello, how are you?","role": "user"}]
|
|
completion_with_split_tests(models=models, messages=messages)
|
|
"""
|
|
import random
|
|
model_configs = {}
|
|
if use_client and not override_client:
|
|
if "id" not in kwargs or kwargs["id"] is None:
|
|
kwargs["id"] = str(uuid.uuid4())
|
|
#raise ValueError("Please tag this completion call, if you'd like to update it's split test values through the UI. - eg. `completion_with_split_tests(.., id=1234)`.")
|
|
# get the most recent model split list from server
|
|
models, model_configs = get_model_split_test(models=models, completion_call_id=kwargs["id"])
|
|
|
|
try:
|
|
selected_llm = random.choices(list(models.keys()), weights=list(models.values()))[0]
|
|
except:
|
|
traceback.print_exc()
|
|
raise ValueError("""models does not follow the required format - {'model_name': 'split_percentage'}, e.g. {'gpt-4': 0.7, 'huggingface/wizard-coder': 0.3}""")
|
|
|
|
# use dynamic model configs if users set
|
|
if model_configs!={}:
|
|
selected_model_configs = model_configs.get(selected_llm, {})
|
|
if "prompt" in selected_model_configs: # special case, add this to messages as system prompt
|
|
messages.append({"role": "system", "content": selected_model_configs["prompt"]})
|
|
selected_model_configs.pop("prompt")
|
|
for param_name in selected_model_configs:
|
|
if param_name == "temperature":
|
|
kwargs[param_name] = float(selected_model_configs[param_name])
|
|
elif param_name == "max_tokens":
|
|
kwargs[param_name] = int(selected_model_configs[param_name])
|
|
else:
|
|
kwargs[param_name] = selected_model_configs[param_name]
|
|
|
|
return litellm.completion(model=selected_llm, messages=messages, use_client=use_client, **kwargs)
|
|
|
|
def completion_with_fallbacks(**kwargs):
|
|
response = None
|
|
rate_limited_models = set()
|
|
model_expiration_times = {}
|
|
start_time = time.time()
|
|
original_model = kwargs["model"]
|
|
fallbacks = [kwargs["model"]] + kwargs["fallbacks"]
|
|
del kwargs["fallbacks"] # remove fallbacks so it's not recursive
|
|
|
|
while response == None and time.time() - start_time < 45:
|
|
for model in fallbacks:
|
|
# loop thru all models
|
|
try:
|
|
# check if it's dict or new model string
|
|
if isinstance(model, dict): # completion(model="gpt-4", fallbacks=[{"api_key": "", "api_base": ""}, {"api_key": "", "api_base": ""}])
|
|
kwargs["api_key"] = model.get("api_key", None)
|
|
kwargs["api_base"] = model.get("api_base", None)
|
|
model = original_model
|
|
print(f"switched api keys")
|
|
elif (
|
|
model in rate_limited_models
|
|
): # check if model is currently cooling down
|
|
if (
|
|
model_expiration_times.get(model)
|
|
and time.time() >= model_expiration_times[model]
|
|
):
|
|
rate_limited_models.remove(
|
|
model
|
|
) # check if it's been 60s of cool down and remove model
|
|
else:
|
|
continue # skip model
|
|
|
|
# delete model from kwargs if it exists
|
|
if kwargs.get("model"):
|
|
del kwargs["model"]
|
|
|
|
response = litellm.completion(**kwargs, model=model)
|
|
|
|
if response != None:
|
|
return response
|
|
|
|
except Exception as e:
|
|
print(e)
|
|
rate_limited_models.add(model)
|
|
model_expiration_times[model] = (
|
|
time.time() + 60
|
|
) # cool down this selected model
|
|
# print(f"rate_limited_models {rate_limited_models}")
|
|
pass
|
|
return response
|
|
|
|
def process_system_message(system_message, max_tokens, model):
|
|
system_message_event = {"role": "system", "content": system_message}
|
|
system_message_tokens = get_token_count(system_message_event, model)
|
|
|
|
if system_message_tokens > max_tokens:
|
|
print_verbose("`tokentrimmer`: Warning, system message exceeds token limit. Trimming...")
|
|
# shorten system message to fit within max_tokens
|
|
new_system_message = shorten_message_to_fit_limit(system_message_event, max_tokens, model)
|
|
system_message_tokens = get_token_count(new_system_message, model)
|
|
|
|
return system_message_event, max_tokens - system_message_tokens
|
|
|
|
def process_messages(messages, max_tokens, model):
|
|
# Process messages from older to more recent
|
|
messages = messages[::-1]
|
|
final_messages = []
|
|
|
|
for message in messages:
|
|
final_messages = attempt_message_addition(final_messages, message, max_tokens, model)
|
|
|
|
return final_messages
|
|
|
|
def attempt_message_addition(final_messages, message, max_tokens, model):
|
|
temp_messages = [message] + final_messages
|
|
temp_message_tokens = get_token_count(messages=temp_messages, model=model)
|
|
|
|
if temp_message_tokens <= max_tokens:
|
|
return temp_messages
|
|
|
|
# if temp_message_tokens > max_tokens, try shortening temp_messages
|
|
elif "function_call" not in message:
|
|
# fit updated_message to be within temp_message_tokens - max_tokens (aka the amount temp_message_tokens is greate than max_tokens)
|
|
updated_message = shorten_message_to_fit_limit(message, temp_message_tokens - max_tokens, model)
|
|
if can_add_message(updated_message, final_messages, max_tokens, model):
|
|
return [updated_message] + final_messages
|
|
|
|
return final_messages
|
|
|
|
def can_add_message(message, messages, max_tokens, model):
|
|
if get_token_count(messages + [message], model) <= max_tokens:
|
|
return True
|
|
return False
|
|
|
|
def get_token_count(messages, model):
|
|
return token_counter(model=model, messages=messages)
|
|
|
|
|
|
def shorten_message_to_fit_limit(
|
|
message,
|
|
tokens_needed,
|
|
model):
|
|
"""
|
|
Shorten a message to fit within a token limit by removing characters from the middle.
|
|
"""
|
|
content = message["content"]
|
|
|
|
while True:
|
|
total_tokens = get_token_count([message], model)
|
|
|
|
if total_tokens <= tokens_needed:
|
|
break
|
|
|
|
ratio = (tokens_needed) / total_tokens
|
|
|
|
new_length = int(len(content) * ratio)
|
|
print_verbose(new_length)
|
|
|
|
half_length = new_length // 2
|
|
left_half = content[:half_length]
|
|
right_half = content[-half_length:]
|
|
|
|
trimmed_content = left_half + '..' + right_half
|
|
message["content"] = trimmed_content
|
|
content = trimmed_content
|
|
|
|
return message
|
|
|
|
# LiteLLM token trimmer
|
|
# this code is borrowed from https://github.com/KillianLucas/tokentrim/blob/main/tokentrim/tokentrim.py
|
|
# Credits for this code go to Killian Lucas
|
|
def trim_messages(
|
|
messages,
|
|
model: Optional[str] = None,
|
|
trim_ratio: float = 0.75,
|
|
return_response_tokens: bool = False,
|
|
max_tokens = None
|
|
):
|
|
"""
|
|
Trim a list of messages to fit within a model's token limit.
|
|
|
|
Args:
|
|
messages: Input messages to be trimmed. Each message is a dictionary with 'role' and 'content'.
|
|
model: The LiteLLM model being used (determines the token limit).
|
|
system_message: Optional system message to preserve at the start of the conversation.
|
|
trim_ratio: Target ratio of tokens to use after trimming. Default is 0.75, meaning it will trim messages so they use about 75% of the model's token limit.
|
|
return_response_tokens: If True, also return the number of tokens left available for the response after trimming.
|
|
max_tokens: Instead of specifying a model or trim_ratio, you can specify this directly.
|
|
|
|
Returns:
|
|
Trimmed messages and optionally the number of tokens available for response.
|
|
"""
|
|
# Initialize max_tokens
|
|
# if users pass in max tokens, trim to this amount
|
|
try:
|
|
if max_tokens == None:
|
|
# Check if model is valid
|
|
if model in litellm.model_cost:
|
|
max_tokens_for_model = litellm.model_cost[model]['max_tokens']
|
|
max_tokens = int(max_tokens_for_model * trim_ratio)
|
|
else:
|
|
# if user did not specify max tokens
|
|
# or passed an llm litellm does not know
|
|
# do nothing, just return messages
|
|
return
|
|
|
|
system_message = ""
|
|
for message in messages:
|
|
if message["role"] == "system":
|
|
system_message += message["content"]
|
|
|
|
current_tokens = token_counter(model=model, messages=messages)
|
|
|
|
# Do nothing if current tokens under messages
|
|
if current_tokens < max_tokens:
|
|
return messages
|
|
|
|
#### Trimming messages if current_tokens > max_tokens
|
|
print_verbose(f"Need to trim input messages: {messages}, current_tokens{current_tokens}, max_tokens: {max_tokens}")
|
|
if system_message:
|
|
system_message_event, max_tokens = process_system_message(system_message=system_message, max_tokens=max_tokens, model=model)
|
|
messages = messages + [system_message_event]
|
|
|
|
final_messages = process_messages(messages=messages, max_tokens=max_tokens, model=model)
|
|
|
|
if return_response_tokens: # if user wants token count with new trimmed messages
|
|
response_tokens = max_tokens - get_token_count(final_messages, model)
|
|
return final_messages, response_tokens
|
|
|
|
return final_messages
|
|
except: # [NON-Blocking, if error occurs just return final_messages
|
|
return messages
|
|
|
|
# this helper reads the .env and returns a list of supported llms for user
|
|
def get_valid_models():
|
|
try:
|
|
# get keys set in .env
|
|
environ_keys = os.environ.keys()
|
|
valid_providers = []
|
|
# for all valid providers, make a list of supported llms
|
|
valid_models = []
|
|
|
|
for provider in litellm.provider_list:
|
|
# edge case litellm has together_ai as a provider, it should be togetherai
|
|
provider = provider.replace("_", "")
|
|
|
|
# litellm standardizes expected provider keys to
|
|
# PROVIDER_API_KEY. Example: OPENAI_API_KEY, COHERE_API_KEY
|
|
expected_provider_key = f"{provider.upper()}_API_KEY"
|
|
if expected_provider_key in environ_keys:
|
|
# key is set
|
|
valid_providers.append(provider)
|
|
|
|
for provider in valid_providers:
|
|
if provider == "azure":
|
|
valid_models.append("Azure-LLM")
|
|
else:
|
|
models_for_provider = litellm.models_by_provider.get(provider, [])
|
|
valid_models.extend(models_for_provider)
|
|
return valid_models
|
|
except:
|
|
return [] # NON-Blocking
|