add linting

This commit is contained in:
ishaan-jaff 2023-08-18 11:05:05 -07:00
parent 8ef47524bf
commit 15b1da9dc8
40 changed files with 3110 additions and 1709 deletions

View file

@ -1,8 +1,9 @@
import threading import threading
success_callback = [] success_callback = []
failure_callback = [] failure_callback = []
set_verbose=False set_verbose = False
telemetry=True telemetry = True
max_tokens = 256 # OpenAI Defaults max_tokens = 256 # OpenAI Defaults
retry = True retry = True
api_key = None api_key = None
@ -19,33 +20,99 @@ caching = False
hugging_api_token = None hugging_api_token = None
togetherai_api_key = None togetherai_api_key = None
model_cost = { model_cost = {
"gpt-3.5-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, "gpt-3.5-turbo": {
"gpt-35-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, # azure model name "max_tokens": 4000,
"gpt-3.5-turbo-0613": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, "input_cost_per_token": 0.0000015,
"gpt-3.5-turbo-0301": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, "output_cost_per_token": 0.000002,
"gpt-3.5-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, },
"gpt-35-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, # azure model name "gpt-35-turbo": {
"gpt-3.5-turbo-16k-0613": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, "max_tokens": 4000,
"gpt-4": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006}, "input_cost_per_token": 0.0000015,
"gpt-4-0613": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006}, "output_cost_per_token": 0.000002,
"gpt-4-32k": {"max_tokens": 8000, "input_cost_per_token": 0.00006, "output_cost_per_token": 0.00012}, }, # azure model name
"claude-instant-1": {"max_tokens": 100000, "input_cost_per_token": 0.00000163, "output_cost_per_token": 0.00000551}, "gpt-3.5-turbo-0613": {
"claude-2": {"max_tokens": 100000, "input_cost_per_token": 0.00001102, "output_cost_per_token": 0.00003268}, "max_tokens": 4000,
"text-bison-001": {"max_tokens": 8192, "input_cost_per_token": 0.000004, "output_cost_per_token": 0.000004}, "input_cost_per_token": 0.0000015,
"chat-bison-001": {"max_tokens": 4096, "input_cost_per_token": 0.000002, "output_cost_per_token": 0.000002}, "output_cost_per_token": 0.000002,
"command-nightly": {"max_tokens": 4096, "input_cost_per_token": 0.000015, "output_cost_per_token": 0.000015}, },
"gpt-3.5-turbo-0301": {
"max_tokens": 4000,
"input_cost_per_token": 0.0000015,
"output_cost_per_token": 0.000002,
},
"gpt-3.5-turbo-16k": {
"max_tokens": 16000,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000004,
},
"gpt-35-turbo-16k": {
"max_tokens": 16000,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000004,
}, # azure model name
"gpt-3.5-turbo-16k-0613": {
"max_tokens": 16000,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000004,
},
"gpt-4": {
"max_tokens": 8000,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.00006,
},
"gpt-4-0613": {
"max_tokens": 8000,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.00006,
},
"gpt-4-32k": {
"max_tokens": 8000,
"input_cost_per_token": 0.00006,
"output_cost_per_token": 0.00012,
},
"claude-instant-1": {
"max_tokens": 100000,
"input_cost_per_token": 0.00000163,
"output_cost_per_token": 0.00000551,
},
"claude-2": {
"max_tokens": 100000,
"input_cost_per_token": 0.00001102,
"output_cost_per_token": 0.00003268,
},
"text-bison-001": {
"max_tokens": 8192,
"input_cost_per_token": 0.000004,
"output_cost_per_token": 0.000004,
},
"chat-bison-001": {
"max_tokens": 4096,
"input_cost_per_token": 0.000002,
"output_cost_per_token": 0.000002,
},
"command-nightly": {
"max_tokens": 4096,
"input_cost_per_token": 0.000015,
"output_cost_per_token": 0.000015,
},
} }
####### THREAD-SPECIFIC DATA ################### ####### THREAD-SPECIFIC DATA ###################
class MyLocal(threading.local): class MyLocal(threading.local):
def __init__(self): def __init__(self):
self.user = "Hello World" self.user = "Hello World"
_thread_context = MyLocal() _thread_context = MyLocal()
def identify(event_details): def identify(event_details):
# Store user in thread local data # Store user in thread local data
if "user" in event_details: if "user" in event_details:
_thread_context.user = event_details["user"] _thread_context.user = event_details["user"]
####### ADDITIONAL PARAMS ################### configurable params if you use proxy models like Helicone, map spend to org id, etc. ####### ADDITIONAL PARAMS ################### configurable params if you use proxy models like Helicone, map spend to org id, etc.
api_base = None api_base = None
headers = None headers = None
@ -66,50 +133,38 @@ open_ai_chat_completion_models = [
"gpt-3.5-turbo-0613", "gpt-3.5-turbo-0613",
"gpt-3.5-turbo-16k-0613", "gpt-3.5-turbo-16k-0613",
] ]
open_ai_text_completion_models = [ open_ai_text_completion_models = ["text-davinci-003"]
'text-davinci-003'
]
cohere_models = [ cohere_models = [
'command-nightly', "command-nightly",
"command", "command",
"command-light", "command-light",
"command-medium-beta", "command-medium-beta",
"command-xlarge-beta" "command-xlarge-beta",
] ]
anthropic_models = [ anthropic_models = ["claude-2", "claude-instant-1", "claude-instant-1.2"]
"claude-2",
"claude-instant-1",
"claude-instant-1.2"
]
replicate_models = [ replicate_models = [
"replicate/" "replicate/"
] # placeholder, to make sure we accept any replicate model in our model_list ] # placeholder, to make sure we accept any replicate model in our model_list
openrouter_models = [ openrouter_models = [
'google/palm-2-codechat-bison', "google/palm-2-codechat-bison",
'google/palm-2-chat-bison', "google/palm-2-chat-bison",
'openai/gpt-3.5-turbo', "openai/gpt-3.5-turbo",
'openai/gpt-3.5-turbo-16k', "openai/gpt-3.5-turbo-16k",
'openai/gpt-4-32k', "openai/gpt-4-32k",
'anthropic/claude-2', "anthropic/claude-2",
'anthropic/claude-instant-v1', "anthropic/claude-instant-v1",
'meta-llama/llama-2-13b-chat', "meta-llama/llama-2-13b-chat",
'meta-llama/llama-2-70b-chat' "meta-llama/llama-2-70b-chat",
] ]
vertex_chat_models = [ vertex_chat_models = ["chat-bison", "chat-bison@001"]
"chat-bison",
"chat-bison@001"
]
vertex_text_models = [ vertex_text_models = ["text-bison", "text-bison@001"]
"text-bison",
"text-bison@001"
]
huggingface_models = [ huggingface_models = [
"meta-llama/Llama-2-7b-hf", "meta-llama/Llama-2-7b-hf",
@ -126,23 +181,54 @@ huggingface_models = [
"meta-llama/Llama-2-70b-chat", "meta-llama/Llama-2-70b-chat",
] # these have been tested on extensively. But by default all text2text-generation and text-generation models are supported by liteLLM. - https://docs.litellm.ai/docs/completion/supported ] # these have been tested on extensively. But by default all text2text-generation and text-generation models are supported by liteLLM. - https://docs.litellm.ai/docs/completion/supported
ai21_models = [ ai21_models = ["j2-ultra", "j2-mid", "j2-light"]
"j2-ultra",
"j2-mid", model_list = (
"j2-light" open_ai_chat_completion_models
+ open_ai_text_completion_models
+ cohere_models
+ anthropic_models
+ replicate_models
+ openrouter_models
+ huggingface_models
+ vertex_chat_models
+ vertex_text_models
+ ai21_models
)
provider_list = [
"openai",
"cohere",
"anthropic",
"replicate",
"huggingface",
"together_ai",
"openrouter",
"vertex_ai",
"ai21",
] ]
model_list = open_ai_chat_completion_models + open_ai_text_completion_models + cohere_models + anthropic_models + replicate_models + openrouter_models + huggingface_models + vertex_chat_models + vertex_text_models + ai21_models
provider_list = ["openai", "cohere", "anthropic", "replicate", "huggingface", "together_ai", "openrouter", "vertex_ai", "ai21"]
####### EMBEDDING MODELS ################### ####### EMBEDDING MODELS ###################
open_ai_embedding_models = [ open_ai_embedding_models = ["text-embedding-ada-002"]
'text-embedding-ada-002'
]
from .timeout import timeout from .timeout import timeout
from .testing import * from .testing import *
from .utils import client, logging, exception_type, get_optional_params, modify_integration, token_counter, cost_per_token, completion_cost, get_litellm_params from .utils import (
client,
logging,
exception_type,
get_optional_params,
modify_integration,
token_counter,
cost_per_token,
completion_cost,
get_litellm_params,
)
from .main import * # Import all the symbols from main.py from .main import * # Import all the symbols from main.py
from .integrations import * from .integrations import *
from openai.error import AuthenticationError, InvalidRequestError, RateLimitError, ServiceUnavailableError, OpenAIError from openai.error import (
AuthenticationError,
InvalidRequestError,
RateLimitError,
ServiceUnavailableError,
OpenAIError,
)

View file

@ -1,12 +1,21 @@
## LiteLLM versions of the OpenAI Exception Types ## LiteLLM versions of the OpenAI Exception Types
from openai.error import AuthenticationError, InvalidRequestError, RateLimitError, ServiceUnavailableError, OpenAIError from openai.error import (
AuthenticationError,
InvalidRequestError,
RateLimitError,
ServiceUnavailableError,
OpenAIError,
)
class AuthenticationError(AuthenticationError): class AuthenticationError(AuthenticationError):
def __init__(self, message, llm_provider): def __init__(self, message, llm_provider):
self.status_code = 401 self.status_code = 401
self.message = message self.message = message
self.llm_provider = llm_provider self.llm_provider = llm_provider
super().__init__(self.message) # Call the base class constructor with the parameters it needs super().__init__(
self.message
) # Call the base class constructor with the parameters it needs
class InvalidRequestError(InvalidRequestError): class InvalidRequestError(InvalidRequestError):
@ -15,7 +24,9 @@ class InvalidRequestError(InvalidRequestError):
self.message = message self.message = message
self.model = model self.model = model
self.llm_provider = llm_provider self.llm_provider = llm_provider
super().__init__(self.message, f"{self.model}") # Call the base class constructor with the parameters it needs super().__init__(
self.message, f"{self.model}"
) # Call the base class constructor with the parameters it needs
class RateLimitError(RateLimitError): class RateLimitError(RateLimitError):
@ -23,21 +34,29 @@ class RateLimitError(RateLimitError):
self.status_code = 429 self.status_code = 429
self.message = message self.message = message
self.llm_provider = llm_provider self.llm_provider = llm_provider
super().__init__(self.message) # Call the base class constructor with the parameters it needs super().__init__(
self.message
) # Call the base class constructor with the parameters it needs
class ServiceUnavailableError(ServiceUnavailableError): class ServiceUnavailableError(ServiceUnavailableError):
def __init__(self, message, llm_provider): def __init__(self, message, llm_provider):
self.status_code = 500 self.status_code = 500
self.message = message self.message = message
self.llm_provider = llm_provider self.llm_provider = llm_provider
super().__init__(self.message) # Call the base class constructor with the parameters it needs super().__init__(
self.message
) # Call the base class constructor with the parameters it needs
class OpenAIError(OpenAIError): class OpenAIError(OpenAIError):
def __init__(self, original_exception): def __init__(self, original_exception):
self.status_code = original_exception.http_status self.status_code = original_exception.http_status
super().__init__(http_body=original_exception.http_body, super().__init__(
http_body=original_exception.http_body,
http_status=original_exception.http_status, http_status=original_exception.http_status,
json_body=original_exception.json_body, json_body=original_exception.json_body,
headers=original_exception.headers, headers=original_exception.headers,
code=original_exception.code) code=original_exception.code,
)
self.llm_provider = "openai" self.llm_provider = "openai"

View file

@ -2,28 +2,90 @@
# On success + failure, log events to aispend.io # On success + failure, log events to aispend.io
import dotenv, os import dotenv, os
import requests import requests
dotenv.load_dotenv() # Loading env variables using dotenv dotenv.load_dotenv() # Loading env variables using dotenv
import traceback import traceback
import datetime import datetime
model_cost = { model_cost = {
"gpt-3.5-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, "gpt-3.5-turbo": {
"gpt-35-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, # azure model name "max_tokens": 4000,
"gpt-3.5-turbo-0613": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, "input_cost_per_token": 0.0000015,
"gpt-3.5-turbo-0301": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, "output_cost_per_token": 0.000002,
"gpt-3.5-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, },
"gpt-35-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, # azure model name "gpt-35-turbo": {
"gpt-3.5-turbo-16k-0613": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, "max_tokens": 4000,
"gpt-4": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006}, "input_cost_per_token": 0.0000015,
"gpt-4-0613": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006}, "output_cost_per_token": 0.000002,
"gpt-4-32k": {"max_tokens": 8000, "input_cost_per_token": 0.00006, "output_cost_per_token": 0.00012}, }, # azure model name
"claude-instant-1": {"max_tokens": 100000, "input_cost_per_token": 0.00000163, "output_cost_per_token": 0.00000551}, "gpt-3.5-turbo-0613": {
"claude-2": {"max_tokens": 100000, "input_cost_per_token": 0.00001102, "output_cost_per_token": 0.00003268}, "max_tokens": 4000,
"text-bison-001": {"max_tokens": 8192, "input_cost_per_token": 0.000004, "output_cost_per_token": 0.000004}, "input_cost_per_token": 0.0000015,
"chat-bison-001": {"max_tokens": 4096, "input_cost_per_token": 0.000002, "output_cost_per_token": 0.000002}, "output_cost_per_token": 0.000002,
"command-nightly": {"max_tokens": 4096, "input_cost_per_token": 0.000015, "output_cost_per_token": 0.000015}, },
"gpt-3.5-turbo-0301": {
"max_tokens": 4000,
"input_cost_per_token": 0.0000015,
"output_cost_per_token": 0.000002,
},
"gpt-3.5-turbo-16k": {
"max_tokens": 16000,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000004,
},
"gpt-35-turbo-16k": {
"max_tokens": 16000,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000004,
}, # azure model name
"gpt-3.5-turbo-16k-0613": {
"max_tokens": 16000,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000004,
},
"gpt-4": {
"max_tokens": 8000,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.00006,
},
"gpt-4-0613": {
"max_tokens": 8000,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.00006,
},
"gpt-4-32k": {
"max_tokens": 8000,
"input_cost_per_token": 0.00006,
"output_cost_per_token": 0.00012,
},
"claude-instant-1": {
"max_tokens": 100000,
"input_cost_per_token": 0.00000163,
"output_cost_per_token": 0.00000551,
},
"claude-2": {
"max_tokens": 100000,
"input_cost_per_token": 0.00001102,
"output_cost_per_token": 0.00003268,
},
"text-bison-001": {
"max_tokens": 8192,
"input_cost_per_token": 0.000004,
"output_cost_per_token": 0.000004,
},
"chat-bison-001": {
"max_tokens": 4096,
"input_cost_per_token": 0.000002,
"output_cost_per_token": 0.000002,
},
"command-nightly": {
"max_tokens": 4096,
"input_cost_per_token": 0.000015,
"output_cost_per_token": 0.000015,
},
} }
class AISpendLogger: class AISpendLogger:
# Class variables or attributes # Class variables or attributes
def __init__(self): def __init__(self):
@ -37,8 +99,14 @@ class AISpendLogger:
prompt_tokens_cost_usd_dollar = 0 prompt_tokens_cost_usd_dollar = 0
completion_tokens_cost_usd_dollar = 0 completion_tokens_cost_usd_dollar = 0
if model in model_cost: if model in model_cost:
prompt_tokens_cost_usd_dollar = model_cost[model]["input_cost_per_token"] * response_obj["usage"]["prompt_tokens"] prompt_tokens_cost_usd_dollar = (
completion_tokens_cost_usd_dollar = model_cost[model]["output_cost_per_token"] * response_obj["usage"]["completion_tokens"] model_cost[model]["input_cost_per_token"]
* response_obj["usage"]["prompt_tokens"]
)
completion_tokens_cost_usd_dollar = (
model_cost[model]["output_cost_per_token"]
* response_obj["usage"]["completion_tokens"]
)
elif "replicate" in model: elif "replicate" in model:
# replicate models are charged based on time # replicate models are charged based on time
# llama 2 runs on an nvidia a100 which costs $0.0032 per second - https://replicate.com/replicate/llama-2-70b-chat # llama 2 runs on an nvidia a100 which costs $0.0032 per second - https://replicate.com/replicate/llama-2-70b-chat
@ -55,27 +123,41 @@ class AISpendLogger:
output_cost_sum += model_cost[model]["output_cost_per_token"] output_cost_sum += model_cost[model]["output_cost_per_token"]
avg_input_cost = input_cost_sum / len(model_cost.keys()) avg_input_cost = input_cost_sum / len(model_cost.keys())
avg_output_cost = output_cost_sum / len(model_cost.keys()) avg_output_cost = output_cost_sum / len(model_cost.keys())
prompt_tokens_cost_usd_dollar = model_cost[model]["input_cost_per_token"] * response_obj["usage"]["prompt_tokens"] prompt_tokens_cost_usd_dollar = (
completion_tokens_cost_usd_dollar = model_cost[model]["output_cost_per_token"] * response_obj["usage"]["completion_tokens"] model_cost[model]["input_cost_per_token"]
* response_obj["usage"]["prompt_tokens"]
)
completion_tokens_cost_usd_dollar = (
model_cost[model]["output_cost_per_token"]
* response_obj["usage"]["completion_tokens"]
)
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
def log_event(self, model, response_obj, start_time, end_time, print_verbose): def log_event(self, model, response_obj, start_time, end_time, print_verbose):
# Method definition # Method definition
try: try:
print_verbose(f"AISpend Logging - Enters logging function for model {model}") print_verbose(
f"AISpend Logging - Enters logging function for model {model}"
)
url = f"https://aispend.io/api/v1/accounts/{self.account_id}/data" url = f"https://aispend.io/api/v1/accounts/{self.account_id}/data"
headers = { headers = {
'Authorization': f'Bearer {self.api_key}', "Authorization": f"Bearer {self.api_key}",
'Content-Type': 'application/json' "Content-Type": "application/json",
} }
response_timestamp = datetime.datetime.fromtimestamp(int(response_obj["created"])).strftime('%Y-%m-%d') response_timestamp = datetime.datetime.fromtimestamp(
int(response_obj["created"])
).strftime("%Y-%m-%d")
prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = self.price_calculator(model, response_obj, start_time, end_time) (
prompt_tokens_cost_usd_dollar,
completion_tokens_cost_usd_dollar,
) = self.price_calculator(model, response_obj, start_time, end_time)
prompt_tokens_cost_usd_cent = prompt_tokens_cost_usd_dollar * 100 prompt_tokens_cost_usd_cent = prompt_tokens_cost_usd_dollar * 100
completion_tokens_cost_usd_cent = completion_tokens_cost_usd_dollar * 100 completion_tokens_cost_usd_cent = completion_tokens_cost_usd_dollar * 100
data = [{ data = [
{
"requests": 1, "requests": 1,
"requests_context": 1, "requests_context": 1,
"context_tokens": response_obj["usage"]["prompt_tokens"], "context_tokens": response_obj["usage"]["prompt_tokens"],
@ -84,8 +166,9 @@ class AISpendLogger:
"recorded_date": response_timestamp, "recorded_date": response_timestamp,
"model_id": response_obj["model"], "model_id": response_obj["model"],
"generated_tokens_cost_usd_cent": prompt_tokens_cost_usd_cent, "generated_tokens_cost_usd_cent": prompt_tokens_cost_usd_cent,
"context_tokens_cost_usd_cent": completion_tokens_cost_usd_cent "context_tokens_cost_usd_cent": completion_tokens_cost_usd_cent,
}] }
]
print_verbose(f"AISpend Logging - final data object: {data}") print_verbose(f"AISpend Logging - final data object: {data}")
except: except:

View file

@ -2,28 +2,90 @@
# On success + failure, log events to aispend.io # On success + failure, log events to aispend.io
import dotenv, os import dotenv, os
import requests import requests
dotenv.load_dotenv() # Loading env variables using dotenv dotenv.load_dotenv() # Loading env variables using dotenv
import traceback import traceback
import datetime import datetime
model_cost = { model_cost = {
"gpt-3.5-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, "gpt-3.5-turbo": {
"gpt-35-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, # azure model name "max_tokens": 4000,
"gpt-3.5-turbo-0613": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, "input_cost_per_token": 0.0000015,
"gpt-3.5-turbo-0301": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, "output_cost_per_token": 0.000002,
"gpt-3.5-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, },
"gpt-35-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, # azure model name "gpt-35-turbo": {
"gpt-3.5-turbo-16k-0613": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, "max_tokens": 4000,
"gpt-4": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006}, "input_cost_per_token": 0.0000015,
"gpt-4-0613": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006}, "output_cost_per_token": 0.000002,
"gpt-4-32k": {"max_tokens": 8000, "input_cost_per_token": 0.00006, "output_cost_per_token": 0.00012}, }, # azure model name
"claude-instant-1": {"max_tokens": 100000, "input_cost_per_token": 0.00000163, "output_cost_per_token": 0.00000551}, "gpt-3.5-turbo-0613": {
"claude-2": {"max_tokens": 100000, "input_cost_per_token": 0.00001102, "output_cost_per_token": 0.00003268}, "max_tokens": 4000,
"text-bison-001": {"max_tokens": 8192, "input_cost_per_token": 0.000004, "output_cost_per_token": 0.000004}, "input_cost_per_token": 0.0000015,
"chat-bison-001": {"max_tokens": 4096, "input_cost_per_token": 0.000002, "output_cost_per_token": 0.000002}, "output_cost_per_token": 0.000002,
"command-nightly": {"max_tokens": 4096, "input_cost_per_token": 0.000015, "output_cost_per_token": 0.000015}, },
"gpt-3.5-turbo-0301": {
"max_tokens": 4000,
"input_cost_per_token": 0.0000015,
"output_cost_per_token": 0.000002,
},
"gpt-3.5-turbo-16k": {
"max_tokens": 16000,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000004,
},
"gpt-35-turbo-16k": {
"max_tokens": 16000,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000004,
}, # azure model name
"gpt-3.5-turbo-16k-0613": {
"max_tokens": 16000,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000004,
},
"gpt-4": {
"max_tokens": 8000,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.00006,
},
"gpt-4-0613": {
"max_tokens": 8000,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.00006,
},
"gpt-4-32k": {
"max_tokens": 8000,
"input_cost_per_token": 0.00006,
"output_cost_per_token": 0.00012,
},
"claude-instant-1": {
"max_tokens": 100000,
"input_cost_per_token": 0.00000163,
"output_cost_per_token": 0.00000551,
},
"claude-2": {
"max_tokens": 100000,
"input_cost_per_token": 0.00001102,
"output_cost_per_token": 0.00003268,
},
"text-bison-001": {
"max_tokens": 8192,
"input_cost_per_token": 0.000004,
"output_cost_per_token": 0.000004,
},
"chat-bison-001": {
"max_tokens": 4096,
"input_cost_per_token": 0.000002,
"output_cost_per_token": 0.000002,
},
"command-nightly": {
"max_tokens": 4096,
"input_cost_per_token": 0.000015,
"output_cost_per_token": 0.000015,
},
} }
class BerriSpendLogger: class BerriSpendLogger:
# Class variables or attributes # Class variables or attributes
def __init__(self): def __init__(self):
@ -36,8 +98,14 @@ class BerriSpendLogger:
prompt_tokens_cost_usd_dollar = 0 prompt_tokens_cost_usd_dollar = 0
completion_tokens_cost_usd_dollar = 0 completion_tokens_cost_usd_dollar = 0
if model in model_cost: if model in model_cost:
prompt_tokens_cost_usd_dollar = model_cost[model]["input_cost_per_token"] * response_obj["usage"]["prompt_tokens"] prompt_tokens_cost_usd_dollar = (
completion_tokens_cost_usd_dollar = model_cost[model]["output_cost_per_token"] * response_obj["usage"]["completion_tokens"] model_cost[model]["input_cost_per_token"]
* response_obj["usage"]["prompt_tokens"]
)
completion_tokens_cost_usd_dollar = (
model_cost[model]["output_cost_per_token"]
* response_obj["usage"]["completion_tokens"]
)
elif "replicate" in model: elif "replicate" in model:
# replicate models are charged based on time # replicate models are charged based on time
# llama 2 runs on an nvidia a100 which costs $0.0032 per second - https://replicate.com/replicate/llama-2-70b-chat # llama 2 runs on an nvidia a100 which costs $0.0032 per second - https://replicate.com/replicate/llama-2-70b-chat
@ -54,42 +122,59 @@ class BerriSpendLogger:
output_cost_sum += model_cost[model]["output_cost_per_token"] output_cost_sum += model_cost[model]["output_cost_per_token"]
avg_input_cost = input_cost_sum / len(model_cost.keys()) avg_input_cost = input_cost_sum / len(model_cost.keys())
avg_output_cost = output_cost_sum / len(model_cost.keys()) avg_output_cost = output_cost_sum / len(model_cost.keys())
prompt_tokens_cost_usd_dollar = model_cost[model]["input_cost_per_token"] * response_obj["usage"]["prompt_tokens"] prompt_tokens_cost_usd_dollar = (
completion_tokens_cost_usd_dollar = model_cost[model]["output_cost_per_token"] * response_obj["usage"]["completion_tokens"] model_cost[model]["input_cost_per_token"]
* response_obj["usage"]["prompt_tokens"]
)
completion_tokens_cost_usd_dollar = (
model_cost[model]["output_cost_per_token"]
* response_obj["usage"]["completion_tokens"]
)
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
def log_event(self, model, messages, response_obj, start_time, end_time, print_verbose): def log_event(
self, model, messages, response_obj, start_time, end_time, print_verbose
):
# Method definition # Method definition
try: try:
print_verbose(f"BerriSpend Logging - Enters logging function for model {model}") print_verbose(
f"BerriSpend Logging - Enters logging function for model {model}"
)
url = f"https://berrispend.berri.ai/spend" url = f"https://berrispend.berri.ai/spend"
headers = { headers = {"Content-Type": "application/json"}
'Content-Type': 'application/json'
}
prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = self.price_calculator(model, response_obj, start_time, end_time) (
total_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar prompt_tokens_cost_usd_dollar,
completion_tokens_cost_usd_dollar,
) = self.price_calculator(model, response_obj, start_time, end_time)
total_cost = (
prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
)
response_time = (end_time-start_time).total_seconds() response_time = (end_time - start_time).total_seconds()
if "response" in response_obj: if "response" in response_obj:
data = [{ data = [
{
"response_time": response_time, "response_time": response_time,
"model_id": response_obj["model"], "model_id": response_obj["model"],
"total_cost": total_cost, "total_cost": total_cost,
"messages": messages, "messages": messages,
"response": response_obj['choices'][0]['message']['content'], "response": response_obj["choices"][0]["message"]["content"],
"account_id": self.account_id "account_id": self.account_id,
}] }
]
elif "error" in response_obj: elif "error" in response_obj:
data = [{ data = [
{
"response_time": response_time, "response_time": response_time,
"model_id": response_obj["model"], "model_id": response_obj["model"],
"total_cost": total_cost, "total_cost": total_cost,
"messages": messages, "messages": messages,
"error": response_obj['error'], "error": response_obj["error"],
"account_id": self.account_id "account_id": self.account_id,
}] }
]
print_verbose(f"BerriSpend Logging - final data object: {data}") print_verbose(f"BerriSpend Logging - final data object: {data}")
response = requests.post(url, headers=headers, json=data) response = requests.post(url, headers=headers, json=data)

View file

@ -2,18 +2,23 @@
# On success, logs events to Helicone # On success, logs events to Helicone
import dotenv, os import dotenv, os
import requests import requests
dotenv.load_dotenv() # Loading env variables using dotenv dotenv.load_dotenv() # Loading env variables using dotenv
import traceback import traceback
class HeliconeLogger: class HeliconeLogger:
# Class variables or attributes # Class variables or attributes
helicone_model_list = ["gpt", "claude"] helicone_model_list = ["gpt", "claude"]
def __init__(self): def __init__(self):
# Instance variables # Instance variables
self.provider_url = "https://api.openai.com/v1" self.provider_url = "https://api.openai.com/v1"
self.key = os.getenv('HELICONE_API_KEY') self.key = os.getenv("HELICONE_API_KEY")
def claude_mapping(self, model, messages, response_obj): def claude_mapping(self, model, messages, response_obj):
from anthropic import HUMAN_PROMPT, AI_PROMPT from anthropic import HUMAN_PROMPT, AI_PROMPT
prompt = f"{HUMAN_PROMPT}" prompt = f"{HUMAN_PROMPT}"
for message in messages: for message in messages:
if "role" in message: if "role" in message:
@ -26,46 +31,82 @@ class HeliconeLogger:
prompt += f"{AI_PROMPT}" prompt += f"{AI_PROMPT}"
claude_provider_request = {"model": model, "prompt": prompt} claude_provider_request = {"model": model, "prompt": prompt}
claude_response_obj = {"completion": response_obj['choices'][0]['message']['content'], "model": model, "stop_reason": "stop_sequence"} claude_response_obj = {
"completion": response_obj["choices"][0]["message"]["content"],
"model": model,
"stop_reason": "stop_sequence",
}
return claude_provider_request, claude_response_obj return claude_provider_request, claude_response_obj
def log_success(self, model, messages, response_obj, start_time, end_time, print_verbose): def log_success(
self, model, messages, response_obj, start_time, end_time, print_verbose
):
# Method definition # Method definition
try: try:
print_verbose(f"Helicone Logging - Enters logging function for model {model}") print_verbose(
model = model if any(accepted_model in model for accepted_model in self.helicone_model_list) else "gpt-3.5-turbo" f"Helicone Logging - Enters logging function for model {model}"
)
model = (
model
if any(
accepted_model in model
for accepted_model in self.helicone_model_list
)
else "gpt-3.5-turbo"
)
provider_request = {"model": model, "messages": messages} provider_request = {"model": model, "messages": messages}
if "claude" in model: if "claude" in model:
provider_request, response_obj = self.claude_mapping(model=model, messages=messages, response_obj=response_obj) provider_request, response_obj = self.claude_mapping(
model=model, messages=messages, response_obj=response_obj
)
providerResponse = { providerResponse = {
"json": response_obj, "json": response_obj,
"headers": {"openai-version": "2020-10-01"}, "headers": {"openai-version": "2020-10-01"},
"status": 200 "status": 200,
} }
# Code to be executed # Code to be executed
url = "https://api.hconeai.com/oai/v1/log" url = "https://api.hconeai.com/oai/v1/log"
headers = { headers = {
'Authorization': f'Bearer {self.key}', "Authorization": f"Bearer {self.key}",
'Content-Type': 'application/json' "Content-Type": "application/json",
} }
start_time_seconds = int(start_time.timestamp()) start_time_seconds = int(start_time.timestamp())
start_time_milliseconds = int((start_time.timestamp() - start_time_seconds) * 1000) start_time_milliseconds = int(
(start_time.timestamp() - start_time_seconds) * 1000
)
end_time_seconds = int(end_time.timestamp()) end_time_seconds = int(end_time.timestamp())
end_time_milliseconds = int((end_time.timestamp() - end_time_seconds) * 1000) end_time_milliseconds = int(
(end_time.timestamp() - end_time_seconds) * 1000
)
data = { data = {
"providerRequest": {"url": self.provider_url, "json": provider_request, "meta": {"Helicone-Auth": f"Bearer {self.key}"}}, "providerRequest": {
"url": self.provider_url,
"json": provider_request,
"meta": {"Helicone-Auth": f"Bearer {self.key}"},
},
"providerResponse": providerResponse, "providerResponse": providerResponse,
"timing": {"startTime": {"seconds": start_time_seconds, "milliseconds": start_time_milliseconds}, "endTime": {"seconds": end_time_seconds, "milliseconds": end_time_milliseconds}} # {"seconds": .., "milliseconds": ..} "timing": {
"startTime": {
"seconds": start_time_seconds,
"milliseconds": start_time_milliseconds,
},
"endTime": {
"seconds": end_time_seconds,
"milliseconds": end_time_milliseconds,
},
}, # {"seconds": .., "milliseconds": ..}
} }
response = requests.post(url, headers=headers, json=data) response = requests.post(url, headers=headers, json=data)
if response.status_code == 200: if response.status_code == 200:
print_verbose("Helicone Logging - Success!") print_verbose("Helicone Logging - Success!")
else: else:
print_verbose(f"Helicone Logging - Error Request was not successful. Status Code: {response.status_code}") print_verbose(
f"Helicone Logging - Error Request was not successful. Status Code: {response.status_code}"
)
print_verbose(f"Helicone Logging - Error {response.text}") print_verbose(f"Helicone Logging - Error {response.text}")
except: except:
# traceback.print_exc() # traceback.print_exc()

View file

@ -3,31 +3,94 @@
import dotenv, os import dotenv, os
import requests import requests
dotenv.load_dotenv() # Loading env variables using dotenv dotenv.load_dotenv() # Loading env variables using dotenv
import traceback import traceback
import datetime, subprocess, sys import datetime, subprocess, sys
model_cost = { model_cost = {
"gpt-3.5-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, "gpt-3.5-turbo": {
"gpt-35-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, # azure model name "max_tokens": 4000,
"gpt-3.5-turbo-0613": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, "input_cost_per_token": 0.0000015,
"gpt-3.5-turbo-0301": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, "output_cost_per_token": 0.000002,
"gpt-3.5-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, },
"gpt-35-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, # azure model name "gpt-35-turbo": {
"gpt-3.5-turbo-16k-0613": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, "max_tokens": 4000,
"gpt-4": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006}, "input_cost_per_token": 0.0000015,
"gpt-4-0613": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006}, "output_cost_per_token": 0.000002,
"gpt-4-32k": {"max_tokens": 8000, "input_cost_per_token": 0.00006, "output_cost_per_token": 0.00012}, }, # azure model name
"claude-instant-1": {"max_tokens": 100000, "input_cost_per_token": 0.00000163, "output_cost_per_token": 0.00000551}, "gpt-3.5-turbo-0613": {
"claude-2": {"max_tokens": 100000, "input_cost_per_token": 0.00001102, "output_cost_per_token": 0.00003268}, "max_tokens": 4000,
"text-bison-001": {"max_tokens": 8192, "input_cost_per_token": 0.000004, "output_cost_per_token": 0.000004}, "input_cost_per_token": 0.0000015,
"chat-bison-001": {"max_tokens": 4096, "input_cost_per_token": 0.000002, "output_cost_per_token": 0.000002}, "output_cost_per_token": 0.000002,
"command-nightly": {"max_tokens": 4096, "input_cost_per_token": 0.000015, "output_cost_per_token": 0.000015}, },
"gpt-3.5-turbo-0301": {
"max_tokens": 4000,
"input_cost_per_token": 0.0000015,
"output_cost_per_token": 0.000002,
},
"gpt-3.5-turbo-16k": {
"max_tokens": 16000,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000004,
},
"gpt-35-turbo-16k": {
"max_tokens": 16000,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000004,
}, # azure model name
"gpt-3.5-turbo-16k-0613": {
"max_tokens": 16000,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000004,
},
"gpt-4": {
"max_tokens": 8000,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.00006,
},
"gpt-4-0613": {
"max_tokens": 8000,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.00006,
},
"gpt-4-32k": {
"max_tokens": 8000,
"input_cost_per_token": 0.00006,
"output_cost_per_token": 0.00012,
},
"claude-instant-1": {
"max_tokens": 100000,
"input_cost_per_token": 0.00000163,
"output_cost_per_token": 0.00000551,
},
"claude-2": {
"max_tokens": 100000,
"input_cost_per_token": 0.00001102,
"output_cost_per_token": 0.00003268,
},
"text-bison-001": {
"max_tokens": 8192,
"input_cost_per_token": 0.000004,
"output_cost_per_token": 0.000004,
},
"chat-bison-001": {
"max_tokens": 4096,
"input_cost_per_token": 0.000002,
"output_cost_per_token": 0.000002,
},
"command-nightly": {
"max_tokens": 4096,
"input_cost_per_token": 0.000015,
"output_cost_per_token": 0.000015,
},
} }
class Supabase: class Supabase:
# Class variables or attributes # Class variables or attributes
supabase_table_name = "request_logs" supabase_table_name = "request_logs"
def __init__(self): def __init__(self):
# Instance variables # Instance variables
self.supabase_url = os.getenv("SUPABASE_URL") self.supabase_url = os.getenv("SUPABASE_URL")
@ -35,9 +98,11 @@ class Supabase:
try: try:
import supabase import supabase
except ImportError: except ImportError:
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'supabase']) subprocess.check_call([sys.executable, "-m", "pip", "install", "supabase"])
import supabase import supabase
self.supabase_client = supabase.create_client(self.supabase_url, self.supabase_key) self.supabase_client = supabase.create_client(
self.supabase_url, self.supabase_key
)
def price_calculator(self, model, response_obj, start_time, end_time): def price_calculator(self, model, response_obj, start_time, end_time):
# try and find if the model is in the model_cost map # try and find if the model is in the model_cost map
@ -45,8 +110,14 @@ class Supabase:
prompt_tokens_cost_usd_dollar = 0 prompt_tokens_cost_usd_dollar = 0
completion_tokens_cost_usd_dollar = 0 completion_tokens_cost_usd_dollar = 0
if model in model_cost: if model in model_cost:
prompt_tokens_cost_usd_dollar = model_cost[model]["input_cost_per_token"] * response_obj["usage"]["prompt_tokens"] prompt_tokens_cost_usd_dollar = (
completion_tokens_cost_usd_dollar = model_cost[model]["output_cost_per_token"] * response_obj["usage"]["completion_tokens"] model_cost[model]["input_cost_per_token"]
* response_obj["usage"]["prompt_tokens"]
)
completion_tokens_cost_usd_dollar = (
model_cost[model]["output_cost_per_token"]
* response_obj["usage"]["completion_tokens"]
)
elif "replicate" in model: elif "replicate" in model:
# replicate models are charged based on time # replicate models are charged based on time
# llama 2 runs on an nvidia a100 which costs $0.0032 per second - https://replicate.com/replicate/llama-2-70b-chat # llama 2 runs on an nvidia a100 which costs $0.0032 per second - https://replicate.com/replicate/llama-2-70b-chat
@ -63,40 +134,74 @@ class Supabase:
output_cost_sum += model_cost[model]["output_cost_per_token"] output_cost_sum += model_cost[model]["output_cost_per_token"]
avg_input_cost = input_cost_sum / len(model_cost.keys()) avg_input_cost = input_cost_sum / len(model_cost.keys())
avg_output_cost = output_cost_sum / len(model_cost.keys()) avg_output_cost = output_cost_sum / len(model_cost.keys())
prompt_tokens_cost_usd_dollar = model_cost[model]["input_cost_per_token"] * response_obj["usage"]["prompt_tokens"] prompt_tokens_cost_usd_dollar = (
completion_tokens_cost_usd_dollar = model_cost[model]["output_cost_per_token"] * response_obj["usage"]["completion_tokens"] model_cost[model]["input_cost_per_token"]
* response_obj["usage"]["prompt_tokens"]
)
completion_tokens_cost_usd_dollar = (
model_cost[model]["output_cost_per_token"]
* response_obj["usage"]["completion_tokens"]
)
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
def log_event(self, model, messages, end_user, response_obj, start_time, end_time, print_verbose): def log_event(
self,
model,
messages,
end_user,
response_obj,
start_time,
end_time,
print_verbose,
):
try: try:
print_verbose(f"Supabase Logging - Enters logging function for model {model}, response_obj: {response_obj}") print_verbose(
f"Supabase Logging - Enters logging function for model {model}, response_obj: {response_obj}"
)
prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = self.price_calculator(model, response_obj, start_time, end_time) (
total_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar prompt_tokens_cost_usd_dollar,
completion_tokens_cost_usd_dollar,
) = self.price_calculator(model, response_obj, start_time, end_time)
total_cost = (
prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
)
response_time = (end_time-start_time).total_seconds() response_time = (end_time - start_time).total_seconds()
if "choices" in response_obj: if "choices" in response_obj:
supabase_data_obj = { supabase_data_obj = {
"response_time": response_time, "response_time": response_time,
"model": response_obj["model"], "model": response_obj["model"],
"total_cost": total_cost, "total_cost": total_cost,
"messages": messages, "messages": messages,
"response": response_obj['choices'][0]['message']['content'], "response": response_obj["choices"][0]["message"]["content"],
"end_user": end_user "end_user": end_user,
} }
print_verbose(f"Supabase Logging - final data object: {supabase_data_obj}") print_verbose(
data, count = self.supabase_client.table(self.supabase_table_name).insert(supabase_data_obj).execute() f"Supabase Logging - final data object: {supabase_data_obj}"
)
data, count = (
self.supabase_client.table(self.supabase_table_name)
.insert(supabase_data_obj)
.execute()
)
elif "error" in response_obj: elif "error" in response_obj:
supabase_data_obj = { supabase_data_obj = {
"response_time": response_time, "response_time": response_time,
"model": response_obj["model"], "model": response_obj["model"],
"total_cost": total_cost, "total_cost": total_cost,
"messages": messages, "messages": messages,
"error": response_obj['error'], "error": response_obj["error"],
"end_user": end_user "end_user": end_user,
} }
print_verbose(f"Supabase Logging - final data object: {supabase_data_obj}") print_verbose(
data, count = self.supabase_client.table(self.supabase_table_name).insert(supabase_data_obj).execute() f"Supabase Logging - final data object: {supabase_data_obj}"
)
data, count = (
self.supabase_client.table(self.supabase_table_name)
.insert(supabase_data_obj)
.execute()
)
except: except:
# traceback.print_exc() # traceback.print_exc()

View file

@ -6,18 +6,22 @@ import time
from typing import Callable from typing import Callable
from litellm.utils import ModelResponse from litellm.utils import ModelResponse
class AnthropicConstants(Enum): class AnthropicConstants(Enum):
HUMAN_PROMPT = "\n\nHuman:" HUMAN_PROMPT = "\n\nHuman:"
AI_PROMPT = "\n\nAssistant:" AI_PROMPT = "\n\nAssistant:"
class AnthropicError(Exception): class AnthropicError(Exception):
def __init__(self, status_code, message): def __init__(self, status_code, message):
self.status_code = status_code self.status_code = status_code
self.message = message self.message = message
super().__init__(self.message) # Call the base class constructor with the parameters it needs super().__init__(
self.message
) # Call the base class constructor with the parameters it needs
class AnthropicLLM: class AnthropicLLM:
def __init__(self, encoding, default_max_tokens_to_sample, api_key=None): def __init__(self, encoding, default_max_tokens_to_sample, api_key=None):
self.encoding = encoding self.encoding = encoding
self.default_max_tokens_to_sample = default_max_tokens_to_sample self.default_max_tokens_to_sample = default_max_tokens_to_sample
@ -25,31 +29,50 @@ class AnthropicLLM:
self.api_key = api_key self.api_key = api_key
self.validate_environment(api_key=api_key) self.validate_environment(api_key=api_key)
def validate_environment(self, api_key): # set up the environment required to run the model def validate_environment(
self, api_key
): # set up the environment required to run the model
# set the api key # set the api key
if self.api_key == None: if self.api_key == None:
raise ValueError("Missing Anthropic API Key - A call is being made to anthropic but no key is set either in the environment variables or via params") raise ValueError(
"Missing Anthropic API Key - A call is being made to anthropic but no key is set either in the environment variables or via params"
)
self.api_key = api_key self.api_key = api_key
self.headers = { self.headers = {
"accept": "application/json", "accept": "application/json",
"anthropic-version": "2023-06-01", "anthropic-version": "2023-06-01",
"content-type": "application/json", "content-type": "application/json",
"x-api-key": self.api_key "x-api-key": self.api_key,
} }
def completion(self, model: str, messages: list, model_response: ModelResponse, print_verbose: Callable, optional_params=None, litellm_params=None, logger_fn=None): # logic for parsing in - calling - parsing out model completion calls def completion(
self,
model: str,
messages: list,
model_response: ModelResponse,
print_verbose: Callable,
optional_params=None,
litellm_params=None,
logger_fn=None,
): # logic for parsing in - calling - parsing out model completion calls
model = model model = model
prompt = f"{AnthropicConstants.HUMAN_PROMPT.value}" prompt = f"{AnthropicConstants.HUMAN_PROMPT.value}"
for message in messages: for message in messages:
if "role" in message: if "role" in message:
if message["role"] == "user": if message["role"] == "user":
prompt += f"{AnthropicConstants.HUMAN_PROMPT.value}{message['content']}" prompt += (
f"{AnthropicConstants.HUMAN_PROMPT.value}{message['content']}"
)
else: else:
prompt += f"{AnthropicConstants.AI_PROMPT.value}{message['content']}" prompt += (
f"{AnthropicConstants.AI_PROMPT.value}{message['content']}"
)
else: else:
prompt += f"{AnthropicConstants.HUMAN_PROMPT.value}{message['content']}" prompt += f"{AnthropicConstants.HUMAN_PROMPT.value}{message['content']}"
prompt += f"{AnthropicConstants.AI_PROMPT.value}" prompt += f"{AnthropicConstants.AI_PROMPT.value}"
if "max_tokens" in optional_params and optional_params["max_tokens"] != float('inf'): if "max_tokens" in optional_params and optional_params["max_tokens"] != float(
"inf"
):
max_tokens = optional_params["max_tokens"] max_tokens = optional_params["max_tokens"]
else: else:
max_tokens = self.default_max_tokens_to_sample max_tokens = self.default_max_tokens_to_sample
@ -57,37 +80,64 @@ class AnthropicLLM:
"model": model, "model": model,
"prompt": prompt, "prompt": prompt,
"max_tokens_to_sample": max_tokens, "max_tokens_to_sample": max_tokens,
**optional_params **optional_params,
} }
## LOGGING ## LOGGING
logging(model=model, input=prompt, additional_args={"litellm_params": litellm_params, "optional_params": optional_params}, logger_fn=logger_fn) logging(
model=model,
input=prompt,
additional_args={
"litellm_params": litellm_params,
"optional_params": optional_params,
},
logger_fn=logger_fn,
)
## COMPLETION CALL ## COMPLETION CALL
response = requests.post(self.completion_url, headers=self.headers, data=json.dumps(data)) response = requests.post(
self.completion_url, headers=self.headers, data=json.dumps(data)
)
if "stream" in optional_params and optional_params["stream"] == True: if "stream" in optional_params and optional_params["stream"] == True:
return response.iter_lines() return response.iter_lines()
else: else:
## LOGGING ## LOGGING
logging(model=model, input=prompt, additional_args={"litellm_params": litellm_params, "optional_params": optional_params, "original_response": response.text}, logger_fn=logger_fn) logging(
model=model,
input=prompt,
additional_args={
"litellm_params": litellm_params,
"optional_params": optional_params,
"original_response": response.text,
},
logger_fn=logger_fn,
)
print_verbose(f"raw model_response: {response.text}") print_verbose(f"raw model_response: {response.text}")
## RESPONSE OBJECT ## RESPONSE OBJECT
completion_response = response.json() completion_response = response.json()
if "error" in completion_response: if "error" in completion_response:
raise AnthropicError(message=completion_response["error"], status_code=response.status_code) raise AnthropicError(
message=completion_response["error"],
status_code=response.status_code,
)
else: else:
model_response["choices"][0]["message"]["content"] = completion_response["completion"] model_response["choices"][0]["message"][
"content"
] = completion_response["completion"]
## CALCULATING USAGE ## CALCULATING USAGE
prompt_tokens = len(self.encoding.encode(prompt)) ##[TODO] use the anthropic tokenizer here prompt_tokens = len(
completion_tokens = len(self.encoding.encode(model_response["choices"][0]["message"]["content"])) ##[TODO] use the anthropic tokenizer here self.encoding.encode(prompt)
) ##[TODO] use the anthropic tokenizer here
completion_tokens = len(
self.encoding.encode(model_response["choices"][0]["message"]["content"])
) ##[TODO] use the anthropic tokenizer here
model_response["created"] = time.time() model_response["created"] = time.time()
model_response["model"] = model model_response["model"] = model
model_response["usage"] = { model_response["usage"] = {
"prompt_tokens": prompt_tokens, "prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens, "completion_tokens": completion_tokens,
"total_tokens": prompt_tokens + completion_tokens "total_tokens": prompt_tokens + completion_tokens,
} }
return model_response return model_response

View file

@ -1,6 +1,7 @@
## This is a template base class to be used for adding new LLM providers via API calls ## This is a template base class to be used for adding new LLM providers via API calls
class BaseLLM():
class BaseLLM:
def validate_environment(): # set up the environment required to run the model def validate_environment(): # set up the environment required to run the model
pass pass

View file

@ -7,18 +7,24 @@ import time
from typing import Callable from typing import Callable
from litellm.utils import ModelResponse from litellm.utils import ModelResponse
class HuggingfaceError(Exception): class HuggingfaceError(Exception):
def __init__(self, status_code, message): def __init__(self, status_code, message):
self.status_code = status_code self.status_code = status_code
self.message = message self.message = message
super().__init__(self.message) # Call the base class constructor with the parameters it needs super().__init__(
self.message
) # Call the base class constructor with the parameters it needs
class HuggingfaceRestAPILLM():
class HuggingfaceRestAPILLM:
def __init__(self, encoding, api_key=None) -> None: def __init__(self, encoding, api_key=None) -> None:
self.encoding = encoding self.encoding = encoding
self.validate_environment(api_key=api_key) self.validate_environment(api_key=api_key)
def validate_environment(self, api_key): # set up the environment required to run the model def validate_environment(
self, api_key
): # set up the environment required to run the model
self.headers = { self.headers = {
"content-type": "application/json", "content-type": "application/json",
} }
@ -27,7 +33,17 @@ class HuggingfaceRestAPILLM():
if self.api_key != None: if self.api_key != None:
self.headers["Authorization"] = f"Bearer {self.api_key}" self.headers["Authorization"] = f"Bearer {self.api_key}"
def completion(self, model: str, messages: list, custom_api_base: str, model_response: ModelResponse, print_verbose: Callable, optional_params=None, litellm_params=None, logger_fn=None): # logic for parsing in - calling - parsing out model completion calls def completion(
self,
model: str,
messages: list,
custom_api_base: str,
model_response: ModelResponse,
print_verbose: Callable,
optional_params=None,
litellm_params=None,
logger_fn=None,
): # logic for parsing in - calling - parsing out model completion calls
if custom_api_base: if custom_api_base:
completion_url = custom_api_base completion_url = custom_api_base
elif "HF_API_BASE" in os.environ: elif "HF_API_BASE" in os.environ:
@ -35,7 +51,9 @@ class HuggingfaceRestAPILLM():
else: else:
completion_url = f"https://api-inference.huggingface.co/models/{model}" completion_url = f"https://api-inference.huggingface.co/models/{model}"
prompt = "" prompt = ""
if "meta-llama" in model and "chat" in model: # use the required special tokens for meta-llama - https://huggingface.co/blog/llama2#how-to-prompt-llama-2 if (
"meta-llama" in model and "chat" in model
): # use the required special tokens for meta-llama - https://huggingface.co/blog/llama2#how-to-prompt-llama-2
prompt = "<s>" prompt = "<s>"
for message in messages: for message in messages:
if message["role"] == "system": if message["role"] == "system":
@ -57,14 +75,33 @@ class HuggingfaceRestAPILLM():
# "parameters": optional_params # "parameters": optional_params
} }
## LOGGING ## LOGGING
logging(model=model, input=prompt, additional_args={"litellm_params": litellm_params, "optional_params": optional_params}, logger_fn=logger_fn) logging(
model=model,
input=prompt,
additional_args={
"litellm_params": litellm_params,
"optional_params": optional_params,
},
logger_fn=logger_fn,
)
## COMPLETION CALL ## COMPLETION CALL
response = requests.post(completion_url, headers=self.headers, data=json.dumps(data)) response = requests.post(
completion_url, headers=self.headers, data=json.dumps(data)
)
if "stream" in optional_params and optional_params["stream"] == True: if "stream" in optional_params and optional_params["stream"] == True:
return response.iter_lines() return response.iter_lines()
else: else:
## LOGGING ## LOGGING
logging(model=model, input=prompt, additional_args={"litellm_params": litellm_params, "optional_params": optional_params, "original_response": response.text}, logger_fn=logger_fn) logging(
model=model,
input=prompt,
additional_args={
"litellm_params": litellm_params,
"optional_params": optional_params,
"original_response": response.text,
},
logger_fn=logger_fn,
)
print_verbose(f"raw model_response: {response.text}") print_verbose(f"raw model_response: {response.text}")
## RESPONSE OBJECT ## RESPONSE OBJECT
completion_response = response.json() completion_response = response.json()
@ -72,21 +109,29 @@ class HuggingfaceRestAPILLM():
if isinstance(completion_response, dict) and "error" in completion_response: if isinstance(completion_response, dict) and "error" in completion_response:
print_verbose(f"completion error: {completion_response['error']}") print_verbose(f"completion error: {completion_response['error']}")
print_verbose(f"response.status_code: {response.status_code}") print_verbose(f"response.status_code: {response.status_code}")
raise HuggingfaceError(message=completion_response["error"], status_code=response.status_code) raise HuggingfaceError(
message=completion_response["error"],
status_code=response.status_code,
)
else: else:
model_response["choices"][0]["message"]["content"] = completion_response[0]["generated_text"] model_response["choices"][0]["message"][
"content"
] = completion_response[0]["generated_text"]
## CALCULATING USAGE ## CALCULATING USAGE
prompt_tokens = len(self.encoding.encode(prompt)) ##[TODO] use the llama2 tokenizer here prompt_tokens = len(
completion_tokens = len(self.encoding.encode(model_response["choices"][0]["message"]["content"])) ##[TODO] use the llama2 tokenizer here self.encoding.encode(prompt)
) ##[TODO] use the llama2 tokenizer here
completion_tokens = len(
self.encoding.encode(model_response["choices"][0]["message"]["content"])
) ##[TODO] use the llama2 tokenizer here
model_response["created"] = time.time() model_response["created"] = time.time()
model_response["model"] = model model_response["model"] = model
model_response["usage"] = { model_response["usage"] = {
"prompt_tokens": prompt_tokens, "prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens, "completion_tokens": completion_tokens,
"total_tokens": prompt_tokens + completion_tokens "total_tokens": prompt_tokens + completion_tokens,
} }
return model_response return model_response
pass pass

View file

@ -4,17 +4,43 @@ from functools import partial
import dotenv, traceback, random, asyncio, time import dotenv, traceback, random, asyncio, time
from copy import deepcopy from copy import deepcopy
import litellm import litellm
from litellm import client, logging, exception_type, timeout, get_optional_params, get_litellm_params from litellm import (
from litellm.utils import get_secret, install_and_import, CustomStreamWrapper, read_config_args client,
logging,
exception_type,
timeout,
get_optional_params,
get_litellm_params,
)
from litellm.utils import (
get_secret,
install_and_import,
CustomStreamWrapper,
read_config_args,
)
from .llms.anthropic import AnthropicLLM from .llms.anthropic import AnthropicLLM
from .llms.huggingface_restapi import HuggingfaceRestAPILLM from .llms.huggingface_restapi import HuggingfaceRestAPILLM
import tiktoken import tiktoken
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
encoding = tiktoken.get_encoding("cl100k_base") encoding = tiktoken.get_encoding("cl100k_base")
from litellm.utils import get_secret, install_and_import, CustomStreamWrapper, ModelResponse, read_config_args from litellm.utils import (
from litellm.utils import get_ollama_response_stream, stream_to_string, together_ai_completion_streaming get_secret,
install_and_import,
CustomStreamWrapper,
ModelResponse,
read_config_args,
)
from litellm.utils import (
get_ollama_response_stream,
stream_to_string,
together_ai_completion_streaming,
)
####### ENVIRONMENT VARIABLES ################### ####### ENVIRONMENT VARIABLES ###################
dotenv.load_dotenv() # Loading env variables using dotenv dotenv.load_dotenv() # Loading env variables using dotenv
####### COMPLETION ENDPOINTS ################ ####### COMPLETION ENDPOINTS ################
############################################# #############################################
async def acompletion(*args, **kwargs): async def acompletion(*args, **kwargs):
@ -26,115 +52,198 @@ async def acompletion(*args, **kwargs):
# Call the synchronous function using run_in_executor # Call the synchronous function using run_in_executor
return await loop.run_in_executor(None, func) return await loop.run_in_executor(None, func)
@client @client
# @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(2), reraise=True, retry_error_callback=lambda retry_state: setattr(retry_state.outcome, 'retry_variable', litellm.retry)) # retry call, turn this off by setting `litellm.retry = False` # @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(2), reraise=True, retry_error_callback=lambda retry_state: setattr(retry_state.outcome, 'retry_variable', litellm.retry)) # retry call, turn this off by setting `litellm.retry = False`
@timeout(600) ## set timeouts, in case calls hang (e.g. Azure) - default is 60s, override with `force_timeout` @timeout(
600
) ## set timeouts, in case calls hang (e.g. Azure) - default is 60s, override with `force_timeout`
def completion( def completion(
model, messages,# required params model,
messages, # required params
# Optional OpenAI params: see https://platform.openai.com/docs/api-reference/chat/create # Optional OpenAI params: see https://platform.openai.com/docs/api-reference/chat/create
functions=[], function_call="", # optional params functions=[],
temperature=1, top_p=1, n=1, stream=False, stop=None, max_tokens=float('inf'), function_call="", # optional params
presence_penalty=0, frequency_penalty=0, logit_bias={}, user="", deployment_id=None, temperature=1,
top_p=1,
n=1,
stream=False,
stop=None,
max_tokens=float("inf"),
presence_penalty=0,
frequency_penalty=0,
logit_bias={},
user="",
deployment_id=None,
# Optional liteLLM function params # Optional liteLLM function params
*, return_async=False, api_key=None, force_timeout=600, logger_fn=None, verbose=False, azure=False, custom_llm_provider=None, custom_api_base=None, *,
return_async=False,
api_key=None,
force_timeout=600,
logger_fn=None,
verbose=False,
azure=False,
custom_llm_provider=None,
custom_api_base=None,
# model specific optional params # model specific optional params
# used by text-bison only # used by text-bison only
top_k=40, request_timeout=0, # unused var for old version of OpenAI API top_k=40,
) -> ModelResponse: request_timeout=0, # unused var for old version of OpenAI API
) -> ModelResponse:
try: try:
model_response = ModelResponse() model_response = ModelResponse()
if azure: # this flag is deprecated, remove once notebooks are also updated. if azure: # this flag is deprecated, remove once notebooks are also updated.
custom_llm_provider="azure" custom_llm_provider = "azure"
args = locals() args = locals()
# check if user passed in any of the OpenAI optional params # check if user passed in any of the OpenAI optional params
optional_params = get_optional_params( optional_params = get_optional_params(
functions=functions, function_call=function_call, functions=functions,
temperature=temperature, top_p=top_p, n=n, stream=stream, stop=stop, max_tokens=max_tokens, function_call=function_call,
presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, logit_bias=logit_bias, user=user, deployment_id=deployment_id, temperature=temperature,
top_p=top_p,
n=n,
stream=stream,
stop=stop,
max_tokens=max_tokens,
presence_penalty=presence_penalty,
frequency_penalty=frequency_penalty,
logit_bias=logit_bias,
user=user,
deployment_id=deployment_id,
# params to identify the model # params to identify the model
model=model, custom_llm_provider=custom_llm_provider, top_k=top_k, model=model,
custom_llm_provider=custom_llm_provider,
top_k=top_k,
) )
# For logging - save the values of the litellm-specific params passed in # For logging - save the values of the litellm-specific params passed in
litellm_params = get_litellm_params( litellm_params = get_litellm_params(
return_async=return_async, api_key=api_key, force_timeout=force_timeout, return_async=return_async,
logger_fn=logger_fn, verbose=verbose, custom_llm_provider=custom_llm_provider, api_key=api_key,
custom_api_base=custom_api_base) force_timeout=force_timeout,
logger_fn=logger_fn,
verbose=verbose,
custom_llm_provider=custom_llm_provider,
custom_api_base=custom_api_base,
)
if custom_llm_provider == "azure": if custom_llm_provider == "azure":
# azure configs # azure configs
openai.api_type = "azure" openai.api_type = "azure"
openai.api_base = litellm.api_base if litellm.api_base is not None else get_secret("AZURE_API_BASE") openai.api_base = (
openai.api_version = litellm.api_version if litellm.api_version is not None else get_secret("AZURE_API_VERSION") litellm.api_base
if litellm.api_base is not None
else get_secret("AZURE_API_BASE")
)
openai.api_version = (
litellm.api_version
if litellm.api_version is not None
else get_secret("AZURE_API_VERSION")
)
# set key # set key
openai.api_key = api_key or litellm.azure_key or get_secret("AZURE_API_KEY") openai.api_key = api_key or litellm.azure_key or get_secret("AZURE_API_KEY")
## LOGGING ## LOGGING
logging(model=model, input=messages, additional_args=optional_params, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn) logging(
model=model,
input=messages,
additional_args=optional_params,
custom_llm_provider=custom_llm_provider,
logger_fn=logger_fn,
)
## COMPLETION CALL ## COMPLETION CALL
if litellm.headers: if litellm.headers:
response = openai.ChatCompletion.create( response = openai.ChatCompletion.create(
engine=model, engine=model,
messages = messages, messages=messages,
headers = litellm.headers, headers=litellm.headers,
**optional_params, **optional_params,
) )
else: else:
response = openai.ChatCompletion.create( response = openai.ChatCompletion.create(
model=model, model=model, messages=messages, **optional_params
messages = messages,
**optional_params
) )
elif model in litellm.open_ai_chat_completion_models or custom_llm_provider == "custom_openai": # allow user to make an openai call with a custom base elif (
model in litellm.open_ai_chat_completion_models
or custom_llm_provider == "custom_openai"
): # allow user to make an openai call with a custom base
openai.api_type = "openai" openai.api_type = "openai"
# note: if a user sets a custom base - we should ensure this works # note: if a user sets a custom base - we should ensure this works
api_base = custom_api_base if custom_api_base is not None else litellm.api_base # allow for the setting of dynamic and stateful api-bases api_base = (
openai.api_base = api_base if api_base is not None else "https://api.openai.com/v1" custom_api_base if custom_api_base is not None else litellm.api_base
) # allow for the setting of dynamic and stateful api-bases
openai.api_base = (
api_base if api_base is not None else "https://api.openai.com/v1"
)
openai.api_version = None openai.api_version = None
if litellm.organization: if litellm.organization:
openai.organization = litellm.organization openai.organization = litellm.organization
# set API KEY # set API KEY
openai.api_key = api_key or litellm.openai_key or get_secret("OPENAI_API_KEY") openai.api_key = (
api_key or litellm.openai_key or get_secret("OPENAI_API_KEY")
)
## LOGGING ## LOGGING
logging(model=model, input=messages, additional_args=args, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn) logging(
model=model,
input=messages,
additional_args=args,
custom_llm_provider=custom_llm_provider,
logger_fn=logger_fn,
)
## COMPLETION CALL ## COMPLETION CALL
if litellm.headers: if litellm.headers:
response = openai.ChatCompletion.create( response = openai.ChatCompletion.create(
model=model, model=model,
messages = messages, messages=messages,
headers = litellm.headers, headers=litellm.headers,
**optional_params **optional_params,
) )
else: else:
response = openai.ChatCompletion.create( response = openai.ChatCompletion.create(
model=model, model=model, messages=messages, **optional_params
messages = messages,
**optional_params
) )
elif model in litellm.open_ai_text_completion_models: elif model in litellm.open_ai_text_completion_models:
openai.api_type = "openai" openai.api_type = "openai"
openai.api_base = litellm.api_base if litellm.api_base is not None else "https://api.openai.com/v1" openai.api_base = (
litellm.api_base
if litellm.api_base is not None
else "https://api.openai.com/v1"
)
openai.api_version = None openai.api_version = None
openai.api_key = api_key or litellm.openai_key or get_secret("OPENAI_API_KEY") openai.api_key = (
api_key or litellm.openai_key or get_secret("OPENAI_API_KEY")
)
if litellm.organization: if litellm.organization:
openai.organization = litellm.organization openai.organization = litellm.organization
prompt = " ".join([message["content"] for message in messages]) prompt = " ".join([message["content"] for message in messages])
## LOGGING ## LOGGING
logging(model=model, input=prompt, additional_args=optional_params, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn) logging(
model=model,
input=prompt,
additional_args=optional_params,
custom_llm_provider=custom_llm_provider,
logger_fn=logger_fn,
)
## COMPLETION CALL ## COMPLETION CALL
if litellm.headers: if litellm.headers:
response = openai.Completion.create( response = openai.Completion.create(
model=model, model=model,
prompt = prompt, prompt=prompt,
headers = litellm.headers, headers=litellm.headers,
) )
else: else:
response = openai.Completion.create( response = openai.Completion.create(model=model, prompt=prompt)
model=model,
prompt = prompt
)
completion_response = response["choices"][0]["text"] completion_response = response["choices"][0]["text"]
## LOGGING ## LOGGING
logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn) logging(
model=model,
input=prompt,
custom_llm_provider=custom_llm_provider,
additional_args={
"max_tokens": max_tokens,
"original_response": completion_response,
},
logger_fn=logger_fn,
)
## RESPONSE OBJECT ## RESPONSE OBJECT
model_response["choices"][0]["message"]["content"] = completion_response model_response["choices"][0]["message"]["content"] = completion_response
model_response["created"] = response["created"] model_response["created"] = response["created"]
@ -145,11 +254,17 @@ def completion(
# import replicate/if it fails then pip install replicate # import replicate/if it fails then pip install replicate
install_and_import("replicate") install_and_import("replicate")
import replicate import replicate
# Setting the relevant API KEY for replicate, replicate defaults to using os.environ.get("REPLICATE_API_TOKEN") # Setting the relevant API KEY for replicate, replicate defaults to using os.environ.get("REPLICATE_API_TOKEN")
replicate_key = os.environ.get("REPLICATE_API_TOKEN") replicate_key = os.environ.get("REPLICATE_API_TOKEN")
if replicate_key == None: if replicate_key == None:
# user did not set REPLICATE_API_TOKEN in .env # user did not set REPLICATE_API_TOKEN in .env
replicate_key = get_secret("REPLICATE_API_KEY") or get_secret("REPLICATE_API_TOKEN") or api_key or litellm.replicate_key replicate_key = (
get_secret("REPLICATE_API_KEY")
or get_secret("REPLICATE_API_TOKEN")
or api_key
or litellm.replicate_key
)
# set replicate kye # set replicate kye
os.environ["REPLICATE_API_TOKEN"] = replicate_key os.environ["REPLICATE_API_TOKEN"] = replicate_key
prompt = " ".join([message["content"] for message in messages]) prompt = " ".join([message["content"] for message in messages])
@ -158,12 +273,16 @@ def completion(
input["max_length"] = max_tokens # for t5 models input["max_length"] = max_tokens # for t5 models
input["max_new_tokens"] = max_tokens # for llama2 models input["max_new_tokens"] = max_tokens # for llama2 models
## LOGGING ## LOGGING
logging(model=model, input=input, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens}, logger_fn=logger_fn) logging(
model=model,
input=input,
custom_llm_provider=custom_llm_provider,
additional_args={"max_tokens": max_tokens},
logger_fn=logger_fn,
)
## COMPLETION CALL ## COMPLETION CALL
output = replicate.run( output = replicate.run(model, input=input)
model, if "stream" in optional_params and optional_params["stream"] == True:
input=input)
if 'stream' in optional_params and optional_params['stream'] == True:
# don't try to access stream object, # don't try to access stream object,
# let the stream handler know this is replicate # let the stream handler know this is replicate
response = CustomStreamWrapper(output, "replicate") response = CustomStreamWrapper(output, "replicate")
@ -173,7 +292,16 @@ def completion(
response += item response += item
completion_response = response completion_response = response
## LOGGING ## LOGGING
logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn) logging(
model=model,
input=prompt,
custom_llm_provider=custom_llm_provider,
additional_args={
"max_tokens": max_tokens,
"original_response": completion_response,
},
logger_fn=logger_fn,
)
prompt_tokens = len(encoding.encode(prompt)) prompt_tokens = len(encoding.encode(prompt))
completion_tokens = len(encoding.encode(completion_response)) completion_tokens = len(encoding.encode(completion_response))
## RESPONSE OBJECT ## RESPONSE OBJECT
@ -183,14 +311,28 @@ def completion(
model_response["usage"] = { model_response["usage"] = {
"prompt_tokens": prompt_tokens, "prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens, "completion_tokens": completion_tokens,
"total_tokens": prompt_tokens + completion_tokens "total_tokens": prompt_tokens + completion_tokens,
} }
response = model_response response = model_response
elif model in litellm.anthropic_models: elif model in litellm.anthropic_models:
anthropic_key = api_key or litellm.anthropic_key or os.environ.get("ANTHROPIC_API_KEY") anthropic_key = (
anthropic_client = AnthropicLLM(encoding=encoding, default_max_tokens_to_sample=litellm.max_tokens, api_key=anthropic_key) api_key or litellm.anthropic_key or os.environ.get("ANTHROPIC_API_KEY")
model_response = anthropic_client.completion(model=model, messages=messages, model_response=model_response, print_verbose=print_verbose, optional_params=optional_params, litellm_params=litellm_params, logger_fn=logger_fn) )
if 'stream' in optional_params and optional_params['stream'] == True: anthropic_client = AnthropicLLM(
encoding=encoding,
default_max_tokens_to_sample=litellm.max_tokens,
api_key=anthropic_key,
)
model_response = anthropic_client.completion(
model=model,
messages=messages,
model_response=model_response,
print_verbose=print_verbose,
optional_params=optional_params,
litellm_params=litellm_params,
logger_fn=logger_fn,
)
if "stream" in optional_params and optional_params["stream"] == True:
# don't try to access stream object, # don't try to access stream object,
response = CustomStreamWrapper(model_response, model) response = CustomStreamWrapper(model_response, model)
return response return response
@ -198,7 +340,11 @@ def completion(
elif model in litellm.openrouter_models or custom_llm_provider == "openrouter": elif model in litellm.openrouter_models or custom_llm_provider == "openrouter":
openai.api_type = "openai" openai.api_type = "openai"
# not sure if this will work after someone first uses another API # not sure if this will work after someone first uses another API
openai.api_base = litellm.api_base if litellm.api_base is not None else "https://openrouter.ai/api/v1" openai.api_base = (
litellm.api_base
if litellm.api_base is not None
else "https://openrouter.ai/api/v1"
)
openai.api_version = None openai.api_version = None
if litellm.organization: if litellm.organization:
openai.organization = litellm.organization openai.organization = litellm.organization
@ -207,16 +353,24 @@ def completion(
elif litellm.openrouter_key: elif litellm.openrouter_key:
openai.api_key = litellm.openrouter_key openai.api_key = litellm.openrouter_key
else: else:
openai.api_key = get_secret("OPENROUTER_API_KEY") or get_secret("OR_API_KEY") openai.api_key = get_secret("OPENROUTER_API_KEY") or get_secret(
"OR_API_KEY"
)
## LOGGING ## LOGGING
logging(model=model, input=messages, additional_args=optional_params, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn) logging(
model=model,
input=messages,
additional_args=optional_params,
custom_llm_provider=custom_llm_provider,
logger_fn=logger_fn,
)
## COMPLETION CALL ## COMPLETION CALL
if litellm.headers: if litellm.headers:
response = openai.ChatCompletion.create( response = openai.ChatCompletion.create(
model=model, model=model,
messages = messages, messages=messages,
headers = litellm.headers, headers=litellm.headers,
**optional_params **optional_params,
) )
else: else:
openrouter_site_url = get_secret("OR_SITE_URL") openrouter_site_url = get_secret("OR_SITE_URL")
@ -229,37 +383,52 @@ def completion(
openrouter_app_name = "liteLLM" openrouter_app_name = "liteLLM"
response = openai.ChatCompletion.create( response = openai.ChatCompletion.create(
model=model, model=model,
messages = messages, messages=messages,
headers = headers={
{
"HTTP-Referer": openrouter_site_url, # To identify your site "HTTP-Referer": openrouter_site_url, # To identify your site
"X-Title": openrouter_app_name # To identify your app "X-Title": openrouter_app_name, # To identify your app
}, },
**optional_params **optional_params,
) )
elif model in litellm.cohere_models: elif model in litellm.cohere_models:
# import cohere/if it fails then pip install cohere # import cohere/if it fails then pip install cohere
install_and_import("cohere") install_and_import("cohere")
import cohere import cohere
cohere_key = api_key or litellm.cohere_key or get_secret("COHERE_API_KEY") or get_secret("CO_API_KEY")
cohere_key = (
api_key
or litellm.cohere_key
or get_secret("COHERE_API_KEY")
or get_secret("CO_API_KEY")
)
co = cohere.Client(cohere_key) co = cohere.Client(cohere_key)
prompt = " ".join([message["content"] for message in messages]) prompt = " ".join([message["content"] for message in messages])
## LOGGING ## LOGGING
logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn) logging(
## COMPLETION CALL
response = co.generate(
model=model, model=model,
prompt = prompt, input=prompt,
**optional_params custom_llm_provider=custom_llm_provider,
logger_fn=logger_fn,
) )
if 'stream' in optional_params and optional_params['stream'] == True: ## COMPLETION CALL
response = co.generate(model=model, prompt=prompt, **optional_params)
if "stream" in optional_params and optional_params["stream"] == True:
# don't try to access stream object, # don't try to access stream object,
response = CustomStreamWrapper(response, model) response = CustomStreamWrapper(response, model)
return response return response
completion_response = response[0].text completion_response = response[0].text
## LOGGING ## LOGGING
logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn) logging(
model=model,
input=prompt,
custom_llm_provider=custom_llm_provider,
additional_args={
"max_tokens": max_tokens,
"original_response": completion_response,
},
logger_fn=logger_fn,
)
prompt_tokens = len(encoding.encode(prompt)) prompt_tokens = len(encoding.encode(prompt))
completion_tokens = len(encoding.encode(completion_response)) completion_tokens = len(encoding.encode(completion_response))
## RESPONSE OBJECT ## RESPONSE OBJECT
@ -269,52 +438,100 @@ def completion(
model_response["usage"] = { model_response["usage"] = {
"prompt_tokens": prompt_tokens, "prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens, "completion_tokens": completion_tokens,
"total_tokens": prompt_tokens + completion_tokens "total_tokens": prompt_tokens + completion_tokens,
} }
response = model_response response = model_response
elif model in litellm.huggingface_models or custom_llm_provider == "huggingface": elif (
model in litellm.huggingface_models or custom_llm_provider == "huggingface"
):
custom_llm_provider = "huggingface" custom_llm_provider = "huggingface"
huggingface_key = api_key or litellm.huggingface_key or os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_API_KEY") huggingface_key = (
huggingface_client = HuggingfaceRestAPILLM(encoding=encoding, api_key=huggingface_key) api_key
model_response = huggingface_client.completion(model=model, messages=messages, custom_api_base=custom_api_base, model_response=model_response, print_verbose=print_verbose, optional_params=optional_params, litellm_params=litellm_params, logger_fn=logger_fn) or litellm.huggingface_key
if 'stream' in optional_params and optional_params['stream'] == True: or os.environ.get("HF_TOKEN")
or os.environ.get("HUGGINGFACE_API_KEY")
)
huggingface_client = HuggingfaceRestAPILLM(
encoding=encoding, api_key=huggingface_key
)
model_response = huggingface_client.completion(
model=model,
messages=messages,
custom_api_base=custom_api_base,
model_response=model_response,
print_verbose=print_verbose,
optional_params=optional_params,
litellm_params=litellm_params,
logger_fn=logger_fn,
)
if "stream" in optional_params and optional_params["stream"] == True:
# don't try to access stream object, # don't try to access stream object,
response = CustomStreamWrapper(model_response, model, custom_llm_provider="huggingface") response = CustomStreamWrapper(
model_response, model, custom_llm_provider="huggingface"
)
return response return response
response = model_response response = model_response
elif custom_llm_provider == "together_ai" or ("togethercomputer" in model): elif custom_llm_provider == "together_ai" or ("togethercomputer" in model):
import requests import requests
TOGETHER_AI_TOKEN = get_secret("TOGETHER_AI_TOKEN") or get_secret("TOGETHERAI_API_KEY") or api_key or litellm.togetherai_api_key
TOGETHER_AI_TOKEN = (
get_secret("TOGETHER_AI_TOKEN")
or get_secret("TOGETHERAI_API_KEY")
or api_key
or litellm.togetherai_api_key
)
headers = {"Authorization": f"Bearer {TOGETHER_AI_TOKEN}"} headers = {"Authorization": f"Bearer {TOGETHER_AI_TOKEN}"}
endpoint = 'https://api.together.xyz/inference' endpoint = "https://api.together.xyz/inference"
prompt = " ".join([message["content"] for message in messages]) # TODO: Add chat support for together AI prompt = " ".join(
[message["content"] for message in messages]
) # TODO: Add chat support for together AI
## LOGGING ## LOGGING
logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn) logging(
model=model,
input=prompt,
custom_llm_provider=custom_llm_provider,
logger_fn=logger_fn,
)
if stream == True: if stream == True:
return together_ai_completion_streaming({ return together_ai_completion_streaming(
{
"model": model, "model": model,
"prompt": prompt, "prompt": prompt,
"request_type": "language-model-inference", "request_type": "language-model-inference",
**optional_params **optional_params,
}, },
headers=headers) headers=headers,
res = requests.post(endpoint, json={ )
res = requests.post(
endpoint,
json={
"model": model, "model": model,
"prompt": prompt, "prompt": prompt,
"request_type": "language-model-inference", "request_type": "language-model-inference",
**optional_params **optional_params,
}, },
headers=headers headers=headers,
) )
## LOGGING ## LOGGING
logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": res.text}, logger_fn=logger_fn) logging(
model=model,
input=prompt,
custom_llm_provider=custom_llm_provider,
additional_args={
"max_tokens": max_tokens,
"original_response": res.text,
},
logger_fn=logger_fn,
)
# make this safe for reading, if output does not exist raise an error # make this safe for reading, if output does not exist raise an error
json_response = res.json() json_response = res.json()
if "output" not in json_response: if "output" not in json_response:
raise Exception(f"liteLLM: Error Making TogetherAI request, JSON Response {json_response}") raise Exception(
completion_response = json_response['output']['choices'][0]['text'] f"liteLLM: Error Making TogetherAI request, JSON Response {json_response}"
)
completion_response = json_response["output"]["choices"][0]["text"]
prompt_tokens = len(encoding.encode(prompt)) prompt_tokens = len(encoding.encode(prompt))
completion_tokens = len(encoding.encode(completion_response)) completion_tokens = len(encoding.encode(completion_response))
## RESPONSE OBJECT ## RESPONSE OBJECT
@ -324,7 +541,7 @@ def completion(
model_response["usage"] = { model_response["usage"] = {
"prompt_tokens": prompt_tokens, "prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens, "completion_tokens": completion_tokens,
"total_tokens": prompt_tokens + completion_tokens "total_tokens": prompt_tokens + completion_tokens,
} }
response = model_response response = model_response
elif model in litellm.vertex_chat_models: elif model in litellm.vertex_chat_models:
@ -332,21 +549,41 @@ def completion(
install_and_import("vertexai") install_and_import("vertexai")
import vertexai import vertexai
from vertexai.preview.language_models import ChatModel, InputOutputTextPair from vertexai.preview.language_models import ChatModel, InputOutputTextPair
vertexai.init(project=litellm.vertex_project, location=litellm.vertex_location)
vertexai.init(
project=litellm.vertex_project, location=litellm.vertex_location
)
# vertexai does not use an API key, it looks for credentials.json in the environment # vertexai does not use an API key, it looks for credentials.json in the environment
prompt = " ".join([message["content"] for message in messages]) prompt = " ".join([message["content"] for message in messages])
## LOGGING ## LOGGING
logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"litellm_params": litellm_params, "optional_params": optional_params}, logger_fn=logger_fn) logging(
model=model,
input=prompt,
custom_llm_provider=custom_llm_provider,
additional_args={
"litellm_params": litellm_params,
"optional_params": optional_params,
},
logger_fn=logger_fn,
)
chat_model = ChatModel.from_pretrained(model) chat_model = ChatModel.from_pretrained(model)
chat = chat_model.start_chat() chat = chat_model.start_chat()
completion_response = chat.send_message(prompt, **optional_params) completion_response = chat.send_message(prompt, **optional_params)
## LOGGING ## LOGGING
logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn) logging(
model=model,
input=prompt,
custom_llm_provider=custom_llm_provider,
additional_args={
"max_tokens": max_tokens,
"original_response": completion_response,
},
logger_fn=logger_fn,
)
## RESPONSE OBJECT ## RESPONSE OBJECT
model_response["choices"][0]["message"]["content"] = completion_response model_response["choices"][0]["message"]["content"] = completion_response
@ -358,17 +595,33 @@ def completion(
import vertexai import vertexai
from vertexai.language_models import TextGenerationModel from vertexai.language_models import TextGenerationModel
vertexai.init(project=litellm.vertex_project, location=litellm.vertex_location) vertexai.init(
project=litellm.vertex_project, location=litellm.vertex_location
)
# vertexai does not use an API key, it looks for credentials.json in the environment # vertexai does not use an API key, it looks for credentials.json in the environment
prompt = " ".join([message["content"] for message in messages]) prompt = " ".join([message["content"] for message in messages])
## LOGGING ## LOGGING
logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn) logging(
model=model,
input=prompt,
custom_llm_provider=custom_llm_provider,
logger_fn=logger_fn,
)
vertex_model = TextGenerationModel.from_pretrained(model) vertex_model = TextGenerationModel.from_pretrained(model)
completion_response= vertex_model.predict(prompt, **optional_params) completion_response = vertex_model.predict(prompt, **optional_params)
## LOGGING ## LOGGING
logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn) logging(
model=model,
input=prompt,
custom_llm_provider=custom_llm_provider,
additional_args={
"max_tokens": max_tokens,
"original_response": completion_response,
},
logger_fn=logger_fn,
)
## RESPONSE OBJECT ## RESPONSE OBJECT
model_response["choices"][0]["message"]["content"] = completion_response model_response["choices"][0]["message"]["content"] = completion_response
@ -378,20 +631,35 @@ def completion(
elif model in litellm.ai21_models: elif model in litellm.ai21_models:
install_and_import("ai21") install_and_import("ai21")
import ai21 import ai21
ai21.api_key = get_secret("AI21_API_KEY") ai21.api_key = get_secret("AI21_API_KEY")
prompt = " ".join([message["content"] for message in messages]) prompt = " ".join([message["content"] for message in messages])
## LOGGING ## LOGGING
logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn) logging(
model=model,
input=prompt,
custom_llm_provider=custom_llm_provider,
logger_fn=logger_fn,
)
ai21_response = ai21.Completion.execute( ai21_response = ai21.Completion.execute(
model=model, model=model,
prompt=prompt, prompt=prompt,
) )
completion_response = ai21_response['completions'][0]['data']['text'] completion_response = ai21_response["completions"][0]["data"]["text"]
## LOGGING ## LOGGING
logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn) logging(
model=model,
input=prompt,
custom_llm_provider=custom_llm_provider,
additional_args={
"max_tokens": max_tokens,
"original_response": completion_response,
},
logger_fn=logger_fn,
)
## RESPONSE OBJECT ## RESPONSE OBJECT
model_response["choices"][0]["message"]["content"] = completion_response model_response["choices"][0]["message"]["content"] = completion_response
@ -399,7 +667,9 @@ def completion(
model_response["model"] = model model_response["model"] = model
response = model_response response = model_response
elif custom_llm_provider == "ollama": elif custom_llm_provider == "ollama":
endpoint = litellm.api_base if litellm.api_base is not None else custom_api_base endpoint = (
litellm.api_base if litellm.api_base is not None else custom_api_base
)
prompt = " ".join([message["content"] for message in messages]) prompt = " ".join([message["content"] for message in messages])
## LOGGING ## LOGGING
@ -407,14 +677,23 @@ def completion(
generator = get_ollama_response_stream(endpoint, model, prompt) generator = get_ollama_response_stream(endpoint, model, prompt)
# assume all responses are streamed # assume all responses are streamed
return generator return generator
elif custom_llm_provider == "baseten" or litellm.api_base=="https://app.baseten.co": elif (
custom_llm_provider == "baseten"
or litellm.api_base == "https://app.baseten.co"
):
import baseten import baseten
base_ten_key = get_secret('BASETEN_API_KEY')
base_ten_key = get_secret("BASETEN_API_KEY")
baseten.login(base_ten_key) baseten.login(base_ten_key)
prompt = " ".join([message["content"] for message in messages]) prompt = " ".join([message["content"] for message in messages])
## LOGGING ## LOGGING
logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn) logging(
model=model,
input=prompt,
custom_llm_provider=custom_llm_provider,
logger_fn=logger_fn,
)
base_ten__model = baseten.deployed_model_version_id(model) base_ten__model = baseten.deployed_model_version_id(model)
@ -424,7 +703,16 @@ def completion(
if type(completion_response) == dict: if type(completion_response) == dict:
completion_response = completion_response["generated_text"] completion_response = completion_response["generated_text"]
logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn) logging(
model=model,
input=prompt,
custom_llm_provider=custom_llm_provider,
additional_args={
"max_tokens": max_tokens,
"original_response": completion_response,
},
logger_fn=logger_fn,
)
## RESPONSE OBJECT ## RESPONSE OBJECT
model_response["choices"][0]["message"]["content"] = completion_response model_response["choices"][0]["message"]["content"] = completion_response
@ -432,16 +720,35 @@ def completion(
model_response["model"] = model model_response["model"] = model
response = model_response response = model_response
elif custom_llm_provider == "petals" or (litellm.api_base and "chat.petals.dev" in litellm.api_base): elif custom_llm_provider == "petals" or (
litellm.api_base and "chat.petals.dev" in litellm.api_base
):
url = "https://chat.petals.dev/api/v1/generate" url = "https://chat.petals.dev/api/v1/generate"
import requests import requests
prompt = " ".join([message["content"] for message in messages]) prompt = " ".join([message["content"] for message in messages])
## LOGGING ## LOGGING
logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn) logging(
response = requests.post(url, data={"inputs": prompt, "max_new_tokens": 100, "model": model}) model=model,
input=prompt,
custom_llm_provider=custom_llm_provider,
logger_fn=logger_fn,
)
response = requests.post(
url, data={"inputs": prompt, "max_new_tokens": 100, "model": model}
)
## LOGGING ## LOGGING
logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": response}, logger_fn=logger_fn) logging(
model=model,
input=prompt,
custom_llm_provider=custom_llm_provider,
additional_args={
"max_tokens": max_tokens,
"original_response": response,
},
logger_fn=logger_fn,
)
completion_response = response.json()["outputs"] completion_response = response.json()["outputs"]
# RESPONSE OBJECT # RESPONSE OBJECT
@ -451,15 +758,32 @@ def completion(
response = model_response response = model_response
else: else:
## LOGGING ## LOGGING
logging(model=model, input=messages, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn) logging(
model=model,
input=messages,
custom_llm_provider=custom_llm_provider,
logger_fn=logger_fn,
)
args = locals() args = locals()
raise ValueError(f"Unable to map your input to a model. Check your input - {args}") raise ValueError(
f"Unable to map your input to a model. Check your input - {args}"
)
return response return response
except Exception as e: except Exception as e:
## LOGGING ## LOGGING
logging(model=model, input=messages, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens}, logger_fn=logger_fn, exception=e) logging(
model=model,
input=messages,
custom_llm_provider=custom_llm_provider,
additional_args={"max_tokens": max_tokens},
logger_fn=logger_fn,
exception=e,
)
## Map to OpenAI Exception ## Map to OpenAI Exception
raise exception_type(model=model, custom_llm_provider=custom_llm_provider, original_exception=e) raise exception_type(
model=model, custom_llm_provider=custom_llm_provider, original_exception=e
)
def batch_completion(*args, **kwargs): def batch_completion(*args, **kwargs):
batch_messages = args[1] if len(args) > 1 else kwargs.get("messages") batch_messages = args[1] if len(args) > 1 else kwargs.get("messages")
@ -480,9 +804,12 @@ def batch_completion(*args, **kwargs):
results = [future.result() for future in completions] results = [future.result() for future in completions]
return results return results
### EMBEDDING ENDPOINTS #################### ### EMBEDDING ENDPOINTS ####################
@client @client
@timeout(60) ## set timeouts, in case calls hang (e.g. Azure) - default is 60s, override with `force_timeout` @timeout(
60
) ## set timeouts, in case calls hang (e.g. Azure) - default is 60s, override with `force_timeout`
def embedding(model, input=[], azure=False, force_timeout=60, logger_fn=None): def embedding(model, input=[], azure=False, force_timeout=60, logger_fn=None):
try: try:
response = None response = None
@ -519,6 +846,8 @@ def embedding(model, input=[], azure=False, force_timeout=60, logger_fn=None):
## Map to OpenAI Exception ## Map to OpenAI Exception
raise exception_type(model=model, original_exception=e) raise exception_type(model=model, original_exception=e)
raise e raise e
####### HELPER FUNCTIONS ################ ####### HELPER FUNCTIONS ################
## Set verbose to true -> ```litellm.set_verbose = True``` ## Set verbose to true -> ```litellm.set_verbose = True```
def print_verbose(print_statement): def print_verbose(print_statement):
@ -527,10 +856,13 @@ def print_verbose(print_statement):
if random.random() <= 0.3: if random.random() <= 0.3:
print("Get help - https://discord.com/invite/wuPM9dRgDw") print("Get help - https://discord.com/invite/wuPM9dRgDw")
def config_completion(**kwargs): def config_completion(**kwargs):
if litellm.config_path != None: if litellm.config_path != None:
config_args = read_config_args(litellm.config_path) config_args = read_config_args(litellm.config_path)
# overwrite any args passed in with config args # overwrite any args passed in with config args
return completion(**kwargs, **config_args) return completion(**kwargs, **config_args)
else: else:
raise ValueError("No config path set, please set a config path using `litellm.config_path = 'path/to/config.json'`") raise ValueError(
"No config path set, please set a config path using `litellm.config_path = 'path/to/config.json'`"
)

View file

@ -3,9 +3,12 @@ import time
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
import traceback import traceback
def testing_batch_completion(*args, **kwargs): def testing_batch_completion(*args, **kwargs):
try: try:
batch_models = args[0] if len(args) > 0 else kwargs.pop("models") ## expected input format- ["gpt-3.5-turbo", {"model": "qvv0xeq", "custom_llm_provider"="baseten"}...] batch_models = (
args[0] if len(args) > 0 else kwargs.pop("models")
) ## expected input format- ["gpt-3.5-turbo", {"model": "qvv0xeq", "custom_llm_provider"="baseten"}...]
batch_messages = args[1] if len(args) > 1 else kwargs.pop("messages") batch_messages = args[1] if len(args) > 1 else kwargs.pop("messages")
results = [] results = []
completions = [] completions = []
@ -18,16 +21,32 @@ def testing_batch_completion(*args, **kwargs):
if len(args) > 0: if len(args) > 0:
args_modified[0] = model["model"] args_modified[0] = model["model"]
else: else:
kwargs_modified["model"] = model["model"] if isinstance(model, dict) and "model" in model else model # if model is a dictionary get it's value else assume it's a string kwargs_modified["model"] = (
kwargs_modified["custom_llm_provider"] = model["custom_llm_provider"] if isinstance(model, dict) and "custom_llm_provider" in model else None model["model"]
kwargs_modified["custom_api_base"] = model["custom_api_base"] if isinstance(model, dict) and "custom_api_base" in model else None if isinstance(model, dict) and "model" in model
else model
) # if model is a dictionary get it's value else assume it's a string
kwargs_modified["custom_llm_provider"] = (
model["custom_llm_provider"]
if isinstance(model, dict) and "custom_llm_provider" in model
else None
)
kwargs_modified["custom_api_base"] = (
model["custom_api_base"]
if isinstance(model, dict) and "custom_api_base" in model
else None
)
for message_list in batch_messages: for message_list in batch_messages:
if len(args) > 1: if len(args) > 1:
args_modified[1] = message_list args_modified[1] = message_list
future = executor.submit(litellm.completion, *args_modified, **kwargs_modified) future = executor.submit(
litellm.completion, *args_modified, **kwargs_modified
)
else: else:
kwargs_modified["messages"] = message_list kwargs_modified["messages"] = message_list
future = executor.submit(litellm.completion, *args_modified, **kwargs_modified) future = executor.submit(
litellm.completion, *args_modified, **kwargs_modified
)
completions.append((future, message_list)) completions.append((future, message_list))
# Retrieve the results and calculate elapsed time for each completion call # Retrieve the results and calculate elapsed time for each completion call
@ -38,17 +57,27 @@ def testing_batch_completion(*args, **kwargs):
result = future.result() result = future.result()
end_time = time.time() end_time = time.time()
elapsed_time = end_time - start_time elapsed_time = end_time - start_time
result_dict = {"status": "succeeded", "response": future.result(), "prompt": message_list, "response_time": elapsed_time} result_dict = {
"status": "succeeded",
"response": future.result(),
"prompt": message_list,
"response_time": elapsed_time,
}
results.append(result_dict) results.append(result_dict)
except Exception as e: except Exception as e:
end_time = time.time() end_time = time.time()
elapsed_time = end_time - start_time elapsed_time = end_time - start_time
result_dict = {"status": "failed", "response": e, "response_time": elapsed_time} result_dict = {
"status": "failed",
"response": e,
"response_time": elapsed_time,
}
results.append(result_dict) results.append(result_dict)
return results return results
except: except:
traceback.print_exc() traceback.print_exc()
def duration_test_model(original_function): def duration_test_model(original_function):
def wrapper_function(*args, **kwargs): def wrapper_function(*args, **kwargs):
# Code to be executed before the original function # Code to be executed before the original function
@ -70,22 +99,39 @@ def duration_test_model(original_function):
# Return the wrapper function # Return the wrapper function
return wrapper_function return wrapper_function
@duration_test_model @duration_test_model
def load_test_model(models: list, prompt: str = None, num_calls: int = None): def load_test_model(models: list, prompt: str = None, num_calls: int = None):
test_calls = 100 test_calls = 100
if num_calls: if num_calls:
test_calls = num_calls test_calls = num_calls
input_prompt = prompt if prompt else "Hey, how's it going?" input_prompt = prompt if prompt else "Hey, how's it going?"
messages = [{"role": "user", "content": prompt}] if prompt else [{"role": "user", "content": input_prompt}] messages = (
full_message_list = [messages for _ in range(test_calls)] # call it as many times as set by user to load test models [{"role": "user", "content": prompt}]
if prompt
else [{"role": "user", "content": input_prompt}]
)
full_message_list = [
messages for _ in range(test_calls)
] # call it as many times as set by user to load test models
start_time = time.time() start_time = time.time()
try: try:
results = testing_batch_completion(models=models, messages=full_message_list) results = testing_batch_completion(models=models, messages=full_message_list)
end_time = time.time() end_time = time.time()
response_time = end_time - start_time response_time = end_time - start_time
return {"total_response_time": response_time, "calls_made": test_calls, "prompt": input_prompt, "results": results} return {
"total_response_time": response_time,
"calls_made": test_calls,
"prompt": input_prompt,
"results": results,
}
except Exception as e: except Exception as e:
traceback.print_exc() traceback.print_exc()
end_time = time.time() end_time = time.time()
response_time = end_time - start_time response_time = end_time - start_time
return {"total_response_time": response_time, "calls_made": test_calls, "prompt": input_prompt, "exception": e} return {
"total_response_time": response_time,
"calls_made": test_calls,
"prompt": input_prompt,
"exception": e,
}

View file

@ -3,24 +3,34 @@
import sys, os import sys, os
import traceback import traceback
sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import litellm import litellm
from litellm import embedding, completion from litellm import embedding, completion
litellm.set_verbose = False litellm.set_verbose = False
def logger_fn(model_call_object: dict): def logger_fn(model_call_object: dict):
print(f"model call details: {model_call_object}") print(f"model call details: {model_call_object}")
user_message = "Hello, how are you?" user_message = "Hello, how are you?"
messages = [{ "content": user_message,"role": "user"}] messages = [{"content": user_message, "role": "user"}]
## Test 1: Setting key dynamically ## Test 1: Setting key dynamically
temp_key = os.environ.get("ANTHROPIC_API_KEY") temp_key = os.environ.get("ANTHROPIC_API_KEY")
os.environ["ANTHROPIC_API_KEY"] = "bad-key" os.environ["ANTHROPIC_API_KEY"] = "bad-key"
# test on openai completion call # test on openai completion call
try: try:
response = completion(model="claude-instant-1", messages=messages, logger_fn=logger_fn, api_key=temp_key) response = completion(
model="claude-instant-1",
messages=messages,
logger_fn=logger_fn,
api_key=temp_key,
)
print(f"response: {response}") print(f"response: {response}")
except: except:
print(f"error occurred: {traceback.format_exc()}") print(f"error occurred: {traceback.format_exc()}")
@ -33,7 +43,9 @@ litellm.anthropic_key = os.environ.get("ANTHROPIC_API_KEY")
os.environ.pop("ANTHROPIC_API_KEY") os.environ.pop("ANTHROPIC_API_KEY")
# test on openai completion call # test on openai completion call
try: try:
response = completion(model="claude-instant-1", messages=messages, logger_fn=logger_fn) response = completion(
model="claude-instant-1", messages=messages, logger_fn=logger_fn
)
print(f"response: {response}") print(f"response: {response}")
except: except:
print(f"error occurred: {traceback.format_exc()}") print(f"error occurred: {traceback.format_exc()}")

View file

@ -5,17 +5,22 @@ import sys, os
import pytest import pytest
import traceback import traceback
import asyncio import asyncio
sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
from litellm import acompletion from litellm import acompletion
async def test_get_response(): async def test_get_response():
user_message = "Hello, how are you?" user_message = "Hello, how are you?"
messages = [{ "content": user_message,"role": "user"}] messages = [{"content": user_message, "role": "user"}]
try: try:
response = await acompletion(model="gpt-3.5-turbo", messages=messages) response = await acompletion(model="gpt-3.5-turbo", messages=messages)
except Exception as e: except Exception as e:
pytest.fail(f"error occurred: {e}") pytest.fail(f"error occurred: {e}")
return response return response
response = asyncio.run(test_get_response()) response = asyncio.run(test_get_response())
print(response) print(response)

View file

@ -5,12 +5,13 @@
import sys, os import sys, os
import traceback import traceback
from dotenv import load_dotenv from dotenv import load_dotenv
load_dotenv() load_dotenv()
# Get the current directory of the script # Get the current directory of the script
current_dir = os.path.dirname(os.path.abspath(__file__)) current_dir = os.path.dirname(os.path.abspath(__file__))
# Get the parent directory by joining the current directory with '..' # Get the parent directory by joining the current directory with '..'
parent_dir = os.path.join(current_dir, '../..') parent_dir = os.path.join(current_dir, "../..")
# Add the parent directory to the system path # Add the parent directory to the system path
sys.path.append(parent_dir) sys.path.append(parent_dir)
@ -26,7 +27,7 @@ litellm.failure_callback = ["slack", "sentry", "posthog"]
user_message = "Hello, how are you?" user_message = "Hello, how are you?"
messages = [{ "content": user_message,"role": "user"}] messages = [{"content": user_message, "role": "user"}]
model_val = None model_val = None
@ -39,7 +40,7 @@ def test_completion_with_empty_model():
pass pass
#bad key # bad key
temp_key = os.environ.get("OPENAI_API_KEY") temp_key = os.environ.get("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = "bad-key" os.environ["OPENAI_API_KEY"] = "bad-key"
# test on openai completion call # test on openai completion call

View file

@ -3,7 +3,10 @@
import sys, os import sys, os
import traceback import traceback
sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import litellm import litellm
from litellm import batch_completion from litellm import batch_completion

View file

@ -1,9 +1,13 @@
import sys, os import sys, os
import traceback import traceback
from dotenv import load_dotenv from dotenv import load_dotenv
load_dotenv() load_dotenv()
import os import os
sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import pytest import pytest
import litellm import litellm
from litellm import embedding, completion from litellm import embedding, completion
@ -12,7 +16,6 @@ litellm.caching = True
messages = [{"role": "user", "content": "who is ishaan Github? "}] messages = [{"role": "user", "content": "who is ishaan Github? "}]
# test if response cached # test if response cached
def test_caching(): def test_caching():
try: try:
@ -29,7 +32,3 @@ def test_caching():
litellm.caching = False litellm.caching = False
print(f"error occurred: {traceback.format_exc()}") print(f"error occurred: {traceback.format_exc()}")
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")

View file

@ -5,7 +5,9 @@ import sys, os
import traceback import traceback
import pytest import pytest
sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import litellm import litellm
from litellm import embedding, completion from litellm import embedding, completion
@ -14,17 +16,22 @@ litellm.failure_callback = ["slack", "sentry", "posthog"]
litellm.set_verbose = True litellm.set_verbose = True
def logger_fn(model_call_object: dict): def logger_fn(model_call_object: dict):
# print(f"model call details: {model_call_object}") # print(f"model call details: {model_call_object}")
pass pass
user_message = "Hello, how are you?" user_message = "Hello, how are you?"
messages = [{ "content": user_message,"role": "user"}] messages = [{"content": user_message, "role": "user"}]
def test_completion_openai(): def test_completion_openai():
try: try:
print("running query") print("running query")
response = completion(model="gpt-3.5-turbo", messages=messages, logger_fn=logger_fn) response = completion(
model="gpt-3.5-turbo", messages=messages, logger_fn=logger_fn
)
print(f"response: {response}") print(f"response: {response}")
# Add any assertions here to check the response # Add any assertions here to check the response
except Exception as e: except Exception as e:
@ -34,33 +41,46 @@ def test_completion_openai():
def test_completion_claude(): def test_completion_claude():
try: try:
response = completion(model="claude-instant-1", messages=messages, logger_fn=logger_fn) response = completion(
model="claude-instant-1", messages=messages, logger_fn=logger_fn
)
# Add any assertions here to check the response # Add any assertions here to check the response
except Exception as e: except Exception as e:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
def test_completion_non_openai(): def test_completion_non_openai():
try: try:
response = completion(model="command-nightly", messages=messages, logger_fn=logger_fn) response = completion(
model="command-nightly", messages=messages, logger_fn=logger_fn
)
# Add any assertions here to check the response # Add any assertions here to check the response
except Exception as e: except Exception as e:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
def test_embedding_openai(): def test_embedding_openai():
try: try:
response = embedding(model='text-embedding-ada-002', input=[user_message], logger_fn=logger_fn) response = embedding(
model="text-embedding-ada-002", input=[user_message], logger_fn=logger_fn
)
# Add any assertions here to check the response # Add any assertions here to check the response
print(f"response: {str(response)[:50]}") print(f"response: {str(response)[:50]}")
except Exception as e: except Exception as e:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
def test_bad_azure_embedding(): def test_bad_azure_embedding():
try: try:
response = embedding(model='chatgpt-test', input=[user_message], logger_fn=logger_fn) response = embedding(
model="chatgpt-test", input=[user_message], logger_fn=logger_fn
)
# Add any assertions here to check the response # Add any assertions here to check the response
print(f"response: {str(response)[:50]}") print(f"response: {str(response)[:50]}")
except Exception as e: except Exception as e:
pass pass
# def test_good_azure_embedding(): # def test_good_azure_embedding():
# try: # try:
# response = embedding(model='azure-embedding-model', input=[user_message], azure=True, logger_fn=logger_fn) # response = embedding(model='azure-embedding-model', input=[user_message], azure=True, logger_fn=logger_fn)
@ -68,4 +88,3 @@ def test_bad_azure_embedding():
# print(f"response: {str(response)[:50]}") # print(f"response: {str(response)[:50]}")
# except Exception as e: # except Exception as e:
# pytest.fail(f"Error occurred: {e}") # pytest.fail(f"Error occurred: {e}")

View file

@ -1,44 +1,58 @@
import sys, os import sys, os
import traceback import traceback
from dotenv import load_dotenv from dotenv import load_dotenv
load_dotenv() load_dotenv()
import os import os
sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import pytest import pytest
import litellm import litellm
from litellm import embedding, completion from litellm import embedding, completion
# from infisical import InfisicalClient # from infisical import InfisicalClient
# litellm.set_verbose = True # litellm.set_verbose = True
# litellm.secret_manager_client = InfisicalClient(token=os.environ["INFISICAL_TOKEN"]) # litellm.secret_manager_client = InfisicalClient(token=os.environ["INFISICAL_TOKEN"])
user_message = "Hello, whats the weather in San Francisco??" user_message = "Hello, whats the weather in San Francisco??"
messages = [{ "content": user_message,"role": "user"}] messages = [{"content": user_message, "role": "user"}]
def logger_fn(user_model_dict): def logger_fn(user_model_dict):
print(f"user_model_dict: {user_model_dict}") print(f"user_model_dict: {user_model_dict}")
def test_completion_claude(): def test_completion_claude():
try: try:
response = completion(model="claude-instant-1", messages=messages, logger_fn=logger_fn) response = completion(
model="claude-instant-1", messages=messages, logger_fn=logger_fn
)
# Add any assertions here to check the response # Add any assertions here to check the response
print(response) print(response)
except Exception as e: except Exception as e:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
def test_completion_claude_stream(): def test_completion_claude_stream():
try: try:
messages = [ messages = [
{"role": "system", "content": "You are a helpful assistant."}, {"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "how does a court case get to the Supreme Court?"} {
"role": "user",
"content": "how does a court case get to the Supreme Court?",
},
] ]
response = completion(model="claude-2", messages=messages, stream=True) response = completion(model="claude-2", messages=messages, stream=True)
# Add any assertions here to check the response # Add any assertions here to check the response
for chunk in response: for chunk in response:
print(chunk['choices'][0]['delta']) # same as openai format print(chunk["choices"][0]["delta"]) # same as openai format
except Exception as e: except Exception as e:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
# def test_completion_hf_api(): # def test_completion_hf_api():
# try: # try:
# user_message = "write some code to find the sum of two numbers" # user_message = "write some code to find the sum of two numbers"
@ -62,10 +76,12 @@ def test_completion_claude_stream():
def test_completion_cohere(): def test_completion_cohere():
try: try:
response = completion(model="command-nightly", messages=messages, max_tokens=100) response = completion(
model="command-nightly", messages=messages, max_tokens=100
)
# Add any assertions here to check the response # Add any assertions here to check the response
print(response) print(response)
response_str = response['choices'][0]['message']['content'] response_str = response["choices"][0]["message"]["content"]
print(f"str response{response_str}") print(f"str response{response_str}")
response_str_2 = response.choices[0].message.content response_str_2 = response.choices[0].message.content
if type(response_str) != str: if type(response_str) != str:
@ -75,24 +91,31 @@ def test_completion_cohere():
except Exception as e: except Exception as e:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
def test_completion_cohere_stream(): def test_completion_cohere_stream():
try: try:
messages = [ messages = [
{"role": "system", "content": "You are a helpful assistant."}, {"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "how does a court case get to the Supreme Court?"} {
"role": "user",
"content": "how does a court case get to the Supreme Court?",
},
] ]
response = completion(model="command-nightly", messages=messages, stream=True, max_tokens=50) response = completion(
model="command-nightly", messages=messages, stream=True, max_tokens=50
)
# Add any assertions here to check the response # Add any assertions here to check the response
for chunk in response: for chunk in response:
print(chunk['choices'][0]['delta']) # same as openai format print(chunk["choices"][0]["delta"]) # same as openai format
except Exception as e: except Exception as e:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
def test_completion_openai(): def test_completion_openai():
try: try:
response = completion(model="gpt-3.5-turbo", messages=messages) response = completion(model="gpt-3.5-turbo", messages=messages)
response_str = response['choices'][0]['message']['content'] response_str = response["choices"][0]["message"]["content"]
response_str_2 = response.choices[0].message.content response_str_2 = response.choices[0].message.content
assert response_str == response_str_2 assert response_str == response_str_2
assert type(response_str) == str assert type(response_str) == str
@ -100,6 +123,7 @@ def test_completion_openai():
except Exception as e: except Exception as e:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
def test_completion_text_openai(): def test_completion_text_openai():
try: try:
response = completion(model="text-davinci-003", messages=messages) response = completion(model="text-davinci-003", messages=messages)
@ -108,17 +132,31 @@ def test_completion_text_openai():
except Exception as e: except Exception as e:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
def test_completion_openai_with_optional_params(): def test_completion_openai_with_optional_params():
try: try:
response = completion(model="gpt-3.5-turbo", messages=messages, temperature=0.5, top_p=0.1, user="ishaan_dev@berri.ai") response = completion(
model="gpt-3.5-turbo",
messages=messages,
temperature=0.5,
top_p=0.1,
user="ishaan_dev@berri.ai",
)
# Add any assertions here to check the response # Add any assertions here to check the response
print(response) print(response)
except Exception as e: except Exception as e:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
def test_completion_openrouter(): def test_completion_openrouter():
try: try:
response = completion(model="google/palm-2-chat-bison", messages=messages, temperature=0.5, top_p=0.1, user="ishaan_dev@berri.ai") response = completion(
model="google/palm-2-chat-bison",
messages=messages,
temperature=0.5,
top_p=0.1,
user="ishaan_dev@berri.ai",
)
# Add any assertions here to check the response # Add any assertions here to check the response
print(response) print(response)
except Exception as e: except Exception as e:
@ -127,12 +165,23 @@ def test_completion_openrouter():
def test_completion_openai_with_more_optional_params(): def test_completion_openai_with_more_optional_params():
try: try:
response = completion(model="gpt-3.5-turbo", messages=messages, temperature=0.5, top_p=0.1, n=2, max_tokens=150, presence_penalty=0.5, frequency_penalty=-0.5, logit_bias={123: 5}, user="ishaan_dev@berri.ai") response = completion(
model="gpt-3.5-turbo",
messages=messages,
temperature=0.5,
top_p=0.1,
n=2,
max_tokens=150,
presence_penalty=0.5,
frequency_penalty=-0.5,
logit_bias={123: 5},
user="ishaan_dev@berri.ai",
)
# Add any assertions here to check the response # Add any assertions here to check the response
print(response) print(response)
response_str = response['choices'][0]['message']['content'] response_str = response["choices"][0]["message"]["content"]
response_str_2 = response.choices[0].message.content response_str_2 = response.choices[0].message.content
print(response['choices'][0]['message']['content']) print(response["choices"][0]["message"]["content"])
print(response.choices[0].message.content) print(response.choices[0].message.content)
if type(response_str) != str: if type(response_str) != str:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
@ -141,14 +190,28 @@ def test_completion_openai_with_more_optional_params():
except Exception as e: except Exception as e:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
def test_completion_openai_with_stream(): def test_completion_openai_with_stream():
try: try:
response = completion(model="gpt-3.5-turbo", messages=messages, temperature=0.5, top_p=0.1, n=2, max_tokens=150, presence_penalty=0.5, stream=True, frequency_penalty=-0.5, logit_bias={27000: 5}, user="ishaan_dev@berri.ai") response = completion(
model="gpt-3.5-turbo",
messages=messages,
temperature=0.5,
top_p=0.1,
n=2,
max_tokens=150,
presence_penalty=0.5,
stream=True,
frequency_penalty=-0.5,
logit_bias={27000: 5},
user="ishaan_dev@berri.ai",
)
# Add any assertions here to check the response # Add any assertions here to check the response
print(response) print(response)
except Exception as e: except Exception as e:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
def test_completion_openai_with_functions(): def test_completion_openai_with_functions():
function1 = [ function1 = [
{ {
@ -159,32 +222,38 @@ def test_completion_openai_with_functions():
"properties": { "properties": {
"location": { "location": {
"type": "string", "type": "string",
"description": "The city and state, e.g. San Francisco, CA" "description": "The city and state, e.g. San Francisco, CA",
}, },
"unit": { "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
"type": "string", },
"enum": ["celsius", "fahrenheit"] "required": ["location"],
}
}, },
"required": ["location"]
}
} }
] ]
try: try:
response = completion(model="gpt-3.5-turbo", messages=messages, functions=function1) response = completion(
model="gpt-3.5-turbo", messages=messages, functions=function1
)
# Add any assertions here to check the response # Add any assertions here to check the response
print(response) print(response)
except Exception as e: except Exception as e:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
def test_completion_azure(): def test_completion_azure():
try: try:
response = completion(model="gpt-3.5-turbo", deployment_id="chatgpt-test", messages=messages, custom_llm_provider="azure") response = completion(
model="gpt-3.5-turbo",
deployment_id="chatgpt-test",
messages=messages,
custom_llm_provider="azure",
)
# Add any assertions here to check the response # Add any assertions here to check the response
print(response) print(response)
except Exception as e: except Exception as e:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
# Replicate API endpoints are unstable -> throw random CUDA errors -> this means our tests can fail even if our tests weren't incorrect. # Replicate API endpoints are unstable -> throw random CUDA errors -> this means our tests can fail even if our tests weren't incorrect.
def test_completion_replicate_llama_stream(): def test_completion_replicate_llama_stream():
model_name = "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1" model_name = "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1"
@ -197,23 +266,32 @@ def test_completion_replicate_llama_stream():
except Exception as e: except Exception as e:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
def test_completion_replicate_stability_stream(): def test_completion_replicate_stability_stream():
model_name = "stability-ai/stablelm-tuned-alpha-7b:c49dae362cbaecd2ceabb5bd34fdb68413c4ff775111fea065d259d577757beb" model_name = "stability-ai/stablelm-tuned-alpha-7b:c49dae362cbaecd2ceabb5bd34fdb68413c4ff775111fea065d259d577757beb"
try: try:
response = completion(model=model_name, messages=messages, stream=True, custom_llm_provider="replicate") response = completion(
model=model_name,
messages=messages,
stream=True,
custom_llm_provider="replicate",
)
# Add any assertions here to check the response # Add any assertions here to check the response
for chunk in response: for chunk in response:
print(chunk['choices'][0]['delta']) print(chunk["choices"][0]["delta"])
print(response) print(response)
except Exception as e: except Exception as e:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
def test_completion_replicate_stability(): def test_completion_replicate_stability():
model_name = "stability-ai/stablelm-tuned-alpha-7b:c49dae362cbaecd2ceabb5bd34fdb68413c4ff775111fea065d259d577757beb" model_name = "stability-ai/stablelm-tuned-alpha-7b:c49dae362cbaecd2ceabb5bd34fdb68413c4ff775111fea065d259d577757beb"
try: try:
response = completion(model=model_name, messages=messages, custom_llm_provider="replicate") response = completion(
model=model_name, messages=messages, custom_llm_provider="replicate"
)
# Add any assertions here to check the response # Add any assertions here to check the response
response_str = response['choices'][0]['message']['content'] response_str = response["choices"][0]["message"]["content"]
response_str_2 = response.choices[0].message.content response_str_2 = response.choices[0].message.content
print(response_str) print(response_str)
print(response_str_2) print(response_str_2)
@ -224,6 +302,7 @@ def test_completion_replicate_stability():
except Exception as e: except Exception as e:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
######## Test TogetherAI ######## ######## Test TogetherAI ########
def test_completion_together_ai(): def test_completion_together_ai():
model_name = "togethercomputer/llama-2-70b-chat" model_name = "togethercomputer/llama-2-70b-chat"
@ -234,15 +313,22 @@ def test_completion_together_ai():
except Exception as e: except Exception as e:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
def test_petals(): def test_petals():
model_name = "stabilityai/StableBeluga2" model_name = "stabilityai/StableBeluga2"
try: try:
response = completion(model=model_name, messages=messages, custom_llm_provider="petals", force_timeout=120) response = completion(
model=model_name,
messages=messages,
custom_llm_provider="petals",
force_timeout=120,
)
# Add any assertions here to check the response # Add any assertions here to check the response
print(response) print(response)
except Exception as e: except Exception as e:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
# def test_baseten_falcon_7bcompletion(): # def test_baseten_falcon_7bcompletion():
# model_name = "qvv0xeq" # model_name = "qvv0xeq"
# try: # try:
@ -290,7 +376,6 @@ def test_petals():
# pytest.fail(f"Error occurred: {e}") # pytest.fail(f"Error occurred: {e}")
#### Test A121 ################### #### Test A121 ###################
# def test_completion_ai21(): # def test_completion_ai21():
# model_name = "j2-light" # model_name = "j2-light"
@ -333,4 +418,3 @@ def test_petals():
# return # return
# test_completion_together_ai_stream() # test_completion_together_ai_stream()

View file

@ -1,14 +1,21 @@
import sys, os import sys, os
import traceback import traceback
from dotenv import load_dotenv from dotenv import load_dotenv
load_dotenv() load_dotenv()
import os import os
sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import litellm import litellm
from litellm import completion from litellm import completion
def logging_fn(model_call_dict): def logging_fn(model_call_dict):
print(f"model call details: {model_call_dict}") print(f"model call details: {model_call_dict}")
models = ["gorilla-7b-hf-v1", "gpt-4"] models = ["gorilla-7b-hf-v1", "gpt-4"]
custom_llm_provider = None custom_llm_provider = None
messages = [{"role": "user", "content": "Hey, how's it going?"}] messages = [{"role": "user", "content": "Hey, how's it going?"}]
@ -17,4 +24,10 @@ for model in models: # iterate through list
if model == "gorilla-7b-hf-v1": if model == "gorilla-7b-hf-v1":
custom_llm_provider = "custom_openai" custom_llm_provider = "custom_openai"
custom_api_base = "http://zanino.millennium.berkeley.edu:8000/v1" custom_api_base = "http://zanino.millennium.berkeley.edu:8000/v1"
completion(model=model, messages=messages, custom_llm_provider=custom_llm_provider, custom_api_base=custom_api_base, logger_fn=logging_fn) completion(
model=model,
messages=messages,
custom_llm_provider=custom_llm_provider,
custom_api_base=custom_api_base,
logger_fn=logging_fn,
)

View file

@ -1,9 +1,10 @@
import sys, os import sys, os
import traceback import traceback
import pytest import pytest
sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import litellm import litellm
from litellm import embedding, completion from litellm import embedding, completion
from infisical import InfisicalClient from infisical import InfisicalClient
@ -11,9 +12,12 @@ from infisical import InfisicalClient
# # litellm.set_verbose = True # # litellm.set_verbose = True
# litellm.secret_manager_client = InfisicalClient(token=os.environ["INFISICAL_TOKEN"]) # litellm.secret_manager_client = InfisicalClient(token=os.environ["INFISICAL_TOKEN"])
def test_openai_embedding(): def test_openai_embedding():
try: try:
response = embedding(model='text-embedding-ada-002', input=["good morning from litellm"]) response = embedding(
model="text-embedding-ada-002", input=["good morning from litellm"]
)
# Add any assertions here to check the response # Add any assertions here to check the response
print(f"response: {str(response)}") print(f"response: {str(response)}")
except Exception as e: except Exception as e:

View file

@ -2,9 +2,20 @@
import os import os
import sys import sys
import traceback import traceback
sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import litellm import litellm
from litellm import embedding, completion, AuthenticationError, InvalidRequestError, RateLimitError, ServiceUnavailableError, OpenAIError from litellm import (
embedding,
completion,
AuthenticationError,
InvalidRequestError,
RateLimitError,
ServiceUnavailableError,
OpenAIError,
)
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
import pytest import pytest
@ -23,6 +34,8 @@ litellm.failure_callback = ["sentry"]
# models = ["gpt-3.5-turbo", "chatgpt-test", "claude-instant-1", "command-nightly"] # models = ["gpt-3.5-turbo", "chatgpt-test", "claude-instant-1", "command-nightly"]
test_model = "claude-instant-1" test_model = "claude-instant-1"
models = ["claude-instant-1"] models = ["claude-instant-1"]
def logging_fn(model_call_dict): def logging_fn(model_call_dict):
if "model" in model_call_dict: if "model" in model_call_dict:
print(f"model_call_dict: {model_call_dict['model']}") print(f"model_call_dict: {model_call_dict['model']}")
@ -38,7 +51,12 @@ def test_context_window(model):
try: try:
model = "chatgpt-test" model = "chatgpt-test"
print(f"model: {model}") print(f"model: {model}")
response = completion(model=model, messages=messages, custom_llm_provider="azure", logger_fn=logging_fn) response = completion(
model=model,
messages=messages,
custom_llm_provider="azure",
logger_fn=logging_fn,
)
print(f"response: {response}") print(f"response: {response}")
except InvalidRequestError as e: except InvalidRequestError as e:
print(f"InvalidRequestError: {e.llm_provider}") print(f"InvalidRequestError: {e.llm_provider}")
@ -52,12 +70,15 @@ def test_context_window(model):
print(f"Uncaught Exception - {e}") print(f"Uncaught Exception - {e}")
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
return return
test_context_window(test_model) test_context_window(test_model)
# Test 2: InvalidAuth Errors # Test 2: InvalidAuth Errors
@pytest.mark.parametrize("model", models) @pytest.mark.parametrize("model", models)
def invalid_auth(model): # set the model key to an invalid key, depending on the model def invalid_auth(model): # set the model key to an invalid key, depending on the model
messages = [{ "content": "Hello, how are you?","role": "user"}] messages = [{"content": "Hello, how are you?", "role": "user"}]
temporary_key = None temporary_key = None
try: try:
custom_llm_provider = None custom_llm_provider = None
@ -74,15 +95,22 @@ def invalid_auth(model): # set the model key to an invalid key, depending on the
elif model == "command-nightly": elif model == "command-nightly":
temporary_key = os.environ["COHERE_API_KEY"] temporary_key = os.environ["COHERE_API_KEY"]
os.environ["COHERE_API_KEY"] = "bad-key" os.environ["COHERE_API_KEY"] = "bad-key"
elif model == "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1": elif (
model
== "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1"
):
temporary_key = os.environ["REPLICATE_API_KEY"] temporary_key = os.environ["REPLICATE_API_KEY"]
os.environ["REPLICATE_API_KEY"] = "bad-key" os.environ["REPLICATE_API_KEY"] = "bad-key"
print(f"model: {model}") print(f"model: {model}")
response = completion(model=model, messages=messages, custom_llm_provider=custom_llm_provider) response = completion(
model=model, messages=messages, custom_llm_provider=custom_llm_provider
)
print(f"response: {response}") print(f"response: {response}")
except AuthenticationError as e: except AuthenticationError as e:
print(f"AuthenticationError Caught Exception - {e.llm_provider}") print(f"AuthenticationError Caught Exception - {e.llm_provider}")
except OpenAIError: # is at least an openai error -> in case of random model errors - e.g. overloaded server except (
OpenAIError
): # is at least an openai error -> in case of random model errors - e.g. overloaded server
print(f"OpenAIError Caught Exception - {e}") print(f"OpenAIError Caught Exception - {e}")
except Exception as e: except Exception as e:
print(type(e)) print(type(e))
@ -99,9 +127,14 @@ def invalid_auth(model): # set the model key to an invalid key, depending on the
os.environ["ANTHROPIC_API_KEY"] = temporary_key os.environ["ANTHROPIC_API_KEY"] = temporary_key
elif model == "command-nightly": elif model == "command-nightly":
os.environ["COHERE_API_KEY"] = temporary_key os.environ["COHERE_API_KEY"] = temporary_key
elif model == "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1": elif (
model
== "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1"
):
os.environ["REPLICATE_API_KEY"] = temporary_key os.environ["REPLICATE_API_KEY"] = temporary_key
return return
invalid_auth(test_model) invalid_auth(test_model)
# # Test 3: Rate Limit Errors # # Test 3: Rate Limit Errors
# def test_model(model): # def test_model(model):
@ -142,5 +175,3 @@ invalid_auth(test_model)
# accuracy_score = counts[True]/(counts[True] + counts[False]) # accuracy_score = counts[True]/(counts[True] + counts[False])
# print(f"accuracy_score: {accuracy_score}") # print(f"accuracy_score: {accuracy_score}")

View file

@ -5,7 +5,9 @@ import sys, os
import traceback import traceback
import pytest import pytest
sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import litellm import litellm
from litellm import embedding, completion from litellm import embedding, completion
@ -14,11 +16,15 @@ litellm.success_callback = ["helicone"]
litellm.set_verbose = True litellm.set_verbose = True
user_message = "Hello, how are you?" user_message = "Hello, how are you?"
messages = [{ "content": user_message,"role": "user"}] messages = [{"content": user_message, "role": "user"}]
#openai call # openai call
response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}]) response = completion(
model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}]
)
#cohere call # cohere call
response = completion(model="command-nightly", messages=[{"role": "user", "content": "Hi 👋 - i'm cohere"}]) response = completion(
model="command-nightly", messages=[{"role": "user", "content": "Hi 👋 - i'm cohere"}]
)

View file

@ -1,6 +1,9 @@
import sys, os import sys, os
import traceback import traceback
sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import litellm import litellm
from litellm import load_test_model, testing_batch_completion from litellm import load_test_model, testing_batch_completion
@ -16,7 +19,19 @@ from litellm import load_test_model, testing_batch_completion
# print(result) # print(result)
## Quality Test across Model ## Quality Test across Model
models = ["gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-4", "claude-instant-1", {"model": "replicate/llama-2-70b-chat:58d078176e02c219e11eb4da5a02a7830a283b14cf8f94537af893ccff5ee781", "custom_llm_provider": "replicate"}] models = [
messages = [[{"role": "user", "content": "What is your name?"}], [{"role": "user", "content": "Hey, how's it going?"}]] "gpt-3.5-turbo",
"gpt-3.5-turbo-16k",
"gpt-4",
"claude-instant-1",
{
"model": "replicate/llama-2-70b-chat:58d078176e02c219e11eb4da5a02a7830a283b14cf8f94537af893ccff5ee781",
"custom_llm_provider": "replicate",
},
]
messages = [
[{"role": "user", "content": "What is your name?"}],
[{"role": "user", "content": "Hey, how's it going?"}],
]
result = testing_batch_completion(models=models, messages=messages) result = testing_batch_completion(models=models, messages=messages)
print(result) print(result)

View file

@ -3,7 +3,10 @@
import sys, os import sys, os
import traceback import traceback
sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import litellm import litellm
from litellm import embedding, completion from litellm import embedding, completion
@ -11,25 +14,29 @@ litellm.set_verbose = False
score = 0 score = 0
def logger_fn(model_call_object: dict): def logger_fn(model_call_object: dict):
print(f"model call details: {model_call_object}") print(f"model call details: {model_call_object}")
user_message = "Hello, how are you?" user_message = "Hello, how are you?"
messages = [{ "content": user_message,"role": "user"}] messages = [{"content": user_message, "role": "user"}]
# test on openai completion call # test on openai completion call
try: try:
response = completion(model="gpt-3.5-turbo", messages=messages, logger_fn=logger_fn) response = completion(model="gpt-3.5-turbo", messages=messages, logger_fn=logger_fn)
score +=1 score += 1
except: except:
print(f"error occurred: {traceback.format_exc()}") print(f"error occurred: {traceback.format_exc()}")
pass pass
# test on non-openai completion call # test on non-openai completion call
try: try:
response = completion(model="claude-instant-1", messages=messages, logger_fn=logger_fn) response = completion(
model="claude-instant-1", messages=messages, logger_fn=logger_fn
)
print(f"claude response: {response}") print(f"claude response: {response}")
score +=1 score += 1
except: except:
print(f"error occurred: {traceback.format_exc()}") print(f"error occurred: {traceback.format_exc()}")
pass pass

View file

@ -3,7 +3,10 @@
import sys, os import sys, os
import traceback import traceback
sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import litellm import litellm
from litellm import embedding, completion from litellm import embedding, completion
@ -15,7 +18,7 @@ litellm.set_verbose = True
model_fallback_list = ["claude-instant-1", "gpt-3.5-turbo", "chatgpt-test"] model_fallback_list = ["claude-instant-1", "gpt-3.5-turbo", "chatgpt-test"]
user_message = "Hello, how are you?" user_message = "Hello, how are you?"
messages = [{ "content": user_message,"role": "user"}] messages = [{"content": user_message, "role": "user"}]
for model in model_fallback_list: for model in model_fallback_list:
try: try:

View file

@ -4,7 +4,10 @@
import sys, os import sys, os
import traceback import traceback
sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import litellm import litellm
from litellm import embedding, completion from litellm import embedding, completion
@ -13,7 +16,7 @@ litellm.set_verbose = True
model_fallback_list = ["claude-instant-1", "gpt-3.5-turbo", "chatgpt-test"] model_fallback_list = ["claude-instant-1", "gpt-3.5-turbo", "chatgpt-test"]
user_message = "Hello, how are you?" user_message = "Hello, how are you?"
messages = [{ "content": user_message,"role": "user"}] messages = [{"content": user_message, "role": "user"}]
for model in model_fallback_list: for model in model_fallback_list:
try: try:

View file

@ -53,7 +53,6 @@
# # # return this generator to the client for streaming requests # # # return this generator to the client for streaming requests
# # async def get_response(): # # async def get_response():
# # global generator # # global generator
# # async for elem in generator: # # async for elem in generator:

View file

@ -12,7 +12,6 @@
# import asyncio # import asyncio
# user_message = "respond in 20 words. who are you?" # user_message = "respond in 20 words. who are you?"
# messages = [{ "content": user_message,"role": "user"}] # messages = [{ "content": user_message,"role": "user"}]
@ -45,8 +44,3 @@
# pytest.fail(f"Error occurred: {e}") # pytest.fail(f"Error occurred: {e}")
# test_completion_ollama_stream() # test_completion_ollama_stream()

View file

@ -4,7 +4,10 @@
import sys, os import sys, os
import traceback import traceback
sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import litellm import litellm
from litellm import embedding, completion from litellm import embedding, completion
from infisical import InfisicalClient from infisical import InfisicalClient
@ -15,7 +18,7 @@ infisical_token = os.environ["INFISICAL_TOKEN"]
litellm.secret_manager_client = InfisicalClient(token=infisical_token) litellm.secret_manager_client = InfisicalClient(token=infisical_token)
user_message = "Hello, whats the weather in San Francisco??" user_message = "Hello, whats the weather in San Francisco??"
messages = [{ "content": user_message,"role": "user"}] messages = [{"content": user_message, "role": "user"}]
def test_completion_openai(): def test_completion_openai():
@ -28,5 +31,5 @@ def test_completion_openai():
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
litellm.secret_manager_client = None litellm.secret_manager_client = None
test_completion_openai()
test_completion_openai()

View file

@ -3,7 +3,10 @@
import sys, os import sys, os
import traceback import traceback
sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import litellm import litellm
from litellm import completion from litellm import completion
@ -11,18 +14,22 @@ litellm.set_verbose = False
score = 0 score = 0
def logger_fn(model_call_object: dict): def logger_fn(model_call_object: dict):
print(f"model call details: {model_call_object}") print(f"model call details: {model_call_object}")
user_message = "Hello, how are you?" user_message = "Hello, how are you?"
messages = [{ "content": user_message,"role": "user"}] messages = [{"content": user_message, "role": "user"}]
# test on anthropic completion call # test on anthropic completion call
try: try:
response = completion(model="claude-instant-1", messages=messages, stream=True, logger_fn=logger_fn) response = completion(
model="claude-instant-1", messages=messages, stream=True, logger_fn=logger_fn
)
for chunk in response: for chunk in response:
print(chunk['choices'][0]['delta']) print(chunk["choices"][0]["delta"])
score +=1 score += 1
except: except:
print(f"error occurred: {traceback.format_exc()}") print(f"error occurred: {traceback.format_exc()}")
pass pass
@ -30,10 +37,17 @@ except:
# test on anthropic completion call # test on anthropic completion call
try: try:
response = completion(model="meta-llama/Llama-2-7b-chat-hf", messages=messages, custom_llm_provider="huggingface", custom_api_base="https://s7c7gytn18vnu4tw.us-east-1.aws.endpoints.huggingface.cloud", stream=True, logger_fn=logger_fn) response = completion(
model="meta-llama/Llama-2-7b-chat-hf",
messages=messages,
custom_llm_provider="huggingface",
custom_api_base="https://s7c7gytn18vnu4tw.us-east-1.aws.endpoints.huggingface.cloud",
stream=True,
logger_fn=logger_fn,
)
for chunk in response: for chunk in response:
print(chunk['choices'][0]['delta']) print(chunk["choices"][0]["delta"])
score +=1 score += 1
except: except:
print(f"error occurred: {traceback.format_exc()}") print(f"error occurred: {traceback.format_exc()}")
pass pass

View file

@ -3,10 +3,14 @@
import sys, os import sys, os
import traceback import traceback
sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import time import time
from litellm import timeout from litellm import timeout
@timeout(10) @timeout(10)
def stop_after_10_s(force_timeout=60): def stop_after_10_s(force_timeout=60):
print("Stopping after 10 seconds") print("Stopping after 10 seconds")

View file

@ -11,9 +11,7 @@ from threading import Thread
from openai.error import Timeout from openai.error import Timeout
def timeout( def timeout(timeout_duration: float = None, exception_to_raise=Timeout):
timeout_duration: float = None, exception_to_raise = Timeout
):
""" """
Wraps a function to raise the specified exception if execution time Wraps a function to raise the specified exception if execution time
is greater than the specified timeout. is greater than the specified timeout.
@ -44,7 +42,9 @@ def timeout(
result = future.result(timeout=local_timeout_duration) result = future.result(timeout=local_timeout_duration)
except futures.TimeoutError: except futures.TimeoutError:
thread.stop_loop() thread.stop_loop()
raise exception_to_raise(f"A timeout error occurred. The function call took longer than {local_timeout_duration} second(s).") raise exception_to_raise(
f"A timeout error occurred. The function call took longer than {local_timeout_duration} second(s)."
)
thread.stop_loop() thread.stop_loop()
return result return result
@ -59,7 +59,9 @@ def timeout(
) )
return value return value
except asyncio.TimeoutError: except asyncio.TimeoutError:
raise exception_to_raise(f"A timeout error occurred. The function call took longer than {local_timeout_duration} second(s).") raise exception_to_raise(
f"A timeout error occurred. The function call took longer than {local_timeout_duration} second(s)."
)
if iscoroutinefunction(func): if iscoroutinefunction(func):
return async_wrapper return async_wrapper

File diff suppressed because it is too large Load diff