forked from phoenix/litellm-mirror
add linting
This commit is contained in:
parent
8ef47524bf
commit
15b1da9dc8
40 changed files with 3110 additions and 1709 deletions
|
@ -1,16 +1,17 @@
|
|||
import threading
|
||||
|
||||
success_callback = []
|
||||
failure_callback = []
|
||||
set_verbose=False
|
||||
telemetry=True
|
||||
max_tokens = 256 # OpenAI Defaults
|
||||
set_verbose = False
|
||||
telemetry = True
|
||||
max_tokens = 256 # OpenAI Defaults
|
||||
retry = True
|
||||
api_key = None
|
||||
openai_key = None
|
||||
azure_key = None
|
||||
anthropic_key = None
|
||||
replicate_key = None
|
||||
cohere_key = None
|
||||
openai_key = None
|
||||
azure_key = None
|
||||
anthropic_key = None
|
||||
replicate_key = None
|
||||
cohere_key = None
|
||||
openrouter_key = None
|
||||
huggingface_key = None
|
||||
vertex_project = None
|
||||
|
@ -19,33 +20,99 @@ caching = False
|
|||
hugging_api_token = None
|
||||
togetherai_api_key = None
|
||||
model_cost = {
|
||||
"gpt-3.5-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
|
||||
"gpt-35-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, # azure model name
|
||||
"gpt-3.5-turbo-0613": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
|
||||
"gpt-3.5-turbo-0301": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
|
||||
"gpt-3.5-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
|
||||
"gpt-35-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, # azure model name
|
||||
"gpt-3.5-turbo-16k-0613": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
|
||||
"gpt-4": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
|
||||
"gpt-4-0613": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
|
||||
"gpt-4-32k": {"max_tokens": 8000, "input_cost_per_token": 0.00006, "output_cost_per_token": 0.00012},
|
||||
"claude-instant-1": {"max_tokens": 100000, "input_cost_per_token": 0.00000163, "output_cost_per_token": 0.00000551},
|
||||
"claude-2": {"max_tokens": 100000, "input_cost_per_token": 0.00001102, "output_cost_per_token": 0.00003268},
|
||||
"text-bison-001": {"max_tokens": 8192, "input_cost_per_token": 0.000004, "output_cost_per_token": 0.000004},
|
||||
"chat-bison-001": {"max_tokens": 4096, "input_cost_per_token": 0.000002, "output_cost_per_token": 0.000002},
|
||||
"command-nightly": {"max_tokens": 4096, "input_cost_per_token": 0.000015, "output_cost_per_token": 0.000015},
|
||||
"gpt-3.5-turbo": {
|
||||
"max_tokens": 4000,
|
||||
"input_cost_per_token": 0.0000015,
|
||||
"output_cost_per_token": 0.000002,
|
||||
},
|
||||
"gpt-35-turbo": {
|
||||
"max_tokens": 4000,
|
||||
"input_cost_per_token": 0.0000015,
|
||||
"output_cost_per_token": 0.000002,
|
||||
}, # azure model name
|
||||
"gpt-3.5-turbo-0613": {
|
||||
"max_tokens": 4000,
|
||||
"input_cost_per_token": 0.0000015,
|
||||
"output_cost_per_token": 0.000002,
|
||||
},
|
||||
"gpt-3.5-turbo-0301": {
|
||||
"max_tokens": 4000,
|
||||
"input_cost_per_token": 0.0000015,
|
||||
"output_cost_per_token": 0.000002,
|
||||
},
|
||||
"gpt-3.5-turbo-16k": {
|
||||
"max_tokens": 16000,
|
||||
"input_cost_per_token": 0.000003,
|
||||
"output_cost_per_token": 0.000004,
|
||||
},
|
||||
"gpt-35-turbo-16k": {
|
||||
"max_tokens": 16000,
|
||||
"input_cost_per_token": 0.000003,
|
||||
"output_cost_per_token": 0.000004,
|
||||
}, # azure model name
|
||||
"gpt-3.5-turbo-16k-0613": {
|
||||
"max_tokens": 16000,
|
||||
"input_cost_per_token": 0.000003,
|
||||
"output_cost_per_token": 0.000004,
|
||||
},
|
||||
"gpt-4": {
|
||||
"max_tokens": 8000,
|
||||
"input_cost_per_token": 0.000003,
|
||||
"output_cost_per_token": 0.00006,
|
||||
},
|
||||
"gpt-4-0613": {
|
||||
"max_tokens": 8000,
|
||||
"input_cost_per_token": 0.000003,
|
||||
"output_cost_per_token": 0.00006,
|
||||
},
|
||||
"gpt-4-32k": {
|
||||
"max_tokens": 8000,
|
||||
"input_cost_per_token": 0.00006,
|
||||
"output_cost_per_token": 0.00012,
|
||||
},
|
||||
"claude-instant-1": {
|
||||
"max_tokens": 100000,
|
||||
"input_cost_per_token": 0.00000163,
|
||||
"output_cost_per_token": 0.00000551,
|
||||
},
|
||||
"claude-2": {
|
||||
"max_tokens": 100000,
|
||||
"input_cost_per_token": 0.00001102,
|
||||
"output_cost_per_token": 0.00003268,
|
||||
},
|
||||
"text-bison-001": {
|
||||
"max_tokens": 8192,
|
||||
"input_cost_per_token": 0.000004,
|
||||
"output_cost_per_token": 0.000004,
|
||||
},
|
||||
"chat-bison-001": {
|
||||
"max_tokens": 4096,
|
||||
"input_cost_per_token": 0.000002,
|
||||
"output_cost_per_token": 0.000002,
|
||||
},
|
||||
"command-nightly": {
|
||||
"max_tokens": 4096,
|
||||
"input_cost_per_token": 0.000015,
|
||||
"output_cost_per_token": 0.000015,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
####### THREAD-SPECIFIC DATA ###################
|
||||
class MyLocal(threading.local):
|
||||
def __init__(self):
|
||||
self.user = "Hello World"
|
||||
|
||||
|
||||
_thread_context = MyLocal()
|
||||
|
||||
|
||||
def identify(event_details):
|
||||
# Store user in thread local data
|
||||
if "user" in event_details:
|
||||
_thread_context.user = event_details["user"]
|
||||
|
||||
|
||||
####### ADDITIONAL PARAMS ################### configurable params if you use proxy models like Helicone, map spend to org id, etc.
|
||||
api_base = None
|
||||
headers = None
|
||||
|
@ -56,60 +123,48 @@ config_path = None
|
|||
secret_manager_client = None
|
||||
####### COMPLETION MODELS ###################
|
||||
open_ai_chat_completion_models = [
|
||||
"gpt-4",
|
||||
"gpt-4-0613",
|
||||
"gpt-4-32k",
|
||||
"gpt-4-32k-0613",
|
||||
#################
|
||||
"gpt-3.5-turbo",
|
||||
"gpt-3.5-turbo-16k",
|
||||
"gpt-3.5-turbo-0613",
|
||||
"gpt-3.5-turbo-16k-0613",
|
||||
]
|
||||
open_ai_text_completion_models = [
|
||||
'text-davinci-003'
|
||||
"gpt-4",
|
||||
"gpt-4-0613",
|
||||
"gpt-4-32k",
|
||||
"gpt-4-32k-0613",
|
||||
#################
|
||||
"gpt-3.5-turbo",
|
||||
"gpt-3.5-turbo-16k",
|
||||
"gpt-3.5-turbo-0613",
|
||||
"gpt-3.5-turbo-16k-0613",
|
||||
]
|
||||
open_ai_text_completion_models = ["text-davinci-003"]
|
||||
|
||||
cohere_models = [
|
||||
'command-nightly',
|
||||
"command",
|
||||
"command-light",
|
||||
"command-medium-beta",
|
||||
"command-xlarge-beta"
|
||||
"command-nightly",
|
||||
"command",
|
||||
"command-light",
|
||||
"command-medium-beta",
|
||||
"command-xlarge-beta",
|
||||
]
|
||||
|
||||
anthropic_models = [
|
||||
"claude-2",
|
||||
"claude-instant-1",
|
||||
"claude-instant-1.2"
|
||||
]
|
||||
anthropic_models = ["claude-2", "claude-instant-1", "claude-instant-1.2"]
|
||||
|
||||
replicate_models = [
|
||||
"replicate/"
|
||||
] # placeholder, to make sure we accept any replicate model in our model_list
|
||||
] # placeholder, to make sure we accept any replicate model in our model_list
|
||||
|
||||
openrouter_models = [
|
||||
'google/palm-2-codechat-bison',
|
||||
'google/palm-2-chat-bison',
|
||||
'openai/gpt-3.5-turbo',
|
||||
'openai/gpt-3.5-turbo-16k',
|
||||
'openai/gpt-4-32k',
|
||||
'anthropic/claude-2',
|
||||
'anthropic/claude-instant-v1',
|
||||
'meta-llama/llama-2-13b-chat',
|
||||
'meta-llama/llama-2-70b-chat'
|
||||
"google/palm-2-codechat-bison",
|
||||
"google/palm-2-chat-bison",
|
||||
"openai/gpt-3.5-turbo",
|
||||
"openai/gpt-3.5-turbo-16k",
|
||||
"openai/gpt-4-32k",
|
||||
"anthropic/claude-2",
|
||||
"anthropic/claude-instant-v1",
|
||||
"meta-llama/llama-2-13b-chat",
|
||||
"meta-llama/llama-2-70b-chat",
|
||||
]
|
||||
|
||||
vertex_chat_models = [
|
||||
"chat-bison",
|
||||
"chat-bison@001"
|
||||
]
|
||||
vertex_chat_models = ["chat-bison", "chat-bison@001"]
|
||||
|
||||
|
||||
vertex_text_models = [
|
||||
"text-bison",
|
||||
"text-bison@001"
|
||||
]
|
||||
vertex_text_models = ["text-bison", "text-bison@001"]
|
||||
|
||||
huggingface_models = [
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
|
@ -124,25 +179,56 @@ huggingface_models = [
|
|||
"meta-llama/Llama-2-13b-chat",
|
||||
"meta-llama/Llama-2-70b",
|
||||
"meta-llama/Llama-2-70b-chat",
|
||||
] # these have been tested on extensively. But by default all text2text-generation and text-generation models are supported by liteLLM. - https://docs.litellm.ai/docs/completion/supported
|
||||
] # these have been tested on extensively. But by default all text2text-generation and text-generation models are supported by liteLLM. - https://docs.litellm.ai/docs/completion/supported
|
||||
|
||||
ai21_models = [
|
||||
"j2-ultra",
|
||||
"j2-mid",
|
||||
"j2-light"
|
||||
ai21_models = ["j2-ultra", "j2-mid", "j2-light"]
|
||||
|
||||
model_list = (
|
||||
open_ai_chat_completion_models
|
||||
+ open_ai_text_completion_models
|
||||
+ cohere_models
|
||||
+ anthropic_models
|
||||
+ replicate_models
|
||||
+ openrouter_models
|
||||
+ huggingface_models
|
||||
+ vertex_chat_models
|
||||
+ vertex_text_models
|
||||
+ ai21_models
|
||||
)
|
||||
|
||||
provider_list = [
|
||||
"openai",
|
||||
"cohere",
|
||||
"anthropic",
|
||||
"replicate",
|
||||
"huggingface",
|
||||
"together_ai",
|
||||
"openrouter",
|
||||
"vertex_ai",
|
||||
"ai21",
|
||||
]
|
||||
|
||||
model_list = open_ai_chat_completion_models + open_ai_text_completion_models + cohere_models + anthropic_models + replicate_models + openrouter_models + huggingface_models + vertex_chat_models + vertex_text_models + ai21_models
|
||||
|
||||
provider_list = ["openai", "cohere", "anthropic", "replicate", "huggingface", "together_ai", "openrouter", "vertex_ai", "ai21"]
|
||||
####### EMBEDDING MODELS ###################
|
||||
open_ai_embedding_models = [
|
||||
'text-embedding-ada-002'
|
||||
]
|
||||
open_ai_embedding_models = ["text-embedding-ada-002"]
|
||||
|
||||
from .timeout import timeout
|
||||
from .testing import *
|
||||
from .utils import client, logging, exception_type, get_optional_params, modify_integration, token_counter, cost_per_token, completion_cost, get_litellm_params
|
||||
from .utils import (
|
||||
client,
|
||||
logging,
|
||||
exception_type,
|
||||
get_optional_params,
|
||||
modify_integration,
|
||||
token_counter,
|
||||
cost_per_token,
|
||||
completion_cost,
|
||||
get_litellm_params,
|
||||
)
|
||||
from .main import * # Import all the symbols from main.py
|
||||
from .integrations import *
|
||||
from openai.error import AuthenticationError, InvalidRequestError, RateLimitError, ServiceUnavailableError, OpenAIError
|
||||
from openai.error import (
|
||||
AuthenticationError,
|
||||
InvalidRequestError,
|
||||
RateLimitError,
|
||||
ServiceUnavailableError,
|
||||
OpenAIError,
|
||||
)
|
||||
|
|
|
@ -1,12 +1,21 @@
|
|||
## LiteLLM versions of the OpenAI Exception Types
|
||||
from openai.error import AuthenticationError, InvalidRequestError, RateLimitError, ServiceUnavailableError, OpenAIError
|
||||
from openai.error import (
|
||||
AuthenticationError,
|
||||
InvalidRequestError,
|
||||
RateLimitError,
|
||||
ServiceUnavailableError,
|
||||
OpenAIError,
|
||||
)
|
||||
|
||||
|
||||
class AuthenticationError(AuthenticationError):
|
||||
def __init__(self, message, llm_provider):
|
||||
self.status_code = 401
|
||||
self.message = message
|
||||
self.llm_provider = llm_provider
|
||||
super().__init__(self.message) # Call the base class constructor with the parameters it needs
|
||||
super().__init__(
|
||||
self.message
|
||||
) # Call the base class constructor with the parameters it needs
|
||||
|
||||
|
||||
class InvalidRequestError(InvalidRequestError):
|
||||
|
@ -15,7 +24,9 @@ class InvalidRequestError(InvalidRequestError):
|
|||
self.message = message
|
||||
self.model = model
|
||||
self.llm_provider = llm_provider
|
||||
super().__init__(self.message, f"{self.model}") # Call the base class constructor with the parameters it needs
|
||||
super().__init__(
|
||||
self.message, f"{self.model}"
|
||||
) # Call the base class constructor with the parameters it needs
|
||||
|
||||
|
||||
class RateLimitError(RateLimitError):
|
||||
|
@ -23,21 +34,29 @@ class RateLimitError(RateLimitError):
|
|||
self.status_code = 429
|
||||
self.message = message
|
||||
self.llm_provider = llm_provider
|
||||
super().__init__(self.message) # Call the base class constructor with the parameters it needs
|
||||
super().__init__(
|
||||
self.message
|
||||
) # Call the base class constructor with the parameters it needs
|
||||
|
||||
|
||||
class ServiceUnavailableError(ServiceUnavailableError):
|
||||
def __init__(self, message, llm_provider):
|
||||
self.status_code = 500
|
||||
self.message = message
|
||||
self.llm_provider = llm_provider
|
||||
super().__init__(self.message) # Call the base class constructor with the parameters it needs
|
||||
super().__init__(
|
||||
self.message
|
||||
) # Call the base class constructor with the parameters it needs
|
||||
|
||||
|
||||
class OpenAIError(OpenAIError):
|
||||
def __init__(self, original_exception):
|
||||
self.status_code = original_exception.http_status
|
||||
super().__init__(http_body=original_exception.http_body,
|
||||
http_status=original_exception.http_status,
|
||||
json_body=original_exception.json_body,
|
||||
headers=original_exception.headers,
|
||||
code=original_exception.code)
|
||||
self.llm_provider = "openai"
|
||||
super().__init__(
|
||||
http_body=original_exception.http_body,
|
||||
http_status=original_exception.http_status,
|
||||
json_body=original_exception.json_body,
|
||||
headers=original_exception.headers,
|
||||
code=original_exception.code,
|
||||
)
|
||||
self.llm_provider = "openai"
|
||||
|
|
|
@ -1 +1 @@
|
|||
from . import *
|
||||
from . import *
|
||||
|
|
|
@ -1,53 +1,121 @@
|
|||
#### What this does ####
|
||||
# On success + failure, log events to aispend.io
|
||||
# On success + failure, log events to aispend.io
|
||||
import dotenv, os
|
||||
import requests
|
||||
dotenv.load_dotenv() # Loading env variables using dotenv
|
||||
|
||||
dotenv.load_dotenv() # Loading env variables using dotenv
|
||||
import traceback
|
||||
import datetime
|
||||
|
||||
model_cost = {
|
||||
"gpt-3.5-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
|
||||
"gpt-35-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, # azure model name
|
||||
"gpt-3.5-turbo-0613": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
|
||||
"gpt-3.5-turbo-0301": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
|
||||
"gpt-3.5-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
|
||||
"gpt-35-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, # azure model name
|
||||
"gpt-3.5-turbo-16k-0613": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
|
||||
"gpt-4": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
|
||||
"gpt-4-0613": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
|
||||
"gpt-4-32k": {"max_tokens": 8000, "input_cost_per_token": 0.00006, "output_cost_per_token": 0.00012},
|
||||
"claude-instant-1": {"max_tokens": 100000, "input_cost_per_token": 0.00000163, "output_cost_per_token": 0.00000551},
|
||||
"claude-2": {"max_tokens": 100000, "input_cost_per_token": 0.00001102, "output_cost_per_token": 0.00003268},
|
||||
"text-bison-001": {"max_tokens": 8192, "input_cost_per_token": 0.000004, "output_cost_per_token": 0.000004},
|
||||
"chat-bison-001": {"max_tokens": 4096, "input_cost_per_token": 0.000002, "output_cost_per_token": 0.000002},
|
||||
"command-nightly": {"max_tokens": 4096, "input_cost_per_token": 0.000015, "output_cost_per_token": 0.000015},
|
||||
"gpt-3.5-turbo": {
|
||||
"max_tokens": 4000,
|
||||
"input_cost_per_token": 0.0000015,
|
||||
"output_cost_per_token": 0.000002,
|
||||
},
|
||||
"gpt-35-turbo": {
|
||||
"max_tokens": 4000,
|
||||
"input_cost_per_token": 0.0000015,
|
||||
"output_cost_per_token": 0.000002,
|
||||
}, # azure model name
|
||||
"gpt-3.5-turbo-0613": {
|
||||
"max_tokens": 4000,
|
||||
"input_cost_per_token": 0.0000015,
|
||||
"output_cost_per_token": 0.000002,
|
||||
},
|
||||
"gpt-3.5-turbo-0301": {
|
||||
"max_tokens": 4000,
|
||||
"input_cost_per_token": 0.0000015,
|
||||
"output_cost_per_token": 0.000002,
|
||||
},
|
||||
"gpt-3.5-turbo-16k": {
|
||||
"max_tokens": 16000,
|
||||
"input_cost_per_token": 0.000003,
|
||||
"output_cost_per_token": 0.000004,
|
||||
},
|
||||
"gpt-35-turbo-16k": {
|
||||
"max_tokens": 16000,
|
||||
"input_cost_per_token": 0.000003,
|
||||
"output_cost_per_token": 0.000004,
|
||||
}, # azure model name
|
||||
"gpt-3.5-turbo-16k-0613": {
|
||||
"max_tokens": 16000,
|
||||
"input_cost_per_token": 0.000003,
|
||||
"output_cost_per_token": 0.000004,
|
||||
},
|
||||
"gpt-4": {
|
||||
"max_tokens": 8000,
|
||||
"input_cost_per_token": 0.000003,
|
||||
"output_cost_per_token": 0.00006,
|
||||
},
|
||||
"gpt-4-0613": {
|
||||
"max_tokens": 8000,
|
||||
"input_cost_per_token": 0.000003,
|
||||
"output_cost_per_token": 0.00006,
|
||||
},
|
||||
"gpt-4-32k": {
|
||||
"max_tokens": 8000,
|
||||
"input_cost_per_token": 0.00006,
|
||||
"output_cost_per_token": 0.00012,
|
||||
},
|
||||
"claude-instant-1": {
|
||||
"max_tokens": 100000,
|
||||
"input_cost_per_token": 0.00000163,
|
||||
"output_cost_per_token": 0.00000551,
|
||||
},
|
||||
"claude-2": {
|
||||
"max_tokens": 100000,
|
||||
"input_cost_per_token": 0.00001102,
|
||||
"output_cost_per_token": 0.00003268,
|
||||
},
|
||||
"text-bison-001": {
|
||||
"max_tokens": 8192,
|
||||
"input_cost_per_token": 0.000004,
|
||||
"output_cost_per_token": 0.000004,
|
||||
},
|
||||
"chat-bison-001": {
|
||||
"max_tokens": 4096,
|
||||
"input_cost_per_token": 0.000002,
|
||||
"output_cost_per_token": 0.000002,
|
||||
},
|
||||
"command-nightly": {
|
||||
"max_tokens": 4096,
|
||||
"input_cost_per_token": 0.000015,
|
||||
"output_cost_per_token": 0.000015,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
class AISpendLogger:
|
||||
# Class variables or attributes
|
||||
def __init__(self):
|
||||
# Instance variables
|
||||
self.account_id = os.getenv("AISPEND_ACCOUNT_ID")
|
||||
self.api_key = os.getenv("AISPEND_API_KEY")
|
||||
|
||||
|
||||
def price_calculator(self, model, response_obj, start_time, end_time):
|
||||
# try and find if the model is in the model_cost map
|
||||
# else default to the average of the costs
|
||||
prompt_tokens_cost_usd_dollar = 0
|
||||
completion_tokens_cost_usd_dollar = 0
|
||||
if model in model_cost:
|
||||
prompt_tokens_cost_usd_dollar = model_cost[model]["input_cost_per_token"] * response_obj["usage"]["prompt_tokens"]
|
||||
completion_tokens_cost_usd_dollar = model_cost[model]["output_cost_per_token"] * response_obj["usage"]["completion_tokens"]
|
||||
elif "replicate" in model:
|
||||
prompt_tokens_cost_usd_dollar = (
|
||||
model_cost[model]["input_cost_per_token"]
|
||||
* response_obj["usage"]["prompt_tokens"]
|
||||
)
|
||||
completion_tokens_cost_usd_dollar = (
|
||||
model_cost[model]["output_cost_per_token"]
|
||||
* response_obj["usage"]["completion_tokens"]
|
||||
)
|
||||
elif "replicate" in model:
|
||||
# replicate models are charged based on time
|
||||
# llama 2 runs on an nvidia a100 which costs $0.0032 per second - https://replicate.com/replicate/llama-2-70b-chat
|
||||
model_run_time = end_time - start_time # assuming time in seconds
|
||||
model_run_time = end_time - start_time # assuming time in seconds
|
||||
cost_usd_dollar = model_run_time * 0.0032
|
||||
prompt_tokens_cost_usd_dollar = cost_usd_dollar / 2
|
||||
completion_tokens_cost_usd_dollar = cost_usd_dollar / 2
|
||||
else:
|
||||
# calculate average input cost
|
||||
# calculate average input cost
|
||||
input_cost_sum = 0
|
||||
output_cost_sum = 0
|
||||
for model in model_cost:
|
||||
|
@ -55,37 +123,52 @@ class AISpendLogger:
|
|||
output_cost_sum += model_cost[model]["output_cost_per_token"]
|
||||
avg_input_cost = input_cost_sum / len(model_cost.keys())
|
||||
avg_output_cost = output_cost_sum / len(model_cost.keys())
|
||||
prompt_tokens_cost_usd_dollar = model_cost[model]["input_cost_per_token"] * response_obj["usage"]["prompt_tokens"]
|
||||
completion_tokens_cost_usd_dollar = model_cost[model]["output_cost_per_token"] * response_obj["usage"]["completion_tokens"]
|
||||
prompt_tokens_cost_usd_dollar = (
|
||||
model_cost[model]["input_cost_per_token"]
|
||||
* response_obj["usage"]["prompt_tokens"]
|
||||
)
|
||||
completion_tokens_cost_usd_dollar = (
|
||||
model_cost[model]["output_cost_per_token"]
|
||||
* response_obj["usage"]["completion_tokens"]
|
||||
)
|
||||
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
|
||||
|
||||
|
||||
def log_event(self, model, response_obj, start_time, end_time, print_verbose):
|
||||
# Method definition
|
||||
try:
|
||||
print_verbose(f"AISpend Logging - Enters logging function for model {model}")
|
||||
print_verbose(
|
||||
f"AISpend Logging - Enters logging function for model {model}"
|
||||
)
|
||||
|
||||
url = f"https://aispend.io/api/v1/accounts/{self.account_id}/data"
|
||||
headers = {
|
||||
'Authorization': f'Bearer {self.api_key}',
|
||||
'Content-Type': 'application/json'
|
||||
"Authorization": f"Bearer {self.api_key}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
response_timestamp = datetime.datetime.fromtimestamp(int(response_obj["created"])).strftime('%Y-%m-%d')
|
||||
response_timestamp = datetime.datetime.fromtimestamp(
|
||||
int(response_obj["created"])
|
||||
).strftime("%Y-%m-%d")
|
||||
|
||||
prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = self.price_calculator(model, response_obj, start_time, end_time)
|
||||
(
|
||||
prompt_tokens_cost_usd_dollar,
|
||||
completion_tokens_cost_usd_dollar,
|
||||
) = self.price_calculator(model, response_obj, start_time, end_time)
|
||||
prompt_tokens_cost_usd_cent = prompt_tokens_cost_usd_dollar * 100
|
||||
completion_tokens_cost_usd_cent = completion_tokens_cost_usd_dollar * 100
|
||||
data = [{
|
||||
"requests": 1,
|
||||
"requests_context": 1,
|
||||
"context_tokens": response_obj["usage"]["prompt_tokens"],
|
||||
"requests_generated": 1,
|
||||
"generated_tokens": response_obj["usage"]["completion_tokens"],
|
||||
"recorded_date": response_timestamp,
|
||||
"model_id": response_obj["model"],
|
||||
"generated_tokens_cost_usd_cent": prompt_tokens_cost_usd_cent,
|
||||
"context_tokens_cost_usd_cent": completion_tokens_cost_usd_cent
|
||||
}]
|
||||
data = [
|
||||
{
|
||||
"requests": 1,
|
||||
"requests_context": 1,
|
||||
"context_tokens": response_obj["usage"]["prompt_tokens"],
|
||||
"requests_generated": 1,
|
||||
"generated_tokens": response_obj["usage"]["completion_tokens"],
|
||||
"recorded_date": response_timestamp,
|
||||
"model_id": response_obj["model"],
|
||||
"generated_tokens_cost_usd_cent": prompt_tokens_cost_usd_cent,
|
||||
"context_tokens_cost_usd_cent": completion_tokens_cost_usd_cent,
|
||||
}
|
||||
]
|
||||
|
||||
print_verbose(f"AISpend Logging - final data object: {data}")
|
||||
except:
|
||||
|
|
|
@ -1,52 +1,120 @@
|
|||
#### What this does ####
|
||||
# On success + failure, log events to aispend.io
|
||||
# On success + failure, log events to aispend.io
|
||||
import dotenv, os
|
||||
import requests
|
||||
dotenv.load_dotenv() # Loading env variables using dotenv
|
||||
|
||||
dotenv.load_dotenv() # Loading env variables using dotenv
|
||||
import traceback
|
||||
import datetime
|
||||
|
||||
model_cost = {
|
||||
"gpt-3.5-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
|
||||
"gpt-35-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, # azure model name
|
||||
"gpt-3.5-turbo-0613": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
|
||||
"gpt-3.5-turbo-0301": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
|
||||
"gpt-3.5-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
|
||||
"gpt-35-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, # azure model name
|
||||
"gpt-3.5-turbo-16k-0613": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
|
||||
"gpt-4": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
|
||||
"gpt-4-0613": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
|
||||
"gpt-4-32k": {"max_tokens": 8000, "input_cost_per_token": 0.00006, "output_cost_per_token": 0.00012},
|
||||
"claude-instant-1": {"max_tokens": 100000, "input_cost_per_token": 0.00000163, "output_cost_per_token": 0.00000551},
|
||||
"claude-2": {"max_tokens": 100000, "input_cost_per_token": 0.00001102, "output_cost_per_token": 0.00003268},
|
||||
"text-bison-001": {"max_tokens": 8192, "input_cost_per_token": 0.000004, "output_cost_per_token": 0.000004},
|
||||
"chat-bison-001": {"max_tokens": 4096, "input_cost_per_token": 0.000002, "output_cost_per_token": 0.000002},
|
||||
"command-nightly": {"max_tokens": 4096, "input_cost_per_token": 0.000015, "output_cost_per_token": 0.000015},
|
||||
"gpt-3.5-turbo": {
|
||||
"max_tokens": 4000,
|
||||
"input_cost_per_token": 0.0000015,
|
||||
"output_cost_per_token": 0.000002,
|
||||
},
|
||||
"gpt-35-turbo": {
|
||||
"max_tokens": 4000,
|
||||
"input_cost_per_token": 0.0000015,
|
||||
"output_cost_per_token": 0.000002,
|
||||
}, # azure model name
|
||||
"gpt-3.5-turbo-0613": {
|
||||
"max_tokens": 4000,
|
||||
"input_cost_per_token": 0.0000015,
|
||||
"output_cost_per_token": 0.000002,
|
||||
},
|
||||
"gpt-3.5-turbo-0301": {
|
||||
"max_tokens": 4000,
|
||||
"input_cost_per_token": 0.0000015,
|
||||
"output_cost_per_token": 0.000002,
|
||||
},
|
||||
"gpt-3.5-turbo-16k": {
|
||||
"max_tokens": 16000,
|
||||
"input_cost_per_token": 0.000003,
|
||||
"output_cost_per_token": 0.000004,
|
||||
},
|
||||
"gpt-35-turbo-16k": {
|
||||
"max_tokens": 16000,
|
||||
"input_cost_per_token": 0.000003,
|
||||
"output_cost_per_token": 0.000004,
|
||||
}, # azure model name
|
||||
"gpt-3.5-turbo-16k-0613": {
|
||||
"max_tokens": 16000,
|
||||
"input_cost_per_token": 0.000003,
|
||||
"output_cost_per_token": 0.000004,
|
||||
},
|
||||
"gpt-4": {
|
||||
"max_tokens": 8000,
|
||||
"input_cost_per_token": 0.000003,
|
||||
"output_cost_per_token": 0.00006,
|
||||
},
|
||||
"gpt-4-0613": {
|
||||
"max_tokens": 8000,
|
||||
"input_cost_per_token": 0.000003,
|
||||
"output_cost_per_token": 0.00006,
|
||||
},
|
||||
"gpt-4-32k": {
|
||||
"max_tokens": 8000,
|
||||
"input_cost_per_token": 0.00006,
|
||||
"output_cost_per_token": 0.00012,
|
||||
},
|
||||
"claude-instant-1": {
|
||||
"max_tokens": 100000,
|
||||
"input_cost_per_token": 0.00000163,
|
||||
"output_cost_per_token": 0.00000551,
|
||||
},
|
||||
"claude-2": {
|
||||
"max_tokens": 100000,
|
||||
"input_cost_per_token": 0.00001102,
|
||||
"output_cost_per_token": 0.00003268,
|
||||
},
|
||||
"text-bison-001": {
|
||||
"max_tokens": 8192,
|
||||
"input_cost_per_token": 0.000004,
|
||||
"output_cost_per_token": 0.000004,
|
||||
},
|
||||
"chat-bison-001": {
|
||||
"max_tokens": 4096,
|
||||
"input_cost_per_token": 0.000002,
|
||||
"output_cost_per_token": 0.000002,
|
||||
},
|
||||
"command-nightly": {
|
||||
"max_tokens": 4096,
|
||||
"input_cost_per_token": 0.000015,
|
||||
"output_cost_per_token": 0.000015,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
class BerriSpendLogger:
|
||||
# Class variables or attributes
|
||||
def __init__(self):
|
||||
# Instance variables
|
||||
self.account_id = os.getenv("BERRISPEND_ACCOUNT_ID")
|
||||
|
||||
|
||||
def price_calculator(self, model, response_obj, start_time, end_time):
|
||||
# try and find if the model is in the model_cost map
|
||||
# else default to the average of the costs
|
||||
prompt_tokens_cost_usd_dollar = 0
|
||||
completion_tokens_cost_usd_dollar = 0
|
||||
if model in model_cost:
|
||||
prompt_tokens_cost_usd_dollar = model_cost[model]["input_cost_per_token"] * response_obj["usage"]["prompt_tokens"]
|
||||
completion_tokens_cost_usd_dollar = model_cost[model]["output_cost_per_token"] * response_obj["usage"]["completion_tokens"]
|
||||
elif "replicate" in model:
|
||||
prompt_tokens_cost_usd_dollar = (
|
||||
model_cost[model]["input_cost_per_token"]
|
||||
* response_obj["usage"]["prompt_tokens"]
|
||||
)
|
||||
completion_tokens_cost_usd_dollar = (
|
||||
model_cost[model]["output_cost_per_token"]
|
||||
* response_obj["usage"]["completion_tokens"]
|
||||
)
|
||||
elif "replicate" in model:
|
||||
# replicate models are charged based on time
|
||||
# llama 2 runs on an nvidia a100 which costs $0.0032 per second - https://replicate.com/replicate/llama-2-70b-chat
|
||||
model_run_time = end_time - start_time # assuming time in seconds
|
||||
model_run_time = end_time - start_time # assuming time in seconds
|
||||
cost_usd_dollar = model_run_time * 0.0032
|
||||
prompt_tokens_cost_usd_dollar = cost_usd_dollar / 2
|
||||
completion_tokens_cost_usd_dollar = cost_usd_dollar / 2
|
||||
else:
|
||||
# calculate average input cost
|
||||
# calculate average input cost
|
||||
input_cost_sum = 0
|
||||
output_cost_sum = 0
|
||||
for model in model_cost:
|
||||
|
@ -54,42 +122,59 @@ class BerriSpendLogger:
|
|||
output_cost_sum += model_cost[model]["output_cost_per_token"]
|
||||
avg_input_cost = input_cost_sum / len(model_cost.keys())
|
||||
avg_output_cost = output_cost_sum / len(model_cost.keys())
|
||||
prompt_tokens_cost_usd_dollar = model_cost[model]["input_cost_per_token"] * response_obj["usage"]["prompt_tokens"]
|
||||
completion_tokens_cost_usd_dollar = model_cost[model]["output_cost_per_token"] * response_obj["usage"]["completion_tokens"]
|
||||
prompt_tokens_cost_usd_dollar = (
|
||||
model_cost[model]["input_cost_per_token"]
|
||||
* response_obj["usage"]["prompt_tokens"]
|
||||
)
|
||||
completion_tokens_cost_usd_dollar = (
|
||||
model_cost[model]["output_cost_per_token"]
|
||||
* response_obj["usage"]["completion_tokens"]
|
||||
)
|
||||
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
|
||||
|
||||
def log_event(self, model, messages, response_obj, start_time, end_time, print_verbose):
|
||||
|
||||
def log_event(
|
||||
self, model, messages, response_obj, start_time, end_time, print_verbose
|
||||
):
|
||||
# Method definition
|
||||
try:
|
||||
print_verbose(f"BerriSpend Logging - Enters logging function for model {model}")
|
||||
print_verbose(
|
||||
f"BerriSpend Logging - Enters logging function for model {model}"
|
||||
)
|
||||
|
||||
url = f"https://berrispend.berri.ai/spend"
|
||||
headers = {
|
||||
'Content-Type': 'application/json'
|
||||
}
|
||||
headers = {"Content-Type": "application/json"}
|
||||
|
||||
prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = self.price_calculator(model, response_obj, start_time, end_time)
|
||||
total_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
|
||||
(
|
||||
prompt_tokens_cost_usd_dollar,
|
||||
completion_tokens_cost_usd_dollar,
|
||||
) = self.price_calculator(model, response_obj, start_time, end_time)
|
||||
total_cost = (
|
||||
prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
|
||||
)
|
||||
|
||||
response_time = (end_time-start_time).total_seconds()
|
||||
response_time = (end_time - start_time).total_seconds()
|
||||
if "response" in response_obj:
|
||||
data = [{
|
||||
"response_time": response_time,
|
||||
"model_id": response_obj["model"],
|
||||
"total_cost": total_cost,
|
||||
"messages": messages,
|
||||
"response": response_obj['choices'][0]['message']['content'],
|
||||
"account_id": self.account_id
|
||||
}]
|
||||
data = [
|
||||
{
|
||||
"response_time": response_time,
|
||||
"model_id": response_obj["model"],
|
||||
"total_cost": total_cost,
|
||||
"messages": messages,
|
||||
"response": response_obj["choices"][0]["message"]["content"],
|
||||
"account_id": self.account_id,
|
||||
}
|
||||
]
|
||||
elif "error" in response_obj:
|
||||
data = [{
|
||||
"response_time": response_time,
|
||||
"model_id": response_obj["model"],
|
||||
"total_cost": total_cost,
|
||||
"messages": messages,
|
||||
"error": response_obj['error'],
|
||||
"account_id": self.account_id
|
||||
}]
|
||||
data = [
|
||||
{
|
||||
"response_time": response_time,
|
||||
"model_id": response_obj["model"],
|
||||
"total_cost": total_cost,
|
||||
"messages": messages,
|
||||
"error": response_obj["error"],
|
||||
"account_id": self.account_id,
|
||||
}
|
||||
]
|
||||
|
||||
print_verbose(f"BerriSpend Logging - final data object: {data}")
|
||||
response = requests.post(url, headers=headers, json=data)
|
||||
|
|
|
@ -2,19 +2,24 @@
|
|||
# On success, logs events to Helicone
|
||||
import dotenv, os
|
||||
import requests
|
||||
dotenv.load_dotenv() # Loading env variables using dotenv
|
||||
|
||||
dotenv.load_dotenv() # Loading env variables using dotenv
|
||||
import traceback
|
||||
|
||||
|
||||
class HeliconeLogger:
|
||||
# Class variables or attributes
|
||||
helicone_model_list = ["gpt", "claude"]
|
||||
|
||||
def __init__(self):
|
||||
# Instance variables
|
||||
self.provider_url = "https://api.openai.com/v1"
|
||||
self.key = os.getenv('HELICONE_API_KEY')
|
||||
self.key = os.getenv("HELICONE_API_KEY")
|
||||
|
||||
def claude_mapping(self, model, messages, response_obj):
|
||||
from anthropic import HUMAN_PROMPT, AI_PROMPT
|
||||
prompt = f"{HUMAN_PROMPT}"
|
||||
|
||||
prompt = f"{HUMAN_PROMPT}"
|
||||
for message in messages:
|
||||
if "role" in message:
|
||||
if message["role"] == "user":
|
||||
|
@ -26,48 +31,84 @@ class HeliconeLogger:
|
|||
prompt += f"{AI_PROMPT}"
|
||||
claude_provider_request = {"model": model, "prompt": prompt}
|
||||
|
||||
claude_response_obj = {"completion": response_obj['choices'][0]['message']['content'], "model": model, "stop_reason": "stop_sequence"}
|
||||
claude_response_obj = {
|
||||
"completion": response_obj["choices"][0]["message"]["content"],
|
||||
"model": model,
|
||||
"stop_reason": "stop_sequence",
|
||||
}
|
||||
|
||||
return claude_provider_request, claude_response_obj
|
||||
|
||||
def log_success(self, model, messages, response_obj, start_time, end_time, print_verbose):
|
||||
|
||||
def log_success(
|
||||
self, model, messages, response_obj, start_time, end_time, print_verbose
|
||||
):
|
||||
# Method definition
|
||||
try:
|
||||
print_verbose(f"Helicone Logging - Enters logging function for model {model}")
|
||||
model = model if any(accepted_model in model for accepted_model in self.helicone_model_list) else "gpt-3.5-turbo"
|
||||
print_verbose(
|
||||
f"Helicone Logging - Enters logging function for model {model}"
|
||||
)
|
||||
model = (
|
||||
model
|
||||
if any(
|
||||
accepted_model in model
|
||||
for accepted_model in self.helicone_model_list
|
||||
)
|
||||
else "gpt-3.5-turbo"
|
||||
)
|
||||
provider_request = {"model": model, "messages": messages}
|
||||
|
||||
if "claude" in model:
|
||||
provider_request, response_obj = self.claude_mapping(model=model, messages=messages, response_obj=response_obj)
|
||||
if "claude" in model:
|
||||
provider_request, response_obj = self.claude_mapping(
|
||||
model=model, messages=messages, response_obj=response_obj
|
||||
)
|
||||
|
||||
providerResponse = {
|
||||
"json": response_obj,
|
||||
"headers": {"openai-version": "2020-10-01"},
|
||||
"status": 200
|
||||
"json": response_obj,
|
||||
"headers": {"openai-version": "2020-10-01"},
|
||||
"status": 200,
|
||||
}
|
||||
|
||||
# Code to be executed
|
||||
url = "https://api.hconeai.com/oai/v1/log"
|
||||
headers = {
|
||||
'Authorization': f'Bearer {self.key}',
|
||||
'Content-Type': 'application/json'
|
||||
"Authorization": f"Bearer {self.key}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
start_time_seconds = int(start_time.timestamp())
|
||||
start_time_milliseconds = int((start_time.timestamp() - start_time_seconds) * 1000)
|
||||
start_time_milliseconds = int(
|
||||
(start_time.timestamp() - start_time_seconds) * 1000
|
||||
)
|
||||
end_time_seconds = int(end_time.timestamp())
|
||||
end_time_milliseconds = int((end_time.timestamp() - end_time_seconds) * 1000)
|
||||
end_time_milliseconds = int(
|
||||
(end_time.timestamp() - end_time_seconds) * 1000
|
||||
)
|
||||
data = {
|
||||
"providerRequest": {"url": self.provider_url, "json": provider_request, "meta": {"Helicone-Auth": f"Bearer {self.key}"}},
|
||||
"providerRequest": {
|
||||
"url": self.provider_url,
|
||||
"json": provider_request,
|
||||
"meta": {"Helicone-Auth": f"Bearer {self.key}"},
|
||||
},
|
||||
"providerResponse": providerResponse,
|
||||
"timing": {"startTime": {"seconds": start_time_seconds, "milliseconds": start_time_milliseconds}, "endTime": {"seconds": end_time_seconds, "milliseconds": end_time_milliseconds}} # {"seconds": .., "milliseconds": ..}
|
||||
"timing": {
|
||||
"startTime": {
|
||||
"seconds": start_time_seconds,
|
||||
"milliseconds": start_time_milliseconds,
|
||||
},
|
||||
"endTime": {
|
||||
"seconds": end_time_seconds,
|
||||
"milliseconds": end_time_milliseconds,
|
||||
},
|
||||
}, # {"seconds": .., "milliseconds": ..}
|
||||
}
|
||||
response = requests.post(url, headers=headers, json=data)
|
||||
if response.status_code == 200:
|
||||
print_verbose("Helicone Logging - Success!")
|
||||
else:
|
||||
print_verbose(f"Helicone Logging - Error Request was not successful. Status Code: {response.status_code}")
|
||||
print_verbose(
|
||||
f"Helicone Logging - Error Request was not successful. Status Code: {response.status_code}"
|
||||
)
|
||||
print_verbose(f"Helicone Logging - Error {response.text}")
|
||||
except:
|
||||
# traceback.print_exc()
|
||||
print_verbose(f"Helicone Logging Error - {traceback.format_exc()}")
|
||||
pass
|
||||
pass
|
||||
|
|
|
@ -3,31 +3,94 @@
|
|||
|
||||
import dotenv, os
|
||||
import requests
|
||||
dotenv.load_dotenv() # Loading env variables using dotenv
|
||||
|
||||
dotenv.load_dotenv() # Loading env variables using dotenv
|
||||
import traceback
|
||||
import datetime, subprocess, sys
|
||||
|
||||
model_cost = {
|
||||
"gpt-3.5-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
|
||||
"gpt-35-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, # azure model name
|
||||
"gpt-3.5-turbo-0613": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
|
||||
"gpt-3.5-turbo-0301": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
|
||||
"gpt-3.5-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
|
||||
"gpt-35-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, # azure model name
|
||||
"gpt-3.5-turbo-16k-0613": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
|
||||
"gpt-4": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
|
||||
"gpt-4-0613": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
|
||||
"gpt-4-32k": {"max_tokens": 8000, "input_cost_per_token": 0.00006, "output_cost_per_token": 0.00012},
|
||||
"claude-instant-1": {"max_tokens": 100000, "input_cost_per_token": 0.00000163, "output_cost_per_token": 0.00000551},
|
||||
"claude-2": {"max_tokens": 100000, "input_cost_per_token": 0.00001102, "output_cost_per_token": 0.00003268},
|
||||
"text-bison-001": {"max_tokens": 8192, "input_cost_per_token": 0.000004, "output_cost_per_token": 0.000004},
|
||||
"chat-bison-001": {"max_tokens": 4096, "input_cost_per_token": 0.000002, "output_cost_per_token": 0.000002},
|
||||
"command-nightly": {"max_tokens": 4096, "input_cost_per_token": 0.000015, "output_cost_per_token": 0.000015},
|
||||
"gpt-3.5-turbo": {
|
||||
"max_tokens": 4000,
|
||||
"input_cost_per_token": 0.0000015,
|
||||
"output_cost_per_token": 0.000002,
|
||||
},
|
||||
"gpt-35-turbo": {
|
||||
"max_tokens": 4000,
|
||||
"input_cost_per_token": 0.0000015,
|
||||
"output_cost_per_token": 0.000002,
|
||||
}, # azure model name
|
||||
"gpt-3.5-turbo-0613": {
|
||||
"max_tokens": 4000,
|
||||
"input_cost_per_token": 0.0000015,
|
||||
"output_cost_per_token": 0.000002,
|
||||
},
|
||||
"gpt-3.5-turbo-0301": {
|
||||
"max_tokens": 4000,
|
||||
"input_cost_per_token": 0.0000015,
|
||||
"output_cost_per_token": 0.000002,
|
||||
},
|
||||
"gpt-3.5-turbo-16k": {
|
||||
"max_tokens": 16000,
|
||||
"input_cost_per_token": 0.000003,
|
||||
"output_cost_per_token": 0.000004,
|
||||
},
|
||||
"gpt-35-turbo-16k": {
|
||||
"max_tokens": 16000,
|
||||
"input_cost_per_token": 0.000003,
|
||||
"output_cost_per_token": 0.000004,
|
||||
}, # azure model name
|
||||
"gpt-3.5-turbo-16k-0613": {
|
||||
"max_tokens": 16000,
|
||||
"input_cost_per_token": 0.000003,
|
||||
"output_cost_per_token": 0.000004,
|
||||
},
|
||||
"gpt-4": {
|
||||
"max_tokens": 8000,
|
||||
"input_cost_per_token": 0.000003,
|
||||
"output_cost_per_token": 0.00006,
|
||||
},
|
||||
"gpt-4-0613": {
|
||||
"max_tokens": 8000,
|
||||
"input_cost_per_token": 0.000003,
|
||||
"output_cost_per_token": 0.00006,
|
||||
},
|
||||
"gpt-4-32k": {
|
||||
"max_tokens": 8000,
|
||||
"input_cost_per_token": 0.00006,
|
||||
"output_cost_per_token": 0.00012,
|
||||
},
|
||||
"claude-instant-1": {
|
||||
"max_tokens": 100000,
|
||||
"input_cost_per_token": 0.00000163,
|
||||
"output_cost_per_token": 0.00000551,
|
||||
},
|
||||
"claude-2": {
|
||||
"max_tokens": 100000,
|
||||
"input_cost_per_token": 0.00001102,
|
||||
"output_cost_per_token": 0.00003268,
|
||||
},
|
||||
"text-bison-001": {
|
||||
"max_tokens": 8192,
|
||||
"input_cost_per_token": 0.000004,
|
||||
"output_cost_per_token": 0.000004,
|
||||
},
|
||||
"chat-bison-001": {
|
||||
"max_tokens": 4096,
|
||||
"input_cost_per_token": 0.000002,
|
||||
"output_cost_per_token": 0.000002,
|
||||
},
|
||||
"command-nightly": {
|
||||
"max_tokens": 4096,
|
||||
"input_cost_per_token": 0.000015,
|
||||
"output_cost_per_token": 0.000015,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
class Supabase:
|
||||
# Class variables or attributes
|
||||
supabase_table_name = "request_logs"
|
||||
|
||||
def __init__(self):
|
||||
# Instance variables
|
||||
self.supabase_url = os.getenv("SUPABASE_URL")
|
||||
|
@ -35,9 +98,11 @@ class Supabase:
|
|||
try:
|
||||
import supabase
|
||||
except ImportError:
|
||||
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'supabase'])
|
||||
subprocess.check_call([sys.executable, "-m", "pip", "install", "supabase"])
|
||||
import supabase
|
||||
self.supabase_client = supabase.create_client(self.supabase_url, self.supabase_key)
|
||||
self.supabase_client = supabase.create_client(
|
||||
self.supabase_url, self.supabase_key
|
||||
)
|
||||
|
||||
def price_calculator(self, model, response_obj, start_time, end_time):
|
||||
# try and find if the model is in the model_cost map
|
||||
|
@ -45,17 +110,23 @@ class Supabase:
|
|||
prompt_tokens_cost_usd_dollar = 0
|
||||
completion_tokens_cost_usd_dollar = 0
|
||||
if model in model_cost:
|
||||
prompt_tokens_cost_usd_dollar = model_cost[model]["input_cost_per_token"] * response_obj["usage"]["prompt_tokens"]
|
||||
completion_tokens_cost_usd_dollar = model_cost[model]["output_cost_per_token"] * response_obj["usage"]["completion_tokens"]
|
||||
elif "replicate" in model:
|
||||
prompt_tokens_cost_usd_dollar = (
|
||||
model_cost[model]["input_cost_per_token"]
|
||||
* response_obj["usage"]["prompt_tokens"]
|
||||
)
|
||||
completion_tokens_cost_usd_dollar = (
|
||||
model_cost[model]["output_cost_per_token"]
|
||||
* response_obj["usage"]["completion_tokens"]
|
||||
)
|
||||
elif "replicate" in model:
|
||||
# replicate models are charged based on time
|
||||
# llama 2 runs on an nvidia a100 which costs $0.0032 per second - https://replicate.com/replicate/llama-2-70b-chat
|
||||
model_run_time = end_time - start_time # assuming time in seconds
|
||||
model_run_time = end_time - start_time # assuming time in seconds
|
||||
cost_usd_dollar = model_run_time * 0.0032
|
||||
prompt_tokens_cost_usd_dollar = cost_usd_dollar / 2
|
||||
completion_tokens_cost_usd_dollar = cost_usd_dollar / 2
|
||||
else:
|
||||
# calculate average input cost
|
||||
# calculate average input cost
|
||||
input_cost_sum = 0
|
||||
output_cost_sum = 0
|
||||
for model in model_cost:
|
||||
|
@ -63,41 +134,75 @@ class Supabase:
|
|||
output_cost_sum += model_cost[model]["output_cost_per_token"]
|
||||
avg_input_cost = input_cost_sum / len(model_cost.keys())
|
||||
avg_output_cost = output_cost_sum / len(model_cost.keys())
|
||||
prompt_tokens_cost_usd_dollar = model_cost[model]["input_cost_per_token"] * response_obj["usage"]["prompt_tokens"]
|
||||
completion_tokens_cost_usd_dollar = model_cost[model]["output_cost_per_token"] * response_obj["usage"]["completion_tokens"]
|
||||
prompt_tokens_cost_usd_dollar = (
|
||||
model_cost[model]["input_cost_per_token"]
|
||||
* response_obj["usage"]["prompt_tokens"]
|
||||
)
|
||||
completion_tokens_cost_usd_dollar = (
|
||||
model_cost[model]["output_cost_per_token"]
|
||||
* response_obj["usage"]["completion_tokens"]
|
||||
)
|
||||
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
|
||||
|
||||
def log_event(self, model, messages, end_user, response_obj, start_time, end_time, print_verbose):
|
||||
|
||||
def log_event(
|
||||
self,
|
||||
model,
|
||||
messages,
|
||||
end_user,
|
||||
response_obj,
|
||||
start_time,
|
||||
end_time,
|
||||
print_verbose,
|
||||
):
|
||||
try:
|
||||
print_verbose(f"Supabase Logging - Enters logging function for model {model}, response_obj: {response_obj}")
|
||||
print_verbose(
|
||||
f"Supabase Logging - Enters logging function for model {model}, response_obj: {response_obj}"
|
||||
)
|
||||
|
||||
prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = self.price_calculator(model, response_obj, start_time, end_time)
|
||||
total_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
|
||||
(
|
||||
prompt_tokens_cost_usd_dollar,
|
||||
completion_tokens_cost_usd_dollar,
|
||||
) = self.price_calculator(model, response_obj, start_time, end_time)
|
||||
total_cost = (
|
||||
prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
|
||||
)
|
||||
|
||||
response_time = (end_time-start_time).total_seconds()
|
||||
response_time = (end_time - start_time).total_seconds()
|
||||
if "choices" in response_obj:
|
||||
supabase_data_obj = {
|
||||
"response_time": response_time,
|
||||
"model": response_obj["model"],
|
||||
"total_cost": total_cost,
|
||||
"total_cost": total_cost,
|
||||
"messages": messages,
|
||||
"response": response_obj['choices'][0]['message']['content'],
|
||||
"end_user": end_user
|
||||
"response": response_obj["choices"][0]["message"]["content"],
|
||||
"end_user": end_user,
|
||||
}
|
||||
print_verbose(f"Supabase Logging - final data object: {supabase_data_obj}")
|
||||
data, count = self.supabase_client.table(self.supabase_table_name).insert(supabase_data_obj).execute()
|
||||
print_verbose(
|
||||
f"Supabase Logging - final data object: {supabase_data_obj}"
|
||||
)
|
||||
data, count = (
|
||||
self.supabase_client.table(self.supabase_table_name)
|
||||
.insert(supabase_data_obj)
|
||||
.execute()
|
||||
)
|
||||
elif "error" in response_obj:
|
||||
supabase_data_obj = {
|
||||
"response_time": response_time,
|
||||
"model": response_obj["model"],
|
||||
"total_cost": total_cost,
|
||||
"total_cost": total_cost,
|
||||
"messages": messages,
|
||||
"error": response_obj['error'],
|
||||
"end_user": end_user
|
||||
"error": response_obj["error"],
|
||||
"end_user": end_user,
|
||||
}
|
||||
print_verbose(f"Supabase Logging - final data object: {supabase_data_obj}")
|
||||
data, count = self.supabase_client.table(self.supabase_table_name).insert(supabase_data_obj).execute()
|
||||
|
||||
print_verbose(
|
||||
f"Supabase Logging - final data object: {supabase_data_obj}"
|
||||
)
|
||||
data, count = (
|
||||
self.supabase_client.table(self.supabase_table_name)
|
||||
.insert(supabase_data_obj)
|
||||
.execute()
|
||||
)
|
||||
|
||||
except:
|
||||
# traceback.print_exc()
|
||||
print_verbose(f"Supabase Logging Error - {traceback.format_exc()}")
|
||||
|
|
|
@ -1 +1 @@
|
|||
from . import *
|
||||
from . import *
|
||||
|
|
|
@ -2,54 +2,77 @@ import os, json
|
|||
from enum import Enum
|
||||
import requests
|
||||
from litellm import logging
|
||||
import time
|
||||
import time
|
||||
from typing import Callable
|
||||
from litellm.utils import ModelResponse
|
||||
|
||||
|
||||
class AnthropicConstants(Enum):
|
||||
HUMAN_PROMPT = "\n\nHuman:"
|
||||
AI_PROMPT = "\n\nAssistant:"
|
||||
|
||||
|
||||
class AnthropicError(Exception):
|
||||
def __init__(self, status_code, message):
|
||||
self.status_code = status_code
|
||||
self.message = message
|
||||
super().__init__(self.message) # Call the base class constructor with the parameters it needs
|
||||
super().__init__(
|
||||
self.message
|
||||
) # Call the base class constructor with the parameters it needs
|
||||
|
||||
class AnthropicLLM:
|
||||
|
||||
|
||||
class AnthropicLLM:
|
||||
def __init__(self, encoding, default_max_tokens_to_sample, api_key=None):
|
||||
self.encoding = encoding
|
||||
self.default_max_tokens_to_sample = default_max_tokens_to_sample
|
||||
self.completion_url = "https://api.anthropic.com/v1/complete"
|
||||
self.api_key = api_key
|
||||
self.validate_environment(api_key=api_key)
|
||||
|
||||
def validate_environment(self, api_key): # set up the environment required to run the model
|
||||
|
||||
def validate_environment(
|
||||
self, api_key
|
||||
): # set up the environment required to run the model
|
||||
# set the api key
|
||||
if self.api_key == None:
|
||||
raise ValueError("Missing Anthropic API Key - A call is being made to anthropic but no key is set either in the environment variables or via params")
|
||||
raise ValueError(
|
||||
"Missing Anthropic API Key - A call is being made to anthropic but no key is set either in the environment variables or via params"
|
||||
)
|
||||
self.api_key = api_key
|
||||
self.headers = {
|
||||
"accept": "application/json",
|
||||
"anthropic-version": "2023-06-01",
|
||||
"content-type": "application/json",
|
||||
"x-api-key": self.api_key
|
||||
"x-api-key": self.api_key,
|
||||
}
|
||||
|
||||
def completion(self, model: str, messages: list, model_response: ModelResponse, print_verbose: Callable, optional_params=None, litellm_params=None, logger_fn=None): # logic for parsing in - calling - parsing out model completion calls
|
||||
def completion(
|
||||
self,
|
||||
model: str,
|
||||
messages: list,
|
||||
model_response: ModelResponse,
|
||||
print_verbose: Callable,
|
||||
optional_params=None,
|
||||
litellm_params=None,
|
||||
logger_fn=None,
|
||||
): # logic for parsing in - calling - parsing out model completion calls
|
||||
model = model
|
||||
prompt = f"{AnthropicConstants.HUMAN_PROMPT.value}"
|
||||
for message in messages:
|
||||
if "role" in message:
|
||||
if message["role"] == "user":
|
||||
prompt += f"{AnthropicConstants.HUMAN_PROMPT.value}{message['content']}"
|
||||
prompt += (
|
||||
f"{AnthropicConstants.HUMAN_PROMPT.value}{message['content']}"
|
||||
)
|
||||
else:
|
||||
prompt += f"{AnthropicConstants.AI_PROMPT.value}{message['content']}"
|
||||
prompt += (
|
||||
f"{AnthropicConstants.AI_PROMPT.value}{message['content']}"
|
||||
)
|
||||
else:
|
||||
prompt += f"{AnthropicConstants.HUMAN_PROMPT.value}{message['content']}"
|
||||
prompt += f"{AnthropicConstants.AI_PROMPT.value}"
|
||||
if "max_tokens" in optional_params and optional_params["max_tokens"] != float('inf'):
|
||||
if "max_tokens" in optional_params and optional_params["max_tokens"] != float(
|
||||
"inf"
|
||||
):
|
||||
max_tokens = optional_params["max_tokens"]
|
||||
else:
|
||||
max_tokens = self.default_max_tokens_to_sample
|
||||
|
@ -57,39 +80,66 @@ class AnthropicLLM:
|
|||
"model": model,
|
||||
"prompt": prompt,
|
||||
"max_tokens_to_sample": max_tokens,
|
||||
**optional_params
|
||||
**optional_params,
|
||||
}
|
||||
|
||||
## LOGGING
|
||||
logging(model=model, input=prompt, additional_args={"litellm_params": litellm_params, "optional_params": optional_params}, logger_fn=logger_fn)
|
||||
logging(
|
||||
model=model,
|
||||
input=prompt,
|
||||
additional_args={
|
||||
"litellm_params": litellm_params,
|
||||
"optional_params": optional_params,
|
||||
},
|
||||
logger_fn=logger_fn,
|
||||
)
|
||||
## COMPLETION CALL
|
||||
response = requests.post(self.completion_url, headers=self.headers, data=json.dumps(data))
|
||||
response = requests.post(
|
||||
self.completion_url, headers=self.headers, data=json.dumps(data)
|
||||
)
|
||||
if "stream" in optional_params and optional_params["stream"] == True:
|
||||
return response.iter_lines()
|
||||
else:
|
||||
## LOGGING
|
||||
logging(model=model, input=prompt, additional_args={"litellm_params": litellm_params, "optional_params": optional_params, "original_response": response.text}, logger_fn=logger_fn)
|
||||
logging(
|
||||
model=model,
|
||||
input=prompt,
|
||||
additional_args={
|
||||
"litellm_params": litellm_params,
|
||||
"optional_params": optional_params,
|
||||
"original_response": response.text,
|
||||
},
|
||||
logger_fn=logger_fn,
|
||||
)
|
||||
print_verbose(f"raw model_response: {response.text}")
|
||||
## RESPONSE OBJECT
|
||||
completion_response = response.json()
|
||||
if "error" in completion_response:
|
||||
raise AnthropicError(message=completion_response["error"], status_code=response.status_code)
|
||||
raise AnthropicError(
|
||||
message=completion_response["error"],
|
||||
status_code=response.status_code,
|
||||
)
|
||||
else:
|
||||
model_response["choices"][0]["message"]["content"] = completion_response["completion"]
|
||||
|
||||
model_response["choices"][0]["message"][
|
||||
"content"
|
||||
] = completion_response["completion"]
|
||||
|
||||
## CALCULATING USAGE
|
||||
prompt_tokens = len(self.encoding.encode(prompt)) ##[TODO] use the anthropic tokenizer here
|
||||
completion_tokens = len(self.encoding.encode(model_response["choices"][0]["message"]["content"])) ##[TODO] use the anthropic tokenizer here
|
||||
|
||||
|
||||
prompt_tokens = len(
|
||||
self.encoding.encode(prompt)
|
||||
) ##[TODO] use the anthropic tokenizer here
|
||||
completion_tokens = len(
|
||||
self.encoding.encode(model_response["choices"][0]["message"]["content"])
|
||||
) ##[TODO] use the anthropic tokenizer here
|
||||
|
||||
model_response["created"] = time.time()
|
||||
model_response["model"] = model
|
||||
model_response["usage"] = {
|
||||
"prompt_tokens": prompt_tokens,
|
||||
"completion_tokens": completion_tokens,
|
||||
"total_tokens": prompt_tokens + completion_tokens
|
||||
}
|
||||
"total_tokens": prompt_tokens + completion_tokens,
|
||||
}
|
||||
return model_response
|
||||
|
||||
def embedding(): # logic for parsing in - calling - parsing out model embedding calls
|
||||
pass
|
||||
|
||||
def embedding(): # logic for parsing in - calling - parsing out model embedding calls
|
||||
pass
|
||||
|
|
|
@ -1,11 +1,12 @@
|
|||
## This is a template base class to be used for adding new LLM providers via API calls
|
||||
|
||||
class BaseLLM():
|
||||
def validate_environment(): # set up the environment required to run the model
|
||||
pass
|
||||
|
||||
def completion(): # logic for parsing in - calling - parsing out model completion calls
|
||||
class BaseLLM:
|
||||
def validate_environment(): # set up the environment required to run the model
|
||||
pass
|
||||
|
||||
def embedding(): # logic for parsing in - calling - parsing out model embedding calls
|
||||
pass
|
||||
def completion(): # logic for parsing in - calling - parsing out model completion calls
|
||||
pass
|
||||
|
||||
def embedding(): # logic for parsing in - calling - parsing out model embedding calls
|
||||
pass
|
||||
|
|
|
@ -3,31 +3,47 @@ import os, json
|
|||
from enum import Enum
|
||||
import requests
|
||||
from litellm import logging
|
||||
import time
|
||||
import time
|
||||
from typing import Callable
|
||||
from litellm.utils import ModelResponse
|
||||
|
||||
|
||||
class HuggingfaceError(Exception):
|
||||
def __init__(self, status_code, message):
|
||||
self.status_code = status_code
|
||||
self.message = message
|
||||
super().__init__(self.message) # Call the base class constructor with the parameters it needs
|
||||
super().__init__(
|
||||
self.message
|
||||
) # Call the base class constructor with the parameters it needs
|
||||
|
||||
class HuggingfaceRestAPILLM():
|
||||
|
||||
class HuggingfaceRestAPILLM:
|
||||
def __init__(self, encoding, api_key=None) -> None:
|
||||
self.encoding = encoding
|
||||
self.validate_environment(api_key=api_key)
|
||||
|
||||
def validate_environment(self, api_key): # set up the environment required to run the model
|
||||
def validate_environment(
|
||||
self, api_key
|
||||
): # set up the environment required to run the model
|
||||
self.headers = {
|
||||
"content-type": "application/json",
|
||||
}
|
||||
# get the api key if it exists in the environment or is passed in, but don't require it
|
||||
self.api_key = api_key
|
||||
if self.api_key != None:
|
||||
self.headers["Authorization"] = f"Bearer {self.api_key}"
|
||||
self.headers["Authorization"] = f"Bearer {self.api_key}"
|
||||
|
||||
def completion(self, model: str, messages: list, custom_api_base: str, model_response: ModelResponse, print_verbose: Callable, optional_params=None, litellm_params=None, logger_fn=None): # logic for parsing in - calling - parsing out model completion calls
|
||||
def completion(
|
||||
self,
|
||||
model: str,
|
||||
messages: list,
|
||||
custom_api_base: str,
|
||||
model_response: ModelResponse,
|
||||
print_verbose: Callable,
|
||||
optional_params=None,
|
||||
litellm_params=None,
|
||||
logger_fn=None,
|
||||
): # logic for parsing in - calling - parsing out model completion calls
|
||||
if custom_api_base:
|
||||
completion_url = custom_api_base
|
||||
elif "HF_API_BASE" in os.environ:
|
||||
|
@ -35,7 +51,9 @@ class HuggingfaceRestAPILLM():
|
|||
else:
|
||||
completion_url = f"https://api-inference.huggingface.co/models/{model}"
|
||||
prompt = ""
|
||||
if "meta-llama" in model and "chat" in model: # use the required special tokens for meta-llama - https://huggingface.co/blog/llama2#how-to-prompt-llama-2
|
||||
if (
|
||||
"meta-llama" in model and "chat" in model
|
||||
): # use the required special tokens for meta-llama - https://huggingface.co/blog/llama2#how-to-prompt-llama-2
|
||||
prompt = "<s>"
|
||||
for message in messages:
|
||||
if message["role"] == "system":
|
||||
|
@ -47,8 +65,8 @@ class HuggingfaceRestAPILLM():
|
|||
else:
|
||||
for message in messages:
|
||||
prompt += f"{message['content']}"
|
||||
### MAP INPUT PARAMS
|
||||
# max tokens
|
||||
### MAP INPUT PARAMS
|
||||
# max tokens
|
||||
if "max_tokens" in optional_params:
|
||||
value = optional_params.pop("max_tokens")
|
||||
optional_params["max_new_tokens"] = value
|
||||
|
@ -57,14 +75,33 @@ class HuggingfaceRestAPILLM():
|
|||
# "parameters": optional_params
|
||||
}
|
||||
## LOGGING
|
||||
logging(model=model, input=prompt, additional_args={"litellm_params": litellm_params, "optional_params": optional_params}, logger_fn=logger_fn)
|
||||
logging(
|
||||
model=model,
|
||||
input=prompt,
|
||||
additional_args={
|
||||
"litellm_params": litellm_params,
|
||||
"optional_params": optional_params,
|
||||
},
|
||||
logger_fn=logger_fn,
|
||||
)
|
||||
## COMPLETION CALL
|
||||
response = requests.post(completion_url, headers=self.headers, data=json.dumps(data))
|
||||
response = requests.post(
|
||||
completion_url, headers=self.headers, data=json.dumps(data)
|
||||
)
|
||||
if "stream" in optional_params and optional_params["stream"] == True:
|
||||
return response.iter_lines()
|
||||
else:
|
||||
## LOGGING
|
||||
logging(model=model, input=prompt, additional_args={"litellm_params": litellm_params, "optional_params": optional_params, "original_response": response.text}, logger_fn=logger_fn)
|
||||
logging(
|
||||
model=model,
|
||||
input=prompt,
|
||||
additional_args={
|
||||
"litellm_params": litellm_params,
|
||||
"optional_params": optional_params,
|
||||
"original_response": response.text,
|
||||
},
|
||||
logger_fn=logger_fn,
|
||||
)
|
||||
print_verbose(f"raw model_response: {response.text}")
|
||||
## RESPONSE OBJECT
|
||||
completion_response = response.json()
|
||||
|
@ -72,24 +109,32 @@ class HuggingfaceRestAPILLM():
|
|||
if isinstance(completion_response, dict) and "error" in completion_response:
|
||||
print_verbose(f"completion error: {completion_response['error']}")
|
||||
print_verbose(f"response.status_code: {response.status_code}")
|
||||
raise HuggingfaceError(message=completion_response["error"], status_code=response.status_code)
|
||||
raise HuggingfaceError(
|
||||
message=completion_response["error"],
|
||||
status_code=response.status_code,
|
||||
)
|
||||
else:
|
||||
model_response["choices"][0]["message"]["content"] = completion_response[0]["generated_text"]
|
||||
|
||||
model_response["choices"][0]["message"][
|
||||
"content"
|
||||
] = completion_response[0]["generated_text"]
|
||||
|
||||
## CALCULATING USAGE
|
||||
prompt_tokens = len(self.encoding.encode(prompt)) ##[TODO] use the llama2 tokenizer here
|
||||
completion_tokens = len(self.encoding.encode(model_response["choices"][0]["message"]["content"])) ##[TODO] use the llama2 tokenizer here
|
||||
|
||||
|
||||
prompt_tokens = len(
|
||||
self.encoding.encode(prompt)
|
||||
) ##[TODO] use the llama2 tokenizer here
|
||||
completion_tokens = len(
|
||||
self.encoding.encode(model_response["choices"][0]["message"]["content"])
|
||||
) ##[TODO] use the llama2 tokenizer here
|
||||
|
||||
model_response["created"] = time.time()
|
||||
model_response["model"] = model
|
||||
model_response["usage"] = {
|
||||
"prompt_tokens": prompt_tokens,
|
||||
"completion_tokens": completion_tokens,
|
||||
"total_tokens": prompt_tokens + completion_tokens
|
||||
}
|
||||
"total_tokens": prompt_tokens + completion_tokens,
|
||||
}
|
||||
return model_response
|
||||
pass
|
||||
|
||||
def embedding(): # logic for parsing in - calling - parsing out model embedding calls
|
||||
pass
|
||||
def embedding(): # logic for parsing in - calling - parsing out model embedding calls
|
||||
pass
|
||||
|
|
1326
litellm/main.py
1326
litellm/main.py
File diff suppressed because it is too large
Load diff
|
@ -1,53 +1,82 @@
|
|||
import litellm
|
||||
import time
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
import traceback
|
||||
|
||||
|
||||
def testing_batch_completion(*args, **kwargs):
|
||||
try:
|
||||
batch_models = args[0] if len(args) > 0 else kwargs.pop("models") ## expected input format- ["gpt-3.5-turbo", {"model": "qvv0xeq", "custom_llm_provider"="baseten"}...]
|
||||
batch_messages = args[1] if len(args) > 1 else kwargs.pop("messages")
|
||||
results = []
|
||||
completions = []
|
||||
exceptions = []
|
||||
times = []
|
||||
with ThreadPoolExecutor() as executor:
|
||||
for model in batch_models:
|
||||
kwargs_modified = dict(kwargs)
|
||||
args_modified = list(args)
|
||||
if len(args) > 0:
|
||||
args_modified[0] = model["model"]
|
||||
else:
|
||||
kwargs_modified["model"] = model["model"] if isinstance(model, dict) and "model" in model else model # if model is a dictionary get it's value else assume it's a string
|
||||
kwargs_modified["custom_llm_provider"] = model["custom_llm_provider"] if isinstance(model, dict) and "custom_llm_provider" in model else None
|
||||
kwargs_modified["custom_api_base"] = model["custom_api_base"] if isinstance(model, dict) and "custom_api_base" in model else None
|
||||
for message_list in batch_messages:
|
||||
if len(args) > 1:
|
||||
args_modified[1] = message_list
|
||||
future = executor.submit(litellm.completion, *args_modified, **kwargs_modified)
|
||||
try:
|
||||
batch_models = (
|
||||
args[0] if len(args) > 0 else kwargs.pop("models")
|
||||
) ## expected input format- ["gpt-3.5-turbo", {"model": "qvv0xeq", "custom_llm_provider"="baseten"}...]
|
||||
batch_messages = args[1] if len(args) > 1 else kwargs.pop("messages")
|
||||
results = []
|
||||
completions = []
|
||||
exceptions = []
|
||||
times = []
|
||||
with ThreadPoolExecutor() as executor:
|
||||
for model in batch_models:
|
||||
kwargs_modified = dict(kwargs)
|
||||
args_modified = list(args)
|
||||
if len(args) > 0:
|
||||
args_modified[0] = model["model"]
|
||||
else:
|
||||
kwargs_modified["messages"] = message_list
|
||||
future = executor.submit(litellm.completion, *args_modified, **kwargs_modified)
|
||||
completions.append((future, message_list))
|
||||
|
||||
# Retrieve the results and calculate elapsed time for each completion call
|
||||
for completion in completions:
|
||||
future, message_list = completion
|
||||
start_time = time.time()
|
||||
try:
|
||||
result = future.result()
|
||||
end_time = time.time()
|
||||
elapsed_time = end_time - start_time
|
||||
result_dict = {"status": "succeeded", "response": future.result(), "prompt": message_list, "response_time": elapsed_time}
|
||||
results.append(result_dict)
|
||||
except Exception as e:
|
||||
end_time = time.time()
|
||||
elapsed_time = end_time - start_time
|
||||
result_dict = {"status": "failed", "response": e, "response_time": elapsed_time}
|
||||
results.append(result_dict)
|
||||
return results
|
||||
except:
|
||||
traceback.print_exc()
|
||||
kwargs_modified["model"] = (
|
||||
model["model"]
|
||||
if isinstance(model, dict) and "model" in model
|
||||
else model
|
||||
) # if model is a dictionary get it's value else assume it's a string
|
||||
kwargs_modified["custom_llm_provider"] = (
|
||||
model["custom_llm_provider"]
|
||||
if isinstance(model, dict) and "custom_llm_provider" in model
|
||||
else None
|
||||
)
|
||||
kwargs_modified["custom_api_base"] = (
|
||||
model["custom_api_base"]
|
||||
if isinstance(model, dict) and "custom_api_base" in model
|
||||
else None
|
||||
)
|
||||
for message_list in batch_messages:
|
||||
if len(args) > 1:
|
||||
args_modified[1] = message_list
|
||||
future = executor.submit(
|
||||
litellm.completion, *args_modified, **kwargs_modified
|
||||
)
|
||||
else:
|
||||
kwargs_modified["messages"] = message_list
|
||||
future = executor.submit(
|
||||
litellm.completion, *args_modified, **kwargs_modified
|
||||
)
|
||||
completions.append((future, message_list))
|
||||
|
||||
# Retrieve the results and calculate elapsed time for each completion call
|
||||
for completion in completions:
|
||||
future, message_list = completion
|
||||
start_time = time.time()
|
||||
try:
|
||||
result = future.result()
|
||||
end_time = time.time()
|
||||
elapsed_time = end_time - start_time
|
||||
result_dict = {
|
||||
"status": "succeeded",
|
||||
"response": future.result(),
|
||||
"prompt": message_list,
|
||||
"response_time": elapsed_time,
|
||||
}
|
||||
results.append(result_dict)
|
||||
except Exception as e:
|
||||
end_time = time.time()
|
||||
elapsed_time = end_time - start_time
|
||||
result_dict = {
|
||||
"status": "failed",
|
||||
"response": e,
|
||||
"response_time": elapsed_time,
|
||||
}
|
||||
results.append(result_dict)
|
||||
return results
|
||||
except:
|
||||
traceback.print_exc()
|
||||
|
||||
|
||||
def duration_test_model(original_function):
|
||||
def wrapper_function(*args, **kwargs):
|
||||
|
@ -70,22 +99,39 @@ def duration_test_model(original_function):
|
|||
# Return the wrapper function
|
||||
return wrapper_function
|
||||
|
||||
|
||||
@duration_test_model
|
||||
def load_test_model(models: list, prompt: str = None, num_calls: int = None):
|
||||
test_calls = 100
|
||||
if num_calls:
|
||||
test_calls = num_calls
|
||||
input_prompt = prompt if prompt else "Hey, how's it going?"
|
||||
messages = [{"role": "user", "content": prompt}] if prompt else [{"role": "user", "content": input_prompt}]
|
||||
full_message_list = [messages for _ in range(test_calls)] # call it as many times as set by user to load test models
|
||||
start_time = time.time()
|
||||
try:
|
||||
results = testing_batch_completion(models=models, messages=full_message_list)
|
||||
end_time = time.time()
|
||||
response_time = end_time - start_time
|
||||
return {"total_response_time": response_time, "calls_made": test_calls, "prompt": input_prompt, "results": results}
|
||||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
end_time = time.time()
|
||||
response_time = end_time - start_time
|
||||
return {"total_response_time": response_time, "calls_made": test_calls, "prompt": input_prompt, "exception": e}
|
||||
test_calls = 100
|
||||
if num_calls:
|
||||
test_calls = num_calls
|
||||
input_prompt = prompt if prompt else "Hey, how's it going?"
|
||||
messages = (
|
||||
[{"role": "user", "content": prompt}]
|
||||
if prompt
|
||||
else [{"role": "user", "content": input_prompt}]
|
||||
)
|
||||
full_message_list = [
|
||||
messages for _ in range(test_calls)
|
||||
] # call it as many times as set by user to load test models
|
||||
start_time = time.time()
|
||||
try:
|
||||
results = testing_batch_completion(models=models, messages=full_message_list)
|
||||
end_time = time.time()
|
||||
response_time = end_time - start_time
|
||||
return {
|
||||
"total_response_time": response_time,
|
||||
"calls_made": test_calls,
|
||||
"prompt": input_prompt,
|
||||
"results": results,
|
||||
}
|
||||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
end_time = time.time()
|
||||
response_time = end_time - start_time
|
||||
return {
|
||||
"total_response_time": response_time,
|
||||
"calls_made": test_calls,
|
||||
"prompt": input_prompt,
|
||||
"exception": e,
|
||||
}
|
||||
|
|
|
@ -3,27 +3,37 @@
|
|||
|
||||
import sys, os
|
||||
import traceback
|
||||
sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path
|
||||
|
||||
sys.path.insert(
|
||||
0, os.path.abspath("../..")
|
||||
) # Adds the parent directory to the system path
|
||||
import litellm
|
||||
from litellm import embedding, completion
|
||||
|
||||
litellm.set_verbose = False
|
||||
|
||||
|
||||
def logger_fn(model_call_object: dict):
|
||||
print(f"model call details: {model_call_object}")
|
||||
|
||||
|
||||
user_message = "Hello, how are you?"
|
||||
messages = [{ "content": user_message,"role": "user"}]
|
||||
messages = [{"content": user_message, "role": "user"}]
|
||||
|
||||
## Test 1: Setting key dynamically
|
||||
temp_key = os.environ.get("ANTHROPIC_API_KEY")
|
||||
os.environ["ANTHROPIC_API_KEY"] = "bad-key"
|
||||
# test on openai completion call
|
||||
# test on openai completion call
|
||||
try:
|
||||
response = completion(model="claude-instant-1", messages=messages, logger_fn=logger_fn, api_key=temp_key)
|
||||
response = completion(
|
||||
model="claude-instant-1",
|
||||
messages=messages,
|
||||
logger_fn=logger_fn,
|
||||
api_key=temp_key,
|
||||
)
|
||||
print(f"response: {response}")
|
||||
except:
|
||||
print(f"error occurred: {traceback.format_exc()}")
|
||||
print(f"error occurred: {traceback.format_exc()}")
|
||||
pass
|
||||
os.environ["ANTHROPIC_API_KEY"] = temp_key
|
||||
|
||||
|
@ -31,11 +41,13 @@ os.environ["ANTHROPIC_API_KEY"] = temp_key
|
|||
## Test 2: Setting key via __init__ params
|
||||
litellm.anthropic_key = os.environ.get("ANTHROPIC_API_KEY")
|
||||
os.environ.pop("ANTHROPIC_API_KEY")
|
||||
# test on openai completion call
|
||||
# test on openai completion call
|
||||
try:
|
||||
response = completion(model="claude-instant-1", messages=messages, logger_fn=logger_fn)
|
||||
response = completion(
|
||||
model="claude-instant-1", messages=messages, logger_fn=logger_fn
|
||||
)
|
||||
print(f"response: {response}")
|
||||
except:
|
||||
print(f"error occurred: {traceback.format_exc()}")
|
||||
print(f"error occurred: {traceback.format_exc()}")
|
||||
pass
|
||||
os.environ["ANTHROPIC_API_KEY"] = temp_key
|
||||
|
|
|
@ -5,17 +5,22 @@ import sys, os
|
|||
import pytest
|
||||
import traceback
|
||||
import asyncio
|
||||
sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path
|
||||
|
||||
sys.path.insert(
|
||||
0, os.path.abspath("../..")
|
||||
) # Adds the parent directory to the system path
|
||||
from litellm import acompletion
|
||||
|
||||
|
||||
async def test_get_response():
|
||||
user_message = "Hello, how are you?"
|
||||
messages = [{ "content": user_message,"role": "user"}]
|
||||
messages = [{"content": user_message, "role": "user"}]
|
||||
try:
|
||||
response = await acompletion(model="gpt-3.5-turbo", messages=messages)
|
||||
except Exception as e:
|
||||
pytest.fail(f"error occurred: {e}")
|
||||
return response
|
||||
|
||||
|
||||
response = asyncio.run(test_get_response())
|
||||
print(response)
|
||||
print(response)
|
||||
|
|
|
@ -1,16 +1,17 @@
|
|||
#### What this tests ####
|
||||
# This tests chaos monkeys - if random parts of the system are broken / things aren't sent correctly - what happens.
|
||||
# Expect to add more edge cases to this over time.
|
||||
# Expect to add more edge cases to this over time.
|
||||
|
||||
import sys, os
|
||||
import traceback
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
# Get the current directory of the script
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
# Get the parent directory by joining the current directory with '..'
|
||||
parent_dir = os.path.join(current_dir, '../..')
|
||||
parent_dir = os.path.join(current_dir, "../..")
|
||||
|
||||
# Add the parent directory to the system path
|
||||
sys.path.append(parent_dir)
|
||||
|
@ -26,7 +27,7 @@ litellm.failure_callback = ["slack", "sentry", "posthog"]
|
|||
|
||||
|
||||
user_message = "Hello, how are you?"
|
||||
messages = [{ "content": user_message,"role": "user"}]
|
||||
messages = [{"content": user_message, "role": "user"}]
|
||||
model_val = None
|
||||
|
||||
|
||||
|
@ -35,18 +36,18 @@ def test_completion_with_empty_model():
|
|||
try:
|
||||
response = completion(model=model_val, messages=messages)
|
||||
except Exception as e:
|
||||
print(f"error occurred: {e}")
|
||||
print(f"error occurred: {e}")
|
||||
pass
|
||||
|
||||
|
||||
#bad key
|
||||
# bad key
|
||||
temp_key = os.environ.get("OPENAI_API_KEY")
|
||||
os.environ["OPENAI_API_KEY"] = "bad-key"
|
||||
# test on openai completion call
|
||||
# test on openai completion call
|
||||
try:
|
||||
response = completion(model="gpt-3.5-turbo", messages=messages)
|
||||
print(f"response: {response}")
|
||||
except:
|
||||
print(f"error occurred: {traceback.format_exc()}")
|
||||
print(f"error occurred: {traceback.format_exc()}")
|
||||
pass
|
||||
os.environ["OPENAI_API_KEY"] = temp_key
|
||||
os.environ["OPENAI_API_KEY"] = temp_key
|
||||
|
|
|
@ -3,7 +3,10 @@
|
|||
|
||||
import sys, os
|
||||
import traceback
|
||||
sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path
|
||||
|
||||
sys.path.insert(
|
||||
0, os.path.abspath("../..")
|
||||
) # Adds the parent directory to the system path
|
||||
import litellm
|
||||
from litellm import batch_completion
|
||||
|
||||
|
@ -14,4 +17,4 @@ model = "gpt-3.5-turbo"
|
|||
|
||||
result = batch_completion(model=model, messages=messages)
|
||||
print(result)
|
||||
print(len(result))
|
||||
print(len(result))
|
||||
|
|
|
@ -19,7 +19,7 @@
|
|||
|
||||
|
||||
# #openai call
|
||||
# response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}])
|
||||
# response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}])
|
||||
|
||||
# #bad request call
|
||||
# response = completion(model="chatgpt-test", messages=[{"role": "user", "content": "Hi 👋 - i'm a bad request"}])
|
||||
# response = completion(model="chatgpt-test", messages=[{"role": "user", "content": "Hi 👋 - i'm a bad request"}])
|
||||
|
|
|
@ -1,9 +1,13 @@
|
|||
import sys, os
|
||||
import traceback
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
import os
|
||||
sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path
|
||||
|
||||
sys.path.insert(
|
||||
0, os.path.abspath("../..")
|
||||
) # Adds the parent directory to the system path
|
||||
import pytest
|
||||
import litellm
|
||||
from litellm import embedding, completion
|
||||
|
@ -12,7 +16,6 @@ litellm.caching = True
|
|||
messages = [{"role": "user", "content": "who is ishaan Github? "}]
|
||||
|
||||
|
||||
|
||||
# test if response cached
|
||||
def test_caching():
|
||||
try:
|
||||
|
@ -27,9 +30,5 @@ def test_caching():
|
|||
pytest.fail(f"Error occurred: {e}")
|
||||
except Exception as e:
|
||||
litellm.caching = False
|
||||
print(f"error occurred: {traceback.format_exc()}")
|
||||
print(f"error occurred: {traceback.format_exc()}")
|
||||
pytest.fail(f"Error occurred: {e}")
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -5,7 +5,9 @@ import sys, os
|
|||
import traceback
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path
|
||||
sys.path.insert(
|
||||
0, os.path.abspath("../..")
|
||||
) # Adds the parent directory to the system path
|
||||
import litellm
|
||||
from litellm import embedding, completion
|
||||
|
||||
|
@ -14,17 +16,22 @@ litellm.failure_callback = ["slack", "sentry", "posthog"]
|
|||
|
||||
litellm.set_verbose = True
|
||||
|
||||
|
||||
def logger_fn(model_call_object: dict):
|
||||
# print(f"model call details: {model_call_object}")
|
||||
pass
|
||||
|
||||
|
||||
user_message = "Hello, how are you?"
|
||||
messages = [{ "content": user_message,"role": "user"}]
|
||||
messages = [{"content": user_message, "role": "user"}]
|
||||
|
||||
|
||||
def test_completion_openai():
|
||||
try:
|
||||
print("running query")
|
||||
response = completion(model="gpt-3.5-turbo", messages=messages, logger_fn=logger_fn)
|
||||
response = completion(
|
||||
model="gpt-3.5-turbo", messages=messages, logger_fn=logger_fn
|
||||
)
|
||||
print(f"response: {response}")
|
||||
# Add any assertions here to check the response
|
||||
except Exception as e:
|
||||
|
@ -34,33 +41,46 @@ def test_completion_openai():
|
|||
|
||||
def test_completion_claude():
|
||||
try:
|
||||
response = completion(model="claude-instant-1", messages=messages, logger_fn=logger_fn)
|
||||
response = completion(
|
||||
model="claude-instant-1", messages=messages, logger_fn=logger_fn
|
||||
)
|
||||
# Add any assertions here to check the response
|
||||
except Exception as e:
|
||||
pytest.fail(f"Error occurred: {e}")
|
||||
|
||||
|
||||
def test_completion_non_openai():
|
||||
try:
|
||||
response = completion(model="command-nightly", messages=messages, logger_fn=logger_fn)
|
||||
response = completion(
|
||||
model="command-nightly", messages=messages, logger_fn=logger_fn
|
||||
)
|
||||
# Add any assertions here to check the response
|
||||
except Exception as e:
|
||||
pytest.fail(f"Error occurred: {e}")
|
||||
|
||||
|
||||
def test_embedding_openai():
|
||||
try:
|
||||
response = embedding(model='text-embedding-ada-002', input=[user_message], logger_fn=logger_fn)
|
||||
response = embedding(
|
||||
model="text-embedding-ada-002", input=[user_message], logger_fn=logger_fn
|
||||
)
|
||||
# Add any assertions here to check the response
|
||||
print(f"response: {str(response)[:50]}")
|
||||
except Exception as e:
|
||||
pytest.fail(f"Error occurred: {e}")
|
||||
|
||||
|
||||
def test_bad_azure_embedding():
|
||||
try:
|
||||
response = embedding(model='chatgpt-test', input=[user_message], logger_fn=logger_fn)
|
||||
response = embedding(
|
||||
model="chatgpt-test", input=[user_message], logger_fn=logger_fn
|
||||
)
|
||||
# Add any assertions here to check the response
|
||||
print(f"response: {str(response)[:50]}")
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
|
||||
# def test_good_azure_embedding():
|
||||
# try:
|
||||
# response = embedding(model='azure-embedding-model', input=[user_message], azure=True, logger_fn=logger_fn)
|
||||
|
@ -68,4 +88,3 @@ def test_bad_azure_embedding():
|
|||
# print(f"response: {str(response)[:50]}")
|
||||
# except Exception as e:
|
||||
# pytest.fail(f"Error occurred: {e}")
|
||||
|
||||
|
|
|
@ -1,44 +1,58 @@
|
|||
import sys, os
|
||||
import traceback
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
import os
|
||||
sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path
|
||||
|
||||
sys.path.insert(
|
||||
0, os.path.abspath("../..")
|
||||
) # Adds the parent directory to the system path
|
||||
import pytest
|
||||
import litellm
|
||||
from litellm import embedding, completion
|
||||
|
||||
# from infisical import InfisicalClient
|
||||
|
||||
# litellm.set_verbose = True
|
||||
# litellm.secret_manager_client = InfisicalClient(token=os.environ["INFISICAL_TOKEN"])
|
||||
|
||||
user_message = "Hello, whats the weather in San Francisco??"
|
||||
messages = [{ "content": user_message,"role": "user"}]
|
||||
messages = [{"content": user_message, "role": "user"}]
|
||||
|
||||
|
||||
def logger_fn(user_model_dict):
|
||||
print(f"user_model_dict: {user_model_dict}")
|
||||
|
||||
|
||||
def test_completion_claude():
|
||||
try:
|
||||
response = completion(model="claude-instant-1", messages=messages, logger_fn=logger_fn)
|
||||
response = completion(
|
||||
model="claude-instant-1", messages=messages, logger_fn=logger_fn
|
||||
)
|
||||
# Add any assertions here to check the response
|
||||
print(response)
|
||||
except Exception as e:
|
||||
pytest.fail(f"Error occurred: {e}")
|
||||
|
||||
|
||||
def test_completion_claude_stream():
|
||||
try:
|
||||
messages = [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "how does a court case get to the Supreme Court?"}
|
||||
{
|
||||
"role": "user",
|
||||
"content": "how does a court case get to the Supreme Court?",
|
||||
},
|
||||
]
|
||||
response = completion(model="claude-2", messages=messages, stream=True)
|
||||
# Add any assertions here to check the response
|
||||
for chunk in response:
|
||||
print(chunk['choices'][0]['delta']) # same as openai format
|
||||
print(chunk["choices"][0]["delta"]) # same as openai format
|
||||
except Exception as e:
|
||||
pytest.fail(f"Error occurred: {e}")
|
||||
|
||||
|
||||
# def test_completion_hf_api():
|
||||
# try:
|
||||
# user_message = "write some code to find the sum of two numbers"
|
||||
|
@ -62,10 +76,12 @@ def test_completion_claude_stream():
|
|||
|
||||
def test_completion_cohere():
|
||||
try:
|
||||
response = completion(model="command-nightly", messages=messages, max_tokens=100)
|
||||
response = completion(
|
||||
model="command-nightly", messages=messages, max_tokens=100
|
||||
)
|
||||
# Add any assertions here to check the response
|
||||
print(response)
|
||||
response_str = response['choices'][0]['message']['content']
|
||||
response_str = response["choices"][0]["message"]["content"]
|
||||
print(f"str response{response_str}")
|
||||
response_str_2 = response.choices[0].message.content
|
||||
if type(response_str) != str:
|
||||
|
@ -75,24 +91,31 @@ def test_completion_cohere():
|
|||
except Exception as e:
|
||||
pytest.fail(f"Error occurred: {e}")
|
||||
|
||||
|
||||
def test_completion_cohere_stream():
|
||||
try:
|
||||
messages = [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "how does a court case get to the Supreme Court?"}
|
||||
{
|
||||
"role": "user",
|
||||
"content": "how does a court case get to the Supreme Court?",
|
||||
},
|
||||
]
|
||||
response = completion(model="command-nightly", messages=messages, stream=True, max_tokens=50)
|
||||
response = completion(
|
||||
model="command-nightly", messages=messages, stream=True, max_tokens=50
|
||||
)
|
||||
# Add any assertions here to check the response
|
||||
for chunk in response:
|
||||
print(chunk['choices'][0]['delta']) # same as openai format
|
||||
print(chunk["choices"][0]["delta"]) # same as openai format
|
||||
except Exception as e:
|
||||
pytest.fail(f"Error occurred: {e}")
|
||||
|
||||
|
||||
def test_completion_openai():
|
||||
try:
|
||||
response = completion(model="gpt-3.5-turbo", messages=messages)
|
||||
|
||||
response_str = response['choices'][0]['message']['content']
|
||||
response_str = response["choices"][0]["message"]["content"]
|
||||
response_str_2 = response.choices[0].message.content
|
||||
assert response_str == response_str_2
|
||||
assert type(response_str) == str
|
||||
|
@ -100,6 +123,7 @@ def test_completion_openai():
|
|||
except Exception as e:
|
||||
pytest.fail(f"Error occurred: {e}")
|
||||
|
||||
|
||||
def test_completion_text_openai():
|
||||
try:
|
||||
response = completion(model="text-davinci-003", messages=messages)
|
||||
|
@ -108,17 +132,31 @@ def test_completion_text_openai():
|
|||
except Exception as e:
|
||||
pytest.fail(f"Error occurred: {e}")
|
||||
|
||||
|
||||
def test_completion_openai_with_optional_params():
|
||||
try:
|
||||
response = completion(model="gpt-3.5-turbo", messages=messages, temperature=0.5, top_p=0.1, user="ishaan_dev@berri.ai")
|
||||
response = completion(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=messages,
|
||||
temperature=0.5,
|
||||
top_p=0.1,
|
||||
user="ishaan_dev@berri.ai",
|
||||
)
|
||||
# Add any assertions here to check the response
|
||||
print(response)
|
||||
except Exception as e:
|
||||
pytest.fail(f"Error occurred: {e}")
|
||||
|
||||
|
||||
def test_completion_openrouter():
|
||||
try:
|
||||
response = completion(model="google/palm-2-chat-bison", messages=messages, temperature=0.5, top_p=0.1, user="ishaan_dev@berri.ai")
|
||||
response = completion(
|
||||
model="google/palm-2-chat-bison",
|
||||
messages=messages,
|
||||
temperature=0.5,
|
||||
top_p=0.1,
|
||||
user="ishaan_dev@berri.ai",
|
||||
)
|
||||
# Add any assertions here to check the response
|
||||
print(response)
|
||||
except Exception as e:
|
||||
|
@ -127,12 +165,23 @@ def test_completion_openrouter():
|
|||
|
||||
def test_completion_openai_with_more_optional_params():
|
||||
try:
|
||||
response = completion(model="gpt-3.5-turbo", messages=messages, temperature=0.5, top_p=0.1, n=2, max_tokens=150, presence_penalty=0.5, frequency_penalty=-0.5, logit_bias={123: 5}, user="ishaan_dev@berri.ai")
|
||||
response = completion(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=messages,
|
||||
temperature=0.5,
|
||||
top_p=0.1,
|
||||
n=2,
|
||||
max_tokens=150,
|
||||
presence_penalty=0.5,
|
||||
frequency_penalty=-0.5,
|
||||
logit_bias={123: 5},
|
||||
user="ishaan_dev@berri.ai",
|
||||
)
|
||||
# Add any assertions here to check the response
|
||||
print(response)
|
||||
response_str = response['choices'][0]['message']['content']
|
||||
response_str = response["choices"][0]["message"]["content"]
|
||||
response_str_2 = response.choices[0].message.content
|
||||
print(response['choices'][0]['message']['content'])
|
||||
print(response["choices"][0]["message"]["content"])
|
||||
print(response.choices[0].message.content)
|
||||
if type(response_str) != str:
|
||||
pytest.fail(f"Error occurred: {e}")
|
||||
|
@ -141,14 +190,28 @@ def test_completion_openai_with_more_optional_params():
|
|||
except Exception as e:
|
||||
pytest.fail(f"Error occurred: {e}")
|
||||
|
||||
|
||||
def test_completion_openai_with_stream():
|
||||
try:
|
||||
response = completion(model="gpt-3.5-turbo", messages=messages, temperature=0.5, top_p=0.1, n=2, max_tokens=150, presence_penalty=0.5, stream=True, frequency_penalty=-0.5, logit_bias={27000: 5}, user="ishaan_dev@berri.ai")
|
||||
response = completion(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=messages,
|
||||
temperature=0.5,
|
||||
top_p=0.1,
|
||||
n=2,
|
||||
max_tokens=150,
|
||||
presence_penalty=0.5,
|
||||
stream=True,
|
||||
frequency_penalty=-0.5,
|
||||
logit_bias={27000: 5},
|
||||
user="ishaan_dev@berri.ai",
|
||||
)
|
||||
# Add any assertions here to check the response
|
||||
print(response)
|
||||
except Exception as e:
|
||||
pytest.fail(f"Error occurred: {e}")
|
||||
|
||||
|
||||
def test_completion_openai_with_functions():
|
||||
function1 = [
|
||||
{
|
||||
|
@ -159,33 +222,39 @@ def test_completion_openai_with_functions():
|
|||
"properties": {
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA"
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"enum": ["celsius", "fahrenheit"]
|
||||
}
|
||||
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
|
||||
},
|
||||
"required": ["location"]
|
||||
}
|
||||
"required": ["location"],
|
||||
},
|
||||
}
|
||||
]
|
||||
try:
|
||||
response = completion(model="gpt-3.5-turbo", messages=messages, functions=function1)
|
||||
response = completion(
|
||||
model="gpt-3.5-turbo", messages=messages, functions=function1
|
||||
)
|
||||
# Add any assertions here to check the response
|
||||
print(response)
|
||||
except Exception as e:
|
||||
pytest.fail(f"Error occurred: {e}")
|
||||
|
||||
|
||||
def test_completion_azure():
|
||||
try:
|
||||
response = completion(model="gpt-3.5-turbo", deployment_id="chatgpt-test", messages=messages, custom_llm_provider="azure")
|
||||
response = completion(
|
||||
model="gpt-3.5-turbo",
|
||||
deployment_id="chatgpt-test",
|
||||
messages=messages,
|
||||
custom_llm_provider="azure",
|
||||
)
|
||||
# Add any assertions here to check the response
|
||||
print(response)
|
||||
except Exception as e:
|
||||
pytest.fail(f"Error occurred: {e}")
|
||||
|
||||
# Replicate API endpoints are unstable -> throw random CUDA errors -> this means our tests can fail even if our tests weren't incorrect.
|
||||
|
||||
# Replicate API endpoints are unstable -> throw random CUDA errors -> this means our tests can fail even if our tests weren't incorrect.
|
||||
def test_completion_replicate_llama_stream():
|
||||
model_name = "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1"
|
||||
try:
|
||||
|
@ -197,23 +266,32 @@ def test_completion_replicate_llama_stream():
|
|||
except Exception as e:
|
||||
pytest.fail(f"Error occurred: {e}")
|
||||
|
||||
|
||||
def test_completion_replicate_stability_stream():
|
||||
model_name = "stability-ai/stablelm-tuned-alpha-7b:c49dae362cbaecd2ceabb5bd34fdb68413c4ff775111fea065d259d577757beb"
|
||||
try:
|
||||
response = completion(model=model_name, messages=messages, stream=True, custom_llm_provider="replicate")
|
||||
response = completion(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
stream=True,
|
||||
custom_llm_provider="replicate",
|
||||
)
|
||||
# Add any assertions here to check the response
|
||||
for chunk in response:
|
||||
print(chunk['choices'][0]['delta'])
|
||||
print(chunk["choices"][0]["delta"])
|
||||
print(response)
|
||||
except Exception as e:
|
||||
pytest.fail(f"Error occurred: {e}")
|
||||
|
||||
|
||||
def test_completion_replicate_stability():
|
||||
model_name = "stability-ai/stablelm-tuned-alpha-7b:c49dae362cbaecd2ceabb5bd34fdb68413c4ff775111fea065d259d577757beb"
|
||||
try:
|
||||
response = completion(model=model_name, messages=messages, custom_llm_provider="replicate")
|
||||
response = completion(
|
||||
model=model_name, messages=messages, custom_llm_provider="replicate"
|
||||
)
|
||||
# Add any assertions here to check the response
|
||||
response_str = response['choices'][0]['message']['content']
|
||||
response_str = response["choices"][0]["message"]["content"]
|
||||
response_str_2 = response.choices[0].message.content
|
||||
print(response_str)
|
||||
print(response_str_2)
|
||||
|
@ -224,6 +302,7 @@ def test_completion_replicate_stability():
|
|||
except Exception as e:
|
||||
pytest.fail(f"Error occurred: {e}")
|
||||
|
||||
|
||||
######## Test TogetherAI ########
|
||||
def test_completion_together_ai():
|
||||
model_name = "togethercomputer/llama-2-70b-chat"
|
||||
|
@ -234,15 +313,22 @@ def test_completion_together_ai():
|
|||
except Exception as e:
|
||||
pytest.fail(f"Error occurred: {e}")
|
||||
|
||||
|
||||
def test_petals():
|
||||
model_name = "stabilityai/StableBeluga2"
|
||||
try:
|
||||
response = completion(model=model_name, messages=messages, custom_llm_provider="petals", force_timeout=120)
|
||||
response = completion(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
custom_llm_provider="petals",
|
||||
force_timeout=120,
|
||||
)
|
||||
# Add any assertions here to check the response
|
||||
print(response)
|
||||
except Exception as e:
|
||||
pytest.fail(f"Error occurred: {e}")
|
||||
|
||||
|
||||
# def test_baseten_falcon_7bcompletion():
|
||||
# model_name = "qvv0xeq"
|
||||
# try:
|
||||
|
@ -290,7 +376,6 @@ def test_petals():
|
|||
# pytest.fail(f"Error occurred: {e}")
|
||||
|
||||
|
||||
|
||||
#### Test A121 ###################
|
||||
# def test_completion_ai21():
|
||||
# model_name = "j2-light"
|
||||
|
@ -301,7 +386,7 @@ def test_petals():
|
|||
# except Exception as e:
|
||||
# pytest.fail(f"Error occurred: {e}")
|
||||
|
||||
# test config file with completion #
|
||||
# test config file with completion #
|
||||
# def test_completion_openai_config():
|
||||
# try:
|
||||
# litellm.config_path = "../config.json"
|
||||
|
@ -333,4 +418,3 @@ def test_petals():
|
|||
# return
|
||||
|
||||
# test_completion_together_ai_stream()
|
||||
|
||||
|
|
|
@ -1,20 +1,33 @@
|
|||
import sys, os
|
||||
import traceback
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
import os
|
||||
sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path
|
||||
import litellm
|
||||
from litellm import completion
|
||||
|
||||
sys.path.insert(
|
||||
0, os.path.abspath("../..")
|
||||
) # Adds the parent directory to the system path
|
||||
import litellm
|
||||
from litellm import completion
|
||||
|
||||
|
||||
def logging_fn(model_call_dict):
|
||||
print(f"model call details: {model_call_dict}")
|
||||
|
||||
|
||||
models = ["gorilla-7b-hf-v1", "gpt-4"]
|
||||
custom_llm_provider = None
|
||||
messages = [{"role": "user", "content": "Hey, how's it going?"}]
|
||||
for model in models: # iterate through list
|
||||
for model in models: # iterate through list
|
||||
custom_api_base = None
|
||||
if model == "gorilla-7b-hf-v1":
|
||||
if model == "gorilla-7b-hf-v1":
|
||||
custom_llm_provider = "custom_openai"
|
||||
custom_api_base = "http://zanino.millennium.berkeley.edu:8000/v1"
|
||||
completion(model=model, messages=messages, custom_llm_provider=custom_llm_provider, custom_api_base=custom_api_base, logger_fn=logging_fn)
|
||||
completion(
|
||||
model=model,
|
||||
messages=messages,
|
||||
custom_llm_provider=custom_llm_provider,
|
||||
custom_api_base=custom_api_base,
|
||||
logger_fn=logging_fn,
|
||||
)
|
||||
|
|
|
@ -1,9 +1,10 @@
|
|||
|
||||
import sys, os
|
||||
import traceback
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path
|
||||
sys.path.insert(
|
||||
0, os.path.abspath("../..")
|
||||
) # Adds the parent directory to the system path
|
||||
import litellm
|
||||
from litellm import embedding, completion
|
||||
from infisical import InfisicalClient
|
||||
|
@ -11,10 +12,13 @@ from infisical import InfisicalClient
|
|||
# # litellm.set_verbose = True
|
||||
# litellm.secret_manager_client = InfisicalClient(token=os.environ["INFISICAL_TOKEN"])
|
||||
|
||||
|
||||
def test_openai_embedding():
|
||||
try:
|
||||
response = embedding(model='text-embedding-ada-002', input=["good morning from litellm"])
|
||||
response = embedding(
|
||||
model="text-embedding-ada-002", input=["good morning from litellm"]
|
||||
)
|
||||
# Add any assertions here to check the response
|
||||
print(f"response: {str(response)}")
|
||||
except Exception as e:
|
||||
pytest.fail(f"Error occurred: {e}")
|
||||
pytest.fail(f"Error occurred: {e}")
|
||||
|
|
|
@ -1,10 +1,21 @@
|
|||
# from openai.error import AuthenticationError, InvalidRequestError, RateLimitError, OpenAIError
|
||||
import os
|
||||
import os
|
||||
import sys
|
||||
import traceback
|
||||
sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path
|
||||
|
||||
sys.path.insert(
|
||||
0, os.path.abspath("../..")
|
||||
) # Adds the parent directory to the system path
|
||||
import litellm
|
||||
from litellm import embedding, completion, AuthenticationError, InvalidRequestError, RateLimitError, ServiceUnavailableError, OpenAIError
|
||||
from litellm import (
|
||||
embedding,
|
||||
completion,
|
||||
AuthenticationError,
|
||||
InvalidRequestError,
|
||||
RateLimitError,
|
||||
ServiceUnavailableError,
|
||||
OpenAIError,
|
||||
)
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
import pytest
|
||||
|
||||
|
@ -23,8 +34,10 @@ litellm.failure_callback = ["sentry"]
|
|||
# models = ["gpt-3.5-turbo", "chatgpt-test", "claude-instant-1", "command-nightly"]
|
||||
test_model = "claude-instant-1"
|
||||
models = ["claude-instant-1"]
|
||||
|
||||
|
||||
def logging_fn(model_call_dict):
|
||||
if "model" in model_call_dict:
|
||||
if "model" in model_call_dict:
|
||||
print(f"model_call_dict: {model_call_dict['model']}")
|
||||
else:
|
||||
print(f"model_call_dict: {model_call_dict}")
|
||||
|
@ -38,7 +51,12 @@ def test_context_window(model):
|
|||
try:
|
||||
model = "chatgpt-test"
|
||||
print(f"model: {model}")
|
||||
response = completion(model=model, messages=messages, custom_llm_provider="azure", logger_fn=logging_fn)
|
||||
response = completion(
|
||||
model=model,
|
||||
messages=messages,
|
||||
custom_llm_provider="azure",
|
||||
logger_fn=logging_fn,
|
||||
)
|
||||
print(f"response: {response}")
|
||||
except InvalidRequestError as e:
|
||||
print(f"InvalidRequestError: {e.llm_provider}")
|
||||
|
@ -52,14 +70,17 @@ def test_context_window(model):
|
|||
print(f"Uncaught Exception - {e}")
|
||||
pytest.fail(f"Error occurred: {e}")
|
||||
return
|
||||
|
||||
|
||||
test_context_window(test_model)
|
||||
|
||||
|
||||
# Test 2: InvalidAuth Errors
|
||||
@pytest.mark.parametrize("model", models)
|
||||
def invalid_auth(model): # set the model key to an invalid key, depending on the model
|
||||
messages = [{ "content": "Hello, how are you?","role": "user"}]
|
||||
def invalid_auth(model): # set the model key to an invalid key, depending on the model
|
||||
messages = [{"content": "Hello, how are you?", "role": "user"}]
|
||||
temporary_key = None
|
||||
try:
|
||||
try:
|
||||
custom_llm_provider = None
|
||||
if model == "gpt-3.5-turbo":
|
||||
temporary_key = os.environ["OPENAI_API_KEY"]
|
||||
|
@ -74,22 +95,29 @@ def invalid_auth(model): # set the model key to an invalid key, depending on the
|
|||
elif model == "command-nightly":
|
||||
temporary_key = os.environ["COHERE_API_KEY"]
|
||||
os.environ["COHERE_API_KEY"] = "bad-key"
|
||||
elif model == "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1":
|
||||
temporary_key = os.environ["REPLICATE_API_KEY"]
|
||||
elif (
|
||||
model
|
||||
== "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1"
|
||||
):
|
||||
temporary_key = os.environ["REPLICATE_API_KEY"]
|
||||
os.environ["REPLICATE_API_KEY"] = "bad-key"
|
||||
print(f"model: {model}")
|
||||
response = completion(model=model, messages=messages, custom_llm_provider=custom_llm_provider)
|
||||
response = completion(
|
||||
model=model, messages=messages, custom_llm_provider=custom_llm_provider
|
||||
)
|
||||
print(f"response: {response}")
|
||||
except AuthenticationError as e:
|
||||
print(f"AuthenticationError Caught Exception - {e.llm_provider}")
|
||||
except OpenAIError: # is at least an openai error -> in case of random model errors - e.g. overloaded server
|
||||
except (
|
||||
OpenAIError
|
||||
): # is at least an openai error -> in case of random model errors - e.g. overloaded server
|
||||
print(f"OpenAIError Caught Exception - {e}")
|
||||
except Exception as e:
|
||||
print(type(e))
|
||||
print(e.__class__.__name__)
|
||||
print(f"Uncaught Exception - {e}")
|
||||
pytest.fail(f"Error occurred: {e}")
|
||||
if temporary_key != None: # reset the key
|
||||
if temporary_key != None: # reset the key
|
||||
if model == "gpt-3.5-turbo":
|
||||
os.environ["OPENAI_API_KEY"] = temporary_key
|
||||
elif model == "chatgpt-test":
|
||||
|
@ -99,13 +127,18 @@ def invalid_auth(model): # set the model key to an invalid key, depending on the
|
|||
os.environ["ANTHROPIC_API_KEY"] = temporary_key
|
||||
elif model == "command-nightly":
|
||||
os.environ["COHERE_API_KEY"] = temporary_key
|
||||
elif model == "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1":
|
||||
elif (
|
||||
model
|
||||
== "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1"
|
||||
):
|
||||
os.environ["REPLICATE_API_KEY"] = temporary_key
|
||||
return
|
||||
|
||||
|
||||
invalid_auth(test_model)
|
||||
# # Test 3: Rate Limit Errors
|
||||
# # Test 3: Rate Limit Errors
|
||||
# def test_model(model):
|
||||
# try:
|
||||
# try:
|
||||
# sample_text = "how does a court case get to the Supreme Court?" * 50000
|
||||
# messages = [{ "content": sample_text,"role": "user"}]
|
||||
# custom_llm_provider = None
|
||||
|
@ -142,5 +175,3 @@ invalid_auth(test_model)
|
|||
|
||||
# accuracy_score = counts[True]/(counts[True] + counts[False])
|
||||
# print(f"accuracy_score: {accuracy_score}")
|
||||
|
||||
|
||||
|
|
|
@ -5,7 +5,9 @@ import sys, os
|
|||
import traceback
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path
|
||||
sys.path.insert(
|
||||
0, os.path.abspath("../..")
|
||||
) # Adds the parent directory to the system path
|
||||
import litellm
|
||||
from litellm import embedding, completion
|
||||
|
||||
|
@ -14,11 +16,15 @@ litellm.success_callback = ["helicone"]
|
|||
litellm.set_verbose = True
|
||||
|
||||
user_message = "Hello, how are you?"
|
||||
messages = [{ "content": user_message,"role": "user"}]
|
||||
messages = [{"content": user_message, "role": "user"}]
|
||||
|
||||
|
||||
#openai call
|
||||
response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}])
|
||||
# openai call
|
||||
response = completion(
|
||||
model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}]
|
||||
)
|
||||
|
||||
#cohere call
|
||||
response = completion(model="command-nightly", messages=[{"role": "user", "content": "Hi 👋 - i'm cohere"}])
|
||||
# cohere call
|
||||
response = completion(
|
||||
model="command-nightly", messages=[{"role": "user", "content": "Hi 👋 - i'm cohere"}]
|
||||
)
|
||||
|
|
|
@ -1,22 +1,37 @@
|
|||
import sys, os
|
||||
import traceback
|
||||
sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path
|
||||
|
||||
sys.path.insert(
|
||||
0, os.path.abspath("../..")
|
||||
) # Adds the parent directory to the system path
|
||||
import litellm
|
||||
from litellm import load_test_model, testing_batch_completion
|
||||
|
||||
# ## Load Test Model
|
||||
# ## Load Test Model
|
||||
# model="gpt-3.5-turbo"
|
||||
# result = load_test_model(model=model, num_calls=5)
|
||||
# print(result)
|
||||
# print(len(result["results"]))
|
||||
|
||||
# ## Duration Test Model
|
||||
# ## Duration Test Model
|
||||
# model="gpt-3.5-turbo"
|
||||
# result = load_test_model(model=model, num_calls=5, duration=15, interval=15) # duration test the model for 2 minutes, sending 5 calls every 15s
|
||||
# print(result)
|
||||
|
||||
## Quality Test across Model
|
||||
models = ["gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-4", "claude-instant-1", {"model": "replicate/llama-2-70b-chat:58d078176e02c219e11eb4da5a02a7830a283b14cf8f94537af893ccff5ee781", "custom_llm_provider": "replicate"}]
|
||||
messages = [[{"role": "user", "content": "What is your name?"}], [{"role": "user", "content": "Hey, how's it going?"}]]
|
||||
## Quality Test across Model
|
||||
models = [
|
||||
"gpt-3.5-turbo",
|
||||
"gpt-3.5-turbo-16k",
|
||||
"gpt-4",
|
||||
"claude-instant-1",
|
||||
{
|
||||
"model": "replicate/llama-2-70b-chat:58d078176e02c219e11eb4da5a02a7830a283b14cf8f94537af893ccff5ee781",
|
||||
"custom_llm_provider": "replicate",
|
||||
},
|
||||
]
|
||||
messages = [
|
||||
[{"role": "user", "content": "What is your name?"}],
|
||||
[{"role": "user", "content": "Hey, how's it going?"}],
|
||||
]
|
||||
result = testing_batch_completion(models=models, messages=messages)
|
||||
print(result)
|
||||
print(result)
|
||||
|
|
|
@ -3,7 +3,10 @@
|
|||
|
||||
import sys, os
|
||||
import traceback
|
||||
sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path
|
||||
|
||||
sys.path.insert(
|
||||
0, os.path.abspath("../..")
|
||||
) # Adds the parent directory to the system path
|
||||
import litellm
|
||||
from litellm import embedding, completion
|
||||
|
||||
|
@ -11,49 +14,53 @@ litellm.set_verbose = False
|
|||
|
||||
score = 0
|
||||
|
||||
|
||||
def logger_fn(model_call_object: dict):
|
||||
print(f"model call details: {model_call_object}")
|
||||
|
||||
user_message = "Hello, how are you?"
|
||||
messages = [{ "content": user_message,"role": "user"}]
|
||||
|
||||
# test on openai completion call
|
||||
user_message = "Hello, how are you?"
|
||||
messages = [{"content": user_message, "role": "user"}]
|
||||
|
||||
# test on openai completion call
|
||||
try:
|
||||
response = completion(model="gpt-3.5-turbo", messages=messages, logger_fn=logger_fn)
|
||||
score +=1
|
||||
score += 1
|
||||
except:
|
||||
print(f"error occurred: {traceback.format_exc()}")
|
||||
print(f"error occurred: {traceback.format_exc()}")
|
||||
pass
|
||||
|
||||
# test on non-openai completion call
|
||||
# test on non-openai completion call
|
||||
try:
|
||||
response = completion(model="claude-instant-1", messages=messages, logger_fn=logger_fn)
|
||||
response = completion(
|
||||
model="claude-instant-1", messages=messages, logger_fn=logger_fn
|
||||
)
|
||||
print(f"claude response: {response}")
|
||||
score +=1
|
||||
score += 1
|
||||
except:
|
||||
print(f"error occurred: {traceback.format_exc()}")
|
||||
print(f"error occurred: {traceback.format_exc()}")
|
||||
pass
|
||||
|
||||
# # test on openai embedding call
|
||||
# try:
|
||||
# # test on openai embedding call
|
||||
# try:
|
||||
# response = embedding(model='text-embedding-ada-002', input=[user_message], logger_fn=logger_fn)
|
||||
# score +=1
|
||||
# score +=1
|
||||
# except:
|
||||
# traceback.print_exc()
|
||||
|
||||
# # test on bad azure openai embedding call -> missing azure flag and this isn't an embedding model
|
||||
# try:
|
||||
# try:
|
||||
# response = embedding(model='chatgpt-test', input=[user_message], logger_fn=logger_fn)
|
||||
# except:
|
||||
# score +=1 # expect this to fail
|
||||
# traceback.print_exc()
|
||||
|
||||
# # test on good azure openai embedding call
|
||||
# try:
|
||||
# # test on good azure openai embedding call
|
||||
# try:
|
||||
# response = embedding(model='azure-embedding-model', input=[user_message], azure=True, logger_fn=logger_fn)
|
||||
# score +=1
|
||||
# score +=1
|
||||
# except:
|
||||
# traceback.print_exc()
|
||||
|
||||
|
||||
# print(f"Score: {score}, Overall score: {score/5}")
|
||||
# print(f"Score: {score}, Overall score: {score/5}")
|
||||
|
|
|
@ -3,7 +3,10 @@
|
|||
|
||||
import sys, os
|
||||
import traceback
|
||||
sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path
|
||||
|
||||
sys.path.insert(
|
||||
0, os.path.abspath("../..")
|
||||
) # Adds the parent directory to the system path
|
||||
import litellm
|
||||
from litellm import embedding, completion
|
||||
|
||||
|
@ -15,11 +18,11 @@ litellm.set_verbose = True
|
|||
model_fallback_list = ["claude-instant-1", "gpt-3.5-turbo", "chatgpt-test"]
|
||||
|
||||
user_message = "Hello, how are you?"
|
||||
messages = [{ "content": user_message,"role": "user"}]
|
||||
messages = [{"content": user_message, "role": "user"}]
|
||||
|
||||
for model in model_fallback_list:
|
||||
try:
|
||||
response = embedding(model="text-embedding-ada-002", input=[user_message])
|
||||
response = completion(model=model, messages=messages)
|
||||
except Exception as e:
|
||||
print(f"error occurred: {traceback.format_exc()}")
|
||||
print(f"error occurred: {traceback.format_exc()}")
|
||||
|
|
|
@ -20,4 +20,4 @@
|
|||
|
||||
# if __name__ == '__main__':
|
||||
# from waitress import serve
|
||||
# serve(app, host='localhost', port=8080, threads=10)
|
||||
# serve(app, host='localhost', port=8080, threads=10)
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# import requests, json
|
||||
# import requests, json
|
||||
|
||||
# BASE_URL = 'http://localhost:8080'
|
||||
|
||||
|
@ -11,4 +11,4 @@
|
|||
# print("Hello route test passed!")
|
||||
|
||||
# if __name__ == '__main__':
|
||||
# test_hello_route()
|
||||
# test_hello_route()
|
||||
|
|
|
@ -4,7 +4,10 @@
|
|||
|
||||
import sys, os
|
||||
import traceback
|
||||
sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path
|
||||
|
||||
sys.path.insert(
|
||||
0, os.path.abspath("../..")
|
||||
) # Adds the parent directory to the system path
|
||||
import litellm
|
||||
from litellm import embedding, completion
|
||||
|
||||
|
@ -13,11 +16,11 @@ litellm.set_verbose = True
|
|||
model_fallback_list = ["claude-instant-1", "gpt-3.5-turbo", "chatgpt-test"]
|
||||
|
||||
user_message = "Hello, how are you?"
|
||||
messages = [{ "content": user_message,"role": "user"}]
|
||||
messages = [{"content": user_message, "role": "user"}]
|
||||
|
||||
for model in model_fallback_list:
|
||||
try:
|
||||
response = embedding(model="text-embedding-ada-002", input=[user_message])
|
||||
response = completion(model=model, messages=messages)
|
||||
except Exception as e:
|
||||
print(f"error occurred: {traceback.format_exc()}")
|
||||
print(f"error occurred: {traceback.format_exc()}")
|
||||
|
|
|
@ -53,7 +53,6 @@
|
|||
# # # return this generator to the client for streaming requests
|
||||
|
||||
|
||||
|
||||
# # async def get_response():
|
||||
# # global generator
|
||||
# # async for elem in generator:
|
||||
|
|
|
@ -12,7 +12,6 @@
|
|||
# import asyncio
|
||||
|
||||
|
||||
|
||||
# user_message = "respond in 20 words. who are you?"
|
||||
# messages = [{ "content": user_message,"role": "user"}]
|
||||
|
||||
|
@ -45,8 +44,3 @@
|
|||
# pytest.fail(f"Error occurred: {e}")
|
||||
|
||||
# test_completion_ollama_stream()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -4,7 +4,10 @@
|
|||
|
||||
import sys, os
|
||||
import traceback
|
||||
sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path
|
||||
|
||||
sys.path.insert(
|
||||
0, os.path.abspath("../..")
|
||||
) # Adds the parent directory to the system path
|
||||
import litellm
|
||||
from litellm import embedding, completion
|
||||
from infisical import InfisicalClient
|
||||
|
@ -15,7 +18,7 @@ infisical_token = os.environ["INFISICAL_TOKEN"]
|
|||
litellm.secret_manager_client = InfisicalClient(token=infisical_token)
|
||||
|
||||
user_message = "Hello, whats the weather in San Francisco??"
|
||||
messages = [{ "content": user_message,"role": "user"}]
|
||||
messages = [{"content": user_message, "role": "user"}]
|
||||
|
||||
|
||||
def test_completion_openai():
|
||||
|
@ -28,5 +31,5 @@ def test_completion_openai():
|
|||
pytest.fail(f"Error occurred: {e}")
|
||||
litellm.secret_manager_client = None
|
||||
|
||||
test_completion_openai()
|
||||
|
||||
test_completion_openai()
|
||||
|
|
|
@ -3,7 +3,10 @@
|
|||
|
||||
import sys, os
|
||||
import traceback
|
||||
sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path
|
||||
|
||||
sys.path.insert(
|
||||
0, os.path.abspath("../..")
|
||||
) # Adds the parent directory to the system path
|
||||
import litellm
|
||||
from litellm import completion
|
||||
|
||||
|
@ -11,29 +14,40 @@ litellm.set_verbose = False
|
|||
|
||||
score = 0
|
||||
|
||||
|
||||
def logger_fn(model_call_object: dict):
|
||||
print(f"model call details: {model_call_object}")
|
||||
|
||||
user_message = "Hello, how are you?"
|
||||
messages = [{ "content": user_message,"role": "user"}]
|
||||
|
||||
# test on anthropic completion call
|
||||
user_message = "Hello, how are you?"
|
||||
messages = [{"content": user_message, "role": "user"}]
|
||||
|
||||
# test on anthropic completion call
|
||||
try:
|
||||
response = completion(model="claude-instant-1", messages=messages, stream=True, logger_fn=logger_fn)
|
||||
response = completion(
|
||||
model="claude-instant-1", messages=messages, stream=True, logger_fn=logger_fn
|
||||
)
|
||||
for chunk in response:
|
||||
print(chunk['choices'][0]['delta'])
|
||||
score +=1
|
||||
print(chunk["choices"][0]["delta"])
|
||||
score += 1
|
||||
except:
|
||||
print(f"error occurred: {traceback.format_exc()}")
|
||||
print(f"error occurred: {traceback.format_exc()}")
|
||||
pass
|
||||
|
||||
|
||||
# test on anthropic completion call
|
||||
# test on anthropic completion call
|
||||
try:
|
||||
response = completion(model="meta-llama/Llama-2-7b-chat-hf", messages=messages, custom_llm_provider="huggingface", custom_api_base="https://s7c7gytn18vnu4tw.us-east-1.aws.endpoints.huggingface.cloud", stream=True, logger_fn=logger_fn)
|
||||
response = completion(
|
||||
model="meta-llama/Llama-2-7b-chat-hf",
|
||||
messages=messages,
|
||||
custom_llm_provider="huggingface",
|
||||
custom_api_base="https://s7c7gytn18vnu4tw.us-east-1.aws.endpoints.huggingface.cloud",
|
||||
stream=True,
|
||||
logger_fn=logger_fn,
|
||||
)
|
||||
for chunk in response:
|
||||
print(chunk['choices'][0]['delta'])
|
||||
score +=1
|
||||
print(chunk["choices"][0]["delta"])
|
||||
score += 1
|
||||
except:
|
||||
print(f"error occurred: {traceback.format_exc()}")
|
||||
pass
|
||||
print(f"error occurred: {traceback.format_exc()}")
|
||||
pass
|
||||
|
|
|
@ -21,7 +21,7 @@
|
|||
|
||||
|
||||
# #openai call
|
||||
# response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}])
|
||||
# response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}])
|
||||
|
||||
# #bad request call
|
||||
# response = completion(model="chatgpt-test", messages=[{"role": "user", "content": "Hi 👋 - i'm a bad request"}])
|
||||
# response = completion(model="chatgpt-test", messages=[{"role": "user", "content": "Hi 👋 - i'm a bad request"}])
|
||||
|
|
|
@ -3,10 +3,14 @@
|
|||
|
||||
import sys, os
|
||||
import traceback
|
||||
sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path
|
||||
|
||||
sys.path.insert(
|
||||
0, os.path.abspath("../..")
|
||||
) # Adds the parent directory to the system path
|
||||
import time
|
||||
from litellm import timeout
|
||||
|
||||
|
||||
@timeout(10)
|
||||
def stop_after_10_s(force_timeout=60):
|
||||
print("Stopping after 10 seconds")
|
||||
|
@ -14,14 +18,14 @@ def stop_after_10_s(force_timeout=60):
|
|||
return
|
||||
|
||||
|
||||
start_time = time.time()
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
stop_after_10_s(force_timeout=1)
|
||||
stop_after_10_s(force_timeout=1)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
pass
|
||||
print(e)
|
||||
pass
|
||||
|
||||
end_time = time.time()
|
||||
|
||||
print(f"total time: {end_time-start_time}")
|
||||
print(f"total time: {end_time-start_time}")
|
||||
|
|
|
@ -49,4 +49,4 @@
|
|||
|
||||
# # chat = chat_model.start_chat()
|
||||
# # response = chat.send_message("who are u? write a sentence", **parameters)
|
||||
# # print(f"Response from Model: {response.text}")
|
||||
# # print(f"Response from Model: {response.text}")
|
||||
|
|
|
@ -11,9 +11,7 @@ from threading import Thread
|
|||
from openai.error import Timeout
|
||||
|
||||
|
||||
def timeout(
|
||||
timeout_duration: float = None, exception_to_raise = Timeout
|
||||
):
|
||||
def timeout(timeout_duration: float = None, exception_to_raise=Timeout):
|
||||
"""
|
||||
Wraps a function to raise the specified exception if execution time
|
||||
is greater than the specified timeout.
|
||||
|
@ -44,7 +42,9 @@ def timeout(
|
|||
result = future.result(timeout=local_timeout_duration)
|
||||
except futures.TimeoutError:
|
||||
thread.stop_loop()
|
||||
raise exception_to_raise(f"A timeout error occurred. The function call took longer than {local_timeout_duration} second(s).")
|
||||
raise exception_to_raise(
|
||||
f"A timeout error occurred. The function call took longer than {local_timeout_duration} second(s)."
|
||||
)
|
||||
thread.stop_loop()
|
||||
return result
|
||||
|
||||
|
@ -59,7 +59,9 @@ def timeout(
|
|||
)
|
||||
return value
|
||||
except asyncio.TimeoutError:
|
||||
raise exception_to_raise(f"A timeout error occurred. The function call took longer than {local_timeout_duration} second(s).")
|
||||
raise exception_to_raise(
|
||||
f"A timeout error occurred. The function call took longer than {local_timeout_duration} second(s)."
|
||||
)
|
||||
|
||||
if iscoroutinefunction(func):
|
||||
return async_wrapper
|
||||
|
@ -80,4 +82,4 @@ class _LoopWrapper(Thread):
|
|||
def stop_loop(self):
|
||||
for task in asyncio.all_tasks(self.loop):
|
||||
task.cancel()
|
||||
self.loop.call_soon_threadsafe(self.loop.stop)
|
||||
self.loop.call_soon_threadsafe(self.loop.stop)
|
||||
|
|
1625
litellm/utils.py
1625
litellm/utils.py
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue