Merge pull request #106 from BerriAI/multi-class-krrish

Move Anthropic to it's own class + custom llm provider flag
This commit is contained in:
Krish Dholakia 2023-08-12 18:52:01 -07:00 committed by GitHub
commit a766a46c82
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
31 changed files with 263 additions and 741 deletions

BIN
.DS_Store vendored

Binary file not shown.

View file

@ -113,7 +113,7 @@ open_ai_embedding_models = [
] ]
from .timeout import timeout from .timeout import timeout
from .utils import client, logging, exception_type, get_optional_params, modify_integration, token_counter, cost_per_token, completion_cost, load_test_model from .utils import client, logging, exception_type, get_optional_params, modify_integration, token_counter, cost_per_token, completion_cost, load_test_model, get_litellm_params
from .main import * # Import all the symbols from main.py from .main import * # Import all the symbols from main.py
from .integrations import * from .integrations import *
from openai.error import AuthenticationError, InvalidRequestError, RateLimitError, ServiceUnavailableError, OpenAIError from openai.error import AuthenticationError, InvalidRequestError, RateLimitError, ServiceUnavailableError, OpenAIError

1
litellm/llms/__init__.py Normal file
View file

@ -0,0 +1 @@
from . import *

99
litellm/llms/anthropic.py Normal file
View file

@ -0,0 +1,99 @@
import os, json
from enum import Enum
import requests
from litellm import logging
import time
from typing import Callable
class AnthropicConstants(Enum):
HUMAN_PROMPT = "\n\nHuman:"
AI_PROMPT = "\n\nAssistant:"
class AnthropicError(Exception):
def __init__(self, status_code, message):
self.status_code = status_code
self.message = message
class AnthropicLLM:
def __init__(self, encoding, default_max_tokens_to_sample, api_key=None):
self.encoding = encoding
self.default_max_tokens_to_sample = default_max_tokens_to_sample
self.completion_url = "https://api.anthropic.com/v1/complete"
self.validate_environment(api_key=api_key)
def validate_environment(self, api_key): # set up the environment required to run the model
# set the api key
try:
self.api_key = os.getenv("ANTHROPIC_API_KEY") if "ANTHROPIC_API_KEY" in os.environ else api_key
if self.api_key == None:
raise Exception
self.headers = {
"accept": "application/json",
"anthropic-version": "2023-06-01",
"content-type": "application/json",
"x-api-key": self.api_key
}
except:
raise ValueError("Missing Anthropic API Key - A call is being made to anthropic but no key is set either in the environment variables or via params")
pass
def completion(self, model: str, messages: list, model_response: dict, print_verbose: Callable, optional_params=None, litellm_params=None, logger_fn=None): # logic for parsing in - calling - parsing out model completion calls
model = model
prompt = f"{AnthropicConstants.HUMAN_PROMPT.value}"
for message in messages:
if "role" in message:
if message["role"] == "user":
prompt += f"{AnthropicConstants.HUMAN_PROMPT.value}{message['content']}"
else:
prompt += f"{AnthropicConstants.AI_PROMPT.value}{message['content']}"
else:
prompt += f"{AnthropicConstants.HUMAN_PROMPT.value}{message['content']}"
prompt += f"{AnthropicConstants.AI_PROMPT.value}"
if "max_tokens" in optional_params and optional_params["max_tokens"] != float('inf'):
max_tokens = optional_params["max_tokens"]
else:
max_tokens = self.default_max_tokens_to_sample
data = {
"model": model,
"prompt": prompt,
"max_tokens_to_sample": max_tokens,
**optional_params
}
## LOGGING
logging(model=model, input=prompt, additional_args={"litellm_params": litellm_params, "optional_params": optional_params}, logger_fn=logger_fn)
## COMPLETION CALL
response = requests.post(self.completion_url, headers=self.headers, data=json.dumps(data))
if "stream" in optional_params and optional_params["stream"] == True:
return response.iter_lines()
else:
## LOGGING
logging(model=model, input=prompt, additional_args={"litellm_params": litellm_params, "optional_params": optional_params, "original_response": response.text}, logger_fn=logger_fn)
print_verbose(f"raw model_response: {response.text}")
## RESPONSE OBJECT
completion_response = response.json()
print(f"completion_response: {completion_response}")
if "error" in completion_response:
raise AnthropicError(message=completion_response["error"], status_code=response.status_code)
else:
model_response["choices"][0]["message"]["content"] = completion_response["completion"]
## CALCULATING USAGE
prompt_tokens = len(self.encoding.encode(prompt)) ##[TODO] use the anthropic tokenizer here
completion_tokens = len(self.encoding.encode(model_response["choices"][0]["message"]["content"])) ##[TODO] use the anthropic tokenizer here
model_response["created"] = time.time()
model_response["model"] = model
model_response["usage"] = {
"prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens,
"total_tokens": prompt_tokens + completion_tokens
}
return model_response
def embedding(): # logic for parsing in - calling - parsing out model embedding calls
pass

11
litellm/llms/base.py Normal file
View file

@ -0,0 +1,11 @@
## This is a template base class to be used for adding new LLM providers via API calls
class BaseLLM():
def validate_environment(): # set up the environment required to run the model
pass
def completion(): # logic for parsing in - calling - parsing out model completion calls
pass
def embedding(): # logic for parsing in - calling - parsing out model embedding calls
pass

View file

@ -4,7 +4,9 @@ from functools import partial
import dotenv, traceback, random, asyncio, time import dotenv, traceback, random, asyncio, time
from copy import deepcopy from copy import deepcopy
import litellm import litellm
from litellm import client, logging, exception_type, timeout, get_optional_params from litellm import client, logging, exception_type, timeout, get_optional_params, get_litellm_params
from litellm.utils import get_secret, install_and_import, CustomStreamWrapper, read_config_args
from .llms.anthropic import AnthropicLLM
import tiktoken import tiktoken
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
encoding = tiktoken.get_encoding("cl100k_base") encoding = tiktoken.get_encoding("cl100k_base")
@ -39,17 +41,18 @@ async def acompletion(*args, **kwargs):
# @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(2), reraise=True, retry_error_callback=lambda retry_state: setattr(retry_state.outcome, 'retry_variable', litellm.retry)) # retry call, turn this off by setting `litellm.retry = False` # @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(2), reraise=True, retry_error_callback=lambda retry_state: setattr(retry_state.outcome, 'retry_variable', litellm.retry)) # retry call, turn this off by setting `litellm.retry = False`
@timeout(600) ## set timeouts, in case calls hang (e.g. Azure) - default is 60s, override with `force_timeout` @timeout(600) ## set timeouts, in case calls hang (e.g. Azure) - default is 60s, override with `force_timeout`
def completion( def completion(
messages, model="gpt-3.5-turbo",# required params model, messages,# required params
# Optional OpenAI params: see https://platform.openai.com/docs/api-reference/chat/create # Optional OpenAI params: see https://platform.openai.com/docs/api-reference/chat/create
functions=[], function_call="", # optional params functions=[], function_call="", # optional params
temperature=1, top_p=1, n=1, stream=False, stop=None, max_tokens=float('inf'), temperature=1, top_p=1, n=1, stream=False, stop=None, max_tokens=float('inf'),
presence_penalty=0, frequency_penalty=0, logit_bias={}, user="", deployment_id=None, presence_penalty=0, frequency_penalty=0, logit_bias={}, user="", deployment_id=None,
# Optional liteLLM function params # Optional liteLLM function params
*, return_async=False, api_key=None, force_timeout=600, azure=False, logger_fn=None, verbose=False, *, return_async=False, api_key=None, force_timeout=600, logger_fn=None, verbose=False, azure=False, custom_llm_provider=None, custom_api_base=None
hugging_face = False, replicate=False,together_ai = False, custom_llm_provider=None, custom_api_base=None
): ):
try: try:
global new_response global new_response
if azure: # this flag is deprecated, remove once notebooks are also updated.
custom_llm_provider="azure"
args = locals() args = locals()
model_response = deepcopy(new_response) # deep copy the default response format so we can mutate it and it's thread-safe. model_response = deepcopy(new_response) # deep copy the default response format so we can mutate it and it's thread-safe.
# check if user passed in any of the OpenAI optional params # check if user passed in any of the OpenAI optional params
@ -58,9 +61,15 @@ def completion(
temperature=temperature, top_p=top_p, n=n, stream=stream, stop=stop, max_tokens=max_tokens, temperature=temperature, top_p=top_p, n=n, stream=stream, stop=stop, max_tokens=max_tokens,
presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, logit_bias=logit_bias, user=user, deployment_id=deployment_id, presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, logit_bias=logit_bias, user=user, deployment_id=deployment_id,
# params to identify the model # params to identify the model
model=model, replicate=replicate, hugging_face=hugging_face, together_ai=together_ai model=model, custom_llm_provider=custom_llm_provider
) )
if azure == True or custom_llm_provider == "azure": # [TODO]: remove azure=True flag, move to 'custom_llm_provider' approach # For logging - save the values of the litellm-specific params passed in
litellm_params = get_litellm_params(
return_async=return_async, api_key=api_key, force_timeout=force_timeout,
logger_fn=logger_fn, verbose=verbose, custom_llm_provider=custom_llm_provider,
custom_api_base=custom_api_base)
if custom_llm_provider == "azure":
# azure configs # azure configs
openai.api_type = "azure" openai.api_type = "azure"
openai.api_base = litellm.api_base if litellm.api_base is not None else get_secret("AZURE_API_BASE") openai.api_base = litellm.api_base if litellm.api_base is not None else get_secret("AZURE_API_BASE")
@ -73,7 +82,7 @@ def completion(
else: else:
openai.api_key = get_secret("AZURE_API_KEY") openai.api_key = get_secret("AZURE_API_KEY")
## LOGGING ## LOGGING
logging(model=model, input=messages, additional_args=optional_params, azure=azure, logger_fn=logger_fn) logging(model=model, input=messages, additional_args=optional_params, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn)
## COMPLETION CALL ## COMPLETION CALL
if litellm.headers: if litellm.headers:
response = openai.ChatCompletion.create( response = openai.ChatCompletion.create(
@ -103,7 +112,7 @@ def completion(
else: else:
openai.api_key = get_secret("OPENAI_API_KEY") openai.api_key = get_secret("OPENAI_API_KEY")
## LOGGING ## LOGGING
logging(model=model, input=messages, additional_args=args, azure=azure, logger_fn=logger_fn) logging(model=model, input=messages, additional_args=args, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn)
## COMPLETION CALL ## COMPLETION CALL
if litellm.headers: if litellm.headers:
response = openai.ChatCompletion.create( response = openai.ChatCompletion.create(
@ -132,7 +141,7 @@ def completion(
openai.organization = litellm.organization openai.organization = litellm.organization
prompt = " ".join([message["content"] for message in messages]) prompt = " ".join([message["content"] for message in messages])
## LOGGING ## LOGGING
logging(model=model, input=prompt, additional_args=optional_params, azure=azure, logger_fn=logger_fn) logging(model=model, input=prompt, additional_args=optional_params, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn)
## COMPLETION CALL ## COMPLETION CALL
if litellm.headers: if litellm.headers:
response = openai.Completion.create( response = openai.Completion.create(
@ -147,14 +156,14 @@ def completion(
) )
completion_response = response["choices"]["text"] completion_response = response["choices"]["text"]
## LOGGING ## LOGGING
logging(model=model, input=prompt, azure=azure, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn) logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn)
## RESPONSE OBJECT ## RESPONSE OBJECT
model_response["choices"][0]["message"]["content"] = completion_response model_response["choices"][0]["message"]["content"] = completion_response
model_response["created"] = response["created"] model_response["created"] = response["created"]
model_response["model"] = model model_response["model"] = model
model_response["usage"] = response["usage"] model_response["usage"] = response["usage"]
response = model_response response = model_response
elif "replicate" in model or replicate == True or custom_llm_provider == "replicate": elif "replicate" in model or custom_llm_provider == "replicate":
# import replicate/if it fails then pip install replicate # import replicate/if it fails then pip install replicate
install_and_import("replicate") install_and_import("replicate")
import replicate import replicate
@ -169,11 +178,11 @@ def completion(
os.environ["REPLICATE_API_TOKEN"] = litellm.replicate_key os.environ["REPLICATE_API_TOKEN"] = litellm.replicate_key
prompt = " ".join([message["content"] for message in messages]) prompt = " ".join([message["content"] for message in messages])
input = {"prompt": prompt} input = {"prompt": prompt}
if max_tokens != float('inf'): if "max_tokens" in optional_params:
input["max_length"] = max_tokens # for t5 models input["max_length"] = max_tokens # for t5 models
input["max_new_tokens"] = max_tokens # for llama2 models input["max_new_tokens"] = max_tokens # for llama2 models
## LOGGING ## LOGGING
logging(model=model, input=input, azure=azure, additional_args={"max_tokens": max_tokens}, logger_fn=logger_fn) logging(model=model, input=input, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens}, logger_fn=logger_fn)
## COMPLETION CALL ## COMPLETION CALL
output = replicate.run( output = replicate.run(
model, model,
@ -188,7 +197,7 @@ def completion(
response += item response += item
completion_response = response completion_response = response
## LOGGING ## LOGGING
logging(model=model, input=prompt, azure=azure, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn) logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn)
prompt_tokens = len(encoding.encode(prompt)) prompt_tokens = len(encoding.encode(prompt))
completion_tokens = len(encoding.encode(completion_response)) completion_tokens = len(encoding.encode(completion_response))
## RESPONSE OBJECT ## RESPONSE OBJECT
@ -202,59 +211,13 @@ def completion(
} }
response = model_response response = model_response
elif model in litellm.anthropic_models: elif model in litellm.anthropic_models:
# import anthropic/if it fails then pip install anthropic anthropic_key = api_key if api_key is not None else litellm.anthropic_key
install_and_import("anthropic") anthropic_client = AnthropicLLM(encoding=encoding, default_max_tokens_to_sample=litellm.max_tokens, api_key=anthropic_key)
from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT model_response = anthropic_client.completion(model=model, messages=messages, model_response=model_response, print_verbose=print_verbose, optional_params=optional_params, litellm_params=litellm_params, logger_fn=logger_fn)
#anthropic defaults to os.environ.get("ANTHROPIC_API_KEY")
if api_key:
os.environ["ANTHROPIC_API_KEY"] = api_key
elif litellm.anthropic_key:
os.environ["ANTHROPIC_API_KEY"] = litellm.anthropic_key
prompt = f"{HUMAN_PROMPT}"
for message in messages:
if "role" in message:
if message["role"] == "user":
prompt += f"{HUMAN_PROMPT}{message['content']}"
else:
prompt += f"{AI_PROMPT}{message['content']}"
else:
prompt += f"{HUMAN_PROMPT}{message['content']}"
prompt += f"{AI_PROMPT}"
anthropic = Anthropic()
if max_tokens != float('inf'):
max_tokens_to_sample = max_tokens
else:
max_tokens_to_sample = litellm.max_tokens # default in Anthropic docs https://docs.anthropic.com/claude/reference/client-libraries
## LOGGING
logging(model=model, input=prompt, azure=azure, additional_args={"max_tokens": max_tokens}, logger_fn=logger_fn)
## COMPLETION CALL
completion = anthropic.completions.create(
model=model,
prompt=prompt,
max_tokens_to_sample=max_tokens_to_sample,
**optional_params
)
if 'stream' in optional_params and optional_params['stream'] == True: if 'stream' in optional_params and optional_params['stream'] == True:
# don't try to access stream object, # don't try to access stream object,
response = CustomStreamWrapper(completion, model) response = CustomStreamWrapper(model_response, model)
return response return response
completion_response = completion.completion
## LOGGING
logging(model=model, input=prompt, azure=azure, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn)
prompt_tokens = anthropic.count_tokens(prompt)
completion_tokens = anthropic.count_tokens(completion_response)
## RESPONSE OBJECT
print_verbose(f"raw model_response: {model_response}")
model_response["choices"][0]["message"]["content"] = completion_response
model_response["created"] = time.time()
model_response["model"] = model
model_response["usage"] = {
"prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens,
"total_tokens": prompt_tokens + completion_tokens
}
response = model_response response = model_response
elif model in litellm.openrouter_models or custom_llm_provider == "openrouter": elif model in litellm.openrouter_models or custom_llm_provider == "openrouter":
@ -271,7 +234,7 @@ def completion(
else: else:
openai.api_key = get_secret("OPENROUTER_API_KEY") openai.api_key = get_secret("OPENROUTER_API_KEY")
## LOGGING ## LOGGING
logging(model=model, input=messages, additional_args=optional_params, azure=azure, logger_fn=logger_fn) logging(model=model, input=messages, additional_args=optional_params, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn)
## COMPLETION CALL ## COMPLETION CALL
if litellm.headers: if litellm.headers:
response = openai.ChatCompletion.create( response = openai.ChatCompletion.create(
@ -312,7 +275,7 @@ def completion(
co = cohere.Client(cohere_key) co = cohere.Client(cohere_key)
prompt = " ".join([message["content"] for message in messages]) prompt = " ".join([message["content"] for message in messages])
## LOGGING ## LOGGING
logging(model=model, input=prompt, azure=azure, logger_fn=logger_fn) logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn)
## COMPLETION CALL ## COMPLETION CALL
response = co.generate( response = co.generate(
model=model, model=model,
@ -326,7 +289,7 @@ def completion(
completion_response = response[0].text completion_response = response[0].text
## LOGGING ## LOGGING
logging(model=model, input=prompt, azure=azure, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn) logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn)
prompt_tokens = len(encoding.encode(prompt)) prompt_tokens = len(encoding.encode(prompt))
completion_tokens = len(encoding.encode(completion_response)) completion_tokens = len(encoding.encode(completion_response))
## RESPONSE OBJECT ## RESPONSE OBJECT
@ -339,7 +302,7 @@ def completion(
"total_tokens": prompt_tokens + completion_tokens "total_tokens": prompt_tokens + completion_tokens
} }
response = model_response response = model_response
elif hugging_face == True or custom_llm_provider == "huggingface": elif custom_llm_provider == "huggingface":
import requests import requests
API_URL = f"https://api-inference.huggingface.co/models/{model}" API_URL = f"https://api-inference.huggingface.co/models/{model}"
HF_TOKEN = get_secret("HF_TOKEN") HF_TOKEN = get_secret("HF_TOKEN")
@ -347,11 +310,11 @@ def completion(
prompt = " ".join([message["content"] for message in messages]) prompt = " ".join([message["content"] for message in messages])
## LOGGING ## LOGGING
logging(model=model, input=prompt, azure=azure, logger_fn=logger_fn) logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn)
input_payload = {"inputs": prompt} input_payload = {"inputs": prompt}
response = requests.post(API_URL, headers=headers, json=input_payload) response = requests.post(API_URL, headers=headers, json=input_payload)
## LOGGING ## LOGGING
logging(model=model, input=prompt, azure=azure, additional_args={"max_tokens": max_tokens, "original_response": response.text}, logger_fn=logger_fn) logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": response.text}, logger_fn=logger_fn)
completion_response = response.json()[0]['generated_text'] completion_response = response.json()[0]['generated_text']
prompt_tokens = len(encoding.encode(prompt)) prompt_tokens = len(encoding.encode(prompt))
completion_tokens = len(encoding.encode(completion_response)) completion_tokens = len(encoding.encode(completion_response))
@ -365,7 +328,7 @@ def completion(
"total_tokens": prompt_tokens + completion_tokens "total_tokens": prompt_tokens + completion_tokens
} }
response = model_response response = model_response
elif together_ai == True or custom_llm_provider == "together_ai": elif custom_llm_provider == "together_ai":
import requests import requests
TOGETHER_AI_TOKEN = get_secret("TOGETHER_AI_TOKEN") TOGETHER_AI_TOKEN = get_secret("TOGETHER_AI_TOKEN")
headers = {"Authorization": f"Bearer {TOGETHER_AI_TOKEN}"} headers = {"Authorization": f"Bearer {TOGETHER_AI_TOKEN}"}
@ -373,7 +336,7 @@ def completion(
prompt = " ".join([message["content"] for message in messages]) # TODO: Add chat support for together AI prompt = " ".join([message["content"] for message in messages]) # TODO: Add chat support for together AI
## LOGGING ## LOGGING
logging(model=model, input=prompt, azure=azure, logger_fn=logger_fn) logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn)
res = requests.post(endpoint, json={ res = requests.post(endpoint, json={
"model": model, "model": model,
"prompt": prompt, "prompt": prompt,
@ -383,7 +346,7 @@ def completion(
headers=headers headers=headers
) )
## LOGGING ## LOGGING
logging(model=model, input=prompt, azure=azure, additional_args={"max_tokens": max_tokens, "original_response": res.text}, logger_fn=logger_fn) logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": res.text}, logger_fn=logger_fn)
if stream == True: if stream == True:
response = CustomStreamWrapper(res, "together_ai") response = CustomStreamWrapper(res, "together_ai")
return response return response
@ -411,7 +374,7 @@ def completion(
prompt = " ".join([message["content"] for message in messages]) prompt = " ".join([message["content"] for message in messages])
## LOGGING ## LOGGING
logging(model=model, input=prompt, azure=azure, logger_fn=logger_fn) logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn)
chat_model = ChatModel.from_pretrained(model) chat_model = ChatModel.from_pretrained(model)
@ -420,7 +383,7 @@ def completion(
completion_response = chat.send_message(prompt, **optional_params) completion_response = chat.send_message(prompt, **optional_params)
## LOGGING ## LOGGING
logging(model=model, input=prompt, azure=azure, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn) logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn)
## RESPONSE OBJECT ## RESPONSE OBJECT
model_response["choices"][0]["message"]["content"] = completion_response model_response["choices"][0]["message"]["content"] = completion_response
@ -438,13 +401,13 @@ def completion(
return generator return generator
else: else:
## LOGGING ## LOGGING
logging(model=model, input=messages, azure=azure, logger_fn=logger_fn) logging(model=model, input=messages, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn)
args = locals() args = locals()
raise ValueError(f"Invalid completion model args passed in. Check your input - {args}") raise ValueError(f"Invalid completion model args passed in. Check your input - {args}")
return response return response
except Exception as e: except Exception as e:
## LOGGING ## LOGGING
logging(model=model, input=messages, azure=azure, additional_args={"max_tokens": max_tokens}, logger_fn=logger_fn, exception=e) logging(model=model, input=messages, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens}, logger_fn=logger_fn, exception=e)
## Map to OpenAI Exception ## Map to OpenAI Exception
raise exception_type(model=model, original_exception=e) raise exception_type(model=model, original_exception=e)

View file

@ -43,7 +43,7 @@ def test_completion_hf_api():
try: try:
user_message = "write some code to find the sum of two numbers" user_message = "write some code to find the sum of two numbers"
messages = [{ "content": user_message,"role": "user"}] messages = [{ "content": user_message,"role": "user"}]
response = completion(model="stabilityai/stablecode-completion-alpha-3b-4k", messages=messages, hugging_face=True) response = completion(model="stabilityai/stablecode-completion-alpha-3b-4k", messages=messages, custom_llm_provider="huggingface")
# Add any assertions here to check the response # Add any assertions here to check the response
print(response) print(response)
except Exception as e: except Exception as e:
@ -141,7 +141,7 @@ def test_completion_openai_with_functions():
def test_completion_azure(): def test_completion_azure():
try: try:
response = completion(model="gpt-3.5-turbo", deployment_id="chatgpt-test", messages=messages, azure=True) response = completion(model="gpt-3.5-turbo", deployment_id="chatgpt-test", messages=messages, custom_llm_provider="azure")
# Add any assertions here to check the response # Add any assertions here to check the response
print(response) print(response)
except Exception as e: except Exception as e:
@ -162,7 +162,7 @@ def test_completion_replicate_llama_stream():
def test_completion_replicate_stability_stream(): def test_completion_replicate_stability_stream():
model_name = "stability-ai/stablelm-tuned-alpha-7b:c49dae362cbaecd2ceabb5bd34fdb68413c4ff775111fea065d259d577757beb" model_name = "stability-ai/stablelm-tuned-alpha-7b:c49dae362cbaecd2ceabb5bd34fdb68413c4ff775111fea065d259d577757beb"
try: try:
response = completion(model=model_name, messages=messages, stream=True, replicate=True) response = completion(model=model_name, messages=messages, stream=True, custom_llm_provider="replicate")
# Add any assertions here to check the response # Add any assertions here to check the response
for chunk in response: for chunk in response:
print(chunk['choices'][0]['delta']) print(chunk['choices'][0]['delta'])
@ -170,27 +170,10 @@ def test_completion_replicate_stability_stream():
except Exception as e: except Exception as e:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
# Replicate API endpoints are unstable -> throw random CUDA errors -> this means our tests can fail even if our tests weren't incorrect.
# [TODO] improve our try-except block to handle for these
# def test_completion_replicate_llama():
# model_name = "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1"
# try:
# response = completion(model=model_name, messages=messages, max_tokens=500)
# # Add any assertions here to check the response
# print(response)
# except Exception as e:
# print(f"in replicate llama, got error {e}")
# pass
# if e == "FunctionTimedOut":
# pass
# else:
# pytest.fail(f"Error occurred: {e}")
def test_completion_replicate_stability(): def test_completion_replicate_stability():
model_name = "stability-ai/stablelm-tuned-alpha-7b:c49dae362cbaecd2ceabb5bd34fdb68413c4ff775111fea065d259d577757beb" model_name = "stability-ai/stablelm-tuned-alpha-7b:c49dae362cbaecd2ceabb5bd34fdb68413c4ff775111fea065d259d577757beb"
try: try:
response = completion(model=model_name, messages=messages, replicate=True) response = completion(model=model_name, messages=messages, custom_llm_provider="replicate")
# Add any assertions here to check the response # Add any assertions here to check the response
for result in response: for result in response:
print(result) print(result)
@ -202,7 +185,7 @@ def test_completion_replicate_stability():
def test_completion_together_ai(): def test_completion_together_ai():
model_name = "togethercomputer/llama-2-70b-chat" model_name = "togethercomputer/llama-2-70b-chat"
try: try:
response = completion(model=model_name, messages=messages, together_ai=True) response = completion(model=model_name, messages=messages, custom_llm_provider="together_ai")
# Add any assertions here to check the response # Add any assertions here to check the response
print(response) print(response)
except Exception as e: except Exception as e:
@ -211,7 +194,7 @@ def test_completion_together_ai():
def test_completion_together_ai_stream(): def test_completion_together_ai_stream():
model_name = "togethercomputer/llama-2-70b-chat" model_name = "togethercomputer/llama-2-70b-chat"
try: try:
response = completion(model=model_name, messages=messages, together_ai=True, stream=True) response = completion(model=model_name, messages=messages, custom_llm_provider="together_ai", stream=True)
# Add any assertions here to check the response # Add any assertions here to check the response
print(response) print(response)
for chunk in response: for chunk in response:

View file

@ -21,7 +21,8 @@ litellm.failure_callback = ["sentry"]
# Approach: Run each model through the test -> assert if the correct error (always the same one) is triggered # Approach: Run each model through the test -> assert if the correct error (always the same one) is triggered
# models = ["gpt-3.5-turbo", "chatgpt-test", "claude-instant-1", "command-nightly"] # models = ["gpt-3.5-turbo", "chatgpt-test", "claude-instant-1", "command-nightly"]
models = ["command-nightly"] test_model = "claude-instant-1"
models = ["claude-instant-1"]
def logging_fn(model_call_dict): def logging_fn(model_call_dict):
if "model" in model_call_dict: if "model" in model_call_dict:
print(f"model_call_dict: {model_call_dict['model']}") print(f"model_call_dict: {model_call_dict['model']}")
@ -35,9 +36,9 @@ def test_context_window(model):
sample_text = "how does a court case get to the Supreme Court?" * 5000 sample_text = "how does a court case get to the Supreme Court?" * 5000
messages = [{"content": sample_text, "role": "user"}] messages = [{"content": sample_text, "role": "user"}]
try: try:
azure = model == "chatgpt-test" model = "chatgpt-test"
print(f"model: {model}") print(f"model: {model}")
response = completion(model=model, messages=messages, azure=azure, logger_fn=logging_fn) response = completion(model=model, messages=messages, custom_llm_provider="azure", logger_fn=logging_fn)
print(f"response: {response}") print(f"response: {response}")
except InvalidRequestError: except InvalidRequestError:
print("InvalidRequestError") print("InvalidRequestError")
@ -51,7 +52,7 @@ def test_context_window(model):
print(f"Uncaught Exception - {e}") print(f"Uncaught Exception - {e}")
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
return return
test_context_window("command-nightly") test_context_window(test_model)
# Test 2: InvalidAuth Errors # Test 2: InvalidAuth Errors
@pytest.mark.parametrize("model", models) @pytest.mark.parametrize("model", models)
@ -59,14 +60,14 @@ def invalid_auth(model): # set the model key to an invalid key, depending on the
messages = [{ "content": "Hello, how are you?","role": "user"}] messages = [{ "content": "Hello, how are you?","role": "user"}]
temporary_key = None temporary_key = None
try: try:
azure = False custom_llm_provider = None
if model == "gpt-3.5-turbo": if model == "gpt-3.5-turbo":
temporary_key = os.environ["OPENAI_API_KEY"] temporary_key = os.environ["OPENAI_API_KEY"]
os.environ["OPENAI_API_KEY"] = "bad-key" os.environ["OPENAI_API_KEY"] = "bad-key"
elif model == "chatgpt-test": elif model == "chatgpt-test":
temporary_key = os.environ["AZURE_API_KEY"] temporary_key = os.environ["AZURE_API_KEY"]
os.environ["AZURE_API_KEY"] = "bad-key" os.environ["AZURE_API_KEY"] = "bad-key"
azure = True custom_llm_provider = "azure"
elif model == "claude-instant-1": elif model == "claude-instant-1":
temporary_key = os.environ["ANTHROPIC_API_KEY"] temporary_key = os.environ["ANTHROPIC_API_KEY"]
os.environ["ANTHROPIC_API_KEY"] = "bad-key" os.environ["ANTHROPIC_API_KEY"] = "bad-key"
@ -77,7 +78,7 @@ def invalid_auth(model): # set the model key to an invalid key, depending on the
temporary_key = os.environ["REPLICATE_API_KEY"] temporary_key = os.environ["REPLICATE_API_KEY"]
os.environ["REPLICATE_API_KEY"] = "bad-key" os.environ["REPLICATE_API_KEY"] = "bad-key"
print(f"model: {model}") print(f"model: {model}")
response = completion(model=model, messages=messages, azure=azure) response = completion(model=model, messages=messages, custom_llm_provider=custom_llm_provider)
print(f"response: {response}") print(f"response: {response}")
except AuthenticationError as e: except AuthenticationError as e:
print(f"AuthenticationError Caught Exception - {e}") print(f"AuthenticationError Caught Exception - {e}")
@ -101,17 +102,17 @@ def invalid_auth(model): # set the model key to an invalid key, depending on the
elif model == "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1": elif model == "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1":
os.environ["REPLICATE_API_KEY"] = temporary_key os.environ["REPLICATE_API_KEY"] = temporary_key
return return
invalid_auth("command-nightly") invalid_auth(test_model)
# # Test 3: Rate Limit Errors # # Test 3: Rate Limit Errors
# def test_model(model): # def test_model(model):
# try: # try:
# sample_text = "how does a court case get to the Supreme Court?" * 50000 # sample_text = "how does a court case get to the Supreme Court?" * 50000
# messages = [{ "content": sample_text,"role": "user"}] # messages = [{ "content": sample_text,"role": "user"}]
# azure = False # custom_llm_provider = None
# if model == "chatgpt-test": # if model == "chatgpt-test":
# azure = True # custom_llm_provider = "azure"
# print(f"model: {model}") # print(f"model: {model}")
# response = completion(model=model, messages=messages, azure=azure) # response = completion(model=model, messages=messages, custom_llm_provider=custom_llm_provider)
# except RateLimitError: # except RateLimitError:
# return True # return True
# except OpenAIError: # is at least an openai error -> in case of random model errors - e.g. overloaded server # except OpenAIError: # is at least an openai error -> in case of random model errors - e.g. overloaded server

View file

@ -28,31 +28,32 @@ except:
# test on non-openai completion call # test on non-openai completion call
try: try:
response = completion(model="claude-instant-1", messages=messages, logger_fn=logger_fn) response = completion(model="claude-instant-1", messages=messages, logger_fn=logger_fn)
print(f"claude response: {response}")
score +=1 score +=1
except: except:
print(f"error occurred: {traceback.format_exc()}") print(f"error occurred: {traceback.format_exc()}")
pass pass
# test on openai embedding call # # test on openai embedding call
try: # try:
response = embedding(model='text-embedding-ada-002', input=[user_message], logger_fn=logger_fn) # response = embedding(model='text-embedding-ada-002', input=[user_message], logger_fn=logger_fn)
score +=1 # score +=1
except: # except:
traceback.print_exc() # traceback.print_exc()
# test on bad azure openai embedding call -> missing azure flag and this isn't an embedding model # # test on bad azure openai embedding call -> missing azure flag and this isn't an embedding model
try: # try:
response = embedding(model='chatgpt-test', input=[user_message], logger_fn=logger_fn) # response = embedding(model='chatgpt-test', input=[user_message], logger_fn=logger_fn)
except: # except:
score +=1 # expect this to fail # score +=1 # expect this to fail
traceback.print_exc() # traceback.print_exc()
# test on good azure openai embedding call # # test on good azure openai embedding call
try: # try:
response = embedding(model='azure-embedding-model', input=[user_message], azure=True, logger_fn=logger_fn) # response = embedding(model='azure-embedding-model', input=[user_message], azure=True, logger_fn=logger_fn)
score +=1 # score +=1
except: # except:
traceback.print_exc() # traceback.print_exc()
print(f"Score: {score}, Overall score: {score/5}") # print(f"Score: {score}, Overall score: {score/5}")

View file

@ -19,7 +19,7 @@ messages = [{ "content": user_message,"role": "user"}]
def test_completion_azure(): def test_completion_azure():
try: try:
response = completion(model="gpt-3.5-turbo", deployment_id="chatgpt-test", messages=messages, azure=True) response = completion(model="gpt-3.5-turbo", deployment_id="chatgpt-test", messages=messages, custom_llm_provider="azure")
# Add any assertions here to check the response # Add any assertions here to check the response
print(response) print(response)
except Exception as e: except Exception as e:

View file

@ -0,0 +1,28 @@
#### What this tests ####
# This tests streaming for the completion endpoint
import sys, os
import traceback
sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path
import litellm
from litellm import completion
litellm.set_verbose = False
score = 0
def logger_fn(model_call_object: dict):
print(f"model call details: {model_call_object}")
user_message = "Hello, how are you?"
messages = [{ "content": user_message,"role": "user"}]
# test on anthropic completion call
try:
response = completion(model="claude-instant-1", messages=messages, stream=True, logger_fn=logger_fn)
for chunk in response:
print(chunk['choices'][0]['delta'])
score +=1
except:
print(f"error occurred: {traceback.format_exc()}")
pass

View file

@ -64,19 +64,17 @@ def install_and_import(package: str):
####### LOGGING ################### ####### LOGGING ###################
#Logging function -> log the exact model details + what's being sent | Non-Blocking #Logging function -> log the exact model details + what's being sent | Non-Blocking
def logging(model=None, input=None, azure=False, additional_args={}, logger_fn=None, exception=None): def logging(model=None, input=None, custom_llm_provider=None, azure=False, additional_args={}, logger_fn=None, exception=None):
try: try:
model_call_details = {} model_call_details = {}
if model: if model:
model_call_details["model"] = model model_call_details["model"] = model
if azure: if azure:
model_call_details["azure"] = azure model_call_details["azure"] = azure
if custom_llm_provider:
model_call_details["custom_llm_provider"] = custom_llm_provider
if exception: if exception:
model_call_details["exception"] = exception model_call_details["exception"] = exception
# if litellm.telemetry:
# safe_crash_reporting(model=model, exception=exception, azure=azure) # log usage-crash details. Do not log any user details. If you want to turn this off, set `litellm.telemetry=False`.
if input: if input:
model_call_details["input"] = input model_call_details["input"] = input
@ -132,8 +130,8 @@ def client(original_function):
try: try:
model = args[0] if len(args) > 0 else kwargs["model"] model = args[0] if len(args) > 0 else kwargs["model"]
exception = kwargs["exception"] if "exception" in kwargs else None exception = kwargs["exception"] if "exception" in kwargs else None
azure = kwargs["azure"] if "azure" in kwargs else None custom_llm_provider = kwargs["custom_llm_provider"] if "custom_llm_provider" in kwargs else None
safe_crash_reporting(model=model, exception=exception, azure=azure) # log usage-crash details. Do not log any user details. If you want to turn this off, set `litellm.telemetry=False`. safe_crash_reporting(model=model, exception=exception, custom_llm_provider=custom_llm_provider) # log usage-crash details. Do not log any user details. If you want to turn this off, set `litellm.telemetry=False`.
except: except:
#[Non-Blocking Error] #[Non-Blocking Error]
pass pass
@ -206,6 +204,32 @@ def completion_cost(model="gpt-3.5-turbo", prompt="", completion=""):
return prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar return prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
####### HELPER FUNCTIONS ################ ####### HELPER FUNCTIONS ################
def get_litellm_params(
return_async=False,
api_key=None,
force_timeout=600,
azure=False,
logger_fn=None,
verbose=False,
hugging_face=False,
replicate=False,
together_ai=False,
custom_llm_provider=None,
custom_api_base=None
):
litellm_params = {
"return_async": return_async,
"api_key": api_key,
"force_timeout": force_timeout,
"logger_fn": logger_fn,
"verbose": verbose,
"custom_llm_provider": custom_llm_provider,
"custom_api_base": custom_api_base
}
return litellm_params
def get_optional_params( def get_optional_params(
# 12 optional params # 12 optional params
functions = [], functions = [],
@ -222,9 +246,7 @@ def get_optional_params(
user = "", user = "",
deployment_id = None, deployment_id = None,
model = None, model = None,
replicate = False, custom_llm_provider = ""
hugging_face = False,
together_ai = False,
): ):
optional_params = {} optional_params = {}
if model in litellm.anthropic_models: if model in litellm.anthropic_models:
@ -247,13 +269,13 @@ def get_optional_params(
if max_tokens != float('inf'): if max_tokens != float('inf'):
optional_params["max_tokens"] = max_tokens optional_params["max_tokens"] = max_tokens
return optional_params return optional_params
elif replicate == True: elif custom_llm_provider == "replicate":
# any replicate models # any replicate models
# TODO: handle translating remaining replicate params # TODO: handle translating remaining replicate params
if stream: if stream:
optional_params["stream"] = stream optional_params["stream"] = stream
return optional_params return optional_params
elif together_ai == True: elif custom_llm_provider == "together_ai":
if stream: if stream:
optional_params["stream_tokens"] = stream optional_params["stream_tokens"] = stream
if temperature != 1: if temperature != 1:
@ -621,11 +643,11 @@ def exception_type(model, original_exception):
else: # don't let an error with mapping interrupt the user from receiving an error from the llm api calls else: # don't let an error with mapping interrupt the user from receiving an error from the llm api calls
raise original_exception raise original_exception
def safe_crash_reporting(model=None, exception=None, azure=None): def safe_crash_reporting(model=None, exception=None, custom_llm_provider=None):
data = { data = {
"model": model, "model": model,
"exception": str(exception), "exception": str(exception),
"azure": azure "custom_llm_provider": custom_llm_provider
} }
threading.Thread(target=litellm_telemetry, args=(data,)).start() threading.Thread(target=litellm_telemetry, args=(data,)).start()
@ -698,6 +720,13 @@ class CustomStreamWrapper:
def __iter__(self): def __iter__(self):
return self return self
def handle_anthropic_chunk(self, chunk):
str_line = chunk.decode('utf-8') # Convert bytes to string
if str_line.startswith('data:'):
data_json = json.loads(str_line[5:])
return data_json.get("completion", "")
return ""
def handle_together_ai_chunk(self, chunk): def handle_together_ai_chunk(self, chunk):
chunk = chunk.decode("utf-8") chunk = chunk.decode("utf-8")
text_index = chunk.find('"text":"') # this checks if text: exists text_index = chunk.find('"text":"') # this checks if text: exists
@ -713,7 +742,7 @@ class CustomStreamWrapper:
completion_obj ={ "role": "assistant", "content": ""} completion_obj ={ "role": "assistant", "content": ""}
if self.model in litellm.anthropic_models: if self.model in litellm.anthropic_models:
chunk = next(self.completion_stream) chunk = next(self.completion_stream)
completion_obj["content"] = chunk.completion completion_obj["content"] = self.handle_anthropic_chunk(chunk)
elif self.model == "replicate": elif self.model == "replicate":
chunk = next(self.completion_stream) chunk = next(self.completion_stream)
completion_obj["content"] = chunk completion_obj["content"] = chunk

View file

@ -1,29 +0,0 @@
# Advanced - Callbacks
## Use Callbacks to send Output Data to Posthog, Sentry etc
liteLLM provides `success_callbacks` and `failure_callbacks`, making it easy for you to send data to a particular provider depending on the status of your responses.
liteLLM supports:
- [Helicone](https://docs.helicone.ai/introduction)
- [Sentry](https://docs.sentry.io/platforms/python/)
- [PostHog](https://posthog.com/docs/libraries/python)
- [Slack](https://slack.dev/bolt-python/concepts)
### Quick Start
```python
from litellm import completion
# set callbacks
litellm.success_callback=["posthog", "helicone"]
litellm.failure_callback=["sentry"]
## set env variables
os.environ['SENTRY_API_URL'], os.environ['SENTRY_API_TRACE_RATE']= ""
os.environ['POSTHOG_API_KEY'], os.environ['POSTHOG_API_URL'] = "api-key", "api-url"
os.environ["HELICONE_API_KEY"] = ""
response = completion(model="gpt-3.5-turbo", messages=messages)
```

View file

@ -1,34 +0,0 @@
# BerriSpend Tutorial
BerriSpend is a free dashboard to monitor your cost and logs across llm providers.
## Use BerriSpend to see total spend across all LLM Providers (OpenAI, Azure, Anthropic, Cohere, Replicate, PaLM)
liteLLM provides `success_callbacks` and `failure_callbacks`, making it easy for you to send data to a particular provider depending on the status of your responses.
In this case, we want to log requests to BerriSpend when a request succeeds.
### Use Callbacks
Use just 2 lines of code, to instantly see costs and log your responses **across all providers** with BerriSpend:
```
litellm.success_callback=["berrispend"]
litellm.failure_callback=["berrispend"]
```
Complete code
```python
from litellm import completion
## set env variables
os.environ["BERRISPEND_ACCOUNT_ID"] = "your-email-id"
os.environ["OPENAI_API_KEY"] = ""
# set callbacks
litellm.success_callback=["berrispend"]
litellm.failure_callback=["berrispend"]
#openai call
response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}])
#bad call
response = completion(model="chatgpt-test", messages=[{"role": "user", "content": "Hi 👋 - i'm a bad call to test error logging"}])
```

View file

@ -1,12 +0,0 @@
# Data Logging Integrations
| Integration | Required OS Variables | How to Use with callbacks |
|-----------------|--------------------------------------------|-------------------------------------------|
| Sentry | `SENTRY_API_URL` | `litellm.success_callback=["sentry"], litellm.failure_callback=["sentry"]` |
| Posthog | `POSTHOG_API_KEY`,<br>`POSTHOG_API_URL` | `litellm.success_callback=["posthog"], litellm.failure_callback=["posthog"]` |
| Slack | `SLACK_API_TOKEN`,<br>`SLACK_API_SECRET`,<br>`SLACK_API_CHANNEL` | `litellm.success_callback=["slack"], litellm.failure_callback=["slack"]` |
| Helicone | `HELICONE_API_TOKEN` | `litellm.success_callback=["helicone"]` |

View file

@ -1,6 +0,0 @@
# Contact Us
[![](https://dcbadge.vercel.app/api/server/wuPM9dRgDw)](https://discord.gg/wuPM9dRgDw)
* [Meet with us 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version)
* Contact us at ishaan@berri.ai / krrish@berri.ai

View file

@ -1,34 +0,0 @@
## Contributing to Documentation
Clone litellm
```
git clone https://github.com/BerriAI/litellm.git
```
### Local setup for locally running docs
#### Installation
```
pip install mkdocs
```
#### Locally Serving Docs
```
mkdocs serve
```
If you see `command not found: mkdocs` try running the following
```
python3 -m mkdocs serve
```
This command builds your Markdown files into HTML and starts a development server to browse your documentation. Open up [http://127.0.0.1:8000/](http://127.0.0.1:8000/) in your web browser to see your documentation. You can make changes to your Markdown files and your docs will automatically rebuild.
[Full tutorial here](https://docs.readthedocs.io/en/stable/intro/getting-started-with-mkdocs.html)
### Making changes to Docs
- All the docs are placed under the `docs` directory
- If you are adding a new `.md` file or editing the hierarchy edit `mkdocs.yml` in the root of the project
- After testing your changes, make a change to the `main` branch of [github.com/BerriAI/litellm](https://github.com/BerriAI/litellm)

View file

@ -1,55 +0,0 @@
# Helicone Tutorial
[Helicone](https://helicone.ai/) is an open source observability platform that proxies your OpenAI traffic and provides you key insights into your spend, latency and usage.
## Use Helicone to log requests across all LLM Providers (OpenAI, Azure, Anthropic, Cohere, Replicate, PaLM)
liteLLM provides `success_callbacks` and `failure_callbacks`, making it easy for you to send data to a particular provider depending on the status of your responses.
In this case, we want to log requests to Helicone when a request succeeds.
### Approach 1: Use Callbacks
Use just 1 line of code, to instantly log your responses **across all providers** with helicone:
```
litellm.success_callback=["helicone"]
```
Complete code
```python
from litellm import completion
## set env variables
os.environ["HELICONE_API_KEY"] = "your-helicone-key"
os.environ["OPENAI_API_KEY"], os.environ["COHERE_API_KEY"] = "", ""
# set callbacks
litellm.success_callback=["helicone"]
#openai call
response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}])
#cohere call
response = completion(model="command-nightly", messages=[{"role": "user", "content": "Hi 👋 - i'm cohere"}])
```
### Approach 2: [OpenAI + Azure only] Use Helicone as a proxy
Helicone provides advanced functionality like caching, etc. Helicone currently supports this for Azure and OpenAI.
If you want to use Helicone to proxy your OpenAI/Azure requests, then you can -
- Set helicone as your base url via: `litellm.api_url`
- Pass in helicone request headers via: `litellm.headers`
Complete Code
```
import litellm
from litellm import completion
litellm.api_base = "https://oai.hconeai.com/v1"
litellm.headers = {"Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}"}
response = litellm.completion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "how does a court case get to the Supreme Court?"}]
)
print(response)
```

View file

@ -1,43 +0,0 @@
# *🚅 litellm*
a light 100 line package to simplify calling OpenAI, Azure, Cohere, Anthropic APIs
###### litellm manages:
* Calling all LLM APIs using the OpenAI format - `completion(model, messages)`
* Consistent output for all LLM APIs, text responses will always be available at `['choices'][0]['message']['content']`
* Consistent Exceptions for all LLM APIs, we map RateLimit, Context Window, and Authentication Error exceptions across all providers to their OpenAI equivalents. [see Code](https://github.com/BerriAI/litellm/blob/ba1079ff6698ef238c5c7f771dd2b698ec76f8d9/litellm/utils.py#L250)
###### observability:
* Logging - see exactly what the raw model request/response is by plugging in your own function `completion(.., logger_fn=your_logging_fn)` and/or print statements from the package `litellm.set_verbose=True`
* Callbacks - automatically send your data to Helicone, Sentry, Posthog, Slack - `litellm.success_callbacks`, `litellm.failure_callbacks` [see Callbacks](https://litellm.readthedocs.io/en/latest/advanced/)
## Quick Start
Go directly to code: [Getting Started Notebook](https://colab.research.google.com/drive/1gR3pY-JzDZahzpVdbGBtrNGDBmzUNJaJ?usp=sharing)
### Installation
```
pip install litellm
```
### Usage
```python
from litellm import completion
## set ENV variables
os.environ["OPENAI_API_KEY"] = "openai key"
os.environ["COHERE_API_KEY"] = "cohere key"
messages = [{ "content": "Hello, how are you?","role": "user"}]
# openai call
response = completion(model="gpt-3.5-turbo", messages=messages)
# cohere call
response = completion("command-nightly", messages)
```
Need Help / Support : [see troubleshooting](https://litellm.readthedocs.io/en/latest/troubleshoot)
## Why did we build liteLLM
- **Need for simplicity**: Our code started to get extremely complicated managing & translating calls between Azure, OpenAI, Cohere
## Support
* [Meet with us 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version)
* Contact us at ishaan@berri.ai / krrish@berri.ai

View file

@ -1,172 +0,0 @@
# Completion Function - completion()
The Input params are **exactly the same** as the
<a href="https://platform.openai.com/docs/api-reference/chat/create" target="_blank" rel="noopener noreferrer">OpenAI Create chat completion</a>, and let you call **Azure OpenAI, Anthropic, Cohere, Replicate, OpenRouter** models in the same format.
In addition, liteLLM allows you to pass in the following **Optional** liteLLM args:<br>
`forceTimeout`, `azure`, `logger_fn`, `verbose`
<!-- TODO: Add info about the following params -->
## Input - Request Body
**`model`**
<span style="color:gray; font-size: 0.8em;">string</span> <span style="color:red; font-size: 0.8em;">Required</span><br>
ID of the model to use. See the <a href="https://litellm.readthedocs.io/en/latest/supported" target="_blank" rel="noopener noreferrer">model endpoint compatibility</a> table for details on which models work with the Chat API.
---
**`messages`**
<span style="color:gray; font-size: 0.8em;">array</span> <span style="color:red; font-size: 0.8em;">Required</span><br>
<a></a>
A list of messages comprising the conversation so far. <a href="https://github.com/openai/openai-cookbook/blob/main/examples/How_to_format_inputs_to_ChatGPT_models.ipynb" target="_blank" rel="noopener noreferrer">Example Python Code</a>
```python
from litellm import completion
messages=
[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Knock knock."},
{"role": "assistant", "content": "Who's there?"},
{"role": "user", "content": "Orange."},
]
# openai call
response = completion(model="gpt-3.5-turbo", messages=messages, temperature=0)
# cohere call
response = completion(model="command-nightly", messages=messages, temperature=0)
```
---
>> **`role`**
>> <span style="color:gray; font-size: 0.8em;">string</span> <span style="color:red; font-size: 0.8em;">Required</span><br>
>> The role of the messages author. One of system, user, assistant, or function.
>> <br>
>>
>> ---
>> **`content`**
>> <span style="color:gray; font-size: 0.8em;">string</span> <span style="color:red; font-size: 0.8em;">Required</span><br>
>> The contents of the message. content is required for all messages, and may be null for assistant messages with function calls.
>> <br>
>>
>> ---
>> **`name`**
>> <span style="color:gray; font-size: 0.8em;">string</span> <span style="color:gray; font-size: 0.8em;">Optional</span><br>
>> The name of the author of this message. name is required if role is function, and it should be the name of the function whose response is in the content. May contain a-z, A-Z, 0-9, and underscores, with a maximum length of 64 characters.
>> <br>
>>
>> ---
>> **`function_call`**
>> <span style="color:gray; font-size: 0.8em;">object</span> <span style="color:gray; font-size: 0.8em;">Optional</span><br>
>> The name and arguments of a function that should be called, as generated by the model.
>> <br>
>>
>> ---
**`functions`**
<span style="color:gray; font-size: 0.8em;">array</span> <span style="color:gray; font-size: 0.8em;">Optional</span><br>
A list of functions the model may generate JSON inputs for.
<br>
---
>> **`name`**
>> <span style="color:gray; font-size: 0.8em;">string</span> <span style="color:red; font-size: 0.8em;">Required</span><br>
>> The name of the function to be called. Must be a-z, A-Z, 0-9, or contain underscores and dashes, with a maximum length of 64.
>> <br>
>>
>> ---
>> **`description`**
>> <span style="color:gray; font-size: 0.8em;">string</span> <span style="color:gray; font-size: 0.8em;">Optional</span><br>
>> A description of what the function does, used by the model to choose when and how to call the function.
>> <br>
>>
>> ---
>> **`parameters`**
>> <span style="color:gray; font-size: 0.8em;">object</span> <span style="color:red; font-size: 0.8em;">Required</span><br>
>> The parameters the functions accept, described as a JSON Schema object.
>> To describe a function that accepts no parameters, provide the value `{"type": "object", "properties": {}}`.
>> <br>
>>
>> ---
**`function_call`**
<span style="color:gray; font-size: 0.8em;">string or object</span> <span style="color:gray; font-size: 0.8em;">Optional</span><br>
Controls how the model responds to function calls. "none" means the model does not call a function, and responds to the end-user. "auto" means the model can pick between an end-user or calling a function. Specifying a particular function via `{"name": "my_function"}` forces the model to call that function. "none" is the default when no functions are present. "auto" is the default if functions are present.
<br>
---
**`temperature`**
<span style="color:gray; font-size: 0.8em;">number</span> <span style="color:gray; font-size: 0.8em;">Optional, Defaults to 1</span><br>
What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. We generally recommend altering this or `top_p` but not both.
<br>
---
**`top_p`**
<span style="color:gray; font-size: 0.8em;">number</span> <span style="color:gray; font-size: 0.8em;">Optional, Defaults to 1</span><br>
An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered. We generally recommend altering this or 1temperature` but not both.
<br>
---
**`n`**
<span style="color:gray; font-size: 0.8em;">integer</span> <span style="color:gray; font-size: 0.8em;">Optional, Defaults to 1</span><br>
How many chat completion choices to generate for each input message.
<br>
---
**`stream`**
<span style="color:gray; font-size: 0.8em;">boolean</span> <span style="color:gray; font-size: 0.8em;">Optional, Defaults to false</span><br>
If set, partial message deltas will be sent, like in ChatGPT. Tokens will be sent as data-only server-sent events as they become available, with the stream terminated by a `data: [DONE]` message.
<br>
---
**`stop`**
<span style="color:gray; font-size: 0.8em;">string or array</span> <span style="color:gray; font-size: 0.8em;">Optional, Defaults to null</span><br>
Up to 4 sequences where the API will stop generating further tokens.
<br>
---
**`max_tokens`**
<span style="color:gray; font-size: 0.8em;">integer</span> <span style="color:gray; font-size: 0.8em;">Optional, Defaults to inf</span><br>
The maximum number of tokens to generate in the chat completion. The total length of input tokens and generated tokens is limited by the model's context length
<br>
---
**`presence_penalty`**
<span style="color:gray; font-size: 0.8em;">number</span> <span style="color:gray; font-size: 0.8em;">Optional, Defaults to 0</span><br>
Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
<br>
---
**`frequency_penalty`**
<span style="color:gray; font-size: 0.8em;">number</span> <span style="color:gray; font-size: 0.8em;">Optional, Defaults to 0</span><br>
Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
<br>
---
**`logit_bias`**
<span style="color:gray; font-size: 0.8em;">map</span> <span style="color:gray; font-size: 0.8em;">Optional, Defaults to null</span><br>
Modify the likelihood of specified tokens appearing in the completion. Accepts a JSON object that maps tokens (specified by their token ID in the tokenizer) to an associated bias value from -100 to 100. Mathematically, the bias is added to the logits generated by the model prior to sampling. The exact effect will vary per model, but values between -1 and 1 should decrease or increase the likelihood of selection; values like -100 or 100 should result in a ban or exclusive selection of the relevant token.
<br>
---
**`user`**
<span style="color:gray; font-size: 0.8em;">string</span> <span style="color:gray; font-size: 0.8em;">Optional</span><br>
A unique identifier representing your end-user, which can help liteLLM to monitor and detect abuse.

View file

@ -1,12 +0,0 @@
# Completion Function - completion()
Here's the exact json output you can expect from a `litellm` completion call:
```python
{'choices': [{'finish_reason': 'stop',
'index': 0,
'message': {'role': 'assistant',
'content': " I'm doing well, thank you for asking. I am Claude, an AI assistant created by Anthropic."}}],
'created': 1691429984.3852863,
'model': 'claude-instant-1',
'usage': {'prompt_tokens': 18, 'completion_tokens': 23, 'total_tokens': 41}}
```

View file

@ -1,33 +0,0 @@
# Supported Secret Managers
liteLLM reads secrets from yoour secret manager, .env file
- [Infisical Secret Manager](#infisical-secret-manager)
- [.env Files](#env-files)
For expected format of secrets see [supported LLM models](https://litellm.readthedocs.io/en/latest/supported)
## Infisical Secret Manager
Integrates with [Infisical's Secret Manager](https://infisical.com/) for secure storage and retrieval of API keys and sensitive data.
### Usage
liteLLM manages reading in your LLM API secrets/env variables from Infisical for you
```
import litellm
from infisical import InfisicalClient
litellm.secret_manager = InfisicalClient(token="your-token")
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What's the weather like today?"},
]
response = litellm.completion(model="gpt-3.5-turbo", messages=messages)
print(response)
```
## .env Files
If no secret manager client is specified, Litellm automatically uses the `.env` file to manage sensitive data.

View file

@ -1,33 +0,0 @@
# Streaming Responses & Async Completion
- [Streaming Responses](#streaming-responses)
- [Async Completion](#async-completion)
## Streaming Responses
LiteLLM supports streaming the model response back by passing `stream=True` as an argument to the completion function
### Usage
```python
response = completion(model="gpt-3.5-turbo", messages=messages, stream=True)
for chunk in response:
print(chunk['choices'][0]['delta'])
```
## Async Completion
Asynchronous Completion with LiteLLM
LiteLLM provides an asynchronous version of the completion function called `acompletion`
### Usage
```
from litellm import acompletion
import asyncio
async def test_get_response():
user_message = "Hello, how are you?"
messages = [{"content": user_message, "role": "user"}]
response = await acompletion(model="gpt-3.5-turbo", messages=messages)
return response
response = asyncio.run(test_get_response())
print(response)
```

View file

@ -1,72 +0,0 @@
## Generation/Completion/Chat Completion Models
### OpenAI Chat Completion Models
| Model Name | Function Call | Required OS Variables |
|------------------|----------------------------------------|--------------------------------------|
| gpt-3.5-turbo | `completion('gpt-3.5-turbo', messages)` | `os.environ['OPENAI_API_KEY']` |
| gpt-3.5-turbo-16k | `completion('gpt-3.5-turbo-16k', messages)` | `os.environ['OPENAI_API_KEY']` |
| gpt-3.5-turbo-16k-0613 | `completion('gpt-3.5-turbo-16k-0613', messages)` | `os.environ['OPENAI_API_KEY']` |
| gpt-4 | `completion('gpt-4', messages)` | `os.environ['OPENAI_API_KEY']` |
## Azure OpenAI Chat Completion Models
| Model Name | Function Call | Required OS Variables |
|------------------|-----------------------------------------|-------------------------------------------|
| gpt-3.5-turbo | `completion('gpt-3.5-turbo', messages, azure=True)` | `os.environ['AZURE_API_KEY']`,<br>`os.environ['AZURE_API_BASE']`,<br>`os.environ['AZURE_API_VERSION']` |
| gpt-4 | `completion('gpt-4', messages, azure=True)` | `os.environ['AZURE_API_KEY']`,<br>`os.environ['AZURE_API_BASE']`,<br>`os.environ['AZURE_API_VERSION']` |
### OpenAI Text Completion Models
| Model Name | Function Call | Required OS Variables |
|------------------|--------------------------------------------|--------------------------------------|
| text-davinci-003 | `completion('text-davinci-003', messages)` | `os.environ['OPENAI_API_KEY']` |
### Cohere Models
| Model Name | Function Call | Required OS Variables |
|------------------|--------------------------------------------|--------------------------------------|
| command-nightly | `completion('command-nightly', messages)` | `os.environ['COHERE_API_KEY']` |
### Anthropic Models
| Model Name | Function Call | Required OS Variables |
|------------------|--------------------------------------------|--------------------------------------|
| claude-instant-1 | `completion('claude-instant-1', messages)` | `os.environ['ANTHROPIC_API_KEY']` |
| claude-2 | `completion('claude-2', messages)` | `os.environ['ANTHROPIC_API_KEY']` |
### Hugging Face Inference API
All [`text2text-generation`](https://huggingface.co/models?library=transformers&pipeline_tag=text2text-generation&sort=downloads) and [`text-generation`](https://huggingface.co/models?library=transformers&pipeline_tag=text-generation&sort=downloads) models are supported by liteLLM. You can use any text model from Hugging Face with the following steps:
* Copy the `model repo` URL from Hugging Face and set it as the `model` parameter in the completion call.
* Set `hugging_face` parameter to `True`.
* Make sure to set the hugging face API key
Here are some examples of supported models:
**Note that the models mentioned in the table are examples, and you can use any text model available on Hugging Face by following the steps above.**
| Model Name | Function Call | Required OS Variables |
|------------------|-------------------------------------------------------------------------------------|--------------------------------------|
| [stabilityai/stablecode-completion-alpha-3b-4k](https://huggingface.co/stabilityai/stablecode-completion-alpha-3b-4k) | `completion(model="stabilityai/stablecode-completion-alpha-3b-4k", messages=messages, hugging_face=True)` | `os.environ['HF_TOKEN']` |
| [bigcode/starcoder](https://huggingface.co/bigcode/starcoder) | `completion(model="bigcode/starcoder", messages=messages, hugging_face=True)` | `os.environ['HF_TOKEN']` |
| [google/flan-t5-xxl](https://huggingface.co/google/flan-t5-xxl) | `completion(model="google/flan-t5-xxl", messages=messages, hugging_face=True)` | `os.environ['HF_TOKEN']` |
| [google/flan-t5-large](https://huggingface.co/google/flan-t5-large) | `completion(model="google/flan-t5-large", messages=messages, hugging_face=True)` | `os.environ['HF_TOKEN']` |
### OpenRouter Completion Models
All the text models from [OpenRouter](https://openrouter.ai/docs) are supported by liteLLM.
| Model Name | Function Call | Required OS Variables |
|------------------|--------------------------------------------|--------------------------------------|
| openai/gpt-3.5-turbo | `completion('openai/gpt-3.5-turbo', messages)` | `os.environ['OR_SITE_URL']`,<br>`os.environ['OR_APP_NAME']`,<br>`os.environ['OR_API_KEY']` |
| openai/gpt-3.5-turbo-16k | `completion('openai/gpt-3.5-turbo-16k', messages)` | `os.environ['OR_SITE_URL']`,<br>`os.environ['OR_APP_NAME']`,<br>`os.environ['OR_API_KEY']` |
| openai/gpt-4 | `completion('openai/gpt-4', messages)` | `os.environ['OR_SITE_URL']`,<br>`os.environ['OR_APP_NAME']`,<br>`os.environ['OR_API_KEY']` |
| openai/gpt-4-32k | `completion('openai/gpt-4-32k', messages)` | `os.environ['OR_SITE_URL']`,<br>`os.environ['OR_APP_NAME']`,<br>`os.environ['OR_API_KEY']` |
| anthropic/claude-2 | `completion('anthropic/claude-2', messages)` | `os.environ['OR_SITE_URL']`,<br>`os.environ['OR_APP_NAME']`,<br>`os.environ['OR_API_KEY']` |
| anthropic/claude-instant-v1 | `completion('anthropic/claude-instant-v1', messages)` | `os.environ['OR_SITE_URL']`,<br>`os.environ['OR_APP_NAME']`,<br>`os.environ['OR_API_KEY']` |
| google/palm-2-chat-bison | `completion('google/palm-2-chat-bison', messages)` | `os.environ['OR_SITE_URL']`,<br>`os.environ['OR_APP_NAME']`,<br>`os.environ['OR_API_KEY']` |
| google/palm-2-codechat-bison | `completion('google/palm-2-codechat-bison', messages)` | `os.environ['OR_SITE_URL']`,<br>`os.environ['OR_APP_NAME']`,<br>`os.environ['OR_API_KEY']` |
| meta-llama/llama-2-13b-chat | `completion('meta-llama/llama-2-13b-chat', messages)` | `os.environ['OR_SITE_URL']`,<br>`os.environ['OR_APP_NAME']`,<br>`os.environ['OR_API_KEY']` |
| meta-llama/llama-2-70b-chat | `completion('meta-llama/llama-2-70b-chat', messages)` | `os.environ['OR_SITE_URL']`,<br>`os.environ['OR_APP_NAME']`,<br>`os.environ['OR_API_KEY']` |

View file

@ -1,5 +0,0 @@
## Embedding Models
| Model Name | Function Call | Required OS Variables |
|----------------------|---------------------------------------------|--------------------------------------|
| text-embedding-ada-002 | `embedding('text-embedding-ada-002', input)` | `os.environ['OPENAI_API_KEY']` |

View file

@ -1,45 +0,0 @@
# Token Usage
By default LiteLLM returns token usage in all completion requests ([See here](https://litellm.readthedocs.io/en/latest/output/))
However, we also expose 3 public helper functions to calculate token usage across providers:
- `token_counter`: This returns the number of tokens for a given input - it uses the tokenizer based on the model, and defaults to tiktoken if no model-specific tokenizer is available.
- `cost_per_token`: This returns the cost (in USD) for prompt (input) and completion (output) tokens. It utilizes our model_cost map which can be found in `__init__.py` and also as a [community resource](https://github.com/BerriAI/litellm/blob/main/cookbook/community-resources/max_tokens.json).
- `completion_cost`: This returns the overall cost (in USD) for a given LLM API Call. It combines `token_counter` and `cost_per_token` to return the cost for that query (counting both cost of input and output).
## Example Usage
1. `token_counter`
```python
from litellm import token_counter
messages = [{"user": "role", "content": "Hey, how's it going"}]
print(token_counter(model="gpt-3.5-turbo", messages=messages))
```
2. `cost_per_token`
```python
from litellm import cost_per_token
prompt_tokens = 5
completion_tokens = 10
prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = cost_per_token(model="gpt-3.5-turbo", prompt_tokens=prompt_tokens, completion_tokens=completion_tokens))
print(prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar)
```
3. `completion_cost`
```python
from litellm import completion_cost
prompt = "Hey, how's it going"
completion = "Hi, I'm gpt - I am doing well"
cost_of_query = completion_cost(model="gpt-3.5-turbo", prompt=prompt, completion=completion))
print(cost_of_query)
```

View file

@ -1,9 +0,0 @@
## Stable Version
If you're running into problems with installation / Usage
Use the stable version of litellm
```
pip install litellm==0.1.1
```