bump: version 0.8.4 → 0.8.5

This commit is contained in:
Krrish Dholakia 2023-10-14 16:43:06 -07:00
parent 80c60e71c1
commit 7358d2e4ea
11 changed files with 228 additions and 7343 deletions

View file

@ -6,7 +6,7 @@ import requests
import time import time
import litellm import litellm
from typing import Callable from typing import Callable
from litellm.utils import ModelResponse, Choices, Message from litellm.utils import ModelResponse, Choices, Message, CustomStreamWrapper
from typing import Optional from typing import Optional
from .prompt_templates.factory import prompt_factory, custom_prompt from .prompt_templates.factory import prompt_factory, custom_prompt
@ -65,12 +65,17 @@ class HuggingfaceConfig():
and not isinstance(v, (types.FunctionType, types.BuiltinFunctionType, classmethod, staticmethod)) and not isinstance(v, (types.FunctionType, types.BuiltinFunctionType, classmethod, staticmethod))
and v is not None} and v is not None}
def validate_environment(api_key): def validate_environment(api_key, headers):
headers = { default_headers = {
"content-type": "application/json", "content-type": "application/json",
} }
if api_key: if api_key and headers is None:
headers["Authorization"] = f"Bearer {api_key}" default_headers["Authorization"] = f"Bearer {api_key}" # Huggingface Inference Endpoint default is to accept bearer tokens
headers = default_headers
elif headers:
headers=headers
else:
headers = default_headers
return headers return headers
tgi_models_cache = None tgi_models_cache = None
@ -125,6 +130,7 @@ def completion(
model: str, model: str,
messages: list, messages: list,
api_base: Optional[str], api_base: Optional[str],
headers: Optional[dict],
model_response: ModelResponse, model_response: ModelResponse,
print_verbose: Callable, print_verbose: Callable,
encoding, encoding,
@ -135,7 +141,8 @@ def completion(
litellm_params=None, litellm_params=None,
logger_fn=None, logger_fn=None,
): ):
headers = validate_environment(api_key) print(f'headers inside hf rest api: {headers}')
headers = validate_environment(api_key, headers)
task = get_hf_task_for_model(model) task = get_hf_task_for_model(model)
print_verbose(f"{model}, {task}") print_verbose(f"{model}, {task}")
completion_url = "" completion_url = ""
@ -227,7 +234,7 @@ def completion(
logging_obj.pre_call( logging_obj.pre_call(
input=input_text, input=input_text,
api_key=api_key, api_key=api_key,
additional_args={"complete_input_dict": data, "task": task}, additional_args={"complete_input_dict": data, "task": task, "headers": headers},
) )
## COMPLETION CALL ## COMPLETION CALL
if "stream" in optional_params and optional_params["stream"] == True: if "stream" in optional_params and optional_params["stream"] == True:
@ -244,6 +251,29 @@ def completion(
headers=headers, headers=headers,
data=json.dumps(data) data=json.dumps(data)
) )
## Some servers might return streaming responses even though stream was not set to true. (e.g. Baseten)
is_streamed = False
print(f"response keys: {response.__dict__.keys()}")
print(f"response keys: {response.__dict__['headers']}")
if response.__dict__['headers']["Content-Type"] == "text/event-stream":
is_streamed = True
# iterate over the complete streamed response, and return the final answer
if is_streamed:
streamed_response = CustomStreamWrapper(completion_stream=response.iter_lines(), model=model, custom_llm_provider="huggingface", logging_obj=logging_obj)
content = ""
for chunk in streamed_response:
content += chunk["choices"][0]["delta"]["content"]
completion_response = [{"generated_text": content}]
## LOGGING
logging_obj.post_call(
input=input_text,
api_key=api_key,
original_response=completion_response,
additional_args={"complete_input_dict": data, "task": task},
)
else:
## LOGGING ## LOGGING
logging_obj.post_call( logging_obj.post_call(
input=input_text, input=input_text,

View file

@ -22,7 +22,9 @@ def llama_2_chat_pt(messages):
"post_message": "\n" # follows this - https://replicate.com/blog/how-to-prompt-llama "post_message": "\n" # follows this - https://replicate.com/blog/how-to-prompt-llama
} }
}, },
messages=messages messages=messages,
bos_token="<s>",
eos_token="</s>"
) )
return prompt return prompt
@ -218,14 +220,26 @@ def function_call_prompt(messages: list, functions: list):
# Custom prompt template # Custom prompt template
def custom_prompt(role_dict: dict, messages: list, initial_prompt_value: str="", final_prompt_value: str=""): def custom_prompt(role_dict: dict, messages: list, initial_prompt_value: str="", final_prompt_value: str="", bos_token: str="", eos_token: str=""):
prompt = initial_prompt_value prompt = bos_token + initial_prompt_value
bos_open = True
## a bos token is at the start of a system / human message
## an eos token is at the end of the assistant response to the message
for message in messages: for message in messages:
role = message["role"] role = message["role"]
if role in ["system", "human"] and not bos_open:
prompt += bos_token
bos_open = True
pre_message_str = role_dict[role]["pre_message"] if role in role_dict and "pre_message" in role_dict[role] else "" pre_message_str = role_dict[role]["pre_message"] if role in role_dict and "pre_message" in role_dict[role] else ""
post_message_str = role_dict[role]["post_message"] if role in role_dict and "post_message" in role_dict[role] else "" post_message_str = role_dict[role]["post_message"] if role in role_dict and "post_message" in role_dict[role] else ""
prompt += pre_message_str + message["content"] + post_message_str prompt += pre_message_str + message["content"] + post_message_str
if role == "assistant":
prompt += eos_token
bos_open = False
prompt += final_prompt_value prompt += final_prompt_value
return prompt return prompt

View file

@ -230,9 +230,10 @@ def completion(
id = kwargs.get('id', None) id = kwargs.get('id', None)
metadata = kwargs.get('metadata', None) metadata = kwargs.get('metadata', None)
fallbacks = kwargs.get('fallbacks', None) fallbacks = kwargs.get('fallbacks', None)
headers = kwargs.get("headers", None)
######## end of unpacking kwargs ########### ######## end of unpacking kwargs ###########
openai_params = ["functions", "function_call", "temperature", "temperature", "top_p", "n", "stream", "stop", "max_tokens", "presence_penalty", "frequency_penalty", "logit_bias", "user", "request_timeout", "api_base", "api_version", "api_key"] openai_params = ["functions", "function_call", "temperature", "temperature", "top_p", "n", "stream", "stop", "max_tokens", "presence_penalty", "frequency_penalty", "logit_bias", "user", "request_timeout", "api_base", "api_version", "api_key"]
litellm_params = ["metadata", "acompletion", "caching", "return_async", "mock_response", "api_key", "api_version", "api_base", "force_timeout", "logger_fn", "verbose", "custom_llm_provider", "litellm_logging_obj", "litellm_call_id", "use_client", "id", "metadata", "fallbacks", "azure"] litellm_params = ["metadata", "acompletion", "caching", "return_async", "mock_response", "api_key", "api_version", "api_base", "force_timeout", "logger_fn", "verbose", "custom_llm_provider", "litellm_logging_obj", "litellm_call_id", "use_client", "id", "fallbacks", "azure", "headers"]
default_params = openai_params + litellm_params default_params = openai_params + litellm_params
non_default_params = {k: v for k,v in kwargs.items() if k not in default_params} # model-specific params - pass them straight to the model/provider non_default_params = {k: v for k,v in kwargs.items() if k not in default_params} # model-specific params - pass them straight to the model/provider
if mock_response: if mock_response:
@ -775,10 +776,16 @@ def completion(
or os.environ.get("HUGGINGFACE_API_KEY") or os.environ.get("HUGGINGFACE_API_KEY")
or litellm.api_key or litellm.api_key
) )
hf_headers = (
headers
or litellm.headers
)
print(f'headers before hf rest api: {hf_headers}')
model_response = huggingface_restapi.completion( model_response = huggingface_restapi.completion(
model=model, model=model,
messages=messages, messages=messages,
api_base=api_base, # type: ignore api_base=api_base, # type: ignore
headers=hf_headers,
model_response=model_response, model_response=model_response,
print_verbose=print_verbose, print_verbose=print_verbose,
optional_params=optional_params, optional_params=optional_params,

File diff suppressed because it is too large Load diff

View file

@ -88,13 +88,15 @@ def is_port_in_use(port):
@click.option('--port', default=8000, help='Port to bind the server to.') @click.option('--port', default=8000, help='Port to bind the server to.')
@click.option('--api_base', default=None, help='API base URL.') @click.option('--api_base', default=None, help='API base URL.')
@click.option('--model', default=None, help='The model name to pass to litellm expects') @click.option('--model', default=None, help='The model name to pass to litellm expects')
@click.option('--alias', default=None, help='The alias for the model - use this to give a litellm model name (e.g. "huggingface/codellama/CodeLlama-7b-Instruct-hf") a more user-friendly name ("codellama")')
@click.option('--add_key', default=None, help='The model name to pass to litellm expects') @click.option('--add_key', default=None, help='The model name to pass to litellm expects')
@click.option('--headers', default=None, help='headers for the API call')
@click.option('--deploy', is_flag=True, type=bool, help='Get a deployed proxy endpoint - api.litellm.ai') @click.option('--deploy', is_flag=True, type=bool, help='Get a deployed proxy endpoint - api.litellm.ai')
@click.option('--save', is_flag=True, type=bool, help='Save the model-specific config')
@click.option('--debug', default=False, is_flag=True, type=bool, help='To debug the input') @click.option('--debug', default=False, is_flag=True, type=bool, help='To debug the input')
@click.option('--temperature', default=None, type=float, help='Set temperature for the model') @click.option('--temperature', default=None, type=float, help='Set temperature for the model')
@click.option('--max_tokens', default=None, type=int, help='Set max tokens for the model') @click.option('--max_tokens', default=None, type=int, help='Set max tokens for the model')
@click.option('--drop_params', is_flag=True, help='Drop any unmapped params') @click.option('--drop_params', is_flag=True, help='Drop any unmapped params')
@click.option('--save', is_flag=True, help='Save params to config, to persist across restarts')
@click.option('--create_proxy', is_flag=True, help='Creates a local OpenAI-compatible server template') @click.option('--create_proxy', is_flag=True, help='Creates a local OpenAI-compatible server template')
@click.option('--add_function_to_prompt', is_flag=True, help='If function passed but unsupported, pass it as prompt') @click.option('--add_function_to_prompt', is_flag=True, help='If function passed but unsupported, pass it as prompt')
@click.option('--config', '-c', is_flag=True, help='Configure Litellm') @click.option('--config', '-c', is_flag=True, help='Configure Litellm')
@ -105,7 +107,7 @@ def is_port_in_use(port):
@click.option('--test', flag_value=True, help='proxy chat completions url to make a test request to') @click.option('--test', flag_value=True, help='proxy chat completions url to make a test request to')
@click.option('--local', is_flag=True, default=False, help='for local debugging') @click.option('--local', is_flag=True, default=False, help='for local debugging')
@click.option('--cost', is_flag=True, default=False, help='for viewing cost logs') @click.option('--cost', is_flag=True, default=False, help='for viewing cost logs')
def run_server(host, port, api_base, model, add_key, deploy, debug, temperature, max_tokens, drop_params, create_proxy, add_function_to_prompt, config, file, max_budget, telemetry, logs, test, local, cost, save): def run_server(host, port, api_base, model, alias, add_key, headers, deploy, save, debug, temperature, max_tokens, drop_params, create_proxy, add_function_to_prompt, config, file, max_budget, telemetry, logs, test, local, cost):
global feature_telemetry global feature_telemetry
args = locals() args = locals()
if local: if local:
@ -133,6 +135,7 @@ def run_server(host, port, api_base, model, add_key, deploy, debug, temperature,
if logs is not None: if logs is not None:
if logs == 0: # default to 1 if logs == 0: # default to 1
logs = 1 logs = 1
try:
with open('api_log.json') as f: with open('api_log.json') as f:
data = json.load(f) data = json.load(f)
@ -146,6 +149,8 @@ def run_server(host, port, api_base, model, add_key, deploy, debug, temperature,
recent_logs = {k.strftime("%Y%m%d%H%M%S%f"): v for k, v in sorted_times[:logs]} recent_logs = {k.strftime("%Y%m%d%H%M%S%f"): v for k, v in sorted_times[:logs]}
print(json.dumps(recent_logs, indent=4)) print(json.dumps(recent_logs, indent=4))
except:
print("LiteLLM: No logs saved!")
return return
if add_key: if add_key:
key_name, key_value = add_key.split("=") key_name, key_value = add_key.split("=")
@ -200,7 +205,9 @@ def run_server(host, port, api_base, model, add_key, deploy, debug, temperature,
click.echo(f'LiteLLM: streaming response from proxy {chunk}') click.echo(f'LiteLLM: streaming response from proxy {chunk}')
return return
else: else:
initialize(model, api_base, debug, temperature, max_tokens, max_budget, telemetry, drop_params, add_function_to_prompt) if headers:
headers = json.loads(headers)
initialize(model=model, alias=alias, api_base=api_base, debug=debug, temperature=temperature, max_tokens=max_tokens, max_budget=max_budget, telemetry=telemetry, drop_params=drop_params, add_function_to_prompt=add_function_to_prompt, headers=headers, save=save)
try: try:
import uvicorn import uvicorn
except: except:

View file

@ -11,15 +11,17 @@ try:
import fastapi import fastapi
import tomli as tomllib import tomli as tomllib
import appdirs import appdirs
import tomli_w
except ImportError: except ImportError:
import subprocess import subprocess
import sys import sys
subprocess.check_call([sys.executable, "-m", "pip", "install", "uvicorn", "fastapi", "tomli", "appdirs"]) subprocess.check_call([sys.executable, "-m", "pip", "install", "uvicorn", "fastapi", "tomli", "appdirs", "tomli-w"])
import uvicorn import uvicorn
import fastapi import fastapi
import tomli as tomllib import tomli as tomllib
import appdirs import appdirs
import tomli_w
import random import random
@ -88,6 +90,7 @@ user_max_tokens = None
user_temperature = None user_temperature = None
user_telemetry = True user_telemetry = True
user_config = None user_config = None
user_headers = None
config_filename = "litellm.secrets.toml" config_filename = "litellm.secrets.toml"
config_dir = os.getcwd() config_dir = os.getcwd()
config_dir = appdirs.user_config_dir("litellm") config_dir = appdirs.user_config_dir("litellm")
@ -120,11 +123,40 @@ def add_keys_to_config(key, value):
config.setdefault('keys', {})[key] = value config.setdefault('keys', {})[key] = value
# Write config to file # Write config to file
with open(user_config_path, 'w') as f: with open(user_config_path, 'wb') as f:
for section, data in config.items(): tomli_w.dump(config, f)
f.write('[%s]\n' % section)
for k, v in data.items(): def save_params_to_config(data: dict):
f.write('%s = "%s"\n' % (k, v)) # Check if file exists
if os.path.exists(user_config_path):
# Load existing file
with open(user_config_path, "rb") as f:
config = tomllib.load(f)
else:
# File doesn't exist, create empty config
config = {}
config.setdefault('general', {})
## general config
general_settings = data["general"]
for key, value in general_settings.items():
config["general"][key] = value
## model-specific config
config.setdefault("model", {})
config["model"].setdefault(user_model, {})
user_model_config = data[user_model]
model_key = model_key = user_model_config.pop("alias", user_model)
config["model"].setdefault(model_key, {})
for key, value in user_model_config.items():
config["model"][model_key][key] = value
# Write config to file
with open(user_config_path, 'wb') as f:
tomli_w.dump(config, f)
def load_config(): def load_config():
@ -138,7 +170,6 @@ def load_config():
if "keys" in user_config: if "keys" in user_config:
for key in user_config["keys"]: for key in user_config["keys"]:
os.environ[key] = user_config["keys"][key] # litellm can read keys from the environment os.environ[key] = user_config["keys"][key] # litellm can read keys from the environment
## settings ## settings
if "general" in user_config: if "general" in user_config:
litellm.add_function_to_prompt = user_config["general"].get("add_function_to_prompt", True) # by default add function to prompt if unsupported by provider litellm.add_function_to_prompt = user_config["general"].get("add_function_to_prompt", True) # by default add function to prompt if unsupported by provider
@ -191,24 +222,42 @@ def load_config():
except Exception as e: except Exception as e:
pass pass
def initialize(model, api_base, debug, temperature, max_tokens, max_budget, telemetry, drop_params, add_function_to_prompt): def initialize(model, alias, api_base, debug, temperature, max_tokens, max_budget, telemetry, drop_params, add_function_to_prompt, headers, save):
global user_model, user_api_base, user_debug, user_max_tokens, user_temperature, user_telemetry global user_model, user_api_base, user_debug, user_max_tokens, user_temperature, user_telemetry, user_headers
user_model = model user_model = model
user_debug = debug user_debug = debug
load_config() load_config()
dynamic_config = {"general": {}, user_model: {}}
if headers: # model-specific param
user_headers = headers
dynamic_config[user_model]["headers"] = headers
if api_base: # model-specific param
user_api_base = api_base user_api_base = api_base
dynamic_config[user_model]["api_base"] = api_base
if max_tokens: # model-specific param
user_max_tokens = max_tokens user_max_tokens = max_tokens
dynamic_config[user_model]["max_tokens"] = max_tokens
if temperature: # model-specific param
user_temperature = temperature user_temperature = temperature
dynamic_config[user_model]["temperature"] = temperature
if alias: # model-specific param
dynamic_config[user_model]["alias"] = alias
if drop_params == True: # litellm-specific param
litellm.drop_params = True
dynamic_config["general"]["drop_params"] = True
if add_function_to_prompt == True: # litellm-specific param
litellm.add_function_to_prompt = True
dynamic_config["general"]["add_function_to_prompt"] = True
if max_budget: # litellm-specific param
litellm.max_budget = max_budget
dynamic_config["general"]["max_budget"] = max_budget
if save:
save_params_to_config(dynamic_config)
with open(user_config_path) as f:
print(f.read())
print("\033[1;32mDone successfully\033[0m")
user_telemetry = telemetry user_telemetry = telemetry
usage_telemetry(feature="local_proxy_server") usage_telemetry(feature="local_proxy_server")
if drop_params == True:
litellm.drop_params = True
if add_function_to_prompt == True:
litellm.add_function_to_prompt = True
if max_budget:
litellm.max_budget = max_budget
def deploy_proxy(model, api_base, debug, temperature, max_tokens, telemetry, deploy): def deploy_proxy(model, api_base, debug, temperature, max_tokens, telemetry, deploy):
import requests import requests
@ -354,9 +403,12 @@ def logger(
existing_data = {} existing_data = {}
existing_data.update(log_data) existing_data.update(log_data)
def write_to_log():
with open(log_file, 'w') as f: with open(log_file, 'w') as f:
json.dump(existing_data, f, indent=2) json.dump(existing_data, f, indent=2)
thread = threading.Thread(target=write_to_log, daemon=True)
thread.start()
elif log_event_type == 'post_api_call': elif log_event_type == 'post_api_call':
if "stream" not in kwargs["optional_params"] or kwargs["optional_params"]["stream"] is False or kwargs.get("complete_streaming_response", False): if "stream" not in kwargs["optional_params"] or kwargs["optional_params"]["stream"] is False or kwargs.get("complete_streaming_response", False):
inference_params = copy.deepcopy(kwargs) inference_params = copy.deepcopy(kwargs)
@ -368,8 +420,12 @@ def logger(
existing_data[dt_key]['post_api_call'] = inference_params existing_data[dt_key]['post_api_call'] = inference_params
def write_to_log():
with open(log_file, 'w') as f: with open(log_file, 'w') as f:
json.dump(existing_data, f, indent=2) json.dump(existing_data, f, indent=2)
thread = threading.Thread(target=write_to_log, daemon=True)
thread.start()
except: except:
traceback.print_exc() traceback.print_exc()
@ -388,6 +444,8 @@ def litellm_completion(data, type):
data["max_tokens"] = user_max_tokens data["max_tokens"] = user_max_tokens
if user_api_base: if user_api_base:
data["api_base"] = user_api_base data["api_base"] = user_api_base
if user_headers:
data["headers"] = user_headers
if type == "completion": if type == "completion":
response = litellm.text_completion(**data) response = litellm.text_completion(**data)
elif type == "chat_completion": elif type == "chat_completion":
@ -397,6 +455,7 @@ def litellm_completion(data, type):
print_verbose(f"response: {response}") print_verbose(f"response: {response}")
return response return response
except Exception as e: except Exception as e:
traceback.print_exc()
if "Invalid response object from API" in str(e): if "Invalid response object from API" in str(e):
completion_call_details = {} completion_call_details = {}
if user_model: if user_model:

View file

@ -3207,6 +3207,7 @@ class CustomStreamWrapper:
return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason} return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
def handle_huggingface_chunk(self, chunk): def handle_huggingface_chunk(self, chunk):
try:
chunk = chunk.decode("utf-8") chunk = chunk.decode("utf-8")
text = "" text = ""
is_finished = False is_finished = False
@ -3229,6 +3230,9 @@ class CustomStreamWrapper:
elif "error" in chunk: elif "error" in chunk:
raise ValueError(chunk) raise ValueError(chunk)
return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason} return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
except Exception as e:
traceback.print_exc()
# raise(e)
def handle_ai21_chunk(self, chunk): # fake streaming def handle_ai21_chunk(self, chunk): # fake streaming
chunk = chunk.decode("utf-8") chunk = chunk.decode("utf-8")

2
poetry.lock generated
View file

@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. # This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
[[package]] [[package]]
name = "aiohttp" name = "aiohttp"

View file

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "litellm" name = "litellm"
version = "0.8.4" version = "0.8.5"
description = "Library to easily interface with LLM API providers" description = "Library to easily interface with LLM API providers"
authors = ["BerriAI"] authors = ["BerriAI"]
license = "MIT License" license = "MIT License"
@ -26,7 +26,7 @@ requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api" build-backend = "poetry.core.masonry.api"
[tool.commitizen] [tool.commitizen]
version = "0.8.4" version = "0.8.5"
version_files = [ version_files = [
"pyproject.toml:^version" "pyproject.toml:^version"
] ]