bump: version 0.8.4 → 0.8.5

This commit is contained in:
Krrish Dholakia 2023-10-14 16:43:06 -07:00
parent 80c60e71c1
commit 7358d2e4ea
11 changed files with 228 additions and 7343 deletions

View file

@ -6,7 +6,7 @@ import requests
import time
import litellm
from typing import Callable
from litellm.utils import ModelResponse, Choices, Message
from litellm.utils import ModelResponse, Choices, Message, CustomStreamWrapper
from typing import Optional
from .prompt_templates.factory import prompt_factory, custom_prompt
@ -65,12 +65,17 @@ class HuggingfaceConfig():
and not isinstance(v, (types.FunctionType, types.BuiltinFunctionType, classmethod, staticmethod))
and v is not None}
def validate_environment(api_key):
headers = {
def validate_environment(api_key, headers):
default_headers = {
"content-type": "application/json",
}
if api_key:
headers["Authorization"] = f"Bearer {api_key}"
if api_key and headers is None:
default_headers["Authorization"] = f"Bearer {api_key}" # Huggingface Inference Endpoint default is to accept bearer tokens
headers = default_headers
elif headers:
headers=headers
else:
headers = default_headers
return headers
tgi_models_cache = None
@ -125,6 +130,7 @@ def completion(
model: str,
messages: list,
api_base: Optional[str],
headers: Optional[dict],
model_response: ModelResponse,
print_verbose: Callable,
encoding,
@ -135,7 +141,8 @@ def completion(
litellm_params=None,
logger_fn=None,
):
headers = validate_environment(api_key)
print(f'headers inside hf rest api: {headers}')
headers = validate_environment(api_key, headers)
task = get_hf_task_for_model(model)
print_verbose(f"{model}, {task}")
completion_url = ""
@ -227,7 +234,7 @@ def completion(
logging_obj.pre_call(
input=input_text,
api_key=api_key,
additional_args={"complete_input_dict": data, "task": task},
additional_args={"complete_input_dict": data, "task": task, "headers": headers},
)
## COMPLETION CALL
if "stream" in optional_params and optional_params["stream"] == True:
@ -244,20 +251,43 @@ def completion(
headers=headers,
data=json.dumps(data)
)
## LOGGING
logging_obj.post_call(
input=input_text,
api_key=api_key,
original_response=response.text,
additional_args={"complete_input_dict": data, "task": task},
)
## RESPONSE OBJECT
try:
completion_response = response.json()
except:
raise HuggingfaceError(
message=response.text, status_code=response.status_code
## Some servers might return streaming responses even though stream was not set to true. (e.g. Baseten)
is_streamed = False
print(f"response keys: {response.__dict__.keys()}")
print(f"response keys: {response.__dict__['headers']}")
if response.__dict__['headers']["Content-Type"] == "text/event-stream":
is_streamed = True
# iterate over the complete streamed response, and return the final answer
if is_streamed:
streamed_response = CustomStreamWrapper(completion_stream=response.iter_lines(), model=model, custom_llm_provider="huggingface", logging_obj=logging_obj)
content = ""
for chunk in streamed_response:
content += chunk["choices"][0]["delta"]["content"]
completion_response = [{"generated_text": content}]
## LOGGING
logging_obj.post_call(
input=input_text,
api_key=api_key,
original_response=completion_response,
additional_args={"complete_input_dict": data, "task": task},
)
else:
## LOGGING
logging_obj.post_call(
input=input_text,
api_key=api_key,
original_response=response.text,
additional_args={"complete_input_dict": data, "task": task},
)
## RESPONSE OBJECT
try:
completion_response = response.json()
except:
raise HuggingfaceError(
message=response.text, status_code=response.status_code
)
print_verbose(f"response: {completion_response}")
if isinstance(completion_response, dict) and "error" in completion_response:
print_verbose(f"completion error: {completion_response['error']}")

View file

@ -22,7 +22,9 @@ def llama_2_chat_pt(messages):
"post_message": "\n" # follows this - https://replicate.com/blog/how-to-prompt-llama
}
},
messages=messages
messages=messages,
bos_token="<s>",
eos_token="</s>"
)
return prompt
@ -218,14 +220,26 @@ def function_call_prompt(messages: list, functions: list):
# Custom prompt template
def custom_prompt(role_dict: dict, messages: list, initial_prompt_value: str="", final_prompt_value: str=""):
prompt = initial_prompt_value
def custom_prompt(role_dict: dict, messages: list, initial_prompt_value: str="", final_prompt_value: str="", bos_token: str="", eos_token: str=""):
prompt = bos_token + initial_prompt_value
bos_open = True
## a bos token is at the start of a system / human message
## an eos token is at the end of the assistant response to the message
for message in messages:
role = message["role"]
if role in ["system", "human"] and not bos_open:
prompt += bos_token
bos_open = True
pre_message_str = role_dict[role]["pre_message"] if role in role_dict and "pre_message" in role_dict[role] else ""
post_message_str = role_dict[role]["post_message"] if role in role_dict and "post_message" in role_dict[role] else ""
prompt += pre_message_str + message["content"] + post_message_str
if role == "assistant":
prompt += eos_token
bos_open = False
prompt += final_prompt_value
return prompt

View file

@ -230,9 +230,10 @@ def completion(
id = kwargs.get('id', None)
metadata = kwargs.get('metadata', None)
fallbacks = kwargs.get('fallbacks', None)
headers = kwargs.get("headers", None)
######## end of unpacking kwargs ###########
openai_params = ["functions", "function_call", "temperature", "temperature", "top_p", "n", "stream", "stop", "max_tokens", "presence_penalty", "frequency_penalty", "logit_bias", "user", "request_timeout", "api_base", "api_version", "api_key"]
litellm_params = ["metadata", "acompletion", "caching", "return_async", "mock_response", "api_key", "api_version", "api_base", "force_timeout", "logger_fn", "verbose", "custom_llm_provider", "litellm_logging_obj", "litellm_call_id", "use_client", "id", "metadata", "fallbacks", "azure"]
litellm_params = ["metadata", "acompletion", "caching", "return_async", "mock_response", "api_key", "api_version", "api_base", "force_timeout", "logger_fn", "verbose", "custom_llm_provider", "litellm_logging_obj", "litellm_call_id", "use_client", "id", "fallbacks", "azure", "headers"]
default_params = openai_params + litellm_params
non_default_params = {k: v for k,v in kwargs.items() if k not in default_params} # model-specific params - pass them straight to the model/provider
if mock_response:
@ -775,10 +776,16 @@ def completion(
or os.environ.get("HUGGINGFACE_API_KEY")
or litellm.api_key
)
hf_headers = (
headers
or litellm.headers
)
print(f'headers before hf rest api: {hf_headers}')
model_response = huggingface_restapi.completion(
model=model,
messages=messages,
api_base=api_base, # type: ignore
headers=hf_headers,
model_response=model_response,
print_verbose=print_verbose,
optional_params=optional_params,

File diff suppressed because it is too large Load diff

View file

@ -88,13 +88,15 @@ def is_port_in_use(port):
@click.option('--port', default=8000, help='Port to bind the server to.')
@click.option('--api_base', default=None, help='API base URL.')
@click.option('--model', default=None, help='The model name to pass to litellm expects')
@click.option('--alias', default=None, help='The alias for the model - use this to give a litellm model name (e.g. "huggingface/codellama/CodeLlama-7b-Instruct-hf") a more user-friendly name ("codellama")')
@click.option('--add_key', default=None, help='The model name to pass to litellm expects')
@click.option('--headers', default=None, help='headers for the API call')
@click.option('--deploy', is_flag=True, type=bool, help='Get a deployed proxy endpoint - api.litellm.ai')
@click.option('--save', is_flag=True, type=bool, help='Save the model-specific config')
@click.option('--debug', default=False, is_flag=True, type=bool, help='To debug the input')
@click.option('--temperature', default=None, type=float, help='Set temperature for the model')
@click.option('--max_tokens', default=None, type=int, help='Set max tokens for the model')
@click.option('--drop_params', is_flag=True, help='Drop any unmapped params')
@click.option('--save', is_flag=True, help='Save params to config, to persist across restarts')
@click.option('--create_proxy', is_flag=True, help='Creates a local OpenAI-compatible server template')
@click.option('--add_function_to_prompt', is_flag=True, help='If function passed but unsupported, pass it as prompt')
@click.option('--config', '-c', is_flag=True, help='Configure Litellm')
@ -105,7 +107,7 @@ def is_port_in_use(port):
@click.option('--test', flag_value=True, help='proxy chat completions url to make a test request to')
@click.option('--local', is_flag=True, default=False, help='for local debugging')
@click.option('--cost', is_flag=True, default=False, help='for viewing cost logs')
def run_server(host, port, api_base, model, add_key, deploy, debug, temperature, max_tokens, drop_params, create_proxy, add_function_to_prompt, config, file, max_budget, telemetry, logs, test, local, cost, save):
def run_server(host, port, api_base, model, alias, add_key, headers, deploy, save, debug, temperature, max_tokens, drop_params, create_proxy, add_function_to_prompt, config, file, max_budget, telemetry, logs, test, local, cost):
global feature_telemetry
args = locals()
if local:
@ -133,19 +135,22 @@ def run_server(host, port, api_base, model, add_key, deploy, debug, temperature,
if logs is not None:
if logs == 0: # default to 1
logs = 1
with open('api_log.json') as f:
data = json.load(f)
try:
with open('api_log.json') as f:
data = json.load(f)
# convert keys to datetime objects
log_times = {datetime.strptime(k, "%Y%m%d%H%M%S%f"): v for k, v in data.items()}
# convert keys to datetime objects
log_times = {datetime.strptime(k, "%Y%m%d%H%M%S%f"): v for k, v in data.items()}
# sort by timestamp
sorted_times = sorted(log_times.items(), key=operator.itemgetter(0), reverse=True)
# sort by timestamp
sorted_times = sorted(log_times.items(), key=operator.itemgetter(0), reverse=True)
# get n recent logs
recent_logs = {k.strftime("%Y%m%d%H%M%S%f"): v for k, v in sorted_times[:logs]}
# get n recent logs
recent_logs = {k.strftime("%Y%m%d%H%M%S%f"): v for k, v in sorted_times[:logs]}
print(json.dumps(recent_logs, indent=4))
print(json.dumps(recent_logs, indent=4))
except:
print("LiteLLM: No logs saved!")
return
if add_key:
key_name, key_value = add_key.split("=")
@ -200,7 +205,9 @@ def run_server(host, port, api_base, model, add_key, deploy, debug, temperature,
click.echo(f'LiteLLM: streaming response from proxy {chunk}')
return
else:
initialize(model, api_base, debug, temperature, max_tokens, max_budget, telemetry, drop_params, add_function_to_prompt)
if headers:
headers = json.loads(headers)
initialize(model=model, alias=alias, api_base=api_base, debug=debug, temperature=temperature, max_tokens=max_tokens, max_budget=max_budget, telemetry=telemetry, drop_params=drop_params, add_function_to_prompt=add_function_to_prompt, headers=headers, save=save)
try:
import uvicorn
except:

View file

@ -11,15 +11,17 @@ try:
import fastapi
import tomli as tomllib
import appdirs
import tomli_w
except ImportError:
import subprocess
import sys
subprocess.check_call([sys.executable, "-m", "pip", "install", "uvicorn", "fastapi", "tomli", "appdirs"])
subprocess.check_call([sys.executable, "-m", "pip", "install", "uvicorn", "fastapi", "tomli", "appdirs", "tomli-w"])
import uvicorn
import fastapi
import tomli as tomllib
import appdirs
import tomli_w
import random
@ -88,6 +90,7 @@ user_max_tokens = None
user_temperature = None
user_telemetry = True
user_config = None
user_headers = None
config_filename = "litellm.secrets.toml"
config_dir = os.getcwd()
config_dir = appdirs.user_config_dir("litellm")
@ -120,12 +123,41 @@ def add_keys_to_config(key, value):
config.setdefault('keys', {})[key] = value
# Write config to file
with open(user_config_path, 'w') as f:
for section, data in config.items():
f.write('[%s]\n' % section)
for k, v in data.items():
f.write('%s = "%s"\n' % (k, v))
with open(user_config_path, 'wb') as f:
tomli_w.dump(config, f)
def save_params_to_config(data: dict):
# Check if file exists
if os.path.exists(user_config_path):
# Load existing file
with open(user_config_path, "rb") as f:
config = tomllib.load(f)
else:
# File doesn't exist, create empty config
config = {}
config.setdefault('general', {})
## general config
general_settings = data["general"]
for key, value in general_settings.items():
config["general"][key] = value
## model-specific config
config.setdefault("model", {})
config["model"].setdefault(user_model, {})
user_model_config = data[user_model]
model_key = model_key = user_model_config.pop("alias", user_model)
config["model"].setdefault(model_key, {})
for key, value in user_model_config.items():
config["model"][model_key][key] = value
# Write config to file
with open(user_config_path, 'wb') as f:
tomli_w.dump(config, f)
def load_config():
try:
@ -138,7 +170,6 @@ def load_config():
if "keys" in user_config:
for key in user_config["keys"]:
os.environ[key] = user_config["keys"][key] # litellm can read keys from the environment
## settings
if "general" in user_config:
litellm.add_function_to_prompt = user_config["general"].get("add_function_to_prompt", True) # by default add function to prompt if unsupported by provider
@ -191,24 +222,42 @@ def load_config():
except Exception as e:
pass
def initialize(model, api_base, debug, temperature, max_tokens, max_budget, telemetry, drop_params, add_function_to_prompt):
global user_model, user_api_base, user_debug, user_max_tokens, user_temperature, user_telemetry
def initialize(model, alias, api_base, debug, temperature, max_tokens, max_budget, telemetry, drop_params, add_function_to_prompt, headers, save):
global user_model, user_api_base, user_debug, user_max_tokens, user_temperature, user_telemetry, user_headers
user_model = model
user_debug = debug
load_config()
user_api_base = api_base
user_max_tokens = max_tokens
user_temperature = temperature
dynamic_config = {"general": {}, user_model: {}}
if headers: # model-specific param
user_headers = headers
dynamic_config[user_model]["headers"] = headers
if api_base: # model-specific param
user_api_base = api_base
dynamic_config[user_model]["api_base"] = api_base
if max_tokens: # model-specific param
user_max_tokens = max_tokens
dynamic_config[user_model]["max_tokens"] = max_tokens
if temperature: # model-specific param
user_temperature = temperature
dynamic_config[user_model]["temperature"] = temperature
if alias: # model-specific param
dynamic_config[user_model]["alias"] = alias
if drop_params == True: # litellm-specific param
litellm.drop_params = True
dynamic_config["general"]["drop_params"] = True
if add_function_to_prompt == True: # litellm-specific param
litellm.add_function_to_prompt = True
dynamic_config["general"]["add_function_to_prompt"] = True
if max_budget: # litellm-specific param
litellm.max_budget = max_budget
dynamic_config["general"]["max_budget"] = max_budget
if save:
save_params_to_config(dynamic_config)
with open(user_config_path) as f:
print(f.read())
print("\033[1;32mDone successfully\033[0m")
user_telemetry = telemetry
usage_telemetry(feature="local_proxy_server")
if drop_params == True:
litellm.drop_params = True
if add_function_to_prompt == True:
litellm.add_function_to_prompt = True
if max_budget:
litellm.max_budget = max_budget
def deploy_proxy(model, api_base, debug, temperature, max_tokens, telemetry, deploy):
import requests
@ -354,9 +403,12 @@ def logger(
existing_data = {}
existing_data.update(log_data)
with open(log_file, 'w') as f:
json.dump(existing_data, f, indent=2)
def write_to_log():
with open(log_file, 'w') as f:
json.dump(existing_data, f, indent=2)
thread = threading.Thread(target=write_to_log, daemon=True)
thread.start()
elif log_event_type == 'post_api_call':
if "stream" not in kwargs["optional_params"] or kwargs["optional_params"]["stream"] is False or kwargs.get("complete_streaming_response", False):
inference_params = copy.deepcopy(kwargs)
@ -367,9 +419,13 @@ def logger(
existing_data = json.load(f)
existing_data[dt_key]['post_api_call'] = inference_params
with open(log_file, 'w') as f:
json.dump(existing_data, f, indent=2)
def write_to_log():
with open(log_file, 'w') as f:
json.dump(existing_data, f, indent=2)
thread = threading.Thread(target=write_to_log, daemon=True)
thread.start()
except:
traceback.print_exc()
@ -388,6 +444,8 @@ def litellm_completion(data, type):
data["max_tokens"] = user_max_tokens
if user_api_base:
data["api_base"] = user_api_base
if user_headers:
data["headers"] = user_headers
if type == "completion":
response = litellm.text_completion(**data)
elif type == "chat_completion":
@ -397,6 +455,7 @@ def litellm_completion(data, type):
print_verbose(f"response: {response}")
return response
except Exception as e:
traceback.print_exc()
if "Invalid response object from API" in str(e):
completion_call_details = {}
if user_model:

View file

@ -3207,28 +3207,32 @@ class CustomStreamWrapper:
return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
def handle_huggingface_chunk(self, chunk):
chunk = chunk.decode("utf-8")
text = ""
is_finished = False
finish_reason = ""
print_verbose(f"chunk: {chunk}")
if chunk.startswith("data:"):
data_json = json.loads(chunk[5:])
print_verbose(f"data json: {data_json}")
if "token" in data_json and "text" in data_json["token"]:
text = data_json["token"]["text"]
if data_json.get("details", False) and data_json["details"].get("finish_reason", False):
is_finished = True
finish_reason = data_json["details"]["finish_reason"]
elif data_json.get("generated_text", False): # if full generated text exists, then stream is complete
text = "" # don't return the final bos token
is_finished = True
finish_reason = "stop"
try:
chunk = chunk.decode("utf-8")
text = ""
is_finished = False
finish_reason = ""
print_verbose(f"chunk: {chunk}")
if chunk.startswith("data:"):
data_json = json.loads(chunk[5:])
print_verbose(f"data json: {data_json}")
if "token" in data_json and "text" in data_json["token"]:
text = data_json["token"]["text"]
if data_json.get("details", False) and data_json["details"].get("finish_reason", False):
is_finished = True
finish_reason = data_json["details"]["finish_reason"]
elif data_json.get("generated_text", False): # if full generated text exists, then stream is complete
text = "" # don't return the final bos token
is_finished = True
finish_reason = "stop"
return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
elif "error" in chunk:
raise ValueError(chunk)
return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
elif "error" in chunk:
raise ValueError(chunk)
return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
except Exception as e:
traceback.print_exc()
# raise(e)
def handle_ai21_chunk(self, chunk): # fake streaming
chunk = chunk.decode("utf-8")

2
poetry.lock generated
View file

@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
[[package]]
name = "aiohttp"

View file

@ -1,6 +1,6 @@
[tool.poetry]
name = "litellm"
version = "0.8.4"
version = "0.8.5"
description = "Library to easily interface with LLM API providers"
authors = ["BerriAI"]
license = "MIT License"
@ -26,7 +26,7 @@ requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
[tool.commitizen]
version = "0.8.4"
version = "0.8.5"
version_files = [
"pyproject.toml:^version"
]