bump: version 0.8.4 → 0.8.5

2025-04-24 10:14:26 +00:00 · 2023-10-14 16:43:06 -07:00 · 2023-10-14 16:43:06 -07:00 · 7358d2e4ea
commit 7358d2e4ea
parent 80c60e71c1
11 changed files with 228 additions and 7343 deletions
--- a/litellm/pycache/main.cpython-311.pyc
+++ b/litellm/pycache/main.cpython-311.pyc
--- a/litellm/pycache/utils.cpython-311.pyc
+++ b/litellm/pycache/utils.cpython-311.pyc
--- a/litellm/llms/huggingface_restapi.py
+++ b/litellm/llms/huggingface_restapi.py
@ -6,7 +6,7 @@ import requests
 import time
 import litellm
 from typing import Callable
-from litellm.utils import ModelResponse, Choices, Message
+from litellm.utils import ModelResponse, Choices, Message, CustomStreamWrapper
 from typing import Optional
 from .prompt_templates.factory import prompt_factory, custom_prompt

@ -65,12 +65,17 @@ class HuggingfaceConfig():
                and not isinstance(v, (types.FunctionType, types.BuiltinFunctionType, classmethod, staticmethod)) 
                and v is not None}

-def validate_environment(api_key):
-    headers = {
+def validate_environment(api_key, headers):
+    default_headers = {
        "content-type": "application/json",
    }
-    if api_key:
-        headers["Authorization"] = f"Bearer {api_key}"
+    if api_key and headers is None:
+        default_headers["Authorization"] = f"Bearer {api_key}" # Huggingface Inference Endpoint default is to accept bearer tokens
+        headers = default_headers
+    elif headers:
+        headers=headers
+    else: 
+        headers = default_headers
    return headers

 tgi_models_cache = None
@ -125,6 +130,7 @@ def completion(
    model: str,
    messages: list,
    api_base: Optional[str],
+    headers: Optional[dict],
    model_response: ModelResponse,
    print_verbose: Callable,
    encoding,
@ -135,7 +141,8 @@ def completion(
    litellm_params=None,
    logger_fn=None,
 ):
-    headers = validate_environment(api_key)
+    print(f'headers inside hf rest api: {headers}')
+    headers = validate_environment(api_key, headers)
    task = get_hf_task_for_model(model)
    print_verbose(f"{model}, {task}")
    completion_url = ""
@ -227,7 +234,7 @@ def completion(
    logging_obj.pre_call(
            input=input_text,
            api_key=api_key,
-            additional_args={"complete_input_dict": data, "task": task},
+            additional_args={"complete_input_dict": data, "task": task, "headers": headers},
        )
    ## COMPLETION CALL
    if "stream" in optional_params and optional_params["stream"] == True:
@ -244,20 +251,43 @@ def completion(
            headers=headers, 
            data=json.dumps(data)
        )
-        ## LOGGING
-        logging_obj.post_call(
-            input=input_text,
-            api_key=api_key,
-            original_response=response.text,
-            additional_args={"complete_input_dict": data, "task": task},
-        )
-        ## RESPONSE OBJECT
-        try:
-            completion_response = response.json()
-        except:
-            raise HuggingfaceError(
-                message=response.text, status_code=response.status_code
+
+        ## Some servers might return streaming responses even though stream was not set to true. (e.g. Baseten)
+        is_streamed = False 
+        print(f"response keys: {response.__dict__.keys()}")
+        print(f"response keys: {response.__dict__['headers']}")
+        if response.__dict__['headers']["Content-Type"] == "text/event-stream":
+            is_streamed = True
+        
+        # iterate over the complete streamed response, and return the final answer
+        if is_streamed:
+            streamed_response = CustomStreamWrapper(completion_stream=response.iter_lines(), model=model, custom_llm_provider="huggingface", logging_obj=logging_obj)
+            content = ""
+            for chunk in streamed_response: 
+                content += chunk["choices"][0]["delta"]["content"]
+            completion_response = [{"generated_text": content}]
+            ## LOGGING
+            logging_obj.post_call(
+                input=input_text,
+                api_key=api_key,
+                original_response=completion_response,
+                additional_args={"complete_input_dict": data, "task": task},
            )
+        else: 
+            ## LOGGING
+            logging_obj.post_call(
+                input=input_text,
+                api_key=api_key,
+                original_response=response.text,
+                additional_args={"complete_input_dict": data, "task": task},
+            )
+            ## RESPONSE OBJECT
+            try:
+                completion_response = response.json()
+            except:
+                raise HuggingfaceError(
+                    message=response.text, status_code=response.status_code
+                )
        print_verbose(f"response: {completion_response}")
        if isinstance(completion_response, dict) and "error" in completion_response:
            print_verbose(f"completion error: {completion_response['error']}")
--- a/litellm/llms/prompt_templates/factory.py
+++ b/litellm/llms/prompt_templates/factory.py
@ -22,7 +22,9 @@ def llama_2_chat_pt(messages):
                "post_message": "\n" # follows this - https://replicate.com/blog/how-to-prompt-llama
            }
        },
-        messages=messages
+        messages=messages,
+        bos_token="<s>",
+        eos_token="</s>"
    )
    return prompt

@ -218,14 +220,26 @@ def function_call_prompt(messages: list, functions: list):


 # Custom prompt template
-def custom_prompt(role_dict: dict, messages: list, initial_prompt_value: str="", final_prompt_value: str=""):
-    prompt = initial_prompt_value
+def custom_prompt(role_dict: dict, messages: list, initial_prompt_value: str="", final_prompt_value: str="", bos_token: str="", eos_token: str=""):
+    prompt = bos_token + initial_prompt_value
+    bos_open = True
+    ## a bos token is at the start of a system / human message
+    ## an eos token is at the end of the assistant response to the message
    for message in messages:
        role = message["role"]
+        
+        if role in ["system", "human"] and not bos_open:
+            prompt += bos_token
+            bos_open = True
+        
        pre_message_str = role_dict[role]["pre_message"] if role in role_dict and "pre_message" in role_dict[role] else "" 
        post_message_str = role_dict[role]["post_message"] if role in role_dict and "post_message" in role_dict[role] else "" 
        prompt += pre_message_str + message["content"] + post_message_str
-    
+        
+        if role == "assistant":
+            prompt += eos_token
+            bos_open = False
+
    prompt += final_prompt_value
    return prompt

--- a/litellm/main.py
+++ b/litellm/main.py
@ -230,9 +230,10 @@ def completion(
    id = kwargs.get('id', None)
    metadata = kwargs.get('metadata', None)
    fallbacks = kwargs.get('fallbacks', None)
+    headers = kwargs.get("headers", None)
    ######## end of unpacking kwargs ###########
    openai_params = ["functions", "function_call", "temperature", "temperature", "top_p", "n", "stream", "stop", "max_tokens", "presence_penalty", "frequency_penalty", "logit_bias", "user", "request_timeout", "api_base", "api_version", "api_key"]
-    litellm_params = ["metadata", "acompletion", "caching", "return_async", "mock_response", "api_key", "api_version", "api_base", "force_timeout", "logger_fn", "verbose", "custom_llm_provider", "litellm_logging_obj", "litellm_call_id", "use_client", "id", "metadata", "fallbacks", "azure"]
+    litellm_params = ["metadata", "acompletion", "caching", "return_async", "mock_response", "api_key", "api_version", "api_base", "force_timeout", "logger_fn", "verbose", "custom_llm_provider", "litellm_logging_obj", "litellm_call_id", "use_client", "id", "fallbacks", "azure", "headers"]
    default_params = openai_params + litellm_params
    non_default_params = {k: v for k,v in kwargs.items() if k not in default_params} # model-specific params - pass them straight to the model/provider
    if mock_response:
@ -775,10 +776,16 @@ def completion(
                or os.environ.get("HUGGINGFACE_API_KEY")
                or litellm.api_key
            )
+            hf_headers = (
+                headers
+                or litellm.headers
+            )
+            print(f'headers before hf rest api: {hf_headers}')
            model_response = huggingface_restapi.completion(
                model=model,
                messages=messages,
                api_base=api_base, # type: ignore
+                headers=hf_headers,
                model_response=model_response,
                print_verbose=print_verbose,
                optional_params=optional_params,
--- a/litellm/proxy/api_log.json
+++ b/litellm/proxy/api_log.json
--- a/litellm/proxy/proxy_cli.py
+++ b/litellm/proxy/proxy_cli.py
@ -88,13 +88,15 @@ def is_port_in_use(port):
@click.option('--port', default=8000, help='Port to bind the server to.')
@click.option('--api_base', default=None, help='API base URL.')
@click.option('--model', default=None, help='The model name to pass to litellm expects') 
+@click.option('--alias', default=None, help='The alias for the model - use this to give a litellm model name (e.g. "huggingface/codellama/CodeLlama-7b-Instruct-hf") a more user-friendly name ("codellama")') 
@click.option('--add_key', default=None, help='The model name to pass to litellm expects') 
+@click.option('--headers', default=None, help='headers for the API call') 
@click.option('--deploy', is_flag=True, type=bool, help='Get a deployed proxy endpoint - api.litellm.ai')
+@click.option('--save', is_flag=True, type=bool, help='Save the model-specific config')
@click.option('--debug', default=False, is_flag=True, type=bool, help='To debug the input') 
@click.option('--temperature', default=None, type=float, help='Set temperature for the model') 
@click.option('--max_tokens', default=None, type=int, help='Set max tokens for the model') 
@click.option('--drop_params', is_flag=True, help='Drop any unmapped params') 
-@click.option('--save', is_flag=True, help='Save params to config, to persist across restarts') 
@click.option('--create_proxy', is_flag=True, help='Creates a local OpenAI-compatible server template') 
@click.option('--add_function_to_prompt', is_flag=True, help='If function passed but unsupported, pass it as prompt') 
@click.option('--config', '-c', is_flag=True, help='Configure Litellm')  
@ -105,7 +107,7 @@ def is_port_in_use(port):
@click.option('--test', flag_value=True, help='proxy chat completions url to make a test request to')
@click.option('--local', is_flag=True, default=False, help='for local debugging')
@click.option('--cost', is_flag=True, default=False, help='for viewing cost logs')
-def run_server(host, port, api_base, model, add_key, deploy, debug, temperature, max_tokens, drop_params, create_proxy, add_function_to_prompt, config, file, max_budget, telemetry, logs, test, local, cost, save):
+def run_server(host, port, api_base, model, alias, add_key, headers, deploy, save, debug, temperature, max_tokens, drop_params, create_proxy, add_function_to_prompt, config, file, max_budget, telemetry, logs, test, local, cost):
    global feature_telemetry
    args = locals()
    if local:
@ -133,19 +135,22 @@ def run_server(host, port, api_base, model, add_key, deploy, debug, temperature,
    if logs is not None:
        if logs == 0: # default to 1
            logs = 1
-        with open('api_log.json') as f:
-            data = json.load(f)
+        try:
+            with open('api_log.json') as f:
+                data = json.load(f)

-        # convert keys to datetime objects    
-        log_times = {datetime.strptime(k, "%Y%m%d%H%M%S%f"): v for k, v in data.items()}
+            # convert keys to datetime objects    
+            log_times = {datetime.strptime(k, "%Y%m%d%H%M%S%f"): v for k, v in data.items()}

-        # sort by timestamp    
-        sorted_times = sorted(log_times.items(), key=operator.itemgetter(0), reverse=True)
+            # sort by timestamp    
+            sorted_times = sorted(log_times.items(), key=operator.itemgetter(0), reverse=True)

-        # get n recent logs
-        recent_logs = {k.strftime("%Y%m%d%H%M%S%f"): v for k, v in sorted_times[:logs]}
+            # get n recent logs
+            recent_logs = {k.strftime("%Y%m%d%H%M%S%f"): v for k, v in sorted_times[:logs]}

-        print(json.dumps(recent_logs, indent=4))
+            print(json.dumps(recent_logs, indent=4))
+        except:
+            print("LiteLLM: No logs saved!")
        return
    if add_key:
        key_name, key_value = add_key.split("=")
@ -200,7 +205,9 @@ def run_server(host, port, api_base, model, add_key, deploy, debug, temperature,
            click.echo(f'LiteLLM: streaming response from proxy {chunk}')
        return
    else:
-        initialize(model, api_base, debug, temperature, max_tokens, max_budget, telemetry, drop_params, add_function_to_prompt)
+        if headers:
+            headers = json.loads(headers)
+        initialize(model=model, alias=alias, api_base=api_base, debug=debug, temperature=temperature, max_tokens=max_tokens, max_budget=max_budget, telemetry=telemetry, drop_params=drop_params, add_function_to_prompt=add_function_to_prompt, headers=headers, save=save)
        try:
            import uvicorn
        except:
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -11,15 +11,17 @@ try:
    import fastapi
    import tomli as tomllib
    import appdirs
+    import tomli_w
 except ImportError:
    import subprocess
    import sys

-    subprocess.check_call([sys.executable, "-m", "pip", "install", "uvicorn", "fastapi", "tomli", "appdirs"])
+    subprocess.check_call([sys.executable, "-m", "pip", "install", "uvicorn", "fastapi", "tomli", "appdirs", "tomli-w"])
    import uvicorn
    import fastapi
    import tomli as tomllib
    import appdirs
+    import tomli_w
    

 import random
@ -88,6 +90,7 @@ user_max_tokens = None
 user_temperature = None
 user_telemetry = True
 user_config = None
+user_headers = None
 config_filename = "litellm.secrets.toml"
 config_dir = os.getcwd()
 config_dir = appdirs.user_config_dir("litellm")
@ -120,12 +123,41 @@ def add_keys_to_config(key, value):
    config.setdefault('keys', {})[key] = value

    # Write config to file 
-    with open(user_config_path, 'w') as f:
-        for section, data in config.items():
-            f.write('[%s]\n' % section)
-            for k, v in data.items():
-                f.write('%s = "%s"\n' % (k, v))
+    with open(user_config_path, 'wb') as f:
+        tomli_w.dump(config, f)

+def save_params_to_config(data: dict): 
+    # Check if file exists
+    if os.path.exists(user_config_path):
+        # Load existing file
+         with open(user_config_path, "rb") as f:
+            config = tomllib.load(f)
+    else:
+        # File doesn't exist, create empty config
+        config = {}
+    
+    config.setdefault('general', {})
+
+    ## general config 
+    general_settings = data["general"]
+    
+    for key, value in general_settings.items():
+        config["general"][key] = value
+
+    ## model-specific config 
+    config.setdefault("model", {})
+    config["model"].setdefault(user_model, {})
+
+    user_model_config = data[user_model]
+    model_key = model_key = user_model_config.pop("alias", user_model)
+    config["model"].setdefault(model_key, {})
+    for key, value in user_model_config.items():
+        config["model"][model_key][key] = value
+
+    # Write config to file 
+    with open(user_config_path, 'wb') as f:
+        tomli_w.dump(config, f)
+        

 def load_config():
    try: 
@ -138,7 +170,6 @@ def load_config():
        if "keys" in user_config:
            for key in user_config["keys"]:
                os.environ[key] = user_config["keys"][key] # litellm can read keys from the environment
-
        ## settings 
        if "general" in user_config:
            litellm.add_function_to_prompt = user_config["general"].get("add_function_to_prompt", True) # by default add function to prompt if unsupported by provider
@ -191,24 +222,42 @@ def load_config():
    except Exception as e:
        pass

-def initialize(model, api_base, debug, temperature, max_tokens, max_budget, telemetry, drop_params, add_function_to_prompt):
-    global user_model, user_api_base, user_debug, user_max_tokens, user_temperature, user_telemetry
+def initialize(model, alias, api_base, debug, temperature, max_tokens, max_budget, telemetry, drop_params, add_function_to_prompt, headers, save):
+    global user_model, user_api_base, user_debug, user_max_tokens, user_temperature, user_telemetry, user_headers
    user_model = model
    user_debug = debug
-    
    load_config()
-    user_api_base = api_base
-    user_max_tokens = max_tokens
-    user_temperature = temperature
+    dynamic_config = {"general": {}, user_model: {}} 
+    if headers: # model-specific param
+        user_headers = headers
+        dynamic_config[user_model]["headers"] = headers
+    if api_base: # model-specific param
+        user_api_base = api_base
+        dynamic_config[user_model]["api_base"] = api_base
+    if max_tokens: # model-specific param
+        user_max_tokens = max_tokens
+        dynamic_config[user_model]["max_tokens"] = max_tokens
+    if temperature: # model-specific param
+        user_temperature = temperature
+        dynamic_config[user_model]["temperature"] = temperature
+    if alias: # model-specific param
+        dynamic_config[user_model]["alias"] = alias
+    if drop_params == True: # litellm-specific param
+        litellm.drop_params = True
+        dynamic_config["general"]["drop_params"] = True
+    if add_function_to_prompt == True: # litellm-specific param
+        litellm.add_function_to_prompt = True
+        dynamic_config["general"]["add_function_to_prompt"] = True
+    if max_budget: # litellm-specific param
+        litellm.max_budget = max_budget
+        dynamic_config["general"]["max_budget"] = max_budget
+    if save: 
+        save_params_to_config(dynamic_config)
+        with open(user_config_path) as f:
+            print(f.read())
+        print("\033[1;32mDone successfully\033[0m")
    user_telemetry = telemetry
    usage_telemetry(feature="local_proxy_server")
-    if drop_params == True: 
-        litellm.drop_params = True
-    if add_function_to_prompt == True: 
-        litellm.add_function_to_prompt = True
-    if max_budget: 
-        litellm.max_budget = max_budget
-

 def deploy_proxy(model, api_base, debug, temperature, max_tokens, telemetry, deploy):
    import requests
@ -354,9 +403,12 @@ def logger(
            existing_data = {}
            
        existing_data.update(log_data)
-        
-        with open(log_file, 'w') as f:
-            json.dump(existing_data, f, indent=2)
+        def write_to_log():
+            with open(log_file, 'w') as f:
+                json.dump(existing_data, f, indent=2)
+
+        thread = threading.Thread(target=write_to_log, daemon=True)
+        thread.start()
    elif log_event_type == 'post_api_call':
        if "stream" not in kwargs["optional_params"] or kwargs["optional_params"]["stream"] is False or kwargs.get("complete_streaming_response", False):
            inference_params = copy.deepcopy(kwargs)
@ -367,9 +419,13 @@ def logger(
                existing_data = json.load(f)
            
            existing_data[dt_key]['post_api_call'] = inference_params
-            
-            with open(log_file, 'w') as f:
-                json.dump(existing_data, f, indent=2)
+
+            def write_to_log():
+                with open(log_file, 'w') as f:
+                    json.dump(existing_data, f, indent=2)
+
+            thread = threading.Thread(target=write_to_log, daemon=True)
+            thread.start()
  except: 
      traceback.print_exc()

@ -388,6 +444,8 @@ def litellm_completion(data, type):
            data["max_tokens"] = user_max_tokens
        if user_api_base: 
            data["api_base"] = user_api_base
+        if user_headers: 
+            data["headers"] = user_headers
        if type == "completion": 
            response = litellm.text_completion(**data)
        elif type == "chat_completion": 
@ -397,6 +455,7 @@ def litellm_completion(data, type):
        print_verbose(f"response: {response}")
        return response
    except Exception as e: 
+        traceback.print_exc()
        if "Invalid response object from API" in str(e): 
            completion_call_details = {}
            if user_model: 
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -3207,28 +3207,32 @@ class CustomStreamWrapper:
            return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}

    def handle_huggingface_chunk(self, chunk):
-        chunk = chunk.decode("utf-8")
-        text = "" 
-        is_finished = False
-        finish_reason = ""
-        print_verbose(f"chunk: {chunk}")
-        if chunk.startswith("data:"):
-            data_json = json.loads(chunk[5:])
-            print_verbose(f"data json: {data_json}")
-            if "token" in data_json and "text" in data_json["token"]:
-                text = data_json["token"]["text"]
-            if data_json.get("details", False) and data_json["details"].get("finish_reason", False):
-                is_finished = True
-                finish_reason = data_json["details"]["finish_reason"]
-            elif data_json.get("generated_text", False): # if full generated text exists, then stream is complete
-                text = "" # don't return the final bos token
-                is_finished = True
-                finish_reason = "stop"
+        try:
+            chunk = chunk.decode("utf-8")
+            text = "" 
+            is_finished = False
+            finish_reason = ""
+            print_verbose(f"chunk: {chunk}")
+            if chunk.startswith("data:"):
+                data_json = json.loads(chunk[5:])
+                print_verbose(f"data json: {data_json}")
+                if "token" in data_json and "text" in data_json["token"]:
+                    text = data_json["token"]["text"]
+                if data_json.get("details", False) and data_json["details"].get("finish_reason", False):
+                    is_finished = True
+                    finish_reason = data_json["details"]["finish_reason"]
+                elif data_json.get("generated_text", False): # if full generated text exists, then stream is complete
+                    text = "" # don't return the final bos token
+                    is_finished = True
+                    finish_reason = "stop"

+                return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
+            elif "error" in chunk: 
+                raise ValueError(chunk)
            return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
-        elif "error" in chunk: 
-            raise ValueError(chunk)
-        return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
+        except Exception as e: 
+            traceback.print_exc()
+            # raise(e)
    
    def handle_ai21_chunk(self, chunk): # fake streaming
        chunk = chunk.decode("utf-8")
--- a/poetry.lock
+++ b/poetry.lock
@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.

 [[package]]
 name = "aiohttp"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "0.8.4"
+version = "0.8.5"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT License"
@ -26,7 +26,7 @@ requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"

 [tool.commitizen]
-version = "0.8.4"
+version = "0.8.5"
 version_files = [
    "pyproject.toml:^version"
 ]