adding support for meta-llama-2

2025-04-24 18:24:20 +00:00 · 2023-08-14 13:06:33 -07:00 · 2023-08-14 13:06:33 -07:00 · 6aff47083b
commit 6aff47083b
parent b5875cc4bd
12 changed files with 220 additions and 43 deletions
--- a/docs/my-website/docs/completion/huggingface_tutorial.md
+++ b/docs/my-website/docs/completion/huggingface_tutorial.md
@ -0,0 +1,45 @@
+# Llama2 - Huggingface Tutorial 
+[Huggingface](https://huggingface.co/) is an open source platform to deploy machine-learnings models. 
+
+## Call Llama2 with Huggingface Inference Endpoints 
+LiteLLM makes it easy to call your public, private or the default huggingface endpoints. 
+
+In this case, let's try and call 3 models: 
+- `deepset/deberta-v3-large-squad2`: calls the default huggingface endpoint
+- `meta-llama/Llama-2-7b-hf`: calls a public endpoint
+- `meta-llama/Llama-2-7b-chat-hf`: call your privat endpoint
+
+### Case 1: Call default huggingface endpoint
+
+Here's the complete example:
+
+```
+from litellm import completion 
+
+model = "deepset/deberta-v3-large-squad2"
+messages = [{"role": "user", "content": "Hey, how's it going?"}] # LiteLLM follows the OpenAI format 
+
+### CALLING ENDPOINT
+completion(model=model, messages=messages, custom_llm_provider="huggingface")
+```
+
+What's happening? 
+- model - this is the name of the deployed model on huggingface 
+- messages - this is the input. We accept the OpenAI chat format. For huggingface, by default we iterate through the list and add the message["content"] to the prompt.
+
+### Case 2: Call Llama2 public endpoint
+
+We've deployed `meta-llama/Llama-2-7b-hf` behind a public endpoint - `https://ag3dkq4zui5nu8g3.us-east-1.aws.endpoints.huggingface.cloud`.
+
+Let's try it out: 
+```
+from litellm import completion 
+
+model = "meta-llama/Llama-2-7b-hf"
+messages = [{"role": "user", "content": "Hey, how's it going?"}] # LiteLLM follows the OpenAI format 
+custom_api_base = "https://ag3dkq4zui5nu8g3.us-east-1.aws.endpoints.huggingface.cloud"
+
+### CALLING ENDPOINT
+completion(model=model, messages=messages, custom_llm_provider="huggingface", custom_api_base=custom_api_base)
+```
+
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -22,7 +22,7 @@ const sidebars = {
    {
      type: 'category',
      label: 'completion_function',
-      items: ['completion/input', 'completion/supported','completion/output'],
+      items: ['completion/input', 'completion/supported','completion/output', 'completion/huggingface_tutorial'],
    },
    {
      type: 'category',
--- a/litellm/init.py
+++ b/litellm/init.py
@ -11,6 +11,7 @@ anthropic_key = None
 replicate_key = None 
 cohere_key = None 
 openrouter_key = None
+huggingface_key = None
 vertex_project = None
 vertex_location = None

@ -62,9 +63,6 @@ open_ai_chat_completion_models = [
  "gpt-3.5-turbo-16k",
  "gpt-3.5-turbo-0613",
  "gpt-3.5-turbo-16k-0613",
-  'gpt-3.5-turbo', 
-  'gpt-3.5-turbo-16k-0613',
-  'gpt-3.5-turbo-16k'
 ]
 open_ai_text_completion_models = [
    'text-davinci-003'
@ -111,7 +109,22 @@ vertex_text_models = [
    "text-bison@001"
 ]

-model_list = open_ai_chat_completion_models + open_ai_text_completion_models + cohere_models + anthropic_models + replicate_models + openrouter_models + vertex_chat_models + vertex_text_models
+huggingface_models = [
+    "meta-llama/Llama-2-7b-hf",
+    "meta-llama/Llama-2-7b-chat-hf",
+    "meta-llama/Llama-2-13b-hf",
+    "meta-llama/Llama-2-13b-chat-hf",
+    "meta-llama/Llama-2-70b-hf",
+    "meta-llama/Llama-2-70b-chat-hf",
+    "meta-llama/Llama-2-7b",
+    "meta-llama/Llama-2-7b-chat",
+    "meta-llama/Llama-2-13b",
+    "meta-llama/Llama-2-13b-chat",
+    "meta-llama/Llama-2-70b",
+    "meta-llama/Llama-2-70b-chat",
+] # these have been tested on extensively. But by default all text2text-generation and text-generation models are supported by liteLLM. - https://docs.litellm.ai/docs/completion/supported
+
+model_list = open_ai_chat_completion_models + open_ai_text_completion_models + cohere_models + anthropic_models + replicate_models + openrouter_models + huggingface_models + vertex_chat_models + vertex_text_models

 ####### EMBEDDING MODELS ###################
 open_ai_embedding_models = [
--- a/litellm/pycache/init.cpython-311.pyc
+++ b/litellm/pycache/init.cpython-311.pyc
--- a/litellm/pycache/main.cpython-311.pyc
+++ b/litellm/pycache/main.cpython-311.pyc
--- a/litellm/pycache/utils.cpython-311.pyc
+++ b/litellm/pycache/utils.cpython-311.pyc
--- a/litellm/llms/anthropic.py
+++ b/litellm/llms/anthropic.py
@ -13,6 +13,7 @@ class AnthropicError(Exception):
    def __init__(self, status_code, message):
        self.status_code = status_code
        self.message = message
+        super().__init__(self.message) # Call the base class constructor with the parameters it needs

 class AnthropicLLM: 
    
@ -75,7 +76,6 @@ class AnthropicLLM:
            print_verbose(f"raw model_response: {response.text}")
            ## RESPONSE OBJECT
            completion_response = response.json()
-            print(f"completion_response: {completion_response}")
            if "error" in completion_response:
                raise AnthropicError(message=completion_response["error"], status_code=response.status_code)
            else:
--- a/litellm/llms/huggingface_restapi.py
+++ b/litellm/llms/huggingface_restapi.py
@ -0,0 +1,94 @@
+## Uses the huggingface text generation inference API
+import os, json
+from enum import Enum
+import requests
+from litellm import logging
+import time 
+from typing import Callable
+
+class HuggingfaceError(Exception):
+    def __init__(self, status_code, message):
+        self.status_code = status_code
+        self.message = message
+        super().__init__(self.message) # Call the base class constructor with the parameters it needs
+
+class HuggingfaceRestAPILLM():
+    def __init__(self, encoding, api_key=None) -> None:
+        self.encoding = encoding
+        self.validate_environment(api_key=api_key)
+
+    def validate_environment(self, api_key): # set up the environment required to run the model 
+        self.headers = {
+            "content-type": "application/json",
+        }
+        # get the api key if it exists in the environment or is passed in, but don't require it
+        self.api_key = os.getenv("HF_TOKEN") if "HF_TOKEN" in os.environ else api_key
+        if self.api_key != None:
+            self.headers["Authorization"] = f"Bearer {self.api_key}" 
+
+    def completion(self, model: str, messages: list, custom_api_base: str, model_response: dict, print_verbose: Callable, optional_params=None, litellm_params=None, logger_fn=None): # logic for parsing in - calling - parsing out model completion calls
+        if custom_api_base:
+            completion_url = custom_api_base
+        elif "HF_API_BASE" in os.environ:
+            completion_url = os.getenv("HF_API_BASE")
+        else:
+            completion_url = f"https://api-inference.huggingface.co/models/{model}"
+        prompt = ""
+        if "meta-llama" in model and "chat" in model: # use the required special tokens for meta-llama - https://huggingface.co/blog/llama2#how-to-prompt-llama-2
+            prompt = "<s>"
+            for message in messages:
+                if message["role"] == "system":
+                    prompt += "[INST] <<SYS>>" + message["content"]
+                elif message["role"] == "assistant":
+                    prompt += message["content"] + "</s><s>[INST]"
+                elif message["role"] == "user":
+                    prompt += message["content"] + "[/INST]"
+        else:
+            for message in messages:
+                prompt += f"{message['content']}"
+        ### MAP INPUT PARAMS 
+        # max tokens  
+        if "max_tokens" in optional_params:
+            value = optional_params.pop("max_tokens")
+            optional_params["max_new_tokens"] = value
+        data = {
+            "inputs": prompt,
+            # "parameters": optional_params
+        }
+        ## LOGGING
+        logging(model=model, input=prompt, additional_args={"litellm_params": litellm_params, "optional_params": optional_params}, logger_fn=logger_fn)
+        ## COMPLETION CALL
+        response = requests.post(completion_url, headers=self.headers, data=json.dumps(data))
+        if "stream" in optional_params and optional_params["stream"] == True:
+            return response.iter_lines()
+        else:
+            ## LOGGING
+            logging(model=model, input=prompt, additional_args={"litellm_params": litellm_params, "optional_params": optional_params, "original_response": response.text}, logger_fn=logger_fn)
+            print_verbose(f"raw model_response: {response.text}")
+            ## RESPONSE OBJECT
+            completion_response = response.json()
+            print(f"response: {completion_response}")
+            if isinstance(completion_response, dict) and "error" in completion_response:
+                print(f"completion error: {completion_response['error']}")
+                print(f"response.status_code: {response.status_code}")
+                raise HuggingfaceError(message=completion_response["error"], status_code=response.status_code)
+            else:
+                model_response["choices"][0]["message"]["content"] = completion_response[0]["generated_text"]    
+            
+            ## CALCULATING USAGE
+            prompt_tokens = len(self.encoding.encode(prompt)) ##[TODO] use the llama2 tokenizer here
+            completion_tokens = len(self.encoding.encode(model_response["choices"][0]["message"]["content"])) ##[TODO] use the llama2 tokenizer here
+            
+            
+            model_response["created"] = time.time()
+            model_response["model"] = model
+            model_response["usage"] = {
+                "prompt_tokens": prompt_tokens,
+                "completion_tokens": completion_tokens,
+                "total_tokens": prompt_tokens + completion_tokens
+                }
+            return model_response
+        pass
+
+    def embedding(): # logic for parsing in - calling - parsing out model embedding calls
+        pass 
--- a/litellm/main.py
+++ b/litellm/main.py
@ -7,6 +7,7 @@ import litellm
 from litellm import client, logging, exception_type, timeout, get_optional_params, get_litellm_params
 from litellm.utils import get_secret, install_and_import, CustomStreamWrapper, read_config_args
 from .llms.anthropic import AnthropicLLM
+from .llms.huggingface_restapi import HuggingfaceRestAPILLM
 import tiktoken
 from concurrent.futures import ThreadPoolExecutor
 encoding = tiktoken.get_encoding("cl100k_base")
@ -222,7 +223,6 @@ def completion(
        response = CustomStreamWrapper(model_response, model)
        return response
      response = model_response
-
    elif model in litellm.openrouter_models or custom_llm_provider == "openrouter":
      openai.api_type = "openai"
      # not sure if this will work after someone first uses another API
@ -305,37 +305,15 @@ def completion(
          "total_tokens": prompt_tokens + completion_tokens
        }
      response = model_response
-    elif custom_llm_provider == "huggingface":
-      import requests
-      API_URL = f"https://api-inference.huggingface.co/models/{model}"
-      HF_TOKEN = get_secret("HF_TOKEN")
-      headers = {"Authorization": f"Bearer {HF_TOKEN}"}
-
-      prompt = " ".join([message["content"] for message in messages])
-      ## LOGGING
-      logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn)
-      input_payload = {"inputs": prompt}
-      response = requests.post(API_URL, headers=headers, json=input_payload)
-      ## LOGGING
-      logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens, "original_response": response.text}, logger_fn=logger_fn)
-      if isinstance(response, dict) and "error" in response:
-        raise Exception(response["error"])
-      json_response = response.json()
-      if 'error' in json_response: # raise HF errors when they exist
-        raise Exception(json_response['error'])
-
-      completion_response = json_response[0]['generated_text']
-      prompt_tokens = len(encoding.encode(prompt))
-      completion_tokens = len(encoding.encode(completion_response))
-      ## RESPONSE OBJECT
-      model_response["choices"][0]["message"]["content"] = completion_response
-      model_response["created"] = time.time()
-      model_response["model"] = model
-      model_response["usage"] = {
-          "prompt_tokens": prompt_tokens,
-          "completion_tokens": completion_tokens,
-          "total_tokens": prompt_tokens + completion_tokens
-        }
+    elif model in litellm.huggingface_models or custom_llm_provider == "huggingface":
+      custom_llm_provider = "huggingface"
+      huggingface_key = api_key if api_key is not None else litellm.huggingface_key
+      huggingface_client = HuggingfaceRestAPILLM(encoding=encoding, api_key=huggingface_key)
+      model_response = huggingface_client.completion(model=model, messages=messages, custom_api_base=custom_api_base, model_response=model_response, print_verbose=print_verbose, optional_params=optional_params, litellm_params=litellm_params, logger_fn=logger_fn)
+      if 'stream' in optional_params and optional_params['stream'] == True:
+        # don't try to access stream object,
+        response = CustomStreamWrapper(model_response, model, custom_llm_provider="huggingface")
+        return response
      response = model_response
    elif custom_llm_provider == "together_ai":
      import requests
@ -383,7 +361,7 @@ def completion(

      prompt = " ".join([message["content"] for message in messages])
      ## LOGGING
-      logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn)
+      logging(model=model, input=prompt, custom_llm_provider=custom_llm_provider, additional_args={"litellm_params": litellm_params, "optional_params": optional_params}, logger_fn=logger_fn)

      chat_model = ChatModel.from_pretrained(model)

@ -434,13 +412,13 @@ def completion(
      ## LOGGING
      logging(model=model, input=messages, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn)
      args = locals()
-      raise ValueError(f"Invalid completion model args passed in. Check your input - {args}")
+      raise ValueError(f"Unable to map your input to a model. Check your input - {args}")
    return response
  except Exception as e:
    ## LOGGING
    logging(model=model, input=messages, custom_llm_provider=custom_llm_provider, additional_args={"max_tokens": max_tokens}, logger_fn=logger_fn, exception=e)
    ## Map to OpenAI Exception
-    raise exception_type(model=model, original_exception=e)
+    raise exception_type(model=model, custom_llm_provider=custom_llm_provider, original_exception=e)

 def batch_completion(*args, **kwargs):
  batch_messages = args[1] if len(args) > 1 else kwargs.get("messages")
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@ -49,6 +49,17 @@ def test_completion_hf_api():
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")

+def test_completion_hf_deployed_api():
+    try:
+        user_message = "There's a llama in my garden 😱 What should I do?"
+        messages = [{ "content": user_message,"role": "user"}]
+        response = completion(model="meta-llama/Llama-2-7b-chat-hf", messages=messages, custom_llm_provider="huggingface", custom_api_base="https://s7c7gytn18vnu4tw.us-east-1.aws.endpoints.huggingface.cloud", logger_fn=logger_fn)
+        # Add any assertions here to check the response
+        print(response)
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+test_completion_hf_deployed_api()
 def test_completion_cohere():
    try:
        response = completion(model="command-nightly", messages=messages, max_tokens=500)
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@ -23,6 +23,17 @@ try:
    for chunk in response:
        print(chunk['choices'][0]['delta'])
    score +=1 
+except:
+    print(f"error occurred: {traceback.format_exc()}") 
+    pass
+
+
+# test on anthropic completion call 
+try:
+    response = completion(model="meta-llama/Llama-2-7b-chat-hf", messages=messages, custom_llm_provider="huggingface", custom_api_base="https://s7c7gytn18vnu4tw.us-east-1.aws.endpoints.huggingface.cloud", stream=True, logger_fn=logger_fn)
+    for chunk in response:
+        print(chunk['choices'][0]['delta'])
+    score +=1 
 except:
    print(f"error occurred: {traceback.format_exc()}") 
    pass
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -589,7 +589,7 @@ def modify_integration(integration_name, integration_params):
      if "table_name" in integration_params:
         Supabase.supabase_table_name = integration_params["table_name"]

-def exception_type(model, original_exception):
+def exception_type(model, original_exception, custom_llm_provider):
    global user_logger_fn
    exception_mapping_worked = False
    try:
@ -640,6 +640,17 @@ def exception_type(model, original_exception):
          elif "CohereConnectionError" in exception_type: # cohere seems to fire these errors when we load test it (1k+ messages / min)
            exception_mapping_worked = True
            raise RateLimitError(f"CohereException - {original_exception.message}")
+        elif custom_llm_provider == "huggingface":
+           if hasattr(original_exception, "status_code"):
+              if original_exception.status_code == 401:
+                exception_mapping_worked = True
+                raise AuthenticationError(f"HuggingfaceException - {original_exception.message}")
+              elif original_exception.status_code == 400:
+                exception_mapping_worked = True
+                raise InvalidRequestError(f"HuggingfaceException - {original_exception.message}", f"{model}")
+              elif original_exception.status_code == 429:
+                exception_mapping_worked = True
+                raise RateLimitError(f"HuggingfaceException - {original_exception.message}")
        raise original_exception # base case - return the original exception
      else:
        raise original_exception
@ -715,8 +726,9 @@ def get_secret(secret_name):
 # wraps the completion stream to return the correct format for the model
 # replicate/anthropic/cohere
 class CustomStreamWrapper:
-    def __init__(self, completion_stream, model):
+    def __init__(self, completion_stream, model, custom_llm_provider=None):
        self.model = model
+        self.custom_llm_provider = custom_llm_provider
        if model in litellm.cohere_models:
           # cohere does not return an iterator, so we need to wrap it in one
           self.completion_stream = iter(completion_stream)
@ -745,6 +757,16 @@ class CustomStreamWrapper:
          return extracted_text
      else:
          return ""
+    
+    def handle_huggingface_chunk(self, chunk): 
+      chunk = chunk.decode("utf-8")
+      if chunk.startswith('data:'):
+          data_json = json.loads(chunk[5:])
+          if "token" in data_json and "text" in data_json["token"]:
+             return data_json["token"]["text"]
+          else:
+             return ""
+      return ""

    def __next__(self):
        completion_obj ={ "role": "assistant", "content": ""}
@ -763,6 +785,9 @@ class CustomStreamWrapper:
        elif self.model in litellm.cohere_models:
          chunk = next(self.completion_stream)
          completion_obj["content"] = chunk.text
+        elif self.custom_llm_provider and self.custom_llm_provider == "huggingface":
+           chunk = next(self.completion_stream)
+           completion_obj["content"] = self.handle_huggingface_chunk(chunk)
        # return this for all models
        return {"choices": [{"delta": completion_obj}]}