refactor(huggingface_restapi.py): moving async completion + streaming to real async calls

2025-04-25 10:44:24 +00:00 · 2023-11-15 15:14:13 -08:00 · 2023-11-15 15:14:13 -08:00 · 1a705bfbcb
commit 1a705bfbcb
parent 77394e7987
5 changed files with 464 additions and 365 deletions
--- a/litellm/init.py
+++ b/litellm/init.py
@ -19,7 +19,7 @@ telemetry = True
 max_tokens = 256  # OpenAI Defaults
 drop_params = False
 retry = True
-request_timeout: float = 6000
+request_timeout: Optional[float] = None
 api_key: Optional[str] = None
 openai_key: Optional[str] = None
 azure_key: Optional[str] = None
--- a/litellm/llms/huggingface_restapi.py
+++ b/litellm/llms/huggingface_restapi.py
@ -3,6 +3,7 @@ import os, copy, types
 import json
 from enum import Enum
 import httpx, requests
+from .base import BaseLLM
 import time
 import litellm
 from typing import Callable, Dict, List, Any
@ -67,19 +68,6 @@ class HuggingfaceConfig():
                and not isinstance(v, (types.FunctionType, types.BuiltinFunctionType, classmethod, staticmethod)) 
                and v is not None}

-def validate_environment(api_key, headers):
-    default_headers = {
-        "content-type": "application/json",
-    }
-    if api_key and headers is None:
-        default_headers["Authorization"] = f"Bearer {api_key}" # Huggingface Inference Endpoint default is to accept bearer tokens
-        headers = default_headers
-    elif headers:
-        headers=headers
-    else: 
-        headers = default_headers
-    return headers
-
 def output_parser(generated_text: str): 
    """
    Parse the output text to remove any special characters. In our current approach we just check for ChatML tokens. 
@ -94,8 +82,6 @@ def output_parser(generated_text: str):
            generated_text = generated_text[::-1].replace(token[::-1], "", 1)[::-1]
    return generated_text
            
-
-
 tgi_models_cache = None
 conv_models_cache = None
 def read_tgi_conv_models():
@ -144,7 +130,106 @@ def get_hf_task_for_model(model):
    else:
        return "text-generation-inference" # default to tgi

-def completion(
+class Huggingface(BaseLLM): 
+    _client_session: Optional[httpx.Client] = None
+    _aclient_session: Optional[httpx.AsyncClient] = None
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def validate_environment(self, api_key, headers):
+        default_headers = {
+            "content-type": "application/json",
+        }
+        if api_key and headers is None:
+            default_headers["Authorization"] = f"Bearer {api_key}" # Huggingface Inference Endpoint default is to accept bearer tokens
+            headers = default_headers
+        elif headers:
+            headers=headers
+        else: 
+            headers = default_headers
+        return headers
+
+    def convert_to_model_response_object(self, 
+                                         completion_response, 
+                                         model_response, 
+                                         task, 
+                                         optional_params, 
+                                         encoding, 
+                                         input_text,
+                                         model):
+        if task == "conversational": 
+            if len(completion_response["generated_text"]) > 0: # type: ignore
+                model_response["choices"][0]["message"][
+                    "content"
+                ] = completion_response["generated_text"] # type: ignore
+        elif task == "text-generation-inference": 
+            if len(completion_response[0]["generated_text"]) > 0: 
+                model_response["choices"][0]["message"][
+                    "content"
+                ] = output_parser(completion_response[0]["generated_text"])
+            ## GETTING LOGPROBS + FINISH REASON 
+            if "details" in completion_response[0] and "tokens" in completion_response[0]["details"]:
+                model_response.choices[0].finish_reason = completion_response[0]["details"]["finish_reason"]
+                sum_logprob = 0
+                for token in completion_response[0]["details"]["tokens"]:
+                    if token["logprob"] != None:
+                        sum_logprob += token["logprob"]
+                model_response["choices"][0]["message"]._logprob = sum_logprob
+            if "best_of" in optional_params and optional_params["best_of"] > 1: 
+                if "details" in completion_response[0] and "best_of_sequences" in completion_response[0]["details"]:
+                    choices_list = []
+                    for idx, item in enumerate(completion_response[0]["details"]["best_of_sequences"]):
+                        sum_logprob = 0
+                        for token in item["tokens"]:
+                            if token["logprob"] != None:
+                                sum_logprob += token["logprob"]
+                        if len(item["generated_text"]) > 0: 
+                            message_obj = Message(content=output_parser(item["generated_text"]), logprobs=sum_logprob)
+                        else: 
+                            message_obj = Message(content=None)
+                        choice_obj = Choices(finish_reason=item["finish_reason"], index=idx+1, message=message_obj)
+                        choices_list.append(choice_obj)
+                    model_response["choices"].extend(choices_list)
+        else:
+            if len(completion_response[0]["generated_text"]) > 0: 
+                model_response["choices"][0]["message"][
+                    "content"
+                ] = output_parser(completion_response[0]["generated_text"])
+        ## CALCULATING USAGE
+        prompt_tokens = 0
+        try:
+            prompt_tokens = len(
+                encoding.encode(input_text)
+            )  ##[TODO] use the llama2 tokenizer here
+        except:
+            # this should remain non blocking we should not block a response returning if calculating usage fails
+            pass
+        output_text = model_response["choices"][0]["message"].get("content", "")
+        if output_text is not None and len(output_text) > 0:
+            completion_tokens = 0
+            try:
+                completion_tokens = len(
+                    encoding.encode(model_response["choices"][0]["message"].get("content", ""))
+                )  ##[TODO] use the llama2 tokenizer here
+            except:
+                # this should remain non blocking we should not block a response returning if calculating usage fails
+                pass
+        else: 
+            completion_tokens = 0
+
+        model_response["created"] = time.time()
+        model_response["model"] = model
+        usage = Usage(
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=prompt_tokens + completion_tokens
+        )
+        model_response.usage = usage
+        model_response._hidden_params["original_response"] = completion_response
+        return model_response
+
+    def completion(self,
        model: str,
        messages: list,
        api_base: Optional[str],
@ -155,13 +240,15 @@ def completion(
        api_key,
        logging_obj,
        custom_prompt_dict={},
+        acompletion: bool = False,
        optional_params=None,
        litellm_params=None,
        logger_fn=None,
-):
+    ):
+        super().completion()
        exception_mapping_worked = False
        try:
-        headers = validate_environment(api_key, headers)
+            headers = self.validate_environment(api_key, headers)
            task = get_hf_task_for_model(model)
            print_verbose(f"{model}, {task}")
            completion_url = ""
@ -255,9 +342,17 @@ def completion(
            logging_obj.pre_call(
                    input=input_text,
                    api_key=api_key,
-                additional_args={"complete_input_dict": data, "task": task, "headers": headers, "api_base": completion_url},
+                    additional_args={"complete_input_dict": data, "task": task, "headers": headers, "api_base": completion_url, "acompletion": acompletion},
                )
            ## COMPLETION CALL
+            if acompletion is True: 
+                ### ASYNC STREAMING 
+                if optional_params.get("stream", False):
+                    return self.async_streaming(logging_obj=logging_obj, api_base=completion_url, data=data, headers=headers, model_response=model_response, model=model)
+                else:
+                    ### ASYNC COMPLETION
+                    return self.acompletion(api_base=completion_url, data=data, headers=headers, model_response=model_response, task=task, encoding=encoding, input_text=input_text, model=model, optional_params=optional_params)
+            ### SYNC STREAMING
            if "stream" in optional_params and optional_params["stream"] == True:
                response = requests.post(
                    completion_url, 
@ -266,6 +361,7 @@ def completion(
                    stream=optional_params["stream"]
                )
                return response.iter_lines()
+            ### SYNC COMPLETION
            else:
                response = requests.post(
                    completion_url, 
@ -273,7 +369,6 @@ def completion(
                    data=json.dumps(data)
                )

-
                ## Some servers might return streaming responses even though stream was not set to true. (e.g. Baseten)
                is_streamed = False 
                if response.__dict__['headers'].get("Content-Type", "") == "text/event-stream":
@ -317,78 +412,16 @@ def completion(
                        message=completion_response["error"],
                        status_code=response.status_code,
                    )
-            else:
-                if task == "conversational": 
-                    if len(completion_response["generated_text"]) > 0: # type: ignore
-                        model_response["choices"][0]["message"][
-                            "content"
-                        ] = completion_response["generated_text"] # type: ignore
-                elif task == "text-generation-inference": 
-                    if len(completion_response[0]["generated_text"]) > 0: 
-                        model_response["choices"][0]["message"][
-                            "content"
-                        ] = output_parser(completion_response[0]["generated_text"])
-                    ## GETTING LOGPROBS + FINISH REASON 
-                    if "details" in completion_response[0] and "tokens" in completion_response[0]["details"]:
-                        model_response.choices[0].finish_reason = completion_response[0]["details"]["finish_reason"]
-                        sum_logprob = 0
-                        for token in completion_response[0]["details"]["tokens"]:
-                            if token["logprob"] != None:
-                                sum_logprob += token["logprob"]
-                        model_response["choices"][0]["message"]._logprob = sum_logprob
-                    if "best_of" in optional_params and optional_params["best_of"] > 1: 
-                        if "details" in completion_response[0] and "best_of_sequences" in completion_response[0]["details"]:
-                            choices_list = []
-                            for idx, item in enumerate(completion_response[0]["details"]["best_of_sequences"]):
-                                sum_logprob = 0
-                                for token in item["tokens"]:
-                                    if token["logprob"] != None:
-                                        sum_logprob += token["logprob"]
-                                if len(item["generated_text"]) > 0: 
-                                    message_obj = Message(content=output_parser(item["generated_text"]), logprobs=sum_logprob)
-                                else: 
-                                    message_obj = Message(content=None)
-                                choice_obj = Choices(finish_reason=item["finish_reason"], index=idx+1, message=message_obj)
-                                choices_list.append(choice_obj)
-                            model_response["choices"].extend(choices_list)
-                else:
-                    if len(completion_response[0]["generated_text"]) > 0: 
-                        model_response["choices"][0]["message"][
-                            "content"
-                        ] = output_parser(completion_response[0]["generated_text"])
-            ## CALCULATING USAGE
-            prompt_tokens = 0
-            try:
-                prompt_tokens = len(
-                    encoding.encode(input_text)
-                )  ##[TODO] use the llama2 tokenizer here
-            except:
-                # this should remain non blocking we should not block a response returning if calculating usage fails
-                pass
-            print_verbose(f'output: {model_response["choices"][0]["message"]}')
-            output_text = model_response["choices"][0]["message"].get("content", "")
-            if output_text is not None and len(output_text) > 0:
-                completion_tokens = 0
-                try:
-                    completion_tokens = len(
-                        encoding.encode(model_response["choices"][0]["message"].get("content", ""))
-                    )  ##[TODO] use the llama2 tokenizer here
-                except:
-                    # this should remain non blocking we should not block a response returning if calculating usage fails
-                    pass
-            else: 
-                completion_tokens = 0
                
-            model_response["created"] = time.time()
-            model_response["model"] = model
-            usage = Usage(
-                prompt_tokens=prompt_tokens,
-                completion_tokens=completion_tokens,
-                total_tokens=prompt_tokens + completion_tokens
+                return self.convert_to_model_response_object(
+                    completion_response=completion_response,
+                    model_response=model_response,
+                    task=task,
+                    optional_params=optional_params,
+                    encoding=encoding,
+                    input_text=input_text,
+                    model=model
                )
-            model_response.usage = usage
-            model_response._hidden_params["original_response"] = completion_response
-            return model_response
        except HuggingfaceError as e: 
            exception_mapping_worked = True
            raise e
@ -399,8 +432,65 @@ def completion(
                import traceback
                raise HuggingfaceError(status_code=500, message=traceback.format_exc())

+    async def acompletion(self, 
+                          api_base: str, 
+                          data: dict, 
+                          headers: dict, 
+                          model_response: ModelResponse,
+                          task: str,
+                          encoding: Any,
+                          input_text: str,
+                          model: str,
+                          optional_params: dict): 
+       if self._aclient_session is None:
+           self._aclient_session = self.create_aclient_session()
+       client = self._aclient_session
+       try:
+            response = await client.post(url=api_base, json=data, headers=headers) 
+            response_json = response.json()
+            if response.status_code != 200:
+                raise HuggingfaceError(status_code=response.status_code, message=response.text, request=response.request, response=response)
            
-def embedding(
+            ## RESPONSE OBJECT
+            return self.convert_to_model_response_object(completion_response=response_json, 
+                                                         model_response=model_response,
+                                                         task=task,
+                                                         encoding=encoding,
+                                                         input_text=input_text,
+                                                         model=model,
+                                                         optional_params=optional_params)
+       except Exception as e: 
+           if isinstance(e,httpx.TimeoutException):
+                raise HuggingfaceError(status_code=500, message="Request Timeout Error")
+           elif response and hasattr(response, "text"):
+                raise HuggingfaceError(status_code=500, message=f"{str(e)}\n\nOriginal Response: {response.text}")
+           else: 
+                raise HuggingfaceError(status_code=500, message=f"{str(e)}")
+
+    async def async_streaming(self, 
+                          logging_obj,
+                          api_base: str, 
+                          data: dict, 
+                          headers: dict, 
+                          model_response: ModelResponse, 
+                          model: str):
+        if self._aclient_session is None:
+           self._aclient_session = self.create_aclient_session()
+        client = self._aclient_session
+        async with client.stream(
+                    url=f"{api_base}",
+                    json=data,
+                    headers=headers,
+                    method="POST"
+                ) as response: 
+            if response.status_code != 200:
+                raise HuggingfaceError(status_code=response.status_code, message="An error occurred while streaming")
+            
+            streamwrapper = CustomStreamWrapper(completion_stream=response.aiter_lines(), model=model, custom_llm_provider="huggingface",logging_obj=logging_obj)
+            async for transformed_chunk in streamwrapper:
+                yield transformed_chunk
+
+    def embedding(self,
        model: str,
        input: list,
        api_key: Optional[str] = None,
@ -408,8 +498,9 @@ def embedding(
        logging_obj=None,
        model_response=None,
        encoding=None,
-):
-    headers = validate_environment(api_key, headers=None)
+    ):
+        super().embedding()
+        headers = self.validate_environment(api_key, headers=None)
        # print_verbose(f"{model}, {task}")
        embed_url = ""
        if "https" in model:
--- a/litellm/main.py
+++ b/litellm/main.py
@ -53,6 +53,7 @@ from .llms import (
    maritalk)
 from .llms.openai import OpenAIChatCompletion, OpenAITextCompletion
 from .llms.azure import AzureChatCompletion
+from .llms.huggingface_restapi import Huggingface
 from .llms.prompt_templates.factory import prompt_factory, custom_prompt, function_call_prompt
 import tiktoken
 from concurrent.futures import ThreadPoolExecutor
@ -77,6 +78,7 @@ dotenv.load_dotenv()  # Loading env variables using dotenv
 openai_chat_completions = OpenAIChatCompletion()
 openai_text_completions = OpenAITextCompletion()
 azure_chat_completions = AzureChatCompletion()
+huggingface = Huggingface()
 ####### COMPLETION ENDPOINTS ################

 class LiteLLM:
@ -165,7 +167,8 @@ async def acompletion(*args, **kwargs):
        if (custom_llm_provider == "openai" 
            or custom_llm_provider == "azure" 
            or custom_llm_provider == "custom_openai"
-            or custom_llm_provider == "text-completion-openai"): # currently implemented aiohttp calls for just azure and openai, soon all. 
+            or custom_llm_provider == "text-completion-openai"
+            or custom_llm_provider == "huggingface"): # currently implemented aiohttp calls for just azure and openai, soon all. 
            if kwargs.get("stream", False): 
                response = completion(*args, **kwargs)
            else:
@ -862,7 +865,7 @@ def completion(
                custom_prompt_dict
                or litellm.custom_prompt_dict
            )
-            model_response = huggingface_restapi.completion(
+            model_response = huggingface.completion(
                model=model,
                messages=messages,
                api_base=api_base, # type: ignore
@ -874,10 +877,11 @@ def completion(
                logger_fn=logger_fn,
                encoding=encoding, 
                api_key=huggingface_key, 
+                acompletion=acompletion,
                logging_obj=logging,
                custom_prompt_dict=custom_prompt_dict
            )
-            if "stream" in optional_params and optional_params["stream"] == True:
+            if "stream" in optional_params and optional_params["stream"] == True and acompletion is False:
                # don't try to access stream object,
                response = CustomStreamWrapper(
                    model_response, model, custom_llm_provider="huggingface", logging_obj=logging
--- a/litellm/tests/test_async_fn.py
+++ b/litellm/tests/test_async_fn.py
@ -25,11 +25,12 @@ def test_sync_response():

 def test_async_response():
    import asyncio
+    litellm.set_verbose = True
    async def test_get_response():
        user_message = "Hello, how are you?"
        messages = [{"content": user_message, "role": "user"}]
        try:
-            response = await acompletion(model="command-nightly", messages=messages)
+            response = await acompletion(model="huggingface/HuggingFaceH4/zephyr-7b-beta", messages=messages)
            print(f"response: {response}")
        except Exception as e:
            pytest.fail(f"An exception occurred: {e}")
@ -44,7 +45,7 @@ def test_get_response_streaming():
        messages = [{"content": user_message, "role": "user"}]
        try:
            litellm.set_verbose = True
-            response = await acompletion(model="command-nightly", messages=messages, stream=True)
+            response = await acompletion(model="gpt-3.5-turbo", messages=messages, stream=True)
            print(type(response))

            import inspect
@ -67,15 +68,16 @@ def test_get_response_streaming():
    asyncio.run(test_async_call())


-test_get_response_streaming()
+# test_get_response_streaming()

 def test_get_response_non_openai_streaming():
    import asyncio
+    litellm.set_verbose = True
    async def test_async_call():
        user_message = "Hello, how are you?"
        messages = [{"content": user_message, "role": "user"}]
        try:
-            response = await acompletion(model="command-nightly", messages=messages, stream=True)
+            response = await acompletion(model="huggingface/HuggingFaceH4/zephyr-7b-beta", messages=messages, stream=True)
            print(type(response))

            import inspect
@ -98,4 +100,4 @@ def test_get_response_non_openai_streaming():
        return response
    asyncio.run(test_async_call())

-# test_get_response_non_openai_streaming()
+test_get_response_non_openai_streaming()
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -511,6 +511,8 @@ class Logging:
            masked_headers = {k: v[:-40] + '*' * 40 if len(v) > 40 else v for k, v in headers.items()}
            formatted_headers = " ".join([f"-H '{k}: {v}'" for k, v in masked_headers.items()])

+            print_verbose(f"PRE-API-CALL ADDITIONAL ARGS: {additional_args}")
+
            curl_command = "\n\nPOST Request Sent from LiteLLM:\n"
            curl_command += "curl -X POST \\\n"
            curl_command += f"{api_base} \\\n"
@ -4313,7 +4315,6 @@ class CustomStreamWrapper:

    def handle_huggingface_chunk(self, chunk):
        try:
-            chunk = chunk.decode("utf-8")
            text = "" 
            is_finished = False
            finish_reason = ""
@ -4770,7 +4771,8 @@ class CustomStreamWrapper:
            if (self.custom_llm_provider == "openai" 
                or self.custom_llm_provider == "azure"
                or self.custom_llm_provider == "custom_openai"
-                or self.custom_llm_provider == "text-completion-openai"):
+                or self.custom_llm_provider == "text-completion-openai"
+                or self.custom_llm_provider == "huggingface"):
                async for chunk in self.completion_stream:
                    if chunk == "None" or chunk is None:
                        raise Exception