diff --git a/docs/my-website/docs/providers/huggingface.md b/docs/my-website/docs/providers/huggingface.md index f8ebadfcf..35befd3e2 100644 --- a/docs/my-website/docs/providers/huggingface.md +++ b/docs/my-website/docs/providers/huggingface.md @@ -21,6 +21,11 @@ This is done by adding the "huggingface/" prefix to `model`, example `completion +By default, LiteLLM will assume a huggingface call follows the TGI format. + + + + ```python import os from litellm import completion @@ -40,9 +45,58 @@ response = completion( print(response) ``` + + + +1. Add models to your config.yaml + + ```yaml + model_list: + - model_name: wizard-coder + litellm_params: + model: huggingface/WizardLM/WizardCoder-Python-34B-V1.0 + api_key: os.environ/HUGGINGFACE_API_KEY + api_base: "https://my-endpoint.endpoints.huggingface.cloud" + ``` + + + +2. Start the proxy + + ```bash + $ litellm --config /path/to/config.yaml --debug + ``` + +3. Test it! + + ```shell + curl --location 'http://0.0.0.0:4000/chat/completions' \ + --header 'Authorization: Bearer sk-1234' \ + --header 'Content-Type: application/json' \ + --data '{ + "model": "wizard-coder", + "messages": [ + { + "role": "user", + "content": "I like you!" + } + ], + }' + ``` + + + + +Append `conversational` to the model name + +e.g. `huggingface/conversational/` + + + + ```python import os from litellm import completion @@ -54,7 +108,7 @@ messages = [{ "content": "There's a llama in my garden 😱 What should I do?"," # e.g. Call 'facebook/blenderbot-400M-distill' hosted on HF Inference endpoints response = completion( - model="huggingface/facebook/blenderbot-400M-distill", + model="huggingface/conversational/facebook/blenderbot-400M-distill", messages=messages, api_base="https://my-endpoint.huggingface.cloud" ) @@ -62,7 +116,123 @@ response = completion( print(response) ``` - + + +1. Add models to your config.yaml + + ```yaml + model_list: + - model_name: blenderbot + litellm_params: + model: huggingface/conversational/facebook/blenderbot-400M-distill + api_key: os.environ/HUGGINGFACE_API_KEY + api_base: "https://my-endpoint.endpoints.huggingface.cloud" + ``` + + + +2. Start the proxy + + ```bash + $ litellm --config /path/to/config.yaml --debug + ``` + +3. Test it! + + ```shell + curl --location 'http://0.0.0.0:4000/chat/completions' \ + --header 'Authorization: Bearer sk-1234' \ + --header 'Content-Type: application/json' \ + --data '{ + "model": "blenderbot", + "messages": [ + { + "role": "user", + "content": "I like you!" + } + ], + }' + ``` + + + + + + + +Append `text-classification` to the model name + +e.g. `huggingface/text-classification/` + + + + +```python +import os +from litellm import completion + +# [OPTIONAL] set env var +os.environ["HUGGINGFACE_API_KEY"] = "huggingface_api_key" + +messages = [{ "content": "I like you, I love you!","role": "user"}] + +# e.g. Call 'shahrukhx01/question-vs-statement-classifier' hosted on HF Inference endpoints +response = completion( + model="huggingface/text-classification/shahrukhx01/question-vs-statement-classifier", + messages=messages, + api_base="https://my-endpoint.endpoints.huggingface.cloud", +) + +print(response) +``` + + + +1. Add models to your config.yaml + + ```yaml + model_list: + - model_name: bert-classifier + litellm_params: + model: huggingface/text-classification/shahrukhx01/question-vs-statement-classifier + api_key: os.environ/HUGGINGFACE_API_KEY + api_base: "https://my-endpoint.endpoints.huggingface.cloud" + ``` + + + +2. Start the proxy + + ```bash + $ litellm --config /path/to/config.yaml --debug + ``` + +3. Test it! + + ```shell + curl --location 'http://0.0.0.0:4000/chat/completions' \ + --header 'Authorization: Bearer sk-1234' \ + --header 'Content-Type: application/json' \ + --data '{ + "model": "bert-classifier", + "messages": [ + { + "role": "user", + "content": "I like you!" + } + ], + }' + ``` + + + + + + + +Append `text-generation` to the model name + +e.g. `huggingface/text-generation/` ```python import os @@ -75,7 +245,7 @@ messages = [{ "content": "There's a llama in my garden 😱 What should I do?"," # e.g. Call 'roneneldan/TinyStories-3M' hosted on HF Inference endpoints response = completion( - model="huggingface/roneneldan/TinyStories-3M", + model="huggingface/text-generation/roneneldan/TinyStories-3M", messages=messages, api_base="https://p69xlsj6rpno5drq.us-east-1.aws.endpoints.huggingface.cloud", ) diff --git a/litellm/llms/huggingface_restapi.py b/litellm/llms/huggingface_restapi.py index a2c4457c2..ad3c570e7 100644 --- a/litellm/llms/huggingface_restapi.py +++ b/litellm/llms/huggingface_restapi.py @@ -6,10 +6,12 @@ import httpx, requests from .base import BaseLLM import time import litellm -from typing import Callable, Dict, List, Any +from typing import Callable, Dict, List, Any, Literal from litellm.utils import ModelResponse, Choices, Message, CustomStreamWrapper, Usage from typing import Optional from .prompt_templates.factory import prompt_factory, custom_prompt +from litellm.types.completion import ChatCompletionMessageToolCallParam +import enum class HuggingfaceError(Exception): @@ -39,11 +41,29 @@ class HuggingfaceError(Exception): ) # Call the base class constructor with the parameters it needs +hf_task_list = [ + "text-generation-inference", + "conversational", + "text-classification", + "text-generation", +] + +hf_tasks = Literal[ + "text-generation-inference", + "conversational", + "text-classification", + "text-generation", +] + + class HuggingfaceConfig: """ Reference: https://huggingface.github.io/text-generation-inference/#/Text%20Generation%20Inference/compat_generate """ + hf_task: Optional[hf_tasks] = ( + None # litellm-specific param, used to know the api spec to use when calling huggingface api + ) best_of: Optional[int] = None decoder_input_details: Optional[bool] = None details: Optional[bool] = True # enables returning logprobs + best of @@ -101,6 +121,51 @@ class HuggingfaceConfig: and v is not None } + def get_supported_openai_params(self): + return [ + "stream", + "temperature", + "max_tokens", + "top_p", + "stop", + "n", + "echo", + ] + + def map_openai_params( + self, non_default_params: dict, optional_params: dict + ) -> dict: + for param, value in non_default_params.items(): + # temperature, top_p, n, stream, stop, max_tokens, n, presence_penalty default to None + if param == "temperature": + if value == 0.0 or value == 0: + # hugging face exception raised when temp==0 + # Failed: Error occurred: HuggingfaceException - Input validation error: `temperature` must be strictly positive + value = 0.01 + optional_params["temperature"] = value + if param == "top_p": + optional_params["top_p"] = value + if param == "n": + optional_params["best_of"] = value + optional_params["do_sample"] = ( + True # Need to sample if you want best of for hf inference endpoints + ) + if param == "stream": + optional_params["stream"] = value + if param == "stop": + optional_params["stop"] = value + if param == "max_tokens": + # HF TGI raises the following exception when max_new_tokens==0 + # Failed: Error occurred: HuggingfaceException - Input validation error: `max_new_tokens` must be strictly positive + if value == 0: + value = 1 + optional_params["max_new_tokens"] = value + if param == "echo": + # https://huggingface.co/docs/huggingface_hub/main/en/package_reference/inference_client#huggingface_hub.InferenceClient.text_generation.decoder_input_details + # Return the decoder input token logprobs and ids. You must set details=True as well for it to be taken into account. Defaults to False + optional_params["decoder_input_details"] = True + return optional_params + def output_parser(generated_text: str): """ @@ -162,16 +227,18 @@ def read_tgi_conv_models(): return set(), set() -def get_hf_task_for_model(model): +def get_hf_task_for_model(model: str) -> hf_tasks: # read text file, cast it to set # read the file called "huggingface_llms_metadata/hf_text_generation_models.txt" + if model.split("/")[0] in hf_task_list: + return model.split("/")[0] # type: ignore tgi_models, conversational_models = read_tgi_conv_models() if model in tgi_models: return "text-generation-inference" elif model in conversational_models: return "conversational" elif "roneneldan/TinyStories" in model: - return None + return "text-generation" else: return "text-generation-inference" # default to tgi @@ -202,7 +269,7 @@ class Huggingface(BaseLLM): self, completion_response, model_response, - task, + task: hf_tasks, optional_params, encoding, input_text, @@ -270,6 +337,10 @@ class Huggingface(BaseLLM): ) choices_list.append(choice_obj) model_response["choices"].extend(choices_list) + elif task == "text-classification": + model_response["choices"][0]["message"]["content"] = json.dumps( + completion_response + ) else: if len(completion_response[0]["generated_text"]) > 0: model_response["choices"][0]["message"]["content"] = output_parser( @@ -333,6 +404,12 @@ class Huggingface(BaseLLM): try: headers = self.validate_environment(api_key, headers) task = get_hf_task_for_model(model) + ## VALIDATE API FORMAT + if task is None or not isinstance(task, str) or task not in hf_task_list: + raise Exception( + "Invalid hf task - {}. Valid formats - {}.".format(task, hf_tasks) + ) + print_verbose(f"{model}, {task}") completion_url = "" input_text = "" @@ -433,14 +510,15 @@ class Huggingface(BaseLLM): inference_params.pop("return_full_text") data = { "inputs": prompt, - "parameters": inference_params, - "stream": ( # type: ignore + } + if task == "text-generation-inference": + data["parameters"] = inference_params + data["stream"] = ( # type: ignore True if "stream" in optional_params and optional_params["stream"] == True else False - ), - } + ) input_text = prompt ## LOGGING logging_obj.pre_call( @@ -531,10 +609,10 @@ class Huggingface(BaseLLM): isinstance(completion_response, dict) and "error" in completion_response ): - print_verbose(f"completion error: {completion_response['error']}") + print_verbose(f"completion error: {completion_response['error']}") # type: ignore print_verbose(f"response.status_code: {response.status_code}") raise HuggingfaceError( - message=completion_response["error"], + message=completion_response["error"], # type: ignore status_code=response.status_code, ) return self.convert_to_model_response_object( @@ -563,7 +641,7 @@ class Huggingface(BaseLLM): data: dict, headers: dict, model_response: ModelResponse, - task: str, + task: hf_tasks, encoding: Any, input_text: str, model: str, diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index bb47404dc..f02dafade 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -13,6 +13,7 @@ import litellm from litellm import embedding, completion, completion_cost, Timeout from litellm import RateLimitError from litellm.llms.prompt_templates.factory import anthropic_messages_pt +from unittest.mock import patch, MagicMock # litellm.num_retries=3 litellm.cache = None @@ -1137,7 +1138,7 @@ def test_get_hf_task_for_model(): model = "roneneldan/TinyStories-3M" model_type = litellm.llms.huggingface_restapi.get_hf_task_for_model(model) print(f"model:{model}, model type: {model_type}") - assert model_type == None + assert model_type == "text-generation" # test_get_hf_task_for_model() @@ -1145,15 +1146,92 @@ def test_get_hf_task_for_model(): # ################### Hugging Face TGI models ######################## # # TGI model # # this is a TGI model https://huggingface.co/glaiveai/glaive-coder-7b -def hf_test_completion_tgi(): - # litellm.set_verbose=True +def tgi_mock_post(url, data=None, json=None, headers=None): + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.headers = {"Content-Type": "application/json"} + mock_response.json.return_value = [ + { + "generated_text": "<|assistant|>\nI'm", + "details": { + "finish_reason": "length", + "generated_tokens": 10, + "seed": None, + "prefill": [], + "tokens": [ + { + "id": 28789, + "text": "<", + "logprob": -0.025222778, + "special": False, + }, + { + "id": 28766, + "text": "|", + "logprob": -0.000003695488, + "special": False, + }, + { + "id": 489, + "text": "ass", + "logprob": -0.0000019073486, + "special": False, + }, + { + "id": 11143, + "text": "istant", + "logprob": -0.000002026558, + "special": False, + }, + { + "id": 28766, + "text": "|", + "logprob": -0.0000015497208, + "special": False, + }, + { + "id": 28767, + "text": ">", + "logprob": -0.0000011920929, + "special": False, + }, + { + "id": 13, + "text": "\n", + "logprob": -0.00009703636, + "special": False, + }, + {"id": 28737, "text": "I", "logprob": -0.1953125, "special": False}, + { + "id": 28742, + "text": "'", + "logprob": -0.88183594, + "special": False, + }, + { + "id": 28719, + "text": "m", + "logprob": -0.00032639503, + "special": False, + }, + ], + }, + } + ] + return mock_response + + +def test_hf_test_completion_tgi(): + litellm.set_verbose = True try: - response = completion( - model="huggingface/HuggingFaceH4/zephyr-7b-beta", - messages=[{"content": "Hello, how are you?", "role": "user"}], - ) - # Add any assertions here to check the response - print(response) + with patch("requests.post", side_effect=tgi_mock_post): + response = completion( + model="huggingface/HuggingFaceH4/zephyr-7b-beta", + messages=[{"content": "Hello, how are you?", "role": "user"}], + max_tokens=10, + ) + # Add any assertions here to check the response + print(response) except litellm.ServiceUnavailableError as e: pass except Exception as e: @@ -1191,6 +1269,40 @@ def hf_test_completion_tgi(): # except Exception as e: # pytest.fail(f"Error occurred: {e}") # hf_test_completion_none_task() + + +def mock_post(url, data=None, json=None, headers=None): + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.headers = {"Content-Type": "application/json"} + mock_response.json.return_value = [ + [ + {"label": "LABEL_0", "score": 0.9990691542625427}, + {"label": "LABEL_1", "score": 0.0009308889275416732}, + ] + ] + return mock_response + + +def test_hf_classifier_task(): + try: + with patch("requests.post", side_effect=mock_post): + litellm.set_verbose = True + user_message = "I like you. I love you" + messages = [{"content": user_message, "role": "user"}] + response = completion( + model="huggingface/text-classification/shahrukhx01/question-vs-statement-classifier", + messages=messages, + ) + print(f"response: {response}") + assert isinstance(response, litellm.ModelResponse) + assert isinstance(response.choices[0], litellm.Choices) + assert response.choices[0].message.content is not None + assert isinstance(response.choices[0].message.content, str) + except Exception as e: + pytest.fail(f"Error occurred: {str(e)}") + + ########################### End of Hugging Face Tests ############################################## # def test_completion_hf_api(): # # failing on circle ci commenting out diff --git a/litellm/tests/test_get_optional_params_functions_not_supported.py b/litellm/tests/test_get_optional_params_functions_not_supported.py index 2abfbc41f..c4580adba 100644 --- a/litellm/tests/test_get_optional_params_functions_not_supported.py +++ b/litellm/tests/test_get_optional_params_functions_not_supported.py @@ -3,7 +3,27 @@ from litellm import get_optional_params litellm.add_function_to_prompt = True optional_params = get_optional_params( - tools= [{'type': 'function', 'function': {'description': 'Get the current weather in a given location', 'name': 'get_current_weather', 'parameters': {'type': 'object', 'properties': {'location': {'type': 'string', 'description': 'The city and state, e.g. San Francisco, CA'}, 'unit': {'type': 'string', 'enum': ['celsius', 'fahrenheit']}}, 'required': ['location']}}}], - tool_choice= 'auto', + model="", + tools=[ + { + "type": "function", + "function": { + "description": "Get the current weather in a given location", + "name": "get_current_weather", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA", + }, + "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}, + }, + "required": ["location"], + }, + }, + } + ], + tool_choice="auto", ) -assert optional_params is not None \ No newline at end of file +assert optional_params is not None diff --git a/litellm/tests/test_optional_params.py b/litellm/tests/test_optional_params.py index fac63ed67..5c33cfa0e 100644 --- a/litellm/tests/test_optional_params.py +++ b/litellm/tests/test_optional_params.py @@ -86,6 +86,7 @@ def test_azure_optional_params_embeddings(): def test_azure_gpt_optional_params_gpt_vision(): # for OpenAI, Azure all extra params need to get passed as extra_body to OpenAI python. We assert we actually set extra_body here optional_params = litellm.utils.get_optional_params( + model="", user="John", custom_llm_provider="azure", max_tokens=10, @@ -125,6 +126,7 @@ def test_azure_gpt_optional_params_gpt_vision(): def test_azure_gpt_optional_params_gpt_vision_with_extra_body(): # if user passes extra_body, we should not over write it, we should pass it along to OpenAI python optional_params = litellm.utils.get_optional_params( + model="", user="John", custom_llm_provider="azure", max_tokens=10, @@ -167,6 +169,7 @@ def test_azure_gpt_optional_params_gpt_vision_with_extra_body(): def test_openai_extra_headers(): optional_params = litellm.utils.get_optional_params( + model="", user="John", custom_llm_provider="openai", max_tokens=10, diff --git a/litellm/utils.py b/litellm/utils.py index 0babe2f0b..09a7851dc 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -4871,6 +4871,7 @@ def get_optional_params_embeddings( def get_optional_params( # use the openai defaults # https://platform.openai.com/docs/api-reference/chat/create + model: str, functions=None, function_call=None, temperature=None, @@ -4884,7 +4885,6 @@ def get_optional_params( frequency_penalty=None, logit_bias=None, user=None, - model=None, custom_llm_provider="", response_format=None, seed=None, @@ -4913,7 +4913,7 @@ def get_optional_params( passed_params[k] = v - optional_params = {} + optional_params: Dict = {} common_auth_dict = litellm.common_cloud_provider_auth_params if custom_llm_provider in common_auth_dict["providers"]: @@ -5187,41 +5187,9 @@ def get_optional_params( model=model, custom_llm_provider=custom_llm_provider ) _check_valid_arg(supported_params=supported_params) - # temperature, top_p, n, stream, stop, max_tokens, n, presence_penalty default to None - if temperature is not None: - if temperature == 0.0 or temperature == 0: - # hugging face exception raised when temp==0 - # Failed: Error occurred: HuggingfaceException - Input validation error: `temperature` must be strictly positive - temperature = 0.01 - optional_params["temperature"] = temperature - if top_p is not None: - optional_params["top_p"] = top_p - if n is not None: - optional_params["best_of"] = n - optional_params["do_sample"] = ( - True # Need to sample if you want best of for hf inference endpoints - ) - if stream is not None: - optional_params["stream"] = stream - if stop is not None: - optional_params["stop"] = stop - if max_tokens is not None: - # HF TGI raises the following exception when max_new_tokens==0 - # Failed: Error occurred: HuggingfaceException - Input validation error: `max_new_tokens` must be strictly positive - if max_tokens == 0: - max_tokens = 1 - optional_params["max_new_tokens"] = max_tokens - if n is not None: - optional_params["best_of"] = n - if presence_penalty is not None: - optional_params["repetition_penalty"] = presence_penalty - if "echo" in passed_params: - # https://huggingface.co/docs/huggingface_hub/main/en/package_reference/inference_client#huggingface_hub.InferenceClient.text_generation.decoder_input_details - # Return the decoder input token logprobs and ids. You must set details=True as well for it to be taken into account. Defaults to False - optional_params["decoder_input_details"] = special_params["echo"] - passed_params.pop( - "echo", None - ) # since we handle translating echo, we should not send it to TGI request + optional_params = litellm.HuggingfaceConfig().map_openai_params( + non_default_params=non_default_params, optional_params=optional_params + ) elif custom_llm_provider == "together_ai": ## check if unsupported param passed in supported_params = get_supported_openai_params( @@ -6181,7 +6149,7 @@ def get_supported_openai_params(model: str, custom_llm_provider: str): "seed", ] elif custom_llm_provider == "huggingface": - return ["stream", "temperature", "max_tokens", "top_p", "stop", "n"] + return litellm.HuggingfaceConfig().get_supported_openai_params() elif custom_llm_provider == "together_ai": return [ "stream",