diff --git a/litellm/__pycache__/__init__.cpython-311.pyc b/litellm/__pycache__/__init__.cpython-311.pyc index 709d54116f..f98964ab77 100644 Binary files a/litellm/__pycache__/__init__.cpython-311.pyc and b/litellm/__pycache__/__init__.cpython-311.pyc differ diff --git a/litellm/__pycache__/utils.cpython-311.pyc b/litellm/__pycache__/utils.cpython-311.pyc index bd50ebc532..8b3ac9683f 100644 Binary files a/litellm/__pycache__/utils.cpython-311.pyc and b/litellm/__pycache__/utils.cpython-311.pyc differ diff --git a/litellm/llms/huggingface_restapi.py b/litellm/llms/huggingface_restapi.py index 0d61a8ad01..cf49635e83 100644 --- a/litellm/llms/huggingface_restapi.py +++ b/litellm/llms/huggingface_restapi.py @@ -5,7 +5,7 @@ from enum import Enum import requests import time from typing import Callable -from litellm.utils import ModelResponse +from litellm.utils import ModelResponse, Choices, Message from typing import Optional from .prompt_templates.factory import prompt_factory, custom_prompt @@ -173,16 +173,28 @@ def completion( "content" ] = completion_response["generated_text"] elif task == "text-generation-inference": - model_response["choices"][0]["message"][ - "content" - ] = completion_response[0]["generated_text"] - ## GETTING LOGPROBS + FINISH REASON - if "details" in completion_response[0] and "tokens" in completion_response[0]["details"]: - model_response.choices[0].finish_reason = completion_response[0]["details"]["finish_reason"] - sum_logprob = 0 - for token in completion_response[0]["details"]["tokens"]: - sum_logprob += token["logprob"] - model_response["choices"][0]["message"]["logprobs"] = sum_logprob + if "best_of" in optional_params and optional_params["best_of"] > 1: + if "details" in completion_response[0] and "best_of_sequences" in completion_response[0]["details"]: + choices_list = [] + for idx, item in enumerate(completion_response[0]["details"]["best_of_sequences"]): + sum_logprob = 0 + for token in item["tokens"]: + sum_logprob += token["logprob"] + message_obj = Message(content=item["generated_text"], logprobs=sum_logprob) + choice_obj = Choices(finish_reason=item["finish_reason"], index=idx, message=message_obj) + choices_list.append(choice_obj) + model_response["choices"] = choices_list + else: + model_response["choices"][0]["message"][ + "content" + ] = completion_response[0]["generated_text"] + ## GETTING LOGPROBS + FINISH REASON + if "details" in completion_response[0] and "tokens" in completion_response[0]["details"]: + model_response.choices[0].finish_reason = completion_response[0]["details"]["finish_reason"] + sum_logprob = 0 + for token in completion_response[0]["details"]["tokens"]: + sum_logprob += token["logprob"] + model_response["choices"][0]["message"]["logprobs"] = sum_logprob else: model_response["choices"][0]["message"]["content"] = completion_response[0]["generated_text"] ## CALCULATING USAGE diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index 10b527dc05..0e995cd9c3 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -135,6 +135,22 @@ def test_completion_with_litellm_call_id(): # test_completion_hf_api() +# def test_completion_hf_api_best_of(): +# # failing on circle ci commenting out +# try: +# user_message = "write some code to find the sum of two numbers" +# messages = [{ "content": user_message,"role": "user"}] +# api_base = "https://a8l9e3ucxinyl3oj.us-east-1.aws.endpoints.huggingface.cloud" +# response = completion(model="huggingface/meta-llama/Llama-2-7b-chat-hf", messages=messages, api_base=api_base, n=2) +# # Add any assertions here to check the response +# print(response) +# except Exception as e: +# if "loading" in str(e): +# pass +# pytest.fail(f"Error occurred: {e}") + +# test_completion_hf_api_best_of() + # def test_completion_hf_deployed_api(): # try: # user_message = "There's a llama in my garden 😱 What should I do?" diff --git a/litellm/utils.py b/litellm/utils.py index ec525f5367..17bf38f2de 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -902,7 +902,8 @@ def get_optional_params( # use the openai defaults if top_p != 1: optional_params["top_p"] = top_p if n != 1: - optional_params["n"] = n + optional_params["best_of"] = n + optional_params["do_sample"] = True # need to sample if you want best of for hf inference endpoints if stream: optional_params["stream"] = stream if stop != None: