diff --git a/litellm/__pycache__/main.cpython-311.pyc b/litellm/__pycache__/main.cpython-311.pyc index 63d8dc73e..7d1b0da0d 100644 Binary files a/litellm/__pycache__/main.cpython-311.pyc and b/litellm/__pycache__/main.cpython-311.pyc differ diff --git a/litellm/__pycache__/utils.cpython-311.pyc b/litellm/__pycache__/utils.cpython-311.pyc index e12757bfe..199316272 100644 Binary files a/litellm/__pycache__/utils.cpython-311.pyc and b/litellm/__pycache__/utils.cpython-311.pyc differ diff --git a/litellm/llms/ai21.py b/litellm/llms/ai21.py index f3f4a4342..17d5c9bd9 100644 --- a/litellm/llms/ai21.py +++ b/litellm/llms/ai21.py @@ -90,7 +90,8 @@ def completion( else: try: model_response["choices"][0]["message"]["content"] = completion_response["completions"][0]["data"]["text"] - except: + model_response.choices[0].finish_reason = completion_response["completions"][0]["finishReason"]["reason"] + except Exception as e: raise AI21Error(message=json.dumps(completion_response), status_code=response.status_code) ## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here. diff --git a/litellm/llms/anthropic.py b/litellm/llms/anthropic.py index 021ec4a73..e1634afe0 100644 --- a/litellm/llms/anthropic.py +++ b/litellm/llms/anthropic.py @@ -114,6 +114,7 @@ def completion( model_response["choices"][0]["message"]["content"] = completion_response[ "completion" ] + model_response.choices[0].finish_reason = completion_response["stop_reason"] ## CALCULATING USAGE prompt_tokens = len( diff --git a/litellm/llms/huggingface_restapi.py b/litellm/llms/huggingface_restapi.py index e2fccb569..1160e6d8d 100644 --- a/litellm/llms/huggingface_restapi.py +++ b/litellm/llms/huggingface_restapi.py @@ -153,9 +153,10 @@ def completion( elif task == "text-generation-inference": model_response["choices"][0]["message"][ "content" - ] = completion_response[0]["generated_text"] - ## GETTING LOGPROBS + ] = completion_response[0]["generated_text"] + ## GETTING LOGPROBS + FINISH REASON if "details" in completion_response[0] and "tokens" in completion_response[0]["details"]: + model_response.choices[0].finish_reason = completion_response[0]["details"]["finish_reason"] sum_logprob = 0 for token in completion_response[0]["details"]["tokens"]: sum_logprob += token["logprob"] diff --git a/litellm/llms/together_ai.py b/litellm/llms/together_ai.py index 4f75e6e43..47d6ab677 100644 --- a/litellm/llms/together_ai.py +++ b/litellm/llms/together_ai.py @@ -104,14 +104,17 @@ def completion( message=json.dumps(completion_response["output"]), status_code=response.status_code ) - completion_response = completion_response["output"]["choices"][0]["text"] + print(completion_response) + completion_text = completion_response["output"]["choices"][0]["text"] ## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here. prompt_tokens = len(encoding.encode(prompt)) completion_tokens = len( - encoding.encode(completion_response) + encoding.encode(completion_text) ) - model_response["choices"][0]["message"]["content"] = completion_response + model_response["choices"][0]["message"]["content"] = completion_text + if "finish_reason" in completion_response["output"]["choices"][0]: + model_response.choices[0].finish_reason = completion_response["output"]["choices"][0]["finish_reason"] model_response["created"] = time.time() model_response["model"] = model model_response["usage"] = { diff --git a/litellm/main.py b/litellm/main.py index a7d9d627b..46129c7be 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -529,6 +529,8 @@ def completion( completion_tokens = len(encoding.encode(completion_response)) ## RESPONSE OBJECT model_response["choices"][0]["message"]["content"] = completion_response + if response[0].finish_reason: + model_response.choices[0].finish_reason = response[0].finish_reason model_response["created"] = time.time() model_response["model"] = model model_response["usage"] = { diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index 6be7f24d3..934354c2c 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -49,7 +49,7 @@ def test_completion_claude(): print(response) except Exception as e: pytest.fail(f"Error occurred: {e}") - +# test_completion_claude() # aleph alpha # def test_completion_aleph_alpha(): # try: @@ -119,8 +119,8 @@ def test_completion_claude_stream(): # try: # user_message = "write some code to find the sum of two numbers" # messages = [{ "content": user_message,"role": "user"}] -# api_base = "https://wyh9bqfgj2r1klv5.us-east-1.aws.endpoints.huggingface.cloud" -# response = completion(model="facebook/blenderbot-400M-distill", messages=messages, custom_llm_provider="huggingface", task="conversational", api_base=api_base, logger_fn=logger_fn) +# api_base = "https://ecd4sb5n09bo4ei2.us-east-1.aws.endpoints.huggingface.cloud" +# response = completion(model="togethercomputer/LLaMA-2-7B-32K", messages=messages, custom_llm_provider="huggingface", api_base=api_base, logger_fn=logger_fn) # # Add any assertions here to check the response # print(response) # except Exception as e: @@ -141,26 +141,26 @@ def test_completion_claude_stream(): # pytest.fail(f"Error occurred: {e}") -# def test_completion_cohere(): # commenting for now as the cohere endpoint is being flaky -# try: -# response = completion( -# model="command-nightly", -# messages=messages, -# max_tokens=100, -# logit_bias={40: 10}, -# ) -# # Add any assertions here to check the response -# print(response) -# response_str = response["choices"][0]["message"]["content"] -# print(f"str response{response_str}") -# response_str_2 = response.choices[0].message.content -# if type(response_str) != str: -# pytest.fail(f"Error occurred: {e}") -# if type(response_str_2) != str: -# pytest.fail(f"Error occurred: {e}") -# except Exception as e: -# pytest.fail(f"Error occurred: {e}") -## +def test_completion_cohere(): # commenting for now as the cohere endpoint is being flaky + try: + response = completion( + model="command-nightly", + messages=messages, + max_tokens=100, + logit_bias={40: 10}, + logger_fn=logger_fn + ) + # Add any assertions here to check the response + print(response) + response_str = response["choices"][0]["message"]["content"] + print(f"str response{response_str}") + response_str_2 = response.choices[0].message.content + if type(response_str) != str: + pytest.fail(f"Error occurred: {e}") + if type(response_str_2) != str: + pytest.fail(f"Error occurred: {e}") + except Exception as e: + pytest.fail(f"Error occurred: {e}") def test_completion_cohere_stream(): try: @@ -750,15 +750,16 @@ def test_completion_with_fallbacks(): #### Test A121 ################### -# def test_completion_ai21(): -# model_name = "j2-light" -# try: -# response = completion(model=model_name, messages=messages) -# # Add any assertions here to check the response -# print(response) -# except Exception as e: -# pytest.fail(f"Error occurred: {e}") +def test_completion_ai21(): + model_name = "j2-light" + try: + response = completion(model=model_name, messages=messages) + # Add any assertions here to check the response + print(response) + except Exception as e: + pytest.fail(f"Error occurred: {e}") +# test_completion_ai21() # test config file with completion # # def test_completion_openai_config(): # try: diff --git a/litellm/utils.py b/litellm/utils.py index fe3efe06b..c5f35cfde 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -830,7 +830,23 @@ def get_optional_params( # use the openai defaults optional_params["top_k"] = top_k if stop != None: optional_params["stop_sequences"] = stop - + elif custom_llm_provider == "huggingface": + if temperature != 1: + optional_params["temperature"] = temperature + if top_p != 1: + optional_params["top_p"] = top_p + if n != 1: + optional_params["n"] = n + if stream: + optional_params["stream"] = stream + if stop != None: + optional_params["stop"] = stop + if max_tokens != float("inf"): + optional_params["max_new_tokens"] = max_tokens + if presence_penalty != 0: + optional_params["repetition_penalty"] = presence_penalty + optional_params["details"] = True + optional_params["task"] = task elif custom_llm_provider == "together_ai" or ("togethercomputer" in model): if stream: optional_params["stream_tokens"] = stream @@ -867,23 +883,6 @@ def get_optional_params( # use the openai defaults optional_params["num_beams"] = num_beams if max_tokens != float("inf"): optional_params["max_new_tokens"] = max_tokens - elif custom_llm_provider == "huggingface": - if temperature != 1: - optional_params["temperature"] = temperature - if top_p != 1: - optional_params["top_p"] = top_p - if n != 1: - optional_params["n"] = n - if stream: - optional_params["stream"] = stream - if stop != None: - optional_params["stop"] = stop - if max_tokens != float("inf"): - optional_params["max_new_tokens"] = max_tokens - if presence_penalty != 0: - optional_params["repetition_penalty"] = presence_penalty - optional_params["details"] = True - optional_params["task"] = task elif custom_llm_provider == "sagemaker": if "llama-2" in model: # llama-2 models on sagemaker support the following args diff --git a/pyproject.toml b/pyproject.toml index 5e247bbf8..6585ca5ba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "litellm" -version = "0.1.620" +version = "0.1.621" description = "Library to easily interface with LLM API providers" authors = ["BerriAI"] license = "MIT License"