diff --git a/litellm/__pycache__/main.cpython-311.pyc b/litellm/__pycache__/main.cpython-311.pyc index b78bc9b00..9a3c21972 100644 Binary files a/litellm/__pycache__/main.cpython-311.pyc and b/litellm/__pycache__/main.cpython-311.pyc differ diff --git a/litellm/__pycache__/utils.cpython-311.pyc b/litellm/__pycache__/utils.cpython-311.pyc index e95fb8744..f71c34263 100644 Binary files a/litellm/__pycache__/utils.cpython-311.pyc and b/litellm/__pycache__/utils.cpython-311.pyc differ diff --git a/litellm/main.py b/litellm/main.py index 06d938ac8..7d39afab4 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -132,6 +132,7 @@ def completion( # model specific optional params top_k=40,# used by text-bison only task: Optional[str]="text-generation-inference", # used by huggingface inference endpoints + return_full_text: bool = False, # used by huggingface TGI remove_input: bool = True, # used by nlp cloud models - prevents input text from being returned as part of output request_timeout=0, # unused var for old version of OpenAI API fallbacks=[], @@ -181,7 +182,8 @@ def completion( custom_llm_provider=custom_llm_provider, top_k=top_k, task=task, - remove_input=remove_input + remove_input=remove_input, + return_full_text=return_full_text ) # For logging - save the values of the litellm-specific params passed in litellm_params = get_litellm_params( diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index e46f6dbdf..26522a354 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -92,25 +92,6 @@ def test_completion_with_litellm_call_id(): pytest.fail(f"Error occurred: {e}") -def test_completion_claude_stream(): - try: - messages = [ - {"role": "system", "content": "You are a helpful assistant."}, - { - "role": "user", - "content": "how does a court case get to the Supreme Court?", - }, - ] - response = completion(model="claude-2", messages=messages, stream=True) - # Add any assertions here to check the response - for chunk in response: - print(chunk["choices"][0]["delta"]) # same as openai format - print(chunk["choices"][0]["finish_reason"]) - print(chunk["choices"][0]["delta"]["content"]) - except Exception as e: - pytest.fail(f"Error occurred: {e}") -# test_completion_claude_stream() - def test_completion_nlp_cloud(): try: messages = [ @@ -125,26 +106,6 @@ def test_completion_nlp_cloud(): except Exception as e: pytest.fail(f"Error occurred: {e}") -def test_completion_nlp_cloud_streaming(): - try: - messages = [ - {"role": "system", "content": "You are a helpful assistant."}, - { - "role": "user", - "content": "how does a court case get to the Supreme Court?", - }, - ] - response = completion(model="dolphin", messages=messages, stream=True, logger_fn=logger_fn) - # Add any assertions here to check the response - for chunk in response: - print(chunk["choices"][0]["delta"]["content"]) # same as openai format - print(chunk["choices"][0]["finish_reason"]) - print(chunk["choices"][0]["delta"]["content"]) - except Exception as e: - pytest.fail(f"Error occurred: {e}") -# test_completion_nlp_cloud_streaming() - -# test_completion_nlp_cloud_streaming() # def test_completion_hf_api(): # try: # user_message = "write some code to find the sum of two numbers" @@ -327,69 +288,6 @@ def test_completion_openai_with_more_optional_params(): pytest.fail(f"Error occurred: {e}") -def test_completion_openai_with_stream(): - try: - response = completion( - model="gpt-3.5-turbo", - messages=messages, - temperature=0.5, - top_p=0.1, - n=2, - max_tokens=150, - presence_penalty=0.5, - stream=True, - frequency_penalty=-0.5, - logit_bias={27000: 5}, - user="ishaan_dev@berri.ai", - ) - # Add any assertions here to check the response - print(response) - for chunk in response: - print(chunk) - if chunk["choices"][0]["finish_reason"] == "stop" or chunk["choices"][0]["finish_reason"] == "length": - break - print(chunk["choices"][0]["finish_reason"]) - print(chunk["choices"][0]["delta"]["content"]) - except Exception as e: - pytest.fail(f"Error occurred: {e}") -# test_completion_openai_with_stream() - -def test_completion_openai_with_functions(): - function1 = [ - { - "name": "get_current_weather", - "description": "Get the current weather in a given location", - "parameters": { - "type": "object", - "properties": { - "location": { - "type": "string", - "description": "The city and state, e.g. San Francisco, CA", - }, - "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}, - }, - "required": ["location"], - }, - } - ] - try: - response = completion( - model="gpt-3.5-turbo", messages=messages, functions=function1, stream=True - ) - # Add any assertions here to check the response - print(response) - for chunk in response: - print(chunk) - if chunk["choices"][0]["finish_reason"] == "stop": - break - print(chunk["choices"][0]["finish_reason"]) - print(chunk["choices"][0]["delta"]["content"]) - - except Exception as e: - pytest.fail(f"Error occurred: {e}") -# test_completion_openai_with_functions() - - # def test_completion_openai_azure_with_functions(): # function1 = [ # { @@ -544,20 +442,6 @@ def test_completion_replicate_vicuna(): except Exception as e: pytest.fail(f"Error occurred: {e}") -# test_completion_replicate_vicuna() - -def test_completion_replicate_llama_stream(): - model_name = "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1" - try: - response = completion(model=model_name, messages=messages, stream=True) - # Add any assertions here to check the response - for chunk in response: - print(chunk) - print(chunk["choices"][0]["delta"]["content"]) - except Exception as e: - pytest.fail(f"Error occurred: {e}") -# test_completion_replicate_llama_stream() - # def test_completion_replicate_stability_stream(): # model_name = "stability-ai/stablelm-tuned-alpha-7b:c49dae362cbaecd2ceabb5bd34fdb68413c4ff775111fea065d259d577757beb" # try: @@ -653,26 +537,7 @@ def test_completion_bedrock_ai21(): except Exception as e: pytest.fail(f"Error occurred: {e}") -def test_completion_bedrock_ai21_stream(): - try: - litellm.set_verbose = False - response = completion( - model="bedrock/amazon.titan-tg1-large", - messages=[{"role": "user", "content": "Be as verbose as possible and give as many details as possible, how does a court case get to the Supreme Court?"}], - temperature=1, - max_tokens=4096, - stream=True, - ) - # Add any assertions here to check the response - print(response) - for chunk in response: - print(chunk) - except Exception as e: - pytest.fail(f"Error occurred: {e}") -# test_completion_bedrock_ai21_stream() - -# test_completion_sagemaker() ######## Test VLLM ######## # def test_completion_vllm(): # try: diff --git a/litellm/tests/test_streaming.py b/litellm/tests/test_streaming.py index f51efb55c..b7a82356c 100644 --- a/litellm/tests/test_streaming.py +++ b/litellm/tests/test_streaming.py @@ -213,7 +213,31 @@ def test_completion_cohere_stream(): print(f"completion_response: {complete_response}") except Exception as e: pytest.fail(f"Error occurred: {e}") - + + +def test_completion_bedrock_ai21_stream(): + try: + litellm.set_verbose = False + response = completion( + model="bedrock/amazon.titan-tg1-large", + messages=[{"role": "user", "content": "Be as verbose as possible and give as many details as possible, how does a court case get to the Supreme Court?"}], + temperature=1, + max_tokens=4096, + stream=True, + ) + # Add any assertions here to check the response + print(response) + for idx, chunk in enumerate(response): + chunk, finished = streaming_format_tests(idx, chunk) + if finished: + break + complete_response += chunk + if complete_response.strip() == "": + raise Exception("Empty response received") + except Exception as e: + pytest.fail(f"Error occurred: {e}") + + # test_completion_cohere_stream() # test on openai completion call @@ -301,34 +325,66 @@ def test_together_ai_completion_call_starcoder(): except: print(f"error occurred: {traceback.format_exc()}") pass -# test_together_ai_completion_call_starcoder() -# test on aleph alpha completion call - commented out as it's expensive to run this on circle ci for every build -# def test_aleph_alpha_call(): -# try: -# start_time = time.time() -# response = completion( -# model="luminous-base", -# messages=messages, -# logger_fn=logger_fn, -# stream=True, -# ) -# complete_response = "" -# print(f"returned response object: {response}") -# for chunk in response: -# chunk_time = time.time() -# complete_response += ( -# chunk["choices"][0]["delta"]["content"] -# if len(chunk["choices"][0]["delta"].keys()) > 0 -# else "" -# ) -# if len(complete_response) > 0: -# print(complete_response) -# if complete_response == "": -# raise Exception("Empty response received") -# except: -# print(f"error occurred: {traceback.format_exc()}") -# pass -#### Test Async streaming + +def test_completion_nlp_cloud_streaming(): + try: + messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + { + "role": "user", + "content": "how does a court case get to the Supreme Court?", + }, + ] + response = completion(model="dolphin", messages=messages, stream=True, logger_fn=logger_fn) + # Add any assertions here to check the response + for idx, chunk in enumerate(response): + chunk, finished = streaming_format_tests(idx, chunk) + if finished: + break + complete_response += chunk + if complete_response == "": + raise Exception("Empty response received") + except Exception as e: + pytest.fail(f"Error occurred: {e}") + + +#### Test Function calling + streaming #### + +def test_completion_openai_with_functions(): + function1 = [ + { + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA", + }, + "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}, + }, + "required": ["location"], + }, + } + ] + try: + response = completion( + model="gpt-3.5-turbo", messages=messages, functions=function1, stream=True + ) + # Add any assertions here to check the response + print(response) + for chunk in response: + print(chunk) + if chunk["choices"][0]["finish_reason"] == "stop": + break + print(chunk["choices"][0]["finish_reason"]) + print(chunk["choices"][0]["delta"]["content"]) + except Exception as e: + pytest.fail(f"Error occurred: {e}") +test_completion_openai_with_functions() + +#### Test Async streaming #### # # test on ai21 completion call async def ai21_async_completion_call(): diff --git a/litellm/utils.py b/litellm/utils.py index c9a9a33f1..5865557da 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -828,6 +828,7 @@ def get_optional_params( # use the openai defaults model=None, custom_llm_provider="", top_k=40, + return_full_text=False, task=None ): optional_params = {} @@ -885,6 +886,7 @@ def get_optional_params( # use the openai defaults optional_params["max_new_tokens"] = max_tokens if presence_penalty != 0: optional_params["repetition_penalty"] = presence_penalty + optional_params["return_full_text"] = return_full_text optional_params["details"] = True optional_params["task"] = task elif custom_llm_provider == "together_ai" or ("togethercomputer" in model): @@ -2507,7 +2509,6 @@ class CustomStreamWrapper: model_response = ModelResponse(stream=True, model=self.model) try: # return this for all models - print_verbose(f"self.sent_first_chunk: {self.sent_first_chunk}") if self.sent_first_chunk == False: model_response.choices[0].delta.role = "assistant" self.sent_first_chunk = True diff --git a/pyproject.toml b/pyproject.toml index 32d4bec96..ffd51a9df 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "litellm" -version = "0.1.677" +version = "0.1.678" description = "Library to easily interface with LLM API providers" authors = ["BerriAI"] license = "MIT License"