diff --git a/litellm/__pycache__/utils.cpython-311.pyc b/litellm/__pycache__/utils.cpython-311.pyc index 8734e5025..486063acb 100644 Binary files a/litellm/__pycache__/utils.cpython-311.pyc and b/litellm/__pycache__/utils.cpython-311.pyc differ diff --git a/litellm/tests/test_config.py b/litellm/tests/test_config.py index b2b48cfb3..73e719cad 100644 --- a/litellm/tests/test_config.py +++ b/litellm/tests/test_config.py @@ -88,4 +88,4 @@ def test_config_context_adapt_to_prompt(): print(f"Exception: {e}") pytest.fail(f"An exception occurred: {e}") -# test_config_context_adapt_to_prompt() \ No newline at end of file +test_config_context_adapt_to_prompt() \ No newline at end of file diff --git a/litellm/utils.py b/litellm/utils.py index 4eff73973..5e96d332e 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -2802,7 +2802,13 @@ def completion_with_config(config: Union[dict, str], **kwargs): fallback_models = config.get("default_fallback_models", None) available_models = config.get("available_models", None) adapt_to_prompt_size = config.get("adapt_to_prompt_size", False) - start_time = time.time() + trim_messages_flag = config.get("trim_messages", False) + prompt_larger_than_model = False + max_model = model + try: + max_tokens = litellm.get_max_tokens(model)["max_tokens"] + except: + max_tokens = 2048 # assume curr model's max window is 2048 tokens if adapt_to_prompt_size: ## Pick model based on token window prompt_tokens = litellm.token_counter(model="gpt-3.5-turbo", text="".join(message["content"] for message in messages)) @@ -2811,14 +2817,22 @@ def completion_with_config(config: Union[dict, str], **kwargs): except: curr_max_tokens = 2048 if curr_max_tokens < prompt_tokens: + prompt_larger_than_model = True for available_model in available_models: try: curr_max_tokens = litellm.get_max_tokens(available_model)["max_tokens"] + if curr_max_tokens > max_tokens: + max_tokens = curr_max_tokens + max_model = available_model if curr_max_tokens > prompt_tokens: model = available_model + prompt_larger_than_model = False except: continue - end_time = time.time() + if prompt_larger_than_model: + messages = trim_messages(messages=messages, model=max_model) + kwargs["messages"] = messages + kwargs["model"] = model try: if model in models_with_config: @@ -3052,8 +3066,7 @@ def shorten_message_to_fit_limit( # Credits for this code go to Killian Lucas def trim_messages( messages, - model = None, - system_message = None, # str of user system message + model: Optional[str] = None, trim_ratio: float = 0.75, return_response_tokens: bool = False, max_tokens = None @@ -3086,6 +3099,11 @@ def trim_messages( # do nothing, just return messages return + system_message = "" + for message in messages: + if message["role"] == "system": + system_message += message["content"] + current_tokens = token_counter(model=model, messages=messages) # Do nothing if current tokens under messages diff --git a/pyproject.toml b/pyproject.toml index fd729ed5d..eea1fc43e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "litellm" -version = "0.1.731" +version = "0.1.732" description = "Library to easily interface with LLM API providers" authors = ["BerriAI"] license = "MIT License"