From 14a19fc6014a7d63e04c283aedd3fc8b12a901f3 Mon Sep 17 00:00:00 2001 From: xihajun Date: Sun, 3 Mar 2024 23:43:03 +0000 Subject: [PATCH 01/14] Disable special token restriction for claude ai --- litellm/llms/anthropic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/litellm/llms/anthropic.py b/litellm/llms/anthropic.py index 150ae0e07..6bfedc101 100644 --- a/litellm/llms/anthropic.py +++ b/litellm/llms/anthropic.py @@ -200,10 +200,10 @@ def completion( ## CALCULATING USAGE prompt_tokens = len( - encoding.encode(prompt) + encoding.encode(prompt,disallowed_special=()) ) ##[TODO] use the anthropic tokenizer here completion_tokens = len( - encoding.encode(model_response["choices"][0]["message"].get("content", "")) + encoding.encode(model_response["choices"][0]["message"].get("content", ""),disallowed_special=()) ) ##[TODO] use the anthropic tokenizer here model_response["created"] = int(time.time()) From 5c03109b6fe8d59602aa4616fffc67e6705eef98 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 5 Mar 2024 07:39:06 -0800 Subject: [PATCH 02/14] docs(configs.md): add load balancing to proxy config docs --- docs/my-website/docs/proxy/configs.md | 65 ++++++++++++++++----------- litellm/llms/aleph_alpha.py | 11 +++-- 2 files changed, 47 insertions(+), 29 deletions(-) diff --git a/docs/my-website/docs/proxy/configs.md b/docs/my-website/docs/proxy/configs.md index 0a155828b..2b3edfadb 100644 --- a/docs/my-website/docs/proxy/configs.md +++ b/docs/my-website/docs/proxy/configs.md @@ -202,7 +202,7 @@ print(response) -## Save Model-specific params (API Base, API Keys, Temperature, Max Tokens, Seed, Organization, Headers etc.) +## Save Model-specific params (API Base, Keys, Temperature, Max Tokens, Organization, Headers etc.) You can use the config to save model-specific information like api_base, api_key, temperature, max_tokens, etc. [**All input params**](https://docs.litellm.ai/docs/completion/input#input-params-1) @@ -244,6 +244,45 @@ $ litellm --config /path/to/config.yaml ``` +## Load Balancing + +Use this to call multiple instances of the same model and configure things like [routing strategy](../routing.md#advanced). + +```yaml +router_settings: + routing_strategy: "latency-based-routing" # routes to the fastest deployment in the group + +model_list: + - model_name: zephyr-beta + litellm_params: + model: huggingface/HuggingFaceH4/zephyr-7b-beta + api_base: http://0.0.0.0:8001 + - model_name: zephyr-beta + litellm_params: + model: huggingface/HuggingFaceH4/zephyr-7b-beta + api_base: http://0.0.0.0:8002 + - model_name: zephyr-beta + litellm_params: + model: huggingface/HuggingFaceH4/zephyr-7b-beta + api_base: http://0.0.0.0:8003 + - model_name: gpt-3.5-turbo + litellm_params: + model: gpt-3.5-turbo + api_key: + - model_name: gpt-3.5-turbo-16k + litellm_params: + model: gpt-3.5-turbo-16k + api_key: + +litellm_settings: + num_retries: 3 # retry call 3 times on each model_name (e.g. zephyr-beta) + request_timeout: 10 # raise Timeout error if call takes longer than 10s. Sets litellm.request_timeout + fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo"]}] # fallback to gpt-3.5-turbo if call fails num_retries + context_window_fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}] # fallback to gpt-3.5-turbo-16k if context window error + allowed_fails: 3 # cooldown model if it fails > 1 call in a minute. +``` + + ## Set Azure `base_model` for cost tracking **Problem**: Azure returns `gpt-4` in the response when `azure/gpt-4-1106-preview` is used. This leads to inaccurate cost tracking @@ -512,30 +551,6 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \ ``` -## Router Settings - -Use this to configure things like routing strategy. - -```yaml -router_settings: - routing_strategy: "least-busy" - -model_list: # will route requests to the least busy ollama model - - model_name: ollama-models - litellm_params: - model: "ollama/mistral" - api_base: "http://127.0.0.1:8001" - - model_name: ollama-models - litellm_params: - model: "ollama/codellama" - api_base: "http://127.0.0.1:8002" - - model_name: ollama-models - litellm_params: - model: "ollama/llama2" - api_base: "http://127.0.0.1:8003" -``` - - ## Configure DB Pool Limits + Connection Timeouts ```yaml diff --git a/litellm/llms/aleph_alpha.py b/litellm/llms/aleph_alpha.py index 7168e7369..3c1bd5dde 100644 --- a/litellm/llms/aleph_alpha.py +++ b/litellm/llms/aleph_alpha.py @@ -77,9 +77,9 @@ class AlephAlphaConfig: - `control_log_additive` (boolean; default value: true): Method of applying control to attention scores. """ - maximum_tokens: Optional[ - int - ] = litellm.max_tokens # aleph alpha requires max tokens + maximum_tokens: Optional[int] = ( + litellm.max_tokens + ) # aleph alpha requires max tokens minimum_tokens: Optional[int] = None echo: Optional[bool] = None temperature: Optional[int] = None @@ -285,7 +285,10 @@ def completion( ## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here. prompt_tokens = len(encoding.encode(prompt)) completion_tokens = len( - encoding.encode(model_response["choices"][0]["message"]["content"]) + encoding.encode( + model_response["choices"][0]["message"]["content"], + disallowed_special=(), + ) ) model_response["created"] = int(time.time()) From 072500e3144f89cb28d1eddb4818c7f372406d3b Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 5 Mar 2024 07:40:41 -0800 Subject: [PATCH 03/14] refactor(main.py): trigger new build --- litellm/main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/litellm/main.py b/litellm/main.py index 60effd96f..87ec7ad07 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -12,6 +12,7 @@ from typing import Any, Literal, Union from functools import partial import dotenv, traceback, random, asyncio, time, contextvars from copy import deepcopy + import httpx import litellm from ._logging import verbose_logger From 6b265bc144ec0e674d6ccf5421db6041cdbacd07 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 5 Mar 2024 07:40:54 -0800 Subject: [PATCH 04/14] =?UTF-8?q?bump:=20version=201.29.2=20=E2=86=92=201.?= =?UTF-8?q?29.3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8de9a9f10..4e318afb2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "litellm" -version = "1.29.2" +version = "1.29.3" description = "Library to easily interface with LLM API providers" authors = ["BerriAI"] license = "MIT" @@ -74,7 +74,7 @@ requires = ["poetry-core", "wheel"] build-backend = "poetry.core.masonry.api" [tool.commitizen] -version = "1.29.2" +version = "1.29.3" version_files = [ "pyproject.toml:^version" ] From d362fc6eec9b34250906d12cc8936a927423abd2 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 5 Mar 2024 13:37:24 -0800 Subject: [PATCH 05/14] fix(utils.py): fix logging --- litellm/utils.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/litellm/utils.py b/litellm/utils.py index d4d85cad1..5937c072d 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -772,10 +772,10 @@ class ImageResponse(OpenAIObject): ############################################################ -def print_verbose(print_statement): +def print_verbose(print_statement, logger_only: bool = False): try: verbose_logger.debug(print_statement) - if litellm.set_verbose: + if litellm.set_verbose == True and logger_only == False: print(print_statement) # noqa except: pass @@ -1739,7 +1739,8 @@ class Logging: ) if callable(callback): # custom logger functions print_verbose( - f"Making async function logging call for {callback}, result={result} - {self.model_call_details}" + f"Making async function logging call for {callback}, result={result} - {self.model_call_details}", + logger_only=True, ) if self.stream: if ( From 78d87a4fbd4d804c63c4577027b96820eaf1b7d1 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 5 Mar 2024 15:01:03 -0800 Subject: [PATCH 06/14] fix: clean up print verbose statements --- litellm/integrations/custom_logger.py | 2 -- litellm/utils.py | 8 ++++---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/litellm/integrations/custom_logger.py b/litellm/integrations/custom_logger.py index 40242f5c0..0556ceebb 100644 --- a/litellm/integrations/custom_logger.py +++ b/litellm/integrations/custom_logger.py @@ -124,7 +124,6 @@ class CustomLogger: # https://docs.litellm.ai/docs/observability/custom_callbac start_time, end_time, ) - print_verbose(f"Custom Logger - final response object: {response_obj}") except: # traceback.print_exc() print_verbose(f"Custom Logger Error - {traceback.format_exc()}") @@ -142,7 +141,6 @@ class CustomLogger: # https://docs.litellm.ai/docs/observability/custom_callbac start_time, end_time, ) - print_verbose(f"Custom Logger - final response object: {response_obj}") except: # traceback.print_exc() print_verbose(f"Custom Logger Error - {traceback.format_exc()}") diff --git a/litellm/utils.py b/litellm/utils.py index 5937c072d..fa3457143 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -1738,10 +1738,10 @@ class Logging: end_time=end_time, ) if callable(callback): # custom logger functions - print_verbose( - f"Making async function logging call for {callback}, result={result} - {self.model_call_details}", - logger_only=True, - ) + # print_verbose( + # f"Making async function logging call for {callback}, result={result} - {self.model_call_details}", + # logger_only=True, + # ) if self.stream: if ( "async_complete_streaming_response" From cd419eb8a5f686c65c7439206ae8bf49d318c8de Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 5 Mar 2024 15:35:13 -0800 Subject: [PATCH 07/14] test(test_keys.py): add more duration for testing budget updates --- proxy_server_config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/proxy_server_config.yaml b/proxy_server_config.yaml index b65cefe79..64183f216 100644 --- a/proxy_server_config.yaml +++ b/proxy_server_config.yaml @@ -40,8 +40,8 @@ litellm_settings: budget_duration: 30d general_settings: master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234) - proxy_budget_rescheduler_min_time: 3 - proxy_budget_rescheduler_max_time: 6 + proxy_budget_rescheduler_min_time: 10 + proxy_budget_rescheduler_max_time: 12 # database_url: "postgresql://:@:/" # [OPTIONAL] use for token-based auth to proxy environment_variables: From 2b595bfdc9eabf5a4af555f59252422254cfe3f1 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 5 Mar 2024 15:42:19 -0800 Subject: [PATCH 08/14] test(test_keys.py): add more duration for test --- tests/test_keys.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_keys.py b/tests/test_keys.py index db21a2176..5a7b79e1c 100644 --- a/tests/test_keys.py +++ b/tests/test_keys.py @@ -469,7 +469,7 @@ async def test_key_with_budgets(): break except: i + 1 - await asyncio.sleep(5) + await asyncio.sleep(10) assert reset_at_init_value != reset_at_new_value From 387864662e8fe40b8b3f7aa8e6f279bd736d38a3 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 5 Mar 2024 15:50:40 -0800 Subject: [PATCH 09/14] fix(main.py): trigger new build --- litellm/main.py | 1 - litellm/proxy/utils.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/litellm/main.py b/litellm/main.py index 87ec7ad07..60effd96f 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -12,7 +12,6 @@ from typing import Any, Literal, Union from functools import partial import dotenv, traceback, random, asyncio, time, contextvars from copy import deepcopy - import httpx import litellm from ._logging import verbose_logger diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py index fc90c3b7b..d11e1e479 100644 --- a/litellm/proxy/utils.py +++ b/litellm/proxy/utils.py @@ -64,7 +64,7 @@ class ProxyLogging: litellm.callbacks.append(self.max_parallel_request_limiter) litellm.callbacks.append(self.max_budget_limiter) litellm.callbacks.append(self.cache_control_check) - litellm.callbacks.append(self.response_taking_too_long_callback) + litellm.success_callback.append(self.response_taking_too_long_callback) for callback in litellm.callbacks: if callback not in litellm.input_callback: litellm.input_callback.append(callback) From 512f6814d3257bdbb3cbde73ef7e233eb55762bb Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 5 Mar 2024 16:31:41 -0800 Subject: [PATCH 10/14] fix(factory.py): fix anthropic prompt template --- litellm/llms/prompt_templates/factory.py | 1 + litellm/tests/test_completion.py | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/litellm/llms/prompt_templates/factory.py b/litellm/llms/prompt_templates/factory.py index dec87a61c..c78a71dba 100644 --- a/litellm/llms/prompt_templates/factory.py +++ b/litellm/llms/prompt_templates/factory.py @@ -575,6 +575,7 @@ def anthropic_messages_pt(messages: list): if messages[i]["role"] == "assistant": last_assistant_message_idx = i + new_messages.append(messages[-1]) if last_assistant_message_idx is not None: new_messages[last_assistant_message_idx]["content"] = new_messages[ last_assistant_message_idx diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index 57b0e436f..f5e145769 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -82,6 +82,23 @@ def test_completion_claude(): # test_completion_claude() +def test_completion_claude_3_empty_response(): + messages = [ + { + "role": "system", + "content": "You are 2twNLGfqk4GMOn3ffp4p.", + }, + {"role": "user", "content": "Hi gm!"}, + {"role": "assistant", "content": "Good morning! How are you doing today?"}, + { + "role": "user", + "content": "I was hoping we could chat a bit", + }, + ] + response = litellm.completion(model="claude-3-opus-20240229", messages=messages) + print(response) + + def test_completion_claude_3(): litellm.set_verbose = True messages = [{"role": "user", "content": "Hello, world"}] From 6727b009a191771aa5d6c240c008ec8c8d0a37a5 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 5 Mar 2024 16:32:06 -0800 Subject: [PATCH 11/14] =?UTF-8?q?bump:=20version=201.29.3=20=E2=86=92=201.?= =?UTF-8?q?29.4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4e318afb2..6d42edd23 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "litellm" -version = "1.29.3" +version = "1.29.4" description = "Library to easily interface with LLM API providers" authors = ["BerriAI"] license = "MIT" @@ -74,7 +74,7 @@ requires = ["poetry-core", "wheel"] build-backend = "poetry.core.masonry.api" [tool.commitizen] -version = "1.29.3" +version = "1.29.4" version_files = [ "pyproject.toml:^version" ] From 3f7bf5c6b19e746a6609c48650d42cd74ea8ce04 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Tue, 5 Mar 2024 16:46:58 -0800 Subject: [PATCH 12/14] (fix) fix batch update user db --- litellm/proxy/proxy_server.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index 47b8c1535..628f55852 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -1067,20 +1067,22 @@ async def update_database( ) data_list.append(existing_spend_obj) - # Update the cost column for the given user id - if prisma_client is not None: - await prisma_client.update_data( - data_list=data_list, - query_type="update_many", - table_name="user", - ) - elif custom_db_client is not None and user_id is not None: + if custom_db_client is not None and user_id is not None: new_spend = data_list[0].spend await custom_db_client.update_data( key=user_id, value={"spend": new_spend}, table_name="user" ) + # Update the cost column for the given user id + if prisma_client is not None: + await prisma_client.update_data( + data_list=data_list, + query_type="update_many", + table_name="user", + ) except Exception as e: - verbose_proxy_logger.info(f"Update User DB call failed to execute") + verbose_proxy_logger.info( + f"Update User DB call failed to execute {str(e)}" + ) ### UPDATE KEY SPEND ### async def _update_key_db(): @@ -1215,7 +1217,9 @@ async def update_database( await custom_db_client.insert_data(payload, table_name="spend") except Exception as e: - verbose_proxy_logger.info(f"Update Spend Logs DB failed to execute") + verbose_proxy_logger.info( + f"Update Spend Logs DB failed to execute - {str(e)}" + ) ### UPDATE KEY SPEND ### async def _update_team_db(): @@ -1286,7 +1290,9 @@ async def update_database( valid_token.spend = new_spend user_api_key_cache.set_cache(key=token, value=valid_token) except Exception as e: - verbose_proxy_logger.info(f"Update Team DB failed to execute") + verbose_proxy_logger.info( + f"Update Team DB failed to execute - {str(e)}" + ) asyncio.create_task(_update_user_db()) asyncio.create_task(_update_key_db()) From 0eb67e50a174c3c52eeabab26bb7c16e3afb31ee Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 5 Mar 2024 16:48:37 -0800 Subject: [PATCH 13/14] fix(utils.py): handle none in tool call for mistral tool calling --- litellm/utils.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/litellm/utils.py b/litellm/utils.py index fa3457143..33f4f1c3d 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -225,9 +225,25 @@ class ChatCompletionDeltaToolCall(OpenAIObject): class ChatCompletionMessageToolCall(OpenAIObject): - id: str - function: Function - type: str + def __init__( + self, + function: Function, + id: Optional[str] = None, + type: Optional[str] = None, + **params, + ): + super(ChatCompletionMessageToolCall, self).__init__(**params) + self.function = function + + if id is not None: + self.id = id + else: + self.id = f"{uuid.uuid4()}" + + if type is not None: + self.type = type + else: + self.type = "function" class Message(OpenAIObject): @@ -6232,7 +6248,7 @@ def convert_to_model_response_object( return model_response_object except Exception as e: - raise Exception(f"Invalid response object {e}") + raise Exception(f"Invalid response object {traceback.format_exc()}") def acreate(*args, **kwargs): ## Thin client to handle the acreate langchain call From f95458dad8ad5ad8709e711c2be45ccff324d695 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 5 Mar 2024 18:10:43 -0800 Subject: [PATCH 14/14] fix(utils.py): handle dict object for chatcompletionmessagetoolcall --- litellm/utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/litellm/utils.py b/litellm/utils.py index 33f4f1c3d..68dc137af 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -227,13 +227,16 @@ class ChatCompletionDeltaToolCall(OpenAIObject): class ChatCompletionMessageToolCall(OpenAIObject): def __init__( self, - function: Function, + function: Union[Dict, Function], id: Optional[str] = None, type: Optional[str] = None, **params, ): super(ChatCompletionMessageToolCall, self).__init__(**params) - self.function = function + if isinstance(function, Dict): + self.function = Function(**function) + else: + self.function = function if id is not None: self.id = id