From 14a19fc6014a7d63e04c283aedd3fc8b12a901f3 Mon Sep 17 00:00:00 2001
From: xihajun <junfan@krai.ai>
Date: Sun, 3 Mar 2024 23:43:03 +0000
Subject: [PATCH 01/14] Disable special token restriction for claude ai

---
 litellm/llms/anthropic.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/litellm/llms/anthropic.py b/litellm/llms/anthropic.py
index 150ae0e07..6bfedc101 100644
--- a/litellm/llms/anthropic.py
+++ b/litellm/llms/anthropic.py
@@ -200,10 +200,10 @@ def completion(
 
         ## CALCULATING USAGE
         prompt_tokens = len(
-            encoding.encode(prompt)
+            encoding.encode(prompt,disallowed_special=())
         )  ##[TODO] use the anthropic tokenizer here
         completion_tokens = len(
-            encoding.encode(model_response["choices"][0]["message"].get("content", ""))
+            encoding.encode(model_response["choices"][0]["message"].get("content", ""),disallowed_special=())
         )  ##[TODO] use the anthropic tokenizer here
 
         model_response["created"] = int(time.time())

From 5c03109b6fe8d59602aa4616fffc67e6705eef98 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 5 Mar 2024 07:39:06 -0800
Subject: [PATCH 02/14] docs(configs.md): add load balancing to proxy config
 docs

---
 docs/my-website/docs/proxy/configs.md | 65 ++++++++++++++++-----------
 litellm/llms/aleph_alpha.py           | 11 +++--
 2 files changed, 47 insertions(+), 29 deletions(-)

diff --git a/docs/my-website/docs/proxy/configs.md b/docs/my-website/docs/proxy/configs.md
index 0a155828b..2b3edfadb 100644
--- a/docs/my-website/docs/proxy/configs.md
+++ b/docs/my-website/docs/proxy/configs.md
@@ -202,7 +202,7 @@ print(response)
 </Tabs>
 
 
-## Save Model-specific params (API Base, API Keys, Temperature, Max Tokens, Seed, Organization, Headers etc.)
+## Save Model-specific params (API Base, Keys, Temperature, Max Tokens, Organization, Headers etc.)
 You can use the config to save model-specific information like api_base, api_key, temperature, max_tokens, etc. 
 
 [**All input params**](https://docs.litellm.ai/docs/completion/input#input-params-1)
@@ -244,6 +244,45 @@ $ litellm --config /path/to/config.yaml
 ```
 
 
+## Load Balancing 
+
+Use this to call multiple instances of the same model and configure things like [routing strategy](../routing.md#advanced). 
+
+```yaml
+router_settings:
+  routing_strategy: "latency-based-routing" # routes to the fastest deployment in the group
+
+model_list:
+  - model_name: zephyr-beta
+    litellm_params:
+        model: huggingface/HuggingFaceH4/zephyr-7b-beta
+        api_base: http://0.0.0.0:8001
+  - model_name: zephyr-beta
+    litellm_params:
+        model: huggingface/HuggingFaceH4/zephyr-7b-beta
+        api_base: http://0.0.0.0:8002
+  - model_name: zephyr-beta
+    litellm_params:
+        model: huggingface/HuggingFaceH4/zephyr-7b-beta
+        api_base: http://0.0.0.0:8003
+  - model_name: gpt-3.5-turbo
+    litellm_params:
+        model: gpt-3.5-turbo
+        api_key: <my-openai-key>
+  - model_name: gpt-3.5-turbo-16k
+    litellm_params:
+        model: gpt-3.5-turbo-16k
+        api_key: <my-openai-key>
+
+litellm_settings:
+  num_retries: 3 # retry call 3 times on each model_name (e.g. zephyr-beta)
+  request_timeout: 10 # raise Timeout error if call takes longer than 10s. Sets litellm.request_timeout 
+  fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo"]}] # fallback to gpt-3.5-turbo if call fails num_retries 
+  context_window_fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}] # fallback to gpt-3.5-turbo-16k if context window error
+  allowed_fails: 3 # cooldown model if it fails > 1 call in a minute. 
+```
+
+
 ## Set Azure `base_model` for cost tracking
 
 **Problem**: Azure returns `gpt-4` in the response when `azure/gpt-4-1106-preview` is used. This leads to inaccurate cost tracking
@@ -512,30 +551,6 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \
 ```
 
 
-## Router Settings 
-
-Use this to configure things like routing strategy. 
-
-```yaml
-router_settings:
-  routing_strategy: "least-busy"
-
-model_list: # will route requests to the least busy ollama model
-  - model_name: ollama-models
-    litellm_params: 
-      model: "ollama/mistral"
-      api_base: "http://127.0.0.1:8001"
-  - model_name: ollama-models
-    litellm_params: 
-      model: "ollama/codellama"
-      api_base: "http://127.0.0.1:8002"
-  - model_name: ollama-models
-    litellm_params: 
-      model: "ollama/llama2"
-      api_base: "http://127.0.0.1:8003"
-```
-
-
 ## Configure DB Pool Limits + Connection Timeouts 
 
 ```yaml
diff --git a/litellm/llms/aleph_alpha.py b/litellm/llms/aleph_alpha.py
index 7168e7369..3c1bd5dde 100644
--- a/litellm/llms/aleph_alpha.py
+++ b/litellm/llms/aleph_alpha.py
@@ -77,9 +77,9 @@ class AlephAlphaConfig:
     - `control_log_additive` (boolean; default value: true): Method of applying control to attention scores.
     """
 
-    maximum_tokens: Optional[
-        int
-    ] = litellm.max_tokens  # aleph alpha requires max tokens
+    maximum_tokens: Optional[int] = (
+        litellm.max_tokens
+    )  # aleph alpha requires max tokens
     minimum_tokens: Optional[int] = None
     echo: Optional[bool] = None
     temperature: Optional[int] = None
@@ -285,7 +285,10 @@ def completion(
         ## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here.
         prompt_tokens = len(encoding.encode(prompt))
         completion_tokens = len(
-            encoding.encode(model_response["choices"][0]["message"]["content"])
+            encoding.encode(
+                model_response["choices"][0]["message"]["content"],
+                disallowed_special=(),
+            )
         )
 
         model_response["created"] = int(time.time())

From 072500e3144f89cb28d1eddb4818c7f372406d3b Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 5 Mar 2024 07:40:41 -0800
Subject: [PATCH 03/14] refactor(main.py): trigger new build

---
 litellm/main.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/litellm/main.py b/litellm/main.py
index 60effd96f..87ec7ad07 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -12,6 +12,7 @@ from typing import Any, Literal, Union
 from functools import partial
 import dotenv, traceback, random, asyncio, time, contextvars
 from copy import deepcopy
+
 import httpx
 import litellm
 from ._logging import verbose_logger

From 6b265bc144ec0e674d6ccf5421db6041cdbacd07 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 5 Mar 2024 07:40:54 -0800
Subject: [PATCH 04/14] =?UTF-8?q?bump:=20version=201.29.2=20=E2=86=92=201.?=
 =?UTF-8?q?29.3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 8de9a9f10..4e318afb2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.29.2"
+version = "1.29.3"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@@ -74,7 +74,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.commitizen]
-version = "1.29.2"
+version = "1.29.3"
 version_files = [
     "pyproject.toml:^version"
 ]

From d362fc6eec9b34250906d12cc8936a927423abd2 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 5 Mar 2024 13:37:24 -0800
Subject: [PATCH 05/14] fix(utils.py): fix logging

---
 litellm/utils.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/litellm/utils.py b/litellm/utils.py
index d4d85cad1..5937c072d 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -772,10 +772,10 @@ class ImageResponse(OpenAIObject):
 
 
 ############################################################
-def print_verbose(print_statement):
+def print_verbose(print_statement, logger_only: bool = False):
     try:
         verbose_logger.debug(print_statement)
-        if litellm.set_verbose:
+        if litellm.set_verbose == True and logger_only == False:
             print(print_statement)  # noqa
     except:
         pass
@@ -1739,7 +1739,8 @@ class Logging:
                         )
                 if callable(callback):  # custom logger functions
                     print_verbose(
-                        f"Making async function logging call for {callback}, result={result} - {self.model_call_details}"
+                        f"Making async function logging call for {callback}, result={result} - {self.model_call_details}",
+                        logger_only=True,
                     )
                     if self.stream:
                         if (

From 78d87a4fbd4d804c63c4577027b96820eaf1b7d1 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 5 Mar 2024 15:01:03 -0800
Subject: [PATCH 06/14] fix: clean up print verbose statements

---
 litellm/integrations/custom_logger.py | 2 --
 litellm/utils.py                      | 8 ++++----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/litellm/integrations/custom_logger.py b/litellm/integrations/custom_logger.py
index 40242f5c0..0556ceebb 100644
--- a/litellm/integrations/custom_logger.py
+++ b/litellm/integrations/custom_logger.py
@@ -124,7 +124,6 @@ class CustomLogger:  # https://docs.litellm.ai/docs/observability/custom_callbac
                 start_time,
                 end_time,
             )
-            print_verbose(f"Custom Logger - final response object: {response_obj}")
         except:
             # traceback.print_exc()
             print_verbose(f"Custom Logger Error - {traceback.format_exc()}")
@@ -142,7 +141,6 @@ class CustomLogger:  # https://docs.litellm.ai/docs/observability/custom_callbac
                 start_time,
                 end_time,
             )
-            print_verbose(f"Custom Logger - final response object: {response_obj}")
         except:
             # traceback.print_exc()
             print_verbose(f"Custom Logger Error - {traceback.format_exc()}")
diff --git a/litellm/utils.py b/litellm/utils.py
index 5937c072d..fa3457143 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -1738,10 +1738,10 @@ class Logging:
                             end_time=end_time,
                         )
                 if callable(callback):  # custom logger functions
-                    print_verbose(
-                        f"Making async function logging call for {callback}, result={result} - {self.model_call_details}",
-                        logger_only=True,
-                    )
+                    # print_verbose(
+                    #     f"Making async function logging call for {callback}, result={result} - {self.model_call_details}",
+                    #     logger_only=True,
+                    # )
                     if self.stream:
                         if (
                             "async_complete_streaming_response"

From cd419eb8a5f686c65c7439206ae8bf49d318c8de Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 5 Mar 2024 15:35:13 -0800
Subject: [PATCH 07/14] test(test_keys.py): add more duration for testing
 budget updates

---
 proxy_server_config.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/proxy_server_config.yaml b/proxy_server_config.yaml
index b65cefe79..64183f216 100644
--- a/proxy_server_config.yaml
+++ b/proxy_server_config.yaml
@@ -40,8 +40,8 @@ litellm_settings:
   budget_duration: 30d
 general_settings: 
   master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234)
-  proxy_budget_rescheduler_min_time: 3
-  proxy_budget_rescheduler_max_time: 6
+  proxy_budget_rescheduler_min_time: 10
+  proxy_budget_rescheduler_max_time: 12
   # database_url: "postgresql://<user>:<password>@<host>:<port>/<dbname>" # [OPTIONAL] use for token-based auth to proxy
 
 environment_variables:

From 2b595bfdc9eabf5a4af555f59252422254cfe3f1 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 5 Mar 2024 15:42:19 -0800
Subject: [PATCH 08/14] test(test_keys.py): add more duration for test

---
 tests/test_keys.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_keys.py b/tests/test_keys.py
index db21a2176..5a7b79e1c 100644
--- a/tests/test_keys.py
+++ b/tests/test_keys.py
@@ -469,7 +469,7 @@ async def test_key_with_budgets():
                 break
             except:
                 i + 1
-                await asyncio.sleep(5)
+                await asyncio.sleep(10)
         assert reset_at_init_value != reset_at_new_value
 
 

From 387864662e8fe40b8b3f7aa8e6f279bd736d38a3 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 5 Mar 2024 15:50:40 -0800
Subject: [PATCH 09/14] fix(main.py): trigger new build

---
 litellm/main.py        | 1 -
 litellm/proxy/utils.py | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/litellm/main.py b/litellm/main.py
index 87ec7ad07..60effd96f 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -12,7 +12,6 @@ from typing import Any, Literal, Union
 from functools import partial
 import dotenv, traceback, random, asyncio, time, contextvars
 from copy import deepcopy
-
 import httpx
 import litellm
 from ._logging import verbose_logger
diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py
index fc90c3b7b..d11e1e479 100644
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@@ -64,7 +64,7 @@ class ProxyLogging:
         litellm.callbacks.append(self.max_parallel_request_limiter)
         litellm.callbacks.append(self.max_budget_limiter)
         litellm.callbacks.append(self.cache_control_check)
-        litellm.callbacks.append(self.response_taking_too_long_callback)
+        litellm.success_callback.append(self.response_taking_too_long_callback)
         for callback in litellm.callbacks:
             if callback not in litellm.input_callback:
                 litellm.input_callback.append(callback)

From 512f6814d3257bdbb3cbde73ef7e233eb55762bb Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 5 Mar 2024 16:31:41 -0800
Subject: [PATCH 10/14] fix(factory.py): fix anthropic prompt template

---
 litellm/llms/prompt_templates/factory.py |  1 +
 litellm/tests/test_completion.py         | 17 +++++++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/litellm/llms/prompt_templates/factory.py b/litellm/llms/prompt_templates/factory.py
index dec87a61c..c78a71dba 100644
--- a/litellm/llms/prompt_templates/factory.py
+++ b/litellm/llms/prompt_templates/factory.py
@@ -575,6 +575,7 @@ def anthropic_messages_pt(messages: list):
         if messages[i]["role"] == "assistant":
             last_assistant_message_idx = i
 
+    new_messages.append(messages[-1])
     if last_assistant_message_idx is not None:
         new_messages[last_assistant_message_idx]["content"] = new_messages[
             last_assistant_message_idx
diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index 57b0e436f..f5e145769 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -82,6 +82,23 @@ def test_completion_claude():
 # test_completion_claude()
 
 
+def test_completion_claude_3_empty_response():
+    messages = [
+        {
+            "role": "system",
+            "content": "You are 2twNLGfqk4GMOn3ffp4p.",
+        },
+        {"role": "user", "content": "Hi gm!"},
+        {"role": "assistant", "content": "Good morning! How are you doing today?"},
+        {
+            "role": "user",
+            "content": "I was hoping we could chat a bit",
+        },
+    ]
+    response = litellm.completion(model="claude-3-opus-20240229", messages=messages)
+    print(response)
+
+
 def test_completion_claude_3():
     litellm.set_verbose = True
     messages = [{"role": "user", "content": "Hello, world"}]

From 6727b009a191771aa5d6c240c008ec8c8d0a37a5 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 5 Mar 2024 16:32:06 -0800
Subject: [PATCH 11/14] =?UTF-8?q?bump:=20version=201.29.3=20=E2=86=92=201.?=
 =?UTF-8?q?29.4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 4e318afb2..6d42edd23 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.29.3"
+version = "1.29.4"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@@ -74,7 +74,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.commitizen]
-version = "1.29.3"
+version = "1.29.4"
 version_files = [
     "pyproject.toml:^version"
 ]

From 3f7bf5c6b19e746a6609c48650d42cd74ea8ce04 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 5 Mar 2024 16:46:58 -0800
Subject: [PATCH 12/14] (fix) fix batch update user db

---
 litellm/proxy/proxy_server.py | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index 47b8c1535..628f55852 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -1067,20 +1067,22 @@ async def update_database(
                     )
                     data_list.append(existing_spend_obj)
 
-                    # Update the cost column for the given user id
-                    if prisma_client is not None:
-                        await prisma_client.update_data(
-                            data_list=data_list,
-                            query_type="update_many",
-                            table_name="user",
-                        )
-                    elif custom_db_client is not None and user_id is not None:
+                    if custom_db_client is not None and user_id is not None:
                         new_spend = data_list[0].spend
                         await custom_db_client.update_data(
                             key=user_id, value={"spend": new_spend}, table_name="user"
                         )
+                # Update the cost column for the given user id
+                if prisma_client is not None:
+                    await prisma_client.update_data(
+                        data_list=data_list,
+                        query_type="update_many",
+                        table_name="user",
+                    )
             except Exception as e:
-                verbose_proxy_logger.info(f"Update User DB call failed to execute")
+                verbose_proxy_logger.info(
+                    f"Update User DB call failed to execute {str(e)}"
+                )
 
         ### UPDATE KEY SPEND ###
         async def _update_key_db():
@@ -1215,7 +1217,9 @@ async def update_database(
                     await custom_db_client.insert_data(payload, table_name="spend")
 
             except Exception as e:
-                verbose_proxy_logger.info(f"Update Spend Logs DB failed to execute")
+                verbose_proxy_logger.info(
+                    f"Update Spend Logs DB failed to execute - {str(e)}"
+                )
 
         ### UPDATE KEY SPEND ###
         async def _update_team_db():
@@ -1286,7 +1290,9 @@ async def update_database(
                         valid_token.spend = new_spend
                         user_api_key_cache.set_cache(key=token, value=valid_token)
             except Exception as e:
-                verbose_proxy_logger.info(f"Update Team DB failed to execute")
+                verbose_proxy_logger.info(
+                    f"Update Team DB failed to execute - {str(e)}"
+                )
 
         asyncio.create_task(_update_user_db())
         asyncio.create_task(_update_key_db())

From 0eb67e50a174c3c52eeabab26bb7c16e3afb31ee Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 5 Mar 2024 16:48:37 -0800
Subject: [PATCH 13/14] fix(utils.py): handle none in tool call for mistral
 tool calling

---
 litellm/utils.py | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/litellm/utils.py b/litellm/utils.py
index fa3457143..33f4f1c3d 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -225,9 +225,25 @@ class ChatCompletionDeltaToolCall(OpenAIObject):
 
 
 class ChatCompletionMessageToolCall(OpenAIObject):
-    id: str
-    function: Function
-    type: str
+    def __init__(
+        self,
+        function: Function,
+        id: Optional[str] = None,
+        type: Optional[str] = None,
+        **params,
+    ):
+        super(ChatCompletionMessageToolCall, self).__init__(**params)
+        self.function = function
+
+        if id is not None:
+            self.id = id
+        else:
+            self.id = f"{uuid.uuid4()}"
+
+        if type is not None:
+            self.type = type
+        else:
+            self.type = "function"
 
 
 class Message(OpenAIObject):
@@ -6232,7 +6248,7 @@ def convert_to_model_response_object(
 
             return model_response_object
     except Exception as e:
-        raise Exception(f"Invalid response object {e}")
+        raise Exception(f"Invalid response object {traceback.format_exc()}")
 
 
 def acreate(*args, **kwargs):  ## Thin client to handle the acreate langchain call

From f95458dad8ad5ad8709e711c2be45ccff324d695 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 5 Mar 2024 18:10:43 -0800
Subject: [PATCH 14/14] fix(utils.py): handle dict object for
 chatcompletionmessagetoolcall

---
 litellm/utils.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/litellm/utils.py b/litellm/utils.py
index 33f4f1c3d..68dc137af 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -227,13 +227,16 @@ class ChatCompletionDeltaToolCall(OpenAIObject):
 class ChatCompletionMessageToolCall(OpenAIObject):
     def __init__(
         self,
-        function: Function,
+        function: Union[Dict, Function],
         id: Optional[str] = None,
         type: Optional[str] = None,
         **params,
     ):
         super(ChatCompletionMessageToolCall, self).__init__(**params)
-        self.function = function
+        if isinstance(function, Dict):
+            self.function = Function(**function)
+        else:
+            self.function = function
 
         if id is not None:
             self.id = id