Merge branch 'main' into litellm_fixes_proxy_db

2024-03-05 18:51:55 -08:00 · 2024-03-05 18:51:55 -08:00 · 03fa322b38
commit 03fa322b38
parent 735728c4dd f95458dad8
11 changed files with 117 additions and 57 deletions
--- a/docs/my-website/docs/proxy/configs.md
+++ b/docs/my-website/docs/proxy/configs.md
@ -202,7 +202,7 @@ print(response)
 </Tabs>


-## Save Model-specific params (API Base, API Keys, Temperature, Max Tokens, Seed, Organization, Headers etc.)
+## Save Model-specific params (API Base, Keys, Temperature, Max Tokens, Organization, Headers etc.)
 You can use the config to save model-specific information like api_base, api_key, temperature, max_tokens, etc. 

 [**All input params**](https://docs.litellm.ai/docs/completion/input#input-params-1)
@ -244,6 +244,45 @@ $ litellm --config /path/to/config.yaml
 ```


+## Load Balancing 
+
+Use this to call multiple instances of the same model and configure things like [routing strategy](../routing.md#advanced). 
+
+```yaml
+router_settings:
+  routing_strategy: "latency-based-routing" # routes to the fastest deployment in the group
+
+model_list:
+  - model_name: zephyr-beta
+    litellm_params:
+        model: huggingface/HuggingFaceH4/zephyr-7b-beta
+        api_base: http://0.0.0.0:8001
+  - model_name: zephyr-beta
+    litellm_params:
+        model: huggingface/HuggingFaceH4/zephyr-7b-beta
+        api_base: http://0.0.0.0:8002
+  - model_name: zephyr-beta
+    litellm_params:
+        model: huggingface/HuggingFaceH4/zephyr-7b-beta
+        api_base: http://0.0.0.0:8003
+  - model_name: gpt-3.5-turbo
+    litellm_params:
+        model: gpt-3.5-turbo
+        api_key: <my-openai-key>
+  - model_name: gpt-3.5-turbo-16k
+    litellm_params:
+        model: gpt-3.5-turbo-16k
+        api_key: <my-openai-key>
+
+litellm_settings:
+  num_retries: 3 # retry call 3 times on each model_name (e.g. zephyr-beta)
+  request_timeout: 10 # raise Timeout error if call takes longer than 10s. Sets litellm.request_timeout 
+  fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo"]}] # fallback to gpt-3.5-turbo if call fails num_retries 
+  context_window_fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}] # fallback to gpt-3.5-turbo-16k if context window error
+  allowed_fails: 3 # cooldown model if it fails > 1 call in a minute. 
+```
+
+
 ## Set Azure `base_model` for cost tracking

 **Problem**: Azure returns `gpt-4` in the response when `azure/gpt-4-1106-preview` is used. This leads to inaccurate cost tracking
@ -512,30 +551,6 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \
 ```


-## Router Settings 
-
-Use this to configure things like routing strategy. 
-
-```yaml
-router_settings:
-  routing_strategy: "least-busy"
-
-model_list: # will route requests to the least busy ollama model
-  - model_name: ollama-models
-    litellm_params: 
-      model: "ollama/mistral"
-      api_base: "http://127.0.0.1:8001"
-  - model_name: ollama-models
-    litellm_params: 
-      model: "ollama/codellama"
-      api_base: "http://127.0.0.1:8002"
-  - model_name: ollama-models
-    litellm_params: 
-      model: "ollama/llama2"
-      api_base: "http://127.0.0.1:8003"
-```
-
-
 ## Configure DB Pool Limits + Connection Timeouts 

 ```yaml
--- a/litellm/integrations/custom_logger.py
+++ b/litellm/integrations/custom_logger.py
@ -124,7 +124,6 @@ class CustomLogger:  # https://docs.litellm.ai/docs/observability/custom_callbac
                start_time,
                end_time,
            )
-            print_verbose(f"Custom Logger - final response object: {response_obj}")
        except:
            # traceback.print_exc()
            print_verbose(f"Custom Logger Error - {traceback.format_exc()}")
@ -142,7 +141,6 @@ class CustomLogger:  # https://docs.litellm.ai/docs/observability/custom_callbac
                start_time,
                end_time,
            )
-            print_verbose(f"Custom Logger - final response object: {response_obj}")
        except:
            # traceback.print_exc()
            print_verbose(f"Custom Logger Error - {traceback.format_exc()}")
--- a/litellm/llms/aleph_alpha.py
+++ b/litellm/llms/aleph_alpha.py
@ -77,9 +77,9 @@ class AlephAlphaConfig:
    - `control_log_additive` (boolean; default value: true): Method of applying control to attention scores.
    """

-    maximum_tokens: Optional[
-        int
-    ] = litellm.max_tokens  # aleph alpha requires max tokens
+    maximum_tokens: Optional[int] = (
+        litellm.max_tokens
+    )  # aleph alpha requires max tokens
    minimum_tokens: Optional[int] = None
    echo: Optional[bool] = None
    temperature: Optional[int] = None
@ -285,7 +285,10 @@ def completion(
        ## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here.
        prompt_tokens = len(encoding.encode(prompt))
        completion_tokens = len(
-            encoding.encode(model_response["choices"][0]["message"]["content"])
+            encoding.encode(
+                model_response["choices"][0]["message"]["content"],
+                disallowed_special=(),
+            )
        )

        model_response["created"] = int(time.time())
--- a/litellm/llms/prompt_templates/factory.py
+++ b/litellm/llms/prompt_templates/factory.py
@ -575,6 +575,7 @@ def anthropic_messages_pt(messages: list):
        if messages[i]["role"] == "assistant":
            last_assistant_message_idx = i

+    new_messages.append(messages[-1])
    if last_assistant_message_idx is not None:
        new_messages[last_assistant_message_idx]["content"] = new_messages[
            last_assistant_message_idx
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -1067,6 +1067,11 @@ async def update_database(
                    )
                    data_list.append(existing_spend_obj)

+                    if custom_db_client is not None and user_id is not None:
+                        new_spend = data_list[0].spend
+                        await custom_db_client.update_data(
+                            key=user_id, value={"spend": new_spend}, table_name="user"
+                        )
                # Update the cost column for the given user id
                if prisma_client is not None:
                    await prisma_client.update_data(
@ -1074,13 +1079,10 @@ async def update_database(
                        query_type="update_many",
                        table_name="user",
                    )
-                    elif custom_db_client is not None and user_id is not None:
-                        new_spend = data_list[0].spend
-                        await custom_db_client.update_data(
-                            key=user_id, value={"spend": new_spend}, table_name="user"
-                        )
            except Exception as e:
-                verbose_proxy_logger.info(f"Update User DB call failed to execute")
+                verbose_proxy_logger.info(
+                    f"Update User DB call failed to execute {str(e)}"
+                )

        ### UPDATE KEY SPEND ###
        async def _update_key_db():
@ -1215,7 +1217,9 @@ async def update_database(
                    await custom_db_client.insert_data(payload, table_name="spend")

            except Exception as e:
-                verbose_proxy_logger.info(f"Update Spend Logs DB failed to execute")
+                verbose_proxy_logger.info(
+                    f"Update Spend Logs DB failed to execute - {str(e)}"
+                )

        ### UPDATE KEY SPEND ###
        async def _update_team_db():
@ -1286,7 +1290,9 @@ async def update_database(
                        valid_token.spend = new_spend
                        user_api_key_cache.set_cache(key=token, value=valid_token)
            except Exception as e:
-                verbose_proxy_logger.info(f"Update Team DB failed to execute")
+                verbose_proxy_logger.info(
+                    f"Update Team DB failed to execute - {str(e)}"
+                )

        asyncio.create_task(_update_user_db())
        asyncio.create_task(_update_key_db())
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@ -64,7 +64,7 @@ class ProxyLogging:
        litellm.callbacks.append(self.max_parallel_request_limiter)
        litellm.callbacks.append(self.max_budget_limiter)
        litellm.callbacks.append(self.cache_control_check)
-        # litellm.callbacks.append(self.response_taking_too_long_callback)
+        litellm.success_callback.append(self.response_taking_too_long_callback)
        for callback in litellm.callbacks:
            if callback not in litellm.input_callback:
                litellm.input_callback.append(callback)
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@ -82,6 +82,23 @@ def test_completion_claude():
 # test_completion_claude()


+def test_completion_claude_3_empty_response():
+    messages = [
+        {
+            "role": "system",
+            "content": "You are 2twNLGfqk4GMOn3ffp4p.",
+        },
+        {"role": "user", "content": "Hi gm!"},
+        {"role": "assistant", "content": "Good morning! How are you doing today?"},
+        {
+            "role": "user",
+            "content": "I was hoping we could chat a bit",
+        },
+    ]
+    response = litellm.completion(model="claude-3-opus-20240229", messages=messages)
+    print(response)
+
+
 def test_completion_claude_3():
    litellm.set_verbose = True
    messages = [{"role": "user", "content": "Hello, world"}]
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -225,9 +225,28 @@ class ChatCompletionDeltaToolCall(OpenAIObject):


 class ChatCompletionMessageToolCall(OpenAIObject):
-    id: str
-    function: Function
-    type: str
+    def __init__(
+        self,
+        function: Union[Dict, Function],
+        id: Optional[str] = None,
+        type: Optional[str] = None,
+        **params,
+    ):
+        super(ChatCompletionMessageToolCall, self).__init__(**params)
+        if isinstance(function, Dict):
+            self.function = Function(**function)
+        else:
+            self.function = function
+
+        if id is not None:
+            self.id = id
+        else:
+            self.id = f"{uuid.uuid4()}"
+
+        if type is not None:
+            self.type = type
+        else:
+            self.type = "function"


 class Message(OpenAIObject):
@ -772,10 +791,10 @@ class ImageResponse(OpenAIObject):


 ############################################################
-def print_verbose(print_statement):
+def print_verbose(print_statement, logger_only: bool = False):
    try:
        verbose_logger.debug(print_statement)
-        if litellm.set_verbose:
+        if litellm.set_verbose == True and logger_only == False:
            print(print_statement)  # noqa
    except:
        pass
@ -1738,9 +1757,10 @@ class Logging:
                            end_time=end_time,
                        )
                if callable(callback):  # custom logger functions
-                    print_verbose(
-                        f"Making async function logging call for {callback}, result={result} - {self.model_call_details}"
-                    )
+                    # print_verbose(
+                    #     f"Making async function logging call for {callback}, result={result} - {self.model_call_details}",
+                    #     logger_only=True,
+                    # )
                    if self.stream:
                        if (
                            "async_complete_streaming_response"
@ -6231,7 +6251,7 @@ def convert_to_model_response_object(

            return model_response_object
    except Exception as e:
-        raise Exception(f"Invalid response object {e}")
+        raise Exception(f"Invalid response object {traceback.format_exc()}")


 def acreate(*args, **kwargs):  ## Thin client to handle the acreate langchain call
--- a/proxy_server_config.yaml
+++ b/proxy_server_config.yaml
@ -40,8 +40,8 @@ litellm_settings:
  budget_duration: 30d
 general_settings: 
  master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234)
-  proxy_budget_rescheduler_min_time: 3
-  proxy_budget_rescheduler_max_time: 6
+  proxy_budget_rescheduler_min_time: 10
+  proxy_budget_rescheduler_max_time: 12
  # database_url: "postgresql://<user>:<password>@<host>:<port>/<dbname>" # [OPTIONAL] use for token-based auth to proxy

 environment_variables:
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.29.2"
+version = "1.29.4"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@ -74,7 +74,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"

 [tool.commitizen]
-version = "1.29.2"
+version = "1.29.4"
 version_files = [
    "pyproject.toml:^version"
 ]
--- a/tests/test_keys.py
+++ b/tests/test_keys.py
@ -469,7 +469,7 @@ async def test_key_with_budgets():
                break
            except:
                i + 1
-                await asyncio.sleep(5)
+                await asyncio.sleep(10)
        assert reset_at_init_value != reset_at_new_value