From db002315e30b13a0f30e90a4edd4ca55c58ecf15 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Thu, 7 Mar 2024 18:33:09 -0800
Subject: [PATCH] (feat) print debug info per deployment

---
 litellm/proxy/proxy_config.yaml             | 59 ++-------------------
 litellm/proxy/tests/load_test_completion.py |  6 +--
 litellm/router.py                           | 45 ++++++++++++++++
 3 files changed, 51 insertions(+), 59 deletions(-)

diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml
index 6b4b7a8f62..654a50b2f4 100644
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@@ -5,63 +5,12 @@ model_list:
       api_base: os.environ/AZURE_API_BASE
       api_key: os.environ/AZURE_API_KEY
       api_version: "2023-07-01-preview"
-    model_info:
-      mode: chat
-      max_tokens: 4096
-      base_model: azure/gpt-4-1106-preview
-      access_groups: ["public"] 
-  - model_name: openai-gpt-3.5
+  - model_name: azure-gpt-3.5
     litellm_params:
       model: gpt-3.5-turbo
       api_key: os.environ/OPENAI_API_KEY
     model_info:
       access_groups: ["public"]
-  - model_name: anthropic-claude-v2.1
-    litellm_params:
-      model: bedrock/anthropic.claude-v2:1
-      timeout: 300 # sets a 5 minute timeout
-    model_info:
-      access_groups: ["private"]
-  - model_name: anthropic-claude-v2
-    litellm_params:
-      model: bedrock/anthropic.claude-v2
-  - model_name: bedrock-cohere
-    litellm_params:
-      model: bedrock/cohere.command-text-v14
-      timeout: 0.0001
-  - model_name: gpt-4
-    litellm_params:
-      model: azure/chatgpt-v-2
-      api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
-      api_version: "2023-05-15"
-      api_key: os.environ/AZURE_API_KEY # The `os.environ/` prefix tells litellm to read this from the env. See https://docs.litellm.ai/docs/simple_proxy#load-api-keys-from-vault
-    model_info:
-      base_model: azure/gpt-4
-  - model_name: text-moderation-stable
-    litellm_params:
-      model: text-moderation-stable
-      api_key: os.environ/OPENAI_API_KEY
-litellm_settings:
-  fallbacks: [{"openai-gpt-3.5": ["azure-gpt-3.5"]}]
-  success_callback: ['langfuse']
-  # setting callback class
-  callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance]
-
-general_settings: 
-  master_key: sk-1234
-  alerting: ["slack"]
-  alerting_threshold: 10 # sends alerts if requests hang for 2 seconds
-  # database_type: "dynamo_db" 
-  # database_args: { # 👈  all args - https://github.com/BerriAI/litellm/blob/befbcbb7ac8f59835ce47415c128decf37aac328/litellm/proxy/_types.py#L190
-  #   "billing_mode": "PAY_PER_REQUEST", 
-  #   "region_name": "us-west-2",
-  #   "ssl_verify": False
-  # }
-
-
-
-
-
-environment_variables: 
-  # otel: True          # OpenTelemetry Logger
-  # master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234)
+router_settings:
+  set_verbose: True
+  debug_level: "DEBUG"
\ No newline at end of file
diff --git a/litellm/proxy/tests/load_test_completion.py b/litellm/proxy/tests/load_test_completion.py
index d708f30368..c6e5f480eb 100644
--- a/litellm/proxy/tests/load_test_completion.py
+++ b/litellm/proxy/tests/load_test_completion.py
@@ -4,9 +4,7 @@ import uuid
 import traceback
 
 
-litellm_client = AsyncOpenAI(
-    base_url="http://0.0.0.0:4000", api_key="sk-iNwH_oOtAQ6syi_2gkEOpQ"
-)
+litellm_client = AsyncOpenAI(base_url="http://0.0.0.0:4000", api_key="sk-1234")
 
 
 async def litellm_completion():
@@ -29,7 +27,7 @@ async def litellm_completion():
 async def main():
     for i in range(150):
         start = time.time()
-        n = 2000  # Number of concurrent tasks
+        n = 20  # Number of concurrent tasks
         tasks = [litellm_completion() for _ in range(n)]
 
         chat_completions = await asyncio.gather(*tasks)
diff --git a/litellm/router.py b/litellm/router.py
index 6f33d0b0d5..5c90ce8345 100644
--- a/litellm/router.py
+++ b/litellm/router.py
@@ -240,6 +240,21 @@ class Router:
             {"caching_groups": caching_groups}
         )
 
+        self.deployment_stats: dict = {}  # used for debugging load balancing
+        """
+        deployment_stats = {
+            "122999-2828282-277:
+            {
+                "model": "gpt-3",
+                "api_base": "http://localhost:8000",
+                "num_requests": 20,
+                "avg_latency": 0.001,
+                "num_failures": 0,
+                "num_successes": 20
+            }
+        }
+        """
+
         ### ROUTING SETUP ###
         if routing_strategy == "least-busy":
             self.leastbusy_logger = LeastBusyLoggingHandler(
@@ -390,6 +405,10 @@ class Router:
                 messages=messages,
                 specific_deployment=kwargs.pop("specific_deployment", None),
             )
+            if self.set_verbose == True and self.debug_level == "DEBUG":
+                # debug how often this deployment picked
+                self._print_deployment_metrics(deployment=deployment)
+
             kwargs.setdefault("metadata", {}).update(
                 {
                     "deployment": deployment["litellm_params"]["model"],
@@ -2124,6 +2143,32 @@ class Router:
         )
         return deployment
 
+    def _print_deployment_metrics(self, deployment):
+        litellm_params = deployment["litellm_params"]
+        api_base = litellm_params.get("api_base", "")
+        model = litellm_params.get("model", "")
+
+        model_id = deployment.get("model_info", {}).get("id", None)
+
+        # update self.deployment_stats
+        if model_id is not None:
+            if model_id in self.deployment_stats:
+                # only update num_requests
+                self.deployment_stats[model_id]["num_requests"] += 1
+            else:
+                self.deployment_stats[model_id] = {
+                    "api_base": api_base,
+                    "model": model,
+                    "num_requests": 1,
+                }
+        from pprint import pformat
+
+        # Assuming self.deployment_stats is your dictionary
+        formatted_stats = pformat(self.deployment_stats)
+
+        # Assuming verbose_router_logger is your logger
+        verbose_router_logger.info("self.deployment_stats: \n%s", formatted_stats)
+
     def flush_cache(self):
         litellm.cache = None
         self.cache.flush_cache()