From db002315e30b13a0f30e90a4edd4ca55c58ecf15 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Thu, 7 Mar 2024 18:33:09 -0800 Subject: [PATCH] (feat) print debug info per deployment --- litellm/proxy/proxy_config.yaml | 59 ++------------------- litellm/proxy/tests/load_test_completion.py | 6 +-- litellm/router.py | 45 ++++++++++++++++ 3 files changed, 51 insertions(+), 59 deletions(-) diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml index 6b4b7a8f62..654a50b2f4 100644 --- a/litellm/proxy/proxy_config.yaml +++ b/litellm/proxy/proxy_config.yaml @@ -5,63 +5,12 @@ model_list: api_base: os.environ/AZURE_API_BASE api_key: os.environ/AZURE_API_KEY api_version: "2023-07-01-preview" - model_info: - mode: chat - max_tokens: 4096 - base_model: azure/gpt-4-1106-preview - access_groups: ["public"] - - model_name: openai-gpt-3.5 + - model_name: azure-gpt-3.5 litellm_params: model: gpt-3.5-turbo api_key: os.environ/OPENAI_API_KEY model_info: access_groups: ["public"] - - model_name: anthropic-claude-v2.1 - litellm_params: - model: bedrock/anthropic.claude-v2:1 - timeout: 300 # sets a 5 minute timeout - model_info: - access_groups: ["private"] - - model_name: anthropic-claude-v2 - litellm_params: - model: bedrock/anthropic.claude-v2 - - model_name: bedrock-cohere - litellm_params: - model: bedrock/cohere.command-text-v14 - timeout: 0.0001 - - model_name: gpt-4 - litellm_params: - model: azure/chatgpt-v-2 - api_base: https://openai-gpt-4-test-v-1.openai.azure.com/ - api_version: "2023-05-15" - api_key: os.environ/AZURE_API_KEY # The `os.environ/` prefix tells litellm to read this from the env. See https://docs.litellm.ai/docs/simple_proxy#load-api-keys-from-vault - model_info: - base_model: azure/gpt-4 - - model_name: text-moderation-stable - litellm_params: - model: text-moderation-stable - api_key: os.environ/OPENAI_API_KEY -litellm_settings: - fallbacks: [{"openai-gpt-3.5": ["azure-gpt-3.5"]}] - success_callback: ['langfuse'] - # setting callback class - callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance] - -general_settings: - master_key: sk-1234 - alerting: ["slack"] - alerting_threshold: 10 # sends alerts if requests hang for 2 seconds - # database_type: "dynamo_db" - # database_args: { # 👈 all args - https://github.com/BerriAI/litellm/blob/befbcbb7ac8f59835ce47415c128decf37aac328/litellm/proxy/_types.py#L190 - # "billing_mode": "PAY_PER_REQUEST", - # "region_name": "us-west-2", - # "ssl_verify": False - # } - - - - - -environment_variables: - # otel: True # OpenTelemetry Logger - # master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234) +router_settings: + set_verbose: True + debug_level: "DEBUG" \ No newline at end of file diff --git a/litellm/proxy/tests/load_test_completion.py b/litellm/proxy/tests/load_test_completion.py index d708f30368..c6e5f480eb 100644 --- a/litellm/proxy/tests/load_test_completion.py +++ b/litellm/proxy/tests/load_test_completion.py @@ -4,9 +4,7 @@ import uuid import traceback -litellm_client = AsyncOpenAI( - base_url="http://0.0.0.0:4000", api_key="sk-iNwH_oOtAQ6syi_2gkEOpQ" -) +litellm_client = AsyncOpenAI(base_url="http://0.0.0.0:4000", api_key="sk-1234") async def litellm_completion(): @@ -29,7 +27,7 @@ async def litellm_completion(): async def main(): for i in range(150): start = time.time() - n = 2000 # Number of concurrent tasks + n = 20 # Number of concurrent tasks tasks = [litellm_completion() for _ in range(n)] chat_completions = await asyncio.gather(*tasks) diff --git a/litellm/router.py b/litellm/router.py index 6f33d0b0d5..5c90ce8345 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -240,6 +240,21 @@ class Router: {"caching_groups": caching_groups} ) + self.deployment_stats: dict = {} # used for debugging load balancing + """ + deployment_stats = { + "122999-2828282-277: + { + "model": "gpt-3", + "api_base": "http://localhost:8000", + "num_requests": 20, + "avg_latency": 0.001, + "num_failures": 0, + "num_successes": 20 + } + } + """ + ### ROUTING SETUP ### if routing_strategy == "least-busy": self.leastbusy_logger = LeastBusyLoggingHandler( @@ -390,6 +405,10 @@ class Router: messages=messages, specific_deployment=kwargs.pop("specific_deployment", None), ) + if self.set_verbose == True and self.debug_level == "DEBUG": + # debug how often this deployment picked + self._print_deployment_metrics(deployment=deployment) + kwargs.setdefault("metadata", {}).update( { "deployment": deployment["litellm_params"]["model"], @@ -2124,6 +2143,32 @@ class Router: ) return deployment + def _print_deployment_metrics(self, deployment): + litellm_params = deployment["litellm_params"] + api_base = litellm_params.get("api_base", "") + model = litellm_params.get("model", "") + + model_id = deployment.get("model_info", {}).get("id", None) + + # update self.deployment_stats + if model_id is not None: + if model_id in self.deployment_stats: + # only update num_requests + self.deployment_stats[model_id]["num_requests"] += 1 + else: + self.deployment_stats[model_id] = { + "api_base": api_base, + "model": model, + "num_requests": 1, + } + from pprint import pformat + + # Assuming self.deployment_stats is your dictionary + formatted_stats = pformat(self.deployment_stats) + + # Assuming verbose_router_logger is your logger + verbose_router_logger.info("self.deployment_stats: \n%s", formatted_stats) + def flush_cache(self): litellm.cache = None self.cache.flush_cache()