diff --git a/litellm/_logging.py b/litellm/_logging.py index 2880fdba8a..363885c4c3 100644 --- a/litellm/_logging.py +++ b/litellm/_logging.py @@ -2,6 +2,14 @@ import logging set_verbose = False +# Create a handler for the logger (you may need to adapt this based on your needs) +handler = logging.StreamHandler() +handler.setLevel(logging.DEBUG) + +# Create a formatter and set it for the handler +formatter = logging.Formatter() +handler.setFormatter(formatter) + def print_verbose(print_statement): try: @@ -14,3 +22,6 @@ def print_verbose(print_statement): verbose_proxy_logger = logging.getLogger("LiteLLM Proxy") verbose_router_logger = logging.getLogger("LiteLLM Router") verbose_logger = logging.getLogger("LiteLLM") + +# Add the handler to the logger +verbose_router_logger.addHandler(handler) diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index b59b9887eb..d8e9efafd8 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -78,6 +78,7 @@ import pydantic from litellm.proxy._types import * from litellm.caching import DualCache from litellm.proxy.health_check import perform_health_check +from litellm._logging import verbose_router_logger, verbose_proxy_logger litellm.suppress_debug_info = True from fastapi import ( @@ -155,20 +156,6 @@ async_result = None celery_app_conn = None celery_fn = None # Redis Queue for handling requests ### logger ### -# Configure the logging settings (you can customize this based on your needs) -logging.basicConfig(level=logging.INFO) -# Create a custom logger for "debug-proxy" -debug_proxy_logger = logging.getLogger("LiteLLM Proxy") - - -#### HELPER FUNCTIONS #### -def print_verbose(print_statement): - try: - global user_debug - if user_debug: - print(print_statement) # noqa - except: - pass def usage_telemetry( @@ -251,24 +238,26 @@ async def user_api_key_auth( ## check for cache hit (In-Memory Cache) valid_token = user_api_key_cache.get_cache(key=api_key) - print_verbose(f"valid_token from cache: {valid_token}") + verbose_proxy_logger.debug(f"valid_token from cache: {valid_token}") if valid_token is None: ## check db - print_verbose(f"api key: {api_key}") + verbose_proxy_logger.debug(f"api key: {api_key}") valid_token = await prisma_client.get_data( token=api_key, expires=datetime.utcnow().replace(tzinfo=timezone.utc) ) - print_verbose(f"valid token from prisma: {valid_token}") + verbose_proxy_logger.debug(f"valid token from prisma: {valid_token}") user_api_key_cache.set_cache(key=api_key, value=valid_token, ttl=60) elif valid_token is not None: - print_verbose(f"API Key Cache Hit!") + verbose_proxy_logger.debug(f"API Key Cache Hit!") if valid_token: litellm.model_alias_map = valid_token.aliases config = valid_token.config if config != {}: model_list = config.get("model_list", []) llm_model_list = model_list - print_verbose(f"\n new llm router model list {llm_model_list}") + verbose_proxy_logger.debug( + f"\n new llm router model list {llm_model_list}" + ) if ( len(valid_token.models) == 0 ): # assume an empty model list means all models are allowed to be called @@ -302,7 +291,7 @@ async def user_api_key_auth( else: raise Exception(f"Invalid token") except Exception as e: - # print_verbose(f"An exception occurred - {traceback.format_exc()}") + # verbose_proxy_logger.debug(f"An exception occurred - {traceback.format_exc()}") traceback.print_exc() if isinstance(e, HTTPException): raise e @@ -322,7 +311,7 @@ def prisma_setup(database_url: Optional[str]): database_url=database_url, proxy_logging_obj=proxy_logging_obj ) except Exception as e: - print_verbose( + verbose_proxy_logger.debug( f"Error when initializing prisma, Ensure you run pip install prisma {str(e)}" ) @@ -364,7 +353,7 @@ def load_from_azure_key_vault(use_azure_key_vault: bool = False): f"Missing KVUri or client_id or client_secret or tenant_id from environment" ) except Exception as e: - print_verbose( + verbose_proxy_logger.debug( "Error when loading keys from Azure Key Vault. Ensure you run `pip install azure-identity azure-keyvault-secrets`" ) @@ -373,7 +362,7 @@ def cost_tracking(): global prisma_client if prisma_client is not None: if isinstance(litellm.success_callback, list): - print_verbose("setting litellm success callback to track cost") + verbose_proxy_logger.debug("setting litellm success callback to track cost") if (track_cost_callback) not in litellm.success_callback: # type: ignore litellm.success_callback.append(track_cost_callback) # type: ignore @@ -387,7 +376,7 @@ async def track_cost_callback( global prisma_client try: # check if it has collected an entire stream response - print_verbose( + verbose_proxy_logger.debug( f"kwargs stream: {kwargs.get('stream', None)} + complete streaming response: {kwargs.get('complete_streaming_response', None)}" ) if "complete_streaming_response" in kwargs: @@ -396,7 +385,7 @@ async def track_cost_callback( response_cost = litellm.completion_cost( completion_response=completion_response ) - print_verbose(f"streaming response_cost {response_cost}") + verbose_proxy_logger.debug(f"streaming response_cost {response_cost}") user_api_key = kwargs["litellm_params"]["metadata"].get( "user_api_key", None ) @@ -422,12 +411,14 @@ async def track_cost_callback( token=user_api_key, response_cost=response_cost, user_id=user_id ) except Exception as e: - print_verbose(f"error in tracking cost callback - {str(e)}") + verbose_proxy_logger.debug(f"error in tracking cost callback - {str(e)}") async def update_prisma_database(token, response_cost, user_id=None): try: - print_verbose(f"Enters prisma db call, token: {token}; user_id: {user_id}") + verbose_proxy_logger.debug( + f"Enters prisma db call, token: {token}; user_id: {user_id}" + ) ### UPDATE USER SPEND ### async def _update_user_db(): @@ -442,7 +433,7 @@ async def update_prisma_database(token, response_cost, user_id=None): # Calculate the new cost by adding the existing cost and response_cost new_spend = existing_spend + response_cost - print_verbose(f"new cost: {new_spend}") + verbose_proxy_logger.debug(f"new cost: {new_spend}") # Update the cost column for the given user id await prisma_client.update_data(user_id=user_id, data={"spend": new_spend}) @@ -450,7 +441,7 @@ async def update_prisma_database(token, response_cost, user_id=None): async def _update_key_db(): # Fetch the existing cost for the given token existing_spend_obj = await prisma_client.get_data(token=token) - print_verbose(f"existing spend: {existing_spend_obj}") + verbose_proxy_logger.debug(f"existing spend: {existing_spend_obj}") if existing_spend_obj is None: existing_spend = 0 else: @@ -458,7 +449,7 @@ async def update_prisma_database(token, response_cost, user_id=None): # Calculate the new cost by adding the existing cost and response_cost new_spend = existing_spend + response_cost - print_verbose(f"new cost: {new_spend}") + verbose_proxy_logger.debug(f"new cost: {new_spend}") # Update the cost column for the given token await prisma_client.update_data(token=token, data={"spend": new_spend}) @@ -467,7 +458,9 @@ async def update_prisma_database(token, response_cost, user_id=None): tasks.append(_update_key_db()) await asyncio.gather(*tasks) except Exception as e: - print_verbose(f"Error updating Prisma database: {traceback.format_exc()}") + verbose_proxy_logger.debug( + f"Error updating Prisma database: {traceback.format_exc()}" + ) pass @@ -478,7 +471,7 @@ def run_ollama_serve(): with open(os.devnull, "w") as devnull: process = subprocess.Popen(command, stdout=devnull, stderr=devnull) except Exception as e: - print_verbose( + verbose_proxy_logger.debug( f""" LiteLLM Warning: proxy started with `ollama` model\n`ollama serve` failed with Exception{e}. \nEnsure you run `ollama serve` """ @@ -618,7 +611,7 @@ class ProxyConfig: printed_yaml = copy.deepcopy(config) printed_yaml.pop("environment_variables", None) - print_verbose( + verbose_proxy_logger.debug( f"Loaded config YAML (api_key and environment_variables are not shown):\n{json.dumps(printed_yaml, indent=2)}" ) @@ -649,7 +642,7 @@ class ProxyConfig: cache_type = cache_params.get("type", "redis") - print_verbose(f"passed cache type={cache_type}") + verbose_proxy_logger.debug(f"passed cache type={cache_type}") if cache_type == "redis": cache_host = litellm.get_secret("REDIS_HOST", None) @@ -693,14 +686,16 @@ class ProxyConfig: litellm.callbacks = [ get_instance_fn(value=value, config_file_path=config_file_path) ] - print_verbose( + verbose_proxy_logger.debug( f"{blue_color_code} Initialized Callbacks - {litellm.callbacks} {reset_color_code}" ) elif key == "post_call_rules": litellm.post_call_rules = [ get_instance_fn(value=value, config_file_path=config_file_path) ] - print_verbose(f"litellm.post_call_rules: {litellm.post_call_rules}") + verbose_proxy_logger.debug( + f"litellm.post_call_rules: {litellm.post_call_rules}" + ) elif key == "success_callback": litellm.success_callback = [] @@ -714,7 +709,7 @@ class ProxyConfig: # these are litellm callbacks - "langfuse", "sentry", "wandb" else: litellm.success_callback.append(callback) - print_verbose( + verbose_proxy_logger.debug( f"{blue_color_code} Initialized Success Callbacks - {litellm.success_callback} {reset_color_code}" ) elif key == "failure_callback": @@ -730,7 +725,7 @@ class ProxyConfig: # these are litellm callbacks - "langfuse", "sentry", "wandb" else: litellm.failure_callback.append(callback) - print_verbose( + verbose_proxy_logger.debug( f"{blue_color_code} Initialized Success Callbacks - {litellm.failure_callback} {reset_color_code}" ) elif key == "cache_params": @@ -770,9 +765,9 @@ class ProxyConfig: ### CONNECT TO DATABASE ### database_url = general_settings.get("database_url", None) if database_url and database_url.startswith("os.environ/"): - print_verbose(f"GOING INTO LITELLM.GET_SECRET!") + verbose_proxy_logger.debug(f"GOING INTO LITELLM.GET_SECRET!") database_url = litellm.get_secret(database_url) - print_verbose(f"RETRIEVED DB URL: {database_url}") + verbose_proxy_logger.debug(f"RETRIEVED DB URL: {database_url}") prisma_setup(database_url=database_url) ## COST TRACKING ## cost_tracking() @@ -908,7 +903,7 @@ async def generate_key_helper_fn( "max_budget": max_budget, "user_email": user_email, } - print_verbose("PrismaClient: Before Insert Data") + verbose_proxy_logger.debug("PrismaClient: Before Insert Data") new_verification_token = await prisma_client.insert_data( data=verification_token_data ) @@ -966,7 +961,11 @@ async def initialize( user_model = model user_debug = debug if debug == True: # this needs to be first, so users can see Router init debugg - litellm.set_verbose = True + from litellm._logging import verbose_router_logger, verbose_proxy_logger + import logging + + verbose_router_logger.setLevel(level=logging.INFO) + verbose_router_logger.debug("initilized verbose router logger") dynamic_config = {"general": {}, user_model: {}} if config: ( @@ -1037,9 +1036,9 @@ async def initialize( # for streaming def data_generator(response): - print_verbose("inside generator") + verbose_proxy_logger.debug("inside generator") for chunk in response: - print_verbose(f"returned chunk: {chunk}") + verbose_proxy_logger.debug(f"returned chunk: {chunk}") try: yield f"data: {json.dumps(chunk.dict())}\n\n" except: @@ -1047,11 +1046,11 @@ def data_generator(response): async def async_data_generator(response, user_api_key_dict): - print_verbose("inside generator") + verbose_proxy_logger.debug("inside generator") try: start_time = time.time() async for chunk in response: - print_verbose(f"returned chunk: {chunk}") + verbose_proxy_logger.debug(f"returned chunk: {chunk}") try: yield f"data: {json.dumps(chunk.dict())}\n\n" except Exception as e: @@ -1116,7 +1115,7 @@ async def startup_event(): ### LOAD CONFIG ### worker_config = litellm.get_secret("WORKER_CONFIG") - print_verbose(f"worker_config: {worker_config}") + verbose_proxy_logger.debug(f"worker_config: {worker_config}") # check if it's a valid file path if os.path.isfile(worker_config): await initialize(**worker_config) @@ -1131,7 +1130,7 @@ async def startup_event(): _run_background_health_check() ) # start the background health check coroutine. - print_verbose(f"prisma client - {prisma_client}") + verbose_proxy_logger.debug(f"prisma client - {prisma_client}") if prisma_client: await prisma_client.connect() @@ -1158,7 +1157,7 @@ def model_list(): all_models = list(set(all_models + [m["model_name"] for m in llm_model_list])) if user_model is not None: all_models += [user_model] - print_verbose(f"all_models: {all_models}") + verbose_proxy_logger.debug(f"all_models: {all_models}") ### CHECK OLLAMA MODELS ### try: response = requests.get("http://0.0.0.0:11434/api/tags") @@ -1277,7 +1276,7 @@ async def completion( else: model_id = "" - print_verbose(f"final response: {response}") + verbose_proxy_logger.debug(f"final response: {response}") if ( "stream" in data and data["stream"] == True ): # use generate_responses to stream responses @@ -1302,8 +1301,8 @@ async def completion( fastapi_response.headers["x-litellm-model-id"] = model_id return response except Exception as e: - print_verbose(f"EXCEPTION RAISED IN PROXY MAIN.PY") - print_verbose( + verbose_proxy_logger.debug(f"EXCEPTION RAISED IN PROXY MAIN.PY") + verbose_proxy_logger.debug( f"\033[1;31mAn error occurred: {e}\n\n Debug this by setting `--debug`, e.g. `litellm --model gpt-3.5-turbo --debug`" ) traceback.print_exc() @@ -1358,13 +1357,13 @@ async def chat_completion( ## Cache Controls headers = request.headers - print_verbose(f"Request Headers: {headers}") + verbose_proxy_logger.debug(f"Request Headers: {headers}") cache_control_header = headers.get("Cache-Control", None) if cache_control_header: cache_dict = parse_cache_control(cache_control_header) data["ttl"] = cache_dict.get("s-maxage") - print_verbose(f"receiving data: {data}") + verbose_proxy_logger.debug(f"receiving data: {data}") data["model"] = ( general_settings.get("completion_model", None) # server default or user_model # model name passed via cli args @@ -1378,7 +1377,7 @@ async def chat_completion( data["user"] = user_api_key_dict.user_id if "metadata" in data: - print_verbose(f'received metadata: {data["metadata"]}') + verbose_proxy_logger.debug(f'received metadata: {data["metadata"]}') data["metadata"]["user_api_key"] = user_api_key_dict.api_key data["metadata"]["user_api_key_user_id"] = user_api_key_dict.user_id data["metadata"]["headers"] = dict(request.headers) @@ -1469,7 +1468,7 @@ async def chat_completion( await proxy_logging_obj.post_call_failure_hook( user_api_key_dict=user_api_key_dict, original_exception=e ) - print_verbose( + verbose_proxy_logger.debug( f"\033[1;31mAn error occurred: {e}\n\n Debug this by setting `--debug`, e.g. `litellm --model gpt-3.5-turbo --debug`" ) router_model_names = ( @@ -1478,17 +1477,17 @@ async def chat_completion( else [] ) if llm_router is not None and data.get("model", "") in router_model_names: - print_verbose("Results from router") - print_verbose("\nRouter stats") - print_verbose("\nTotal Calls made") + verbose_proxy_logger.debug("Results from router") + verbose_proxy_logger.debug("\nRouter stats") + verbose_proxy_logger.debug("\nTotal Calls made") for key, value in llm_router.total_calls.items(): - print_verbose(f"{key}: {value}") - print_verbose("\nSuccess Calls made") + verbose_proxy_logger.debug(f"{key}: {value}") + verbose_proxy_logger.debug("\nSuccess Calls made") for key, value in llm_router.success_calls.items(): - print_verbose(f"{key}: {value}") - print_verbose("\nFail Calls made") + verbose_proxy_logger.debug(f"{key}: {value}") + verbose_proxy_logger.debug("\nFail Calls made") for key, value in llm_router.fail_calls.items(): - print_verbose(f"{key}: {value}") + verbose_proxy_logger.debug(f"{key}: {value}") if user_debug: traceback.print_exc() @@ -1791,7 +1790,7 @@ async def generate_key_fn( - expires: (datetime) Datetime object for when key expires. - user_id: (str) Unique user id - used for tracking spend across multiple keys for same user id. """ - print_verbose("entered /key/generate") + verbose_proxy_logger.debug("entered /key/generate") data_json = data.json() # type: ignore response = await generate_key_helper_fn(**data_json) return GenerateKeyResponse( @@ -2007,9 +2006,9 @@ async def add_new_model(model_params: ModelParams): # Load existing config config = await proxy_config.get_config() - print_verbose(f"User config path: {user_config_file_path}") + verbose_proxy_logger.debug(f"User config path: {user_config_file_path}") - print_verbose(f"Loaded config: {config}") + verbose_proxy_logger.debug(f"Loaded config: {config}") # Add the new model to the config model_info = model_params.model_info.json() model_info = {k: v for k, v in model_info.items() if v is not None} @@ -2021,7 +2020,7 @@ async def add_new_model(model_params: ModelParams): } ) - print_verbose(f"updated model list: {config['model_list']}") + verbose_proxy_logger.debug(f"updated model list: {config['model_list']}") # Save new config await proxy_config.save_config(new_config=config) @@ -2071,7 +2070,7 @@ async def model_info_v1(request: Request): # don't return the api key model["litellm_params"].pop("api_key", None) - print_verbose(f"all_models: {all_models}") + verbose_proxy_logger.debug(f"all_models: {all_models}") return {"data": all_models} @@ -2136,7 +2135,7 @@ async def _litellm_chat_completions_worker(data, user_api_key_dict): user_api_key_dict=user_api_key_dict, data=data, call_type="completion" ) - print_verbose(f"_litellm_chat_completions_worker started") + verbose_proxy_logger.debug(f"_litellm_chat_completions_worker started") ### ROUTE THE REQUEST ### router_model_names = ( [m["model_name"] for m in llm_model_list] @@ -2162,17 +2161,17 @@ async def _litellm_chat_completions_worker(data, user_api_key_dict): else: # router is not set response = await litellm.acompletion(**data) - print_verbose(f"final response: {response}") + verbose_proxy_logger.debug(f"final response: {response}") return response except HTTPException as e: - print_verbose( + verbose_proxy_logger.debug( f"EXCEPTION RAISED IN _litellm_chat_completions_worker - {e.status_code}; {e.detail}" ) if ( e.status_code == 429 and "Max parallel request limit reached" in e.detail ): - print_verbose(f"Max parallel request limit reached!") + verbose_proxy_logger.debug(f"Max parallel request limit reached!") timeout = litellm._calculate_retry_after( remaining_retries=3, max_retries=3, min_timeout=1 ) @@ -2212,7 +2211,7 @@ async def async_queue_request( "body": copy.copy(data), # use copy instead of deepcopy } - print_verbose(f"receiving data: {data}") + verbose_proxy_logger.debug(f"receiving data: {data}") data["model"] = ( general_settings.get("completion_model", None) # server default or user_model # model name passed via cli args @@ -2226,7 +2225,7 @@ async def async_queue_request( data["user"] = user_api_key_dict.user_id if "metadata" in data: - print_verbose(f'received metadata: {data["metadata"]}') + verbose_proxy_logger.debug(f'received metadata: {data["metadata"]}') data["metadata"]["user_api_key"] = user_api_key_dict.api_key data["metadata"]["headers"] = dict(request.headers) data["metadata"]["user_api_key_user_id"] = user_api_key_dict.user_id @@ -2300,7 +2299,7 @@ async def update_config(config_info: ConfigYAML): config = await proxy_config.get_config() backup_config = copy.deepcopy(config) - print_verbose(f"Loaded config: {config}") + verbose_proxy_logger.debug(f"Loaded config: {config}") # update the general settings if config_info.general_settings is not None: @@ -2484,7 +2483,7 @@ async def get_routes(): async def shutdown_event(): global prisma_client, master_key, user_custom_auth if prisma_client: - print_verbose("Disconnecting from Prisma") + verbose_proxy_logger.debug("Disconnecting from Prisma") await prisma_client.disconnect() ## RESET CUSTOM VARIABLES ##