diff --git a/docs/my-website/docs/proxy/prometheus.md b/docs/my-website/docs/proxy/prometheus.md index 6ccf0e44e..a1b9feb40 100644 --- a/docs/my-website/docs/proxy/prometheus.md +++ b/docs/my-website/docs/proxy/prometheus.md @@ -58,17 +58,9 @@ http://localhost:4000/metrics ## 📈 Metrics Tracked -### Error Metrics +### Virtual Keys, Teams, Internal Users Metrics -| Metric Name | Description | -|----------------------|--------------------------------------| -| `litellm_error_code_metric_total` | Total number of errors by error code and model | - -This metric provides a count of errors encountered, categorized by error code and model. For example: - - - -### Proxy Requests / Spend Metrics +Use this for for tracking per [user, key, team, etc.](virtual_keys) | Metric Name | Description | |----------------------|--------------------------------------| @@ -76,11 +68,32 @@ This metric provides a count of errors encountered, categorized by error code an | `litellm_spend_metric` | Total Spend, per `"user", "key", "model", "team", "end-user"` | | `litellm_total_tokens` | input + output tokens per `"user", "key", "model", "team", "end-user"` | -### Error Monitoring Metrics + + +### LLM API / Provider Metrics + +Use this for LLM API Error monitoring and tracking remaining rate limits and token limits | Metric Name | Description | -| `litellm_llm_api_failed_requests_metric` | Number of failed LLM API requests per `"user", "key", "model", "team", "end-user"` | -| `litellm_error_code_metric_total` | Total number of errors by error code and model | +|----------------------|--------------------------------------| + `litellm_deployment_success_responses` | Total number of successful LLM API calls for deployment | +| `litellm_deployment_failure_responses` | Total number of failed LLM API calls for a specific LLM deploymeny. exception_status is the status of the exception from the llm api | +| `litellm_deployment_total_requests` | Total number of LLM API calls for deployment - success + failure | +| `litellm_remaining_requests_metric` | Track `x-ratelimit-remaining-requests` returned from LLM API Deployment | +| `litellm_remaining_tokens` | Track `x-ratelimit-remaining-tokens` return from LLM API Deployment | +| `litellm_deployment_state` | The state of the deployment: 0 = healthy, 1 = partial outage, 2 = complete outage. | +| `litellm_deployment_latency_per_output_token` | Latency per output token for deployment | + +## Load Balancing, Fallback, Cooldown Metrics + +Use this for tracking [litellm router](../routing) load balancing metrics + +| Metric Name | Description | +|----------------------|--------------------------------------| +| `litellm_deployment_cooled_down` | Number of times a deployment has been cooled down by LiteLLM load balancing logic. exception_status is the status of the exception that caused the deployment to be cooled down | +| `litellm_deployment_successful_fallbacks` | Number of successful fallback requests from primary model -> fallback model | +| `litellm_deployment_failed_fallbacks` | Number of failed fallback requests from primary model -> fallback model | + ### Request Latency Metrics @@ -90,24 +103,6 @@ This metric provides a count of errors encountered, categorized by error code an | `litellm_llm_api_latency_metric` | latency (seconds) for just the LLM API call - tracked for labels `litellm_call_id`, `model` | - -### LLM API / Provider Metrics - -| Metric Name | Description | -|----------------------|--------------------------------------| -| `litellm_deployment_state` | The state of the deployment: 0 = healthy, 1 = partial outage, 2 = complete outage. | -| `litellm_remaining_requests_metric` | Track `x-ratelimit-remaining-requests` returned from LLM API Deployment | -| `litellm_remaining_tokens` | Track `x-ratelimit-remaining-tokens` return from LLM API Deployment | - `litellm_deployment_success_responses` | Total number of successful LLM API calls for deployment | -| `litellm_deployment_failure_responses` | Total number of failed LLM API calls for deployment | -| `litellm_deployment_total_requests` | Total number of LLM API calls for deployment - success + failure | -| `litellm_deployment_latency_per_output_token` | Latency per output token for deployment | -| `litellm_deployment_successful_fallbacks` | Number of successful fallback requests from primary model -> fallback model | -| `litellm_deployment_failed_fallbacks` | Number of failed fallback requests from primary model -> fallback model | - - - - ### Budget Metrics | Metric Name | Description | |----------------------|--------------------------------------| diff --git a/litellm/integrations/prometheus.py b/litellm/integrations/prometheus.py index ed5035074..0bf7079d0 100644 --- a/litellm/integrations/prometheus.py +++ b/litellm/integrations/prometheus.py @@ -26,8 +26,6 @@ class PrometheusLogger(CustomLogger): try: from prometheus_client import Counter, Gauge, Histogram - from litellm.proxy.proxy_server import premium_user - verbose_logger.warning( "🚨🚨🚨 Prometheus Metrics will be moving to LiteLLM Enterprise on September 15th, 2024.\n🚨 Contact us here to get a license https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat \n🚨 Enterprise Pricing: https://www.litellm.ai/#pricing" ) @@ -145,83 +143,86 @@ class PrometheusLogger(CustomLogger): labelnames=["error_code", "model"], ) - # Litellm-Enterprise Metrics - if premium_user is True: + ######################################## + # LLM API Deployment Metrics / analytics + ######################################## - ######################################## - # LLM API Deployment Metrics / analytics - ######################################## - - # Remaining Rate Limit for model - self.litellm_remaining_requests_metric = Gauge( - "litellm_remaining_requests", - "LLM Deployment Analytics - remaining requests for model, returned from LLM API Provider", - labelnames=[ - "model_group", - "api_provider", - "api_base", - "litellm_model_name", - ], - ) - - self.litellm_remaining_tokens_metric = Gauge( - "litellm_remaining_tokens", - "remaining tokens for model, returned from LLM API Provider", - labelnames=[ - "model_group", - "api_provider", - "api_base", - "litellm_model_name", - ], - ) - # Get all keys - _logged_llm_labels = [ - "litellm_model_name", - "model_id", - "api_base", + # Remaining Rate Limit for model + self.litellm_remaining_requests_metric = Gauge( + "litellm_remaining_requests", + "LLM Deployment Analytics - remaining requests for model, returned from LLM API Provider", + labelnames=[ + "model_group", "api_provider", - ] + "api_base", + "litellm_model_name", + ], + ) - # Metric for deployment state - self.litellm_deployment_state = Gauge( - "litellm_deployment_state", - "LLM Deployment Analytics - The state of the deployment: 0 = healthy, 1 = partial outage, 2 = complete outage", - labelnames=_logged_llm_labels, - ) + self.litellm_remaining_tokens_metric = Gauge( + "litellm_remaining_tokens", + "remaining tokens for model, returned from LLM API Provider", + labelnames=[ + "model_group", + "api_provider", + "api_base", + "litellm_model_name", + ], + ) + # Get all keys + _logged_llm_labels = [ + "litellm_model_name", + "model_id", + "api_base", + "api_provider", + ] - self.litellm_deployment_success_responses = Counter( - name="litellm_deployment_success_responses", - documentation="LLM Deployment Analytics - Total number of successful LLM API calls via litellm", - labelnames=_logged_llm_labels, - ) - self.litellm_deployment_failure_responses = Counter( - name="litellm_deployment_failure_responses", - documentation="LLM Deployment Analytics - Total number of failed LLM API calls via litellm", - labelnames=_logged_llm_labels, - ) - self.litellm_deployment_total_requests = Counter( - name="litellm_deployment_total_requests", - documentation="LLM Deployment Analytics - Total number of LLM API calls via litellm - success + failure", - labelnames=_logged_llm_labels, - ) + # Metric for deployment state + self.litellm_deployment_state = Gauge( + "litellm_deployment_state", + "LLM Deployment Analytics - The state of the deployment: 0 = healthy, 1 = partial outage, 2 = complete outage", + labelnames=_logged_llm_labels, + ) - # Deployment Latency tracking - self.litellm_deployment_latency_per_output_token = Histogram( - name="litellm_deployment_latency_per_output_token", - documentation="LLM Deployment Analytics - Latency per output token", - labelnames=_logged_llm_labels, - ) + self.litellm_deployment_cooled_down = Counter( + "litellm_deployment_cooled_down", + "LLM Deployment Analytics - Number of times a deployment has been cooled down by LiteLLM load balancing logic. exception_status is the status of the exception that caused the deployment to be cooled down", + labelnames=_logged_llm_labels + ["exception_status"], + ) - self.litellm_deployment_successful_fallbacks = Counter( - "litellm_deployment_successful_fallbacks", - "LLM Deployment Analytics - Number of successful fallback requests from primary model -> fallback model", - ["primary_model", "fallback_model"], - ) - self.litellm_deployment_failed_fallbacks = Counter( - "litellm_deployment_failed_fallbacks", - "LLM Deployment Analytics - Number of failed fallback requests from primary model -> fallback model", - ["primary_model", "fallback_model"], - ) + self.litellm_deployment_success_responses = Counter( + name="litellm_deployment_success_responses", + documentation="LLM Deployment Analytics - Total number of successful LLM API calls via litellm", + labelnames=_logged_llm_labels, + ) + self.litellm_deployment_failure_responses = Counter( + name="litellm_deployment_failure_responses", + documentation="LLM Deployment Analytics - Total number of failed LLM API calls for a specific LLM deploymeny. exception_status is the status of the exception from the llm api", + labelnames=_logged_llm_labels + ["exception_status"], + ) + self.litellm_deployment_total_requests = Counter( + name="litellm_deployment_total_requests", + documentation="LLM Deployment Analytics - Total number of LLM API calls via litellm - success + failure", + labelnames=_logged_llm_labels, + ) + + # Deployment Latency tracking + self.litellm_deployment_latency_per_output_token = Histogram( + name="litellm_deployment_latency_per_output_token", + documentation="LLM Deployment Analytics - Latency per output token", + labelnames=_logged_llm_labels, + ) + + self.litellm_deployment_successful_fallbacks = Counter( + "litellm_deployment_successful_fallbacks", + "LLM Deployment Analytics - Number of successful fallback requests from primary model -> fallback model", + ["primary_model", "fallback_model"], + ) + self.litellm_deployment_failed_fallbacks = Counter( + "litellm_deployment_failed_fallbacks", + "LLM Deployment Analytics - Number of failed fallback requests from primary model -> fallback model", + ["primary_model", "fallback_model"], + ) except Exception as e: print_verbose(f"Got exception on init prometheus client {str(e)}") @@ -232,7 +233,6 @@ class PrometheusLogger(CustomLogger): from litellm.proxy.common_utils.callback_utils import ( get_model_group_from_litellm_kwargs, ) - from litellm.proxy.proxy_server import premium_user verbose_logger.debug( f"prometheus Logging - Enters success logging function for kwargs {kwargs}" @@ -375,14 +375,12 @@ class PrometheusLogger(CustomLogger): ) # set x-ratelimit headers - if premium_user is True: - self.set_llm_deployment_success_metrics( - kwargs, start_time, end_time, output_tokens - ) + self.set_llm_deployment_success_metrics( + kwargs, start_time, end_time, output_tokens + ) pass async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time): - from litellm.proxy.proxy_server import premium_user verbose_logger.debug( f"prometheus Logging - Enters failure logging function for kwargs {kwargs}" @@ -404,6 +402,7 @@ class PrometheusLogger(CustomLogger): user_api_team_alias = litellm_params.get("metadata", {}).get( "user_api_key_team_alias", None ) + exception = kwargs.get("exception", None) try: self.litellm_llm_api_failed_requests_metric.labels( @@ -441,8 +440,13 @@ class PrometheusLogger(CustomLogger): _metadata = _litellm_params.get("metadata", {}) litellm_model_name = request_kwargs.get("model", None) api_base = _metadata.get("api_base", None) + if api_base is None: + api_base = _litellm_params.get("api_base", None) llm_provider = _litellm_params.get("custom_llm_provider", None) - model_id = _metadata.get("model_id") + _model_info = _metadata.get("model_info") or {} + model_id = _model_info.get("id", None) + exception = request_kwargs.get("exception", None) + exception_status_code: str = str(getattr(exception, "status_code", None)) """ log these labels @@ -460,6 +464,7 @@ class PrometheusLogger(CustomLogger): model_id=model_id, api_base=api_base, api_provider=llm_provider, + exception_status=exception_status_code, ).inc() self.litellm_deployment_total_requests.labels( @@ -488,8 +493,11 @@ class PrometheusLogger(CustomLogger): litellm_model_name = request_kwargs.get("model", None) model_group = _metadata.get("model_group", None) api_base = _metadata.get("api_base", None) + if api_base is None: + api_base = _litellm_params.get("api_base", None) llm_provider = _litellm_params.get("custom_llm_provider", None) - model_id = _metadata.get("model_id") + _model_info = _metadata.get("model_info") or {} + model_id = _model_info.get("id", None) remaining_requests = None remaining_tokens = None @@ -654,6 +662,21 @@ class PrometheusLogger(CustomLogger): 2, litellm_model_name, model_id, api_base, api_provider ) + def increment_deployment_cooled_down( + self, + litellm_model_name: str, + model_id: str, + api_base: str, + api_provider: str, + exception_status: str, + ): + """ + increment metric when litellm.Router / load balancing logic places a deployment in cool down + """ + self.litellm_deployment_cooled_down.labels( + litellm_model_name, model_id, api_base, api_provider, exception_status + ).inc() + def safe_get_remaining_budget( max_budget: Optional[float], spend: Optional[float] diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml index 8e8479e1d..b33bc35d1 100644 --- a/litellm/proxy/proxy_config.yaml +++ b/litellm/proxy/proxy_config.yaml @@ -24,5 +24,5 @@ general_settings: master_key: sk-1234 litellm_settings: - success_callback: ["datadog"] + success_callback: ["prometheus"] diff --git a/litellm/router.py b/litellm/router.py index eb6bbf040..d31646203 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -53,7 +53,7 @@ from litellm.router_utils.client_initalization_utils import ( should_initialize_sync_client, ) from litellm.router_utils.cooldown_cache import CooldownCache -from litellm.router_utils.cooldown_callbacks import router_cooldown_handler +from litellm.router_utils.cooldown_callbacks import router_cooldown_event_callback from litellm.router_utils.cooldown_handlers import ( DEFAULT_COOLDOWN_TIME_SECONDS, _async_get_cooldown_deployments, diff --git a/litellm/router_utils/cooldown_callbacks.py b/litellm/router_utils/cooldown_callbacks.py index 661075047..8324d3270 100644 --- a/litellm/router_utils/cooldown_callbacks.py +++ b/litellm/router_utils/cooldown_callbacks.py @@ -16,32 +16,39 @@ else: LitellmRouter = Any -async def router_cooldown_handler( +async def router_cooldown_event_callback( litellm_router_instance: LitellmRouter, deployment_id: str, exception_status: Union[str, int], cooldown_time: float, ): + """ + Callback triggered when a deployment is put into cooldown by litellm + + - Updates deploymen state on Prometheus + - Increments cooldown metric for deployment on Prometheus + """ + verbose_logger.debug("In router_cooldown_event_callback - updating prometheus") _deployment = litellm_router_instance.get_deployment(model_id=deployment_id) if _deployment is None: verbose_logger.warning( - f"in router_cooldown_handler but _deployment is None for deployment_id={deployment_id}. Doing nothing" + f"in router_cooldown_event_callback but _deployment is None for deployment_id={deployment_id}. Doing nothing" ) return _litellm_params = _deployment["litellm_params"] temp_litellm_params = copy.deepcopy(_litellm_params) temp_litellm_params = dict(temp_litellm_params) - _model_name = _deployment.get("model_name", None) - _api_base = litellm.get_api_base( - model=_model_name, optional_params=temp_litellm_params + _model_name = _deployment.get("model_name", None) or "" + _api_base = ( + litellm.get_api_base(model=_model_name, optional_params=temp_litellm_params) + or "" ) model_info = _deployment["model_info"] model_id = model_info.id - litellm_model_name = temp_litellm_params.get("model") + litellm_model_name = temp_litellm_params.get("model") or "" llm_provider = "" try: - _, llm_provider, _, _ = litellm.get_llm_provider( model=litellm_model_name, custom_llm_provider=temp_litellm_params.get("custom_llm_provider"), @@ -50,13 +57,29 @@ async def router_cooldown_handler( pass # Trigger cooldown on Prometheus - from litellm.litellm_core_utils.litellm_logging import prometheusLogger + from litellm.integrations.prometheus import PrometheusLogger + + prometheusLogger = None + for callback in litellm.callbacks: + if isinstance(callback, PrometheusLogger): + prometheusLogger = callback if prometheusLogger is not None: - prometheusLogger.set_deployment_complete_outage( - litellm_model_name=_model_name, - model_id=model_id, - api_base=_api_base, - api_provider=llm_provider, - ) + + if isinstance(prometheusLogger, PrometheusLogger): + prometheusLogger.set_deployment_complete_outage( + litellm_model_name=_model_name, + model_id=model_id, + api_base=_api_base, + api_provider=llm_provider, + ) + + prometheusLogger.increment_deployment_cooled_down( + litellm_model_name=_model_name, + model_id=model_id, + api_base=_api_base, + api_provider=llm_provider, + exception_status=str(exception_status), + ) + return diff --git a/litellm/router_utils/cooldown_handlers.py b/litellm/router_utils/cooldown_handlers.py index e062a2188..54d0694ee 100644 --- a/litellm/router_utils/cooldown_handlers.py +++ b/litellm/router_utils/cooldown_handlers.py @@ -11,7 +11,7 @@ from typing import TYPE_CHECKING, Any, List, Optional, Union import litellm from litellm._logging import verbose_router_logger -from litellm.router_utils.cooldown_callbacks import router_cooldown_handler +from litellm.router_utils.cooldown_callbacks import router_cooldown_event_callback from litellm.utils import get_utc_datetime from .router_callbacks.track_deployment_metrics import ( @@ -184,7 +184,7 @@ def _set_cooldown_deployments( # Trigger cooldown callback handler asyncio.create_task( - router_cooldown_handler( + router_cooldown_event_callback( litellm_router_instance=litellm_router_instance, deployment_id=deployment, exception_status=exception_status,