mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-26 19:24:27 +00:00
Merge branch 'main' into litellm_anthropic_streaming_tool_call_fix
This commit is contained in:
commit
78d72acc44
13 changed files with 338 additions and 180 deletions
|
@ -36,7 +36,8 @@ This covers:
|
||||||
- ✅ [Tracking Spend for Custom Tags](./proxy/enterprise#tracking-spend-for-custom-tags)
|
- ✅ [Tracking Spend for Custom Tags](./proxy/enterprise#tracking-spend-for-custom-tags)
|
||||||
- ✅ [Exporting LLM Logs to GCS Bucket](./proxy/bucket#🪣-logging-gcs-s3-buckets)
|
- ✅ [Exporting LLM Logs to GCS Bucket](./proxy/bucket#🪣-logging-gcs-s3-buckets)
|
||||||
- ✅ [API Endpoints to get Spend Reports per Team, API Key, Customer](./proxy/cost_tracking.md#✨-enterprise-api-endpoints-to-get-spend)
|
- ✅ [API Endpoints to get Spend Reports per Team, API Key, Customer](./proxy/cost_tracking.md#✨-enterprise-api-endpoints-to-get-spend)
|
||||||
- **Advanced Metrics**
|
- **Prometheus Metrics**
|
||||||
|
- ✅ [Prometheus Metrics - Num Requests, failures, LLM Provider Outages](./proxy/prometheus)
|
||||||
- ✅ [`x-ratelimit-remaining-requests`, `x-ratelimit-remaining-tokens` for LLM APIs on Prometheus](./proxy/prometheus#✨-enterprise-llm-remaining-requests-and-remaining-tokens)
|
- ✅ [`x-ratelimit-remaining-requests`, `x-ratelimit-remaining-tokens` for LLM APIs on Prometheus](./proxy/prometheus#✨-enterprise-llm-remaining-requests-and-remaining-tokens)
|
||||||
- **Guardrails, PII Masking, Content Moderation**
|
- **Guardrails, PII Masking, Content Moderation**
|
||||||
- ✅ [Content Moderation with LLM Guard, LlamaGuard, Secret Detection, Google Text Moderations](./proxy/enterprise#content-moderation)
|
- ✅ [Content Moderation with LLM Guard, LlamaGuard, Secret Detection, Google Text Moderations](./proxy/enterprise#content-moderation)
|
||||||
|
|
|
@ -30,7 +30,8 @@ Features:
|
||||||
- ✅ [Tracking Spend for Custom Tags](#tracking-spend-for-custom-tags)
|
- ✅ [Tracking Spend for Custom Tags](#tracking-spend-for-custom-tags)
|
||||||
- ✅ [Exporting LLM Logs to GCS Bucket](./proxy/bucket#🪣-logging-gcs-s3-buckets)
|
- ✅ [Exporting LLM Logs to GCS Bucket](./proxy/bucket#🪣-logging-gcs-s3-buckets)
|
||||||
- ✅ [`/spend/report` API endpoint](cost_tracking.md#✨-enterprise-api-endpoints-to-get-spend)
|
- ✅ [`/spend/report` API endpoint](cost_tracking.md#✨-enterprise-api-endpoints-to-get-spend)
|
||||||
- **Advanced Metrics**
|
- **Prometheus Metrics**
|
||||||
|
- ✅ [Prometheus Metrics - Num Requests, failures, LLM Provider Outages](prometheus)
|
||||||
- ✅ [`x-ratelimit-remaining-requests`, `x-ratelimit-remaining-tokens` for LLM APIs on Prometheus](prometheus#✨-enterprise-llm-remaining-requests-and-remaining-tokens)
|
- ✅ [`x-ratelimit-remaining-requests`, `x-ratelimit-remaining-tokens` for LLM APIs on Prometheus](prometheus#✨-enterprise-llm-remaining-requests-and-remaining-tokens)
|
||||||
- **Guardrails, PII Masking, Content Moderation**
|
- **Guardrails, PII Masking, Content Moderation**
|
||||||
- ✅ [Content Moderation with LLM Guard, LlamaGuard, Secret Detection, Google Text Moderations](#content-moderation)
|
- ✅ [Content Moderation with LLM Guard, LlamaGuard, Secret Detection, Google Text Moderations](#content-moderation)
|
||||||
|
|
|
@ -1,7 +1,16 @@
|
||||||
import Tabs from '@theme/Tabs';
|
import Tabs from '@theme/Tabs';
|
||||||
import TabItem from '@theme/TabItem';
|
import TabItem from '@theme/TabItem';
|
||||||
|
|
||||||
# 📈 Prometheus metrics [BETA]
|
# 📈 Prometheus metrics
|
||||||
|
|
||||||
|
:::info
|
||||||
|
🚨 Prometheus Metrics will be moving to LiteLLM Enterprise on September 15th, 2024
|
||||||
|
|
||||||
|
[Enterprise Pricing](https://www.litellm.ai/#pricing)
|
||||||
|
|
||||||
|
[Contact us here to get a free trial](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
|
||||||
|
|
||||||
|
:::
|
||||||
|
|
||||||
LiteLLM Exposes a `/metrics` endpoint for Prometheus to Poll
|
LiteLLM Exposes a `/metrics` endpoint for Prometheus to Poll
|
||||||
|
|
||||||
|
@ -47,9 +56,11 @@ http://localhost:4000/metrics
|
||||||
# <proxy_base_url>/metrics
|
# <proxy_base_url>/metrics
|
||||||
```
|
```
|
||||||
|
|
||||||
## Metrics Tracked
|
## 📈 Metrics Tracked
|
||||||
|
|
||||||
|
|
||||||
|
### Proxy Requests / Spend Metrics
|
||||||
|
|
||||||
| Metric Name | Description |
|
| Metric Name | Description |
|
||||||
|----------------------|--------------------------------------|
|
|----------------------|--------------------------------------|
|
||||||
| `litellm_requests_metric` | Number of requests made, per `"user", "key", "model", "team", "end-user"` |
|
| `litellm_requests_metric` | Number of requests made, per `"user", "key", "model", "team", "end-user"` |
|
||||||
|
@ -57,6 +68,19 @@ http://localhost:4000/metrics
|
||||||
| `litellm_total_tokens` | input + output tokens per `"user", "key", "model", "team", "end-user"` |
|
| `litellm_total_tokens` | input + output tokens per `"user", "key", "model", "team", "end-user"` |
|
||||||
| `litellm_llm_api_failed_requests_metric` | Number of failed LLM API requests per `"user", "key", "model", "team", "end-user"` |
|
| `litellm_llm_api_failed_requests_metric` | Number of failed LLM API requests per `"user", "key", "model", "team", "end-user"` |
|
||||||
|
|
||||||
|
### LLM API / Provider Metrics
|
||||||
|
|
||||||
|
| Metric Name | Description |
|
||||||
|
|----------------------|--------------------------------------|
|
||||||
|
| `deployment_complete_outage` | Value is "1" when deployment is in cooldown and has had a complete outage. This metric tracks the state of the LLM API Deployment when it's completely unavailable. |
|
||||||
|
| `deployment_partial_outage` | Value is "1" when deployment is experiencing a partial outage. This metric indicates when the LLM API Deployment is facing issues but is not completely down. |
|
||||||
|
| `deployment_healthy` | Value is "1" when deployment is in a healthy state. This metric shows when the LLM API Deployment is functioning normally without any outages. |
|
||||||
|
| `litellm_remaining_requests_metric` | Track `x-ratelimit-remaining-requests` returned from LLM API Deployment |
|
||||||
|
| `litellm_remaining_tokens` | Track `x-ratelimit-remaining-tokens` return from LLM API Deployment |
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
### Budget Metrics
|
### Budget Metrics
|
||||||
| Metric Name | Description |
|
| Metric Name | Description |
|
||||||
|----------------------|--------------------------------------|
|
|----------------------|--------------------------------------|
|
||||||
|
@ -64,55 +88,6 @@ http://localhost:4000/metrics
|
||||||
| `litellm_remaining_api_key_budget_metric` | Remaining Budget for API Key (A key Created on LiteLLM)|
|
| `litellm_remaining_api_key_budget_metric` | Remaining Budget for API Key (A key Created on LiteLLM)|
|
||||||
|
|
||||||
|
|
||||||
### ✨ (Enterprise) LLM Remaining Requests and Remaining Tokens
|
|
||||||
Set this on your config.yaml to allow you to track how close you are to hitting your TPM / RPM limits on each model group
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
litellm_settings:
|
|
||||||
success_callback: ["prometheus"]
|
|
||||||
failure_callback: ["prometheus"]
|
|
||||||
return_response_headers: true # ensures the LLM API calls track the response headers
|
|
||||||
```
|
|
||||||
|
|
||||||
| Metric Name | Description |
|
|
||||||
|----------------------|--------------------------------------|
|
|
||||||
| `litellm_remaining_requests_metric` | Track `x-ratelimit-remaining-requests` returned from LLM API Deployment |
|
|
||||||
| `litellm_remaining_tokens` | Track `x-ratelimit-remaining-tokens` return from LLM API Deployment |
|
|
||||||
|
|
||||||
Example Metric
|
|
||||||
<Tabs>
|
|
||||||
|
|
||||||
<TabItem value="Remaining Requests" label="Remaining Requests">
|
|
||||||
|
|
||||||
```shell
|
|
||||||
litellm_remaining_requests
|
|
||||||
{
|
|
||||||
api_base="https://api.openai.com/v1",
|
|
||||||
api_provider="openai",
|
|
||||||
litellm_model_name="gpt-3.5-turbo",
|
|
||||||
model_group="gpt-3.5-turbo"
|
|
||||||
}
|
|
||||||
8998.0
|
|
||||||
```
|
|
||||||
|
|
||||||
</TabItem>
|
|
||||||
|
|
||||||
<TabItem value="Requests" label="Remaining Tokens">
|
|
||||||
|
|
||||||
```shell
|
|
||||||
litellm_remaining_tokens
|
|
||||||
{
|
|
||||||
api_base="https://api.openai.com/v1",
|
|
||||||
api_provider="openai",
|
|
||||||
litellm_model_name="gpt-3.5-turbo",
|
|
||||||
model_group="gpt-3.5-turbo"
|
|
||||||
}
|
|
||||||
999981.0
|
|
||||||
```
|
|
||||||
|
|
||||||
</TabItem>
|
|
||||||
|
|
||||||
</Tabs>
|
|
||||||
|
|
||||||
## Monitor System Health
|
## Monitor System Health
|
||||||
|
|
||||||
|
|
|
@ -73,6 +73,7 @@ class ServiceLogging(CustomLogger):
|
||||||
)
|
)
|
||||||
for callback in litellm.service_callback:
|
for callback in litellm.service_callback:
|
||||||
if callback == "prometheus_system":
|
if callback == "prometheus_system":
|
||||||
|
await self.init_prometheus_services_logger_if_none()
|
||||||
await self.prometheusServicesLogger.async_service_success_hook(
|
await self.prometheusServicesLogger.async_service_success_hook(
|
||||||
payload=payload
|
payload=payload
|
||||||
)
|
)
|
||||||
|
@ -88,6 +89,11 @@ class ServiceLogging(CustomLogger):
|
||||||
event_metadata=event_metadata,
|
event_metadata=event_metadata,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
async def init_prometheus_services_logger_if_none(self):
|
||||||
|
if self.prometheusServicesLogger is None:
|
||||||
|
self.prometheusServicesLogger = self.prometheusServicesLogger()
|
||||||
|
return
|
||||||
|
|
||||||
async def async_service_failure_hook(
|
async def async_service_failure_hook(
|
||||||
self,
|
self,
|
||||||
service: ServiceTypes,
|
service: ServiceTypes,
|
||||||
|
@ -120,8 +126,7 @@ class ServiceLogging(CustomLogger):
|
||||||
)
|
)
|
||||||
for callback in litellm.service_callback:
|
for callback in litellm.service_callback:
|
||||||
if callback == "prometheus_system":
|
if callback == "prometheus_system":
|
||||||
if self.prometheusServicesLogger is None:
|
await self.init_prometheus_services_logger_if_none()
|
||||||
self.prometheusServicesLogger = self.prometheusServicesLogger()
|
|
||||||
await self.prometheusServicesLogger.async_service_failure_hook(
|
await self.prometheusServicesLogger.async_service_failure_hook(
|
||||||
payload=payload
|
payload=payload
|
||||||
)
|
)
|
||||||
|
|
|
@ -8,7 +8,7 @@ import subprocess
|
||||||
import sys
|
import sys
|
||||||
import traceback
|
import traceback
|
||||||
import uuid
|
import uuid
|
||||||
from typing import Optional, Union
|
from typing import Optional, TypedDict, Union
|
||||||
|
|
||||||
import dotenv
|
import dotenv
|
||||||
import requests # type: ignore
|
import requests # type: ignore
|
||||||
|
@ -28,6 +28,10 @@ class PrometheusLogger:
|
||||||
|
|
||||||
from litellm.proxy.proxy_server import premium_user
|
from litellm.proxy.proxy_server import premium_user
|
||||||
|
|
||||||
|
verbose_logger.warning(
|
||||||
|
"🚨🚨🚨 Prometheus Metrics will be moving to LiteLLM Enterprise on September 15th, 2024.\n🚨 Contact us here to get a license https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat \n🚨 Enterprise Pricing: https://www.litellm.ai/#pricing"
|
||||||
|
)
|
||||||
|
|
||||||
self.litellm_llm_api_failed_requests_metric = Counter(
|
self.litellm_llm_api_failed_requests_metric = Counter(
|
||||||
name="litellm_llm_api_failed_requests_metric",
|
name="litellm_llm_api_failed_requests_metric",
|
||||||
documentation="Total number of failed LLM API calls via litellm",
|
documentation="Total number of failed LLM API calls via litellm",
|
||||||
|
@ -124,6 +128,29 @@ class PrometheusLogger:
|
||||||
"litellm_model_name",
|
"litellm_model_name",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
# Get all keys
|
||||||
|
_logged_llm_labels = [
|
||||||
|
"litellm_model_name",
|
||||||
|
"model_id",
|
||||||
|
"api_base",
|
||||||
|
"api_provider",
|
||||||
|
]
|
||||||
|
|
||||||
|
self.deployment_complete_outage = Gauge(
|
||||||
|
"deployment_complete_outage",
|
||||||
|
'Value is "1" when deployment is in cooldown and has had a complete outage',
|
||||||
|
labelnames=_logged_llm_labels,
|
||||||
|
)
|
||||||
|
self.deployment_partial_outage = Gauge(
|
||||||
|
"deployment_partial_outage",
|
||||||
|
'Value is "1" when deployment is experiencing a partial outage',
|
||||||
|
labelnames=_logged_llm_labels,
|
||||||
|
)
|
||||||
|
self.deployment_healthy = Gauge(
|
||||||
|
"deployment_healthy",
|
||||||
|
'Value is "1" when deployment is in an healthy state',
|
||||||
|
labelnames=_logged_llm_labels,
|
||||||
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print_verbose(f"Got exception on init prometheus client {str(e)}")
|
print_verbose(f"Got exception on init prometheus client {str(e)}")
|
||||||
|
@ -243,7 +270,7 @@ class PrometheusLogger:
|
||||||
|
|
||||||
# set x-ratelimit headers
|
# set x-ratelimit headers
|
||||||
if premium_user is True:
|
if premium_user is True:
|
||||||
self.set_remaining_tokens_requests_metric(kwargs)
|
self.set_llm_deployment_success_metrics(kwargs)
|
||||||
|
|
||||||
### FAILURE INCREMENT ###
|
### FAILURE INCREMENT ###
|
||||||
if "exception" in kwargs:
|
if "exception" in kwargs:
|
||||||
|
@ -256,6 +283,8 @@ class PrometheusLogger:
|
||||||
user_api_team_alias,
|
user_api_team_alias,
|
||||||
user_id,
|
user_id,
|
||||||
).inc()
|
).inc()
|
||||||
|
|
||||||
|
self.set_llm_deployment_failure_metrics(kwargs)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
verbose_logger.error(
|
verbose_logger.error(
|
||||||
"prometheus Layer Error(): Exception occured - {}".format(str(e))
|
"prometheus Layer Error(): Exception occured - {}".format(str(e))
|
||||||
|
@ -263,7 +292,33 @@ class PrometheusLogger:
|
||||||
verbose_logger.debug(traceback.format_exc())
|
verbose_logger.debug(traceback.format_exc())
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def set_remaining_tokens_requests_metric(self, request_kwargs: dict):
|
def set_llm_deployment_failure_metrics(self, request_kwargs: dict):
|
||||||
|
try:
|
||||||
|
verbose_logger.debug("setting remaining tokens requests metric")
|
||||||
|
_response_headers = request_kwargs.get("response_headers")
|
||||||
|
_litellm_params = request_kwargs.get("litellm_params", {}) or {}
|
||||||
|
_metadata = _litellm_params.get("metadata", {})
|
||||||
|
litellm_model_name = request_kwargs.get("model", None)
|
||||||
|
api_base = _metadata.get("api_base", None)
|
||||||
|
llm_provider = _litellm_params.get("custom_llm_provider", None)
|
||||||
|
model_id = _metadata.get("model_id")
|
||||||
|
|
||||||
|
"""
|
||||||
|
log these labels
|
||||||
|
["litellm_model_name", "model_id", "api_base", "api_provider"]
|
||||||
|
"""
|
||||||
|
self.set_deployment_partial_outage(
|
||||||
|
litellm_model_name=litellm_model_name,
|
||||||
|
model_id=model_id,
|
||||||
|
api_base=api_base,
|
||||||
|
llm_provider=llm_provider,
|
||||||
|
)
|
||||||
|
|
||||||
|
pass
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def set_llm_deployment_success_metrics(self, request_kwargs: dict):
|
||||||
try:
|
try:
|
||||||
verbose_logger.debug("setting remaining tokens requests metric")
|
verbose_logger.debug("setting remaining tokens requests metric")
|
||||||
_response_headers = request_kwargs.get("response_headers")
|
_response_headers = request_kwargs.get("response_headers")
|
||||||
|
@ -273,6 +328,7 @@ class PrometheusLogger:
|
||||||
model_group = _metadata.get("model_group", None)
|
model_group = _metadata.get("model_group", None)
|
||||||
api_base = _metadata.get("api_base", None)
|
api_base = _metadata.get("api_base", None)
|
||||||
llm_provider = _litellm_params.get("custom_llm_provider", None)
|
llm_provider = _litellm_params.get("custom_llm_provider", None)
|
||||||
|
model_id = _metadata.get("model_id")
|
||||||
|
|
||||||
remaining_requests = None
|
remaining_requests = None
|
||||||
remaining_tokens = None
|
remaining_tokens = None
|
||||||
|
@ -307,14 +363,82 @@ class PrometheusLogger:
|
||||||
model_group, llm_provider, api_base, litellm_model_name
|
model_group, llm_provider, api_base, litellm_model_name
|
||||||
).set(remaining_tokens)
|
).set(remaining_tokens)
|
||||||
|
|
||||||
|
"""
|
||||||
|
log these labels
|
||||||
|
["litellm_model_name", "model_id", "api_base", "api_provider"]
|
||||||
|
"""
|
||||||
|
self.set_deployment_healthy(
|
||||||
|
litellm_model_name=litellm_model_name,
|
||||||
|
model_id=model_id,
|
||||||
|
api_base=api_base,
|
||||||
|
llm_provider=llm_provider,
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
verbose_logger.error(
|
verbose_logger.error(
|
||||||
"Prometheus Error: set_remaining_tokens_requests_metric. Exception occured - {}".format(
|
"Prometheus Error: set_llm_deployment_success_metrics. Exception occured - {}".format(
|
||||||
str(e)
|
str(e)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def set_deployment_healthy(
|
||||||
|
self,
|
||||||
|
litellm_model_name: str,
|
||||||
|
model_id: str,
|
||||||
|
api_base: str,
|
||||||
|
llm_provider: str,
|
||||||
|
):
|
||||||
|
self.deployment_complete_outage.labels(
|
||||||
|
litellm_model_name, model_id, api_base, llm_provider
|
||||||
|
).set(0)
|
||||||
|
|
||||||
|
self.deployment_partial_outage.labels(
|
||||||
|
litellm_model_name, model_id, api_base, llm_provider
|
||||||
|
).set(0)
|
||||||
|
|
||||||
|
self.deployment_healthy.labels(
|
||||||
|
litellm_model_name, model_id, api_base, llm_provider
|
||||||
|
).set(1)
|
||||||
|
|
||||||
|
def set_deployment_complete_outage(
|
||||||
|
self,
|
||||||
|
litellm_model_name: str,
|
||||||
|
model_id: str,
|
||||||
|
api_base: str,
|
||||||
|
llm_provider: str,
|
||||||
|
):
|
||||||
|
verbose_logger.debug("setting llm outage metric")
|
||||||
|
self.deployment_complete_outage.labels(
|
||||||
|
litellm_model_name, model_id, api_base, llm_provider
|
||||||
|
).set(1)
|
||||||
|
|
||||||
|
self.deployment_partial_outage.labels(
|
||||||
|
litellm_model_name, model_id, api_base, llm_provider
|
||||||
|
).set(0)
|
||||||
|
|
||||||
|
self.deployment_healthy.labels(
|
||||||
|
litellm_model_name, model_id, api_base, llm_provider
|
||||||
|
).set(0)
|
||||||
|
|
||||||
|
def set_deployment_partial_outage(
|
||||||
|
self,
|
||||||
|
litellm_model_name: str,
|
||||||
|
model_id: str,
|
||||||
|
api_base: str,
|
||||||
|
llm_provider: str,
|
||||||
|
):
|
||||||
|
self.deployment_complete_outage.labels(
|
||||||
|
litellm_model_name, model_id, api_base, llm_provider
|
||||||
|
).set(0)
|
||||||
|
|
||||||
|
self.deployment_partial_outage.labels(
|
||||||
|
litellm_model_name, model_id, api_base, llm_provider
|
||||||
|
).set(1)
|
||||||
|
|
||||||
|
self.deployment_healthy.labels(
|
||||||
|
litellm_model_name, model_id, api_base, llm_provider
|
||||||
|
).set(0)
|
||||||
|
|
||||||
|
|
||||||
def safe_get_remaining_budget(
|
def safe_get_remaining_budget(
|
||||||
max_budget: Optional[float], spend: Optional[float]
|
max_budget: Optional[float], spend: Optional[float]
|
||||||
|
|
|
@ -94,18 +94,14 @@ class VertexAILlama3Config:
|
||||||
}
|
}
|
||||||
|
|
||||||
def get_supported_openai_params(self):
|
def get_supported_openai_params(self):
|
||||||
return [
|
return litellm.OpenAIConfig().get_supported_openai_params(model="gpt-3.5-turbo")
|
||||||
"max_tokens",
|
|
||||||
"stream",
|
|
||||||
]
|
|
||||||
|
|
||||||
def map_openai_params(self, non_default_params: dict, optional_params: dict):
|
def map_openai_params(self, non_default_params: dict, optional_params: dict):
|
||||||
for param, value in non_default_params.items():
|
return litellm.OpenAIConfig().map_openai_params(
|
||||||
if param == "max_tokens":
|
non_default_params=non_default_params,
|
||||||
optional_params["max_tokens"] = value
|
optional_params=optional_params,
|
||||||
if param == "stream":
|
model="gpt-3.5-turbo",
|
||||||
optional_params["stream"] = value
|
)
|
||||||
return optional_params
|
|
||||||
|
|
||||||
|
|
||||||
class VertexAIPartnerModels(BaseLLM):
|
class VertexAIPartnerModels(BaseLLM):
|
||||||
|
|
|
@ -1856,17 +1856,18 @@ def completion(
|
||||||
)
|
)
|
||||||
|
|
||||||
openrouter_site_url = get_secret("OR_SITE_URL") or "https://litellm.ai"
|
openrouter_site_url = get_secret("OR_SITE_URL") or "https://litellm.ai"
|
||||||
|
|
||||||
openrouter_app_name = get_secret("OR_APP_NAME") or "liteLLM"
|
openrouter_app_name = get_secret("OR_APP_NAME") or "liteLLM"
|
||||||
|
|
||||||
headers = (
|
openrouter_headers = {
|
||||||
headers
|
"HTTP-Referer": openrouter_site_url,
|
||||||
or litellm.headers
|
"X-Title": openrouter_app_name,
|
||||||
or {
|
}
|
||||||
"HTTP-Referer": openrouter_site_url,
|
|
||||||
"X-Title": openrouter_app_name,
|
_headers = headers or litellm.headers
|
||||||
}
|
if _headers:
|
||||||
)
|
openrouter_headers.update(_headers)
|
||||||
|
|
||||||
|
headers = openrouter_headers
|
||||||
|
|
||||||
## Load Config
|
## Load Config
|
||||||
config = openrouter.OpenrouterConfig.get_config()
|
config = openrouter.OpenrouterConfig.get_config()
|
||||||
|
|
|
@ -1,7 +1,14 @@
|
||||||
model_list:
|
model_list:
|
||||||
- model_name: "*"
|
- model_name: "gpt-3.5-turbo"
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: "*"
|
model: "gpt-3.5-turbo"
|
||||||
|
- model_name: "gpt-4"
|
||||||
|
litellm_params:
|
||||||
|
model: "gpt-4"
|
||||||
|
api_key: "bad_key"
|
||||||
|
- model_name: "gpt-4o"
|
||||||
|
litellm_params:
|
||||||
|
model: "gpt-4o"
|
||||||
|
|
||||||
litellm_settings:
|
litellm_settings:
|
||||||
callbacks: ["lakera_prompt_injection"]
|
fallbacks: [{"gpt-3.5-turbo": ["gpt-4", "gpt-4o"]}]
|
||||||
|
|
|
@ -3,7 +3,7 @@ model_list:
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: openai/fake
|
model: openai/fake
|
||||||
api_key: fake-key
|
api_key: fake-key
|
||||||
api_base: https://exampleopenaiendpoint-production.up.railway.app/
|
api_base: https://exampleopenaiendpoint-production.up.railwaz.app/
|
||||||
- model_name: fireworks-llama-v3-70b-instruct
|
- model_name: fireworks-llama-v3-70b-instruct
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: fireworks_ai/accounts/fireworks/models/llama-v3-70b-instruct
|
model: fireworks_ai/accounts/fireworks/models/llama-v3-70b-instruct
|
||||||
|
@ -50,4 +50,6 @@ general_settings:
|
||||||
|
|
||||||
|
|
||||||
litellm_settings:
|
litellm_settings:
|
||||||
callbacks: ["otel"] # 👈 KEY CHANGE
|
callbacks: ["otel"] # 👈 KEY CHANGE
|
||||||
|
success_callback: ["prometheus"]
|
||||||
|
failure_callback: ["prometheus"]
|
|
@ -57,6 +57,7 @@ from litellm.router_utils.client_initalization_utils import (
|
||||||
set_client,
|
set_client,
|
||||||
should_initialize_sync_client,
|
should_initialize_sync_client,
|
||||||
)
|
)
|
||||||
|
from litellm.router_utils.cooldown_callbacks import router_cooldown_handler
|
||||||
from litellm.router_utils.handle_error import send_llm_exception_alert
|
from litellm.router_utils.handle_error import send_llm_exception_alert
|
||||||
from litellm.scheduler import FlowItem, Scheduler
|
from litellm.scheduler import FlowItem, Scheduler
|
||||||
from litellm.types.llms.openai import (
|
from litellm.types.llms.openai import (
|
||||||
|
@ -2316,8 +2317,10 @@ class Router:
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
if mock_testing_fallbacks is not None and mock_testing_fallbacks is True:
|
if mock_testing_fallbacks is not None and mock_testing_fallbacks is True:
|
||||||
raise Exception(
|
raise litellm.InternalServerError(
|
||||||
f"This is a mock exception for model={model_group}, to trigger a fallback. Fallbacks={fallbacks}"
|
model=model_group,
|
||||||
|
llm_provider="",
|
||||||
|
message=f"This is a mock exception for model={model_group}, to trigger a fallback. Fallbacks={fallbacks}",
|
||||||
)
|
)
|
||||||
elif (
|
elif (
|
||||||
mock_testing_context_fallbacks is not None
|
mock_testing_context_fallbacks is not None
|
||||||
|
@ -2347,6 +2350,7 @@ class Router:
|
||||||
verbose_router_logger.debug(f"Traceback{traceback.format_exc()}")
|
verbose_router_logger.debug(f"Traceback{traceback.format_exc()}")
|
||||||
original_exception = e
|
original_exception = e
|
||||||
fallback_model_group = None
|
fallback_model_group = None
|
||||||
|
fallback_failure_exception_str = ""
|
||||||
try:
|
try:
|
||||||
verbose_router_logger.debug("Trying to fallback b/w models")
|
verbose_router_logger.debug("Trying to fallback b/w models")
|
||||||
if (
|
if (
|
||||||
|
@ -2505,6 +2509,7 @@ class Router:
|
||||||
await self._async_get_cooldown_deployments_with_debug_info(),
|
await self._async_get_cooldown_deployments_with_debug_info(),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
fallback_failure_exception_str = str(new_exception)
|
||||||
|
|
||||||
if hasattr(original_exception, "message"):
|
if hasattr(original_exception, "message"):
|
||||||
# add the available fallbacks to the exception
|
# add the available fallbacks to the exception
|
||||||
|
@ -2512,6 +2517,13 @@ class Router:
|
||||||
model_group,
|
model_group,
|
||||||
fallback_model_group,
|
fallback_model_group,
|
||||||
)
|
)
|
||||||
|
if len(fallback_failure_exception_str) > 0:
|
||||||
|
original_exception.message += (
|
||||||
|
"\nError doing the fallback: {}".format(
|
||||||
|
fallback_failure_exception_str
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
raise original_exception
|
raise original_exception
|
||||||
|
|
||||||
async def async_function_with_retries(self, *args, **kwargs):
|
async def async_function_with_retries(self, *args, **kwargs):
|
||||||
|
@ -3294,10 +3306,14 @@ class Router:
|
||||||
value=cached_value, key=cooldown_key, ttl=cooldown_time
|
value=cached_value, key=cooldown_key, ttl=cooldown_time
|
||||||
)
|
)
|
||||||
|
|
||||||
self.send_deployment_cooldown_alert(
|
# Trigger cooldown handler
|
||||||
deployment_id=deployment,
|
asyncio.create_task(
|
||||||
exception_status=exception_status,
|
router_cooldown_handler(
|
||||||
cooldown_time=cooldown_time,
|
litellm_router_instance=self,
|
||||||
|
deployment_id=deployment,
|
||||||
|
exception_status=exception_status,
|
||||||
|
cooldown_time=cooldown_time,
|
||||||
|
)
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
self.failed_calls.set_cache(
|
self.failed_calls.set_cache(
|
||||||
|
@ -4948,42 +4964,6 @@ class Router:
|
||||||
)
|
)
|
||||||
print("\033[94m\nInitialized Alerting for litellm.Router\033[0m\n") # noqa
|
print("\033[94m\nInitialized Alerting for litellm.Router\033[0m\n") # noqa
|
||||||
|
|
||||||
def send_deployment_cooldown_alert(
|
|
||||||
self,
|
|
||||||
deployment_id: str,
|
|
||||||
exception_status: Union[str, int],
|
|
||||||
cooldown_time: float,
|
|
||||||
):
|
|
||||||
try:
|
|
||||||
from litellm.proxy.proxy_server import proxy_logging_obj
|
|
||||||
|
|
||||||
# trigger slack alert saying deployment is in cooldown
|
|
||||||
if (
|
|
||||||
proxy_logging_obj is not None
|
|
||||||
and proxy_logging_obj.alerting is not None
|
|
||||||
and "slack" in proxy_logging_obj.alerting
|
|
||||||
):
|
|
||||||
_deployment = self.get_deployment(model_id=deployment_id)
|
|
||||||
if _deployment is None:
|
|
||||||
return
|
|
||||||
|
|
||||||
_litellm_params = _deployment["litellm_params"]
|
|
||||||
temp_litellm_params = copy.deepcopy(_litellm_params)
|
|
||||||
temp_litellm_params = dict(temp_litellm_params)
|
|
||||||
_model_name = _deployment.get("model_name", None)
|
|
||||||
_api_base = litellm.get_api_base(
|
|
||||||
model=_model_name, optional_params=temp_litellm_params
|
|
||||||
)
|
|
||||||
# asyncio.create_task(
|
|
||||||
# proxy_logging_obj.slack_alerting_instance.send_alert(
|
|
||||||
# message=f"Router: Cooling down Deployment:\nModel Name: `{_model_name}`\nAPI Base: `{_api_base}`\nCooldown Time: `{cooldown_time} seconds`\nException Status Code: `{str(exception_status)}`\n\nChange 'cooldown_time' + 'allowed_fails' under 'Router Settings' on proxy UI, or via config - https://docs.litellm.ai/docs/proxy/reliability#fallbacks--retries--timeouts--cooldowns",
|
|
||||||
# alert_type="cooldown_deployment",
|
|
||||||
# level="Low",
|
|
||||||
# )
|
|
||||||
# )
|
|
||||||
except Exception as e:
|
|
||||||
pass
|
|
||||||
|
|
||||||
def set_custom_routing_strategy(
|
def set_custom_routing_strategy(
|
||||||
self, CustomRoutingStrategy: CustomRoutingStrategyBase
|
self, CustomRoutingStrategy: CustomRoutingStrategyBase
|
||||||
):
|
):
|
||||||
|
|
51
litellm/router_utils/cooldown_callbacks.py
Normal file
51
litellm/router_utils/cooldown_callbacks.py
Normal file
|
@ -0,0 +1,51 @@
|
||||||
|
"""
|
||||||
|
Callbacks triggered on cooling down deployments
|
||||||
|
"""
|
||||||
|
|
||||||
|
import copy
|
||||||
|
from typing import TYPE_CHECKING, Any, Union
|
||||||
|
|
||||||
|
import litellm
|
||||||
|
from litellm._logging import verbose_logger
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from litellm.router import Router as _Router
|
||||||
|
|
||||||
|
LitellmRouter = _Router
|
||||||
|
else:
|
||||||
|
LitellmRouter = Any
|
||||||
|
|
||||||
|
|
||||||
|
async def router_cooldown_handler(
|
||||||
|
litellm_router_instance: LitellmRouter,
|
||||||
|
deployment_id: str,
|
||||||
|
exception_status: Union[str, int],
|
||||||
|
cooldown_time: float,
|
||||||
|
):
|
||||||
|
_deployment = litellm_router_instance.get_deployment(model_id=deployment_id)
|
||||||
|
if _deployment is None:
|
||||||
|
verbose_logger.warning(
|
||||||
|
f"in router_cooldown_handler but _deployment is None for deployment_id={deployment_id}. Doing nothing"
|
||||||
|
)
|
||||||
|
return
|
||||||
|
_litellm_params = _deployment["litellm_params"]
|
||||||
|
temp_litellm_params = copy.deepcopy(_litellm_params)
|
||||||
|
temp_litellm_params = dict(temp_litellm_params)
|
||||||
|
_model_name = _deployment.get("model_name", None)
|
||||||
|
_api_base = litellm.get_api_base(
|
||||||
|
model=_model_name, optional_params=temp_litellm_params
|
||||||
|
)
|
||||||
|
model_info = _deployment["model_info"]
|
||||||
|
model_id = model_info.id
|
||||||
|
|
||||||
|
# Trigger cooldown on Prometheus
|
||||||
|
from litellm.litellm_core_utils.litellm_logging import prometheusLogger
|
||||||
|
|
||||||
|
if prometheusLogger is not None:
|
||||||
|
prometheusLogger.set_deployment_complete_outage(
|
||||||
|
litellm_model_name=_model_name,
|
||||||
|
model_id=model_id,
|
||||||
|
api_base="",
|
||||||
|
llm_provider="",
|
||||||
|
)
|
||||||
|
pass
|
|
@ -892,47 +892,51 @@ def test_completion_claude_3_base64():
|
||||||
"model", ["gemini/gemini-1.5-flash"] # "claude-3-sonnet-20240229",
|
"model", ["gemini/gemini-1.5-flash"] # "claude-3-sonnet-20240229",
|
||||||
)
|
)
|
||||||
def test_completion_function_plus_image(model):
|
def test_completion_function_plus_image(model):
|
||||||
litellm.set_verbose = True
|
try:
|
||||||
|
litellm.set_verbose = True
|
||||||
|
|
||||||
image_content = [
|
image_content = [
|
||||||
{"type": "text", "text": "What’s in this image?"},
|
{"type": "text", "text": "What’s in this image?"},
|
||||||
{
|
{
|
||||||
"type": "image_url",
|
"type": "image_url",
|
||||||
"image_url": {
|
"image_url": {
|
||||||
"url": "https://litellm-listing.s3.amazonaws.com/litellm_logo.png"
|
"url": "https://litellm-listing.s3.amazonaws.com/litellm_logo.png"
|
||||||
},
|
|
||||||
},
|
|
||||||
]
|
|
||||||
image_message = {"role": "user", "content": image_content}
|
|
||||||
|
|
||||||
tools = [
|
|
||||||
{
|
|
||||||
"type": "function",
|
|
||||||
"function": {
|
|
||||||
"name": "get_current_weather",
|
|
||||||
"description": "Get the current weather in a given location",
|
|
||||||
"parameters": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"location": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "The city and state, e.g. San Francisco, CA",
|
|
||||||
},
|
|
||||||
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
|
|
||||||
},
|
|
||||||
"required": ["location"],
|
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
}
|
]
|
||||||
]
|
image_message = {"role": "user", "content": image_content}
|
||||||
|
|
||||||
tool_choice = {"type": "function", "function": {"name": "get_current_weather"}}
|
tools = [
|
||||||
messages = [
|
{
|
||||||
{
|
"type": "function",
|
||||||
"role": "user",
|
"function": {
|
||||||
"content": "What's the weather like in Boston today in Fahrenheit?",
|
"name": "get_current_weather",
|
||||||
}
|
"description": "Get the current weather in a given location",
|
||||||
]
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"location": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The city and state, e.g. San Francisco, CA",
|
||||||
|
},
|
||||||
|
"unit": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["celsius", "fahrenheit"],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"required": ["location"],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
tool_choice = {"type": "function", "function": {"name": "get_current_weather"}}
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "What's the weather like in Boston today in Fahrenheit?",
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = completion(
|
response = completion(
|
||||||
|
@ -4088,9 +4092,28 @@ async def test_acompletion_gemini():
|
||||||
def test_completion_deepseek():
|
def test_completion_deepseek():
|
||||||
litellm.set_verbose = True
|
litellm.set_verbose = True
|
||||||
model_name = "deepseek/deepseek-chat"
|
model_name = "deepseek/deepseek-chat"
|
||||||
messages = [{"role": "user", "content": "Hey, how's it going?"}]
|
tools = [
|
||||||
|
{
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": "get_weather",
|
||||||
|
"description": "Get weather of an location, the user shoud supply a location first",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"location": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The city and state, e.g. San Francisco, CA",
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["location"],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
messages = [{"role": "user", "content": "How's the weather in Hangzhou?"}]
|
||||||
try:
|
try:
|
||||||
response = completion(model=model_name, messages=messages)
|
response = completion(model=model_name, messages=messages, tools=tools)
|
||||||
# Add any assertions here to check the response
|
# Add any assertions here to check the response
|
||||||
print(response)
|
print(response)
|
||||||
except litellm.APIError as e:
|
except litellm.APIError as e:
|
||||||
|
|
|
@ -3536,22 +3536,11 @@ def get_optional_params(
|
||||||
)
|
)
|
||||||
_check_valid_arg(supported_params=supported_params)
|
_check_valid_arg(supported_params=supported_params)
|
||||||
|
|
||||||
if frequency_penalty is not None:
|
optional_params = litellm.OpenAIConfig().map_openai_params(
|
||||||
optional_params["frequency_penalty"] = frequency_penalty
|
non_default_params=non_default_params,
|
||||||
if max_tokens is not None:
|
optional_params=optional_params,
|
||||||
optional_params["max_tokens"] = max_tokens
|
model=model,
|
||||||
if presence_penalty is not None:
|
)
|
||||||
optional_params["presence_penalty"] = presence_penalty
|
|
||||||
if stop is not None:
|
|
||||||
optional_params["stop"] = stop
|
|
||||||
if stream is not None:
|
|
||||||
optional_params["stream"] = stream
|
|
||||||
if temperature is not None:
|
|
||||||
optional_params["temperature"] = temperature
|
|
||||||
if logprobs is not None:
|
|
||||||
optional_params["logprobs"] = logprobs
|
|
||||||
if top_logprobs is not None:
|
|
||||||
optional_params["top_logprobs"] = top_logprobs
|
|
||||||
elif custom_llm_provider == "openrouter":
|
elif custom_llm_provider == "openrouter":
|
||||||
supported_params = get_supported_openai_params(
|
supported_params = get_supported_openai_params(
|
||||||
model=model, custom_llm_provider=custom_llm_provider
|
model=model, custom_llm_provider=custom_llm_provider
|
||||||
|
@ -4141,12 +4130,15 @@ def get_supported_openai_params(
|
||||||
"frequency_penalty",
|
"frequency_penalty",
|
||||||
"max_tokens",
|
"max_tokens",
|
||||||
"presence_penalty",
|
"presence_penalty",
|
||||||
|
"response_format",
|
||||||
"stop",
|
"stop",
|
||||||
"stream",
|
"stream",
|
||||||
"temperature",
|
"temperature",
|
||||||
"top_p",
|
"top_p",
|
||||||
"logprobs",
|
"logprobs",
|
||||||
"top_logprobs",
|
"top_logprobs",
|
||||||
|
"tools",
|
||||||
|
"tool_choice",
|
||||||
]
|
]
|
||||||
elif custom_llm_provider == "cohere":
|
elif custom_llm_provider == "cohere":
|
||||||
return [
|
return [
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue