diff --git a/docs/my-website/docs/projects/dbally.md b/docs/my-website/docs/projects/dbally.md new file mode 100644 index 0000000000..688f1ab0ff --- /dev/null +++ b/docs/my-website/docs/projects/dbally.md @@ -0,0 +1,3 @@ +Efficient, consistent and secure library for querying structured data with natural language. Query any database with over 100 LLMs ❤️ 🚅. + +🔗 [GitHub](https://github.com/deepsense-ai/db-ally) diff --git a/docs/my-website/docs/proxy/configs.md b/docs/my-website/docs/proxy/configs.md index 19c1f7902d..a50b3f6460 100644 --- a/docs/my-website/docs/proxy/configs.md +++ b/docs/my-website/docs/proxy/configs.md @@ -727,6 +727,7 @@ general_settings: "completion_model": "string", "disable_spend_logs": "boolean", # turn off writing each transaction to the db "disable_master_key_return": "boolean", # turn off returning master key on UI (checked on '/user/info' endpoint) + "disable_retry_on_max_parallel_request_limit_error": "boolean", # turn off retries when max parallel request limit is reached "disable_reset_budget": "boolean", # turn off reset budget scheduled task "disable_adding_master_key_hash_to_db": "boolean", # turn off storing master key hash in db, for spend tracking "enable_jwt_auth": "boolean", # allow proxy admin to auth in via jwt tokens with 'litellm_proxy_admin' in claims @@ -751,7 +752,8 @@ general_settings: }, "otel": true, "custom_auth": "string", - "max_parallel_requests": 0, + "max_parallel_requests": 0, # the max parallel requests allowed per deployment + "global_max_parallel_requests": 0, # the max parallel requests allowed on the proxy all up "infer_model_from_keys": true, "background_health_checks": true, "health_check_interval": 300, diff --git a/docs/my-website/docs/proxy/prometheus.md b/docs/my-website/docs/proxy/prometheus.md index 4b913d2e82..10e6456c21 100644 --- a/docs/my-website/docs/proxy/prometheus.md +++ b/docs/my-website/docs/proxy/prometheus.md @@ -68,6 +68,15 @@ http://localhost:4000/metrics | `litellm_total_tokens` | input + output tokens per `"user", "key", "model", "team", "end-user"` | | `litellm_llm_api_failed_requests_metric` | Number of failed LLM API requests per `"user", "key", "model", "team", "end-user"` | +### Request Latency Metrics + +| Metric Name | Description | +|----------------------|--------------------------------------| +| `litellm_request_total_latency_metric` | Total latency (seconds) for a request to LiteLLM Proxy Server - tracked for labels `litellm_call_id`, `model` | +| `litellm_llm_api_latency_metric` | latency (seconds) for just the LLM API call - tracked for labels `litellm_call_id`, `model` | + + + ### LLM API / Provider Metrics | Metric Name | Description | diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js index b907a11304..4bb4125e0e 100644 --- a/docs/my-website/sidebars.js +++ b/docs/my-website/sidebars.js @@ -292,6 +292,7 @@ const sidebars = { items: [ "projects/Docq.AI", "projects/OpenInterpreter", + "projects/dbally", "projects/FastREPL", "projects/PROMPTMETHEUS", "projects/Codium PR Agent", diff --git a/litellm/_redis.py b/litellm/_redis.py index d72016dcd9..23f82ed1a7 100644 --- a/litellm/_redis.py +++ b/litellm/_redis.py @@ -7,13 +7,17 @@ # # Thank you users! We ❤️ you! - Krrish & Ishaan +import inspect + # s/o [@Frank Colson](https://www.linkedin.com/in/frank-colson-422b9b183/) for this redis implementation import os -import inspect -import redis, litellm # type: ignore -import redis.asyncio as async_redis # type: ignore from typing import List, Optional +import redis # type: ignore +import redis.asyncio as async_redis # type: ignore + +import litellm + def _get_redis_kwargs(): arg_spec = inspect.getfullargspec(redis.Redis) @@ -51,6 +55,19 @@ def _get_redis_url_kwargs(client=None): return available_args +def _get_redis_cluster_kwargs(client=None): + if client is None: + client = redis.Redis.from_url + arg_spec = inspect.getfullargspec(redis.RedisCluster) + + # Only allow primitive arguments + exclude_args = {"self", "connection_pool", "retry", "host", "port", "startup_nodes"} + + available_args = [x for x in arg_spec.args if x not in exclude_args] + + return available_args + + def _get_redis_env_kwarg_mapping(): PREFIX = "REDIS_" @@ -124,6 +141,22 @@ def get_redis_client(**env_overrides): url_kwargs[arg] = redis_kwargs[arg] return redis.Redis.from_url(**url_kwargs) + + if "startup_nodes" in redis_kwargs: + from redis.cluster import ClusterNode + + args = _get_redis_cluster_kwargs() + cluster_kwargs = {} + for arg in redis_kwargs: + if arg in args: + cluster_kwargs[arg] = redis_kwargs[arg] + + new_startup_nodes: List[ClusterNode] = [] + + for item in redis_kwargs["startup_nodes"]: + new_startup_nodes.append(ClusterNode(**item)) + redis_kwargs.pop("startup_nodes") + return redis.RedisCluster(startup_nodes=new_startup_nodes, **cluster_kwargs) return redis.Redis(**redis_kwargs) @@ -143,6 +176,24 @@ def get_redis_async_client(**env_overrides): ) return async_redis.Redis.from_url(**url_kwargs) + if "startup_nodes" in redis_kwargs: + from redis.cluster import ClusterNode + + args = _get_redis_cluster_kwargs() + cluster_kwargs = {} + for arg in redis_kwargs: + if arg in args: + cluster_kwargs[arg] = redis_kwargs[arg] + + new_startup_nodes: List[ClusterNode] = [] + + for item in redis_kwargs["startup_nodes"]: + new_startup_nodes.append(ClusterNode(**item)) + redis_kwargs.pop("startup_nodes") + return async_redis.RedisCluster( + startup_nodes=new_startup_nodes, **cluster_kwargs + ) + return async_redis.Redis( socket_timeout=5, **redis_kwargs, @@ -160,4 +211,5 @@ def get_redis_connection_pool(**env_overrides): connection_class = async_redis.SSLConnection redis_kwargs.pop("ssl", None) redis_kwargs["connection_class"] = connection_class + redis_kwargs.pop("startup_nodes", None) return async_redis.BlockingConnectionPool(timeout=5, **redis_kwargs) diff --git a/litellm/caching.py b/litellm/caching.py index 1c72160295..1b19fdf3e5 100644 --- a/litellm/caching.py +++ b/litellm/caching.py @@ -203,6 +203,7 @@ class RedisCache(BaseCache): password=None, redis_flush_size=100, namespace: Optional[str] = None, + startup_nodes: Optional[List] = None, # for redis-cluster **kwargs, ): import redis @@ -218,7 +219,8 @@ class RedisCache(BaseCache): redis_kwargs["port"] = port if password is not None: redis_kwargs["password"] = password - + if startup_nodes is not None: + redis_kwargs["startup_nodes"] = startup_nodes ### HEALTH MONITORING OBJECT ### if kwargs.get("service_logger_obj", None) is not None and isinstance( kwargs["service_logger_obj"], ServiceLogging @@ -246,7 +248,7 @@ class RedisCache(BaseCache): ### ASYNC HEALTH PING ### try: # asyncio.get_running_loop().create_task(self.ping()) - result = asyncio.get_running_loop().create_task(self.ping()) + _ = asyncio.get_running_loop().create_task(self.ping()) except Exception as e: if "no running event loop" in str(e): verbose_logger.debug( @@ -2123,6 +2125,7 @@ class Cache: redis_semantic_cache_use_async=False, redis_semantic_cache_embedding_model="text-embedding-ada-002", redis_flush_size=None, + redis_startup_nodes: Optional[List] = None, disk_cache_dir=None, qdrant_api_base: Optional[str] = None, qdrant_api_key: Optional[str] = None, @@ -2155,7 +2158,12 @@ class Cache: """ if type == "redis": self.cache: BaseCache = RedisCache( - host, port, password, redis_flush_size, **kwargs + host, + port, + password, + redis_flush_size, + startup_nodes=redis_startup_nodes, + **kwargs, ) elif type == "redis-semantic": self.cache = RedisSemanticCache( diff --git a/litellm/integrations/prometheus.py b/litellm/integrations/prometheus.py index 1471f59b75..659e5b193c 100644 --- a/litellm/integrations/prometheus.py +++ b/litellm/integrations/prometheus.py @@ -60,6 +60,25 @@ class PrometheusLogger(CustomLogger): ], ) + # request latency metrics + self.litellm_request_total_latency_metric = Histogram( + "litellm_request_total_latency_metric", + "Total latency (seconds) for a request to LiteLLM", + labelnames=[ + "model", + "litellm_call_id", + ], + ) + + self.litellm_llm_api_latency_metric = Histogram( + "litellm_llm_api_latency_metric", + "Total latency (seconds) for a models LLM API call", + labelnames=[ + "model", + "litellm_call_id", + ], + ) + # Counter for spend self.litellm_spend_metric = Counter( "litellm_spend_metric", @@ -329,6 +348,25 @@ class PrometheusLogger(CustomLogger): user_api_key, user_api_key_alias, model_group ).set(remaining_tokens) + # latency metrics + total_time: timedelta = kwargs.get("end_time") - kwargs.get("start_time") + total_time_seconds = total_time.total_seconds() + api_call_total_time: timedelta = kwargs.get("end_time") - kwargs.get( + "api_call_start_time" + ) + + api_call_total_time_seconds = api_call_total_time.total_seconds() + + litellm_call_id = kwargs.get("litellm_call_id") + + self.litellm_request_total_latency_metric.labels( + model, litellm_call_id + ).observe(total_time_seconds) + + self.litellm_llm_api_latency_metric.labels(model, litellm_call_id).observe( + api_call_total_time_seconds + ) + # set x-ratelimit headers if premium_user is True: self.set_llm_deployment_success_metrics( diff --git a/litellm/litellm_core_utils/litellm_logging.py b/litellm/litellm_core_utils/litellm_logging.py index d59f985584..dbf2a7d3e5 100644 --- a/litellm/litellm_core_utils/litellm_logging.py +++ b/litellm/litellm_core_utils/litellm_logging.py @@ -354,6 +354,8 @@ class Logging: str(e) ) ) + + self.model_call_details["api_call_start_time"] = datetime.datetime.now() # Input Integration Logging -> If you want to log the fact that an attempt to call the model was made callbacks = litellm.input_callback + self.dynamic_input_callbacks for callback in callbacks: diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml index 2c888a4f30..96a0242a8e 100644 --- a/litellm/proxy/_new_secret_config.yaml +++ b/litellm/proxy/_new_secret_config.yaml @@ -2,12 +2,3 @@ model_list: - model_name: "*" litellm_params: model: "*" - -litellm_settings: - success_callback: ["s3"] - cache: true - s3_callback_params: - s3_bucket_name: mytestbucketlitellm # AWS Bucket Name for S3 - s3_region_name: us-west-2 # AWS Region Name for S3 - s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID # us os.environ/ to pass environment variables. This is AWS Access Key ID for S3 - s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY # AWS Secret Access Key for S3 diff --git a/litellm/proxy/auth/auth_checks.py b/litellm/proxy/auth/auth_checks.py index cf5065c2e1..0f1452651e 100644 --- a/litellm/proxy/auth/auth_checks.py +++ b/litellm/proxy/auth/auth_checks.py @@ -66,7 +66,7 @@ def common_checks( raise Exception( f"Team={team_object.team_id} is blocked. Update via `/team/unblock` if your admin." ) - # 2. If user can call model + # 2. If team can call model if ( _model is not None and team_object is not None @@ -74,7 +74,11 @@ def common_checks( and _model not in team_object.models ): # this means the team has access to all models on the proxy - if "all-proxy-models" in team_object.models: + if ( + "all-proxy-models" in team_object.models + or "*" in team_object.models + or "openai/*" in team_object.models + ): # this means the team has access to all models on the proxy pass # check if the team model is an access_group diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml index 6b831876f0..5f4072434f 100644 --- a/litellm/proxy/proxy_config.yaml +++ b/litellm/proxy/proxy_config.yaml @@ -1,8 +1,9 @@ model_list: - model_name: fake-openai-endpoint litellm_params: - model: openai/gpt-4 - api_key: os.environ/OPENAI_API_KEY + model: openai/fake + api_key: fake-key + api_base: https://exampleopenaiendpoint-production.up.railway.app/ guardrails: - guardrail_name: "bedrock-pre-guard" diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index 0a9abc09ad..c793ffbe3f 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -1588,7 +1588,7 @@ class ProxyConfig: verbose_proxy_logger.debug( # noqa f"{blue_color_code}Set Cache on LiteLLM Proxy: {vars(litellm.cache.cache)}{reset_color_code}" ) - elif key == "cache" and value == False: + elif key == "cache" and value is False: pass elif key == "guardrails": if premium_user is not True: @@ -2672,6 +2672,13 @@ def giveup(e): and isinstance(e.message, str) and "Max parallel request limit reached" in e.message ) + + if ( + general_settings.get("disable_retry_on_max_parallel_request_limit_error") + is True + ): + return True # giveup if queuing max parallel request limits is disabled + if result: verbose_proxy_logger.info(json.dumps({"event": "giveup", "exception": str(e)})) return result diff --git a/litellm/router.py b/litellm/router.py index e261c1743d..7a938f5c4e 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -277,7 +277,8 @@ class Router: "local" # default to an in-memory cache ) redis_cache = None - cache_config = {} + cache_config: Dict[str, Any] = {} + self.client_ttl = client_ttl if redis_url is not None or ( redis_host is not None diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index 64196e5c56..e474dff2e4 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -804,6 +804,38 @@ def test_redis_cache_completion_stream(): # test_redis_cache_completion_stream() +@pytest.mark.skip(reason="Local test. Requires running redis cluster locally.") +@pytest.mark.asyncio +async def test_redis_cache_cluster_init_unit_test(): + try: + from redis.asyncio import RedisCluster as AsyncRedisCluster + from redis.cluster import RedisCluster + + from litellm.caching import RedisCache + + litellm.set_verbose = True + + # List of startup nodes + startup_nodes = [ + {"host": "127.0.0.1", "port": "7001"}, + ] + + resp = RedisCache(startup_nodes=startup_nodes) + + assert isinstance(resp.redis_client, RedisCluster) + assert isinstance(resp.init_async_client(), AsyncRedisCluster) + + resp = litellm.Cache(type="redis", redis_startup_nodes=startup_nodes) + + assert isinstance(resp.cache, RedisCache) + assert isinstance(resp.cache.redis_client, RedisCluster) + assert isinstance(resp.cache.init_async_client(), AsyncRedisCluster) + + except Exception as e: + print(f"{str(e)}\n\n{traceback.format_exc()}") + raise e + + @pytest.mark.asyncio async def test_redis_cache_acompletion_stream(): try: