From 33c9c16388fbcf013e7a4538b087cfe5f5c323c1 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Wed, 21 Aug 2024 15:01:52 -0700 Subject: [PATCH 01/11] feat(caching.py): redis cluster support Closes https://github.com/BerriAI/litellm/issues/4358 --- litellm/_redis.py | 58 +++++++++++++++++++++++++-- litellm/caching.py | 14 +++++-- litellm/proxy/_new_secret_config.yaml | 7 ++++ litellm/proxy/proxy_server.py | 2 +- litellm/tests/test_caching.py | 32 +++++++++++++++ 5 files changed, 106 insertions(+), 7 deletions(-) diff --git a/litellm/_redis.py b/litellm/_redis.py index d72016dcd9..23f82ed1a7 100644 --- a/litellm/_redis.py +++ b/litellm/_redis.py @@ -7,13 +7,17 @@ # # Thank you users! We ❤️ you! - Krrish & Ishaan +import inspect + # s/o [@Frank Colson](https://www.linkedin.com/in/frank-colson-422b9b183/) for this redis implementation import os -import inspect -import redis, litellm # type: ignore -import redis.asyncio as async_redis # type: ignore from typing import List, Optional +import redis # type: ignore +import redis.asyncio as async_redis # type: ignore + +import litellm + def _get_redis_kwargs(): arg_spec = inspect.getfullargspec(redis.Redis) @@ -51,6 +55,19 @@ def _get_redis_url_kwargs(client=None): return available_args +def _get_redis_cluster_kwargs(client=None): + if client is None: + client = redis.Redis.from_url + arg_spec = inspect.getfullargspec(redis.RedisCluster) + + # Only allow primitive arguments + exclude_args = {"self", "connection_pool", "retry", "host", "port", "startup_nodes"} + + available_args = [x for x in arg_spec.args if x not in exclude_args] + + return available_args + + def _get_redis_env_kwarg_mapping(): PREFIX = "REDIS_" @@ -124,6 +141,22 @@ def get_redis_client(**env_overrides): url_kwargs[arg] = redis_kwargs[arg] return redis.Redis.from_url(**url_kwargs) + + if "startup_nodes" in redis_kwargs: + from redis.cluster import ClusterNode + + args = _get_redis_cluster_kwargs() + cluster_kwargs = {} + for arg in redis_kwargs: + if arg in args: + cluster_kwargs[arg] = redis_kwargs[arg] + + new_startup_nodes: List[ClusterNode] = [] + + for item in redis_kwargs["startup_nodes"]: + new_startup_nodes.append(ClusterNode(**item)) + redis_kwargs.pop("startup_nodes") + return redis.RedisCluster(startup_nodes=new_startup_nodes, **cluster_kwargs) return redis.Redis(**redis_kwargs) @@ -143,6 +176,24 @@ def get_redis_async_client(**env_overrides): ) return async_redis.Redis.from_url(**url_kwargs) + if "startup_nodes" in redis_kwargs: + from redis.cluster import ClusterNode + + args = _get_redis_cluster_kwargs() + cluster_kwargs = {} + for arg in redis_kwargs: + if arg in args: + cluster_kwargs[arg] = redis_kwargs[arg] + + new_startup_nodes: List[ClusterNode] = [] + + for item in redis_kwargs["startup_nodes"]: + new_startup_nodes.append(ClusterNode(**item)) + redis_kwargs.pop("startup_nodes") + return async_redis.RedisCluster( + startup_nodes=new_startup_nodes, **cluster_kwargs + ) + return async_redis.Redis( socket_timeout=5, **redis_kwargs, @@ -160,4 +211,5 @@ def get_redis_connection_pool(**env_overrides): connection_class = async_redis.SSLConnection redis_kwargs.pop("ssl", None) redis_kwargs["connection_class"] = connection_class + redis_kwargs.pop("startup_nodes", None) return async_redis.BlockingConnectionPool(timeout=5, **redis_kwargs) diff --git a/litellm/caching.py b/litellm/caching.py index 1c72160295..1b19fdf3e5 100644 --- a/litellm/caching.py +++ b/litellm/caching.py @@ -203,6 +203,7 @@ class RedisCache(BaseCache): password=None, redis_flush_size=100, namespace: Optional[str] = None, + startup_nodes: Optional[List] = None, # for redis-cluster **kwargs, ): import redis @@ -218,7 +219,8 @@ class RedisCache(BaseCache): redis_kwargs["port"] = port if password is not None: redis_kwargs["password"] = password - + if startup_nodes is not None: + redis_kwargs["startup_nodes"] = startup_nodes ### HEALTH MONITORING OBJECT ### if kwargs.get("service_logger_obj", None) is not None and isinstance( kwargs["service_logger_obj"], ServiceLogging @@ -246,7 +248,7 @@ class RedisCache(BaseCache): ### ASYNC HEALTH PING ### try: # asyncio.get_running_loop().create_task(self.ping()) - result = asyncio.get_running_loop().create_task(self.ping()) + _ = asyncio.get_running_loop().create_task(self.ping()) except Exception as e: if "no running event loop" in str(e): verbose_logger.debug( @@ -2123,6 +2125,7 @@ class Cache: redis_semantic_cache_use_async=False, redis_semantic_cache_embedding_model="text-embedding-ada-002", redis_flush_size=None, + redis_startup_nodes: Optional[List] = None, disk_cache_dir=None, qdrant_api_base: Optional[str] = None, qdrant_api_key: Optional[str] = None, @@ -2155,7 +2158,12 @@ class Cache: """ if type == "redis": self.cache: BaseCache = RedisCache( - host, port, password, redis_flush_size, **kwargs + host, + port, + password, + redis_flush_size, + startup_nodes=redis_startup_nodes, + **kwargs, ) elif type == "redis-semantic": self.cache = RedisSemanticCache( diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml index 96a0242a8e..10d608ec89 100644 --- a/litellm/proxy/_new_secret_config.yaml +++ b/litellm/proxy/_new_secret_config.yaml @@ -2,3 +2,10 @@ model_list: - model_name: "*" litellm_params: model: "*" + + +litellm_settings: + cache: True + cache_params: + type: redis + redis_startup_nodes: [{"host": "127.0.0.1", "port": "7001"}] diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index a9d0325d80..8986b587b7 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -1583,7 +1583,7 @@ class ProxyConfig: verbose_proxy_logger.debug( # noqa f"{blue_color_code}Set Cache on LiteLLM Proxy: {vars(litellm.cache.cache)}{reset_color_code}" ) - elif key == "cache" and value == False: + elif key == "cache" and value is False: pass elif key == "guardrails": if premium_user is not True: diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index 64196e5c56..5da883f4ae 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -804,6 +804,38 @@ def test_redis_cache_completion_stream(): # test_redis_cache_completion_stream() +# @pytest.mark.skip(reason="Local test. Requires running redis cluster locally.") +@pytest.mark.asyncio +async def test_redis_cache_cluster_init_unit_test(): + try: + from redis.asyncio import RedisCluster as AsyncRedisCluster + from redis.cluster import RedisCluster + + from litellm.caching import RedisCache + + litellm.set_verbose = True + + # List of startup nodes + startup_nodes = [ + {"host": "127.0.0.1", "port": "7001"}, + ] + + resp = RedisCache(startup_nodes=startup_nodes) + + assert isinstance(resp.redis_client, RedisCluster) + assert isinstance(resp.init_async_client(), AsyncRedisCluster) + + resp = litellm.Cache(type="redis", redis_startup_nodes=startup_nodes) + + assert isinstance(resp.cache, RedisCache) + assert isinstance(resp.cache.redis_client, RedisCluster) + assert isinstance(resp.cache.init_async_client(), AsyncRedisCluster) + + except Exception as e: + print(f"{str(e)}\n\n{traceback.format_exc()}") + raise e + + @pytest.mark.asyncio async def test_redis_cache_acompletion_stream(): try: From ea7968e22eb5e7f7d176da5cecf5aa95ff2a64e9 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Wed, 21 Aug 2024 15:05:18 -0700 Subject: [PATCH 02/11] test(test_caching.py): skip local test --- litellm/tests/test_caching.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index 5da883f4ae..e474dff2e4 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -804,7 +804,7 @@ def test_redis_cache_completion_stream(): # test_redis_cache_completion_stream() -# @pytest.mark.skip(reason="Local test. Requires running redis cluster locally.") +@pytest.mark.skip(reason="Local test. Requires running redis cluster locally.") @pytest.mark.asyncio async def test_redis_cache_cluster_init_unit_test(): try: From 45048ee006eccd9b89d271e08355ddeebb5340ea Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Wed, 21 Aug 2024 15:35:10 -0700 Subject: [PATCH 03/11] fix(router.py): fix linting error --- litellm/router.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/litellm/router.py b/litellm/router.py index e261c1743d..7a938f5c4e 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -277,7 +277,8 @@ class Router: "local" # default to an in-memory cache ) redis_cache = None - cache_config = {} + cache_config: Dict[str, Any] = {} + self.client_ttl = client_ttl if redis_url is not None or ( redis_host is not None From 8162208a5cdcb66e8707421980ca6f59428e540a Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 22 Aug 2024 13:52:03 -0700 Subject: [PATCH 04/11] track api_call_start_time --- litellm/litellm_core_utils/litellm_logging.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/litellm/litellm_core_utils/litellm_logging.py b/litellm/litellm_core_utils/litellm_logging.py index d59f985584..dbf2a7d3e5 100644 --- a/litellm/litellm_core_utils/litellm_logging.py +++ b/litellm/litellm_core_utils/litellm_logging.py @@ -354,6 +354,8 @@ class Logging: str(e) ) ) + + self.model_call_details["api_call_start_time"] = datetime.datetime.now() # Input Integration Logging -> If you want to log the fact that an attempt to call the model was made callbacks = litellm.input_callback + self.dynamic_input_callbacks for callback in callbacks: From c719c375f70f850816cfeb1b781164da24ba5561 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 22 Aug 2024 13:58:10 -0700 Subject: [PATCH 05/11] track litellm_request_latency_metric --- litellm/integrations/prometheus.py | 38 ++++++++++++++++++++++++++++++ litellm/proxy/proxy_config.yaml | 4 +++- 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/litellm/integrations/prometheus.py b/litellm/integrations/prometheus.py index 1471f59b75..dadafa80e3 100644 --- a/litellm/integrations/prometheus.py +++ b/litellm/integrations/prometheus.py @@ -60,6 +60,25 @@ class PrometheusLogger(CustomLogger): ], ) + # request latency metrics + self.litellm_request_latency_metric = Histogram( + "litellm_request_latency_metric", + "Total latency (seconds) for a request to LiteLLM", + labelnames=[ + "model", + "litellm_call_id", + ], + ) + + self.litellm_deployment_latency_metric = Histogram( + "litellm_deployment_latency_metric", + "Total latency (seconds) for a models LLM API call", + labelnames=[ + "model", + "litellm_call_id", + ], + ) + # Counter for spend self.litellm_spend_metric = Counter( "litellm_spend_metric", @@ -329,6 +348,25 @@ class PrometheusLogger(CustomLogger): user_api_key, user_api_key_alias, model_group ).set(remaining_tokens) + # latency metrics + total_time: timedelta = kwargs.get("end_time") - kwargs.get("start_time") + total_time_seconds = total_time.total_seconds() + api_call_total_time: timedelta = kwargs.get("end_time") - kwargs.get( + "api_call_start_time" + ) + + api_call_total_time_seconds = api_call_total_time.total_seconds() + + litellm_call_id = kwargs.get("litellm_call_id") + + self.litellm_request_latency_metric.labels(model, litellm_call_id).observe( + total_time_seconds + ) + + self.litellm_deployment_latency_metric.labels(model, litellm_call_id).observe( + api_call_total_time_seconds + ) + # set x-ratelimit headers if premium_user is True: self.set_llm_deployment_success_metrics( diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml index 65c7f70525..7c524eb18a 100644 --- a/litellm/proxy/proxy_config.yaml +++ b/litellm/proxy/proxy_config.yaml @@ -4,7 +4,9 @@ model_list: model: openai/fake api_key: fake-key api_base: https://exampleopenaiendpoint-production.up.railway.app/ - +litellm_settings: + success_callback: ["prometheus"] + failure_callback: ["prometheus"] guardrails: - guardrail_name: "lakera-pre-guard" litellm_params: From 9476582fb7091d418d8df1ed918d0db7e21c701c Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 22 Aug 2024 14:03:00 -0700 Subject: [PATCH 06/11] update promtheus metric names --- litellm/integrations/prometheus.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/litellm/integrations/prometheus.py b/litellm/integrations/prometheus.py index dadafa80e3..659e5b193c 100644 --- a/litellm/integrations/prometheus.py +++ b/litellm/integrations/prometheus.py @@ -61,8 +61,8 @@ class PrometheusLogger(CustomLogger): ) # request latency metrics - self.litellm_request_latency_metric = Histogram( - "litellm_request_latency_metric", + self.litellm_request_total_latency_metric = Histogram( + "litellm_request_total_latency_metric", "Total latency (seconds) for a request to LiteLLM", labelnames=[ "model", @@ -70,8 +70,8 @@ class PrometheusLogger(CustomLogger): ], ) - self.litellm_deployment_latency_metric = Histogram( - "litellm_deployment_latency_metric", + self.litellm_llm_api_latency_metric = Histogram( + "litellm_llm_api_latency_metric", "Total latency (seconds) for a models LLM API call", labelnames=[ "model", @@ -359,11 +359,11 @@ class PrometheusLogger(CustomLogger): litellm_call_id = kwargs.get("litellm_call_id") - self.litellm_request_latency_metric.labels(model, litellm_call_id).observe( - total_time_seconds - ) + self.litellm_request_total_latency_metric.labels( + model, litellm_call_id + ).observe(total_time_seconds) - self.litellm_deployment_latency_metric.labels(model, litellm_call_id).observe( + self.litellm_llm_api_latency_metric.labels(model, litellm_call_id).observe( api_call_total_time_seconds ) From 55d35c2ea12a4e82fd2a2fc416aec1bc3b669e34 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 22 Aug 2024 14:06:14 -0700 Subject: [PATCH 07/11] add prom docs for Request Latency Metrics --- docs/my-website/docs/proxy/prometheus.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/docs/my-website/docs/proxy/prometheus.md b/docs/my-website/docs/proxy/prometheus.md index 4b913d2e82..10e6456c21 100644 --- a/docs/my-website/docs/proxy/prometheus.md +++ b/docs/my-website/docs/proxy/prometheus.md @@ -68,6 +68,15 @@ http://localhost:4000/metrics | `litellm_total_tokens` | input + output tokens per `"user", "key", "model", "team", "end-user"` | | `litellm_llm_api_failed_requests_metric` | Number of failed LLM API requests per `"user", "key", "model", "team", "end-user"` | +### Request Latency Metrics + +| Metric Name | Description | +|----------------------|--------------------------------------| +| `litellm_request_total_latency_metric` | Total latency (seconds) for a request to LiteLLM Proxy Server - tracked for labels `litellm_call_id`, `model` | +| `litellm_llm_api_latency_metric` | latency (seconds) for just the LLM API call - tracked for labels `litellm_call_id`, `model` | + + + ### LLM API / Provider Metrics | Metric Name | Description | From 2568b5536ddf047d89f2a24f561af0beae091824 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pstr=C4=85g?= Date: Thu, 22 Aug 2024 23:21:40 +0200 Subject: [PATCH 08/11] add dbally project --- docs/my-website/docs/projects/dbally.md | 3 +++ docs/my-website/sidebars.js | 1 + 2 files changed, 4 insertions(+) create mode 100644 docs/my-website/docs/projects/dbally.md diff --git a/docs/my-website/docs/projects/dbally.md b/docs/my-website/docs/projects/dbally.md new file mode 100644 index 0000000000..688f1ab0ff --- /dev/null +++ b/docs/my-website/docs/projects/dbally.md @@ -0,0 +1,3 @@ +Efficient, consistent and secure library for querying structured data with natural language. Query any database with over 100 LLMs ❤️ 🚅. + +🔗 [GitHub](https://github.com/deepsense-ai/db-ally) diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js index 54df1f3e35..368609d4bf 100644 --- a/docs/my-website/sidebars.js +++ b/docs/my-website/sidebars.js @@ -261,6 +261,7 @@ const sidebars = { items: [ "projects/Docq.AI", "projects/OpenInterpreter", + "projects/dbally", "projects/FastREPL", "projects/PROMPTMETHEUS", "projects/Codium PR Agent", From 73a5921262d9749072fdb0e1b7b694bf3568ee66 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Thu, 22 Aug 2024 16:37:46 -0700 Subject: [PATCH 09/11] feat(auth_checks.py): allow team to call all models, when explicitly set via /* --- litellm/proxy/_new_secret_config.yaml | 9 --------- litellm/proxy/auth/auth_checks.py | 8 ++++++-- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml index 66530c7db0..96a0242a8e 100644 --- a/litellm/proxy/_new_secret_config.yaml +++ b/litellm/proxy/_new_secret_config.yaml @@ -2,12 +2,3 @@ model_list: - model_name: "*" litellm_params: model: "*" - -litellm_settings: - success_callback: ["s3"] - cache: true - s3_callback_params: - s3_bucket_name: mytestbucketlitellm # AWS Bucket Name for S3 - s3_region_name: us-west-2 # AWS Region Name for S3 - s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID # us os.environ/ to pass environment variables. This is AWS Access Key ID for S3 - s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY # AWS Secret Access Key for S3 \ No newline at end of file diff --git a/litellm/proxy/auth/auth_checks.py b/litellm/proxy/auth/auth_checks.py index cf5065c2e1..0f1452651e 100644 --- a/litellm/proxy/auth/auth_checks.py +++ b/litellm/proxy/auth/auth_checks.py @@ -66,7 +66,7 @@ def common_checks( raise Exception( f"Team={team_object.team_id} is blocked. Update via `/team/unblock` if your admin." ) - # 2. If user can call model + # 2. If team can call model if ( _model is not None and team_object is not None @@ -74,7 +74,11 @@ def common_checks( and _model not in team_object.models ): # this means the team has access to all models on the proxy - if "all-proxy-models" in team_object.models: + if ( + "all-proxy-models" in team_object.models + or "*" in team_object.models + or "openai/*" in team_object.models + ): # this means the team has access to all models on the proxy pass # check if the team model is an access_group From b0706a6f8f4912191d6e28206c613dd8250fa228 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Thu, 22 Aug 2024 16:49:52 -0700 Subject: [PATCH 10/11] fix(proxy_server.py): expose flag to disable retries when max parallel request limit is hit --- docs/my-website/docs/proxy/configs.md | 1 + litellm/proxy/proxy_server.py | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/docs/my-website/docs/proxy/configs.md b/docs/my-website/docs/proxy/configs.md index 19c1f7902d..d08d6324da 100644 --- a/docs/my-website/docs/proxy/configs.md +++ b/docs/my-website/docs/proxy/configs.md @@ -727,6 +727,7 @@ general_settings: "completion_model": "string", "disable_spend_logs": "boolean", # turn off writing each transaction to the db "disable_master_key_return": "boolean", # turn off returning master key on UI (checked on '/user/info' endpoint) + "disable_retry_on_max_parallel_request_limit_error": "boolean", # turn off retries when max parallel request limit is reached "disable_reset_budget": "boolean", # turn off reset budget scheduled task "disable_adding_master_key_hash_to_db": "boolean", # turn off storing master key hash in db, for spend tracking "enable_jwt_auth": "boolean", # allow proxy admin to auth in via jwt tokens with 'litellm_proxy_admin' in claims diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index 6bd528def8..c793ffbe3f 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -2672,6 +2672,13 @@ def giveup(e): and isinstance(e.message, str) and "Max parallel request limit reached" in e.message ) + + if ( + general_settings.get("disable_retry_on_max_parallel_request_limit_error") + is True + ): + return True # giveup if queuing max parallel request limits is disabled + if result: verbose_proxy_logger.info(json.dumps({"event": "giveup", "exception": str(e)})) return result From 849cfa9bdee82ad0c85633aa4be608fa973126f9 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Thu, 22 Aug 2024 17:12:52 -0700 Subject: [PATCH 11/11] docs(configs.md): add global_max_parallel_requests to docs --- docs/my-website/docs/proxy/configs.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/my-website/docs/proxy/configs.md b/docs/my-website/docs/proxy/configs.md index d08d6324da..a50b3f6460 100644 --- a/docs/my-website/docs/proxy/configs.md +++ b/docs/my-website/docs/proxy/configs.md @@ -752,7 +752,8 @@ general_settings: }, "otel": true, "custom_auth": "string", - "max_parallel_requests": 0, + "max_parallel_requests": 0, # the max parallel requests allowed per deployment + "global_max_parallel_requests": 0, # the max parallel requests allowed on the proxy all up "infer_model_from_keys": true, "background_health_checks": true, "health_check_interval": 300,