Merge branch 'main' into litellm_add_bedrock_guardrails

2024-08-22 17:28:49 -07:00 · 2024-08-22 17:28:49 -07:00 · 70f9e41ed9
commit 70f9e41ed9
parent 1f0cc72531 e445b78490
14 changed files with 173 additions and 22 deletions
--- a/docs/my-website/docs/projects/dbally.md
+++ b/docs/my-website/docs/projects/dbally.md
@ -0,0 +1,3 @@
+Efficient, consistent and secure library for querying structured data with natural language. Query any database with over 100 LLMs ❤️ 🚅.
+
+🔗 [GitHub](https://github.com/deepsense-ai/db-ally)
--- a/docs/my-website/docs/proxy/configs.md
+++ b/docs/my-website/docs/proxy/configs.md
@ -727,6 +727,7 @@ general_settings:
    "completion_model": "string",
    "disable_spend_logs": "boolean", # turn off writing each transaction to the db
    "disable_master_key_return": "boolean", # turn off returning master key on UI (checked on '/user/info' endpoint)
+    "disable_retry_on_max_parallel_request_limit_error": "boolean", # turn off retries when max parallel request limit is reached
    "disable_reset_budget": "boolean", # turn off reset budget scheduled task
    "disable_adding_master_key_hash_to_db": "boolean", # turn off storing master key hash in db, for spend tracking
    "enable_jwt_auth": "boolean", # allow proxy admin to auth in via jwt tokens with 'litellm_proxy_admin' in claims
@ -751,7 +752,8 @@ general_settings:
    },
    "otel": true,
    "custom_auth": "string",
-    "max_parallel_requests": 0,
+    "max_parallel_requests": 0, # the max parallel requests allowed per deployment 
+    "global_max_parallel_requests": 0, # the max parallel requests allowed on the proxy all up 
    "infer_model_from_keys": true,
    "background_health_checks": true,
    "health_check_interval": 300,
--- a/docs/my-website/docs/proxy/prometheus.md
+++ b/docs/my-website/docs/proxy/prometheus.md
@ -68,6 +68,15 @@ http://localhost:4000/metrics
 | `litellm_total_tokens`         | input + output tokens per `"user", "key", "model", "team", "end-user"`     |
 | `litellm_llm_api_failed_requests_metric`   | Number of failed LLM API requests per `"user", "key", "model", "team", "end-user"`    |

+### Request Latency Metrics 
+
+| Metric Name          | Description                          |
+|----------------------|--------------------------------------|
+| `litellm_request_total_latency_metric`             | Total latency (seconds) for a request to LiteLLM Proxy Server - tracked for labels `litellm_call_id`, `model` |
+| `litellm_llm_api_latency_metric`             | latency (seconds) for just the LLM API call - tracked for labels `litellm_call_id`, `model` |
+
+
+
 ### LLM API / Provider Metrics

 | Metric Name          | Description                          |
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -292,6 +292,7 @@ const sidebars = {
          items: [
            "projects/Docq.AI",
            "projects/OpenInterpreter",
+            "projects/dbally",
            "projects/FastREPL",
            "projects/PROMPTMETHEUS",
            "projects/Codium PR Agent",
--- a/litellm/_redis.py
+++ b/litellm/_redis.py
@ -7,13 +7,17 @@
 #
 #  Thank you users! We ❤️ you! - Krrish & Ishaan

+import inspect
+
 # s/o [@Frank Colson](https://www.linkedin.com/in/frank-colson-422b9b183/) for this redis implementation
 import os
-import inspect
-import redis, litellm  # type: ignore
-import redis.asyncio as async_redis  # type: ignore
 from typing import List, Optional

+import redis  # type: ignore
+import redis.asyncio as async_redis  # type: ignore
+
+import litellm
+

 def _get_redis_kwargs():
    arg_spec = inspect.getfullargspec(redis.Redis)
@ -51,6 +55,19 @@ def _get_redis_url_kwargs(client=None):
    return available_args


+def _get_redis_cluster_kwargs(client=None):
+    if client is None:
+        client = redis.Redis.from_url
+    arg_spec = inspect.getfullargspec(redis.RedisCluster)
+
+    # Only allow primitive arguments
+    exclude_args = {"self", "connection_pool", "retry", "host", "port", "startup_nodes"}
+
+    available_args = [x for x in arg_spec.args if x not in exclude_args]
+
+    return available_args
+
+
 def _get_redis_env_kwarg_mapping():
    PREFIX = "REDIS_"

@ -124,6 +141,22 @@ def get_redis_client(**env_overrides):
                url_kwargs[arg] = redis_kwargs[arg]

        return redis.Redis.from_url(**url_kwargs)
+
+    if "startup_nodes" in redis_kwargs:
+        from redis.cluster import ClusterNode
+
+        args = _get_redis_cluster_kwargs()
+        cluster_kwargs = {}
+        for arg in redis_kwargs:
+            if arg in args:
+                cluster_kwargs[arg] = redis_kwargs[arg]
+
+        new_startup_nodes: List[ClusterNode] = []
+
+        for item in redis_kwargs["startup_nodes"]:
+            new_startup_nodes.append(ClusterNode(**item))
+        redis_kwargs.pop("startup_nodes")
+        return redis.RedisCluster(startup_nodes=new_startup_nodes, **cluster_kwargs)
    return redis.Redis(**redis_kwargs)


@ -143,6 +176,24 @@ def get_redis_async_client(**env_overrides):
                )
        return async_redis.Redis.from_url(**url_kwargs)

+    if "startup_nodes" in redis_kwargs:
+        from redis.cluster import ClusterNode
+
+        args = _get_redis_cluster_kwargs()
+        cluster_kwargs = {}
+        for arg in redis_kwargs:
+            if arg in args:
+                cluster_kwargs[arg] = redis_kwargs[arg]
+
+        new_startup_nodes: List[ClusterNode] = []
+
+        for item in redis_kwargs["startup_nodes"]:
+            new_startup_nodes.append(ClusterNode(**item))
+        redis_kwargs.pop("startup_nodes")
+        return async_redis.RedisCluster(
+            startup_nodes=new_startup_nodes, **cluster_kwargs
+        )
+
    return async_redis.Redis(
        socket_timeout=5,
        **redis_kwargs,
@ -160,4 +211,5 @@ def get_redis_connection_pool(**env_overrides):
        connection_class = async_redis.SSLConnection
        redis_kwargs.pop("ssl", None)
        redis_kwargs["connection_class"] = connection_class
+    redis_kwargs.pop("startup_nodes", None)
    return async_redis.BlockingConnectionPool(timeout=5, **redis_kwargs)
--- a/litellm/caching.py
+++ b/litellm/caching.py
@ -203,6 +203,7 @@ class RedisCache(BaseCache):
        password=None,
        redis_flush_size=100,
        namespace: Optional[str] = None,
+        startup_nodes: Optional[List] = None,  # for redis-cluster
        **kwargs,
    ):
        import redis
@ -218,7 +219,8 @@ class RedisCache(BaseCache):
            redis_kwargs["port"] = port
        if password is not None:
            redis_kwargs["password"] = password
-
+        if startup_nodes is not None:
+            redis_kwargs["startup_nodes"] = startup_nodes
        ### HEALTH MONITORING OBJECT ###
        if kwargs.get("service_logger_obj", None) is not None and isinstance(
            kwargs["service_logger_obj"], ServiceLogging
@ -246,7 +248,7 @@ class RedisCache(BaseCache):
        ### ASYNC HEALTH PING ###
        try:
            # asyncio.get_running_loop().create_task(self.ping())
-            result = asyncio.get_running_loop().create_task(self.ping())
+            _ = asyncio.get_running_loop().create_task(self.ping())
        except Exception as e:
            if "no running event loop" in str(e):
                verbose_logger.debug(
@ -2123,6 +2125,7 @@ class Cache:
        redis_semantic_cache_use_async=False,
        redis_semantic_cache_embedding_model="text-embedding-ada-002",
        redis_flush_size=None,
+        redis_startup_nodes: Optional[List] = None,
        disk_cache_dir=None,
        qdrant_api_base: Optional[str] = None,
        qdrant_api_key: Optional[str] = None,
@ -2155,7 +2158,12 @@ class Cache:
        """
        if type == "redis":
            self.cache: BaseCache = RedisCache(
-                host, port, password, redis_flush_size, **kwargs
+                host,
+                port,
+                password,
+                redis_flush_size,
+                startup_nodes=redis_startup_nodes,
+                **kwargs,
            )
        elif type == "redis-semantic":
            self.cache = RedisSemanticCache(
--- a/litellm/integrations/prometheus.py
+++ b/litellm/integrations/prometheus.py
@ -60,6 +60,25 @@ class PrometheusLogger(CustomLogger):
                ],
            )

+            # request latency metrics
+            self.litellm_request_total_latency_metric = Histogram(
+                "litellm_request_total_latency_metric",
+                "Total latency (seconds) for a request to LiteLLM",
+                labelnames=[
+                    "model",
+                    "litellm_call_id",
+                ],
+            )
+
+            self.litellm_llm_api_latency_metric = Histogram(
+                "litellm_llm_api_latency_metric",
+                "Total latency (seconds) for a models LLM API call",
+                labelnames=[
+                    "model",
+                    "litellm_call_id",
+                ],
+            )
+
            # Counter for spend
            self.litellm_spend_metric = Counter(
                "litellm_spend_metric",
@ -329,6 +348,25 @@ class PrometheusLogger(CustomLogger):
            user_api_key, user_api_key_alias, model_group
        ).set(remaining_tokens)

+        # latency metrics
+        total_time: timedelta = kwargs.get("end_time") - kwargs.get("start_time")
+        total_time_seconds = total_time.total_seconds()
+        api_call_total_time: timedelta = kwargs.get("end_time") - kwargs.get(
+            "api_call_start_time"
+        )
+
+        api_call_total_time_seconds = api_call_total_time.total_seconds()
+
+        litellm_call_id = kwargs.get("litellm_call_id")
+
+        self.litellm_request_total_latency_metric.labels(
+            model, litellm_call_id
+        ).observe(total_time_seconds)
+
+        self.litellm_llm_api_latency_metric.labels(model, litellm_call_id).observe(
+            api_call_total_time_seconds
+        )
+
        # set x-ratelimit headers
        if premium_user is True:
            self.set_llm_deployment_success_metrics(
--- a/litellm/litellm_core_utils/litellm_logging.py
+++ b/litellm/litellm_core_utils/litellm_logging.py
@ -354,6 +354,8 @@ class Logging:
                            str(e)
                        )
                    )
+
+            self.model_call_details["api_call_start_time"] = datetime.datetime.now()
            # Input Integration Logging -> If you want to log the fact that an attempt to call the model was made
            callbacks = litellm.input_callback + self.dynamic_input_callbacks
            for callback in callbacks:
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@ -2,12 +2,3 @@ model_list:
  - model_name: "*"
    litellm_params:
      model: "*"
-
-litellm_settings:
-  success_callback: ["s3"]
-  cache: true
-  s3_callback_params:
-    s3_bucket_name: mytestbucketlitellm   # AWS Bucket Name for S3
-    s3_region_name: us-west-2              # AWS Region Name for S3
-    s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID  # us os.environ/<variable name> to pass environment variables. This is AWS Access Key ID for S3
-    s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY  # AWS Secret Access Key for S3
--- a/litellm/proxy/auth/auth_checks.py
+++ b/litellm/proxy/auth/auth_checks.py
@ -66,7 +66,7 @@ def common_checks(
        raise Exception(
            f"Team={team_object.team_id} is blocked. Update via `/team/unblock` if your admin."
        )
-    # 2. If user can call model
+    # 2. If team can call model
    if (
        _model is not None
        and team_object is not None
@ -74,7 +74,11 @@ def common_checks(
        and _model not in team_object.models
    ):
        # this means the team has access to all models on the proxy
-        if "all-proxy-models" in team_object.models:
+        if (
+            "all-proxy-models" in team_object.models
+            or "*" in team_object.models
+            or "openai/*" in team_object.models
+        ):
            # this means the team has access to all models on the proxy
            pass
        # check if the team model is an access_group
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@ -1,8 +1,9 @@
 model_list:
  - model_name: fake-openai-endpoint
    litellm_params:
-      model: openai/gpt-4
-      api_key: os.environ/OPENAI_API_KEY
+      model: openai/fake
+      api_key: fake-key
+      api_base: https://exampleopenaiendpoint-production.up.railway.app/

 guardrails:
  - guardrail_name: "bedrock-pre-guard"
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -1588,7 +1588,7 @@ class ProxyConfig:
                        verbose_proxy_logger.debug(  # noqa
                            f"{blue_color_code}Set Cache on LiteLLM Proxy: {vars(litellm.cache.cache)}{reset_color_code}"
                        )
-                elif key == "cache" and value == False:
+                elif key == "cache" and value is False:
                    pass
                elif key == "guardrails":
                    if premium_user is not True:
@ -2672,6 +2672,13 @@ def giveup(e):
        and isinstance(e.message, str)
        and "Max parallel request limit reached" in e.message
    )
+
+    if (
+        general_settings.get("disable_retry_on_max_parallel_request_limit_error")
+        is True
+    ):
+        return True  # giveup if queuing max parallel request limits is disabled
+
    if result:
        verbose_proxy_logger.info(json.dumps({"event": "giveup", "exception": str(e)}))
    return result
--- a/litellm/router.py
+++ b/litellm/router.py
@ -277,7 +277,8 @@ class Router:
            "local"  # default to an in-memory cache
        )
        redis_cache = None
-        cache_config = {}
+        cache_config: Dict[str, Any] = {}
+
        self.client_ttl = client_ttl
        if redis_url is not None or (
            redis_host is not None
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@ -804,6 +804,38 @@ def test_redis_cache_completion_stream():
 # test_redis_cache_completion_stream()


+@pytest.mark.skip(reason="Local test. Requires running redis cluster locally.")
+@pytest.mark.asyncio
+async def test_redis_cache_cluster_init_unit_test():
+    try:
+        from redis.asyncio import RedisCluster as AsyncRedisCluster
+        from redis.cluster import RedisCluster
+
+        from litellm.caching import RedisCache
+
+        litellm.set_verbose = True
+
+        # List of startup nodes
+        startup_nodes = [
+            {"host": "127.0.0.1", "port": "7001"},
+        ]
+
+        resp = RedisCache(startup_nodes=startup_nodes)
+
+        assert isinstance(resp.redis_client, RedisCluster)
+        assert isinstance(resp.init_async_client(), AsyncRedisCluster)
+
+        resp = litellm.Cache(type="redis", redis_startup_nodes=startup_nodes)
+
+        assert isinstance(resp.cache, RedisCache)
+        assert isinstance(resp.cache.redis_client, RedisCluster)
+        assert isinstance(resp.cache.init_async_client(), AsyncRedisCluster)
+
+    except Exception as e:
+        print(f"{str(e)}\n\n{traceback.format_exc()}")
+        raise e
+
+
@pytest.mark.asyncio
 async def test_redis_cache_acompletion_stream():
    try: