fix(proxy_server.py): fix tpm/rpm limiting for jwt auth

fixes tpm/rpm limiting for jwt auth and implements unit tests for jwt auth
2024-03-28 21:19:34 -07:00 · 2024-03-28 21:19:34 -07:00 · 5a117490ec
commit 5a117490ec
parent c15ba368e7
3 changed files with 316 additions and 135 deletions
--- a/litellm/proxy/hooks/parallel_request_limiter.py
+++ b/litellm/proxy/hooks/parallel_request_limiter.py
@ -39,6 +39,11 @@ class _PROXY_MaxParallelRequestsHandler(CustomLogger):
            key=request_count_api_key
        )  # {"current_requests": 1, "current_tpm": 1, "current_rpm": 10}
        if current is None:
            if max_parallel_requests == 0 or tpm_limit == 0 or rpm_limit == 0:
                # base case
                raise HTTPException(
                    status_code=429, detail="Max parallel request limit reached."
                )
            new_val = {
                "current_requests": 1,
                "current_tpm": 0,
@ -81,9 +86,6 @@ class _PROXY_MaxParallelRequestsHandler(CustomLogger):
        if rpm_limit is None:
            rpm_limit = sys.maxsize
        if api_key is None:
            return
        self.user_api_key_cache = cache  # save the api key cache for updating the value
        # ------------
        # Setup values
@ -94,9 +96,11 @@ class _PROXY_MaxParallelRequestsHandler(CustomLogger):
        current_minute = datetime.now().strftime("%M")
        precise_minute = f"{current_date}-{current_hour}-{current_minute}"
        if api_key is not None:
            request_count_api_key = f"{api_key}::{precise_minute}::request_count"
            # CHECK IF REQUEST ALLOWED for key
            current = cache.get_cache(
                key=request_count_api_key
            )  # {"current_requests": 1, "current_tpm": 1, "current_rpm": 10}
@ -137,10 +141,13 @@ class _PROXY_MaxParallelRequestsHandler(CustomLogger):
        # check if REQUEST ALLOWED for user_id
        user_id = user_api_key_dict.user_id
        if user_id is not None:
            _user_id_rate_limits = user_api_key_dict.user_id_rate_limits
            # get user tpm/rpm limits
-        if _user_id_rate_limits is not None and isinstance(_user_id_rate_limits, dict):
+            if _user_id_rate_limits is not None and isinstance(
                _user_id_rate_limits, dict
            ):
                user_tpm_limit = _user_id_rate_limits.get("tpm_limit", None)
                user_rpm_limit = _user_id_rate_limits.get("rpm_limit", None)
                if user_tpm_limit is None:
@ -166,7 +173,9 @@ class _PROXY_MaxParallelRequestsHandler(CustomLogger):
        # TEAM RATE LIMITS
        ## get team tpm/rpm limits
        team_id = user_api_key_dict.team_id
        if team_id is not None:
            team_tpm_limit = getattr(user_api_key_dict, "team_tpm_limit", sys.maxsize)
            if team_tpm_limit is None:
                team_tpm_limit = sys.maxsize
            team_rpm_limit = getattr(user_api_key_dict, "team_rpm_limit", sys.maxsize)
@ -187,11 +196,12 @@ class _PROXY_MaxParallelRequestsHandler(CustomLogger):
                cache=cache,
                data=data,
                call_type=call_type,
-            max_parallel_requests=sys.maxsize,  # TODO: Support max parallel requests for a user
+                max_parallel_requests=sys.maxsize,  # TODO: Support max parallel requests for a team
                request_count_api_key=request_count_api_key,
                tpm_limit=team_tpm_limit,
                rpm_limit=team_rpm_limit,
            )
        return
    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
@ -205,9 +215,6 @@ class _PROXY_MaxParallelRequestsHandler(CustomLogger):
                "user_api_key_team_id", None
            )
            if user_api_key is None:
                return
            if self.user_api_key_cache is None:
                return
@ -225,18 +232,23 @@ class _PROXY_MaxParallelRequestsHandler(CustomLogger):
            if isinstance(response_obj, ModelResponse):
                total_tokens = response_obj.usage.total_tokens
-            request_count_api_key = f"{user_api_key}::{precise_minute}::request_count"
+            # ------------
            # Update usage - API Key
            # ------------
-            current = self.user_api_key_cache.get_cache(key=request_count_api_key) or {
+            if user_api_key is not None:
                request_count_api_key = (
                    f"{user_api_key}::{precise_minute}::request_count"
                )
                current = self.user_api_key_cache.get_cache(
                    key=request_count_api_key
                ) or {
                    "current_requests": 1,
                    "current_tpm": total_tokens,
                    "current_rpm": 1,
                }
            # ------------
            # Update usage - API Key
            # ------------
                new_val = {
                    "current_requests": max(current["current_requests"] - 1, 0),
                    "current_tpm": current["current_tpm"] + total_tokens,
@ -287,9 +299,7 @@ class _PROXY_MaxParallelRequestsHandler(CustomLogger):
            # ------------
            # Update usage - Team
            # ------------
-            if user_api_key_team_id is None:
+            if user_api_key_team_id is not None:
                return
                total_tokens = 0
                if isinstance(response_obj, ModelResponse):
@ -299,7 +309,9 @@ class _PROXY_MaxParallelRequestsHandler(CustomLogger):
                    f"{user_api_key_team_id}::{precise_minute}::request_count"
                )
-            current = self.user_api_key_cache.get_cache(key=request_count_api_key) or {
+                current = self.user_api_key_cache.get_cache(
                    key=request_count_api_key
                ) or {
                    "current_requests": 1,
                    "current_tpm": total_tokens,
                    "current_rpm": 1,
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -361,6 +361,7 @@ async def user_api_key_auth(
                valid_token = await jwt_handler.auth_jwt(token=api_key)
                # get scopes
                scopes = jwt_handler.get_scopes(token=valid_token)
                # check if admin
                is_admin = jwt_handler.is_admin(scopes=scopes)
                # if admin return
@ -453,9 +454,9 @@ async def user_api_key_auth(
                return UserAPIKeyAuth(
                    api_key=None,
                    team_id=team_object.team_id,
-                    tpm_limit=team_object.tpm_limit,
+                    team_tpm_limit=team_object.tpm_limit,
-                    rpm_limit=team_object.rpm_limit,
+                    team_rpm_limit=team_object.rpm_limit,
-                    models=team_object.models,
+                    team_models=team_object.models,
                    user_role="app_owner",
                )
        #### ELSE ####
@ -5759,7 +5760,7 @@ async def new_team(
                },
            )
-        if data.models is not None:
+        if data.models is not None and len(user_api_key_dict.models) > 0:
            for m in data.models:
                if m not in user_api_key_dict.models:
                    raise HTTPException(
--- a/litellm/tests/test_jwt.py
+++ b/litellm/tests/test_jwt.py
@ -177,3 +177,171 @@ async def test_valid_invalid_token():
        response = await jwt_handler.auth_jwt(token=token)
    except Exception as e:
        pytest.fail(f"An exception occurred - {str(e)}")
@pytest.fixture
 def prisma_client():
    import litellm
    from litellm.proxy.utils import PrismaClient, ProxyLogging
    from litellm.proxy.proxy_cli import append_query_params
    proxy_logging_obj = ProxyLogging(user_api_key_cache=DualCache())
    ### add connection pool + pool timeout args
    params = {"connection_limit": 100, "pool_timeout": 60}
    database_url = os.getenv("DATABASE_URL")
    modified_url = append_query_params(database_url, params)
    os.environ["DATABASE_URL"] = modified_url
    # Assuming DBClient is a class that needs to be instantiated
    prisma_client = PrismaClient(
        database_url=os.environ["DATABASE_URL"], proxy_logging_obj=proxy_logging_obj
    )
    return prisma_client
@pytest.mark.asyncio
 async def test_team_token_output(prisma_client):
    import jwt, json
    from cryptography.hazmat.primitives import serialization
    from cryptography.hazmat.primitives.asymmetric import rsa
    from cryptography.hazmat.backends import default_backend
    from fastapi import Request
    from starlette.datastructures import URL
    from litellm.proxy.proxy_server import user_api_key_auth, new_team
    from litellm.proxy._types import NewTeamRequest, UserAPIKeyAuth
    import litellm
    import uuid
    setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
    await litellm.proxy.proxy_server.prisma_client.connect()
    # Generate a private / public key pair using RSA algorithm
    key = rsa.generate_private_key(
        public_exponent=65537, key_size=2048, backend=default_backend()
    )
    # Get private key in PEM format
    private_key = key.private_bytes(
        encoding=serialization.Encoding.PEM,
        format=serialization.PrivateFormat.PKCS8,
        encryption_algorithm=serialization.NoEncryption(),
    )
    # Get public key in PEM format
    public_key = key.public_key().public_bytes(
        encoding=serialization.Encoding.PEM,
        format=serialization.PublicFormat.SubjectPublicKeyInfo,
    )
    public_key_obj = serialization.load_pem_public_key(
        public_key, backend=default_backend()
    )
    # Convert RSA public key object to JWK (JSON Web Key)
    public_jwk = json.loads(jwt.algorithms.RSAAlgorithm.to_jwk(public_key_obj))
    assert isinstance(public_jwk, dict)
    # set cache
    cache = DualCache()
    await cache.async_set_cache(key="litellm_jwt_auth_keys", value=[public_jwk])
    jwt_handler = JWTHandler()
    jwt_handler.user_api_key_cache = cache
    jwt_handler.litellm_jwtauth = LiteLLM_JWTAuth()
    # VALID TOKEN
    ## GENERATE A TOKEN
    # Assuming the current time is in UTC
    expiration_time = int((datetime.utcnow() + timedelta(minutes=10)).timestamp())
    team_id = f"team123_{uuid.uuid4()}"
    payload = {
        "sub": "user123",
        "exp": expiration_time,  # set the token to expire in 10 minutes
        "scope": "litellm_team",
        "client_id": team_id,
    }
    # Generate the JWT token
    # But before, you should convert bytes to string
    private_key_str = private_key.decode("utf-8")
    ## team token
    token = jwt.encode(payload, private_key_str, algorithm="RS256")
    ## admin token
    payload = {
        "sub": "user123",
        "exp": expiration_time,  # set the token to expire in 10 minutes
        "scope": "litellm_proxy_admin",
    }
    admin_token = jwt.encode(payload, private_key_str, algorithm="RS256")
    ## VERIFY IT WORKS
    # verify token
    response = await jwt_handler.auth_jwt(token=token)
    ## RUN IT THROUGH USER API KEY AUTH
    """
    - 1. Initial call should fail -> team doesn't exist
    - 2. Create team via admin token 
    - 3. 2nd call w/ same team -> call should succeed -> assert UserAPIKeyAuth object correctly formatted
    """
    bearer_token = "Bearer " + token
    request = Request(scope={"type": "http"})
    request._url = URL(url="/chat/completions")
    ## 1. INITIAL TEAM CALL - should fail
    # use generated key to auth in
    setattr(litellm.proxy.proxy_server, "general_settings", {"enable_jwt_auth": True})
    setattr(litellm.proxy.proxy_server, "jwt_handler", jwt_handler)
    try:
        result = await user_api_key_auth(request=request, api_key=bearer_token)
        pytest.fail("Team doesn't exist. This should fail")
    except Exception as e:
        pass
    ## 2. CREATE TEAM W/ ADMIN TOKEN - should succeed
    try:
        bearer_token = "Bearer " + admin_token
        request._url = URL(url="/team/new")
        result = await user_api_key_auth(request=request, api_key=bearer_token)
        await new_team(
            data=NewTeamRequest(
                team_id=team_id,
                tpm_limit=100,
                rpm_limit=99,
                models=["gpt-3.5-turbo", "gpt-4"],
            ),
            user_api_key_dict=result,
        )
    except Exception as e:
        pytest.fail(f"This should not fail - {str(e)}")
    ## 3. 2nd CALL W/ TEAM TOKEN - should succeed
    bearer_token = "Bearer " + token
    request._url = URL(url="/chat/completions")
    try:
        team_result: UserAPIKeyAuth = await user_api_key_auth(
            request=request, api_key=bearer_token
        )
    except Exception as e:
        pytest.fail(f"Team exists. This should not fail - {e}")
    ## 4. ASSERT USER_API_KEY_AUTH format (used for tpm/rpm limiting in parallel_request_limiter.py)
    assert team_result.team_tpm_limit == 100
    assert team_result.team_rpm_limit == 99
    assert team_result.team_models == ["gpt-3.5-turbo", "gpt-4"]