mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-27 17:11:59 +00:00
Unrestricted API usage can lead to runaway costs and fragmented client-side
throttling logic. This commit introduces a built-in quota mechanism at the
server level, enabling operators to centrally enforce per-client and anonymous
rate limits—without needing external proxies or client changes.
This helps contain compute costs, enforces fair usage, and simplifies deployment
and monitoring of Llama Stack services. Quotas are fully opt-in and have no
effect unless explicitly configured.
Currently, SQLite is the only supported KV store. If quotas are
configured but authentication is disabled, authenticated limits will
gracefully fall back to anonymous limits.
Highlights:
- Adds `QuotaMiddleware` to enforce request quotas:
- Uses bearer token as client ID if present; otherwise falls back to IP address
- Tracks requests in KV store with per-key TTL expiration
- Returns HTTP 429 if a client exceeds their quota
- Extends `ServerConfig` with a `quota` section:
- `kvstore`: configuration for the backend (currently only SQLite)
- `anonymous_max_requests`: per-period cap for unauthenticated clients
- `authenticated_max_requests`: per-period cap for authenticated clients
- `period`: duration of the quota window (currently only `day` is supported)
- Adds full test coverage with FastAPI `TestClient` and custom middleware injection
Behavior changes:
- Quotas are disabled by default unless explicitly configured
- Anonymous users get a conservative default quota; authenticated clients can be given more generous limits
To enable per-client request quotas in `run.yaml`, add:
```yaml
server:
port: 8321
auth:
provider_type: custom
config:
endpoint: https://auth.example.com/validate
quota:
kvstore:
type: sqlite
db_path: ./quotas.db
anonymous_max_requests: 100
authenticated_max_requests: 1000
period: day
```
Signed-off-by: Wen Liang <wenliang@redhat.com>
127 lines
3.7 KiB
Python
127 lines
3.7 KiB
Python
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
# All rights reserved.
|
|
#
|
|
# This source code is licensed under the terms described in the LICENSE file in
|
|
# the root directory of this source tree.
|
|
|
|
import pytest
|
|
from fastapi import FastAPI, Request
|
|
from fastapi.testclient import TestClient
|
|
from starlette.middleware.base import BaseHTTPMiddleware
|
|
|
|
from llama_stack.distribution.datatypes import QuotaConfig, QuotaPeriod
|
|
from llama_stack.distribution.server.quota import QuotaMiddleware
|
|
from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig
|
|
|
|
|
|
class InjectClientIDMiddleware(BaseHTTPMiddleware):
|
|
"""
|
|
Middleware that injects 'authenticated_client_id' to mimic AuthenticationMiddleware.
|
|
"""
|
|
|
|
def __init__(self, app, client_id="client1"):
|
|
super().__init__(app)
|
|
self.client_id = client_id
|
|
|
|
async def dispatch(self, request: Request, call_next):
|
|
request.scope["authenticated_client_id"] = self.client_id
|
|
return await call_next(request)
|
|
|
|
|
|
def build_quota_config(db_path) -> QuotaConfig:
|
|
return QuotaConfig(
|
|
kvstore=SqliteKVStoreConfig(db_path=str(db_path)),
|
|
anonymous_max_requests=1,
|
|
authenticated_max_requests=2,
|
|
period=QuotaPeriod.DAY,
|
|
)
|
|
|
|
|
|
@pytest.fixture
|
|
def auth_app(tmp_path, request):
|
|
"""
|
|
FastAPI app with InjectClientIDMiddleware and QuotaMiddleware for authenticated testing.
|
|
Each test gets its own DB file.
|
|
"""
|
|
inner_app = FastAPI()
|
|
|
|
@inner_app.get("/test")
|
|
async def test_endpoint():
|
|
return {"message": "ok"}
|
|
|
|
db_path = tmp_path / f"quota_{request.node.name}.db"
|
|
quota = build_quota_config(db_path)
|
|
|
|
app = InjectClientIDMiddleware(
|
|
QuotaMiddleware(
|
|
inner_app,
|
|
kv_config=quota.kvstore,
|
|
anonymous_max_requests=quota.anonymous_max_requests,
|
|
authenticated_max_requests=quota.authenticated_max_requests,
|
|
window_seconds=86400,
|
|
),
|
|
client_id=f"client_{request.node.name}",
|
|
)
|
|
return app
|
|
|
|
|
|
def test_authenticated_quota_allows_up_to_limit(auth_app):
|
|
client = TestClient(auth_app)
|
|
assert client.get("/test").status_code == 200
|
|
assert client.get("/test").status_code == 200
|
|
|
|
|
|
def test_authenticated_quota_blocks_after_limit(auth_app):
|
|
client = TestClient(auth_app)
|
|
client.get("/test")
|
|
client.get("/test")
|
|
resp = client.get("/test")
|
|
assert resp.status_code == 429
|
|
assert resp.json()["error"]["message"] == "Quota exceeded"
|
|
|
|
|
|
def test_anonymous_quota_allows_up_to_limit(tmp_path, request):
|
|
inner_app = FastAPI()
|
|
|
|
@inner_app.get("/test")
|
|
async def test_endpoint():
|
|
return {"message": "ok"}
|
|
|
|
db_path = tmp_path / f"quota_anon_{request.node.name}.db"
|
|
quota = build_quota_config(db_path)
|
|
|
|
app = QuotaMiddleware(
|
|
inner_app,
|
|
kv_config=quota.kvstore,
|
|
anonymous_max_requests=quota.anonymous_max_requests,
|
|
authenticated_max_requests=quota.authenticated_max_requests,
|
|
window_seconds=86400,
|
|
)
|
|
|
|
client = TestClient(app)
|
|
assert client.get("/test").status_code == 200
|
|
|
|
|
|
def test_anonymous_quota_blocks_after_limit(tmp_path, request):
|
|
inner_app = FastAPI()
|
|
|
|
@inner_app.get("/test")
|
|
async def test_endpoint():
|
|
return {"message": "ok"}
|
|
|
|
db_path = tmp_path / f"quota_anon_{request.node.name}.db"
|
|
quota = build_quota_config(db_path)
|
|
|
|
app = QuotaMiddleware(
|
|
inner_app,
|
|
kv_config=quota.kvstore,
|
|
anonymous_max_requests=quota.anonymous_max_requests,
|
|
authenticated_max_requests=quota.authenticated_max_requests,
|
|
window_seconds=86400,
|
|
)
|
|
|
|
client = TestClient(app)
|
|
client.get("/test")
|
|
resp = client.get("/test")
|
|
assert resp.status_code == 429
|
|
assert resp.json()["error"]["message"] == "Quota exceeded"
|