Merge branch 'main' into litellm_add_bedrock_guardrails

This commit is contained in:
Ishaan Jaff 2024-08-22 17:28:49 -07:00 committed by GitHub
commit 70f9e41ed9
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
14 changed files with 173 additions and 22 deletions

View file

@ -0,0 +1,3 @@
Efficient, consistent and secure library for querying structured data with natural language. Query any database with over 100 LLMs ❤️ 🚅.
🔗 [GitHub](https://github.com/deepsense-ai/db-ally)

View file

@ -727,6 +727,7 @@ general_settings:
"completion_model": "string",
"disable_spend_logs": "boolean", # turn off writing each transaction to the db
"disable_master_key_return": "boolean", # turn off returning master key on UI (checked on '/user/info' endpoint)
"disable_retry_on_max_parallel_request_limit_error": "boolean", # turn off retries when max parallel request limit is reached
"disable_reset_budget": "boolean", # turn off reset budget scheduled task
"disable_adding_master_key_hash_to_db": "boolean", # turn off storing master key hash in db, for spend tracking
"enable_jwt_auth": "boolean", # allow proxy admin to auth in via jwt tokens with 'litellm_proxy_admin' in claims
@ -751,7 +752,8 @@ general_settings:
},
"otel": true,
"custom_auth": "string",
"max_parallel_requests": 0,
"max_parallel_requests": 0, # the max parallel requests allowed per deployment
"global_max_parallel_requests": 0, # the max parallel requests allowed on the proxy all up
"infer_model_from_keys": true,
"background_health_checks": true,
"health_check_interval": 300,

View file

@ -68,6 +68,15 @@ http://localhost:4000/metrics
| `litellm_total_tokens` | input + output tokens per `"user", "key", "model", "team", "end-user"` |
| `litellm_llm_api_failed_requests_metric` | Number of failed LLM API requests per `"user", "key", "model", "team", "end-user"` |
### Request Latency Metrics
| Metric Name | Description |
|----------------------|--------------------------------------|
| `litellm_request_total_latency_metric` | Total latency (seconds) for a request to LiteLLM Proxy Server - tracked for labels `litellm_call_id`, `model` |
| `litellm_llm_api_latency_metric` | latency (seconds) for just the LLM API call - tracked for labels `litellm_call_id`, `model` |
### LLM API / Provider Metrics
| Metric Name | Description |

View file

@ -292,6 +292,7 @@ const sidebars = {
items: [
"projects/Docq.AI",
"projects/OpenInterpreter",
"projects/dbally",
"projects/FastREPL",
"projects/PROMPTMETHEUS",
"projects/Codium PR Agent",

View file

@ -7,13 +7,17 @@
#
# Thank you users! We ❤️ you! - Krrish & Ishaan
import inspect
# s/o [@Frank Colson](https://www.linkedin.com/in/frank-colson-422b9b183/) for this redis implementation
import os
import inspect
import redis, litellm # type: ignore
import redis.asyncio as async_redis # type: ignore
from typing import List, Optional
import redis # type: ignore
import redis.asyncio as async_redis # type: ignore
import litellm
def _get_redis_kwargs():
arg_spec = inspect.getfullargspec(redis.Redis)
@ -51,6 +55,19 @@ def _get_redis_url_kwargs(client=None):
return available_args
def _get_redis_cluster_kwargs(client=None):
if client is None:
client = redis.Redis.from_url
arg_spec = inspect.getfullargspec(redis.RedisCluster)
# Only allow primitive arguments
exclude_args = {"self", "connection_pool", "retry", "host", "port", "startup_nodes"}
available_args = [x for x in arg_spec.args if x not in exclude_args]
return available_args
def _get_redis_env_kwarg_mapping():
PREFIX = "REDIS_"
@ -124,6 +141,22 @@ def get_redis_client(**env_overrides):
url_kwargs[arg] = redis_kwargs[arg]
return redis.Redis.from_url(**url_kwargs)
if "startup_nodes" in redis_kwargs:
from redis.cluster import ClusterNode
args = _get_redis_cluster_kwargs()
cluster_kwargs = {}
for arg in redis_kwargs:
if arg in args:
cluster_kwargs[arg] = redis_kwargs[arg]
new_startup_nodes: List[ClusterNode] = []
for item in redis_kwargs["startup_nodes"]:
new_startup_nodes.append(ClusterNode(**item))
redis_kwargs.pop("startup_nodes")
return redis.RedisCluster(startup_nodes=new_startup_nodes, **cluster_kwargs)
return redis.Redis(**redis_kwargs)
@ -143,6 +176,24 @@ def get_redis_async_client(**env_overrides):
)
return async_redis.Redis.from_url(**url_kwargs)
if "startup_nodes" in redis_kwargs:
from redis.cluster import ClusterNode
args = _get_redis_cluster_kwargs()
cluster_kwargs = {}
for arg in redis_kwargs:
if arg in args:
cluster_kwargs[arg] = redis_kwargs[arg]
new_startup_nodes: List[ClusterNode] = []
for item in redis_kwargs["startup_nodes"]:
new_startup_nodes.append(ClusterNode(**item))
redis_kwargs.pop("startup_nodes")
return async_redis.RedisCluster(
startup_nodes=new_startup_nodes, **cluster_kwargs
)
return async_redis.Redis(
socket_timeout=5,
**redis_kwargs,
@ -160,4 +211,5 @@ def get_redis_connection_pool(**env_overrides):
connection_class = async_redis.SSLConnection
redis_kwargs.pop("ssl", None)
redis_kwargs["connection_class"] = connection_class
redis_kwargs.pop("startup_nodes", None)
return async_redis.BlockingConnectionPool(timeout=5, **redis_kwargs)

View file

@ -203,6 +203,7 @@ class RedisCache(BaseCache):
password=None,
redis_flush_size=100,
namespace: Optional[str] = None,
startup_nodes: Optional[List] = None, # for redis-cluster
**kwargs,
):
import redis
@ -218,7 +219,8 @@ class RedisCache(BaseCache):
redis_kwargs["port"] = port
if password is not None:
redis_kwargs["password"] = password
if startup_nodes is not None:
redis_kwargs["startup_nodes"] = startup_nodes
### HEALTH MONITORING OBJECT ###
if kwargs.get("service_logger_obj", None) is not None and isinstance(
kwargs["service_logger_obj"], ServiceLogging
@ -246,7 +248,7 @@ class RedisCache(BaseCache):
### ASYNC HEALTH PING ###
try:
# asyncio.get_running_loop().create_task(self.ping())
result = asyncio.get_running_loop().create_task(self.ping())
_ = asyncio.get_running_loop().create_task(self.ping())
except Exception as e:
if "no running event loop" in str(e):
verbose_logger.debug(
@ -2123,6 +2125,7 @@ class Cache:
redis_semantic_cache_use_async=False,
redis_semantic_cache_embedding_model="text-embedding-ada-002",
redis_flush_size=None,
redis_startup_nodes: Optional[List] = None,
disk_cache_dir=None,
qdrant_api_base: Optional[str] = None,
qdrant_api_key: Optional[str] = None,
@ -2155,7 +2158,12 @@ class Cache:
"""
if type == "redis":
self.cache: BaseCache = RedisCache(
host, port, password, redis_flush_size, **kwargs
host,
port,
password,
redis_flush_size,
startup_nodes=redis_startup_nodes,
**kwargs,
)
elif type == "redis-semantic":
self.cache = RedisSemanticCache(

View file

@ -60,6 +60,25 @@ class PrometheusLogger(CustomLogger):
],
)
# request latency metrics
self.litellm_request_total_latency_metric = Histogram(
"litellm_request_total_latency_metric",
"Total latency (seconds) for a request to LiteLLM",
labelnames=[
"model",
"litellm_call_id",
],
)
self.litellm_llm_api_latency_metric = Histogram(
"litellm_llm_api_latency_metric",
"Total latency (seconds) for a models LLM API call",
labelnames=[
"model",
"litellm_call_id",
],
)
# Counter for spend
self.litellm_spend_metric = Counter(
"litellm_spend_metric",
@ -329,6 +348,25 @@ class PrometheusLogger(CustomLogger):
user_api_key, user_api_key_alias, model_group
).set(remaining_tokens)
# latency metrics
total_time: timedelta = kwargs.get("end_time") - kwargs.get("start_time")
total_time_seconds = total_time.total_seconds()
api_call_total_time: timedelta = kwargs.get("end_time") - kwargs.get(
"api_call_start_time"
)
api_call_total_time_seconds = api_call_total_time.total_seconds()
litellm_call_id = kwargs.get("litellm_call_id")
self.litellm_request_total_latency_metric.labels(
model, litellm_call_id
).observe(total_time_seconds)
self.litellm_llm_api_latency_metric.labels(model, litellm_call_id).observe(
api_call_total_time_seconds
)
# set x-ratelimit headers
if premium_user is True:
self.set_llm_deployment_success_metrics(

View file

@ -354,6 +354,8 @@ class Logging:
str(e)
)
)
self.model_call_details["api_call_start_time"] = datetime.datetime.now()
# Input Integration Logging -> If you want to log the fact that an attempt to call the model was made
callbacks = litellm.input_callback + self.dynamic_input_callbacks
for callback in callbacks:

View file

@ -2,12 +2,3 @@ model_list:
- model_name: "*"
litellm_params:
model: "*"
litellm_settings:
success_callback: ["s3"]
cache: true
s3_callback_params:
s3_bucket_name: mytestbucketlitellm # AWS Bucket Name for S3
s3_region_name: us-west-2 # AWS Region Name for S3
s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID # us os.environ/<variable name> to pass environment variables. This is AWS Access Key ID for S3
s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY # AWS Secret Access Key for S3

View file

@ -66,7 +66,7 @@ def common_checks(
raise Exception(
f"Team={team_object.team_id} is blocked. Update via `/team/unblock` if your admin."
)
# 2. If user can call model
# 2. If team can call model
if (
_model is not None
and team_object is not None
@ -74,7 +74,11 @@ def common_checks(
and _model not in team_object.models
):
# this means the team has access to all models on the proxy
if "all-proxy-models" in team_object.models:
if (
"all-proxy-models" in team_object.models
or "*" in team_object.models
or "openai/*" in team_object.models
):
# this means the team has access to all models on the proxy
pass
# check if the team model is an access_group

View file

@ -1,8 +1,9 @@
model_list:
- model_name: fake-openai-endpoint
litellm_params:
model: openai/gpt-4
api_key: os.environ/OPENAI_API_KEY
model: openai/fake
api_key: fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
guardrails:
- guardrail_name: "bedrock-pre-guard"

View file

@ -1588,7 +1588,7 @@ class ProxyConfig:
verbose_proxy_logger.debug( # noqa
f"{blue_color_code}Set Cache on LiteLLM Proxy: {vars(litellm.cache.cache)}{reset_color_code}"
)
elif key == "cache" and value == False:
elif key == "cache" and value is False:
pass
elif key == "guardrails":
if premium_user is not True:
@ -2672,6 +2672,13 @@ def giveup(e):
and isinstance(e.message, str)
and "Max parallel request limit reached" in e.message
)
if (
general_settings.get("disable_retry_on_max_parallel_request_limit_error")
is True
):
return True # giveup if queuing max parallel request limits is disabled
if result:
verbose_proxy_logger.info(json.dumps({"event": "giveup", "exception": str(e)}))
return result

View file

@ -277,7 +277,8 @@ class Router:
"local" # default to an in-memory cache
)
redis_cache = None
cache_config = {}
cache_config: Dict[str, Any] = {}
self.client_ttl = client_ttl
if redis_url is not None or (
redis_host is not None

View file

@ -804,6 +804,38 @@ def test_redis_cache_completion_stream():
# test_redis_cache_completion_stream()
@pytest.mark.skip(reason="Local test. Requires running redis cluster locally.")
@pytest.mark.asyncio
async def test_redis_cache_cluster_init_unit_test():
try:
from redis.asyncio import RedisCluster as AsyncRedisCluster
from redis.cluster import RedisCluster
from litellm.caching import RedisCache
litellm.set_verbose = True
# List of startup nodes
startup_nodes = [
{"host": "127.0.0.1", "port": "7001"},
]
resp = RedisCache(startup_nodes=startup_nodes)
assert isinstance(resp.redis_client, RedisCluster)
assert isinstance(resp.init_async_client(), AsyncRedisCluster)
resp = litellm.Cache(type="redis", redis_startup_nodes=startup_nodes)
assert isinstance(resp.cache, RedisCache)
assert isinstance(resp.cache.redis_client, RedisCluster)
assert isinstance(resp.cache.init_async_client(), AsyncRedisCluster)
except Exception as e:
print(f"{str(e)}\n\n{traceback.format_exc()}")
raise e
@pytest.mark.asyncio
async def test_redis_cache_acompletion_stream():
try: