forked from phoenix/litellm-mirror
Merge branch 'main' into litellm_add_bedrock_guardrails
This commit is contained in:
commit
70f9e41ed9
14 changed files with 173 additions and 22 deletions
3
docs/my-website/docs/projects/dbally.md
Normal file
3
docs/my-website/docs/projects/dbally.md
Normal file
|
@ -0,0 +1,3 @@
|
|||
Efficient, consistent and secure library for querying structured data with natural language. Query any database with over 100 LLMs ❤️ 🚅.
|
||||
|
||||
🔗 [GitHub](https://github.com/deepsense-ai/db-ally)
|
|
@ -727,6 +727,7 @@ general_settings:
|
|||
"completion_model": "string",
|
||||
"disable_spend_logs": "boolean", # turn off writing each transaction to the db
|
||||
"disable_master_key_return": "boolean", # turn off returning master key on UI (checked on '/user/info' endpoint)
|
||||
"disable_retry_on_max_parallel_request_limit_error": "boolean", # turn off retries when max parallel request limit is reached
|
||||
"disable_reset_budget": "boolean", # turn off reset budget scheduled task
|
||||
"disable_adding_master_key_hash_to_db": "boolean", # turn off storing master key hash in db, for spend tracking
|
||||
"enable_jwt_auth": "boolean", # allow proxy admin to auth in via jwt tokens with 'litellm_proxy_admin' in claims
|
||||
|
@ -751,7 +752,8 @@ general_settings:
|
|||
},
|
||||
"otel": true,
|
||||
"custom_auth": "string",
|
||||
"max_parallel_requests": 0,
|
||||
"max_parallel_requests": 0, # the max parallel requests allowed per deployment
|
||||
"global_max_parallel_requests": 0, # the max parallel requests allowed on the proxy all up
|
||||
"infer_model_from_keys": true,
|
||||
"background_health_checks": true,
|
||||
"health_check_interval": 300,
|
||||
|
|
|
@ -68,6 +68,15 @@ http://localhost:4000/metrics
|
|||
| `litellm_total_tokens` | input + output tokens per `"user", "key", "model", "team", "end-user"` |
|
||||
| `litellm_llm_api_failed_requests_metric` | Number of failed LLM API requests per `"user", "key", "model", "team", "end-user"` |
|
||||
|
||||
### Request Latency Metrics
|
||||
|
||||
| Metric Name | Description |
|
||||
|----------------------|--------------------------------------|
|
||||
| `litellm_request_total_latency_metric` | Total latency (seconds) for a request to LiteLLM Proxy Server - tracked for labels `litellm_call_id`, `model` |
|
||||
| `litellm_llm_api_latency_metric` | latency (seconds) for just the LLM API call - tracked for labels `litellm_call_id`, `model` |
|
||||
|
||||
|
||||
|
||||
### LLM API / Provider Metrics
|
||||
|
||||
| Metric Name | Description |
|
||||
|
|
|
@ -292,6 +292,7 @@ const sidebars = {
|
|||
items: [
|
||||
"projects/Docq.AI",
|
||||
"projects/OpenInterpreter",
|
||||
"projects/dbally",
|
||||
"projects/FastREPL",
|
||||
"projects/PROMPTMETHEUS",
|
||||
"projects/Codium PR Agent",
|
||||
|
|
|
@ -7,13 +7,17 @@
|
|||
#
|
||||
# Thank you users! We ❤️ you! - Krrish & Ishaan
|
||||
|
||||
import inspect
|
||||
|
||||
# s/o [@Frank Colson](https://www.linkedin.com/in/frank-colson-422b9b183/) for this redis implementation
|
||||
import os
|
||||
import inspect
|
||||
import redis, litellm # type: ignore
|
||||
import redis.asyncio as async_redis # type: ignore
|
||||
from typing import List, Optional
|
||||
|
||||
import redis # type: ignore
|
||||
import redis.asyncio as async_redis # type: ignore
|
||||
|
||||
import litellm
|
||||
|
||||
|
||||
def _get_redis_kwargs():
|
||||
arg_spec = inspect.getfullargspec(redis.Redis)
|
||||
|
@ -51,6 +55,19 @@ def _get_redis_url_kwargs(client=None):
|
|||
return available_args
|
||||
|
||||
|
||||
def _get_redis_cluster_kwargs(client=None):
|
||||
if client is None:
|
||||
client = redis.Redis.from_url
|
||||
arg_spec = inspect.getfullargspec(redis.RedisCluster)
|
||||
|
||||
# Only allow primitive arguments
|
||||
exclude_args = {"self", "connection_pool", "retry", "host", "port", "startup_nodes"}
|
||||
|
||||
available_args = [x for x in arg_spec.args if x not in exclude_args]
|
||||
|
||||
return available_args
|
||||
|
||||
|
||||
def _get_redis_env_kwarg_mapping():
|
||||
PREFIX = "REDIS_"
|
||||
|
||||
|
@ -124,6 +141,22 @@ def get_redis_client(**env_overrides):
|
|||
url_kwargs[arg] = redis_kwargs[arg]
|
||||
|
||||
return redis.Redis.from_url(**url_kwargs)
|
||||
|
||||
if "startup_nodes" in redis_kwargs:
|
||||
from redis.cluster import ClusterNode
|
||||
|
||||
args = _get_redis_cluster_kwargs()
|
||||
cluster_kwargs = {}
|
||||
for arg in redis_kwargs:
|
||||
if arg in args:
|
||||
cluster_kwargs[arg] = redis_kwargs[arg]
|
||||
|
||||
new_startup_nodes: List[ClusterNode] = []
|
||||
|
||||
for item in redis_kwargs["startup_nodes"]:
|
||||
new_startup_nodes.append(ClusterNode(**item))
|
||||
redis_kwargs.pop("startup_nodes")
|
||||
return redis.RedisCluster(startup_nodes=new_startup_nodes, **cluster_kwargs)
|
||||
return redis.Redis(**redis_kwargs)
|
||||
|
||||
|
||||
|
@ -143,6 +176,24 @@ def get_redis_async_client(**env_overrides):
|
|||
)
|
||||
return async_redis.Redis.from_url(**url_kwargs)
|
||||
|
||||
if "startup_nodes" in redis_kwargs:
|
||||
from redis.cluster import ClusterNode
|
||||
|
||||
args = _get_redis_cluster_kwargs()
|
||||
cluster_kwargs = {}
|
||||
for arg in redis_kwargs:
|
||||
if arg in args:
|
||||
cluster_kwargs[arg] = redis_kwargs[arg]
|
||||
|
||||
new_startup_nodes: List[ClusterNode] = []
|
||||
|
||||
for item in redis_kwargs["startup_nodes"]:
|
||||
new_startup_nodes.append(ClusterNode(**item))
|
||||
redis_kwargs.pop("startup_nodes")
|
||||
return async_redis.RedisCluster(
|
||||
startup_nodes=new_startup_nodes, **cluster_kwargs
|
||||
)
|
||||
|
||||
return async_redis.Redis(
|
||||
socket_timeout=5,
|
||||
**redis_kwargs,
|
||||
|
@ -160,4 +211,5 @@ def get_redis_connection_pool(**env_overrides):
|
|||
connection_class = async_redis.SSLConnection
|
||||
redis_kwargs.pop("ssl", None)
|
||||
redis_kwargs["connection_class"] = connection_class
|
||||
redis_kwargs.pop("startup_nodes", None)
|
||||
return async_redis.BlockingConnectionPool(timeout=5, **redis_kwargs)
|
||||
|
|
|
@ -203,6 +203,7 @@ class RedisCache(BaseCache):
|
|||
password=None,
|
||||
redis_flush_size=100,
|
||||
namespace: Optional[str] = None,
|
||||
startup_nodes: Optional[List] = None, # for redis-cluster
|
||||
**kwargs,
|
||||
):
|
||||
import redis
|
||||
|
@ -218,7 +219,8 @@ class RedisCache(BaseCache):
|
|||
redis_kwargs["port"] = port
|
||||
if password is not None:
|
||||
redis_kwargs["password"] = password
|
||||
|
||||
if startup_nodes is not None:
|
||||
redis_kwargs["startup_nodes"] = startup_nodes
|
||||
### HEALTH MONITORING OBJECT ###
|
||||
if kwargs.get("service_logger_obj", None) is not None and isinstance(
|
||||
kwargs["service_logger_obj"], ServiceLogging
|
||||
|
@ -246,7 +248,7 @@ class RedisCache(BaseCache):
|
|||
### ASYNC HEALTH PING ###
|
||||
try:
|
||||
# asyncio.get_running_loop().create_task(self.ping())
|
||||
result = asyncio.get_running_loop().create_task(self.ping())
|
||||
_ = asyncio.get_running_loop().create_task(self.ping())
|
||||
except Exception as e:
|
||||
if "no running event loop" in str(e):
|
||||
verbose_logger.debug(
|
||||
|
@ -2123,6 +2125,7 @@ class Cache:
|
|||
redis_semantic_cache_use_async=False,
|
||||
redis_semantic_cache_embedding_model="text-embedding-ada-002",
|
||||
redis_flush_size=None,
|
||||
redis_startup_nodes: Optional[List] = None,
|
||||
disk_cache_dir=None,
|
||||
qdrant_api_base: Optional[str] = None,
|
||||
qdrant_api_key: Optional[str] = None,
|
||||
|
@ -2155,7 +2158,12 @@ class Cache:
|
|||
"""
|
||||
if type == "redis":
|
||||
self.cache: BaseCache = RedisCache(
|
||||
host, port, password, redis_flush_size, **kwargs
|
||||
host,
|
||||
port,
|
||||
password,
|
||||
redis_flush_size,
|
||||
startup_nodes=redis_startup_nodes,
|
||||
**kwargs,
|
||||
)
|
||||
elif type == "redis-semantic":
|
||||
self.cache = RedisSemanticCache(
|
||||
|
|
|
@ -60,6 +60,25 @@ class PrometheusLogger(CustomLogger):
|
|||
],
|
||||
)
|
||||
|
||||
# request latency metrics
|
||||
self.litellm_request_total_latency_metric = Histogram(
|
||||
"litellm_request_total_latency_metric",
|
||||
"Total latency (seconds) for a request to LiteLLM",
|
||||
labelnames=[
|
||||
"model",
|
||||
"litellm_call_id",
|
||||
],
|
||||
)
|
||||
|
||||
self.litellm_llm_api_latency_metric = Histogram(
|
||||
"litellm_llm_api_latency_metric",
|
||||
"Total latency (seconds) for a models LLM API call",
|
||||
labelnames=[
|
||||
"model",
|
||||
"litellm_call_id",
|
||||
],
|
||||
)
|
||||
|
||||
# Counter for spend
|
||||
self.litellm_spend_metric = Counter(
|
||||
"litellm_spend_metric",
|
||||
|
@ -329,6 +348,25 @@ class PrometheusLogger(CustomLogger):
|
|||
user_api_key, user_api_key_alias, model_group
|
||||
).set(remaining_tokens)
|
||||
|
||||
# latency metrics
|
||||
total_time: timedelta = kwargs.get("end_time") - kwargs.get("start_time")
|
||||
total_time_seconds = total_time.total_seconds()
|
||||
api_call_total_time: timedelta = kwargs.get("end_time") - kwargs.get(
|
||||
"api_call_start_time"
|
||||
)
|
||||
|
||||
api_call_total_time_seconds = api_call_total_time.total_seconds()
|
||||
|
||||
litellm_call_id = kwargs.get("litellm_call_id")
|
||||
|
||||
self.litellm_request_total_latency_metric.labels(
|
||||
model, litellm_call_id
|
||||
).observe(total_time_seconds)
|
||||
|
||||
self.litellm_llm_api_latency_metric.labels(model, litellm_call_id).observe(
|
||||
api_call_total_time_seconds
|
||||
)
|
||||
|
||||
# set x-ratelimit headers
|
||||
if premium_user is True:
|
||||
self.set_llm_deployment_success_metrics(
|
||||
|
|
|
@ -354,6 +354,8 @@ class Logging:
|
|||
str(e)
|
||||
)
|
||||
)
|
||||
|
||||
self.model_call_details["api_call_start_time"] = datetime.datetime.now()
|
||||
# Input Integration Logging -> If you want to log the fact that an attempt to call the model was made
|
||||
callbacks = litellm.input_callback + self.dynamic_input_callbacks
|
||||
for callback in callbacks:
|
||||
|
|
|
@ -2,12 +2,3 @@ model_list:
|
|||
- model_name: "*"
|
||||
litellm_params:
|
||||
model: "*"
|
||||
|
||||
litellm_settings:
|
||||
success_callback: ["s3"]
|
||||
cache: true
|
||||
s3_callback_params:
|
||||
s3_bucket_name: mytestbucketlitellm # AWS Bucket Name for S3
|
||||
s3_region_name: us-west-2 # AWS Region Name for S3
|
||||
s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID # us os.environ/<variable name> to pass environment variables. This is AWS Access Key ID for S3
|
||||
s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY # AWS Secret Access Key for S3
|
||||
|
|
|
@ -66,7 +66,7 @@ def common_checks(
|
|||
raise Exception(
|
||||
f"Team={team_object.team_id} is blocked. Update via `/team/unblock` if your admin."
|
||||
)
|
||||
# 2. If user can call model
|
||||
# 2. If team can call model
|
||||
if (
|
||||
_model is not None
|
||||
and team_object is not None
|
||||
|
@ -74,7 +74,11 @@ def common_checks(
|
|||
and _model not in team_object.models
|
||||
):
|
||||
# this means the team has access to all models on the proxy
|
||||
if "all-proxy-models" in team_object.models:
|
||||
if (
|
||||
"all-proxy-models" in team_object.models
|
||||
or "*" in team_object.models
|
||||
or "openai/*" in team_object.models
|
||||
):
|
||||
# this means the team has access to all models on the proxy
|
||||
pass
|
||||
# check if the team model is an access_group
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
model_list:
|
||||
- model_name: fake-openai-endpoint
|
||||
litellm_params:
|
||||
model: openai/gpt-4
|
||||
api_key: os.environ/OPENAI_API_KEY
|
||||
model: openai/fake
|
||||
api_key: fake-key
|
||||
api_base: https://exampleopenaiendpoint-production.up.railway.app/
|
||||
|
||||
guardrails:
|
||||
- guardrail_name: "bedrock-pre-guard"
|
||||
|
|
|
@ -1588,7 +1588,7 @@ class ProxyConfig:
|
|||
verbose_proxy_logger.debug( # noqa
|
||||
f"{blue_color_code}Set Cache on LiteLLM Proxy: {vars(litellm.cache.cache)}{reset_color_code}"
|
||||
)
|
||||
elif key == "cache" and value == False:
|
||||
elif key == "cache" and value is False:
|
||||
pass
|
||||
elif key == "guardrails":
|
||||
if premium_user is not True:
|
||||
|
@ -2672,6 +2672,13 @@ def giveup(e):
|
|||
and isinstance(e.message, str)
|
||||
and "Max parallel request limit reached" in e.message
|
||||
)
|
||||
|
||||
if (
|
||||
general_settings.get("disable_retry_on_max_parallel_request_limit_error")
|
||||
is True
|
||||
):
|
||||
return True # giveup if queuing max parallel request limits is disabled
|
||||
|
||||
if result:
|
||||
verbose_proxy_logger.info(json.dumps({"event": "giveup", "exception": str(e)}))
|
||||
return result
|
||||
|
|
|
@ -277,7 +277,8 @@ class Router:
|
|||
"local" # default to an in-memory cache
|
||||
)
|
||||
redis_cache = None
|
||||
cache_config = {}
|
||||
cache_config: Dict[str, Any] = {}
|
||||
|
||||
self.client_ttl = client_ttl
|
||||
if redis_url is not None or (
|
||||
redis_host is not None
|
||||
|
|
|
@ -804,6 +804,38 @@ def test_redis_cache_completion_stream():
|
|||
# test_redis_cache_completion_stream()
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Local test. Requires running redis cluster locally.")
|
||||
@pytest.mark.asyncio
|
||||
async def test_redis_cache_cluster_init_unit_test():
|
||||
try:
|
||||
from redis.asyncio import RedisCluster as AsyncRedisCluster
|
||||
from redis.cluster import RedisCluster
|
||||
|
||||
from litellm.caching import RedisCache
|
||||
|
||||
litellm.set_verbose = True
|
||||
|
||||
# List of startup nodes
|
||||
startup_nodes = [
|
||||
{"host": "127.0.0.1", "port": "7001"},
|
||||
]
|
||||
|
||||
resp = RedisCache(startup_nodes=startup_nodes)
|
||||
|
||||
assert isinstance(resp.redis_client, RedisCluster)
|
||||
assert isinstance(resp.init_async_client(), AsyncRedisCluster)
|
||||
|
||||
resp = litellm.Cache(type="redis", redis_startup_nodes=startup_nodes)
|
||||
|
||||
assert isinstance(resp.cache, RedisCache)
|
||||
assert isinstance(resp.cache.redis_client, RedisCluster)
|
||||
assert isinstance(resp.cache.init_async_client(), AsyncRedisCluster)
|
||||
|
||||
except Exception as e:
|
||||
print(f"{str(e)}\n\n{traceback.format_exc()}")
|
||||
raise e
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_redis_cache_acompletion_stream():
|
||||
try:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue