mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-26 03:04:13 +00:00
Merge branch 'main' into litellm_add_bedrock_guardrails
This commit is contained in:
commit
c23cf18a70
14 changed files with 173 additions and 22 deletions
3
docs/my-website/docs/projects/dbally.md
Normal file
3
docs/my-website/docs/projects/dbally.md
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
Efficient, consistent and secure library for querying structured data with natural language. Query any database with over 100 LLMs ❤️ 🚅.
|
||||||
|
|
||||||
|
🔗 [GitHub](https://github.com/deepsense-ai/db-ally)
|
|
@ -727,6 +727,7 @@ general_settings:
|
||||||
"completion_model": "string",
|
"completion_model": "string",
|
||||||
"disable_spend_logs": "boolean", # turn off writing each transaction to the db
|
"disable_spend_logs": "boolean", # turn off writing each transaction to the db
|
||||||
"disable_master_key_return": "boolean", # turn off returning master key on UI (checked on '/user/info' endpoint)
|
"disable_master_key_return": "boolean", # turn off returning master key on UI (checked on '/user/info' endpoint)
|
||||||
|
"disable_retry_on_max_parallel_request_limit_error": "boolean", # turn off retries when max parallel request limit is reached
|
||||||
"disable_reset_budget": "boolean", # turn off reset budget scheduled task
|
"disable_reset_budget": "boolean", # turn off reset budget scheduled task
|
||||||
"disable_adding_master_key_hash_to_db": "boolean", # turn off storing master key hash in db, for spend tracking
|
"disable_adding_master_key_hash_to_db": "boolean", # turn off storing master key hash in db, for spend tracking
|
||||||
"enable_jwt_auth": "boolean", # allow proxy admin to auth in via jwt tokens with 'litellm_proxy_admin' in claims
|
"enable_jwt_auth": "boolean", # allow proxy admin to auth in via jwt tokens with 'litellm_proxy_admin' in claims
|
||||||
|
@ -751,7 +752,8 @@ general_settings:
|
||||||
},
|
},
|
||||||
"otel": true,
|
"otel": true,
|
||||||
"custom_auth": "string",
|
"custom_auth": "string",
|
||||||
"max_parallel_requests": 0,
|
"max_parallel_requests": 0, # the max parallel requests allowed per deployment
|
||||||
|
"global_max_parallel_requests": 0, # the max parallel requests allowed on the proxy all up
|
||||||
"infer_model_from_keys": true,
|
"infer_model_from_keys": true,
|
||||||
"background_health_checks": true,
|
"background_health_checks": true,
|
||||||
"health_check_interval": 300,
|
"health_check_interval": 300,
|
||||||
|
|
|
@ -68,6 +68,15 @@ http://localhost:4000/metrics
|
||||||
| `litellm_total_tokens` | input + output tokens per `"user", "key", "model", "team", "end-user"` |
|
| `litellm_total_tokens` | input + output tokens per `"user", "key", "model", "team", "end-user"` |
|
||||||
| `litellm_llm_api_failed_requests_metric` | Number of failed LLM API requests per `"user", "key", "model", "team", "end-user"` |
|
| `litellm_llm_api_failed_requests_metric` | Number of failed LLM API requests per `"user", "key", "model", "team", "end-user"` |
|
||||||
|
|
||||||
|
### Request Latency Metrics
|
||||||
|
|
||||||
|
| Metric Name | Description |
|
||||||
|
|----------------------|--------------------------------------|
|
||||||
|
| `litellm_request_total_latency_metric` | Total latency (seconds) for a request to LiteLLM Proxy Server - tracked for labels `litellm_call_id`, `model` |
|
||||||
|
| `litellm_llm_api_latency_metric` | latency (seconds) for just the LLM API call - tracked for labels `litellm_call_id`, `model` |
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
### LLM API / Provider Metrics
|
### LLM API / Provider Metrics
|
||||||
|
|
||||||
| Metric Name | Description |
|
| Metric Name | Description |
|
||||||
|
|
|
@ -292,6 +292,7 @@ const sidebars = {
|
||||||
items: [
|
items: [
|
||||||
"projects/Docq.AI",
|
"projects/Docq.AI",
|
||||||
"projects/OpenInterpreter",
|
"projects/OpenInterpreter",
|
||||||
|
"projects/dbally",
|
||||||
"projects/FastREPL",
|
"projects/FastREPL",
|
||||||
"projects/PROMPTMETHEUS",
|
"projects/PROMPTMETHEUS",
|
||||||
"projects/Codium PR Agent",
|
"projects/Codium PR Agent",
|
||||||
|
|
|
@ -7,13 +7,17 @@
|
||||||
#
|
#
|
||||||
# Thank you users! We ❤️ you! - Krrish & Ishaan
|
# Thank you users! We ❤️ you! - Krrish & Ishaan
|
||||||
|
|
||||||
|
import inspect
|
||||||
|
|
||||||
# s/o [@Frank Colson](https://www.linkedin.com/in/frank-colson-422b9b183/) for this redis implementation
|
# s/o [@Frank Colson](https://www.linkedin.com/in/frank-colson-422b9b183/) for this redis implementation
|
||||||
import os
|
import os
|
||||||
import inspect
|
|
||||||
import redis, litellm # type: ignore
|
|
||||||
import redis.asyncio as async_redis # type: ignore
|
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
|
import redis # type: ignore
|
||||||
|
import redis.asyncio as async_redis # type: ignore
|
||||||
|
|
||||||
|
import litellm
|
||||||
|
|
||||||
|
|
||||||
def _get_redis_kwargs():
|
def _get_redis_kwargs():
|
||||||
arg_spec = inspect.getfullargspec(redis.Redis)
|
arg_spec = inspect.getfullargspec(redis.Redis)
|
||||||
|
@ -51,6 +55,19 @@ def _get_redis_url_kwargs(client=None):
|
||||||
return available_args
|
return available_args
|
||||||
|
|
||||||
|
|
||||||
|
def _get_redis_cluster_kwargs(client=None):
|
||||||
|
if client is None:
|
||||||
|
client = redis.Redis.from_url
|
||||||
|
arg_spec = inspect.getfullargspec(redis.RedisCluster)
|
||||||
|
|
||||||
|
# Only allow primitive arguments
|
||||||
|
exclude_args = {"self", "connection_pool", "retry", "host", "port", "startup_nodes"}
|
||||||
|
|
||||||
|
available_args = [x for x in arg_spec.args if x not in exclude_args]
|
||||||
|
|
||||||
|
return available_args
|
||||||
|
|
||||||
|
|
||||||
def _get_redis_env_kwarg_mapping():
|
def _get_redis_env_kwarg_mapping():
|
||||||
PREFIX = "REDIS_"
|
PREFIX = "REDIS_"
|
||||||
|
|
||||||
|
@ -124,6 +141,22 @@ def get_redis_client(**env_overrides):
|
||||||
url_kwargs[arg] = redis_kwargs[arg]
|
url_kwargs[arg] = redis_kwargs[arg]
|
||||||
|
|
||||||
return redis.Redis.from_url(**url_kwargs)
|
return redis.Redis.from_url(**url_kwargs)
|
||||||
|
|
||||||
|
if "startup_nodes" in redis_kwargs:
|
||||||
|
from redis.cluster import ClusterNode
|
||||||
|
|
||||||
|
args = _get_redis_cluster_kwargs()
|
||||||
|
cluster_kwargs = {}
|
||||||
|
for arg in redis_kwargs:
|
||||||
|
if arg in args:
|
||||||
|
cluster_kwargs[arg] = redis_kwargs[arg]
|
||||||
|
|
||||||
|
new_startup_nodes: List[ClusterNode] = []
|
||||||
|
|
||||||
|
for item in redis_kwargs["startup_nodes"]:
|
||||||
|
new_startup_nodes.append(ClusterNode(**item))
|
||||||
|
redis_kwargs.pop("startup_nodes")
|
||||||
|
return redis.RedisCluster(startup_nodes=new_startup_nodes, **cluster_kwargs)
|
||||||
return redis.Redis(**redis_kwargs)
|
return redis.Redis(**redis_kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
@ -143,6 +176,24 @@ def get_redis_async_client(**env_overrides):
|
||||||
)
|
)
|
||||||
return async_redis.Redis.from_url(**url_kwargs)
|
return async_redis.Redis.from_url(**url_kwargs)
|
||||||
|
|
||||||
|
if "startup_nodes" in redis_kwargs:
|
||||||
|
from redis.cluster import ClusterNode
|
||||||
|
|
||||||
|
args = _get_redis_cluster_kwargs()
|
||||||
|
cluster_kwargs = {}
|
||||||
|
for arg in redis_kwargs:
|
||||||
|
if arg in args:
|
||||||
|
cluster_kwargs[arg] = redis_kwargs[arg]
|
||||||
|
|
||||||
|
new_startup_nodes: List[ClusterNode] = []
|
||||||
|
|
||||||
|
for item in redis_kwargs["startup_nodes"]:
|
||||||
|
new_startup_nodes.append(ClusterNode(**item))
|
||||||
|
redis_kwargs.pop("startup_nodes")
|
||||||
|
return async_redis.RedisCluster(
|
||||||
|
startup_nodes=new_startup_nodes, **cluster_kwargs
|
||||||
|
)
|
||||||
|
|
||||||
return async_redis.Redis(
|
return async_redis.Redis(
|
||||||
socket_timeout=5,
|
socket_timeout=5,
|
||||||
**redis_kwargs,
|
**redis_kwargs,
|
||||||
|
@ -160,4 +211,5 @@ def get_redis_connection_pool(**env_overrides):
|
||||||
connection_class = async_redis.SSLConnection
|
connection_class = async_redis.SSLConnection
|
||||||
redis_kwargs.pop("ssl", None)
|
redis_kwargs.pop("ssl", None)
|
||||||
redis_kwargs["connection_class"] = connection_class
|
redis_kwargs["connection_class"] = connection_class
|
||||||
|
redis_kwargs.pop("startup_nodes", None)
|
||||||
return async_redis.BlockingConnectionPool(timeout=5, **redis_kwargs)
|
return async_redis.BlockingConnectionPool(timeout=5, **redis_kwargs)
|
||||||
|
|
|
@ -203,6 +203,7 @@ class RedisCache(BaseCache):
|
||||||
password=None,
|
password=None,
|
||||||
redis_flush_size=100,
|
redis_flush_size=100,
|
||||||
namespace: Optional[str] = None,
|
namespace: Optional[str] = None,
|
||||||
|
startup_nodes: Optional[List] = None, # for redis-cluster
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
import redis
|
import redis
|
||||||
|
@ -218,7 +219,8 @@ class RedisCache(BaseCache):
|
||||||
redis_kwargs["port"] = port
|
redis_kwargs["port"] = port
|
||||||
if password is not None:
|
if password is not None:
|
||||||
redis_kwargs["password"] = password
|
redis_kwargs["password"] = password
|
||||||
|
if startup_nodes is not None:
|
||||||
|
redis_kwargs["startup_nodes"] = startup_nodes
|
||||||
### HEALTH MONITORING OBJECT ###
|
### HEALTH MONITORING OBJECT ###
|
||||||
if kwargs.get("service_logger_obj", None) is not None and isinstance(
|
if kwargs.get("service_logger_obj", None) is not None and isinstance(
|
||||||
kwargs["service_logger_obj"], ServiceLogging
|
kwargs["service_logger_obj"], ServiceLogging
|
||||||
|
@ -246,7 +248,7 @@ class RedisCache(BaseCache):
|
||||||
### ASYNC HEALTH PING ###
|
### ASYNC HEALTH PING ###
|
||||||
try:
|
try:
|
||||||
# asyncio.get_running_loop().create_task(self.ping())
|
# asyncio.get_running_loop().create_task(self.ping())
|
||||||
result = asyncio.get_running_loop().create_task(self.ping())
|
_ = asyncio.get_running_loop().create_task(self.ping())
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if "no running event loop" in str(e):
|
if "no running event loop" in str(e):
|
||||||
verbose_logger.debug(
|
verbose_logger.debug(
|
||||||
|
@ -2123,6 +2125,7 @@ class Cache:
|
||||||
redis_semantic_cache_use_async=False,
|
redis_semantic_cache_use_async=False,
|
||||||
redis_semantic_cache_embedding_model="text-embedding-ada-002",
|
redis_semantic_cache_embedding_model="text-embedding-ada-002",
|
||||||
redis_flush_size=None,
|
redis_flush_size=None,
|
||||||
|
redis_startup_nodes: Optional[List] = None,
|
||||||
disk_cache_dir=None,
|
disk_cache_dir=None,
|
||||||
qdrant_api_base: Optional[str] = None,
|
qdrant_api_base: Optional[str] = None,
|
||||||
qdrant_api_key: Optional[str] = None,
|
qdrant_api_key: Optional[str] = None,
|
||||||
|
@ -2155,7 +2158,12 @@ class Cache:
|
||||||
"""
|
"""
|
||||||
if type == "redis":
|
if type == "redis":
|
||||||
self.cache: BaseCache = RedisCache(
|
self.cache: BaseCache = RedisCache(
|
||||||
host, port, password, redis_flush_size, **kwargs
|
host,
|
||||||
|
port,
|
||||||
|
password,
|
||||||
|
redis_flush_size,
|
||||||
|
startup_nodes=redis_startup_nodes,
|
||||||
|
**kwargs,
|
||||||
)
|
)
|
||||||
elif type == "redis-semantic":
|
elif type == "redis-semantic":
|
||||||
self.cache = RedisSemanticCache(
|
self.cache = RedisSemanticCache(
|
||||||
|
|
|
@ -60,6 +60,25 @@ class PrometheusLogger(CustomLogger):
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# request latency metrics
|
||||||
|
self.litellm_request_total_latency_metric = Histogram(
|
||||||
|
"litellm_request_total_latency_metric",
|
||||||
|
"Total latency (seconds) for a request to LiteLLM",
|
||||||
|
labelnames=[
|
||||||
|
"model",
|
||||||
|
"litellm_call_id",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
self.litellm_llm_api_latency_metric = Histogram(
|
||||||
|
"litellm_llm_api_latency_metric",
|
||||||
|
"Total latency (seconds) for a models LLM API call",
|
||||||
|
labelnames=[
|
||||||
|
"model",
|
||||||
|
"litellm_call_id",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
# Counter for spend
|
# Counter for spend
|
||||||
self.litellm_spend_metric = Counter(
|
self.litellm_spend_metric = Counter(
|
||||||
"litellm_spend_metric",
|
"litellm_spend_metric",
|
||||||
|
@ -329,6 +348,25 @@ class PrometheusLogger(CustomLogger):
|
||||||
user_api_key, user_api_key_alias, model_group
|
user_api_key, user_api_key_alias, model_group
|
||||||
).set(remaining_tokens)
|
).set(remaining_tokens)
|
||||||
|
|
||||||
|
# latency metrics
|
||||||
|
total_time: timedelta = kwargs.get("end_time") - kwargs.get("start_time")
|
||||||
|
total_time_seconds = total_time.total_seconds()
|
||||||
|
api_call_total_time: timedelta = kwargs.get("end_time") - kwargs.get(
|
||||||
|
"api_call_start_time"
|
||||||
|
)
|
||||||
|
|
||||||
|
api_call_total_time_seconds = api_call_total_time.total_seconds()
|
||||||
|
|
||||||
|
litellm_call_id = kwargs.get("litellm_call_id")
|
||||||
|
|
||||||
|
self.litellm_request_total_latency_metric.labels(
|
||||||
|
model, litellm_call_id
|
||||||
|
).observe(total_time_seconds)
|
||||||
|
|
||||||
|
self.litellm_llm_api_latency_metric.labels(model, litellm_call_id).observe(
|
||||||
|
api_call_total_time_seconds
|
||||||
|
)
|
||||||
|
|
||||||
# set x-ratelimit headers
|
# set x-ratelimit headers
|
||||||
if premium_user is True:
|
if premium_user is True:
|
||||||
self.set_llm_deployment_success_metrics(
|
self.set_llm_deployment_success_metrics(
|
||||||
|
|
|
@ -354,6 +354,8 @@ class Logging:
|
||||||
str(e)
|
str(e)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
self.model_call_details["api_call_start_time"] = datetime.datetime.now()
|
||||||
# Input Integration Logging -> If you want to log the fact that an attempt to call the model was made
|
# Input Integration Logging -> If you want to log the fact that an attempt to call the model was made
|
||||||
callbacks = litellm.input_callback + self.dynamic_input_callbacks
|
callbacks = litellm.input_callback + self.dynamic_input_callbacks
|
||||||
for callback in callbacks:
|
for callback in callbacks:
|
||||||
|
|
|
@ -2,12 +2,3 @@ model_list:
|
||||||
- model_name: "*"
|
- model_name: "*"
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: "*"
|
model: "*"
|
||||||
|
|
||||||
litellm_settings:
|
|
||||||
success_callback: ["s3"]
|
|
||||||
cache: true
|
|
||||||
s3_callback_params:
|
|
||||||
s3_bucket_name: mytestbucketlitellm # AWS Bucket Name for S3
|
|
||||||
s3_region_name: us-west-2 # AWS Region Name for S3
|
|
||||||
s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID # us os.environ/<variable name> to pass environment variables. This is AWS Access Key ID for S3
|
|
||||||
s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY # AWS Secret Access Key for S3
|
|
||||||
|
|
|
@ -66,7 +66,7 @@ def common_checks(
|
||||||
raise Exception(
|
raise Exception(
|
||||||
f"Team={team_object.team_id} is blocked. Update via `/team/unblock` if your admin."
|
f"Team={team_object.team_id} is blocked. Update via `/team/unblock` if your admin."
|
||||||
)
|
)
|
||||||
# 2. If user can call model
|
# 2. If team can call model
|
||||||
if (
|
if (
|
||||||
_model is not None
|
_model is not None
|
||||||
and team_object is not None
|
and team_object is not None
|
||||||
|
@ -74,7 +74,11 @@ def common_checks(
|
||||||
and _model not in team_object.models
|
and _model not in team_object.models
|
||||||
):
|
):
|
||||||
# this means the team has access to all models on the proxy
|
# this means the team has access to all models on the proxy
|
||||||
if "all-proxy-models" in team_object.models:
|
if (
|
||||||
|
"all-proxy-models" in team_object.models
|
||||||
|
or "*" in team_object.models
|
||||||
|
or "openai/*" in team_object.models
|
||||||
|
):
|
||||||
# this means the team has access to all models on the proxy
|
# this means the team has access to all models on the proxy
|
||||||
pass
|
pass
|
||||||
# check if the team model is an access_group
|
# check if the team model is an access_group
|
||||||
|
|
|
@ -1,8 +1,9 @@
|
||||||
model_list:
|
model_list:
|
||||||
- model_name: fake-openai-endpoint
|
- model_name: fake-openai-endpoint
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: openai/gpt-4
|
model: openai/fake
|
||||||
api_key: os.environ/OPENAI_API_KEY
|
api_key: fake-key
|
||||||
|
api_base: https://exampleopenaiendpoint-production.up.railway.app/
|
||||||
|
|
||||||
guardrails:
|
guardrails:
|
||||||
- guardrail_name: "bedrock-pre-guard"
|
- guardrail_name: "bedrock-pre-guard"
|
||||||
|
|
|
@ -1588,7 +1588,7 @@ class ProxyConfig:
|
||||||
verbose_proxy_logger.debug( # noqa
|
verbose_proxy_logger.debug( # noqa
|
||||||
f"{blue_color_code}Set Cache on LiteLLM Proxy: {vars(litellm.cache.cache)}{reset_color_code}"
|
f"{blue_color_code}Set Cache on LiteLLM Proxy: {vars(litellm.cache.cache)}{reset_color_code}"
|
||||||
)
|
)
|
||||||
elif key == "cache" and value == False:
|
elif key == "cache" and value is False:
|
||||||
pass
|
pass
|
||||||
elif key == "guardrails":
|
elif key == "guardrails":
|
||||||
if premium_user is not True:
|
if premium_user is not True:
|
||||||
|
@ -2672,6 +2672,13 @@ def giveup(e):
|
||||||
and isinstance(e.message, str)
|
and isinstance(e.message, str)
|
||||||
and "Max parallel request limit reached" in e.message
|
and "Max parallel request limit reached" in e.message
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if (
|
||||||
|
general_settings.get("disable_retry_on_max_parallel_request_limit_error")
|
||||||
|
is True
|
||||||
|
):
|
||||||
|
return True # giveup if queuing max parallel request limits is disabled
|
||||||
|
|
||||||
if result:
|
if result:
|
||||||
verbose_proxy_logger.info(json.dumps({"event": "giveup", "exception": str(e)}))
|
verbose_proxy_logger.info(json.dumps({"event": "giveup", "exception": str(e)}))
|
||||||
return result
|
return result
|
||||||
|
|
|
@ -277,7 +277,8 @@ class Router:
|
||||||
"local" # default to an in-memory cache
|
"local" # default to an in-memory cache
|
||||||
)
|
)
|
||||||
redis_cache = None
|
redis_cache = None
|
||||||
cache_config = {}
|
cache_config: Dict[str, Any] = {}
|
||||||
|
|
||||||
self.client_ttl = client_ttl
|
self.client_ttl = client_ttl
|
||||||
if redis_url is not None or (
|
if redis_url is not None or (
|
||||||
redis_host is not None
|
redis_host is not None
|
||||||
|
|
|
@ -804,6 +804,38 @@ def test_redis_cache_completion_stream():
|
||||||
# test_redis_cache_completion_stream()
|
# test_redis_cache_completion_stream()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip(reason="Local test. Requires running redis cluster locally.")
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_redis_cache_cluster_init_unit_test():
|
||||||
|
try:
|
||||||
|
from redis.asyncio import RedisCluster as AsyncRedisCluster
|
||||||
|
from redis.cluster import RedisCluster
|
||||||
|
|
||||||
|
from litellm.caching import RedisCache
|
||||||
|
|
||||||
|
litellm.set_verbose = True
|
||||||
|
|
||||||
|
# List of startup nodes
|
||||||
|
startup_nodes = [
|
||||||
|
{"host": "127.0.0.1", "port": "7001"},
|
||||||
|
]
|
||||||
|
|
||||||
|
resp = RedisCache(startup_nodes=startup_nodes)
|
||||||
|
|
||||||
|
assert isinstance(resp.redis_client, RedisCluster)
|
||||||
|
assert isinstance(resp.init_async_client(), AsyncRedisCluster)
|
||||||
|
|
||||||
|
resp = litellm.Cache(type="redis", redis_startup_nodes=startup_nodes)
|
||||||
|
|
||||||
|
assert isinstance(resp.cache, RedisCache)
|
||||||
|
assert isinstance(resp.cache.redis_client, RedisCluster)
|
||||||
|
assert isinstance(resp.cache.init_async_client(), AsyncRedisCluster)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"{str(e)}\n\n{traceback.format_exc()}")
|
||||||
|
raise e
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_redis_cache_acompletion_stream():
|
async def test_redis_cache_acompletion_stream():
|
||||||
try:
|
try:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue