forked from phoenix/litellm-mirror
perf: remove 'always_read_redis' - adding +830ms on each llm call (#6414)
* perf: remove 'always_read_redis' - adding +830ms on each llm call * test: cleanup codestral tests - backend api unavailable
This commit is contained in:
parent
0f0470f574
commit
d59f8f952d
5 changed files with 49 additions and 57 deletions
|
@ -160,9 +160,6 @@ enable_caching_on_provider_specific_optional_params: bool = (
|
||||||
caching: bool = (
|
caching: bool = (
|
||||||
False # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
|
False # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
|
||||||
)
|
)
|
||||||
always_read_redis: bool = (
|
|
||||||
True # always use redis for rate limiting logic on litellm proxy
|
|
||||||
)
|
|
||||||
caching_with_models: bool = (
|
caching_with_models: bool = (
|
||||||
False # # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
|
False # # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
|
||||||
)
|
)
|
||||||
|
|
|
@ -32,7 +32,6 @@ class DualCache(BaseCache):
|
||||||
redis_cache: Optional[RedisCache] = None,
|
redis_cache: Optional[RedisCache] = None,
|
||||||
default_in_memory_ttl: Optional[float] = None,
|
default_in_memory_ttl: Optional[float] = None,
|
||||||
default_redis_ttl: Optional[float] = None,
|
default_redis_ttl: Optional[float] = None,
|
||||||
always_read_redis: Optional[bool] = True,
|
|
||||||
) -> None:
|
) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
# If in_memory_cache is not provided, use the default InMemoryCache
|
# If in_memory_cache is not provided, use the default InMemoryCache
|
||||||
|
@ -44,7 +43,6 @@ class DualCache(BaseCache):
|
||||||
default_in_memory_ttl or litellm.default_in_memory_ttl
|
default_in_memory_ttl or litellm.default_in_memory_ttl
|
||||||
)
|
)
|
||||||
self.default_redis_ttl = default_redis_ttl or litellm.default_redis_ttl
|
self.default_redis_ttl = default_redis_ttl or litellm.default_redis_ttl
|
||||||
self.always_read_redis = always_read_redis
|
|
||||||
|
|
||||||
def update_cache_ttl(
|
def update_cache_ttl(
|
||||||
self, default_in_memory_ttl: Optional[float], default_redis_ttl: Optional[float]
|
self, default_in_memory_ttl: Optional[float], default_redis_ttl: Optional[float]
|
||||||
|
@ -102,12 +100,8 @@ class DualCache(BaseCache):
|
||||||
if in_memory_result is not None:
|
if in_memory_result is not None:
|
||||||
result = in_memory_result
|
result = in_memory_result
|
||||||
|
|
||||||
if (
|
if result is None and self.redis_cache is not None and local_only is False:
|
||||||
(self.always_read_redis is True)
|
# If not found in in-memory cache, try fetching from Redis
|
||||||
and self.redis_cache is not None
|
|
||||||
and local_only is False
|
|
||||||
):
|
|
||||||
# If not found in in-memory cache or always_read_redis is True, try fetching from Redis
|
|
||||||
redis_result = self.redis_cache.get_cache(key, **kwargs)
|
redis_result = self.redis_cache.get_cache(key, **kwargs)
|
||||||
|
|
||||||
if redis_result is not None:
|
if redis_result is not None:
|
||||||
|
|
|
@ -171,7 +171,7 @@ class OpenTelemetry(CustomLogger):
|
||||||
try:
|
try:
|
||||||
value = str(value)
|
value = str(value)
|
||||||
except Exception:
|
except Exception:
|
||||||
value = "litllm logging error - could_not_json_serialize"
|
value = "litellm logging error - could_not_json_serialize"
|
||||||
self.safe_set_attribute(
|
self.safe_set_attribute(
|
||||||
span=service_logging_span,
|
span=service_logging_span,
|
||||||
key=key,
|
key=key,
|
||||||
|
|
|
@ -1,10 +1,51 @@
|
||||||
model_list:
|
model_list:
|
||||||
- model_name: gpt-4o
|
- model_name: gpt-4o
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: azure/gpt-4o-realtime-preview
|
model: openai/fake
|
||||||
api_key: os.environ/AZURE_SWEDEN_API_KEY
|
api_key: fake-key
|
||||||
api_base: os.environ/AZURE_SWEDEN_API_BASE
|
api_base: https://exampleopenaiendpoint-production.up.railway.app/
|
||||||
|
|
||||||
litellm_settings:
|
litellm_settings:
|
||||||
success_callback: ["langfuse"]
|
callbacks: ["prometheus", "otel"]
|
||||||
# logged_real_time_event_types: "*"
|
|
||||||
|
general_settings:
|
||||||
|
user_api_key_cache_ttl: 3600
|
||||||
|
|
||||||
|
router_settings:
|
||||||
|
routing_strategy: latency-based-routing
|
||||||
|
routing_strategy_args:
|
||||||
|
# only assign 40% of traffic to the fastest deployment to avoid overloading it
|
||||||
|
lowest_latency_buffer: 0.4
|
||||||
|
|
||||||
|
# consider last five minutes of calls for latency calculation
|
||||||
|
ttl: 300
|
||||||
|
|
||||||
|
# model_group_alias:
|
||||||
|
# gpt-4o: gpt-4o-128k-2024-05-13
|
||||||
|
# gpt-4o-mini: gpt-4o-mini-128k-2024-07-18
|
||||||
|
|
||||||
|
enable_tag_filtering: True
|
||||||
|
|
||||||
|
# retry call 3 times on each model_name (we don't use fallbacks, so this would be 3 times total)
|
||||||
|
num_retries: 3
|
||||||
|
|
||||||
|
# -- cooldown settings --
|
||||||
|
# see https://github.com/BerriAI/litellm/blob/main/litellm/router_utils/cooldown_handlers.py#L265
|
||||||
|
|
||||||
|
# cooldown model if it fails > n calls in a minute.
|
||||||
|
allowed_fails: 2
|
||||||
|
|
||||||
|
# (in seconds) how long to cooldown model if fails/min > allowed_fails
|
||||||
|
cooldown_time: 60
|
||||||
|
|
||||||
|
allowed_fails_policy:
|
||||||
|
InternalServerErrorAllowedFails: 1
|
||||||
|
RateLimitErrorAllowedFails: 2
|
||||||
|
TimeoutErrorAllowedFails: 3
|
||||||
|
# -- end cooldown settings --
|
||||||
|
|
||||||
|
# see https://docs.litellm.ai/docs/proxy/prod#3-use-redis-porthost-password-not-redis_url
|
||||||
|
redis_host: os.environ/REDIS_HOST
|
||||||
|
redis_port: os.environ/REDIS_PORT
|
||||||
|
redis_password: os.environ/REDIS_PASSWORD
|
||||||
|
|
||||||
|
|
|
@ -2066,46 +2066,6 @@ async def test_cache_default_off_acompletion():
|
||||||
assert response3.id == response4.id
|
assert response3.id == response4.id
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio()
|
|
||||||
@pytest.mark.skip(reason="dual caching should first prioritze local cache")
|
|
||||||
async def test_dual_cache_uses_redis():
|
|
||||||
"""
|
|
||||||
|
|
||||||
- Store diff values in redis and in memory cache
|
|
||||||
- call get cache
|
|
||||||
- Assert that value from redis is used
|
|
||||||
"""
|
|
||||||
litellm.set_verbose = True
|
|
||||||
from litellm.caching.caching import DualCache, RedisCache
|
|
||||||
|
|
||||||
current_usage = uuid.uuid4()
|
|
||||||
|
|
||||||
_cache_obj = DualCache(redis_cache=RedisCache(), always_read_redis=True)
|
|
||||||
|
|
||||||
# set cache
|
|
||||||
await _cache_obj.async_set_cache(key=f"current_usage: {current_usage}", value=10)
|
|
||||||
|
|
||||||
# modify value of in memory cache
|
|
||||||
_cache_obj.in_memory_cache.cache_dict[f"current_usage: {current_usage}"] = 1
|
|
||||||
|
|
||||||
# get cache
|
|
||||||
value = await _cache_obj.async_get_cache(key=f"current_usage: {current_usage}")
|
|
||||||
print("value from dual cache", value)
|
|
||||||
assert value == 10
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio()
|
|
||||||
async def test_proxy_logging_setup():
|
|
||||||
"""
|
|
||||||
Assert always_read_redis is True when used by internal usage cache
|
|
||||||
"""
|
|
||||||
from litellm.caching.caching import DualCache
|
|
||||||
from litellm.proxy.utils import ProxyLogging
|
|
||||||
|
|
||||||
pl_obj = ProxyLogging(user_api_key_cache=DualCache())
|
|
||||||
assert pl_obj.internal_usage_cache.dual_cache.always_read_redis is True
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip(reason="local test. Requires sentinel setup.")
|
@pytest.mark.skip(reason="local test. Requires sentinel setup.")
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_redis_sentinel_caching():
|
async def test_redis_sentinel_caching():
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue