diff --git a/litellm/__init__.py b/litellm/__init__.py index 2aa89a03c..11b34f504 100644 --- a/litellm/__init__.py +++ b/litellm/__init__.py @@ -160,9 +160,6 @@ enable_caching_on_provider_specific_optional_params: bool = ( caching: bool = ( False # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648 ) -always_read_redis: bool = ( - True # always use redis for rate limiting logic on litellm proxy -) caching_with_models: bool = ( False # # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648 ) diff --git a/litellm/caching/dual_cache.py b/litellm/caching/dual_cache.py index 720da9ad6..ec6a6c163 100644 --- a/litellm/caching/dual_cache.py +++ b/litellm/caching/dual_cache.py @@ -32,7 +32,6 @@ class DualCache(BaseCache): redis_cache: Optional[RedisCache] = None, default_in_memory_ttl: Optional[float] = None, default_redis_ttl: Optional[float] = None, - always_read_redis: Optional[bool] = True, ) -> None: super().__init__() # If in_memory_cache is not provided, use the default InMemoryCache @@ -44,7 +43,6 @@ class DualCache(BaseCache): default_in_memory_ttl or litellm.default_in_memory_ttl ) self.default_redis_ttl = default_redis_ttl or litellm.default_redis_ttl - self.always_read_redis = always_read_redis def update_cache_ttl( self, default_in_memory_ttl: Optional[float], default_redis_ttl: Optional[float] @@ -102,12 +100,8 @@ class DualCache(BaseCache): if in_memory_result is not None: result = in_memory_result - if ( - (self.always_read_redis is True) - and self.redis_cache is not None - and local_only is False - ): - # If not found in in-memory cache or always_read_redis is True, try fetching from Redis + if result is None and self.redis_cache is not None and local_only is False: + # If not found in in-memory cache, try fetching from Redis redis_result = self.redis_cache.get_cache(key, **kwargs) if redis_result is not None: diff --git a/litellm/integrations/opentelemetry.py b/litellm/integrations/opentelemetry.py index 8ba871acc..f1b7abbbb 100644 --- a/litellm/integrations/opentelemetry.py +++ b/litellm/integrations/opentelemetry.py @@ -171,7 +171,7 @@ class OpenTelemetry(CustomLogger): try: value = str(value) except Exception: - value = "litllm logging error - could_not_json_serialize" + value = "litellm logging error - could_not_json_serialize" self.safe_set_attribute( span=service_logging_span, key=key, diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml index 00f4da8d9..2cdf35b70 100644 --- a/litellm/proxy/_new_secret_config.yaml +++ b/litellm/proxy/_new_secret_config.yaml @@ -1,10 +1,51 @@ model_list: - model_name: gpt-4o litellm_params: - model: azure/gpt-4o-realtime-preview - api_key: os.environ/AZURE_SWEDEN_API_KEY - api_base: os.environ/AZURE_SWEDEN_API_BASE + model: openai/fake + api_key: fake-key + api_base: https://exampleopenaiendpoint-production.up.railway.app/ litellm_settings: - success_callback: ["langfuse"] - # logged_real_time_event_types: "*" \ No newline at end of file + callbacks: ["prometheus", "otel"] + +general_settings: + user_api_key_cache_ttl: 3600 + +router_settings: + routing_strategy: latency-based-routing + routing_strategy_args: + # only assign 40% of traffic to the fastest deployment to avoid overloading it + lowest_latency_buffer: 0.4 + + # consider last five minutes of calls for latency calculation + ttl: 300 + + # model_group_alias: + # gpt-4o: gpt-4o-128k-2024-05-13 + # gpt-4o-mini: gpt-4o-mini-128k-2024-07-18 + + enable_tag_filtering: True + + # retry call 3 times on each model_name (we don't use fallbacks, so this would be 3 times total) + num_retries: 3 + + # -- cooldown settings -- + # see https://github.com/BerriAI/litellm/blob/main/litellm/router_utils/cooldown_handlers.py#L265 + + # cooldown model if it fails > n calls in a minute. + allowed_fails: 2 + + # (in seconds) how long to cooldown model if fails/min > allowed_fails + cooldown_time: 60 + + allowed_fails_policy: + InternalServerErrorAllowedFails: 1 + RateLimitErrorAllowedFails: 2 + TimeoutErrorAllowedFails: 3 + # -- end cooldown settings -- + + # see https://docs.litellm.ai/docs/proxy/prod#3-use-redis-porthost-password-not-redis_url + redis_host: os.environ/REDIS_HOST + redis_port: os.environ/REDIS_PORT + redis_password: os.environ/REDIS_PASSWORD + diff --git a/tests/local_testing/test_caching.py b/tests/local_testing/test_caching.py index dfadf11bb..f56079aa7 100644 --- a/tests/local_testing/test_caching.py +++ b/tests/local_testing/test_caching.py @@ -2066,46 +2066,6 @@ async def test_cache_default_off_acompletion(): assert response3.id == response4.id -@pytest.mark.asyncio() -@pytest.mark.skip(reason="dual caching should first prioritze local cache") -async def test_dual_cache_uses_redis(): - """ - - - Store diff values in redis and in memory cache - - call get cache - - Assert that value from redis is used - """ - litellm.set_verbose = True - from litellm.caching.caching import DualCache, RedisCache - - current_usage = uuid.uuid4() - - _cache_obj = DualCache(redis_cache=RedisCache(), always_read_redis=True) - - # set cache - await _cache_obj.async_set_cache(key=f"current_usage: {current_usage}", value=10) - - # modify value of in memory cache - _cache_obj.in_memory_cache.cache_dict[f"current_usage: {current_usage}"] = 1 - - # get cache - value = await _cache_obj.async_get_cache(key=f"current_usage: {current_usage}") - print("value from dual cache", value) - assert value == 10 - - -@pytest.mark.asyncio() -async def test_proxy_logging_setup(): - """ - Assert always_read_redis is True when used by internal usage cache - """ - from litellm.caching.caching import DualCache - from litellm.proxy.utils import ProxyLogging - - pl_obj = ProxyLogging(user_api_key_cache=DualCache()) - assert pl_obj.internal_usage_cache.dual_cache.always_read_redis is True - - @pytest.mark.skip(reason="local test. Requires sentinel setup.") @pytest.mark.asyncio async def test_redis_sentinel_caching():