diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json index 6804d677e..2831f1a5c 100644 --- a/litellm/model_prices_and_context_window_backup.json +++ b/litellm/model_prices_and_context_window_backup.json @@ -2454,6 +2454,17 @@ "mode": "chat", "source": "https://cloud.google.com/vertex-ai/generative-ai/pricing#partner-models" }, + "vertex_ai/meta/llama-3.2-90b-vision-instruct-maas": { + "max_tokens": 8192, + "max_input_tokens": 128000, + "max_output_tokens": 8192, + "input_cost_per_token": 0.0, + "output_cost_per_token": 0.0, + "litellm_provider": "vertex_ai-llama_models", + "mode": "chat", + "supports_system_messages": true, + "source": "https://console.cloud.google.com/vertex-ai/publishers/meta/model-garden/llama-3.2-90b-vision-instruct-maas" + }, "vertex_ai/mistral-large@latest": { "max_tokens": 8191, "max_input_tokens": 128000, diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py index 80eea79fb..61448f4bb 100644 --- a/litellm/proxy/utils.py +++ b/litellm/proxy/utils.py @@ -70,6 +70,7 @@ from litellm.proxy.hooks.max_budget_limiter import _PROXY_MaxBudgetLimiter from litellm.proxy.hooks.parallel_request_limiter import ( _PROXY_MaxParallelRequestsHandler, ) +from litellm.proxy.proxy_server import UserAPIKeyCacheTTLEnum from litellm.types.utils import CallTypes, LoggedLiteLLMParams if TYPE_CHECKING: @@ -301,7 +302,9 @@ class ProxyLogging: self.call_details: dict = {} self.call_details["user_api_key_cache"] = user_api_key_cache self.internal_usage_cache: InternalUsageCache = InternalUsageCache( - dual_cache=DualCache(default_in_memory_ttl=1) # ping redis cache every 1s + dual_cache=DualCache( + default_in_memory_ttl=UserAPIKeyCacheTTLEnum.in_memory_cache_ttl.value + ) # ping redis cache every 1s ) self.max_parallel_request_limiter = _PROXY_MaxParallelRequestsHandler( self.internal_usage_cache