LiteLLM Minor Fixes & Improvements (09/21/2024) (#5819)

* fix(router.py): fix error message

* Litellm disable keys (#5814)

* build(schema.prisma): allow blocking/unblocking keys

Fixes https://github.com/BerriAI/litellm/issues/5328

* fix(key_management_endpoints.py): fix pop

* feat(auth_checks.py): allow admin to enable/disable virtual keys

Closes https://github.com/BerriAI/litellm/issues/5328

* docs(vertex.md): add auth section for vertex ai

Addresses - https://github.com/BerriAI/litellm/issues/5768#issuecomment-2365284223

* build(model_prices_and_context_window.json): show which models support prompt_caching

Closes https://github.com/BerriAI/litellm/issues/5776

* fix(router.py): allow setting default priority for requests

* fix(router.py): add 'retry-after' header for concurrent request limit errors

Fixes https://github.com/BerriAI/litellm/issues/5783

* fix(router.py): correctly raise and use retry-after header from azure+openai

Fixes https://github.com/BerriAI/litellm/issues/5783

* fix(user_api_key_auth.py): fix valid token being none

* fix(auth_checks.py): fix model dump for cache management object

* fix(user_api_key_auth.py): pass prisma_client to obj

* test(test_otel.py): update test for new key check

* test: fix test
This commit is contained in:
Krish Dholakia 2024-09-21 18:51:53 -07:00 committed by GitHub
parent f0543a6f9d
commit f3fa2160a0
25 changed files with 1006 additions and 182 deletions

View file

@ -154,6 +154,7 @@ class Router:
client_ttl: int = 3600, # ttl for cached clients - will re-initialize after this time in seconds
## SCHEDULER ##
polling_interval: Optional[float] = None,
default_priority: Optional[int] = None,
## RELIABILITY ##
num_retries: Optional[int] = None,
timeout: Optional[float] = None,
@ -220,6 +221,7 @@ class Router:
caching_groups (Optional[List[tuple]]): List of model groups for caching across model groups. Defaults to None.
client_ttl (int): Time-to-live for cached clients in seconds. Defaults to 3600.
polling_interval: (Optional[float]): frequency of polling queue. Only for '.scheduler_acompletion()'. Default is 3ms.
default_priority: (Optional[int]): the default priority for a request. Only for '.scheduler_acompletion()'. Default is None.
num_retries (Optional[int]): Number of retries for failed requests. Defaults to 2.
timeout (Optional[float]): Timeout for requests. Defaults to None.
default_litellm_params (dict): Default parameters for Router.chat.completion.create. Defaults to {}.
@ -336,6 +338,7 @@ class Router:
self.scheduler = Scheduler(
polling_interval=polling_interval, redis_cache=redis_cache
)
self.default_priority = default_priority
self.default_deployment = None # use this to track the users default deployment, when they want to use model = *
self.default_max_parallel_requests = default_max_parallel_requests
self.provider_default_deployments: Dict[str, List] = {}
@ -712,12 +715,11 @@ class Router:
kwargs["original_function"] = self._acompletion
kwargs["num_retries"] = kwargs.get("num_retries", self.num_retries)
timeout = kwargs.get("request_timeout", self.timeout)
kwargs.setdefault("metadata", {}).update({"model_group": model})
if kwargs.get("priority", None) is not None and isinstance(
kwargs.get("priority"), int
):
request_priority = kwargs.get("priority") or self.default_priority
if request_priority is not None and isinstance(request_priority, int):
response = await self.schedule_acompletion(**kwargs)
else:
response = await self.async_function_with_fallbacks(**kwargs)
@ -3085,9 +3087,9 @@ class Router:
except Exception as e:
current_attempt = None
original_exception = e
"""
Retry Logic
"""
_healthy_deployments, _all_deployments = (
await self._async_get_healthy_deployments(
@ -3105,16 +3107,6 @@ class Router:
content_policy_fallbacks=content_policy_fallbacks,
)
# decides how long to sleep before retry
_timeout = self._time_to_sleep_before_retry(
e=original_exception,
remaining_retries=num_retries,
num_retries=num_retries,
healthy_deployments=_healthy_deployments,
)
# sleeps for the length of the timeout
await asyncio.sleep(_timeout)
if (
self.retry_policy is not None
or self.model_group_retry_policy is not None
@ -3128,11 +3120,19 @@ class Router:
## LOGGING
if num_retries > 0:
kwargs = self.log_retry(kwargs=kwargs, e=original_exception)
else:
raise
# decides how long to sleep before retry
_timeout = self._time_to_sleep_before_retry(
e=original_exception,
remaining_retries=num_retries,
num_retries=num_retries,
healthy_deployments=_healthy_deployments,
)
# sleeps for the length of the timeout
await asyncio.sleep(_timeout)
for current_attempt in range(num_retries):
verbose_router_logger.debug(
f"retrying request. Current attempt - {current_attempt}; num retries: {num_retries}"
)
try:
# if the function call is successful, no exception will be raised and we'll break out of the loop
response = await original_function(*args, **kwargs)
@ -3370,14 +3370,14 @@ class Router:
if (
healthy_deployments is not None
and isinstance(healthy_deployments, list)
and len(healthy_deployments) > 0
and len(healthy_deployments) > 1
):
return 0
response_headers: Optional[httpx.Headers] = None
if hasattr(e, "response") and hasattr(e.response, "headers"): # type: ignore
response_headers = e.response.headers # type: ignore
elif hasattr(e, "litellm_response_headers"):
if hasattr(e, "litellm_response_headers"):
response_headers = e.litellm_response_headers # type: ignore
if response_headers is not None:
@ -3561,7 +3561,7 @@ class Router:
except Exception as e:
verbose_router_logger.exception(
"litellm.proxy.hooks.prompt_injection_detection.py::async_pre_call_hook(): Exception occured - {}".format(
"litellm.router.Router::deployment_callback_on_success(): Exception occured - {}".format(
str(e)
)
)
@ -5324,7 +5324,6 @@ class Router:
return deployment
except Exception as e:
traceback_exception = traceback.format_exc()
# if router rejects call -> log to langfuse/otel/etc.
if request_kwargs is not None: