redis otel tracing + async support for latency routing (#6452)

* docs(exception_mapping.md): add missing exception types

Fixes https://github.com/Aider-AI/aider/issues/2120#issuecomment-2438971183

* fix(main.py): register custom model pricing with specific key

Ensure custom model pricing is registered to the specific model+provider key combination

* test: make testing more robust for custom pricing

* fix(redis_cache.py): instrument otel logging for sync redis calls

ensures complete coverage for all redis cache calls

* refactor: pass parent_otel_span for redis caching calls in router

allows for more observability into what calls are causing latency issues

* test: update tests with new params

* refactor: ensure e2e otel tracing for router

* refactor(router.py): add more otel tracing acrosss router

catch all latency issues for router requests

* fix: fix linting error

* fix(router.py): fix linting error

* fix: fix test

* test: fix tests

* fix(dual_cache.py): pass ttl to redis cache

* fix: fix param
This commit is contained in:
Krish Dholakia 2024-10-28 21:52:12 -07:00 committed by GitHub
parent d9e7818e6b
commit 4f8a3fd4cf
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
25 changed files with 559 additions and 147 deletions

View file

@ -25,6 +25,7 @@ import uuid
from collections import defaultdict
from datetime import datetime
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
@ -50,6 +51,7 @@ from litellm._logging import verbose_router_logger
from litellm.assistants.main import AssistantDeleted
from litellm.caching.caching import DualCache, InMemoryCache, RedisCache
from litellm.integrations.custom_logger import CustomLogger
from litellm.litellm_core_utils.core_helpers import _get_parent_otel_span_from_kwargs
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLogging
from litellm.llms.AzureOpenAI.azure import get_azure_ad_token_from_oidc
from litellm.router_strategy.least_busy import LeastBusyLoggingHandler
@ -124,6 +126,7 @@ from litellm.types.router import (
updateDeployment,
updateLiteLLMParams,
)
from litellm.types.services import ServiceLoggerPayload, ServiceTypes
from litellm.types.utils import OPENAI_RESPONSE_HEADERS
from litellm.types.utils import ModelInfo as ModelMapInfo
from litellm.utils import (
@ -140,6 +143,13 @@ from litellm.utils import (
from .router_utils.pattern_match_deployments import PatternMatchRouter
if TYPE_CHECKING:
from opentelemetry.trace import Span as _Span
Span = _Span
else:
Span = Any
class RoutingArgs(enum.Enum):
ttl = 60 # 1min (RPM/TPM expire key)
@ -293,6 +303,8 @@ class Router:
```
"""
from litellm._service_logger import ServiceLogging
if semaphore:
self.semaphore = semaphore
self.set_verbose = set_verbose
@ -494,7 +506,7 @@ class Router:
f"Routing context window fallbacks: {self.context_window_fallbacks}\n\n"
f"Router Redis Caching={self.cache.redis_cache}\n"
)
self.service_logger_obj = ServiceLogging()
self.routing_strategy_args = routing_strategy_args
self.retry_policy: Optional[RetryPolicy] = None
if retry_policy is not None:
@ -762,10 +774,23 @@ class Router:
request_priority = kwargs.get("priority") or self.default_priority
start_time = time.time()
if request_priority is not None and isinstance(request_priority, int):
response = await self.schedule_acompletion(**kwargs)
else:
response = await self.async_function_with_fallbacks(**kwargs)
end_time = time.time()
_duration = end_time - start_time
asyncio.create_task(
self.service_logger_obj.async_service_success_hook(
service=ServiceTypes.ROUTER,
duration=_duration,
call_type="acompletion",
start_time=start_time,
end_time=end_time,
parent_otel_span=_get_parent_otel_span_from_kwargs(kwargs),
)
)
return response
except Exception as e:
@ -793,15 +818,32 @@ class Router:
verbose_router_logger.debug(
f"Inside _acompletion()- model: {model}; kwargs: {kwargs}"
)
parent_otel_span = _get_parent_otel_span_from_kwargs(kwargs)
start_time = time.time()
deployment = await self.async_get_available_deployment(
model=model,
messages=messages,
specific_deployment=kwargs.pop("specific_deployment", None),
request_kwargs=kwargs,
)
end_time = time.time()
_duration = end_time - start_time
asyncio.create_task(
self.service_logger_obj.async_service_success_hook(
service=ServiceTypes.ROUTER,
duration=_duration,
call_type="async_get_available_deployment",
start_time=start_time,
end_time=end_time,
parent_otel_span=_get_parent_otel_span_from_kwargs(kwargs),
)
)
# debug how often this deployment picked
self._track_deployment_metrics(deployment=deployment)
self._track_deployment_metrics(
deployment=deployment, parent_otel_span=parent_otel_span
)
self._update_kwargs_with_deployment(deployment=deployment, kwargs=kwargs)
data = deployment["litellm_params"].copy()
@ -846,12 +888,16 @@ class Router:
- If allowed, increment the rpm limit (allows global value to be updated, concurrency-safe)
"""
await self.async_routing_strategy_pre_call_checks(
deployment=deployment, logging_obj=logging_obj
deployment=deployment,
logging_obj=logging_obj,
parent_otel_span=parent_otel_span,
)
response = await _response
else:
await self.async_routing_strategy_pre_call_checks(
deployment=deployment, logging_obj=logging_obj
deployment=deployment,
logging_obj=logging_obj,
parent_otel_span=parent_otel_span,
)
response = await _response
@ -872,7 +918,11 @@ class Router:
f"litellm.acompletion(model={model_name})\033[32m 200 OK\033[0m"
)
# debug how often this deployment picked
self._track_deployment_metrics(deployment=deployment, response=response)
self._track_deployment_metrics(
deployment=deployment,
response=response,
parent_otel_span=parent_otel_span,
)
return response
except Exception as e:
@ -1212,6 +1262,7 @@ class Router:
stream=False,
**kwargs,
):
parent_otel_span = _get_parent_otel_span_from_kwargs(kwargs)
### FLOW ITEM ###
_request_id = str(uuid.uuid4())
item = FlowItem(
@ -1232,7 +1283,7 @@ class Router:
while curr_time < end_time:
_healthy_deployments, _ = await self._async_get_healthy_deployments(
model=model
model=model, parent_otel_span=parent_otel_span
)
make_request = await self.scheduler.poll( ## POLL QUEUE ## - returns 'True' if there's healthy deployments OR if request is at top of queue
id=item.request_id,
@ -1353,6 +1404,7 @@ class Router:
verbose_router_logger.debug(
f"Inside _image_generation()- model: {model}; kwargs: {kwargs}"
)
parent_otel_span = _get_parent_otel_span_from_kwargs(kwargs)
deployment = await self.async_get_available_deployment(
model=model,
messages=[{"role": "user", "content": "prompt"}],
@ -1395,11 +1447,13 @@ class Router:
- If allowed, increment the rpm limit (allows global value to be updated, concurrency-safe)
"""
await self.async_routing_strategy_pre_call_checks(
deployment=deployment
deployment=deployment, parent_otel_span=parent_otel_span
)
response = await response
else:
await self.async_routing_strategy_pre_call_checks(deployment=deployment)
await self.async_routing_strategy_pre_call_checks(
deployment=deployment, parent_otel_span=parent_otel_span
)
response = await response
self.success_calls[model_name] += 1
@ -1465,6 +1519,7 @@ class Router:
verbose_router_logger.debug(
f"Inside _atranscription()- model: {model}; kwargs: {kwargs}"
)
parent_otel_span = _get_parent_otel_span_from_kwargs(kwargs)
deployment = await self.async_get_available_deployment(
model=model,
messages=[{"role": "user", "content": "prompt"}],
@ -1505,11 +1560,13 @@ class Router:
- If allowed, increment the rpm limit (allows global value to be updated, concurrency-safe)
"""
await self.async_routing_strategy_pre_call_checks(
deployment=deployment
deployment=deployment, parent_otel_span=parent_otel_span
)
response = await response
else:
await self.async_routing_strategy_pre_call_checks(deployment=deployment)
await self.async_routing_strategy_pre_call_checks(
deployment=deployment, parent_otel_span=parent_otel_span
)
response = await response
self.success_calls[model_name] += 1
@ -1861,6 +1918,7 @@ class Router:
verbose_router_logger.debug(
f"Inside _atext_completion()- model: {model}; kwargs: {kwargs}"
)
parent_otel_span = _get_parent_otel_span_from_kwargs(kwargs)
deployment = await self.async_get_available_deployment(
model=model,
messages=[{"role": "user", "content": prompt}],
@ -1903,11 +1961,13 @@ class Router:
- If allowed, increment the rpm limit (allows global value to be updated, concurrency-safe)
"""
await self.async_routing_strategy_pre_call_checks(
deployment=deployment
deployment=deployment, parent_otel_span=parent_otel_span
)
response = await response
else:
await self.async_routing_strategy_pre_call_checks(deployment=deployment)
await self.async_routing_strategy_pre_call_checks(
deployment=deployment, parent_otel_span=parent_otel_span
)
response = await response
self.success_calls[model_name] += 1
@ -1958,6 +2018,7 @@ class Router:
verbose_router_logger.debug(
f"Inside _aadapter_completion()- model: {model}; kwargs: {kwargs}"
)
parent_otel_span = _get_parent_otel_span_from_kwargs(kwargs)
deployment = await self.async_get_available_deployment(
model=model,
messages=[{"role": "user", "content": "default text"}],
@ -2000,11 +2061,13 @@ class Router:
- If allowed, increment the rpm limit (allows global value to be updated, concurrency-safe)
"""
await self.async_routing_strategy_pre_call_checks(
deployment=deployment
deployment=deployment, parent_otel_span=parent_otel_span
)
response = await response # type: ignore
else:
await self.async_routing_strategy_pre_call_checks(deployment=deployment)
await self.async_routing_strategy_pre_call_checks(
deployment=deployment, parent_otel_span=parent_otel_span
)
response = await response # type: ignore
self.success_calls[model_name] += 1
@ -2128,6 +2191,7 @@ class Router:
verbose_router_logger.debug(
f"Inside _aembedding()- model: {model}; kwargs: {kwargs}"
)
parent_otel_span = _get_parent_otel_span_from_kwargs(kwargs)
deployment = await self.async_get_available_deployment(
model=model,
input=input,
@ -2168,11 +2232,13 @@ class Router:
- If allowed, increment the rpm limit (allows global value to be updated, concurrency-safe)
"""
await self.async_routing_strategy_pre_call_checks(
deployment=deployment
deployment=deployment, parent_otel_span=parent_otel_span
)
response = await response
else:
await self.async_routing_strategy_pre_call_checks(deployment=deployment)
await self.async_routing_strategy_pre_call_checks(
deployment=deployment, parent_otel_span=parent_otel_span
)
response = await response
self.success_calls[model_name] += 1
@ -2223,6 +2289,7 @@ class Router:
verbose_router_logger.debug(
f"Inside _atext_completion()- model: {model}; kwargs: {kwargs}"
)
parent_otel_span = _get_parent_otel_span_from_kwargs(kwargs)
deployment = await self.async_get_available_deployment(
model=model,
messages=[{"role": "user", "content": "files-api-fake-text"}],
@ -2273,11 +2340,13 @@ class Router:
- If allowed, increment the rpm limit (allows global value to be updated, concurrency-safe)
"""
await self.async_routing_strategy_pre_call_checks(
deployment=deployment
deployment=deployment, parent_otel_span=parent_otel_span
)
response = await response # type: ignore
else:
await self.async_routing_strategy_pre_call_checks(deployment=deployment)
await self.async_routing_strategy_pre_call_checks(
deployment=deployment, parent_otel_span=parent_otel_span
)
response = await response # type: ignore
self.success_calls[model_name] += 1
@ -2327,6 +2396,7 @@ class Router:
verbose_router_logger.debug(
f"Inside _acreate_batch()- model: {model}; kwargs: {kwargs}"
)
parent_otel_span = _get_parent_otel_span_from_kwargs(kwargs)
deployment = await self.async_get_available_deployment(
model=model,
messages=[{"role": "user", "content": "files-api-fake-text"}],
@ -2389,11 +2459,13 @@ class Router:
- If allowed, increment the rpm limit (allows global value to be updated, concurrency-safe)
"""
await self.async_routing_strategy_pre_call_checks(
deployment=deployment
deployment=deployment, parent_otel_span=parent_otel_span
)
response = await response # type: ignore
else:
await self.async_routing_strategy_pre_call_checks(deployment=deployment)
await self.async_routing_strategy_pre_call_checks(
deployment=deployment, parent_otel_span=parent_otel_span
)
response = await response # type: ignore
self.success_calls[model_name] += 1
@ -2702,12 +2774,14 @@ class Router:
)
return response
except Exception as new_exception:
parent_otel_span = _get_parent_otel_span_from_kwargs(kwargs)
verbose_router_logger.error(
"litellm.router.py::async_function_with_fallbacks() - Error occurred while trying to do fallbacks - {}\n{}\n\nDebug Information:\nCooldown Deployments={}".format(
str(new_exception),
traceback.format_exc(),
await _async_get_cooldown_deployments_with_debug_info(
litellm_router_instance=self
litellm_router_instance=self,
parent_otel_span=parent_otel_span,
),
)
)
@ -2779,12 +2853,13 @@ class Router:
Context_Policy_Fallbacks={content_policy_fallbacks}",
)
async def async_function_with_retries(self, *args, **kwargs):
async def async_function_with_retries(self, *args, **kwargs): # noqa: PLR0915
verbose_router_logger.debug(
f"Inside async function with retries: args - {args}; kwargs - {kwargs}"
)
original_function = kwargs.pop("original_function")
fallbacks = kwargs.pop("fallbacks", self.fallbacks)
parent_otel_span = _get_parent_otel_span_from_kwargs(kwargs)
context_window_fallbacks = kwargs.pop(
"context_window_fallbacks", self.context_window_fallbacks
)
@ -2822,6 +2897,7 @@ class Router:
_healthy_deployments, _all_deployments = (
await self._async_get_healthy_deployments(
model=kwargs.get("model") or "",
parent_otel_span=parent_otel_span,
)
)
@ -2879,6 +2955,7 @@ class Router:
_healthy_deployments, _ = (
await self._async_get_healthy_deployments(
model=_model,
parent_otel_span=parent_otel_span,
)
)
else:
@ -3217,8 +3294,10 @@ class Router:
if _model is None:
raise e # re-raise error, if model can't be determined for loadbalancing
### CHECK IF RATE LIMIT / CONTEXT WINDOW ERROR
parent_otel_span = _get_parent_otel_span_from_kwargs(kwargs)
_healthy_deployments, _all_deployments = self._get_healthy_deployments(
model=_model,
parent_otel_span=parent_otel_span,
)
# raises an exception if this error should not be retries
@ -3260,8 +3339,10 @@ class Router:
if _model is None:
raise e # re-raise error, if model can't be determined for loadbalancing
parent_otel_span = _get_parent_otel_span_from_kwargs(kwargs)
_healthy_deployments, _ = self._get_healthy_deployments(
model=_model,
parent_otel_span=parent_otel_span,
)
remaining_retries = num_retries - current_attempt
_timeout = self._time_to_sleep_before_retry(
@ -3323,9 +3404,13 @@ class Router:
# ------------
# update cache
parent_otel_span = _get_parent_otel_span_from_kwargs(kwargs)
## TPM
await self.cache.async_increment_cache(
key=tpm_key, value=total_tokens, ttl=RoutingArgs.ttl.value
key=tpm_key,
value=total_tokens,
parent_otel_span=parent_otel_span,
ttl=RoutingArgs.ttl.value,
)
increment_deployment_successes_for_current_minute(
@ -3474,7 +3559,9 @@ class Router:
except Exception as e:
raise e
def _update_usage(self, deployment_id: str) -> int:
def _update_usage(
self, deployment_id: str, parent_otel_span: Optional[Span]
) -> int:
"""
Update deployment rpm for that minute
@ -3483,7 +3570,9 @@ class Router:
"""
rpm_key = deployment_id
request_count = self.cache.get_cache(key=rpm_key, local_only=True)
request_count = self.cache.get_cache(
key=rpm_key, parent_otel_span=parent_otel_span, local_only=True
)
if request_count is None:
request_count = 1
self.cache.set_cache(
@ -3591,7 +3680,7 @@ class Router:
)
return False
def _get_healthy_deployments(self, model: str):
def _get_healthy_deployments(self, model: str, parent_otel_span: Optional[Span]):
_all_deployments: list = []
try:
_, _all_deployments = self._common_checks_available_deployment( # type: ignore
@ -3602,7 +3691,9 @@ class Router:
except Exception:
pass
unhealthy_deployments = _get_cooldown_deployments(litellm_router_instance=self)
unhealthy_deployments = _get_cooldown_deployments(
litellm_router_instance=self, parent_otel_span=parent_otel_span
)
healthy_deployments: list = []
for deployment in _all_deployments:
if deployment["model_info"]["id"] in unhealthy_deployments:
@ -3613,7 +3704,7 @@ class Router:
return healthy_deployments, _all_deployments
async def _async_get_healthy_deployments(
self, model: str
self, model: str, parent_otel_span: Optional[Span]
) -> Tuple[List[Dict], List[Dict]]:
"""
Returns Tuple of:
@ -3632,7 +3723,7 @@ class Router:
pass
unhealthy_deployments = await _async_get_cooldown_deployments(
litellm_router_instance=self
litellm_router_instance=self, parent_otel_span=parent_otel_span
)
healthy_deployments: list = []
for deployment in _all_deployments:
@ -3659,7 +3750,10 @@ class Router:
_callback.pre_call_check(deployment)
async def async_routing_strategy_pre_call_checks(
self, deployment: dict, logging_obj: Optional[LiteLLMLogging] = None
self,
deployment: dict,
parent_otel_span: Optional[Span],
logging_obj: Optional[LiteLLMLogging] = None,
):
"""
For usage-based-routing-v2, enables running rpm checks before the call is made, inside the semaphore.
@ -3675,7 +3769,7 @@ class Router:
for _callback in litellm.callbacks:
if isinstance(_callback, CustomLogger):
try:
await _callback.async_pre_call_check(deployment)
await _callback.async_pre_call_check(deployment, parent_otel_span)
except litellm.RateLimitError as e:
## LOG FAILURE EVENT
if logging_obj is not None:
@ -4646,14 +4740,19 @@ class Router:
The appropriate client based on the given client_type and kwargs.
"""
model_id = deployment["model_info"]["id"]
parent_otel_span: Optional[Span] = _get_parent_otel_span_from_kwargs(kwargs)
if client_type == "max_parallel_requests":
cache_key = "{}_max_parallel_requests_client".format(model_id)
client = self.cache.get_cache(key=cache_key, local_only=True)
client = self.cache.get_cache(
key=cache_key, local_only=True, parent_otel_span=parent_otel_span
)
return client
elif client_type == "async":
if kwargs.get("stream") is True:
cache_key = f"{model_id}_stream_async_client"
client = self.cache.get_cache(key=cache_key, local_only=True)
client = self.cache.get_cache(
key=cache_key, local_only=True, parent_otel_span=parent_otel_span
)
if client is None:
"""
Re-initialize the client
@ -4661,11 +4760,17 @@ class Router:
InitalizeOpenAISDKClient.set_client(
litellm_router_instance=self, model=deployment
)
client = self.cache.get_cache(key=cache_key, local_only=True)
client = self.cache.get_cache(
key=cache_key,
local_only=True,
parent_otel_span=parent_otel_span,
)
return client
else:
cache_key = f"{model_id}_async_client"
client = self.cache.get_cache(key=cache_key, local_only=True)
client = self.cache.get_cache(
key=cache_key, local_only=True, parent_otel_span=parent_otel_span
)
if client is None:
"""
Re-initialize the client
@ -4673,12 +4778,18 @@ class Router:
InitalizeOpenAISDKClient.set_client(
litellm_router_instance=self, model=deployment
)
client = self.cache.get_cache(key=cache_key, local_only=True)
client = self.cache.get_cache(
key=cache_key,
local_only=True,
parent_otel_span=parent_otel_span,
)
return client
else:
if kwargs.get("stream") is True:
cache_key = f"{model_id}_stream_client"
client = self.cache.get_cache(key=cache_key)
client = self.cache.get_cache(
key=cache_key, parent_otel_span=parent_otel_span
)
if client is None:
"""
Re-initialize the client
@ -4686,11 +4797,15 @@ class Router:
InitalizeOpenAISDKClient.set_client(
litellm_router_instance=self, model=deployment
)
client = self.cache.get_cache(key=cache_key)
client = self.cache.get_cache(
key=cache_key, parent_otel_span=parent_otel_span
)
return client
else:
cache_key = f"{model_id}_client"
client = self.cache.get_cache(key=cache_key)
client = self.cache.get_cache(
key=cache_key, parent_otel_span=parent_otel_span
)
if client is None:
"""
Re-initialize the client
@ -4698,7 +4813,9 @@ class Router:
InitalizeOpenAISDKClient.set_client(
litellm_router_instance=self, model=deployment
)
client = self.cache.get_cache(key=cache_key)
client = self.cache.get_cache(
key=cache_key, parent_otel_span=parent_otel_span
)
return client
def _pre_call_checks( # noqa: PLR0915
@ -4738,13 +4855,17 @@ class Router:
_context_window_error = False
_potential_error_str = ""
_rate_limit_error = False
parent_otel_span = _get_parent_otel_span_from_kwargs(request_kwargs)
## get model group RPM ##
dt = get_utc_datetime()
current_minute = dt.strftime("%H-%M")
rpm_key = f"{model}:rpm:{current_minute}"
model_group_cache = (
self.cache.get_cache(key=rpm_key, local_only=True) or {}
self.cache.get_cache(
key=rpm_key, local_only=True, parent_otel_span=parent_otel_span
)
or {}
) # check the in-memory cache used by lowest_latency and usage-based routing. Only check the local cache.
for idx, deployment in enumerate(_returned_deployments):
# see if we have the info for this model
@ -4783,7 +4904,10 @@ class Router:
## RPM CHECK ##
### get local router cache ###
current_request_cache_local = (
self.cache.get_cache(key=model_id, local_only=True) or 0
self.cache.get_cache(
key=model_id, local_only=True, parent_otel_span=parent_otel_span
)
or 0
)
### get usage based cache ###
if (
@ -5002,6 +5126,7 @@ class Router:
self.routing_strategy != "usage-based-routing-v2"
and self.routing_strategy != "simple-shuffle"
and self.routing_strategy != "cost-based-routing"
and self.routing_strategy != "latency-based-routing"
): # prevent regressions for other routing strategies, that don't have async get available deployments implemented.
return self.get_available_deployment(
model=model,
@ -5011,6 +5136,7 @@ class Router:
request_kwargs=request_kwargs,
)
try:
parent_otel_span = _get_parent_otel_span_from_kwargs(request_kwargs)
model, healthy_deployments = self._common_checks_available_deployment(
model=model,
messages=messages,
@ -5021,7 +5147,7 @@ class Router:
return healthy_deployments
cooldown_deployments = await _async_get_cooldown_deployments(
litellm_router_instance=self
litellm_router_instance=self, parent_otel_span=parent_otel_span
)
verbose_router_logger.debug(
f"async cooldown deployments: {cooldown_deployments}"
@ -5059,16 +5185,18 @@ class Router:
_allowed_model_region = "n/a"
model_ids = self.get_model_ids(model_name=model)
_cooldown_time = self.cooldown_cache.get_min_cooldown(
model_ids=model_ids
model_ids=model_ids, parent_otel_span=parent_otel_span
)
_cooldown_list = _get_cooldown_deployments(
litellm_router_instance=self, parent_otel_span=parent_otel_span
)
_cooldown_list = _get_cooldown_deployments(litellm_router_instance=self)
raise RouterRateLimitError(
model=model,
cooldown_time=_cooldown_time,
enable_pre_call_checks=self.enable_pre_call_checks,
cooldown_list=_cooldown_list,
)
start_time = time.time()
if (
self.routing_strategy == "usage-based-routing-v2"
and self.lowesttpm_logger_v2 is not None
@ -5093,6 +5221,19 @@ class Router:
input=input,
)
)
elif (
self.routing_strategy == "latency-based-routing"
and self.lowestlatency_logger is not None
):
deployment = (
await self.lowestlatency_logger.async_get_available_deployments(
model_group=model,
healthy_deployments=healthy_deployments, # type: ignore
messages=messages,
input=input,
request_kwargs=request_kwargs,
)
)
elif self.routing_strategy == "simple-shuffle":
return simple_shuffle(
llm_router_instance=self,
@ -5107,9 +5248,11 @@ class Router:
)
model_ids = self.get_model_ids(model_name=model)
_cooldown_time = self.cooldown_cache.get_min_cooldown(
model_ids=model_ids
model_ids=model_ids, parent_otel_span=parent_otel_span
)
_cooldown_list = _get_cooldown_deployments(
litellm_router_instance=self, parent_otel_span=parent_otel_span
)
_cooldown_list = _get_cooldown_deployments(litellm_router_instance=self)
raise RouterRateLimitError(
model=model,
cooldown_time=_cooldown_time,
@ -5120,6 +5263,19 @@ class Router:
f"get_available_deployment for model: {model}, Selected deployment: {self.print_deployment(deployment)} for model: {model}"
)
end_time = time.time()
_duration = end_time - start_time
asyncio.create_task(
self.service_logger_obj.async_service_success_hook(
service=ServiceTypes.ROUTER,
duration=_duration,
call_type="<routing_strategy>.async_get_available_deployments",
parent_otel_span=parent_otel_span,
start_time=start_time,
end_time=end_time,
)
)
return deployment
except Exception as e:
traceback_exception = traceback.format_exc()
@ -5163,7 +5319,12 @@ class Router:
if isinstance(healthy_deployments, dict):
return healthy_deployments
cooldown_deployments = _get_cooldown_deployments(litellm_router_instance=self)
parent_otel_span: Optional[Span] = _get_parent_otel_span_from_kwargs(
request_kwargs
)
cooldown_deployments = _get_cooldown_deployments(
litellm_router_instance=self, parent_otel_span=parent_otel_span
)
healthy_deployments = self._filter_cooldown_deployments(
healthy_deployments=healthy_deployments,
cooldown_deployments=cooldown_deployments,
@ -5180,8 +5341,12 @@ class Router:
if len(healthy_deployments) == 0:
model_ids = self.get_model_ids(model_name=model)
_cooldown_time = self.cooldown_cache.get_min_cooldown(model_ids=model_ids)
_cooldown_list = _get_cooldown_deployments(litellm_router_instance=self)
_cooldown_time = self.cooldown_cache.get_min_cooldown(
model_ids=model_ids, parent_otel_span=parent_otel_span
)
_cooldown_list = _get_cooldown_deployments(
litellm_router_instance=self, parent_otel_span=parent_otel_span
)
raise RouterRateLimitError(
model=model,
cooldown_time=_cooldown_time,
@ -5238,8 +5403,12 @@ class Router:
f"get_available_deployment for model: {model}, No deployment available"
)
model_ids = self.get_model_ids(model_name=model)
_cooldown_time = self.cooldown_cache.get_min_cooldown(model_ids=model_ids)
_cooldown_list = _get_cooldown_deployments(litellm_router_instance=self)
_cooldown_time = self.cooldown_cache.get_min_cooldown(
model_ids=model_ids, parent_otel_span=parent_otel_span
)
_cooldown_list = _get_cooldown_deployments(
litellm_router_instance=self, parent_otel_span=parent_otel_span
)
raise RouterRateLimitError(
model=model,
cooldown_time=_cooldown_time,
@ -5278,7 +5447,9 @@ class Router:
healthy_deployments.remove(deployment)
return healthy_deployments
def _track_deployment_metrics(self, deployment, response=None):
def _track_deployment_metrics(
self, deployment, parent_otel_span: Optional[Span], response=None
):
"""
Tracks successful requests rpm usage.
"""
@ -5288,7 +5459,9 @@ class Router:
# update self.deployment_stats
if model_id is not None:
self._update_usage(model_id) # update in-memory cache for tracking
self._update_usage(
model_id, parent_otel_span
) # update in-memory cache for tracking
except Exception as e:
verbose_router_logger.error(f"Error in _track_deployment_metrics: {str(e)}")