mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-25 10:44:24 +00:00
redis otel tracing + async support for latency routing (#6452)
* docs(exception_mapping.md): add missing exception types Fixes https://github.com/Aider-AI/aider/issues/2120#issuecomment-2438971183 * fix(main.py): register custom model pricing with specific key Ensure custom model pricing is registered to the specific model+provider key combination * test: make testing more robust for custom pricing * fix(redis_cache.py): instrument otel logging for sync redis calls ensures complete coverage for all redis cache calls * refactor: pass parent_otel_span for redis caching calls in router allows for more observability into what calls are causing latency issues * test: update tests with new params * refactor: ensure e2e otel tracing for router * refactor(router.py): add more otel tracing acrosss router catch all latency issues for router requests * fix: fix linting error * fix(router.py): fix linting error * fix: fix test * test: fix tests * fix(dual_cache.py): pass ttl to redis cache * fix: fix param
This commit is contained in:
parent
d9e7818e6b
commit
4f8a3fd4cf
25 changed files with 559 additions and 147 deletions
|
@ -25,6 +25,7 @@ import uuid
|
|||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Callable,
|
||||
Dict,
|
||||
|
@ -50,6 +51,7 @@ from litellm._logging import verbose_router_logger
|
|||
from litellm.assistants.main import AssistantDeleted
|
||||
from litellm.caching.caching import DualCache, InMemoryCache, RedisCache
|
||||
from litellm.integrations.custom_logger import CustomLogger
|
||||
from litellm.litellm_core_utils.core_helpers import _get_parent_otel_span_from_kwargs
|
||||
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLogging
|
||||
from litellm.llms.AzureOpenAI.azure import get_azure_ad_token_from_oidc
|
||||
from litellm.router_strategy.least_busy import LeastBusyLoggingHandler
|
||||
|
@ -124,6 +126,7 @@ from litellm.types.router import (
|
|||
updateDeployment,
|
||||
updateLiteLLMParams,
|
||||
)
|
||||
from litellm.types.services import ServiceLoggerPayload, ServiceTypes
|
||||
from litellm.types.utils import OPENAI_RESPONSE_HEADERS
|
||||
from litellm.types.utils import ModelInfo as ModelMapInfo
|
||||
from litellm.utils import (
|
||||
|
@ -140,6 +143,13 @@ from litellm.utils import (
|
|||
|
||||
from .router_utils.pattern_match_deployments import PatternMatchRouter
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from opentelemetry.trace import Span as _Span
|
||||
|
||||
Span = _Span
|
||||
else:
|
||||
Span = Any
|
||||
|
||||
|
||||
class RoutingArgs(enum.Enum):
|
||||
ttl = 60 # 1min (RPM/TPM expire key)
|
||||
|
@ -293,6 +303,8 @@ class Router:
|
|||
```
|
||||
"""
|
||||
|
||||
from litellm._service_logger import ServiceLogging
|
||||
|
||||
if semaphore:
|
||||
self.semaphore = semaphore
|
||||
self.set_verbose = set_verbose
|
||||
|
@ -494,7 +506,7 @@ class Router:
|
|||
f"Routing context window fallbacks: {self.context_window_fallbacks}\n\n"
|
||||
f"Router Redis Caching={self.cache.redis_cache}\n"
|
||||
)
|
||||
|
||||
self.service_logger_obj = ServiceLogging()
|
||||
self.routing_strategy_args = routing_strategy_args
|
||||
self.retry_policy: Optional[RetryPolicy] = None
|
||||
if retry_policy is not None:
|
||||
|
@ -762,10 +774,23 @@ class Router:
|
|||
|
||||
request_priority = kwargs.get("priority") or self.default_priority
|
||||
|
||||
start_time = time.time()
|
||||
if request_priority is not None and isinstance(request_priority, int):
|
||||
response = await self.schedule_acompletion(**kwargs)
|
||||
else:
|
||||
response = await self.async_function_with_fallbacks(**kwargs)
|
||||
end_time = time.time()
|
||||
_duration = end_time - start_time
|
||||
asyncio.create_task(
|
||||
self.service_logger_obj.async_service_success_hook(
|
||||
service=ServiceTypes.ROUTER,
|
||||
duration=_duration,
|
||||
call_type="acompletion",
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
parent_otel_span=_get_parent_otel_span_from_kwargs(kwargs),
|
||||
)
|
||||
)
|
||||
|
||||
return response
|
||||
except Exception as e:
|
||||
|
@ -793,15 +818,32 @@ class Router:
|
|||
verbose_router_logger.debug(
|
||||
f"Inside _acompletion()- model: {model}; kwargs: {kwargs}"
|
||||
)
|
||||
parent_otel_span = _get_parent_otel_span_from_kwargs(kwargs)
|
||||
start_time = time.time()
|
||||
deployment = await self.async_get_available_deployment(
|
||||
model=model,
|
||||
messages=messages,
|
||||
specific_deployment=kwargs.pop("specific_deployment", None),
|
||||
request_kwargs=kwargs,
|
||||
)
|
||||
end_time = time.time()
|
||||
_duration = end_time - start_time
|
||||
asyncio.create_task(
|
||||
self.service_logger_obj.async_service_success_hook(
|
||||
service=ServiceTypes.ROUTER,
|
||||
duration=_duration,
|
||||
call_type="async_get_available_deployment",
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
parent_otel_span=_get_parent_otel_span_from_kwargs(kwargs),
|
||||
)
|
||||
)
|
||||
|
||||
# debug how often this deployment picked
|
||||
self._track_deployment_metrics(deployment=deployment)
|
||||
|
||||
self._track_deployment_metrics(
|
||||
deployment=deployment, parent_otel_span=parent_otel_span
|
||||
)
|
||||
self._update_kwargs_with_deployment(deployment=deployment, kwargs=kwargs)
|
||||
|
||||
data = deployment["litellm_params"].copy()
|
||||
|
@ -846,12 +888,16 @@ class Router:
|
|||
- If allowed, increment the rpm limit (allows global value to be updated, concurrency-safe)
|
||||
"""
|
||||
await self.async_routing_strategy_pre_call_checks(
|
||||
deployment=deployment, logging_obj=logging_obj
|
||||
deployment=deployment,
|
||||
logging_obj=logging_obj,
|
||||
parent_otel_span=parent_otel_span,
|
||||
)
|
||||
response = await _response
|
||||
else:
|
||||
await self.async_routing_strategy_pre_call_checks(
|
||||
deployment=deployment, logging_obj=logging_obj
|
||||
deployment=deployment,
|
||||
logging_obj=logging_obj,
|
||||
parent_otel_span=parent_otel_span,
|
||||
)
|
||||
response = await _response
|
||||
|
||||
|
@ -872,7 +918,11 @@ class Router:
|
|||
f"litellm.acompletion(model={model_name})\033[32m 200 OK\033[0m"
|
||||
)
|
||||
# debug how often this deployment picked
|
||||
self._track_deployment_metrics(deployment=deployment, response=response)
|
||||
self._track_deployment_metrics(
|
||||
deployment=deployment,
|
||||
response=response,
|
||||
parent_otel_span=parent_otel_span,
|
||||
)
|
||||
|
||||
return response
|
||||
except Exception as e:
|
||||
|
@ -1212,6 +1262,7 @@ class Router:
|
|||
stream=False,
|
||||
**kwargs,
|
||||
):
|
||||
parent_otel_span = _get_parent_otel_span_from_kwargs(kwargs)
|
||||
### FLOW ITEM ###
|
||||
_request_id = str(uuid.uuid4())
|
||||
item = FlowItem(
|
||||
|
@ -1232,7 +1283,7 @@ class Router:
|
|||
|
||||
while curr_time < end_time:
|
||||
_healthy_deployments, _ = await self._async_get_healthy_deployments(
|
||||
model=model
|
||||
model=model, parent_otel_span=parent_otel_span
|
||||
)
|
||||
make_request = await self.scheduler.poll( ## POLL QUEUE ## - returns 'True' if there's healthy deployments OR if request is at top of queue
|
||||
id=item.request_id,
|
||||
|
@ -1353,6 +1404,7 @@ class Router:
|
|||
verbose_router_logger.debug(
|
||||
f"Inside _image_generation()- model: {model}; kwargs: {kwargs}"
|
||||
)
|
||||
parent_otel_span = _get_parent_otel_span_from_kwargs(kwargs)
|
||||
deployment = await self.async_get_available_deployment(
|
||||
model=model,
|
||||
messages=[{"role": "user", "content": "prompt"}],
|
||||
|
@ -1395,11 +1447,13 @@ class Router:
|
|||
- If allowed, increment the rpm limit (allows global value to be updated, concurrency-safe)
|
||||
"""
|
||||
await self.async_routing_strategy_pre_call_checks(
|
||||
deployment=deployment
|
||||
deployment=deployment, parent_otel_span=parent_otel_span
|
||||
)
|
||||
response = await response
|
||||
else:
|
||||
await self.async_routing_strategy_pre_call_checks(deployment=deployment)
|
||||
await self.async_routing_strategy_pre_call_checks(
|
||||
deployment=deployment, parent_otel_span=parent_otel_span
|
||||
)
|
||||
response = await response
|
||||
|
||||
self.success_calls[model_name] += 1
|
||||
|
@ -1465,6 +1519,7 @@ class Router:
|
|||
verbose_router_logger.debug(
|
||||
f"Inside _atranscription()- model: {model}; kwargs: {kwargs}"
|
||||
)
|
||||
parent_otel_span = _get_parent_otel_span_from_kwargs(kwargs)
|
||||
deployment = await self.async_get_available_deployment(
|
||||
model=model,
|
||||
messages=[{"role": "user", "content": "prompt"}],
|
||||
|
@ -1505,11 +1560,13 @@ class Router:
|
|||
- If allowed, increment the rpm limit (allows global value to be updated, concurrency-safe)
|
||||
"""
|
||||
await self.async_routing_strategy_pre_call_checks(
|
||||
deployment=deployment
|
||||
deployment=deployment, parent_otel_span=parent_otel_span
|
||||
)
|
||||
response = await response
|
||||
else:
|
||||
await self.async_routing_strategy_pre_call_checks(deployment=deployment)
|
||||
await self.async_routing_strategy_pre_call_checks(
|
||||
deployment=deployment, parent_otel_span=parent_otel_span
|
||||
)
|
||||
response = await response
|
||||
|
||||
self.success_calls[model_name] += 1
|
||||
|
@ -1861,6 +1918,7 @@ class Router:
|
|||
verbose_router_logger.debug(
|
||||
f"Inside _atext_completion()- model: {model}; kwargs: {kwargs}"
|
||||
)
|
||||
parent_otel_span = _get_parent_otel_span_from_kwargs(kwargs)
|
||||
deployment = await self.async_get_available_deployment(
|
||||
model=model,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
|
@ -1903,11 +1961,13 @@ class Router:
|
|||
- If allowed, increment the rpm limit (allows global value to be updated, concurrency-safe)
|
||||
"""
|
||||
await self.async_routing_strategy_pre_call_checks(
|
||||
deployment=deployment
|
||||
deployment=deployment, parent_otel_span=parent_otel_span
|
||||
)
|
||||
response = await response
|
||||
else:
|
||||
await self.async_routing_strategy_pre_call_checks(deployment=deployment)
|
||||
await self.async_routing_strategy_pre_call_checks(
|
||||
deployment=deployment, parent_otel_span=parent_otel_span
|
||||
)
|
||||
response = await response
|
||||
|
||||
self.success_calls[model_name] += 1
|
||||
|
@ -1958,6 +2018,7 @@ class Router:
|
|||
verbose_router_logger.debug(
|
||||
f"Inside _aadapter_completion()- model: {model}; kwargs: {kwargs}"
|
||||
)
|
||||
parent_otel_span = _get_parent_otel_span_from_kwargs(kwargs)
|
||||
deployment = await self.async_get_available_deployment(
|
||||
model=model,
|
||||
messages=[{"role": "user", "content": "default text"}],
|
||||
|
@ -2000,11 +2061,13 @@ class Router:
|
|||
- If allowed, increment the rpm limit (allows global value to be updated, concurrency-safe)
|
||||
"""
|
||||
await self.async_routing_strategy_pre_call_checks(
|
||||
deployment=deployment
|
||||
deployment=deployment, parent_otel_span=parent_otel_span
|
||||
)
|
||||
response = await response # type: ignore
|
||||
else:
|
||||
await self.async_routing_strategy_pre_call_checks(deployment=deployment)
|
||||
await self.async_routing_strategy_pre_call_checks(
|
||||
deployment=deployment, parent_otel_span=parent_otel_span
|
||||
)
|
||||
response = await response # type: ignore
|
||||
|
||||
self.success_calls[model_name] += 1
|
||||
|
@ -2128,6 +2191,7 @@ class Router:
|
|||
verbose_router_logger.debug(
|
||||
f"Inside _aembedding()- model: {model}; kwargs: {kwargs}"
|
||||
)
|
||||
parent_otel_span = _get_parent_otel_span_from_kwargs(kwargs)
|
||||
deployment = await self.async_get_available_deployment(
|
||||
model=model,
|
||||
input=input,
|
||||
|
@ -2168,11 +2232,13 @@ class Router:
|
|||
- If allowed, increment the rpm limit (allows global value to be updated, concurrency-safe)
|
||||
"""
|
||||
await self.async_routing_strategy_pre_call_checks(
|
||||
deployment=deployment
|
||||
deployment=deployment, parent_otel_span=parent_otel_span
|
||||
)
|
||||
response = await response
|
||||
else:
|
||||
await self.async_routing_strategy_pre_call_checks(deployment=deployment)
|
||||
await self.async_routing_strategy_pre_call_checks(
|
||||
deployment=deployment, parent_otel_span=parent_otel_span
|
||||
)
|
||||
response = await response
|
||||
|
||||
self.success_calls[model_name] += 1
|
||||
|
@ -2223,6 +2289,7 @@ class Router:
|
|||
verbose_router_logger.debug(
|
||||
f"Inside _atext_completion()- model: {model}; kwargs: {kwargs}"
|
||||
)
|
||||
parent_otel_span = _get_parent_otel_span_from_kwargs(kwargs)
|
||||
deployment = await self.async_get_available_deployment(
|
||||
model=model,
|
||||
messages=[{"role": "user", "content": "files-api-fake-text"}],
|
||||
|
@ -2273,11 +2340,13 @@ class Router:
|
|||
- If allowed, increment the rpm limit (allows global value to be updated, concurrency-safe)
|
||||
"""
|
||||
await self.async_routing_strategy_pre_call_checks(
|
||||
deployment=deployment
|
||||
deployment=deployment, parent_otel_span=parent_otel_span
|
||||
)
|
||||
response = await response # type: ignore
|
||||
else:
|
||||
await self.async_routing_strategy_pre_call_checks(deployment=deployment)
|
||||
await self.async_routing_strategy_pre_call_checks(
|
||||
deployment=deployment, parent_otel_span=parent_otel_span
|
||||
)
|
||||
response = await response # type: ignore
|
||||
|
||||
self.success_calls[model_name] += 1
|
||||
|
@ -2327,6 +2396,7 @@ class Router:
|
|||
verbose_router_logger.debug(
|
||||
f"Inside _acreate_batch()- model: {model}; kwargs: {kwargs}"
|
||||
)
|
||||
parent_otel_span = _get_parent_otel_span_from_kwargs(kwargs)
|
||||
deployment = await self.async_get_available_deployment(
|
||||
model=model,
|
||||
messages=[{"role": "user", "content": "files-api-fake-text"}],
|
||||
|
@ -2389,11 +2459,13 @@ class Router:
|
|||
- If allowed, increment the rpm limit (allows global value to be updated, concurrency-safe)
|
||||
"""
|
||||
await self.async_routing_strategy_pre_call_checks(
|
||||
deployment=deployment
|
||||
deployment=deployment, parent_otel_span=parent_otel_span
|
||||
)
|
||||
response = await response # type: ignore
|
||||
else:
|
||||
await self.async_routing_strategy_pre_call_checks(deployment=deployment)
|
||||
await self.async_routing_strategy_pre_call_checks(
|
||||
deployment=deployment, parent_otel_span=parent_otel_span
|
||||
)
|
||||
response = await response # type: ignore
|
||||
|
||||
self.success_calls[model_name] += 1
|
||||
|
@ -2702,12 +2774,14 @@ class Router:
|
|||
)
|
||||
return response
|
||||
except Exception as new_exception:
|
||||
parent_otel_span = _get_parent_otel_span_from_kwargs(kwargs)
|
||||
verbose_router_logger.error(
|
||||
"litellm.router.py::async_function_with_fallbacks() - Error occurred while trying to do fallbacks - {}\n{}\n\nDebug Information:\nCooldown Deployments={}".format(
|
||||
str(new_exception),
|
||||
traceback.format_exc(),
|
||||
await _async_get_cooldown_deployments_with_debug_info(
|
||||
litellm_router_instance=self
|
||||
litellm_router_instance=self,
|
||||
parent_otel_span=parent_otel_span,
|
||||
),
|
||||
)
|
||||
)
|
||||
|
@ -2779,12 +2853,13 @@ class Router:
|
|||
Context_Policy_Fallbacks={content_policy_fallbacks}",
|
||||
)
|
||||
|
||||
async def async_function_with_retries(self, *args, **kwargs):
|
||||
async def async_function_with_retries(self, *args, **kwargs): # noqa: PLR0915
|
||||
verbose_router_logger.debug(
|
||||
f"Inside async function with retries: args - {args}; kwargs - {kwargs}"
|
||||
)
|
||||
original_function = kwargs.pop("original_function")
|
||||
fallbacks = kwargs.pop("fallbacks", self.fallbacks)
|
||||
parent_otel_span = _get_parent_otel_span_from_kwargs(kwargs)
|
||||
context_window_fallbacks = kwargs.pop(
|
||||
"context_window_fallbacks", self.context_window_fallbacks
|
||||
)
|
||||
|
@ -2822,6 +2897,7 @@ class Router:
|
|||
_healthy_deployments, _all_deployments = (
|
||||
await self._async_get_healthy_deployments(
|
||||
model=kwargs.get("model") or "",
|
||||
parent_otel_span=parent_otel_span,
|
||||
)
|
||||
)
|
||||
|
||||
|
@ -2879,6 +2955,7 @@ class Router:
|
|||
_healthy_deployments, _ = (
|
||||
await self._async_get_healthy_deployments(
|
||||
model=_model,
|
||||
parent_otel_span=parent_otel_span,
|
||||
)
|
||||
)
|
||||
else:
|
||||
|
@ -3217,8 +3294,10 @@ class Router:
|
|||
if _model is None:
|
||||
raise e # re-raise error, if model can't be determined for loadbalancing
|
||||
### CHECK IF RATE LIMIT / CONTEXT WINDOW ERROR
|
||||
parent_otel_span = _get_parent_otel_span_from_kwargs(kwargs)
|
||||
_healthy_deployments, _all_deployments = self._get_healthy_deployments(
|
||||
model=_model,
|
||||
parent_otel_span=parent_otel_span,
|
||||
)
|
||||
|
||||
# raises an exception if this error should not be retries
|
||||
|
@ -3260,8 +3339,10 @@ class Router:
|
|||
|
||||
if _model is None:
|
||||
raise e # re-raise error, if model can't be determined for loadbalancing
|
||||
parent_otel_span = _get_parent_otel_span_from_kwargs(kwargs)
|
||||
_healthy_deployments, _ = self._get_healthy_deployments(
|
||||
model=_model,
|
||||
parent_otel_span=parent_otel_span,
|
||||
)
|
||||
remaining_retries = num_retries - current_attempt
|
||||
_timeout = self._time_to_sleep_before_retry(
|
||||
|
@ -3323,9 +3404,13 @@ class Router:
|
|||
# ------------
|
||||
# update cache
|
||||
|
||||
parent_otel_span = _get_parent_otel_span_from_kwargs(kwargs)
|
||||
## TPM
|
||||
await self.cache.async_increment_cache(
|
||||
key=tpm_key, value=total_tokens, ttl=RoutingArgs.ttl.value
|
||||
key=tpm_key,
|
||||
value=total_tokens,
|
||||
parent_otel_span=parent_otel_span,
|
||||
ttl=RoutingArgs.ttl.value,
|
||||
)
|
||||
|
||||
increment_deployment_successes_for_current_minute(
|
||||
|
@ -3474,7 +3559,9 @@ class Router:
|
|||
except Exception as e:
|
||||
raise e
|
||||
|
||||
def _update_usage(self, deployment_id: str) -> int:
|
||||
def _update_usage(
|
||||
self, deployment_id: str, parent_otel_span: Optional[Span]
|
||||
) -> int:
|
||||
"""
|
||||
Update deployment rpm for that minute
|
||||
|
||||
|
@ -3483,7 +3570,9 @@ class Router:
|
|||
"""
|
||||
rpm_key = deployment_id
|
||||
|
||||
request_count = self.cache.get_cache(key=rpm_key, local_only=True)
|
||||
request_count = self.cache.get_cache(
|
||||
key=rpm_key, parent_otel_span=parent_otel_span, local_only=True
|
||||
)
|
||||
if request_count is None:
|
||||
request_count = 1
|
||||
self.cache.set_cache(
|
||||
|
@ -3591,7 +3680,7 @@ class Router:
|
|||
)
|
||||
return False
|
||||
|
||||
def _get_healthy_deployments(self, model: str):
|
||||
def _get_healthy_deployments(self, model: str, parent_otel_span: Optional[Span]):
|
||||
_all_deployments: list = []
|
||||
try:
|
||||
_, _all_deployments = self._common_checks_available_deployment( # type: ignore
|
||||
|
@ -3602,7 +3691,9 @@ class Router:
|
|||
except Exception:
|
||||
pass
|
||||
|
||||
unhealthy_deployments = _get_cooldown_deployments(litellm_router_instance=self)
|
||||
unhealthy_deployments = _get_cooldown_deployments(
|
||||
litellm_router_instance=self, parent_otel_span=parent_otel_span
|
||||
)
|
||||
healthy_deployments: list = []
|
||||
for deployment in _all_deployments:
|
||||
if deployment["model_info"]["id"] in unhealthy_deployments:
|
||||
|
@ -3613,7 +3704,7 @@ class Router:
|
|||
return healthy_deployments, _all_deployments
|
||||
|
||||
async def _async_get_healthy_deployments(
|
||||
self, model: str
|
||||
self, model: str, parent_otel_span: Optional[Span]
|
||||
) -> Tuple[List[Dict], List[Dict]]:
|
||||
"""
|
||||
Returns Tuple of:
|
||||
|
@ -3632,7 +3723,7 @@ class Router:
|
|||
pass
|
||||
|
||||
unhealthy_deployments = await _async_get_cooldown_deployments(
|
||||
litellm_router_instance=self
|
||||
litellm_router_instance=self, parent_otel_span=parent_otel_span
|
||||
)
|
||||
healthy_deployments: list = []
|
||||
for deployment in _all_deployments:
|
||||
|
@ -3659,7 +3750,10 @@ class Router:
|
|||
_callback.pre_call_check(deployment)
|
||||
|
||||
async def async_routing_strategy_pre_call_checks(
|
||||
self, deployment: dict, logging_obj: Optional[LiteLLMLogging] = None
|
||||
self,
|
||||
deployment: dict,
|
||||
parent_otel_span: Optional[Span],
|
||||
logging_obj: Optional[LiteLLMLogging] = None,
|
||||
):
|
||||
"""
|
||||
For usage-based-routing-v2, enables running rpm checks before the call is made, inside the semaphore.
|
||||
|
@ -3675,7 +3769,7 @@ class Router:
|
|||
for _callback in litellm.callbacks:
|
||||
if isinstance(_callback, CustomLogger):
|
||||
try:
|
||||
await _callback.async_pre_call_check(deployment)
|
||||
await _callback.async_pre_call_check(deployment, parent_otel_span)
|
||||
except litellm.RateLimitError as e:
|
||||
## LOG FAILURE EVENT
|
||||
if logging_obj is not None:
|
||||
|
@ -4646,14 +4740,19 @@ class Router:
|
|||
The appropriate client based on the given client_type and kwargs.
|
||||
"""
|
||||
model_id = deployment["model_info"]["id"]
|
||||
parent_otel_span: Optional[Span] = _get_parent_otel_span_from_kwargs(kwargs)
|
||||
if client_type == "max_parallel_requests":
|
||||
cache_key = "{}_max_parallel_requests_client".format(model_id)
|
||||
client = self.cache.get_cache(key=cache_key, local_only=True)
|
||||
client = self.cache.get_cache(
|
||||
key=cache_key, local_only=True, parent_otel_span=parent_otel_span
|
||||
)
|
||||
return client
|
||||
elif client_type == "async":
|
||||
if kwargs.get("stream") is True:
|
||||
cache_key = f"{model_id}_stream_async_client"
|
||||
client = self.cache.get_cache(key=cache_key, local_only=True)
|
||||
client = self.cache.get_cache(
|
||||
key=cache_key, local_only=True, parent_otel_span=parent_otel_span
|
||||
)
|
||||
if client is None:
|
||||
"""
|
||||
Re-initialize the client
|
||||
|
@ -4661,11 +4760,17 @@ class Router:
|
|||
InitalizeOpenAISDKClient.set_client(
|
||||
litellm_router_instance=self, model=deployment
|
||||
)
|
||||
client = self.cache.get_cache(key=cache_key, local_only=True)
|
||||
client = self.cache.get_cache(
|
||||
key=cache_key,
|
||||
local_only=True,
|
||||
parent_otel_span=parent_otel_span,
|
||||
)
|
||||
return client
|
||||
else:
|
||||
cache_key = f"{model_id}_async_client"
|
||||
client = self.cache.get_cache(key=cache_key, local_only=True)
|
||||
client = self.cache.get_cache(
|
||||
key=cache_key, local_only=True, parent_otel_span=parent_otel_span
|
||||
)
|
||||
if client is None:
|
||||
"""
|
||||
Re-initialize the client
|
||||
|
@ -4673,12 +4778,18 @@ class Router:
|
|||
InitalizeOpenAISDKClient.set_client(
|
||||
litellm_router_instance=self, model=deployment
|
||||
)
|
||||
client = self.cache.get_cache(key=cache_key, local_only=True)
|
||||
client = self.cache.get_cache(
|
||||
key=cache_key,
|
||||
local_only=True,
|
||||
parent_otel_span=parent_otel_span,
|
||||
)
|
||||
return client
|
||||
else:
|
||||
if kwargs.get("stream") is True:
|
||||
cache_key = f"{model_id}_stream_client"
|
||||
client = self.cache.get_cache(key=cache_key)
|
||||
client = self.cache.get_cache(
|
||||
key=cache_key, parent_otel_span=parent_otel_span
|
||||
)
|
||||
if client is None:
|
||||
"""
|
||||
Re-initialize the client
|
||||
|
@ -4686,11 +4797,15 @@ class Router:
|
|||
InitalizeOpenAISDKClient.set_client(
|
||||
litellm_router_instance=self, model=deployment
|
||||
)
|
||||
client = self.cache.get_cache(key=cache_key)
|
||||
client = self.cache.get_cache(
|
||||
key=cache_key, parent_otel_span=parent_otel_span
|
||||
)
|
||||
return client
|
||||
else:
|
||||
cache_key = f"{model_id}_client"
|
||||
client = self.cache.get_cache(key=cache_key)
|
||||
client = self.cache.get_cache(
|
||||
key=cache_key, parent_otel_span=parent_otel_span
|
||||
)
|
||||
if client is None:
|
||||
"""
|
||||
Re-initialize the client
|
||||
|
@ -4698,7 +4813,9 @@ class Router:
|
|||
InitalizeOpenAISDKClient.set_client(
|
||||
litellm_router_instance=self, model=deployment
|
||||
)
|
||||
client = self.cache.get_cache(key=cache_key)
|
||||
client = self.cache.get_cache(
|
||||
key=cache_key, parent_otel_span=parent_otel_span
|
||||
)
|
||||
return client
|
||||
|
||||
def _pre_call_checks( # noqa: PLR0915
|
||||
|
@ -4738,13 +4855,17 @@ class Router:
|
|||
_context_window_error = False
|
||||
_potential_error_str = ""
|
||||
_rate_limit_error = False
|
||||
parent_otel_span = _get_parent_otel_span_from_kwargs(request_kwargs)
|
||||
|
||||
## get model group RPM ##
|
||||
dt = get_utc_datetime()
|
||||
current_minute = dt.strftime("%H-%M")
|
||||
rpm_key = f"{model}:rpm:{current_minute}"
|
||||
model_group_cache = (
|
||||
self.cache.get_cache(key=rpm_key, local_only=True) or {}
|
||||
self.cache.get_cache(
|
||||
key=rpm_key, local_only=True, parent_otel_span=parent_otel_span
|
||||
)
|
||||
or {}
|
||||
) # check the in-memory cache used by lowest_latency and usage-based routing. Only check the local cache.
|
||||
for idx, deployment in enumerate(_returned_deployments):
|
||||
# see if we have the info for this model
|
||||
|
@ -4783,7 +4904,10 @@ class Router:
|
|||
## RPM CHECK ##
|
||||
### get local router cache ###
|
||||
current_request_cache_local = (
|
||||
self.cache.get_cache(key=model_id, local_only=True) or 0
|
||||
self.cache.get_cache(
|
||||
key=model_id, local_only=True, parent_otel_span=parent_otel_span
|
||||
)
|
||||
or 0
|
||||
)
|
||||
### get usage based cache ###
|
||||
if (
|
||||
|
@ -5002,6 +5126,7 @@ class Router:
|
|||
self.routing_strategy != "usage-based-routing-v2"
|
||||
and self.routing_strategy != "simple-shuffle"
|
||||
and self.routing_strategy != "cost-based-routing"
|
||||
and self.routing_strategy != "latency-based-routing"
|
||||
): # prevent regressions for other routing strategies, that don't have async get available deployments implemented.
|
||||
return self.get_available_deployment(
|
||||
model=model,
|
||||
|
@ -5011,6 +5136,7 @@ class Router:
|
|||
request_kwargs=request_kwargs,
|
||||
)
|
||||
try:
|
||||
parent_otel_span = _get_parent_otel_span_from_kwargs(request_kwargs)
|
||||
model, healthy_deployments = self._common_checks_available_deployment(
|
||||
model=model,
|
||||
messages=messages,
|
||||
|
@ -5021,7 +5147,7 @@ class Router:
|
|||
return healthy_deployments
|
||||
|
||||
cooldown_deployments = await _async_get_cooldown_deployments(
|
||||
litellm_router_instance=self
|
||||
litellm_router_instance=self, parent_otel_span=parent_otel_span
|
||||
)
|
||||
verbose_router_logger.debug(
|
||||
f"async cooldown deployments: {cooldown_deployments}"
|
||||
|
@ -5059,16 +5185,18 @@ class Router:
|
|||
_allowed_model_region = "n/a"
|
||||
model_ids = self.get_model_ids(model_name=model)
|
||||
_cooldown_time = self.cooldown_cache.get_min_cooldown(
|
||||
model_ids=model_ids
|
||||
model_ids=model_ids, parent_otel_span=parent_otel_span
|
||||
)
|
||||
_cooldown_list = _get_cooldown_deployments(
|
||||
litellm_router_instance=self, parent_otel_span=parent_otel_span
|
||||
)
|
||||
_cooldown_list = _get_cooldown_deployments(litellm_router_instance=self)
|
||||
raise RouterRateLimitError(
|
||||
model=model,
|
||||
cooldown_time=_cooldown_time,
|
||||
enable_pre_call_checks=self.enable_pre_call_checks,
|
||||
cooldown_list=_cooldown_list,
|
||||
)
|
||||
|
||||
start_time = time.time()
|
||||
if (
|
||||
self.routing_strategy == "usage-based-routing-v2"
|
||||
and self.lowesttpm_logger_v2 is not None
|
||||
|
@ -5093,6 +5221,19 @@ class Router:
|
|||
input=input,
|
||||
)
|
||||
)
|
||||
elif (
|
||||
self.routing_strategy == "latency-based-routing"
|
||||
and self.lowestlatency_logger is not None
|
||||
):
|
||||
deployment = (
|
||||
await self.lowestlatency_logger.async_get_available_deployments(
|
||||
model_group=model,
|
||||
healthy_deployments=healthy_deployments, # type: ignore
|
||||
messages=messages,
|
||||
input=input,
|
||||
request_kwargs=request_kwargs,
|
||||
)
|
||||
)
|
||||
elif self.routing_strategy == "simple-shuffle":
|
||||
return simple_shuffle(
|
||||
llm_router_instance=self,
|
||||
|
@ -5107,9 +5248,11 @@ class Router:
|
|||
)
|
||||
model_ids = self.get_model_ids(model_name=model)
|
||||
_cooldown_time = self.cooldown_cache.get_min_cooldown(
|
||||
model_ids=model_ids
|
||||
model_ids=model_ids, parent_otel_span=parent_otel_span
|
||||
)
|
||||
_cooldown_list = _get_cooldown_deployments(
|
||||
litellm_router_instance=self, parent_otel_span=parent_otel_span
|
||||
)
|
||||
_cooldown_list = _get_cooldown_deployments(litellm_router_instance=self)
|
||||
raise RouterRateLimitError(
|
||||
model=model,
|
||||
cooldown_time=_cooldown_time,
|
||||
|
@ -5120,6 +5263,19 @@ class Router:
|
|||
f"get_available_deployment for model: {model}, Selected deployment: {self.print_deployment(deployment)} for model: {model}"
|
||||
)
|
||||
|
||||
end_time = time.time()
|
||||
_duration = end_time - start_time
|
||||
asyncio.create_task(
|
||||
self.service_logger_obj.async_service_success_hook(
|
||||
service=ServiceTypes.ROUTER,
|
||||
duration=_duration,
|
||||
call_type="<routing_strategy>.async_get_available_deployments",
|
||||
parent_otel_span=parent_otel_span,
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
)
|
||||
)
|
||||
|
||||
return deployment
|
||||
except Exception as e:
|
||||
traceback_exception = traceback.format_exc()
|
||||
|
@ -5163,7 +5319,12 @@ class Router:
|
|||
if isinstance(healthy_deployments, dict):
|
||||
return healthy_deployments
|
||||
|
||||
cooldown_deployments = _get_cooldown_deployments(litellm_router_instance=self)
|
||||
parent_otel_span: Optional[Span] = _get_parent_otel_span_from_kwargs(
|
||||
request_kwargs
|
||||
)
|
||||
cooldown_deployments = _get_cooldown_deployments(
|
||||
litellm_router_instance=self, parent_otel_span=parent_otel_span
|
||||
)
|
||||
healthy_deployments = self._filter_cooldown_deployments(
|
||||
healthy_deployments=healthy_deployments,
|
||||
cooldown_deployments=cooldown_deployments,
|
||||
|
@ -5180,8 +5341,12 @@ class Router:
|
|||
|
||||
if len(healthy_deployments) == 0:
|
||||
model_ids = self.get_model_ids(model_name=model)
|
||||
_cooldown_time = self.cooldown_cache.get_min_cooldown(model_ids=model_ids)
|
||||
_cooldown_list = _get_cooldown_deployments(litellm_router_instance=self)
|
||||
_cooldown_time = self.cooldown_cache.get_min_cooldown(
|
||||
model_ids=model_ids, parent_otel_span=parent_otel_span
|
||||
)
|
||||
_cooldown_list = _get_cooldown_deployments(
|
||||
litellm_router_instance=self, parent_otel_span=parent_otel_span
|
||||
)
|
||||
raise RouterRateLimitError(
|
||||
model=model,
|
||||
cooldown_time=_cooldown_time,
|
||||
|
@ -5238,8 +5403,12 @@ class Router:
|
|||
f"get_available_deployment for model: {model}, No deployment available"
|
||||
)
|
||||
model_ids = self.get_model_ids(model_name=model)
|
||||
_cooldown_time = self.cooldown_cache.get_min_cooldown(model_ids=model_ids)
|
||||
_cooldown_list = _get_cooldown_deployments(litellm_router_instance=self)
|
||||
_cooldown_time = self.cooldown_cache.get_min_cooldown(
|
||||
model_ids=model_ids, parent_otel_span=parent_otel_span
|
||||
)
|
||||
_cooldown_list = _get_cooldown_deployments(
|
||||
litellm_router_instance=self, parent_otel_span=parent_otel_span
|
||||
)
|
||||
raise RouterRateLimitError(
|
||||
model=model,
|
||||
cooldown_time=_cooldown_time,
|
||||
|
@ -5278,7 +5447,9 @@ class Router:
|
|||
healthy_deployments.remove(deployment)
|
||||
return healthy_deployments
|
||||
|
||||
def _track_deployment_metrics(self, deployment, response=None):
|
||||
def _track_deployment_metrics(
|
||||
self, deployment, parent_otel_span: Optional[Span], response=None
|
||||
):
|
||||
"""
|
||||
Tracks successful requests rpm usage.
|
||||
"""
|
||||
|
@ -5288,7 +5459,9 @@ class Router:
|
|||
|
||||
# update self.deployment_stats
|
||||
if model_id is not None:
|
||||
self._update_usage(model_id) # update in-memory cache for tracking
|
||||
self._update_usage(
|
||||
model_id, parent_otel_span
|
||||
) # update in-memory cache for tracking
|
||||
except Exception as e:
|
||||
verbose_router_logger.error(f"Error in _track_deployment_metrics: {str(e)}")
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue