redis otel tracing + async support for latency routing (#6452)

* docs(exception_mapping.md): add missing exception types

Fixes https://github.com/Aider-AI/aider/issues/2120#issuecomment-2438971183

* fix(main.py): register custom model pricing with specific key

Ensure custom model pricing is registered to the specific model+provider key combination

* test: make testing more robust for custom pricing

* fix(redis_cache.py): instrument otel logging for sync redis calls

ensures complete coverage for all redis cache calls

* refactor: pass parent_otel_span for redis caching calls in router

allows for more observability into what calls are causing latency issues

* test: update tests with new params

* refactor: ensure e2e otel tracing for router

* refactor(router.py): add more otel tracing acrosss router

catch all latency issues for router requests

* fix: fix linting error

* fix(router.py): fix linting error

* fix: fix test

* test: fix tests

* fix(dual_cache.py): pass ttl to redis cache

* fix: fix param
This commit is contained in:
Krish Dholakia 2024-10-28 21:52:12 -07:00 committed by GitHub
parent d9e7818e6b
commit 4f8a3fd4cf
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
25 changed files with 559 additions and 147 deletions

View file

@ -20,12 +20,15 @@ from .router_callbacks.track_deployment_metrics import (
)
if TYPE_CHECKING:
from opentelemetry.trace import Span as _Span
from litellm.router import Router as _Router
LitellmRouter = _Router
Span = _Span
else:
LitellmRouter = Any
Span = Any
DEFAULT_FAILURE_THRESHOLD_PERCENT = (
0.5 # default cooldown a deployment if 50% of requests fail in a given minute
)
@ -207,6 +210,7 @@ def _set_cooldown_deployments(
async def _async_get_cooldown_deployments(
litellm_router_instance: LitellmRouter,
parent_otel_span: Optional[Span],
) -> List[str]:
"""
Async implementation of '_get_cooldown_deployments'
@ -214,7 +218,8 @@ async def _async_get_cooldown_deployments(
model_ids = litellm_router_instance.get_model_ids()
cooldown_models = (
await litellm_router_instance.cooldown_cache.async_get_active_cooldowns(
model_ids=model_ids
model_ids=model_ids,
parent_otel_span=parent_otel_span,
)
)
@ -233,6 +238,7 @@ async def _async_get_cooldown_deployments(
async def _async_get_cooldown_deployments_with_debug_info(
litellm_router_instance: LitellmRouter,
parent_otel_span: Optional[Span],
) -> List[tuple]:
"""
Async implementation of '_get_cooldown_deployments'
@ -240,7 +246,7 @@ async def _async_get_cooldown_deployments_with_debug_info(
model_ids = litellm_router_instance.get_model_ids()
cooldown_models = (
await litellm_router_instance.cooldown_cache.async_get_active_cooldowns(
model_ids=model_ids
model_ids=model_ids, parent_otel_span=parent_otel_span
)
)
@ -248,7 +254,9 @@ async def _async_get_cooldown_deployments_with_debug_info(
return cooldown_models
def _get_cooldown_deployments(litellm_router_instance: LitellmRouter) -> List[str]:
def _get_cooldown_deployments(
litellm_router_instance: LitellmRouter, parent_otel_span: Optional[Span]
) -> List[str]:
"""
Get the list of models being cooled down for this minute
"""
@ -258,8 +266,9 @@ def _get_cooldown_deployments(litellm_router_instance: LitellmRouter) -> List[st
# Return cooldown models
# ----------------------
model_ids = litellm_router_instance.get_model_ids()
cooldown_models = litellm_router_instance.cooldown_cache.get_active_cooldowns(
model_ids=model_ids
model_ids=model_ids, parent_otel_span=parent_otel_span
)
cached_value_deployment_ids = []