(fix) latency fix - revert prompt caching check on litellm router (#7211)

* attempt to fix latency issue * fix latency issues for router prompt caching
2025-04-25 10:44:24 +00:00 · 2024-12-12 20:50:16 -08:00 · 2024-12-12 20:50:16 -08:00 · 7ff9a905d2
commit 7ff9a905d2
parent 3de32f4106
2 changed files with 6 additions and 44 deletions
--- a/litellm/router.py
+++ b/litellm/router.py
@ -151,7 +151,6 @@ from litellm.utils import (
    get_llm_provider,
    get_secret,
    get_utc_datetime,
    is_prompt_caching_valid_prompt,
    is_region_allowed,
 )
@ -3383,30 +3382,6 @@ class Router:
                    litellm_router_instance=self,
                    deployment_id=id,
                )
                ## PROMPT CACHING
                prompt_cache = PromptCachingCache(
                    cache=self.cache,
                )
                if (
                    standard_logging_object["messages"] is not None
                    and isinstance(standard_logging_object["messages"], list)
                    and deployment_name is not None
                    and isinstance(deployment_name, str)
                ):
                    valid_prompt = is_prompt_caching_valid_prompt(
                        messages=standard_logging_object["messages"],  # type: ignore
                        tools=None,
                        model=deployment_name,
                        custom_llm_provider=None,
                    )
                    if valid_prompt:
                        await prompt_cache.async_add_model_id(
                            model_id=id,
                            messages=standard_logging_object["messages"],  # type: ignore
                            tools=None,
                        )
                return tpm_key
        except Exception as e:
@ -5339,25 +5314,6 @@ class Router:
                    messages=messages,
                    request_kwargs=request_kwargs,
                )
            if messages is not None and is_prompt_caching_valid_prompt(
                messages=cast(List[AllMessageValues], messages),
                model=model,
                custom_llm_provider=None,
            ):
                prompt_cache = PromptCachingCache(
                    cache=self.cache,
                )
                healthy_deployment = (
                    await prompt_cache.async_get_prompt_caching_deployment(
                        router=self,
                        messages=cast(List[AllMessageValues], messages),
                        tools=None,
                    )
                )
                if healthy_deployment is not None:
                    return healthy_deployment
            # check if user wants to do tag based routing
            healthy_deployments = await get_deployments_for_tag(  # type: ignore
                llm_router_instance=self,
--- a/tests/local_testing/test_anthropic_prompt_caching.py
+++ b/tests/local_testing/test_anthropic_prompt_caching.py
@ -603,6 +603,9 @@ def test_is_prompt_caching_enabled(anthropic_messages):
    [("anthropic_messages", True), ("normal_messages", False)],
 )
@pytest.mark.asyncio()
@pytest.mark.skip(
    reason="BETA FEATURE - skipping since this led to a latency impact, beta feature that is not used as yet"
 )
 async def test_router_prompt_caching_model_stored(
    messages, expected_model_id, anthropic_messages
 ):
@ -650,6 +653,9 @@ async def test_router_prompt_caching_model_stored(
@pytest.mark.asyncio()
@pytest.mark.skip(
    reason="BETA FEATURE - skipping since this led to a latency impact, beta feature that is not used as yet"
 )
 async def test_router_with_prompt_caching(anthropic_messages):
    """
    if prompt caching supported model called with prompt caching valid prompt,