(fix) latency fix - revert prompt caching check on litellm router (#7211)

* attempt to fix latency issue * fix latency issues for router prompt caching
2025-04-25 10:44:24 +00:00 · 2024-12-12 20:50:16 -08:00 · 2024-12-12 20:50:16 -08:00 · 7ff9a905d2
commit 7ff9a905d2
parent 3de32f4106
2 changed files with 6 additions and 44 deletions
--- a/litellm/router.py
+++ b/litellm/router.py
@ -151,7 +151,6 @@ from litellm.utils import (
    get_llm_provider,
    get_secret,
    get_utc_datetime,
-    is_prompt_caching_valid_prompt,
    is_region_allowed,
 )

@ -3383,30 +3382,6 @@ class Router:
                    litellm_router_instance=self,
                    deployment_id=id,
                )
-
-                ## PROMPT CACHING
-                prompt_cache = PromptCachingCache(
-                    cache=self.cache,
-                )
-                if (
-                    standard_logging_object["messages"] is not None
-                    and isinstance(standard_logging_object["messages"], list)
-                    and deployment_name is not None
-                    and isinstance(deployment_name, str)
-                ):
-                    valid_prompt = is_prompt_caching_valid_prompt(
-                        messages=standard_logging_object["messages"],  # type: ignore
-                        tools=None,
-                        model=deployment_name,
-                        custom_llm_provider=None,
-                    )
-                    if valid_prompt:
-                        await prompt_cache.async_add_model_id(
-                            model_id=id,
-                            messages=standard_logging_object["messages"],  # type: ignore
-                            tools=None,
-                        )
-
                return tpm_key

        except Exception as e:
@ -5339,25 +5314,6 @@ class Router:
                    messages=messages,
                    request_kwargs=request_kwargs,
                )
-
-            if messages is not None and is_prompt_caching_valid_prompt(
-                messages=cast(List[AllMessageValues], messages),
-                model=model,
-                custom_llm_provider=None,
-            ):
-                prompt_cache = PromptCachingCache(
-                    cache=self.cache,
-                )
-                healthy_deployment = (
-                    await prompt_cache.async_get_prompt_caching_deployment(
-                        router=self,
-                        messages=cast(List[AllMessageValues], messages),
-                        tools=None,
-                    )
-                )
-                if healthy_deployment is not None:
-                    return healthy_deployment
-
            # check if user wants to do tag based routing
            healthy_deployments = await get_deployments_for_tag(  # type: ignore
                llm_router_instance=self,
--- a/tests/local_testing/test_anthropic_prompt_caching.py
+++ b/tests/local_testing/test_anthropic_prompt_caching.py
@ -603,6 +603,9 @@ def test_is_prompt_caching_enabled(anthropic_messages):
    [("anthropic_messages", True), ("normal_messages", False)],
 )
@pytest.mark.asyncio()
+@pytest.mark.skip(
+    reason="BETA FEATURE - skipping since this led to a latency impact, beta feature that is not used as yet"
+)
 async def test_router_prompt_caching_model_stored(
    messages, expected_model_id, anthropic_messages
 ):
@ -650,6 +653,9 @@ async def test_router_prompt_caching_model_stored(


@pytest.mark.asyncio()
+@pytest.mark.skip(
+    reason="BETA FEATURE - skipping since this led to a latency impact, beta feature that is not used as yet"
+)
 async def test_router_with_prompt_caching(anthropic_messages):
    """
    if prompt caching supported model called with prompt caching valid prompt,