(fix) latency fix - revert prompt caching check on litellm router (#7211)

* attempt to fix latency issue

* fix latency issues for router prompt caching
This commit is contained in:
Ishaan Jaff 2024-12-12 20:50:16 -08:00 committed by GitHub
parent 3de32f4106
commit 7ff9a905d2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 6 additions and 44 deletions

View file

@ -151,7 +151,6 @@ from litellm.utils import (
get_llm_provider, get_llm_provider,
get_secret, get_secret,
get_utc_datetime, get_utc_datetime,
is_prompt_caching_valid_prompt,
is_region_allowed, is_region_allowed,
) )
@ -3383,30 +3382,6 @@ class Router:
litellm_router_instance=self, litellm_router_instance=self,
deployment_id=id, deployment_id=id,
) )
## PROMPT CACHING
prompt_cache = PromptCachingCache(
cache=self.cache,
)
if (
standard_logging_object["messages"] is not None
and isinstance(standard_logging_object["messages"], list)
and deployment_name is not None
and isinstance(deployment_name, str)
):
valid_prompt = is_prompt_caching_valid_prompt(
messages=standard_logging_object["messages"], # type: ignore
tools=None,
model=deployment_name,
custom_llm_provider=None,
)
if valid_prompt:
await prompt_cache.async_add_model_id(
model_id=id,
messages=standard_logging_object["messages"], # type: ignore
tools=None,
)
return tpm_key return tpm_key
except Exception as e: except Exception as e:
@ -5339,25 +5314,6 @@ class Router:
messages=messages, messages=messages,
request_kwargs=request_kwargs, request_kwargs=request_kwargs,
) )
if messages is not None and is_prompt_caching_valid_prompt(
messages=cast(List[AllMessageValues], messages),
model=model,
custom_llm_provider=None,
):
prompt_cache = PromptCachingCache(
cache=self.cache,
)
healthy_deployment = (
await prompt_cache.async_get_prompt_caching_deployment(
router=self,
messages=cast(List[AllMessageValues], messages),
tools=None,
)
)
if healthy_deployment is not None:
return healthy_deployment
# check if user wants to do tag based routing # check if user wants to do tag based routing
healthy_deployments = await get_deployments_for_tag( # type: ignore healthy_deployments = await get_deployments_for_tag( # type: ignore
llm_router_instance=self, llm_router_instance=self,

View file

@ -603,6 +603,9 @@ def test_is_prompt_caching_enabled(anthropic_messages):
[("anthropic_messages", True), ("normal_messages", False)], [("anthropic_messages", True), ("normal_messages", False)],
) )
@pytest.mark.asyncio() @pytest.mark.asyncio()
@pytest.mark.skip(
reason="BETA FEATURE - skipping since this led to a latency impact, beta feature that is not used as yet"
)
async def test_router_prompt_caching_model_stored( async def test_router_prompt_caching_model_stored(
messages, expected_model_id, anthropic_messages messages, expected_model_id, anthropic_messages
): ):
@ -650,6 +653,9 @@ async def test_router_prompt_caching_model_stored(
@pytest.mark.asyncio() @pytest.mark.asyncio()
@pytest.mark.skip(
reason="BETA FEATURE - skipping since this led to a latency impact, beta feature that is not used as yet"
)
async def test_router_with_prompt_caching(anthropic_messages): async def test_router_with_prompt_caching(anthropic_messages):
""" """
if prompt caching supported model called with prompt caching valid prompt, if prompt caching supported model called with prompt caching valid prompt,