mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-25 10:44:24 +00:00
(fix) latency fix - revert prompt caching check on litellm router (#7211)
* attempt to fix latency issue * fix latency issues for router prompt caching
This commit is contained in:
parent
3de32f4106
commit
7ff9a905d2
2 changed files with 6 additions and 44 deletions
|
@ -151,7 +151,6 @@ from litellm.utils import (
|
||||||
get_llm_provider,
|
get_llm_provider,
|
||||||
get_secret,
|
get_secret,
|
||||||
get_utc_datetime,
|
get_utc_datetime,
|
||||||
is_prompt_caching_valid_prompt,
|
|
||||||
is_region_allowed,
|
is_region_allowed,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -3383,30 +3382,6 @@ class Router:
|
||||||
litellm_router_instance=self,
|
litellm_router_instance=self,
|
||||||
deployment_id=id,
|
deployment_id=id,
|
||||||
)
|
)
|
||||||
|
|
||||||
## PROMPT CACHING
|
|
||||||
prompt_cache = PromptCachingCache(
|
|
||||||
cache=self.cache,
|
|
||||||
)
|
|
||||||
if (
|
|
||||||
standard_logging_object["messages"] is not None
|
|
||||||
and isinstance(standard_logging_object["messages"], list)
|
|
||||||
and deployment_name is not None
|
|
||||||
and isinstance(deployment_name, str)
|
|
||||||
):
|
|
||||||
valid_prompt = is_prompt_caching_valid_prompt(
|
|
||||||
messages=standard_logging_object["messages"], # type: ignore
|
|
||||||
tools=None,
|
|
||||||
model=deployment_name,
|
|
||||||
custom_llm_provider=None,
|
|
||||||
)
|
|
||||||
if valid_prompt:
|
|
||||||
await prompt_cache.async_add_model_id(
|
|
||||||
model_id=id,
|
|
||||||
messages=standard_logging_object["messages"], # type: ignore
|
|
||||||
tools=None,
|
|
||||||
)
|
|
||||||
|
|
||||||
return tpm_key
|
return tpm_key
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -5339,25 +5314,6 @@ class Router:
|
||||||
messages=messages,
|
messages=messages,
|
||||||
request_kwargs=request_kwargs,
|
request_kwargs=request_kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
if messages is not None and is_prompt_caching_valid_prompt(
|
|
||||||
messages=cast(List[AllMessageValues], messages),
|
|
||||||
model=model,
|
|
||||||
custom_llm_provider=None,
|
|
||||||
):
|
|
||||||
prompt_cache = PromptCachingCache(
|
|
||||||
cache=self.cache,
|
|
||||||
)
|
|
||||||
healthy_deployment = (
|
|
||||||
await prompt_cache.async_get_prompt_caching_deployment(
|
|
||||||
router=self,
|
|
||||||
messages=cast(List[AllMessageValues], messages),
|
|
||||||
tools=None,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
if healthy_deployment is not None:
|
|
||||||
return healthy_deployment
|
|
||||||
|
|
||||||
# check if user wants to do tag based routing
|
# check if user wants to do tag based routing
|
||||||
healthy_deployments = await get_deployments_for_tag( # type: ignore
|
healthy_deployments = await get_deployments_for_tag( # type: ignore
|
||||||
llm_router_instance=self,
|
llm_router_instance=self,
|
||||||
|
|
|
@ -603,6 +603,9 @@ def test_is_prompt_caching_enabled(anthropic_messages):
|
||||||
[("anthropic_messages", True), ("normal_messages", False)],
|
[("anthropic_messages", True), ("normal_messages", False)],
|
||||||
)
|
)
|
||||||
@pytest.mark.asyncio()
|
@pytest.mark.asyncio()
|
||||||
|
@pytest.mark.skip(
|
||||||
|
reason="BETA FEATURE - skipping since this led to a latency impact, beta feature that is not used as yet"
|
||||||
|
)
|
||||||
async def test_router_prompt_caching_model_stored(
|
async def test_router_prompt_caching_model_stored(
|
||||||
messages, expected_model_id, anthropic_messages
|
messages, expected_model_id, anthropic_messages
|
||||||
):
|
):
|
||||||
|
@ -650,6 +653,9 @@ async def test_router_prompt_caching_model_stored(
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio()
|
@pytest.mark.asyncio()
|
||||||
|
@pytest.mark.skip(
|
||||||
|
reason="BETA FEATURE - skipping since this led to a latency impact, beta feature that is not used as yet"
|
||||||
|
)
|
||||||
async def test_router_with_prompt_caching(anthropic_messages):
|
async def test_router_with_prompt_caching(anthropic_messages):
|
||||||
"""
|
"""
|
||||||
if prompt caching supported model called with prompt caching valid prompt,
|
if prompt caching supported model called with prompt caching valid prompt,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue