fix(router.py): check if azure returns 'content_filter' response + fallback available -> fallback

Exception maps azure content filter response exceptions
This commit is contained in:
Krrish Dholakia 2024-06-22 19:10:15 -07:00
parent f814f24d9d
commit 2c7a80d08d
8 changed files with 100 additions and 70 deletions

View file

@ -477,6 +477,9 @@ def mock_completion(
if time_delay is not None:
time.sleep(time_delay)
if isinstance(mock_response, dict):
return ModelResponse(**mock_response)
model_response = ModelResponse(stream=stream)
if stream is True:
# don't try to access stream object,

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -1,61 +1,14 @@
environment_variables:
LANGFUSE_PUBLIC_KEY: Q6K8MQN6L7sPYSJiFKM9eNrETOx6V/FxVPup4FqdKsZK1hyR4gyanlQ2KHLg5D5afng99uIt0JCEQ2jiKF9UxFvtnb4BbJ4qpeceH+iK8v/bdg==
LANGFUSE_SECRET_KEY: 5xQ7KMa6YMLsm+H/Pf1VmlqWq1NON5IoCxABhkUBeSck7ftsj2CmpkL2ZwrxwrktgiTUBH+3gJYBX+XBk7lqOOUpvmiLjol/E5lCqq0M1CqLWA==
SLACK_WEBHOOK_URL: RJjhS0Hhz0/s07sCIf1OTXmTGodpK9L2K9p953Z+fOX0l2SkPFT6mB9+yIrLufmlwEaku5NNEBKy//+AG01yOd+7wV1GhK65vfj3B/gTN8t5cuVnR4vFxKY5Rx4eSGLtzyAs+aIBTp4GoNXDIjroCqfCjPkItEZWCg==
general_settings:
alerting:
- slack
alerting_threshold: 300
database_connection_pool_limit: 100
database_connection_timeout: 60
disable_master_key_return: true
health_check_interval: 300
proxy_batch_write_at: 60
ui_access_mode: all
# master_key: sk-1234
litellm_settings:
allowed_fails: 3
failure_callback:
- prometheus
num_retries: 3
service_callback:
- prometheus_system
success_callback:
- langfuse
- prometheus
- langsmith
model_list:
- litellm_params:
- model_name: my-fake-model
litellm_params:
model: gpt-3.5-turbo
model_name: gpt-3.5-turbo
- litellm_params:
api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
api_key: my-fake-key
model: openai/my-fake-model
stream_timeout: 0.001
model_name: fake-openai-endpoint
- litellm_params:
api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
api_key: my-fake-key
model: openai/my-fake-model-2
stream_timeout: 0.001
model_name: fake-openai-endpoint
- litellm_params:
api_base: os.environ/AZURE_API_BASE
api_key: os.environ/AZURE_API_KEY
api_version: 2023-07-01-preview
model: azure/chatgpt-v-2
stream_timeout: 0.001
model_name: azure-gpt-3.5
- litellm_params:
api_key: os.environ/OPENAI_API_KEY
model: text-embedding-ada-002
model_name: text-embedding-ada-002
- litellm_params:
model: text-completion-openai/gpt-3.5-turbo-instruct
model_name: gpt-instruct
router_settings:
enable_pre_call_checks: true
redis_host: os.environ/REDIS_HOST
redis_password: os.environ/REDIS_PASSWORD
redis_port: os.environ/REDIS_PORT
mock_response: hello-world
- model_name: gpt-4o
litellm_params:
model: azure/gpt-4o
api_base: https://litellm8397336933.openai.azure.com/
api_key: 610f806211ab47f2a694493000045858
litellm_settings:
content_policy_fallbacks: [{"gpt-4o": ["my-fake-model"]}]

View file

@ -562,6 +562,18 @@ class Router:
f"litellm.completion(model={model_name})\033[32m 200 OK\033[0m"
)
## CHECK CONTENT FILTER ERROR ##
if isinstance(response, ModelResponse):
_should_raise = self._should_raise_content_policy_error(
model=model, response=response, kwargs=kwargs
)
if _should_raise:
raise litellm.ContentPolicyViolationError(
message="Response output was blocked.",
model=model,
llm_provider="",
)
return response
except Exception as e:
verbose_router_logger.info(
@ -721,6 +733,18 @@ class Router:
await self.async_routing_strategy_pre_call_checks(deployment=deployment)
response = await _response
## CHECK CONTENT FILTER ERROR ##
if isinstance(response, ModelResponse):
_should_raise = self._should_raise_content_policy_error(
model=model, response=response, kwargs=kwargs
)
if _should_raise:
raise litellm.ContentPolicyViolationError(
message="Response output was blocked.",
model=model,
llm_provider="",
)
self.success_calls[model_name] += 1
verbose_router_logger.info(
f"litellm.acompletion(model={model_name})\033[32m 200 OK\033[0m"
@ -2801,6 +2825,40 @@ class Router:
# Catch all - if any exceptions default to cooling down
return True
def _should_raise_content_policy_error(
self, model: str, response: ModelResponse, kwargs: dict
) -> bool:
"""
Determines if a content policy error should be raised.
Only raised if a fallback is available.
Else, original response is returned.
"""
if response.choices[0].finish_reason != "content_filter":
return False
content_policy_fallbacks = kwargs.get(
"content_policy_fallbacks", self.content_policy_fallbacks
)
### ONLY RAISE ERROR IF CP FALLBACK AVAILABLE ###
if content_policy_fallbacks is not None:
fallback_model_group = None
for item in content_policy_fallbacks: # [{"gpt-3.5-turbo": ["gpt-4"]}]
if list(item.keys())[0] == model:
fallback_model_group = item[model]
break
if fallback_model_group is not None:
return True
verbose_router_logger.info(
"Content Policy Error occurred. No available fallbacks. Returning original response. model={}, content_policy_fallbacks={}".format(
model, content_policy_fallbacks
)
)
return False
def _set_cooldown_deployments(
self,
original_exception: Any,

View file

@ -1,8 +1,12 @@
#### What this tests ####
# This tests calling router with fallback models
import sys, os, time
import traceback, asyncio
import asyncio
import os
import sys
import time
import traceback
import pytest
sys.path.insert(
@ -762,9 +766,11 @@ def test_ausage_based_routing_fallbacks():
# The Request should fail azure/gpt-4-fast. Then fallback -> "azure/gpt-4-basic" -> "openai-gpt-4"
# It should work with "openai-gpt-4"
import os
from dotenv import load_dotenv
import litellm
from litellm import Router
from dotenv import load_dotenv
load_dotenv()
@ -1112,9 +1118,19 @@ async def test_client_side_fallbacks_list(sync_mode):
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.parametrize("content_filter_response_exception", [True, False])
@pytest.mark.asyncio
async def test_router_content_policy_fallbacks(sync_mode):
async def test_router_content_policy_fallbacks(
sync_mode, content_filter_response_exception
):
os.environ["LITELLM_LOG"] = "DEBUG"
if content_filter_response_exception:
mock_response = Exception("content filtering policy")
else:
mock_response = litellm.ModelResponse(
choices=[litellm.Choices(finish_reason="content_filter")]
)
router = Router(
model_list=[
{
@ -1122,13 +1138,13 @@ async def test_router_content_policy_fallbacks(sync_mode):
"litellm_params": {
"model": "claude-2",
"api_key": "",
"mock_response": Exception("content filtering policy"),
"mock_response": mock_response,
},
},
{
"model_name": "my-fallback-model",
"litellm_params": {
"model": "claude-2",
"model": "openai/my-fake-model",
"api_key": "",
"mock_response": "This works!",
},
@ -1165,3 +1181,5 @@ async def test_router_content_policy_fallbacks(sync_mode):
model="claude-2",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
)
assert response.model == "my-fake-model"

View file

@ -12,6 +12,7 @@ from pydantic import BaseModel, ConfigDict, Field
from .completion import CompletionRequest
from .embedding import EmbeddingRequest
from .utils import ModelResponse
class ModelConfig(BaseModel):
@ -315,7 +316,7 @@ class LiteLLMParamsTypedDict(TypedDict, total=False):
input_cost_per_second: Optional[float]
output_cost_per_second: Optional[float]
## MOCK RESPONSES ##
mock_response: Optional[str]
mock_response: Optional[Union[str, ModelResponse, Exception]]
class DeploymentTypedDict(TypedDict):