mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-25 10:44:24 +00:00
Litellm dev 01 27 2025 p3 (#8047)
* docs(reliability.md): add doc on disabling fallbacks per request * feat(litellm_pre_call_utils.py): support reading request timeout from request headers - new `x-litellm-timeout` param Allows setting dynamic model timeouts from vercel's AI sdk * test(test_proxy_server.py): add simple unit test for reading request timeout * test(test_fallbacks.py): add e2e test to confirm timeout passed in request headers is correctly read * feat(main.py): support passing metadata to openai in preview Resolves https://github.com/BerriAI/litellm/issues/6022#issuecomment-2616119371 * fix(main.py): fix passing openai metadata * docs(request_headers.md): document new request headers * build: Merge branch 'main' into litellm_dev_01_27_2025_p3 * test: loosen test
This commit is contained in:
parent
9c20c69915
commit
d9eb8f42ff
11 changed files with 187 additions and 3 deletions
|
@ -1007,7 +1007,34 @@ curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
### Disable Fallbacks per key
|
### Disable Fallbacks (Per Request/Key)
|
||||||
|
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
|
||||||
|
<TabItem value="request" label="Per Request">
|
||||||
|
|
||||||
|
You can disable fallbacks per key by setting `disable_fallbacks: true` in your request body.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
|
||||||
|
-H 'Content-Type: application/json' \
|
||||||
|
-H 'Authorization: Bearer sk-1234' \
|
||||||
|
-d '{
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "List 5 important events in the XIX century"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"disable_fallbacks": true # 👈 DISABLE FALLBACKS
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
|
||||||
|
<TabItem value="key" label="Per Key">
|
||||||
|
|
||||||
You can disable fallbacks per key by setting `disable_fallbacks: true` in your key metadata.
|
You can disable fallbacks per key by setting `disable_fallbacks: true` in your key metadata.
|
||||||
|
|
||||||
|
@ -1020,4 +1047,7 @@ curl -L -X POST 'http://0.0.0.0:4000/key/generate' \
|
||||||
"disable_fallbacks": true
|
"disable_fallbacks": true
|
||||||
}
|
}
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
</Tabs>
|
12
docs/my-website/docs/proxy/request_headers.md
Normal file
12
docs/my-website/docs/proxy/request_headers.md
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
# Request Headers
|
||||||
|
|
||||||
|
Special headers that are supported by LiteLLM.
|
||||||
|
|
||||||
|
## LiteLLM Headers
|
||||||
|
|
||||||
|
`x-litellm-timeout` Optional[float]: The timeout for the request in seconds.
|
||||||
|
|
||||||
|
## Anthropic Headers
|
||||||
|
|
||||||
|
`anthropic-version` Optional[str]: The version of the Anthropic API to use.
|
||||||
|
`anthropic-beta` Optional[str]: The beta version of the Anthropic API to use.
|
|
@ -66,6 +66,7 @@ const sidebars = {
|
||||||
"proxy/user_keys",
|
"proxy/user_keys",
|
||||||
"proxy/clientside_auth",
|
"proxy/clientside_auth",
|
||||||
"proxy/response_headers",
|
"proxy/response_headers",
|
||||||
|
"proxy/request_headers",
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
|
@ -75,6 +75,7 @@ from litellm.utils import (
|
||||||
CustomStreamWrapper,
|
CustomStreamWrapper,
|
||||||
ProviderConfigManager,
|
ProviderConfigManager,
|
||||||
Usage,
|
Usage,
|
||||||
|
add_openai_metadata,
|
||||||
async_mock_completion_streaming_obj,
|
async_mock_completion_streaming_obj,
|
||||||
convert_to_model_response_object,
|
convert_to_model_response_object,
|
||||||
create_pretrained_tokenizer,
|
create_pretrained_tokenizer,
|
||||||
|
@ -1617,6 +1618,11 @@ def completion( # type: ignore # noqa: PLR0915
|
||||||
if extra_headers is not None:
|
if extra_headers is not None:
|
||||||
optional_params["extra_headers"] = extra_headers
|
optional_params["extra_headers"] = extra_headers
|
||||||
|
|
||||||
|
if (
|
||||||
|
litellm.enable_preview_features and metadata is not None
|
||||||
|
): # [PREVIEW] allow metadata to be passed to OPENAI
|
||||||
|
optional_params["metadata"] = add_openai_metadata(metadata)
|
||||||
|
|
||||||
## LOAD CONFIG - if set
|
## LOAD CONFIG - if set
|
||||||
config = litellm.OpenAIConfig.get_config()
|
config = litellm.OpenAIConfig.get_config()
|
||||||
for k, v in config.items():
|
for k, v in config.items():
|
||||||
|
|
|
@ -13,4 +13,4 @@ model_list:
|
||||||
- model_name: deepseek/*
|
- model_name: deepseek/*
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: deepseek/*
|
model: deepseek/*
|
||||||
api_key: os.environ/DEEPSEEK_API_KEY
|
api_key: os.environ/DEEPSEEK_API_KEY
|
|
@ -2204,6 +2204,7 @@ class SpecialHeaders(enum.Enum):
|
||||||
class LitellmDataForBackendLLMCall(TypedDict, total=False):
|
class LitellmDataForBackendLLMCall(TypedDict, total=False):
|
||||||
headers: dict
|
headers: dict
|
||||||
organization: str
|
organization: str
|
||||||
|
timeout: Optional[float]
|
||||||
|
|
||||||
|
|
||||||
class JWTKeyItem(TypedDict, total=False):
|
class JWTKeyItem(TypedDict, total=False):
|
||||||
|
|
|
@ -181,6 +181,31 @@ def clean_headers(
|
||||||
|
|
||||||
|
|
||||||
class LiteLLMProxyRequestSetup:
|
class LiteLLMProxyRequestSetup:
|
||||||
|
@staticmethod
|
||||||
|
def _get_timeout_from_request(headers: dict) -> Optional[float]:
|
||||||
|
"""
|
||||||
|
Workaround for client request from Vercel's AI SDK.
|
||||||
|
|
||||||
|
Allow's user to set a timeout in the request headers.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
```js
|
||||||
|
const openaiProvider = createOpenAI({
|
||||||
|
baseURL: liteLLM.baseURL,
|
||||||
|
apiKey: liteLLM.apiKey,
|
||||||
|
compatibility: "compatible",
|
||||||
|
headers: {
|
||||||
|
"x-litellm-timeout": "90"
|
||||||
|
},
|
||||||
|
});
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
timeout_header = headers.get("x-litellm-timeout", None)
|
||||||
|
if timeout_header is not None:
|
||||||
|
return float(timeout_header)
|
||||||
|
return None
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _get_forwardable_headers(
|
def _get_forwardable_headers(
|
||||||
headers: Union[Headers, dict],
|
headers: Union[Headers, dict],
|
||||||
|
@ -267,6 +292,11 @@ class LiteLLMProxyRequestSetup:
|
||||||
)
|
)
|
||||||
if _organization is not None:
|
if _organization is not None:
|
||||||
data["organization"] = _organization
|
data["organization"] = _organization
|
||||||
|
|
||||||
|
timeout = LiteLLMProxyRequestSetup._get_timeout_from_request(headers)
|
||||||
|
if timeout is not None:
|
||||||
|
data["timeout"] = timeout
|
||||||
|
|
||||||
return data
|
return data
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|
|
@ -6206,3 +6206,21 @@ def get_non_default_completion_params(kwargs: dict) -> dict:
|
||||||
k: v for k, v in kwargs.items() if k not in default_params
|
k: v for k, v in kwargs.items() if k not in default_params
|
||||||
} # model-specific params - pass them straight to the model/provider
|
} # model-specific params - pass them straight to the model/provider
|
||||||
return non_default_params
|
return non_default_params
|
||||||
|
|
||||||
|
|
||||||
|
def add_openai_metadata(metadata: dict) -> dict:
|
||||||
|
"""
|
||||||
|
Add metadata to openai optional parameters, excluding hidden params
|
||||||
|
|
||||||
|
Args:
|
||||||
|
params (dict): Dictionary of API parameters
|
||||||
|
metadata (dict, optional): Metadata to include in the request
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: Updated parameters dictionary with visible metadata only
|
||||||
|
"""
|
||||||
|
if metadata is None:
|
||||||
|
return None
|
||||||
|
# Only include non-hidden parameters
|
||||||
|
visible_metadata = {k: v for k, v in metadata.items() if k != "hidden_params"}
|
||||||
|
return visible_metadata.copy()
|
||||||
|
|
|
@ -4582,3 +4582,37 @@ def test_provider_specific_header(custom_llm_provider, expected_result):
|
||||||
mock_post.assert_called_once()
|
mock_post.assert_called_once()
|
||||||
print(mock_post.call_args.kwargs["headers"])
|
print(mock_post.call_args.kwargs["headers"])
|
||||||
assert "anthropic-beta" in mock_post.call_args.kwargs["headers"]
|
assert "anthropic-beta" in mock_post.call_args.kwargs["headers"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"enable_preview_features",
|
||||||
|
[True, False],
|
||||||
|
)
|
||||||
|
def test_completion_openai_metadata(monkeypatch, enable_preview_features):
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
client = OpenAI()
|
||||||
|
|
||||||
|
litellm.set_verbose = True
|
||||||
|
|
||||||
|
monkeypatch.setattr(litellm, "enable_preview_features", enable_preview_features)
|
||||||
|
with patch.object(
|
||||||
|
client.chat.completions.with_raw_response, "create", return_value=MagicMock()
|
||||||
|
) as mock_completion:
|
||||||
|
try:
|
||||||
|
resp = litellm.completion(
|
||||||
|
model="openai/gpt-3.5-turbo",
|
||||||
|
messages=[{"role": "user", "content": "Hello world"}],
|
||||||
|
metadata={"my-test-key": "my-test-value"},
|
||||||
|
client=client,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error: {e}")
|
||||||
|
|
||||||
|
mock_completion.assert_called_once()
|
||||||
|
if enable_preview_features:
|
||||||
|
assert mock_completion.call_args.kwargs["metadata"] == {
|
||||||
|
"my-test-key": "my-test-value"
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
assert "metadata" not in mock_completion.call_args.kwargs
|
||||||
|
|
|
@ -2190,3 +2190,19 @@ async def test_get_ui_settings_spend_logs_threshold():
|
||||||
|
|
||||||
# Clean up
|
# Clean up
|
||||||
proxy_state.set_proxy_state_variable("spend_logs_row_count", 0)
|
proxy_state.set_proxy_state_variable("spend_logs_row_count", 0)
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_timeout_from_request():
|
||||||
|
from litellm.proxy.litellm_pre_call_utils import LiteLLMProxyRequestSetup
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"x-litellm-timeout": "90",
|
||||||
|
}
|
||||||
|
timeout = LiteLLMProxyRequestSetup._get_timeout_from_request(headers)
|
||||||
|
assert timeout == 90
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"x-litellm-timeout": "90.5",
|
||||||
|
}
|
||||||
|
timeout = LiteLLMProxyRequestSetup._get_timeout_from_request(headers)
|
||||||
|
assert timeout == 90.5
|
||||||
|
|
|
@ -5,6 +5,7 @@ import asyncio
|
||||||
import aiohttp
|
import aiohttp
|
||||||
from large_text import text
|
from large_text import text
|
||||||
import time
|
import time
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
|
||||||
async def generate_key(
|
async def generate_key(
|
||||||
|
@ -44,6 +45,7 @@ async def chat_completion(
|
||||||
model: str,
|
model: str,
|
||||||
messages: list,
|
messages: list,
|
||||||
return_headers: bool = False,
|
return_headers: bool = False,
|
||||||
|
extra_headers: Optional[dict] = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
url = "http://0.0.0.0:4000/chat/completions"
|
url = "http://0.0.0.0:4000/chat/completions"
|
||||||
|
@ -51,6 +53,8 @@ async def chat_completion(
|
||||||
"Authorization": f"Bearer {key}",
|
"Authorization": f"Bearer {key}",
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
}
|
}
|
||||||
|
if extra_headers is not None:
|
||||||
|
headers.update(extra_headers)
|
||||||
data = {"model": model, "messages": messages, **kwargs}
|
data = {"model": model, "messages": messages, **kwargs}
|
||||||
|
|
||||||
async with session.post(url, headers=headers, json=data) as response:
|
async with session.post(url, headers=headers, json=data) as response:
|
||||||
|
@ -180,6 +184,38 @@ async def test_chat_completion_with_timeout():
|
||||||
) # assert model-specific timeout used
|
) # assert model-specific timeout used
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_chat_completion_with_timeout_from_request():
|
||||||
|
"""
|
||||||
|
make chat completion call with low timeout and `mock_timeout`: true. Expect it to fail and correct timeout to be set in headers.
|
||||||
|
"""
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
model = "fake-openai-endpoint-5"
|
||||||
|
messages = [
|
||||||
|
{"role": "system", "content": text},
|
||||||
|
{"role": "user", "content": "Who was Alexander?"},
|
||||||
|
]
|
||||||
|
extra_headers = {
|
||||||
|
"x-litellm-timeout": "0.001",
|
||||||
|
}
|
||||||
|
start_time = time.time()
|
||||||
|
response, headers = await chat_completion(
|
||||||
|
session=session,
|
||||||
|
key="sk-1234",
|
||||||
|
model=model,
|
||||||
|
messages=messages,
|
||||||
|
num_retries=0,
|
||||||
|
mock_timeout=True,
|
||||||
|
extra_headers=extra_headers,
|
||||||
|
return_headers=True,
|
||||||
|
)
|
||||||
|
end_time = time.time()
|
||||||
|
print(f"headers: {headers}")
|
||||||
|
assert (
|
||||||
|
headers["x-litellm-timeout"] == "0.001"
|
||||||
|
) # assert model-specific timeout used
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("has_access", [True, False])
|
@pytest.mark.parametrize("has_access", [True, False])
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_chat_completion_client_fallbacks_with_custom_message(has_access):
|
async def test_chat_completion_client_fallbacks_with_custom_message(has_access):
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue