Litellm dev 01 27 2025 p3 (#8047)

* docs(reliability.md): add doc on disabling fallbacks per request * feat(litellm_pre_call_utils.py): support reading request timeout from request headers - new `x-litellm-timeout` param Allows setting dynamic model timeouts from vercel's AI sdk * test(test_proxy_server.py): add simple unit test for reading request timeout * test(test_fallbacks.py): add e2e test to confirm timeout passed in request headers is correctly read * feat(main.py): support passing metadata to openai in preview Resolves https://github.com/BerriAI/litellm/issues/6022#issuecomment-2616119371 * fix(main.py): fix passing openai metadata * docs(request_headers.md): document new request headers * build: Merge branch 'main' into litellm_dev_01_27_2025_p3 * test: loosen test
2025-04-25 10:44:24 +00:00 · 2025-01-28 18:01:27 -08:00 · 2025-01-28 18:01:27 -08:00 · d9eb8f42ff
commit d9eb8f42ff
parent 9c20c69915
11 changed files with 187 additions and 3 deletions
--- a/docs/my-website/docs/proxy/reliability.md
+++ b/docs/my-website/docs/proxy/reliability.md
@ -1007,7 +1007,34 @@ curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
 }'
 ```
-### Disable Fallbacks per key
+### Disable Fallbacks (Per Request/Key)
 <Tabs>
 <TabItem value="request" label="Per Request">
 You can disable fallbacks per key by setting `disable_fallbacks: true` in your request body.
 ```bash
 curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
 -H 'Content-Type: application/json' \
 -H 'Authorization: Bearer sk-1234' \
 -d '{
    "messages": [
        {
            "role": "user",
            "content": "List 5 important events in the XIX century"
        }
    ],
    "model": "gpt-3.5-turbo",
    "disable_fallbacks": true # 👈 DISABLE FALLBACKS
 }'
 ```
 </TabItem>
 <TabItem value="key" label="Per Key">
 You can disable fallbacks per key by setting `disable_fallbacks: true` in your key metadata.
@ -1020,4 +1047,7 @@ curl -L -X POST 'http://0.0.0.0:4000/key/generate' \
        "disable_fallbacks": true
    }
 }'
-```
+```
 </TabItem>
 </Tabs>
--- a/docs/my-website/docs/proxy/request_headers.md
+++ b/docs/my-website/docs/proxy/request_headers.md
@ -0,0 +1,12 @@
 # Request Headers
 Special headers that are supported by LiteLLM.
 ## LiteLLM Headers
 `x-litellm-timeout` Optional[float]: The timeout for the request in seconds.
 ## Anthropic Headers
 `anthropic-version` Optional[str]: The version of the Anthropic API to use.  
 `anthropic-beta` Optional[str]: The beta version of the Anthropic API to use.
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -66,6 +66,7 @@ const sidebars = {
            "proxy/user_keys",
            "proxy/clientside_auth",
            "proxy/response_headers",
            "proxy/request_headers",
          ],
        },
        {
--- a/litellm/main.py
+++ b/litellm/main.py
@ -75,6 +75,7 @@ from litellm.utils import (
    CustomStreamWrapper,
    ProviderConfigManager,
    Usage,
    add_openai_metadata,
    async_mock_completion_streaming_obj,
    convert_to_model_response_object,
    create_pretrained_tokenizer,
@ -1617,6 +1618,11 @@ def completion(  # type: ignore # noqa: PLR0915
            if extra_headers is not None:
                optional_params["extra_headers"] = extra_headers
            if (
                litellm.enable_preview_features and metadata is not None
            ):  # [PREVIEW] allow metadata to be passed to OPENAI
                optional_params["metadata"] = add_openai_metadata(metadata)
            ## LOAD CONFIG - if set
            config = litellm.OpenAIConfig.get_config()
            for k, v in config.items():
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@ -13,4 +13,4 @@ model_list:
  - model_name: deepseek/*
    litellm_params:
      model: deepseek/*
-      api_key: os.environ/DEEPSEEK_API_KEY
+      api_key: os.environ/DEEPSEEK_API_KEY
--- a/litellm/proxy/_types.py
+++ b/litellm/proxy/_types.py
@ -2204,6 +2204,7 @@ class SpecialHeaders(enum.Enum):
 class LitellmDataForBackendLLMCall(TypedDict, total=False):
    headers: dict
    organization: str
    timeout: Optional[float]
 class JWTKeyItem(TypedDict, total=False):
--- a/litellm/proxy/litellm_pre_call_utils.py
+++ b/litellm/proxy/litellm_pre_call_utils.py
@ -181,6 +181,31 @@ def clean_headers(
 class LiteLLMProxyRequestSetup:
    @staticmethod
    def _get_timeout_from_request(headers: dict) -> Optional[float]:
        """
        Workaround for client request from Vercel's AI SDK.
        Allow's user to set a timeout in the request headers.
        Example:
        ```js
        const openaiProvider = createOpenAI({
            baseURL: liteLLM.baseURL,
            apiKey: liteLLM.apiKey,
            compatibility: "compatible",
            headers: {
                "x-litellm-timeout": "90"
            },
        });
        ```
        """
        timeout_header = headers.get("x-litellm-timeout", None)
        if timeout_header is not None:
            return float(timeout_header)
        return None
    @staticmethod
    def _get_forwardable_headers(
        headers: Union[Headers, dict],
@ -267,6 +292,11 @@ class LiteLLMProxyRequestSetup:
        )
        if _organization is not None:
            data["organization"] = _organization
        timeout = LiteLLMProxyRequestSetup._get_timeout_from_request(headers)
        if timeout is not None:
            data["timeout"] = timeout
        return data
    @staticmethod
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -6206,3 +6206,21 @@ def get_non_default_completion_params(kwargs: dict) -> dict:
        k: v for k, v in kwargs.items() if k not in default_params
    }  # model-specific params - pass them straight to the model/provider
    return non_default_params
 def add_openai_metadata(metadata: dict) -> dict:
    """
    Add metadata to openai optional parameters, excluding hidden params
    Args:
        params (dict): Dictionary of API parameters
        metadata (dict, optional): Metadata to include in the request
    Returns:
        dict: Updated parameters dictionary with visible metadata only
    """
    if metadata is None:
        return None
    # Only include non-hidden parameters
    visible_metadata = {k: v for k, v in metadata.items() if k != "hidden_params"}
    return visible_metadata.copy()
--- a/tests/local_testing/test_completion.py
+++ b/tests/local_testing/test_completion.py
@ -4582,3 +4582,37 @@ def test_provider_specific_header(custom_llm_provider, expected_result):
        mock_post.assert_called_once()
        print(mock_post.call_args.kwargs["headers"])
        assert "anthropic-beta" in mock_post.call_args.kwargs["headers"]
@pytest.mark.parametrize(
    "enable_preview_features",
    [True, False],
 )
 def test_completion_openai_metadata(monkeypatch, enable_preview_features):
    from openai import OpenAI
    client = OpenAI()
    litellm.set_verbose = True
    monkeypatch.setattr(litellm, "enable_preview_features", enable_preview_features)
    with patch.object(
        client.chat.completions.with_raw_response, "create", return_value=MagicMock()
    ) as mock_completion:
        try:
            resp = litellm.completion(
                model="openai/gpt-3.5-turbo",
                messages=[{"role": "user", "content": "Hello world"}],
                metadata={"my-test-key": "my-test-value"},
                client=client,
            )
        except Exception as e:
            print(f"Error: {e}")
        mock_completion.assert_called_once()
        if enable_preview_features:
            assert mock_completion.call_args.kwargs["metadata"] == {
                "my-test-key": "my-test-value"
            }
        else:
            assert "metadata" not in mock_completion.call_args.kwargs
--- a/tests/proxy_unit_tests/test_proxy_server.py
+++ b/tests/proxy_unit_tests/test_proxy_server.py
@ -2190,3 +2190,19 @@ async def test_get_ui_settings_spend_logs_threshold():
    # Clean up
    proxy_state.set_proxy_state_variable("spend_logs_row_count", 0)
 def test_get_timeout_from_request():
    from litellm.proxy.litellm_pre_call_utils import LiteLLMProxyRequestSetup
    headers = {
        "x-litellm-timeout": "90",
    }
    timeout = LiteLLMProxyRequestSetup._get_timeout_from_request(headers)
    assert timeout == 90
    headers = {
        "x-litellm-timeout": "90.5",
    }
    timeout = LiteLLMProxyRequestSetup._get_timeout_from_request(headers)
    assert timeout == 90.5
--- a/tests/test_fallbacks.py
+++ b/tests/test_fallbacks.py
@ -5,6 +5,7 @@ import asyncio
 import aiohttp
 from large_text import text
 import time
 from typing import Optional
 async def generate_key(
@ -44,6 +45,7 @@ async def chat_completion(
    model: str,
    messages: list,
    return_headers: bool = False,
    extra_headers: Optional[dict] = None,
    **kwargs,
 ):
    url = "http://0.0.0.0:4000/chat/completions"
@ -51,6 +53,8 @@ async def chat_completion(
        "Authorization": f"Bearer {key}",
        "Content-Type": "application/json",
    }
    if extra_headers is not None:
        headers.update(extra_headers)
    data = {"model": model, "messages": messages, **kwargs}
    async with session.post(url, headers=headers, json=data) as response:
@ -180,6 +184,38 @@ async def test_chat_completion_with_timeout():
        )  # assert model-specific timeout used
@pytest.mark.asyncio
 async def test_chat_completion_with_timeout_from_request():
    """
    make chat completion call with low timeout and `mock_timeout`: true. Expect it to fail and correct timeout to be set in headers.
    """
    async with aiohttp.ClientSession() as session:
        model = "fake-openai-endpoint-5"
        messages = [
            {"role": "system", "content": text},
            {"role": "user", "content": "Who was Alexander?"},
        ]
        extra_headers = {
            "x-litellm-timeout": "0.001",
        }
        start_time = time.time()
        response, headers = await chat_completion(
            session=session,
            key="sk-1234",
            model=model,
            messages=messages,
            num_retries=0,
            mock_timeout=True,
            extra_headers=extra_headers,
            return_headers=True,
        )
        end_time = time.time()
        print(f"headers: {headers}")
        assert (
            headers["x-litellm-timeout"] == "0.001"
        )  # assert model-specific timeout used
@pytest.mark.parametrize("has_access", [True, False])
@pytest.mark.asyncio
 async def test_chat_completion_client_fallbacks_with_custom_message(has_access):