diff --git a/docs/my-website/docs/proxy/reliability.md b/docs/my-website/docs/proxy/reliability.md index 489f4e2ef1..654c2618c2 100644 --- a/docs/my-website/docs/proxy/reliability.md +++ b/docs/my-website/docs/proxy/reliability.md @@ -1007,7 +1007,34 @@ curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \ }' ``` -### Disable Fallbacks per key +### Disable Fallbacks (Per Request/Key) + + + + + + +You can disable fallbacks per key by setting `disable_fallbacks: true` in your request body. + +```bash +curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer sk-1234' \ +-d '{ + "messages": [ + { + "role": "user", + "content": "List 5 important events in the XIX century" + } + ], + "model": "gpt-3.5-turbo", + "disable_fallbacks": true # 👈 DISABLE FALLBACKS +}' +``` + + + + You can disable fallbacks per key by setting `disable_fallbacks: true` in your key metadata. @@ -1020,4 +1047,7 @@ curl -L -X POST 'http://0.0.0.0:4000/key/generate' \ "disable_fallbacks": true } }' -``` \ No newline at end of file +``` + + + \ No newline at end of file diff --git a/docs/my-website/docs/proxy/request_headers.md b/docs/my-website/docs/proxy/request_headers.md new file mode 100644 index 0000000000..d3ccb54435 --- /dev/null +++ b/docs/my-website/docs/proxy/request_headers.md @@ -0,0 +1,12 @@ +# Request Headers + +Special headers that are supported by LiteLLM. + +## LiteLLM Headers + +`x-litellm-timeout` Optional[float]: The timeout for the request in seconds. + +## Anthropic Headers + +`anthropic-version` Optional[str]: The version of the Anthropic API to use. +`anthropic-beta` Optional[str]: The beta version of the Anthropic API to use. \ No newline at end of file diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js index d20f2a73e4..b4c9b13b31 100644 --- a/docs/my-website/sidebars.js +++ b/docs/my-website/sidebars.js @@ -66,6 +66,7 @@ const sidebars = { "proxy/user_keys", "proxy/clientside_auth", "proxy/response_headers", + "proxy/request_headers", ], }, { diff --git a/litellm/main.py b/litellm/main.py index 0056f4751d..ec4e43fd94 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -75,6 +75,7 @@ from litellm.utils import ( CustomStreamWrapper, ProviderConfigManager, Usage, + add_openai_metadata, async_mock_completion_streaming_obj, convert_to_model_response_object, create_pretrained_tokenizer, @@ -1617,6 +1618,11 @@ def completion( # type: ignore # noqa: PLR0915 if extra_headers is not None: optional_params["extra_headers"] = extra_headers + if ( + litellm.enable_preview_features and metadata is not None + ): # [PREVIEW] allow metadata to be passed to OPENAI + optional_params["metadata"] = add_openai_metadata(metadata) + ## LOAD CONFIG - if set config = litellm.OpenAIConfig.get_config() for k, v in config.items(): diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml index 423032ac86..321e8b676f 100644 --- a/litellm/proxy/_new_secret_config.yaml +++ b/litellm/proxy/_new_secret_config.yaml @@ -13,4 +13,4 @@ model_list: - model_name: deepseek/* litellm_params: model: deepseek/* - api_key: os.environ/DEEPSEEK_API_KEY + api_key: os.environ/DEEPSEEK_API_KEY \ No newline at end of file diff --git a/litellm/proxy/_types.py b/litellm/proxy/_types.py index bf3f6b6543..bf13d178d4 100644 --- a/litellm/proxy/_types.py +++ b/litellm/proxy/_types.py @@ -2204,6 +2204,7 @@ class SpecialHeaders(enum.Enum): class LitellmDataForBackendLLMCall(TypedDict, total=False): headers: dict organization: str + timeout: Optional[float] class JWTKeyItem(TypedDict, total=False): diff --git a/litellm/proxy/litellm_pre_call_utils.py b/litellm/proxy/litellm_pre_call_utils.py index 1933bfb016..b913c238db 100644 --- a/litellm/proxy/litellm_pre_call_utils.py +++ b/litellm/proxy/litellm_pre_call_utils.py @@ -181,6 +181,31 @@ def clean_headers( class LiteLLMProxyRequestSetup: + @staticmethod + def _get_timeout_from_request(headers: dict) -> Optional[float]: + """ + Workaround for client request from Vercel's AI SDK. + + Allow's user to set a timeout in the request headers. + + Example: + + ```js + const openaiProvider = createOpenAI({ + baseURL: liteLLM.baseURL, + apiKey: liteLLM.apiKey, + compatibility: "compatible", + headers: { + "x-litellm-timeout": "90" + }, + }); + ``` + """ + timeout_header = headers.get("x-litellm-timeout", None) + if timeout_header is not None: + return float(timeout_header) + return None + @staticmethod def _get_forwardable_headers( headers: Union[Headers, dict], @@ -267,6 +292,11 @@ class LiteLLMProxyRequestSetup: ) if _organization is not None: data["organization"] = _organization + + timeout = LiteLLMProxyRequestSetup._get_timeout_from_request(headers) + if timeout is not None: + data["timeout"] = timeout + return data @staticmethod diff --git a/litellm/utils.py b/litellm/utils.py index 92d6dc37db..c9a3b77cc2 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -6206,3 +6206,21 @@ def get_non_default_completion_params(kwargs: dict) -> dict: k: v for k, v in kwargs.items() if k not in default_params } # model-specific params - pass them straight to the model/provider return non_default_params + + +def add_openai_metadata(metadata: dict) -> dict: + """ + Add metadata to openai optional parameters, excluding hidden params + + Args: + params (dict): Dictionary of API parameters + metadata (dict, optional): Metadata to include in the request + + Returns: + dict: Updated parameters dictionary with visible metadata only + """ + if metadata is None: + return None + # Only include non-hidden parameters + visible_metadata = {k: v for k, v in metadata.items() if k != "hidden_params"} + return visible_metadata.copy() diff --git a/tests/local_testing/test_completion.py b/tests/local_testing/test_completion.py index b1aeeb98a2..dbee6e5194 100644 --- a/tests/local_testing/test_completion.py +++ b/tests/local_testing/test_completion.py @@ -4582,3 +4582,37 @@ def test_provider_specific_header(custom_llm_provider, expected_result): mock_post.assert_called_once() print(mock_post.call_args.kwargs["headers"]) assert "anthropic-beta" in mock_post.call_args.kwargs["headers"] + + +@pytest.mark.parametrize( + "enable_preview_features", + [True, False], +) +def test_completion_openai_metadata(monkeypatch, enable_preview_features): + from openai import OpenAI + + client = OpenAI() + + litellm.set_verbose = True + + monkeypatch.setattr(litellm, "enable_preview_features", enable_preview_features) + with patch.object( + client.chat.completions.with_raw_response, "create", return_value=MagicMock() + ) as mock_completion: + try: + resp = litellm.completion( + model="openai/gpt-3.5-turbo", + messages=[{"role": "user", "content": "Hello world"}], + metadata={"my-test-key": "my-test-value"}, + client=client, + ) + except Exception as e: + print(f"Error: {e}") + + mock_completion.assert_called_once() + if enable_preview_features: + assert mock_completion.call_args.kwargs["metadata"] == { + "my-test-key": "my-test-value" + } + else: + assert "metadata" not in mock_completion.call_args.kwargs diff --git a/tests/proxy_unit_tests/test_proxy_server.py b/tests/proxy_unit_tests/test_proxy_server.py index 4a9320c2ad..7f9d3b9081 100644 --- a/tests/proxy_unit_tests/test_proxy_server.py +++ b/tests/proxy_unit_tests/test_proxy_server.py @@ -2190,3 +2190,19 @@ async def test_get_ui_settings_spend_logs_threshold(): # Clean up proxy_state.set_proxy_state_variable("spend_logs_row_count", 0) + + +def test_get_timeout_from_request(): + from litellm.proxy.litellm_pre_call_utils import LiteLLMProxyRequestSetup + + headers = { + "x-litellm-timeout": "90", + } + timeout = LiteLLMProxyRequestSetup._get_timeout_from_request(headers) + assert timeout == 90 + + headers = { + "x-litellm-timeout": "90.5", + } + timeout = LiteLLMProxyRequestSetup._get_timeout_from_request(headers) + assert timeout == 90.5 diff --git a/tests/test_fallbacks.py b/tests/test_fallbacks.py index 91c90448b3..b891eb3062 100644 --- a/tests/test_fallbacks.py +++ b/tests/test_fallbacks.py @@ -5,6 +5,7 @@ import asyncio import aiohttp from large_text import text import time +from typing import Optional async def generate_key( @@ -44,6 +45,7 @@ async def chat_completion( model: str, messages: list, return_headers: bool = False, + extra_headers: Optional[dict] = None, **kwargs, ): url = "http://0.0.0.0:4000/chat/completions" @@ -51,6 +53,8 @@ async def chat_completion( "Authorization": f"Bearer {key}", "Content-Type": "application/json", } + if extra_headers is not None: + headers.update(extra_headers) data = {"model": model, "messages": messages, **kwargs} async with session.post(url, headers=headers, json=data) as response: @@ -180,6 +184,38 @@ async def test_chat_completion_with_timeout(): ) # assert model-specific timeout used +@pytest.mark.asyncio +async def test_chat_completion_with_timeout_from_request(): + """ + make chat completion call with low timeout and `mock_timeout`: true. Expect it to fail and correct timeout to be set in headers. + """ + async with aiohttp.ClientSession() as session: + model = "fake-openai-endpoint-5" + messages = [ + {"role": "system", "content": text}, + {"role": "user", "content": "Who was Alexander?"}, + ] + extra_headers = { + "x-litellm-timeout": "0.001", + } + start_time = time.time() + response, headers = await chat_completion( + session=session, + key="sk-1234", + model=model, + messages=messages, + num_retries=0, + mock_timeout=True, + extra_headers=extra_headers, + return_headers=True, + ) + end_time = time.time() + print(f"headers: {headers}") + assert ( + headers["x-litellm-timeout"] == "0.001" + ) # assert model-specific timeout used + + @pytest.mark.parametrize("has_access", [True, False]) @pytest.mark.asyncio async def test_chat_completion_client_fallbacks_with_custom_message(has_access):