diff --git a/litellm/main.py b/litellm/main.py index df1166116..dd4312f0c 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -609,6 +609,7 @@ def completion( "cache", "no-log", "base_model", + "stream_timeout", ] default_params = openai_params + litellm_params non_default_params = { diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml index 9049d78e4..00b783952 100644 --- a/litellm/proxy/_new_secret_config.yaml +++ b/litellm/proxy/_new_secret_config.yaml @@ -4,6 +4,14 @@ model_list: model: openai/my-fake-model api_key: my-fake-key api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/ + stream_timeout: 0.001 +- litellm_params: + model: azure/chatgpt-v-2 + api_base: os.environ/AZURE_API_BASE + api_key: os.environ/AZURE_API_KEY + api_version: "2023-07-01-preview" + stream_timeout: 0.001 + model_name: azure-gpt-3.5 - model_name: gpt-instruct litellm_params: model: gpt-3.5-turbo-instruct diff --git a/litellm/tests/test_router_init.py b/litellm/tests/test_router_init.py index 5fa142053..4fdceaf36 100644 --- a/litellm/tests/test_router_init.py +++ b/litellm/tests/test_router_init.py @@ -252,24 +252,31 @@ def test_stream_timeouts_router(): "api_version": os.getenv("AZURE_API_VERSION"), "api_base": os.getenv("AZURE_API_BASE"), "timeout": 200, # regular calls will not timeout, stream calls will - "stream_timeout": 0.000_001, + "stream_timeout": 10, }, }, ] router = Router(model_list=model_list) print("PASSED !") + data = { + "model": "gpt-3.5-turbo", + "messages": [{"role": "user", "content": "hello, write a 20 pg essay"}], + "stream": True, + } selected_client = router._get_client( deployment=router.model_list[0], - kwargs={ - "model": "gpt-3.5-turbo", - "messages": [{"role": "user", "content": "hello, write a 20 pg essay"}], - "stream": True, - }, + kwargs=data, client_type=None, ) print("Select client timeout", selected_client.timeout) - assert selected_client.timeout == 0.000_001 + assert selected_client.timeout == 10 + + # make actual call + response = router.completion(**data) + + for chunk in response: + print(f"chunk: {chunk}") except openai.APITimeoutError as e: print( "Passed: Raised correct exception. Got openai.APITimeoutError\nGood Job", e diff --git a/proxy_server_config.yaml b/proxy_server_config.yaml index c723bd31d..d1d06eb58 100644 --- a/proxy_server_config.yaml +++ b/proxy_server_config.yaml @@ -9,12 +9,18 @@ model_list: litellm_params: model: "gpt-3.5-turbo-1106" api_key: os.environ/OPENAI_API_KEY + rpm: 480 + timeout: 300 + stream_timeout: 60 - model_name: gpt-4 litellm_params: model: azure/chatgpt-v-2 api_base: https://openai-gpt-4-test-v-1.openai.azure.com/ api_version: "2023-05-15" api_key: os.environ/AZURE_API_KEY # The `os.environ/` prefix tells litellm to read this from the env. See https://docs.litellm.ai/docs/simple_proxy#load-api-keys-from-vault + rpm: 480 + timeout: 300 + stream_timeout: 60 - model_name: sagemaker-completion-model litellm_params: model: sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4