forked from phoenix/litellm-mirror
docs(routing.md): add timeouts per model
This commit is contained in:
parent
9988a39169
commit
265f5ef6da
1 changed files with 31 additions and 0 deletions
|
@ -302,6 +302,7 @@ asyncio.run(router_acompletion())
|
||||||
|
|
||||||
The timeout set in router is for the entire length of the call, and is passed down to the completion() call level as well.
|
The timeout set in router is for the entire length of the call, and is passed down to the completion() call level as well.
|
||||||
|
|
||||||
|
**Global Timeouts**
|
||||||
```python
|
```python
|
||||||
from litellm import Router
|
from litellm import Router
|
||||||
|
|
||||||
|
@ -313,6 +314,36 @@ router = Router(model_list=model_list,
|
||||||
print(response)
|
print(response)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
**Timeouts per model**
|
||||||
|
|
||||||
|
```python
|
||||||
|
from litellm import Router
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
model_list = [{
|
||||||
|
"model_name": "gpt-3.5-turbo",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "azure/chatgpt-v-2",
|
||||||
|
"api_key": os.getenv("AZURE_API_KEY"),
|
||||||
|
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||||
|
"api_base": os.getenv("AZURE_API_BASE"),
|
||||||
|
"timeout": 300 # sets a 5 minute timeout
|
||||||
|
"stream_timeout": 30 # sets a 30s timeout for streaming calls
|
||||||
|
}
|
||||||
|
}]
|
||||||
|
|
||||||
|
# init router
|
||||||
|
router = Router(model_list=model_list, routing_strategy="least-busy")
|
||||||
|
async def router_acompletion():
|
||||||
|
response = await router.acompletion(
|
||||||
|
model="gpt-3.5-turbo",
|
||||||
|
messages=[{"role": "user", "content": "Hey, how's it going?"}]
|
||||||
|
)
|
||||||
|
print(response)
|
||||||
|
return response
|
||||||
|
|
||||||
|
asyncio.run(router_acompletion())
|
||||||
|
```
|
||||||
### Cooldowns
|
### Cooldowns
|
||||||
|
|
||||||
Set the limit for how many calls a model is allowed to fail in a minute, before being cooled down for a minute.
|
Set the limit for how many calls a model is allowed to fail in a minute, before being cooled down for a minute.
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue