forked from phoenix/litellm-mirror
docs(routing.md): adding latency-based routing to docs
This commit is contained in:
parent
d71f89aac3
commit
d0d08b4dce
1 changed files with 108 additions and 4 deletions
|
@ -65,13 +65,16 @@ print(response)
|
||||||
- `router.completion()` - chat completions endpoint to call 100+ LLMs
|
- `router.completion()` - chat completions endpoint to call 100+ LLMs
|
||||||
- `router.acompletion()` - async chat completion calls
|
- `router.acompletion()` - async chat completion calls
|
||||||
- `router.embeddings()` - embedding endpoint for Azure, OpenAI, Huggingface endpoints
|
- `router.embeddings()` - embedding endpoint for Azure, OpenAI, Huggingface endpoints
|
||||||
- `router.aembeddings()` - async embeddings endpoint
|
- `router.aembeddings()` - async embeddings calls
|
||||||
- `router.text_completion()` - completion calls in the old OpenAI `/v1/completions` endpoint format
|
- `router.text_completion()` - completion calls in the old OpenAI `/v1/completions` endpoint format
|
||||||
|
- `router.atext_completion()` - async text completion calls
|
||||||
|
- `router.image_generation()` - completion calls in OpenAI `/v1/images/generations` endpoint format
|
||||||
|
- `router.aimage_generation()` - async image generation calls
|
||||||
|
|
||||||
### Advanced
|
### Advanced
|
||||||
#### Routing Strategies - Weighted Pick, Rate Limit Aware
|
#### Routing Strategies - Weighted Pick, Rate Limit Aware, Least Busy, Latency Based
|
||||||
|
|
||||||
Router provides 2 strategies for routing your calls across multiple deployments:
|
Router provides 4 strategies for routing your calls across multiple deployments:
|
||||||
|
|
||||||
<Tabs>
|
<Tabs>
|
||||||
<TabItem value="simple-shuffle" label="Weighted Pick">
|
<TabItem value="simple-shuffle" label="Weighted Pick">
|
||||||
|
@ -172,7 +175,7 @@ router = Router(model_list=model_list,
|
||||||
redis_host=os.environ["REDIS_HOST"],
|
redis_host=os.environ["REDIS_HOST"],
|
||||||
redis_password=os.environ["REDIS_PASSWORD"],
|
redis_password=os.environ["REDIS_PASSWORD"],
|
||||||
redis_port=os.environ["REDIS_PORT"],
|
redis_port=os.environ["REDIS_PORT"],
|
||||||
routing_strategy="simple-shuffle")
|
routing_strategy="usage-based-routing")
|
||||||
|
|
||||||
|
|
||||||
response = await router.acompletion(model="gpt-3.5-turbo",
|
response = await router.acompletion(model="gpt-3.5-turbo",
|
||||||
|
@ -182,6 +185,107 @@ print(response)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="least-busy" label="Least-Busy">
|
||||||
|
|
||||||
|
|
||||||
|
Picks a deployment with the least number of ongoing calls, it's handling.
|
||||||
|
|
||||||
|
[**How to test**](https://github.com/BerriAI/litellm/blob/main/litellm/tests/test_least_busy_routing.py)
|
||||||
|
|
||||||
|
```python
|
||||||
|
from litellm import Router
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
model_list = [{ # list of model deployments
|
||||||
|
"model_name": "gpt-3.5-turbo", # model alias
|
||||||
|
"litellm_params": { # params for litellm completion/embedding call
|
||||||
|
"model": "azure/chatgpt-v-2", # actual model name
|
||||||
|
"api_key": os.getenv("AZURE_API_KEY"),
|
||||||
|
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||||
|
"api_base": os.getenv("AZURE_API_BASE"),
|
||||||
|
}
|
||||||
|
}, {
|
||||||
|
"model_name": "gpt-3.5-turbo",
|
||||||
|
"litellm_params": { # params for litellm completion/embedding call
|
||||||
|
"model": "azure/chatgpt-functioncalling",
|
||||||
|
"api_key": os.getenv("AZURE_API_KEY"),
|
||||||
|
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||||
|
"api_base": os.getenv("AZURE_API_BASE"),
|
||||||
|
}
|
||||||
|
}, {
|
||||||
|
"model_name": "gpt-3.5-turbo",
|
||||||
|
"litellm_params": { # params for litellm completion/embedding call
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"api_key": os.getenv("OPENAI_API_KEY"),
|
||||||
|
}
|
||||||
|
}]
|
||||||
|
|
||||||
|
# init router
|
||||||
|
router = Router(model_list=model_list, routing_strategy="least-busy")
|
||||||
|
async def router_acompletion():
|
||||||
|
response = await router.acompletion(
|
||||||
|
model="gpt-3.5-turbo",
|
||||||
|
messages=[{"role": "user", "content": "Hey, how's it going?"}]
|
||||||
|
)
|
||||||
|
print(response)
|
||||||
|
return response
|
||||||
|
|
||||||
|
asyncio.run(router_acompletion())
|
||||||
|
```
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="latency-based" label="Latency-Based">
|
||||||
|
|
||||||
|
|
||||||
|
Picks the deployment with the lowest response time.
|
||||||
|
|
||||||
|
It caches, and updates the response times for deployments based on when a request was sent and received from a deployment.
|
||||||
|
|
||||||
|
[**How to test**](https://github.com/BerriAI/litellm/blob/main/litellm/tests/test_lowest_latency_routing.py)
|
||||||
|
|
||||||
|
```python
|
||||||
|
from litellm import Router
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
model_list = [{ # list of model deployments
|
||||||
|
"model_name": "gpt-3.5-turbo", # model alias
|
||||||
|
"litellm_params": { # params for litellm completion/embedding call
|
||||||
|
"model": "azure/chatgpt-v-2", # actual model name
|
||||||
|
"api_key": os.getenv("AZURE_API_KEY"),
|
||||||
|
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||||
|
"api_base": os.getenv("AZURE_API_BASE"),
|
||||||
|
}
|
||||||
|
}, {
|
||||||
|
"model_name": "gpt-3.5-turbo",
|
||||||
|
"litellm_params": { # params for litellm completion/embedding call
|
||||||
|
"model": "azure/chatgpt-functioncalling",
|
||||||
|
"api_key": os.getenv("AZURE_API_KEY"),
|
||||||
|
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||||
|
"api_base": os.getenv("AZURE_API_BASE"),
|
||||||
|
}
|
||||||
|
}, {
|
||||||
|
"model_name": "gpt-3.5-turbo",
|
||||||
|
"litellm_params": { # params for litellm completion/embedding call
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"api_key": os.getenv("OPENAI_API_KEY"),
|
||||||
|
}
|
||||||
|
}]
|
||||||
|
|
||||||
|
# init router
|
||||||
|
router = Router(model_list=model_list, routing_strategy="latency-based-routing")
|
||||||
|
async def router_acompletion():
|
||||||
|
response = await router.acompletion(
|
||||||
|
model="gpt-3.5-turbo",
|
||||||
|
messages=[{"role": "user", "content": "Hey, how's it going?"}]
|
||||||
|
)
|
||||||
|
print(response)
|
||||||
|
return response
|
||||||
|
|
||||||
|
asyncio.run(router_acompletion())
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
</TabItem>
|
</TabItem>
|
||||||
</Tabs>
|
</Tabs>
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue