forked from phoenix/litellm-mirror
ci(config.yml): run prisma generate before testing
This commit is contained in:
parent
7f269e92c5
commit
e44d3e51aa
2 changed files with 67 additions and 53 deletions
|
@ -45,6 +45,13 @@ jobs:
|
||||||
paths:
|
paths:
|
||||||
- ./venv
|
- ./venv
|
||||||
key: v1-dependencies-{{ checksum ".circleci/requirements.txt" }}
|
key: v1-dependencies-{{ checksum ".circleci/requirements.txt" }}
|
||||||
|
- run:
|
||||||
|
name: Run prisma ./entrypoint.sh
|
||||||
|
command: |
|
||||||
|
set +e
|
||||||
|
chmod +x entrypoint.sh
|
||||||
|
./entrypoint.sh
|
||||||
|
set -e
|
||||||
- run:
|
- run:
|
||||||
name: Black Formatting
|
name: Black Formatting
|
||||||
command: |
|
command: |
|
||||||
|
|
|
@ -77,7 +77,65 @@ print(response)
|
||||||
Router provides 4 strategies for routing your calls across multiple deployments:
|
Router provides 4 strategies for routing your calls across multiple deployments:
|
||||||
|
|
||||||
<Tabs>
|
<Tabs>
|
||||||
<TabItem value="simple-shuffle" label="Weighted Pick">
|
<TabItem value="latency-based" label="Latency-Based">
|
||||||
|
|
||||||
|
|
||||||
|
Picks the deployment with the lowest response time.
|
||||||
|
|
||||||
|
It caches, and updates the response times for deployments based on when a request was sent and received from a deployment.
|
||||||
|
|
||||||
|
[**How to test**](https://github.com/BerriAI/litellm/blob/main/litellm/tests/test_lowest_latency_routing.py)
|
||||||
|
|
||||||
|
```python
|
||||||
|
from litellm import Router
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
model_list = [{ ... }]
|
||||||
|
|
||||||
|
# init router
|
||||||
|
router = Router(model_list=model_list, routing_strategy="latency-based-routing") # 👈 set routing strategy
|
||||||
|
|
||||||
|
## CALL 1+2
|
||||||
|
tasks = []
|
||||||
|
response = None
|
||||||
|
final_response = None
|
||||||
|
for _ in range(2):
|
||||||
|
tasks.append(router.acompletion(model=model, messages=messages))
|
||||||
|
response = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
|
if response is not None:
|
||||||
|
## CALL 3
|
||||||
|
await asyncio.sleep(1) # let the cache update happen
|
||||||
|
picked_deployment = router.lowestlatency_logger.get_available_deployments(
|
||||||
|
model_group=model, healthy_deployments=router.healthy_deployments
|
||||||
|
)
|
||||||
|
final_response = await router.acompletion(model=model, messages=messages)
|
||||||
|
print(f"min deployment id: {picked_deployment}")
|
||||||
|
print(f"model id: {final_response._hidden_params['model_id']}")
|
||||||
|
assert (
|
||||||
|
final_response._hidden_params["model_id"]
|
||||||
|
== picked_deployment["model_info"]["id"]
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Set Time Window
|
||||||
|
|
||||||
|
Set time window for how far back to consider when averaging latency for a deployment.
|
||||||
|
|
||||||
|
**In Router**
|
||||||
|
```python
|
||||||
|
router = Router(..., routing_strategy_args={"ttl": 10})
|
||||||
|
```
|
||||||
|
|
||||||
|
**In Proxy**
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
router_settings:
|
||||||
|
routing_strategy_args: {"ttl": 10}
|
||||||
|
```
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="simple-shuffle" label="(Default) Weighted Pick">
|
||||||
|
|
||||||
**Default** Picks a deployment based on the provided **Requests per minute (rpm) or Tokens per minute (tpm)**
|
**Default** Picks a deployment based on the provided **Requests per minute (rpm) or Tokens per minute (tpm)**
|
||||||
|
|
||||||
|
@ -235,58 +293,7 @@ asyncio.run(router_acompletion())
|
||||||
```
|
```
|
||||||
|
|
||||||
</TabItem>
|
</TabItem>
|
||||||
<TabItem value="latency-based" label="Latency-Based">
|
|
||||||
|
|
||||||
|
|
||||||
Picks the deployment with the lowest response time.
|
|
||||||
|
|
||||||
It caches, and updates the response times for deployments based on when a request was sent and received from a deployment.
|
|
||||||
|
|
||||||
[**How to test**](https://github.com/BerriAI/litellm/blob/main/litellm/tests/test_lowest_latency_routing.py)
|
|
||||||
|
|
||||||
```python
|
|
||||||
from litellm import Router
|
|
||||||
import asyncio
|
|
||||||
|
|
||||||
model_list = [{ # list of model deployments
|
|
||||||
"model_name": "gpt-3.5-turbo", # model alias
|
|
||||||
"litellm_params": { # params for litellm completion/embedding call
|
|
||||||
"model": "azure/chatgpt-v-2", # actual model name
|
|
||||||
"api_key": os.getenv("AZURE_API_KEY"),
|
|
||||||
"api_version": os.getenv("AZURE_API_VERSION"),
|
|
||||||
"api_base": os.getenv("AZURE_API_BASE"),
|
|
||||||
}
|
|
||||||
}, {
|
|
||||||
"model_name": "gpt-3.5-turbo",
|
|
||||||
"litellm_params": { # params for litellm completion/embedding call
|
|
||||||
"model": "azure/chatgpt-functioncalling",
|
|
||||||
"api_key": os.getenv("AZURE_API_KEY"),
|
|
||||||
"api_version": os.getenv("AZURE_API_VERSION"),
|
|
||||||
"api_base": os.getenv("AZURE_API_BASE"),
|
|
||||||
}
|
|
||||||
}, {
|
|
||||||
"model_name": "gpt-3.5-turbo",
|
|
||||||
"litellm_params": { # params for litellm completion/embedding call
|
|
||||||
"model": "gpt-3.5-turbo",
|
|
||||||
"api_key": os.getenv("OPENAI_API_KEY"),
|
|
||||||
}
|
|
||||||
}]
|
|
||||||
|
|
||||||
# init router
|
|
||||||
router = Router(model_list=model_list, routing_strategy="latency-based-routing")
|
|
||||||
async def router_acompletion():
|
|
||||||
response = await router.acompletion(
|
|
||||||
model="gpt-3.5-turbo",
|
|
||||||
messages=[{"role": "user", "content": "Hey, how's it going?"}]
|
|
||||||
)
|
|
||||||
print(response)
|
|
||||||
return response
|
|
||||||
|
|
||||||
asyncio.run(router_acompletion())
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
||||||
</TabItem>
|
|
||||||
</Tabs>
|
</Tabs>
|
||||||
|
|
||||||
## Basic Reliability
|
## Basic Reliability
|
||||||
|
@ -608,4 +615,4 @@ def __init__(
|
||||||
"latency-based-routing",
|
"latency-based-routing",
|
||||||
] = "simple-shuffle",
|
] = "simple-shuffle",
|
||||||
):
|
):
|
||||||
```
|
```
|
Loading…
Add table
Add a link
Reference in a new issue