diff --git a/.circleci/config.yml b/.circleci/config.yml index 2f2b58198..3ea6b7fca 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -45,6 +45,13 @@ jobs: paths: - ./venv key: v1-dependencies-{{ checksum ".circleci/requirements.txt" }} + - run: + name: Run prisma ./entrypoint.sh + command: | + set +e + chmod +x entrypoint.sh + ./entrypoint.sh + set -e - run: name: Black Formatting command: | diff --git a/docs/my-website/docs/routing.md b/docs/my-website/docs/routing.md index ce2f491b2..2115e2802 100644 --- a/docs/my-website/docs/routing.md +++ b/docs/my-website/docs/routing.md @@ -77,7 +77,65 @@ print(response) Router provides 4 strategies for routing your calls across multiple deployments: - + + + +Picks the deployment with the lowest response time. + +It caches, and updates the response times for deployments based on when a request was sent and received from a deployment. + +[**How to test**](https://github.com/BerriAI/litellm/blob/main/litellm/tests/test_lowest_latency_routing.py) + +```python +from litellm import Router +import asyncio + +model_list = [{ ... }] + +# init router +router = Router(model_list=model_list, routing_strategy="latency-based-routing") # 👈 set routing strategy + +## CALL 1+2 +tasks = [] +response = None +final_response = None +for _ in range(2): + tasks.append(router.acompletion(model=model, messages=messages)) +response = await asyncio.gather(*tasks) + +if response is not None: + ## CALL 3 + await asyncio.sleep(1) # let the cache update happen + picked_deployment = router.lowestlatency_logger.get_available_deployments( + model_group=model, healthy_deployments=router.healthy_deployments + ) + final_response = await router.acompletion(model=model, messages=messages) + print(f"min deployment id: {picked_deployment}") + print(f"model id: {final_response._hidden_params['model_id']}") + assert ( + final_response._hidden_params["model_id"] + == picked_deployment["model_info"]["id"] + ) +``` + +### Set Time Window + +Set time window for how far back to consider when averaging latency for a deployment. + +**In Router** +```python +router = Router(..., routing_strategy_args={"ttl": 10}) +``` + +**In Proxy** + +```yaml +router_settings: + routing_strategy_args: {"ttl": 10} +``` + + + **Default** Picks a deployment based on the provided **Requests per minute (rpm) or Tokens per minute (tpm)** @@ -235,58 +293,7 @@ asyncio.run(router_acompletion()) ``` - - -Picks the deployment with the lowest response time. - -It caches, and updates the response times for deployments based on when a request was sent and received from a deployment. - -[**How to test**](https://github.com/BerriAI/litellm/blob/main/litellm/tests/test_lowest_latency_routing.py) - -```python -from litellm import Router -import asyncio - -model_list = [{ # list of model deployments - "model_name": "gpt-3.5-turbo", # model alias - "litellm_params": { # params for litellm completion/embedding call - "model": "azure/chatgpt-v-2", # actual model name - "api_key": os.getenv("AZURE_API_KEY"), - "api_version": os.getenv("AZURE_API_VERSION"), - "api_base": os.getenv("AZURE_API_BASE"), - } -}, { - "model_name": "gpt-3.5-turbo", - "litellm_params": { # params for litellm completion/embedding call - "model": "azure/chatgpt-functioncalling", - "api_key": os.getenv("AZURE_API_KEY"), - "api_version": os.getenv("AZURE_API_VERSION"), - "api_base": os.getenv("AZURE_API_BASE"), - } -}, { - "model_name": "gpt-3.5-turbo", - "litellm_params": { # params for litellm completion/embedding call - "model": "gpt-3.5-turbo", - "api_key": os.getenv("OPENAI_API_KEY"), - } -}] - -# init router -router = Router(model_list=model_list, routing_strategy="latency-based-routing") -async def router_acompletion(): - response = await router.acompletion( - model="gpt-3.5-turbo", - messages=[{"role": "user", "content": "Hey, how's it going?"}] - ) - print(response) - return response - -asyncio.run(router_acompletion()) -``` - - - ## Basic Reliability @@ -608,4 +615,4 @@ def __init__( "latency-based-routing", ] = "simple-shuffle", ): -``` +``` \ No newline at end of file