From 886c859519c8bd8eddcdeb51c9946036a4042043 Mon Sep 17 00:00:00 2001 From: CLARKBENHAM Date: Mon, 8 Apr 2024 21:20:59 -0700 Subject: [PATCH 1/3] doc pre_call_check: enables router rate limits for concurrent calls --- docs/my-website/docs/routing.md | 327 ++++++++++++++++---------------- 1 file changed, 168 insertions(+), 159 deletions(-) diff --git a/docs/my-website/docs/routing.md b/docs/my-website/docs/routing.md index 404c72e44..3fda19094 100644 --- a/docs/my-website/docs/routing.md +++ b/docs/my-website/docs/routing.md @@ -28,40 +28,40 @@ If you want a server to load balance across different LLM APIs, use our [OpenAI ```python from litellm import Router -model_list = [{ # list of model deployments +model_list = [{ # list of model deployments "model_name": "gpt-3.5-turbo", # model alias -> loadbalance between models with same `model_name` - "litellm_params": { # params for litellm completion/embedding call + "litellm_params": { # params for litellm completion/embedding call "model": "azure/chatgpt-v-2", # actual model name "api_key": os.getenv("AZURE_API_KEY"), "api_version": os.getenv("AZURE_API_VERSION"), "api_base": os.getenv("AZURE_API_BASE") } }, { - "model_name": "gpt-3.5-turbo", - "litellm_params": { # params for litellm completion/embedding call - "model": "azure/chatgpt-functioncalling", + "model_name": "gpt-3.5-turbo", + "litellm_params": { # params for litellm completion/embedding call + "model": "azure/chatgpt-functioncalling", "api_key": os.getenv("AZURE_API_KEY"), "api_version": os.getenv("AZURE_API_VERSION"), "api_base": os.getenv("AZURE_API_BASE") } }, { - "model_name": "gpt-3.5-turbo", - "litellm_params": { # params for litellm completion/embedding call - "model": "gpt-3.5-turbo", + "model_name": "gpt-3.5-turbo", + "litellm_params": { # params for litellm completion/embedding call + "model": "gpt-3.5-turbo", "api_key": os.getenv("OPENAI_API_KEY"), } }, { - "model_name": "gpt-4", - "litellm_params": { # params for litellm completion/embedding call - "model": "azure/gpt-4", + "model_name": "gpt-4", + "litellm_params": { # params for litellm completion/embedding call + "model": "azure/gpt-4", "api_key": os.getenv("AZURE_API_KEY"), "api_base": os.getenv("AZURE_API_BASE"), "api_version": os.getenv("AZURE_API_VERSION"), } }, { - "model_name": "gpt-4", - "litellm_params": { # params for litellm completion/embedding call - "model": "gpt-4", + "model_name": "gpt-4", + "litellm_params": { # params for litellm completion/embedding call + "model": "gpt-4", "api_key": os.getenv("OPENAI_API_KEY"), } }, @@ -72,14 +72,14 @@ router = Router(model_list=model_list) # openai.ChatCompletion.create replacement # requests with model="gpt-3.5-turbo" will pick a deployment where model_name="gpt-3.5-turbo" -response = await router.acompletion(model="gpt-3.5-turbo", +response = await router.acompletion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hey, how's it going?"}]) print(response) # openai.ChatCompletion.create replacement # requests with model="gpt-4" will pick a deployment where model_name="gpt-4" -response = await router.acompletion(model="gpt-4", +response = await router.acompletion(model="gpt-4", messages=[{"role": "user", "content": "Hey, how's it going?"}]) print(response) @@ -98,7 +98,7 @@ print(response) ### Advanced #### Routing Strategies - Weighted Pick, Rate Limit Aware, Least Busy, Latency Based -Router provides 4 strategies for routing your calls across multiple deployments: +Router provides 4 strategies for routing your calls across multiple deployments: @@ -111,13 +111,16 @@ It caches, and updates the response times for deployments based on when a reques [**How to test**](https://github.com/BerriAI/litellm/blob/main/litellm/tests/test_lowest_latency_routing.py) ```python -from litellm import Router +from litellm import Router import asyncio model_list = [{ ... }] # init router -router = Router(model_list=model_list, routing_strategy="latency-based-routing") # 👈 set routing strategy +router = Router(model_list=model_list, + routing_strategy="latency-based-routing",# 👈 set routing strategy + enable_pre_call_check=True, # enables router rate limits for concurrent calls + ) ## CALL 1+2 tasks = [] @@ -128,7 +131,7 @@ for _ in range(2): response = await asyncio.gather(*tasks) if response is not None: - ## CALL 3 + ## CALL 3 await asyncio.sleep(1) # let the cache update happen picked_deployment = router.lowestlatency_logger.get_available_deployments( model_group=model, healthy_deployments=router.healthy_deployments @@ -142,12 +145,12 @@ if response is not None: ) ``` -### Set Time Window +### Set Time Window -Set time window for how far back to consider when averaging latency for a deployment. +Set time window for how far back to consider when averaging latency for a deployment. **In Router** -```python +```python router = Router(..., routing_strategy_args={"ttl": 10}) ``` @@ -166,12 +169,12 @@ router_settings: If `rpm` or `tpm` is not provided, it randomly picks a deployment ```python -from litellm import Router +from litellm import Router import asyncio -model_list = [{ # list of model deployments - "model_name": "gpt-3.5-turbo", # model alias - "litellm_params": { # params for litellm completion/embedding call +model_list = [{ # list of model deployments + "model_name": "gpt-3.5-turbo", # model alias + "litellm_params": { # params for litellm completion/embedding call "model": "azure/chatgpt-v-2", # actual model name "api_key": os.getenv("AZURE_API_KEY"), "api_version": os.getenv("AZURE_API_VERSION"), @@ -179,18 +182,18 @@ model_list = [{ # list of model deployments "rpm": 900, # requests per minute for this API } }, { - "model_name": "gpt-3.5-turbo", - "litellm_params": { # params for litellm completion/embedding call - "model": "azure/chatgpt-functioncalling", + "model_name": "gpt-3.5-turbo", + "litellm_params": { # params for litellm completion/embedding call + "model": "azure/chatgpt-functioncalling", "api_key": os.getenv("AZURE_API_KEY"), "api_version": os.getenv("AZURE_API_VERSION"), "api_base": os.getenv("AZURE_API_BASE"), "rpm": 10, } }, { - "model_name": "gpt-3.5-turbo", - "litellm_params": { # params for litellm completion/embedding call - "model": "gpt-3.5-turbo", + "model_name": "gpt-3.5-turbo", + "litellm_params": { # params for litellm completion/embedding call + "model": "gpt-3.5-turbo", "api_key": os.getenv("OPENAI_API_KEY"), "rpm": 10, } @@ -200,7 +203,7 @@ model_list = [{ # list of model deployments router = Router(model_list=model_list, routing_strategy="simple-shuffle") async def router_acompletion(): response = await router.acompletion( - model="gpt-3.5-turbo", + model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hey, how's it going?"}] ) print(response) @@ -211,33 +214,33 @@ asyncio.run(router_acompletion()) -This will route to the deployment with the lowest TPM usage for that minute. +This will route to the deployment with the lowest TPM usage for that minute. -In production, we use Redis to track usage (TPM/RPM) across multiple deployments. +In production, we use Redis to track usage (TPM/RPM) across multiple deployments. -If you pass in the deployment's tpm/rpm limits, this will also check against that, and filter out any who's limits would be exceeded. +If you pass in the deployment's tpm/rpm limits, this will also check against that, and filter out any who's limits would be exceeded. -For Azure, your RPM = TPM/6. +For Azure, your RPM = TPM/6. ```python -from litellm import Router +from litellm import Router -model_list = [{ # list of model deployments - "model_name": "gpt-3.5-turbo", # model alias - "litellm_params": { # params for litellm completion/embedding call +model_list = [{ # list of model deployments + "model_name": "gpt-3.5-turbo", # model alias + "litellm_params": { # params for litellm completion/embedding call "model": "azure/chatgpt-v-2", # actual model name "api_key": os.getenv("AZURE_API_KEY"), "api_version": os.getenv("AZURE_API_VERSION"), "api_base": os.getenv("AZURE_API_BASE") - }, + }, "tpm": 100000, "rpm": 10000, }, { - "model_name": "gpt-3.5-turbo", - "litellm_params": { # params for litellm completion/embedding call - "model": "azure/chatgpt-functioncalling", + "model_name": "gpt-3.5-turbo", + "litellm_params": { # params for litellm completion/embedding call + "model": "azure/chatgpt-functioncalling", "api_key": os.getenv("AZURE_API_KEY"), "api_version": os.getenv("AZURE_API_VERSION"), "api_base": os.getenv("AZURE_API_BASE") @@ -245,22 +248,24 @@ model_list = [{ # list of model deployments "tpm": 100000, "rpm": 1000, }, { - "model_name": "gpt-3.5-turbo", - "litellm_params": { # params for litellm completion/embedding call - "model": "gpt-3.5-turbo", + "model_name": "gpt-3.5-turbo", + "litellm_params": { # params for litellm completion/embedding call + "model": "gpt-3.5-turbo", "api_key": os.getenv("OPENAI_API_KEY"), }, "tpm": 100000, "rpm": 1000, }] -router = Router(model_list=model_list, - redis_host=os.environ["REDIS_HOST"], - redis_password=os.environ["REDIS_PASSWORD"], - redis_port=os.environ["REDIS_PORT"], - routing_strategy="usage-based-routing") +router = Router(model_list=model_list, + redis_host=os.environ["REDIS_HOST"], + redis_password=os.environ["REDIS_PASSWORD"], + redis_port=os.environ["REDIS_PORT"], + routing_strategy="usage-based-routing", + enable_pre_call_check=True, # enables router rate limits for concurrent calls + ) -response = await router.acompletion(model="gpt-3.5-turbo", +response = await router.acompletion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hey, how's it going?"}] print(response) @@ -276,29 +281,29 @@ Picks a deployment with the least number of ongoing calls, it's handling. [**How to test**](https://github.com/BerriAI/litellm/blob/main/litellm/tests/test_least_busy_routing.py) ```python -from litellm import Router +from litellm import Router import asyncio -model_list = [{ # list of model deployments - "model_name": "gpt-3.5-turbo", # model alias - "litellm_params": { # params for litellm completion/embedding call +model_list = [{ # list of model deployments + "model_name": "gpt-3.5-turbo", # model alias + "litellm_params": { # params for litellm completion/embedding call "model": "azure/chatgpt-v-2", # actual model name "api_key": os.getenv("AZURE_API_KEY"), "api_version": os.getenv("AZURE_API_VERSION"), "api_base": os.getenv("AZURE_API_BASE"), } }, { - "model_name": "gpt-3.5-turbo", - "litellm_params": { # params for litellm completion/embedding call - "model": "azure/chatgpt-functioncalling", + "model_name": "gpt-3.5-turbo", + "litellm_params": { # params for litellm completion/embedding call + "model": "azure/chatgpt-functioncalling", "api_key": os.getenv("AZURE_API_KEY"), "api_version": os.getenv("AZURE_API_VERSION"), "api_base": os.getenv("AZURE_API_BASE"), } }, { - "model_name": "gpt-3.5-turbo", - "litellm_params": { # params for litellm completion/embedding call - "model": "gpt-3.5-turbo", + "model_name": "gpt-3.5-turbo", + "litellm_params": { # params for litellm completion/embedding call + "model": "gpt-3.5-turbo", "api_key": os.getenv("OPENAI_API_KEY"), } }] @@ -307,7 +312,7 @@ model_list = [{ # list of model deployments router = Router(model_list=model_list, routing_strategy="least-busy") async def router_acompletion(): response = await router.acompletion( - model="gpt-3.5-turbo", + model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hey, how's it going?"}] ) print(response) @@ -322,18 +327,18 @@ asyncio.run(router_acompletion()) ## Basic Reliability -### Timeouts +### Timeouts -The timeout set in router is for the entire length of the call, and is passed down to the completion() call level as well. +The timeout set in router is for the entire length of the call, and is passed down to the completion() call level as well. **Global Timeouts** ```python -from litellm import Router +from litellm import Router model_list = [{...}] -router = Router(model_list=model_list, - timeout=30) # raise timeout error if call takes > 30s +router = Router(model_list=model_list, + timeout=30) # raise timeout error if call takes > 30s print(response) ``` @@ -341,7 +346,7 @@ print(response) **Timeouts per model** ```python -from litellm import Router +from litellm import Router import asyncio model_list = [{ @@ -360,7 +365,7 @@ model_list = [{ router = Router(model_list=model_list, routing_strategy="least-busy") async def router_acompletion(): response = await router.acompletion( - model="gpt-3.5-turbo", + model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hey, how's it going?"}] ) print(response) @@ -370,20 +375,20 @@ asyncio.run(router_acompletion()) ``` ### Cooldowns -Set the limit for how many calls a model is allowed to fail in a minute, before being cooled down for a minute. +Set the limit for how many calls a model is allowed to fail in a minute, before being cooled down for a minute. ```python from litellm import Router model_list = [{...}] -router = Router(model_list=model_list, - allowed_fails=1) # cooldown model if it fails > 1 call in a minute. +router = Router(model_list=model_list, + allowed_fails=1) # cooldown model if it fails > 1 call in a minute. user_message = "Hello, whats the weather in San Francisco??" messages = [{"content": user_message, "role": "user"}] -# normal call +# normal call response = router.completion(model="gpt-3.5-turbo", messages=messages) print(f"response: {response}") @@ -392,55 +397,55 @@ print(f"response: {response}") ### Retries -For both async + sync functions, we support retrying failed requests. +For both async + sync functions, we support retrying failed requests. -For RateLimitError we implement exponential backoffs +For RateLimitError we implement exponential backoffs -For generic errors, we retry immediately +For generic errors, we retry immediately -Here's a quick look at how we can set `num_retries = 3`: +Here's a quick look at how we can set `num_retries = 3`: -```python +```python from litellm import Router model_list = [{...}] -router = Router(model_list=model_list, +router = Router(model_list=model_list, num_retries=3) user_message = "Hello, whats the weather in San Francisco??" messages = [{"content": user_message, "role": "user"}] -# normal call +# normal call response = router.completion(model="gpt-3.5-turbo", messages=messages) print(f"response: {response}") ``` -We also support setting minimum time to wait before retrying a failed request. This is via the `retry_after` param. +We also support setting minimum time to wait before retrying a failed request. This is via the `retry_after` param. -```python +```python from litellm import Router model_list = [{...}] -router = Router(model_list=model_list, +router = Router(model_list=model_list, num_retries=3, retry_after=5) # waits min 5s before retrying request user_message = "Hello, whats the weather in San Francisco??" messages = [{"content": user_message, "role": "user"}] -# normal call +# normal call response = router.completion(model="gpt-3.5-turbo", messages=messages) print(f"response: {response}") ``` -### Fallbacks +### Fallbacks -If a call fails after num_retries, fall back to another model group. +If a call fails after num_retries, fall back to another model group. -If the error is a context window exceeded error, fall back to a larger model group (if given). +If the error is a context window exceeded error, fall back to a larger model group (if given). Fallbacks are done in-order - ["gpt-3.5-turbo, "gpt-4", "gpt-4-32k"], will do 'gpt-3.5-turbo' first, then 'gpt-4', etc. @@ -448,52 +453,52 @@ Fallbacks are done in-order - ["gpt-3.5-turbo, "gpt-4", "gpt-4-32k"], will do 'g from litellm import Router model_list = [ - { # list of model deployments - "model_name": "azure/gpt-3.5-turbo", # openai model name - "litellm_params": { # params for litellm completion/embedding call - "model": "azure/chatgpt-v-2", + { # list of model deployments + "model_name": "azure/gpt-3.5-turbo", # openai model name + "litellm_params": { # params for litellm completion/embedding call + "model": "azure/chatgpt-v-2", "api_key": "bad-key", "api_version": os.getenv("AZURE_API_VERSION"), "api_base": os.getenv("AZURE_API_BASE") }, "tpm": 240000, "rpm": 1800 - }, - { # list of model deployments - "model_name": "azure/gpt-3.5-turbo-context-fallback", # openai model name - "litellm_params": { # params for litellm completion/embedding call - "model": "azure/chatgpt-v-2", + }, + { # list of model deployments + "model_name": "azure/gpt-3.5-turbo-context-fallback", # openai model name + "litellm_params": { # params for litellm completion/embedding call + "model": "azure/chatgpt-v-2", "api_key": "bad-key", "api_version": os.getenv("AZURE_API_VERSION"), "api_base": os.getenv("AZURE_API_BASE") }, "tpm": 240000, "rpm": 1800 - }, + }, { - "model_name": "azure/gpt-3.5-turbo", # openai model name - "litellm_params": { # params for litellm completion/embedding call - "model": "azure/chatgpt-functioncalling", + "model_name": "azure/gpt-3.5-turbo", # openai model name + "litellm_params": { # params for litellm completion/embedding call + "model": "azure/chatgpt-functioncalling", "api_key": "bad-key", "api_version": os.getenv("AZURE_API_VERSION"), "api_base": os.getenv("AZURE_API_BASE") }, "tpm": 240000, "rpm": 1800 - }, + }, { - "model_name": "gpt-3.5-turbo", # openai model name - "litellm_params": { # params for litellm completion/embedding call - "model": "gpt-3.5-turbo", + "model_name": "gpt-3.5-turbo", # openai model name + "litellm_params": { # params for litellm completion/embedding call + "model": "gpt-3.5-turbo", "api_key": os.getenv("OPENAI_API_KEY"), }, "tpm": 1000000, "rpm": 9000 }, { - "model_name": "gpt-3.5-turbo-16k", # openai model name - "litellm_params": { # params for litellm completion/embedding call - "model": "gpt-3.5-turbo-16k", + "model_name": "gpt-3.5-turbo-16k", # openai model name + "litellm_params": { # params for litellm completion/embedding call + "model": "gpt-3.5-turbo-16k", "api_key": os.getenv("OPENAI_API_KEY"), }, "tpm": 1000000, @@ -502,8 +507,8 @@ model_list = [ ] -router = Router(model_list=model_list, - fallbacks=[{"azure/gpt-3.5-turbo": ["gpt-3.5-turbo"]}], +router = Router(model_list=model_list, + fallbacks=[{"azure/gpt-3.5-turbo": ["gpt-3.5-turbo"]}], context_window_fallbacks=[{"azure/gpt-3.5-turbo-context-fallback": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}], set_verbose=True) @@ -511,7 +516,7 @@ router = Router(model_list=model_list, user_message = "Hello, whats the weather in San Francisco??" messages = [{"content": user_message, "role": "user"}] -# normal fallback call +# normal fallback call response = router.completion(model="azure/gpt-3.5-turbo", messages=messages) # context window fallback call @@ -522,12 +527,12 @@ print(f"response: {response}") ### Caching -In production, we recommend using a Redis cache. For quickly testing things locally, we also support simple in-memory caching. +In production, we recommend using a Redis cache. For quickly testing things locally, we also support simple in-memory caching. **In-memory Cache** ```python -router = Router(model_list=model_list, +router = Router(model_list=model_list, cache_responses=True) print(response) @@ -535,19 +540,19 @@ print(response) **Redis Cache** ```python -router = Router(model_list=model_list, - redis_host=os.getenv("REDIS_HOST"), - redis_password=os.getenv("REDIS_PASSWORD"), +router = Router(model_list=model_list, + redis_host=os.getenv("REDIS_HOST"), + redis_password=os.getenv("REDIS_PASSWORD"), redis_port=os.getenv("REDIS_PORT"), cache_responses=True) print(response) ``` -**Pass in Redis URL, additional kwargs** -```python +**Pass in Redis URL, additional kwargs** +```python router = Router(model_list: Optional[list] = None, - ## CACHING ## + ## CACHING ## redis_url=os.getenv("REDIS_URL")", cache_kwargs= {}, # additional kwargs to pass to RedisCache (see caching.py) cache_responses=True) @@ -555,14 +560,18 @@ router = Router(model_list: Optional[list] = None, ## Pre-Call Checks (Context Window) -Enable pre-call checks to filter out deployments with context window limit < messages for a call. +Enable pre-call checks to filter out: +1. deployments with context window limit < messages for a call. +2. deployments that have exceeded rate limits when making concurrent calls. (eg. `asyncio.gather(*[ + router.acompletion(model="gpt-3.5-turbo", messages=m) for m in list_of_messages + ])`) **1. Enable pre-call checks** -```python -from litellm import Router +```python +from litellm import Router # ... router = Router(model_list=model_list, enable_pre_call_checks=True) # 👈 Set to True ``` @@ -570,7 +579,7 @@ router = Router(model_list=model_list, enable_pre_call_checks=True) # 👈 Set t **2. Set Model List** -For azure deployments, set the base model. Pick the base model from [this list](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json), all the azure models start with `azure/`. +For azure deployments, set the base model. Pick the base model from [this list](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json), all the azure models start with `azure/`. @@ -598,7 +607,7 @@ model_list = [ }, ] -router = Router(model_list=model_list, enable_pre_call_checks=True) +router = Router(model_list=model_list, enable_pre_call_checks=True) ``` @@ -627,7 +636,7 @@ model_list = [ }, }, { - "model_name": "claude-opus", + "model_name": "claude-opus", "litellm_params": { call "model": "claude-3-opus-20240229", "api_key": os.getenv("ANTHROPIC_API_KEY"), @@ -635,7 +644,7 @@ model_list = [ }, ] -router = Router(model_list=model_list, enable_pre_call_checks=True, context_window_fallbacks=[{"gpt-3.5-turbo-small": ["gpt-3.5-turbo-large", "claude-opus"]}]) +router = Router(model_list=model_list, enable_pre_call_checks=True, context_window_fallbacks=[{"gpt-3.5-turbo-small": ["gpt-3.5-turbo-large", "claude-opus"]}]) ``` @@ -664,7 +673,7 @@ model_list = [ "api_base": os.getenv("AZURE_API_BASE"), }, "model_info": { - "base_model": "azure/gpt-35-turbo", + "base_model": "azure/gpt-35-turbo", } }, { @@ -676,7 +685,7 @@ model_list = [ }, ] -router = Router(model_list=model_list, enable_pre_call_checks=True) +router = Router(model_list=model_list, enable_pre_call_checks=True) text = "What is the meaning of 42?" * 5000 @@ -701,11 +710,11 @@ Go [here](./proxy/reliability.md#advanced---context-window-fallbacks) for how to ## Caching across model groups -If you want to cache across 2 different model groups (e.g. azure deployments, and openai), use caching groups. +If you want to cache across 2 different model groups (e.g. azure deployments, and openai), use caching groups. ```python import litellm, asyncio, time -from litellm import Router +from litellm import Router # set os env os.environ["OPENAI_API_KEY"] = "" @@ -713,8 +722,8 @@ os.environ["AZURE_API_KEY"] = "" os.environ["AZURE_API_BASE"] = "" os.environ["AZURE_API_VERSION"] = "" -async def test_acompletion_caching_on_router_caching_groups(): - # tests acompletion + caching on router +async def test_acompletion_caching_on_router_caching_groups(): + # tests acompletion + caching on router try: litellm.set_verbose = True model_list = [ @@ -740,8 +749,8 @@ async def test_acompletion_caching_on_router_caching_groups(): {"role": "user", "content": f"write a one sentence poem {time.time()}?"} ] start_time = time.time() - router = Router(model_list=model_list, - cache_responses=True, + router = Router(model_list=model_list, + cache_responses=True, caching_groups=[("openai-gpt-3.5-turbo", "azure-gpt-3.5-turbo")]) response1 = await router.acompletion(model="openai-gpt-3.5-turbo", messages=messages, temperature=1) print(f"response1: {response1}") @@ -768,9 +777,9 @@ Step 1. Router Setup from litellm import Router model_list = [ - { # list of model deployments - "model_name": "gpt-4-preview", # model alias - "litellm_params": { # params for litellm completion/embedding call + { # list of model deployments + "model_name": "gpt-4-preview", # model alias + "litellm_params": { # params for litellm completion/embedding call "model": "azure/chatgpt-v-2", # actual model name "api_key": os.getenv("AZURE_API_KEY"), "api_version": os.getenv("AZURE_API_VERSION"), @@ -779,11 +788,11 @@ model_list = [ "model_info": { "base_model": "azure/gpt-4-1106-preview" # azure/gpt-4-1106-preview will be used for cost tracking, ensure this exists in litellm model_prices_and_context_window.json } - }, + }, { - "model_name": "gpt-4-32k", - "litellm_params": { # params for litellm completion/embedding call - "model": "azure/chatgpt-functioncalling", + "model_name": "gpt-4-32k", + "litellm_params": { # params for litellm completion/embedding call + "model": "azure/chatgpt-functioncalling", "api_key": os.getenv("AZURE_API_KEY"), "api_version": os.getenv("AZURE_API_VERSION"), "api_base": os.getenv("AZURE_API_BASE") @@ -804,8 +813,8 @@ Step 2. Access `response_cost` in the custom callback, **litellm calculates the import litellm from litellm.integrations.custom_logger import CustomLogger -class MyCustomHandler(CustomLogger): - def log_success_event(self, kwargs, response_obj, start_time, end_time): +class MyCustomHandler(CustomLogger): + def log_success_event(self, kwargs, response_obj, start_time, end_time): print(f"On Success") response_cost = kwargs.get("response_cost") print("response_cost=", response_cost) @@ -815,7 +824,7 @@ litellm.callbacks = [customHandler] # router completion call response = router.completion( - model="gpt-4-32k", + model="gpt-4-32k", messages=[{ "role": "user", "content": "Hi who are you"}] ) ``` @@ -823,28 +832,28 @@ response = router.completion( #### Default litellm.completion/embedding params -You can also set default params for litellm completion/embedding calls. Here's how to do that: +You can also set default params for litellm completion/embedding calls. Here's how to do that: -```python +```python from litellm import Router fallback_dict = {"gpt-3.5-turbo": "gpt-3.5-turbo-16k"} -router = Router(model_list=model_list, +router = Router(model_list=model_list, default_litellm_params={"context_window_fallback_dict": fallback_dict}) user_message = "Hello, whats the weather in San Francisco??" messages = [{"content": user_message, "role": "user"}] -# normal call +# normal call response = router.completion(model="gpt-3.5-turbo", messages=messages) print(f"response: {response}") ``` -## Custom Callbacks - Track API Key, API Endpoint, Model Used +## Custom Callbacks - Track API Key, API Endpoint, Model Used -If you need to track the api_key, api endpoint, model, custom_llm_provider used for each completion call, you can setup a [custom callback](https://docs.litellm.ai/docs/observability/custom_callback) +If you need to track the api_key, api endpoint, model, custom_llm_provider used for each completion call, you can setup a [custom callback](https://docs.litellm.ai/docs/observability/custom_callback) ### Usage @@ -852,8 +861,8 @@ If you need to track the api_key, api endpoint, model, custom_llm_provider used import litellm from litellm.integrations.custom_logger import CustomLogger -class MyCustomHandler(CustomLogger): - def log_success_event(self, kwargs, response_obj, start_time, end_time): +class MyCustomHandler(CustomLogger): + def log_success_event(self, kwargs, response_obj, start_time, end_time): print(f"On Success") print("kwargs=", kwargs) litellm_params= kwargs.get("litellm_params") @@ -868,7 +877,7 @@ class MyCustomHandler(CustomLogger): print("custom_llm_provider=", custom_llm_provider) print("response_cost=", response_cost) - def log_failure_event(self, kwargs, response_obj, start_time, end_time): + def log_failure_event(self, kwargs, response_obj, start_time, end_time): print(f"On Failure") print("kwargs=") @@ -881,12 +890,12 @@ router = Router(model_list=model_list, routing_strategy="simple-shuffle") # router completion call response = router.completion( - model="gpt-3.5-turbo", + model="gpt-3.5-turbo", messages=[{ "role": "user", "content": "Hi who are you"}] ) ``` -## Deploy Router +## Deploy Router If you want a server to load balance across different LLM APIs, use our [OpenAI Proxy Server](./simple_proxy#load-balancing---multiple-instances-of-1-model) @@ -896,7 +905,7 @@ If you want a server to load balance across different LLM APIs, use our [OpenAI ```python def __init__( model_list: Optional[list] = None, - + ## CACHING ## redis_url: Optional[str] = None, redis_host: Optional[str] = None, From 6e20bb13b26089728dfd164c5840a38658851ce0 Mon Sep 17 00:00:00 2001 From: CLARKBENHAM Date: Mon, 8 Apr 2024 21:27:38 -0700 Subject: [PATCH 2/3] Revert "doc pre_call_check: enables router rate limits for concurrent calls" This reverts commit 886c859519c8bd8eddcdeb51c9946036a4042043. --- docs/my-website/docs/routing.md | 327 ++++++++++++++++---------------- 1 file changed, 159 insertions(+), 168 deletions(-) diff --git a/docs/my-website/docs/routing.md b/docs/my-website/docs/routing.md index 3fda19094..404c72e44 100644 --- a/docs/my-website/docs/routing.md +++ b/docs/my-website/docs/routing.md @@ -28,40 +28,40 @@ If you want a server to load balance across different LLM APIs, use our [OpenAI ```python from litellm import Router -model_list = [{ # list of model deployments +model_list = [{ # list of model deployments "model_name": "gpt-3.5-turbo", # model alias -> loadbalance between models with same `model_name` - "litellm_params": { # params for litellm completion/embedding call + "litellm_params": { # params for litellm completion/embedding call "model": "azure/chatgpt-v-2", # actual model name "api_key": os.getenv("AZURE_API_KEY"), "api_version": os.getenv("AZURE_API_VERSION"), "api_base": os.getenv("AZURE_API_BASE") } }, { - "model_name": "gpt-3.5-turbo", - "litellm_params": { # params for litellm completion/embedding call - "model": "azure/chatgpt-functioncalling", + "model_name": "gpt-3.5-turbo", + "litellm_params": { # params for litellm completion/embedding call + "model": "azure/chatgpt-functioncalling", "api_key": os.getenv("AZURE_API_KEY"), "api_version": os.getenv("AZURE_API_VERSION"), "api_base": os.getenv("AZURE_API_BASE") } }, { - "model_name": "gpt-3.5-turbo", - "litellm_params": { # params for litellm completion/embedding call - "model": "gpt-3.5-turbo", + "model_name": "gpt-3.5-turbo", + "litellm_params": { # params for litellm completion/embedding call + "model": "gpt-3.5-turbo", "api_key": os.getenv("OPENAI_API_KEY"), } }, { - "model_name": "gpt-4", - "litellm_params": { # params for litellm completion/embedding call - "model": "azure/gpt-4", + "model_name": "gpt-4", + "litellm_params": { # params for litellm completion/embedding call + "model": "azure/gpt-4", "api_key": os.getenv("AZURE_API_KEY"), "api_base": os.getenv("AZURE_API_BASE"), "api_version": os.getenv("AZURE_API_VERSION"), } }, { - "model_name": "gpt-4", - "litellm_params": { # params for litellm completion/embedding call - "model": "gpt-4", + "model_name": "gpt-4", + "litellm_params": { # params for litellm completion/embedding call + "model": "gpt-4", "api_key": os.getenv("OPENAI_API_KEY"), } }, @@ -72,14 +72,14 @@ router = Router(model_list=model_list) # openai.ChatCompletion.create replacement # requests with model="gpt-3.5-turbo" will pick a deployment where model_name="gpt-3.5-turbo" -response = await router.acompletion(model="gpt-3.5-turbo", +response = await router.acompletion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hey, how's it going?"}]) print(response) # openai.ChatCompletion.create replacement # requests with model="gpt-4" will pick a deployment where model_name="gpt-4" -response = await router.acompletion(model="gpt-4", +response = await router.acompletion(model="gpt-4", messages=[{"role": "user", "content": "Hey, how's it going?"}]) print(response) @@ -98,7 +98,7 @@ print(response) ### Advanced #### Routing Strategies - Weighted Pick, Rate Limit Aware, Least Busy, Latency Based -Router provides 4 strategies for routing your calls across multiple deployments: +Router provides 4 strategies for routing your calls across multiple deployments: @@ -111,16 +111,13 @@ It caches, and updates the response times for deployments based on when a reques [**How to test**](https://github.com/BerriAI/litellm/blob/main/litellm/tests/test_lowest_latency_routing.py) ```python -from litellm import Router +from litellm import Router import asyncio model_list = [{ ... }] # init router -router = Router(model_list=model_list, - routing_strategy="latency-based-routing",# 👈 set routing strategy - enable_pre_call_check=True, # enables router rate limits for concurrent calls - ) +router = Router(model_list=model_list, routing_strategy="latency-based-routing") # 👈 set routing strategy ## CALL 1+2 tasks = [] @@ -131,7 +128,7 @@ for _ in range(2): response = await asyncio.gather(*tasks) if response is not None: - ## CALL 3 + ## CALL 3 await asyncio.sleep(1) # let the cache update happen picked_deployment = router.lowestlatency_logger.get_available_deployments( model_group=model, healthy_deployments=router.healthy_deployments @@ -145,12 +142,12 @@ if response is not None: ) ``` -### Set Time Window +### Set Time Window -Set time window for how far back to consider when averaging latency for a deployment. +Set time window for how far back to consider when averaging latency for a deployment. **In Router** -```python +```python router = Router(..., routing_strategy_args={"ttl": 10}) ``` @@ -169,12 +166,12 @@ router_settings: If `rpm` or `tpm` is not provided, it randomly picks a deployment ```python -from litellm import Router +from litellm import Router import asyncio -model_list = [{ # list of model deployments - "model_name": "gpt-3.5-turbo", # model alias - "litellm_params": { # params for litellm completion/embedding call +model_list = [{ # list of model deployments + "model_name": "gpt-3.5-turbo", # model alias + "litellm_params": { # params for litellm completion/embedding call "model": "azure/chatgpt-v-2", # actual model name "api_key": os.getenv("AZURE_API_KEY"), "api_version": os.getenv("AZURE_API_VERSION"), @@ -182,18 +179,18 @@ model_list = [{ # list of model deployments "rpm": 900, # requests per minute for this API } }, { - "model_name": "gpt-3.5-turbo", - "litellm_params": { # params for litellm completion/embedding call - "model": "azure/chatgpt-functioncalling", + "model_name": "gpt-3.5-turbo", + "litellm_params": { # params for litellm completion/embedding call + "model": "azure/chatgpt-functioncalling", "api_key": os.getenv("AZURE_API_KEY"), "api_version": os.getenv("AZURE_API_VERSION"), "api_base": os.getenv("AZURE_API_BASE"), "rpm": 10, } }, { - "model_name": "gpt-3.5-turbo", - "litellm_params": { # params for litellm completion/embedding call - "model": "gpt-3.5-turbo", + "model_name": "gpt-3.5-turbo", + "litellm_params": { # params for litellm completion/embedding call + "model": "gpt-3.5-turbo", "api_key": os.getenv("OPENAI_API_KEY"), "rpm": 10, } @@ -203,7 +200,7 @@ model_list = [{ # list of model deployments router = Router(model_list=model_list, routing_strategy="simple-shuffle") async def router_acompletion(): response = await router.acompletion( - model="gpt-3.5-turbo", + model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hey, how's it going?"}] ) print(response) @@ -214,33 +211,33 @@ asyncio.run(router_acompletion()) -This will route to the deployment with the lowest TPM usage for that minute. +This will route to the deployment with the lowest TPM usage for that minute. -In production, we use Redis to track usage (TPM/RPM) across multiple deployments. +In production, we use Redis to track usage (TPM/RPM) across multiple deployments. -If you pass in the deployment's tpm/rpm limits, this will also check against that, and filter out any who's limits would be exceeded. +If you pass in the deployment's tpm/rpm limits, this will also check against that, and filter out any who's limits would be exceeded. -For Azure, your RPM = TPM/6. +For Azure, your RPM = TPM/6. ```python -from litellm import Router +from litellm import Router -model_list = [{ # list of model deployments - "model_name": "gpt-3.5-turbo", # model alias - "litellm_params": { # params for litellm completion/embedding call +model_list = [{ # list of model deployments + "model_name": "gpt-3.5-turbo", # model alias + "litellm_params": { # params for litellm completion/embedding call "model": "azure/chatgpt-v-2", # actual model name "api_key": os.getenv("AZURE_API_KEY"), "api_version": os.getenv("AZURE_API_VERSION"), "api_base": os.getenv("AZURE_API_BASE") - }, + }, "tpm": 100000, "rpm": 10000, }, { - "model_name": "gpt-3.5-turbo", - "litellm_params": { # params for litellm completion/embedding call - "model": "azure/chatgpt-functioncalling", + "model_name": "gpt-3.5-turbo", + "litellm_params": { # params for litellm completion/embedding call + "model": "azure/chatgpt-functioncalling", "api_key": os.getenv("AZURE_API_KEY"), "api_version": os.getenv("AZURE_API_VERSION"), "api_base": os.getenv("AZURE_API_BASE") @@ -248,24 +245,22 @@ model_list = [{ # list of model deployments "tpm": 100000, "rpm": 1000, }, { - "model_name": "gpt-3.5-turbo", - "litellm_params": { # params for litellm completion/embedding call - "model": "gpt-3.5-turbo", + "model_name": "gpt-3.5-turbo", + "litellm_params": { # params for litellm completion/embedding call + "model": "gpt-3.5-turbo", "api_key": os.getenv("OPENAI_API_KEY"), }, "tpm": 100000, "rpm": 1000, }] -router = Router(model_list=model_list, - redis_host=os.environ["REDIS_HOST"], - redis_password=os.environ["REDIS_PASSWORD"], - redis_port=os.environ["REDIS_PORT"], - routing_strategy="usage-based-routing", - enable_pre_call_check=True, # enables router rate limits for concurrent calls - ) +router = Router(model_list=model_list, + redis_host=os.environ["REDIS_HOST"], + redis_password=os.environ["REDIS_PASSWORD"], + redis_port=os.environ["REDIS_PORT"], + routing_strategy="usage-based-routing") -response = await router.acompletion(model="gpt-3.5-turbo", +response = await router.acompletion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hey, how's it going?"}] print(response) @@ -281,29 +276,29 @@ Picks a deployment with the least number of ongoing calls, it's handling. [**How to test**](https://github.com/BerriAI/litellm/blob/main/litellm/tests/test_least_busy_routing.py) ```python -from litellm import Router +from litellm import Router import asyncio -model_list = [{ # list of model deployments - "model_name": "gpt-3.5-turbo", # model alias - "litellm_params": { # params for litellm completion/embedding call +model_list = [{ # list of model deployments + "model_name": "gpt-3.5-turbo", # model alias + "litellm_params": { # params for litellm completion/embedding call "model": "azure/chatgpt-v-2", # actual model name "api_key": os.getenv("AZURE_API_KEY"), "api_version": os.getenv("AZURE_API_VERSION"), "api_base": os.getenv("AZURE_API_BASE"), } }, { - "model_name": "gpt-3.5-turbo", - "litellm_params": { # params for litellm completion/embedding call - "model": "azure/chatgpt-functioncalling", + "model_name": "gpt-3.5-turbo", + "litellm_params": { # params for litellm completion/embedding call + "model": "azure/chatgpt-functioncalling", "api_key": os.getenv("AZURE_API_KEY"), "api_version": os.getenv("AZURE_API_VERSION"), "api_base": os.getenv("AZURE_API_BASE"), } }, { - "model_name": "gpt-3.5-turbo", - "litellm_params": { # params for litellm completion/embedding call - "model": "gpt-3.5-turbo", + "model_name": "gpt-3.5-turbo", + "litellm_params": { # params for litellm completion/embedding call + "model": "gpt-3.5-turbo", "api_key": os.getenv("OPENAI_API_KEY"), } }] @@ -312,7 +307,7 @@ model_list = [{ # list of model deployments router = Router(model_list=model_list, routing_strategy="least-busy") async def router_acompletion(): response = await router.acompletion( - model="gpt-3.5-turbo", + model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hey, how's it going?"}] ) print(response) @@ -327,18 +322,18 @@ asyncio.run(router_acompletion()) ## Basic Reliability -### Timeouts +### Timeouts -The timeout set in router is for the entire length of the call, and is passed down to the completion() call level as well. +The timeout set in router is for the entire length of the call, and is passed down to the completion() call level as well. **Global Timeouts** ```python -from litellm import Router +from litellm import Router model_list = [{...}] -router = Router(model_list=model_list, - timeout=30) # raise timeout error if call takes > 30s +router = Router(model_list=model_list, + timeout=30) # raise timeout error if call takes > 30s print(response) ``` @@ -346,7 +341,7 @@ print(response) **Timeouts per model** ```python -from litellm import Router +from litellm import Router import asyncio model_list = [{ @@ -365,7 +360,7 @@ model_list = [{ router = Router(model_list=model_list, routing_strategy="least-busy") async def router_acompletion(): response = await router.acompletion( - model="gpt-3.5-turbo", + model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hey, how's it going?"}] ) print(response) @@ -375,20 +370,20 @@ asyncio.run(router_acompletion()) ``` ### Cooldowns -Set the limit for how many calls a model is allowed to fail in a minute, before being cooled down for a minute. +Set the limit for how many calls a model is allowed to fail in a minute, before being cooled down for a minute. ```python from litellm import Router model_list = [{...}] -router = Router(model_list=model_list, - allowed_fails=1) # cooldown model if it fails > 1 call in a minute. +router = Router(model_list=model_list, + allowed_fails=1) # cooldown model if it fails > 1 call in a minute. user_message = "Hello, whats the weather in San Francisco??" messages = [{"content": user_message, "role": "user"}] -# normal call +# normal call response = router.completion(model="gpt-3.5-turbo", messages=messages) print(f"response: {response}") @@ -397,55 +392,55 @@ print(f"response: {response}") ### Retries -For both async + sync functions, we support retrying failed requests. +For both async + sync functions, we support retrying failed requests. -For RateLimitError we implement exponential backoffs +For RateLimitError we implement exponential backoffs -For generic errors, we retry immediately +For generic errors, we retry immediately -Here's a quick look at how we can set `num_retries = 3`: +Here's a quick look at how we can set `num_retries = 3`: -```python +```python from litellm import Router model_list = [{...}] -router = Router(model_list=model_list, +router = Router(model_list=model_list, num_retries=3) user_message = "Hello, whats the weather in San Francisco??" messages = [{"content": user_message, "role": "user"}] -# normal call +# normal call response = router.completion(model="gpt-3.5-turbo", messages=messages) print(f"response: {response}") ``` -We also support setting minimum time to wait before retrying a failed request. This is via the `retry_after` param. +We also support setting minimum time to wait before retrying a failed request. This is via the `retry_after` param. -```python +```python from litellm import Router model_list = [{...}] -router = Router(model_list=model_list, +router = Router(model_list=model_list, num_retries=3, retry_after=5) # waits min 5s before retrying request user_message = "Hello, whats the weather in San Francisco??" messages = [{"content": user_message, "role": "user"}] -# normal call +# normal call response = router.completion(model="gpt-3.5-turbo", messages=messages) print(f"response: {response}") ``` -### Fallbacks +### Fallbacks -If a call fails after num_retries, fall back to another model group. +If a call fails after num_retries, fall back to another model group. -If the error is a context window exceeded error, fall back to a larger model group (if given). +If the error is a context window exceeded error, fall back to a larger model group (if given). Fallbacks are done in-order - ["gpt-3.5-turbo, "gpt-4", "gpt-4-32k"], will do 'gpt-3.5-turbo' first, then 'gpt-4', etc. @@ -453,52 +448,52 @@ Fallbacks are done in-order - ["gpt-3.5-turbo, "gpt-4", "gpt-4-32k"], will do 'g from litellm import Router model_list = [ - { # list of model deployments - "model_name": "azure/gpt-3.5-turbo", # openai model name - "litellm_params": { # params for litellm completion/embedding call - "model": "azure/chatgpt-v-2", + { # list of model deployments + "model_name": "azure/gpt-3.5-turbo", # openai model name + "litellm_params": { # params for litellm completion/embedding call + "model": "azure/chatgpt-v-2", "api_key": "bad-key", "api_version": os.getenv("AZURE_API_VERSION"), "api_base": os.getenv("AZURE_API_BASE") }, "tpm": 240000, "rpm": 1800 - }, - { # list of model deployments - "model_name": "azure/gpt-3.5-turbo-context-fallback", # openai model name - "litellm_params": { # params for litellm completion/embedding call - "model": "azure/chatgpt-v-2", + }, + { # list of model deployments + "model_name": "azure/gpt-3.5-turbo-context-fallback", # openai model name + "litellm_params": { # params for litellm completion/embedding call + "model": "azure/chatgpt-v-2", "api_key": "bad-key", "api_version": os.getenv("AZURE_API_VERSION"), "api_base": os.getenv("AZURE_API_BASE") }, "tpm": 240000, "rpm": 1800 - }, + }, { - "model_name": "azure/gpt-3.5-turbo", # openai model name - "litellm_params": { # params for litellm completion/embedding call - "model": "azure/chatgpt-functioncalling", + "model_name": "azure/gpt-3.5-turbo", # openai model name + "litellm_params": { # params for litellm completion/embedding call + "model": "azure/chatgpt-functioncalling", "api_key": "bad-key", "api_version": os.getenv("AZURE_API_VERSION"), "api_base": os.getenv("AZURE_API_BASE") }, "tpm": 240000, "rpm": 1800 - }, + }, { - "model_name": "gpt-3.5-turbo", # openai model name - "litellm_params": { # params for litellm completion/embedding call - "model": "gpt-3.5-turbo", + "model_name": "gpt-3.5-turbo", # openai model name + "litellm_params": { # params for litellm completion/embedding call + "model": "gpt-3.5-turbo", "api_key": os.getenv("OPENAI_API_KEY"), }, "tpm": 1000000, "rpm": 9000 }, { - "model_name": "gpt-3.5-turbo-16k", # openai model name - "litellm_params": { # params for litellm completion/embedding call - "model": "gpt-3.5-turbo-16k", + "model_name": "gpt-3.5-turbo-16k", # openai model name + "litellm_params": { # params for litellm completion/embedding call + "model": "gpt-3.5-turbo-16k", "api_key": os.getenv("OPENAI_API_KEY"), }, "tpm": 1000000, @@ -507,8 +502,8 @@ model_list = [ ] -router = Router(model_list=model_list, - fallbacks=[{"azure/gpt-3.5-turbo": ["gpt-3.5-turbo"]}], +router = Router(model_list=model_list, + fallbacks=[{"azure/gpt-3.5-turbo": ["gpt-3.5-turbo"]}], context_window_fallbacks=[{"azure/gpt-3.5-turbo-context-fallback": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}], set_verbose=True) @@ -516,7 +511,7 @@ router = Router(model_list=model_list, user_message = "Hello, whats the weather in San Francisco??" messages = [{"content": user_message, "role": "user"}] -# normal fallback call +# normal fallback call response = router.completion(model="azure/gpt-3.5-turbo", messages=messages) # context window fallback call @@ -527,12 +522,12 @@ print(f"response: {response}") ### Caching -In production, we recommend using a Redis cache. For quickly testing things locally, we also support simple in-memory caching. +In production, we recommend using a Redis cache. For quickly testing things locally, we also support simple in-memory caching. **In-memory Cache** ```python -router = Router(model_list=model_list, +router = Router(model_list=model_list, cache_responses=True) print(response) @@ -540,19 +535,19 @@ print(response) **Redis Cache** ```python -router = Router(model_list=model_list, - redis_host=os.getenv("REDIS_HOST"), - redis_password=os.getenv("REDIS_PASSWORD"), +router = Router(model_list=model_list, + redis_host=os.getenv("REDIS_HOST"), + redis_password=os.getenv("REDIS_PASSWORD"), redis_port=os.getenv("REDIS_PORT"), cache_responses=True) print(response) ``` -**Pass in Redis URL, additional kwargs** -```python +**Pass in Redis URL, additional kwargs** +```python router = Router(model_list: Optional[list] = None, - ## CACHING ## + ## CACHING ## redis_url=os.getenv("REDIS_URL")", cache_kwargs= {}, # additional kwargs to pass to RedisCache (see caching.py) cache_responses=True) @@ -560,18 +555,14 @@ router = Router(model_list: Optional[list] = None, ## Pre-Call Checks (Context Window) -Enable pre-call checks to filter out: -1. deployments with context window limit < messages for a call. -2. deployments that have exceeded rate limits when making concurrent calls. (eg. `asyncio.gather(*[ - router.acompletion(model="gpt-3.5-turbo", messages=m) for m in list_of_messages - ])`) +Enable pre-call checks to filter out deployments with context window limit < messages for a call. **1. Enable pre-call checks** -```python -from litellm import Router +```python +from litellm import Router # ... router = Router(model_list=model_list, enable_pre_call_checks=True) # 👈 Set to True ``` @@ -579,7 +570,7 @@ router = Router(model_list=model_list, enable_pre_call_checks=True) # 👈 Set t **2. Set Model List** -For azure deployments, set the base model. Pick the base model from [this list](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json), all the azure models start with `azure/`. +For azure deployments, set the base model. Pick the base model from [this list](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json), all the azure models start with `azure/`. @@ -607,7 +598,7 @@ model_list = [ }, ] -router = Router(model_list=model_list, enable_pre_call_checks=True) +router = Router(model_list=model_list, enable_pre_call_checks=True) ``` @@ -636,7 +627,7 @@ model_list = [ }, }, { - "model_name": "claude-opus", + "model_name": "claude-opus", "litellm_params": { call "model": "claude-3-opus-20240229", "api_key": os.getenv("ANTHROPIC_API_KEY"), @@ -644,7 +635,7 @@ model_list = [ }, ] -router = Router(model_list=model_list, enable_pre_call_checks=True, context_window_fallbacks=[{"gpt-3.5-turbo-small": ["gpt-3.5-turbo-large", "claude-opus"]}]) +router = Router(model_list=model_list, enable_pre_call_checks=True, context_window_fallbacks=[{"gpt-3.5-turbo-small": ["gpt-3.5-turbo-large", "claude-opus"]}]) ``` @@ -673,7 +664,7 @@ model_list = [ "api_base": os.getenv("AZURE_API_BASE"), }, "model_info": { - "base_model": "azure/gpt-35-turbo", + "base_model": "azure/gpt-35-turbo", } }, { @@ -685,7 +676,7 @@ model_list = [ }, ] -router = Router(model_list=model_list, enable_pre_call_checks=True) +router = Router(model_list=model_list, enable_pre_call_checks=True) text = "What is the meaning of 42?" * 5000 @@ -710,11 +701,11 @@ Go [here](./proxy/reliability.md#advanced---context-window-fallbacks) for how to ## Caching across model groups -If you want to cache across 2 different model groups (e.g. azure deployments, and openai), use caching groups. +If you want to cache across 2 different model groups (e.g. azure deployments, and openai), use caching groups. ```python import litellm, asyncio, time -from litellm import Router +from litellm import Router # set os env os.environ["OPENAI_API_KEY"] = "" @@ -722,8 +713,8 @@ os.environ["AZURE_API_KEY"] = "" os.environ["AZURE_API_BASE"] = "" os.environ["AZURE_API_VERSION"] = "" -async def test_acompletion_caching_on_router_caching_groups(): - # tests acompletion + caching on router +async def test_acompletion_caching_on_router_caching_groups(): + # tests acompletion + caching on router try: litellm.set_verbose = True model_list = [ @@ -749,8 +740,8 @@ async def test_acompletion_caching_on_router_caching_groups(): {"role": "user", "content": f"write a one sentence poem {time.time()}?"} ] start_time = time.time() - router = Router(model_list=model_list, - cache_responses=True, + router = Router(model_list=model_list, + cache_responses=True, caching_groups=[("openai-gpt-3.5-turbo", "azure-gpt-3.5-turbo")]) response1 = await router.acompletion(model="openai-gpt-3.5-turbo", messages=messages, temperature=1) print(f"response1: {response1}") @@ -777,9 +768,9 @@ Step 1. Router Setup from litellm import Router model_list = [ - { # list of model deployments - "model_name": "gpt-4-preview", # model alias - "litellm_params": { # params for litellm completion/embedding call + { # list of model deployments + "model_name": "gpt-4-preview", # model alias + "litellm_params": { # params for litellm completion/embedding call "model": "azure/chatgpt-v-2", # actual model name "api_key": os.getenv("AZURE_API_KEY"), "api_version": os.getenv("AZURE_API_VERSION"), @@ -788,11 +779,11 @@ model_list = [ "model_info": { "base_model": "azure/gpt-4-1106-preview" # azure/gpt-4-1106-preview will be used for cost tracking, ensure this exists in litellm model_prices_and_context_window.json } - }, + }, { - "model_name": "gpt-4-32k", - "litellm_params": { # params for litellm completion/embedding call - "model": "azure/chatgpt-functioncalling", + "model_name": "gpt-4-32k", + "litellm_params": { # params for litellm completion/embedding call + "model": "azure/chatgpt-functioncalling", "api_key": os.getenv("AZURE_API_KEY"), "api_version": os.getenv("AZURE_API_VERSION"), "api_base": os.getenv("AZURE_API_BASE") @@ -813,8 +804,8 @@ Step 2. Access `response_cost` in the custom callback, **litellm calculates the import litellm from litellm.integrations.custom_logger import CustomLogger -class MyCustomHandler(CustomLogger): - def log_success_event(self, kwargs, response_obj, start_time, end_time): +class MyCustomHandler(CustomLogger): + def log_success_event(self, kwargs, response_obj, start_time, end_time): print(f"On Success") response_cost = kwargs.get("response_cost") print("response_cost=", response_cost) @@ -824,7 +815,7 @@ litellm.callbacks = [customHandler] # router completion call response = router.completion( - model="gpt-4-32k", + model="gpt-4-32k", messages=[{ "role": "user", "content": "Hi who are you"}] ) ``` @@ -832,28 +823,28 @@ response = router.completion( #### Default litellm.completion/embedding params -You can also set default params for litellm completion/embedding calls. Here's how to do that: +You can also set default params for litellm completion/embedding calls. Here's how to do that: -```python +```python from litellm import Router fallback_dict = {"gpt-3.5-turbo": "gpt-3.5-turbo-16k"} -router = Router(model_list=model_list, +router = Router(model_list=model_list, default_litellm_params={"context_window_fallback_dict": fallback_dict}) user_message = "Hello, whats the weather in San Francisco??" messages = [{"content": user_message, "role": "user"}] -# normal call +# normal call response = router.completion(model="gpt-3.5-turbo", messages=messages) print(f"response: {response}") ``` -## Custom Callbacks - Track API Key, API Endpoint, Model Used +## Custom Callbacks - Track API Key, API Endpoint, Model Used -If you need to track the api_key, api endpoint, model, custom_llm_provider used for each completion call, you can setup a [custom callback](https://docs.litellm.ai/docs/observability/custom_callback) +If you need to track the api_key, api endpoint, model, custom_llm_provider used for each completion call, you can setup a [custom callback](https://docs.litellm.ai/docs/observability/custom_callback) ### Usage @@ -861,8 +852,8 @@ If you need to track the api_key, api endpoint, model, custom_llm_provider used import litellm from litellm.integrations.custom_logger import CustomLogger -class MyCustomHandler(CustomLogger): - def log_success_event(self, kwargs, response_obj, start_time, end_time): +class MyCustomHandler(CustomLogger): + def log_success_event(self, kwargs, response_obj, start_time, end_time): print(f"On Success") print("kwargs=", kwargs) litellm_params= kwargs.get("litellm_params") @@ -877,7 +868,7 @@ class MyCustomHandler(CustomLogger): print("custom_llm_provider=", custom_llm_provider) print("response_cost=", response_cost) - def log_failure_event(self, kwargs, response_obj, start_time, end_time): + def log_failure_event(self, kwargs, response_obj, start_time, end_time): print(f"On Failure") print("kwargs=") @@ -890,12 +881,12 @@ router = Router(model_list=model_list, routing_strategy="simple-shuffle") # router completion call response = router.completion( - model="gpt-3.5-turbo", + model="gpt-3.5-turbo", messages=[{ "role": "user", "content": "Hi who are you"}] ) ``` -## Deploy Router +## Deploy Router If you want a server to load balance across different LLM APIs, use our [OpenAI Proxy Server](./simple_proxy#load-balancing---multiple-instances-of-1-model) @@ -905,7 +896,7 @@ If you want a server to load balance across different LLM APIs, use our [OpenAI ```python def __init__( model_list: Optional[list] = None, - + ## CACHING ## redis_url: Optional[str] = None, redis_host: Optional[str] = None, From e96d97d9e58dc26568d2b53afbe2abaa97a2913d Mon Sep 17 00:00:00 2001 From: CLARKBENHAM Date: Mon, 8 Apr 2024 21:31:21 -0700 Subject: [PATCH 3/3] remove formating changes --- docs/my-website/docs/routing.md | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/docs/my-website/docs/routing.md b/docs/my-website/docs/routing.md index 404c72e44..85649eb6b 100644 --- a/docs/my-website/docs/routing.md +++ b/docs/my-website/docs/routing.md @@ -117,7 +117,10 @@ import asyncio model_list = [{ ... }] # init router -router = Router(model_list=model_list, routing_strategy="latency-based-routing") # 👈 set routing strategy +router = Router(model_list=model_list, + routing_strategy="latency-based-routing",# 👈 set routing strategy + enable_pre_call_check=True, # enables router rate limits for concurrent calls + ) ## CALL 1+2 tasks = [] @@ -257,8 +260,9 @@ router = Router(model_list=model_list, redis_host=os.environ["REDIS_HOST"], redis_password=os.environ["REDIS_PASSWORD"], redis_port=os.environ["REDIS_PORT"], - routing_strategy="usage-based-routing") - + routing_strategy="usage-based-routing" + enable_pre_call_check=True, # enables router rate limits for concurrent calls + ) response = await router.acompletion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hey, how's it going?"}] @@ -555,7 +559,11 @@ router = Router(model_list: Optional[list] = None, ## Pre-Call Checks (Context Window) -Enable pre-call checks to filter out deployments with context window limit < messages for a call. +Enable pre-call checks to filter out: +1. deployments with context window limit < messages for a call. +2. deployments that have exceeded rate limits when making concurrent calls. (eg. `asyncio.gather(*[ + router.acompletion(model="gpt-3.5-turbo", messages=m) for m in list_of_messages + ])`)