diff --git a/docs/my-website/docs/routing.md b/docs/my-website/docs/routing.md index 391b20b2f6..6e7a076bd5 100644 --- a/docs/my-website/docs/routing.md +++ b/docs/my-website/docs/routing.md @@ -1,5 +1,10 @@ +import Image from '@theme/IdealImage'; + # Reliability - Fallbacks, Azure Deployments, etc. + +HF_Dashboard + # Reliability LiteLLM helps prevent failed requests in 3 ways: @@ -14,6 +19,99 @@ LiteLLM supports the following functions for reliability: * `completion()` with fallbacks: switch between models/keys/api bases in case of errors. * `router()`: An abstraction on top of completion + embeddings to route the request to a deployment with capacity (available tpm/rpm). +## Manage Multiple Deployments + +Use this if you're trying to load-balance across multiple deployments (e.g. Azure/OpenAI). + +`Router` prevents failed requests, by picking the deployment which is below rate-limit and has the least amount of tokens used. + +In production, [Router connects to a Redis Cache](#redis-queue) to track usage across multiple deployments. + +### Quick Start + +```python +from litellm import Router + +model_list = [{ # list of model deployments + "model_name": "gpt-3.5-turbo", # openai model name + "litellm_params": { # params for litellm completion/embedding call + "model": "azure/chatgpt-v-2", + "api_key": os.getenv("AZURE_API_KEY"), + "api_version": os.getenv("AZURE_API_VERSION"), + "api_base": os.getenv("AZURE_API_BASE") + } +}, { + "model_name": "gpt-3.5-turbo", # openai model name + "litellm_params": { # params for litellm completion/embedding call + "model": "azure/chatgpt-functioncalling", + "api_key": os.getenv("AZURE_API_KEY"), + "api_version": os.getenv("AZURE_API_VERSION"), + "api_base": os.getenv("AZURE_API_BASE") + } +}, { + "model_name": "gpt-3.5-turbo", # openai model name + "litellm_params": { # params for litellm completion/embedding call + "model": "gpt-3.5-turbo", + "api_key": os.getenv("OPENAI_API_KEY"), + } +}] + +router = Router(model_list=model_list) + +# openai.ChatCompletion.create replacement +response = router.completion(model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "Hey, how's it going?"}] + +print(response) +``` + +### Redis Queue + +In production, we use Redis to track usage across multiple Azure deployments. + +```python +router = Router(model_list=model_list, + redis_host=os.getenv("REDIS_HOST"), + redis_password=os.getenv("REDIS_PASSWORD"), + redis_port=os.getenv("REDIS_PORT")) + +print(response) +``` + +### Deploy Router + +1. Clone repo +```shell + git clone https://github.com/BerriAI/litellm +``` + +2. Create + Modify router_config.yaml (save your azure/openai/etc. deployment info) + +```shell +cp ./router_config_template.yaml ./router_config.yaml +``` + +3. Build + Run docker image + +```shell +docker build -t litellm-proxy . --build-arg CONFIG_FILE=./router_config.yaml +``` + +```shell +docker run --name litellm-proxy -e PORT=8000 -p 8000:8000 litellm-proxy +``` + +### Test + +```curl +curl 'http://0.0.0.0:8000/router/completions' \ +--header 'Content-Type: application/json' \ +--data '{ + "model": "gpt-3.5-turbo", + "messages": [{"role": "user", "content": "Hey"}] +}' +``` + ## Retry failed requests Call it in completion like this `completion(..num_retries=2)`. @@ -73,106 +171,6 @@ response = completion(model="azure/gpt-4", messages=messages, api_key=api_key, [Check out this section for implementation details](#fallbacks-1) -## Manage Multiple Deployments - -Use this if you're trying to load-balance across multiple deployments (e.g. Azure/OpenAI). - -`Router` prevents failed requests, by picking the deployment which is below rate-limit and has the least amount of tokens used. - -In production, [Router connects to a Redis Cache](#redis-queue) to track usage across multiple deployments. - -### Quick Start - -```python -from litellm import Router - -model_list = [{ # list of model deployments - "model_name": "gpt-3.5-turbo", # openai model name - "litellm_params": { # params for litellm completion/embedding call - "model": "azure/chatgpt-v-2", - "api_key": os.getenv("AZURE_API_KEY"), - "api_version": os.getenv("AZURE_API_VERSION"), - "api_base": os.getenv("AZURE_API_BASE") - }, - "tpm": 240000, - "rpm": 1800 -}, { - "model_name": "gpt-3.5-turbo", # openai model name - "litellm_params": { # params for litellm completion/embedding call - "model": "azure/chatgpt-functioncalling", - "api_key": os.getenv("AZURE_API_KEY"), - "api_version": os.getenv("AZURE_API_VERSION"), - "api_base": os.getenv("AZURE_API_BASE") - }, - "tpm": 240000, - "rpm": 1800 -}, { - "model_name": "gpt-3.5-turbo", # openai model name - "litellm_params": { # params for litellm completion/embedding call - "model": "gpt-3.5-turbo", - "api_key": os.getenv("OPENAI_API_KEY"), - }, - "tpm": 1000000, - "rpm": 9000 -}] - -router = Router(model_list=model_list) - -# openai.ChatCompletion.create replacement -response = router.completion(model="gpt-3.5-turbo", - messages=[{"role": "user", "content": "Hey, how's it going?"}] - -print(response) -``` - -### Redis Queue - -In production, we use Redis to track usage across multiple Azure deployments. - -```python -router = Router(model_list=model_list, - redis_host=os.getenv("REDIS_HOST"), - redis_password=os.getenv("REDIS_PASSWORD"), - redis_port=os.getenv("REDIS_PORT")) - -print(response) -``` - -### Deploy Router - -1. Clone repo -```shell - git clone https://github.com/BerriAI/litellm -``` - -2. Create + Modify router_config.yaml (save your azure/openai/etc. deployment info) - -```shell -cp ./router_config_template.yaml ./router_config.yaml -``` - -3. Build + Run docker image - -```shell -docker build -t litellm-proxy . --build-arg CONFIG_FILE=./router_config.yaml -``` - -```shell -docker run --name litellm-proxy -e PORT=8000 -p 8000:8000 litellm-proxy -``` - -### Test - -```curl -curl 'http://0.0.0.0:8000/router/completions' \ ---header 'Content-Type: application/json' \ ---data '{ - "model": "gpt-3.5-turbo", - "messages": [{"role": "user", "content": "Hey"}] -}' -``` - - ## Implementation Details ### Fallbacks diff --git a/docs/my-website/img/multiple_deployment.png b/docs/my-website/img/multiple_deployment.png new file mode 100644 index 0000000000..5f0c309427 Binary files /dev/null and b/docs/my-website/img/multiple_deployment.png differ diff --git a/litellm/router.py b/litellm/router.py index 33efb6dd07..90e8b37b67 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -32,7 +32,7 @@ class Router: cache_responses: bool = False) -> None: if model_list: self.set_model_list(model_list) - self.healthy_deployments = [] + self.healthy_deployments: List = [] ### HEALTH CHECK THREAD ### - commenting out as further testing required self._start_health_check_thread() @@ -168,7 +168,7 @@ class Router: data = deployment["litellm_params"] # call via litellm.completion() - return litellm.text_completion(**{**data, "prompt": prompt, "caching": self.cache_responses, **kwargs}) + return litellm.text_completion(**{**data, "prompt": prompt, "caching": self.cache_responses, **kwargs}) # type: ignore def embedding(self, model: str,