diff --git a/docs/my-website/docs/routing.md b/docs/my-website/docs/routing.md index b3a3298dea..cb3722c229 100644 --- a/docs/my-website/docs/routing.md +++ b/docs/my-website/docs/routing.md @@ -64,37 +64,7 @@ print(response) ### Deploy Router -1. Clone repo -```shell - git clone https://github.com/BerriAI/litellm -``` - -2. Create + Modify router_config.yaml (save your azure/openai/etc. deployment info) - -```shell -cp ./router_config_template.yaml ./router_config.yaml -``` - -3. Build + Run docker image - -```shell -docker build -t litellm-proxy . --build-arg CONFIG_FILE=./router_config.yaml -``` - -```shell -docker run --name litellm-proxy -e PORT=8000 -p 8000:8000 litellm-proxy -``` - -### Test - -```curl -curl 'http://0.0.0.0:8000/router/completions' \ ---header 'Content-Type: application/json' \ ---data '{ - "model": "gpt-3.5-turbo", - "messages": [{"role": "user", "content": "Hey"}] -}' -``` +If you want a server to just route requests to different LLM APIs, use our [OpenAI Proxy Server](./simple_proxy.md) ## Retry failed requests diff --git a/docs/my-website/docs/simple_proxy.md b/docs/my-website/docs/simple_proxy.md index 4b3ac7e30d..8040af8741 100644 --- a/docs/my-website/docs/simple_proxy.md +++ b/docs/my-website/docs/simple_proxy.md @@ -8,6 +8,9 @@ LiteLLM Server manages: * Calling 100+ LLMs [Huggingface/Bedrock/TogetherAI/etc.](#other-supported-models) in the OpenAI `ChatCompletions` & `Completions` format * Set custom prompt templates + model-specific configs (`temperature`, `max_tokens`, etc.) +* Routing between [Multiple Models](#multiple-models---quick-start) + [Deployments of the same model](#multiple-instances-of-1-model) + +[**See code**](https://github.com/BerriAI/litellm/tree/main/litellm/proxy) ## Quick Start View all the supported args for the Proxy CLI [here](https://docs.litellm.ai/docs/simple_proxy#proxy-cli-arguments) @@ -593,8 +596,11 @@ model_list: api_key: sk-claude ``` -#### Default Model - Config: +:::info + The proxy uses the first model in the config as the default model - in this config the default model is `zephyr-alpha` +::: + #### Step 2: Start Proxy with config @@ -602,11 +608,7 @@ The proxy uses the first model in the config as the default model - in this conf $ litellm --config /path/to/config.yaml ``` -#### Step 3: Start Proxy with config - -If you're repo let's you set model name, you can call the specific model by just passing in that model's name - - -#### Step 4: Use proxy +#### Step 3: Use proxy Curl Command ```shell curl --location 'http://0.0.0.0:8000/chat/completions' \ @@ -703,6 +705,28 @@ model_list: api_base: http://0.0.0.0:8003 ``` +#### Step 2: Start Proxy with config + +```shell +$ litellm --config /path/to/config.yaml +``` + +#### Step 3: Use proxy +Curl Command +```shell +curl --location 'http://0.0.0.0:8000/chat/completions' \ +--header 'Content-Type: application/json' \ +--data ' { + "model": "zephyr-beta", + "messages": [ + { + "role": "user", + "content": "what llm are you" + } + ], + } +' +``` ### Set Custom Prompt Templates diff --git a/litellm/router.py b/litellm/router.py index ecb73ece1b..ca5b5511b9 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -153,7 +153,10 @@ class Router: for current_attempt in range(self.num_retries): try: # if the function call is successful, no exception will be raised and we'll break out of the loop - return await original_function(*args, **kwargs) + response = await original_function(*args, **kwargs) + if isinstance(response, asyncio.coroutines.Coroutine): # async errors are often returned as coroutines + response = await response + return response except openai.RateLimitError as e: # on RateLimitError we'll wait for an exponential time before trying again @@ -231,6 +234,8 @@ class Router: deployment = self.get_available_deployment(model=model, messages=messages) data = deployment["litellm_params"] response = await litellm.acompletion(**{**data, "messages": messages, "caching": self.cache_responses, **kwargs}) + if isinstance(response, asyncio.coroutines.Coroutine): # async errors are often returned as coroutines + response = await response return response except Exception as e: kwargs["model"] = model diff --git a/litellm/utils.py b/litellm/utils.py index 682d6dec3e..4a3f4db9b7 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -1057,12 +1057,12 @@ def client(original_function): if litellm.client_session is None: litellm.client_session = httpx.Client( limits=httpx.Limits(max_connections=100, max_keepalive_connections=20), - timeout = httpx.Timeout(timeout=600.0, connect=5.0) + timeout = None ) if litellm.aclient_session is None: litellm.aclient_session = httpx.AsyncClient( limits=httpx.Limits(max_connections=100, max_keepalive_connections=20), - timeout = httpx.Timeout(timeout=600.0, connect=5.0) + timeout = None ) if litellm.use_client or ("use_client" in kwargs and kwargs["use_client"] == True): print_verbose(f"litedebugger initialized")