mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-26 11:14:04 +00:00
docs(routing.md): update tutorial on deploying router
This commit is contained in:
parent
c4f51594d5
commit
d9123ea2e8
4 changed files with 39 additions and 40 deletions
|
@ -64,37 +64,7 @@ print(response)
|
||||||
|
|
||||||
### Deploy Router
|
### Deploy Router
|
||||||
|
|
||||||
1. Clone repo
|
If you want a server to just route requests to different LLM APIs, use our [OpenAI Proxy Server](./simple_proxy.md)
|
||||||
```shell
|
|
||||||
git clone https://github.com/BerriAI/litellm
|
|
||||||
```
|
|
||||||
|
|
||||||
2. Create + Modify router_config.yaml (save your azure/openai/etc. deployment info)
|
|
||||||
|
|
||||||
```shell
|
|
||||||
cp ./router_config_template.yaml ./router_config.yaml
|
|
||||||
```
|
|
||||||
|
|
||||||
3. Build + Run docker image
|
|
||||||
|
|
||||||
```shell
|
|
||||||
docker build -t litellm-proxy . --build-arg CONFIG_FILE=./router_config.yaml
|
|
||||||
```
|
|
||||||
|
|
||||||
```shell
|
|
||||||
docker run --name litellm-proxy -e PORT=8000 -p 8000:8000 litellm-proxy
|
|
||||||
```
|
|
||||||
|
|
||||||
### Test
|
|
||||||
|
|
||||||
```curl
|
|
||||||
curl 'http://0.0.0.0:8000/router/completions' \
|
|
||||||
--header 'Content-Type: application/json' \
|
|
||||||
--data '{
|
|
||||||
"model": "gpt-3.5-turbo",
|
|
||||||
"messages": [{"role": "user", "content": "Hey"}]
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
## Retry failed requests
|
## Retry failed requests
|
||||||
|
|
||||||
|
|
|
@ -8,6 +8,9 @@ LiteLLM Server manages:
|
||||||
|
|
||||||
* Calling 100+ LLMs [Huggingface/Bedrock/TogetherAI/etc.](#other-supported-models) in the OpenAI `ChatCompletions` & `Completions` format
|
* Calling 100+ LLMs [Huggingface/Bedrock/TogetherAI/etc.](#other-supported-models) in the OpenAI `ChatCompletions` & `Completions` format
|
||||||
* Set custom prompt templates + model-specific configs (`temperature`, `max_tokens`, etc.)
|
* Set custom prompt templates + model-specific configs (`temperature`, `max_tokens`, etc.)
|
||||||
|
* Routing between [Multiple Models](#multiple-models---quick-start) + [Deployments of the same model](#multiple-instances-of-1-model)
|
||||||
|
|
||||||
|
[**See code**](https://github.com/BerriAI/litellm/tree/main/litellm/proxy)
|
||||||
|
|
||||||
## Quick Start
|
## Quick Start
|
||||||
View all the supported args for the Proxy CLI [here](https://docs.litellm.ai/docs/simple_proxy#proxy-cli-arguments)
|
View all the supported args for the Proxy CLI [here](https://docs.litellm.ai/docs/simple_proxy#proxy-cli-arguments)
|
||||||
|
@ -593,8 +596,11 @@ model_list:
|
||||||
api_key: sk-claude
|
api_key: sk-claude
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Default Model - Config:
|
:::info
|
||||||
|
|
||||||
The proxy uses the first model in the config as the default model - in this config the default model is `zephyr-alpha`
|
The proxy uses the first model in the config as the default model - in this config the default model is `zephyr-alpha`
|
||||||
|
:::
|
||||||
|
|
||||||
|
|
||||||
#### Step 2: Start Proxy with config
|
#### Step 2: Start Proxy with config
|
||||||
|
|
||||||
|
@ -602,11 +608,7 @@ The proxy uses the first model in the config as the default model - in this conf
|
||||||
$ litellm --config /path/to/config.yaml
|
$ litellm --config /path/to/config.yaml
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Step 3: Start Proxy with config
|
#### Step 3: Use proxy
|
||||||
|
|
||||||
If you're repo let's you set model name, you can call the specific model by just passing in that model's name -
|
|
||||||
|
|
||||||
#### Step 4: Use proxy
|
|
||||||
Curl Command
|
Curl Command
|
||||||
```shell
|
```shell
|
||||||
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
||||||
|
@ -703,6 +705,28 @@ model_list:
|
||||||
api_base: http://0.0.0.0:8003
|
api_base: http://0.0.0.0:8003
|
||||||
```
|
```
|
||||||
|
|
||||||
|
#### Step 2: Start Proxy with config
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ litellm --config /path/to/config.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Step 3: Use proxy
|
||||||
|
Curl Command
|
||||||
|
```shell
|
||||||
|
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
||||||
|
--header 'Content-Type: application/json' \
|
||||||
|
--data ' {
|
||||||
|
"model": "zephyr-beta",
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "what llm are you"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
'
|
||||||
|
```
|
||||||
|
|
||||||
### Set Custom Prompt Templates
|
### Set Custom Prompt Templates
|
||||||
|
|
||||||
|
|
|
@ -153,7 +153,10 @@ class Router:
|
||||||
for current_attempt in range(self.num_retries):
|
for current_attempt in range(self.num_retries):
|
||||||
try:
|
try:
|
||||||
# if the function call is successful, no exception will be raised and we'll break out of the loop
|
# if the function call is successful, no exception will be raised and we'll break out of the loop
|
||||||
return await original_function(*args, **kwargs)
|
response = await original_function(*args, **kwargs)
|
||||||
|
if isinstance(response, asyncio.coroutines.Coroutine): # async errors are often returned as coroutines
|
||||||
|
response = await response
|
||||||
|
return response
|
||||||
|
|
||||||
except openai.RateLimitError as e:
|
except openai.RateLimitError as e:
|
||||||
# on RateLimitError we'll wait for an exponential time before trying again
|
# on RateLimitError we'll wait for an exponential time before trying again
|
||||||
|
@ -231,6 +234,8 @@ class Router:
|
||||||
deployment = self.get_available_deployment(model=model, messages=messages)
|
deployment = self.get_available_deployment(model=model, messages=messages)
|
||||||
data = deployment["litellm_params"]
|
data = deployment["litellm_params"]
|
||||||
response = await litellm.acompletion(**{**data, "messages": messages, "caching": self.cache_responses, **kwargs})
|
response = await litellm.acompletion(**{**data, "messages": messages, "caching": self.cache_responses, **kwargs})
|
||||||
|
if isinstance(response, asyncio.coroutines.Coroutine): # async errors are often returned as coroutines
|
||||||
|
response = await response
|
||||||
return response
|
return response
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
kwargs["model"] = model
|
kwargs["model"] = model
|
||||||
|
|
|
@ -1057,12 +1057,12 @@ def client(original_function):
|
||||||
if litellm.client_session is None:
|
if litellm.client_session is None:
|
||||||
litellm.client_session = httpx.Client(
|
litellm.client_session = httpx.Client(
|
||||||
limits=httpx.Limits(max_connections=100, max_keepalive_connections=20),
|
limits=httpx.Limits(max_connections=100, max_keepalive_connections=20),
|
||||||
timeout = httpx.Timeout(timeout=600.0, connect=5.0)
|
timeout = None
|
||||||
)
|
)
|
||||||
if litellm.aclient_session is None:
|
if litellm.aclient_session is None:
|
||||||
litellm.aclient_session = httpx.AsyncClient(
|
litellm.aclient_session = httpx.AsyncClient(
|
||||||
limits=httpx.Limits(max_connections=100, max_keepalive_connections=20),
|
limits=httpx.Limits(max_connections=100, max_keepalive_connections=20),
|
||||||
timeout = httpx.Timeout(timeout=600.0, connect=5.0)
|
timeout = None
|
||||||
)
|
)
|
||||||
if litellm.use_client or ("use_client" in kwargs and kwargs["use_client"] == True):
|
if litellm.use_client or ("use_client" in kwargs and kwargs["use_client"] == True):
|
||||||
print_verbose(f"litedebugger initialized")
|
print_verbose(f"litedebugger initialized")
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue