mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-25 18:54:30 +00:00
fix(proxy_server.py): accept config.yaml
This commit is contained in:
parent
e09b4cb01a
commit
6b3671b593
3 changed files with 603 additions and 607 deletions
|
@ -9,8 +9,8 @@ LiteLLM Server, is a simple, fast, and lightweight **OpenAI-compatible server**
|
||||||
LiteLLM Server supports:
|
LiteLLM Server supports:
|
||||||
|
|
||||||
* LLM API Calls in the OpenAI ChatCompletions format
|
* LLM API Calls in the OpenAI ChatCompletions format
|
||||||
|
* Set custom prompt templates
|
||||||
* Caching + Logging capabilities (Redis and Langfuse, respectively)
|
* Caching + Logging capabilities (Redis and Langfuse, respectively)
|
||||||
* Setting API keys in the request headers or in the .env
|
|
||||||
|
|
||||||
[**See Code**](https://github.com/BerriAI/litellm/tree/main/litellm_server)
|
[**See Code**](https://github.com/BerriAI/litellm/tree/main/litellm_server)
|
||||||
|
|
||||||
|
@ -19,17 +19,16 @@ We want to learn how we can make the server better! Meet the [founders](https://
|
||||||
join our [discord](https://discord.gg/wuPM9dRgDw)
|
join our [discord](https://discord.gg/wuPM9dRgDw)
|
||||||
:::
|
:::
|
||||||
|
|
||||||
## Usage
|
## Quick Start
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
docker run -e PORT=8000 -p 8000:8000 ghcr.io/berriai/litellm:latest
|
$ litellm --model huggingface/bigcode/starcoder
|
||||||
```
|
```
|
||||||
OpenAI Proxy running on http://0.0.0.0:8000
|
OpenAI Proxy running on http://0.0.0.0:8000
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl http://0.0.0.0:8000/v1/chat/completions \
|
curl http://0.0.0.0:8000/v1/chat/completions \
|
||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
-H "Authorization: Bearer $YOUR_API_KEY"
|
|
||||||
-d '{
|
-d '{
|
||||||
"model": "gpt-3.5-turbo",
|
"model": "gpt-3.5-turbo",
|
||||||
"messages": [{"role": "user", "content": "Say this is a test!"}],
|
"messages": [{"role": "user", "content": "Say this is a test!"}],
|
||||||
|
@ -37,192 +36,483 @@ curl http://0.0.0.0:8000/v1/chat/completions \
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
|
This will now automatically route any requests for gpt-3.5-turbo to bigcode starcoder, hosted on huggingface inference endpoints.
|
||||||
|
|
||||||
|
|
||||||
#### Other supported models:
|
#### Other supported models:
|
||||||
<Tabs>
|
<Tabs>
|
||||||
<TabItem value="bedrock" label="Bedrock">
|
<TabItem value="vllm-local" label="VLLM">
|
||||||
|
Assuming you're running vllm locally
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
$ docker run -e PORT=8000 -e AWS_ACCESS_KEY_ID=<your-access-key> -e AWS_SECRET_ACCESS_KEY=<your-secret-key> -p 8000:8000 ghcr.io/berriai/litellm:latest
|
$ litellm --model vllm/facebook/opt-125m
|
||||||
```
|
```
|
||||||
|
|
||||||
</TabItem>
|
</TabItem>
|
||||||
<TabItem value="huggingface" label="Huggingface">
|
<TabItem value="openai-proxy" label="OpenAI Compatible Server">
|
||||||
|
|
||||||
**Set API Keys in .env**
|
|
||||||
If, you're calling it via Huggingface Inference Endpoints.
|
|
||||||
```shell
|
```shell
|
||||||
$ docker run -e PORT=8000 -e HUGGINGFACE_API_KEY=<your-api-key> -p 8000:8000 ghcr.io/berriai/litellm:latest
|
$ litellm --model openai/<model_name> --api_base <your-api-base>
|
||||||
```
|
```
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="huggingface" label="Huggingface (TGI)">
|
||||||
|
|
||||||
Else,
|
|
||||||
```shell
|
```shell
|
||||||
$ docker run -e PORT=8000 -p 8000:8000 ghcr.io/berriai/litellm:latest
|
$ export HUGGINGFACE_API_KEY=my-api-key #[OPTIONAL]
|
||||||
|
$ litellm --model huggingface/<huggingface-model-name> --api_base https://<your-hf-endpoint># e.g. huggingface/mistralai/Mistral-7B-v0.1
|
||||||
```
|
```
|
||||||
|
|
||||||
**Set API Keys in request headers**
|
|
||||||
```shell
|
|
||||||
curl http://0.0.0.0:8000/v1/chat/completions \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-H "Authorization: Bearer $HUGGINGFACE_API_KEY"
|
|
||||||
-d '{
|
|
||||||
"model": "huggingface/bigcoder/starcoder",
|
|
||||||
"messages": [{"role": "user", "content": "Say this is a test!"}],
|
|
||||||
"temperature": 0.7
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
||||||
</TabItem>
|
</TabItem>
|
||||||
<TabItem value="anthropic" label="Anthropic">
|
<TabItem value="anthropic" label="Anthropic">
|
||||||
|
|
||||||
**Set API Keys in .env**
|
|
||||||
```shell
|
```shell
|
||||||
$ docker run -e PORT=8000 -e ANTHROPIC_API_KEY=<your-api-key> -p 8000:8000 ghcr.io/berriai/litellm:latest
|
$ export ANTHROPIC_API_KEY=my-api-key
|
||||||
```
|
$ litellm --model claude-instant-1
|
||||||
|
|
||||||
**Set API Keys in request headers**
|
|
||||||
```shell
|
|
||||||
curl http://0.0.0.0:8000/v1/chat/completions \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-H "Authorization: Bearer $ANTHROPIC_API_KEY"
|
|
||||||
-d '{
|
|
||||||
"model": "claude-2",
|
|
||||||
"messages": [{"role": "user", "content": "Say this is a test!"}],
|
|
||||||
"temperature": 0.7
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
||||||
</TabItem>
|
|
||||||
|
|
||||||
<TabItem value="ollama" label="Ollama">
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ docker run -e PORT=8000 -e OLLAMA_API_BASE=<your-ollama-api-base> -p 8000:8000 ghcr.io/berriai/litellm:latest
|
|
||||||
```
|
```
|
||||||
|
|
||||||
</TabItem>
|
</TabItem>
|
||||||
|
|
||||||
|
|
||||||
<TabItem value="together_ai" label="TogetherAI">
|
<TabItem value="together_ai" label="TogetherAI">
|
||||||
|
|
||||||
**Set API Keys in .env**
|
|
||||||
```shell
|
```shell
|
||||||
$ docker run -e PORT=8000 -e TOGETHERAI_API_KEY=<your-api-key> -p 8000:8000 ghcr.io/berriai/litellm:latest
|
$ export TOGETHERAI_API_KEY=my-api-key
|
||||||
```
|
$ litellm --model together_ai/lmsys/vicuna-13b-v1.5-16k
|
||||||
|
|
||||||
**Set API Keys in request headers**
|
|
||||||
```shell
|
|
||||||
curl http://0.0.0.0:8000/v1/chat/completions \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-H "Authorization: Bearer $TOGETHERAI_API_KEY"
|
|
||||||
-d '{
|
|
||||||
"model": "together_ai/togethercomputer/llama-2-70b-chat",
|
|
||||||
"messages": [{"role": "user", "content": "Say this is a test!"}],
|
|
||||||
"temperature": 0.7
|
|
||||||
}'
|
|
||||||
```
|
```
|
||||||
|
|
||||||
</TabItem>
|
</TabItem>
|
||||||
|
|
||||||
<TabItem value="replicate" label="Replicate">
|
<TabItem value="replicate" label="Replicate">
|
||||||
|
|
||||||
**Set API Keys in .env**
|
|
||||||
```shell
|
```shell
|
||||||
$ docker run -e PORT=8000 -e REPLICATE_API_KEY=<your-api-key> -p 8000:8000 ghcr.io/berriai/litellm:latest
|
$ export REPLICATE_API_KEY=my-api-key
|
||||||
|
$ litellm \
|
||||||
|
--model replicate/meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3
|
||||||
```
|
```
|
||||||
|
|
||||||
**Set API Keys in request headers**
|
</TabItem>
|
||||||
```shell
|
|
||||||
curl http://0.0.0.0:8000/v1/chat/completions \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-H "Authorization: Bearer $REPLICATE_API_KEY"
|
|
||||||
-d '{
|
|
||||||
"model": "replicate/llama-2-70b-chat:2796ee9483c3fd7aa2e171d38f4ca12251a30609463dcfd4cd76703f22e96cdf",
|
|
||||||
"messages": [{"role": "user", "content": "Say this is a test!"}],
|
|
||||||
"temperature": 0.7
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
|
<TabItem value="petals" label="Petals">
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ litellm --model petals/meta-llama/Llama-2-70b-chat-hf
|
||||||
|
```
|
||||||
|
|
||||||
</TabItem>
|
</TabItem>
|
||||||
|
|
||||||
<TabItem value="palm" label="Palm">
|
<TabItem value="palm" label="Palm">
|
||||||
|
|
||||||
**Set API Keys in .env**
|
|
||||||
```shell
|
```shell
|
||||||
$ docker run -e PORT=8000 -e PALM_API_KEY=<your-api-key> -p 8000:8000 ghcr.io/berriai/litellm:latest
|
$ export PALM_API_KEY=my-palm-key
|
||||||
```
|
$ litellm --model palm/chat-bison
|
||||||
|
|
||||||
**Set API Keys in request headers**
|
|
||||||
```shell
|
|
||||||
curl http://0.0.0.0:8000/v1/chat/completions \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-H "Authorization: Bearer $PALM_API_KEY"
|
|
||||||
-d '{
|
|
||||||
"model": "palm/chat-bison",
|
|
||||||
"messages": [{"role": "user", "content": "Say this is a test!"}],
|
|
||||||
"temperature": 0.7
|
|
||||||
}'
|
|
||||||
```
|
```
|
||||||
|
|
||||||
</TabItem>
|
</TabItem>
|
||||||
|
|
||||||
<TabItem value="azure" label="Azure OpenAI">
|
<TabItem value="azure" label="Azure OpenAI">
|
||||||
|
|
||||||
**Set API Keys in .env**
|
|
||||||
```shell
|
```shell
|
||||||
$ docker run -e PORT=8000 -e AZURE_API_KEY=<your-api-key> -e AZURE_API_BASE=<your-api-base> -p 8000:8000 ghcr.io/berriai/litellm:latest
|
$ export AZURE_API_KEY=my-api-key
|
||||||
|
$ export AZURE_API_BASE=my-api-base
|
||||||
|
|
||||||
|
$ litellm --model azure/my-deployment-name
|
||||||
```
|
```
|
||||||
|
|
||||||
</TabItem>
|
</TabItem>
|
||||||
|
|
||||||
<TabItem value="ai21" label="AI21">
|
<TabItem value="ai21" label="AI21">
|
||||||
|
|
||||||
**Set API Keys in .env**
|
|
||||||
```shell
|
```shell
|
||||||
$ docker run -e PORT=8000 -e AI21_API_KEY=<your-api-key> -p 8000:8000 ghcr.io/berriai/litellm:latest
|
$ export AI21_API_KEY=my-api-key
|
||||||
```
|
$ litellm --model j2-light
|
||||||
|
|
||||||
**Set API Keys in request headers**
|
|
||||||
```shell
|
|
||||||
curl http://0.0.0.0:8000/v1/chat/completions \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-H "Authorization: Bearer $AI21_API_KEY"
|
|
||||||
-d '{
|
|
||||||
"model": "j2-mid",
|
|
||||||
"messages": [{"role": "user", "content": "Say this is a test!"}],
|
|
||||||
"temperature": 0.7
|
|
||||||
}'
|
|
||||||
```
|
```
|
||||||
|
|
||||||
</TabItem>
|
</TabItem>
|
||||||
|
|
||||||
<TabItem value="cohere" label="Cohere">
|
<TabItem value="cohere" label="Cohere">
|
||||||
|
|
||||||
**Set API Keys in .env**
|
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
$ docker run -e PORT=8000 -e COHERE_API_KEY=<your-api-key> -p 8000:8000 ghcr.io/berriai/litellm:latest
|
$ export COHERE_API_KEY=my-api-key
|
||||||
|
$ litellm --model command-nightly
|
||||||
```
|
```
|
||||||
|
|
||||||
**Set API Keys in request headers**
|
</TabItem>
|
||||||
|
|
||||||
|
</Tabs>
|
||||||
|
|
||||||
|
[**Jump to Code**](https://github.com/BerriAI/litellm/blob/fef4146396d5d87006259e00095a62e3900d6bb4/litellm/proxy.py#L36)
|
||||||
|
|
||||||
|
# LM-Evaluation Harness with TGI
|
||||||
|
|
||||||
|
Evaluate LLMs 20x faster with TGI via litellm proxy's `/completions` endpoint.
|
||||||
|
|
||||||
|
This tutorial assumes you're using [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness)
|
||||||
|
|
||||||
|
**Step 1: Start the local proxy**
|
||||||
```shell
|
```shell
|
||||||
curl http://0.0.0.0:8000/v1/chat/completions \
|
$ litellm --model huggingface/bigcode/starcoder
|
||||||
|
```
|
||||||
|
|
||||||
|
OpenAI Compatible Endpoint at http://0.0.0.0:8000
|
||||||
|
|
||||||
|
**Step 2: Set OpenAI API Base**
|
||||||
|
```shell
|
||||||
|
$ export OPENAI_API_BASE="http://0.0.0.0:8000"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 3: Run LM-Eval-Harness**
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ python3 main.py \
|
||||||
|
--model gpt3 \
|
||||||
|
--model_args engine=huggingface/bigcode/starcoder \
|
||||||
|
--tasks hellaswag
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## Endpoints:
|
||||||
|
- `/chat/completions` - chat completions endpoint to call 100+ LLMs
|
||||||
|
- `/embeddings` - embedding endpoint for Azure, OpenAI, Huggingface endpoints
|
||||||
|
- `/models` - available models on server
|
||||||
|
|
||||||
|
## Set Custom Prompt Templates
|
||||||
|
|
||||||
|
LiteLLM by default checks if a model has a [prompt template and applies it](./completion/prompt_formatting.md) (e.g. if a huggingface model has a saved chat template in it's tokenizer_config.json). However, you can also set a custom prompt template on your proxy in the `config.yaml`:
|
||||||
|
|
||||||
|
**Step 1**: Save your prompt template in a `config.yaml`
|
||||||
|
```yaml
|
||||||
|
# Model-specific parameters
|
||||||
|
model_list:
|
||||||
|
- model_name: mistral-7b # model alias
|
||||||
|
litellm_params: # actual params for litellm.completion()
|
||||||
|
model: "huggingface/mistralai/Mistral-7B-Instruct-v0.1"
|
||||||
|
api_base: "<your-api-base>"
|
||||||
|
api_key: "<your-api-key>" # [OPTIONAL] for hf inference endpoints
|
||||||
|
initial_prompt_value: "\n"
|
||||||
|
roles: {"system":{"pre_message":"<|im_start|>system\n", "post_message":"<|im_end|>"}, "assistant":{"pre_message":"<|im_start|>assistant\n","post_message":"<|im_end|>"}, "user":{"pre_message":"<|im_start|>user\n","post_message":"<|im_end|>"}}
|
||||||
|
final_prompt_value: "\n"
|
||||||
|
bos_token: "<s>"
|
||||||
|
eos_token: "</s>"
|
||||||
|
max_tokens: 4096
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 2**: Start server with config
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ litellm --config /path/to/config.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
## Save Model-specific params (API Base, API Keys, Temperature, etc.)
|
||||||
|
Use the [router_config_template.yaml](https://github.com/BerriAI/litellm/blob/main/router_config_template.yaml) to save model-specific information like api_base, api_key, temperature, max_tokens, etc.
|
||||||
|
|
||||||
|
**Step 1**: Create a `config.yaml` file
|
||||||
|
```shell
|
||||||
|
model_list:
|
||||||
|
- model_name: gpt-3.5-turbo
|
||||||
|
litellm_params: # params for litellm.completion() - https://docs.litellm.ai/docs/completion/input#input---request-body
|
||||||
|
model: azure/chatgpt-v-2 # azure/<your-deployment-name>
|
||||||
|
api_key: your_azure_api_key
|
||||||
|
api_version: your_azure_api_version
|
||||||
|
api_base: your_azure_api_base
|
||||||
|
- model_name: mistral-7b
|
||||||
|
litellm_params:
|
||||||
|
model: ollama/mistral
|
||||||
|
api_base: your_ollama_api_base
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 2**: Start server with config
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ litellm --config /path/to/config.yaml
|
||||||
|
```
|
||||||
|
## Model Alias
|
||||||
|
|
||||||
|
Set a model alias for your deployments.
|
||||||
|
|
||||||
|
In the `config.yaml` the model_name parameter is the user-facing name to use for your deployment.
|
||||||
|
|
||||||
|
E.g.: If we want to save a Huggingface TGI Mistral-7b deployment, as 'mistral-7b' for our users, we might save it as:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
model_list:
|
||||||
|
- model_name: mistral-7b # ALIAS
|
||||||
|
litellm_params:
|
||||||
|
model: huggingface/mistralai/Mistral-7B-Instruct-v0.1 # ACTUAL NAME
|
||||||
|
api_key: your_huggingface_api_key # [OPTIONAL] if deployed on huggingface inference endpoints
|
||||||
|
api_base: your_api_base # url where model is deployed
|
||||||
|
```
|
||||||
|
|
||||||
|
## Caching
|
||||||
|
|
||||||
|
Add Redis Caching to your server via environment variables
|
||||||
|
|
||||||
|
```env
|
||||||
|
### REDIS
|
||||||
|
REDIS_HOST = ""
|
||||||
|
REDIS_PORT = ""
|
||||||
|
REDIS_PASSWORD = ""
|
||||||
|
```
|
||||||
|
|
||||||
|
Docker command:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
docker run -e REDIST_HOST=<your-redis-host> -e REDIS_PORT=<your-redis-port> -e REDIS_PASSWORD=<your-redis-password> -e PORT=8000 -p 8000:8000 ghcr.io/berriai/litellm:latest
|
||||||
|
```
|
||||||
|
|
||||||
|
## Logging
|
||||||
|
|
||||||
|
1. Debug Logs
|
||||||
|
Print the input/output params by setting `SET_VERBOSE = "True"`.
|
||||||
|
|
||||||
|
Docker command:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
docker run -e SET_VERBOSE="True" -e PORT=8000 -p 8000:8000 ghcr.io/berriai/litellm:latest
|
||||||
|
```
|
||||||
|
2. Add Langfuse Logging to your server via environment variables
|
||||||
|
|
||||||
|
```env
|
||||||
|
### LANGFUSE
|
||||||
|
LANGFUSE_PUBLIC_KEY = ""
|
||||||
|
LANGFUSE_SECRET_KEY = ""
|
||||||
|
# Optional, defaults to https://cloud.langfuse.com
|
||||||
|
LANGFUSE_HOST = "" # optional
|
||||||
|
```
|
||||||
|
|
||||||
|
Docker command:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
docker run -e LANGFUSE_PUBLIC_KEY=<your-public-key> -e LANGFUSE_SECRET_KEY=<your-secret-key> -e LANGFUSE_HOST=<your-langfuse-host> -e PORT=8000 -p 8000:8000 ghcr.io/berriai/litellm:latest
|
||||||
|
```
|
||||||
|
|
||||||
|
## Local Usage
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ git clone https://github.com/BerriAI/litellm.git
|
||||||
|
```
|
||||||
|
```shell
|
||||||
|
$ cd ./litellm/litellm_server
|
||||||
|
```
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ uvicorn main:app --host 0.0.0.0 --port 8000
|
||||||
|
```
|
||||||
|
|
||||||
|
## Setting LLM API keys
|
||||||
|
This server allows two ways of passing API keys to litellm
|
||||||
|
- Environment Variables - This server by default assumes the LLM API Keys are stored in the environment variables
|
||||||
|
- Dynamic Variables passed to `/chat/completions`
|
||||||
|
- Set `AUTH_STRATEGY=DYNAMIC` in the Environment
|
||||||
|
- Pass required auth params `api_key`,`api_base`, `api_version` with the request params
|
||||||
|
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<TabItem value="gcp-run" label="Google Cloud Run">
|
||||||
|
|
||||||
|
#### Deploy on Google Cloud Run
|
||||||
|
**Click the button** to deploy to Google Cloud Run
|
||||||
|
|
||||||
|
[](https://l.linklyhq.com/l/1uHtX)
|
||||||
|
|
||||||
|
On a successfull deploy your Cloud Run Shell will have this output
|
||||||
|
<Image img={require('../img/cloud_run0.png')} />
|
||||||
|
|
||||||
|
### Testing your deployed server
|
||||||
|
**Assuming the required keys are set as Environment Variables**
|
||||||
|
|
||||||
|
https://litellm-7yjrj3ha2q-uc.a.run.app is our example server, substitute it with your deployed cloud run app
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<TabItem value="openai" label="OpenAI">
|
||||||
|
|
||||||
|
```shell
|
||||||
|
curl https://litellm-7yjrj3ha2q-uc.a.run.app/v1/chat/completions \
|
||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
-H "Authorization: Bearer $COHERE_API_KEY"
|
|
||||||
-d '{
|
-d '{
|
||||||
"model": "command-nightly",
|
"model": "gpt-3.5-turbo",
|
||||||
"messages": [{"role": "user", "content": "Say this is a test!"}],
|
"messages": [{"role": "user", "content": "Say this is a test!"}],
|
||||||
"temperature": 0.7
|
"temperature": 0.7
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="azure" label="Azure">
|
||||||
|
|
||||||
|
```shell
|
||||||
|
curl https://litellm-7yjrj3ha2q-uc.a.run.app/v1/chat/completions \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"model": "azure/<your-deployment-name>",
|
||||||
|
"messages": [{"role": "user", "content": "Say this is a test!"}],
|
||||||
|
"temperature": 0.7
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
</TabItem>
|
</TabItem>
|
||||||
|
|
||||||
|
<TabItem value="anthropic" label="Anthropic">
|
||||||
|
|
||||||
|
```shell
|
||||||
|
curl https://litellm-7yjrj3ha2q-uc.a.run.app/v1/chat/completions \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"model": "claude-2",
|
||||||
|
"messages": [{"role": "user", "content": "Say this is a test!"}],
|
||||||
|
"temperature": 0.7,
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
</TabItem>
|
||||||
|
|
||||||
</Tabs>
|
</Tabs>
|
||||||
|
|
||||||
|
### Set LLM API Keys
|
||||||
|
#### Environment Variables
|
||||||
|
More info [here](https://cloud.google.com/run/docs/configuring/services/environment-variables#console)
|
||||||
|
|
||||||
|
1. In the Google Cloud console, go to Cloud Run: [Go to Cloud Run](https://console.cloud.google.com/run)
|
||||||
|
|
||||||
|
2. Click on the **litellm** service
|
||||||
|
<Image img={require('../img/cloud_run1.png')} />
|
||||||
|
|
||||||
|
3. Click **Edit and Deploy New Revision**
|
||||||
|
<Image img={require('../img/cloud_run2.png')} />
|
||||||
|
|
||||||
|
4. Enter your Environment Variables
|
||||||
|
Example `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`
|
||||||
|
<Image img={require('../img/cloud_run3.png')} />
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="render" label="Render">
|
||||||
|
|
||||||
|
#### Deploy on Render
|
||||||
|
**Click the button** to deploy to Render
|
||||||
|
|
||||||
|
[](https://l.linklyhq.com/l/1uHsr)
|
||||||
|
|
||||||
|
On a successfull deploy https://dashboard.render.com/ should display the following
|
||||||
|
<Image img={require('../img/render1.png')} />
|
||||||
|
|
||||||
|
<Image img={require('../img/render2.png')} />
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="aws-apprunner" label="AWS Apprunner">
|
||||||
|
|
||||||
|
#### Deploy on AWS Apprunner
|
||||||
|
1. Fork LiteLLM https://github.com/BerriAI/litellm
|
||||||
|
2. Navigate to to App Runner on AWS Console: https://console.aws.amazon.com/apprunner/home#/services
|
||||||
|
3. Follow the steps in the video below
|
||||||
|
<iframe width="800" height="450" src="https://www.loom.com/embed/5fccced4dde8461a8caeee97addb2231?sid=eac60660-073e-455e-a737-b3d05a5a756a" frameborder="0" webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe>
|
||||||
|
|
||||||
|
4. Testing your deployed endpoint
|
||||||
|
|
||||||
|
**Assuming the required keys are set as Environment Variables** Example: `OPENAI_API_KEY`
|
||||||
|
|
||||||
|
https://b2w6emmkzp.us-east-1.awsapprunner.com is our example server, substitute it with your deployed apprunner endpoint
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<TabItem value="openai" label="OpenAI">
|
||||||
|
|
||||||
|
```shell
|
||||||
|
curl https://b2w6emmkzp.us-east-1.awsapprunner.com/v1/chat/completions \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"messages": [{"role": "user", "content": "Say this is a test!"}],
|
||||||
|
"temperature": 0.7
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="azure" label="Azure">
|
||||||
|
|
||||||
|
```shell
|
||||||
|
curl https://b2w6emmkzp.us-east-1.awsapprunner.com/v1/chat/completions \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"model": "azure/<your-deployment-name>",
|
||||||
|
"messages": [{"role": "user", "content": "Say this is a test!"}],
|
||||||
|
"temperature": 0.7
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
|
||||||
|
<TabItem value="anthropic" label="Anthropic">
|
||||||
|
|
||||||
|
```shell
|
||||||
|
curl https://b2w6emmkzp.us-east-1.awsapprunner.com/v1/chat/completions \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"model": "claude-2",
|
||||||
|
"messages": [{"role": "user", "content": "Say this is a test!"}],
|
||||||
|
"temperature": 0.7,
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
</TabItem>
|
||||||
|
|
||||||
|
</Tabs>
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
</Tabs>
|
||||||
|
|
||||||
|
## Advanced
|
||||||
|
### Caching - Completion() and Embedding() Responses
|
||||||
|
|
||||||
|
Enable caching by adding the following credentials to your server environment
|
||||||
|
|
||||||
|
```
|
||||||
|
REDIS_HOST = "" # REDIS_HOST='redis-18841.c274.us-east-1-3.ec2.cloud.redislabs.com'
|
||||||
|
REDIS_PORT = "" # REDIS_PORT='18841'
|
||||||
|
REDIS_PASSWORD = "" # REDIS_PASSWORD='liteLlmIsAmazing'
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Test Caching
|
||||||
|
Send the same request twice:
|
||||||
|
```shell
|
||||||
|
curl http://0.0.0.0:8000/v1/chat/completions \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"messages": [{"role": "user", "content": "write a poem about litellm!"}],
|
||||||
|
"temperature": 0.7
|
||||||
|
}'
|
||||||
|
|
||||||
|
curl http://0.0.0.0:8000/v1/chat/completions \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"messages": [{"role": "user", "content": "write a poem about litellm!"}],
|
||||||
|
"temperature": 0.7
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Control caching per completion request
|
||||||
|
Caching can be switched on/off per /chat/completions request
|
||||||
|
- Caching on for completion - pass `caching=True`:
|
||||||
|
```shell
|
||||||
|
curl http://0.0.0.0:8000/v1/chat/completions \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"messages": [{"role": "user", "content": "write a poem about litellm!"}],
|
||||||
|
"temperature": 0.7,
|
||||||
|
"caching": true
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
- Caching off for completion - pass `caching=False`:
|
||||||
|
```shell
|
||||||
|
curl http://0.0.0.0:8000/v1/chat/completions \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"messages": [{"role": "user", "content": "write a poem about litellm!"}],
|
||||||
|
"temperature": 0.7,
|
||||||
|
"caching": false
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Tutorials (Chat-UI, NeMO-Guardrails, PromptTools, Phoenix ArizeAI, Langchain, ragas, LlamaIndex, etc.)
|
## Tutorials (Chat-UI, NeMO-Guardrails, PromptTools, Phoenix ArizeAI, Langchain, ragas, LlamaIndex, etc.)
|
||||||
|
|
||||||
**Start server:**
|
**Start server:**
|
||||||
|
@ -440,323 +730,3 @@ print(response)
|
||||||
</TabItem>
|
</TabItem>
|
||||||
</Tabs>
|
</Tabs>
|
||||||
|
|
||||||
|
|
||||||
## Endpoints:
|
|
||||||
- `/chat/completions` - chat completions endpoint to call 100+ LLMs
|
|
||||||
- `/embeddings` - embedding endpoint for Azure, OpenAI, Huggingface endpoints
|
|
||||||
- `/models` - available models on server
|
|
||||||
|
|
||||||
|
|
||||||
## Save Model-specific params (API Base, API Keys, Temperature, etc.)
|
|
||||||
Use the [router_config_template.yaml](https://github.com/BerriAI/litellm/blob/main/router_config_template.yaml) to save model-specific information like api_base, api_key, temperature, max_tokens, etc.
|
|
||||||
|
|
||||||
1. Create a `config.yaml` file
|
|
||||||
```shell
|
|
||||||
model_list:
|
|
||||||
- model_name: gpt-3.5-turbo
|
|
||||||
litellm_params: # params for litellm.completion() - https://docs.litellm.ai/docs/completion/input#input---request-body
|
|
||||||
model: azure/chatgpt-v-2 # azure/<your-deployment-name>
|
|
||||||
api_key: your_azure_api_key
|
|
||||||
api_version: your_azure_api_version
|
|
||||||
api_base: your_azure_api_base
|
|
||||||
- model_name: mistral-7b
|
|
||||||
litellm_params:
|
|
||||||
model: ollama/mistral
|
|
||||||
api_base: your_ollama_api_base
|
|
||||||
```
|
|
||||||
|
|
||||||
2. Start the server
|
|
||||||
|
|
||||||
```shell
|
|
||||||
docker run -e PORT=8000 -p 8000:8000 -v $(pwd)/config.yaml:/app/config.yaml ghcr.io/berriai/litellm:latest
|
|
||||||
```
|
|
||||||
## Model Alias
|
|
||||||
|
|
||||||
Set a model alias for your deployments.
|
|
||||||
|
|
||||||
In the `config.yaml` the model_name parameter is the user-facing name to use for your deployment.
|
|
||||||
|
|
||||||
E.g.: If we want to save a Huggingface TGI Mistral-7b deployment, as 'mistral-7b' for our users, we might save it as:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
model_list:
|
|
||||||
- model_name: mistral-7b # ALIAS
|
|
||||||
litellm_params:
|
|
||||||
model: huggingface/mistralai/Mistral-7B-Instruct-v0.1 # ACTUAL NAME
|
|
||||||
api_key: your_huggingface_api_key # [OPTIONAL] if deployed on huggingface inference endpoints
|
|
||||||
api_base: your_api_base # url where model is deployed
|
|
||||||
```
|
|
||||||
|
|
||||||
## Caching
|
|
||||||
|
|
||||||
Add Redis Caching to your server via environment variables
|
|
||||||
|
|
||||||
```env
|
|
||||||
### REDIS
|
|
||||||
REDIS_HOST = ""
|
|
||||||
REDIS_PORT = ""
|
|
||||||
REDIS_PASSWORD = ""
|
|
||||||
```
|
|
||||||
|
|
||||||
Docker command:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
docker run -e REDIST_HOST=<your-redis-host> -e REDIS_PORT=<your-redis-port> -e REDIS_PASSWORD=<your-redis-password> -e PORT=8000 -p 8000:8000 ghcr.io/berriai/litellm:latest
|
|
||||||
```
|
|
||||||
|
|
||||||
## Logging
|
|
||||||
|
|
||||||
1. Debug Logs
|
|
||||||
Print the input/output params by setting `SET_VERBOSE = "True"`.
|
|
||||||
|
|
||||||
Docker command:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
docker run -e SET_VERBOSE="True" -e PORT=8000 -p 8000:8000 ghcr.io/berriai/litellm:latest
|
|
||||||
```
|
|
||||||
2. Add Langfuse Logging to your server via environment variables
|
|
||||||
|
|
||||||
```env
|
|
||||||
### LANGFUSE
|
|
||||||
LANGFUSE_PUBLIC_KEY = ""
|
|
||||||
LANGFUSE_SECRET_KEY = ""
|
|
||||||
# Optional, defaults to https://cloud.langfuse.com
|
|
||||||
LANGFUSE_HOST = "" # optional
|
|
||||||
```
|
|
||||||
|
|
||||||
Docker command:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
docker run -e LANGFUSE_PUBLIC_KEY=<your-public-key> -e LANGFUSE_SECRET_KEY=<your-secret-key> -e LANGFUSE_HOST=<your-langfuse-host> -e PORT=8000 -p 8000:8000 ghcr.io/berriai/litellm:latest
|
|
||||||
```
|
|
||||||
|
|
||||||
## Local Usage
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ git clone https://github.com/BerriAI/litellm.git
|
|
||||||
```
|
|
||||||
```shell
|
|
||||||
$ cd ./litellm/litellm_server
|
|
||||||
```
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ uvicorn main:app --host 0.0.0.0 --port 8000
|
|
||||||
```
|
|
||||||
|
|
||||||
## Setting LLM API keys
|
|
||||||
This server allows two ways of passing API keys to litellm
|
|
||||||
- Environment Variables - This server by default assumes the LLM API Keys are stored in the environment variables
|
|
||||||
- Dynamic Variables passed to `/chat/completions`
|
|
||||||
- Set `AUTH_STRATEGY=DYNAMIC` in the Environment
|
|
||||||
- Pass required auth params `api_key`,`api_base`, `api_version` with the request params
|
|
||||||
|
|
||||||
|
|
||||||
<Tabs>
|
|
||||||
<TabItem value="gcp-run" label="Google Cloud Run">
|
|
||||||
|
|
||||||
## Deploy on Google Cloud Run
|
|
||||||
**Click the button** to deploy to Google Cloud Run
|
|
||||||
|
|
||||||
[](https://l.linklyhq.com/l/1uHtX)
|
|
||||||
|
|
||||||
On a successfull deploy your Cloud Run Shell will have this output
|
|
||||||
<Image img={require('../img/cloud_run0.png')} />
|
|
||||||
|
|
||||||
### Testing your deployed server
|
|
||||||
**Assuming the required keys are set as Environment Variables**
|
|
||||||
|
|
||||||
https://litellm-7yjrj3ha2q-uc.a.run.app is our example server, substitute it with your deployed cloud run app
|
|
||||||
|
|
||||||
<Tabs>
|
|
||||||
<TabItem value="openai" label="OpenAI">
|
|
||||||
|
|
||||||
```shell
|
|
||||||
curl https://litellm-7yjrj3ha2q-uc.a.run.app/v1/chat/completions \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{
|
|
||||||
"model": "gpt-3.5-turbo",
|
|
||||||
"messages": [{"role": "user", "content": "Say this is a test!"}],
|
|
||||||
"temperature": 0.7
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
</TabItem>
|
|
||||||
<TabItem value="azure" label="Azure">
|
|
||||||
|
|
||||||
```shell
|
|
||||||
curl https://litellm-7yjrj3ha2q-uc.a.run.app/v1/chat/completions \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{
|
|
||||||
"model": "azure/<your-deployment-name>",
|
|
||||||
"messages": [{"role": "user", "content": "Say this is a test!"}],
|
|
||||||
"temperature": 0.7
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
</TabItem>
|
|
||||||
|
|
||||||
<TabItem value="anthropic" label="Anthropic">
|
|
||||||
|
|
||||||
```shell
|
|
||||||
curl https://litellm-7yjrj3ha2q-uc.a.run.app/v1/chat/completions \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{
|
|
||||||
"model": "claude-2",
|
|
||||||
"messages": [{"role": "user", "content": "Say this is a test!"}],
|
|
||||||
"temperature": 0.7,
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
</TabItem>
|
|
||||||
|
|
||||||
</Tabs>
|
|
||||||
|
|
||||||
### Set LLM API Keys
|
|
||||||
#### Environment Variables
|
|
||||||
More info [here](https://cloud.google.com/run/docs/configuring/services/environment-variables#console)
|
|
||||||
|
|
||||||
1. In the Google Cloud console, go to Cloud Run: [Go to Cloud Run](https://console.cloud.google.com/run)
|
|
||||||
|
|
||||||
2. Click on the **litellm** service
|
|
||||||
<Image img={require('../img/cloud_run1.png')} />
|
|
||||||
|
|
||||||
3. Click **Edit and Deploy New Revision**
|
|
||||||
<Image img={require('../img/cloud_run2.png')} />
|
|
||||||
|
|
||||||
4. Enter your Environment Variables
|
|
||||||
Example `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`
|
|
||||||
<Image img={require('../img/cloud_run3.png')} />
|
|
||||||
|
|
||||||
</TabItem>
|
|
||||||
<TabItem value="render" label="Render">
|
|
||||||
|
|
||||||
## Deploy on Render
|
|
||||||
**Click the button** to deploy to Render
|
|
||||||
|
|
||||||
[](https://l.linklyhq.com/l/1uHsr)
|
|
||||||
|
|
||||||
On a successfull deploy https://dashboard.render.com/ should display the following
|
|
||||||
<Image img={require('../img/render1.png')} />
|
|
||||||
|
|
||||||
<Image img={require('../img/render2.png')} />
|
|
||||||
</TabItem>
|
|
||||||
<TabItem value="aws-apprunner" label="AWS Apprunner">
|
|
||||||
|
|
||||||
## Deploy on AWS Apprunner
|
|
||||||
1. Fork LiteLLM https://github.com/BerriAI/litellm
|
|
||||||
2. Navigate to to App Runner on AWS Console: https://console.aws.amazon.com/apprunner/home#/services
|
|
||||||
3. Follow the steps in the video below
|
|
||||||
<iframe width="800" height="450" src="https://www.loom.com/embed/5fccced4dde8461a8caeee97addb2231?sid=eac60660-073e-455e-a737-b3d05a5a756a" frameborder="0" webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe>
|
|
||||||
|
|
||||||
4. Testing your deployed endpoint
|
|
||||||
|
|
||||||
**Assuming the required keys are set as Environment Variables** Example: `OPENAI_API_KEY`
|
|
||||||
|
|
||||||
https://b2w6emmkzp.us-east-1.awsapprunner.com is our example server, substitute it with your deployed apprunner endpoint
|
|
||||||
|
|
||||||
<Tabs>
|
|
||||||
<TabItem value="openai" label="OpenAI">
|
|
||||||
|
|
||||||
```shell
|
|
||||||
curl https://b2w6emmkzp.us-east-1.awsapprunner.com/v1/chat/completions \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{
|
|
||||||
"model": "gpt-3.5-turbo",
|
|
||||||
"messages": [{"role": "user", "content": "Say this is a test!"}],
|
|
||||||
"temperature": 0.7
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
</TabItem>
|
|
||||||
<TabItem value="azure" label="Azure">
|
|
||||||
|
|
||||||
```shell
|
|
||||||
curl https://b2w6emmkzp.us-east-1.awsapprunner.com/v1/chat/completions \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{
|
|
||||||
"model": "azure/<your-deployment-name>",
|
|
||||||
"messages": [{"role": "user", "content": "Say this is a test!"}],
|
|
||||||
"temperature": 0.7
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
</TabItem>
|
|
||||||
|
|
||||||
<TabItem value="anthropic" label="Anthropic">
|
|
||||||
|
|
||||||
```shell
|
|
||||||
curl https://b2w6emmkzp.us-east-1.awsapprunner.com/v1/chat/completions \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{
|
|
||||||
"model": "claude-2",
|
|
||||||
"messages": [{"role": "user", "content": "Say this is a test!"}],
|
|
||||||
"temperature": 0.7,
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
</TabItem>
|
|
||||||
|
|
||||||
</Tabs>
|
|
||||||
|
|
||||||
</TabItem>
|
|
||||||
</Tabs>
|
|
||||||
|
|
||||||
## Advanced
|
|
||||||
### Caching - Completion() and Embedding() Responses
|
|
||||||
|
|
||||||
Enable caching by adding the following credentials to your server environment
|
|
||||||
|
|
||||||
```
|
|
||||||
REDIS_HOST = "" # REDIS_HOST='redis-18841.c274.us-east-1-3.ec2.cloud.redislabs.com'
|
|
||||||
REDIS_PORT = "" # REDIS_PORT='18841'
|
|
||||||
REDIS_PASSWORD = "" # REDIS_PASSWORD='liteLlmIsAmazing'
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Test Caching
|
|
||||||
Send the same request twice:
|
|
||||||
```shell
|
|
||||||
curl http://0.0.0.0:8000/v1/chat/completions \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{
|
|
||||||
"model": "gpt-3.5-turbo",
|
|
||||||
"messages": [{"role": "user", "content": "write a poem about litellm!"}],
|
|
||||||
"temperature": 0.7
|
|
||||||
}'
|
|
||||||
|
|
||||||
curl http://0.0.0.0:8000/v1/chat/completions \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{
|
|
||||||
"model": "gpt-3.5-turbo",
|
|
||||||
"messages": [{"role": "user", "content": "write a poem about litellm!"}],
|
|
||||||
"temperature": 0.7
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Control caching per completion request
|
|
||||||
Caching can be switched on/off per /chat/completions request
|
|
||||||
- Caching on for completion - pass `caching=True`:
|
|
||||||
```shell
|
|
||||||
curl http://0.0.0.0:8000/v1/chat/completions \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{
|
|
||||||
"model": "gpt-3.5-turbo",
|
|
||||||
"messages": [{"role": "user", "content": "write a poem about litellm!"}],
|
|
||||||
"temperature": 0.7,
|
|
||||||
"caching": true
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
- Caching off for completion - pass `caching=False`:
|
|
||||||
```shell
|
|
||||||
curl http://0.0.0.0:8000/v1/chat/completions \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{
|
|
||||||
"model": "gpt-3.5-turbo",
|
|
||||||
"messages": [{"role": "user", "content": "write a poem about litellm!"}],
|
|
||||||
"temperature": 0.7,
|
|
||||||
"caching": false
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -6,7 +6,7 @@ from datetime import datetime
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
import operator
|
import operator
|
||||||
|
|
||||||
config_filename = "litellm.secrets.toml"
|
config_filename = "litellm.secrets"
|
||||||
# Using appdirs to determine user-specific config path
|
# Using appdirs to determine user-specific config path
|
||||||
config_dir = appdirs.user_config_dir("litellm")
|
config_dir = appdirs.user_config_dir("litellm")
|
||||||
user_config_path = os.getenv("LITELLM_CONFIG_PATH", os.path.join(config_dir, config_filename))
|
user_config_path = os.getenv("LITELLM_CONFIG_PATH", os.path.join(config_dir, config_filename))
|
||||||
|
@ -22,39 +22,6 @@ def run_ollama_serve():
|
||||||
with open(os.devnull, 'w') as devnull:
|
with open(os.devnull, 'w') as devnull:
|
||||||
process = subprocess.Popen(command, stdout=devnull, stderr=devnull)
|
process = subprocess.Popen(command, stdout=devnull, stderr=devnull)
|
||||||
|
|
||||||
def open_config(file_path=None):
|
|
||||||
# Create the .env file if it doesn't exist
|
|
||||||
if file_path:
|
|
||||||
# Ensure the user-specific directory exists
|
|
||||||
os.makedirs(config_dir, exist_ok=True)
|
|
||||||
# Copying the file using shutil.copy
|
|
||||||
try:
|
|
||||||
shutil.copy(file_path, user_config_path)
|
|
||||||
with open(file_path) as f:
|
|
||||||
print(f"Source file: {file_path}")
|
|
||||||
print(f.read())
|
|
||||||
|
|
||||||
with open(user_config_path) as f:
|
|
||||||
print(f"Dest file: {user_config_path}")
|
|
||||||
print(f.read())
|
|
||||||
print("\033[1;32mDone successfully\033[0m")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Failed to copy {file_path}: {e}")
|
|
||||||
else:
|
|
||||||
if os.path.exists(user_config_path):
|
|
||||||
if os.path.getsize(user_config_path) == 0:
|
|
||||||
print(f"{user_config_path} exists but is empty")
|
|
||||||
print(f"To create a config (save keys, modify model prompt), copy the template located here: https://docs.litellm.ai/docs/proxy_server")
|
|
||||||
else:
|
|
||||||
with open(user_config_path) as f:
|
|
||||||
print(f"Saved Config file: {user_config_path}")
|
|
||||||
print(f.read())
|
|
||||||
else:
|
|
||||||
print(f"{user_config_path} hasn't been created yet.")
|
|
||||||
print(f"To create a config (save keys, modify model prompt), copy the template located here: https://docs.litellm.ai/docs/proxy_server")
|
|
||||||
print(f"LiteLLM: config location - {user_config_path}")
|
|
||||||
|
|
||||||
|
|
||||||
def clone_subfolder(repo_url, subfolder, destination):
|
def clone_subfolder(repo_url, subfolder, destination):
|
||||||
# Clone the full repo
|
# Clone the full repo
|
||||||
repo_name = repo_url.split('/')[-1]
|
repo_name = repo_url.split('/')[-1]
|
||||||
|
@ -99,7 +66,7 @@ def is_port_in_use(port):
|
||||||
@click.option('--drop_params', is_flag=True, help='Drop any unmapped params')
|
@click.option('--drop_params', is_flag=True, help='Drop any unmapped params')
|
||||||
@click.option('--create_proxy', is_flag=True, help='Creates a local OpenAI-compatible server template')
|
@click.option('--create_proxy', is_flag=True, help='Creates a local OpenAI-compatible server template')
|
||||||
@click.option('--add_function_to_prompt', is_flag=True, help='If function passed but unsupported, pass it as prompt')
|
@click.option('--add_function_to_prompt', is_flag=True, help='If function passed but unsupported, pass it as prompt')
|
||||||
@click.option('--config', '-c', is_flag=True, help='Configure Litellm')
|
@click.option('--config', '-c', help='Configure Litellm')
|
||||||
@click.option('--file', '-f', help='Path to config file')
|
@click.option('--file', '-f', help='Path to config file')
|
||||||
@click.option('--max_budget', default=None, type=float, help='Set max budget for API calls - works for hosted models like OpenAI, TogetherAI, Anthropic, etc.`')
|
@click.option('--max_budget', default=None, type=float, help='Set max budget for API calls - works for hosted models like OpenAI, TogetherAI, Anthropic, etc.`')
|
||||||
@click.option('--telemetry', default=True, type=bool, help='Helps us know if people are using this feature. Turn this off by doing `--telemetry False`')
|
@click.option('--telemetry', default=True, type=bool, help='Helps us know if people are using this feature. Turn this off by doing `--telemetry False`')
|
||||||
|
@ -126,12 +93,6 @@ def run_server(host, port, api_base, api_version, model, alias, add_key, headers
|
||||||
|
|
||||||
clone_subfolder(repo_url, subfolder, destination)
|
clone_subfolder(repo_url, subfolder, destination)
|
||||||
return
|
return
|
||||||
if config:
|
|
||||||
if file:
|
|
||||||
open_config(file_path=file)
|
|
||||||
else:
|
|
||||||
open_config()
|
|
||||||
return
|
|
||||||
if logs is not None:
|
if logs is not None:
|
||||||
if logs == 0: # default to 1
|
if logs == 0: # default to 1
|
||||||
logs = 1
|
logs = 1
|
||||||
|
@ -202,7 +163,7 @@ def run_server(host, port, api_base, api_version, model, alias, add_key, headers
|
||||||
else:
|
else:
|
||||||
if headers:
|
if headers:
|
||||||
headers = json.loads(headers)
|
headers = json.loads(headers)
|
||||||
initialize(model=model, alias=alias, api_base=api_base, api_version=api_version, debug=debug, temperature=temperature, max_tokens=max_tokens, request_timeout=request_timeout, max_budget=max_budget, telemetry=telemetry, drop_params=drop_params, add_function_to_prompt=add_function_to_prompt, headers=headers, save=save)
|
initialize(model=model, alias=alias, api_base=api_base, api_version=api_version, debug=debug, temperature=temperature, max_tokens=max_tokens, request_timeout=request_timeout, max_budget=max_budget, telemetry=telemetry, drop_params=drop_params, add_function_to_prompt=add_function_to_prompt, headers=headers, save=save, config=config)
|
||||||
try:
|
try:
|
||||||
import uvicorn
|
import uvicorn
|
||||||
except:
|
except:
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import sys, os, platform, time, copy
|
import sys, os, platform, time, copy
|
||||||
import threading, ast
|
import threading, ast
|
||||||
import shutil, random, traceback, requests
|
import shutil, random, traceback, requests
|
||||||
|
from typing import Optional
|
||||||
messages: list = []
|
messages: list = []
|
||||||
sys.path.insert(
|
sys.path.insert(
|
||||||
0, os.path.abspath("../..")
|
0, os.path.abspath("../..")
|
||||||
|
@ -14,6 +14,7 @@ try:
|
||||||
import appdirs
|
import appdirs
|
||||||
import tomli_w
|
import tomli_w
|
||||||
import backoff
|
import backoff
|
||||||
|
import yaml
|
||||||
except ImportError:
|
except ImportError:
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
|
@ -38,11 +39,6 @@ except ImportError:
|
||||||
import appdirs
|
import appdirs
|
||||||
import tomli_w
|
import tomli_w
|
||||||
|
|
||||||
try:
|
|
||||||
from .llm import litellm_completion
|
|
||||||
except ImportError as e:
|
|
||||||
from llm import litellm_completion # type: ignore
|
|
||||||
|
|
||||||
import random
|
import random
|
||||||
|
|
||||||
list_of_messages = [
|
list_of_messages = [
|
||||||
|
@ -120,13 +116,16 @@ user_telemetry = True
|
||||||
user_config = None
|
user_config = None
|
||||||
user_headers = None
|
user_headers = None
|
||||||
local_logging = True # writes logs to a local api_log.json file for debugging
|
local_logging = True # writes logs to a local api_log.json file for debugging
|
||||||
model_router = litellm.Router()
|
|
||||||
config_filename = "litellm.secrets.toml"
|
config_filename = "litellm.secrets.toml"
|
||||||
config_dir = os.getcwd()
|
config_dir = os.getcwd()
|
||||||
config_dir = appdirs.user_config_dir("litellm")
|
config_dir = appdirs.user_config_dir("litellm")
|
||||||
user_config_path = os.getenv(
|
user_config_path = os.getenv(
|
||||||
"LITELLM_CONFIG_PATH", os.path.join(config_dir, config_filename)
|
"LITELLM_CONFIG_PATH", os.path.join(config_dir, config_filename)
|
||||||
)
|
)
|
||||||
|
#### GLOBAL VARIABLES ####
|
||||||
|
llm_router: Optional[litellm.Router] = None
|
||||||
|
llm_model_list: Optional[list] = None
|
||||||
|
server_settings: Optional[dict] = None
|
||||||
log_file = "api_log.json"
|
log_file = "api_log.json"
|
||||||
|
|
||||||
|
|
||||||
|
@ -137,13 +136,6 @@ def print_verbose(print_statement):
|
||||||
print(print_statement)
|
print(print_statement)
|
||||||
|
|
||||||
|
|
||||||
def find_avatar_url(role):
|
|
||||||
role = role.replace(" ", "%20")
|
|
||||||
avatar_filename = f"avatars/{role}.png"
|
|
||||||
avatar_url = f"/static/{avatar_filename}"
|
|
||||||
return avatar_url
|
|
||||||
|
|
||||||
|
|
||||||
def usage_telemetry(
|
def usage_telemetry(
|
||||||
feature: str,
|
feature: str,
|
||||||
): # helps us know if people are using this feature. Set `litellm --telemetry False` to your cli call to turn this off
|
): # helps us know if people are using this feature. Set `litellm --telemetry False` to your cli call to turn this off
|
||||||
|
@ -205,102 +197,141 @@ def save_params_to_config(data: dict):
|
||||||
tomli_w.dump(config, f)
|
tomli_w.dump(config, f)
|
||||||
|
|
||||||
|
|
||||||
|
def load_router_config(router: Optional[litellm.Router], config_file_path: Optional[str]):
|
||||||
|
config = {}
|
||||||
|
server_settings = {}
|
||||||
|
try:
|
||||||
|
if os.path.exists(config_file_path):
|
||||||
|
with open(config_file_path, 'r') as file:
|
||||||
|
config = yaml.safe_load(file)
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
## SERVER SETTINGS (e.g. default completion model = 'ollama/mistral')
|
||||||
|
server_settings = config.get("server_settings", None)
|
||||||
|
if server_settings:
|
||||||
|
server_settings = server_settings
|
||||||
|
|
||||||
|
## LITELLM MODULE SETTINGS (e.g. litellm.drop_params=True,..)
|
||||||
|
litellm_settings = config.get('litellm_settings', None)
|
||||||
|
if litellm_settings:
|
||||||
|
for key, value in litellm_settings.items():
|
||||||
|
setattr(litellm, key, value)
|
||||||
|
|
||||||
|
## MODEL LIST
|
||||||
|
model_list = config.get('model_list', None)
|
||||||
|
if model_list:
|
||||||
|
router = litellm.Router(model_list=model_list)
|
||||||
|
|
||||||
|
## ENVIRONMENT VARIABLES
|
||||||
|
environment_variables = config.get('environment_variables', None)
|
||||||
|
if environment_variables:
|
||||||
|
for key, value in environment_variables.items():
|
||||||
|
os.environ[key] = value
|
||||||
|
|
||||||
|
return router, model_list, server_settings
|
||||||
|
|
||||||
def load_config():
|
def load_config():
|
||||||
|
#### DEPRECATED ####
|
||||||
try:
|
try:
|
||||||
global user_config, user_api_base, user_max_tokens, user_temperature, user_model, local_logging
|
global user_config, user_api_base, user_max_tokens, user_temperature, user_model, local_logging, llm_model_list, llm_router, server_settings
|
||||||
# As the .env file is typically much simpler in structure, we use load_dotenv here directly
|
|
||||||
with open(user_config_path, "rb") as f:
|
# Get the file extension
|
||||||
user_config = tomllib.load(f)
|
file_extension = os.path.splitext(user_config_path)[1]
|
||||||
|
if file_extension.lower() == ".toml":
|
||||||
|
# As the .env file is typically much simpler in structure, we use load_dotenv here directly
|
||||||
|
with open(user_config_path, "rb") as f:
|
||||||
|
user_config = tomllib.load(f)
|
||||||
|
|
||||||
## load keys
|
## load keys
|
||||||
if "keys" in user_config:
|
if "keys" in user_config:
|
||||||
for key in user_config["keys"]:
|
for key in user_config["keys"]:
|
||||||
os.environ[key] = user_config["keys"][
|
os.environ[key] = user_config["keys"][
|
||||||
key
|
key
|
||||||
] # litellm can read keys from the environment
|
] # litellm can read keys from the environment
|
||||||
## settings
|
## settings
|
||||||
if "general" in user_config:
|
if "general" in user_config:
|
||||||
litellm.add_function_to_prompt = user_config["general"].get(
|
litellm.add_function_to_prompt = user_config["general"].get(
|
||||||
"add_function_to_prompt", True
|
"add_function_to_prompt", True
|
||||||
) # by default add function to prompt if unsupported by provider
|
) # by default add function to prompt if unsupported by provider
|
||||||
litellm.drop_params = user_config["general"].get(
|
litellm.drop_params = user_config["general"].get(
|
||||||
"drop_params", True
|
"drop_params", True
|
||||||
) # by default drop params if unsupported by provider
|
) # by default drop params if unsupported by provider
|
||||||
litellm.model_fallbacks = user_config["general"].get(
|
litellm.model_fallbacks = user_config["general"].get(
|
||||||
"fallbacks", None
|
"fallbacks", None
|
||||||
) # fallback models in case initial completion call fails
|
) # fallback models in case initial completion call fails
|
||||||
default_model = user_config["general"].get(
|
default_model = user_config["general"].get(
|
||||||
"default_model", None
|
"default_model", None
|
||||||
) # route all requests to this model.
|
) # route all requests to this model.
|
||||||
|
|
||||||
local_logging = user_config["general"].get("local_logging", True)
|
local_logging = user_config["general"].get("local_logging", True)
|
||||||
|
|
||||||
if user_model is None: # `litellm --model <model-name>`` > default_model.
|
if user_model is None: # `litellm --model <model-name>`` > default_model.
|
||||||
user_model = default_model
|
user_model = default_model
|
||||||
|
|
||||||
## load model config - to set this run `litellm --config`
|
## load model config - to set this run `litellm --config`
|
||||||
model_config = None
|
model_config = None
|
||||||
if "model" in user_config:
|
if "model" in user_config:
|
||||||
if user_model in user_config["model"]:
|
if user_model in user_config["model"]:
|
||||||
model_config = user_config["model"][user_model]
|
model_config = user_config["model"][user_model]
|
||||||
model_list = []
|
model_list = []
|
||||||
for model in user_config["model"]:
|
for model in user_config["model"]:
|
||||||
if "model_list" in user_config["model"][model]:
|
if "model_list" in user_config["model"][model]:
|
||||||
model_list.extend(user_config["model"][model]["model_list"])
|
model_list.extend(user_config["model"][model]["model_list"])
|
||||||
if len(model_list) > 0:
|
|
||||||
model_router.set_model_list(model_list=model_list)
|
|
||||||
|
|
||||||
print_verbose(f"user_config: {user_config}")
|
print_verbose(f"user_config: {user_config}")
|
||||||
print_verbose(f"model_config: {model_config}")
|
print_verbose(f"model_config: {model_config}")
|
||||||
print_verbose(f"user_model: {user_model}")
|
print_verbose(f"user_model: {user_model}")
|
||||||
if model_config is None:
|
if model_config is None:
|
||||||
return
|
return
|
||||||
|
|
||||||
user_max_tokens = model_config.get("max_tokens", None)
|
user_max_tokens = model_config.get("max_tokens", None)
|
||||||
user_temperature = model_config.get("temperature", None)
|
user_temperature = model_config.get("temperature", None)
|
||||||
user_api_base = model_config.get("api_base", None)
|
user_api_base = model_config.get("api_base", None)
|
||||||
|
|
||||||
## custom prompt template
|
## custom prompt template
|
||||||
if "prompt_template" in model_config:
|
if "prompt_template" in model_config:
|
||||||
model_prompt_template = model_config["prompt_template"]
|
model_prompt_template = model_config["prompt_template"]
|
||||||
if (
|
if (
|
||||||
len(model_prompt_template.keys()) > 0
|
len(model_prompt_template.keys()) > 0
|
||||||
): # if user has initialized this at all
|
): # if user has initialized this at all
|
||||||
litellm.register_prompt_template(
|
litellm.register_prompt_template(
|
||||||
model=user_model,
|
model=user_model,
|
||||||
initial_prompt_value=model_prompt_template.get(
|
initial_prompt_value=model_prompt_template.get(
|
||||||
"MODEL_PRE_PROMPT", ""
|
"MODEL_PRE_PROMPT", ""
|
||||||
),
|
),
|
||||||
roles={
|
roles={
|
||||||
"system": {
|
"system": {
|
||||||
"pre_message": model_prompt_template.get(
|
"pre_message": model_prompt_template.get(
|
||||||
"MODEL_SYSTEM_MESSAGE_START_TOKEN", ""
|
"MODEL_SYSTEM_MESSAGE_START_TOKEN", ""
|
||||||
),
|
),
|
||||||
"post_message": model_prompt_template.get(
|
"post_message": model_prompt_template.get(
|
||||||
"MODEL_SYSTEM_MESSAGE_END_TOKEN", ""
|
"MODEL_SYSTEM_MESSAGE_END_TOKEN", ""
|
||||||
),
|
),
|
||||||
|
},
|
||||||
|
"user": {
|
||||||
|
"pre_message": model_prompt_template.get(
|
||||||
|
"MODEL_USER_MESSAGE_START_TOKEN", ""
|
||||||
|
),
|
||||||
|
"post_message": model_prompt_template.get(
|
||||||
|
"MODEL_USER_MESSAGE_END_TOKEN", ""
|
||||||
|
),
|
||||||
|
},
|
||||||
|
"assistant": {
|
||||||
|
"pre_message": model_prompt_template.get(
|
||||||
|
"MODEL_ASSISTANT_MESSAGE_START_TOKEN", ""
|
||||||
|
),
|
||||||
|
"post_message": model_prompt_template.get(
|
||||||
|
"MODEL_ASSISTANT_MESSAGE_END_TOKEN", ""
|
||||||
|
),
|
||||||
|
},
|
||||||
},
|
},
|
||||||
"user": {
|
final_prompt_value=model_prompt_template.get(
|
||||||
"pre_message": model_prompt_template.get(
|
"MODEL_POST_PROMPT", ""
|
||||||
"MODEL_USER_MESSAGE_START_TOKEN", ""
|
),
|
||||||
),
|
)
|
||||||
"post_message": model_prompt_template.get(
|
|
||||||
"MODEL_USER_MESSAGE_END_TOKEN", ""
|
|
||||||
),
|
|
||||||
},
|
|
||||||
"assistant": {
|
|
||||||
"pre_message": model_prompt_template.get(
|
|
||||||
"MODEL_ASSISTANT_MESSAGE_START_TOKEN", ""
|
|
||||||
),
|
|
||||||
"post_message": model_prompt_template.get(
|
|
||||||
"MODEL_ASSISTANT_MESSAGE_END_TOKEN", ""
|
|
||||||
),
|
|
||||||
},
|
|
||||||
},
|
|
||||||
final_prompt_value=model_prompt_template.get(
|
|
||||||
"MODEL_POST_PROMPT", ""
|
|
||||||
),
|
|
||||||
)
|
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -320,12 +351,14 @@ def initialize(
|
||||||
add_function_to_prompt,
|
add_function_to_prompt,
|
||||||
headers,
|
headers,
|
||||||
save,
|
save,
|
||||||
|
config
|
||||||
):
|
):
|
||||||
global user_model, user_api_base, user_debug, user_max_tokens, user_request_timeout, user_temperature, user_telemetry, user_headers
|
global user_model, user_api_base, user_debug, user_max_tokens, user_request_timeout, user_temperature, user_telemetry, user_headers, llm_model_list, llm_router, server_settings
|
||||||
user_model = model
|
user_model = model
|
||||||
user_debug = debug
|
user_debug = debug
|
||||||
load_config()
|
|
||||||
dynamic_config = {"general": {}, user_model: {}}
|
dynamic_config = {"general": {}, user_model: {}}
|
||||||
|
if config:
|
||||||
|
llm_router, llm_model_list, server_settings = load_router_config(router=llm_router, config_file_path=config)
|
||||||
if headers: # model-specific param
|
if headers: # model-specific param
|
||||||
user_headers = headers
|
user_headers = headers
|
||||||
dynamic_config[user_model]["headers"] = headers
|
dynamic_config[user_model]["headers"] = headers
|
||||||
|
@ -470,17 +503,50 @@ litellm.input_callback = [logger]
|
||||||
litellm.success_callback = [logger]
|
litellm.success_callback = [logger]
|
||||||
litellm.failure_callback = [logger]
|
litellm.failure_callback = [logger]
|
||||||
|
|
||||||
|
# for streaming
|
||||||
|
def data_generator(response):
|
||||||
|
print_verbose("inside generator")
|
||||||
|
for chunk in response:
|
||||||
|
print_verbose(f"returned chunk: {chunk}")
|
||||||
|
yield f"data: {json.dumps(chunk)}\n\n"
|
||||||
|
|
||||||
|
|
||||||
|
def litellm_completion(*args, **kwargs):
|
||||||
|
global user_temperature, user_request_timeout, user_max_tokens, user_api_base
|
||||||
|
call_type = kwargs.pop("call_type")
|
||||||
|
# override with user settings
|
||||||
|
if user_temperature:
|
||||||
|
kwargs["temperature"] = user_temperature
|
||||||
|
if user_request_timeout:
|
||||||
|
kwargs["request_timeout"] = user_request_timeout
|
||||||
|
if user_max_tokens:
|
||||||
|
kwargs["max_tokens"] = user_max_tokens
|
||||||
|
if user_api_base:
|
||||||
|
kwargs["api_base"] = user_api_base
|
||||||
|
## CHECK CONFIG ##
|
||||||
|
if llm_model_list and kwargs["model"] in [m["model_name"] for m in llm_model_list]:
|
||||||
|
for m in llm_model_list:
|
||||||
|
if kwargs["model"] == m["model_name"]:
|
||||||
|
for key, value in m["litellm_params"].items():
|
||||||
|
kwargs[key] = value
|
||||||
|
break
|
||||||
|
print(f"call going to litellm: {kwargs}")
|
||||||
|
if call_type == "chat_completion":
|
||||||
|
response = litellm.completion(*args, **kwargs)
|
||||||
|
elif call_type == "text_completion":
|
||||||
|
response = litellm.text_completion(*args, **kwargs)
|
||||||
|
if 'stream' in kwargs and kwargs['stream'] == True: # use generate_responses to stream responses
|
||||||
|
return StreamingResponse(data_generator(response), media_type='text/event-stream')
|
||||||
|
return response
|
||||||
|
|
||||||
#### API ENDPOINTS ####
|
#### API ENDPOINTS ####
|
||||||
@router.get("/v1/models")
|
@router.get("/v1/models")
|
||||||
@router.get("/models") # if project requires model list
|
@router.get("/models") # if project requires model list
|
||||||
def model_list():
|
def model_list():
|
||||||
# all_models = litellm.utils.get_valid_models()
|
global llm_model_list
|
||||||
# if llm_model_list:
|
|
||||||
# all_models += llm_model_list
|
|
||||||
|
|
||||||
|
|
||||||
all_models = litellm.utils.get_valid_models()
|
all_models = litellm.utils.get_valid_models()
|
||||||
|
if llm_model_list:
|
||||||
|
all_models += llm_model_list
|
||||||
if user_model is not None:
|
if user_model is not None:
|
||||||
all_models += user_model
|
all_models += user_model
|
||||||
### CHECK OLLAMA MODELS ###
|
### CHECK OLLAMA MODELS ###
|
||||||
|
@ -508,36 +574,35 @@ def model_list():
|
||||||
@router.post("/completions")
|
@router.post("/completions")
|
||||||
@router.post("/engines/{model:path}/completions")
|
@router.post("/engines/{model:path}/completions")
|
||||||
async def completion(request: Request):
|
async def completion(request: Request):
|
||||||
|
body = await request.body()
|
||||||
|
body_str = body.decode()
|
||||||
try:
|
try:
|
||||||
body = await request.body()
|
data = ast.literal_eval(body_str)
|
||||||
body_str = body.decode()
|
except:
|
||||||
try:
|
data = json.loads(body_str)
|
||||||
data = ast.literal_eval(body_str)
|
if user_model:
|
||||||
except:
|
data["model"] = user_model
|
||||||
data = json.loads(body_str)
|
data["call_type"] = "text_completion"
|
||||||
return litellm_completion(data=data, type="completion", user_model=user_model, user_temperature=user_temperature,
|
return litellm_completion(
|
||||||
user_max_tokens=user_max_tokens, user_api_base=user_api_base, user_headers=user_headers,
|
**data
|
||||||
user_debug=user_debug, model_router=model_router, user_request_timeout=user_request_timeout)
|
)
|
||||||
except Exception as e:
|
|
||||||
print(e)
|
|
||||||
return
|
|
||||||
|
|
||||||
@router.post("/v1/chat/completions")
|
@router.post("/v1/chat/completions")
|
||||||
@router.post("/chat/completions")
|
@router.post("/chat/completions")
|
||||||
async def chat_completion(request: Request):
|
async def chat_completion(request: Request):
|
||||||
|
body = await request.body()
|
||||||
|
body_str = body.decode()
|
||||||
try:
|
try:
|
||||||
body = await request.body()
|
data = ast.literal_eval(body_str)
|
||||||
body_str = body.decode()
|
except:
|
||||||
try:
|
data = json.loads(body_str)
|
||||||
data = ast.literal_eval(body_str)
|
if user_model:
|
||||||
except:
|
data["model"] = user_model
|
||||||
data = json.loads(body_str)
|
data["call_type"] = "chat_completion"
|
||||||
return litellm_completion(data, type="chat_completion", user_model=user_model,
|
return litellm_completion(
|
||||||
user_temperature=user_temperature, user_max_tokens=user_max_tokens,
|
**data
|
||||||
user_api_base=user_api_base, user_headers=user_headers, user_debug=user_debug, model_router=model_router, user_request_timeout=user_request_timeout)
|
)
|
||||||
except Exception as e:
|
|
||||||
print(e)
|
|
||||||
return
|
|
||||||
|
|
||||||
def print_cost_logs():
|
def print_cost_logs():
|
||||||
with open("costs.json", "r") as f:
|
with open("costs.json", "r") as f:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue