From 6b3671b5937e3dde8b6ee6e86ffbb9e886e4f991 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Fri, 3 Nov 2023 12:48:47 -0700 Subject: [PATCH] fix(proxy_server.py): accept config.yaml --- docs/my-website/docs/simple_proxy.md | 838 +++++++++++++-------------- litellm/proxy/proxy_cli.py | 45 +- litellm/proxy/proxy_server.py | 327 ++++++----- 3 files changed, 603 insertions(+), 607 deletions(-) diff --git a/docs/my-website/docs/simple_proxy.md b/docs/my-website/docs/simple_proxy.md index fd6df78295..0a2b5cfd1a 100644 --- a/docs/my-website/docs/simple_proxy.md +++ b/docs/my-website/docs/simple_proxy.md @@ -9,8 +9,8 @@ LiteLLM Server, is a simple, fast, and lightweight **OpenAI-compatible server** LiteLLM Server supports: * LLM API Calls in the OpenAI ChatCompletions format +* Set custom prompt templates * Caching + Logging capabilities (Redis and Langfuse, respectively) -* Setting API keys in the request headers or in the .env [**See Code**](https://github.com/BerriAI/litellm/tree/main/litellm_server) @@ -19,17 +19,16 @@ We want to learn how we can make the server better! Meet the [founders](https:// join our [discord](https://discord.gg/wuPM9dRgDw) ::: -## Usage +## Quick Start ```shell -docker run -e PORT=8000 -p 8000:8000 ghcr.io/berriai/litellm:latest +$ litellm --model huggingface/bigcode/starcoder ``` OpenAI Proxy running on http://0.0.0.0:8000 ```shell curl http://0.0.0.0:8000/v1/chat/completions \ -H "Content-Type: application/json" \ - -H "Authorization: Bearer $YOUR_API_KEY" -d '{ "model": "gpt-3.5-turbo", "messages": [{"role": "user", "content": "Say this is a test!"}], @@ -37,192 +36,483 @@ curl http://0.0.0.0:8000/v1/chat/completions \ }' ``` +This will now automatically route any requests for gpt-3.5-turbo to bigcode starcoder, hosted on huggingface inference endpoints. + + #### Other supported models: - + +Assuming you're running vllm locally ```shell -$ docker run -e PORT=8000 -e AWS_ACCESS_KEY_ID= -e AWS_SECRET_ACCESS_KEY= -p 8000:8000 ghcr.io/berriai/litellm:latest +$ litellm --model vllm/facebook/opt-125m ``` - - + -**Set API Keys in .env** -If, you're calling it via Huggingface Inference Endpoints. ```shell -$ docker run -e PORT=8000 -e HUGGINGFACE_API_KEY= -p 8000:8000 ghcr.io/berriai/litellm:latest +$ litellm --model openai/ --api_base ``` + + -Else, ```shell -$ docker run -e PORT=8000 -p 8000:8000 ghcr.io/berriai/litellm:latest +$ export HUGGINGFACE_API_KEY=my-api-key #[OPTIONAL] +$ litellm --model huggingface/ --api_base https://# e.g. huggingface/mistralai/Mistral-7B-v0.1 ``` -**Set API Keys in request headers** -```shell -curl http://0.0.0.0:8000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -H "Authorization: Bearer $HUGGINGFACE_API_KEY" - -d '{ - "model": "huggingface/bigcoder/starcoder", - "messages": [{"role": "user", "content": "Say this is a test!"}], - "temperature": 0.7 - }' -``` - - -**Set API Keys in .env** ```shell -$ docker run -e PORT=8000 -e ANTHROPIC_API_KEY= -p 8000:8000 ghcr.io/berriai/litellm:latest -``` - -**Set API Keys in request headers** -```shell -curl http://0.0.0.0:8000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -H "Authorization: Bearer $ANTHROPIC_API_KEY" - -d '{ - "model": "claude-2", - "messages": [{"role": "user", "content": "Say this is a test!"}], - "temperature": 0.7 - }' -``` - - - - - - -```shell -$ docker run -e PORT=8000 -e OLLAMA_API_BASE= -p 8000:8000 ghcr.io/berriai/litellm:latest +$ export ANTHROPIC_API_KEY=my-api-key +$ litellm --model claude-instant-1 ``` - -**Set API Keys in .env** ```shell -$ docker run -e PORT=8000 -e TOGETHERAI_API_KEY= -p 8000:8000 ghcr.io/berriai/litellm:latest -``` - -**Set API Keys in request headers** -```shell -curl http://0.0.0.0:8000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -H "Authorization: Bearer $TOGETHERAI_API_KEY" - -d '{ - "model": "together_ai/togethercomputer/llama-2-70b-chat", - "messages": [{"role": "user", "content": "Say this is a test!"}], - "temperature": 0.7 - }' +$ export TOGETHERAI_API_KEY=my-api-key +$ litellm --model together_ai/lmsys/vicuna-13b-v1.5-16k ``` -**Set API Keys in .env** ```shell -$ docker run -e PORT=8000 -e REPLICATE_API_KEY= -p 8000:8000 ghcr.io/berriai/litellm:latest +$ export REPLICATE_API_KEY=my-api-key +$ litellm \ + --model replicate/meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3 ``` -**Set API Keys in request headers** -```shell -curl http://0.0.0.0:8000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -H "Authorization: Bearer $REPLICATE_API_KEY" - -d '{ - "model": "replicate/llama-2-70b-chat:2796ee9483c3fd7aa2e171d38f4ca12251a30609463dcfd4cd76703f22e96cdf", - "messages": [{"role": "user", "content": "Say this is a test!"}], - "temperature": 0.7 - }' -``` + + + +```shell +$ litellm --model petals/meta-llama/Llama-2-70b-chat-hf +``` -**Set API Keys in .env** ```shell -$ docker run -e PORT=8000 -e PALM_API_KEY= -p 8000:8000 ghcr.io/berriai/litellm:latest -``` - -**Set API Keys in request headers** -```shell -curl http://0.0.0.0:8000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -H "Authorization: Bearer $PALM_API_KEY" - -d '{ - "model": "palm/chat-bison", - "messages": [{"role": "user", "content": "Say this is a test!"}], - "temperature": 0.7 - }' +$ export PALM_API_KEY=my-palm-key +$ litellm --model palm/chat-bison ``` -**Set API Keys in .env** ```shell -$ docker run -e PORT=8000 -e AZURE_API_KEY= -e AZURE_API_BASE= -p 8000:8000 ghcr.io/berriai/litellm:latest +$ export AZURE_API_KEY=my-api-key +$ export AZURE_API_BASE=my-api-base + +$ litellm --model azure/my-deployment-name ``` -**Set API Keys in .env** ```shell -$ docker run -e PORT=8000 -e AI21_API_KEY= -p 8000:8000 ghcr.io/berriai/litellm:latest -``` - -**Set API Keys in request headers** -```shell -curl http://0.0.0.0:8000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -H "Authorization: Bearer $AI21_API_KEY" - -d '{ - "model": "j2-mid", - "messages": [{"role": "user", "content": "Say this is a test!"}], - "temperature": 0.7 - }' +$ export AI21_API_KEY=my-api-key +$ litellm --model j2-light ``` -**Set API Keys in .env** - ```shell -$ docker run -e PORT=8000 -e COHERE_API_KEY= -p 8000:8000 ghcr.io/berriai/litellm:latest +$ export COHERE_API_KEY=my-api-key +$ litellm --model command-nightly ``` -**Set API Keys in request headers** + + + + +[**Jump to Code**](https://github.com/BerriAI/litellm/blob/fef4146396d5d87006259e00095a62e3900d6bb4/litellm/proxy.py#L36) + +# LM-Evaluation Harness with TGI + +Evaluate LLMs 20x faster with TGI via litellm proxy's `/completions` endpoint. + +This tutorial assumes you're using [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) + +**Step 1: Start the local proxy** ```shell -curl http://0.0.0.0:8000/v1/chat/completions \ +$ litellm --model huggingface/bigcode/starcoder +``` + +OpenAI Compatible Endpoint at http://0.0.0.0:8000 + +**Step 2: Set OpenAI API Base** +```shell +$ export OPENAI_API_BASE="http://0.0.0.0:8000" +``` + +**Step 3: Run LM-Eval-Harness** + +```shell +$ python3 main.py \ + --model gpt3 \ + --model_args engine=huggingface/bigcode/starcoder \ + --tasks hellaswag +``` + + +## Endpoints: +- `/chat/completions` - chat completions endpoint to call 100+ LLMs +- `/embeddings` - embedding endpoint for Azure, OpenAI, Huggingface endpoints +- `/models` - available models on server + +## Set Custom Prompt Templates + +LiteLLM by default checks if a model has a [prompt template and applies it](./completion/prompt_formatting.md) (e.g. if a huggingface model has a saved chat template in it's tokenizer_config.json). However, you can also set a custom prompt template on your proxy in the `config.yaml`: + +**Step 1**: Save your prompt template in a `config.yaml` +```yaml +# Model-specific parameters +model_list: + - model_name: mistral-7b # model alias + litellm_params: # actual params for litellm.completion() + model: "huggingface/mistralai/Mistral-7B-Instruct-v0.1" + api_base: "" + api_key: "" # [OPTIONAL] for hf inference endpoints + initial_prompt_value: "\n" + roles: {"system":{"pre_message":"<|im_start|>system\n", "post_message":"<|im_end|>"}, "assistant":{"pre_message":"<|im_start|>assistant\n","post_message":"<|im_end|>"}, "user":{"pre_message":"<|im_start|>user\n","post_message":"<|im_end|>"}} + final_prompt_value: "\n" + bos_token: "" + eos_token: "" + max_tokens: 4096 +``` + +**Step 2**: Start server with config + +```shell +$ litellm --config /path/to/config.yaml +``` + +## Save Model-specific params (API Base, API Keys, Temperature, etc.) +Use the [router_config_template.yaml](https://github.com/BerriAI/litellm/blob/main/router_config_template.yaml) to save model-specific information like api_base, api_key, temperature, max_tokens, etc. + +**Step 1**: Create a `config.yaml` file +```shell +model_list: + - model_name: gpt-3.5-turbo + litellm_params: # params for litellm.completion() - https://docs.litellm.ai/docs/completion/input#input---request-body + model: azure/chatgpt-v-2 # azure/ + api_key: your_azure_api_key + api_version: your_azure_api_version + api_base: your_azure_api_base + - model_name: mistral-7b + litellm_params: + model: ollama/mistral + api_base: your_ollama_api_base +``` + +**Step 2**: Start server with config + +```shell +$ litellm --config /path/to/config.yaml +``` +## Model Alias + +Set a model alias for your deployments. + +In the `config.yaml` the model_name parameter is the user-facing name to use for your deployment. + +E.g.: If we want to save a Huggingface TGI Mistral-7b deployment, as 'mistral-7b' for our users, we might save it as: + +```yaml +model_list: + - model_name: mistral-7b # ALIAS + litellm_params: + model: huggingface/mistralai/Mistral-7B-Instruct-v0.1 # ACTUAL NAME + api_key: your_huggingface_api_key # [OPTIONAL] if deployed on huggingface inference endpoints + api_base: your_api_base # url where model is deployed +``` + +## Caching + +Add Redis Caching to your server via environment variables + +```env +### REDIS +REDIS_HOST = "" +REDIS_PORT = "" +REDIS_PASSWORD = "" +``` + +Docker command: + +```shell +docker run -e REDIST_HOST= -e REDIS_PORT= -e REDIS_PASSWORD= -e PORT=8000 -p 8000:8000 ghcr.io/berriai/litellm:latest +``` + +## Logging + +1. Debug Logs +Print the input/output params by setting `SET_VERBOSE = "True"`. + +Docker command: + +```shell +docker run -e SET_VERBOSE="True" -e PORT=8000 -p 8000:8000 ghcr.io/berriai/litellm:latest +``` +2. Add Langfuse Logging to your server via environment variables + +```env +### LANGFUSE +LANGFUSE_PUBLIC_KEY = "" +LANGFUSE_SECRET_KEY = "" +# Optional, defaults to https://cloud.langfuse.com +LANGFUSE_HOST = "" # optional +``` + +Docker command: + +```shell +docker run -e LANGFUSE_PUBLIC_KEY= -e LANGFUSE_SECRET_KEY= -e LANGFUSE_HOST= -e PORT=8000 -p 8000:8000 ghcr.io/berriai/litellm:latest +``` + +## Local Usage + +```shell +$ git clone https://github.com/BerriAI/litellm.git +``` +```shell +$ cd ./litellm/litellm_server +``` + +```shell +$ uvicorn main:app --host 0.0.0.0 --port 8000 +``` + +## Setting LLM API keys +This server allows two ways of passing API keys to litellm +- Environment Variables - This server by default assumes the LLM API Keys are stored in the environment variables +- Dynamic Variables passed to `/chat/completions` + - Set `AUTH_STRATEGY=DYNAMIC` in the Environment + - Pass required auth params `api_key`,`api_base`, `api_version` with the request params + + + + + +#### Deploy on Google Cloud Run +**Click the button** to deploy to Google Cloud Run + +[![Deploy](https://deploy.cloud.run/button.svg)](https://l.linklyhq.com/l/1uHtX) + +On a successfull deploy your Cloud Run Shell will have this output + + +### Testing your deployed server +**Assuming the required keys are set as Environment Variables** + +https://litellm-7yjrj3ha2q-uc.a.run.app is our example server, substitute it with your deployed cloud run app + + + + +```shell +curl https://litellm-7yjrj3ha2q-uc.a.run.app/v1/chat/completions \ -H "Content-Type: application/json" \ - -H "Authorization: Bearer $COHERE_API_KEY" -d '{ - "model": "command-nightly", + "model": "gpt-3.5-turbo", "messages": [{"role": "user", "content": "Say this is a test!"}], "temperature": 0.7 }' ``` + + + +```shell +curl https://litellm-7yjrj3ha2q-uc.a.run.app/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "azure/", + "messages": [{"role": "user", "content": "Say this is a test!"}], + "temperature": 0.7 + }' +``` + + +```shell +curl https://litellm-7yjrj3ha2q-uc.a.run.app/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "claude-2", + "messages": [{"role": "user", "content": "Say this is a test!"}], + "temperature": 0.7, + }' +``` + + +### Set LLM API Keys +#### Environment Variables +More info [here](https://cloud.google.com/run/docs/configuring/services/environment-variables#console) + +1. In the Google Cloud console, go to Cloud Run: [Go to Cloud Run](https://console.cloud.google.com/run) + +2. Click on the **litellm** service + + +3. Click **Edit and Deploy New Revision** + + +4. Enter your Environment Variables +Example `OPENAI_API_KEY`, `ANTHROPIC_API_KEY` + + + + + +#### Deploy on Render +**Click the button** to deploy to Render + +[![Deploy](https://render.com/images/deploy-to-render-button.svg)](https://l.linklyhq.com/l/1uHsr) + +On a successfull deploy https://dashboard.render.com/ should display the following + + + + + + +#### Deploy on AWS Apprunner +1. Fork LiteLLM https://github.com/BerriAI/litellm +2. Navigate to to App Runner on AWS Console: https://console.aws.amazon.com/apprunner/home#/services +3. Follow the steps in the video below + + +4. Testing your deployed endpoint + + **Assuming the required keys are set as Environment Variables** Example: `OPENAI_API_KEY` + + https://b2w6emmkzp.us-east-1.awsapprunner.com is our example server, substitute it with your deployed apprunner endpoint + + + + + ```shell + curl https://b2w6emmkzp.us-east-1.awsapprunner.com/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-3.5-turbo", + "messages": [{"role": "user", "content": "Say this is a test!"}], + "temperature": 0.7 + }' + ``` + + + + + ```shell + curl https://b2w6emmkzp.us-east-1.awsapprunner.com/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "azure/", + "messages": [{"role": "user", "content": "Say this is a test!"}], + "temperature": 0.7 + }' + ``` + + + + + + ```shell + curl https://b2w6emmkzp.us-east-1.awsapprunner.com/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "claude-2", + "messages": [{"role": "user", "content": "Say this is a test!"}], + "temperature": 0.7, + }' + ``` + + + + + + + +## Advanced +### Caching - Completion() and Embedding() Responses + +Enable caching by adding the following credentials to your server environment + + ``` + REDIS_HOST = "" # REDIS_HOST='redis-18841.c274.us-east-1-3.ec2.cloud.redislabs.com' + REDIS_PORT = "" # REDIS_PORT='18841' + REDIS_PASSWORD = "" # REDIS_PASSWORD='liteLlmIsAmazing' + ``` + +#### Test Caching +Send the same request twice: +```shell +curl http://0.0.0.0:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-3.5-turbo", + "messages": [{"role": "user", "content": "write a poem about litellm!"}], + "temperature": 0.7 + }' + +curl http://0.0.0.0:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-3.5-turbo", + "messages": [{"role": "user", "content": "write a poem about litellm!"}], + "temperature": 0.7 + }' +``` + +#### Control caching per completion request +Caching can be switched on/off per /chat/completions request +- Caching on for completion - pass `caching=True`: + ```shell + curl http://0.0.0.0:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-3.5-turbo", + "messages": [{"role": "user", "content": "write a poem about litellm!"}], + "temperature": 0.7, + "caching": true + }' + ``` +- Caching off for completion - pass `caching=False`: + ```shell + curl http://0.0.0.0:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-3.5-turbo", + "messages": [{"role": "user", "content": "write a poem about litellm!"}], + "temperature": 0.7, + "caching": false + }' + ``` + + + + + + ## Tutorials (Chat-UI, NeMO-Guardrails, PromptTools, Phoenix ArizeAI, Langchain, ragas, LlamaIndex, etc.) **Start server:** @@ -440,323 +730,3 @@ print(response) - -## Endpoints: -- `/chat/completions` - chat completions endpoint to call 100+ LLMs -- `/embeddings` - embedding endpoint for Azure, OpenAI, Huggingface endpoints -- `/models` - available models on server - - -## Save Model-specific params (API Base, API Keys, Temperature, etc.) -Use the [router_config_template.yaml](https://github.com/BerriAI/litellm/blob/main/router_config_template.yaml) to save model-specific information like api_base, api_key, temperature, max_tokens, etc. - -1. Create a `config.yaml` file -```shell -model_list: - - model_name: gpt-3.5-turbo - litellm_params: # params for litellm.completion() - https://docs.litellm.ai/docs/completion/input#input---request-body - model: azure/chatgpt-v-2 # azure/ - api_key: your_azure_api_key - api_version: your_azure_api_version - api_base: your_azure_api_base - - model_name: mistral-7b - litellm_params: - model: ollama/mistral - api_base: your_ollama_api_base -``` - -2. Start the server - -```shell -docker run -e PORT=8000 -p 8000:8000 -v $(pwd)/config.yaml:/app/config.yaml ghcr.io/berriai/litellm:latest -``` -## Model Alias - -Set a model alias for your deployments. - -In the `config.yaml` the model_name parameter is the user-facing name to use for your deployment. - -E.g.: If we want to save a Huggingface TGI Mistral-7b deployment, as 'mistral-7b' for our users, we might save it as: - -```yaml -model_list: - - model_name: mistral-7b # ALIAS - litellm_params: - model: huggingface/mistralai/Mistral-7B-Instruct-v0.1 # ACTUAL NAME - api_key: your_huggingface_api_key # [OPTIONAL] if deployed on huggingface inference endpoints - api_base: your_api_base # url where model is deployed -``` - -## Caching - -Add Redis Caching to your server via environment variables - -```env -### REDIS -REDIS_HOST = "" -REDIS_PORT = "" -REDIS_PASSWORD = "" -``` - -Docker command: - -```shell -docker run -e REDIST_HOST= -e REDIS_PORT= -e REDIS_PASSWORD= -e PORT=8000 -p 8000:8000 ghcr.io/berriai/litellm:latest -``` - -## Logging - -1. Debug Logs -Print the input/output params by setting `SET_VERBOSE = "True"`. - -Docker command: - -```shell -docker run -e SET_VERBOSE="True" -e PORT=8000 -p 8000:8000 ghcr.io/berriai/litellm:latest -``` -2. Add Langfuse Logging to your server via environment variables - -```env -### LANGFUSE -LANGFUSE_PUBLIC_KEY = "" -LANGFUSE_SECRET_KEY = "" -# Optional, defaults to https://cloud.langfuse.com -LANGFUSE_HOST = "" # optional -``` - -Docker command: - -```shell -docker run -e LANGFUSE_PUBLIC_KEY= -e LANGFUSE_SECRET_KEY= -e LANGFUSE_HOST= -e PORT=8000 -p 8000:8000 ghcr.io/berriai/litellm:latest -``` - -## Local Usage - -```shell -$ git clone https://github.com/BerriAI/litellm.git -``` -```shell -$ cd ./litellm/litellm_server -``` - -```shell -$ uvicorn main:app --host 0.0.0.0 --port 8000 -``` - -## Setting LLM API keys -This server allows two ways of passing API keys to litellm -- Environment Variables - This server by default assumes the LLM API Keys are stored in the environment variables -- Dynamic Variables passed to `/chat/completions` - - Set `AUTH_STRATEGY=DYNAMIC` in the Environment - - Pass required auth params `api_key`,`api_base`, `api_version` with the request params - - - - - -## Deploy on Google Cloud Run -**Click the button** to deploy to Google Cloud Run - -[![Deploy](https://deploy.cloud.run/button.svg)](https://l.linklyhq.com/l/1uHtX) - -On a successfull deploy your Cloud Run Shell will have this output - - -### Testing your deployed server -**Assuming the required keys are set as Environment Variables** - -https://litellm-7yjrj3ha2q-uc.a.run.app is our example server, substitute it with your deployed cloud run app - - - - -```shell -curl https://litellm-7yjrj3ha2q-uc.a.run.app/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "gpt-3.5-turbo", - "messages": [{"role": "user", "content": "Say this is a test!"}], - "temperature": 0.7 - }' -``` - - - - -```shell -curl https://litellm-7yjrj3ha2q-uc.a.run.app/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "azure/", - "messages": [{"role": "user", "content": "Say this is a test!"}], - "temperature": 0.7 - }' -``` - - - - - -```shell -curl https://litellm-7yjrj3ha2q-uc.a.run.app/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "claude-2", - "messages": [{"role": "user", "content": "Say this is a test!"}], - "temperature": 0.7, - }' -``` - - - - -### Set LLM API Keys -#### Environment Variables -More info [here](https://cloud.google.com/run/docs/configuring/services/environment-variables#console) - -1. In the Google Cloud console, go to Cloud Run: [Go to Cloud Run](https://console.cloud.google.com/run) - -2. Click on the **litellm** service - - -3. Click **Edit and Deploy New Revision** - - -4. Enter your Environment Variables -Example `OPENAI_API_KEY`, `ANTHROPIC_API_KEY` - - - - - -## Deploy on Render -**Click the button** to deploy to Render - -[![Deploy](https://render.com/images/deploy-to-render-button.svg)](https://l.linklyhq.com/l/1uHsr) - -On a successfull deploy https://dashboard.render.com/ should display the following - - - - - - -## Deploy on AWS Apprunner -1. Fork LiteLLM https://github.com/BerriAI/litellm -2. Navigate to to App Runner on AWS Console: https://console.aws.amazon.com/apprunner/home#/services -3. Follow the steps in the video below - - -4. Testing your deployed endpoint - - **Assuming the required keys are set as Environment Variables** Example: `OPENAI_API_KEY` - - https://b2w6emmkzp.us-east-1.awsapprunner.com is our example server, substitute it with your deployed apprunner endpoint - - - - - ```shell - curl https://b2w6emmkzp.us-east-1.awsapprunner.com/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "gpt-3.5-turbo", - "messages": [{"role": "user", "content": "Say this is a test!"}], - "temperature": 0.7 - }' - ``` - - - - - ```shell - curl https://b2w6emmkzp.us-east-1.awsapprunner.com/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "azure/", - "messages": [{"role": "user", "content": "Say this is a test!"}], - "temperature": 0.7 - }' - ``` - - - - - - ```shell - curl https://b2w6emmkzp.us-east-1.awsapprunner.com/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "claude-2", - "messages": [{"role": "user", "content": "Say this is a test!"}], - "temperature": 0.7, - }' - ``` - - - - - - - -## Advanced -### Caching - Completion() and Embedding() Responses - -Enable caching by adding the following credentials to your server environment - - ``` - REDIS_HOST = "" # REDIS_HOST='redis-18841.c274.us-east-1-3.ec2.cloud.redislabs.com' - REDIS_PORT = "" # REDIS_PORT='18841' - REDIS_PASSWORD = "" # REDIS_PASSWORD='liteLlmIsAmazing' - ``` - -#### Test Caching -Send the same request twice: -```shell -curl http://0.0.0.0:8000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "gpt-3.5-turbo", - "messages": [{"role": "user", "content": "write a poem about litellm!"}], - "temperature": 0.7 - }' - -curl http://0.0.0.0:8000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "gpt-3.5-turbo", - "messages": [{"role": "user", "content": "write a poem about litellm!"}], - "temperature": 0.7 - }' -``` - -#### Control caching per completion request -Caching can be switched on/off per /chat/completions request -- Caching on for completion - pass `caching=True`: - ```shell - curl http://0.0.0.0:8000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "gpt-3.5-turbo", - "messages": [{"role": "user", "content": "write a poem about litellm!"}], - "temperature": 0.7, - "caching": true - }' - ``` -- Caching off for completion - pass `caching=False`: - ```shell - curl http://0.0.0.0:8000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "gpt-3.5-turbo", - "messages": [{"role": "user", "content": "write a poem about litellm!"}], - "temperature": 0.7, - "caching": false - }' - ``` - - - - - - diff --git a/litellm/proxy/proxy_cli.py b/litellm/proxy/proxy_cli.py index d65adce16c..bce43b25e1 100644 --- a/litellm/proxy/proxy_cli.py +++ b/litellm/proxy/proxy_cli.py @@ -6,7 +6,7 @@ from datetime import datetime from dotenv import load_dotenv import operator -config_filename = "litellm.secrets.toml" +config_filename = "litellm.secrets" # Using appdirs to determine user-specific config path config_dir = appdirs.user_config_dir("litellm") user_config_path = os.getenv("LITELLM_CONFIG_PATH", os.path.join(config_dir, config_filename)) @@ -22,39 +22,6 @@ def run_ollama_serve(): with open(os.devnull, 'w') as devnull: process = subprocess.Popen(command, stdout=devnull, stderr=devnull) -def open_config(file_path=None): - # Create the .env file if it doesn't exist - if file_path: - # Ensure the user-specific directory exists - os.makedirs(config_dir, exist_ok=True) - # Copying the file using shutil.copy - try: - shutil.copy(file_path, user_config_path) - with open(file_path) as f: - print(f"Source file: {file_path}") - print(f.read()) - - with open(user_config_path) as f: - print(f"Dest file: {user_config_path}") - print(f.read()) - print("\033[1;32mDone successfully\033[0m") - except Exception as e: - print(f"Failed to copy {file_path}: {e}") - else: - if os.path.exists(user_config_path): - if os.path.getsize(user_config_path) == 0: - print(f"{user_config_path} exists but is empty") - print(f"To create a config (save keys, modify model prompt), copy the template located here: https://docs.litellm.ai/docs/proxy_server") - else: - with open(user_config_path) as f: - print(f"Saved Config file: {user_config_path}") - print(f.read()) - else: - print(f"{user_config_path} hasn't been created yet.") - print(f"To create a config (save keys, modify model prompt), copy the template located here: https://docs.litellm.ai/docs/proxy_server") - print(f"LiteLLM: config location - {user_config_path}") - - def clone_subfolder(repo_url, subfolder, destination): # Clone the full repo repo_name = repo_url.split('/')[-1] @@ -99,7 +66,7 @@ def is_port_in_use(port): @click.option('--drop_params', is_flag=True, help='Drop any unmapped params') @click.option('--create_proxy', is_flag=True, help='Creates a local OpenAI-compatible server template') @click.option('--add_function_to_prompt', is_flag=True, help='If function passed but unsupported, pass it as prompt') -@click.option('--config', '-c', is_flag=True, help='Configure Litellm') +@click.option('--config', '-c', help='Configure Litellm') @click.option('--file', '-f', help='Path to config file') @click.option('--max_budget', default=None, type=float, help='Set max budget for API calls - works for hosted models like OpenAI, TogetherAI, Anthropic, etc.`') @click.option('--telemetry', default=True, type=bool, help='Helps us know if people are using this feature. Turn this off by doing `--telemetry False`') @@ -126,12 +93,6 @@ def run_server(host, port, api_base, api_version, model, alias, add_key, headers clone_subfolder(repo_url, subfolder, destination) return - if config: - if file: - open_config(file_path=file) - else: - open_config() - return if logs is not None: if logs == 0: # default to 1 logs = 1 @@ -202,7 +163,7 @@ def run_server(host, port, api_base, api_version, model, alias, add_key, headers else: if headers: headers = json.loads(headers) - initialize(model=model, alias=alias, api_base=api_base, api_version=api_version, debug=debug, temperature=temperature, max_tokens=max_tokens, request_timeout=request_timeout, max_budget=max_budget, telemetry=telemetry, drop_params=drop_params, add_function_to_prompt=add_function_to_prompt, headers=headers, save=save) + initialize(model=model, alias=alias, api_base=api_base, api_version=api_version, debug=debug, temperature=temperature, max_tokens=max_tokens, request_timeout=request_timeout, max_budget=max_budget, telemetry=telemetry, drop_params=drop_params, add_function_to_prompt=add_function_to_prompt, headers=headers, save=save, config=config) try: import uvicorn except: diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index 72fa14ec2c..65b5dd3dc7 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -1,7 +1,7 @@ import sys, os, platform, time, copy import threading, ast import shutil, random, traceback, requests - +from typing import Optional messages: list = [] sys.path.insert( 0, os.path.abspath("../..") @@ -14,6 +14,7 @@ try: import appdirs import tomli_w import backoff + import yaml except ImportError: import subprocess import sys @@ -38,11 +39,6 @@ except ImportError: import appdirs import tomli_w -try: - from .llm import litellm_completion -except ImportError as e: - from llm import litellm_completion # type: ignore - import random list_of_messages = [ @@ -120,13 +116,16 @@ user_telemetry = True user_config = None user_headers = None local_logging = True # writes logs to a local api_log.json file for debugging -model_router = litellm.Router() config_filename = "litellm.secrets.toml" config_dir = os.getcwd() config_dir = appdirs.user_config_dir("litellm") user_config_path = os.getenv( "LITELLM_CONFIG_PATH", os.path.join(config_dir, config_filename) ) +#### GLOBAL VARIABLES #### +llm_router: Optional[litellm.Router] = None +llm_model_list: Optional[list] = None +server_settings: Optional[dict] = None log_file = "api_log.json" @@ -137,13 +136,6 @@ def print_verbose(print_statement): print(print_statement) -def find_avatar_url(role): - role = role.replace(" ", "%20") - avatar_filename = f"avatars/{role}.png" - avatar_url = f"/static/{avatar_filename}" - return avatar_url - - def usage_telemetry( feature: str, ): # helps us know if people are using this feature. Set `litellm --telemetry False` to your cli call to turn this off @@ -205,102 +197,141 @@ def save_params_to_config(data: dict): tomli_w.dump(config, f) +def load_router_config(router: Optional[litellm.Router], config_file_path: Optional[str]): + config = {} + server_settings = {} + try: + if os.path.exists(config_file_path): + with open(config_file_path, 'r') as file: + config = yaml.safe_load(file) + else: + pass + except: + pass + + ## SERVER SETTINGS (e.g. default completion model = 'ollama/mistral') + server_settings = config.get("server_settings", None) + if server_settings: + server_settings = server_settings + + ## LITELLM MODULE SETTINGS (e.g. litellm.drop_params=True,..) + litellm_settings = config.get('litellm_settings', None) + if litellm_settings: + for key, value in litellm_settings.items(): + setattr(litellm, key, value) + + ## MODEL LIST + model_list = config.get('model_list', None) + if model_list: + router = litellm.Router(model_list=model_list) + + ## ENVIRONMENT VARIABLES + environment_variables = config.get('environment_variables', None) + if environment_variables: + for key, value in environment_variables.items(): + os.environ[key] = value + + return router, model_list, server_settings + def load_config(): + #### DEPRECATED #### try: - global user_config, user_api_base, user_max_tokens, user_temperature, user_model, local_logging - # As the .env file is typically much simpler in structure, we use load_dotenv here directly - with open(user_config_path, "rb") as f: - user_config = tomllib.load(f) + global user_config, user_api_base, user_max_tokens, user_temperature, user_model, local_logging, llm_model_list, llm_router, server_settings + + # Get the file extension + file_extension = os.path.splitext(user_config_path)[1] + if file_extension.lower() == ".toml": + # As the .env file is typically much simpler in structure, we use load_dotenv here directly + with open(user_config_path, "rb") as f: + user_config = tomllib.load(f) - ## load keys - if "keys" in user_config: - for key in user_config["keys"]: - os.environ[key] = user_config["keys"][ - key - ] # litellm can read keys from the environment - ## settings - if "general" in user_config: - litellm.add_function_to_prompt = user_config["general"].get( - "add_function_to_prompt", True - ) # by default add function to prompt if unsupported by provider - litellm.drop_params = user_config["general"].get( - "drop_params", True - ) # by default drop params if unsupported by provider - litellm.model_fallbacks = user_config["general"].get( - "fallbacks", None - ) # fallback models in case initial completion call fails - default_model = user_config["general"].get( - "default_model", None - ) # route all requests to this model. + ## load keys + if "keys" in user_config: + for key in user_config["keys"]: + os.environ[key] = user_config["keys"][ + key + ] # litellm can read keys from the environment + ## settings + if "general" in user_config: + litellm.add_function_to_prompt = user_config["general"].get( + "add_function_to_prompt", True + ) # by default add function to prompt if unsupported by provider + litellm.drop_params = user_config["general"].get( + "drop_params", True + ) # by default drop params if unsupported by provider + litellm.model_fallbacks = user_config["general"].get( + "fallbacks", None + ) # fallback models in case initial completion call fails + default_model = user_config["general"].get( + "default_model", None + ) # route all requests to this model. - local_logging = user_config["general"].get("local_logging", True) + local_logging = user_config["general"].get("local_logging", True) - if user_model is None: # `litellm --model `` > default_model. - user_model = default_model + if user_model is None: # `litellm --model `` > default_model. + user_model = default_model - ## load model config - to set this run `litellm --config` - model_config = None - if "model" in user_config: - if user_model in user_config["model"]: - model_config = user_config["model"][user_model] - model_list = [] - for model in user_config["model"]: - if "model_list" in user_config["model"][model]: - model_list.extend(user_config["model"][model]["model_list"]) - if len(model_list) > 0: - model_router.set_model_list(model_list=model_list) + ## load model config - to set this run `litellm --config` + model_config = None + if "model" in user_config: + if user_model in user_config["model"]: + model_config = user_config["model"][user_model] + model_list = [] + for model in user_config["model"]: + if "model_list" in user_config["model"][model]: + model_list.extend(user_config["model"][model]["model_list"]) - print_verbose(f"user_config: {user_config}") - print_verbose(f"model_config: {model_config}") - print_verbose(f"user_model: {user_model}") - if model_config is None: - return + print_verbose(f"user_config: {user_config}") + print_verbose(f"model_config: {model_config}") + print_verbose(f"user_model: {user_model}") + if model_config is None: + return - user_max_tokens = model_config.get("max_tokens", None) - user_temperature = model_config.get("temperature", None) - user_api_base = model_config.get("api_base", None) + user_max_tokens = model_config.get("max_tokens", None) + user_temperature = model_config.get("temperature", None) + user_api_base = model_config.get("api_base", None) - ## custom prompt template - if "prompt_template" in model_config: - model_prompt_template = model_config["prompt_template"] - if ( - len(model_prompt_template.keys()) > 0 - ): # if user has initialized this at all - litellm.register_prompt_template( - model=user_model, - initial_prompt_value=model_prompt_template.get( - "MODEL_PRE_PROMPT", "" - ), - roles={ - "system": { - "pre_message": model_prompt_template.get( - "MODEL_SYSTEM_MESSAGE_START_TOKEN", "" - ), - "post_message": model_prompt_template.get( - "MODEL_SYSTEM_MESSAGE_END_TOKEN", "" - ), + ## custom prompt template + if "prompt_template" in model_config: + model_prompt_template = model_config["prompt_template"] + if ( + len(model_prompt_template.keys()) > 0 + ): # if user has initialized this at all + litellm.register_prompt_template( + model=user_model, + initial_prompt_value=model_prompt_template.get( + "MODEL_PRE_PROMPT", "" + ), + roles={ + "system": { + "pre_message": model_prompt_template.get( + "MODEL_SYSTEM_MESSAGE_START_TOKEN", "" + ), + "post_message": model_prompt_template.get( + "MODEL_SYSTEM_MESSAGE_END_TOKEN", "" + ), + }, + "user": { + "pre_message": model_prompt_template.get( + "MODEL_USER_MESSAGE_START_TOKEN", "" + ), + "post_message": model_prompt_template.get( + "MODEL_USER_MESSAGE_END_TOKEN", "" + ), + }, + "assistant": { + "pre_message": model_prompt_template.get( + "MODEL_ASSISTANT_MESSAGE_START_TOKEN", "" + ), + "post_message": model_prompt_template.get( + "MODEL_ASSISTANT_MESSAGE_END_TOKEN", "" + ), + }, }, - "user": { - "pre_message": model_prompt_template.get( - "MODEL_USER_MESSAGE_START_TOKEN", "" - ), - "post_message": model_prompt_template.get( - "MODEL_USER_MESSAGE_END_TOKEN", "" - ), - }, - "assistant": { - "pre_message": model_prompt_template.get( - "MODEL_ASSISTANT_MESSAGE_START_TOKEN", "" - ), - "post_message": model_prompt_template.get( - "MODEL_ASSISTANT_MESSAGE_END_TOKEN", "" - ), - }, - }, - final_prompt_value=model_prompt_template.get( - "MODEL_POST_PROMPT", "" - ), - ) + final_prompt_value=model_prompt_template.get( + "MODEL_POST_PROMPT", "" + ), + ) except: pass @@ -320,12 +351,14 @@ def initialize( add_function_to_prompt, headers, save, + config ): - global user_model, user_api_base, user_debug, user_max_tokens, user_request_timeout, user_temperature, user_telemetry, user_headers + global user_model, user_api_base, user_debug, user_max_tokens, user_request_timeout, user_temperature, user_telemetry, user_headers, llm_model_list, llm_router, server_settings user_model = model user_debug = debug - load_config() dynamic_config = {"general": {}, user_model: {}} + if config: + llm_router, llm_model_list, server_settings = load_router_config(router=llm_router, config_file_path=config) if headers: # model-specific param user_headers = headers dynamic_config[user_model]["headers"] = headers @@ -470,17 +503,50 @@ litellm.input_callback = [logger] litellm.success_callback = [logger] litellm.failure_callback = [logger] +# for streaming +def data_generator(response): + print_verbose("inside generator") + for chunk in response: + print_verbose(f"returned chunk: {chunk}") + yield f"data: {json.dumps(chunk)}\n\n" + + +def litellm_completion(*args, **kwargs): + global user_temperature, user_request_timeout, user_max_tokens, user_api_base + call_type = kwargs.pop("call_type") + # override with user settings + if user_temperature: + kwargs["temperature"] = user_temperature + if user_request_timeout: + kwargs["request_timeout"] = user_request_timeout + if user_max_tokens: + kwargs["max_tokens"] = user_max_tokens + if user_api_base: + kwargs["api_base"] = user_api_base + ## CHECK CONFIG ## + if llm_model_list and kwargs["model"] in [m["model_name"] for m in llm_model_list]: + for m in llm_model_list: + if kwargs["model"] == m["model_name"]: + for key, value in m["litellm_params"].items(): + kwargs[key] = value + break + print(f"call going to litellm: {kwargs}") + if call_type == "chat_completion": + response = litellm.completion(*args, **kwargs) + elif call_type == "text_completion": + response = litellm.text_completion(*args, **kwargs) + if 'stream' in kwargs and kwargs['stream'] == True: # use generate_responses to stream responses + return StreamingResponse(data_generator(response), media_type='text/event-stream') + return response #### API ENDPOINTS #### @router.get("/v1/models") @router.get("/models") # if project requires model list def model_list(): - # all_models = litellm.utils.get_valid_models() - # if llm_model_list: - # all_models += llm_model_list - - + global llm_model_list all_models = litellm.utils.get_valid_models() + if llm_model_list: + all_models += llm_model_list if user_model is not None: all_models += user_model ### CHECK OLLAMA MODELS ### @@ -508,36 +574,35 @@ def model_list(): @router.post("/completions") @router.post("/engines/{model:path}/completions") async def completion(request: Request): + body = await request.body() + body_str = body.decode() try: - body = await request.body() - body_str = body.decode() - try: - data = ast.literal_eval(body_str) - except: - data = json.loads(body_str) - return litellm_completion(data=data, type="completion", user_model=user_model, user_temperature=user_temperature, - user_max_tokens=user_max_tokens, user_api_base=user_api_base, user_headers=user_headers, - user_debug=user_debug, model_router=model_router, user_request_timeout=user_request_timeout) - except Exception as e: - print(e) - return + data = ast.literal_eval(body_str) + except: + data = json.loads(body_str) + if user_model: + data["model"] = user_model + data["call_type"] = "text_completion" + return litellm_completion( + **data + ) + @router.post("/v1/chat/completions") @router.post("/chat/completions") async def chat_completion(request: Request): + body = await request.body() + body_str = body.decode() try: - body = await request.body() - body_str = body.decode() - try: - data = ast.literal_eval(body_str) - except: - data = json.loads(body_str) - return litellm_completion(data, type="chat_completion", user_model=user_model, - user_temperature=user_temperature, user_max_tokens=user_max_tokens, - user_api_base=user_api_base, user_headers=user_headers, user_debug=user_debug, model_router=model_router, user_request_timeout=user_request_timeout) - except Exception as e: - print(e) - return + data = ast.literal_eval(body_str) + except: + data = json.loads(body_str) + if user_model: + data["model"] = user_model + data["call_type"] = "chat_completion" + return litellm_completion( + **data + ) def print_cost_logs(): with open("costs.json", "r") as f: