From afe14c8a96df7c0ce6cf2d726e9e10ce9f660e14 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Fri, 27 Oct 2023 16:00:28 -0700 Subject: [PATCH] fix(utils.py/completion_with_fallbacks): accept azure deployment name in rotations --- docs/my-website/docs/simple_proxy.md | 276 +++++++++++++++++++++------ litellm/main.py | 1 + litellm/utils.py | 3 +- 3 files changed, 224 insertions(+), 56 deletions(-) diff --git a/docs/my-website/docs/simple_proxy.md b/docs/my-website/docs/simple_proxy.md index becf87e98..b8501c8df 100644 --- a/docs/my-website/docs/simple_proxy.md +++ b/docs/my-website/docs/simple_proxy.md @@ -6,19 +6,224 @@ import TabItem from '@theme/TabItem'; A simple, fast, and lightweight **OpenAI-compatible server** to call 100+ LLM APIs in the OpenAI Input/Output format -## Endpoints: -- `/chat/completions` - chat completions endpoint to call 100+ LLMs -- `/models` - available models on server - -[![Deploy](https://deploy.cloud.run/button.svg)](https://l.linklyhq.com/l/1uHtX) -[![Deploy](https://render.com/images/deploy-to-render-button.svg)](https://l.linklyhq.com/l/1uHsr) -[![Deploy](../img/deploy-to-aws.png)](https://docs.litellm.ai/docs/simple_proxy#deploy-on-aws-apprunner) +[**See Code**](https://github.com/BerriAI/litellm/tree/main/litellm_server) :::info We want to learn how we can make the server better! Meet the [founders](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version) or join our [discord](https://discord.gg/wuPM9dRgDw) ::: +## Usage + +```shell +docker run -e PORT=8000 -e OPENAI_API_KEY= -p 8000:8000 ghcr.io/berriai/litellm:latest + +# UVICORN: OpenAI Proxy running on http://0.0.0.0:8000 +``` + +```shell +curl http://0.0.0.0:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-3.5-turbo", + "messages": [{"role": "user", "content": "Say this is a test!"}], + "temperature": 0.7 + }' +``` + +#### Other supported models: + + + +```shell +$ docker run -e PORT=8000 -e AWS_ACCESS_KEY_ID= -e AWS_SECRET_ACCESS_KEY= -p 8000:8000 ghcr.io/berriai/litellm:latest +``` + + + + +If, you're calling it via Huggingface Inference Endpoints. +```shell +$ docker run -e PORT=8000 -e HUGGINGFACE_API_KEY= -p 8000:8000 ghcr.io/berriai/litellm:latest +``` + +Else, +```shell +$ docker run -e PORT=8000 -p 8000:8000 ghcr.io/berriai/litellm:latest +``` + + + + + +```shell +$ docker run -e PORT=8000 -e ANTHROPIC_API_KEY= -p 8000:8000 ghcr.io/berriai/litellm:latest +``` + + + + + +```shell +$ docker run -e PORT=8000 -e OLLAMA_API_BASE= -p 8000:8000 ghcr.io/berriai/litellm:latest +``` + + + + + + +```shell +$ docker run -e PORT=8000 -e TOGETHERAI_API_KEY= -p 8000:8000 ghcr.io/berriai/litellm:latest +``` + + + + + +```shell +$ docker run -e PORT=8000 -e REPLICATE_API_KEY= -p 8000:8000 ghcr.io/berriai/litellm:latest +``` + + + + + +```shell +$ docker run -e PORT=8000 -e PALM_API_KEY= -p 8000:8000 ghcr.io/berriai/litellm:latest +``` + + + + + +```shell +$ docker run -e PORT=8000 -e AZURE_API_KEY= -e AZURE_API_BASE= -p 8000:8000 ghcr.io/berriai/litellm:latest +``` + + + + + +```shell +$ docker run -e PORT=8000 -e AI21_API_KEY= -p 8000:8000 ghcr.io/berriai/litellm:latest +``` + + + + + +```shell +$ docker run -e PORT=8000 -e COHERE_API_KEY= -p 8000:8000 ghcr.io/berriai/litellm:latest +``` + + + + + +## Endpoints: +- `/chat/completions` - chat completions endpoint to call 100+ LLMs +- `/embeddings` - embedding endpoint for Azure, OpenAI, Huggingface endpoints +- `/models` - available models on server + + +## Save Model-specific params (API Base, API Keys, Temperature, etc.) +Use the [router_config_template.yaml](https://github.com/BerriAI/litellm/blob/main/router_config_template.yaml) to save model-specific information like api_base, api_key, temperature, max_tokens, etc. + +1. Create a `config.yaml` file +```shell +model_list: + - model_name: gpt-3.5-turbo + litellm_params: # params for litellm.completion() - https://docs.litellm.ai/docs/completion/input#input---request-body + model: azure/chatgpt-v-2 # azure/ + api_key: your_azure_api_key + api_version: your_azure_api_version + api_base: your_azure_api_base + - model_name: mistral-7b + litellm_params: + model: ollama/mistral + api_base: your_ollama_api_base +``` + +2. Start the server + +```shell +docker run -e PORT=8000 -p 8000:8000 -v $(pwd)/config.yaml:/app/config.yaml ghcr.io/berriai/litellm:latest +``` +## Caching + +Add Redis Caching to your server via environment variables + +```env +### REDIS +REDIS_HOST = "" +REDIS_PORT = "" +REDIS_PASSWORD = "" +``` + +Docker command: + +```shell +docker run -e REDIST_HOST= -e REDIS_PORT= -e REDIS_PASSWORD= -e PORT=8000 -p 8000:8000 ghcr.io/berriai/litellm:latest +``` + +## Logging + +1. Debug Logs +Print the input/output params by setting `SET_VERBOSE = "True"`. + +Docker command: + +```shell +docker run -e SET_VERBOSE="True" -e PORT=8000 -p 8000:8000 ghcr.io/berriai/litellm:latest +``` +2. Add Langfuse Logging to your server via environment variables + +```env +### LANGFUSE +LANGFUSE_PUBLIC_KEY = "" +LANGFUSE_SECRET_KEY = "" +# Optional, defaults to https://cloud.langfuse.com +LANGFUSE_HOST = "" # optional +``` + +Docker command: + +```shell +docker run -e LANGFUSE_PUBLIC_KEY= -e LANGFUSE_SECRET_KEY= -e LANGFUSE_HOST= -e PORT=8000 -p 8000:8000 ghcr.io/berriai/litellm:latest +``` + +## Tutorials + + + +Here's the `docker-compose.yml` for running LiteLLM Server with Mckay Wrigley's Chat-UI: +```yaml +version: '3' +services: + container1: + image: ghcr.io/berriai/litellm:latest + ports: + - '8000:8000' + environment: + - PORT=8000 + - OPENAI_API_KEY=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 + + container2: + image: ghcr.io/mckaywrigley/chatbot-ui:main + ports: + - '3000:3000' + environment: + - OPENAI_API_KEY=my-fake-key + - OPENAI_API_HOST=http://container1:8000 +``` + +Run this via: +```shell +docker-compose up +``` + + ## Local Usage @@ -33,53 +238,6 @@ $ cd ./litellm/litellm_server $ uvicorn main:app --host 0.0.0.0 --port 8000 ``` -### Test Request -Ensure your API keys are set in the Environment for these requests - - - - -```shell -curl http://0.0.0.0:8000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "gpt-3.5-turbo", - "messages": [{"role": "user", "content": "Say this is a test!"}], - "temperature": 0.7 - }' -``` - - - - -```shell -curl http://0.0.0.0:8000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "azure/", - "messages": [{"role": "user", "content": "Say this is a test!"}], - "temperature": 0.7 - }' -``` - - - - - -```shell -curl http://0.0.0.0:8000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "claude-2", - "messages": [{"role": "user", "content": "Say this is a test!"}], - "temperature": 0.7, - }' -``` - - - - - ## Setting LLM API keys This server allows two ways of passing API keys to litellm - Environment Variables - This server by default assumes the LLM API Keys are stored in the environment variables @@ -87,6 +245,10 @@ This server allows two ways of passing API keys to litellm - Set `AUTH_STRATEGY=DYNAMIC` in the Environment - Pass required auth params `api_key`,`api_base`, `api_version` with the request params + + + + ## Deploy on Google Cloud Run **Click the button** to deploy to Google Cloud Run @@ -159,6 +321,8 @@ More info [here](https://cloud.google.com/run/docs/configuring/services/environm Example `OPENAI_API_KEY`, `ANTHROPIC_API_KEY` + + ## Deploy on Render **Click the button** to deploy to Render @@ -169,6 +333,8 @@ On a successfull deploy https://dashboard.render.com/ should display the followi + + ## Deploy on AWS Apprunner 1. Fork LiteLLM https://github.com/BerriAI/litellm @@ -225,6 +391,8 @@ On a successfull deploy https://dashboard.render.com/ should display the followi + + ## Advanced ### Caching - Completion() and Embedding() Responses diff --git a/litellm/main.py b/litellm/main.py index 7a7571583..7667c9079 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -1087,6 +1087,7 @@ def completion( api_base = ( litellm.api_base or api_base or + get_secret("OLLAMA_API_BASE") or "http://localhost:11434" ) diff --git a/litellm/utils.py b/litellm/utils.py index b99f2999c..8c9911484 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -4079,8 +4079,7 @@ def completion_with_fallbacks(**kwargs): if isinstance(model, dict): # completion(model="gpt-4", fallbacks=[{"api_key": "", "api_base": ""}, {"api_key": "", "api_base": ""}]) kwargs["api_key"] = model.get("api_key", None) kwargs["api_base"] = model.get("api_base", None) - model = original_model - print(f"switched api keys") + model = model.get("model", original_model) elif ( model in rate_limited_models ): # check if model is currently cooling down