diff --git a/.circleci/config.yml b/.circleci/config.yml
index d563b8c17..61734d78a 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -36,6 +36,7 @@ jobs:
pip install appdirs
pip install langchain
pip install numpydoc
+ pip install traceloop-sdk==0.0.69
- save_cache:
paths:
- ./venv
diff --git a/.flake8 b/.flake8
new file mode 100644
index 000000000..b51cc0045
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,2 @@
+[flake8]
+ignore = E,F,W,B,B9,C,D,I,N,S,W503,W504,E203, TCE,TCA,EXE999,E999,TD
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 313241e4c..e3e1bee69 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,3 +13,4 @@ litellm/proxy/litellm_secrets.toml
litellm/proxy/api_log.json
.idea/
router_config.yaml
+litellm_server/config.yaml
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 000000000..8bda916bc
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,8 @@
+repos:
+- repo: https://github.com/pycqa/flake8
+ rev: 3.8.4 # The version of flake8 to use
+ hooks:
+ - id: flake8
+ exclude: ^litellm/tests/|^litellm/proxy/|^litellm/integrations/
+ additional_dependencies: [flake8-print]
+ files: litellm/.*\.py
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
index 30d78eb18..179629c9a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,11 +1,5 @@
FROM python:3.10
-# Define a build argument for the config file path
-ARG CONFIG_FILE
-
-# Copy the custom config file (if provided) into the Docker image
-COPY $CONFIG_FILE /app/config.yaml
-
COPY . /app
WORKDIR /app
RUN pip install -r requirements.txt
diff --git a/README.md b/README.md
index 31c7d85b0..db51b85d6 100644
--- a/README.md
+++ b/README.md
@@ -5,22 +5,7 @@
Call all LLM APIs using the OpenAI format [Bedrock, Huggingface, Cohere, TogetherAI, Azure, OpenAI, etc.]
-
-
-
+
+
+LiteLLM Server supports:
+- LLM API Calls in the OpenAI ChatCompletions format
+- Caching + Logging capabilities (Redis and Langfuse, respectively)
+- Setting API keys in the request headers or in the .env
## Usage
```shell
-docker run -e PORT=8000 -p 8000:8000 ghcr.io/berriai/litellm:latest
-
-# UVICORN: OpenAI Proxy running on http://0.0.0.0:8000
+docker run -e PORT=8000 -e OPENAI_API_KEY= -p 8000:8000 ghcr.io/berriai/litellm:latest
```
+OpenAI Proxy running on http://0.0.0.0:8000
-## Endpoints:
-- `/chat/completions` - chat completions endpoint to call 100+ LLMs
-- `/router/completions` - for multiple deployments of the same model (e.g. Azure OpenAI), uses the least used deployment. [Learn more](https://docs.litellm.ai/docs/routing)
-- `/models` - available models on server
-
-## Making Requests to Proxy
-### Curl
-
-**Call OpenAI**
```shell
curl http://0.0.0.0:8000/v1/chat/completions \
-H "Content-Type: application/json" \
@@ -37,18 +38,80 @@ curl http://0.0.0.0:8000/v1/chat/completions \
"temperature": 0.7
}'
```
-**Call Bedrock**
+
+[**See how to call Huggingface,Bedrock,TogetherAI,Anthropic, etc.**](https://docs.litellm.ai/docs/providers)
+## Endpoints:
+- `/chat/completions` - chat completions endpoint to call 100+ LLMs
+- `/models` - available models on server
+
+## Save Model-specific params (API Base, API Keys, Temperature, etc.)
+Use the [router_config_template.yaml](https://github.com/BerriAI/litellm/blob/main/router_config_template.yaml) to save model-specific information like api_base, api_key, temperature, max_tokens, etc.
+
+1. Create a `config.yaml` file
```shell
-curl http://0.0.0.0:8000/v1/chat/completions \
- -H "Content-Type: application/json" \
- -d '{
- "model": "bedrock/anthropic.claude-instant-v1",
- "messages": [{"role": "user", "content": "Say this is a test!"}],
- "temperature": 0.7
- }'
+model_list:
+ - model_name: gpt-3.5-turbo # set model alias
+ litellm_params: # params for litellm.completion() - https://docs.litellm.ai/docs/completion/input#input---request-body
+ model: azure/chatgpt-v-2 # azure/ <- actual name used for litellm.completion()
+ api_key: your_azure_api_key
+ api_version: your_azure_api_version
+ api_base: your_azure_api_base
+ - model_name: mistral-7b
+ litellm_params:
+ model: ollama/mistral
+ api_base: your_ollama_api_base
```
-### Running Locally
+2. Start the server
+
+```shell
+docker run -e PORT=8000 -p 8000:8000 -v $(pwd)/config.yaml:/app/config.yaml ghcr.io/berriai/litellm:latest
+```
+## Caching
+
+Add Redis Caching to your server via environment variables
+
+```env
+### REDIS
+REDIS_HOST = ""
+REDIS_PORT = ""
+REDIS_PASSWORD = ""
+```
+
+Docker command:
+
+```shell
+docker run -e REDIST_HOST= -e REDIS_PORT= -e REDIS_PASSWORD= -e PORT=8000 -p 8000:8000 ghcr.io/berriai/litellm:latest
+```
+
+## Logging
+
+1. Debug Logs
+Print the input/output params by setting `SET_VERBOSE = "True"`.
+
+Docker command:
+
+```shell
+docker run -e SET_VERBOSE="True" -e PORT=8000 -p 8000:8000 ghcr.io/berriai/litellm:latest
+```
+
+Add Langfuse Logging to your server via environment variables
+
+```env
+### LANGFUSE
+LANGFUSE_PUBLIC_KEY = ""
+LANGFUSE_SECRET_KEY = ""
+# Optional, defaults to https://cloud.langfuse.com
+LANGFUSE_HOST = "" # optional
+```
+
+Docker command:
+
+```shell
+docker run -e LANGFUSE_PUBLIC_KEY= -e LANGFUSE_SECRET_KEY= -e LANGFUSE_HOST= -e PORT=8000 -p 8000:8000 ghcr.io/berriai/litellm:latest
+```
+
+## Running Locally
```shell
$ git clone https://github.com/BerriAI/litellm.git
```
@@ -59,5 +122,16 @@ $ cd ./litellm/litellm_server
```shell
$ uvicorn main:app --host 0.0.0.0 --port 8000
```
-
-[**See how to call Huggingface,Bedrock,TogetherAI,Anthropic, etc.**](https://docs.litellm.ai/docs/simple_proxy)
+### Custom Config
+1. Create + Modify [router_config.yaml](https://github.com/BerriAI/litellm/blob/main/router_config_template.yaml) (save your azure/openai/etc. deployment info)
+```shell
+cp ./router_config_template.yaml ./router_config.yaml
+```
+2. Build Docker Image
+```shell
+docker build -t litellm_server . --build-arg CONFIG_FILE=./router_config.yaml
+```
+3. Run Docker Image
+```shell
+docker run --name litellm_server -e PORT=8000 -p 8000:8000 litellm_server
+```
diff --git a/litellm_server/config b/litellm_server/config
deleted file mode 100644
index e69de29bb..000000000
diff --git a/litellm_server/main.py b/litellm_server/main.py
index 4f2586b7a..c7b26b685 100644
--- a/litellm_server/main.py
+++ b/litellm_server/main.py
@@ -1,15 +1,19 @@
-import litellm, os, traceback
+import os, traceback
from fastapi import FastAPI, Request, HTTPException
from fastapi.routing import APIRouter
from fastapi.responses import StreamingResponse, FileResponse
from fastapi.middleware.cors import CORSMiddleware
-import json
-import os
+import json, sys
from typing import Optional
+sys.path.insert(
+ 0, os.path.abspath("../")
+) # Adds the parent directory to the system path - for litellm local dev
+import litellm
+print(f"litellm: {litellm}")
try:
- from utils import set_callbacks, load_router_config
+ from utils import set_callbacks, load_router_config, print_verbose
except ImportError:
- from litellm_server.utils import set_callbacks, load_router_config
+ from litellm_server.utils import set_callbacks, load_router_config, print_verbose
import dotenv
dotenv.load_dotenv() # load env variables
@@ -26,14 +30,23 @@ app.add_middleware(
)
#### GLOBAL VARIABLES ####
llm_router: Optional[litellm.Router] = None
+llm_model_list: Optional[list] = None
+server_settings: Optional[dict] = None
set_callbacks() # sets litellm callbacks for logging if they exist in the environment
-llm_router = load_router_config(router=llm_router)
+
+if "CONFIG_FILE_PATH" in os.environ:
+ print(f"CONFIG FILE DETECTED")
+ llm_router, llm_model_list, server_settings = load_router_config(router=llm_router, config_file_path=os.getenv("CONFIG_FILE_PATH"))
+else:
+ llm_router, llm_model_list, server_settings = load_router_config(router=llm_router)
#### API ENDPOINTS ####
-@router.post("/v1/models")
+@router.get("/v1/models")
@router.get("/models") # if project requires model list
def model_list():
all_models = litellm.utils.get_valid_models()
+ if llm_model_list:
+ all_models += llm_model_list
return dict(
data=[
{
@@ -72,7 +85,7 @@ async def embedding(request: Request):
# default to always using the "ENV" variables, only if AUTH_STRATEGY==DYNAMIC then reads headers
if os.getenv("AUTH_STRATEGY", None) == "DYNAMIC" and "authorization" in request.headers: # if users pass LLM api keys as part of header
api_key = request.headers.get("authorization")
- api_key = api_key.replace("Bearer", "").strip()
+ api_key = api_key.replace("Bearer", "").strip()
if len(api_key.strip()) > 0:
api_key = api_key
data["api_key"] = api_key
@@ -87,27 +100,53 @@ async def embedding(request: Request):
@router.post("/v1/chat/completions")
@router.post("/chat/completions")
-async def chat_completion(request: Request):
+@router.post("/openai/deployments/{model:path}/chat/completions") # azure compatible endpoint
+async def chat_completion(request: Request, model: Optional[str] = None):
+ global llm_model_list, server_settings
try:
data = await request.json()
+ print(f"data: {data}")
+ data["model"] = (
+ server_settings.get("completion_model", None) # server default
+ or model # model passed in url
+ or data["model"] # default passed in
+ )
+ ## CHECK KEYS ##
# default to always using the "ENV" variables, only if AUTH_STRATEGY==DYNAMIC then reads headers
- if os.getenv("AUTH_STRATEGY", None) == "DYNAMIC" and "authorization" in request.headers: # if users pass LLM api keys as part of header
- api_key = request.headers.get("authorization")
- api_key = api_key.replace("Bearer", "").strip()
- if len(api_key.strip()) > 0:
- api_key = api_key
- data["api_key"] = api_key
+ # env_validation = litellm.validate_environment(model=data["model"])
+ # if (env_validation['keys_in_environment'] is False or os.getenv("AUTH_STRATEGY", None) == "DYNAMIC") and ("authorization" in request.headers or "api-key" in request.headers): # if users pass LLM api keys as part of header
+ # if "authorization" in request.headers:
+ # api_key = request.headers.get("authorization")
+ # elif "api-key" in request.headers:
+ # api_key = request.headers.get("api-key")
+ # print(f"api_key in headers: {api_key}")
+ # if " " in api_key:
+ # api_key = api_key.split(" ")[1]
+ # print(f"api_key split: {api_key}")
+ # if len(api_key) > 0:
+ # api_key = api_key
+ # data["api_key"] = api_key
+ # print(f"api_key in data: {api_key}")
+ ## CHECK CONFIG ##
+ if llm_model_list and data["model"] in [m["model_name"] for m in llm_model_list]:
+ for m in llm_model_list:
+ if data["model"] == m["model_name"]:
+ for key, value in m["litellm_params"].items():
+ data[key] = value
+ break
response = litellm.completion(
**data
)
if 'stream' in data and data['stream'] == True: # use generate_responses to stream responses
return StreamingResponse(data_generator(response), media_type='text/event-stream')
+ print(f"response: {response}")
return response
except Exception as e:
error_traceback = traceback.format_exc()
+ print(f"{error_traceback}")
error_msg = f"{str(e)}\n\n{error_traceback}"
- return {"error": error_msg}
- # raise HTTPException(status_code=500, detail=error_msg)
+ # return {"error": error_msg}
+ raise HTTPException(status_code=500, detail=error_msg)
@router.post("/router/completions")
async def router_completion(request: Request):
@@ -157,4 +196,4 @@ async def home(request: Request):
return "LiteLLM: RUNNING"
-app.include_router(router)
+app.include_router(router)
\ No newline at end of file
diff --git a/litellm_server/utils.py b/litellm_server/utils.py
index 5cb1bd06a..ffaa64c91 100644
--- a/litellm_server/utils.py
+++ b/litellm_server/utils.py
@@ -1,5 +1,12 @@
import os, litellm
import pkg_resources
+import dotenv
+dotenv.load_dotenv() # load env variables
+
+def print_verbose(print_statement):
+ print(f"SET_VERBOSE value: {os.environ['SET_VERBOSE']}")
+ if os.environ["SET_VERBOSE"] == "True":
+ print(print_statement)
def get_package_version(package_name):
try:
@@ -36,26 +43,37 @@ def set_callbacks():
## CACHING
### REDIS
- if len(os.getenv("REDIS_HOST", "")) > 0 and len(os.getenv("REDIS_PORT", "")) > 0 and len(os.getenv("REDIS_PASSWORD", "")) > 0:
- from litellm.caching import Cache
- litellm.cache = Cache(type="redis", host=os.getenv("REDIS_HOST"), port=os.getenv("REDIS_PORT"), password=os.getenv("REDIS_PASSWORD"))
- print("\033[92mLiteLLM: Switched on Redis caching\033[0m")
+ # if len(os.getenv("REDIS_HOST", "")) > 0 and len(os.getenv("REDIS_PORT", "")) > 0 and len(os.getenv("REDIS_PASSWORD", "")) > 0:
+ # print(f"redis host: {os.getenv('REDIS_HOST')}; redis port: {os.getenv('REDIS_PORT')}; password: {os.getenv('REDIS_PASSWORD')}")
+ # from litellm.caching import Cache
+ # litellm.cache = Cache(type="redis", host=os.getenv("REDIS_HOST"), port=os.getenv("REDIS_PORT"), password=os.getenv("REDIS_PASSWORD"))
+ # print("\033[92mLiteLLM: Switched on Redis caching\033[0m")
-def load_router_config(router: Optional[litellm.Router]):
+def load_router_config(router: Optional[litellm.Router], config_file_path: Optional[str]='/app/config.yaml'):
config = {}
- config_file = '/app/config.yaml'
-
+ server_settings = {}
try:
- if os.path.exists(config_file):
- with open(config_file, 'r') as file:
+ if os.path.exists(config_file_path):
+ with open(config_file_path, 'r') as file:
config = yaml.safe_load(file)
else:
pass
except:
pass
+ ## SERVER SETTINGS (e.g. default completion model = 'ollama/mistral')
+ server_settings = config.get("server_settings", None)
+ if server_settings:
+ server_settings = server_settings
+
+ ## LITELLM MODULE SETTINGS (e.g. litellm.drop_params=True,..)
+ litellm_settings = config.get('litellm_settings', None)
+ if litellm_settings:
+ for key, value in litellm_settings.items():
+ setattr(litellm, key, value)
+
## MODEL LIST
model_list = config.get('model_list', None)
if model_list:
@@ -67,4 +85,4 @@ def load_router_config(router: Optional[litellm.Router]):
for key, value in environment_variables.items():
os.environ[key] = value
- return router
+ return router, model_list, server_settings
diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json
index 84fc0b890..fe66b9f92 100644
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@@ -233,6 +233,48 @@
"litellm_provider": "vertex_ai-code-chat-models",
"mode": "chat"
},
+ "palm/chat-bison": {
+ "max_tokens": 4096,
+ "input_cost_per_token": 0.000000125,
+ "output_cost_per_token": 0.000000125,
+ "litellm_provider": "palm",
+ "mode": "chat"
+ },
+ "palm/chat-bison-001": {
+ "max_tokens": 4096,
+ "input_cost_per_token": 0.000000125,
+ "output_cost_per_token": 0.000000125,
+ "litellm_provider": "palm",
+ "mode": "chat"
+ },
+ "palm/text-bison": {
+ "max_tokens": 8196,
+ "input_cost_per_token": 0.000000125,
+ "output_cost_per_token": 0.000000125,
+ "litellm_provider": "palm",
+ "mode": "completion"
+ },
+ "palm/text-bison-001": {
+ "max_tokens": 8196,
+ "input_cost_per_token": 0.000000125,
+ "output_cost_per_token": 0.000000125,
+ "litellm_provider": "palm",
+ "mode": "completion"
+ },
+ "palm/text-bison-safety-off": {
+ "max_tokens": 8196,
+ "input_cost_per_token": 0.000000125,
+ "output_cost_per_token": 0.000000125,
+ "litellm_provider": "palm",
+ "mode": "completion"
+ },
+ "palm/text-bison-safety-recitation-off": {
+ "max_tokens": 8196,
+ "input_cost_per_token": 0.000000125,
+ "output_cost_per_token": 0.000000125,
+ "litellm_provider": "palm",
+ "mode": "completion"
+ },
"command-nightly": {
"max_tokens": 4096,
"input_cost_per_token": 0.000015,
@@ -544,12 +586,12 @@
"output_cost_per_token": 0.0000004
},
"together-ai-20.1b-40b": {
- "input_cost_per_token": 0.000001,
- "output_cost_per_token": 0.000001
+ "input_cost_per_token": 0.0000008,
+ "output_cost_per_token": 0.0000008
},
"together-ai-40.1b-70b": {
- "input_cost_per_token": 0.000003,
- "output_cost_per_token": 0.000003
+ "input_cost_per_token": 0.000001,
+ "output_cost_per_token": 0.000001
},
"ollama/llama2": {
"max_tokens": 4096,
@@ -579,6 +621,34 @@
"litellm_provider": "ollama",
"mode": "completion"
},
+ "ollama/mistral": {
+ "max_tokens": 8192,
+ "input_cost_per_token": 0.0,
+ "output_cost_per_token": 0.0,
+ "litellm_provider": "ollama",
+ "mode": "completion"
+ },
+ "ollama/codellama": {
+ "max_tokens": 4096,
+ "input_cost_per_token": 0.0,
+ "output_cost_per_token": 0.0,
+ "litellm_provider": "ollama",
+ "mode": "completion"
+ },
+ "ollama/orca-mini": {
+ "max_tokens": 4096,
+ "input_cost_per_token": 0.0,
+ "output_cost_per_token": 0.0,
+ "litellm_provider": "ollama",
+ "mode": "completion"
+ },
+ "ollama/vicuna": {
+ "max_tokens": 2048,
+ "input_cost_per_token": 0.0,
+ "output_cost_per_token": 0.0,
+ "litellm_provider": "ollama",
+ "mode": "completion"
+ },
"deepinfra/meta-llama/Llama-2-70b-chat-hf": {
"max_tokens": 4096,
"input_cost_per_token": 0.000000700,
diff --git a/pyproject.toml b/pyproject.toml
index 3f53f9496..0c10050a0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[tool.poetry]
name = "litellm"
-version = "0.12.5"
+version = "0.13.1"
description = "Library to easily interface with LLM API providers"
authors = ["BerriAI"]
license = "MIT License"
@@ -26,7 +26,7 @@ requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
[tool.commitizen]
-version = "0.12.5"
+version = "0.13.1"
version_files = [
"pyproject.toml:^version"
]
diff --git a/router_config_template.yaml b/router_config_template.yaml
index e548f9829..b6a8612a4 100644
--- a/router_config_template.yaml
+++ b/router_config_template.yaml
@@ -1,26 +1,28 @@
+# Global settings for the litellm module
+litellm_settings:
+ drop_params: True
+ # failure_callbacks: ["sentry"]
+
+# Model-specific settings
model_list: # refer to https://docs.litellm.ai/docs/routing
- model_name: gpt-3.5-turbo
- litellm_params:
+ litellm_params: # parameters for litellm.completion()
model: azure/chatgpt-v-2 # azure/
api_key: your_azure_api_key
api_version: your_azure_api_version
api_base: your_azure_api_base
- tpm: 240000 # REPLACE with your azure deployment tpm
- rpm: 1800 # REPLACE with your azure deployment rpm
- - model_name: gpt-3.5-turbo
+ tpm: 240000 # [OPTIONAL] To load balance between multiple deployments
+ rpm: 1800 # [OPTIONAL] To load balance between multiple deployments
+ - model_name: mistral
litellm_params:
- model: azure/chatgpt-functioncalling
- api_key: your_azure_api_key
- api_version: your_azure_api_version
- api_base: your_azure_api_base
- tpm: 240000
- rpm: 1800
+ model: ollama/mistral
+ api_base: my_ollama_api_base
- model_name: gpt-3.5-turbo
litellm_params:
model: gpt-3.5-turbo
api_key: your_openai_api_key
- tpm: 1000000 # REPLACE with your openai tpm
- rpm: 9000 # REPLACE with your openai rpm
+ tpm: 1000000 # [OPTIONAL] REPLACE with your openai tpm
+ rpm: 9000 # [OPTIONAL] REPLACE with your openai rpm
environment_variables:
REDIS_HOST: your_redis_host