refactor(openai_proxy-->-litellm_server): renaming project for simplicity

This commit is contained in:
Krrish Dholakia 2023-10-25 14:14:32 -07:00
parent f6be642f2f
commit 16f39ec840
15 changed files with 1 additions and 1 deletions

View file

@ -0,0 +1,40 @@
# set AUTH STRATEGY FOR LLM APIs - Defaults to using Environment Variables
AUTH_STRATEGY = "ENV" # ENV or DYNAMIC, ENV always reads from environment variables, DYNAMIC reads request headers to set LLM api keys
OPENAI_API_KEY = ""
HUGGINGFACE_API_KEY=""
TOGETHERAI_API_KEY=""
REPLICATE_API_KEY=""
## bedrock / sagemaker
AWS_ACCESS_KEY_ID = ""
AWS_SECRET_ACCESS_KEY = ""
AZURE_API_KEY = ""
AZURE_API_BASE = ""
AZURE_API_VERSION = ""
ANTHROPIC_API_KEY = ""
COHERE_API_KEY = ""
## LOGGING ##
SET_VERBOSE = "False" # set to 'True' to see detailed input/output logs
### LANGFUSE
LANGFUSE_PUBLIC_KEY = ""
LANGFUSE_SECRET_KEY = ""
# Optional, defaults to https://cloud.langfuse.com
LANGFUSE_HOST = "" # optional
## CACHING ##
### REDIS
REDIS_HOST = ""
REDIS_PORT = ""
REDIS_PASSWORD = ""

10
litellm_server/Dockerfile Normal file
View file

@ -0,0 +1,10 @@
FROM python:3.10
ENV LITELLM_CONFIG_PATH="/litellm.secrets.toml"
COPY . /app
WORKDIR /app
RUN pip install -r requirements.txt
EXPOSE $PORT
CMD exec uvicorn main:app --host 0.0.0.0 --port $PORT

63
litellm_server/README.md Normal file
View file

@ -0,0 +1,63 @@
# litellm-server
A simple, fast, and lightweight **OpenAI-compatible server** to call 100+ LLM APIs.
<p align="center" style="margin: 2%">
<a href="https://l.linklyhq.com/l/1uHsr" target="_blank">
<img src="https://render.com/images/deploy-to-render-button.svg" width="173"/>
</a>
<a href="https://l.linklyhq.com/l/1uHtX" target="_blank">
<img src="https://deploy.cloud.run/button.svg" width="200"/>
</a>
</p>
## Usage
```shell
docker run -e PORT=8000 -p 8000:8000 ghcr.io/berriai/litellm:latest
# UVICORN: OpenAI Proxy running on http://0.0.0.0:8000
```
## Endpoints:
- `/chat/completions` - chat completions endpoint to call 100+ LLMs
- `/router/completions` - for multiple deployments of the same model (e.g. Azure OpenAI), uses the least used deployment. [Learn more](https://docs.litellm.ai/docs/routing)
- `/models` - available models on server
## Making Requests to Proxy
### Curl
**Call OpenAI**
```shell
curl http://0.0.0.0:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "gpt-3.5-turbo",
"messages": [{"role": "user", "content": "Say this is a test!"}],
"temperature": 0.7
}'
```
**Call Bedrock**
```shell
curl http://0.0.0.0:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "bedrock/anthropic.claude-instant-v1",
"messages": [{"role": "user", "content": "Say this is a test!"}],
"temperature": 0.7
}'
```
### Running Locally
```shell
$ git clone https://github.com/BerriAI/litellm.git
```
```shell
$ cd ./litellm/openai-proxy
```
```shell
$ uvicorn main:app --host 0.0.0.0 --port 8000
```
[**See how to call Huggingface,Bedrock,TogetherAI,Anthropic, etc.**](https://docs.litellm.ai/docs/simple_proxy)

View file

@ -0,0 +1,2 @@
from .main import *
from .utils import *

0
litellm_server/config Normal file
View file

160
litellm_server/main.py Normal file
View file

@ -0,0 +1,160 @@
import litellm, os, traceback
from fastapi import FastAPI, Request, HTTPException
from fastapi.routing import APIRouter
from fastapi.responses import StreamingResponse, FileResponse
from fastapi.middleware.cors import CORSMiddleware
import json
import os
from typing import Optional
try:
from utils import set_callbacks, load_router_config
except ImportError:
from openai_proxy.utils import set_callbacks, load_router_config
import dotenv
dotenv.load_dotenv() # load env variables
app = FastAPI(docs_url="/", title="LiteLLM API")
router = APIRouter()
origins = ["*"]
app.add_middleware(
CORSMiddleware,
allow_origins=origins,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
#### GLOBAL VARIABLES ####
llm_router: Optional[litellm.Router] = None
set_callbacks() # sets litellm callbacks for logging if they exist in the environment
llm_router = load_router_config(router=llm_router)
#### API ENDPOINTS ####
@router.post("/v1/models")
@router.get("/models") # if project requires model list
def model_list():
all_models = litellm.utils.get_valid_models()
return dict(
data=[
{
"id": model,
"object": "model",
"created": 1677610602,
"owned_by": "openai",
}
for model in all_models
],
object="list",
)
# for streaming
def data_generator(response):
print("inside generator")
for chunk in response:
print(f"returned chunk: {chunk}")
yield f"data: {json.dumps(chunk)}\n\n"
@router.post("/v1/completions")
@router.post("/completions")
async def completion(request: Request):
data = await request.json()
response = litellm.completion(
**data
)
if 'stream' in data and data['stream'] == True: # use generate_responses to stream responses
return StreamingResponse(data_generator(response), media_type='text/event-stream')
return response
@router.post("/v1/embeddings")
@router.post("/embeddings")
async def embedding(request: Request):
try:
data = await request.json()
# default to always using the "ENV" variables, only if AUTH_STRATEGY==DYNAMIC then reads headers
if os.getenv("AUTH_STRATEGY", None) == "DYNAMIC" and "authorization" in request.headers: # if users pass LLM api keys as part of header
api_key = request.headers.get("authorization")
api_key = api_key.replace("Bearer", "").strip()
if len(api_key.strip()) > 0:
api_key = api_key
data["api_key"] = api_key
response = litellm.embedding(
**data
)
return response
except Exception as e:
error_traceback = traceback.format_exc()
error_msg = f"{str(e)}\n\n{error_traceback}"
return {"error": error_msg}
@router.post("/v1/chat/completions")
@router.post("/chat/completions")
async def chat_completion(request: Request):
try:
data = await request.json()
# default to always using the "ENV" variables, only if AUTH_STRATEGY==DYNAMIC then reads headers
if os.getenv("AUTH_STRATEGY", None) == "DYNAMIC" and "authorization" in request.headers: # if users pass LLM api keys as part of header
api_key = request.headers.get("authorization")
api_key = api_key.replace("Bearer", "").strip()
if len(api_key.strip()) > 0:
api_key = api_key
data["api_key"] = api_key
response = litellm.completion(
**data
)
if 'stream' in data and data['stream'] == True: # use generate_responses to stream responses
return StreamingResponse(data_generator(response), media_type='text/event-stream')
return response
except Exception as e:
error_traceback = traceback.format_exc()
error_msg = f"{str(e)}\n\n{error_traceback}"
return {"error": error_msg}
# raise HTTPException(status_code=500, detail=error_msg)
@router.post("/router/completions")
async def router_completion(request: Request):
global llm_router
try:
data = await request.json()
if "model_list" in data:
llm_router = litellm.Router(model_list=data.pop("model_list"))
if llm_router is None:
raise Exception("Save model list via config.yaml. Eg.: ` docker build -t myapp --build-arg CONFIG_FILE=myconfig.yaml .` or pass it in as model_list=[..] as part of the request body")
# openai.ChatCompletion.create replacement
response = await llm_router.acompletion(model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hey, how's it going?"}])
if 'stream' in data and data['stream'] == True: # use generate_responses to stream responses
return StreamingResponse(data_generator(response), media_type='text/event-stream')
return response
except Exception as e:
error_traceback = traceback.format_exc()
error_msg = f"{str(e)}\n\n{error_traceback}"
return {"error": error_msg}
@router.post("/router/embedding")
async def router_embedding(request: Request):
global llm_router
try:
data = await request.json()
if "model_list" in data:
llm_router = litellm.Router(model_list=data.pop("model_list"))
if llm_router is None:
raise Exception("Save model list via config.yaml. Eg.: ` docker build -t myapp --build-arg CONFIG_FILE=myconfig.yaml .` or pass it in as model_list=[..] as part of the request body")
response = await llm_router.aembedding(model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hey, how's it going?"}])
if 'stream' in data and data['stream'] == True: # use generate_responses to stream responses
return StreamingResponse(data_generator(response), media_type='text/event-stream')
return response
except Exception as e:
error_traceback = traceback.format_exc()
error_msg = f"{str(e)}\n\n{error_traceback}"
return {"error": error_msg}
@router.get("/")
async def home(request: Request):
return "LiteLLM: RUNNING"
app.include_router(router)

242
litellm_server/openapi.json Normal file
View file

@ -0,0 +1,242 @@
{
"openapi": "3.0.0",
"info": {
"version": "1.0.0",
"title": "LiteLLM API",
"description": "API for LiteLLM"
},
"paths": {
"/chat/completions": {
"post": {
"summary": "Create chat completion for 100+ LLM APIs",
"requestBody": {
"description": "Input parameters for chat completions",
"required": true,
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ChatCompletionsRequest"
},
"example": {
"model": "gpt-3.5-turbo",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "this is a test message from litellm proxy, can you ack"
}
],
"frequency_penalty": 0.0,
"max_tokens": 500,
"n": 1,
"presence_penalty": 0.0,
"stop": "###",
"stream": false,
"temperature": 0.7,
"top_p": 0.8,
"user": "test-litellm"
}
}
}
},
"responses": {
"200": {
"description": "Successful operation",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ChatCompletionsResponse"
},
"example": {
"object": "chat.completion",
"id": "chatcmpl-92861fad-b36c-41a1-88db-139344819276",
"choices": [
{
"finish_reason": "stop_sequence",
"index": 0,
"message": {
"content": "I'm a large language model trained by OpenAI, ACK receiving this message",
"role": "assistant"
}
}
],
"created": 1698253693.169062,
"model": "gpt-3.5-turbo",
"usage": {
"prompt_tokens": 14,
"completion_tokens": 102,
"total_tokens": 116
}
}
}
}
},
"500": {
"description": "Server error"
}
}
}
},
"/models": {
"get": {
"summary": "Get models",
"responses": {
"200": {
"description": "Successful operation"
}
}
}
},
"/": {
"get": {
"summary": "Swagger docs",
"responses": {
"200": {
"description": "Successful operation"
}
}
}
}
},
"components": {
"schemas": {
"ChatCompletionsRequest": {
"type": "object",
"properties": {
"messages": {
"type": "array",
"items": {
"type": "object",
"properties": {
"role": {
"type": "string"
},
"content": {
"type": "string"
}
},
"required": ["role", "content"]
}
},
"model": {
"type": "string"
},
"frequency_penalty": {
"type": "number"
},
"function_call": {
"type": ["string", "object"]
},
"functions": {
"type": "array"
},
"logit_bias": {
"type": "object"
},
"max_tokens": {
"type": "integer"
},
"n": {
"type": "integer"
},
"presence_penalty": {
"type": "number"
},
"stop": {
"oneOf": [
{
"type": "string"
},
{
"type": "array",
"items": {
"type": "string"
}
}
]
},
"stream": {
"type": "boolean"
},
"temperature": {
"type": "number"
},
"top_p": {
"type": "number"
},
"user": {
"type": "string"
}
},
"required": ["messages", "model"]
},
"ChatCompletionsResponse": {
"type": "object",
"properties": {
"object": {
"type": "string"
},
"choices": {
"type": "array",
"items": {
"type": "object",
"properties": {
"finish_reason": {
"type": "string"
},
"index": {
"type": "integer"
},
"message": {
"type": "object",
"properties": {
"content": {
"type": "string"
},
"role": {
"type": "string"
}
},
"required": ["content", "role"]
},
"usage": {
"type": "object",
"properties": {
"prompt_tokens": {
"type": "integer"
},
"completion_tokens": {
"type": "integer"
},
"total_tokens": {
"type": "integer"
}
},
"required": ["prompt_tokens", "completion_tokens", "total_tokens"]
}
},
"required": ["finish_reason", "index", "message", "usage"]
}
},
"id": {
"type": "string"
},
"created": {
"type": "number"
},
"model": {
"type": "string"
}
},
"required": ["object", "choices", "id", "created", "model"]
}
}
}
}

View file

@ -0,0 +1,7 @@
openai
fastapi
uvicorn
boto3
litellm
python-dotenv
redis

View file

@ -0,0 +1,39 @@
import openai
openai.api_base = "http://0.0.0.0:8000"
print("making request")
openai.api_key = "anything" # this gets passed as a header
response = openai.ChatCompletion.create(
model = "bedrock/anthropic.claude-instant-v1",
messages = [
{
"role": "user",
"content": "this is a test message, what model / llm are you"
}
],
aws_access_key_id="",
aws_secret_access_key="",
aws_region_name="us-west-2",
max_tokens = 10,
)
print(response)
# response = openai.ChatCompletion.create(
# model = "gpt-3.5-turbo",
# messages = [
# {
# "role": "user",
# "content": "this is a test message, what model / llm are you"
# }
# ],
# max_tokens = 10,
# stream=True
# )
# for chunk in response:
# print(chunk)

View file

@ -0,0 +1,80 @@
import openai, os, dotenv, traceback, time
openai.api_base = "http://0.0.0.0:8000"
dotenv.load_dotenv()
openai.api_key = os.getenv("ANTHROPIC_API_KEY") # this gets passed as a header
response1 = openai.ChatCompletion.create(
model = "claude-instant-1",
messages = [
{
"role": "user",
"content": "write a short poem about litellm"
}
],
)
try:
print(f"response: {response1['choices'][0]['message']['content']}")
except:
print(f"response: {response1}")
time.sleep(1) # allow time for request to be stored
response2 = openai.ChatCompletion.create(
model = "claude-instant-1",
messages = [
{
"role": "user",
"content": "write a short poem about litellm"
}
],
)
try:
print(f"response: {response2['choices'][0]['message']['content']}")
except:
print(f"response: {response2}")
openai.api_key = os.getenv("OPENAI_API_KEY")
try:
response3 = openai.ChatCompletion.create(
model = "gpt-3.5-turbo",
messages = [
{
"role": "user",
"content": "write a short poem about litellm"
}
],
)
except Exception as e:
traceback.print_exc()
try:
print(f"response: {response3['choices'][0]['message']['content']}")
except:
print(f"response: {response3}")
openai.api_key = os.getenv("ANTHROPIC_API_KEY") # this gets passed as a header
# switch caching off using cache flag
response4 = openai.ChatCompletion.create(
model = "claude-instant-1",
messages = [
{
"role": "user",
"content": "write a short poem about litellm"
}
],
caching = False,
)
try:
print(f"response: {response4['choices'][0]['message']['content']}")
except:
print(f"response: {response4}")
assert response1["choices"][0]["message"]["content"] == response2["choices"][0]["message"]["content"]
assert response1["choices"][0]["message"]["content"] != response4["choices"][0]["message"]["content"]
assert response1["choices"][0]["message"]["content"] != response3["choices"][0]["message"]["content"]

View file

@ -0,0 +1,39 @@
import openai
openai.api_base = "http://127.0.0.1:8000"
openai.api_key = "this can be anything"
print("making request")
api_key = ""
response = openai.ChatCompletion.create(
model = "gpt-3.5-turbo",
messages = [
{
"role": "user",
"content": "this is a test message, what model / llm are you"
}
],
api_key=api_key,
max_tokens = 10,
)
print(response)
response = openai.ChatCompletion.create(
model = "gpt-3.5-turbo",
messages = [
{
"role": "user",
"content": "this is a test message, what model / llm are you"
}
],
api_key=api_key,
max_tokens = 10,
stream=True
)
for chunk in response:
print(chunk)

View file

@ -0,0 +1,38 @@
import openai
openai.api_base = "http://0.0.0.0:8000"
openai.api_key = "this can be anything"
print("making request")
api_key = ""
response = openai.ChatCompletion.create(
model = "openrouter/google/palm-2-chat-bison",
messages = [
{
"role": "user",
"content": "this is a test message, what model / llm are you"
}
],
api_key=api_key,
max_tokens = 10,
)
print(response)
response = openai.ChatCompletion.create(
model = "openrouter/google/palm-2-chat-bison",
messages = [
{
"role": "user",
"content": "this is a test message, what model / llm are you"
}
],
api_key=api_key,
max_tokens = 10,
stream=True
)
for chunk in response:
print(chunk)

View file

@ -0,0 +1,59 @@
#### What this tests ####
# This tests calling batch_completions by running 100 messages together
import sys, os
import traceback, asyncio
import pytest
from fastapi.testclient import TestClient
from fastapi import Request
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
from openai_proxy import app
def test_router_completion():
client = TestClient(app)
data = {
"model": "gpt-3.5-turbo",
"messages": [{"role": "user", "content": "Hey, how's it going?"}],
"model_list": [{ # list of model deployments
"model_name": "gpt-3.5-turbo", # openai model name
"litellm_params": { # params for litellm completion/embedding call
"model": "azure/chatgpt-v-2",
"api_key": os.getenv("AZURE_API_KEY"),
"api_version": os.getenv("AZURE_API_VERSION"),
"api_base": os.getenv("AZURE_API_BASE")
},
"tpm": 240000,
"rpm": 1800
}, {
"model_name": "gpt-3.5-turbo", # openai model name
"litellm_params": { # params for litellm completion/embedding call
"model": "azure/chatgpt-functioncalling",
"api_key": os.getenv("AZURE_API_KEY"),
"api_version": os.getenv("AZURE_API_VERSION"),
"api_base": os.getenv("AZURE_API_BASE")
},
"tpm": 240000,
"rpm": 1800
}, {
"model_name": "gpt-3.5-turbo", # openai model name
"litellm_params": { # params for litellm completion/embedding call
"model": "gpt-3.5-turbo",
"api_key": os.getenv("OPENAI_API_KEY"),
},
"tpm": 1000000,
"rpm": 9000
}]
}
response = client.post("/router/completions", json=data)
print(f"response: {response.text}")
assert response.status_code == 200
response_data = response.json()
# Perform assertions on the response data
assert isinstance(response_data['choices'][0]['message']['content'], str)
test_router_completion()

70
litellm_server/utils.py Normal file
View file

@ -0,0 +1,70 @@
import os, litellm
import pkg_resources
def get_package_version(package_name):
try:
package = pkg_resources.get_distribution(package_name)
return package.version
except pkg_resources.DistributionNotFound:
return None
# Usage example
package_name = "litellm"
version = get_package_version(package_name)
if version:
print(f"The version of {package_name} is {version}")
else:
print(f"{package_name} is not installed")
import yaml
import dotenv
from typing import Optional
dotenv.load_dotenv() # load env variables
def set_callbacks():
## LOGGING
if len(os.getenv("SET_VERBOSE", "")) > 0:
if os.getenv("SET_VERBOSE") == "True":
litellm.set_verbose = True
print("\033[92mLiteLLM: Switched on verbose logging\033[0m")
else:
litellm.set_verbose = False
### LANGFUSE
if (len(os.getenv("LANGFUSE_PUBLIC_KEY", "")) > 0 and len(os.getenv("LANGFUSE_SECRET_KEY", ""))) > 0 or len(os.getenv("LANGFUSE_HOST", "")) > 0:
litellm.success_callback = ["langfuse"]
print("\033[92mLiteLLM: Switched on Langfuse feature\033[0m")
## CACHING
### REDIS
if len(os.getenv("REDIS_HOST", "")) > 0 and len(os.getenv("REDIS_PORT", "")) > 0 and len(os.getenv("REDIS_PASSWORD", "")) > 0:
from litellm.caching import Cache
litellm.cache = Cache(type="redis", host=os.getenv("REDIS_HOST"), port=os.getenv("REDIS_PORT"), password=os.getenv("REDIS_PASSWORD"))
print("\033[92mLiteLLM: Switched on Redis caching\033[0m")
def load_router_config(router: Optional[litellm.Router]):
config = {}
config_file = '/app/config.yaml'
try:
if os.path.exists(config_file):
with open(config_file, 'r') as file:
config = yaml.safe_load(file)
else:
print(f"Config file '{config_file}' not found.")
except:
print(f"Config file '{config_file}' not found.")
## MODEL LIST
model_list = config.get('model_list', None)
if model_list:
router = litellm.Router(model_list=model_list)
## ENVIRONMENT VARIABLES
environment_variables = config.get('environment_variables', None)
if environment_variables:
for key, value in environment_variables.items():
os.environ[key] = value
return router