forked from phoenix/litellm-mirror
moving proxy server to the top of repo
This commit is contained in:
parent
8543d89418
commit
8ef47524bf
10 changed files with 0 additions and 0 deletions
BIN
cookbook/proxy-server/.DS_Store
vendored
BIN
cookbook/proxy-server/.DS_Store
vendored
Binary file not shown.
|
@ -1,22 +0,0 @@
|
|||
# Use a recent version of Python as the base image
|
||||
FROM python:3.8-slim-buster
|
||||
|
||||
# Set the working directory to /app
|
||||
WORKDIR /app
|
||||
|
||||
# Copy the requirements.txt file to the image
|
||||
COPY requirements.txt .
|
||||
|
||||
# Install the required packages
|
||||
|
||||
# Install the required Python packages using pip
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Copy the rest of the application files to the image
|
||||
COPY . .
|
||||
|
||||
# Expose port 5000 for the Flask app to listen on
|
||||
EXPOSE 5000
|
||||
|
||||
# Run the main.py file when the container is started
|
||||
CMD ["python", "main.py"]
|
|
@ -1,21 +0,0 @@
|
|||
MIT License
|
||||
|
||||
Copyright (c) 2023 Berri AI
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
|
@ -1,86 +0,0 @@
|
|||
from flask import Flask, request, jsonify, abort, Response
|
||||
from flask_cors import CORS
|
||||
import traceback
|
||||
import litellm
|
||||
|
||||
from litellm import completion
|
||||
import openai
|
||||
from utils import handle_error, get_cache, add_cache
|
||||
import os, dotenv
|
||||
import logging
|
||||
import json
|
||||
dotenv.load_dotenv()
|
||||
|
||||
# TODO: set your keys in .env or here:
|
||||
# os.environ["OPENAI_API_KEY"] = "" # set your openai key here
|
||||
# see supported models / keys here: https://litellm.readthedocs.io/en/latest/supported/
|
||||
|
||||
######### LOGGING ###################
|
||||
# log your data to slack, supabase
|
||||
litellm.success_callback=["slack", "supabase"] # set .env SLACK_API_TOKEN, SLACK_API_SECRET, SLACK_API_CHANNEL, SUPABASE
|
||||
|
||||
######### ERROR MONITORING ##########
|
||||
# log errors to slack, sentry, supabase
|
||||
litellm.failure_callback=["slack", "sentry", "supabase"] # .env SENTRY_API_URL
|
||||
|
||||
app = Flask(__name__)
|
||||
CORS(app)
|
||||
|
||||
@app.route('/')
|
||||
def index():
|
||||
return 'received!', 200
|
||||
|
||||
def data_generator(response):
|
||||
for chunk in response:
|
||||
yield f"data: {json.dumps(chunk)}\n\n"
|
||||
|
||||
@app.route('/chat/completions', methods=["POST"])
|
||||
def api_completion():
|
||||
data = request.json
|
||||
if data.get('stream') == "True":
|
||||
data['stream'] = True # convert to boolean
|
||||
try:
|
||||
# pass in data to completion function, unpack data
|
||||
response = completion(**data)
|
||||
if 'stream' in data and data['stream'] == True: # use generate_responses to stream responses
|
||||
return Response(data_generator(response), mimetype='text/event-stream')
|
||||
except Exception as e:
|
||||
# call handle_error function
|
||||
print(f"got error{e}")
|
||||
return handle_error(data)
|
||||
return response, 200 # non streaming responses
|
||||
|
||||
@app.route('/get_models', methods=["POST"])
|
||||
def get_models():
|
||||
try:
|
||||
return litellm.model_list
|
||||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
response = {"error": str(e)}
|
||||
return response, 200
|
||||
|
||||
if __name__ == "__main__":
|
||||
from waitress import serve
|
||||
serve(app, host="0.0.0.0", port=5000, threads=500)
|
||||
|
||||
############### Advanced ##########################
|
||||
|
||||
############ Caching ###################################
|
||||
# make a new endpoint with caching
|
||||
# This Cache is built using ChromaDB
|
||||
# it has two functions add_cache() and get_cache()
|
||||
@app.route('/chat/completions_with_cache', methods=["POST"])
|
||||
def api_completion_with_cache():
|
||||
data = request.json
|
||||
try:
|
||||
cache_response = get_cache(data['messages'])
|
||||
if cache_response!=None:
|
||||
return cache_response
|
||||
# pass in data to completion function, unpack data
|
||||
response = completion(**data)
|
||||
|
||||
# add to cache
|
||||
except Exception as e:
|
||||
# call handle_error function
|
||||
return handle_error(data)
|
||||
return response, 200
|
|
@ -1,18 +0,0 @@
|
|||
|
||||
{
|
||||
"gpt-3.5-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
|
||||
"gpt-35-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
|
||||
"gpt-3.5-turbo-0613": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
|
||||
"gpt-3.5-turbo-0301": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
|
||||
"gpt-3.5-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
|
||||
"gpt-35-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
|
||||
"gpt-3.5-turbo-16k-0613": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
|
||||
"gpt-4": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
|
||||
"gpt-4-0613": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
|
||||
"gpt-4-32k": {"max_tokens": 8000, "input_cost_per_token": 0.00006, "output_cost_per_token": 0.00012},
|
||||
"claude-instant-1": {"max_tokens": 100000, "input_cost_per_token": 0.00000163, "output_cost_per_token": 0.00000551},
|
||||
"claude-2": {"max_tokens": 100000, "input_cost_per_token": 0.00001102, "output_cost_per_token": 0.00003268},
|
||||
"text-bison-001": {"max_tokens": 8192, "input_cost_per_token": 0.000004, "output_cost_per_token": 0.000004},
|
||||
"chat-bison-001": {"max_tokens": 4096, "input_cost_per_token": 0.000002, "output_cost_per_token": 0.000002},
|
||||
"command-nightly": {"max_tokens": 4096, "input_cost_per_token": 0.000015, "output_cost_per_token": 0.000015}
|
||||
}
|
|
@ -1,168 +0,0 @@
|
|||
|
||||
# liteLLM Proxy Server: 50+ LLM Models, Error Handling, Caching
|
||||
### Azure, Llama2, OpenAI, Claude, Hugging Face, Replicate Models
|
||||
[](https://pypi.org/project/litellm/)
|
||||
[](https://pypi.org/project/litellm/0.1.1/)
|
||||

|
||||
[](https://github.com/BerriAI/litellm)
|
||||
|
||||
[](https://railway.app/template/DYqQAW?referralCode=t3ukrU)
|
||||
|
||||

|
||||
|
||||
## What does liteLLM proxy do
|
||||
- Make `/chat/completions` requests for 50+ LLM models **Azure, OpenAI, Replicate, Anthropic, Hugging Face**
|
||||
|
||||
Example: for `model` use `claude-2`, `gpt-3.5`, `gpt-4`, `command-nightly`, `stabilityai/stablecode-completion-alpha-3b-4k`
|
||||
```json
|
||||
{
|
||||
"model": "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1",
|
||||
"messages": [
|
||||
{
|
||||
"content": "Hello, whats the weather in San Francisco??",
|
||||
"role": "user"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
- **Consistent Input/Output** Format
|
||||
- Call all models using the OpenAI format - `completion(model, messages)`
|
||||
- Text responses will always be available at `['choices'][0]['message']['content']`
|
||||
- **Error Handling** Using Model Fallbacks (if `GPT-4` fails, try `llama2`)
|
||||
- **Logging** - Log Requests, Responses and Errors to `Supabase`, `Posthog`, `Mixpanel`, `Sentry`, `Helicone` (Any of the supported providers here: https://litellm.readthedocs.io/en/latest/advanced/
|
||||
|
||||
**Example: Logs sent to Supabase**
|
||||
<img width="1015" alt="Screenshot 2023-08-11 at 4 02 46 PM" src="https://github.com/ishaan-jaff/proxy-server/assets/29436595/237557b8-ba09-4917-982c-8f3e1b2c8d08">
|
||||
|
||||
- **Token Usage & Spend** - Track Input + Completion tokens used + Spend/model
|
||||
- **Caching** - Implementation of Semantic Caching
|
||||
- **Streaming & Async Support** - Return generators to stream text responses
|
||||
|
||||
|
||||
## API Endpoints
|
||||
|
||||
### `/chat/completions` (POST)
|
||||
|
||||
This endpoint is used to generate chat completions for 50+ support LLM API Models. Use llama2, GPT-4, Claude2 etc
|
||||
|
||||
#### Input
|
||||
This API endpoint accepts all inputs in raw JSON and expects the following inputs
|
||||
- `model` (string, required): ID of the model to use for chat completions. See all supported models [here]: (https://litellm.readthedocs.io/en/latest/supported/):
|
||||
eg `gpt-3.5-turbo`, `gpt-4`, `claude-2`, `command-nightly`, `stabilityai/stablecode-completion-alpha-3b-4k`
|
||||
- `messages` (array, required): A list of messages representing the conversation context. Each message should have a `role` (system, user, assistant, or function), `content` (message text), and `name` (for function role).
|
||||
- Additional Optional parameters: `temperature`, `functions`, `function_call`, `top_p`, `n`, `stream`. See the full list of supported inputs here: https://litellm.readthedocs.io/en/latest/input/
|
||||
|
||||
|
||||
#### Example JSON body
|
||||
For claude-2
|
||||
```json
|
||||
{
|
||||
"model": "claude-2",
|
||||
"messages": [
|
||||
{
|
||||
"content": "Hello, whats the weather in San Francisco??",
|
||||
"role": "user"
|
||||
}
|
||||
]
|
||||
|
||||
}
|
||||
```
|
||||
|
||||
### Making an API request to the Proxy Server
|
||||
```python
|
||||
import requests
|
||||
import json
|
||||
|
||||
# TODO: use your URL
|
||||
url = "http://localhost:5000/chat/completions"
|
||||
|
||||
payload = json.dumps({
|
||||
"model": "gpt-3.5-turbo",
|
||||
"messages": [
|
||||
{
|
||||
"content": "Hello, whats the weather in San Francisco??",
|
||||
"role": "user"
|
||||
}
|
||||
]
|
||||
})
|
||||
headers = {
|
||||
'Content-Type': 'application/json'
|
||||
}
|
||||
response = requests.request("POST", url, headers=headers, data=payload)
|
||||
print(response.text)
|
||||
|
||||
```
|
||||
|
||||
### Output [Response Format]
|
||||
Responses from the server are given in the following format.
|
||||
All responses from the server are returned in the following format (for all LLM models). More info on output here: https://litellm.readthedocs.io/en/latest/output/
|
||||
```json
|
||||
{
|
||||
"choices": [
|
||||
{
|
||||
"finish_reason": "stop",
|
||||
"index": 0,
|
||||
"message": {
|
||||
"content": "I'm sorry, but I don't have the capability to provide real-time weather information. However, you can easily check the weather in San Francisco by searching online or using a weather app on your phone.",
|
||||
"role": "assistant"
|
||||
}
|
||||
}
|
||||
],
|
||||
"created": 1691790381,
|
||||
"id": "chatcmpl-7mUFZlOEgdohHRDx2UpYPRTejirzb",
|
||||
"model": "gpt-3.5-turbo-0613",
|
||||
"object": "chat.completion",
|
||||
"usage": {
|
||||
"completion_tokens": 41,
|
||||
"prompt_tokens": 16,
|
||||
"total_tokens": 57
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Installation & Usage
|
||||
### Running Locally
|
||||
1. Clone liteLLM repository to your local machine:
|
||||
```
|
||||
git clone https://github.com/BerriAI/liteLLM-proxy
|
||||
```
|
||||
2. Install the required dependencies using pip
|
||||
```
|
||||
pip install requirements.txt
|
||||
```
|
||||
3. Set your LLM API keys
|
||||
```
|
||||
os.environ['OPENAI_API_KEY]` = "YOUR_API_KEY"
|
||||
or
|
||||
set OPENAI_API_KEY in your .env file
|
||||
```
|
||||
4. Run the server:
|
||||
```
|
||||
python main.py
|
||||
```
|
||||
|
||||
|
||||
|
||||
## Deploying
|
||||
1. Quick Start: Deploy on Railway
|
||||
|
||||
[](https://railway.app/template/DYqQAW?referralCode=t3ukrU)
|
||||
|
||||
2. `GCP`, `AWS`, `Azure`
|
||||
This project includes a `Dockerfile` allowing you to build and deploy a Docker Project on your providers
|
||||
|
||||
# Support / Talk with founders
|
||||
- [Our calendar 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version)
|
||||
- [Community Discord 💭](https://discord.gg/wuPM9dRgDw)
|
||||
- Our numbers 📞 +1 (770) 8783-106 / +1 (412) 618-6238
|
||||
- Our emails ✉️ ishaan@berri.ai / krrish@berri.ai
|
||||
|
||||
|
||||
## Roadmap
|
||||
- [ ] Support hosted db (e.g. Supabase)
|
||||
- [ ] Easily send data to places like posthog and sentry.
|
||||
- [ ] Add a hot-cache for project spend logs - enables fast checks for user + project limitings
|
||||
- [ ] Implement user-based rate-limiting
|
||||
- [ ] Spending controls per project - expose key creation endpoint
|
||||
- [ ] Need to store a keys db -> mapping created keys to their alias (i.e. project name)
|
||||
- [ ] Easily add new models as backups / as the entry-point (add this to the available model list)
|
|
@ -1,4 +0,0 @@
|
|||
flask
|
||||
flask_cors
|
||||
waitress
|
||||
litellm==0.1.381
|
|
@ -1,3 +0,0 @@
|
|||
|
||||
# tests to call the proxy app.py using gpt-3.5-turbo, gpt-4, and claude-instant-1
|
||||
|
|
@ -1,21 +0,0 @@
|
|||
# import openai
|
||||
# import os
|
||||
|
||||
# os.environ["OPENAI_API_KEY"] = ""
|
||||
|
||||
# openai.api_key = os.environ["OPENAI_API_KEY"]
|
||||
# openai.api_base ="http://localhost:5000"
|
||||
|
||||
# messages = [
|
||||
# {
|
||||
# "role": "user",
|
||||
# "content": "write a 1 pg essay in liteLLM"
|
||||
# }
|
||||
# ]
|
||||
|
||||
# response = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages, stream=True)
|
||||
# print("got response", response)
|
||||
# # response is a generator
|
||||
|
||||
# for chunk in response:
|
||||
# print(chunk)
|
|
@ -1,107 +0,0 @@
|
|||
|
||||
from litellm import completion
|
||||
import os, dotenv
|
||||
import json
|
||||
dotenv.load_dotenv()
|
||||
############### Advanced ##########################
|
||||
|
||||
########### streaming ############################
|
||||
def generate_responses(response):
|
||||
for chunk in response:
|
||||
yield json.dumps({"response": chunk}) + "\n"
|
||||
|
||||
################ ERROR HANDLING #####################
|
||||
# implement model fallbacks, cooldowns, and retries
|
||||
# if a model fails assume it was rate limited and let it cooldown for 60s
|
||||
def handle_error(data):
|
||||
import time
|
||||
# retry completion() request with fallback models
|
||||
response = None
|
||||
start_time = time.time()
|
||||
rate_limited_models = set()
|
||||
model_expiration_times = {}
|
||||
fallback_strategy=['gpt-3.5-turbo', 'command-nightly', 'claude-2']
|
||||
while response == None and time.time() - start_time < 45: # retry for 45s
|
||||
for model in fallback_strategy:
|
||||
try:
|
||||
if model in rate_limited_models: # check if model is currently cooling down
|
||||
if model_expiration_times.get(model) and time.time() >= model_expiration_times[model]:
|
||||
rate_limited_models.remove(model) # check if it's been 60s of cool down and remove model
|
||||
else:
|
||||
continue # skip model
|
||||
print(f"calling model {model}")
|
||||
response = completion(**data)
|
||||
if response != None:
|
||||
return response
|
||||
except Exception as e:
|
||||
rate_limited_models.add(model)
|
||||
model_expiration_times[model] = time.time() + 60 # cool down this selected model
|
||||
pass
|
||||
return response
|
||||
|
||||
|
||||
########### Pricing is tracked in Supabase ############
|
||||
|
||||
|
||||
|
||||
import uuid
|
||||
cache_collection = None
|
||||
# Add a response to the cache
|
||||
def add_cache(messages, model_response):
|
||||
global cache_collection
|
||||
if cache_collection is None:
|
||||
make_collection()
|
||||
|
||||
user_question = message_to_user_question(messages)
|
||||
|
||||
# Add the user question and model response to the cache
|
||||
cache_collection.add(
|
||||
documents=[user_question],
|
||||
metadatas=[{"model_response": str(model_response)}],
|
||||
ids=[str(uuid.uuid4())]
|
||||
)
|
||||
return
|
||||
|
||||
# Retrieve a response from the cache if similarity is above the threshold
|
||||
def get_cache(messages, similarity_threshold):
|
||||
try:
|
||||
global cache_collection
|
||||
if cache_collection is None:
|
||||
make_collection()
|
||||
|
||||
user_question = message_to_user_question(messages)
|
||||
|
||||
# Query the cache for the user question
|
||||
results = cache_collection.query(
|
||||
query_texts=[user_question],
|
||||
n_results=1
|
||||
)
|
||||
|
||||
if len(results['distances'][0]) == 0:
|
||||
return None # Cache is empty
|
||||
|
||||
distance = results['distances'][0][0]
|
||||
sim = (1 - distance)
|
||||
|
||||
if sim >= similarity_threshold:
|
||||
return results['metadatas'][0][0]["model_response"] # Return cached response
|
||||
else:
|
||||
return None # No cache hit
|
||||
except Exception as e:
|
||||
print("Error in get cache", e)
|
||||
raise e
|
||||
|
||||
# Initialize the cache collection
|
||||
def make_collection():
|
||||
import chromadb
|
||||
global cache_collection
|
||||
client = chromadb.Client()
|
||||
cache_collection = client.create_collection("llm_responses")
|
||||
|
||||
# HELPER: Extract user's question from messages
|
||||
def message_to_user_question(messages):
|
||||
user_question = ""
|
||||
for message in messages:
|
||||
if message['role'] == 'user':
|
||||
user_question += message["content"]
|
||||
return user_question
|
Loading…
Add table
Add a link
Reference in a new issue