add latest version of proxy

This commit is contained in:
ishaan-jaff 2023-08-11 16:45:45 -07:00
parent 5d0f9fd749
commit 2ccd5848b0
6 changed files with 278 additions and 86 deletions

View file

@ -0,0 +1,22 @@
# Use a recent version of Python as the base image
FROM python:3.8-slim-buster
# Set the working directory to /app
WORKDIR /app
# Copy the requirements.txt file to the image
COPY requirements.txt .
# Install the required packages
# Install the required Python packages using pip
RUN pip install --no-cache-dir -r requirements.txt
# Copy the rest of the application files to the image
COPY . .
# Expose port 5000 for the Flask app to listen on
EXPOSE 5000
# Run the main.py file when the container is started
CMD ["python", "main.py"]

View file

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2023 Berri AI
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View file

@ -9,7 +9,7 @@ dotenv.load_dotenv()
######### LOGGING ###################
# log your data to slack, supabase
litellm.success_callback=["slack", "supabase"] # .env SLACK_API_TOKEN, SLACK_API_SECRET, SLACK_API_CHANNEL, SUPABASE
litellm.success_callback=["slack", "supabase"] # set .env SLACK_API_TOKEN, SLACK_API_SECRET, SLACK_API_CHANNEL, SUPABASE
######### ERROR MONITORING ##########
# log errors to slack, sentry, supabase
@ -27,15 +27,14 @@ def api_completion():
data = request.json
try:
# pass in data to completion function, unpack data
response = completion(**data)
response = completion(**data)
except Exception as e:
traceback.print_exc()
response = {"error": str(e)}
# call handle_error function
return handle_error(data)
return response, 200
@app.route('/get_models', methods=["POST"])
def get_models():
data = request.json
try:
return litellm.model_list
except Exception as e:
@ -47,6 +46,120 @@ if __name__ == "__main__":
from waitress import serve
serve(app, host="0.0.0.0", port=5000, threads=500)
############### Advanced ##########################
################ ERROR HANDLING #####################
# implement model fallbacks, cooldowns, and retries
# if a model fails assume it was rate limited and let it cooldown for 60s
def handle_error(data):
import time
# retry completion() request with fallback models
response = None
start_time = time.time()
rate_limited_models = set()
model_expiration_times = {}
fallback_strategy=['gpt-3.5-turbo', 'command-nightly', 'claude-2']
while response == None and time.time() - start_time < 45: # retry for 45s
for model in fallback_strategy:
try:
if model in rate_limited_models: # check if model is currently cooling down
if model_expiration_times.get(model) and time.time() >= model_expiration_times[model]:
rate_limited_models.remove(model) # check if it's been 60s of cool down and remove model
else:
continue # skip model
print(f"calling model {model}")
response = completion(**data)
if response != None:
return response
except Exception as e:
rate_limited_models.add(model)
model_expiration_times[model] = time.time() + 60 # cool down this selected model
pass
return response
########### Pricing is tracked in Supabase ############
############ Caching ###################################
# make a new endpoint with caching
# This Cache is built using ChromaDB
# it has two functions add_cache() and get_cache()
@app.route('/chat/completions', methods=["POST"])
def api_completion_with_cache():
data = request.json
try:
cache_response = get_cache(data['messages'])
if cache_response!=None:
return cache_response
# pass in data to completion function, unpack data
response = completion(**data)
# add to cache
except Exception as e:
# call handle_error function
return handle_error(data)
return response, 200
import uuid
cache_collection = None
# Add a response to the cache
def add_cache(messages, model_response):
global cache_collection
if cache_collection is None:
make_collection()
user_question = message_to_user_question(messages)
# Add the user question and model response to the cache
cache_collection.add(
documents=[user_question],
metadatas=[{"model_response": str(model_response)}],
ids=[str(uuid.uuid4())]
)
return
# Retrieve a response from the cache if similarity is above the threshold
def get_cache(messages, similarity_threshold):
try:
global cache_collection
if cache_collection is None:
make_collection()
user_question = message_to_user_question(messages)
# Query the cache for the user question
results = cache_collection.query(
query_texts=[user_question],
n_results=1
)
if len(results['distances'][0]) == 0:
return None # Cache is empty
distance = results['distances'][0][0]
sim = (1 - distance)
if sim >= similarity_threshold:
return results['metadatas'][0][0]["model_response"] # Return cached response
else:
return None # No cache hit
except Exception as e:
print("Error in get cache", e)
raise e
# Initialize the cache collection
def make_collection():
import chromadb
global cache_collection
client = chromadb.Client()
cache_collection = client.create_collection("llm_responses")
# HELPER: Extract user's question from messages
def message_to_user_question(messages):
user_question = ""
for message in messages:
if message['role'] == 'user':
user_question += message["content"]
return user_question

View file

@ -0,0 +1,18 @@
{
"gpt-3.5-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
"gpt-35-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
"gpt-3.5-turbo-0613": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
"gpt-3.5-turbo-0301": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
"gpt-3.5-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
"gpt-35-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
"gpt-3.5-turbo-16k-0613": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
"gpt-4": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
"gpt-4-0613": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
"gpt-4-32k": {"max_tokens": 8000, "input_cost_per_token": 0.00006, "output_cost_per_token": 0.00012},
"claude-instant-1": {"max_tokens": 100000, "input_cost_per_token": 0.00000163, "output_cost_per_token": 0.00000551},
"claude-2": {"max_tokens": 100000, "input_cost_per_token": 0.00001102, "output_cost_per_token": 0.00003268},
"text-bison-001": {"max_tokens": 8192, "input_cost_per_token": 0.000004, "output_cost_per_token": 0.000004},
"chat-bison-001": {"max_tokens": 4096, "input_cost_per_token": 0.000002, "output_cost_per_token": 0.000002},
"command-nightly": {"max_tokens": 4096, "input_cost_per_token": 0.000015, "output_cost_per_token": 0.000015}
}

View file

@ -1,106 +1,124 @@
<<<<<<< HEAD
# Proxy Server for Azure, Llama2, OpenAI, Claude, Hugging Face, Replicate Models
# liteLLM Proxy Server: 50+ LLM Models, Error Handling, Caching
### Azure, Llama2, OpenAI, Claude, Hugging Face, Replicate Models
[![PyPI Version](https://img.shields.io/pypi/v/litellm.svg)](https://pypi.org/project/litellm/)
[![PyPI Version](https://img.shields.io/badge/stable%20version-v0.1.345-blue?color=green&link=https://pypi.org/project/litellm/0.1.1/)](https://pypi.org/project/litellm/0.1.1/)
![Downloads](https://img.shields.io/pypi/dm/litellm)
[![litellm](https://img.shields.io/badge/%20%F0%9F%9A%85%20liteLLM-OpenAI%7CAzure%7CAnthropic%7CPalm%7CCohere%7CReplicate%7CHugging%20Face-blue?color=green)](https://github.com/BerriAI/litellm)
=======
# Proxy Server for Chat API
>>>>>>> d1ff082 (new v litellm for render)
This repository contains a proxy server that interacts with OpenAI's Chat API and other similar APIs to facilitate chat-based language models. The server allows you to easily integrate chat completion capabilities into your applications. The server is built using Python and the Flask framework.
[![Deploy on Railway](https://railway.app/button.svg)](https://railway.app/template/_YF4Qj?referralCode=t3ukrU)
<<<<<<< HEAD
# Proxy Server for Chat API
# What does liteLLM proxy do
- Make `/chat/completions` requests for 50+ LLM models **Azure, OpenAI, Replicate, Anthropic, Hugging Face**
Example: for `model` use `claude-2`, `gpt-3.5`, `gpt-4`, `command-nightly`, `stabilityai/stablecode-completion-alpha-3b-4k`
```json
{
"model": "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1",
"messages": [
{
"content": "Hello, whats the weather in San Francisco??",
"role": "user"
}
]
}
```
- **Consistent Input/Output** Format
- Call all models using the OpenAI format - completion(model, messages)
- Text responses will always be available at ['choices'][0]['message']['content']
- **Error Handling** Using Model Fallbacks (if `GPT-4` fails, try `llama2`)
- **Logging** - Log Requests, Responses and Errors to `Supabase`, `Posthog`, `Mixpanel`, `Sentry`, `Helicone` (Any of the supported providers here: https://litellm.readthedocs.io/en/latest/advanced/
This repository contains a proxy server that interacts with OpenAI's Chat API and other similar APIs to facilitate chat-based language models. The server allows you to easily integrate chat completion capabilities into your applications. The server is built using Python and the Flask framework.
Example: Logs sent to Supabase
<img width="1015" alt="Screenshot 2023-08-11 at 4 02 46 PM" src="https://github.com/ishaan-jaff/proxy-server/assets/29436595/237557b8-ba09-4917-982c-8f3e1b2c8d08">
## Installation
=======
## Installation
>>>>>>> d1ff082 (new v litellm for render)
To set up and run the proxy server locally, follow these steps:
1. Clone this repository to your local machine:
2. Install the required dependencies using pip:
`pip install -r requirements.txt`
3. Configure the server settings, such as API keys and model endpoints, in the configuration file (`config.py`).
4. Run the server:
`python app.py`
- **Token Usage & Spend** - Track Input + Completion tokens used + Spend/model
- **Caching** - Implementation of Semantic Caching
- **Streaming & Async Support** - Return generators to stream text responses
## API Endpoints
### `/chat/completions` (POST)
This endpoint is used to generate chat completions. It takes in JSON data with the following parameters:
This endpoint is used to generate chat completions for 50+ support LLM API Models. Use llama2, GPT-4, Claude2 etc
- `model` (string, required): ID of the model to use for chat completions. Refer to the model endpoint compatibility table for supported models.
#### Input
This API endpoint accepts all inputs in raw JSON and expects the following inputs
- `model` (string, required): ID of the model to use for chat completions. See all supported models [here]: (https://litellm.readthedocs.io/en/latest/supported/):
eg `gpt-3.5-turbo`, `gpt-4`, `claude-2`, `command-nightly`, `stabilityai/stablecode-completion-alpha-3b-4k`
- `messages` (array, required): A list of messages representing the conversation context. Each message should have a `role` (system, user, assistant, or function), `content` (message text), and `name` (for function role).
- Additional parameters for controlling completions, such as `temperature`, `top_p`, `n`, etc.
- Additional Optional parameters: `temperature`, `functions`, `function_call`, `top_p`, `n`, `stream`. See the full list of supported inputs here: https://litellm.readthedocs.io/en/latest/input/
Example JSON payload:
#### Example JSON body
For claude-2
```json
{
"model": "gpt-3.5-turbo",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Knock knock."},
{"role": "assistant", "content": "Who's there?"},
{"role": "user", "content": "Orange."}
],
"temperature": 0.8
"model": "claude-2",
"messages": [
{
"content": "Hello, whats the weather in San Francisco??",
"role": "user"
}
]
}
```
### Making an API request to the Proxy Server
```python
import requests
import json
# TODO: use your URL
url = "http://localhost:5000/chat/completions"
payload = json.dumps({
"model": "gpt-3.5-turbo",
"messages": [
{
"content": "Hello, whats the weather in San Francisco??",
"role": "user"
}
]
})
headers = {
'Content-Type': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload)
print(response.text)
```
### Output [Response Format]
Responses from the server are given in the following format.
All responses from the server are returned in the following format (for all LLM models). More info on output here: https://litellm.readthedocs.io/en/latest/output/
```json
{
"choices": [
{
"finish_reason": "stop",
"index": 0,
"message": {
"content": "I'm sorry, but I don't have the capability to provide real-time weather information. However, you can easily check the weather in San Francisco by searching online or using a weather app on your phone.",
"role": "assistant"
}
}
],
"created": 1691790381,
"id": "chatcmpl-7mUFZlOEgdohHRDx2UpYPRTejirzb",
"model": "gpt-3.5-turbo-0613",
"object": "chat.completion",
"usage": {
"completion_tokens": 41,
"prompt_tokens": 16,
"total_tokens": 57
}
}
```
## Input Parameters
model: ID of the language model to use.
messages: An array of messages representing the conversation context.
role: The role of the message author (system, user, assistant, or function).
content: The content of the message.
name: The name of the author (required for function role).
function_call: The name and arguments of a function to call.
functions: A list of functions the model may generate JSON inputs for.
Various other parameters for controlling completion behavior.
Supported Models
The proxy server supports the following models:
OpenAI Chat Completion Models:
gpt-4
gpt-4-0613
gpt-4-32k
...
OpenAI Text Completion Models:
text-davinci-003
Cohere Models:
command-nightly
command
...
Anthropic Models:
claude-2
claude-instant-1
...
Replicate Models:
replicate/
OpenRouter Models:
google/palm-2-codechat-bison
google/palm-2-chat-bison
...
Vertex Models:
chat-bison
chat-bison@001
<<<<<<< HEAD
Refer to the model endpoint compatibility table for more details.
=======
Refer to the model endpoint compatibility table for more details.
>>>>>>> d1ff082 (new v litellm for render)