forked from phoenix/litellm-mirror
add latest version of proxy
This commit is contained in:
parent
5d0f9fd749
commit
2ccd5848b0
6 changed files with 278 additions and 86 deletions
22
cookbook/proxy-server/Dockerfile
Normal file
22
cookbook/proxy-server/Dockerfile
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
# Use a recent version of Python as the base image
|
||||||
|
FROM python:3.8-slim-buster
|
||||||
|
|
||||||
|
# Set the working directory to /app
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Copy the requirements.txt file to the image
|
||||||
|
COPY requirements.txt .
|
||||||
|
|
||||||
|
# Install the required packages
|
||||||
|
|
||||||
|
# Install the required Python packages using pip
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
# Copy the rest of the application files to the image
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Expose port 5000 for the Flask app to listen on
|
||||||
|
EXPOSE 5000
|
||||||
|
|
||||||
|
# Run the main.py file when the container is started
|
||||||
|
CMD ["python", "main.py"]
|
21
cookbook/proxy-server/LICENSE
Normal file
21
cookbook/proxy-server/LICENSE
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2023 Berri AI
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
|
@ -9,7 +9,7 @@ dotenv.load_dotenv()
|
||||||
|
|
||||||
######### LOGGING ###################
|
######### LOGGING ###################
|
||||||
# log your data to slack, supabase
|
# log your data to slack, supabase
|
||||||
litellm.success_callback=["slack", "supabase"] # .env SLACK_API_TOKEN, SLACK_API_SECRET, SLACK_API_CHANNEL, SUPABASE
|
litellm.success_callback=["slack", "supabase"] # set .env SLACK_API_TOKEN, SLACK_API_SECRET, SLACK_API_CHANNEL, SUPABASE
|
||||||
|
|
||||||
######### ERROR MONITORING ##########
|
######### ERROR MONITORING ##########
|
||||||
# log errors to slack, sentry, supabase
|
# log errors to slack, sentry, supabase
|
||||||
|
@ -27,15 +27,14 @@ def api_completion():
|
||||||
data = request.json
|
data = request.json
|
||||||
try:
|
try:
|
||||||
# pass in data to completion function, unpack data
|
# pass in data to completion function, unpack data
|
||||||
response = completion(**data)
|
response = completion(**data)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
traceback.print_exc()
|
# call handle_error function
|
||||||
response = {"error": str(e)}
|
return handle_error(data)
|
||||||
return response, 200
|
return response, 200
|
||||||
|
|
||||||
@app.route('/get_models', methods=["POST"])
|
@app.route('/get_models', methods=["POST"])
|
||||||
def get_models():
|
def get_models():
|
||||||
data = request.json
|
|
||||||
try:
|
try:
|
||||||
return litellm.model_list
|
return litellm.model_list
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -47,6 +46,120 @@ if __name__ == "__main__":
|
||||||
from waitress import serve
|
from waitress import serve
|
||||||
serve(app, host="0.0.0.0", port=5000, threads=500)
|
serve(app, host="0.0.0.0", port=5000, threads=500)
|
||||||
|
|
||||||
|
############### Advanced ##########################
|
||||||
|
|
||||||
|
################ ERROR HANDLING #####################
|
||||||
|
# implement model fallbacks, cooldowns, and retries
|
||||||
|
# if a model fails assume it was rate limited and let it cooldown for 60s
|
||||||
|
def handle_error(data):
|
||||||
|
import time
|
||||||
|
# retry completion() request with fallback models
|
||||||
|
response = None
|
||||||
|
start_time = time.time()
|
||||||
|
rate_limited_models = set()
|
||||||
|
model_expiration_times = {}
|
||||||
|
fallback_strategy=['gpt-3.5-turbo', 'command-nightly', 'claude-2']
|
||||||
|
while response == None and time.time() - start_time < 45: # retry for 45s
|
||||||
|
for model in fallback_strategy:
|
||||||
|
try:
|
||||||
|
if model in rate_limited_models: # check if model is currently cooling down
|
||||||
|
if model_expiration_times.get(model) and time.time() >= model_expiration_times[model]:
|
||||||
|
rate_limited_models.remove(model) # check if it's been 60s of cool down and remove model
|
||||||
|
else:
|
||||||
|
continue # skip model
|
||||||
|
print(f"calling model {model}")
|
||||||
|
response = completion(**data)
|
||||||
|
if response != None:
|
||||||
|
return response
|
||||||
|
except Exception as e:
|
||||||
|
rate_limited_models.add(model)
|
||||||
|
model_expiration_times[model] = time.time() + 60 # cool down this selected model
|
||||||
|
pass
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
########### Pricing is tracked in Supabase ############
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
############ Caching ###################################
|
||||||
|
# make a new endpoint with caching
|
||||||
|
# This Cache is built using ChromaDB
|
||||||
|
# it has two functions add_cache() and get_cache()
|
||||||
|
@app.route('/chat/completions', methods=["POST"])
|
||||||
|
def api_completion_with_cache():
|
||||||
|
data = request.json
|
||||||
|
try:
|
||||||
|
cache_response = get_cache(data['messages'])
|
||||||
|
if cache_response!=None:
|
||||||
|
return cache_response
|
||||||
|
# pass in data to completion function, unpack data
|
||||||
|
response = completion(**data)
|
||||||
|
|
||||||
|
# add to cache
|
||||||
|
except Exception as e:
|
||||||
|
# call handle_error function
|
||||||
|
return handle_error(data)
|
||||||
|
return response, 200
|
||||||
|
|
||||||
|
import uuid
|
||||||
|
cache_collection = None
|
||||||
|
# Add a response to the cache
|
||||||
|
def add_cache(messages, model_response):
|
||||||
|
global cache_collection
|
||||||
|
if cache_collection is None:
|
||||||
|
make_collection()
|
||||||
|
|
||||||
|
user_question = message_to_user_question(messages)
|
||||||
|
|
||||||
|
# Add the user question and model response to the cache
|
||||||
|
cache_collection.add(
|
||||||
|
documents=[user_question],
|
||||||
|
metadatas=[{"model_response": str(model_response)}],
|
||||||
|
ids=[str(uuid.uuid4())]
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
# Retrieve a response from the cache if similarity is above the threshold
|
||||||
|
def get_cache(messages, similarity_threshold):
|
||||||
|
try:
|
||||||
|
global cache_collection
|
||||||
|
if cache_collection is None:
|
||||||
|
make_collection()
|
||||||
|
|
||||||
|
user_question = message_to_user_question(messages)
|
||||||
|
|
||||||
|
# Query the cache for the user question
|
||||||
|
results = cache_collection.query(
|
||||||
|
query_texts=[user_question],
|
||||||
|
n_results=1
|
||||||
|
)
|
||||||
|
|
||||||
|
if len(results['distances'][0]) == 0:
|
||||||
|
return None # Cache is empty
|
||||||
|
|
||||||
|
distance = results['distances'][0][0]
|
||||||
|
sim = (1 - distance)
|
||||||
|
|
||||||
|
if sim >= similarity_threshold:
|
||||||
|
return results['metadatas'][0][0]["model_response"] # Return cached response
|
||||||
|
else:
|
||||||
|
return None # No cache hit
|
||||||
|
except Exception as e:
|
||||||
|
print("Error in get cache", e)
|
||||||
|
raise e
|
||||||
|
|
||||||
|
# Initialize the cache collection
|
||||||
|
def make_collection():
|
||||||
|
import chromadb
|
||||||
|
global cache_collection
|
||||||
|
client = chromadb.Client()
|
||||||
|
cache_collection = client.create_collection("llm_responses")
|
||||||
|
|
||||||
|
# HELPER: Extract user's question from messages
|
||||||
|
def message_to_user_question(messages):
|
||||||
|
user_question = ""
|
||||||
|
for message in messages:
|
||||||
|
if message['role'] == 'user':
|
||||||
|
user_question += message["content"]
|
||||||
|
return user_question
|
18
cookbook/proxy-server/models_info.json
Normal file
18
cookbook/proxy-server/models_info.json
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
|
||||||
|
{
|
||||||
|
"gpt-3.5-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
|
||||||
|
"gpt-35-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
|
||||||
|
"gpt-3.5-turbo-0613": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
|
||||||
|
"gpt-3.5-turbo-0301": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
|
||||||
|
"gpt-3.5-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
|
||||||
|
"gpt-35-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
|
||||||
|
"gpt-3.5-turbo-16k-0613": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
|
||||||
|
"gpt-4": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
|
||||||
|
"gpt-4-0613": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
|
||||||
|
"gpt-4-32k": {"max_tokens": 8000, "input_cost_per_token": 0.00006, "output_cost_per_token": 0.00012},
|
||||||
|
"claude-instant-1": {"max_tokens": 100000, "input_cost_per_token": 0.00000163, "output_cost_per_token": 0.00000551},
|
||||||
|
"claude-2": {"max_tokens": 100000, "input_cost_per_token": 0.00001102, "output_cost_per_token": 0.00003268},
|
||||||
|
"text-bison-001": {"max_tokens": 8192, "input_cost_per_token": 0.000004, "output_cost_per_token": 0.000004},
|
||||||
|
"chat-bison-001": {"max_tokens": 4096, "input_cost_per_token": 0.000002, "output_cost_per_token": 0.000002},
|
||||||
|
"command-nightly": {"max_tokens": 4096, "input_cost_per_token": 0.000015, "output_cost_per_token": 0.000015}
|
||||||
|
}
|
|
@ -1,106 +1,124 @@
|
||||||
<<<<<<< HEAD
|
|
||||||
# Proxy Server for Azure, Llama2, OpenAI, Claude, Hugging Face, Replicate Models
|
# liteLLM Proxy Server: 50+ LLM Models, Error Handling, Caching
|
||||||
|
### Azure, Llama2, OpenAI, Claude, Hugging Face, Replicate Models
|
||||||
[](https://pypi.org/project/litellm/)
|
[](https://pypi.org/project/litellm/)
|
||||||
[](https://pypi.org/project/litellm/0.1.1/)
|
[](https://pypi.org/project/litellm/0.1.1/)
|
||||||

|

|
||||||
[](https://github.com/BerriAI/litellm)
|
[](https://github.com/BerriAI/litellm)
|
||||||
=======
|
|
||||||
# Proxy Server for Chat API
|
|
||||||
>>>>>>> d1ff082 (new v litellm for render)
|
|
||||||
|
|
||||||
This repository contains a proxy server that interacts with OpenAI's Chat API and other similar APIs to facilitate chat-based language models. The server allows you to easily integrate chat completion capabilities into your applications. The server is built using Python and the Flask framework.
|
[](https://railway.app/template/_YF4Qj?referralCode=t3ukrU)
|
||||||
|
|
||||||
<<<<<<< HEAD
|
# What does liteLLM proxy do
|
||||||
# Proxy Server for Chat API
|
- Make `/chat/completions` requests for 50+ LLM models **Azure, OpenAI, Replicate, Anthropic, Hugging Face**
|
||||||
|
|
||||||
|
Example: for `model` use `claude-2`, `gpt-3.5`, `gpt-4`, `command-nightly`, `stabilityai/stablecode-completion-alpha-3b-4k`
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"model": "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1",
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"content": "Hello, whats the weather in San Francisco??",
|
||||||
|
"role": "user"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
- **Consistent Input/Output** Format
|
||||||
|
- Call all models using the OpenAI format - completion(model, messages)
|
||||||
|
- Text responses will always be available at ['choices'][0]['message']['content']
|
||||||
|
- **Error Handling** Using Model Fallbacks (if `GPT-4` fails, try `llama2`)
|
||||||
|
- **Logging** - Log Requests, Responses and Errors to `Supabase`, `Posthog`, `Mixpanel`, `Sentry`, `Helicone` (Any of the supported providers here: https://litellm.readthedocs.io/en/latest/advanced/
|
||||||
|
|
||||||
This repository contains a proxy server that interacts with OpenAI's Chat API and other similar APIs to facilitate chat-based language models. The server allows you to easily integrate chat completion capabilities into your applications. The server is built using Python and the Flask framework.
|
Example: Logs sent to Supabase
|
||||||
|
<img width="1015" alt="Screenshot 2023-08-11 at 4 02 46 PM" src="https://github.com/ishaan-jaff/proxy-server/assets/29436595/237557b8-ba09-4917-982c-8f3e1b2c8d08">
|
||||||
|
|
||||||
## Installation
|
- **Token Usage & Spend** - Track Input + Completion tokens used + Spend/model
|
||||||
|
- **Caching** - Implementation of Semantic Caching
|
||||||
=======
|
- **Streaming & Async Support** - Return generators to stream text responses
|
||||||
## Installation
|
|
||||||
|
|
||||||
>>>>>>> d1ff082 (new v litellm for render)
|
|
||||||
To set up and run the proxy server locally, follow these steps:
|
|
||||||
|
|
||||||
1. Clone this repository to your local machine:
|
|
||||||
|
|
||||||
|
|
||||||
2. Install the required dependencies using pip:
|
|
||||||
|
|
||||||
`pip install -r requirements.txt`
|
|
||||||
|
|
||||||
3. Configure the server settings, such as API keys and model endpoints, in the configuration file (`config.py`).
|
|
||||||
|
|
||||||
4. Run the server:
|
|
||||||
|
|
||||||
`python app.py`
|
|
||||||
|
|
||||||
|
|
||||||
## API Endpoints
|
## API Endpoints
|
||||||
|
|
||||||
### `/chat/completions` (POST)
|
### `/chat/completions` (POST)
|
||||||
|
|
||||||
This endpoint is used to generate chat completions. It takes in JSON data with the following parameters:
|
This endpoint is used to generate chat completions for 50+ support LLM API Models. Use llama2, GPT-4, Claude2 etc
|
||||||
|
|
||||||
- `model` (string, required): ID of the model to use for chat completions. Refer to the model endpoint compatibility table for supported models.
|
#### Input
|
||||||
|
This API endpoint accepts all inputs in raw JSON and expects the following inputs
|
||||||
|
- `model` (string, required): ID of the model to use for chat completions. See all supported models [here]: (https://litellm.readthedocs.io/en/latest/supported/):
|
||||||
|
eg `gpt-3.5-turbo`, `gpt-4`, `claude-2`, `command-nightly`, `stabilityai/stablecode-completion-alpha-3b-4k`
|
||||||
- `messages` (array, required): A list of messages representing the conversation context. Each message should have a `role` (system, user, assistant, or function), `content` (message text), and `name` (for function role).
|
- `messages` (array, required): A list of messages representing the conversation context. Each message should have a `role` (system, user, assistant, or function), `content` (message text), and `name` (for function role).
|
||||||
- Additional parameters for controlling completions, such as `temperature`, `top_p`, `n`, etc.
|
- Additional Optional parameters: `temperature`, `functions`, `function_call`, `top_p`, `n`, `stream`. See the full list of supported inputs here: https://litellm.readthedocs.io/en/latest/input/
|
||||||
|
|
||||||
Example JSON payload:
|
|
||||||
|
|
||||||
|
#### Example JSON body
|
||||||
|
For claude-2
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"model": "gpt-3.5-turbo",
|
"model": "claude-2",
|
||||||
"messages": [
|
"messages": [
|
||||||
{"role": "system", "content": "You are a helpful assistant."},
|
{
|
||||||
{"role": "user", "content": "Knock knock."},
|
"content": "Hello, whats the weather in San Francisco??",
|
||||||
{"role": "assistant", "content": "Who's there?"},
|
"role": "user"
|
||||||
{"role": "user", "content": "Orange."}
|
}
|
||||||
],
|
]
|
||||||
"temperature": 0.8
|
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Making an API request to the Proxy Server
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
|
||||||
|
# TODO: use your URL
|
||||||
|
url = "http://localhost:5000/chat/completions"
|
||||||
|
|
||||||
|
payload = json.dumps({
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"content": "Hello, whats the weather in San Francisco??",
|
||||||
|
"role": "user"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
})
|
||||||
|
headers = {
|
||||||
|
'Content-Type': 'application/json'
|
||||||
|
}
|
||||||
|
response = requests.request("POST", url, headers=headers, data=payload)
|
||||||
|
print(response.text)
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
### Output [Response Format]
|
||||||
|
Responses from the server are given in the following format.
|
||||||
|
All responses from the server are returned in the following format (for all LLM models). More info on output here: https://litellm.readthedocs.io/en/latest/output/
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"choices": [
|
||||||
|
{
|
||||||
|
"finish_reason": "stop",
|
||||||
|
"index": 0,
|
||||||
|
"message": {
|
||||||
|
"content": "I'm sorry, but I don't have the capability to provide real-time weather information. However, you can easily check the weather in San Francisco by searching online or using a weather app on your phone.",
|
||||||
|
"role": "assistant"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"created": 1691790381,
|
||||||
|
"id": "chatcmpl-7mUFZlOEgdohHRDx2UpYPRTejirzb",
|
||||||
|
"model": "gpt-3.5-turbo-0613",
|
||||||
|
"object": "chat.completion",
|
||||||
|
"usage": {
|
||||||
|
"completion_tokens": 41,
|
||||||
|
"prompt_tokens": 16,
|
||||||
|
"total_tokens": 57
|
||||||
|
}
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
## Input Parameters
|
|
||||||
model: ID of the language model to use.
|
|
||||||
messages: An array of messages representing the conversation context.
|
|
||||||
role: The role of the message author (system, user, assistant, or function).
|
|
||||||
content: The content of the message.
|
|
||||||
name: The name of the author (required for function role).
|
|
||||||
function_call: The name and arguments of a function to call.
|
|
||||||
functions: A list of functions the model may generate JSON inputs for.
|
|
||||||
Various other parameters for controlling completion behavior.
|
|
||||||
Supported Models
|
|
||||||
The proxy server supports the following models:
|
|
||||||
|
|
||||||
OpenAI Chat Completion Models:
|
|
||||||
gpt-4
|
|
||||||
gpt-4-0613
|
|
||||||
gpt-4-32k
|
|
||||||
...
|
|
||||||
OpenAI Text Completion Models:
|
|
||||||
text-davinci-003
|
|
||||||
Cohere Models:
|
|
||||||
command-nightly
|
|
||||||
command
|
|
||||||
...
|
|
||||||
Anthropic Models:
|
|
||||||
claude-2
|
|
||||||
claude-instant-1
|
|
||||||
...
|
|
||||||
Replicate Models:
|
|
||||||
replicate/
|
|
||||||
OpenRouter Models:
|
|
||||||
google/palm-2-codechat-bison
|
|
||||||
google/palm-2-chat-bison
|
|
||||||
...
|
|
||||||
Vertex Models:
|
|
||||||
chat-bison
|
|
||||||
chat-bison@001
|
|
||||||
<<<<<<< HEAD
|
|
||||||
Refer to the model endpoint compatibility table for more details.
|
|
||||||
=======
|
|
||||||
Refer to the model endpoint compatibility table for more details.
|
|
||||||
>>>>>>> d1ff082 (new v litellm for render)
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue