forked from phoenix/litellm-mirror
add latest version of proxy
This commit is contained in:
parent
5d0f9fd749
commit
2ccd5848b0
6 changed files with 278 additions and 86 deletions
22
cookbook/proxy-server/Dockerfile
Normal file
22
cookbook/proxy-server/Dockerfile
Normal file
|
@ -0,0 +1,22 @@
|
|||
# Use a recent version of Python as the base image
|
||||
FROM python:3.8-slim-buster
|
||||
|
||||
# Set the working directory to /app
|
||||
WORKDIR /app
|
||||
|
||||
# Copy the requirements.txt file to the image
|
||||
COPY requirements.txt .
|
||||
|
||||
# Install the required packages
|
||||
|
||||
# Install the required Python packages using pip
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Copy the rest of the application files to the image
|
||||
COPY . .
|
||||
|
||||
# Expose port 5000 for the Flask app to listen on
|
||||
EXPOSE 5000
|
||||
|
||||
# Run the main.py file when the container is started
|
||||
CMD ["python", "main.py"]
|
21
cookbook/proxy-server/LICENSE
Normal file
21
cookbook/proxy-server/LICENSE
Normal file
|
@ -0,0 +1,21 @@
|
|||
MIT License
|
||||
|
||||
Copyright (c) 2023 Berri AI
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
|
@ -9,7 +9,7 @@ dotenv.load_dotenv()
|
|||
|
||||
######### LOGGING ###################
|
||||
# log your data to slack, supabase
|
||||
litellm.success_callback=["slack", "supabase"] # .env SLACK_API_TOKEN, SLACK_API_SECRET, SLACK_API_CHANNEL, SUPABASE
|
||||
litellm.success_callback=["slack", "supabase"] # set .env SLACK_API_TOKEN, SLACK_API_SECRET, SLACK_API_CHANNEL, SUPABASE
|
||||
|
||||
######### ERROR MONITORING ##########
|
||||
# log errors to slack, sentry, supabase
|
||||
|
@ -27,15 +27,14 @@ def api_completion():
|
|||
data = request.json
|
||||
try:
|
||||
# pass in data to completion function, unpack data
|
||||
response = completion(**data)
|
||||
response = completion(**data)
|
||||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
response = {"error": str(e)}
|
||||
# call handle_error function
|
||||
return handle_error(data)
|
||||
return response, 200
|
||||
|
||||
@app.route('/get_models', methods=["POST"])
|
||||
def get_models():
|
||||
data = request.json
|
||||
try:
|
||||
return litellm.model_list
|
||||
except Exception as e:
|
||||
|
@ -47,6 +46,120 @@ if __name__ == "__main__":
|
|||
from waitress import serve
|
||||
serve(app, host="0.0.0.0", port=5000, threads=500)
|
||||
|
||||
############### Advanced ##########################
|
||||
|
||||
################ ERROR HANDLING #####################
|
||||
# implement model fallbacks, cooldowns, and retries
|
||||
# if a model fails assume it was rate limited and let it cooldown for 60s
|
||||
def handle_error(data):
|
||||
import time
|
||||
# retry completion() request with fallback models
|
||||
response = None
|
||||
start_time = time.time()
|
||||
rate_limited_models = set()
|
||||
model_expiration_times = {}
|
||||
fallback_strategy=['gpt-3.5-turbo', 'command-nightly', 'claude-2']
|
||||
while response == None and time.time() - start_time < 45: # retry for 45s
|
||||
for model in fallback_strategy:
|
||||
try:
|
||||
if model in rate_limited_models: # check if model is currently cooling down
|
||||
if model_expiration_times.get(model) and time.time() >= model_expiration_times[model]:
|
||||
rate_limited_models.remove(model) # check if it's been 60s of cool down and remove model
|
||||
else:
|
||||
continue # skip model
|
||||
print(f"calling model {model}")
|
||||
response = completion(**data)
|
||||
if response != None:
|
||||
return response
|
||||
except Exception as e:
|
||||
rate_limited_models.add(model)
|
||||
model_expiration_times[model] = time.time() + 60 # cool down this selected model
|
||||
pass
|
||||
return response
|
||||
|
||||
|
||||
########### Pricing is tracked in Supabase ############
|
||||
|
||||
|
||||
|
||||
############ Caching ###################################
|
||||
# make a new endpoint with caching
|
||||
# This Cache is built using ChromaDB
|
||||
# it has two functions add_cache() and get_cache()
|
||||
@app.route('/chat/completions', methods=["POST"])
|
||||
def api_completion_with_cache():
|
||||
data = request.json
|
||||
try:
|
||||
cache_response = get_cache(data['messages'])
|
||||
if cache_response!=None:
|
||||
return cache_response
|
||||
# pass in data to completion function, unpack data
|
||||
response = completion(**data)
|
||||
|
||||
# add to cache
|
||||
except Exception as e:
|
||||
# call handle_error function
|
||||
return handle_error(data)
|
||||
return response, 200
|
||||
|
||||
import uuid
|
||||
cache_collection = None
|
||||
# Add a response to the cache
|
||||
def add_cache(messages, model_response):
|
||||
global cache_collection
|
||||
if cache_collection is None:
|
||||
make_collection()
|
||||
|
||||
user_question = message_to_user_question(messages)
|
||||
|
||||
# Add the user question and model response to the cache
|
||||
cache_collection.add(
|
||||
documents=[user_question],
|
||||
metadatas=[{"model_response": str(model_response)}],
|
||||
ids=[str(uuid.uuid4())]
|
||||
)
|
||||
return
|
||||
|
||||
# Retrieve a response from the cache if similarity is above the threshold
|
||||
def get_cache(messages, similarity_threshold):
|
||||
try:
|
||||
global cache_collection
|
||||
if cache_collection is None:
|
||||
make_collection()
|
||||
|
||||
user_question = message_to_user_question(messages)
|
||||
|
||||
# Query the cache for the user question
|
||||
results = cache_collection.query(
|
||||
query_texts=[user_question],
|
||||
n_results=1
|
||||
)
|
||||
|
||||
if len(results['distances'][0]) == 0:
|
||||
return None # Cache is empty
|
||||
|
||||
distance = results['distances'][0][0]
|
||||
sim = (1 - distance)
|
||||
|
||||
if sim >= similarity_threshold:
|
||||
return results['metadatas'][0][0]["model_response"] # Return cached response
|
||||
else:
|
||||
return None # No cache hit
|
||||
except Exception as e:
|
||||
print("Error in get cache", e)
|
||||
raise e
|
||||
|
||||
# Initialize the cache collection
|
||||
def make_collection():
|
||||
import chromadb
|
||||
global cache_collection
|
||||
client = chromadb.Client()
|
||||
cache_collection = client.create_collection("llm_responses")
|
||||
|
||||
# HELPER: Extract user's question from messages
|
||||
def message_to_user_question(messages):
|
||||
user_question = ""
|
||||
for message in messages:
|
||||
if message['role'] == 'user':
|
||||
user_question += message["content"]
|
||||
return user_question
|
18
cookbook/proxy-server/models_info.json
Normal file
18
cookbook/proxy-server/models_info.json
Normal file
|
@ -0,0 +1,18 @@
|
|||
|
||||
{
|
||||
"gpt-3.5-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
|
||||
"gpt-35-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
|
||||
"gpt-3.5-turbo-0613": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
|
||||
"gpt-3.5-turbo-0301": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
|
||||
"gpt-3.5-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
|
||||
"gpt-35-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
|
||||
"gpt-3.5-turbo-16k-0613": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
|
||||
"gpt-4": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
|
||||
"gpt-4-0613": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
|
||||
"gpt-4-32k": {"max_tokens": 8000, "input_cost_per_token": 0.00006, "output_cost_per_token": 0.00012},
|
||||
"claude-instant-1": {"max_tokens": 100000, "input_cost_per_token": 0.00000163, "output_cost_per_token": 0.00000551},
|
||||
"claude-2": {"max_tokens": 100000, "input_cost_per_token": 0.00001102, "output_cost_per_token": 0.00003268},
|
||||
"text-bison-001": {"max_tokens": 8192, "input_cost_per_token": 0.000004, "output_cost_per_token": 0.000004},
|
||||
"chat-bison-001": {"max_tokens": 4096, "input_cost_per_token": 0.000002, "output_cost_per_token": 0.000002},
|
||||
"command-nightly": {"max_tokens": 4096, "input_cost_per_token": 0.000015, "output_cost_per_token": 0.000015}
|
||||
}
|
|
@ -1,106 +1,124 @@
|
|||
<<<<<<< HEAD
|
||||
# Proxy Server for Azure, Llama2, OpenAI, Claude, Hugging Face, Replicate Models
|
||||
|
||||
# liteLLM Proxy Server: 50+ LLM Models, Error Handling, Caching
|
||||
### Azure, Llama2, OpenAI, Claude, Hugging Face, Replicate Models
|
||||
[](https://pypi.org/project/litellm/)
|
||||
[](https://pypi.org/project/litellm/0.1.1/)
|
||||

|
||||
[](https://github.com/BerriAI/litellm)
|
||||
=======
|
||||
# Proxy Server for Chat API
|
||||
>>>>>>> d1ff082 (new v litellm for render)
|
||||
|
||||
This repository contains a proxy server that interacts with OpenAI's Chat API and other similar APIs to facilitate chat-based language models. The server allows you to easily integrate chat completion capabilities into your applications. The server is built using Python and the Flask framework.
|
||||
[](https://railway.app/template/_YF4Qj?referralCode=t3ukrU)
|
||||
|
||||
<<<<<<< HEAD
|
||||
# Proxy Server for Chat API
|
||||
# What does liteLLM proxy do
|
||||
- Make `/chat/completions` requests for 50+ LLM models **Azure, OpenAI, Replicate, Anthropic, Hugging Face**
|
||||
|
||||
Example: for `model` use `claude-2`, `gpt-3.5`, `gpt-4`, `command-nightly`, `stabilityai/stablecode-completion-alpha-3b-4k`
|
||||
```json
|
||||
{
|
||||
"model": "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1",
|
||||
"messages": [
|
||||
{
|
||||
"content": "Hello, whats the weather in San Francisco??",
|
||||
"role": "user"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
- **Consistent Input/Output** Format
|
||||
- Call all models using the OpenAI format - completion(model, messages)
|
||||
- Text responses will always be available at ['choices'][0]['message']['content']
|
||||
- **Error Handling** Using Model Fallbacks (if `GPT-4` fails, try `llama2`)
|
||||
- **Logging** - Log Requests, Responses and Errors to `Supabase`, `Posthog`, `Mixpanel`, `Sentry`, `Helicone` (Any of the supported providers here: https://litellm.readthedocs.io/en/latest/advanced/
|
||||
|
||||
This repository contains a proxy server that interacts with OpenAI's Chat API and other similar APIs to facilitate chat-based language models. The server allows you to easily integrate chat completion capabilities into your applications. The server is built using Python and the Flask framework.
|
||||
Example: Logs sent to Supabase
|
||||
<img width="1015" alt="Screenshot 2023-08-11 at 4 02 46 PM" src="https://github.com/ishaan-jaff/proxy-server/assets/29436595/237557b8-ba09-4917-982c-8f3e1b2c8d08">
|
||||
|
||||
## Installation
|
||||
|
||||
=======
|
||||
## Installation
|
||||
|
||||
>>>>>>> d1ff082 (new v litellm for render)
|
||||
To set up and run the proxy server locally, follow these steps:
|
||||
|
||||
1. Clone this repository to your local machine:
|
||||
|
||||
|
||||
2. Install the required dependencies using pip:
|
||||
|
||||
`pip install -r requirements.txt`
|
||||
|
||||
3. Configure the server settings, such as API keys and model endpoints, in the configuration file (`config.py`).
|
||||
|
||||
4. Run the server:
|
||||
|
||||
`python app.py`
|
||||
- **Token Usage & Spend** - Track Input + Completion tokens used + Spend/model
|
||||
- **Caching** - Implementation of Semantic Caching
|
||||
- **Streaming & Async Support** - Return generators to stream text responses
|
||||
|
||||
|
||||
## API Endpoints
|
||||
|
||||
### `/chat/completions` (POST)
|
||||
|
||||
This endpoint is used to generate chat completions. It takes in JSON data with the following parameters:
|
||||
This endpoint is used to generate chat completions for 50+ support LLM API Models. Use llama2, GPT-4, Claude2 etc
|
||||
|
||||
- `model` (string, required): ID of the model to use for chat completions. Refer to the model endpoint compatibility table for supported models.
|
||||
#### Input
|
||||
This API endpoint accepts all inputs in raw JSON and expects the following inputs
|
||||
- `model` (string, required): ID of the model to use for chat completions. See all supported models [here]: (https://litellm.readthedocs.io/en/latest/supported/):
|
||||
eg `gpt-3.5-turbo`, `gpt-4`, `claude-2`, `command-nightly`, `stabilityai/stablecode-completion-alpha-3b-4k`
|
||||
- `messages` (array, required): A list of messages representing the conversation context. Each message should have a `role` (system, user, assistant, or function), `content` (message text), and `name` (for function role).
|
||||
- Additional parameters for controlling completions, such as `temperature`, `top_p`, `n`, etc.
|
||||
- Additional Optional parameters: `temperature`, `functions`, `function_call`, `top_p`, `n`, `stream`. See the full list of supported inputs here: https://litellm.readthedocs.io/en/latest/input/
|
||||
|
||||
Example JSON payload:
|
||||
|
||||
#### Example JSON body
|
||||
For claude-2
|
||||
```json
|
||||
{
|
||||
"model": "gpt-3.5-turbo",
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "Knock knock."},
|
||||
{"role": "assistant", "content": "Who's there?"},
|
||||
{"role": "user", "content": "Orange."}
|
||||
],
|
||||
"temperature": 0.8
|
||||
"model": "claude-2",
|
||||
"messages": [
|
||||
{
|
||||
"content": "Hello, whats the weather in San Francisco??",
|
||||
"role": "user"
|
||||
}
|
||||
]
|
||||
|
||||
}
|
||||
```
|
||||
|
||||
### Making an API request to the Proxy Server
|
||||
```python
|
||||
import requests
|
||||
import json
|
||||
|
||||
# TODO: use your URL
|
||||
url = "http://localhost:5000/chat/completions"
|
||||
|
||||
payload = json.dumps({
|
||||
"model": "gpt-3.5-turbo",
|
||||
"messages": [
|
||||
{
|
||||
"content": "Hello, whats the weather in San Francisco??",
|
||||
"role": "user"
|
||||
}
|
||||
]
|
||||
})
|
||||
headers = {
|
||||
'Content-Type': 'application/json'
|
||||
}
|
||||
response = requests.request("POST", url, headers=headers, data=payload)
|
||||
print(response.text)
|
||||
|
||||
```
|
||||
|
||||
### Output [Response Format]
|
||||
Responses from the server are given in the following format.
|
||||
All responses from the server are returned in the following format (for all LLM models). More info on output here: https://litellm.readthedocs.io/en/latest/output/
|
||||
```json
|
||||
{
|
||||
"choices": [
|
||||
{
|
||||
"finish_reason": "stop",
|
||||
"index": 0,
|
||||
"message": {
|
||||
"content": "I'm sorry, but I don't have the capability to provide real-time weather information. However, you can easily check the weather in San Francisco by searching online or using a weather app on your phone.",
|
||||
"role": "assistant"
|
||||
}
|
||||
}
|
||||
],
|
||||
"created": 1691790381,
|
||||
"id": "chatcmpl-7mUFZlOEgdohHRDx2UpYPRTejirzb",
|
||||
"model": "gpt-3.5-turbo-0613",
|
||||
"object": "chat.completion",
|
||||
"usage": {
|
||||
"completion_tokens": 41,
|
||||
"prompt_tokens": 16,
|
||||
"total_tokens": 57
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
## Input Parameters
|
||||
model: ID of the language model to use.
|
||||
messages: An array of messages representing the conversation context.
|
||||
role: The role of the message author (system, user, assistant, or function).
|
||||
content: The content of the message.
|
||||
name: The name of the author (required for function role).
|
||||
function_call: The name and arguments of a function to call.
|
||||
functions: A list of functions the model may generate JSON inputs for.
|
||||
Various other parameters for controlling completion behavior.
|
||||
Supported Models
|
||||
The proxy server supports the following models:
|
||||
|
||||
OpenAI Chat Completion Models:
|
||||
gpt-4
|
||||
gpt-4-0613
|
||||
gpt-4-32k
|
||||
...
|
||||
OpenAI Text Completion Models:
|
||||
text-davinci-003
|
||||
Cohere Models:
|
||||
command-nightly
|
||||
command
|
||||
...
|
||||
Anthropic Models:
|
||||
claude-2
|
||||
claude-instant-1
|
||||
...
|
||||
Replicate Models:
|
||||
replicate/
|
||||
OpenRouter Models:
|
||||
google/palm-2-codechat-bison
|
||||
google/palm-2-chat-bison
|
||||
...
|
||||
Vertex Models:
|
||||
chat-bison
|
||||
chat-bison@001
|
||||
<<<<<<< HEAD
|
||||
Refer to the model endpoint compatibility table for more details.
|
||||
=======
|
||||
Refer to the model endpoint compatibility table for more details.
|
||||
>>>>>>> d1ff082 (new v litellm for render)
|
||||
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue