add latest version of proxy

2023-08-11 16:45:45 -07:00 · 2023-08-11 16:45:45 -07:00 · 2ccd5848b0
commit 2ccd5848b0
parent 5d0f9fd749
6 changed files with 278 additions and 86 deletions
--- a/cookbook/proxy-server/Dockerfile
+++ b/cookbook/proxy-server/Dockerfile
@ -0,0 +1,22 @@
+# Use a recent version of Python as the base image
+FROM python:3.8-slim-buster
+
+# Set the working directory to /app
+WORKDIR /app
+
+# Copy the requirements.txt file to the image
+COPY requirements.txt .
+
+# Install the required packages
+
+# Install the required Python packages using pip
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy the rest of the application files to the image
+COPY . .
+
+# Expose port 5000 for the Flask app to listen on
+EXPOSE 5000
+
+# Run the main.py file when the container is started
+CMD ["python", "main.py"]
--- a/cookbook/proxy-server/LICENSE
+++ b/cookbook/proxy-server/LICENSE
@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 Berri AI
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/cookbook/proxy-server/main.py
+++ b/cookbook/proxy-server/main.py
@ -9,7 +9,7 @@ dotenv.load_dotenv()

 ######### LOGGING ###################
 # log your data to slack, supabase
-litellm.success_callback=["slack", "supabase"] # .env SLACK_API_TOKEN, SLACK_API_SECRET, SLACK_API_CHANNEL, SUPABASE 
+litellm.success_callback=["slack", "supabase"] # set .env SLACK_API_TOKEN, SLACK_API_SECRET, SLACK_API_CHANNEL, SUPABASE 

 ######### ERROR MONITORING ##########
 # log errors to slack, sentry, supabase
@ -27,15 +27,14 @@ def api_completion():
    data = request.json
    try:
        # pass in data to completion function, unpack data
-        response = completion(**data) 
+        response = completion(**data)
    except Exception as e:
-        traceback.print_exc()
-        response = {"error": str(e)}
+        # call handle_error function
+        return handle_error(data)
    return response, 200

@app.route('/get_models', methods=["POST"])
 def get_models():
-    data = request.json
    try:
        return litellm.model_list
    except Exception as e:
@ -47,6 +46,120 @@ if __name__ == "__main__":
  from waitress import serve
  serve(app, host="0.0.0.0", port=5000, threads=500)

+############### Advanced ##########################
+
+################ ERROR HANDLING #####################
+# implement model fallbacks, cooldowns, and retries
+# if a model fails assume it was rate limited and let it cooldown for 60s
+def handle_error(data):
+    import time
+    # retry completion() request with fallback models
+    response = None
+    start_time = time.time()
+    rate_limited_models = set()
+    model_expiration_times = {}
+    fallback_strategy=['gpt-3.5-turbo', 'command-nightly', 'claude-2']
+    while response == None and time.time() - start_time < 45: # retry for 45s
+      for model in fallback_strategy:
+        try:
+            if model in rate_limited_models: # check if model is currently cooling down
+              if model_expiration_times.get(model) and time.time() >= model_expiration_times[model]:
+                  rate_limited_models.remove(model) # check if it's been 60s of cool down and remove model
+              else:
+                  continue # skip model
+            print(f"calling model {model}")
+            response = completion(**data)
+            if response != None:
+              return response
+        except Exception as e:
+          rate_limited_models.add(model)
+          model_expiration_times[model] = time.time() + 60 # cool down this selected model
+          pass
+    return response
+
+
+########### Pricing is tracked in Supabase ############



+############ Caching ###################################
+# make a new endpoint with caching
+# This Cache is built using ChromaDB
+# it has two functions add_cache() and get_cache()
+@app.route('/chat/completions', methods=["POST"])
+def api_completion_with_cache():
+    data = request.json
+    try:
+        cache_response = get_cache(data['messages'])
+        if cache_response!=None:
+            return cache_response
+        # pass in data to completion function, unpack data
+        response = completion(**data) 
+
+        # add to cache 
+    except Exception as e:
+        # call handle_error function
+        return handle_error(data)
+    return response, 200
+
+import uuid
+cache_collection = None
+# Add a response to the cache
+def add_cache(messages, model_response):
+    global cache_collection
+    if cache_collection is None:
+        make_collection()
+
+    user_question = message_to_user_question(messages)
+
+    # Add the user question and model response to the cache
+    cache_collection.add(
+        documents=[user_question],
+        metadatas=[{"model_response": str(model_response)}],
+        ids=[str(uuid.uuid4())]
+    )
+    return
+
+# Retrieve a response from the cache if similarity is above the threshold
+def get_cache(messages, similarity_threshold):
+    try:
+        global cache_collection
+        if cache_collection is None:
+            make_collection()
+
+        user_question = message_to_user_question(messages)
+
+        # Query the cache for the user question
+        results = cache_collection.query(
+            query_texts=[user_question],
+            n_results=1
+        )
+
+        if len(results['distances'][0]) == 0:
+            return None  # Cache is empty
+
+        distance = results['distances'][0][0]
+        sim = (1 - distance)
+
+        if sim >= similarity_threshold:
+            return results['metadatas'][0][0]["model_response"]  # Return cached response
+        else:
+            return None  # No cache hit
+    except Exception as e:
+        print("Error in get cache", e)
+        raise e
+
+# Initialize the cache collection
+def make_collection():
+    import chromadb
+    global cache_collection
+    client = chromadb.Client()
+    cache_collection = client.create_collection("llm_responses")
+
+# HELPER: Extract user's question from messages
+def message_to_user_question(messages):
+    user_question = ""
+    for message in messages:
+        if message['role'] == 'user':
+            user_question += message["content"]
+    return user_question
--- a/cookbook/proxy-server/models.json
+++ b/cookbook/proxy-server/models.json
--- a/cookbook/proxy-server/models_info.json
+++ b/cookbook/proxy-server/models_info.json
@ -0,0 +1,18 @@
+
+{
+    "gpt-3.5-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
+    "gpt-35-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, 
+    "gpt-3.5-turbo-0613": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
+    "gpt-3.5-turbo-0301": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
+    "gpt-3.5-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
+    "gpt-35-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
+    "gpt-3.5-turbo-16k-0613": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
+    "gpt-4": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
+    "gpt-4-0613": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
+    "gpt-4-32k": {"max_tokens": 8000, "input_cost_per_token": 0.00006, "output_cost_per_token": 0.00012},
+    "claude-instant-1": {"max_tokens": 100000, "input_cost_per_token": 0.00000163, "output_cost_per_token": 0.00000551},
+    "claude-2": {"max_tokens": 100000, "input_cost_per_token": 0.00001102, "output_cost_per_token": 0.00003268},
+    "text-bison-001": {"max_tokens": 8192, "input_cost_per_token": 0.000004, "output_cost_per_token": 0.000004},
+    "chat-bison-001": {"max_tokens": 4096, "input_cost_per_token": 0.000002, "output_cost_per_token": 0.000002},
+    "command-nightly": {"max_tokens": 4096, "input_cost_per_token": 0.000015, "output_cost_per_token": 0.000015}
+}
--- a/cookbook/proxy-server/readme.md
+++ b/cookbook/proxy-server/readme.md
@ -1,106 +1,124 @@
-<<<<<<< HEAD
-# Proxy Server for Azure, Llama2, OpenAI, Claude, Hugging Face, Replicate Models
+
+# liteLLM Proxy Server: 50+ LLM Models, Error Handling, Caching
+### Azure, Llama2, OpenAI, Claude, Hugging Face, Replicate Models
 [![PyPI Version](https://img.shields.io/pypi/v/litellm.svg)](https://pypi.org/project/litellm/)
 [![PyPI Version](https://img.shields.io/badge/stable%20version-v0.1.345-blue?color=green&link=https://pypi.org/project/litellm/0.1.1/)](https://pypi.org/project/litellm/0.1.1/)
 ![Downloads](https://img.shields.io/pypi/dm/litellm)
 [![litellm](https://img.shields.io/badge/%20%F0%9F%9A%85%20liteLLM-OpenAI%7CAzure%7CAnthropic%7CPalm%7CCohere%7CReplicate%7CHugging%20Face-blue?color=green)](https://github.com/BerriAI/litellm)
-=======
-# Proxy Server for Chat API
->>>>>>> d1ff082 (new v litellm for render)

-This repository contains a proxy server that interacts with OpenAI's Chat API and other similar APIs to facilitate chat-based language models. The server allows you to easily integrate chat completion capabilities into your applications. The server is built using Python and the Flask framework.
+[![Deploy on Railway](https://railway.app/button.svg)](https://railway.app/template/_YF4Qj?referralCode=t3ukrU)

-<<<<<<< HEAD
-# Proxy Server for Chat API
+# What does liteLLM proxy do
+- Make `/chat/completions` requests for 50+ LLM models **Azure, OpenAI, Replicate, Anthropic, Hugging Face**
+  
+  Example: for `model` use `claude-2`, `gpt-3.5`, `gpt-4`, `command-nightly`, `stabilityai/stablecode-completion-alpha-3b-4k`
+  ```json
+  {
+    "model": "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1",
+    "messages": [
+                    { 
+                        "content": "Hello, whats the weather in San Francisco??",
+                        "role": "user"
+                    }
+                ]
+  }
+  ```
+- **Consistent Input/Output** Format
+    - Call all models using the OpenAI format - completion(model, messages)
+    - Text responses will always be available at ['choices'][0]['message']['content']
+- **Error Handling** Using Model Fallbacks (if `GPT-4` fails, try `llama2`)
+- **Logging** - Log Requests, Responses and Errors to `Supabase`, `Posthog`, `Mixpanel`, `Sentry`, `Helicone` (Any of the supported providers here: https://litellm.readthedocs.io/en/latest/advanced/

-This repository contains a proxy server that interacts with OpenAI's Chat API and other similar APIs to facilitate chat-based language models. The server allows you to easily integrate chat completion capabilities into your applications. The server is built using Python and the Flask framework.
+  Example: Logs sent to Supabase
+  <img width="1015" alt="Screenshot 2023-08-11 at 4 02 46 PM" src="https://github.com/ishaan-jaff/proxy-server/assets/29436595/237557b8-ba09-4917-982c-8f3e1b2c8d08">

-## Installation
-
-=======
-## Installation
-
->>>>>>> d1ff082 (new v litellm for render)
-To set up and run the proxy server locally, follow these steps:
-
-1. Clone this repository to your local machine:
-
-
-2. Install the required dependencies using pip:
-
-`pip install -r requirements.txt`
-
-3. Configure the server settings, such as API keys and model endpoints, in the configuration file (`config.py`).
-
-4. Run the server:
-
-`python app.py`
+- **Token Usage & Spend** - Track Input + Completion tokens used + Spend/model
+- **Caching** - Implementation of Semantic Caching
+- **Streaming & Async Support** - Return generators to stream text responses


 ## API Endpoints

 ### `/chat/completions` (POST)

-This endpoint is used to generate chat completions. It takes in JSON data with the following parameters:
+This endpoint is used to generate chat completions for 50+ support LLM API Models. Use llama2, GPT-4, Claude2 etc

- `model` (string, required): ID of the model to use for chat completions. Refer to the model endpoint compatibility table for supported models.
+#### Input
+This API endpoint accepts all inputs in raw JSON and expects the following inputs
+- `model` (string, required): ID of the model to use for chat completions. See all supported models [here]: (https://litellm.readthedocs.io/en/latest/supported/): 
+ eg `gpt-3.5-turbo`, `gpt-4`, `claude-2`, `command-nightly`, `stabilityai/stablecode-completion-alpha-3b-4k`
 - `messages` (array, required): A list of messages representing the conversation context. Each message should have a `role` (system, user, assistant, or function), `content` (message text), and `name` (for function role).
- Additional parameters for controlling completions, such as `temperature`, `top_p`, `n`, etc.
+- Additional Optional parameters: `temperature`, `functions`, `function_call`, `top_p`, `n`, `stream`. See the full list of supported inputs here: https://litellm.readthedocs.io/en/latest/input/

-Example JSON payload:

+#### Example JSON body
+For claude-2
 ```json
 {
-"model": "gpt-3.5-turbo",
-"messages": [
- {"role": "system", "content": "You are a helpful assistant."},
- {"role": "user", "content": "Knock knock."},
- {"role": "assistant", "content": "Who's there?"},
- {"role": "user", "content": "Orange."}
-],
-"temperature": 0.8
+    "model": "claude-2",
+    "messages": [
+                    { 
+                        "content": "Hello, whats the weather in San Francisco??",
+                        "role": "user"
+                    }
+                ]
+    
+}
+```
+
+### Making an API request to the Proxy Server
+```python
+import requests
+import json
+
+# TODO: use your URL 
+url = "http://localhost:5000/chat/completions"
+
+payload = json.dumps({
+  "model": "gpt-3.5-turbo",
+  "messages": [
+    {
+      "content": "Hello, whats the weather in San Francisco??",
+      "role": "user"
+    }
+  ]
+})
+headers = {
+  'Content-Type': 'application/json'
+}
+response = requests.request("POST", url, headers=headers, data=payload)
+print(response.text)
+
+```
+
+### Output [Response Format]
+Responses from the server are given in the following format. 
+All responses from the server are returned in the following format (for all LLM models). More info on output here: https://litellm.readthedocs.io/en/latest/output/
+```json
+{
+    "choices": [
+        {
+            "finish_reason": "stop",
+            "index": 0,
+            "message": {
+                "content": "I'm sorry, but I don't have the capability to provide real-time weather information. However, you can easily check the weather in San Francisco by searching online or using a weather app on your phone.",
+                "role": "assistant"
+            }
+        }
+    ],
+    "created": 1691790381,
+    "id": "chatcmpl-7mUFZlOEgdohHRDx2UpYPRTejirzb",
+    "model": "gpt-3.5-turbo-0613",
+    "object": "chat.completion",
+    "usage": {
+        "completion_tokens": 41,
+        "prompt_tokens": 16,
+        "total_tokens": 57
+    }
 }
 ```


-## Input Parameters
-model: ID of the language model to use.
-messages: An array of messages representing the conversation context.
-role: The role of the message author (system, user, assistant, or function).
-content: The content of the message.
-name: The name of the author (required for function role).
-function_call: The name and arguments of a function to call.
-functions: A list of functions the model may generate JSON inputs for.
-Various other parameters for controlling completion behavior.
-Supported Models
-The proxy server supports the following models:

-OpenAI Chat Completion Models:
-gpt-4
-gpt-4-0613
-gpt-4-32k
-...
-OpenAI Text Completion Models:
-text-davinci-003
-Cohere Models:
-command-nightly
-command
-...
-Anthropic Models:
-claude-2
-claude-instant-1
-...
-Replicate Models:
-replicate/
-OpenRouter Models:
-google/palm-2-codechat-bison
-google/palm-2-chat-bison
-...
-Vertex Models:
-chat-bison
-chat-bison@001
-<<<<<<< HEAD
-Refer to the model endpoint compatibility table for more details.
-=======
-Refer to the model endpoint compatibility table for more details.
->>>>>>> d1ff082 (new v litellm for render)
+
+