add latest version of proxy

2023-08-11 16:45:45 -07:00 · 2023-08-11 16:45:45 -07:00 · 2ccd5848b0
commit 2ccd5848b0
parent 5d0f9fd749
6 changed files with 278 additions and 86 deletions
--- a/cookbook/proxy-server/Dockerfile
+++ b/cookbook/proxy-server/Dockerfile
@ -0,0 +1,22 @@
 # Use a recent version of Python as the base image
 FROM python:3.8-slim-buster
 # Set the working directory to /app
 WORKDIR /app
 # Copy the requirements.txt file to the image
 COPY requirements.txt .
 # Install the required packages
 # Install the required Python packages using pip
 RUN pip install --no-cache-dir -r requirements.txt
 # Copy the rest of the application files to the image
 COPY . .
 # Expose port 5000 for the Flask app to listen on
 EXPOSE 5000
 # Run the main.py file when the container is started
 CMD ["python", "main.py"]
--- a/cookbook/proxy-server/LICENSE
+++ b/cookbook/proxy-server/LICENSE
@ -0,0 +1,21 @@
 MIT License
 Copyright (c) 2023 Berri AI
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/cookbook/proxy-server/main.py
+++ b/cookbook/proxy-server/main.py
@ -9,7 +9,7 @@ dotenv.load_dotenv()
 ######### LOGGING ###################
 # log your data to slack, supabase
-litellm.success_callback=["slack", "supabase"] # .env SLACK_API_TOKEN, SLACK_API_SECRET, SLACK_API_CHANNEL, SUPABASE 
+litellm.success_callback=["slack", "supabase"] # set .env SLACK_API_TOKEN, SLACK_API_SECRET, SLACK_API_CHANNEL, SUPABASE 
 ######### ERROR MONITORING ##########
 # log errors to slack, sentry, supabase
@ -27,15 +27,14 @@ def api_completion():
    data = request.json
    try:
        # pass in data to completion function, unpack data
-        response = completion(**data) 
+        response = completion(**data)
    except Exception as e:
-        traceback.print_exc()
+        # call handle_error function
-        response = {"error": str(e)}
+        return handle_error(data)
    return response, 200
@app.route('/get_models', methods=["POST"])
 def get_models():
    data = request.json
    try:
        return litellm.model_list
    except Exception as e:
@ -47,6 +46,120 @@ if __name__ == "__main__":
  from waitress import serve
  serve(app, host="0.0.0.0", port=5000, threads=500)
 ############### Advanced ##########################
 ################ ERROR HANDLING #####################
 # implement model fallbacks, cooldowns, and retries
 # if a model fails assume it was rate limited and let it cooldown for 60s
 def handle_error(data):
    import time
    # retry completion() request with fallback models
    response = None
    start_time = time.time()
    rate_limited_models = set()
    model_expiration_times = {}
    fallback_strategy=['gpt-3.5-turbo', 'command-nightly', 'claude-2']
    while response == None and time.time() - start_time < 45: # retry for 45s
      for model in fallback_strategy:
        try:
            if model in rate_limited_models: # check if model is currently cooling down
              if model_expiration_times.get(model) and time.time() >= model_expiration_times[model]:
                  rate_limited_models.remove(model) # check if it's been 60s of cool down and remove model
              else:
                  continue # skip model
            print(f"calling model {model}")
            response = completion(**data)
            if response != None:
              return response
        except Exception as e:
          rate_limited_models.add(model)
          model_expiration_times[model] = time.time() + 60 # cool down this selected model
          pass
    return response
 ########### Pricing is tracked in Supabase ############
 ############ Caching ###################################
 # make a new endpoint with caching
 # This Cache is built using ChromaDB
 # it has two functions add_cache() and get_cache()
@app.route('/chat/completions', methods=["POST"])
 def api_completion_with_cache():
    data = request.json
    try:
        cache_response = get_cache(data['messages'])
        if cache_response!=None:
            return cache_response
        # pass in data to completion function, unpack data
        response = completion(**data) 
        # add to cache 
    except Exception as e:
        # call handle_error function
        return handle_error(data)
    return response, 200
 import uuid
 cache_collection = None
 # Add a response to the cache
 def add_cache(messages, model_response):
    global cache_collection
    if cache_collection is None:
        make_collection()
    user_question = message_to_user_question(messages)
    # Add the user question and model response to the cache
    cache_collection.add(
        documents=[user_question],
        metadatas=[{"model_response": str(model_response)}],
        ids=[str(uuid.uuid4())]
    )
    return
 # Retrieve a response from the cache if similarity is above the threshold
 def get_cache(messages, similarity_threshold):
    try:
        global cache_collection
        if cache_collection is None:
            make_collection()
        user_question = message_to_user_question(messages)
        # Query the cache for the user question
        results = cache_collection.query(
            query_texts=[user_question],
            n_results=1
        )
        if len(results['distances'][0]) == 0:
            return None  # Cache is empty
        distance = results['distances'][0][0]
        sim = (1 - distance)
        if sim >= similarity_threshold:
            return results['metadatas'][0][0]["model_response"]  # Return cached response
        else:
            return None  # No cache hit
    except Exception as e:
        print("Error in get cache", e)
        raise e
 # Initialize the cache collection
 def make_collection():
    import chromadb
    global cache_collection
    client = chromadb.Client()
    cache_collection = client.create_collection("llm_responses")
 # HELPER: Extract user's question from messages
 def message_to_user_question(messages):
    user_question = ""
    for message in messages:
        if message['role'] == 'user':
            user_question += message["content"]
    return user_question
--- a/cookbook/proxy-server/models.json
+++ b/cookbook/proxy-server/models.json
--- a/cookbook/proxy-server/models_info.json
+++ b/cookbook/proxy-server/models_info.json
@ -0,0 +1,18 @@
 {
    "gpt-3.5-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
    "gpt-35-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, 
    "gpt-3.5-turbo-0613": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
    "gpt-3.5-turbo-0301": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
    "gpt-3.5-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
    "gpt-35-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
    "gpt-3.5-turbo-16k-0613": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
    "gpt-4": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
    "gpt-4-0613": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
    "gpt-4-32k": {"max_tokens": 8000, "input_cost_per_token": 0.00006, "output_cost_per_token": 0.00012},
    "claude-instant-1": {"max_tokens": 100000, "input_cost_per_token": 0.00000163, "output_cost_per_token": 0.00000551},
    "claude-2": {"max_tokens": 100000, "input_cost_per_token": 0.00001102, "output_cost_per_token": 0.00003268},
    "text-bison-001": {"max_tokens": 8192, "input_cost_per_token": 0.000004, "output_cost_per_token": 0.000004},
    "chat-bison-001": {"max_tokens": 4096, "input_cost_per_token": 0.000002, "output_cost_per_token": 0.000002},
    "command-nightly": {"max_tokens": 4096, "input_cost_per_token": 0.000015, "output_cost_per_token": 0.000015}
 }
--- a/cookbook/proxy-server/readme.md
+++ b/cookbook/proxy-server/readme.md
@ -1,106 +1,124 @@
-<<<<<<< HEAD
+
-# Proxy Server for Azure, Llama2, OpenAI, Claude, Hugging Face, Replicate Models
+# liteLLM Proxy Server: 50+ LLM Models, Error Handling, Caching
 ### Azure, Llama2, OpenAI, Claude, Hugging Face, Replicate Models
 [![PyPI Version](https://img.shields.io/pypi/v/litellm.svg)](https://pypi.org/project/litellm/)
 [![PyPI Version](https://img.shields.io/badge/stable%20version-v0.1.345-blue?color=green&link=https://pypi.org/project/litellm/0.1.1/)](https://pypi.org/project/litellm/0.1.1/)
 ![Downloads](https://img.shields.io/pypi/dm/litellm)
 [![litellm](https://img.shields.io/badge/%20%F0%9F%9A%85%20liteLLM-OpenAI%7CAzure%7CAnthropic%7CPalm%7CCohere%7CReplicate%7CHugging%20Face-blue?color=green)](https://github.com/BerriAI/litellm)
 =======
 # Proxy Server for Chat API
 >>>>>>> d1ff082 (new v litellm for render)
-This repository contains a proxy server that interacts with OpenAI's Chat API and other similar APIs to facilitate chat-based language models. The server allows you to easily integrate chat completion capabilities into your applications. The server is built using Python and the Flask framework.
+[![Deploy on Railway](https://railway.app/button.svg)](https://railway.app/template/_YF4Qj?referralCode=t3ukrU)
-<<<<<<< HEAD
+# What does liteLLM proxy do
-# Proxy Server for Chat API
+- Make `/chat/completions` requests for 50+ LLM models **Azure, OpenAI, Replicate, Anthropic, Hugging Face**
  Example: for `model` use `claude-2`, `gpt-3.5`, `gpt-4`, `command-nightly`, `stabilityai/stablecode-completion-alpha-3b-4k`
  ```json
  {
    "model": "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1",
    "messages": [
                    { 
                        "content": "Hello, whats the weather in San Francisco??",
                        "role": "user"
                    }
                ]
  }
  ```
 - **Consistent Input/Output** Format
    - Call all models using the OpenAI format - completion(model, messages)
    - Text responses will always be available at ['choices'][0]['message']['content']
 - **Error Handling** Using Model Fallbacks (if `GPT-4` fails, try `llama2`)
 - **Logging** - Log Requests, Responses and Errors to `Supabase`, `Posthog`, `Mixpanel`, `Sentry`, `Helicone` (Any of the supported providers here: https://litellm.readthedocs.io/en/latest/advanced/
-This repository contains a proxy server that interacts with OpenAI's Chat API and other similar APIs to facilitate chat-based language models. The server allows you to easily integrate chat completion capabilities into your applications. The server is built using Python and the Flask framework.
+  Example: Logs sent to Supabase
  <img width="1015" alt="Screenshot 2023-08-11 at 4 02 46 PM" src="https://github.com/ishaan-jaff/proxy-server/assets/29436595/237557b8-ba09-4917-982c-8f3e1b2c8d08">
-## Installation
+- **Token Usage & Spend** - Track Input + Completion tokens used + Spend/model
-
+- **Caching** - Implementation of Semantic Caching
-=======
+- **Streaming & Async Support** - Return generators to stream text responses
 ## Installation
 >>>>>>> d1ff082 (new v litellm for render)
 To set up and run the proxy server locally, follow these steps:
 1. Clone this repository to your local machine:
 2. Install the required dependencies using pip:
 `pip install -r requirements.txt`
 3. Configure the server settings, such as API keys and model endpoints, in the configuration file (`config.py`).
 4. Run the server:
 `python app.py`
 ## API Endpoints
 ### `/chat/completions` (POST)
-This endpoint is used to generate chat completions. It takes in JSON data with the following parameters:
+This endpoint is used to generate chat completions for 50+ support LLM API Models. Use llama2, GPT-4, Claude2 etc
- `model` (string, required): ID of the model to use for chat completions. Refer to the model endpoint compatibility table for supported models.
+#### Input
 This API endpoint accepts all inputs in raw JSON and expects the following inputs
 - `model` (string, required): ID of the model to use for chat completions. See all supported models [here]: (https://litellm.readthedocs.io/en/latest/supported/): 
 eg `gpt-3.5-turbo`, `gpt-4`, `claude-2`, `command-nightly`, `stabilityai/stablecode-completion-alpha-3b-4k`
 - `messages` (array, required): A list of messages representing the conversation context. Each message should have a `role` (system, user, assistant, or function), `content` (message text), and `name` (for function role).
- Additional parameters for controlling completions, such as `temperature`, `top_p`, `n`, etc.
+- Additional Optional parameters: `temperature`, `functions`, `function_call`, `top_p`, `n`, `stream`. See the full list of supported inputs here: https://litellm.readthedocs.io/en/latest/input/
 Example JSON payload:
 #### Example JSON body
 For claude-2
 ```json
 {
-"model": "gpt-3.5-turbo",
+    "model": "claude-2",
-"messages": [
+    "messages": [
- {"role": "system", "content": "You are a helpful assistant."},
+                    { 
- {"role": "user", "content": "Knock knock."},
+                        "content": "Hello, whats the weather in San Francisco??",
- {"role": "assistant", "content": "Who's there?"},
+                        "role": "user"
- {"role": "user", "content": "Orange."}
+                    }
-],
+                ]
-"temperature": 0.8
+    
 }
 ```
 ### Making an API request to the Proxy Server
 ```python
 import requests
 import json
 # TODO: use your URL 
 url = "http://localhost:5000/chat/completions"
 payload = json.dumps({
  "model": "gpt-3.5-turbo",
  "messages": [
    {
      "content": "Hello, whats the weather in San Francisco??",
      "role": "user"
    }
  ]
 })
 headers = {
  'Content-Type': 'application/json'
 }
 response = requests.request("POST", url, headers=headers, data=payload)
 print(response.text)
 ```
 ### Output [Response Format]
 Responses from the server are given in the following format. 
 All responses from the server are returned in the following format (for all LLM models). More info on output here: https://litellm.readthedocs.io/en/latest/output/
 ```json
 {
    "choices": [
        {
            "finish_reason": "stop",
            "index": 0,
            "message": {
                "content": "I'm sorry, but I don't have the capability to provide real-time weather information. However, you can easily check the weather in San Francisco by searching online or using a weather app on your phone.",
                "role": "assistant"
            }
        }
    ],
    "created": 1691790381,
    "id": "chatcmpl-7mUFZlOEgdohHRDx2UpYPRTejirzb",
    "model": "gpt-3.5-turbo-0613",
    "object": "chat.completion",
    "usage": {
        "completion_tokens": 41,
        "prompt_tokens": 16,
        "total_tokens": 57
    }
 }
 ```
 ## Input Parameters
 model: ID of the language model to use.
 messages: An array of messages representing the conversation context.
 role: The role of the message author (system, user, assistant, or function).
 content: The content of the message.
 name: The name of the author (required for function role).
 function_call: The name and arguments of a function to call.
 functions: A list of functions the model may generate JSON inputs for.
 Various other parameters for controlling completion behavior.
 Supported Models
 The proxy server supports the following models:
-OpenAI Chat Completion Models:
+
-gpt-4
+
 gpt-4-0613
 gpt-4-32k
 ...
 OpenAI Text Completion Models:
 text-davinci-003
 Cohere Models:
 command-nightly
 command
 ...
 Anthropic Models:
 claude-2
 claude-instant-1
 ...
 Replicate Models:
 replicate/
 OpenRouter Models:
 google/palm-2-codechat-bison
 google/palm-2-chat-bison
 ...
 Vertex Models:
 chat-bison
 chat-bison@001
 <<<<<<< HEAD
 Refer to the model endpoint compatibility table for more details.
 =======
 Refer to the model endpoint compatibility table for more details.
 >>>>>>> d1ff082 (new v litellm for render)