From acb3b22dfc5be2a3dc24cc3f1d725298dbf88cec Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Tue, 19 Sep 2023 13:22:01 -0700 Subject: [PATCH] Removed proxy-server from Git index --- proxy-server/.DS_Store | Bin 6148 -> 0 bytes proxy-server/Dockerfile | 22 ---- proxy-server/LICENSE | 21 ---- proxy-server/main.py | 86 --------------- proxy-server/models_info.json | 18 --- proxy-server/readme.md | 178 ------------------------------ proxy-server/requirements.txt | 4 - proxy-server/test_proxy.py | 3 - proxy-server/test_proxy_stream.py | 21 ---- proxy-server/utils.py | 107 ------------------ 10 files changed, 460 deletions(-) delete mode 100644 proxy-server/.DS_Store delete mode 100644 proxy-server/Dockerfile delete mode 100644 proxy-server/LICENSE delete mode 100644 proxy-server/main.py delete mode 100644 proxy-server/models_info.json delete mode 100644 proxy-server/readme.md delete mode 100644 proxy-server/requirements.txt delete mode 100644 proxy-server/test_proxy.py delete mode 100644 proxy-server/test_proxy_stream.py delete mode 100644 proxy-server/utils.py diff --git a/proxy-server/.DS_Store b/proxy-server/.DS_Store deleted file mode 100644 index 7a42e831cf559a3cc2fb2cf7b66ff3c47d2f89da..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKK~BRk5FD3EZ9zymazV-qBJqPzl0zlLr4Inwq!N%e1zL&Ta_0|xf%kEOS=*|b zq+Bb6cBS>$>m55DM{x|m4A*4drbPpTh{TB9?{q_a=c)M2NcL#*&6tZ z3dr6aBSDENp15n<{TmhKbW)TP_RGt0Nsqp*u?F;TjS;Sp;s#@+jqJ}>_8j+oFBXWf z#HOBY-W%Bg=6s%vPgCF(x0W|!B*G)xB^C|-SA##`Dss-3ah#m2IkE=hoQ#j1)dZID zj5X&HuJ9IEakhx%8&-9`Rb!Wb%B-B(D?aU#Bb4N9_N^77%r28nHB~?rPz4UI0MBf( z{)s~^RRL8%71$~u--nPc7<;T7+D``ydj%kN8Mel{{4NS7^cZ`r9I}UI9F^#(#$Pdv zqca}*yx3#q(9vQ1<-_=yjlZE7J3Ge@Z8}WsP)k)n6{st4o+ zuM{xdWSk7SB!9NH7AI$INWY_tNnYh}OJT(x#mtqX_>69i@lYzn*kk385t{x8SQ)fX I1rDmf4_q{IQ2+n{ diff --git a/proxy-server/Dockerfile b/proxy-server/Dockerfile deleted file mode 100644 index 9eb7132ee..000000000 --- a/proxy-server/Dockerfile +++ /dev/null @@ -1,22 +0,0 @@ -# Use a recent version of Python as the base image -FROM python:3.8-slim-buster - -# Set the working directory to /app -WORKDIR /app - -# Copy the requirements.txt file to the image -COPY requirements.txt . - -# Install the required packages - -# Install the required Python packages using pip -RUN pip install --no-cache-dir -r requirements.txt - -# Copy the rest of the application files to the image -COPY . . - -# Expose port 5000 for the Flask app to listen on -EXPOSE 5000 - -# Run the main.py file when the container is started -CMD ["python", "main.py"] \ No newline at end of file diff --git a/proxy-server/LICENSE b/proxy-server/LICENSE deleted file mode 100644 index dd11dc523..000000000 --- a/proxy-server/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2023 Berri AI - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/proxy-server/main.py b/proxy-server/main.py deleted file mode 100644 index 71b5b6a95..000000000 --- a/proxy-server/main.py +++ /dev/null @@ -1,86 +0,0 @@ -from flask import Flask, request, jsonify, abort, Response -from flask_cors import CORS -import traceback -import litellm - -from litellm import completion -import openai -from utils import handle_error, get_cache, add_cache -import os, dotenv -import logging -import json -dotenv.load_dotenv() - -# TODO: set your keys in .env or here: -# os.environ["OPENAI_API_KEY"] = "" # set your openai key here -# see supported models / keys here: https://litellm.readthedocs.io/en/latest/supported/ - -######### LOGGING ################### -# log your data to slack, supabase -litellm.success_callback=["slack", "supabase"] # set .env SLACK_API_TOKEN, SLACK_API_SECRET, SLACK_API_CHANNEL, SUPABASE - -######### ERROR MONITORING ########## -# log errors to slack, sentry, supabase -litellm.failure_callback=["slack", "sentry", "supabase"] # .env SENTRY_API_URL - -app = Flask(__name__) -CORS(app) - -@app.route('/') -def index(): - return 'received!', 200 - -def data_generator(response): - for chunk in response: - yield f"data: {json.dumps(chunk)}\n\n" - -@app.route('/chat/completions', methods=["POST"]) -def api_completion(): - data = request.json - if data.get('stream') == "True": - data['stream'] = True # convert to boolean - try: - # pass in data to completion function, unpack data - response = completion(**data) - if 'stream' in data and data['stream'] == True: # use generate_responses to stream responses - return Response(data_generator(response), mimetype='text/event-stream') - except Exception as e: - # call handle_error function - print(f"got error{e}") - return handle_error(data) - return response, 200 # non streaming responses - -@app.route('/get_models', methods=["POST"]) -def get_models(): - try: - return litellm.model_list - except Exception as e: - traceback.print_exc() - response = {"error": str(e)} - return response, 200 - -if __name__ == "__main__": - from waitress import serve - serve(app, host="0.0.0.0", port=os.environ.get("PORT", 5000), threads=500) - -############### Advanced ########################## - -############ Caching ################################### -# make a new endpoint with caching -# This Cache is built using ChromaDB -# it has two functions add_cache() and get_cache() -@app.route('/chat/completions_with_cache', methods=["POST"]) -def api_completion_with_cache(): - data = request.json - try: - cache_response = get_cache(data['messages']) - if cache_response!=None: - return cache_response - # pass in data to completion function, unpack data - response = completion(**data) - - # add to cache - except Exception as e: - # call handle_error function - return handle_error(data) - return response, 200 \ No newline at end of file diff --git a/proxy-server/models_info.json b/proxy-server/models_info.json deleted file mode 100644 index 7bc9aafae..000000000 --- a/proxy-server/models_info.json +++ /dev/null @@ -1,18 +0,0 @@ - -{ - "gpt-3.5-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, - "gpt-35-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, - "gpt-3.5-turbo-0613": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, - "gpt-3.5-turbo-0301": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, - "gpt-3.5-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, - "gpt-35-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, - "gpt-3.5-turbo-16k-0613": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, - "gpt-4": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006}, - "gpt-4-0613": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006}, - "gpt-4-32k": {"max_tokens": 8000, "input_cost_per_token": 0.00006, "output_cost_per_token": 0.00012}, - "claude-instant-1": {"max_tokens": 100000, "input_cost_per_token": 0.00000163, "output_cost_per_token": 0.00000551}, - "claude-2": {"max_tokens": 100000, "input_cost_per_token": 0.00001102, "output_cost_per_token": 0.00003268}, - "text-bison-001": {"max_tokens": 8192, "input_cost_per_token": 0.000004, "output_cost_per_token": 0.000004}, - "chat-bison-001": {"max_tokens": 4096, "input_cost_per_token": 0.000002, "output_cost_per_token": 0.000002}, - "command-nightly": {"max_tokens": 4096, "input_cost_per_token": 0.000015, "output_cost_per_token": 0.000015} -} \ No newline at end of file diff --git a/proxy-server/readme.md b/proxy-server/readme.md deleted file mode 100644 index 0bd45a3b5..000000000 --- a/proxy-server/readme.md +++ /dev/null @@ -1,178 +0,0 @@ -# liteLLM Proxy Server: 50+ LLM Models, Error Handling, Caching - -### Azure, Llama2, OpenAI, Claude, Hugging Face, Replicate Models - -[![PyPI Version](https://img.shields.io/pypi/v/litellm.svg)](https://pypi.org/project/litellm/) -[![PyPI Version](https://img.shields.io/badge/stable%20version-v0.1.345-blue?color=green&link=https://pypi.org/project/litellm/0.1.1/)](https://pypi.org/project/litellm/0.1.1/) -![Downloads](https://img.shields.io/pypi/dm/litellm) -[![litellm](https://img.shields.io/badge/%20%F0%9F%9A%85%20liteLLM-OpenAI%7CAzure%7CAnthropic%7CPalm%7CCohere%7CReplicate%7CHugging%20Face-blue?color=green)](https://github.com/BerriAI/litellm) - -[![Deploy on Railway](https://railway.app/button.svg)](https://railway.app/template/DYqQAW?referralCode=t3ukrU) - -![4BC6491E-86D0-4833-B061-9F54524B2579](https://github.com/BerriAI/litellm/assets/17561003/f5dd237b-db5e-42e1-b1ac-f05683b1d724) - -## What does liteLLM proxy do - -- Make `/chat/completions` requests for 50+ LLM models **Azure, OpenAI, Replicate, Anthropic, Hugging Face** - - Example: for `model` use `claude-2`, `gpt-3.5`, `gpt-4`, `command-nightly`, `stabilityai/stablecode-completion-alpha-3b-4k` - - ```json - { - "model": "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1", - "messages": [ - { - "content": "Hello, whats the weather in San Francisco??", - "role": "user" - } - ] - } - ``` - -- **Consistent Input/Output** Format - - Call all models using the OpenAI format - `completion(model, messages)` - - Text responses will always be available at `['choices'][0]['message']['content']` -- **Error Handling** Using Model Fallbacks (if `GPT-4` fails, try `llama2`) -- **Logging** - Log Requests, Responses and Errors to `Supabase`, `Posthog`, `Mixpanel`, `Sentry`, `LLMonitor`, `Traceloop`, `Helicone` (Any of the supported providers here: https://litellm.readthedocs.io/en/latest/advanced/ - - **Example: Logs sent to Supabase** - Screenshot 2023-08-11 at 4 02 46 PM - -- **Token Usage & Spend** - Track Input + Completion tokens used + Spend/model -- **Caching** - Implementation of Semantic Caching -- **Streaming & Async Support** - Return generators to stream text responses - -## API Endpoints - -### `/chat/completions` (POST) - -This endpoint is used to generate chat completions for 50+ support LLM API Models. Use llama2, GPT-4, Claude2 etc - -#### Input - -This API endpoint accepts all inputs in raw JSON and expects the following inputs - -- `model` (string, required): ID of the model to use for chat completions. See all supported models [here]: (https://litellm.readthedocs.io/en/latest/supported/): - eg `gpt-3.5-turbo`, `gpt-4`, `claude-2`, `command-nightly`, `stabilityai/stablecode-completion-alpha-3b-4k` -- `messages` (array, required): A list of messages representing the conversation context. Each message should have a `role` (system, user, assistant, or function), `content` (message text), and `name` (for function role). -- Additional Optional parameters: `temperature`, `functions`, `function_call`, `top_p`, `n`, `stream`. See the full list of supported inputs here: https://litellm.readthedocs.io/en/latest/input/ - -#### Example JSON body - -For claude-2 - -```json -{ - "model": "claude-2", - "messages": [ - { - "content": "Hello, whats the weather in San Francisco??", - "role": "user" - } - ] -} -``` - -### Making an API request to the Proxy Server - -```python -import requests -import json - -# TODO: use your URL -url = "http://localhost:5000/chat/completions" - -payload = json.dumps({ - "model": "gpt-3.5-turbo", - "messages": [ - { - "content": "Hello, whats the weather in San Francisco??", - "role": "user" - } - ] -}) -headers = { - 'Content-Type': 'application/json' -} -response = requests.request("POST", url, headers=headers, data=payload) -print(response.text) - -``` - -### Output [Response Format] - -Responses from the server are given in the following format. -All responses from the server are returned in the following format (for all LLM models). More info on output here: https://litellm.readthedocs.io/en/latest/output/ - -```json -{ - "choices": [ - { - "finish_reason": "stop", - "index": 0, - "message": { - "content": "I'm sorry, but I don't have the capability to provide real-time weather information. However, you can easily check the weather in San Francisco by searching online or using a weather app on your phone.", - "role": "assistant" - } - } - ], - "created": 1691790381, - "id": "chatcmpl-7mUFZlOEgdohHRDx2UpYPRTejirzb", - "model": "gpt-3.5-turbo-0613", - "object": "chat.completion", - "usage": { - "completion_tokens": 41, - "prompt_tokens": 16, - "total_tokens": 57 - } -} -``` - -## Installation & Usage - -### Running Locally - -1. Clone liteLLM repository to your local machine: - ``` - git clone https://github.com/BerriAI/liteLLM-proxy - ``` -2. Install the required dependencies using pip - ``` - pip install requirements.txt - ``` -3. Set your LLM API keys - ``` - os.environ['OPENAI_API_KEY]` = "YOUR_API_KEY" - or - set OPENAI_API_KEY in your .env file - ``` -4. Run the server: - ``` - python main.py - ``` - -## Deploying - -1. Quick Start: Deploy on Railway - - [![Deploy on Railway](https://railway.app/button.svg)](https://railway.app/template/DYqQAW?referralCode=t3ukrU) - -2. `GCP`, `AWS`, `Azure` - This project includes a `Dockerfile` allowing you to build and deploy a Docker Project on your providers - -# Support / Talk with founders - -- [Our calendar 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version) -- [Community Discord 💭](https://discord.gg/wuPM9dRgDw) -- Our numbers 📞 +1 (770) 8783-106 / +1 (412) 618-6238 -- Our emails ✉️ ishaan@berri.ai / krrish@berri.ai - -## Roadmap - -- [ ] Support hosted db (e.g. Supabase) -- [ ] Easily send data to places like posthog and sentry. -- [ ] Add a hot-cache for project spend logs - enables fast checks for user + project limitings -- [ ] Implement user-based rate-limiting -- [ ] Spending controls per project - expose key creation endpoint -- [ ] Need to store a keys db -> mapping created keys to their alias (i.e. project name) -- [ ] Easily add new models as backups / as the entry-point (add this to the available model list) diff --git a/proxy-server/requirements.txt b/proxy-server/requirements.txt deleted file mode 100644 index 45e197fc4..000000000 --- a/proxy-server/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -flask -flask_cors -waitress -litellm==0.1.591 diff --git a/proxy-server/test_proxy.py b/proxy-server/test_proxy.py deleted file mode 100644 index f1d30ee31..000000000 --- a/proxy-server/test_proxy.py +++ /dev/null @@ -1,3 +0,0 @@ - -# tests to call the proxy app.py using gpt-3.5-turbo, gpt-4, and claude-instant-1 - diff --git a/proxy-server/test_proxy_stream.py b/proxy-server/test_proxy_stream.py deleted file mode 100644 index 8b358f058..000000000 --- a/proxy-server/test_proxy_stream.py +++ /dev/null @@ -1,21 +0,0 @@ -# import openai -# import os - -# os.environ["OPENAI_API_KEY"] = "" - -# openai.api_key = os.environ["OPENAI_API_KEY"] -# openai.api_base ="http://localhost:5000" - -# messages = [ -# { -# "role": "user", -# "content": "write a 1 pg essay in liteLLM" -# } -# ] - -# response = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages, stream=True) -# print("got response", response) -# # response is a generator - -# for chunk in response: -# print(chunk) diff --git a/proxy-server/utils.py b/proxy-server/utils.py deleted file mode 100644 index f6cd94942..000000000 --- a/proxy-server/utils.py +++ /dev/null @@ -1,107 +0,0 @@ - -from litellm import completion -import os, dotenv -import json -dotenv.load_dotenv() -############### Advanced ########################## - -########### streaming ############################ -def generate_responses(response): - for chunk in response: - yield json.dumps({"response": chunk}) + "\n" - -################ ERROR HANDLING ##################### -# implement model fallbacks, cooldowns, and retries -# if a model fails assume it was rate limited and let it cooldown for 60s -def handle_error(data): - import time - # retry completion() request with fallback models - response = None - start_time = time.time() - rate_limited_models = set() - model_expiration_times = {} - fallback_strategy=['gpt-3.5-turbo', 'command-nightly', 'claude-2'] - while response == None and time.time() - start_time < 45: # retry for 45s - for model in fallback_strategy: - try: - if model in rate_limited_models: # check if model is currently cooling down - if model_expiration_times.get(model) and time.time() >= model_expiration_times[model]: - rate_limited_models.remove(model) # check if it's been 60s of cool down and remove model - else: - continue # skip model - print(f"calling model {model}") - response = completion(**data) - if response != None: - return response - except Exception as e: - rate_limited_models.add(model) - model_expiration_times[model] = time.time() + 60 # cool down this selected model - pass - return response - - -########### Pricing is tracked in Supabase ############ - - - -import uuid -cache_collection = None -# Add a response to the cache -def add_cache(messages, model_response): - global cache_collection - if cache_collection is None: - make_collection() - - user_question = message_to_user_question(messages) - - # Add the user question and model response to the cache - cache_collection.add( - documents=[user_question], - metadatas=[{"model_response": str(model_response)}], - ids=[str(uuid.uuid4())] - ) - return - -# Retrieve a response from the cache if similarity is above the threshold -def get_cache(messages, similarity_threshold): - try: - global cache_collection - if cache_collection is None: - make_collection() - - user_question = message_to_user_question(messages) - - # Query the cache for the user question - results = cache_collection.query( - query_texts=[user_question], - n_results=1 - ) - - if len(results['distances'][0]) == 0: - return None # Cache is empty - - distance = results['distances'][0][0] - sim = (1 - distance) - - if sim >= similarity_threshold: - return results['metadatas'][0][0]["model_response"] # Return cached response - else: - return None # No cache hit - except Exception as e: - print("Error in get cache", e) - raise e - -# Initialize the cache collection -def make_collection(): - import chromadb - global cache_collection - client = chromadb.Client() - cache_collection = client.create_collection("llm_responses") - -# HELPER: Extract user's question from messages -def message_to_user_question(messages): - user_question = "" - for message in messages: - if message['role'] == 'user': - user_question += message["content"] - return user_question \ No newline at end of file