From acb3b22dfc5be2a3dc24cc3f1d725298dbf88cec Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 19 Sep 2023 13:22:01 -0700
Subject: [PATCH] Removed proxy-server from Git index

---
 proxy-server/.DS_Store            | Bin 6148 -> 0 bytes
 proxy-server/Dockerfile           |  22 ----
 proxy-server/LICENSE              |  21 ----
 proxy-server/main.py              |  86 ---------------
 proxy-server/models_info.json     |  18 ---
 proxy-server/readme.md            | 178 ------------------------------
 proxy-server/requirements.txt     |   4 -
 proxy-server/test_proxy.py        |   3 -
 proxy-server/test_proxy_stream.py |  21 ----
 proxy-server/utils.py             | 107 ------------------
 10 files changed, 460 deletions(-)
 delete mode 100644 proxy-server/.DS_Store
 delete mode 100644 proxy-server/Dockerfile
 delete mode 100644 proxy-server/LICENSE
 delete mode 100644 proxy-server/main.py
 delete mode 100644 proxy-server/models_info.json
 delete mode 100644 proxy-server/readme.md
 delete mode 100644 proxy-server/requirements.txt
 delete mode 100644 proxy-server/test_proxy.py
 delete mode 100644 proxy-server/test_proxy_stream.py
 delete mode 100644 proxy-server/utils.py

diff --git a/proxy-server/.DS_Store b/proxy-server/.DS_Store
deleted file mode 100644
index 7a42e831cf559a3cc2fb2cf7b66ff3c47d2f89da..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6148
zcmeHKK~BRk5FD3EZ9zymazV-qBJqPzl0zlLr4Inwq!N%e1zL&Ta_0|xf%kEOS=*|b
zq+Bb6cBS>$>m55DM{x|m4A<EO5CG`Y1v>*4drbPpTh{TB9?{q_a=c)M2NcL#*&6tZ
z3dr6aBSDENp15n<{TmhKbW)TP_RGt0Nsqp*u?F;TjS;Sp;s#@+jqJ}>_8j+oFBXWf
z#HOBY-W%Bg=6s%vPgCF(x0W|!B*G)xB^C|-SA##`Dss-3ah#m2IkE=hoQ#j1)dZID
zj5X&HuJ9IEakhx%8&-9`Rb!Wb%B-B(D?aU#Bb4N9_N^77%r28nHB~?rPz4UI0MBf(
z{)s~^RRL8%71$~u--nPc7<;T7+D``ydj%kN8Mel{{4NS7^cZ`r9I}UI9F^#(#$Pdv
zqca}*yx3#q(9vQ1<-_=yjlZE7J3Ge@Z8}WsP)k)n6{st4<eGt;|LO1d|2j!qs(>o+
zuM{xdWSk7SB!9NH7AI$INWY_tNnYh}OJT(x#mtqX_>69i@lYzn*kk385t{x8SQ)fX
I1rDmf4_q{IQ2+n{

diff --git a/proxy-server/Dockerfile b/proxy-server/Dockerfile
deleted file mode 100644
index 9eb7132ee..000000000
--- a/proxy-server/Dockerfile
+++ /dev/null
@@ -1,22 +0,0 @@
-# Use a recent version of Python as the base image
-FROM python:3.8-slim-buster
-
-# Set the working directory to /app
-WORKDIR /app
-
-# Copy the requirements.txt file to the image
-COPY requirements.txt .
-
-# Install the required packages
-
-# Install the required Python packages using pip
-RUN pip install --no-cache-dir -r requirements.txt
-
-# Copy the rest of the application files to the image
-COPY . .
-
-# Expose port 5000 for the Flask app to listen on
-EXPOSE 5000
-
-# Run the main.py file when the container is started
-CMD ["python", "main.py"]
\ No newline at end of file
diff --git a/proxy-server/LICENSE b/proxy-server/LICENSE
deleted file mode 100644
index dd11dc523..000000000
--- a/proxy-server/LICENSE
+++ /dev/null
@@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) 2023 Berri AI
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
diff --git a/proxy-server/main.py b/proxy-server/main.py
deleted file mode 100644
index 71b5b6a95..000000000
--- a/proxy-server/main.py
+++ /dev/null
@@ -1,86 +0,0 @@
-from flask import Flask, request, jsonify, abort, Response
-from flask_cors import CORS
-import traceback
-import litellm
-
-from litellm import completion 
-import openai
-from utils import handle_error, get_cache, add_cache
-import os, dotenv
-import logging
-import json
-dotenv.load_dotenv()
-
-# TODO: set your keys in .env or here:
-# os.environ["OPENAI_API_KEY"] = "" # set your openai key here
-# see supported models / keys here: https://litellm.readthedocs.io/en/latest/supported/
-
-######### LOGGING ###################
-# log your data to slack, supabase
-litellm.success_callback=["slack", "supabase"] # set .env SLACK_API_TOKEN, SLACK_API_SECRET, SLACK_API_CHANNEL, SUPABASE 
-
-######### ERROR MONITORING ##########
-# log errors to slack, sentry, supabase
-litellm.failure_callback=["slack", "sentry", "supabase"] # .env SENTRY_API_URL
-
-app = Flask(__name__)
-CORS(app)
-
-@app.route('/')
-def index():
-    return 'received!', 200
-
-def data_generator(response):
-    for chunk in response:
-        yield f"data: {json.dumps(chunk)}\n\n"
-
-@app.route('/chat/completions', methods=["POST"])
-def api_completion():
-    data = request.json
-    if data.get('stream') == "True":
-        data['stream'] = True # convert to boolean
-    try:
-        # pass in data to completion function, unpack data
-        response = completion(**data)
-        if 'stream' in data and data['stream'] == True: # use generate_responses to stream responses
-            return Response(data_generator(response), mimetype='text/event-stream')
-    except Exception as e:
-        # call handle_error function
-        print(f"got error{e}")
-        return handle_error(data)
-    return response, 200 # non streaming responses
-
-@app.route('/get_models', methods=["POST"])
-def get_models():
-    try:
-        return litellm.model_list
-    except Exception as e:
-        traceback.print_exc()
-        response = {"error": str(e)}
-    return response, 200
-
-if __name__ == "__main__":
-  from waitress import serve
-  serve(app, host="0.0.0.0", port=os.environ.get("PORT", 5000), threads=500)
-
-############### Advanced ##########################
-
-############ Caching ###################################
-# make a new endpoint with caching
-# This Cache is built using ChromaDB
-# it has two functions add_cache() and get_cache()
-@app.route('/chat/completions_with_cache', methods=["POST"])
-def api_completion_with_cache():
-    data = request.json
-    try:
-        cache_response = get_cache(data['messages'])
-        if cache_response!=None:
-            return cache_response
-        # pass in data to completion function, unpack data
-        response = completion(**data) 
-
-        # add to cache 
-    except Exception as e:
-        # call handle_error function
-        return handle_error(data)
-    return response, 200
\ No newline at end of file
diff --git a/proxy-server/models_info.json b/proxy-server/models_info.json
deleted file mode 100644
index 7bc9aafae..000000000
--- a/proxy-server/models_info.json
+++ /dev/null
@@ -1,18 +0,0 @@
-
-{
-    "gpt-3.5-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
-    "gpt-35-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, 
-    "gpt-3.5-turbo-0613": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
-    "gpt-3.5-turbo-0301": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
-    "gpt-3.5-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
-    "gpt-35-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
-    "gpt-3.5-turbo-16k-0613": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
-    "gpt-4": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
-    "gpt-4-0613": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
-    "gpt-4-32k": {"max_tokens": 8000, "input_cost_per_token": 0.00006, "output_cost_per_token": 0.00012},
-    "claude-instant-1": {"max_tokens": 100000, "input_cost_per_token": 0.00000163, "output_cost_per_token": 0.00000551},
-    "claude-2": {"max_tokens": 100000, "input_cost_per_token": 0.00001102, "output_cost_per_token": 0.00003268},
-    "text-bison-001": {"max_tokens": 8192, "input_cost_per_token": 0.000004, "output_cost_per_token": 0.000004},
-    "chat-bison-001": {"max_tokens": 4096, "input_cost_per_token": 0.000002, "output_cost_per_token": 0.000002},
-    "command-nightly": {"max_tokens": 4096, "input_cost_per_token": 0.000015, "output_cost_per_token": 0.000015}
-}
\ No newline at end of file
diff --git a/proxy-server/readme.md b/proxy-server/readme.md
deleted file mode 100644
index 0bd45a3b5..000000000
--- a/proxy-server/readme.md
+++ /dev/null
@@ -1,178 +0,0 @@
-# liteLLM Proxy Server: 50+ LLM Models, Error Handling, Caching
-
-### Azure, Llama2, OpenAI, Claude, Hugging Face, Replicate Models
-
-[![PyPI Version](https://img.shields.io/pypi/v/litellm.svg)](https://pypi.org/project/litellm/)
-[![PyPI Version](https://img.shields.io/badge/stable%20version-v0.1.345-blue?color=green&link=https://pypi.org/project/litellm/0.1.1/)](https://pypi.org/project/litellm/0.1.1/)
-![Downloads](https://img.shields.io/pypi/dm/litellm)
-[![litellm](https://img.shields.io/badge/%20%F0%9F%9A%85%20liteLLM-OpenAI%7CAzure%7CAnthropic%7CPalm%7CCohere%7CReplicate%7CHugging%20Face-blue?color=green)](https://github.com/BerriAI/litellm)
-
-[![Deploy on Railway](https://railway.app/button.svg)](https://railway.app/template/DYqQAW?referralCode=t3ukrU)
-
-![4BC6491E-86D0-4833-B061-9F54524B2579](https://github.com/BerriAI/litellm/assets/17561003/f5dd237b-db5e-42e1-b1ac-f05683b1d724)
-
-## What does liteLLM proxy do
-
-- Make `/chat/completions` requests for 50+ LLM models **Azure, OpenAI, Replicate, Anthropic, Hugging Face**
-
-  Example: for `model` use `claude-2`, `gpt-3.5`, `gpt-4`, `command-nightly`, `stabilityai/stablecode-completion-alpha-3b-4k`
-
-  ```json
-  {
-    "model": "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1",
-    "messages": [
-      {
-        "content": "Hello, whats the weather in San Francisco??",
-        "role": "user"
-      }
-    ]
-  }
-  ```
-
-- **Consistent Input/Output** Format
-  - Call all models using the OpenAI format - `completion(model, messages)`
-  - Text responses will always be available at `['choices'][0]['message']['content']`
-- **Error Handling** Using Model Fallbacks (if `GPT-4` fails, try `llama2`)
-- **Logging** - Log Requests, Responses and Errors to `Supabase`, `Posthog`, `Mixpanel`, `Sentry`, `LLMonitor`, `Traceloop`, `Helicone` (Any of the supported providers here: https://litellm.readthedocs.io/en/latest/advanced/
-
-  **Example: Logs sent to Supabase**
-  <img width="1015" alt="Screenshot 2023-08-11 at 4 02 46 PM" src="https://github.com/ishaan-jaff/proxy-server/assets/29436595/237557b8-ba09-4917-982c-8f3e1b2c8d08">
-
-- **Token Usage & Spend** - Track Input + Completion tokens used + Spend/model
-- **Caching** - Implementation of Semantic Caching
-- **Streaming & Async Support** - Return generators to stream text responses
-
-## API Endpoints
-
-### `/chat/completions` (POST)
-
-This endpoint is used to generate chat completions for 50+ support LLM API Models. Use llama2, GPT-4, Claude2 etc
-
-#### Input
-
-This API endpoint accepts all inputs in raw JSON and expects the following inputs
-
-- `model` (string, required): ID of the model to use for chat completions. See all supported models [here]: (https://litellm.readthedocs.io/en/latest/supported/):
-  eg `gpt-3.5-turbo`, `gpt-4`, `claude-2`, `command-nightly`, `stabilityai/stablecode-completion-alpha-3b-4k`
-- `messages` (array, required): A list of messages representing the conversation context. Each message should have a `role` (system, user, assistant, or function), `content` (message text), and `name` (for function role).
-- Additional Optional parameters: `temperature`, `functions`, `function_call`, `top_p`, `n`, `stream`. See the full list of supported inputs here: https://litellm.readthedocs.io/en/latest/input/
-
-#### Example JSON body
-
-For claude-2
-
-```json
-{
-  "model": "claude-2",
-  "messages": [
-    {
-      "content": "Hello, whats the weather in San Francisco??",
-      "role": "user"
-    }
-  ]
-}
-```
-
-### Making an API request to the Proxy Server
-
-```python
-import requests
-import json
-
-# TODO: use your URL
-url = "http://localhost:5000/chat/completions"
-
-payload = json.dumps({
-  "model": "gpt-3.5-turbo",
-  "messages": [
-    {
-      "content": "Hello, whats the weather in San Francisco??",
-      "role": "user"
-    }
-  ]
-})
-headers = {
-  'Content-Type': 'application/json'
-}
-response = requests.request("POST", url, headers=headers, data=payload)
-print(response.text)
-
-```
-
-### Output [Response Format]
-
-Responses from the server are given in the following format.
-All responses from the server are returned in the following format (for all LLM models). More info on output here: https://litellm.readthedocs.io/en/latest/output/
-
-```json
-{
-  "choices": [
-    {
-      "finish_reason": "stop",
-      "index": 0,
-      "message": {
-        "content": "I'm sorry, but I don't have the capability to provide real-time weather information. However, you can easily check the weather in San Francisco by searching online or using a weather app on your phone.",
-        "role": "assistant"
-      }
-    }
-  ],
-  "created": 1691790381,
-  "id": "chatcmpl-7mUFZlOEgdohHRDx2UpYPRTejirzb",
-  "model": "gpt-3.5-turbo-0613",
-  "object": "chat.completion",
-  "usage": {
-    "completion_tokens": 41,
-    "prompt_tokens": 16,
-    "total_tokens": 57
-  }
-}
-```
-
-## Installation & Usage
-
-### Running Locally
-
-1. Clone liteLLM repository to your local machine:
-   ```
-   git clone https://github.com/BerriAI/liteLLM-proxy
-   ```
-2. Install the required dependencies using pip
-   ```
-   pip install requirements.txt
-   ```
-3. Set your LLM API keys
-   ```
-   os.environ['OPENAI_API_KEY]` = "YOUR_API_KEY"
-   or
-   set OPENAI_API_KEY in your .env file
-   ```
-4. Run the server:
-   ```
-   python main.py
-   ```
-
-## Deploying
-
-1. Quick Start: Deploy on Railway
-
-   [![Deploy on Railway](https://railway.app/button.svg)](https://railway.app/template/DYqQAW?referralCode=t3ukrU)
-
-2. `GCP`, `AWS`, `Azure`
-   This project includes a `Dockerfile` allowing you to build and deploy a Docker Project on your providers
-
-# Support / Talk with founders
-
-- [Our calendar 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version)
-- [Community Discord 💭](https://discord.gg/wuPM9dRgDw)
-- Our numbers 📞 +1 (770) 8783-106 / +1 (412) 618-6238
-- Our emails ✉️ ishaan@berri.ai / krrish@berri.ai
-
-## Roadmap
-
-- [ ] Support hosted db (e.g. Supabase)
-- [ ] Easily send data to places like posthog and sentry.
-- [ ] Add a hot-cache for project spend logs - enables fast checks for user + project limitings
-- [ ] Implement user-based rate-limiting
-- [ ] Spending controls per project - expose key creation endpoint
-- [ ] Need to store a keys db -> mapping created keys to their alias (i.e. project name)
-- [ ] Easily add new models as backups / as the entry-point (add this to the available model list)
diff --git a/proxy-server/requirements.txt b/proxy-server/requirements.txt
deleted file mode 100644
index 45e197fc4..000000000
--- a/proxy-server/requirements.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-flask
-flask_cors
-waitress
-litellm==0.1.591
diff --git a/proxy-server/test_proxy.py b/proxy-server/test_proxy.py
deleted file mode 100644
index f1d30ee31..000000000
--- a/proxy-server/test_proxy.py
+++ /dev/null
@@ -1,3 +0,0 @@
-
-# tests to call the proxy app.py using gpt-3.5-turbo, gpt-4, and claude-instant-1
-
diff --git a/proxy-server/test_proxy_stream.py b/proxy-server/test_proxy_stream.py
deleted file mode 100644
index 8b358f058..000000000
--- a/proxy-server/test_proxy_stream.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# import openai
-# import os
-
-# os.environ["OPENAI_API_KEY"] = ""
-
-# openai.api_key = os.environ["OPENAI_API_KEY"]
-# openai.api_base ="http://localhost:5000"
-
-# messages = [
-#     {
-#         "role": "user",
-#         "content": "write a 1 pg essay in liteLLM"
-#     }
-# ]
-
-# response = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages, stream=True)
-# print("got response", response)
-# # response is a generator
-
-# for chunk in response:
-#     print(chunk)
diff --git a/proxy-server/utils.py b/proxy-server/utils.py
deleted file mode 100644
index f6cd94942..000000000
--- a/proxy-server/utils.py
+++ /dev/null
@@ -1,107 +0,0 @@
-
-from litellm import completion 
-import os, dotenv
-import json
-dotenv.load_dotenv()
-############### Advanced ##########################
-
-########### streaming ############################
-def generate_responses(response):
-    for chunk in response:
-        yield json.dumps({"response": chunk}) + "\n"
-
-################ ERROR HANDLING #####################
-# implement model fallbacks, cooldowns, and retries
-# if a model fails assume it was rate limited and let it cooldown for 60s
-def handle_error(data):
-    import time
-    # retry completion() request with fallback models
-    response = None
-    start_time = time.time()
-    rate_limited_models = set()
-    model_expiration_times = {}
-    fallback_strategy=['gpt-3.5-turbo', 'command-nightly', 'claude-2']
-    while response == None and time.time() - start_time < 45: # retry for 45s
-      for model in fallback_strategy:
-        try:
-            if model in rate_limited_models: # check if model is currently cooling down
-              if model_expiration_times.get(model) and time.time() >= model_expiration_times[model]:
-                  rate_limited_models.remove(model) # check if it's been 60s of cool down and remove model
-              else:
-                  continue # skip model
-            print(f"calling model {model}")
-            response = completion(**data)
-            if response != None:
-              return response
-        except Exception as e:
-          rate_limited_models.add(model)
-          model_expiration_times[model] = time.time() + 60 # cool down this selected model
-          pass
-    return response
-
-
-########### Pricing is tracked in Supabase ############
-
-
-
-import uuid
-cache_collection = None
-# Add a response to the cache
-def add_cache(messages, model_response):
-    global cache_collection
-    if cache_collection is None:
-        make_collection()
-
-    user_question = message_to_user_question(messages)
-
-    # Add the user question and model response to the cache
-    cache_collection.add(
-        documents=[user_question],
-        metadatas=[{"model_response": str(model_response)}],
-        ids=[str(uuid.uuid4())]
-    )
-    return
-
-# Retrieve a response from the cache if similarity is above the threshold
-def get_cache(messages, similarity_threshold):
-    try:
-        global cache_collection
-        if cache_collection is None:
-            make_collection()
-
-        user_question = message_to_user_question(messages)
-
-        # Query the cache for the user question
-        results = cache_collection.query(
-            query_texts=[user_question],
-            n_results=1
-        )
-
-        if len(results['distances'][0]) == 0:
-            return None  # Cache is empty
-
-        distance = results['distances'][0][0]
-        sim = (1 - distance)
-
-        if sim >= similarity_threshold:
-            return results['metadatas'][0][0]["model_response"]  # Return cached response
-        else:
-            return None  # No cache hit
-    except Exception as e:
-        print("Error in get cache", e)
-        raise e
-
-# Initialize the cache collection
-def make_collection():
-    import chromadb
-    global cache_collection
-    client = chromadb.Client()
-    cache_collection = client.create_collection("llm_responses")
-
-# HELPER: Extract user's question from messages
-def message_to_user_question(messages):
-    user_question = ""
-    for message in messages:
-        if message['role'] == 'user':
-            user_question += message["content"]
-    return user_question
\ No newline at end of file