moving proxy server to the top of repo

2023-08-18 09:53:49 -07:00 · 2023-08-18 09:53:49 -07:00 · 8ef47524bf
commit 8ef47524bf
parent 8543d89418
10 changed files with 0 additions and 0 deletions
--- a/cookbook/proxy-server/.DS_Store
+++ b/cookbook/proxy-server/.DS_Store
--- a/cookbook/proxy-server/Dockerfile
+++ b/cookbook/proxy-server/Dockerfile
@ -1,22 +0,0 @@
-# Use a recent version of Python as the base image
-FROM python:3.8-slim-buster
-
-# Set the working directory to /app
-WORKDIR /app
-
-# Copy the requirements.txt file to the image
-COPY requirements.txt .
-
-# Install the required packages
-
-# Install the required Python packages using pip
-RUN pip install --no-cache-dir -r requirements.txt
-
-# Copy the rest of the application files to the image
-COPY . .
-
-# Expose port 5000 for the Flask app to listen on
-EXPOSE 5000
-
-# Run the main.py file when the container is started
-CMD ["python", "main.py"]
--- a/cookbook/proxy-server/LICENSE
+++ b/cookbook/proxy-server/LICENSE
@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) 2023 Berri AI
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
--- a/cookbook/proxy-server/main.py
+++ b/cookbook/proxy-server/main.py
@ -1,86 +0,0 @@
-from flask import Flask, request, jsonify, abort, Response
-from flask_cors import CORS
-import traceback
-import litellm
-
-from litellm import completion 
-import openai
-from utils import handle_error, get_cache, add_cache
-import os, dotenv
-import logging
-import json
-dotenv.load_dotenv()
-
-# TODO: set your keys in .env or here:
-# os.environ["OPENAI_API_KEY"] = "" # set your openai key here
-# see supported models / keys here: https://litellm.readthedocs.io/en/latest/supported/
-
-######### LOGGING ###################
-# log your data to slack, supabase
-litellm.success_callback=["slack", "supabase"] # set .env SLACK_API_TOKEN, SLACK_API_SECRET, SLACK_API_CHANNEL, SUPABASE 
-
-######### ERROR MONITORING ##########
-# log errors to slack, sentry, supabase
-litellm.failure_callback=["slack", "sentry", "supabase"] # .env SENTRY_API_URL
-
-app = Flask(__name__)
-CORS(app)
-
-@app.route('/')
-def index():
-    return 'received!', 200
-
-def data_generator(response):
-    for chunk in response:
-        yield f"data: {json.dumps(chunk)}\n\n"
-
-@app.route('/chat/completions', methods=["POST"])
-def api_completion():
-    data = request.json
-    if data.get('stream') == "True":
-        data['stream'] = True # convert to boolean
-    try:
-        # pass in data to completion function, unpack data
-        response = completion(**data)
-        if 'stream' in data and data['stream'] == True: # use generate_responses to stream responses
-            return Response(data_generator(response), mimetype='text/event-stream')
-    except Exception as e:
-        # call handle_error function
-        print(f"got error{e}")
-        return handle_error(data)
-    return response, 200 # non streaming responses
-
-@app.route('/get_models', methods=["POST"])
-def get_models():
-    try:
-        return litellm.model_list
-    except Exception as e:
-        traceback.print_exc()
-        response = {"error": str(e)}
-    return response, 200
-
-if __name__ == "__main__":
-  from waitress import serve
-  serve(app, host="0.0.0.0", port=5000, threads=500)
-
-############### Advanced ##########################
-
-############ Caching ###################################
-# make a new endpoint with caching
-# This Cache is built using ChromaDB
-# it has two functions add_cache() and get_cache()
-@app.route('/chat/completions_with_cache', methods=["POST"])
-def api_completion_with_cache():
-    data = request.json
-    try:
-        cache_response = get_cache(data['messages'])
-        if cache_response!=None:
-            return cache_response
-        # pass in data to completion function, unpack data
-        response = completion(**data) 
-
-        # add to cache 
-    except Exception as e:
-        # call handle_error function
-        return handle_error(data)
-    return response, 200
--- a/cookbook/proxy-server/models_info.json
+++ b/cookbook/proxy-server/models_info.json
@ -1,18 +0,0 @@
-
-{
-    "gpt-3.5-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
-    "gpt-35-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, 
-    "gpt-3.5-turbo-0613": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
-    "gpt-3.5-turbo-0301": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
-    "gpt-3.5-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
-    "gpt-35-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
-    "gpt-3.5-turbo-16k-0613": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
-    "gpt-4": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
-    "gpt-4-0613": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
-    "gpt-4-32k": {"max_tokens": 8000, "input_cost_per_token": 0.00006, "output_cost_per_token": 0.00012},
-    "claude-instant-1": {"max_tokens": 100000, "input_cost_per_token": 0.00000163, "output_cost_per_token": 0.00000551},
-    "claude-2": {"max_tokens": 100000, "input_cost_per_token": 0.00001102, "output_cost_per_token": 0.00003268},
-    "text-bison-001": {"max_tokens": 8192, "input_cost_per_token": 0.000004, "output_cost_per_token": 0.000004},
-    "chat-bison-001": {"max_tokens": 4096, "input_cost_per_token": 0.000002, "output_cost_per_token": 0.000002},
-    "command-nightly": {"max_tokens": 4096, "input_cost_per_token": 0.000015, "output_cost_per_token": 0.000015}
-}
--- a/cookbook/proxy-server/readme.md
+++ b/cookbook/proxy-server/readme.md
@ -1,168 +0,0 @@
-
-# liteLLM Proxy Server: 50+ LLM Models, Error Handling, Caching
-### Azure, Llama2, OpenAI, Claude, Hugging Face, Replicate Models
-[![PyPI Version](https://img.shields.io/pypi/v/litellm.svg)](https://pypi.org/project/litellm/)
-[![PyPI Version](https://img.shields.io/badge/stable%20version-v0.1.345-blue?color=green&link=https://pypi.org/project/litellm/0.1.1/)](https://pypi.org/project/litellm/0.1.1/)
-![Downloads](https://img.shields.io/pypi/dm/litellm)
-[![litellm](https://img.shields.io/badge/%20%F0%9F%9A%85%20liteLLM-OpenAI%7CAzure%7CAnthropic%7CPalm%7CCohere%7CReplicate%7CHugging%20Face-blue?color=green)](https://github.com/BerriAI/litellm)
-
-[![Deploy on Railway](https://railway.app/button.svg)](https://railway.app/template/DYqQAW?referralCode=t3ukrU)
-
-![4BC6491E-86D0-4833-B061-9F54524B2579](https://github.com/BerriAI/litellm/assets/17561003/f5dd237b-db5e-42e1-b1ac-f05683b1d724)
-
-## What does liteLLM proxy do
- Make `/chat/completions` requests for 50+ LLM models **Azure, OpenAI, Replicate, Anthropic, Hugging Face**
-  
-  Example: for `model` use `claude-2`, `gpt-3.5`, `gpt-4`, `command-nightly`, `stabilityai/stablecode-completion-alpha-3b-4k`
-  ```json
-  {
-    "model": "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1",
-    "messages": [
-                    { 
-                        "content": "Hello, whats the weather in San Francisco??",
-                        "role": "user"
-                    }
-                ]
-  }
-  ```
- **Consistent Input/Output** Format
-    - Call all models using the OpenAI format - `completion(model, messages)`
-    - Text responses will always be available at `['choices'][0]['message']['content']`
- **Error Handling** Using Model Fallbacks (if `GPT-4` fails, try `llama2`)
- **Logging** - Log Requests, Responses and Errors to `Supabase`, `Posthog`, `Mixpanel`, `Sentry`, `Helicone` (Any of the supported providers here: https://litellm.readthedocs.io/en/latest/advanced/
-
- **Example: Logs sent to Supabase**
-  <img width="1015" alt="Screenshot 2023-08-11 at 4 02 46 PM" src="https://github.com/ishaan-jaff/proxy-server/assets/29436595/237557b8-ba09-4917-982c-8f3e1b2c8d08">
-
- **Token Usage & Spend** - Track Input + Completion tokens used + Spend/model
- **Caching** - Implementation of Semantic Caching
- **Streaming & Async Support** - Return generators to stream text responses
-
-
-## API Endpoints
-
-### `/chat/completions` (POST)
-
-This endpoint is used to generate chat completions for 50+ support LLM API Models. Use llama2, GPT-4, Claude2 etc
-
-#### Input
-This API endpoint accepts all inputs in raw JSON and expects the following inputs
- `model` (string, required): ID of the model to use for chat completions. See all supported models [here]: (https://litellm.readthedocs.io/en/latest/supported/): 
- eg `gpt-3.5-turbo`, `gpt-4`, `claude-2`, `command-nightly`, `stabilityai/stablecode-completion-alpha-3b-4k`
- `messages` (array, required): A list of messages representing the conversation context. Each message should have a `role` (system, user, assistant, or function), `content` (message text), and `name` (for function role).
- Additional Optional parameters: `temperature`, `functions`, `function_call`, `top_p`, `n`, `stream`. See the full list of supported inputs here: https://litellm.readthedocs.io/en/latest/input/
-
-
-#### Example JSON body
-For claude-2
-```json
-{
-    "model": "claude-2",
-    "messages": [
-                    { 
-                        "content": "Hello, whats the weather in San Francisco??",
-                        "role": "user"
-                    }
-                ]
-    
-}
-```
-
-### Making an API request to the Proxy Server
-```python
-import requests
-import json
-
-# TODO: use your URL 
-url = "http://localhost:5000/chat/completions"
-
-payload = json.dumps({
-  "model": "gpt-3.5-turbo",
-  "messages": [
-    {
-      "content": "Hello, whats the weather in San Francisco??",
-      "role": "user"
-    }
-  ]
-})
-headers = {
-  'Content-Type': 'application/json'
-}
-response = requests.request("POST", url, headers=headers, data=payload)
-print(response.text)
-
-```
-
-### Output [Response Format]
-Responses from the server are given in the following format. 
-All responses from the server are returned in the following format (for all LLM models). More info on output here: https://litellm.readthedocs.io/en/latest/output/
-```json
-{
-    "choices": [
-        {
-            "finish_reason": "stop",
-            "index": 0,
-            "message": {
-                "content": "I'm sorry, but I don't have the capability to provide real-time weather information. However, you can easily check the weather in San Francisco by searching online or using a weather app on your phone.",
-                "role": "assistant"
-            }
-        }
-    ],
-    "created": 1691790381,
-    "id": "chatcmpl-7mUFZlOEgdohHRDx2UpYPRTejirzb",
-    "model": "gpt-3.5-turbo-0613",
-    "object": "chat.completion",
-    "usage": {
-        "completion_tokens": 41,
-        "prompt_tokens": 16,
-        "total_tokens": 57
-    }
-}
-```
-
-## Installation & Usage
-### Running Locally
-1. Clone liteLLM repository to your local machine:
-   ```
-   git clone https://github.com/BerriAI/liteLLM-proxy
-   ```
-2. Install the required dependencies using pip
-   ```
-   pip install requirements.txt
-   ```
-3. Set your LLM API keys
-   ```
-   os.environ['OPENAI_API_KEY]` = "YOUR_API_KEY"
-   or
-   set OPENAI_API_KEY in your .env file
-   ```
-4. Run the server:
-   ```
-   python main.py
-   ```
-
-   
-
-## Deploying
-1. Quick Start: Deploy on Railway
-
-   [![Deploy on Railway](https://railway.app/button.svg)](https://railway.app/template/DYqQAW?referralCode=t3ukrU)
-   
-2. `GCP`, `AWS`, `Azure` 
-This project includes a `Dockerfile` allowing you to build and deploy a Docker Project on your providers
-
-# Support / Talk with founders
- [Our calendar 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version)
- [Community Discord 💭](https://discord.gg/wuPM9dRgDw)
- Our numbers 📞 +1 (770) 8783-106 / +1 (412) 618-6238
- Our emails ✉️ ishaan@berri.ai / krrish@berri.ai
-
-
-## Roadmap
- [ ] Support hosted db (e.g. Supabase)
- [ ] Easily send data to places like posthog and sentry.
- [ ] Add a hot-cache for project spend logs - enables fast checks for user + project limitings
- [ ] Implement user-based rate-limiting
- [ ] Spending controls per project - expose key creation endpoint
- [ ] Need to store a keys db -> mapping created keys to their alias (i.e. project name)
- [ ] Easily add new models as backups / as the entry-point (add this to the available model list)
--- a/cookbook/proxy-server/requirements.txt
+++ b/cookbook/proxy-server/requirements.txt
@ -1,4 +0,0 @@
-flask
-flask_cors
-waitress
-litellm==0.1.381
--- a/cookbook/proxy-server/test_proxy.py
+++ b/cookbook/proxy-server/test_proxy.py
@ -1,3 +0,0 @@
-
-# tests to call the proxy app.py using gpt-3.5-turbo, gpt-4, and claude-instant-1
-
--- a/cookbook/proxy-server/test_proxy_stream.py
+++ b/cookbook/proxy-server/test_proxy_stream.py
@ -1,21 +0,0 @@
-# import openai
-# import os
-
-# os.environ["OPENAI_API_KEY"] = ""
-
-# openai.api_key = os.environ["OPENAI_API_KEY"]
-# openai.api_base ="http://localhost:5000"
-
-# messages = [
-#     {
-#         "role": "user",
-#         "content": "write a 1 pg essay in liteLLM"
-#     }
-# ]
-
-# response = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages, stream=True)
-# print("got response", response)
-# # response is a generator
-
-# for chunk in response:
-#     print(chunk)
--- a/cookbook/proxy-server/utils.py
+++ b/cookbook/proxy-server/utils.py
@ -1,107 +0,0 @@
-
-from litellm import completion 
-import os, dotenv
-import json
-dotenv.load_dotenv()
-############### Advanced ##########################
-
-########### streaming ############################
-def generate_responses(response):
-    for chunk in response:
-        yield json.dumps({"response": chunk}) + "\n"
-
-################ ERROR HANDLING #####################
-# implement model fallbacks, cooldowns, and retries
-# if a model fails assume it was rate limited and let it cooldown for 60s
-def handle_error(data):
-    import time
-    # retry completion() request with fallback models
-    response = None
-    start_time = time.time()
-    rate_limited_models = set()
-    model_expiration_times = {}
-    fallback_strategy=['gpt-3.5-turbo', 'command-nightly', 'claude-2']
-    while response == None and time.time() - start_time < 45: # retry for 45s
-      for model in fallback_strategy:
-        try:
-            if model in rate_limited_models: # check if model is currently cooling down
-              if model_expiration_times.get(model) and time.time() >= model_expiration_times[model]:
-                  rate_limited_models.remove(model) # check if it's been 60s of cool down and remove model
-              else:
-                  continue # skip model
-            print(f"calling model {model}")
-            response = completion(**data)
-            if response != None:
-              return response
-        except Exception as e:
-          rate_limited_models.add(model)
-          model_expiration_times[model] = time.time() + 60 # cool down this selected model
-          pass
-    return response
-
-
-########### Pricing is tracked in Supabase ############
-
-
-
-import uuid
-cache_collection = None
-# Add a response to the cache
-def add_cache(messages, model_response):
-    global cache_collection
-    if cache_collection is None:
-        make_collection()
-
-    user_question = message_to_user_question(messages)
-
-    # Add the user question and model response to the cache
-    cache_collection.add(
-        documents=[user_question],
-        metadatas=[{"model_response": str(model_response)}],
-        ids=[str(uuid.uuid4())]
-    )
-    return
-
-# Retrieve a response from the cache if similarity is above the threshold
-def get_cache(messages, similarity_threshold):
-    try:
-        global cache_collection
-        if cache_collection is None:
-            make_collection()
-
-        user_question = message_to_user_question(messages)
-
-        # Query the cache for the user question
-        results = cache_collection.query(
-            query_texts=[user_question],
-            n_results=1
-        )
-
-        if len(results['distances'][0]) == 0:
-            return None  # Cache is empty
-
-        distance = results['distances'][0][0]
-        sim = (1 - distance)
-
-        if sim >= similarity_threshold:
-            return results['metadatas'][0][0]["model_response"]  # Return cached response
-        else:
-            return None  # No cache hit
-    except Exception as e:
-        print("Error in get cache", e)
-        raise e
-
-# Initialize the cache collection
-def make_collection():
-    import chromadb
-    global cache_collection
-    client = chromadb.Client()
-    cache_collection = client.create_collection("llm_responses")
-
-# HELPER: Extract user's question from messages
-def message_to_user_question(messages):
-    user_question = ""
-    for message in messages:
-        if message['role'] == 'user':
-            user_question += message["content"]
-    return user_question
				`@ -1,3 +0,0 @@`

				`# tests to call the proxy app.py using gpt-3.5-turbo, gpt-4, and claude-instant-1`