diff --git a/cookbook/proxy-server/Dockerfile b/cookbook/proxy-server/Dockerfile new file mode 100644 index 000000000..9eb7132ee --- /dev/null +++ b/cookbook/proxy-server/Dockerfile @@ -0,0 +1,22 @@ +# Use a recent version of Python as the base image +FROM python:3.8-slim-buster + +# Set the working directory to /app +WORKDIR /app + +# Copy the requirements.txt file to the image +COPY requirements.txt . + +# Install the required packages + +# Install the required Python packages using pip +RUN pip install --no-cache-dir -r requirements.txt + +# Copy the rest of the application files to the image +COPY . . + +# Expose port 5000 for the Flask app to listen on +EXPOSE 5000 + +# Run the main.py file when the container is started +CMD ["python", "main.py"] \ No newline at end of file diff --git a/cookbook/proxy-server/LICENSE b/cookbook/proxy-server/LICENSE new file mode 100644 index 000000000..dd11dc523 --- /dev/null +++ b/cookbook/proxy-server/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 Berri AI + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/cookbook/proxy-server/main.py b/cookbook/proxy-server/main.py index 005bd81d1..be770afb7 100644 --- a/cookbook/proxy-server/main.py +++ b/cookbook/proxy-server/main.py @@ -9,7 +9,7 @@ dotenv.load_dotenv() ######### LOGGING ################### # log your data to slack, supabase -litellm.success_callback=["slack", "supabase"] # .env SLACK_API_TOKEN, SLACK_API_SECRET, SLACK_API_CHANNEL, SUPABASE +litellm.success_callback=["slack", "supabase"] # set .env SLACK_API_TOKEN, SLACK_API_SECRET, SLACK_API_CHANNEL, SUPABASE ######### ERROR MONITORING ########## # log errors to slack, sentry, supabase @@ -27,15 +27,14 @@ def api_completion(): data = request.json try: # pass in data to completion function, unpack data - response = completion(**data) + response = completion(**data) except Exception as e: - traceback.print_exc() - response = {"error": str(e)} + # call handle_error function + return handle_error(data) return response, 200 @app.route('/get_models', methods=["POST"]) def get_models(): - data = request.json try: return litellm.model_list except Exception as e: @@ -47,6 +46,120 @@ if __name__ == "__main__": from waitress import serve serve(app, host="0.0.0.0", port=5000, threads=500) +############### Advanced ########################## + +################ ERROR HANDLING ##################### +# implement model fallbacks, cooldowns, and retries +# if a model fails assume it was rate limited and let it cooldown for 60s +def handle_error(data): + import time + # retry completion() request with fallback models + response = None + start_time = time.time() + rate_limited_models = set() + model_expiration_times = {} + fallback_strategy=['gpt-3.5-turbo', 'command-nightly', 'claude-2'] + while response == None and time.time() - start_time < 45: # retry for 45s + for model in fallback_strategy: + try: + if model in rate_limited_models: # check if model is currently cooling down + if model_expiration_times.get(model) and time.time() >= model_expiration_times[model]: + rate_limited_models.remove(model) # check if it's been 60s of cool down and remove model + else: + continue # skip model + print(f"calling model {model}") + response = completion(**data) + if response != None: + return response + except Exception as e: + rate_limited_models.add(model) + model_expiration_times[model] = time.time() + 60 # cool down this selected model + pass + return response + + +########### Pricing is tracked in Supabase ############ +############ Caching ################################### +# make a new endpoint with caching +# This Cache is built using ChromaDB +# it has two functions add_cache() and get_cache() +@app.route('/chat/completions', methods=["POST"]) +def api_completion_with_cache(): + data = request.json + try: + cache_response = get_cache(data['messages']) + if cache_response!=None: + return cache_response + # pass in data to completion function, unpack data + response = completion(**data) + + # add to cache + except Exception as e: + # call handle_error function + return handle_error(data) + return response, 200 + +import uuid +cache_collection = None +# Add a response to the cache +def add_cache(messages, model_response): + global cache_collection + if cache_collection is None: + make_collection() + + user_question = message_to_user_question(messages) + + # Add the user question and model response to the cache + cache_collection.add( + documents=[user_question], + metadatas=[{"model_response": str(model_response)}], + ids=[str(uuid.uuid4())] + ) + return + +# Retrieve a response from the cache if similarity is above the threshold +def get_cache(messages, similarity_threshold): + try: + global cache_collection + if cache_collection is None: + make_collection() + + user_question = message_to_user_question(messages) + + # Query the cache for the user question + results = cache_collection.query( + query_texts=[user_question], + n_results=1 + ) + + if len(results['distances'][0]) == 0: + return None # Cache is empty + + distance = results['distances'][0][0] + sim = (1 - distance) + + if sim >= similarity_threshold: + return results['metadatas'][0][0]["model_response"] # Return cached response + else: + return None # No cache hit + except Exception as e: + print("Error in get cache", e) + raise e + +# Initialize the cache collection +def make_collection(): + import chromadb + global cache_collection + client = chromadb.Client() + cache_collection = client.create_collection("llm_responses") + +# HELPER: Extract user's question from messages +def message_to_user_question(messages): + user_question = "" + for message in messages: + if message['role'] == 'user': + user_question += message["content"] + return user_question \ No newline at end of file diff --git a/cookbook/proxy-server/models.json b/cookbook/proxy-server/models.json deleted file mode 100644 index e69de29bb..000000000 diff --git a/cookbook/proxy-server/models_info.json b/cookbook/proxy-server/models_info.json new file mode 100644 index 000000000..7bc9aafae --- /dev/null +++ b/cookbook/proxy-server/models_info.json @@ -0,0 +1,18 @@ + +{ + "gpt-3.5-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, + "gpt-35-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, + "gpt-3.5-turbo-0613": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, + "gpt-3.5-turbo-0301": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, + "gpt-3.5-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, + "gpt-35-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, + "gpt-3.5-turbo-16k-0613": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, + "gpt-4": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006}, + "gpt-4-0613": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006}, + "gpt-4-32k": {"max_tokens": 8000, "input_cost_per_token": 0.00006, "output_cost_per_token": 0.00012}, + "claude-instant-1": {"max_tokens": 100000, "input_cost_per_token": 0.00000163, "output_cost_per_token": 0.00000551}, + "claude-2": {"max_tokens": 100000, "input_cost_per_token": 0.00001102, "output_cost_per_token": 0.00003268}, + "text-bison-001": {"max_tokens": 8192, "input_cost_per_token": 0.000004, "output_cost_per_token": 0.000004}, + "chat-bison-001": {"max_tokens": 4096, "input_cost_per_token": 0.000002, "output_cost_per_token": 0.000002}, + "command-nightly": {"max_tokens": 4096, "input_cost_per_token": 0.000015, "output_cost_per_token": 0.000015} +} \ No newline at end of file diff --git a/cookbook/proxy-server/readme.md b/cookbook/proxy-server/readme.md index 7866b7b63..12f3c65bd 100644 --- a/cookbook/proxy-server/readme.md +++ b/cookbook/proxy-server/readme.md @@ -1,106 +1,124 @@ -<<<<<<< HEAD -# Proxy Server for Azure, Llama2, OpenAI, Claude, Hugging Face, Replicate Models + +# liteLLM Proxy Server: 50+ LLM Models, Error Handling, Caching +### Azure, Llama2, OpenAI, Claude, Hugging Face, Replicate Models [![PyPI Version](https://img.shields.io/pypi/v/litellm.svg)](https://pypi.org/project/litellm/) [![PyPI Version](https://img.shields.io/badge/stable%20version-v0.1.345-blue?color=green&link=https://pypi.org/project/litellm/0.1.1/)](https://pypi.org/project/litellm/0.1.1/) ![Downloads](https://img.shields.io/pypi/dm/litellm) [![litellm](https://img.shields.io/badge/%20%F0%9F%9A%85%20liteLLM-OpenAI%7CAzure%7CAnthropic%7CPalm%7CCohere%7CReplicate%7CHugging%20Face-blue?color=green)](https://github.com/BerriAI/litellm) -======= -# Proxy Server for Chat API ->>>>>>> d1ff082 (new v litellm for render) -This repository contains a proxy server that interacts with OpenAI's Chat API and other similar APIs to facilitate chat-based language models. The server allows you to easily integrate chat completion capabilities into your applications. The server is built using Python and the Flask framework. +[![Deploy on Railway](https://railway.app/button.svg)](https://railway.app/template/_YF4Qj?referralCode=t3ukrU) -<<<<<<< HEAD -# Proxy Server for Chat API +# What does liteLLM proxy do +- Make `/chat/completions` requests for 50+ LLM models **Azure, OpenAI, Replicate, Anthropic, Hugging Face** + + Example: for `model` use `claude-2`, `gpt-3.5`, `gpt-4`, `command-nightly`, `stabilityai/stablecode-completion-alpha-3b-4k` + ```json + { + "model": "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1", + "messages": [ + { + "content": "Hello, whats the weather in San Francisco??", + "role": "user" + } + ] + } + ``` +- **Consistent Input/Output** Format + - Call all models using the OpenAI format - completion(model, messages) + - Text responses will always be available at ['choices'][0]['message']['content'] +- **Error Handling** Using Model Fallbacks (if `GPT-4` fails, try `llama2`) +- **Logging** - Log Requests, Responses and Errors to `Supabase`, `Posthog`, `Mixpanel`, `Sentry`, `Helicone` (Any of the supported providers here: https://litellm.readthedocs.io/en/latest/advanced/ -This repository contains a proxy server that interacts with OpenAI's Chat API and other similar APIs to facilitate chat-based language models. The server allows you to easily integrate chat completion capabilities into your applications. The server is built using Python and the Flask framework. + Example: Logs sent to Supabase + Screenshot 2023-08-11 at 4 02 46 PM -## Installation - -======= -## Installation - ->>>>>>> d1ff082 (new v litellm for render) -To set up and run the proxy server locally, follow these steps: - -1. Clone this repository to your local machine: - - -2. Install the required dependencies using pip: - -`pip install -r requirements.txt` - -3. Configure the server settings, such as API keys and model endpoints, in the configuration file (`config.py`). - -4. Run the server: - -`python app.py` +- **Token Usage & Spend** - Track Input + Completion tokens used + Spend/model +- **Caching** - Implementation of Semantic Caching +- **Streaming & Async Support** - Return generators to stream text responses ## API Endpoints ### `/chat/completions` (POST) -This endpoint is used to generate chat completions. It takes in JSON data with the following parameters: +This endpoint is used to generate chat completions for 50+ support LLM API Models. Use llama2, GPT-4, Claude2 etc -- `model` (string, required): ID of the model to use for chat completions. Refer to the model endpoint compatibility table for supported models. +#### Input +This API endpoint accepts all inputs in raw JSON and expects the following inputs +- `model` (string, required): ID of the model to use for chat completions. See all supported models [here]: (https://litellm.readthedocs.io/en/latest/supported/): + eg `gpt-3.5-turbo`, `gpt-4`, `claude-2`, `command-nightly`, `stabilityai/stablecode-completion-alpha-3b-4k` - `messages` (array, required): A list of messages representing the conversation context. Each message should have a `role` (system, user, assistant, or function), `content` (message text), and `name` (for function role). -- Additional parameters for controlling completions, such as `temperature`, `top_p`, `n`, etc. +- Additional Optional parameters: `temperature`, `functions`, `function_call`, `top_p`, `n`, `stream`. See the full list of supported inputs here: https://litellm.readthedocs.io/en/latest/input/ -Example JSON payload: +#### Example JSON body +For claude-2 ```json { -"model": "gpt-3.5-turbo", -"messages": [ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": "Knock knock."}, - {"role": "assistant", "content": "Who's there?"}, - {"role": "user", "content": "Orange."} -], -"temperature": 0.8 + "model": "claude-2", + "messages": [ + { + "content": "Hello, whats the weather in San Francisco??", + "role": "user" + } + ] + +} +``` + +### Making an API request to the Proxy Server +```python +import requests +import json + +# TODO: use your URL +url = "http://localhost:5000/chat/completions" + +payload = json.dumps({ + "model": "gpt-3.5-turbo", + "messages": [ + { + "content": "Hello, whats the weather in San Francisco??", + "role": "user" + } + ] +}) +headers = { + 'Content-Type': 'application/json' +} +response = requests.request("POST", url, headers=headers, data=payload) +print(response.text) + +``` + +### Output [Response Format] +Responses from the server are given in the following format. +All responses from the server are returned in the following format (for all LLM models). More info on output here: https://litellm.readthedocs.io/en/latest/output/ +```json +{ + "choices": [ + { + "finish_reason": "stop", + "index": 0, + "message": { + "content": "I'm sorry, but I don't have the capability to provide real-time weather information. However, you can easily check the weather in San Francisco by searching online or using a weather app on your phone.", + "role": "assistant" + } + } + ], + "created": 1691790381, + "id": "chatcmpl-7mUFZlOEgdohHRDx2UpYPRTejirzb", + "model": "gpt-3.5-turbo-0613", + "object": "chat.completion", + "usage": { + "completion_tokens": 41, + "prompt_tokens": 16, + "total_tokens": 57 + } } ``` -## Input Parameters -model: ID of the language model to use. -messages: An array of messages representing the conversation context. -role: The role of the message author (system, user, assistant, or function). -content: The content of the message. -name: The name of the author (required for function role). -function_call: The name and arguments of a function to call. -functions: A list of functions the model may generate JSON inputs for. -Various other parameters for controlling completion behavior. -Supported Models -The proxy server supports the following models: -OpenAI Chat Completion Models: -gpt-4 -gpt-4-0613 -gpt-4-32k -... -OpenAI Text Completion Models: -text-davinci-003 -Cohere Models: -command-nightly -command -... -Anthropic Models: -claude-2 -claude-instant-1 -... -Replicate Models: -replicate/ -OpenRouter Models: -google/palm-2-codechat-bison -google/palm-2-chat-bison -... -Vertex Models: -chat-bison -chat-bison@001 -<<<<<<< HEAD -Refer to the model endpoint compatibility table for more details. -======= -Refer to the model endpoint compatibility table for more details. ->>>>>>> d1ff082 (new v litellm for render) + +