diff --git a/cookbook/codellama-server/README.MD b/cookbook/codellama-server/README.MD new file mode 100644 index 000000000..e410426ca --- /dev/null +++ b/cookbook/codellama-server/README.MD @@ -0,0 +1,154 @@ +# CodeLlama Server: Streaming, Caching, Model Fallbacks (OpenAI + Anthropic), Prompt-tracking + +Works with: Anthropic, Huggingface, Cohere, TogetherAI, Azure, OpenAI, etc. + +[![PyPI Version](https://img.shields.io/pypi/v/litellm.svg)](https://pypi.org/project/litellm/) +[![PyPI Version](https://img.shields.io/badge/stable%20version-v0.1.345-blue?color=green&link=https://pypi.org/project/litellm/0.1.1/)](https://pypi.org/project/litellm/0.1.1/) +![Downloads](https://img.shields.io/pypi/dm/litellm) + +[![Deploy on Railway](https://railway.app/button.svg)](https://railway.app/template/HuDPw-?referralCode=jch2ME) + +**LIVE DEMO** - https://litellm.ai/playground + +## What does CodeLlama Server do + +- Uses Together AI's CodeLlama to answer coding questions, with GPT-4 + Claude-2 as backups (you can easily switch this to any model from Huggingface, Replicate, Cohere, AI21, Azure, OpenAI, etc.) +- Sets default system prompt for guardrails `system_prompt = "Only respond to questions about code. Say 'I don't know' to anything outside of that."` +- Integrates with Promptlayer for model + prompt tracking +- Example output + +Code Output + +- **Consistent Input/Output** Format + - Call all models using the OpenAI format - `completion(model, messages)` + - Text responses will always be available at `['choices'][0]['message']['content']` + - Stream responses will always be available at `['choices'][0]['delta']['content']` +- **Error Handling** Using Model Fallbacks (if `CodeLlama` fails, try `GPT-4`) with cooldowns, and retries +- **Prompt Logging** - Log successful completions to promptlayer for testing + iterating on your prompts in production! (Learn more: https://litellm.readthedocs.io/en/latest/advanced/ + + **Example: Logs sent to PromptLayer** + + Prompt Logging + + +- **Token Usage & Spend** - Track Input + Completion tokens used + Spend/model - https://docs.litellm.ai/docs/token_usage +- **Caching** - Provides in-memory cache + GPT-Cache integration for more advanced usage - https://docs.litellm.ai/docs/caching/gpt_cache + +- **Streaming & Async Support** - Return generators to stream text responses - TEST IT 👉 https://litellm.ai/ + +## API Endpoints + +### `/chat/completions` (POST) + +This endpoint is used to generate chat completions for 50+ support LLM API Models. Use llama2, GPT-4, Claude2 etc + +#### Input + +This API endpoint accepts all inputs in raw JSON and expects the following inputs + +- `prompt` (string, required): The user's coding related question +- Additional Optional parameters: `temperature`, `functions`, `function_call`, `top_p`, `n`, `stream`. See the full list of supported inputs here: https://litellm.readthedocs.io/en/latest/input/ + +#### Example JSON body + +For claude-2 + +```json +{ + "prompt": "write me a function to print hello world" +} +``` + +### Making an API request to the Code-Gen Server + +```python +import requests +import json + +url = "localhost:4000/chat/completions" + +payload = json.dumps({ + "prompt": "write me a function to print hello world" +}) +headers = { + 'Content-Type': 'application/json' +} + +response = requests.request("POST", url, headers=headers, data=payload) + +print(response.text) + +``` + +### Output [Response Format] + +Responses from the server are given in the following format. +All responses from the server are returned in the following format (for all LLM models). More info on output here: https://litellm.readthedocs.io/en/latest/output/ + +```json +{ + "choices": [ + { + "finish_reason": "stop", + "index": 0, + "message": { + "content": ".\n\n```\ndef print_hello_world():\n print(\"hello world\")\n", + "role": "assistant" + } + } + ], + "created": 1693279694.6474009, + "model": "togethercomputer/CodeLlama-34b-Instruct", + "usage": { + "completion_tokens": 14, + "prompt_tokens": 28, + "total_tokens": 42 + } +} +``` + +## Installation & Usage + +### Running Locally + +1. Clone liteLLM repository to your local machine: + ``` + git clone https://github.com/BerriAI/litellm-CodeGen-proxy + ``` +2. Install the required dependencies using pip + ``` + pip install requirements.txt + ``` +3. Set your LLM API keys + ``` + os.environ['OPENAI_API_KEY]` = "YOUR_API_KEY" + or + set OPENAI_API_KEY in your .env file + ``` +4. Run the server: + ``` + python main.py + ``` + +## Deploying + +1. Quick Start: Deploy on Railway + + [![Deploy on Railway](https://railway.app/button.svg)](https://railway.app/template/HuDPw-?referralCode=jch2ME) + +2. `GCP`, `AWS`, `Azure` + This project includes a `Dockerfile` allowing you to build and deploy a Docker Project on your providers + +# Support / Talk with founders + +- [Our calendar 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version) +- [Community Discord 💭](https://discord.gg/wuPM9dRgDw) +- Our numbers 📞 +1 (770) 8783-106 / +1 (412) 618-6238 +- Our emails ✉️ ishaan@berri.ai / krrish@berri.ai + +## Roadmap + +- [ ] Implement user-based rate-limiting +- [ ] Spending controls per project - expose key creation endpoint +- [ ] Need to store a keys db -> mapping created keys to their alias (i.e. project name) +- [ ] Easily add new models as backups / as the entry-point (add this to the available model list) diff --git a/cookbook/codellama-server/imgs/code-output.png b/cookbook/codellama-server/imgs/code-output.png new file mode 100644 index 000000000..67e298bd3 Binary files /dev/null and b/cookbook/codellama-server/imgs/code-output.png differ diff --git a/cookbook/codellama-server/imgs/promptlayer_logging.png b/cookbook/codellama-server/imgs/promptlayer_logging.png new file mode 100644 index 000000000..26b046ac4 Binary files /dev/null and b/cookbook/codellama-server/imgs/promptlayer_logging.png differ diff --git a/cookbook/codellama-server/main.py b/cookbook/codellama-server/main.py new file mode 100644 index 000000000..51627ceca --- /dev/null +++ b/cookbook/codellama-server/main.py @@ -0,0 +1,84 @@ +import traceback +from flask import Flask, request, jsonify, abort, Response +from flask_cors import CORS +import traceback +import litellm +from util import handle_error +from litellm import completion +import os, dotenv, time +import json +dotenv.load_dotenv() + +# TODO: set your keys in .env or here: +# os.environ["OPENAI_API_KEY"] = "" # set your openai key here +# os.environ["ANTHROPIC_API_KEY"] = "" # set your anthropic key here +# os.environ["TOGETHER_AI_API_KEY"] = "" # set your together ai key here +# see supported models / keys here: https://litellm.readthedocs.io/en/latest/supported/ +######### ENVIRONMENT VARIABLES ########## +verbose = True + +# litellm.caching_with_models = True # CACHING: caching_with_models Keys in the cache are messages + model. - to learn more: https://docs.litellm.ai/docs/caching/ +######### PROMPT LOGGING ########## +os.environ["PROMPTLAYER_API_KEY"] = "" # set your promptlayer key here - https://promptlayer.com/ + +# set callbacks +litellm.success_callback = ["promptlayer"] +############ HELPER FUNCTIONS ################################### + +def print_verbose(print_statement): + if verbose: + print(print_statement) + +app = Flask(__name__) +CORS(app) + +@app.route('/') +def index(): + return 'received!', 200 + +def data_generator(response): + for chunk in response: + yield f"data: {json.dumps(chunk)}\n\n" + +@app.route('/chat/completions', methods=["POST"]) +def api_completion(): + data = request.json + start_time = time.time() + if data.get('stream') == "True": + data['stream'] = True # convert to boolean + try: + if "prompt" not in data: + raise ValueError("data needs to have prompt") + data["model"] = "togethercomputer/CodeLlama-34b-Instruct" # by default use Together AI's CodeLlama model - https://api.together.xyz/playground/chat?model=togethercomputer%2FCodeLlama-34b-Instruct + # COMPLETION CALL + system_prompt = "Only respond to questions about code. Say 'I don't know' to anything outside of that." + messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": data.pop("prompt")}] + data["messages"] = messages + print(f"data: {data}") + response = completion(**data) + ## LOG SUCCESS + end_time = time.time() + if 'stream' in data and data['stream'] == True: # use generate_responses to stream responses + return Response(data_generator(response), mimetype='text/event-stream') + except Exception as e: + # call handle_error function + print_verbose(f"Got Error api_completion(): {traceback.format_exc()}") + ## LOG FAILURE + end_time = time.time() + traceback_exception = traceback.format_exc() + return handle_error(data=data) + return response + +@app.route('/get_models', methods=["POST"]) +def get_models(): + try: + return litellm.model_list + except Exception as e: + traceback.print_exc() + response = {"error": str(e)} + return response, 200 + +if __name__ == "__main__": + from waitress import serve + serve(app, host="0.0.0.0", port=4000, threads=500) +