litellm/cookbook/codellama-server/main.py

import traceback
from flask import Flask, request, jsonify, abort, Response
from flask_cors import CORS
import traceback
import litellm
from util import handle_error
from litellm import completion
import os, dotenv, time
import json

dotenv.load_dotenv()

# TODO: set your keys in .env or here:
# os.environ["OPENAI_API_KEY"] = "" # set your openai key here
# os.environ["ANTHROPIC_API_KEY"] = "" # set your anthropic key here
# os.environ["TOGETHER_AI_API_KEY"] = "" # set your together ai key here
# see supported models / keys here: https://litellm.readthedocs.io/en/latest/supported/
######### ENVIRONMENT VARIABLES ##########
verbose = True

# litellm.caching_with_models = True # CACHING: caching_with_models Keys in the cache are messages + model. - to learn more: https://docs.litellm.ai/docs/caching/
######### PROMPT LOGGING ##########
os.environ[
    "PROMPTLAYER_API_KEY"
] = ""  # set your promptlayer key here - https://promptlayer.com/

# set callbacks
litellm.success_callback = ["promptlayer"]
############ HELPER FUNCTIONS ###################################


def print_verbose(print_statement):
    if verbose:
        print(print_statement)


app = Flask(__name__)
CORS(app)


@app.route("/")
def index():
    return "received!", 200


def data_generator(response):
    for chunk in response:
        yield f"data: {json.dumps(chunk)}\n\n"


@app.route("/chat/completions", methods=["POST"])
def api_completion():
    data = request.json
    start_time = time.time()
    if data.get("stream") == "True":
        data["stream"] = True  # convert to boolean
    try:
        if "prompt" not in data:
            raise ValueError("data needs to have prompt")
        data[
            "model"
        ] = "togethercomputer/CodeLlama-34b-Instruct"  # by default use Together AI's CodeLlama model - https://api.together.xyz/playground/chat?model=togethercomputer%2FCodeLlama-34b-Instruct
        # COMPLETION CALL
        system_prompt = "Only respond to questions about code. Say 'I don't know' to anything outside of that."
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": data.pop("prompt")},
        ]
        data["messages"] = messages
        print(f"data: {data}")
        response = completion(**data)
        ## LOG SUCCESS
        end_time = time.time()
        if (
            "stream" in data and data["stream"] == True
        ):  # use generate_responses to stream responses
            return Response(data_generator(response), mimetype="text/event-stream")
    except Exception as e:
        # call handle_error function
        print_verbose(f"Got Error api_completion(): {traceback.format_exc()}")
        ## LOG FAILURE
        end_time = time.time()
        traceback_exception = traceback.format_exc()
        return handle_error(data=data)
    return response


@app.route("/get_models", methods=["POST"])
def get_models():
    try:
        return litellm.model_list
    except Exception as e:
        traceback.print_exc()
        response = {"error": str(e)}
    return response, 200


if __name__ == "__main__":
    from waitress import serve

    serve(app, host="0.0.0.0", port=4000, threads=500)