diff --git a/cookbook/codellama-server/README.MD b/cookbook/codellama-server/README.MD
new file mode 100644
index 000000000..e410426ca
--- /dev/null
+++ b/cookbook/codellama-server/README.MD
@@ -0,0 +1,154 @@
+# CodeLlama Server: Streaming, Caching, Model Fallbacks (OpenAI + Anthropic), Prompt-tracking
+
+Works with: Anthropic, Huggingface, Cohere, TogetherAI, Azure, OpenAI, etc.
+
+[](https://pypi.org/project/litellm/)
+[](https://pypi.org/project/litellm/0.1.1/)
+
+
+[](https://railway.app/template/HuDPw-?referralCode=jch2ME)
+
+**LIVE DEMO** - https://litellm.ai/playground
+
+## What does CodeLlama Server do
+
+- Uses Together AI's CodeLlama to answer coding questions, with GPT-4 + Claude-2 as backups (you can easily switch this to any model from Huggingface, Replicate, Cohere, AI21, Azure, OpenAI, etc.)
+- Sets default system prompt for guardrails `system_prompt = "Only respond to questions about code. Say 'I don't know' to anything outside of that."`
+- Integrates with Promptlayer for model + prompt tracking
+- Example output
+
+
+
+- **Consistent Input/Output** Format
+ - Call all models using the OpenAI format - `completion(model, messages)`
+ - Text responses will always be available at `['choices'][0]['message']['content']`
+ - Stream responses will always be available at `['choices'][0]['delta']['content']`
+- **Error Handling** Using Model Fallbacks (if `CodeLlama` fails, try `GPT-4`) with cooldowns, and retries
+- **Prompt Logging** - Log successful completions to promptlayer for testing + iterating on your prompts in production! (Learn more: https://litellm.readthedocs.io/en/latest/advanced/
+
+ **Example: Logs sent to PromptLayer**
+
+
+
+
+- **Token Usage & Spend** - Track Input + Completion tokens used + Spend/model - https://docs.litellm.ai/docs/token_usage
+- **Caching** - Provides in-memory cache + GPT-Cache integration for more advanced usage - https://docs.litellm.ai/docs/caching/gpt_cache
+
+- **Streaming & Async Support** - Return generators to stream text responses - TEST IT 👉 https://litellm.ai/
+
+## API Endpoints
+
+### `/chat/completions` (POST)
+
+This endpoint is used to generate chat completions for 50+ support LLM API Models. Use llama2, GPT-4, Claude2 etc
+
+#### Input
+
+This API endpoint accepts all inputs in raw JSON and expects the following inputs
+
+- `prompt` (string, required): The user's coding related question
+- Additional Optional parameters: `temperature`, `functions`, `function_call`, `top_p`, `n`, `stream`. See the full list of supported inputs here: https://litellm.readthedocs.io/en/latest/input/
+
+#### Example JSON body
+
+For claude-2
+
+```json
+{
+ "prompt": "write me a function to print hello world"
+}
+```
+
+### Making an API request to the Code-Gen Server
+
+```python
+import requests
+import json
+
+url = "localhost:4000/chat/completions"
+
+payload = json.dumps({
+ "prompt": "write me a function to print hello world"
+})
+headers = {
+ 'Content-Type': 'application/json'
+}
+
+response = requests.request("POST", url, headers=headers, data=payload)
+
+print(response.text)
+
+```
+
+### Output [Response Format]
+
+Responses from the server are given in the following format.
+All responses from the server are returned in the following format (for all LLM models). More info on output here: https://litellm.readthedocs.io/en/latest/output/
+
+```json
+{
+ "choices": [
+ {
+ "finish_reason": "stop",
+ "index": 0,
+ "message": {
+ "content": ".\n\n```\ndef print_hello_world():\n print(\"hello world\")\n",
+ "role": "assistant"
+ }
+ }
+ ],
+ "created": 1693279694.6474009,
+ "model": "togethercomputer/CodeLlama-34b-Instruct",
+ "usage": {
+ "completion_tokens": 14,
+ "prompt_tokens": 28,
+ "total_tokens": 42
+ }
+}
+```
+
+## Installation & Usage
+
+### Running Locally
+
+1. Clone liteLLM repository to your local machine:
+ ```
+ git clone https://github.com/BerriAI/litellm-CodeGen-proxy
+ ```
+2. Install the required dependencies using pip
+ ```
+ pip install requirements.txt
+ ```
+3. Set your LLM API keys
+ ```
+ os.environ['OPENAI_API_KEY]` = "YOUR_API_KEY"
+ or
+ set OPENAI_API_KEY in your .env file
+ ```
+4. Run the server:
+ ```
+ python main.py
+ ```
+
+## Deploying
+
+1. Quick Start: Deploy on Railway
+
+ [](https://railway.app/template/HuDPw-?referralCode=jch2ME)
+
+2. `GCP`, `AWS`, `Azure`
+ This project includes a `Dockerfile` allowing you to build and deploy a Docker Project on your providers
+
+# Support / Talk with founders
+
+- [Our calendar 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version)
+- [Community Discord 💭](https://discord.gg/wuPM9dRgDw)
+- Our numbers 📞 +1 (770) 8783-106 / +1 (412) 618-6238
+- Our emails ✉️ ishaan@berri.ai / krrish@berri.ai
+
+## Roadmap
+
+- [ ] Implement user-based rate-limiting
+- [ ] Spending controls per project - expose key creation endpoint
+- [ ] Need to store a keys db -> mapping created keys to their alias (i.e. project name)
+- [ ] Easily add new models as backups / as the entry-point (add this to the available model list)
diff --git a/cookbook/codellama-server/imgs/code-output.png b/cookbook/codellama-server/imgs/code-output.png
new file mode 100644
index 000000000..67e298bd3
Binary files /dev/null and b/cookbook/codellama-server/imgs/code-output.png differ
diff --git a/cookbook/codellama-server/imgs/promptlayer_logging.png b/cookbook/codellama-server/imgs/promptlayer_logging.png
new file mode 100644
index 000000000..26b046ac4
Binary files /dev/null and b/cookbook/codellama-server/imgs/promptlayer_logging.png differ
diff --git a/cookbook/codellama-server/main.py b/cookbook/codellama-server/main.py
new file mode 100644
index 000000000..51627ceca
--- /dev/null
+++ b/cookbook/codellama-server/main.py
@@ -0,0 +1,84 @@
+import traceback
+from flask import Flask, request, jsonify, abort, Response
+from flask_cors import CORS
+import traceback
+import litellm
+from util import handle_error
+from litellm import completion
+import os, dotenv, time
+import json
+dotenv.load_dotenv()
+
+# TODO: set your keys in .env or here:
+# os.environ["OPENAI_API_KEY"] = "" # set your openai key here
+# os.environ["ANTHROPIC_API_KEY"] = "" # set your anthropic key here
+# os.environ["TOGETHER_AI_API_KEY"] = "" # set your together ai key here
+# see supported models / keys here: https://litellm.readthedocs.io/en/latest/supported/
+######### ENVIRONMENT VARIABLES ##########
+verbose = True
+
+# litellm.caching_with_models = True # CACHING: caching_with_models Keys in the cache are messages + model. - to learn more: https://docs.litellm.ai/docs/caching/
+######### PROMPT LOGGING ##########
+os.environ["PROMPTLAYER_API_KEY"] = "" # set your promptlayer key here - https://promptlayer.com/
+
+# set callbacks
+litellm.success_callback = ["promptlayer"]
+############ HELPER FUNCTIONS ###################################
+
+def print_verbose(print_statement):
+ if verbose:
+ print(print_statement)
+
+app = Flask(__name__)
+CORS(app)
+
+@app.route('/')
+def index():
+ return 'received!', 200
+
+def data_generator(response):
+ for chunk in response:
+ yield f"data: {json.dumps(chunk)}\n\n"
+
+@app.route('/chat/completions', methods=["POST"])
+def api_completion():
+ data = request.json
+ start_time = time.time()
+ if data.get('stream') == "True":
+ data['stream'] = True # convert to boolean
+ try:
+ if "prompt" not in data:
+ raise ValueError("data needs to have prompt")
+ data["model"] = "togethercomputer/CodeLlama-34b-Instruct" # by default use Together AI's CodeLlama model - https://api.together.xyz/playground/chat?model=togethercomputer%2FCodeLlama-34b-Instruct
+ # COMPLETION CALL
+ system_prompt = "Only respond to questions about code. Say 'I don't know' to anything outside of that."
+ messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": data.pop("prompt")}]
+ data["messages"] = messages
+ print(f"data: {data}")
+ response = completion(**data)
+ ## LOG SUCCESS
+ end_time = time.time()
+ if 'stream' in data and data['stream'] == True: # use generate_responses to stream responses
+ return Response(data_generator(response), mimetype='text/event-stream')
+ except Exception as e:
+ # call handle_error function
+ print_verbose(f"Got Error api_completion(): {traceback.format_exc()}")
+ ## LOG FAILURE
+ end_time = time.time()
+ traceback_exception = traceback.format_exc()
+ return handle_error(data=data)
+ return response
+
+@app.route('/get_models', methods=["POST"])
+def get_models():
+ try:
+ return litellm.model_list
+ except Exception as e:
+ traceback.print_exc()
+ response = {"error": str(e)}
+ return response, 200
+
+if __name__ == "__main__":
+ from waitress import serve
+ serve(app, host="0.0.0.0", port=4000, threads=500)
+