diff --git a/.circleci/config.yml b/.circleci/config.yml
index d563b8c17..61734d78a 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -36,6 +36,7 @@ jobs:
             pip install appdirs
             pip install langchain
             pip install numpydoc
+            pip install traceloop-sdk==0.0.69
       - save_cache:
           paths:
             - ./venv
diff --git a/.flake8 b/.flake8
new file mode 100644
index 000000000..b51cc0045
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,2 @@
+[flake8]
+ignore = E,F,W,B,B9,C,D,I,N,S,W503,W504,E203, TCE,TCA,EXE999,E999,TD
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 313241e4c..e3e1bee69 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,3 +13,4 @@ litellm/proxy/litellm_secrets.toml
 litellm/proxy/api_log.json
 .idea/
 router_config.yaml
+litellm_server/config.yaml
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 000000000..8bda916bc
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,8 @@
+repos:
+-   repo: https://github.com/pycqa/flake8
+    rev: 3.8.4  # The version of flake8 to use
+    hooks:
+    -  id: flake8
+       exclude: ^litellm/tests/|^litellm/proxy/|^litellm/integrations/
+       additional_dependencies: [flake8-print]
+       files: litellm/.*\.py
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
index 30d78eb18..179629c9a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,11 +1,5 @@
 FROM python:3.10
 
-# Define a build argument for the config file path
-ARG CONFIG_FILE
-
-# Copy the custom config file (if provided) into the Docker image
-COPY $CONFIG_FILE /app/config.yaml
-
 COPY . /app
 WORKDIR /app
 RUN pip install -r requirements.txt
diff --git a/README.md b/README.md
index 31c7d85b0..db51b85d6 100644
--- a/README.md
+++ b/README.md
@@ -5,22 +5,7 @@
         <p align="center">Call all LLM APIs using the OpenAI format [Bedrock, Huggingface, Cohere, TogetherAI, Azure, OpenAI, etc.]
         <br>
     </p>
-
-<h4 align="center">
-        <a href="https://l.linklyhq.com/l/1uHsr" target="_blank" >
-                <img src="https://render.com/images/deploy-to-render-button.svg" width=200/>
-        </a>
-        <a href="https://railway.app/template/YTHiYS?referralCode=t3ukrU" target="_blank">
-                <img src="https://railway.app/button.svg" width=200 />
-        </a>
-        <a href="https://l.linklyhq.com/l/1uHtX" target="_blank">
-                <img src="https://deploy.cloud.run/button.svg" width=200 height=50/>
-        </a>
-        <a href="https://docs.litellm.ai/docs/simple_proxy#deploy-on-aws-apprunner" target="_blank">
-            <img src=".github/deploy-to-aws.png"  height=40/>
-          </a>
-</h4>
-<h4 align="center"><a href="https://github.com/BerriAI/litellm/tree/main/litellm_server" target="_blank">LiteLLM Server</a></h4>
+<h4 align="center"><a href="https://github.com/BerriAI/litellm/tree/main/litellm_server" target="_blank">Evaluate LLMs → OpenAI-Compatible Server</a></h4>
 <h4 align="center">
     <a href="https://pypi.org/project/litellm/" target="_blank">
         <img src="https://img.shields.io/pypi/v/litellm.svg" alt="PyPI Version">
@@ -79,6 +64,7 @@ print(response)
 liteLLM supports streaming the model response back, pass `stream=True` to get a streaming iterator in response.  
 Streaming is supported for all models (Bedrock, Huggingface, TogetherAI, Azure, OpenAI, etc.)
 ```python
+from litellm import completion
 response = completion(model="gpt-3.5-turbo", messages=messages, stream=True)
 for chunk in response:
     print(chunk['choices'][0]['delta'])
@@ -89,31 +75,18 @@ for chunk in result:
   print(chunk['choices'][0]['delta'])
 ```
 
-## Supported Provider ([Docs](https://docs.litellm.ai/docs/providers))
-| Provider      | [Completion](https://docs.litellm.ai/docs/#basic-usage) | [Streaming](https://docs.litellm.ai/docs/completion/stream#streaming-responses)  | [Async Completion](https://docs.litellm.ai/docs/completion/stream#async-completion)  | [Async Streaming](https://docs.litellm.ai/docs/completion/stream#async-streaming)  |
-| ------------- | ------------- | ------------- | ------------- | ------------- |
-| [openai](https://docs.litellm.ai/docs/providers/openai)  | ✅ | ✅ | ✅ | ✅ |
-| [cohere](https://docs.litellm.ai/docs/providers/cohere)  | ✅ | ✅ | ✅ | ✅ |
-| [anthropic](https://docs.litellm.ai/docs/providers/anthropic)  | ✅ | ✅ | ✅ | ✅ |
-| [replicate](https://docs.litellm.ai/docs/providers/replicate)  | ✅ | ✅ | ✅ | ✅ |
-| [huggingface](https://docs.litellm.ai/docs/providers/huggingface)  | ✅ | ✅ | ✅ | ✅ |
-| [together_ai](https://docs.litellm.ai/docs/providers/togetherai)  | ✅ | ✅ | ✅ | ✅ |
-| [openrouter](https://docs.litellm.ai/docs/providers/openrouter)  | ✅ | ✅ | ✅ | ✅ |
-| [vertex_ai](https://docs.litellm.ai/docs/providers/vertex)  | ✅ | ✅ | ✅ | ✅ |
-| [palm](https://docs.litellm.ai/docs/providers/palm)  | ✅ | ✅ | ✅ | ✅ |
-| [ai21](https://docs.litellm.ai/docs/providers/ai21)  | ✅ | ✅ | ✅ | ✅ |
-| [baseten](https://docs.litellm.ai/docs/providers/baseten)  | ✅ | ✅ | ✅ | ✅ |
-| [azure](https://docs.litellm.ai/docs/providers/azure)  | ✅ | ✅ | ✅ | ✅ |
-| [sagemaker](https://docs.litellm.ai/docs/providers/aws_sagemaker)  | ✅ | ✅ | ✅ | ✅ |
-| [bedrock](https://docs.litellm.ai/docs/providers/bedrock)  | ✅ | ✅ | ✅ | ✅ |
-| [vllm](https://docs.litellm.ai/docs/providers/vllm)  | ✅ | ✅ | ✅ | ✅ |
-| [nlp_cloud](https://docs.litellm.ai/docs/providers/nlp_cloud)  | ✅ | ✅ | ✅ | ✅ |
-| [aleph alpha](https://docs.litellm.ai/docs/providers/aleph_alpha)  | ✅ | ✅ | ✅ | ✅ |
-| [petals](https://docs.litellm.ai/docs/providers/petals)  | ✅ | ✅ | ✅ | ✅ |
-| [ollama](https://docs.litellm.ai/docs/providers/ollama)  | ✅ | ✅ | ✅ | ✅ |
-| [deepinfra](https://docs.litellm.ai/docs/providers/deepinfra)  | ✅ | ✅ | ✅ | ✅ |
+## Reliability - Fallback LLMs
+Never fail a request using LiteLLM
+
+```python
+from litellm import completion
+# if gpt-4 fails, retry the request with gpt-3.5-turbo->command-nightly->claude-instant-1
+response = completion(model="gpt-4",messages=messages, fallbacks=["gpt-3.5-turbo", "command-nightly", "claude-instant-1"])
+
+# if azure/gpt-4 fails, retry the request with fallback api_keys/api_base
+response = completion(model="azure/gpt-4", messages=messages, api_key=api_key, fallbacks=[{"api_key": "good-key-1"}, {"api_key": "good-key-2", "api_base": "good-api-base-2"}])
+```
 
-[**Read the Docs**](https://docs.litellm.ai/docs/)
 ## Logging Observability - Log LLM Input/Output ([Docs](https://docs.litellm.ai/docs/observability/callbacks))
 LiteLLM exposes pre defined callbacks to send data to LLMonitor, Langfuse, Helicone, Promptlayer, Traceloop, Slack
 ```python
@@ -132,6 +105,35 @@ litellm.success_callback = ["promptlayer", "llmonitor"] # log input/output to pr
 response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}])
 ```
 
+
+## Supported Provider ([Docs](https://docs.litellm.ai/docs/providers))
+| Provider      | [Completion](https://docs.litellm.ai/docs/#basic-usage) | [Streaming](https://docs.litellm.ai/docs/completion/stream#streaming-responses)  | [Async Completion](https://docs.litellm.ai/docs/completion/stream#async-completion)  | [Async Streaming](https://docs.litellm.ai/docs/completion/stream#async-streaming)  |
+| ------------- | ------------- | ------------- | ------------- | ------------- |
+| [openai](https://docs.litellm.ai/docs/providers/openai)  | ✅ | ✅ | ✅ | ✅ |
+| [azure](https://docs.litellm.ai/docs/providers/azure)  | ✅ | ✅ | ✅ | ✅ |
+| [aws - sagemaker](https://docs.litellm.ai/docs/providers/aws_sagemaker)  | ✅ | ✅ | ✅ | ✅ |
+| [aws - bedrock](https://docs.litellm.ai/docs/providers/bedrock)  | ✅ | ✅ | ✅ | ✅ |
+| [cohere](https://docs.litellm.ai/docs/providers/cohere)  | ✅ | ✅ | ✅ | ✅ |
+| [anthropic](https://docs.litellm.ai/docs/providers/anthropic)  | ✅ | ✅ | ✅ | ✅ |
+| [huggingface](https://docs.litellm.ai/docs/providers/huggingface)  | ✅ | ✅ | ✅ | ✅ |
+| [replicate](https://docs.litellm.ai/docs/providers/replicate)  | ✅ | ✅ | ✅ | ✅ |
+| [together_ai](https://docs.litellm.ai/docs/providers/togetherai)  | ✅ | ✅ | ✅ | ✅ |
+| [openrouter](https://docs.litellm.ai/docs/providers/openrouter)  | ✅ | ✅ | ✅ | ✅ |
+| [google - vertex_ai](https://docs.litellm.ai/docs/providers/vertex)  | ✅ | ✅ | ✅ | ✅ |
+| [google - palm](https://docs.litellm.ai/docs/providers/palm)  | ✅ | ✅ | ✅ | ✅ |
+| [ai21](https://docs.litellm.ai/docs/providers/ai21)  | ✅ | ✅ | ✅ | ✅ |
+| [baseten](https://docs.litellm.ai/docs/providers/baseten)  | ✅ | ✅ | ✅ | ✅ |
+| [vllm](https://docs.litellm.ai/docs/providers/vllm)  | ✅ | ✅ | ✅ | ✅ |
+| [nlp_cloud](https://docs.litellm.ai/docs/providers/nlp_cloud)  | ✅ | ✅ | ✅ | ✅ |
+| [aleph alpha](https://docs.litellm.ai/docs/providers/aleph_alpha)  | ✅ | ✅ | ✅ | ✅ |
+| [petals](https://docs.litellm.ai/docs/providers/petals)  | ✅ | ✅ | ✅ | ✅ |
+| [ollama](https://docs.litellm.ai/docs/providers/ollama)  | ✅ | ✅ | ✅ | ✅ |
+| [deepinfra](https://docs.litellm.ai/docs/providers/deepinfra)  | ✅ | ✅ | ✅ | ✅ |
+| [perplexity-ai](https://docs.litellm.ai/docs/providers/perplexity)  | ✅ | ✅ | ✅ | ✅ |
+| [anyscale](https://docs.litellm.ai/docs/providers/anyscale)  | ✅ | ✅ | ✅ | ✅ |
+
+[**Read the Docs**](https://docs.litellm.ai/docs/)
+
 ## Contributing
 To contribute: Clone the repo locally -> Make a change -> Submit a PR with the change. 
 
diff --git a/cookbook/LiteLLM_AB_TestLLMs.ipynb b/cookbook/LiteLLM_AB_TestLLMs.ipynb
deleted file mode 100644
index 690427be8..000000000
--- a/cookbook/LiteLLM_AB_TestLLMs.ipynb
+++ /dev/null
@@ -1,198 +0,0 @@
-{
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "provenance": []
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    }
-  },
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "source": [
-        "# LiteLLM A/B Testing LLMs in production\n",
-        "\n",
-        "* LiteLLM allows you to use 100+ LLMs as a drop in replacement for `gpt-3.5-turbo`\n",
-        "\n",
-        "This tutorial walks through how to use LiteLLM to easily A/B Test LLMs in production"
-      ],
-      "metadata": {
-        "id": "ODpmJQ5u4rXI"
-      }
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "# Example 1: A/B Test GPT-4 & GPT-3.5\n",
-        "\n",
-        "# Step 1\n",
-        "👉 Get your `id` from here: https://admin.litellm.ai/"
-      ],
-      "metadata": {
-        "id": "YamUetcC5Ke7"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "from litellm import completion_with_split_tests\n",
-        "import os\n",
-        "\n",
-        "## set ENV variables\n",
-        "os.environ[\"OPENAI_API_KEY\"] = \"\"\n",
-        "\n",
-        "\n",
-        "# define a dict of model id and % of requests for model\n",
-        "# see models here: https://docs.litellm.ai/docs/providers\n",
-        "split_per_model = {\n",
-        "\t\"gpt-4\": 0.3,\n",
-        "\t\"gpt-3.5-turbo\": 0.7\n",
-        "}\n",
-        "\n",
-        "messages = [{ \"content\": \"Hello, how are you?\",\"role\": \"user\"}]\n",
-        "\n",
-        "completion_with_split_tests(messages=messages, use_client=True,\n",
-        "   id=\"91fad14a-8c0f-4e99-8eaa-68245435aa80\") # [Optional Set your own ID]"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "7XGfv0958k70",
-        "outputId": "91a069a5-c7d4-4fb0-e345-5ebf383edbbc"
-      },
-      "execution_count": 4,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "last_fetched_at: 1693624804.2941535\n"
-          ]
-        },
-        {
-          "output_type": "execute_result",
-          "data": {
-            "text/plain": [
-              "<OpenAIObject chat.completion id=chatcmpl-7uBT4QHc8BAoZKkU7JoH4ahmXvu0M at 0x7c2895c9e890> JSON: {\n",
-              "  \"id\": \"chatcmpl-7uBT4QHc8BAoZKkU7JoH4ahmXvu0M\",\n",
-              "  \"object\": \"chat.completion\",\n",
-              "  \"created\": 1693624806,\n",
-              "  \"model\": \"gpt-3.5-turbo-0613\",\n",
-              "  \"choices\": [\n",
-              "    {\n",
-              "      \"index\": 0,\n",
-              "      \"message\": {\n",
-              "        \"role\": \"assistant\",\n",
-              "        \"content\": \"Hello! I'm an AI, so I don't have emotions, but I'm here to assist you. How can I help you today?\"\n",
-              "      },\n",
-              "      \"finish_reason\": \"stop\"\n",
-              "    }\n",
-              "  ],\n",
-              "  \"usage\": {\n",
-              "    \"prompt_tokens\": 13,\n",
-              "    \"completion_tokens\": 29,\n",
-              "    \"total_tokens\": 42\n",
-              "  }\n",
-              "}"
-            ]
-          },
-          "metadata": {},
-          "execution_count": 4
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## A/B Test GPT-4 and Claude-2"
-      ],
-      "metadata": {
-        "id": "Y12cxhZt58v8"
-      }
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 5,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "0k6rshtR8i9q",
-        "outputId": "31ac9d73-9e35-4697-d1ff-5d51048566f8"
-      },
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "last_fetched_at: 1693624809.3467667\n"
-          ]
-        },
-        {
-          "output_type": "execute_result",
-          "data": {
-            "text/plain": [
-              "<OpenAIObject chat.completion id=chatcmpl-7uBTA6gotsTksvCU7GffJ64ybfHUw at 0x7c28aa288630> JSON: {\n",
-              "  \"id\": \"chatcmpl-7uBTA6gotsTksvCU7GffJ64ybfHUw\",\n",
-              "  \"object\": \"chat.completion\",\n",
-              "  \"created\": 1693624812,\n",
-              "  \"model\": \"gpt-4-0613\",\n",
-              "  \"choices\": [\n",
-              "    {\n",
-              "      \"index\": 0,\n",
-              "      \"message\": {\n",
-              "        \"role\": \"assistant\",\n",
-              "        \"content\": \"As an AI, I don't have feelings, but I'm here and ready to assist you. How can I help you today?\"\n",
-              "      },\n",
-              "      \"finish_reason\": \"stop\"\n",
-              "    }\n",
-              "  ],\n",
-              "  \"usage\": {\n",
-              "    \"prompt_tokens\": 13,\n",
-              "    \"completion_tokens\": 27,\n",
-              "    \"total_tokens\": 40\n",
-              "  }\n",
-              "}"
-            ]
-          },
-          "metadata": {},
-          "execution_count": 5
-        }
-      ],
-      "source": [
-        "from litellm import completion_with_split_tests\n",
-        "import os\n",
-        "\n",
-        "## set ENV variables\n",
-        "os.environ[\"ANTHROPIC_API_KEY\"] = \"\"\n",
-        "\n",
-        "# define a dict of model id and % of requests for model\n",
-        "split_per_model = {\n",
-        "\t\"gpt-4\": 0.3,\n",
-        "\t\"claude-2\": 0.7\n",
-        "}\n",
-        "\n",
-        "messages = [{ \"content\": \"Hello, how are you?\",\"role\": \"user\"}]\n",
-        "\n",
-        "\n",
-        "completion_with_split_tests(messages=messages, use_client=True,\n",
-        "   id=\"91fad14a-8c0f-4e99-8eaa-68245435aa80\") # [Optional Set your own ID]"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "source": [],
-      "metadata": {
-        "id": "hzzbsAIp4pnr"
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/cookbook/LiteLLM_Azure_OpenAI.ipynb b/cookbook/LiteLLM_Azure_OpenAI.ipynb
deleted file mode 100644
index f6c460df3..000000000
--- a/cookbook/LiteLLM_Azure_OpenAI.ipynb
+++ /dev/null
@@ -1,259 +0,0 @@
-{
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "provenance": []
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    }
-  },
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## Use Azure OpenAI with LiteLLM"
-      ],
-      "metadata": {
-        "id": "oTA-1bG_wBVw"
-      }
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "RreFKTyKv2nt"
-      },
-      "outputs": [],
-      "source": [
-        "!pip install litellm"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "Pass API_BASE, API_VERSION, API_KEY in COMPLETION()"
-      ],
-      "metadata": {
-        "id": "kSOo9lbKv_7H"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "import litellm\n",
-        "response = litellm.completion(\n",
-        "    model = \"azure/chatgpt-v-2\",                                  # model = azure/<your deployment name>\n",
-        "    api_base = \"https://openai-gpt-4-test-v-1.openai.azure.com/\", # azure api base\n",
-        "    api_version = \"2023-05-15\",                                   # azure api version\n",
-        "    api_key = \"\",                                                 # azure api key\n",
-        "    messages = [{\"role\": \"user\", \"content\": \"good morning\"}],\n",
-        "    max_tokens=10,\n",
-        ")\n",
-        "print(response)"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "gWIsjrHMv_DM",
-        "outputId": "732e9daa-8dca-4bc1-bb8a-aee90ee14c8d"
-      },
-      "execution_count": 4,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "{\n",
-            "  \"id\": \"chatcmpl-877x4J2JUSReOuxVGE3THLjcmdrI8\",\n",
-            "  \"object\": \"chat.completion\",\n",
-            "  \"created\": 1696709554,\n",
-            "  \"model\": \"gpt-35-turbo\",\n",
-            "  \"choices\": [\n",
-            "    {\n",
-            "      \"index\": 0,\n",
-            "      \"finish_reason\": \"length\",\n",
-            "      \"message\": {\n",
-            "        \"role\": \"assistant\",\n",
-            "        \"content\": \"Good morning! How can I assist you today?\"\n",
-            "      }\n",
-            "    }\n",
-            "  ],\n",
-            "  \"usage\": {\n",
-            "    \"completion_tokens\": 10,\n",
-            "    \"prompt_tokens\": 10,\n",
-            "    \"total_tokens\": 20\n",
-            "  }\n",
-            "}\n"
-          ]
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "source": [],
-      "metadata": {
-        "id": "PR5uhvVHxe-C"
-      }
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## Set .env variables with Azure / LiteLLM"
-      ],
-      "metadata": {
-        "id": "1P2hprlLxfDc"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "import litellm\n",
-        "import os\n",
-        "\n",
-        "os.environ['AZURE_API_KEY'] = \"\"\n",
-        "os.environ['AZURE_API_BASE'] = \"\"\n",
-        "os.environ['AZURE_API_VERSION'] = \"\"\n",
-        "\n",
-        "response = litellm.completion(\n",
-        "    model = \"azure/chatgpt-v-2\", # model = azure/<your deployment name>\n",
-        "    messages = [{\"role\": \"user\", \"content\": \"good morning\"}],\n",
-        "    max_tokens=10,\n",
-        ")\n",
-        "print(response)"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "mGi9Gae1wMjK",
-        "outputId": "29f2a9cf-f6ee-416b-9b24-02588d96fe59"
-      },
-      "execution_count": 5,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "{\n",
-            "  \"id\": \"chatcmpl-877zB0GWZl4zswopLt12yQEzEfYWy\",\n",
-            "  \"object\": \"chat.completion\",\n",
-            "  \"created\": 1696709685,\n",
-            "  \"model\": \"gpt-35-turbo\",\n",
-            "  \"choices\": [\n",
-            "    {\n",
-            "      \"index\": 0,\n",
-            "      \"finish_reason\": \"length\",\n",
-            "      \"message\": {\n",
-            "        \"role\": \"assistant\",\n",
-            "        \"content\": \"Good morning! How can I assist you today?\"\n",
-            "      }\n",
-            "    }\n",
-            "  ],\n",
-            "  \"usage\": {\n",
-            "    \"completion_tokens\": 10,\n",
-            "    \"prompt_tokens\": 10,\n",
-            "    \"total_tokens\": 20\n",
-            "  }\n",
-            "}\n"
-          ]
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## With Streaming"
-      ],
-      "metadata": {
-        "id": "uIhyvSVNx4hX"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "response = litellm.completion(\n",
-        "    model = \"azure/chatgpt-v-2\",\n",
-        "    messages = [{\"role\": \"user\", \"content\": \"good morning\"}],\n",
-        "    max_tokens=10,\n",
-        "    stream=True\n",
-        ")\n",
-        "\n",
-        "for chunk in response:\n",
-        "  print(chunk)"
-      ],
-      "metadata": {
-        "id": "R4KYKLOHxy9r"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## With Rate Limit Handler"
-      ],
-      "metadata": {
-        "id": "hB8jLz94ybTC"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "from litellm import RateLimitManager\n",
-        "\n",
-        "handler = RateLimitManager(max_requests_per_minute=10, max_tokens_per_minute=200)\n",
-        "\n",
-        "response = await handler.acompletion(\n",
-        "    model = \"azure/chatgpt-v-2\",\n",
-        "    messages = [{\"role\": \"user\", \"content\": \"good morning\"}],\n",
-        "    max_tokens=10,\n",
-        ")\n",
-        "print(response)"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "CQECDwpix7Hl",
-        "outputId": "18dc4041-3262-4ab7-a451-34ceaf70ca31"
-      },
-      "execution_count": 8,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "{\n",
-            "  \"id\": \"chatcmpl-8781gvDKwPbp44CliumABgAuIDnSf\",\n",
-            "  \"object\": \"chat.completion\",\n",
-            "  \"created\": 1696709840,\n",
-            "  \"model\": \"gpt-35-turbo\",\n",
-            "  \"choices\": [\n",
-            "    {\n",
-            "      \"index\": 0,\n",
-            "      \"finish_reason\": \"length\",\n",
-            "      \"message\": {\n",
-            "        \"role\": \"assistant\",\n",
-            "        \"content\": \"Good morning! How can I assist you today?\"\n",
-            "      }\n",
-            "    }\n",
-            "  ],\n",
-            "  \"usage\": {\n",
-            "    \"completion_tokens\": 10,\n",
-            "    \"prompt_tokens\": 10,\n",
-            "    \"total_tokens\": 20\n",
-            "  }\n",
-            "}\n"
-          ]
-        }
-      ]
-    }
-  ]
-}
\ No newline at end of file
diff --git a/cookbook/LiteLLM_Caching.ipynb b/cookbook/LiteLLM_Caching.ipynb
deleted file mode 100644
index 1d025e4df..000000000
--- a/cookbook/LiteLLM_Caching.ipynb
+++ /dev/null
@@ -1,123 +0,0 @@
-{
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "provenance": []
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    }
-  },
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## LiteLLM Caching Tutorial\n",
-        "Link to using Caching in Docs:\n",
-        "https://docs.litellm.ai/docs/caching/"
-      ],
-      "metadata": {
-        "id": "Lvj-GI3YQfQx"
-      }
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "eKSBuuKn99Jm"
-      },
-      "outputs": [],
-      "source": [
-        "!pip install litellm==0.1.492"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## Set `caching_with_models` to True\n",
-        "Enables caching on a per-model basis.\n",
-        "Keys are the input messages + model and values stored in the cache is the corresponding response"
-      ],
-      "metadata": {
-        "id": "sFXj4UUnQpyt"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "import os, time, litellm\n",
-        "from litellm import completion\n",
-        "litellm.caching_with_models = True # set caching for each model to True\n"
-      ],
-      "metadata": {
-        "id": "xCea1EjR99rU"
-      },
-      "execution_count": 8,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "os.environ['OPENAI_API_KEY'] = \"\""
-      ],
-      "metadata": {
-        "id": "VK3kXGXI-dtC"
-      },
-      "execution_count": 9,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## Use LiteLLM Cache"
-      ],
-      "metadata": {
-        "id": "U_CDCcnjQ7c6"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "question = \"write 1 page about what's LiteLLM\"\n",
-        "for _ in range(2):\n",
-        "    start_time = time.time()\n",
-        "    response = completion(\n",
-        "      model='gpt-3.5-turbo',\n",
-        "      messages=[\n",
-        "        {\n",
-        "            'role': 'user',\n",
-        "            'content': question\n",
-        "        }\n",
-        "      ],\n",
-        "    )\n",
-        "    print(f'Question: {question}')\n",
-        "    print(\"Time consuming: {:.2f}s\".format(time.time() - start_time))"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "Efli-J-t-bJH",
-        "outputId": "cfdb1e14-96b0-48ee-c504-7f567e84c349"
-      },
-      "execution_count": 10,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "Question: write 1 page about what's LiteLLM\n",
-            "Time consuming: 13.53s\n",
-            "Question: write 1 page about what's LiteLLM\n",
-            "Time consuming: 0.00s\n"
-          ]
-        }
-      ]
-    }
-  ]
-}
\ No newline at end of file
diff --git a/cookbook/LiteLLM_GPTCache.ipynb b/cookbook/LiteLLM_GPTCache.ipynb
deleted file mode 100644
index e8b8a52db..000000000
--- a/cookbook/LiteLLM_GPTCache.ipynb
+++ /dev/null
@@ -1,336 +0,0 @@
-{
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "provenance": []
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    }
-  },
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "source": [
-        "# Using GPT Cache with LiteLLM\n",
-        "GPT Cache allows you to slash Your LLM API Costs by 10x 💰, Boost Speed by 100x ⚡\n",
-        "\n",
-        "In this tutorial we demo how to use LiteLLM with GPTCache\n",
-        "* Quick Start Usage\n",
-        "* Advanced Usaged\n",
-        "* Setting custom cache keys\n",
-        "\n"
-      ],
-      "metadata": {
-        "id": "2BUxu9L2mPbX"
-      }
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "SRbVXJUGk6HC"
-      },
-      "outputs": [],
-      "source": [
-        "# installation\n",
-        "!pip install litellm gptcache"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "# Set ENV variables\n"
-      ],
-      "metadata": {
-        "id": "UuZX3OSBlIDt"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "import os\n",
-        "os.environ['OPENAI_API_KEY'] = \"\"\n",
-        "os.environ['COHERE_API_KEY'] = \"\""
-      ],
-      "metadata": {
-        "id": "E4jn-bPWlBZs"
-      },
-      "execution_count": 12,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "# Quick Start Usage\n",
-        "By default GPT Cache uses the content in `messages` as the cache key\n",
-        " Import GPT Cache"
-      ],
-      "metadata": {
-        "id": "Tswo-058lcid"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "import litellm\n",
-        "from litellm.gpt_cache import completion\n",
-        "\n",
-        "### using / setting up gpt cache\n",
-        "from gptcache import cache\n",
-        "cache.init()\n",
-        "cache.set_openai_key()\n",
-        "#########################"
-      ],
-      "metadata": {
-        "id": "9oOV8gRtk_la"
-      },
-      "execution_count": 4,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "## two completion calls\n",
-        "import time\n",
-        "question = \"why should i use LiteLLM\"\n",
-        "for _ in range(2):\n",
-        "    start_time = time.time()\n",
-        "    response = completion(\n",
-        "      model='gpt-3.5-turbo',\n",
-        "      messages=[\n",
-        "        {\n",
-        "            'role': 'user',\n",
-        "            'content': question\n",
-        "        }\n",
-        "      ],\n",
-        "    )\n",
-        "    print(f'Question: {question}, Response {response}')\n",
-        "    print(\"Time consuming: {:.2f}s\".format(time.time() - start_time))"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "Em1kgIOOm1Vo",
-        "outputId": "d8e57747-a851-4675-f936-d65e5570d95a"
-      },
-      "execution_count": 7,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "Question: why should i use LiteLLM, Response {\n",
-            "  \"id\": \"chatcmpl-7tJozrtW5UzVHNUcxX6cfzRS4nbxd\",\n",
-            "  \"object\": \"chat.completion\",\n",
-            "  \"created\": 1693418589,\n",
-            "  \"model\": \"gpt-3.5-turbo-0613\",\n",
-            "  \"choices\": [\n",
-            "    {\n",
-            "      \"index\": 0,\n",
-            "      \"message\": {\n",
-            "        \"role\": \"assistant\",\n",
-            "        \"content\": \"There are several reasons why you might consider using LiteLLM:\\n\\n1. Simplified document management: LiteLLM offers a user-friendly interface that makes it easy to manage and organize your legal documents. You can track versions, organize files into folders, and quickly find what you need.\\n\\n2. Collaboration and accessibility: LiteLLM allows multiple users to work on documents simultaneously, making it easier for teams to collaborate and exchange feedback. It also provides flexible accessibility, allowing you to access your documents from anywhere, anytime, as long as you have an internet connection.\\n\\n3. Time-saving features: The platform offers various time-saving features, such as automated document generation, customizable templates, and integration with other tools like Microsoft Word. This can significantly reduce the time and effort required to prepare legal documents.\\n\\n4. Enhanced security: LiteLLM prioritizes the security of your data. It provides robust encryption, secure data storage, and role-based access controls. This ensures that your sensitive legal documents are protected from unauthorized access.\\n\\n5. Cost-effective solution: LiteLLM offers a cost-effective solution compared to traditional legal document management systems. With its cloud-based approach, you don't need to invest in expensive hardware or software installations. Instead, you pay for a subscription-based model, which can be more affordable for small firms or individual practitioners.\\n\\nUltimately, the decision to use LiteLLM depends on your specific needs and preferences. It's important to consider factors such as the size of your practice, the volume of legal documents you handle, and your budget before making a decision.\"\n",
-            "      },\n",
-            "      \"finish_reason\": \"stop\"\n",
-            "    }\n",
-            "  ],\n",
-            "  \"usage\": {\n",
-            "    \"prompt_tokens\": 14,\n",
-            "    \"completion_tokens\": 312,\n",
-            "    \"total_tokens\": 326\n",
-            "  }\n",
-            "}\n",
-            "Time consuming: 9.48s\n",
-            "Question: why should i use LiteLLM, Response {'gptcache': True, 'saved_token': [14, 312], 'choices': [{'message': {'role': 'assistant', 'content': \"There are several reasons why you might consider using LiteLLM:\\n\\n1. Simplified document management: LiteLLM offers a user-friendly interface that makes it easy to manage and organize your legal documents. You can track versions, organize files into folders, and quickly find what you need.\\n\\n2. Collaboration and accessibility: LiteLLM allows multiple users to work on documents simultaneously, making it easier for teams to collaborate and exchange feedback. It also provides flexible accessibility, allowing you to access your documents from anywhere, anytime, as long as you have an internet connection.\\n\\n3. Time-saving features: The platform offers various time-saving features, such as automated document generation, customizable templates, and integration with other tools like Microsoft Word. This can significantly reduce the time and effort required to prepare legal documents.\\n\\n4. Enhanced security: LiteLLM prioritizes the security of your data. It provides robust encryption, secure data storage, and role-based access controls. This ensures that your sensitive legal documents are protected from unauthorized access.\\n\\n5. Cost-effective solution: LiteLLM offers a cost-effective solution compared to traditional legal document management systems. With its cloud-based approach, you don't need to invest in expensive hardware or software installations. Instead, you pay for a subscription-based model, which can be more affordable for small firms or individual practitioners.\\n\\nUltimately, the decision to use LiteLLM depends on your specific needs and preferences. It's important to consider factors such as the size of your practice, the volume of legal documents you handle, and your budget before making a decision.\"}, 'finish_reason': 'stop', 'index': 0}], 'created': 1693418598, 'usage': {'completion_tokens': 0, 'prompt_tokens': 0, 'total_tokens': 0}, 'object': 'chat.completion'}\n",
-            "Time consuming: 0.00s\n"
-          ]
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "# Advanced Usage - Setting custom keys for Cache\n",
-        "By default gptcache uses the `messages` as the cache key\n",
-        "\n",
-        "GPTCache allows you to set custom cache keys by setting\n",
-        "```python\n",
-        "cache.init(pre_func=pre_cache_func)\n",
-        "```\n",
-        "\n",
-        "In this code snippet below we define a `pre_func` that returns message content + model as key"
-      ],
-      "metadata": {
-        "id": "R6hywKu8nXXW"
-      }
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## Defining a `pre_func` for GPTCache\n"
-      ],
-      "metadata": {
-        "id": "6nx1X-2Hn3ak"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "### using / setting up gpt cache\n",
-        "from gptcache import cache\n",
-        "from gptcache.processor.pre import last_content_without_prompt\n",
-        "from typing import Dict, Any\n",
-        "\n",
-        "# use this function to set your cache keys -> gptcache\n",
-        "# data are all the args passed to your completion call\n",
-        "def pre_cache_func(data: Dict[str, Any], **params: Dict[str, Any]) -> Any:\n",
-        "        # use this to set cache key\n",
-        "        print(\"in pre_cache_func\")\n",
-        "        last_content_without_prompt_val = last_content_without_prompt(data, **params)\n",
-        "        print(\"last content without prompt\", last_content_without_prompt_val)\n",
-        "        print(\"model\", data[\"model\"])\n",
-        "        cache_key = last_content_without_prompt_val + data[\"model\"]\n",
-        "        print(\"cache_key\", cache_key)\n",
-        "        return cache_key # using this as cache_key\n",
-        ""
-      ],
-      "metadata": {
-        "id": "jJQsTyrZlvDY"
-      },
-      "execution_count": 9,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## Init Cache with `pre_func` to set custom keys"
-      ],
-      "metadata": {
-        "id": "Tjv1e0hqn-dX"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# init GPT Cache with custom pre_func\n",
-        "cache.init(pre_func=pre_cache_func)\n",
-        "cache.set_openai_key()"
-      ],
-      "metadata": {
-        "id": "Ua8UhEp6n9yR"
-      },
-      "execution_count": 10,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## Using Cache\n",
-        "* Cache key is `message` + `model`\n",
-        "\n",
-        "We make 3 LLM API calls\n",
-        "* 2 to OpenAI\n",
-        "* 1 to Cohere command nightly"
-      ],
-      "metadata": {
-        "id": "jHqWdfC4sTHf"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "messages = [{\"role\": \"user\", \"content\": \"why should I use LiteLLM for completions()\"}]\n",
-        "response1 = completion(model=\"gpt-3.5-turbo\", messages=messages)\n",
-        "response2 = completion(model=\"gpt-3.5-turbo\", messages=messages)\n",
-        "response3 = completion(model=\"command-nightly\", messages=messages) # calling cohere command nightly\n",
-        "\n",
-        "if response1[\"choices\"] != response2[\"choices\"]: # same models should cache\n",
-        "    print(f\"Error occurred: Caching for same model+prompt failed\")\n",
-        "\n",
-        "if response3[\"choices\"] == response2[\"choices\"]: # different models, don't cache\n",
-        "    # if models are different, it should not return cached response\n",
-        "    print(f\"Error occurred: Caching for different model+prompt failed\")\n",
-        "\n",
-        "print(\"response1\", response1)\n",
-        "print(\"response2\", response2)\n",
-        "print(\"response3\", response3)"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "ftrKpB2GsPMi",
-        "outputId": "1ee49273-bd62-49b4-a177-d40e33a51785"
-      },
-      "execution_count": 14,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "in pre_cache_func\n",
-            "last content without prompt why should I use LiteLLM for completions()\n",
-            "model gpt-3.5-turbo\n",
-            "cache_key why should I use LiteLLM for completions()gpt-3.5-turbo\n",
-            "in pre_cache_func\n",
-            "last content without prompt why should I use LiteLLM for completions()\n",
-            "model gpt-3.5-turbo\n",
-            "cache_key why should I use LiteLLM for completions()gpt-3.5-turbo\n",
-            "in pre_cache_func\n",
-            "last content without prompt why should I use LiteLLM for completions()\n",
-            "model command-nightly\n",
-            "cache_key why should I use LiteLLM for completions()command-nightly\n",
-            "response1 {\n",
-            "  \"id\": \"chatcmpl-7tKE21PEe43sR6RvZ7pcUmanFwZLf\",\n",
-            "  \"object\": \"chat.completion\",\n",
-            "  \"created\": 1693420142,\n",
-            "  \"model\": \"gpt-3.5-turbo-0613\",\n",
-            "  \"choices\": [\n",
-            "    {\n",
-            "      \"index\": 0,\n",
-            "      \"message\": {\n",
-            "        \"role\": \"assistant\",\n",
-            "        \"content\": \"There are several reasons why you should use LiteLLM for completions() in your code:\\n\\n1. Fast and efficient: LiteLLM is implemented in a lightweight manner, making it highly performant. It provides quick and accurate completions, ensuring a smooth development experience.\\n\\n2. Customizable completion options: LiteLLM allows you to customize the completion options based on your specific needs. You can specify the maximum number of completions to retrieve, the desired timeout, and more.\\n\\n3. Language-agnostic: LiteLLM supports multiple programming languages, including Python, JavaScript, Java, C++, and many others. So, regardless of the language you are using, LiteLLM can help you with intelligent code completions.\\n\\n4. Learning capabilities: LiteLLM has the ability to learn from the provided code and context, improving the accuracy of the completions over time. This means that as you continue to use it, the suggested completions will become increasingly tailored to your specific coding style and patterns.\\n\\n5. Ease of integration: LiteLLM is designed to be easily integrated into existing code editors or IDEs. It provides a simple API that allows you to integrate it seamlessly into your development workflow.\\n\\n6. Supported by OpenAI: LiteLLM is developed and maintained by OpenAI, a well-known and reputable organization in the field of artificial intelligence. This ensures ongoing support and updates to enhance the functionality and performance of LiteLLM.\\n\\nOverall, using LiteLLM for completions() can greatly improve your coding productivity by providing accurate and context-aware code completion suggestions, regardless of the programming language you are working with.\"\n",
-            "      },\n",
-            "      \"finish_reason\": \"stop\"\n",
-            "    }\n",
-            "  ],\n",
-            "  \"usage\": {\n",
-            "    \"prompt_tokens\": 18,\n",
-            "    \"completion_tokens\": 326,\n",
-            "    \"total_tokens\": 344\n",
-            "  }\n",
-            "}\n",
-            "response2 {'gptcache': True, 'saved_token': [18, 326], 'choices': [{'message': {'role': 'assistant', 'content': 'There are several reasons why you should use LiteLLM for completions() in your code:\\n\\n1. Fast and efficient: LiteLLM is implemented in a lightweight manner, making it highly performant. It provides quick and accurate completions, ensuring a smooth development experience.\\n\\n2. Customizable completion options: LiteLLM allows you to customize the completion options based on your specific needs. You can specify the maximum number of completions to retrieve, the desired timeout, and more.\\n\\n3. Language-agnostic: LiteLLM supports multiple programming languages, including Python, JavaScript, Java, C++, and many others. So, regardless of the language you are using, LiteLLM can help you with intelligent code completions.\\n\\n4. Learning capabilities: LiteLLM has the ability to learn from the provided code and context, improving the accuracy of the completions over time. This means that as you continue to use it, the suggested completions will become increasingly tailored to your specific coding style and patterns.\\n\\n5. Ease of integration: LiteLLM is designed to be easily integrated into existing code editors or IDEs. It provides a simple API that allows you to integrate it seamlessly into your development workflow.\\n\\n6. Supported by OpenAI: LiteLLM is developed and maintained by OpenAI, a well-known and reputable organization in the field of artificial intelligence. This ensures ongoing support and updates to enhance the functionality and performance of LiteLLM.\\n\\nOverall, using LiteLLM for completions() can greatly improve your coding productivity by providing accurate and context-aware code completion suggestions, regardless of the programming language you are working with.'}, 'finish_reason': 'stop', 'index': 0}], 'created': 1693420152, 'usage': {'completion_tokens': 0, 'prompt_tokens': 0, 'total_tokens': 0}, 'object': 'chat.completion'}\n",
-            "response3 {\n",
-            "  \"choices\": [\n",
-            "    {\n",
-            "      \"finish_reason\": \"stop\",\n",
-            "      \"index\": 0,\n",
-            "      \"message\": {\n",
-            "        \"content\": \" LiteLLM is a state-of-the-art, privacy-preserving LLM trained\",\n",
-            "        \"role\": \"assistant\"\n",
-            "      }\n",
-            "    }\n",
-            "  ],\n",
-            "  \"created\": 1693420153.8769038,\n",
-            "  \"model\": \"command-nightly\",\n",
-            "  \"usage\": {\n",
-            "    \"prompt_tokens\": 11,\n",
-            "    \"completion_tokens\": 16,\n",
-            "    \"total_tokens\": 27\n",
-            "  }\n",
-            "}\n"
-          ]
-        }
-      ]
-    }
-  ]
-}
\ No newline at end of file
diff --git a/cookbook/Using_Nemo_Guardrails_with_LiteLLM_Server.ipynb b/cookbook/Using_Nemo_Guardrails_with_LiteLLM_Server.ipynb
new file mode 100644
index 000000000..da5908324
--- /dev/null
+++ b/cookbook/Using_Nemo_Guardrails_with_LiteLLM_Server.ipynb
@@ -0,0 +1,159 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Using Nemo-Guardrails with LiteLLM Server\n",
+        "\n",
+        "[Call Bedrock, TogetherAI, Huggingface, etc. on the server](https://docs.litellm.ai/docs/providers)"
+      ],
+      "metadata": {
+        "id": "eKXncoQbU_2j"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Using with Bedrock\n",
+        "\n",
+        "`docker run -e PORT=8000 -e AWS_ACCESS_KEY_ID=<your-aws-access-key> -e AWS_SECRET_ACCESS_KEY=<your-aws-secret-key> -p 8000:8000 ghcr.io/berriai/litellm:latest`"
+      ],
+      "metadata": {
+        "id": "ZciYaLwvuFbu"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "pip install nemoguardrails langchain"
+      ],
+      "metadata": {
+        "id": "vOUwGSJ2Vsy3"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "xXEJNxe7U0IN"
+      },
+      "outputs": [],
+      "source": [
+        "import openai\n",
+        "from langchain.chat_models import ChatOpenAI\n",
+        "\n",
+        "llm = ChatOpenAI(model_name=\"anthropic.claude-v2\", openai_api_base=\"http://0.0.0.0:8000\", openai_api_key=\"my-fake-key\")\n",
+        "\n",
+        "from nemoguardrails import LLMRails, RailsConfig\n",
+        "\n",
+        "config = RailsConfig.from_path(\"./config.yml\")\n",
+        "app = LLMRails(config, llm=llm)\n",
+        "\n",
+        "new_message = app.generate(messages=[{\n",
+        "    \"role\": \"user\",\n",
+        "    \"content\": \"Hello! What can you do for me?\"\n",
+        "}])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Using with TogetherAI\n",
+        "\n",
+        "1. You can either set this in the server environment:\n",
+        "`docker run -e PORT=8000 -e TOGETHERAI_API_KEY=<your-together-ai-api-key> -p 8000:8000 ghcr.io/berriai/litellm:latest`\n",
+        "\n",
+        "2. **Or** Pass this in as the api key `(...openai_api_key=\"<your-together-ai-api-key>\")`"
+      ],
+      "metadata": {
+        "id": "vz5n00qyuKjp"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import openai\n",
+        "from langchain.chat_models import ChatOpenAI\n",
+        "\n",
+        "llm = ChatOpenAI(model_name=\"together_ai/togethercomputer/CodeLlama-13b-Instruct\", openai_api_base=\"http://0.0.0.0:8000\", openai_api_key=\"my-together-ai-api-key\")\n",
+        "\n",
+        "from nemoguardrails import LLMRails, RailsConfig\n",
+        "\n",
+        "config = RailsConfig.from_path(\"./config.yml\")\n",
+        "app = LLMRails(config, llm=llm)\n",
+        "\n",
+        "new_message = app.generate(messages=[{\n",
+        "    \"role\": \"user\",\n",
+        "    \"content\": \"Hello! What can you do for me?\"\n",
+        "}])"
+      ],
+      "metadata": {
+        "id": "XK1sk-McuhpE"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### CONFIG.YML\n",
+        "\n",
+        "save this example `config.yml` in your current directory"
+      ],
+      "metadata": {
+        "id": "8A1KWKnzuxAS"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# instructions:\n",
+        "#   - type: general\n",
+        "#     content: |\n",
+        "#       Below is a conversation between a bot and a user about the recent job reports.\n",
+        "#       The bot is factual and concise. If the bot does not know the answer to a\n",
+        "#       question, it truthfully says it does not know.\n",
+        "\n",
+        "# sample_conversation: |\n",
+        "#   user \"Hello there!\"\n",
+        "#     express greeting\n",
+        "#   bot express greeting\n",
+        "#     \"Hello! How can I assist you today?\"\n",
+        "#   user \"What can you do for me?\"\n",
+        "#     ask about capabilities\n",
+        "#   bot respond about capabilities\n",
+        "#     \"I am an AI assistant that helps answer mathematical questions. My core mathematical skills are powered by wolfram alpha.\"\n",
+        "#   user \"What's 2+2?\"\n",
+        "#     ask math question\n",
+        "#   bot responds to math question\n",
+        "#     \"2+2 is equal to 4.\"\n",
+        "\n",
+        "# models:\n",
+        "#   - type: main\n",
+        "#     engine: openai\n",
+        "#     model: claude-instant-1"
+      ],
+      "metadata": {
+        "id": "NKN1GmSvu0Cx"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file
diff --git a/cookbook/get_hf_models.py b/cookbook/community-resources/get_hf_models.py
similarity index 100%
rename from cookbook/get_hf_models.py
rename to cookbook/community-resources/get_hf_models.py
diff --git a/cookbook/liteLLM_ChromaDB_Cache.ipynb b/cookbook/liteLLM_ChromaDB_Cache.ipynb
deleted file mode 100644
index 67306cac4..000000000
--- a/cookbook/liteLLM_ChromaDB_Cache.ipynb
+++ /dev/null
@@ -1,346 +0,0 @@
-{
-  "cells": [
-    {
-      "attachments": {},
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "fqqYwS3jzN_t"
-      },
-      "source": [
-        "## This is a tutorial on how to build a Cache for GPT-4, chatGPT, Claude, Palm, Llama2\n",
-        "\n",
-        "In this notebook we:\n",
-        "- use chromaDB to define add_cache(), get_cache(). We cache responses from the LLM\n",
-        "- use liteLLM for calling `completion()` with GPT-4, chatGPT, Claude, llama2"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 1,
-      "metadata": {
-        "id": "yQWPyKaEvl7c"
-      },
-      "outputs": [],
-      "source": [
-        "!pip install litellm\n",
-        "!pip install -Uq chromadb"
-      ]
-    },
-    {
-      "attachments": {},
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "oUVLNjt0pNUw"
-      },
-      "source": [
-        "## Init ChromaDB collection\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 10,
-      "metadata": {
-        "id": "iyrAj4tjpMph"
-      },
-      "outputs": [],
-      "source": [
-        "import chromadb\n",
-        "# Global cache collection instance\n",
-        "cache_collection = None\n",
-        "\n",
-        "# Initialize the cache collection\n",
-        "def make_collection():\n",
-        "    global cache_collection\n",
-        "    client = chromadb.EphemeralClient()\n",
-        "    cache_collection = client.create_collection(\"llm_responses\")"
-      ]
-    },
-    {
-      "attachments": {},
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "mr8ArGpWpZqi"
-      },
-      "source": [
-        "## Add to Cache Function\n",
-        "We extract the user question and use chromaDB to embed it. For each question we store the model response as `metadata`\n",
-        "\n",
-        "`add_cache()` args\n",
-        "* `messages` - Expect this to be in the chatGPT messages format\n",
-        "* `model_response` - Response from LLM\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 11,
-      "metadata": {
-        "id": "9Yr9jrPspTl8"
-      },
-      "outputs": [],
-      "source": [
-        "import uuid\n",
-        "\n",
-        "# Add a response to the cache\n",
-        "def add_cache(messages, model_response):\n",
-        "    global cache_collection\n",
-        "    if cache_collection is None:\n",
-        "        make_collection()\n",
-        "\n",
-        "    user_question = message_to_user_question(messages)\n",
-        "\n",
-        "    # Add the user question and model response to the cache\n",
-        "    cache_collection.add(\n",
-        "        documents=[user_question],\n",
-        "        metadatas=[{\"model_response\": str(model_response)}],\n",
-        "        ids=[str(uuid.uuid4())]\n",
-        "    )\n",
-        "    return\n",
-        "\n",
-        "\n",
-        "# HELPER: Extract user's question from messages\n",
-        "def message_to_user_question(messages):\n",
-        "    user_question = \"\"\n",
-        "    for message in messages:\n",
-        "        if message['role'] == 'user':\n",
-        "            user_question += message[\"content\"]\n",
-        "    return user_question"
-      ]
-    },
-    {
-      "attachments": {},
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "vpPjoHpNpxd0"
-      },
-      "source": [
-        "## Get Cache Function\n",
-        "Given a user question, we check chromaDB for any embeddings with\n",
-        "`similarity > similarity_threshold`. Return the corresponding model_response if there is a match i.e `cache_hit`\n",
-        "\n",
-        "`get_cache()` args\n",
-        "* `messages` - Expect this to be in the chatGPT messages format\n",
-        "* `similarity_threshold` - Define a similarity_threshold on a scale of 0-1\n",
-        "0 -> everything is cache hit, 0.5 (50% similar), 1-> only return cache hits"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 12,
-      "metadata": {
-        "id": "SJaz-Mpnj7jd"
-      },
-      "outputs": [],
-      "source": [
-        "# Retrieve a response from the cache if similarity is above the threshold\n",
-        "def get_cache(messages, similarity_threshold):\n",
-        "    try:\n",
-        "        global cache_collection\n",
-        "        if cache_collection is None:\n",
-        "            make_collection()\n",
-        "\n",
-        "        user_question = message_to_user_question(messages)\n",
-        "\n",
-        "        # Query the cache for the user question\n",
-        "        results = cache_collection.query(\n",
-        "            query_texts=[user_question],\n",
-        "            n_results=1\n",
-        "        )\n",
-        "\n",
-        "        if len(results['distances'][0]) == 0:\n",
-        "            return None  # Cache is empty\n",
-        "\n",
-        "        distance = results['distances'][0][0]\n",
-        "        sim = (1 - distance)\n",
-        "\n",
-        "        if sim >= similarity_threshold:\n",
-        "            return results['metadatas'][0][0][\"model_response\"]  # Return cached response\n",
-        "        else:\n",
-        "            return None  # No cache hit\n",
-        "    except Exception as e:\n",
-        "        print(\"Error in get cache\", e)\n",
-        "        raise e\n"
-      ]
-    },
-    {
-      "attachments": {},
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "8bM5GI9hqYPK"
-      },
-      "source": [
-        "## Using liteLLM completion()\n",
-        "We use liteLLM completion to call our LLM APIs. LiteLLM allows the same Input/Output format for Azure OpenAI, chatGPT,\n",
-        "* Basic usage - `litellm.completion(model, messages)`.\n",
-        "\n",
-        "Use OpenAI, Claude, Anthropic, Replicate models. See supported models here: https://litellm.readthedocs.io/en/latest/supported/\n",
-        "\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 13,
-      "metadata": {
-        "id": "r3hW2whOkAEj"
-      },
-      "outputs": [],
-      "source": [
-        "import litellm, os, random\n",
-        "os.environ[\"OPENAI_API_KEY\"] = \"\" # @param\n",
-        "os.environ[\"REPLICATE_API_TOKEN\"] = \"\" #@param\n",
-        "\n",
-        "models = [\"gpt-4\", \"replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1\"]\n",
-        "\n",
-        "def completion_with_cache(messages, similarity_threshold):\n",
-        "  # check cache before calling model, return if there is a hit\n",
-        "  cache_result = get_cache(messages, similarity_threshold)\n",
-        "\n",
-        "  if cache_result != None:\n",
-        "    return cache_result\n",
-        "\n",
-        "  # randomly pick llama2, GPT-4\n",
-        "  random_model_idx = random.randint(0, 1)\n",
-        "  model = models[random_model_idx]\n",
-        "  # use litellm to make completion request\n",
-        "  print(f\"using model {model}\")\n",
-        "  model_response = litellm.completion(model, messages)\n",
-        "\n",
-        "  # add the user question + model response to cache\n",
-        "  add_cache(messages, model_response)\n",
-        "\n",
-        "  return model_response"
-      ]
-    },
-    {
-      "attachments": {},
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "tTkYOpo0rbJO"
-      },
-      "source": [
-        "## Testing + Running Cache"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 14,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 747
-        },
-        "id": "i650yqJfkokZ",
-        "outputId": "efd14d6f-500e-4e52-969f-974a2a2ac15a"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "using model gpt-4\n",
-            "got response for Hello, what's the weather in San Francisco??\n",
-            "got response for what's the weather in San Francisco??Can you tell me about the latest news?\n",
-            "using model gpt-4\n",
-            "got response for What's the capital of France?\n",
-            "using model gpt-4\n",
-            "got response for How does photosynthesis work?\n",
-            "got response for capital of france?\n",
-            "using model replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1\n",
-            "got response for tell me a joke\n",
-            "using model replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1\n",
-            "got response for tell me a joke right nowHow do I bake a chocolate cake?\n",
-            "using model gpt-4\n",
-            "got response for What are the benefits of exercise?\n",
-            "got response for Tell me a joke!\n"
-          ]
-        },
-        {
-          "data": {
-            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAioAAAGzCAYAAAABsTylAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA7S0lEQVR4nO3deVhU5f//8dfIMiKyqOGWCCqa4oJpWohrapZL9rFcW5C0LCnc0qQ+pbaIWpof07DNpdTcW6zUXDOX3HHFBZWyzNBkERdUOL8//DHfJjQZGpyTPB/XNdfVuc8993mfwYkX97nPjMUwDEMAAAAmVMzVBQAAAFwPQQUAAJgWQQUAAJgWQQUAAJgWQQUAAJgWQQUAAJgWQQUAAJgWQQUAAJgWQQUAAJgWQQX4/1q2bKk6derclGNZLBaNHDnyphzLjIKDg9W7d+8CPbeov3ZAUUNQgSkdOXJE/fr1U9WqVVW8eHH5+voqIiJC//vf/3ThwgVXl3dLWLt2rSwWiywWi2bNmnXNPhEREbJYLDctwBWGxMREWSwWFS9eXGlpadfs07JlS9trYbFYVLp0aTVq1EjTpk1TTk6OrV/v3r1VsmTJGx5z5MiRduN5eHgoODhYMTEx160BwLW5u7oA4K+++eYbde3aVVarVU888YTq1KmjS5cuaf369Ro6dKj27dunDz74wNVl3jKKFy+uOXPm6LHHHrNrT05O1saNG1W8eHEXVeYcs2bNUvny5ZWamqqFCxeqb9++1+xXqVIlxcXFSZJOnTqlTz75RH369NGhQ4c0ZsyYAh07Pj5eJUuW1Llz57Rq1Sq9++672rFjh9avX1/g8wGKGoIKTOXYsWPq0aOHgoKCtHr1alWoUMG2Lzo6WklJSfrmm29cWOGtp3379vrqq690+vRp3Xbbbbb2OXPmqFy5cqpevbpSU1NdWGHBGYahOXPmqFevXjp27Jhmz5593aDi5+dnF9b69eunO+64Q5MnT9brr78uDw8Ph4//yCOP2F7Tfv36qUePHpo3b562bNmixo0bF+ykgCKGSz8wlXHjxikzM1Mff/yxXUjJFRISogEDBti2p0+frnvvvVdly5aV1WpVaGio4uPjrzn20qVL1aJFC/n4+MjX11eNGjXSnDlz8vTbv3+/WrVqpRIlSuj222/XuHHj8vTJysrSiBEjFBISIqvVqsDAQA0bNkxZWVl5+g0aNEgBAQHy8fHRgw8+qF9++eWGr8Pvv/8ud3d3jRo1Ks++gwcPymKxaPLkyZKky5cva9SoUapevbqKFy+uMmXKqGnTplqxYsUNjyNJnTt3ltVq1YIFC+za58yZo27dusnNzS3Pc65cuaLXX39d1apVk9VqVXBwsF566aU8528Yht544w1VqlRJJUqUUKtWrbRv375r1pGWlqaBAwcqMDBQVqtVISEhGjt2rN2lF0dt2LBBycnJ6tGjh3r06KF169bl6/WXpBIlSuiee+7RuXPndOrUqQLX8GfNmjWTdPXS5p9t3rxZ999/v/z8/FSiRAm1aNFCGzZssOtz9uxZDRw4UMHBwbJarSpbtqzatm2rHTt22PrkrrPavn27mjRpIi8vL1WpUkVTp07NU0tKSor69OmjcuXKqXjx4goLC9PMmTPt+iQnJ8tisejtt9/WBx98YPt5N2rUSFu3brXre/LkSUVFRalSpUqyWq2qUKGCOnfurOTkZLt+S5cuVbNmzeTt7S0fHx916NAhz7+J/I6FooEZFZjKkiVLVLVqVTVp0iRf/ePj41W7dm09+OCDcnd315IlS9S/f3/l5OQoOjra1m/GjBl68sknVbt2bcXGxsrf3187d+7UsmXL1KtXL1u/1NRU3X///erSpYu6deumhQsX6sUXX1TdunX1wAMPSJJycnL04IMPav369Xr66adVq1Yt7dmzR++8844OHTqkL774wjZe3759NWvWLPXq1UtNmjTR6tWr1aFDhxueV7ly5dSiRQvNnz9fI0aMsNs3b948ubm5qWvXrpKuroeIi4tT37591bhxY2VkZGjbtm3asWOH2rZte8NjlShRQp07d9Znn32mZ599VpK0a9cu7du3Tx999JF2796d5zl9+/bVzJkz9cgjj2jIkCHavHmz4uLilJiYqM8//9zW79VXX9Ubb7yh9u3bq3379tqxY4fuu+8+Xbp0yW688+fPq0WLFvr111/Vr18/Va5cWRs3blRsbKx+++03TZw48YbncS2zZ89WtWrV1KhRI9WpU0clSpTQZ599pqFDh+br+UePHpWbm5v8/f0LdPy/yv1FW6pUKVvb6tWr9cADD6hhw4YaMWKEihUrZgvgP/zwg23m5ZlnntHChQv13HPPKTQ0VH/88YfWr1+vxMRENWjQwDZeamqq2rdvr27duqlnz56aP3++nn32WXl6eurJJ5+UJF24cEEtW7ZUUlKSnnvuOVWpUkULFixQ7969lZaWZvfHgHQ1tJ49e1b9+vWTxWLRuHHj1KVLFx09etQ20/Twww9r3759ev755xUcHKyUlBStWLFCP//8s4KDgyVJn376qSIjI9WuXTuNHTtW58+fV3x8vJo2baqdO3fa+uVnLBQhBmAS6enphiSjc+fO+X7O+fPn87S1a9fOqFq1qm07LS3N8PHxMe6++27jwoULdn1zcnJs/92iRQtDkvHJJ5/Y2rKysozy5csbDz/8sK3t008/NYoVK2b88MMPdmNNnTrVkGRs2LDBMAzDSEhIMCQZ/fv3t+vXq1cvQ5IxYsSIvz23999/35Bk7Nmzx649NDTUuPfee23bYWFhRocOHf52rGtZs2aNIclYsGCB8fXXXxsWi8X4+eefDcMwjKFDh9pewxYtWhi1a9e2PS/3vPr27Ws33gsvvGBIMlavXm0YhmGkpKQYnp6eRocOHexe55deesmQZERGRtraXn/9dcPb29s4dOiQ3ZjDhw833NzcbHUZhpGv184wDOPSpUtGmTJljJdfftnW1qtXLyMsLCxP3xYtWhg1a9Y0Tp06ZZw6dcpITEw0YmJiDElGp06dbP0iIyMNb2/vGx57xIgRhiTj4MGDxqlTp4zk5GRj2rRphpeXlxEQEGCcO3fOMIyr//6qV69utGvXzu41On/+vFGlShWjbdu2tjY/Pz8jOjr6b4+b+294/PjxtrasrCyjfv36RtmyZY1Lly4ZhmEYEydONCQZs2bNsnu9wsPDjZIlSxoZGRmGYRjGsWPHDElGmTJljDNnztj6fvnll4YkY8mSJYZhGEZqaqohyXjrrbeuW9vZs2cNf39/46mnnrJrP3nypOHn52drz89YKFq49APTyMjIkCT5+Pjk+zleXl62/05PT9fp06fVokULHT16VOnp6ZKkFStW6OzZsxo+fHiehaEWi8Vuu2TJknbrFDw9PdW4cWMdPXrU1rZgwQLVqlVLNWvW1OnTp22Pe++9V5K0Zs0aSdK3334rSYqJibE7xsCBA/N1bl26dJG7u7vmzZtna9u7d6/279+v7t2729r8/f21b98+HT58OF/jXst9992n0qVLa+7cuTIMQ3PnzlXPnj2v2Tf3vAYPHmzXPmTIEEmyrSFauXKlLl26pOeff97udb7W+S9YsEDNmjVTqVKl7F7TNm3aKDs7W+vWrXP4nJYuXao//vjD7jx69uxpmy36qwMHDiggIEABAQGqVauW3n33XXXo0EHTpk1z+Ni57rjjDgUEBCg4OFhPPvmkQkJCtHTpUpUoUUKSlJCQoMOHD6tXr176448/bOd97tw5tW7dWuvWrbNd+vL399fmzZt14sSJvz2mu7u7+vXrZ9v29PRUv379lJKSou3bt0u6+jMsX7683Wvj4eGhmJgYZWZm6vvvv7cbs3v37nazQLmXsHLfF15eXvL09NTatWuvu55pxYoVSktLU8+ePe1+xm5ubrr77rtt75v8jIWihUs/MA1fX19JV6/F59eGDRs0YsQIbdq0SefPn7fbl56eLj8/P9t6gPzcYlupUqU84aVUqVJ2lz8OHz6sxMREBQQEXHOMlJQUSdJPP/2kYsWKqVq1anb777jjjhufmKTbbrtNrVu31vz58/X6669LunrZx93dXV26dLH1e+2119S5c2fVqFFDderU0f3336/HH39c9erVy9dxpKu/pLp27ao5c+aocePGOn78uN0lsT/LPa+QkBC79vLly8vf318//fSTrZ8kVa9e3a5fQECA3S896eprunv37hu+po6YNWuWqlSpIqvVqqSkJElStWrVVKJECc2ePVujR4+26x8cHKwPP/zQditz9erVVbZsWYeP+2eLFi2Sr6+vTp06pUmTJunYsWN24To3XEZGRl53jPT0dJUqVUrjxo1TZGSkAgMD1bBhQ7Vv315PPPGEqlatate/YsWK8vb2tmurUaOGpKuXnu655x799NNPql69uooVs/9btVatWpL+72eXq3LlynbbuT+/3CBhtVo1duxYDRkyROXKldM999yjjh076oknnlD58uXtzjU30P9V7vs/P2OhaCGowDR8fX1VsWJF7d27N1/9jxw5otatW6tmzZqaMGGCAgMD5enpqW+//VbvvPNOgRZhXmvhqHR1UWiunJwc1a1bVxMmTLhm38DAQIePez09evRQVFSUEhISVL9+fc2fP1+tW7e2uzunefPmOnLkiL788kt99913+uijj/TOO+9o6tSp173D5Vp69eqlqVOnauTIkQoLC1NoaOjf9v9roPsncnJy1LZtWw0bNuya+3N/0eZXRkaGlixZoosXL+YJStLVNRdvvvmm3Tl4e3urTZs2jhV+A82bN7f9rDp16qS6devq0Ucf1fbt21WsWDHbv9G33npL9evXv+YYuZ/b0q1bNzVr1kyff/65vvvuO7311lsaO3asFi9ebFs/VVjy874YOHCgOnXqpC+++ELLly/XK6+8ori4OK1evVp33nmn7Vw//fTTawYOd3f3fI+FooWgAlPp2LGjPvjgA23atEnh4eF/23fJkiXKysrSV199ZfcXX+4Ucq7cGY29e/fmmQUoiGrVqmnXrl1q3br13/6yDgoKUk5Ojo4cOWI3i3Lw4MF8H+uhhx5Sv379bJd/Dh06pNjY2Dz9SpcuraioKEVFRSkzM1PNmzfXyJEjHQoqTZs2VeXKlbV27VqNHTv2hud1+PBh21/g0tU7ldLS0hQUFGTrJ139S/rPf/WfOnUqz5R+tWrVlJmZ6bSgsHjxYl28eFHx8fF2oU66+vr/97//1YYNG9S0aVOnHC8/SpYsqREjRigqKkrz589Xjx49bP82fX1983XuFSpUUP/+/dW/f3+lpKSoQYMGevPNN+2CyokTJ3Tu3Dm7WZVDhw5Jkm0halBQkHbv3q2cnBy7WZUDBw7Y9hdEtWrVNGTIEA0ZMkSHDx9W/fr1NX78eM2aNct2rmXLls3Xuf7dWChaWKMCUxk2bJi8vb3Vt29f/f7773n2HzlyRP/73/8k/d9feX/+qy49PV3Tp0+3e859990nHx8fxcXF6eLFi3b7/vzc/OrWrZt+/fVXffjhh3n2XbhwQefOnZMk2y+PSZMm2fVx5A4Wf39/tWvXTvPnz9fcuXPl6emphx56yK7PH3/8YbddsmRJhYSE5LlV+EYsFosmTZqkESNG6PHHH79uv/bt20vKex65M0y5dzW1adNGHh4eevfdd+1e52udf7du3bRp0yYtX748z760tDRduXLFoXOZNWuWqlatqmeeeUaPPPKI3eOFF15QyZIlNXv2bIfGdIZHH31UlSpVsgXBhg0bqlq1anr77beVmZmZp3/ubdHZ2dm2NVe5ypYtq4oVK+b5OV+5ckXvv/++bfvSpUt6//33FRAQoIYNG0q6+jM8efKk3fqnK1eu6N1331XJkiXVokULh87r/Pnzed5b1apVk4+Pj62+du3aydfXV6NHj9bly5eve675GQtFCzMqMJVq1appzpw56t69u2rVqmX3ybQbN2603UIpXQ0gnp6e6tSpk/r166fMzEx9+OGHKlu2rH777TfbmL6+vnrnnXfUt29fNWrUSL169VKpUqW0a9cunT9/Ps9nR9zI448/rvnz5+uZZ57RmjVrFBERoezsbB04cEDz58/X8uXLddddd6l+/frq2bOn3nvvPaWnp6tJkyZatWqVbb1EfnXv3l2PPfaY3nvvPbVr1y7PrbKhoaFq2bKlGjZsqNKlS2vbtm2221gd1blzZ3Xu3Plv+4SFhSkyMlIffPCB0tLS1KJFC23ZskUzZ87UQw89pFatWkm6uhblhRdeUFxcnDp27Kj27dtr586dWrp0aZ5ZjqFDh+qrr75Sx44d1bt3bzVs2FDnzp3Tnj17tHDhQiUnJ+d5zvWcOHFCa9asybOIOZfValW7du20YMECTZo0qUAf5FZQHh4eGjBggIYOHaply5bp/vvv10cffaQHHnhAtWvXVlRUlG6//Xb9+uuvWrNmjXx9fbVkyRKdPXtWlSpV0iOPPKKwsDCVLFlSK1eu1NatWzV+/Hi7Y1SsWFFjx45VcnKyatSooXnz5ikhIUEffPCB7Vyffvppvf/+++rdu7e2b9+u4OBgLVy4UBs2bNDEiRMdWtAuXZ2xad26tbp166bQ0FC5u7vr888/1++//64ePXpIuvo+jI+P1+OPP64GDRqoR48eCggI0M8//6xvvvlGERERmjx5cr7GQhHjyluOgOs5dOiQ8dRTTxnBwcGGp6en4ePjY0RERBjvvvuucfHiRVu/r776yqhXr55RvHhxIzg42Bg7dqwxbdo0Q5Jx7NgxuzG/+uoro0mTJoaXl5fh6+trNG7c2Pjss89s+/96G26uyMhIIygoyK7t0qVLxtixY43atWsbVqvVKFWqlNGwYUNj1KhRRnp6uq3fhQsXjJiYGKNMmTKGt7e30alTJ+P48eP5vsXWMAwjIyPD8PLyynM7aa433njDaNy4seHv7294eXkZNWvWNN58803brajX8+fbk//OtV6Xy5cvG6NGjTKqVKlieHh4GIGBgUZsbKzdz8YwDCM7O9sYNWqUUaFCBcPLy8to2bKlsXfvXiMoKMju9mTDuHr7amxsrBESEmJ4enoat912m9GkSRPj7bfftjuXG71248ePNyQZq1atum6fGTNmGJKML7/88rrneC2O3p586tSpPPvS09MNPz8/o0WLFra2nTt3Gl26dDHKlCljWK1WIygoyOjWrZvtHLKysoyhQ4caYWFhho+Pj+Ht7W2EhYUZ7733nt3Yueexbds2Izw83ChevLgRFBRkTJ48OU8dv//+uxEVFWXcdttthqenp1G3bl1j+vTpdn1yb0++1q3Cf/45nD592oiOjjZq1qxpeHt7G35+fsbdd99tzJ8/P8/z1qxZY7Rr187w8/MzihcvblSrVs3o3bu3sW3bNofHQtFgMYwCzH0DAEynZcuWOn36dL4XpAP/BqxRAQAApkVQAQAApkVQAQAApsUaFQAAYFrMqAAAANMiqAAAANP6V3/gW05Ojk6cOCEfHx+nfu8IAAAoPIZh6OzZs6pYsWKeL8f8q391UDlx4oRTvwAOAADcPMePH1elSpX+ts+/Oqjkfszz8ePHbV8RDgAAzC0jI0OBgYH5+rqGf3VQyb3c4+vrS1ABAOBfJj/LNlhMCwAATIugAgAATIugAgAATIugAgAATIugAgAATIugAgAATIugAgAATIugAgAATIugAgAATIugAgAATMvlQeXXX3/VY489pjJlysjLy0t169bVtm3bXF0WAAAwAZd+109qaqoiIiLUqlUrLV26VAEBATp8+LBKlSrlyrIAAIBJuDSojB07VoGBgZo+fbqtrUqVKi6sCAAAmIlLL/189dVXuuuuu9S1a1eVLVtWd955pz788MPr9s/KylJGRobdAwAA3LpcOqNy9OhRxcfHa/DgwXrppZe0detWxcTEyNPTU5GRkXn6x8XFadSoUS6oFMCtKnj4N64uATC15DEdXHp8i2EYhqsO7unpqbvuuksbN260tcXExGjr1q3atGlTnv5ZWVnKysqybWdkZCgwMFDp6eny9fW9KTUDuLUQVIC/VxhBJSMjQ35+fvn6/e3SSz8VKlRQaGioXVutWrX0888/X7O/1WqVr6+v3QMAANy6XBpUIiIidPDgQbu2Q4cOKSgoyEUVAQAAM3FpUBk0aJB+/PFHjR49WklJSZozZ44++OADRUdHu7IsAABgEi4NKo0aNdLnn3+uzz77THXq1NHrr7+uiRMn6tFHH3VlWQAAwCRcetePJHXs2FEdO3Z0dRkAAMCEXP4R+gAAANdDUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKZFUAEAAKbl0qAycuRIWSwWu0fNmjVdWRIAADARd1cXULt2ba1cudK27e7u8pIAAIBJuDwVuLu7q3z58q4uAwAAmJDL16gcPnxYFStWVNWqVfXoo4/q559/vm7frKwsZWRk2D0AAMCty6VB5e6779aMGTO0bNkyxcfH69ixY2rWrJnOnj17zf5xcXHy8/OzPQIDA29yxQAA4GayGIZhuLqIXGlpaQoKCtKECRPUp0+fPPuzsrKUlZVl287IyFBgYKDS09Pl6+t7M0sFcIsIHv6Nq0sATC15TAenj5mRkSE/P798/f52+RqVP/P391eNGjWUlJR0zf1Wq1VWq/UmVwUAAFzF5WtU/iwzM1NHjhxRhQoVXF0KAAAwAZcGlRdeeEHff/+9kpOTtXHjRv3nP/+Rm5ubevbs6cqyAACASbj00s8vv/yinj176o8//lBAQICaNm2qH3/8UQEBAa4sCwAAmIRLg8rcuXNdeXgAAGByplqjAgAA8GcEFQAAYFoEFQAAYFoEFQAAYFoEFQAAYFoEFQAAYFoEFQAAYFoEFQAAYFoEFQAAYFoEFQAAYFoEFQAAYFoEFQAAYFoEFQAAYFoEFQAAYFoEFQAAYFoEFQAAYFoEFQAAYFoEFQAAYFoEFQAAYFoEFQAAYFoEFQAAYFoEFQAAYFoEFQAAYFoEFQAAYFoEFQAAYFoEFQAAYFoEFQAAYFoEFQAAYFoOB5Vly5Zp/fr1tu0pU6aofv366tWrl1JTU51aHAAAKNocDipDhw5VRkaGJGnPnj0aMmSI2rdvr2PHjmnw4MFOLxAAABRd7o4+4dixYwoNDZUkLVq0SB07dtTo0aO1Y8cOtW/f3ukFAgCAosvhGRVPT0+dP39ekrRy5Urdd999kqTSpUvbZloAAACcweEZlaZNm2rw4MGKiIjQli1bNG/ePEnSoUOHVKlSJacXCAAAii6HZ1QmT54sd3d3LVy4UPHx8br99tslSUuXLtX999/v9AIBAEDR5fCMSuXKlfX111/naX/nnXecUhAAAECuAn2OypEjR/Tf//5XPXv2VEpKiqSrMyr79u1zanEAAKBocziofP/996pbt642b96sxYsXKzMzU5K0a9cujRgxwukFAgCAosvhoDJ8+HC98cYbWrFihTw9PW3t9957r3788UenFgcAAIo2h4PKnj179J///CdPe9myZXX69GmnFAUAACAVIKj4+/vrt99+y9O+c+dO2x1AAAAAzuBwUOnRo4defPFFnTx5UhaLRTk5OdqwYYNeeOEFPfHEE4VRIwAAKKIcDiqjR49WzZo1FRgYqMzMTIWGhqp58+Zq0qSJ/vvf/xZGjQAAoIhy+HNUPD099eGHH+rVV1/Vnj17lJmZqTvvvFPVq1cvjPoAAEAR5nBQyRUYGKjAwEBlZ2drz549Sk1NValSpZxZGwAAKOIcvvQzcOBAffzxx5Kk7OxstWjRQg0aNFBgYKDWrl3r7PoAAEAR5nBQWbhwocLCwiRJS5Ys0dGjR3XgwAENGjRIL7/8stMLBAAARZfDQeX06dMqX768JOnbb79Vt27dVKNGDT355JPas2eP0wsEAABFl8NBpVy5ctq/f7+ys7O1bNkytW3bVpJ0/vx5ubm5Ob1AAABQdDkcVKKiotStWzfVqVNHFotFbdq0kSRt3rxZNWvWLHAhY8aMkcVi0cCBAws8BgAAuLU4fNfPyJEjVadOHR0/flxdu3aV1WqVJLm5uWn48OEFKmLr1q16//33Va9evQI9HwAA3JoKdHvyI488kqctMjKyQAVkZmbq0Ucf1Ycffqg33nijQGMAAIBbU4GCyqpVq7Rq1SqlpKQoJyfHbt+0adMcGis6OlodOnRQmzZtbhhUsrKylJWVZdvOyMhw6FgAAODfxeGgMmrUKL322mu66667VKFCBVkslgIffO7cudqxY4e2bt2ar/5xcXEaNWpUgY8HAAD+XRwOKlOnTtWMGTP0+OOP/6MDHz9+XAMGDNCKFStUvHjxfD0nNjZWgwcPtm1nZGQoMDDwH9UBAADMy+GgcunSJTVp0uQfH3j79u1KSUlRgwYNbG3Z2dlat26dJk+erKysrDy3O1utVtviXQAAcOtz+Pbkvn37as6cOf/4wK1bt9aePXuUkJBge9x111169NFHlZCQwGeyAAAAx2dULl68qA8++EArV65UvXr15OHhYbd/woQJ+RrHx8dHderUsWvz9vZWmTJl8rQDAICiyeGgsnv3btWvX1+StHfvXrt9/2RhLQAAwF85HFTWrFlTGHVIEt++DAAA7Di8RuXPfvnlF/3yyy/OqgUAAMCOw0ElJydHr732mvz8/BQUFKSgoCD5+/vr9ddfz/PhbwAAAP+Ew5d+Xn75ZX388ccaM2aMIiIiJEnr16/XyJEjdfHiRb355ptOLxIAABRNDgeVmTNn6qOPPtKDDz5oa6tXr55uv/129e/fn6ACAACcxuFLP2fOnFHNmjXztNesWVNnzpxxSlEAAABSAYJKWFiYJk+enKd98uTJCgsLc0pRAAAAUgEu/YwbN04dOnTQypUrFR4eLknatGmTjh8/rm+//dbpBQIAgKLL4RmVFi1a6NChQ/rPf/6jtLQ0paWlqUuXLjp48KCaNWtWGDUCAIAiyuEZFUmqWLEii2YBAEChK1BQSU1N1ccff6zExERJUmhoqKKiolS6dGmnFgcAAIo2hy/9rFu3TsHBwZo0aZJSU1OVmpqqSZMmqUqVKlq3bl1h1AgAAIooh2dUoqOj1b17d8XHx8vNzU2SlJ2drf79+ys6Olp79uxxepEAAKBocnhGJSkpSUOGDLGFFElyc3PT4MGDlZSU5NTiAABA0eZwUGnQoIFtbcqfJSYm8jkqAADAqRy+9BMTE6MBAwYoKSlJ99xzjyTpxx9/1JQpUzRmzBjt3r3b1rdevXrOqxQAABQ5FsMwDEeeUKzY30/CWCwWGYYhi8Wi7Ozsf1TcjWRkZMjPz0/p6eny9fUt1GMBuDUFD//G1SUAppY8poPTx3Tk97fDMyrHjh0rcGEAAACOcDioBAUFFUYdAAAAeTi8mHbmzJn65pv/myodNmyY/P391aRJE/30009OLQ4AABRtDgeV0aNHy8vLS9LVLyOcPHmyxo0bp9tuu02DBg1yeoEAAKDocvjSz/HjxxUSEiJJ+uKLL/TII4/o6aefVkREhFq2bOns+gAAQBHm8IxKyZIl9ccff0iSvvvuO7Vt21aSVLx4cV24cMG51QEAgCLN4RmVtm3bqm/fvrrzzjt16NAhtW/fXpK0b98+BQcHO7s+AABQhDk8ozJlyhSFh4fr1KlTWrRokcqUKSNJ2r59u3r27On0AgEAQNHl8IyKv7+/Jk+enKd91KhRTikIAAAgl8MzKpL0ww8/6LHHHlOTJk3066+/SpI+/fRTrV+/3qnFAQCAos3hoLJo0SK1a9dOXl5e2rFjh7KysiRJ6enpGj16tNMLBAAARZfDQeWNN97Q1KlT9eGHH8rDw8PWHhERoR07dji1OAAAULQ5HFQOHjyo5s2b52n38/NTWlqaM2oCAACQVICgUr58eSUlJeVpX79+vapWreqUogAAAKQCBJWnnnpKAwYM0ObNm2WxWHTixAnNnj1bL7zwgp599tnCqBEAABRRDt+ePHz4cOXk5Kh169Y6f/68mjdvLqvVqhdeeEHPP/98YdQIAACKKIeDisVi0csvv6yhQ4cqKSlJmZmZCg0NVcmSJXXhwgXbFxYCAAD8UwX6HBVJ8vT0VGhoqBo3biwPDw9NmDBBVapUcWZtAACgiMt3UMnKylJsbKzuuusuNWnSRF988YUkafr06apSpYreeecdDRo0qLDqBAAARVC+L/28+uqrev/999WmTRtt3LhRXbt2VVRUlH788UdNmDBBXbt2lZubW2HWCgAAiph8B5UFCxbok08+0YMPPqi9e/eqXr16unLlinbt2iWLxVKYNQIAgCIq35d+fvnlFzVs2FCSVKdOHVmtVg0aNIiQAgAACk2+g0p2drY8PT1t2+7u7ipZsmShFAUAACA5cOnHMAz17t1bVqtVknTx4kU988wz8vb2tuu3ePFi51YIAACKrHwHlcjISLvtxx57zOnFAAAA/Fm+g8r06dMLsw4AAIA8CvyBbwAAAIWNoAIAAEyLoAIAAEyLoAIAAEwrX0GlQYMGSk1NlSS99tprOn/+fKEWBQAAIOUzqCQmJurcuXOSpFGjRikzM9MpB4+Pj1e9evXk6+srX19fhYeHa+nSpU4ZGwAA/Pvl6/bk+vXrKyoqSk2bNpVhGHr77bev+6m0r776ar4PXqlSJY0ZM0bVq1eXYRiaOXOmOnfurJ07d6p27dr5HgcAANyaLIZhGDfqdPDgQY0YMUJHjhzRjh07FBoaKnf3vBnHYrFox44d/6ig0qVL66233lKfPn1u2DcjI0N+fn5KT0+Xr6/vPzougKIpePg3ri4BMLXkMR2cPqYjv7/zNaNyxx13aO7cuZKkYsWKadWqVSpbtuw/r/RPsrOztWDBAp07d07h4eHX7JOVlaWsrCzbdkZGhlNrAAAA5pLvT6bNlZOT49QC9uzZo/DwcF28eFElS5bU559/rtDQ0Gv2jYuL06hRo5x6/L/DX1rA9RXGX1kA8FcFuj35yJEjev7559WmTRu1adNGMTExOnLkSIEKuOOOO5SQkKDNmzfr2WefVWRkpPbv33/NvrGxsUpPT7c9jh8/XqBjAgCAfweHg8ry5csVGhqqLVu2qF69eqpXr542b96s2rVra8WKFQ4X4OnpqZCQEDVs2FBxcXEKCwvT//73v2v2tVqttjuEch8AAODW5fCln+HDh2vQoEEaM2ZMnvYXX3xRbdu2/UcF5eTk2K1DAQAARZfDQSUxMVHz58/P0/7kk09q4sSJDo0VGxurBx54QJUrV9bZs2c1Z84crV27VsuXL3e0LAAAcAtyOKgEBAQoISFB1atXt2tPSEhw+E6glJQUPfHEE/rtt9/k5+enevXqafny5f94VgYAANwaHA4qTz31lJ5++mkdPXpUTZo0kSRt2LBBY8eO1eDBgx0a6+OPP3b08AAAoAhxOKi88sor8vHx0fjx4xUbGytJqlixokaOHKmYmBinFwgAAIouh4OKxWLRoEGDNGjQIJ09e1aS5OPj4/TCAAAAHA4qf0ZAAQAAhalAH/gGAABwMxBUAACAaRFUAACAaTkUVC5fvqzWrVvr8OHDhVUPAACAjUNBxcPDQ7t37y6sWgAAAOw4fOnnscce44PaAADATeHw7clXrlzRtGnTtHLlSjVs2FDe3t52+ydMmOC04gAAQNHmcFDZu3evGjRoIEk6dOiQ3T6LxeKcqgAAAFSAoLJmzZrCqAMAACCPAt+enJSUpOXLl+vChQuSJMMwnFYUAACAVICg8scff6h169aqUaOG2rdvr99++02S1KdPHw0ZMsTpBQIAgKLL4aAyaNAgeXh46Oeff1aJEiVs7d27d9eyZcucWhwAACjaHF6j8t1332n58uWqVKmSXXv16tX1008/Oa0wAAAAh2dUzp07ZzeTkuvMmTOyWq1OKQoAAEAqQFBp1qyZPvnkE9u2xWJRTk6Oxo0bp1atWjm1OAAAULQ5fOln3Lhxat26tbZt26ZLly5p2LBh2rdvn86cOaMNGzYURo0AAKCIcnhGpU6dOjp06JCaNm2qzp0769y5c+rSpYt27typatWqFUaNAACgiHJ4RkWS/Pz89PLLLzu7FgAAADsFCiqpqan6+OOPlZiYKEkKDQ1VVFSUSpcu7dTiAABA0ebwpZ9169YpODhYkyZNUmpqqlJTUzVp0iRVqVJF69atK4waAQBAEeXwjEp0dLS6d++u+Ph4ubm5SZKys7PVv39/RUdHa8+ePU4vEgAAFE0Oz6gkJSVpyJAhtpAiSW5ubho8eLCSkpKcWhwAACjaHA4qDRo0sK1N+bPExESFhYU5pSgAAAApn5d+du/ebfvvmJgYDRgwQElJSbrnnnskST/++KOmTJmiMWPGFE6VAACgSMpXUKlfv74sFosMw7C1DRs2LE+/Xr16qXv37s6rDgAAFGn5CirHjh0r7DoAAADyyFdQCQoKKuw6AAAA8ijQB76dOHFC69evV0pKinJycuz2xcTEOKUwAAAAh4PKjBkz1K9fP3l6eqpMmTKyWCy2fRaLhaACAACcxuGg8sorr+jVV19VbGysihVz+O5mAACAfHM4aZw/f149evQgpAAAgELncNro06ePFixYUBi1AAAA2HH40k9cXJw6duyoZcuWqW7duvLw8LDbP2HCBKcVBwAAirYCBZXly5frjjvukKQ8i2kBAACcxeGgMn78eE2bNk29e/cuhHIAAAD+j8NrVKxWqyIiIgqjFgAAADsOB5UBAwbo3XffLYxaAAAA7Dh86WfLli1avXq1vv76a9WuXTvPYtrFixc7rTgAAFC0ORxU/P391aVLl8KoBQAAwI7DQWX69OmFUQcAAEAefLwsAAAwLYdnVKpUqfK3n5dy9OjRf1QQAABALoeDysCBA+22L1++rJ07d2rZsmUaOnSos+oCAABwPKgMGDDgmu1TpkzRtm3b/nFBAAAAuZy2RuWBBx7QokWLnDUcAACA84LKwoULVbp0aYeeExcXp0aNGsnHx0dly5bVQw89pIMHDzqrJAAA8C/n8KWfO++8024xrWEYOnnypE6dOqX33nvPobG+//57RUdHq1GjRrpy5Ypeeukl3Xfffdq/f7+8vb0dLQ0AANxiHA4qDz30kN12sWLFFBAQoJYtW6pmzZoOjbVs2TK77RkzZqhs2bLavn27mjdv7mhpAADgFuNwUBkxYkRh1CFJSk9Pl6TrXkLKyspSVlaWbTsjI6PQagEAAK5nmg98y8nJ0cCBAxUREaE6depcs09cXJz8/Pxsj8DAwJtcJQAAuJnyHVSKFSsmNze3v324uzs8QWMTHR2tvXv3au7cudftExsbq/T0dNvj+PHjBT4eAAAwv3wni88///y6+zZt2qRJkyYpJyenQEU899xz+vrrr7Vu3TpVqlTpuv2sVqusVmuBjgEAAP598h1UOnfunKft4MGDGj58uJYsWaJHH31Ur732mkMHNwxDzz//vD7//HOtXbtWVapUcej5AADg1lagNSonTpzQU089pbp16+rKlStKSEjQzJkzFRQU5NA40dHRmjVrlubMmSMfHx+dPHlSJ0+e1IULFwpSFgAAuMU4FFTS09P14osvKiQkRPv27dOqVau0ZMmS6y5+vZH4+Hilp6erZcuWqlChgu0xb968Ao0HAABuLfm+9DNu3DiNHTtW5cuX12effXbNS0GOMgzjH48BAABuXfkOKsOHD5eXl5dCQkI0c+ZMzZw585r9Fi9e7LTiAABA0ZbvoPLEE0/YfXQ+AABAYct3UJkxY0YhlgEAAJCXaT6ZFgAA4K8IKgAAwLQIKgAAwLQIKgAAwLQIKgAAwLQIKgAAwLQIKgAAwLQIKgAAwLQIKgAAwLQIKgAAwLQIKgAAwLQIKgAAwLQIKgAAwLQIKgAAwLQIKgAAwLQIKgAAwLQIKgAAwLQIKgAAwLQIKgAAwLQIKgAAwLQIKgAAwLQIKgAAwLQIKgAAwLQIKgAAwLQIKgAAwLQIKgAAwLQIKgAAwLQIKgAAwLQIKgAAwLQIKgAAwLQIKgAAwLQIKgAAwLQIKgAAwLQIKgAAwLQIKgAAwLQIKgAAwLQIKgAAwLQIKgAAwLQIKgAAwLQIKgAAwLQIKgAAwLQIKgAAwLQIKgAAwLQIKgAAwLQIKgAAwLRcGlTWrVunTp06qWLFirJYLPriiy9cWQ4AADAZlwaVc+fOKSwsTFOmTHFlGQAAwKTcXXnwBx54QA888IArSwAAACbm0qDiqKysLGVlZdm2MzIyXFgNAAAobP+qxbRxcXHy8/OzPQIDA11dEgAAKET/qqASGxur9PR02+P48eOuLgkAABSif9WlH6vVKqvV6uoyAADATfKvmlEBAABFi0tnVDIzM5WUlGTbPnbsmBISElS6dGlVrlzZhZUBAAAzcGlQ2bZtm1q1amXbHjx4sCQpMjJSM2bMcFFVAADALFwaVFq2bCnDMFxZAgAAMDHWqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMiqAAAANMyRVCZMmWKgoODVbx4cd19993asmWLq0sCAAAm4PKgMm/ePA0ePFgjRozQjh07FBYWpnbt2iklJcXVpQEAABdzeVCZMGGCnnrqKUVFRSk0NFRTp05ViRIlNG3aNFeXBgAAXMzdlQe/dOmStm/frtjYWFtbsWLF1KZNG23atClP/6ysLGVlZdm209PTJUkZGRmFUl9O1vlCGRe4FRTW++5m430O/L3CeK/njmkYxg37ujSonD59WtnZ2SpXrpxde7ly5XTgwIE8/ePi4jRq1Kg87YGBgYVWI4Br85vo6goA3AyF+V4/e/as/Pz8/raPS4OKo2JjYzV48GDbdk5Ojs6cOaMyZcrIYrG4sDIUtoyMDAUGBur48ePy9fV1dTkACgHv86LDMAydPXtWFStWvGFflwaV2267TW5ubvr999/t2n///XeVL18+T3+r1Sqr1WrX5u/vX5glwmR8fX35Hxhwi+N9XjTcaCYll0sX03p6eqphw4ZatWqVrS0nJ0erVq1SeHi4CysDAABm4PJLP4MHD1ZkZKTuuusuNW7cWBMnTtS5c+cUFRXl6tIAAICLuTyodO/eXadOndKrr76qkydPqn79+lq2bFmeBbYo2qxWq0aMGJHn0h+AWwfvc1yLxcjPvUEAAAAu4PIPfAMAALgeggoAADAtggoAADAtggoAADAtggr+VVq2bKmBAwc6fdyRI0eqfv36Th8XwD+3du1aWSwWpaWl5fs5wcHBmjhxYqHVhJuHoAKnOnnypJ5//nlVrVpVVqtVgYGB6tSpk92H+gG4tfTu3VsWi0XPPPNMnn3R0dGyWCzq3bv3zS8MtwSCCpwmOTlZDRs21OrVq/XWW29pz549WrZsmVq1aqXo6GhXlwegEAUGBmru3Lm6cOGCre3ixYuaM2eOKleu7MLK8G9HUIHT9O/fXxaLRVu2bNHDDz+sGjVqqHbt2ho8eLB+/PFHSdKECRNUt25deXt7KzAwUP3791dmZqbdOBs2bFDLli1VokQJlSpVSu3atVNqaqptf05OjoYNG6bSpUurfPnyGjlypN3z09LS1LdvXwUEBMjX11f33nuvdu3aZddnzJgxKleunHx8fNSnTx9dvHixcF4UoIho0KCBAgMDtXjxYlvb4sWLVblyZd155522tqysLMXExKhs2bIqXry4mjZtqq1bt9qN9e2336pGjRry8vJSq1atlJycnOd469evV7NmzeTl5aXAwEDFxMTo3LlzhXZ+cB2CCpzizJkzWrZsmaKjo+Xt7Z1nf+6XRxYrVkyTJk3Svn37NHPmTK1evVrDhg2z9UtISFDr1q0VGhqqTZs2af369erUqZOys7NtfWbOnClvb29t3rxZ48aN02uvvaYVK1bY9nft2lUpKSlaunSptm/frgYNGqh169Y6c+aMJGn+/PkaOXKkRo8erW3btqlChQp67733CumVAYqOJ598UtOnT7dtT5s2Lc/XoQwbNkyLFi3SzJkztWPHDoWEhKhdu3a29+fx48fVpUsXderUSQkJCerbt6+GDx9uN8aRI0d0//336+GHH9bu3bs1b948rV+/Xs8991zhnyRuPgNwgs2bNxuSjMWLFzv0vAULFhhlypSxbffs2dOIiIi4bv8WLVoYTZs2tWtr1KiR8eKLLxqGYRg//PCD4evra1y8eNGuT7Vq1Yz333/fMAzDCA8PN/r372+3/+677zbCwsIcqh3AVZGRkUbnzp2NlJQUw2q1GsnJyUZycrJRvHhx49SpU0bnzp2NyMhIIzMz0/Dw8DBmz55te+6lS5eMihUrGuPGjTMMwzBiY2ON0NBQu/FffPFFQ5KRmppqGIZh9OnTx3j66aft+vzwww9GsWLFjAsXLhiGYRhBQUHGO++8U3gnjZvG5d/1g1uDkc9vYli5cqXi4uJ04MABZWRk6MqVK7p48aLOnz+vEiVKKCEhQV27dv3bMerVq2e3XaFCBaWkpEiSdu3apczMTJUpU8auz4ULF3TkyBFJUmJiYp5Ff+Hh4VqzZk2+zgHAtQUEBKhDhw6aMWOGDMNQhw4ddNttt9n2HzlyRJcvX1ZERIStzcPDQ40bN1ZiYqKkq+/Pu+++227c8PBwu+1du3Zp9+7dmj17tq3NMAzl5OTo2LFjqlWrVmGcHlyEoAKnqF69uiwWiw4cOHDdPsnJyerYsaOeffZZvfnmmypdurTWr1+vPn366NKlSypRooS8vLxueCwPDw+7bYvFopycHElSZmamKlSooLVr1+Z5Xu7lJwCF58knn7RdgpkyZUqhHCMzM1P9+vVTTExMnn0s3L31sEYFTlG6dGm1a9dOU6ZMueaCtrS0NG3fvl05OTkaP3687rnnHtWoUUMnTpyw61evXr1/dCtzgwYNdPLkSbm7uyskJMTukfuXXa1atbR582a75+Uu9gXwz9x///26dOmSLl++rHbt2tntq1atmjw9PbVhwwZb2+XLl7V161aFhoZKuvr+3LJli93z/vr+bNCggfbv35/nPR4SEiJPT89COjO4CkEFTjNlyhRlZ2ercePGWrRokQ4fPqzExERNmjRJ4eHhCgkJ0eXLl/Xuu+/q6NGj+vTTTzV16lS7MWJjY7V161b1799fu3fv1oEDBxQfH6/Tp0/nq4Y2bdooPDxcDz30kL777jslJydr48aNevnll7Vt2zZJ0oABAzRt2jRNnz5dhw4d0ogRI7Rv3z6nvx5AUeTm5qbExETt379fbm5udvu8vb317LPPaujQoVq2bJn279+vp556SufPn1efPn0kSc8884wOHz6soUOH6uDBg5ozZ45mzJhhN86LL76ojRs36rnnnlNCQoIOHz6sL7/8ksW0tyiCCpymatWq2rFjh1q1aqUhQ4aoTp06atu2rVatWqX4+HiFhYVpwoQJGjt2rOrUqaPZs2crLi7ObowaNWrou+++065du9S4cWOFh4fryy+/lLt7/q5SWiwWffvtt2revLmioqJUo0YN9ejRQz/99JPKlSsnSerevbteeeUVDRs2TA0bNtRPP/2kZ5991umvB1BU+fr6ytfX95r7xowZo4cffliPP/64GjRooKSkJC1fvlylSpWSdPXSzaJFi/TFF18oLCxMU6dO1ejRo+3GqFevnr7//nsdOnRIzZo105133qlXX31VFStWLPRzw81nMfK7ChIAAOAmY0YFAACYFkEFAACYFkEFAACYFkEFAACYFkEFAACYFkEFAACYFkEFAACYFkEFAACYFkEFAACYFkEFAACYFkEFAACY1v8D2kVLzIqfIssAAAAASUVORK5CYII=",
-            "text/plain": [
-              "<Figure size 640x480 with 1 Axes>"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Cached Responses: 3\n",
-            "Model Responses: 6\n"
-          ]
-        }
-      ],
-      "source": [
-        "import os\n",
-        "import time\n",
-        "import matplotlib.pyplot as plt\n",
-        "\n",
-        "\n",
-        "# List of example user messages\n",
-        "user_messages = [\n",
-        "    \"Hello, what's the weather in San Francisco??\",\n",
-        "    \"what's the weather in San Francisco??\"\n",
-        "    \"Can you tell me about the latest news?\",\n",
-        "    \"What's the capital of France?\",\n",
-        "    \"How does photosynthesis work?\",\n",
-        "    \"capital of france?\",\n",
-        "    \"tell me a joke\",\n",
-        "    \"tell me a joke right now\"\n",
-        "    \"How do I bake a chocolate cake?\",\n",
-        "    \"What are the benefits of exercise?\",\n",
-        "    \"Tell me a joke!\",\n",
-        "    # Add more questions here\n",
-        "]\n",
-        "\n",
-        "similarity_threshold = 0.5  # Adjust as needed\n",
-        "\n",
-        "### Testing / Measuring\n",
-        "cached_responses = 0\n",
-        "model_responses = 0\n",
-        "\n",
-        "for user_message in user_messages:\n",
-        "    messages = [{\"content\": user_message, \"role\": \"user\"}]\n",
-        "\n",
-        "    start = time.time()\n",
-        "    response = completion_with_cache(messages=messages, similarity_threshold=similarity_threshold)\n",
-        "    end = time.time()\n",
-        "    response_time = end - start\n",
-        "\n",
-        "    if response_time < 1:  # Assuming cached responses come in less than 1s\n",
-        "        cached_responses += 1\n",
-        "    else:\n",
-        "        model_responses += 1\n",
-        "    print(f\"got response for {user_message}\")\n",
-        "\n",
-        "# Plotting\n",
-        "response_types = [\"Cached\", \"Model\"]\n",
-        "response_counts = [cached_responses, model_responses]\n",
-        "\n",
-        "fig, ax = plt.subplots()\n",
-        "ax.bar(response_types, response_counts)\n",
-        "ax.set_ylabel(\"Number of Responses\")\n",
-        "ax.set_title(\"Cached vs Model API Responses\")\n",
-        "plt.show()\n",
-        "\n",
-        "print(f\"Cached Responses: {cached_responses}\")\n",
-        "print(f\"Model Responses: {model_responses}\")"
-      ]
-    }
-  ],
-  "metadata": {
-    "colab": {
-      "provenance": []
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    },
-    "language_info": {
-      "name": "python"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}
diff --git a/dist/litellm-0.13.1.dev1-py3-none-any.whl b/dist/litellm-0.13.1.dev1-py3-none-any.whl
new file mode 100644
index 000000000..826c660f9
Binary files /dev/null and b/dist/litellm-0.13.1.dev1-py3-none-any.whl differ
diff --git a/dist/litellm-0.13.1.dev1.tar.gz b/dist/litellm-0.13.1.dev1.tar.gz
new file mode 100644
index 000000000..82c2431c2
Binary files /dev/null and b/dist/litellm-0.13.1.dev1.tar.gz differ
diff --git a/dist/litellm-0.13.1.dev2-py3-none-any.whl b/dist/litellm-0.13.1.dev2-py3-none-any.whl
new file mode 100644
index 000000000..09854d802
Binary files /dev/null and b/dist/litellm-0.13.1.dev2-py3-none-any.whl differ
diff --git a/dist/litellm-0.13.1.dev2.tar.gz b/dist/litellm-0.13.1.dev2.tar.gz
new file mode 100644
index 000000000..db14e1297
Binary files /dev/null and b/dist/litellm-0.13.1.dev2.tar.gz differ
diff --git a/dist/litellm-0.13.1.dev3-py3-none-any.whl b/dist/litellm-0.13.1.dev3-py3-none-any.whl
new file mode 100644
index 000000000..9b941bd4a
Binary files /dev/null and b/dist/litellm-0.13.1.dev3-py3-none-any.whl differ
diff --git a/dist/litellm-0.13.1.dev3.tar.gz b/dist/litellm-0.13.1.dev3.tar.gz
new file mode 100644
index 000000000..64a730d0a
Binary files /dev/null and b/dist/litellm-0.13.1.dev3.tar.gz differ
diff --git a/docs/my-website/docs/caching/local_caching.md b/docs/my-website/docs/caching/local_caching.md
index 56425337b..d0e26e4bf 100644
--- a/docs/my-website/docs/caching/local_caching.md
+++ b/docs/my-website/docs/caching/local_caching.md
@@ -6,7 +6,6 @@ liteLLM implements exact match caching and supports the following Caching:
 * In-Memory Caching [Default]
 * Redis Caching Local
 * Redis Caching Hosted
-* GPTCache 
 
 ## Quick Start Usage - Completion
 Caching - cache
diff --git a/docs/my-website/docs/completion/function_call.md b/docs/my-website/docs/completion/function_call.md
index 76d8d0193..969441a03 100644
--- a/docs/my-website/docs/completion/function_call.md
+++ b/docs/my-website/docs/completion/function_call.md
@@ -1,7 +1,7 @@
 # Function Calling 
-LiteLLM only supports: OpenAI gpt-4-0613 and gpt-3.5-turbo-0613 for function calling 
+
 ## Quick Start 
-This is exactly how OpenAI supports function calling for gpt-4-0613 and gpt-3.5-turbo-0613
+
 ```python
 import os, litellm
 from litellm import completion
@@ -128,7 +128,6 @@ print(response)
 ```
 
 ## Function calling for Non-OpenAI LLMs
-**For Non OpenAI LLMs - LiteLLM raises an exception if you try using it for function calling**
 
 ### Adding Function to prompt
 For Non OpenAI LLMs LiteLLM allows you to add the function to the prompt set: `litellm.add_function_to_prompt = True`
diff --git a/docs/my-website/docs/completion/input.md b/docs/my-website/docs/completion/input.md
index 88b2f0f3e..6f6cfb1e2 100644
--- a/docs/my-website/docs/completion/input.md
+++ b/docs/my-website/docs/completion/input.md
@@ -58,6 +58,108 @@ To drop the param instead, set `litellm.drop_params = True`.
 Add to prompt for non-openai models, set: `litellm.add_function_to_prompt = True`. 
 ::: 
 
+## Input Params
+
+```python
+def completion(
+    model: str,
+    messages: List = [],
+    # Optional OpenAI params
+    functions: List = [],
+    function_call: str = "",  # optional params
+    temperature: Optional[float] = None,
+    top_p: Optional[float] = None,
+    n: Optional[int] = None,
+    stream: Optional[bool] = None,
+    stop=None,
+    max_tokens: Optional[float] = None,
+    presence_penalty: Optional[float] = None,
+    frequency_penalty: Optional[float]=None,
+    logit_bias: dict = {},
+    user: str = "",
+    deployment_id = None,
+    request_timeout: Optional[int] = None,
+
+    # Optional LiteLLM params
+    api_base: Optional[str] = None,
+    api_version: Optional[str] = None,
+    api_key: Optional[str] = None,
+    num_retries: Optional[int] = None, # set to retry a model if an APIError, TimeoutError, or ServiceUnavailableError occurs 
+    context_window_fallback_dict: Optional[dict] = None, # mapping of model to use if call fails due to context window error
+    fallbacks: Optional[list] = None, # pass in a list of api_base,keys, etc. 
+    metadata: Optional[dict] = None # additional call metadata, passed to logging integrations / custom callbacks
+    
+
+    **kwargs,
+) -> ModelResponse:
+```
+### Required Fields
+
+- `model`: *string* - ID of the model to use. Refer to the model endpoint compatibility table for details on which models work with the Chat API.
+  
+- `messages`: *array* - A list of messages comprising the conversation so far.
+
+#### Properties of `messages`
+*Note* - Each message in the array contains the following properties:
+
+- `role`: *string* - The role of the message's author. Roles can be: system, user, assistant, or function.
+
+- `content`: *string or null* - The contents of the message. It is required for all messages, but may be null for assistant messages with function calls.
+
+- `name`: *string (optional)* - The name of the author of the message. It is required if the role is "function". The name should match the name of the function represented in the content. It can contain characters (a-z, A-Z, 0-9), and underscores, with a maximum length of 64 characters.
+
+- `function_call`: *object (optional)* - The name and arguments of a function that should be called, as generated by the model.
+
+
+
+### Optional Fields
+
+- `functions`: *array* - A list of functions that the model may use to generate JSON inputs. Each function should have the following properties:
+
+    - `name`: *string* - The name of the function to be called. It should contain a-z, A-Z, 0-9, underscores and dashes, with a maximum length of 64 characters.
+    
+    - `description`: *string (optional)* - A description explaining what the function does. It helps the model to decide when and how to call the function.
+    
+    - `parameters`: *object* - The parameters that the function accepts, described as a JSON Schema object.
+    
+    - `function_call`: *string or object (optional)* - Controls how the model responds to function calls.
+
+- `temperature`: *number or null (optional)* - The sampling temperature to be used, between 0 and 2. Higher values like 0.8 produce more random outputs, while lower values like 0.2 make outputs more focused and deterministic. 
+
+- `top_p`: *number or null (optional)* - An alternative to sampling with temperature. It instructs the model to consider the results of the tokens with top_p probability. For example, 0.1 means only the tokens comprising the top 10% probability mass are considered.
+
+- `n`: *integer or null (optional)* - The number of chat completion choices to generate for each input message.
+
+- `stream`: *boolean or null (optional)* - If set to true, it sends partial message deltas. Tokens will be sent as they become available, with the stream terminated by a [DONE] message.
+
+- `stop`: *string/ array/ null (optional)* - Up to 4 sequences where the API will stop generating further tokens.
+
+- `max_tokens`: *integer (optional)* - The maximum number of tokens to generate in the chat completion.
+
+- `presence_penalty`: *number or null (optional)* - It is used to penalize new tokens based on their existence in the text so far.
+
+- `frequency_penalty`: *number or null (optional)* - It is used to penalize new tokens based on their frequency in the text so far.
+
+- `logit_bias`: *map (optional)* - Used to modify the probability of specific tokens appearing in the completion.
+
+- `user`: *string (optional)* - A unique identifier representing your end-user. This can help OpenAI to monitor and detect abuse.
+
+- `request_timeout`: *int (optional)* - Timeout in seconds for completion requests (Defaults to 600 seconds)
+
+#### litellm-specific params 
+
+- `api_base`: *string (optional)* - The api endpoint you want to call the model with
+
+- `api_version`: *string (optional)* - (Azure-specific) the api version for the call
+
+- `num_retries`: *int (optional)* - The number of times to retry the API call if an APIError, TimeoutError or ServiceUnavailableError occurs 
+
+- `context_window_fallback_dict`: *dict (optional)* - A mapping of model to use if call fails due to context window error
+
+- `fallbacks`: *list (optional)* - A list of model names + params to be used, in case the initial call fails
+
+- `metadata`: *dict (optional)* - Any additional data you want to be logged when the call is made (sent to logging integrations, eg. promptlayer and accessible via custom callback function)
+
 ## Provider-specific Params
 Providers might offer params not supported by OpenAI (e.g. top_k). You can pass those in 2 ways: 
 - via completion(): We'll pass the non-openai param, straight to the provider as part of the request body.
@@ -453,59 +555,3 @@ assert len(response_2_text) > len(response_1_text)
 
 
 [**Check out the tutorial!**](../tutorials/provider_specific_params.md)
-
-## Input - Request Body
-# Request Body
-
-### Required Fields
-
-- `model`: *string* - ID of the model to use. Refer to the model endpoint compatibility table for details on which models work with the Chat API.
-  
-- `messages`: *array* - A list of messages comprising the conversation so far.
-
-#### Properties of `messages`
-*Note* - Each message in the array contains the following properties:
-
-- `role`: *string* - The role of the message's author. Roles can be: system, user, assistant, or function.
-
-- `content`: *string or null* - The contents of the message. It is required for all messages, but may be null for assistant messages with function calls.
-
-- `name`: *string (optional)* - The name of the author of the message. It is required if the role is "function". The name should match the name of the function represented in the content. It can contain characters (a-z, A-Z, 0-9), and underscores, with a maximum length of 64 characters.
-
-- `function_call`: *object (optional)* - The name and arguments of a function that should be called, as generated by the model.
-
-
-
-### Optional Fields
-
-- `functions`: *array* - A list of functions that the model may use to generate JSON inputs. Each function should have the following properties:
-
-    - `name`: *string* - The name of the function to be called. It should contain a-z, A-Z, 0-9, underscores and dashes, with a maximum length of 64 characters.
-    
-    - `description`: *string (optional)* - A description explaining what the function does. It helps the model to decide when and how to call the function.
-    
-    - `parameters`: *object* - The parameters that the function accepts, described as a JSON Schema object.
-    
-    - `function_call`: *string or object (optional)* - Controls how the model responds to function calls.
-
-- `temperature`: *number or null (optional)* - The sampling temperature to be used, between 0 and 2. Higher values like 0.8 produce more random outputs, while lower values like 0.2 make outputs more focused and deterministic. 
-
-- `top_p`: *number or null (optional)* - An alternative to sampling with temperature. It instructs the model to consider the results of the tokens with top_p probability. For example, 0.1 means only the tokens comprising the top 10% probability mass are considered.
-
-- `n`: *integer or null (optional)* - The number of chat completion choices to generate for each input message.
-
-- `stream`: *boolean or null (optional)* - If set to true, it sends partial message deltas. Tokens will be sent as they become available, with the stream terminated by a [DONE] message.
-
-- `stop`: *string/ array/ null (optional)* - Up to 4 sequences where the API will stop generating further tokens.
-
-- `max_tokens`: *integer (optional)* - The maximum number of tokens to generate in the chat completion.
-
-- `presence_penalty`: *number or null (optional)* - It is used to penalize new tokens based on their existence in the text so far.
-
-- `frequency_penalty`: *number or null (optional)* - It is used to penalize new tokens based on their frequency in the text so far.
-
-- `logit_bias`: *map (optional)* - Used to modify the probability of specific tokens appearing in the completion.
-
-- `user`: *string (optional)* - A unique identifier representing your end-user. This can help OpenAI to monitor and detect abuse.
-
-- `request_timeout`: *int (optional)* - Timeout in seconds for completion requests (Defaults to 600 seconds)
\ No newline at end of file
diff --git a/docs/my-website/docs/completion/message_trimming.md b/docs/my-website/docs/completion/message_trimming.md
index e3f0204d8..abb203095 100644
--- a/docs/my-website/docs/completion/message_trimming.md
+++ b/docs/my-website/docs/completion/message_trimming.md
@@ -31,8 +31,6 @@ The function uses the following parameters:
 
 - `model`:[Optional] This is the LiteLLM model being used. This parameter is optional, as you can alternatively specify the `max_tokens` parameter.
 
-- `system_message`:[Optional] This is a string containing an optional system message that will be preserved at the beginning of the conversation. This parameter is optional and set to `None` by default.
-
 - `max_tokens`:[Optional] This is an int, manually set upper limit on messages
 
 - `trim_ratio`:[Optional] This represents the target ratio of tokens to use following trimming. It's default value is 0.75, which implies that messages will be trimmed to utilise about 75%
\ No newline at end of file
diff --git a/docs/my-website/docs/completion/model_alias.md b/docs/my-website/docs/completion/model_alias.md
index 5f910a6fe..5fa832649 100644
--- a/docs/my-website/docs/completion/model_alias.md
+++ b/docs/my-website/docs/completion/model_alias.md
@@ -45,14 +45,9 @@ litellm.model_alias_map = model_alias_map
 
 messages = [{ "content": "Hello, how are you?","role": "user"}]
 
-# openai call
+# call "gpt-3.5-turbo-16k"
 response = completion(model="GPT-3.5", messages=messages)
 
-# replicate call
+# call replicate/llama-2-70b-chat:2796ee9483c3fd7aa2e171d38f4ca1...
 response = completion("llama2", messages)
 ```
-
-
-# no-code
-
-If you use litellm client, you can also do this without going into code. [Learn more]("https://docs.litellm.ai/docs/debugging/hosted_debugging")
\ No newline at end of file
diff --git a/docs/my-website/docs/completion/reliable_completions.md b/docs/my-website/docs/completion/reliable_completions.md
index 2b340a5d0..eb64eaf91 100644
--- a/docs/my-website/docs/completion/reliable_completions.md
+++ b/docs/my-website/docs/completion/reliable_completions.md
@@ -1,62 +1,53 @@
 # Reliability
+
+LiteLLM helps prevent failed requests in 2 ways: 
+- Retries
+- Fallbacks: Context Window + General
+
 ## Helper utils 
 LiteLLM supports the following functions for reliability:
 * `litellm.longer_context_model_fallback_dict`: Dictionary which has a mapping for those models which have larger equivalents  
-* `completion_with_retries`: use tenacity retries
+* `num_retries`: use tenacity retries
 * `completion()` with fallbacks: switch between models/keys/api bases in case of errors. 
 
-## Context Window Errors 
-
-```python 
-from litellm import longer_context_model_fallback_dict, ContextWindowExceededError
-
-sample_text = "how does a court case get to the Supreme Court?" * 1000
-messages = [{"content": user_message, "role": "user"}]
-model = "gpt-3.5-turbo"
-try: 
-    # try the original model
-    response = completion(model=model, messages=messages) 
-# catch the context window error
-except ContextWindowExceededError as e:
-    if model in longer_context_model_fallback_dict: 
-        # switch to the equivalent larger model -> gpt.3.5-turbo-16k 
-        new_model = longer_context_model_fallback_dict[model]
-        response = completion(new_model, messages)
-
-print(response)
-```
-
-
 ## Retry failed requests
 
-You can use this as a drop-in replacement for the `completion()` function to use tenacity retries - by default we retry the call 3 times. 
+Call it in completion like this `completion(..num_retries=2)`.
+
 
 Here's a quick look at how you can use it: 
 
 ```python 
-from litellm import completion_with_retries
+from litellm import completion
 
 user_message = "Hello, whats the weather in San Francisco??"
 messages = [{"content": user_message, "role": "user"}]
 
 # normal call 
-def test_completion_custom_provider_model_name():
-    try:
-        response = completion_with_retries(
+response = completion(
             model="gpt-3.5-turbo",
             messages=messages,
+            num_retries=2
         )
-        # Add any assertions here to check the response
-        print(response)
-    except Exception as e:
-        printf"Error occurred: {e}")
 ```
 
-## Switch Models/API Keys/API Bases
+## Fallbacks 
+
+### Context Window Fallbacks
+```python 
+from litellm import completion
+
+fallback_dict = {"gpt-3.5-turbo": "gpt-3.5-turbo-16k"}
+messages = [{"content": "how does a court case get to the Supreme Court?" * 500, "role": "user"}]
+
+completion(model="gpt-3.5-turbo", messages=messages, context_window_fallback_dict=ctx_window_fallback_dict)
+```
+
+### Fallbacks - Switch Models/API Keys/API Bases
 
 LLM APIs can be unstable, completion() with fallbacks ensures you'll always get a response from your calls
 
-### Usage 
+#### Usage 
 To use fallback models with `completion()`, specify a list of models in the `fallbacks` parameter. 
 
 The `fallbacks` list should include the primary model you want to use, followed by additional models that can be used as backups in case the primary model fails to provide a response.
@@ -76,6 +67,11 @@ response = completion(model="azure/gpt-4", messages=messages, api_key=api_key,
     fallbacks=[{"api_key": "good-key-1"}, {"api_key": "good-key-2", "api_base": "good-api-base-2"}])
 ```
 
+[Check out this section for implementation details](#fallbacks-1)
+
+## Implementation Details 
+
+### Fallbacks
 #### Output from calls
 ```
 Completion with 'bad-model': got exception Unable to map your input to a model. Check your input - {'model': 'bad-model'
@@ -112,7 +108,7 @@ completion call gpt-3.5-turbo
 When you pass `fallbacks` to `completion`, it makes the first `completion` call using the primary model specified as `model` in `completion(model=model)`. If the primary model fails or encounters an error, it automatically tries the `fallbacks` models in the specified order. This ensures a response even if the primary model is unavailable.
 
 
-### Key components of Model Fallbacks implementation:
+#### Key components of Model Fallbacks implementation:
 * Looping through `fallbacks`
 * Cool-Downs for rate-limited models
 
diff --git a/docs/my-website/docs/completion/stream.md b/docs/my-website/docs/completion/stream.md
index 6a1afb91c..413076dc9 100644
--- a/docs/my-website/docs/completion/stream.md
+++ b/docs/my-website/docs/completion/stream.md
@@ -2,11 +2,13 @@
 
 - [Streaming Responses](#streaming-responses)
 - [Async Completion](#async-completion)
+- [Async + Streaming Completion](#async-streaming)
 
 ## Streaming Responses
 LiteLLM supports streaming the model response back by passing `stream=True` as an argument to the completion function
 ### Usage
 ```python
+from litellm import completion
 response = completion(model="gpt-3.5-turbo", messages=messages, stream=True)
 for chunk in response:
     print(chunk['choices'][0]['delta'])
@@ -35,36 +37,22 @@ print(response)
 We've implemented an `__anext__()` function in the streaming object returned. This enables async iteration over the streaming object. 
 
 ### Usage
-Here's an example of using it with openai. But this 
+Here's an example of using it with openai.
 ```python
-from litellm import completion
-import asyncio, os, traceback, time
-
-os.environ["OPENAI_API_KEY"] = "your-api-key"
-
-def logger_fn(model_call_object: dict):
-    print(f"LOGGER FUNCTION: {model_call_object}")
-
-
-user_message = "Hello, how are you?"
-messages = [{"content": user_message, "role": "user"}]
+from litellm import acompletion
+import asyncio, os, traceback
 
 async def completion_call():
     try:
-        response = completion(
-            model="gpt-3.5-turbo", messages=messages, stream=True, logger_fn=logger_fn
+        print("test acompletion + streaming")
+        response = await acompletion(
+            model="gpt-3.5-turbo", 
+            messages=[{"content": "Hello, how are you?", "role": "user"}], 
+            stream=True
         )
         print(f"response: {response}")
-        complete_response = ""
-        start_time = time.time()
-        # Change for loop to async for loop
         async for chunk in response:
-            chunk_time = time.time()
-            print(f"time since initial request: {chunk_time - start_time:.5f}")
-            print(chunk["choices"][0]["delta"])
-            complete_response += chunk["choices"][0]["delta"].get("content", "")
-        if complete_response == "": 
-            raise Exception("Empty response received")
+            print(chunk)
     except:
         print(f"error occurred: {traceback.format_exc()}")
         pass
diff --git a/docs/my-website/docs/completion/token_usage.md b/docs/my-website/docs/completion/token_usage.md
index 8b1140bad..e3058d791 100644
--- a/docs/my-website/docs/completion/token_usage.md
+++ b/docs/my-website/docs/completion/token_usage.md
@@ -31,21 +31,10 @@ Encoding has model-specific tokenizers for anthropic, cohere, llama2 and openai.
 ```python
 from litellm import encode, decode
 
-
-def test_encoding_and_decoding():
-    try: 
-        sample_text = "Hellö World, this is my input string!"
-
-        # openai tokenizer 
-        openai_tokens = token_counter(model="gpt-3.5-turbo", text=sample_text)
-
-        openai_text = decode(model="gpt-3.5-turbo", tokens=openai_tokens)
-
-        assert openai_text == sample_text
-    except: 
-        pass
-
-test_encoding_and_decoding()
+sample_text = "Hellö World, this is my input string!"
+# openai encoding + decoding
+openai_tokens = encode(model="gpt-3.5-turbo", text=sample_text)
+print(openai_tokens)
 ```
 
 ### 2. `decode`
@@ -55,21 +44,11 @@ Decoding is supported for anthropic, cohere, llama2 and openai.
 ```python
 from litellm import encode, decode
 
-
-def test_encoding_and_decoding():
-    try: 
-        sample_text = "Hellö World, this is my input string!"
-
-        # openai tokenizer 
-        openai_tokens = token_counter(model="gpt-3.5-turbo", text=sample_text)
-
-        openai_text = decode(model="gpt-3.5-turbo", tokens=openai_tokens)
-
-        assert openai_text == sample_text
-    except: 
-        pass
-
-test_encoding_and_decoding()
+sample_text = "Hellö World, this is my input string!"
+# openai encoding + decoding
+openai_tokens = encode(model="gpt-3.5-turbo", text=sample_text)
+openai_text = decode(model="gpt-3.5-turbo", tokens=openai_tokens)
+print(openai_text)
 ```
 
 ### 3. `token_counter`
diff --git a/docs/my-website/docs/embedding/supported_embedding.md b/docs/my-website/docs/embedding/supported_embedding.md
index 2ee71fbd2..47690184a 100644
--- a/docs/my-website/docs/embedding/supported_embedding.md
+++ b/docs/my-website/docs/embedding/supported_embedding.md
@@ -52,22 +52,57 @@ print(response)
 
 h/t to [Mikko](https://www.linkedin.com/in/mikkolehtimaki/) for this integration
 
+
+## Bedrock Embedding
+
+### API keys
+This can be set as env variables or passed as **params to litellm.embedding()**
+```python
+import os
+os.environ["AWS_ACCESS_KEY_ID"] = ""  # Access key
+os.environ["AWS_SECRET_ACCESS_KEY"] = "" # Secret access key
+os.environ["AWS_REGION_NAME"] = "" # us-east-1, us-east-2, us-west-1, us-west-2
+```
+
+### Usage
+```python
+from litellm import embedding
+response = embedding(
+    model="amazon.titan-embed-text-v1",
+    input=["good morning from litellm"],
+)
+print(response)
+```
+
+| Model Name           | Function Call                               |
+|----------------------|---------------------------------------------|
+| Titan Embeddings - G1 | `embedding(model="amazon.titan-embed-text-v1", input=input)` |
+
+
 ## Cohere Embedding Models
 https://docs.cohere.com/reference/embed
 
 ### Usage
 ```python
 from litellm import embedding
-import os
-os.environ['COHERE_API_KEY'] = ""
-response = embedding('embed-english-v2.0', input=["good morning from litellm"])
-```
+os.environ["COHERE_API_KEY"] = "cohere key"
 
-| Model Name            | Function Call | Required OS Variables                        |
-|-----------------------|--------------------------------------------------------------|-------------------------------------------------|
-| embed-english-v2.0    | `embedding('embed-english-v2.0', input=input)`               | `os.environ['COHERE_API_KEY']`                                             |
-| embed-english-light-v2.0 | `embedding('embed-english-light-v2.0', input=input)`         | `os.environ['COHERE_API_KEY']`                                             |
-| embed-multilingual-v2.0 | `embedding('embed-multilingual-v2.0', input=input)`         | `os.environ['COHERE_API_KEY']`                                             |
+# cohere call
+response = embedding(
+    model="embed-english-v3.0", 
+    input=["good morning from litellm", "this is another item"], 
+    input_type="search_document" # optional param for v3 llms
+)
+```
+| Model Name               | Function Call                                                |
+|--------------------------|--------------------------------------------------------------|
+| embed-english-v3.0       | `embedding(model="embed-english-v3.0", input=["good morning from litellm", "this is another item"])` |
+| embed-english-light-v3.0 | `embedding(model="embed-english-light-v3.0", input=["good morning from litellm", "this is another item"])` |
+| embed-multilingual-v3.0  | `embedding(model="embed-multilingual-v3.0", input=["good morning from litellm", "this is another item"])` |
+| embed-multilingual-light-v3.0 | `embedding(model="embed-multilingual-light-v3.0", input=["good morning from litellm", "this is another item"])` |
+| embed-english-v2.0       | `embedding(model="embed-english-v2.0", input=["good morning from litellm", "this is another item"])` |
+| embed-english-light-v2.0 | `embedding(model="embed-english-light-v2.0", input=["good morning from litellm", "this is another item"])` |
+| embed-multilingual-v2.0  | `embedding(model="embed-multilingual-v2.0", input=["good morning from litellm", "this is another item"])` |
 
 ## HuggingFace Embedding Models
 LiteLLM supports all Feature-Extraction Embedding models: https://huggingface.co/models?pipeline_tag=feature-extraction
diff --git a/docs/my-website/docs/extras/contributing.md b/docs/my-website/docs/extras/contributing.md
index 6f1e2d01a..f470515e3 100644
--- a/docs/my-website/docs/extras/contributing.md
+++ b/docs/my-website/docs/extras/contributing.md
@@ -1,5 +1,7 @@
 # Contributing to Documentation
 
+This website is built using [Docusaurus 2](https://docusaurus.io/), a modern static website generator.
+
 Clone litellm 
 ```
 git clone https://github.com/BerriAI/litellm.git
@@ -9,16 +11,28 @@ git clone https://github.com/BerriAI/litellm.git
 
 #### Installation
 ```
-pip install mkdocs
+npm install --global yarn
 ```
 
-#### Locally Serving Docs
+
+### Local Development
+
 ```
-mkdocs serve
+cd docs/my-website
 ```
-If you see `command not found: mkdocs` try running the following
+
+Let's Install requirement
+
 ```
-python3 -m mkdocs serve
+yarn
+```
+Run website
+
+```
+yarn start
+```
+Open docs here: [http://localhost:3000/](http://localhost:3000/)
+
 ```
 
 This command builds your Markdown files into HTML and starts a development server to browse your documentation. Open up [http://127.0.0.1:8000/](http://127.0.0.1:8000/) in your web browser to see your documentation. You can make changes to your Markdown files and your docs will automatically rebuild.
diff --git a/docs/my-website/docs/index.md b/docs/my-website/docs/index.md
index 7d3a9f093..4c39a46bb 100644
--- a/docs/my-website/docs/index.md
+++ b/docs/my-website/docs/index.md
@@ -395,9 +395,6 @@ response = completion(
 )
 ```
 
-Need a dedicated key? Email us @ krrish@berri.ai
-
-
 ## More details
 * [exception mapping](./exception_mapping.md)
 * [retries + model fallbacks for completion()](./completion/reliable_completions.md)
diff --git a/docs/my-website/docs/observability/callbacks.md b/docs/my-website/docs/observability/callbacks.md
index af0425975..892be9322 100644
--- a/docs/my-website/docs/observability/callbacks.md
+++ b/docs/my-website/docs/observability/callbacks.md
@@ -25,7 +25,7 @@ litellm.success_callback=["posthog", "helicone", "llmonitor"]
 litellm.failure_callback=["sentry", "llmonitor"]
 
 ## set env variables
-os.environ['SENTRY_API_URL'], os.environ['SENTRY_API_TRACE_RATE']= ""
+os.environ['SENTRY_DSN'], os.environ['SENTRY_API_TRACE_RATE']= ""
 os.environ['POSTHOG_API_KEY'], os.environ['POSTHOG_API_URL'] = "api-key", "api-url"
 os.environ["HELICONE_API_KEY"] = ""
 os.environ["TRACELOOP_API_KEY"] = ""
diff --git a/docs/my-website/docs/observability/custom_callback.md b/docs/my-website/docs/observability/custom_callback.md
index 32ec8e3d3..580bd819a 100644
--- a/docs/my-website/docs/observability/custom_callback.md
+++ b/docs/my-website/docs/observability/custom_callback.md
@@ -1,4 +1,39 @@
-# Custom Callback Functions for Completion()
+# Custom Callbacks
+
+## Callback Class
+You can create a custom callback class to precisely log events as they occur in litellm. 
+
+```python
+from litellm.integrations.custom_logger import CustomLogger
+
+class MyCustomHandler(CustomLogger):
+    def log_pre_api_call(self, model, messages, kwargs): 
+        print(f"Pre-API Call")
+    
+    def log_post_api_call(self, kwargs, response_obj, start_time, end_time): 
+        print(f"Post-API Call")
+    
+    def log_stream_event(self, kwargs, response_obj, start_time, end_time):
+        print(f"On Stream")
+        
+    def log_success_event(self, kwargs, response_obj, start_time, end_time): 
+        print(f"On Success")
+
+    def log_failure_event(self, kwargs, response_obj, start_time, end_time): 
+        print(f"On Failure")
+
+customHandler = MyCustomHandler()
+
+litellm.callbacks = [customHandler]
+response = completion(model="gpt-3.5-turbo", messages=[{ "role": "user", "content": "Hi 👋 - i'm openai"}],
+                              stream=True)
+for chunk in response: 
+    continue
+```
+
+## Callback Functions
+If you just want to log on a specific event (e.g. on input) - you can use callback functions. 
+
 You can set custom callbacks to trigger for:
 - `litellm.input_callback`   - Track inputs/transformed inputs before making the LLM API call
 - `litellm.success_callback` - Track inputs/outputs after making LLM API call
diff --git a/docs/my-website/docs/observability/sentry.md b/docs/my-website/docs/observability/sentry.md
index 732146bbc..255dd55cf 100644
--- a/docs/my-website/docs/observability/sentry.md
+++ b/docs/my-website/docs/observability/sentry.md
@@ -1,20 +1,36 @@
-# Sentry Tutorial 
+import Image from '@theme/IdealImage';
+
+# Sentry - Log LLM Exceptions
 [Sentry](https://sentry.io/) provides error monitoring for production. LiteLLM can add breadcrumbs and send exceptions to Sentry with this integration
 
-This works on normal, async and streaming completion calls
+Track exceptions for:
+- litellm.completion() - completion()for 100+ LLMs
+- litellm.acompletion() - async completion()
+- Streaming completion() & acompletion() calls
 
-### usage 
+<Image img={require('../../img/sentry.png')} />
 
+
+## Usage
+
+### Set SENTRY_DSN & callback
+
+```python
+import litellm, os
+os.environ["SENTRY_DSN"] = "your-sentry-url"
+litellm.failure_callback=["sentry"]
+```
+
+### Sentry callback with completion
 ```python
 import litellm
 from litellm import completion 
-litellm.set_verbose = True
 
 litellm.input_callback=["sentry"] # adds sentry breadcrumbing
 litellm.failure_callback=["sentry"] # [OPTIONAL] if you want litellm to capture -> send exception to sentry
 
 import os 
-os.environ["SENTRY_API_URL"] = "your-sentry-url"
+os.environ["SENTRY_DSN"] = "your-sentry-url"
 os.environ["OPENAI_API_KEY"] = "your-openai-key"
 
 # set bad key to trigger error 
diff --git a/docs/my-website/docs/providers/bedrock.md b/docs/my-website/docs/providers/bedrock.md
index 5a41e7a3a..4f28b9b80 100644
--- a/docs/my-website/docs/providers/bedrock.md
+++ b/docs/my-website/docs/providers/bedrock.md
@@ -29,11 +29,53 @@ os.environ["AWS_SECRET_ACCESS_KEY"] = ""
 os.environ["AWS_REGION_NAME"] = ""
 
 response = completion(
-            model="anthropic.claude-instant-v1", 
-            messages=[{ "content": "Hello, how are you?","role": "user"}]
+  model="anthropic.claude-instant-v1", 
+  messages=[{ "content": "Hello, how are you?","role": "user"}]
 )
 ```
 
+## Usage - Streaming
+```python
+import os 
+from litellm import completion
+
+os.environ["AWS_ACCESS_KEY_ID"] = ""
+os.environ["AWS_SECRET_ACCESS_KEY"] = ""
+os.environ["AWS_REGION_NAME"] = ""
+
+response = completion(
+  model="anthropic.claude-instant-v1", 
+  messages=[{ "content": "Hello, how are you?","role": "user"}],
+  stream=True
+)
+for chunk in response:
+  print(chunk)
+```
+
+#### Example Streaming Output Chunk
+```json
+{
+  "choices": [
+    {
+      "finish_reason": null,
+      "index": 0,
+      "delta": {
+        "content": "ase can appeal the case to a higher federal court. If a higher federal court rules in a way that conflicts with a ruling from a lower federal court or conflicts with a ruling from a higher state court, the parties involved in the case can appeal the case to the Supreme Court. In order to appeal a case to the Sup"
+      }
+    }
+  ],
+  "created": null,
+  "model": "anthropic.claude-instant-v1",
+  "usage": {
+    "prompt_tokens": null,
+    "completion_tokens": null,
+    "total_tokens": null
+  }
+}
+```
+
+## Boto3 - Authentication
+
 ### Passing credentials as parameters - Completion()
 Pass AWS credentials as parameters to litellm.completion
 ```python
@@ -93,8 +135,8 @@ response = completion(
 ## Supported AWS Bedrock Models
 Here's an example of using a bedrock model with LiteLLM 
 
-| Model Name               | Command                                                          | Environment Variables                                              |
-|--------------------------|------------------------------------------------------------------|---------------------------------------------------------------------|
+| Model Name               | Command                                                          |
+|--------------------------|------------------------------------------------------------------|
 | Anthropic Claude-V2      | `completion(model='anthropic.claude-v2', messages=messages)`   | `os.environ['ANTHROPIC_ACCESS_KEY_ID']`, `os.environ['ANTHROPIC_SECRET_ACCESS_KEY']`           |
 | Anthropic Claude-Instant V1 | `completion(model='anthropic.claude-instant-v1', messages=messages)` | `os.environ['ANTHROPIC_ACCESS_KEY_ID']`, `os.environ['ANTHROPIC_SECRET_ACCESS_KEY']`           |
 | Anthropic Claude-V1      | `completion(model='anthropic.claude-v1', messages=messages)`   | `os.environ['ANTHROPIC_ACCESS_KEY_ID']`, `os.environ['ANTHROPIC_SECRET_ACCESS_KEY']`           |
@@ -104,45 +146,29 @@ Here's an example of using a bedrock model with LiteLLM
 | AI21 J2-Mid             | `completion(model='ai21.j2-mid-v1', messages=messages)`   | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` |
 | AI21 J2-Ultra              | `completion(model='ai21.j2-ultra-v1', messages=messages)`   | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` |
 
+## Bedrock Embedding
 
-## Streaming
-
+### API keys
+This can be set as env variables or passed as **params to litellm.embedding()**
 ```python
-import os 
-from litellm import completion
+import os
+os.environ["AWS_ACCESS_KEY_ID"] = ""        # Access key
+os.environ["AWS_SECRET_ACCESS_KEY"] = ""    # Secret access key
+os.environ["AWS_REGION_NAME"] = ""           # us-east-1, us-east-2, us-west-1, us-west-2
+```
 
-os.environ["AWS_ACCESS_KEY_ID"] = ""
-os.environ["AWS_SECRET_ACCESS_KEY"] = ""
-os.environ["AWS_REGION_NAME"] = ""
-
-response = completion(
-            model="bedrock/anthropic.claude-instant-v1", 
-            messages=[{ "content": "Hello, how are you?","role": "user"}],
-            stream=True
+### Usage
+```python
+from litellm import embedding
+response = embedding(
+    model="amazon.titan-embed-text-v1",
+    input=["good morning from litellm"],
 )
-
-for chunk in response:
-    print(chunk)
+print(response)
 ```
 
-### Example Streaming Output Chunk
-```json
-{
-  "choices": [
-    {
-      "finish_reason": null,
-      "index": 0,
-      "delta": {
-        "content": "ase can appeal the case to a higher federal court. If a higher federal court rules in a way that conflicts with a ruling from a lower federal court or conflicts with a ruling from a higher state court, the parties involved in the case can appeal the case to the Supreme Court. In order to appeal a case to the Sup"
-      }
-    }
-  ],
-  "created": null,
-  "model": "amazon.titan-tg1-large",
-  "usage": {
-    "prompt_tokens": null,
-    "completion_tokens": null,
-    "total_tokens": null
-  }
-}
-```
+## Supported AWS Bedrock Embedding Models
+
+| Model Name           | Function Call                               |
+|----------------------|---------------------------------------------|
+| Titan Embeddings - G1 | `embedding(model="amazon.titan-embed-text-v1", input=input)` |
diff --git a/docs/my-website/docs/providers/cohere.md b/docs/my-website/docs/providers/cohere.md
index d31964657..980143770 100644
--- a/docs/my-website/docs/providers/cohere.md
+++ b/docs/my-website/docs/providers/cohere.md
@@ -1,27 +1,90 @@
 # Cohere
 
-LiteLLM supports 'command', 'command-light', 'command-medium', 'command-medium-beta', 'command-xlarge-beta', 'command-nightly' models from [Cohere](https://cohere.com/). 
-
-Like AI21, these models are available without a waitlist. 
-
-### API KEYS
+## API KEYS
 
 ```python
 import os 
 os.environ["COHERE_API_KEY"] = ""
 ```
 
-### Example Usage
+## Usage
 
 ```python
-
 from litellm import completion
 
 ## set ENV variables
 os.environ["COHERE_API_KEY"] = "cohere key"
 
-messages = [{ "content": "Hello, how are you?","role": "user"}]
+# cohere call
+response = completion(
+    model="command-nightly", 
+    messages = [{ "content": "Hello, how are you?","role": "user"}]
+)
+```
+
+## Usage - Streaming
+
+```python
+from litellm import completion
+
+## set ENV variables
+os.environ["COHERE_API_KEY"] = "cohere key"
 
 # cohere call
-response = completion("command-nightly", messages)
-```
\ No newline at end of file
+response = completion(
+    model="command-nightly", 
+    messages = [{ "content": "Hello, how are you?","role": "user"}],
+    stream=True
+)
+
+for chunk in response:
+    print(chunk)
+```
+
+LiteLLM supports 'command', 'command-light', 'command-medium', 'command-medium-beta', 'command-xlarge-beta', 'command-nightly' models from [Cohere](https://cohere.com/). 
+
+## Embedding
+
+```python
+from litellm import embedding
+os.environ["COHERE_API_KEY"] = "cohere key"
+
+# cohere call
+response = embedding(
+    model="embed-english-v3.0", 
+    input=["good morning from litellm", "this is another item"], 
+)
+```
+
+### Setting - Input Type for v3 models
+v3 Models have a required parameter: `input_type`, it can be one of the following four values:
+
+- `input_type="search_document"`: (default) Use this for texts (documents) you want to store in your vector database
+- `input_type="search_query"`: Use this for search queries to find the most relevant documents in your vector database
+- `input_type="classification"`: Use this if you use the embeddings as an input for a classification system
+- `input_type="clustering"`: Use this if you use the embeddings for text clustering
+
+https://txt.cohere.com/introducing-embed-v3/
+```python
+from litellm import embedding
+os.environ["COHERE_API_KEY"] = "cohere key"
+
+# cohere call
+response = embedding(
+    model="embed-english-v3.0", 
+    input=["good morning from litellm", "this is another item"], 
+    input_type="search_document" 
+)
+```
+
+### Supported Embedding Models
+| Model Name               | Function Call                                                |
+|--------------------------|--------------------------------------------------------------|
+| embed-english-v3.0       | `embedding(model="embed-english-v3.0", input=["good morning from litellm", "this is another item"])` |
+| embed-english-light-v3.0 | `embedding(model="embed-english-light-v3.0", input=["good morning from litellm", "this is another item"])` |
+| embed-multilingual-v3.0  | `embedding(model="embed-multilingual-v3.0", input=["good morning from litellm", "this is another item"])` |
+| embed-multilingual-light-v3.0 | `embedding(model="embed-multilingual-light-v3.0", input=["good morning from litellm", "this is another item"])` |
+| embed-english-v2.0       | `embedding(model="embed-english-v2.0", input=["good morning from litellm", "this is another item"])` |
+| embed-english-light-v2.0 | `embedding(model="embed-english-light-v2.0", input=["good morning from litellm", "this is another item"])` |
+| embed-multilingual-v2.0  | `embedding(model="embed-multilingual-v2.0", input=["good morning from litellm", "this is another item"])` |
+
diff --git a/docs/my-website/docs/providers/huggingface.md b/docs/my-website/docs/providers/huggingface.md
index fabfcfcbb..f8ebadfcf 100644
--- a/docs/my-website/docs/providers/huggingface.md
+++ b/docs/my-website/docs/providers/huggingface.md
@@ -128,7 +128,9 @@ response = embedding(
 )
 ```
 
-### [OPTIONAL] API KEYS + API BASE
+## Advanced
+
+### Setting API KEYS + API BASE
 If required, you can set the api key + api base, set it in your os environment. [Code for how it's sent](https://github.com/BerriAI/litellm/blob/0100ab2382a0e720c7978fbf662cc6e6920e7e03/litellm/llms/huggingface_restapi.py#L25)
 
 ```python
@@ -137,6 +139,72 @@ os.environ["HUGGINGFACE_API_KEY"] = ""
 os.environ["HUGGINGFACE_API_BASE"] = "" 
 ```
 
+### Viewing Log probs
+
+#### Using `decoder_input_details` - OpenAI `echo`
+The `echo` param is supported by OpenAI Completions - Use `litellm.text_completion()` for this 
+```python
+from litellm import text_completion
+response = text_completion(
+    model="huggingface/bigcode/starcoder", 
+    prompt="good morning", 
+    max_tokens=10, logprobs=10,
+    echo=True
+)
+ ```
+
+#### Output
+ ```json
+{
+   "id":"chatcmpl-3fc71792-c442-4ba1-a611-19dd0ac371ad",
+   "object":"text_completion",
+   "created":1698801125.936519,
+   "model":"bigcode/starcoder",
+   "choices":[
+      {
+         "text":", I'm going to make you a sand",
+         "index":0,
+         "logprobs":{
+            "tokens":[
+               "good",
+               " morning",
+               ",",
+               " I",
+               "'m",
+               " going",
+               " to",
+               " make",
+               " you",
+               " a",
+               " s",
+               "and"
+            ],
+            "token_logprobs":[
+               "None",
+               -14.96875,
+               -2.2285156,
+               -2.734375,
+               -2.0957031,
+               -2.0917969,
+               -0.09429932,
+               -3.1132812,
+               -1.3203125,
+               -1.2304688,
+               -1.6201172,
+               -0.010292053
+            ]
+         },
+         "finish_reason":"length"
+      }
+   ],
+   "usage":{
+      "completion_tokens":9,
+      "prompt_tokens":2,
+      "total_tokens":11
+   }
+}
+```
+
 ### Models with Prompt Formatting
 For models with special prompt templates (e.g. Llama2), we format the prompt to fit their template. 
 
@@ -198,7 +266,7 @@ test_huggingface_custom_model()
 
 [Implementation Code](https://github.com/BerriAI/litellm/blob/c0b3da2c14c791a0b755f0b1e5a9ef065951ecbf/litellm/llms/huggingface_restapi.py#L52)
 
-## deploying a model on huggingface
+### Deploying a model on huggingface
 You can use any chat/text model from Hugging Face with the following steps:
 
 * Copy your model id/url from Huggingface Inference Endpoints
diff --git a/docs/my-website/docs/providers/ollama.md b/docs/my-website/docs/providers/ollama.md
index 6bccfedd4..88927aba7 100644
--- a/docs/my-website/docs/providers/ollama.md
+++ b/docs/my-website/docs/providers/ollama.md
@@ -89,8 +89,8 @@ For Ollama LiteLLM Provides a Docker Image for an OpenAI API compatible server f
 
 ### Quick Start:
 Docker Hub: 
-https://hub.docker.com/repository/docker/litellm/ollama/general
-
+For ARM Processors: https://hub.docker.com/repository/docker/litellm/ollama/general
+For Intel/AMD Processors: to be added
 ```shell
 docker pull litellm/ollama
 ```
diff --git a/docs/my-website/docs/providers/openai.md b/docs/my-website/docs/providers/openai.md
index 6f29b1136..05f5e2b60 100644
--- a/docs/my-website/docs/providers/openai.md
+++ b/docs/my-website/docs/providers/openai.md
@@ -1,17 +1,12 @@
 # OpenAI
 LiteLLM supports OpenAI Chat + Text completion and embedding calls.
 
-### API Keys
+### Required API Keys
 
 ```python
 import os 
-
 os.environ["OPENAI_API_KEY"] = "your-api-key"
 ```
-**Need a dedicated key?**
-Email us @ krrish@berri.ai 
-
-[**See all supported models by the litellm api key**](../proxy_api.md#supported-models-for-litellm-key)
 
 ### Usage
 ```python
@@ -20,44 +15,70 @@ from litellm import completion
 
 os.environ["OPENAI_API_KEY"] = "your-api-key"
 
-
-messages = [{ "content": "Hello, how are you?","role": "user"}]
-
 # openai call
-response = completion("gpt-3.5-turbo", messages)
+response = completion(
+    model = "gpt-3.5-turbo", 
+    messages=[{ "content": "Hello, how are you?","role": "user"}]
+)
+```
+
+### Optional Keys - OpenAI Organization, OpenAI API Base
+
+```python
+import os 
+os.environ["OPENAI_ORGANIZATION"] = "your-org-id"       # OPTIONAL
+os.environ["OPENAI_API_BASE"] = "openaiai-api-base"     # OPTIONAL
 ```
 
 ### OpenAI Chat Completion Models
 
-| Model Name       | Function Call                          | Required OS Variables                |
-|------------------|----------------------------------------|--------------------------------------|
-| gpt-3.5-turbo    | `completion('gpt-3.5-turbo', messages)` | `os.environ['OPENAI_API_KEY']`       |
-| gpt-3.5-turbo-0301    | `completion('gpt-3.5-turbo-0301', messages)` | `os.environ['OPENAI_API_KEY']`       |
-| gpt-3.5-turbo-0613    | `completion('gpt-3.5-turbo-0613', messages)` | `os.environ['OPENAI_API_KEY']`       |
-| gpt-3.5-turbo-16k    | `completion('gpt-3.5-turbo-16k', messages)` | `os.environ['OPENAI_API_KEY']`       |
-| gpt-3.5-turbo-16k-0613    | `completion('gpt-3.5-turbo-16k-0613', messages)` | `os.environ['OPENAI_API_KEY']`       |
-| gpt-4            | `completion('gpt-4', messages)`         | `os.environ['OPENAI_API_KEY']`       |
-| gpt-4-0314            | `completion('gpt-4-0314', messages)`         | `os.environ['OPENAI_API_KEY']`       |
-| gpt-4-0613            | `completion('gpt-4-0613', messages)`         | `os.environ['OPENAI_API_KEY']`       |
-| gpt-4-32k            | `completion('gpt-4-32k', messages)`         | `os.environ['OPENAI_API_KEY']`       |
-| gpt-4-32k-0314            | `completion('gpt-4-32k-0314', messages)`         | `os.environ['OPENAI_API_KEY']`       |
-| gpt-4-32k-0613            | `completion('gpt-4-32k-0613', messages)`         | `os.environ['OPENAI_API_KEY']`       |
+| Model Name            | Function Call                                                   |
+|-----------------------|-----------------------------------------------------------------|
+| gpt-3.5-turbo         | `response = completion(model="gpt-3.5-turbo", messages=messages)` |
+| gpt-3.5-turbo-0301    | `response = completion(model="gpt-3.5-turbo-0301", messages=messages)` |
+| gpt-3.5-turbo-0613    | `response = completion(model="gpt-3.5-turbo-0613", messages=messages)` |
+| gpt-3.5-turbo-16k     | `response = completion(model="gpt-3.5-turbo-16k", messages=messages)` |
+| gpt-3.5-turbo-16k-0613| `response = completion(model="gpt-3.5-turbo-16k-0613", messages=messages)` |
+| gpt-4                 | `response = completion(model="gpt-4", messages=messages)` |
+| gpt-4-0314            | `response = completion(model="gpt-4-0314", messages=messages)` |
+| gpt-4-0613            | `response = completion(model="gpt-4-0613", messages=messages)` |
+| gpt-4-32k             | `response = completion(model="gpt-4-32k", messages=messages)` |
+| gpt-4-32k-0314        | `response = completion(model="gpt-4-32k-0314", messages=messages)` |
+| gpt-4-32k-0613        | `response = completion(model="gpt-4-32k-0613", messages=messages)` |
+
 
 These also support the `OPENAI_API_BASE` environment variable, which can be used to specify a custom API endpoint.
 
 ### OpenAI Text Completion Models / Instruct Models
 
-| Model Name       | Function Call                              | Required OS Variables                |
-|------------------|--------------------------------------------|--------------------------------------|
-| gpt-3.5-turbo-instruct | `completion('gpt-3.5-turbo-instruct', messages)` | `os.environ['OPENAI_API_KEY'`       |
-| text-davinci-003 | `completion('text-davinci-003', messages)` | `os.environ['OPENAI_API_KEY']`       |
-| ada-001 | `completion('ada-001', messages)` | `os.environ['OPENAI_API_KEY']`       |
-| curie-001 | `completion('curie-001', messages)` | `os.environ['OPENAI_API_KEY']`       |
-| babbage-001 | `completion('babbage-001', messages)` | `os.environ['OPENAI_API_KEY']`       |
-| babbage-002 | `completion('ada-001', messages)` | `os.environ['OPENAI_API_KEY']`       |
-| davinci-002 | `completion('davinci-002', messages)` | `os.environ['OPENAI_API_KEY']`       |
+| Model Name          | Function Call                                      |
+|---------------------|----------------------------------------------------|
+| gpt-3.5-turbo-instruct | `response = completion(model="gpt-3.5-turbo-instruct", messages=messages)` |
+| text-davinci-003    | `response = completion(model="text-davinci-003", messages=messages)` |
+| ada-001             | `response = completion(model="ada-001", messages=messages)` |
+| curie-001           | `response = completion(model="curie-001", messages=messages)` |
+| babbage-001         | `response = completion(model="babbage-001", messages=messages)` |
+| babbage-002         | `response = completion(model="babbage-002", messages=messages)` |
+| davinci-002         | `response = completion(model="davinci-002", messages=messages)` |
 
 
+### Setting Organization-ID for completion calls
+This can be set in one of the following ways:
+- Environment Variable `OPENAI_ORGANIZATION`
+- Params to `litellm.completion(model=model, organization="your-organization-id")`
+- Set as `litellm.organization="your-organization-id"`
+```python
+import os 
+from litellm import completion
+
+os.environ["OPENAI_API_KEY"] = "your-api-key"
+os.environ["OPENAI_ORGANIZATION"] = "your-org-id" # OPTIONAL
+
+response = completion(
+    model = "gpt-3.5-turbo", 
+    messages=[{ "content": "Hello, how are you?","role": "user"}]
+)
+```
 ### Using Helicone Proxy with LiteLLM
 ```python
 import os 
diff --git a/docs/my-website/docs/proxy_server.md b/docs/my-website/docs/proxy_server.md
index f9aec2f29..0737e761f 100644
--- a/docs/my-website/docs/proxy_server.md
+++ b/docs/my-website/docs/proxy_server.md
@@ -6,8 +6,10 @@ import TabItem from '@theme/TabItem';
 A fast, and lightweight OpenAI-compatible server to call 100+ LLM APIs. 
 
 :::info
-This is deprecated. Support for the CLI tool will be removed in our next MAJOR release - https://github.com/BerriAI/litellm/discussions/648.
-::: 
+
+Docs outdated. New docs 👉 [here](./simple_proxy.md)
+
+:::
 
 ## Usage 
 ```shell
diff --git a/docs/my-website/docs/routing.md b/docs/my-website/docs/routing.md
index e11f0485f..391b20b2f 100644
--- a/docs/my-website/docs/routing.md
+++ b/docs/my-website/docs/routing.md
@@ -1,4 +1,79 @@
-# Manage Multiple Deployments
+# Reliability - Fallbacks, Azure Deployments, etc.
+
+# Reliability
+
+LiteLLM helps prevent failed requests in 3 ways: 
+- Retries
+- Fallbacks: Context Window + General
+- RateLimitManager
+
+## Helper utils 
+LiteLLM supports the following functions for reliability:
+* `litellm.longer_context_model_fallback_dict`: Dictionary which has a mapping for those models which have larger equivalents  
+* `num_retries`: use tenacity retries
+* `completion()` with fallbacks: switch between models/keys/api bases in case of errors. 
+* `router()`: An abstraction on top of completion + embeddings to route the request to a deployment with capacity (available tpm/rpm).
+
+## Retry failed requests
+
+Call it in completion like this `completion(..num_retries=2)`.
+
+
+Here's a quick look at how you can use it: 
+
+```python 
+from litellm import completion
+
+user_message = "Hello, whats the weather in San Francisco??"
+messages = [{"content": user_message, "role": "user"}]
+
+# normal call 
+response = completion(
+            model="gpt-3.5-turbo",
+            messages=messages,
+            num_retries=2
+        )
+```
+
+## Fallbacks 
+
+### Context Window Fallbacks
+```python 
+from litellm import completion
+
+fallback_dict = {"gpt-3.5-turbo": "gpt-3.5-turbo-16k"}
+messages = [{"content": "how does a court case get to the Supreme Court?" * 500, "role": "user"}]
+
+completion(model="gpt-3.5-turbo", messages=messages, context_window_fallback_dict=ctx_window_fallback_dict)
+```
+
+### Fallbacks - Switch Models/API Keys/API Bases
+
+LLM APIs can be unstable, completion() with fallbacks ensures you'll always get a response from your calls
+
+#### Usage 
+To use fallback models with `completion()`, specify a list of models in the `fallbacks` parameter. 
+
+The `fallbacks` list should include the primary model you want to use, followed by additional models that can be used as backups in case the primary model fails to provide a response.
+
+#### switch models 
+```python
+response = completion(model="bad-model", messages=messages, 
+    fallbacks=["gpt-3.5-turbo" "command-nightly"])
+```
+
+#### switch api keys/bases (E.g. azure deployment)
+Switch between different keys for the same azure deployment, or use another deployment as well. 
+
+```python
+api_key="bad-key"
+response = completion(model="azure/gpt-4", messages=messages, api_key=api_key,
+    fallbacks=[{"api_key": "good-key-1"}, {"api_key": "good-key-2", "api_base": "good-api-base-2"}])
+```
+
+[Check out this section for implementation details](#fallbacks-1)
+
+## Manage Multiple Deployments
 
 Use this if you're trying to load-balance across multiple deployments (e.g. Azure/OpenAI). 
 
@@ -6,11 +81,7 @@ Use this if you're trying to load-balance across multiple deployments (e.g. Azur
 
 In production, [Router connects to a Redis Cache](#redis-queue) to track usage across multiple deployments.
 
-## Quick Start
-
-```python
-pip install litellm
-```
+### Quick Start
 
 ```python
 from litellm import Router
@@ -54,7 +125,7 @@ response = router.completion(model="gpt-3.5-turbo",
 print(response)
 ```
 
-## Redis Queue 
+### Redis Queue 
 
 In production, we use Redis to track usage across multiple Azure deployments.
 
@@ -67,7 +138,7 @@ router = Router(model_list=model_list,
 print(response)
 ```
 
-## Deploy Router 
+### Deploy Router 
 
 1. Clone repo
 ```shell
@@ -99,4 +170,131 @@ curl 'http://0.0.0.0:8000/router/completions' \
     "model": "gpt-3.5-turbo",
     "messages": [{"role": "user", "content": "Hey"}]
 }'
+```
+
+
+## Implementation Details 
+
+### Fallbacks
+#### Output from calls
+```
+Completion with 'bad-model': got exception Unable to map your input to a model. Check your input - {'model': 'bad-model'
+
+
+
+completion call gpt-3.5-turbo
+{
+  "id": "chatcmpl-7qTmVRuO3m3gIBg4aTmAumV1TmQhB",
+  "object": "chat.completion",
+  "created": 1692741891,
+  "model": "gpt-3.5-turbo-0613",
+  "choices": [
+    {
+      "index": 0,
+      "message": {
+        "role": "assistant",
+        "content": "I apologize, but as an AI, I do not have the capability to provide real-time weather updates. However, you can easily check the current weather in San Francisco by using a search engine or checking a weather website or app."
+      },
+      "finish_reason": "stop"
+    }
+  ],
+  "usage": {
+    "prompt_tokens": 16,
+    "completion_tokens": 46,
+    "total_tokens": 62
+  }
+}
+
+```
+
+#### How does fallbacks work
+
+When you pass `fallbacks` to `completion`, it makes the first `completion` call using the primary model specified as `model` in `completion(model=model)`. If the primary model fails or encounters an error, it automatically tries the `fallbacks` models in the specified order. This ensures a response even if the primary model is unavailable.
+
+
+#### Key components of Model Fallbacks implementation:
+* Looping through `fallbacks`
+* Cool-Downs for rate-limited models
+
+#### Looping through `fallbacks`
+Allow `45seconds` for each request. In the 45s this function tries calling the primary model set as `model`. If model fails it loops through the backup `fallbacks` models and attempts to get a response in the allocated `45s` time set here: 
+```python
+while response == None and time.time() - start_time < 45:
+        for model in fallbacks:
+```
+
+#### Cool-Downs for rate-limited models
+If a model API call leads to an error - allow it to cooldown for `60s`
+```python
+except Exception as e:
+  print(f"got exception {e} for model {model}")
+  rate_limited_models.add(model)
+  model_expiration_times[model] = (
+      time.time() + 60
+  )  # cool down this selected model
+  pass
+```
+
+Before making an LLM API call we check if the selected model is in `rate_limited_models`, if so skip making the API call
+```python
+if (
+  model in rate_limited_models
+):  # check if model is currently cooling down
+  if (
+      model_expiration_times.get(model)
+      and time.time() >= model_expiration_times[model]
+  ):
+      rate_limited_models.remove(
+          model
+      )  # check if it's been 60s of cool down and remove model
+  else:
+      continue  # skip model
+
+```
+
+#### Full code of completion with fallbacks()
+```python
+
+    response = None
+    rate_limited_models = set()
+    model_expiration_times = {}
+    start_time = time.time()
+    fallbacks = [kwargs["model"]] + kwargs["fallbacks"]
+    del kwargs["fallbacks"]  # remove fallbacks so it's not recursive
+
+    while response == None and time.time() - start_time < 45:
+        for model in fallbacks:
+            # loop thru all models
+            try:
+                if (
+                    model in rate_limited_models
+                ):  # check if model is currently cooling down
+                    if (
+                        model_expiration_times.get(model)
+                        and time.time() >= model_expiration_times[model]
+                    ):
+                        rate_limited_models.remove(
+                            model
+                        )  # check if it's been 60s of cool down and remove model
+                    else:
+                        continue  # skip model
+
+                # delete model from kwargs if it exists
+                if kwargs.get("model"):
+                    del kwargs["model"]
+
+                print("making completion call", model)
+                response = litellm.completion(**kwargs, model=model)
+
+                if response != None:
+                    return response
+
+            except Exception as e:
+                print(f"got exception {e} for model {model}")
+                rate_limited_models.add(model)
+                model_expiration_times[model] = (
+                    time.time() + 60
+                )  # cool down this selected model
+                pass
+    return response
 ```
\ No newline at end of file
diff --git a/docs/my-website/docs/set_keys.md b/docs/my-website/docs/set_keys.md
index e8524da22..4c8cc42fe 100644
--- a/docs/my-website/docs/set_keys.md
+++ b/docs/my-website/docs/set_keys.md
@@ -78,6 +78,14 @@ litellm.api_base = "https://hosted-llm-api.co"
 response = litellm.completion(messages=messages, model="gpt-3.5-turbo")
 ```
 
+### litellm.api_version
+
+```python
+import litellm
+litellm.api_version = "2023-05-15"
+response = litellm.completion(messages=messages, model="gpt-3.5-turbo")
+```
+
 ### litellm.organization
 ```python
 import litellm
@@ -124,7 +132,7 @@ response = completion("command-nightly", messages, api_version="2023-02-15")
 
 Check if a user submitted a valid key for the model they're trying to call. 
 
-```
+```python
 key = "bad-key"
 response = check_valid_key(model="gpt-3.5-turbo", api_key=key)
 assert(response == False)
@@ -134,7 +142,7 @@ assert(response == False)
 
 This helper reads the .env and returns a list of supported llms for user
 
-```
+```python
 old_environ = os.environ
 os.environ = {'OPENAI_API_KEY': 'temp'} # mock set only openai key in environ
 
diff --git a/docs/my-website/docs/simple_proxy.md b/docs/my-website/docs/simple_proxy.md
index becf87e98..e4577e4ad 100644
--- a/docs/my-website/docs/simple_proxy.md
+++ b/docs/my-website/docs/simple_proxy.md
@@ -2,23 +2,337 @@ import Image from '@theme/IdealImage';
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 
-# 💥 LiteLLM Server - Deploy LiteLLM
+# 💥 Evaluate LLMs - OpenAI Proxy Server
 
-A simple, fast, and lightweight **OpenAI-compatible server** to call 100+ LLM APIs in the OpenAI Input/Output format
+A simple, fast, and lightweight **OpenAI-compatible server** to call 100+ LLM APIs.
 
-## Endpoints:
-- `/chat/completions` - chat completions endpoint to call 100+ LLMs
-- `/models` - available models on server
+LiteLLM Server supports:
 
-[![Deploy](https://deploy.cloud.run/button.svg)](https://l.linklyhq.com/l/1uHtX)
-[![Deploy](https://render.com/images/deploy-to-render-button.svg)](https://l.linklyhq.com/l/1uHsr)
-[![Deploy](../img/deploy-to-aws.png)](https://docs.litellm.ai/docs/simple_proxy#deploy-on-aws-apprunner)
+* Call [Huggingface/Bedrock/TogetherAI/etc.](#other-supported-models) in the OpenAI ChatCompletions format
+* Set custom prompt templates + model-specific configs (temperature, max_tokens, etc.)
+* Caching (In-memory + Redis)
+
+[**See Code**](https://github.com/BerriAI/litellm/tree/main/litellm_server)
 
 :::info
 We want to learn how we can make the server better! Meet the [founders](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version) or
 join our [discord](https://discord.gg/wuPM9dRgDw)
 ::: 
 
+## Quick Start 
+
+```shell
+$ litellm --model huggingface/bigcode/starcoder
+```
+OpenAI Proxy running on http://0.0.0.0:8000
+
+```shell
+curl http://0.0.0.0:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+     "model": "gpt-3.5-turbo",
+     "messages": [{"role": "user", "content": "Say this is a test!"}],
+     "temperature": 0.7
+   }'
+```
+
+This will now automatically route any requests for gpt-3.5-turbo to bigcode starcoder, hosted on huggingface inference endpoints. 
+
+
+#### Other supported models:
+<Tabs>
+<TabItem value="bedrock" label="Bedrock">
+
+```shell
+$ export AWS_ACCESS_KEY_ID=""
+$ export AWS_REGION_NAME="" # e.g. us-west-2
+$ export AWS_SECRET_ACCESS_KEY=""
+$ litellm --model bedrock/anthropic.claude-v2
+```
+</TabItem>
+<TabItem value="vllm-local" label="VLLM">
+Assuming you're running vllm locally
+
+```shell
+$ litellm --model vllm/facebook/opt-125m
+```
+</TabItem>
+<TabItem value="openai-proxy" label="OpenAI Compatible Server">
+
+```shell
+$ litellm --model openai/<model_name> --api_base <your-api-base>
+```
+</TabItem>
+<TabItem value="huggingface" label="Huggingface (TGI)">
+
+```shell
+$ export HUGGINGFACE_API_KEY=my-api-key #[OPTIONAL]
+$ litellm --model huggingface/<huggingface-model-name> --api_base https://<your-hf-endpoint># e.g. huggingface/mistralai/Mistral-7B-v0.1
+```
+
+</TabItem>
+<TabItem value="anthropic" label="Anthropic">
+
+```shell
+$ export ANTHROPIC_API_KEY=my-api-key
+$ litellm --model claude-instant-1
+```
+
+</TabItem>
+
+<TabItem value="together_ai" label="TogetherAI">
+
+```shell
+$ export TOGETHERAI_API_KEY=my-api-key
+$ litellm --model together_ai/lmsys/vicuna-13b-v1.5-16k
+```
+
+</TabItem>
+
+<TabItem value="replicate" label="Replicate">
+
+```shell
+$ export REPLICATE_API_KEY=my-api-key
+$ litellm \
+  --model replicate/meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3
+```
+
+</TabItem>
+
+<TabItem value="petals" label="Petals">
+
+```shell
+$ litellm --model petals/meta-llama/Llama-2-70b-chat-hf
+```
+
+</TabItem>
+
+<TabItem value="palm" label="Palm">
+
+```shell
+$ export PALM_API_KEY=my-palm-key
+$ litellm --model palm/chat-bison
+```
+
+</TabItem>
+
+<TabItem value="azure" label="Azure OpenAI">
+
+```shell
+$ export AZURE_API_KEY=my-api-key
+$ export AZURE_API_BASE=my-api-base
+
+$ litellm --model azure/my-deployment-name
+```
+
+</TabItem>
+
+<TabItem value="ai21" label="AI21">
+
+```shell
+$ export AI21_API_KEY=my-api-key
+$ litellm --model j2-light
+```
+
+</TabItem>
+
+<TabItem value="cohere" label="Cohere">
+
+```shell
+$ export COHERE_API_KEY=my-api-key
+$ litellm --model command-nightly
+```
+
+</TabItem>
+
+</Tabs>
+
+[**Jump to Code**](https://github.com/BerriAI/litellm/blob/fef4146396d5d87006259e00095a62e3900d6bb4/litellm/proxy.py#L36)
+
+# [TUTORIAL] LM-Evaluation Harness with TGI
+
+Evaluate LLMs 20x faster with TGI via litellm proxy's `/completions` endpoint. 
+
+This tutorial assumes you're using [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness)
+
+**Step 1: Start the local proxy**
+```shell
+$ litellm --model huggingface/bigcode/starcoder
+```
+
+OpenAI Compatible Endpoint at http://0.0.0.0:8000
+
+**Step 2: Set OpenAI API Base**
+```shell
+$ export OPENAI_API_BASE="http://0.0.0.0:8000"
+```
+
+**Step 3: Run LM-Eval-Harness**
+
+```shell
+$ python3 main.py \
+  --model gpt3 \
+  --model_args engine=huggingface/bigcode/starcoder \
+  --tasks hellaswag
+```
+
+
+## Endpoints:
+- `/chat/completions` - chat completions endpoint to call 100+ LLMs
+- `/embeddings` - embedding endpoint for Azure, OpenAI, Huggingface endpoints
+- `/models` - available models on server
+
+## Set Custom Prompt Templates
+
+LiteLLM by default checks if a model has a [prompt template and applies it](./completion/prompt_formatting.md) (e.g. if a huggingface model has a saved chat template in it's tokenizer_config.json). However, you can also set a custom prompt template on your proxy in the `config.yaml`: 
+
+**Step 1**: Save your prompt template in a `config.yaml`
+```yaml
+# Model-specific parameters
+model_list:
+  - model_name: mistral-7b # model alias
+    litellm_params: # actual params for litellm.completion()
+      model: "huggingface/mistralai/Mistral-7B-Instruct-v0.1" 
+      api_base: "<your-api-base>"
+      api_key: "<your-api-key>" # [OPTIONAL] for hf inference endpoints
+      initial_prompt_value: "\n"
+      roles: {"system":{"pre_message":"<|im_start|>system\n", "post_message":"<|im_end|>"}, "assistant":{"pre_message":"<|im_start|>assistant\n","post_message":"<|im_end|>"}, "user":{"pre_message":"<|im_start|>user\n","post_message":"<|im_end|>"}}
+      final_prompt_value: "\n"
+      bos_token: "<s>"
+      eos_token: "</s>"
+      max_tokens: 4096
+```
+
+**Step 2**: Start server with config
+
+```shell
+$ litellm --config /path/to/config.yaml
+```
+
+## Multiple Models 
+
+If you have 1 model running on a local GPU and another that's hosted (e.g. on Runpod), you can call both via the same litellm server by listing them in your `config.yaml`. 
+
+```yaml
+model_list:
+  - model_name: zephyr-alpha
+    litellm_params: # params for litellm.completion() - https://docs.litellm.ai/docs/completion/input#input---request-body
+      model: huggingface/HuggingFaceH4/zephyr-7b-alpha
+      api_base: http://0.0.0.0:8001
+  - model_name: zephyr-beta
+    litellm_params:
+      model: huggingface/HuggingFaceH4/zephyr-7b-beta
+      api_base: https://<my-hosted-endpoint>
+```
+
+```shell
+$ litellm --config /path/to/config.yaml
+```
+
+### Evaluate model
+
+If you're repo let's you set model name, you can call the specific model by just passing in that model's name - 
+
+```python
+import openai 
+openai.api_base = "http://0.0.0.0:8000" 
+
+completion = openai.ChatCompletion.create(model="zephyr-alpha", messages=[{"role": "user", "content": "Hello world"}])
+print(completion.choices[0].message.content)
+```
+
+If you're repo only let's you specify api base, then you can add the model name to the api base passed in - 
+
+```python
+import openai 
+openai.api_base = "http://0.0.0.0:8000/openai/deployments/zephyr-alpha/chat/completions" # zephyr-alpha will be used 
+
+completion = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hello world"}])
+print(completion.choices[0].message.content)
+```
+
+## Save Model-specific params (API Base, API Keys, Temperature, etc.)
+Use the [router_config_template.yaml](https://github.com/BerriAI/litellm/blob/main/router_config_template.yaml) to save model-specific information like api_base, api_key, temperature, max_tokens, etc. 
+
+**Step 1**: Create a `config.yaml` file
+```shell
+model_list:
+  - model_name: gpt-3.5-turbo
+    litellm_params: # params for litellm.completion() - https://docs.litellm.ai/docs/completion/input#input---request-body
+      model: azure/chatgpt-v-2 # azure/<your-deployment-name>
+      api_key: your_azure_api_key
+      api_version: your_azure_api_version
+      api_base: your_azure_api_base
+  - model_name: mistral-7b
+    litellm_params:
+      model: ollama/mistral
+      api_base: your_ollama_api_base
+```
+
+**Step 2**: Start server with config
+
+```shell
+$ litellm --config /path/to/config.yaml
+```
+## Model Alias 
+
+Set a model alias for your deployments. 
+
+In the `config.yaml` the model_name parameter is the user-facing name to use for your deployment. 
+
+E.g.: If we want to save a Huggingface TGI Mistral-7b deployment, as 'mistral-7b' for our users, we might save it as: 
+
+```yaml
+model_list:
+  - model_name: mistral-7b # ALIAS
+    litellm_params:
+      model: huggingface/mistralai/Mistral-7B-Instruct-v0.1 # ACTUAL NAME
+      api_key: your_huggingface_api_key # [OPTIONAL] if deployed on huggingface inference endpoints
+      api_base: your_api_base # url where model is deployed 
+```
+
+## Caching 
+
+Add Redis Caching to your server via environment variables  
+
+```env
+### REDIS
+REDIS_HOST = "" 
+REDIS_PORT = "" 
+REDIS_PASSWORD = "" 
+```
+
+Docker command: 
+
+```shell
+docker run -e REDIST_HOST=<your-redis-host> -e REDIS_PORT=<your-redis-port> -e REDIS_PASSWORD=<your-redis-password> -e PORT=8000 -p 8000:8000 ghcr.io/berriai/litellm:latest
+```
+
+## Logging 
+
+1. Debug Logs
+Print the input/output params by setting `SET_VERBOSE = "True"`.
+
+Docker command:
+
+```shell
+docker run -e SET_VERBOSE="True" -e PORT=8000 -p 8000:8000 ghcr.io/berriai/litellm:latest
+```
+2. Add Langfuse Logging to your server via environment variables  
+
+```env
+### LANGFUSE
+LANGFUSE_PUBLIC_KEY = ""
+LANGFUSE_SECRET_KEY = ""
+# Optional, defaults to https://cloud.langfuse.com
+LANGFUSE_HOST = "" # optional
+```
+
+Docker command: 
+
+```shell
+docker run -e LANGFUSE_PUBLIC_KEY=<your-public-key> -e LANGFUSE_SECRET_KEY=<your-secret-key> -e LANGFUSE_HOST=<your-langfuse-host> -e PORT=8000 -p 8000:8000 ghcr.io/berriai/litellm:latest
+```
 
 ## Local Usage 
 
@@ -33,53 +347,6 @@ $ cd ./litellm/litellm_server
 $ uvicorn main:app --host 0.0.0.0 --port 8000
 ```
 
-### Test Request
-Ensure your API keys are set in the Environment for these requests
-
-<Tabs>
-<TabItem value="openai" label="OpenAI">
-
-```shell
-curl http://0.0.0.0:8000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-     "model": "gpt-3.5-turbo",
-     "messages": [{"role": "user", "content": "Say this is a test!"}],
-     "temperature": 0.7
-   }'
-```
-
-</TabItem>
-<TabItem value="azure" label="Azure">
-
-```shell
-curl http://0.0.0.0:8000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-     "model": "azure/<your-deployment-name>",
-     "messages": [{"role": "user", "content": "Say this is a test!"}],
-     "temperature": 0.7
-   }'
-```
-
-</TabItem>
-
-<TabItem value="anthropic" label="Anthropic">
-
-```shell
-curl http://0.0.0.0:8000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-     "model": "claude-2",
-     "messages": [{"role": "user", "content": "Say this is a test!"}],
-     "temperature": 0.7,
-   }'
-```
-</TabItem>
-
-</Tabs>
-
-
 ## Setting LLM API keys
 This server allows two ways of passing API keys to litellm
 - Environment Variables - This server by default assumes the LLM API Keys are stored in the environment variables
@@ -87,7 +354,11 @@ This server allows two ways of passing API keys to litellm
   - Set `AUTH_STRATEGY=DYNAMIC` in the Environment 
   - Pass required auth params `api_key`,`api_base`, `api_version` with the request params
 
-## Deploy on Google Cloud Run
+
+<Tabs>
+<TabItem value="gcp-run" label="Google Cloud Run">
+
+#### Deploy on Google Cloud Run
 **Click the button** to deploy to Google Cloud Run
 
 [![Deploy](https://deploy.cloud.run/button.svg)](https://l.linklyhq.com/l/1uHtX)
@@ -159,8 +430,10 @@ More info [here](https://cloud.google.com/run/docs/configuring/services/environm
 Example `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`
 <Image img={require('../img/cloud_run3.png')} />
 
+</TabItem>
+<TabItem value="render" label="Render">
 
-## Deploy on Render
+#### Deploy on Render
 **Click the button** to deploy to Render
 
 [![Deploy](https://render.com/images/deploy-to-render-button.svg)](https://l.linklyhq.com/l/1uHsr)
@@ -169,8 +442,10 @@ On a successfull deploy https://dashboard.render.com/ should display the followi
 <Image img={require('../img/render1.png')} />
 
 <Image img={require('../img/render2.png')} />
+</TabItem>
+<TabItem value="aws-apprunner" label="AWS Apprunner">
 
-## Deploy on AWS Apprunner
+#### Deploy on AWS Apprunner
 1. Fork LiteLLM https://github.com/BerriAI/litellm 
 2. Navigate to to App Runner on AWS Console: https://console.aws.amazon.com/apprunner/home#/services
 3. Follow the steps in the video below
@@ -225,6 +500,8 @@ On a successfull deploy https://dashboard.render.com/ should display the followi
 
   </Tabs>
 
+</TabItem>
+</Tabs>
 
 ## Advanced
 ### Caching - Completion() and Embedding() Responses
@@ -287,3 +564,220 @@ Caching can be switched on/off per /chat/completions request
 
 
 
+## Tutorials (Chat-UI, NeMO-Guardrails, PromptTools, Phoenix ArizeAI, Langchain, ragas, LlamaIndex, etc.)
+
+**Start server:**
+```shell
+`docker run -e PORT=8000 -p 8000:8000 ghcr.io/berriai/litellm:latest`
+```
+The server is now live on http://0.0.0.0:8000
+
+<Tabs>
+<TabItem value="chat-ui" label="Chat UI">
+
+Here's the `docker-compose.yml` for running LiteLLM Server with Mckay Wrigley's Chat-UI: 
+```yaml
+version: '3'
+services:
+  container1:
+    image: ghcr.io/berriai/litellm:latest
+    ports:
+      - '8000:8000'
+    environment:
+      - PORT=8000
+      - OPENAI_API_KEY=<your-openai-key>
+
+  container2:
+    image: ghcr.io/mckaywrigley/chatbot-ui:main
+    ports:
+      - '3000:3000'
+    environment:
+      - OPENAI_API_KEY=my-fake-key
+      - OPENAI_API_HOST=http://container1:8000
+```
+
+Run this via: 
+```shell
+docker-compose up
+```
+</TabItem>
+<TabItem value="nemo-guardrails" label="NeMO-Guardrails">
+
+#### Adding NeMO-Guardrails to Bedrock 
+
+1. Start server
+```shell
+`docker run -e PORT=8000 -e AWS_ACCESS_KEY_ID=<your-aws-access-key> -e AWS_SECRET_ACCESS_KEY=<your-aws-secret-key> -p 8000:8000 ghcr.io/berriai/litellm:latest`
+```
+
+2. Install dependencies
+```shell
+pip install nemoguardrails langchain
+```
+
+3. Run script
+```python
+import openai
+from langchain.chat_models import ChatOpenAI
+
+llm = ChatOpenAI(model_name="bedrock/anthropic.claude-v2", openai_api_base="http://0.0.0.0:8000", openai_api_key="my-fake-key")
+
+from nemoguardrails import LLMRails, RailsConfig
+
+config = RailsConfig.from_path("./config.yml")
+app = LLMRails(config, llm=llm)
+
+new_message = app.generate(messages=[{
+    "role": "user",
+    "content": "Hello! What can you do for me?"
+}])
+``` 
+</TabItem>
+<TabItem value="prompttools" label="PromptTools">
+
+Use [PromptTools](https://github.com/hegelai/prompttools) for evaluating different LLMs
+
+1. Start server
+```shell
+`docker run -e PORT=8000 -p 8000:8000 ghcr.io/berriai/litellm:latest`
+```
+
+2. Install dependencies 
+```python 
+pip install prompttools
+```
+
+3. Run script 
+```python 
+import os
+os.environ['DEBUG']=""  # Set this to "" to call OpenAI's API
+os.environ['AZURE_OPENAI_KEY'] = "my-api-key"  # Insert your key here
+
+from typing import Dict, List
+from prompttools.experiment import OpenAIChatExperiment
+
+models = ["gpt-3.5-turbo", "gpt-3.5-turbo-0613"]
+messages = [
+    [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "Who was the first president?"},
+    ]
+]
+temperatures = [0.0, 1.0]
+# You can add more parameters that you'd like to test here.
+
+experiment = OpenAIChatExperiment(models, messages, temperature=temperatures, azure_openai_service_configs={"AZURE_OPENAI_ENDPOINT": "http://0.0.0.0:8000", "API_TYPE": "azure", "API_VERSION": "2023-05-15"})
+```
+</TabItem>
+<TabItem value="phoenix-arizeai" label="ArizeAI">
+
+Use [Arize AI's LLM Evals](https://github.com/Arize-ai/phoenix#llm-evals) to evaluate different LLMs
+
+1. Start server
+```shell
+`docker run -e PORT=8000 -p 8000:8000 ghcr.io/berriai/litellm:latest`
+```
+
+2. Use this LLM Evals Quickstart colab
+[![Open in Colab](https://img.shields.io/static/v1?message=Open%20in%20Colab&logo=googlecolab&labelColor=grey&color=blue&logoColor=orange&label=%20)](https://colab.research.google.com/github/Arize-ai/phoenix/blob/main/tutorials/evals/evaluate_relevance_classifications.ipynb)
+
+3. Call the model
+```python
+import openai 
+
+## SET API BASE + PROVIDER KEY
+openai.api_base = "http://0.0.0.0:8000
+openai.api_key = "my-anthropic-key"
+
+## CALL MODEL 
+model = OpenAIModel(
+    model_name="claude-2",
+    temperature=0.0,
+)
+```
+</TabItem>
+<TabItem value="langchain" label="Langchain">
+
+```python
+from langchain.chat_models import ChatOpenAI
+from langchain.prompts.chat import (
+    ChatPromptTemplate,
+    SystemMessagePromptTemplate,
+    AIMessagePromptTemplate,
+    HumanMessagePromptTemplate,
+)
+from langchain.schema import AIMessage, HumanMessage, SystemMessage
+
+chat = ChatOpenAI(model_name="claude-instant-1", openai_api_key="my-anthropic-key", openai_api_base="http://0.0.0.0:8000")
+
+messages = [
+    SystemMessage(
+        content="You are a helpful assistant that translates English to French."
+    ),
+    HumanMessage(
+        content="Translate this sentence from English to French. I love programming."
+    ),
+]
+chat(messages)
+```
+</TabItem>
+<TabItem value="ragas" label="ragas">
+
+#### Evaluating with Open-Source LLMs 
+
+Use [Ragas](https://github.com/explodinggradients/ragas/blob/7b123533df80d0ada33a2cb2dd2fdedf36807f33/docs/howtos/customisations/llms.ipynb#L247) to evaluate LLMs for RAG-scenarios.
+```python
+from langchain.chat_models import ChatOpenAI
+
+inference_server_url = "http://localhost:8080/v1"
+
+chat = ChatOpenAI(
+    model="bedrock/anthropic.claude-v2",
+    openai_api_key="no-key",
+    openai_api_base=inference_server_url,
+    max_tokens=5,
+    temperature=0,
+)
+
+from ragas.metrics import (
+    context_precision,
+    answer_relevancy,
+    faithfulness,
+    context_recall,
+)
+from ragas.metrics.critique import harmfulness
+
+# change the LLM
+
+faithfulness.llm.langchain_llm = chat
+answer_relevancy.llm.langchain_llm = chat
+context_precision.llm.langchain_llm = chat
+context_recall.llm.langchain_llm = chat
+harmfulness.llm.langchain_llm = chat
+
+
+# evaluate
+from ragas import evaluate
+
+result = evaluate(
+    fiqa_eval["baseline"].select(range(5)),  # showing only 5 for demonstration
+    metrics=[faithfulness],
+)
+
+result
+```
+</TabItem>
+<TabItem value="llama_index" label="Llama Index">
+
+```python
+!pip install llama-index
+```
+```python
+from llama_index.llms import OpenAI
+
+response = OpenAI(model="claude-2", api_key="your-anthropic-key",api_base="http://0.0.0.0:8000").complete('Paul Graham is ')
+print(response)
+```
+</TabItem>
+</Tabs>
+
diff --git a/docs/my-website/docs/tutorials/finetuned_chat_gpt.md b/docs/my-website/docs/tutorials/finetuned_chat_gpt.md
index 0650bdf32..641c45b5f 100644
--- a/docs/my-website/docs/tutorials/finetuned_chat_gpt.md
+++ b/docs/my-website/docs/tutorials/finetuned_chat_gpt.md
@@ -2,13 +2,41 @@
 LiteLLM allows you to call `completion` with your fine-tuned gpt-3.5-turbo models
 If you're trying to create your custom finetuned gpt-3.5-turbo model following along on this tutorial: https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset
 
-Once you've created your fine tuned model, you can call it with `completion()` 
+Once you've created your fine tuned model, you can call it with `litellm.completion()` 
 
 ## Usage
 ```python
 import os
 from litellm import completion
-# set your OPENAI key in your .env as "OPENAI_API_KEY"
+
+# LiteLLM reads from your .env
+os.environ["OPENAI_API_KEY"] = "your-api-key"
+
+response = completion(
+  model="ft:gpt-3.5-turbo:my-org:custom_suffix:id",
+  messages=[
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "Hello!"}
+  ]
+)
+
+print(response.choices[0].message)
+```
+
+## Usage - Setting OpenAI Organization ID
+LiteLLM allows you to specify your OpenAI Organization when calling OpenAI LLMs. More details here: 
+[setting Organization ID](https://docs.litellm.ai/docs/providers/openai#setting-organization-id-for-completion-calls)
+This can be set in one of the following ways:
+- Environment Variable `OPENAI_ORGANIZATION`
+- Params to `litellm.completion(model=model, organization="your-organization-id")`
+- Set as `litellm.organization="your-organization-id"`
+```python
+import os
+from litellm import completion
+
+# LiteLLM reads from your .env
+os.environ["OPENAI_API_KEY"] = "your-api-key"
+os.environ["OPENAI_ORGANIZATION"] = "your-org-id" # Optional
 
 response = completion(
   model="ft:gpt-3.5-turbo:my-org:custom_suffix:id",
diff --git a/docs/my-website/docs/tutorials/first_playground.md b/docs/my-website/docs/tutorials/first_playground.md
index 7af08915c..bc34e89b6 100644
--- a/docs/my-website/docs/tutorials/first_playground.md
+++ b/docs/my-website/docs/tutorials/first_playground.md
@@ -184,6 +184,4 @@ This is what you should see:
 You've created your first LLM Playground - with the ability to call 50+ LLM APIs. 
 
 Next Steps: 
-* [Check out the full list of LLM Providers you can now add](../completion/supported)
-* [Deploy your server using Render](https://render.com/docs/deploy-flask)
-* [Deploy your playground using Streamlit](https://docs.streamlit.io/streamlit-community-cloud/deploy-your-app)
\ No newline at end of file
+* [Check out the full list of LLM Providers you can now add](https://docs.litellm.ai/docs/providers)
\ No newline at end of file
diff --git a/docs/my-website/docs/tutorials/lm_evaluation_harness.md b/docs/my-website/docs/tutorials/lm_evaluation_harness.md
new file mode 100644
index 000000000..3cb63d008
--- /dev/null
+++ b/docs/my-website/docs/tutorials/lm_evaluation_harness.md
@@ -0,0 +1,50 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# LM-Evaluation Harness with TGI
+
+Evaluate LLMs 20x faster with TGI via litellm proxy's `/completions` endpoint. 
+
+This tutorial assumes you're using the `big-refactor` branch of [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness/tree/big-refactor)
+
+**Step 1: Start the local proxy**
+```shell
+$ litellm --model huggingface/bigcode/starcoder
+```
+
+Using a custom api base
+
+```shell
+$ export HUGGINGFACE_API_KEY=my-api-key #[OPTIONAL]
+$ litellm --model huggingface/tinyllama --api_base https://k58ory32yinf1ly0.us-east-1.aws.endpoints.huggingface.cloud
+```
+
+OpenAI Compatible Endpoint at http://0.0.0.0:8000
+
+**Step 2: Set OpenAI API Base & Key**
+```shell
+$ export OPENAI_API_BASE=http://0.0.0.0:8000
+```
+
+LM Harness requires you to set an OpenAI API key `OPENAI_API_SECRET_KEY` for running benchmarks
+```shell
+export OPENAI_API_SECRET_KEY=anything
+```
+
+**Step 3: Run LM-Eval-Harness**
+
+```shell
+python3 -m lm_eval \
+  --model openai-completions \
+  --model_args engine=davinci \
+  --task crows_pairs_english_age
+
+```
+
+## Debugging 
+
+### Making a test request to your proxy
+This command makes a test Completion, ChatCompletion request to your proxy server
+```shell
+litellm --test
+```
\ No newline at end of file
diff --git a/docs/my-website/img/sentry.png b/docs/my-website/img/sentry.png
new file mode 100644
index 000000000..8851aef50
Binary files /dev/null and b/docs/my-website/img/sentry.png differ
diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js
index 7401ea7a0..759e5554a 100644
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@@ -36,8 +36,6 @@ const sidebars = {
         "completion/message_trimming",
         "completion/function_call",
         "completion/model_alias", 
-        "completion/reliable_completions", 
-        "completion/multiple_deployments",
         "completion/config",
         "completion/batching",
         "completion/mock_requests",
@@ -97,10 +95,10 @@ const sidebars = {
       label: 'Tutorials',
       items: [
         'tutorials/azure_openai',
-        'tutorials/ab_test_llms',
         'tutorials/oobabooga',
         "tutorials/gradio_integration",
         "tutorials/model_config_proxy",
+        "tutorials/lm_evaluation_harness",
         'tutorials/huggingface_codellama',
         'tutorials/huggingface_tutorial', 
         'tutorials/TogetherAI_liteLLM', 
diff --git a/docs/my-website/src/pages/index.md b/docs/my-website/src/pages/index.md
index 1abd77567..0a7da6551 100644
--- a/docs/my-website/src/pages/index.md
+++ b/docs/my-website/src/pages/index.md
@@ -5,6 +5,7 @@ import TabItem from '@theme/TabItem';
 
 https://github.com/BerriAI/litellm
 
+
 ## **Call 100+ LLMs using the same Input/Output Format**
 
 ## Basic usage 
diff --git a/docs/my-website/src/pages/observability/callbacks.md b/docs/my-website/src/pages/observability/callbacks.md
index 323d73580..be27d76da 100644
--- a/docs/my-website/src/pages/observability/callbacks.md
+++ b/docs/my-website/src/pages/observability/callbacks.md
@@ -22,7 +22,7 @@ litellm.success_callback=["posthog", "helicone", "llmonitor"]
 litellm.failure_callback=["sentry", "llmonitor"]
 
 ## set env variables
-os.environ['SENTRY_API_URL'], os.environ['SENTRY_API_TRACE_RATE']= ""
+os.environ['SENTRY_DSN'], os.environ['SENTRY_API_TRACE_RATE']= ""
 os.environ['POSTHOG_API_KEY'], os.environ['POSTHOG_API_URL'] = "api-key", "api-url"
 os.environ["HELICONE_API_KEY"] = ""
 
diff --git a/litellm/__init__.py b/litellm/__init__.py
index cb2b64e7b..fcf64f817 100644
--- a/litellm/__init__.py
+++ b/litellm/__init__.py
@@ -6,6 +6,7 @@ from litellm.caching import Cache
 input_callback: List[Union[str, Callable]] = []
 success_callback: List[Union[str, Callable]] = []
 failure_callback: List[Union[str, Callable]] = []
+callbacks: List[Callable] = []
 set_verbose = False
 email: Optional[
     str
@@ -23,6 +24,7 @@ azure_key: Optional[str] = None
 anthropic_key: Optional[str] = None
 replicate_key: Optional[str] = None
 cohere_key: Optional[str] = None
+maritalk_key: Optional[str] = None
 ai21_key: Optional[str] = None
 openrouter_key: Optional[str] = None
 huggingface_key: Optional[str] = None
@@ -45,6 +47,8 @@ add_function_to_prompt: bool = False # if function calling not supported by api,
 client_session: Optional[requests.Session] = None
 model_fallbacks: Optional[List] = None
 model_cost_map_url: str = "https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json"
+num_retries: Optional[int] = None
+suppress_debug_info = False
 #############################################
 
 def get_model_cost_map(url: str):
@@ -218,6 +222,10 @@ ollama_models = [
     "llama2"
 ]
 
+maritalk_models = [
+    "maritalk"
+]
+
 model_list = (
     open_ai_chat_completion_models
     + open_ai_text_completion_models
@@ -237,6 +245,7 @@ model_list = (
     + bedrock_models
     + deepinfra_models
     + perplexity_models
+    + maritalk_models
 )
 
 provider_list: List = [
@@ -263,6 +272,7 @@ provider_list: List = [
     "deepinfra",
     "perplexity",
     "anyscale",
+    "maritalk",
     "custom", # custom apis
 ]
 
@@ -282,6 +292,7 @@ models_by_provider: dict = {
     "ollama": ollama_models,
     "deepinfra": deepinfra_models,
     "perplexity": perplexity_models,
+    "maritalk": maritalk_models
 }
 
 # mapping for those models which have larger equivalents 
@@ -308,7 +319,15 @@ longer_context_model_fallback_dict: dict = {
 
 ####### EMBEDDING MODELS ###################
 open_ai_embedding_models: List = ["text-embedding-ada-002"]
-cohere_embedding_models: List = ["embed-english-v2.0", "embed-english-light-v2.0", "embed-multilingual-v2.0"]
+cohere_embedding_models: List = [
+    "embed-english-v3.0",
+    "embed-english-light-v3.0",
+    "embed-multilingual-v3.0", 
+    "embed-english-v2.0", 
+    "embed-english-light-v2.0", 
+    "embed-multilingual-v2.0", 
+]
+bedrock_embedding_models: List = ["amazon.titan-embed-text-v1"]
 
 from .timeout import timeout
 from .testing import *
@@ -324,7 +343,6 @@ from .utils import (
     Logging,
     acreate,
     get_model_list,
-    completion_with_split_tests,
     get_max_tokens,
     register_prompt_template,
     validate_environment,
@@ -348,6 +366,7 @@ from .llms.petals import PetalsConfig
 from .llms.vertex_ai import VertexAIConfig
 from .llms.sagemaker import SagemakerConfig
 from .llms.ollama import OllamaConfig
+from .llms.maritalk import MaritTalkConfig
 from .llms.bedrock import AmazonTitanConfig, AmazonAI21Config, AmazonAnthropicConfig, AmazonCohereConfig
 from .llms.openai import OpenAIConfig, OpenAITextCompletionConfig, AzureOpenAIConfig
 from .main import *  # type: ignore
@@ -359,10 +378,9 @@ from .exceptions import (
     ServiceUnavailableError,
     OpenAIError,
     ContextWindowExceededError,
-    BudgetExceededError
-
+    BudgetExceededError, 
 )
 from .budget_manager import BudgetManager
 from .proxy.proxy_cli import run_server
 from .router import Router
-
+from .proxy.proxy_server import app
diff --git a/litellm/budget_manager.py b/litellm/budget_manager.py
index 77a1e51f2..6a9d1e520 100644
--- a/litellm/budget_manager.py
+++ b/litellm/budget_manager.py
@@ -14,7 +14,8 @@ class BudgetManager:
     
     def print_verbose(self, print_statement):
         if litellm.set_verbose:
-            print(print_statement)
+            import logging
+            logging.info(print_statement)
     
     def load_data(self):
         if self.client_type == "local":
@@ -149,8 +150,6 @@ class BudgetManager:
                 'project_name' : self.project_name, 
                 "user_dict": self.user_dict
             }
-            print(f"data: {data}")
             response = requests.post(url, headers=headers, json=data)
-            print(f"response: {response.text}")
             response = response.json()
             return response
\ No newline at end of file
diff --git a/litellm/caching.py b/litellm/caching.py
index 0e508e37e..9632a6b03 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -8,8 +8,9 @@
 #  Thank you users! We ❤️ you! - Krrish & Ishaan
 
 import litellm
-import time
-import json
+import time, logging
+import json, traceback
+
 
 def get_prompt(*args, **kwargs):
     # make this safe checks, it should not throw any exceptions
@@ -23,81 +24,105 @@ def get_prompt(*args, **kwargs):
         return prompt
     return None
 
-class RedisCache():
+
+class BaseCache:
+    def set_cache(self, key, value, **kwargs):
+        raise NotImplementedError
+
+    def get_cache(self, key, **kwargs):
+        raise NotImplementedError
+
+
+class RedisCache(BaseCache):
     def __init__(self, host, port, password):
         import redis
         # if users don't provider one, use the default litellm cache
         self.redis_client = redis.Redis(host=host, port=port, password=password)
 
-    def set_cache(self, key, value):
+    def set_cache(self, key, value, **kwargs):
+        ttl = kwargs.get("ttl", None)
         try:
-            self.redis_client.set(key, str(value))
+            self.redis_client.set(name=key, value=str(value), ex=ttl)
         except Exception as e:
             # NON blocking - notify users Redis is throwing an exception
-            print("LiteLLM Caching: Got exception from REDIS: ", e)
+            logging.debug("LiteLLM Caching: set() - Got exception from REDIS : ", e)
 
-    def get_cache(self, key):
+    def get_cache(self, key, **kwargs):
         try:
             # TODO convert this to a ModelResponse object
             cached_response = self.redis_client.get(key)
-            if cached_response!=None:
+            if cached_response != None:
                 # cached_response is in `b{} convert it to ModelResponse
                 cached_response = cached_response.decode("utf-8")  # Convert bytes to string
                 cached_response = json.loads(cached_response)  # Convert string to dictionary
-                cached_response['cache'] = True # set cache-hit flag to True
+                cached_response['cache'] = True  # set cache-hit flag to True
                 return cached_response
         except Exception as e:
             # NON blocking - notify users Redis is throwing an exception
-            print("LiteLLM Caching: Got exception from REDIS: ", e)
+            traceback.print_exc()
+            logging.debug("LiteLLM Caching: get() - Got exception from REDIS: ", e)
 
-class HostedCache():
-    def set_cache(self, key, value):
+
+class HostedCache(BaseCache):
+    def set_cache(self, key, value, **kwargs):
+        if "ttl" in kwargs:
+            logging.debug("LiteLLM Caching: TTL is not supported for hosted cache!")
         # make a post request to api.litellm.ai/set_cache
         import requests
         url = f"https://api.litellm.ai/set_cache?key={key}&value={str(value)}"
-        requests.request("POST", url) # post request to set this in the hosted litellm cache
+        requests.request("POST", url)  # post request to set this in the hosted litellm cache
 
-    def get_cache(self, key):
+    def get_cache(self, key, **kwargs):
         import requests
         url = f"https://api.litellm.ai/get_cache?key={key}"
         cached_response = requests.request("GET", url)
         cached_response = cached_response.text
-        if cached_response == "NONE": # api.litellm.ai returns "NONE" if it's not a cache hit
-            return None        
-        if cached_response!=None:
+        if cached_response == "NONE":  # api.litellm.ai returns "NONE" if it's not a cache hit
+            return None
+        if cached_response != None:
             try:
                 cached_response = json.loads(cached_response)  # Convert string to dictionary
-                cached_response['cache'] = True # set cache-hit flag to True
+                cached_response['cache'] = True  # set cache-hit flag to True
                 return cached_response
             except:
                 return cached_response
 
-class InMemoryCache():
+
+class InMemoryCache(BaseCache):
     def __init__(self):
         # if users don't provider one, use the default litellm cache
         self.cache_dict = {}
+        self.ttl_dict = {}
 
-    def set_cache(self, key, value):
-        #print("in set cache for inmem")
+    def set_cache(self, key, value, **kwargs):
         self.cache_dict[key] = value
-        #print(self.cache_dict)
+        if "ttl" in kwargs:
+            self.ttl_dict[key] = time.time() + kwargs["ttl"]
 
-    def get_cache(self, key):
-        #print("in get cache for inmem")
+    def get_cache(self, key, **kwargs):
         if key in self.cache_dict:
-            #print("got a cache hit")
-            return self.cache_dict[key]
-        #print("got a cache miss")
+            if key in self.ttl_dict:
+                if time.time() > self.ttl_dict[key]:
+                    self.cache_dict.pop(key, None)
+                    return None
+            original_cached_response = self.cache_dict[key]
+            try: 
+                cached_response = json.loads(original_cached_response)
+            except: 
+                cached_response = original_cached_response
+            cached_response['cache'] = True  # set cache-hit flag to True
+            return cached_response
         return None
 
-class Cache():
+
+class Cache:
     def __init__(
-            self, 
-            type = "local",
-            host = None,
-            port = None,
-            password = None
-        ):
+            self,
+            type="local",
+            host=None,
+            port=None,
+            password=None
+    ):
         """
         Initializes the cache based on the given type.
 
@@ -151,9 +176,9 @@ class Cache():
     def generate_streaming_content(self, content):
         chunk_size = 5  # Adjust the chunk size as needed
         for i in range(0, len(content), chunk_size):
-            yield {'choices': [{'delta': {'role': 'assistant', 'content': content[i:i+chunk_size]}}]}
+            yield {'choices': [{'delta': {'role': 'assistant', 'content': content[i:i + chunk_size]}}]}
             time.sleep(0.02)
-    
+
     def get_cache(self, *args, **kwargs):
         """
         Retrieves the cached result for the given arguments.
@@ -166,19 +191,18 @@ class Cache():
             The cached result if it exists, otherwise None.
         """
         try:  # never block execution
-            if "cache_key" in kwargs: 
+            if "cache_key" in kwargs:
                 cache_key = kwargs["cache_key"]
-            else: 
+            else:
                 cache_key = self.get_cache_key(*args, **kwargs)
             if cache_key is not None:
                 cached_result = self.cache.get_cache(cache_key)
                 if cached_result != None and 'stream' in kwargs and kwargs['stream'] == True:
                     # if streaming is true and we got a cache hit, return a generator
-                    #print("cache hit and stream=True")
-                    #print(cached_result)
                     return self.generate_streaming_content(cached_result["choices"][0]['message']['content'])
                 return cached_result
-        except:
+        except Exception as e:
+            logging.debug(f"An exception occurred: {traceback.format_exc()}")
             return None
 
     def add_cache(self, result, *args, **kwargs):
@@ -193,20 +217,11 @@ class Cache():
             None
         """
         try:
-            if "cache_key" in kwargs: 
+            if "cache_key" in kwargs:
                 cache_key = kwargs["cache_key"]
-            else: 
+            else:
                 cache_key = self.get_cache_key(*args, **kwargs)
-            # print("adding to cache", cache_key, result)
-            # print(cache_key)
             if cache_key is not None:
-                # print("adding to cache", cache_key, result)
-                self.cache.set_cache(cache_key, result)
+                self.cache.set_cache(cache_key, result, **kwargs)
         except:
             pass
-
-
-
-
-
-
diff --git a/litellm/gpt_cache.py b/litellm/gpt_cache.py
index e2c9d33fc..6d45e1545 100644
--- a/litellm/gpt_cache.py
+++ b/litellm/gpt_cache.py
@@ -1,4 +1,5 @@
 ###### LiteLLM Integration with GPT Cache #########
+# will be deprecated soon https://github.com/BerriAI/litellm/discussions/648#discussioncomment-7461510 
 import gptcache
 
 # openai.ChatCompletion._llm_handler = litellm.completion
diff --git a/litellm/integrations/custom_logger.py b/litellm/integrations/custom_logger.py
index d79b01cfe..66dd57eb2 100644
--- a/litellm/integrations/custom_logger.py
+++ b/litellm/integrations/custom_logger.py
@@ -12,7 +12,25 @@ class CustomLogger:
     # Class variables or attributes
     def __init__(self):
         pass
+
+    def log_pre_api_call(self, model, messages, kwargs): 
+        pass
+
+    def log_post_api_call(self, kwargs, response_obj, start_time, end_time): 
+        pass
     
+    def log_stream_event(self, kwargs, response_obj, start_time, end_time):
+        pass
+
+    def log_success_event(self, kwargs, response_obj, start_time, end_time): 
+        pass
+
+    def log_failure_event(self, kwargs, response_obj, start_time, end_time): 
+        pass
+
+
+    #### DEPRECATED ####
+
     def log_input_event(self, model, messages, kwargs, print_verbose, callback_func):
         try: 
             print_verbose(
diff --git a/litellm/integrations/langfuse.py b/litellm/integrations/langfuse.py
index 75826ddfb..e6f48a5bd 100644
--- a/litellm/integrations/langfuse.py
+++ b/litellm/integrations/langfuse.py
@@ -13,8 +13,8 @@ class LangFuseLogger:
     def __init__(self):
         try:
             from langfuse import Langfuse
-        except:
-            raise Exception("\033[91mLangfuse not installed, try running 'pip install langfuse' to fix this error\033[0m")
+        except Exception as e:
+            raise Exception("\033[91mLangfuse not installed, try running 'pip install langfuse' to fix this error\033[0m", e)
         # Instance variables
         self.secret_key = os.getenv("LANGFUSE_SECRET_KEY")
         self.public_key = os.getenv("LANGFUSE_PUBLIC_KEY")
@@ -36,10 +36,6 @@ class LangFuseLogger:
             print_verbose(
                 f"Langfuse Logging - Enters logging function for model {kwargs}"
             )
-            # print(response_obj)
-            # print(response_obj['choices'][0]['message']['content'])
-            # print(response_obj['usage']['prompt_tokens'])
-            # print(response_obj['usage']['completion_tokens'])
             metadata = kwargs.get("metadata", {})
             prompt = [kwargs['messages']]
 
diff --git a/litellm/integrations/prompt_layer.py b/litellm/integrations/prompt_layer.py
index cf0b7fd57..4167ea60f 100644
--- a/litellm/integrations/prompt_layer.py
+++ b/litellm/integrations/prompt_layer.py
@@ -17,18 +17,25 @@ class PromptLayerLogger:
     def log_event(self, kwargs, response_obj, start_time, end_time, print_verbose):
         # Method definition
         try:
-            if 'litellm_logging_obj' in kwargs:
-                kwargs.pop('litellm_logging_obj')
+            new_kwargs = {}
+            new_kwargs['model'] = kwargs['model']
+            new_kwargs['messages'] = kwargs['messages']
+
+            # add kwargs["optional_params"] to new_kwargs
+            for optional_param in kwargs["optional_params"]:
+                new_kwargs[optional_param] = kwargs["optional_params"][optional_param]
+
 
             print_verbose(
-                f"Prompt Layer Logging - Enters logging function for model kwargs: {kwargs}\n, response: {response_obj}"
+                f"Prompt Layer Logging - Enters logging function for model kwargs: {new_kwargs}\n, response: {response_obj}"
             )
 
+
             request_response = requests.post(
                 "https://api.promptlayer.com/rest/track-request",
                 json={
                     "function_name": "openai.ChatCompletion.create",
-                    "kwargs": kwargs,
+                    "kwargs": new_kwargs,
                     "tags": ["hello", "world"],
                     "request_response": dict(response_obj),
                     "request_start_time": int(start_time.timestamp()),
diff --git a/litellm/llms/aleph_alpha.py b/litellm/llms/aleph_alpha.py
index 0e83b76a7..090262461 100644
--- a/litellm/llms/aleph_alpha.py
+++ b/litellm/llms/aleph_alpha.py
@@ -262,11 +262,9 @@ def completion(
 
         model_response["created"] = time.time()
         model_response["model"] = model
-        model_response["usage"] = {
-            "prompt_tokens": prompt_tokens,
-            "completion_tokens": completion_tokens,
-            "total_tokens": prompt_tokens + completion_tokens,
-        }
+        model_response.usage.completion_tokens = completion_tokens
+        model_response.usage.prompt_tokens = prompt_tokens
+        model_response.usage.total_tokens = prompt_tokens + completion_tokens
         return model_response
 
 def embedding():
diff --git a/litellm/llms/anthropic.py b/litellm/llms/anthropic.py
index 7a2b3d8d8..8cce80826 100644
--- a/litellm/llms/anthropic.py
+++ b/litellm/llms/anthropic.py
@@ -164,11 +164,9 @@ def completion(
 
         model_response["created"] = time.time()
         model_response["model"] = model
-        model_response["usage"] = {
-            "prompt_tokens": prompt_tokens,
-            "completion_tokens": completion_tokens,
-            "total_tokens": prompt_tokens + completion_tokens,
-        }
+        model_response.usage.completion_tokens = completion_tokens
+        model_response.usage.prompt_tokens = prompt_tokens
+        model_response.usage.total_tokens = prompt_tokens + completion_tokens
         return model_response
 
 def embedding():
diff --git a/litellm/llms/baseten.py b/litellm/llms/baseten.py
index aecacd84f..05abb0005 100644
--- a/litellm/llms/baseten.py
+++ b/litellm/llms/baseten.py
@@ -136,11 +136,9 @@ def completion(
 
         model_response["created"] = time.time()
         model_response["model"] = model
-        model_response["usage"] = {
-            "prompt_tokens": prompt_tokens,
-            "completion_tokens": completion_tokens,
-            "total_tokens": prompt_tokens + completion_tokens,
-        }
+        model_response.usage.completion_tokens = completion_tokens
+        model_response.usage.prompt_tokens = prompt_tokens
+        model_response.usage.total_tokens = prompt_tokens + completion_tokens
         return model_response
 
 def embedding():
diff --git a/litellm/llms/bedrock.py b/litellm/llms/bedrock.py
index 7014ebc42..18f67526f 100644
--- a/litellm/llms/bedrock.py
+++ b/litellm/llms/bedrock.py
@@ -1,4 +1,5 @@
 import json, copy, types
+import os
 from enum import Enum
 import time
 from typing import Callable, Optional
@@ -174,8 +175,32 @@ def init_bedrock_client(
         aws_access_key_id = None,
         aws_secret_access_key = None,
         aws_region_name=None,
+        aws_bedrock_runtime_endpoint=None,
     ):
 
+    # check for custom AWS_REGION_NAME and use it if not passed to init_bedrock_client
+    litellm_aws_region_name = get_secret("AWS_REGION_NAME")
+    standard_aws_region_name = get_secret("AWS_REGION")
+    if region_name:
+        pass
+    elif aws_region_name:
+        region_name = aws_region_name
+    elif litellm_aws_region_name:
+        region_name = litellm_aws_region_name
+    elif standard_aws_region_name:
+        region_name = standard_aws_region_name
+    else:
+        raise BedrockError(message="AWS region not set: set AWS_REGION_NAME or AWS_REGION env variable or in .env file", status_code=401)
+
+    # check for custom AWS_BEDROCK_RUNTIME_ENDPOINT and use it if not passed to init_bedrock_client
+    env_aws_bedrock_runtime_endpoint = get_secret("AWS_BEDROCK_RUNTIME_ENDPOINT")
+    if aws_bedrock_runtime_endpoint:
+        endpoint_url = aws_bedrock_runtime_endpoint
+    elif env_aws_bedrock_runtime_endpoint:
+        endpoint_url = env_aws_bedrock_runtime_endpoint
+    else:
+        endpoint_url = f'https://bedrock-runtime.{region_name}.amazonaws.com'
+
     import boto3
     if aws_access_key_id != None:
         # uses auth params passed to completion
@@ -185,23 +210,17 @@ def init_bedrock_client(
             service_name="bedrock-runtime",
             aws_access_key_id=aws_access_key_id,
             aws_secret_access_key=aws_secret_access_key,
-            region_name=aws_region_name,
-            endpoint_url=f'https://bedrock-runtime.{aws_region_name}.amazonaws.com'
+            region_name=region_name,
+            endpoint_url=endpoint_url,
         )
     else:
         # aws_access_key_id is None, assume user is trying to auth using env variables 
-        # boto3 automaticaly reads env variables
+        # boto3 automatically reads env variables
 
-        # we need to read region name from env
-        # I assume majority of users use .env for auth 
-        region_name = (
-            get_secret("AWS_REGION_NAME") or
-            "us-west-2"  # default to us-west-2 if user not specified
-        )
         client = boto3.client(
             service_name="bedrock-runtime",
             region_name=region_name,
-            endpoint_url=f'https://bedrock-runtime.{region_name}.amazonaws.com'
+            endpoint_url=endpoint_url,
         )
 
     return client
@@ -259,6 +278,174 @@ def completion(
         litellm_params=None,
         logger_fn=None,
 ):
+    exception_mapping_worked = False
+    try:
+        # pop aws_secret_access_key, aws_access_key_id, aws_region_name from kwargs, since completion calls fail with them
+        aws_secret_access_key = optional_params.pop("aws_secret_access_key", None)
+        aws_access_key_id = optional_params.pop("aws_access_key_id", None)
+        aws_region_name = optional_params.pop("aws_region_name", None)
+
+        # use passed in BedrockRuntime.Client if provided, otherwise create a new one
+        client = optional_params.pop(
+            "aws_bedrock_client",
+            # only pass variables that are not None
+            init_bedrock_client(
+                aws_access_key_id=aws_access_key_id,
+                aws_secret_access_key=aws_secret_access_key,
+                aws_region_name=aws_region_name,
+            ),
+        )
+
+        model = model
+        provider = model.split(".")[0]
+        prompt = convert_messages_to_prompt(model, messages, provider, custom_prompt_dict)
+        inference_params = copy.deepcopy(optional_params)
+        stream = inference_params.pop("stream", False)
+        if provider == "anthropic":
+            ## LOAD CONFIG
+            config = litellm.AmazonAnthropicConfig.get_config() 
+            for k, v in config.items(): 
+                if k not in inference_params: # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
+                    inference_params[k] = v
+            data = json.dumps({
+                "prompt": prompt,
+                **inference_params
+            })
+        elif provider == "ai21":
+            ## LOAD CONFIG
+            config = litellm.AmazonAI21Config.get_config() 
+            for k, v in config.items(): 
+                if k not in inference_params: # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
+                    inference_params[k] = v
+
+            data = json.dumps({
+                "prompt": prompt,
+                **inference_params
+            })
+        elif provider == "cohere":
+            ## LOAD CONFIG
+            config = litellm.AmazonCohereConfig.get_config() 
+            for k, v in config.items(): 
+                if k not in inference_params: # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
+                    inference_params[k] = v
+            if optional_params.get("stream", False) == True:
+                inference_params["stream"] = True # cohere requires stream = True in inference params
+            data = json.dumps({
+                "prompt": prompt,
+                **inference_params
+            })
+        elif provider == "amazon":  # amazon titan
+            ## LOAD CONFIG
+            config = litellm.AmazonTitanConfig.get_config() 
+            for k, v in config.items(): 
+                if k not in inference_params: # completion(top_k=3) > amazon_config(top_k=3) <- allows for dynamic variables to be passed in
+                    inference_params[k] = v
+
+            data = json.dumps({
+                "inputText": prompt,
+                "textGenerationConfig": inference_params,
+            })
+        
+        ## LOGGING
+        logging_obj.pre_call(
+            input=prompt,
+            api_key="",
+            additional_args={"complete_input_dict": data},
+        )
+
+        ## COMPLETION CALL
+        accept = 'application/json'
+        contentType = 'application/json'
+        if stream == True:
+            response = client.invoke_model_with_response_stream(
+                body=data,
+                modelId=model,
+                accept=accept,
+                contentType=contentType
+            )
+            response = response.get('body')
+            return response
+
+        try: 
+            response = client.invoke_model(
+                body=data,
+                modelId=model,
+                accept=accept,
+                contentType=contentType
+            )
+        except Exception as e: 
+            raise BedrockError(status_code=500, message=str(e))
+        
+        response_body = json.loads(response.get('body').read())
+
+        ## LOGGING
+        logging_obj.post_call(
+            input=prompt,
+            api_key="",
+            original_response=response_body,
+            additional_args={"complete_input_dict": data},
+        )
+        print_verbose(f"raw model_response: {response}")
+        ## RESPONSE OBJECT
+        outputText = "default"
+        if provider == "ai21":
+            outputText = response_body.get('completions')[0].get('data').get('text')
+        elif provider == "anthropic":
+            outputText = response_body['completion']
+            model_response["finish_reason"] = response_body["stop_reason"]
+        elif provider == "cohere": 
+            outputText = response_body["generations"][0]["text"]
+        else:  # amazon titan
+            outputText = response_body.get('results')[0].get('outputText')
+
+        response_metadata = response.get("ResponseMetadata", {})
+        if response_metadata.get("HTTPStatusCode", 500) >= 400:
+            raise BedrockError(
+                message=outputText,
+                status_code=response_metadata.get("HTTPStatusCode", 500),
+            )
+        else:
+            try:
+                if len(outputText) > 0:
+                    model_response["choices"][0]["message"]["content"] = outputText
+            except:
+                raise BedrockError(message=json.dumps(outputText), status_code=response_metadata.get("HTTPStatusCode", 500))
+
+        ## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here. 
+        prompt_tokens = len(
+            encoding.encode(prompt)
+        )
+        completion_tokens = len(
+            encoding.encode(model_response["choices"][0]["message"].get("content", ""))
+        )
+
+        model_response["created"] = time.time()
+        model_response["model"] = model
+        model_response.usage.completion_tokens = completion_tokens
+        model_response.usage.prompt_tokens = prompt_tokens
+        model_response.usage.total_tokens = prompt_tokens + completion_tokens
+        return model_response
+    except BedrockError as e:
+        exception_mapping_worked = True
+        raise e
+    except Exception as e: 
+        if exception_mapping_worked:
+            raise e
+        else: 
+            import traceback
+            raise BedrockError(status_code=500, message=traceback.format_exc())
+
+
+
+def embedding(
+    model: str,
+    input: list,
+    logging_obj=None,
+    model_response=None,
+    optional_params=None,
+    encoding=None,
+):
+    # logic for parsing in - calling - parsing out model embedding calls
     # pop aws_secret_access_key, aws_access_key_id, aws_region_name from kwargs, since completion calls fail with them
     aws_secret_access_key = optional_params.pop("aws_secret_access_key", None)
     aws_access_key_id = optional_params.pop("aws_access_key_id", None)
@@ -274,132 +461,39 @@ def completion(
             aws_region_name=aws_region_name,
         ),
     )
-
-    model = model
-    provider = model.split(".")[0]
-    prompt = convert_messages_to_prompt(model, messages, provider, custom_prompt_dict)
-    inference_params = copy.deepcopy(optional_params)
-    stream = inference_params.pop("stream", False)
-    if provider == "anthropic":
-        ## LOAD CONFIG
-        config = litellm.AmazonAnthropicConfig.get_config() 
-        for k, v in config.items(): 
-            if k not in inference_params: # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
-                inference_params[k] = v
-        data = json.dumps({
-            "prompt": prompt,
-            **inference_params
-        })
-    elif provider == "ai21":
-        ## LOAD CONFIG
-        config = litellm.AmazonAI21Config.get_config() 
-        for k, v in config.items(): 
-            if k not in inference_params: # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
-                inference_params[k] = v
-
-        data = json.dumps({
-            "prompt": prompt,
-            **inference_params
-        })
-    elif provider == "cohere":
-        ## LOAD CONFIG
-        config = litellm.AmazonCohereConfig.get_config() 
-        for k, v in config.items(): 
-            if k not in inference_params: # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
-                inference_params[k] = v
-        data = json.dumps({
-            "prompt": prompt,
-            **inference_params
-        })
-    elif provider == "amazon":  # amazon titan
-        ## LOAD CONFIG
-        config = litellm.AmazonTitanConfig.get_config() 
-        for k, v in config.items(): 
-            if k not in inference_params: # completion(top_k=3) > amazon_config(top_k=3) <- allows for dynamic variables to be passed in
-                inference_params[k] = v
-
-        data = json.dumps({
-            "inputText": prompt,
-            "textGenerationConfig": inference_params,
-        })
     
-    ## LOGGING
-    logging_obj.pre_call(
-        input=prompt,
-        api_key="",
-        additional_args={"complete_input_dict": data},
-    )
-
-    ## COMPLETION CALL
-    accept = 'application/json'
-    contentType = 'application/json'
-    if stream == True:
-        response = client.invoke_model_with_response_stream(
-            body=data,
-            modelId=model,
-            accept=accept,
-            contentType=contentType
-        )
-        response = response.get('body')
-        return response
+    # translate to bedrock
+    # bedrock only accepts (str) for inputText
+    if type(input) == list:
+        if len(input) > 1: # input is a list with more than 1 elem, raise Exception, Bedrock only supports one element 
+            raise BedrockError(message="Bedrock cannot embed() more than one string - len(input) must always ==  1, input = ['hi from litellm']", status_code=400)
+        input_str = "".join(input)
 
     response = client.invoke_model(
-        body=data,
+        body=json.dumps({
+            "inputText": input_str
+        }),
         modelId=model,
-        accept=accept,
-        contentType=contentType
+        accept="*/*",
+        contentType="application/json"
     )
+
     response_body = json.loads(response.get('body').read())
 
-    ## LOGGING
-    logging_obj.post_call(
-        input=prompt,
-        api_key="",
-        original_response=response_body,
-        additional_args={"complete_input_dict": data},
-    )
-    print_verbose(f"raw model_response: {response}")
-    ## RESPONSE OBJECT
-    outputText = "default"
-    if provider == "ai21":
-        outputText = response_body.get('completions')[0].get('data').get('text')
-    elif provider == "anthropic":
-        outputText = response_body['completion']
-        model_response["finish_reason"] = response_body["stop_reason"]
-    elif provider == "cohere": 
-        outputText = response_body["generations"][0]["text"]
-    else:  # amazon titan
-        outputText = response_body.get('results')[0].get('outputText')
-    if "error" in outputText:
-        raise BedrockError(
-            message=outputText,
-            status_code=response.status_code,
-        )
-    else:
-        try:
-            if len(outputText) > 0:
-                model_response["choices"][0]["message"]["content"] = outputText
-        except:
-            raise BedrockError(message=json.dumps(outputText), status_code=response.status_code)
+    embedding_response = response_body["embedding"]
 
-    ## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here. 
-    prompt_tokens = len(
-        encoding.encode(prompt)
-    )
-    completion_tokens = len(
-        encoding.encode(model_response["choices"][0]["message"].get("content", ""))
-    )
-
-    model_response["created"] = time.time()
+    model_response["object"] = "list"
+    model_response["data"] = embedding_response
     model_response["model"] = model
-    model_response["usage"] = {
-        "prompt_tokens": prompt_tokens,
-        "completion_tokens": completion_tokens,
-        "total_tokens": prompt_tokens + completion_tokens,
+    input_tokens = 0
+
+    input_tokens+=len(encoding.encode(input_str)) 
+
+    model_response["usage"] = { 
+        "prompt_tokens": input_tokens, 
+        "total_tokens": input_tokens,
     }
+
+
+
     return model_response
-
-
-def embedding():
-    # logic for parsing in - calling - parsing out model embedding calls
-    pass
diff --git a/litellm/llms/cohere.py b/litellm/llms/cohere.py
index cd6032c56..123391c32 100644
--- a/litellm/llms/cohere.py
+++ b/litellm/llms/cohere.py
@@ -137,6 +137,10 @@ def completion(
     response = requests.post(
         completion_url, headers=headers, data=json.dumps(data), stream=optional_params["stream"] if "stream" in optional_params else False
     )
+    ## error handling for cohere calls
+    if response.status_code!=200:
+        raise CohereError(message=response.text, status_code=response.status_code)
+
     if "stream" in optional_params and optional_params["stream"] == True:
         return response.iter_lines()
     else:
@@ -179,11 +183,9 @@ def completion(
 
         model_response["created"] = time.time()
         model_response["model"] = model
-        model_response["usage"] = {
-            "prompt_tokens": prompt_tokens,
-            "completion_tokens": completion_tokens,
-            "total_tokens": prompt_tokens + completion_tokens,
-        }
+        model_response.usage.completion_tokens = completion_tokens
+        model_response.usage.prompt_tokens = prompt_tokens
+        model_response.usage.total_tokens = prompt_tokens + completion_tokens
         return model_response
 
 def embedding(
@@ -193,6 +195,7 @@ def embedding(
     logging_obj=None,
     model_response=None,
     encoding=None,
+    optional_params=None,
 ):
     headers = validate_environment(api_key)
     embed_url = "https://api.cohere.ai/v1/embed"
@@ -200,8 +203,13 @@ def embedding(
     data = {
         "model": model,
         "texts": input,
+        **optional_params
     }
 
+    if "3" in model and "input_type" not in data:
+        # cohere v3 embedding models require input_type, if no input_type is provided, default to "search_document"
+        data["input_type"] = "search_document"
+
     ## LOGGING
     logging_obj.pre_call(
             input=input,
@@ -212,7 +220,6 @@ def embedding(
     response = requests.post(
         embed_url, headers=headers, data=json.dumps(data)
     )
-
     ## LOGGING
     logging_obj.post_call(
             input=input,
@@ -220,7 +227,6 @@ def embedding(
             additional_args={"complete_input_dict": data},
             original_response=response,
         )
-    # print(response.json())
     """
         response 
         {
@@ -232,6 +238,8 @@ def embedding(
             'usage'
         }
     """
+    if response.status_code!=200:
+        raise CohereError(message=response.text, status_code=response.status_code)
     embeddings = response.json()['embeddings']
     output_data = []
     for idx, embedding in enumerate(embeddings):
diff --git a/litellm/llms/huggingface_restapi.py b/litellm/llms/huggingface_restapi.py
index b3c3e5e38..0c9d2432a 100644
--- a/litellm/llms/huggingface_restapi.py
+++ b/litellm/llms/huggingface_restapi.py
@@ -141,216 +141,239 @@ def completion(
     litellm_params=None,
     logger_fn=None,
 ):
-    headers = validate_environment(api_key, headers)
-    task = get_hf_task_for_model(model)
-    print_verbose(f"{model}, {task}")
-    completion_url = ""
-    input_text = None
-    if "https" in model:
-        completion_url = model
-    elif api_base:
-        completion_url = api_base
-    elif "HF_API_BASE" in os.environ:
-        completion_url = os.getenv("HF_API_BASE", "")
-    elif "HUGGINGFACE_API_BASE" in os.environ:
-        completion_url = os.getenv("HUGGINGFACE_API_BASE", "")
-    else:
-        completion_url = f"https://api-inference.huggingface.co/models/{model}"
-
-    ## Load Config
-    config=litellm.HuggingfaceConfig.get_config()
-    for k, v in config.items():
-        if k not in optional_params: # completion(top_k=3) > huggingfaceConfig(top_k=3) <- allows for dynamic variables to be passed in
-            optional_params[k] = v
-
-    ### MAP INPUT PARAMS
-    if task == "conversational":
-        inference_params = copy.deepcopy(optional_params)
-        inference_params.pop("details")
-        inference_params.pop("return_full_text")
-        past_user_inputs = []
-        generated_responses = []
-        text = ""
-        for message in messages:
-            if message["role"] == "user":
-                if text != "":
-                    past_user_inputs.append(text)
-                text = message["content"]
-            elif message["role"] == "assistant" or message["role"] == "system":
-                generated_responses.append(message["content"])
-        data = {
-            "inputs": {
-                "text": text, 
-                "past_user_inputs": past_user_inputs, 
-                "generated_responses": generated_responses
-            },
-            "parameters": inference_params
-        }
-        input_text = "".join(message["content"] for message in messages)
-    elif task == "text-generation-inference":
-        # always send "details" and "return_full_text" as params
-        if model in custom_prompt_dict:
-            # check if the model has a registered custom prompt
-            model_prompt_details = custom_prompt_dict[model]
-            prompt = custom_prompt(
-                role_dict=model_prompt_details["roles"], 
-                initial_prompt_value=model_prompt_details["initial_prompt_value"],  
-                final_prompt_value=model_prompt_details["final_prompt_value"], 
-                messages=messages
-            )
+    exception_mapping_worked = False
+    try:
+        headers = validate_environment(api_key, headers)
+        task = get_hf_task_for_model(model)
+        print_verbose(f"{model}, {task}")
+        completion_url = ""
+        input_text = None
+        if "https" in model:
+            completion_url = model
+        elif api_base:
+            completion_url = api_base
+        elif "HF_API_BASE" in os.environ:
+            completion_url = os.getenv("HF_API_BASE", "")
+        elif "HUGGINGFACE_API_BASE" in os.environ:
+            completion_url = os.getenv("HUGGINGFACE_API_BASE", "")
         else:
-            prompt = prompt_factory(model=model, messages=messages)
-        data = {
-            "inputs": prompt,
-            "parameters": optional_params,
-            "stream": True if "stream" in optional_params and optional_params["stream"] == True else False,
-        }
-        input_text = prompt
-    else:
-        # Non TGI and Conversational llms
-        # We need this branch, it removes 'details' and 'return_full_text' from params 
-        if model in custom_prompt_dict:
-            # check if the model has a registered custom prompt
-            model_prompt_details = custom_prompt_dict[model]
-            prompt = custom_prompt(
-                role_dict=model_prompt_details["roles"], 
-                initial_prompt_value=model_prompt_details["initial_prompt_value"],  
-                final_prompt_value=model_prompt_details["final_prompt_value"], 
-                messages=messages
-            )
-        else:
-            prompt = prompt_factory(model=model, messages=messages)
-        inference_params = copy.deepcopy(optional_params)
-        inference_params.pop("details")
-        inference_params.pop("return_full_text")
-        data = {
-            "inputs": prompt,
-            "parameters": inference_params,
-            "stream": True if "stream" in optional_params and optional_params["stream"] == True else False,
-        }
-        input_text = prompt
-    ## LOGGING
-    logging_obj.pre_call(
-            input=input_text,
-            api_key=api_key,
-            additional_args={"complete_input_dict": data, "task": task, "headers": headers},
-        )
-    ## COMPLETION CALL
-    if "stream" in optional_params and optional_params["stream"] == True:
-        response = requests.post(
-            completion_url, 
-            headers=headers, 
-            data=json.dumps(data), 
-            stream=optional_params["stream"]
-        )
-        return response.iter_lines()
-    else:
-        response = requests.post(
-            completion_url, 
-            headers=headers, 
-            data=json.dumps(data)
-        )
+            completion_url = f"https://api-inference.huggingface.co/models/{model}"
 
-        ## Some servers might return streaming responses even though stream was not set to true. (e.g. Baseten)
-        is_streamed = False 
-        if response.__dict__['headers']["Content-Type"] == "text/event-stream":
-            is_streamed = True
-        
-        # iterate over the complete streamed response, and return the final answer
-        if is_streamed:
-            streamed_response = CustomStreamWrapper(completion_stream=response.iter_lines(), model=model, custom_llm_provider="huggingface", logging_obj=logging_obj)
-            content = ""
-            for chunk in streamed_response: 
-                content += chunk["choices"][0]["delta"]["content"]
-            completion_response: List[Dict[str, Any]] = [{"generated_text": content}]
-            ## LOGGING
-            logging_obj.post_call(
-                input=input_text,
-                api_key=api_key,
-                original_response=completion_response,
-                additional_args={"complete_input_dict": data, "task": task},
-            )
-        else: 
-            ## LOGGING
-            logging_obj.post_call(
-                input=input_text,
-                api_key=api_key,
-                original_response=response.text,
-                additional_args={"complete_input_dict": data, "task": task},
-            )
-            ## RESPONSE OBJECT
-            try:
-                completion_response = response.json()
-            except:
-                raise HuggingfaceError(
-                    message=response.text, status_code=response.status_code
+        ## Load Config
+        config=litellm.HuggingfaceConfig.get_config()
+        for k, v in config.items():
+            if k not in optional_params: # completion(top_k=3) > huggingfaceConfig(top_k=3) <- allows for dynamic variables to be passed in
+                optional_params[k] = v
+
+        ### MAP INPUT PARAMS
+        if task == "conversational":
+            inference_params = copy.deepcopy(optional_params)
+            inference_params.pop("details")
+            inference_params.pop("return_full_text")
+            past_user_inputs = []
+            generated_responses = []
+            text = ""
+            for message in messages:
+                if message["role"] == "user":
+                    if text != "":
+                        past_user_inputs.append(text)
+                    text = message["content"]
+                elif message["role"] == "assistant" or message["role"] == "system":
+                    generated_responses.append(message["content"])
+            data = {
+                "inputs": {
+                    "text": text, 
+                    "past_user_inputs": past_user_inputs, 
+                    "generated_responses": generated_responses
+                },
+                "parameters": inference_params
+            }
+            input_text = "".join(message["content"] for message in messages)
+        elif task == "text-generation-inference":
+            # always send "details" and "return_full_text" as params
+            if model in custom_prompt_dict:
+                # check if the model has a registered custom prompt
+                model_prompt_details = custom_prompt_dict[model]
+                prompt = custom_prompt(
+                    role_dict=model_prompt_details.get("roles", None), 
+                    initial_prompt_value=model_prompt_details.get("initial_prompt_value", ""),  
+                    final_prompt_value=model_prompt_details.get("final_prompt_value", ""), 
+                    messages=messages
                 )
-        print_verbose(f"response: {completion_response}")
-        if isinstance(completion_response, dict) and "error" in completion_response:
-            print_verbose(f"completion error: {completion_response['error']}")
-            print_verbose(f"response.status_code: {response.status_code}")
-            raise HuggingfaceError(
-                message=completion_response["error"],
-                status_code=response.status_code,
-            )
-        else:
-            if task == "conversational": 
-                if len(completion_response["generated_text"]) > 0: # type: ignore
-                    model_response["choices"][0]["message"][
-                        "content"
-                    ] = completion_response["generated_text"] # type: ignore
-            elif task == "text-generation-inference": 
-                if len(completion_response[0]["generated_text"]) > 0: 
-                    model_response["choices"][0]["message"][
-                        "content"
-                    ] = completion_response[0]["generated_text"]   
-                ## GETTING LOGPROBS + FINISH REASON 
-                if "details" in completion_response[0] and "tokens" in completion_response[0]["details"]:
-                    model_response.choices[0].finish_reason = completion_response[0]["details"]["finish_reason"]
-                    sum_logprob = 0
-                    for token in completion_response[0]["details"]["tokens"]:
-                        sum_logprob += token["logprob"]
-                    model_response["choices"][0]["message"]._logprob = sum_logprob
-                if "best_of" in optional_params and optional_params["best_of"] > 1: 
-                    if "details" in completion_response[0] and "best_of_sequences" in completion_response[0]["details"]:
-                        choices_list = []
-                        for idx, item in enumerate(completion_response[0]["details"]["best_of_sequences"]):
-                            sum_logprob = 0
-                            for token in item["tokens"]:
-                                sum_logprob += token["logprob"]
-                            if len(item["generated_text"]) > 0: 
-                                message_obj = Message(content=item["generated_text"], logprobs=sum_logprob)
-                            else: 
-                                message_obj = Message(content=None)
-                            choice_obj = Choices(finish_reason=item["finish_reason"], index=idx+1, message=message_obj)
-                            choices_list.append(choice_obj)
-                        model_response["choices"].extend(choices_list)
             else:
-                if len(completion_response[0]["generated_text"]) > 0: 
-                    model_response["choices"][0]["message"][
-                        "content"
-                    ] = completion_response[0]["generated_text"]   
-        ## CALCULATING USAGE
-        prompt_tokens = len(
-            encoding.encode(input_text)
-        )  ##[TODO] use the llama2 tokenizer here
-        print_verbose(f'output: {model_response["choices"][0]["message"]}')
-        output_text = model_response["choices"][0]["message"].get("content", "")
-        if output_text is not None and len(output_text) > 0:
-            completion_tokens = len(
-                encoding.encode(model_response["choices"][0]["message"].get("content", ""))
-            )  ##[TODO] use the llama2 tokenizer here
-        else: 
-            completion_tokens = 0
+                prompt = prompt_factory(model=model, messages=messages)
+            data = {
+                "inputs": prompt,
+                "parameters": optional_params,
+                "stream": True if "stream" in optional_params and optional_params["stream"] == True else False,
+            }
+            input_text = prompt
+        else:
+            # Non TGI and Conversational llms
+            # We need this branch, it removes 'details' and 'return_full_text' from params 
+            if model in custom_prompt_dict:
+                # check if the model has a registered custom prompt
+                model_prompt_details = custom_prompt_dict[model]
+                prompt = custom_prompt(
+                    role_dict=model_prompt_details.get("roles", {}), 
+                    initial_prompt_value=model_prompt_details.get("initial_prompt_value", ""),  
+                    final_prompt_value=model_prompt_details.get("final_prompt_value", ""), 
+                    bos_token=model_prompt_details.get("bos_token", ""),
+                    eos_token=model_prompt_details.get("eos_token", ""),
+                    messages=messages,
+                )
+            else:
+                prompt = prompt_factory(model=model, messages=messages)
+            inference_params = copy.deepcopy(optional_params)
+            inference_params.pop("details")
+            inference_params.pop("return_full_text")
+            data = {
+                "inputs": prompt,
+                "parameters": inference_params,
+                "stream": True if "stream" in optional_params and optional_params["stream"] == True else False,
+            }
+            input_text = prompt
+        ## LOGGING
+        logging_obj.pre_call(
+                input=input_text,
+                api_key=api_key,
+                additional_args={"complete_input_dict": data, "task": task, "headers": headers},
+            )
+        ## COMPLETION CALL
+        if "stream" in optional_params and optional_params["stream"] == True:
+            response = requests.post(
+                completion_url, 
+                headers=headers, 
+                data=json.dumps(data), 
+                stream=optional_params["stream"]
+            )
+            return response.iter_lines()
+        else:
+            response = requests.post(
+                completion_url, 
+                headers=headers, 
+                data=json.dumps(data)
+            )
 
-        model_response["created"] = time.time()
-        model_response["model"] = model
-        model_response["usage"] = {
-            "prompt_tokens": prompt_tokens,
-            "completion_tokens": completion_tokens,
-            "total_tokens": prompt_tokens + completion_tokens,
-        }
-        return model_response
+            ## Some servers might return streaming responses even though stream was not set to true. (e.g. Baseten)
+            is_streamed = False 
+            if response.__dict__['headers'].get("Content-Type", "") == "text/event-stream":
+                is_streamed = True
+            
+            # iterate over the complete streamed response, and return the final answer
+            if is_streamed:
+                streamed_response = CustomStreamWrapper(completion_stream=response.iter_lines(), model=model, custom_llm_provider="huggingface", logging_obj=logging_obj)
+                content = ""
+                for chunk in streamed_response: 
+                    content += chunk["choices"][0]["delta"]["content"]
+                completion_response: List[Dict[str, Any]] = [{"generated_text": content}]
+                ## LOGGING
+                logging_obj.post_call(
+                    input=input_text,
+                    api_key=api_key,
+                    original_response=completion_response,
+                    additional_args={"complete_input_dict": data, "task": task},
+                )
+            else: 
+                ## LOGGING
+                logging_obj.post_call(
+                    input=input_text,
+                    api_key=api_key,
+                    original_response=response.text,
+                    additional_args={"complete_input_dict": data, "task": task},
+                )
+                ## RESPONSE OBJECT
+                try:
+                    completion_response = response.json()
+                except:
+                    import traceback
+                    raise HuggingfaceError(
+                        message=f"Original Response received: {response.text}; Stacktrace: {traceback.format_exc()}", status_code=response.status_code
+                    )
+            print_verbose(f"response: {completion_response}")
+            if isinstance(completion_response, dict) and "error" in completion_response:
+                print_verbose(f"completion error: {completion_response['error']}")
+                print_verbose(f"response.status_code: {response.status_code}")
+                raise HuggingfaceError(
+                    message=completion_response["error"],
+                    status_code=response.status_code,
+                )
+            else:
+                if task == "conversational": 
+                    if len(completion_response["generated_text"]) > 0: # type: ignore
+                        model_response["choices"][0]["message"][
+                            "content"
+                        ] = completion_response["generated_text"] # type: ignore
+                elif task == "text-generation-inference": 
+                    if len(completion_response[0]["generated_text"]) > 0: 
+                        model_response["choices"][0]["message"][
+                            "content"
+                        ] = completion_response[0]["generated_text"]   
+                    ## GETTING LOGPROBS + FINISH REASON 
+                    if "details" in completion_response[0] and "tokens" in completion_response[0]["details"]:
+                        model_response.choices[0].finish_reason = completion_response[0]["details"]["finish_reason"]
+                        sum_logprob = 0
+                        for token in completion_response[0]["details"]["tokens"]:
+                            sum_logprob += token["logprob"]
+                        model_response["choices"][0]["message"]._logprob = sum_logprob
+                    if "best_of" in optional_params and optional_params["best_of"] > 1: 
+                        if "details" in completion_response[0] and "best_of_sequences" in completion_response[0]["details"]:
+                            choices_list = []
+                            for idx, item in enumerate(completion_response[0]["details"]["best_of_sequences"]):
+                                sum_logprob = 0
+                                for token in item["tokens"]:
+                                    sum_logprob += token["logprob"]
+                                if len(item["generated_text"]) > 0: 
+                                    message_obj = Message(content=item["generated_text"], logprobs=sum_logprob)
+                                else: 
+                                    message_obj = Message(content=None)
+                                choice_obj = Choices(finish_reason=item["finish_reason"], index=idx+1, message=message_obj)
+                                choices_list.append(choice_obj)
+                            model_response["choices"].extend(choices_list)
+                else:
+                    if len(completion_response[0]["generated_text"]) > 0: 
+                        model_response["choices"][0]["message"][
+                            "content"
+                        ] = completion_response[0]["generated_text"]   
+            ## CALCULATING USAGE
+            prompt_tokens = 0
+            try:
+                prompt_tokens = len(
+                    encoding.encode(input_text)
+                )  ##[TODO] use the llama2 tokenizer here
+            except:
+                # this should remain non blocking we should not block a response returning if calculating usage fails
+                pass
+            print_verbose(f'output: {model_response["choices"][0]["message"]}')
+            output_text = model_response["choices"][0]["message"].get("content", "")
+            if output_text is not None and len(output_text) > 0:
+                completion_tokens = 0
+                try:
+                    completion_tokens = len(
+                        encoding.encode(model_response["choices"][0]["message"].get("content", ""))
+                    )  ##[TODO] use the llama2 tokenizer here
+                except:
+                    # this should remain non blocking we should not block a response returning if calculating usage fails
+                    pass
+            else: 
+                completion_tokens = 0
+
+            model_response["created"] = time.time()
+            model_response["model"] = model
+            model_response.usage.completion_tokens = completion_tokens
+            model_response.usage.prompt_tokens = prompt_tokens
+            model_response.usage.total_tokens = prompt_tokens + completion_tokens
+            model_response._hidden_params["original_response"] = completion_response
+            return model_response
+    except HuggingfaceError as e: 
+        exception_mapping_worked = True
+        raise e
+    except Exception as e: 
+        if exception_mapping_worked: 
+            raise e
+        else: 
+            import traceback
+            raise HuggingfaceError(status_code=500, message=traceback.format_exc())
 
 
 def embedding(
@@ -376,9 +399,19 @@ def embedding(
     else:
         embed_url = f"https://api-inference.huggingface.co/models/{model}"
     
-    data = {
-        "inputs": input
-    }
+    if "sentence-transformers" in model: 
+        if len(input) == 0: 
+            raise HuggingfaceError(status_code=400, message="sentence transformers requires 2+ sentences")
+        data = {
+            "inputs": {
+                "source_sentence": input[0], 
+                "sentences": [ "That is a happy dog", "That is a very happy person", "Today is a sunny day" ]
+            }
+        }
+    else:
+        data = {
+            "inputs": input # type: ignore
+        }
     
     ## LOGGING
     logging_obj.pre_call(
@@ -403,15 +436,37 @@ def embedding(
 
     embeddings = response.json()
 
+    if "error" in embeddings: 
+        raise HuggingfaceError(status_code=500, message=embeddings['error'])
+    
     output_data = []
-    for idx, embedding in enumerate(embeddings):
-        output_data.append(
+    if "similarities" in embeddings: 
+        for idx, embedding in embeddings["similarities"]:
+            output_data.append(
             {
                 "object": "embedding",
                 "index": idx,
-                "embedding": embedding[0][0] # flatten list returned from hf
+                "embedding": embedding # flatten list returned from hf
             }
         )
+    else: 
+        for idx, embedding in enumerate(embeddings):
+            if isinstance(embedding, float): 
+                output_data.append(
+                    {
+                        "object": "embedding",
+                        "index": idx,
+                        "embedding": embedding # flatten list returned from hf
+                    }
+                )
+            else: 
+                output_data.append(
+                    {
+                        "object": "embedding",
+                        "index": idx,
+                        "embedding": embedding[0][0] # flatten list returned from hf
+                    }
+                )
     model_response["object"] = "list"
     model_response["data"] = output_data
     model_response["model"] = model
diff --git a/litellm/llms/maritalk.py b/litellm/llms/maritalk.py
new file mode 100644
index 000000000..10f39aa09
--- /dev/null
+++ b/litellm/llms/maritalk.py
@@ -0,0 +1,161 @@
+import os, types
+import json
+from enum import Enum
+import requests
+import time, traceback
+from typing import Callable, Optional, List
+from litellm.utils import ModelResponse, Choices, Message
+import litellm
+
+class MaritalkError(Exception):
+    def __init__(self, status_code, message):
+        self.status_code = status_code
+        self.message = message
+        super().__init__(
+            self.message
+        )  # Call the base class constructor with the parameters it needs
+
+class MaritTalkConfig():
+    """
+    The class `MaritTalkConfig` provides configuration for the MaritTalk's API interface. Here are the parameters:
+        
+    - `max_tokens` (integer): Maximum number of tokens the model will generate as part of the response. Default is 1.
+        
+    - `model` (string): The model used for conversation. Default is 'maritalk'.
+        
+    - `do_sample` (boolean): If set to True, the API will generate a response using sampling. Default is True.
+        
+    - `temperature` (number): A non-negative float controlling the randomness in generation. Lower temperatures result in less random generations. Default is 0.7.
+        
+    - `top_p` (number): Selection threshold for token inclusion based on cumulative probability. Default is 0.95.
+        
+    - `repetition_penalty` (number): Penalty for repetition in the generated conversation. Default is 1.
+        
+    - `stopping_tokens` (list of string): List of tokens where the conversation can be stopped/stopped.
+    """
+    max_tokens: Optional[int] = None
+    model: Optional[str] = None
+    do_sample: Optional[bool] = None
+    temperature: Optional[float] = None
+    top_p: Optional[float] = None
+    repetition_penalty: Optional[float] = None
+    stopping_tokens: Optional[List[str]] = None
+
+    def __init__(self,
+                 max_tokens: Optional[int]=None,
+                 model: Optional[str] = None,
+                 do_sample: Optional[bool] = None,
+                 temperature: Optional[float] = None,
+                 top_p: Optional[float] = None,
+                 repetition_penalty: Optional[float] = None,
+                 stopping_tokens: Optional[List[str]] = None) -> None:
+        
+        locals_ = locals()
+        for key, value in locals_.items():
+            if key != 'self' and value is not None:
+                setattr(self.__class__, key, value)
+   
+    @classmethod
+    def get_config(cls):
+        return {k: v for k, v in cls.__dict__.items() 
+                if not k.startswith('__') 
+                and not isinstance(v, (types.FunctionType, types.BuiltinFunctionType, classmethod, staticmethod)) 
+                and v is not None}
+        
+def validate_environment(api_key):
+    headers = {
+        "accept": "application/json",
+        "content-type": "application/json",
+    }
+    if api_key:
+        headers["Authorization"] = f"Key {api_key}"
+    return headers
+
+def completion(
+    model: str,
+    messages: list,
+    api_base: str,
+    model_response: ModelResponse,
+    print_verbose: Callable,
+    encoding,
+    api_key,
+    logging_obj,
+    optional_params=None,
+    litellm_params=None,
+    logger_fn=None,
+):
+    headers = validate_environment(api_key)
+    completion_url = api_base
+    model = model
+
+    ## Load Config
+    config=litellm.MaritTalkConfig.get_config()
+    for k, v in config.items():
+        if k not in optional_params: # completion(top_k=3) > maritalk_config(top_k=3) <- allows for dynamic variables to be passed in
+            optional_params[k] = v
+
+    data = {
+        "messages": messages,
+        **optional_params,
+    }
+
+    ## LOGGING
+    logging_obj.pre_call(
+            input=messages,
+            api_key=api_key,
+            additional_args={"complete_input_dict": data},
+        )
+    ## COMPLETION CALL
+    response = requests.post(
+        completion_url, headers=headers, data=json.dumps(data), stream=optional_params["stream"] if "stream" in optional_params else False
+    )
+    if "stream" in optional_params and optional_params["stream"] == True:
+        return response.iter_lines()
+    else:
+        ## LOGGING
+        logging_obj.post_call(
+                input=messages,
+                api_key=api_key,
+                original_response=response.text,
+                additional_args={"complete_input_dict": data},
+            )
+        print_verbose(f"raw model_response: {response.text}")
+        ## RESPONSE OBJECT
+        completion_response = response.json()
+        if "error" in completion_response:
+            raise MaritalkError(
+                message=completion_response["error"],
+                status_code=response.status_code,
+            )
+        else:
+            try:
+                if len(completion_response["answer"]) > 0:
+                    model_response["choices"][0]["message"]["content"] = completion_response["answer"]
+            except Exception as e:
+                raise MaritalkError(message=response.text, status_code=response.status_code)
+
+        ## CALCULATING USAGE
+        prompt = "".join(m["content"] for m in messages)
+        prompt_tokens = len(
+            encoding.encode(prompt)
+        ) 
+        completion_tokens = len(
+            encoding.encode(model_response["choices"][0]["message"].get("content", ""))
+        )
+
+        model_response["created"] = time.time()
+        model_response["model"] = model
+        model_response.usage.completion_tokens = completion_tokens
+        model_response.usage.prompt_tokens = prompt_tokens
+        model_response.usage.total_tokens = prompt_tokens + completion_tokens
+        return model_response
+
+def embedding(
+    model: str,
+    input: list,
+    api_key: Optional[str] = None,
+    logging_obj=None,
+    model_response=None,
+    encoding=None,
+):
+    pass
\ No newline at end of file
diff --git a/litellm/llms/nlp_cloud.py b/litellm/llms/nlp_cloud.py
index b12c23ff5..a4647bc08 100644
--- a/litellm/llms/nlp_cloud.py
+++ b/litellm/llms/nlp_cloud.py
@@ -171,11 +171,9 @@ def completion(
 
         model_response["created"] = time.time()
         model_response["model"] = model
-        model_response["usage"] = {
-            "prompt_tokens": prompt_tokens,
-            "completion_tokens": completion_tokens,
-            "total_tokens": prompt_tokens + completion_tokens,
-        }
+        model_response.usage.completion_tokens = completion_tokens
+        model_response.usage.prompt_tokens = prompt_tokens
+        model_response.usage.total_tokens = prompt_tokens + completion_tokens
         return model_response
 
 def embedding():
diff --git a/litellm/llms/ollama.py b/litellm/llms/ollama.py
index 3a0530803..add9c8d7f 100644
--- a/litellm/llms/ollama.py
+++ b/litellm/llms/ollama.py
@@ -147,7 +147,6 @@ def get_ollama_response_stream(
                                 yield completion_obj
                 except Exception as e:
                     traceback.print_exc()
-                    print(f"Error decoding JSON: {e}")
     session.close()
 
 if async_generator_imported:
@@ -198,5 +197,6 @@ if async_generator_imported:
                                     completion_obj["content"] = j["response"]
                                     await yield_({"choices": [{"delta": completion_obj}]})
                     except Exception as e:
-                        print(f"Error decoding JSON: {e}")
+                        import logging
+                        logging.debug(f"Error decoding JSON: {e}")
         session.close()
\ No newline at end of file
diff --git a/litellm/llms/oobabooga.py b/litellm/llms/oobabooga.py
index e49eba422..74f3957be 100644
--- a/litellm/llms/oobabooga.py
+++ b/litellm/llms/oobabooga.py
@@ -111,11 +111,9 @@ def completion(
 
         model_response["created"] = time.time()
         model_response["model"] = model
-        model_response["usage"] = {
-            "prompt_tokens": prompt_tokens,
-            "completion_tokens": completion_tokens,
-            "total_tokens": prompt_tokens + completion_tokens,
-        }
+        model_response.usage.completion_tokens = completion_tokens
+        model_response.usage.prompt_tokens = prompt_tokens
+        model_response.usage.total_tokens = prompt_tokens + completion_tokens
         return model_response
 
 def embedding():
diff --git a/litellm/llms/palm.py b/litellm/llms/palm.py
index b4160b63b..79a913649 100644
--- a/litellm/llms/palm.py
+++ b/litellm/llms/palm.py
@@ -157,11 +157,9 @@ def completion(
 
     model_response["created"] = time.time()
     model_response["model"] = "palm/" + model
-    model_response["usage"] = {
-        "prompt_tokens": prompt_tokens,
-        "completion_tokens": completion_tokens,
-        "total_tokens": prompt_tokens + completion_tokens,
-    }
+    model_response.usage.completion_tokens = completion_tokens
+    model_response.usage.prompt_tokens = prompt_tokens
+    model_response.usage.total_tokens = prompt_tokens + completion_tokens
     return model_response
 
 def embedding():
diff --git a/litellm/llms/petals.py b/litellm/llms/petals.py
index a3127eade..5834129c1 100644
--- a/litellm/llms/petals.py
+++ b/litellm/llms/petals.py
@@ -176,11 +176,9 @@ def completion(
 
     model_response["created"] = time.time()
     model_response["model"] = model
-    model_response["usage"] = {
-        "prompt_tokens": prompt_tokens,
-        "completion_tokens": completion_tokens,
-        "total_tokens": prompt_tokens + completion_tokens,
-    }
+    model_response.usage.completion_tokens = completion_tokens
+    model_response.usage.prompt_tokens = prompt_tokens
+    model_response.usage.total_tokens = prompt_tokens + completion_tokens
     return model_response
 
 def embedding():
diff --git a/litellm/llms/replicate.py b/litellm/llms/replicate.py
index 0912af5c0..afa56d978 100644
--- a/litellm/llms/replicate.py
+++ b/litellm/llms/replicate.py
@@ -240,11 +240,9 @@ def completion(
         prompt_tokens = len(encoding.encode(prompt))
         completion_tokens = len(encoding.encode(model_response["choices"][0]["message"].get("content", "")))
         model_response["model"] = "replicate/" + model
-        model_response["usage"] = {
-            "prompt_tokens": prompt_tokens,
-            "completion_tokens": completion_tokens,
-            "total_tokens": prompt_tokens + completion_tokens,
-        }
+        model_response.usage.completion_tokens = completion_tokens
+        model_response.usage.prompt_tokens = prompt_tokens
+        model_response.usage.total_tokens = prompt_tokens + completion_tokens
         return model_response
 
 
diff --git a/litellm/llms/sagemaker.py b/litellm/llms/sagemaker.py
index 962a2fc0e..8c999af63 100644
--- a/litellm/llms/sagemaker.py
+++ b/litellm/llms/sagemaker.py
@@ -169,11 +169,9 @@ def completion(
 
     model_response["created"] = time.time()
     model_response["model"] = model
-    model_response["usage"] = {
-        "prompt_tokens": prompt_tokens,
-        "completion_tokens": completion_tokens,
-        "total_tokens": prompt_tokens + completion_tokens,
-    }
+    model_response.usage.completion_tokens = completion_tokens
+    model_response.usage.prompt_tokens = prompt_tokens
+    model_response.usage.total_tokens = prompt_tokens + completion_tokens
     return model_response
 
 def embedding():
diff --git a/litellm/llms/together_ai.py b/litellm/llms/together_ai.py
index 9fc48b4f6..daddd472b 100644
--- a/litellm/llms/together_ai.py
+++ b/litellm/llms/together_ai.py
@@ -99,15 +99,18 @@ def completion(
         if k not in optional_params: # completion(top_k=3) > togetherai_config(top_k=3) <- allows for dynamic variables to be passed in
             optional_params[k] = v
 
+    print_verbose(f"CUSTOM PROMPT DICT: {custom_prompt_dict}; model: {model}")
     if model in custom_prompt_dict:
         # check if the model has a registered custom prompt
         model_prompt_details = custom_prompt_dict[model]
         prompt = custom_prompt(
-            role_dict=model_prompt_details["roles"], 
-            initial_prompt_value=model_prompt_details["initial_prompt_value"],  
-            final_prompt_value=model_prompt_details["final_prompt_value"], 
-            messages=messages
-        )
+                role_dict=model_prompt_details.get("roles", {}), 
+                initial_prompt_value=model_prompt_details.get("initial_prompt_value", ""),  
+                final_prompt_value=model_prompt_details.get("final_prompt_value", ""), 
+                bos_token=model_prompt_details.get("bos_token", ""),
+                eos_token=model_prompt_details.get("eos_token", ""),
+                messages=messages,
+            )
     else:
         prompt = prompt_factory(model=model, messages=messages)
 
@@ -175,11 +178,9 @@ def completion(
             model_response.choices[0].finish_reason = completion_response["output"]["choices"][0]["finish_reason"]
         model_response["created"] = time.time()
         model_response["model"] = model
-        model_response["usage"] = {
-            "prompt_tokens": prompt_tokens,
-            "completion_tokens": completion_tokens,
-            "total_tokens": prompt_tokens + completion_tokens,
-        }
+        model_response.usage.completion_tokens = completion_tokens
+        model_response.usage.prompt_tokens = prompt_tokens
+        model_response.usage.total_tokens = prompt_tokens + completion_tokens
         return model_response
 
 def embedding():
diff --git a/litellm/llms/vertex_ai.py b/litellm/llms/vertex_ai.py
index f124a088e..1e48fbbb0 100644
--- a/litellm/llms/vertex_ai.py
+++ b/litellm/llms/vertex_ai.py
@@ -109,7 +109,12 @@ def completion(
         logging_obj.pre_call(input=prompt, api_key=None, additional_args={"complete_input_dict": optional_params})
 
         if "stream" in optional_params and optional_params["stream"] == True:
+            # NOTE: VertexAI does not accept stream=True as a param and raises an error,
+            # we handle this by removing 'stream' from optional params and sending the request
+            # after we get the response we add optional_params["stream"] = True, since main.py needs to know it's a streaming response to then transform it for the OpenAI format
+            optional_params.pop("stream", None) # vertex ai raises an error when passing stream in optional params
             model_response = chat.send_message_streaming(prompt, **optional_params)
+            optional_params["stream"] = True
             return model_response
 
         completion_response = chat.send_message(prompt, **optional_params).text
@@ -118,7 +123,9 @@ def completion(
         logging_obj.pre_call(input=prompt, api_key=None)
 
         if "stream" in optional_params and optional_params["stream"] == True:
+            optional_params.pop("stream", None) # See note above on handling streaming for vertex ai 
             model_response = text_model.predict_streaming(prompt, **optional_params)
+            optional_params["stream"] = True
             return model_response
 
         completion_response = text_model.predict(prompt, **optional_params).text
@@ -144,11 +151,9 @@ def completion(
         encoding.encode(model_response["choices"][0]["message"].get("content", ""))
     )
 
-    model_response["usage"] = {
-        "prompt_tokens": prompt_tokens,
-        "completion_tokens": completion_tokens,
-        "total_tokens": prompt_tokens + completion_tokens,
-    }
+    model_response.usage.completion_tokens = completion_tokens
+    model_response.usage.prompt_tokens = prompt_tokens
+    model_response.usage.total_tokens = prompt_tokens + completion_tokens
     return model_response
 
 
diff --git a/litellm/llms/vllm.py b/litellm/llms/vllm.py
index 379d54ae8..7519c381f 100644
--- a/litellm/llms/vllm.py
+++ b/litellm/llms/vllm.py
@@ -90,11 +90,9 @@ def completion(
 
         model_response["created"] = time.time()
         model_response["model"] = model
-        model_response["usage"] = {
-            "prompt_tokens": prompt_tokens,
-            "completion_tokens": completion_tokens,
-            "total_tokens": prompt_tokens + completion_tokens,
-        }
+        model_response.usage.completion_tokens = completion_tokens
+        model_response.usage.prompt_tokens = prompt_tokens
+        model_response.usage.total_tokens = prompt_tokens + completion_tokens
         return model_response
 
 def batch_completions(
@@ -172,11 +170,9 @@ def batch_completions(
 
         model_response["created"] = time.time()
         model_response["model"] = model
-        model_response["usage"] = {
-            "prompt_tokens": prompt_tokens,
-            "completion_tokens": completion_tokens,
-            "total_tokens": prompt_tokens + completion_tokens,
-        }
+        model_response.usage.completion_tokens = completion_tokens
+        model_response.usage.prompt_tokens = prompt_tokens
+        model_response.usage.total_tokens = prompt_tokens + completion_tokens
         final_outputs.append(model_response)
     return final_outputs
 
diff --git a/litellm/main.py b/litellm/main.py
index 7a7571583..dc2310ae8 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -47,7 +47,8 @@ from .llms import (
     petals,
     oobabooga,
     palm,
-    vertex_ai)
+    vertex_ai,
+    maritalk)
 from .llms.openai import OpenAIChatCompletion
 from .llms.prompt_templates.factory import prompt_factory, custom_prompt, function_call_prompt
 import tiktoken
@@ -59,9 +60,10 @@ from litellm.utils import (
     get_secret,
     CustomStreamWrapper,
     ModelResponse,
+    TextCompletionResponse,
+    TextChoices,
     EmbeddingResponse,
     read_config_args,
-    RateLimitManager,
     Choices, 
     Message
 )
@@ -73,21 +75,42 @@ openai_proxy_chat_completions = OpenAIChatCompletion()
 
 async def acompletion(*args, **kwargs):
     """
-    Asynchronously perform a completion() using the any LiteLLM model (ex gpt-3.5-turbo, claude-2)
-
-    This function takes the same arguments as the 'completion' function and is used for asynchronous completion requests.
+    Asynchronously executes a litellm.completion() call for any of litellm supported llms (example gpt-4, gpt-3.5-turbo, claude-2, command-nightly)
 
     Parameters:
-        *args: Positional arguments to pass to the 'litellm.completion' function.
-        **kwargs: Keyword arguments to pass to the 'litellm.completion' function.
+        model (str): The name of the language model to use for text completion. see all supported LLMs: https://docs.litellm.ai/docs/providers/
+        messages (List): A list of message objects representing the conversation context (default is an empty list).
 
+        OPTIONAL PARAMS
+        functions (List, optional): A list of functions to apply to the conversation messages (default is an empty list).
+        function_call (str, optional): The name of the function to call within the conversation (default is an empty string).
+        temperature (float, optional): The temperature parameter for controlling the randomness of the output (default is 1.0).
+        top_p (float, optional): The top-p parameter for nucleus sampling (default is 1.0).
+        n (int, optional): The number of completions to generate (default is 1).
+        stream (bool, optional): If True, return a streaming response (default is False).
+        stop(string/list, optional): - Up to 4 sequences where the LLM API will stop generating further tokens.
+        max_tokens (integer, optional): The maximum number of tokens in the generated completion (default is infinity).
+        presence_penalty (float, optional): It is used to penalize new tokens based on their existence in the text so far.
+        frequency_penalty: It is used to penalize new tokens based on their frequency in the text so far.
+        logit_bias (dict, optional): Used to modify the probability of specific tokens appearing in the completion.
+        user (str, optional):  A unique identifier representing your end-user. This can help the LLM provider to monitor and detect abuse.
+        metadata (dict, optional): Pass in additional metadata to tag your completion calls - eg. prompt version, details, etc. 
+        api_base (str, optional): Base URL for the API (default is None).
+        api_version (str, optional): API version (default is None).
+        api_key (str, optional): API key (default is None).
+        model_list (list, optional): List of api base, version, keys
+
+        LITELLM Specific Params
+        mock_response (str, optional): If provided, return a mock completion response for testing or debugging purposes (default is None).
+        force_timeout (int, optional): The maximum execution time in seconds for the completion request (default is 600).
+        custom_llm_provider (str, optional): Used for Non-OpenAI LLMs, Example usage for bedrock, set model="amazon.titan-tg1-large" and custom_llm_provider="bedrock"
     Returns:
-        The completion response, either as a litellm.ModelResponse Object or an async generator if 'stream' is set to True.
+        ModelResponse: A response object containing the generated completion and associated metadata.
 
-    Note:
-        - This function uses asynchronous programming to perform completions.
-        - It leverages the 'loop.run_in_executor' method to execute the synchronous 'completion' function.
-        - If 'stream' is set to True in kwargs, the function returns an async generator.
+    Notes:
+        - This function is an asynchronous version of the `completion` function.
+        - The `completion` function is called using `run_in_executor` to execute synchronously in the event loop.
+        - If `stream` is True, the function returns an async generator that yields completion lines.
     """
     loop = asyncio.get_event_loop()
 
@@ -212,6 +235,7 @@ def completion(
         mock_response (str, optional): If provided, return a mock completion response for testing or debugging purposes (default is None).
         force_timeout (int, optional): The maximum execution time in seconds for the completion request (default is 600).
         custom_llm_provider (str, optional): Used for Non-OpenAI LLMs, Example usage for bedrock, set model="amazon.titan-tg1-large" and custom_llm_provider="bedrock"
+        num_retries (int, optional): The number of retries to attempt (default is 0).
     Returns:
         ModelResponse: A response object containing the generated completion and associated metadata.
 
@@ -233,13 +257,22 @@ def completion(
     metadata = kwargs.get('metadata', None)
     fallbacks = kwargs.get('fallbacks', None)
     headers = kwargs.get("headers", None)
+    num_retries = kwargs.get("num_retries", None)
+    context_window_fallback_dict = kwargs.get("context_window_fallback_dict", None)
+    ### CUSTOM PROMPT TEMPLATE ### 
+    initial_prompt_value = kwargs.get("intial_prompt_value", None)
+    roles = kwargs.get("roles", None)
+    final_prompt_value = kwargs.get("final_prompt_value", None)
+    bos_token = kwargs.get("bos_token", None)
+    eos_token = kwargs.get("eos_token", None)
     ######## end of unpacking kwargs ###########
     openai_params = ["functions", "function_call", "temperature", "temperature", "top_p", "n", "stream", "stop", "max_tokens", "presence_penalty", "frequency_penalty", "logit_bias", "user", "request_timeout", "api_base", "api_version", "api_key"]
-    litellm_params = ["metadata", "acompletion", "caching", "return_async", "mock_response", "api_key", "api_version", "api_base", "force_timeout", "logger_fn", "verbose", "custom_llm_provider", "litellm_logging_obj", "litellm_call_id", "use_client", "id", "fallbacks", "azure", "headers", "model_list"]
+    litellm_params = ["metadata", "acompletion", "caching", "return_async", "mock_response", "api_key", "api_version", "api_base", "force_timeout", "logger_fn", "verbose", "custom_llm_provider", "litellm_logging_obj", "litellm_call_id", "use_client", "id", "fallbacks", "azure", "headers", "model_list", "num_retries", "context_window_fallback_dict", "roles", "final_prompt_value", "bos_token", "eos_token"]
     default_params = openai_params + litellm_params
     non_default_params = {k: v for k,v in kwargs.items() if k not in default_params} # model-specific params - pass them straight to the model/provider
     if mock_response:
         return mock_completion(model, messages, stream=stream, mock_response=mock_response)
+    
     try:
         logging = litellm_logging_obj
         fallbacks = (
@@ -256,6 +289,7 @@ def completion(
             model = litellm.model_alias_map[
                 model
             ]  # update the model to the actual value if an alias has been passed in
+
         model_response = ModelResponse()
 
         if kwargs.get('azure', False) == True: # don't remove flag check, to remain backwards compatible for repos like Codium
@@ -264,6 +298,19 @@ def completion(
                 model=deployment_id
                 custom_llm_provider="azure"
         model, custom_llm_provider, dynamic_api_key, api_base = get_llm_provider(model=model, custom_llm_provider=custom_llm_provider, api_base=api_base)
+        custom_prompt_dict = {} # type: ignore
+        if initial_prompt_value or roles or final_prompt_value or bos_token or eos_token:
+            custom_prompt_dict = {model: {}}
+            if initial_prompt_value:
+                custom_prompt_dict[model]["initial_prompt_value"] = initial_prompt_value
+            if roles: 
+                custom_prompt_dict[model]["roles"] = roles
+            if final_prompt_value: 
+                custom_prompt_dict[model]["final_prompt_value"] = final_prompt_value
+            if bos_token:
+                custom_prompt_dict[model]["bos_token"] = bos_token
+            if eos_token:
+                custom_prompt_dict[model]["eos_token"] = eos_token
         model_api_key = get_api_key(llm_provider=custom_llm_provider, dynamic_api_key=api_key) # get the api key from the environment if required for the model
         if model_api_key and "sk-litellm" in model_api_key:
             api_base = "https://proxy.litellm.ai"
@@ -334,6 +381,11 @@ def completion(
                 get_secret("AZURE_API_KEY")
             )
 
+            headers = (
+                headers or
+                litellm.headers
+            )
+
             ## LOAD CONFIG - if set
             config=litellm.AzureOpenAIConfig.get_config()
             for k, v in config.items():
@@ -345,7 +397,7 @@ def completion(
                 input=messages,
                 api_key=api_key,
                 additional_args={
-                    "headers": litellm.headers,
+                    "headers": headers,
                     "api_version": api_version,
                     "api_base": api_base,
                 },
@@ -354,7 +406,7 @@ def completion(
             response = openai.ChatCompletion.create(
                 engine=model,
                 messages=messages,
-                headers=litellm.headers,
+                headers=headers,
                 api_key=api_key,
                 api_base=api_base,
                 api_version=api_version,
@@ -370,7 +422,7 @@ def completion(
                 api_key=api_key,
                 original_response=response,
                 additional_args={
-                    "headers": litellm.headers,
+                    "headers": headers,
                     "api_version": api_version,
                     "api_base": api_base,
                 },
@@ -403,6 +455,11 @@ def completion(
                 get_secret("OPENAI_API_KEY")
             )
 
+            headers = (
+                    headers or
+                    litellm.headers
+            )
+
             ## LOAD CONFIG - if set
             config=litellm.OpenAIConfig.get_config()
             for k, v in config.items():
@@ -413,7 +470,7 @@ def completion(
             logging.pre_call(
                 input=messages,
                 api_key=api_key,
-                additional_args={"headers": litellm.headers, "api_base": api_base},
+                additional_args={"headers": headers, "api_base": api_base},
             )
             ## COMPLETION CALL
             try:
@@ -434,7 +491,7 @@ def completion(
                     response = openai.ChatCompletion.create(
                         model=model,
                         messages=messages,
-                        headers=litellm.headers, # None by default
+                        headers=headers, # None by default
                         api_base=api_base, # thread safe setting base, key, api_version
                         api_key=api_key,
                         api_type="openai",
@@ -447,7 +504,7 @@ def completion(
                     input=messages,
                     api_key=api_key,
                     original_response=str(e),
-                    additional_args={"headers": litellm.headers},
+                    additional_args={"headers": headers},
                 )
                 raise e
             
@@ -459,10 +516,11 @@ def completion(
                 input=messages,
                 api_key=api_key,
                 original_response=response,
-                additional_args={"headers": litellm.headers},
+                additional_args={"headers": headers},
             )
         elif (
-            model in litellm.open_ai_text_completion_models
+            custom_llm_provider == "text-completion-openai"
+            or model in litellm.open_ai_text_completion_models
             or "ft:babbage-002" in model
             or "ft:davinci-002" in model  # support for finetuned completion models
             # NOTE: Do NOT add custom_llm_provider == "openai". 
@@ -491,23 +549,32 @@ def completion(
                 get_secret("OPENAI_API_KEY")
             )
 
+            headers = (
+                headers or
+                litellm.headers
+            )
+
             ## LOAD CONFIG - if set
             config=litellm.OpenAITextCompletionConfig.get_config()
             for k, v in config.items():
                 if k not in optional_params: # completion(top_k=3) > openai_text_config(top_k=3) <- allows for dynamic variables to be passed in
                     optional_params[k] = v
-
-
             if litellm.organization:
                 openai.organization = litellm.organization
-            prompt = " ".join([message["content"] for message in messages])
+
+            if len(messages)>0 and "content" in messages[0] and type(messages[0]["content"]) == list: 
+                # text-davinci-003 can accept a string or array, if it's an array, assume the array is set in messages[0]['content']
+                # https://platform.openai.com/docs/api-reference/completions/create
+                prompt = messages[0]["content"]
+            else:
+                prompt = " ".join([message["content"] for message in messages]) # type: ignore
             ## LOGGING
             logging.pre_call(
                 input=prompt,
                 api_key=api_key,
                 additional_args={
                     "openai_organization": litellm.organization,
-                    "headers": litellm.headers,
+                    "headers": headers,
                     "api_base": api_base,
                     "api_type": openai.api_type,
                 },
@@ -516,7 +583,7 @@ def completion(
             response = openai.Completion.create(
                 model=model, 
                 prompt=prompt,
-                headers=litellm.headers,
+                headers=headers,
                 api_key = api_key,
                 api_base=api_base,
                 **optional_params
@@ -531,12 +598,13 @@ def completion(
                 original_response=response,
                 additional_args={
                     "openai_organization": litellm.organization,
-                    "headers": litellm.headers,
+                    "headers": headers,
                     "api_base": openai.api_base,
                     "api_type": openai.api_type,
                 },
             )
             ## RESPONSE OBJECT
+            model_response._hidden_params["original_response"] = response # track original response, if users make a litellm.text_completion() request, we can return the original response
             choices_list = []
             for idx, item in enumerate(response["choices"]):
                 if len(item["text"]) > 0: 
@@ -601,6 +669,10 @@ def completion(
                 or get_secret("ANTHROPIC_API_BASE")
                 or "https://api.anthropic.com/v1/complete"
             )
+            custom_prompt_dict = (
+                custom_prompt_dict
+                or litellm.custom_prompt_dict
+            )
             model_response = anthropic.completion(
                 model=model,
                 messages=messages,
@@ -683,7 +755,7 @@ def completion(
                 response = CustomStreamWrapper(model_response, model, custom_llm_provider="aleph_alpha", logging_obj=logging)
                 return response
             response = model_response
-        elif model in litellm.cohere_models:
+        elif custom_llm_provider == "cohere":
             cohere_key = (
                 api_key
                 or litellm.cohere_key
@@ -718,6 +790,40 @@ def completion(
                 response = CustomStreamWrapper(model_response, model, custom_llm_provider="cohere", logging_obj=logging)
                 return response
             response = model_response
+        elif custom_llm_provider == "maritalk":
+            maritalk_key = (
+                api_key
+                or litellm.maritalk_key
+                or get_secret("MARITALK_API_KEY")
+                or litellm.api_key
+            )
+
+            api_base = (
+                api_base
+                or litellm.api_base
+                or get_secret("MARITALK_API_BASE")
+                or "https://chat.maritaca.ai/api/chat/inference"
+            )
+            
+            model_response = maritalk.completion(
+                model=model,
+                messages=messages,
+                api_base=api_base,
+                model_response=model_response,
+                print_verbose=print_verbose,
+                optional_params=optional_params,
+                litellm_params=litellm_params,
+                logger_fn=logger_fn,
+                encoding=encoding,
+                api_key=maritalk_key,
+                logging_obj=logging 
+            )
+
+            if "stream" in optional_params and optional_params["stream"] == True:
+                # don't try to access stream object,
+                response = CustomStreamWrapper(model_response, model, custom_llm_provider="maritalk", logging_obj=logging)
+                return response
+            response = model_response
         elif custom_llm_provider == "deepinfra": # for now this NEEDS to be above Hugging Face otherwise all calls to meta-llama/Llama-2-70b-chat-hf go to hf, we need this to go to deep infra if user sets provider to deep infra 
             # this can be called with the openai python package
             api_key = (
@@ -734,6 +840,11 @@ def completion(
                 or "https://api.deepinfra.com/v1/openai"
             )
 
+            headers = (
+                headers or
+                litellm.headers
+            )
+
             ## LOGGING
             logging.pre_call(
                 input=messages,
@@ -766,7 +877,7 @@ def completion(
                 input=messages,
                 api_key=api_key,
                 original_response=response,
-                additional_args={"headers": litellm.headers},
+                additional_args={"headers": headers},
             )
         elif ( 
             custom_llm_provider == "huggingface"
@@ -783,6 +894,11 @@ def completion(
                 headers
                 or litellm.headers
             )
+
+            custom_prompt_dict = (
+                custom_prompt_dict
+                or litellm.custom_prompt_dict
+            )
             model_response = huggingface_restapi.completion(
                 model=model,
                 messages=messages,
@@ -796,7 +912,7 @@ def completion(
                 encoding=encoding, 
                 api_key=huggingface_key, 
                 logging_obj=logging,
-                custom_prompt_dict=litellm.custom_prompt_dict
+                custom_prompt_dict=custom_prompt_dict
             )
             if "stream" in optional_params and optional_params["stream"] == True:
                 # don't try to access stream object,
@@ -846,15 +962,24 @@ def completion(
                 openai.api_key = get_secret("OPENROUTER_API_KEY") or get_secret(
                     "OR_API_KEY"
                 ) or litellm.api_key
+
+            headers = (
+                headers or
+                litellm.headers
+            )
+
+            data = {
+                "model": model, 
+                "messages": messages,  
+                **optional_params
+            }
             ## LOGGING
-            logging.pre_call(input=messages, api_key=openai.api_key)
+            logging.pre_call(input=messages, api_key=openai.api_key, additional_args={"complete_input_dict": data, "headers": headers})
             ## COMPLETION CALL
-            if litellm.headers:
+            if headers:
                 response = openai.ChatCompletion.create(
-                    model=model,
-                    messages=messages,
-                    headers=litellm.headers,
-                    **optional_params,
+                    headers=headers,
+                    **data,
                 )
             else:
                 openrouter_site_url = get_secret("OR_SITE_URL")
@@ -866,13 +991,11 @@ def completion(
                 if openrouter_app_name is None:
                     openrouter_app_name = "liteLLM"
                 response = openai.ChatCompletion.create(
-                    model=model,
-                    messages=messages,
                     headers={
                         "HTTP-Referer": openrouter_site_url,  # To identify your site
                         "X-Title": openrouter_app_name,  # To identify your app
                     },
-                    **optional_params,
+                    **data,
                 )
             ## LOGGING
             logging.post_call(
@@ -894,6 +1017,11 @@ def completion(
                 or get_secret("TOGETHERAI_API_BASE")
                 or "https://api.together.xyz/inference"
             )
+
+            custom_prompt_dict = (
+                custom_prompt_dict
+                or litellm.custom_prompt_dict
+            )
             
             model_response = together_ai.completion(
                 model=model,
@@ -906,7 +1034,8 @@ def completion(
                 logger_fn=logger_fn,
                 encoding=encoding,
                 api_key=together_ai_key,
-                logging_obj=logging
+                logging_obj=logging,
+                custom_prompt_dict=custom_prompt_dict
             )
             if "stream_tokens" in optional_params and optional_params["stream_tokens"] == True:
                 # don't try to access stream object,
@@ -1038,6 +1167,10 @@ def completion(
             response = model_response
         elif custom_llm_provider == "bedrock":
             # boto3 reads keys from .env
+            custom_prompt_dict = (
+                custom_prompt_dict
+                or litellm.custom_prompt_dict
+            )
             model_response = bedrock.completion(
                 model=model,
                 messages=messages,
@@ -1087,12 +1220,17 @@ def completion(
             api_base = (
                 litellm.api_base or
                 api_base or
+                get_secret("OLLAMA_API_BASE") or 
                 "http://localhost:11434"
                 
             )
-            if model in litellm.custom_prompt_dict:
+            custom_prompt_dict = (
+                custom_prompt_dict
+                or litellm.custom_prompt_dict
+            )
+            if model in custom_prompt_dict:
                 # check if the model has a registered custom prompt
-                model_prompt_details = litellm.custom_prompt_dict[model]
+                model_prompt_details = custom_prompt_dict[model]
                 prompt = custom_prompt(
                     role_dict=model_prompt_details["roles"], 
                     initial_prompt_value=model_prompt_details["initial_prompt_value"],  
@@ -1104,7 +1242,7 @@ def completion(
 
             ## LOGGING
             logging.pre_call(
-                input=prompt, api_key=None, additional_args={"api_base": api_base, "custom_prompt_dict": litellm.custom_prompt_dict}
+                input=prompt, api_key=None, additional_args={"api_base": api_base, "custom_prompt_dict": custom_prompt_dict}
             )
             if kwargs.get('acompletion', False) == True:    
                 if optional_params.get("stream", False) == True:
@@ -1128,7 +1266,7 @@ def completion(
             model_response["choices"][0]["message"]["content"] = response_string
             model_response["created"] = time.time()
             model_response["model"] = "ollama/" + model
-            prompt_tokens = len(encoding.encode(prompt))
+            prompt_tokens = len(encoding.encode(prompt)) # type: ignore
             completion_tokens = len(encoding.encode(response_string))
             model_response["usage"] = {
                 "prompt_tokens": prompt_tokens,
@@ -1224,7 +1362,7 @@ def completion(
             )
 
             """
-            prompt = " ".join([message["content"] for message in messages])
+            prompt = " ".join([message["content"] for message in messages]) # type: ignore
             resp = requests.post(url, json={
                 'model': model,
                 'params': {
@@ -1263,17 +1401,21 @@ def completion(
     except Exception as e:
         ## Map to OpenAI Exception
         raise exception_type(
-            model=model, custom_llm_provider=custom_llm_provider, original_exception=e, completion_kwargs=args,
-        )
+                model=model, custom_llm_provider=custom_llm_provider, original_exception=e, completion_kwargs=args,
+            )
 
 
 def completion_with_retries(*args, **kwargs):
+    """
+    Executes a litellm.completion() with 3 retries
+    """
     try:
         import tenacity
     except:
         raise Exception("tenacity import failed please run `pip install tenacity`")
-
-    retryer = tenacity.Retrying(stop=tenacity.stop_after_attempt(3), reraise=True)
+    
+    num_retries = kwargs.pop("num_retries", 3)
+    retryer = tenacity.Retrying(stop=tenacity.stop_after_attempt(num_retries), reraise=True)
     return retryer(completion, *args, **kwargs)
 
 
@@ -1297,6 +1439,30 @@ def batch_completion(
     request_timeout: Optional[int] = None,
     # Optional liteLLM function params
     **kwargs):
+    """
+    Batch litellm.completion function for a given model.
+
+    Args:
+        model (str): The model to use for generating completions.
+        messages (List, optional): List of messages to use as input for generating completions. Defaults to [].
+        functions (List, optional): List of functions to use as input for generating completions. Defaults to [].
+        function_call (str, optional): The function call to use as input for generating completions. Defaults to "".
+        temperature (float, optional): The temperature parameter for generating completions. Defaults to None.
+        top_p (float, optional): The top-p parameter for generating completions. Defaults to None.
+        n (int, optional): The number of completions to generate. Defaults to None.
+        stream (bool, optional): Whether to stream completions or not. Defaults to None.
+        stop (optional): The stop parameter for generating completions. Defaults to None.
+        max_tokens (float, optional): The maximum number of tokens to generate. Defaults to None.
+        presence_penalty (float, optional): The presence penalty for generating completions. Defaults to None.
+        frequency_penalty (float, optional): The frequency penalty for generating completions. Defaults to None.
+        logit_bias (dict, optional): The logit bias for generating completions. Defaults to {}.
+        user (str, optional): The user string for generating completions. Defaults to "".
+        deployment_id (optional): The deployment ID for generating completions. Defaults to None.
+        request_timeout (int, optional): The request timeout for generating completions. Defaults to None.
+
+    Returns:
+        list: A list of completion results.
+    """
     args = locals()
     batch_messages = messages
     completions = []
@@ -1393,10 +1559,33 @@ def batch_completion_models(*args, **kwargs):
                 kwargs = {**deployment, **nested_kwargs}
                 futures[deployment["model"]] = executor.submit(completion, **kwargs)
 
-            done, not_done = concurrent.futures.wait(futures.values(), return_when=concurrent.futures.FIRST_COMPLETED)
+            while futures:
+                # wait for the first returned future
+                print_verbose("\n\n waiting for next result\n\n")
+                done, _ = concurrent.futures.wait(futures.values(), return_when=concurrent.futures.FIRST_COMPLETED)
+                print_verbose(f"done list\n{done}")
+                for future in done:
+                    try:
+                        result = future.result()
+                        return result
+                    except Exception as e:
+                        # if model 1 fails, continue with response from model 2, model3
+                        print_verbose(f"\n\ngot an exception, ignoring, removing from futures")
+                        print_verbose(futures)
+                        new_futures = {}
+                        for key, value in futures.items():
+                            if future == value:
+                                print_verbose(f"removing key{key}")
+                                continue
+                            else:
+                                new_futures[key] = value
+                        futures = new_futures
+                        print_verbose(f"new futures{futures}")
+                        continue
 
-            for future in done:
-                return future.result()
+                
+                print_verbose("\n\ndone looping through futures\n\n")
+                print_verbose(futures)
 
     return None  # If no response is received from any model
 
@@ -1435,19 +1624,25 @@ def batch_completion_models_all_responses(*args, **kwargs):
 
     with concurrent.futures.ThreadPoolExecutor(max_workers=len(models)) as executor:
         for idx, model in enumerate(models):
-            print(f"{GREEN}LiteLLM: Making request to model: {model}{RESET}")
             future = executor.submit(completion, *args, model=model, **kwargs)
             if future.result() is not None:
                 responses.append(future.result())
-                print(f"{GREEN}LiteLLM: Model {model} returned response{RESET}")
-            else:
-                print(f"{RED}LiteLLM: Model {model } did not return a response{RESET}")
 
     return responses
 
 ### EMBEDDING ENDPOINTS ####################
 
 async def aembedding(*args, **kwargs):
+    """
+    Asynchronously calls the `embedding` function with the given arguments and keyword arguments.
+
+    Parameters:
+    - `args` (tuple): Positional arguments to be passed to the `embedding` function.
+    - `kwargs` (dict): Keyword arguments to be passed to the `embedding` function.
+
+    Returns:
+    - `response` (Any): The response returned by the `embedding` function.
+    """
     loop = asyncio.get_event_loop()
 
     # Use a partial function to pass your keyword arguments
@@ -1481,6 +1676,7 @@ def embedding(
     api_type: Optional[str] = None,
     caching: bool=False,
     custom_llm_provider=None,
+    **kwargs
 ):
     """
     Embedding function that calls an API to generate embeddings for the given input.
@@ -1610,6 +1806,7 @@ def embedding(
             response = cohere.embedding(
                 model=model,
                 input=input,
+                optional_params=kwargs,
                 encoding=encoding,
                 api_key=cohere_key,
                 logging_obj=logging,
@@ -1632,6 +1829,15 @@ def embedding(
                 logging_obj=logging,
                 model_response= EmbeddingResponse()
             )
+        elif custom_llm_provider == "bedrock":
+            response = bedrock.embedding(
+                model=model,
+                input=input,
+                encoding=encoding,
+                logging_obj=logging,
+                optional_params=kwargs,
+                model_response= EmbeddingResponse()
+            )
         else:
             args = locals()
             raise ValueError(f"No valid embedding model args passed in - {args}")
@@ -1653,32 +1859,87 @@ def embedding(
 
 ###### Text Completion ################
 def text_completion(*args, **kwargs):
+    global print_verbose
+    import copy
     """
     This maps to the Openai.Completion.create format, which has a different I/O (accepts prompt, returning ["choices"]["text"].
     """
-    if "prompt" in kwargs:
+    if "engine" in  kwargs:
+        kwargs["model"] = kwargs["engine"]
+        kwargs.pop("engine")
+
+    # input validation
+    if "prompt" not in kwargs:
+        raise ValueError("please pass prompt into the `text_completion` endpoint - `text_completion(model, prompt='hello world')`")
+
+    text_completion_response = TextCompletionResponse()
+    model = kwargs["model"]
+    prompt = kwargs["prompt"]
+    # get custom_llm_provider
+    _, custom_llm_provider, dynamic_api_key, api_base = get_llm_provider(model=model)
+    
+    if custom_llm_provider == "text-completion-openai":
+        # text-davinci-003 and openai text completion models
         messages = [{"role": "system", "content": kwargs["prompt"]}]
         kwargs["messages"] = messages
         kwargs.pop("prompt")
         response = completion(*args, **kwargs) # assume the response is the openai response object 
-        formatted_response_obj = {
-            "id": response["id"],
-            "object": "text_completion",
-            "created": response["created"],
-            "model": response["model"],
-            "choices": [
-            {
-                "text": response["choices"][0]["message"]["content"],
-                "index": response["choices"][0]["index"],
-                "logprobs": None,
-                "finish_reason": response["choices"][0]["finish_reason"]
-            }
-            ],
-            "usage": response["usage"]
-        }
-        return formatted_response_obj
+        # return raw response from openai
+        return response._hidden_params.get("original_response", None)
+
+    elif custom_llm_provider == "huggingface":
+        # if echo == True, for TGI llms we need to set top_n_tokens to 3
+        if kwargs.get("echo", False) == True:
+            # for tgi llms
+            if "top_n_tokens" not in kwargs:
+                kwargs["top_n_tokens"] = 3
+
+    # processing prompt - users can pass raw tokens to OpenAI Completion()
+    if type(prompt) == list:
+        tokenizer = tiktoken.encoding_for_model("text-davinci-003")
+        ## if it's a 2d list - each element in the list is a text_completion() request
+        if len(prompt) > 0 and type(prompt[0]) == list:
+            responses = [None for x in prompt] # init responses 
+            for i, request in enumerate(prompt):
+                decoded_prompt = tokenizer.decode(request)
+                new_kwargs = copy.deepcopy(kwargs)
+                new_kwargs["prompt"] = decoded_prompt
+                response = text_completion(**new_kwargs)
+                responses[i] = response["choices"][0]
+
+            text_completion_response["id"] = response["id"]
+            text_completion_response["object"] = "text_completion"
+            text_completion_response["created"] = response["created"]
+            text_completion_response["model"] = response["model"]
+            text_completion_response["choices"] = responses
+            text_completion_response["usage"] = response["usage"]
+
+            return text_completion_response
     else:
-        raise ValueError("please pass prompt into the `text_completion` endpoint - `text_completion(model, prompt='hello world')`")
+        messages = [{"role": "system", "content": kwargs["prompt"]}]
+        kwargs["messages"] = messages
+        kwargs.pop("prompt")
+        response = completion(*args, **kwargs) # assume the response is the openai response object 
+
+        transformed_logprobs = None
+        # only supported for TGI models
+        try:
+            raw_response = response._hidden_params.get("original_response", None)
+            transformed_logprobs = litellm.utils.transform_logprobs(raw_response)
+        except Exception as e:
+            print_verbose(f"LiteLLM non blocking exception: {e}")
+        text_completion_response["id"] = response["id"]
+        text_completion_response["object"] = "text_completion"
+        text_completion_response["created"] = response["created"]
+        text_completion_response["model"] = response["model"]
+        text_choices = TextChoices()
+        text_choices["text"] = response["choices"][0]["message"]["content"]
+        text_choices["index"] = response["choices"][0]["index"]
+        text_choices["logprobs"] = transformed_logprobs
+        text_choices["finish_reason"] = response["choices"][0]["finish_reason"]
+        text_completion_response["choices"] = [text_choices]
+        text_completion_response["usage"] = response["usage"]
+        return text_completion_response
 
 ##### Moderation #######################
 def moderation(input: str, api_key: Optional[str]=None):
@@ -1700,7 +1961,7 @@ def moderation(input: str, api_key: Optional[str]=None):
 ## Set verbose to true -> ```litellm.set_verbose = True```
 def print_verbose(print_statement):
     if litellm.set_verbose:
-        print(f"LiteLLM: {print_statement}")
+        print(print_statement) # noqa
 
 def config_completion(**kwargs):
     if litellm.config_path != None:
@@ -1736,15 +1997,16 @@ def stream_chunk_builder(chunks: list):
                 "finish_reason": finish_reason,
             }
         ],
-        # "usage": {
-        #     "prompt_tokens": 0,  # Modify as needed
-        #     "completion_tokens": 0,  # Modify as needed
-        #     "total_tokens": 0  # Modify as needed
-        # }
+        "usage": {
+            "prompt_tokens": 0,  # Modify as needed
+            "completion_tokens": 0,  # Modify as needed
+            "total_tokens": 0  # Modify as needed
+        }
     }
 
     # Extract the "content" strings from the nested dictionaries within "choices"
     content_list = []
+    combined_content = ""
 
     if "function_call" in chunks[0]["choices"][0]["delta"]:
         argument_list = []
@@ -1787,6 +2049,5 @@ def stream_chunk_builder(chunks: list):
 
 
     # # Update usage information if needed
-    # response["usage"]["completion_tokens"] = token
-
+    response["usage"]["completion_tokens"] = litellm.utils.token_counter(model=model, text=combined_content)
     return response
diff --git a/litellm/proxy/config.yaml b/litellm/proxy/config.yaml
new file mode 100644
index 000000000..48c2e7594
--- /dev/null
+++ b/litellm/proxy/config.yaml
@@ -0,0 +1,9 @@
+model_list:
+  - model_name: zephyr-alpha
+    litellm_params: # params for litellm.completion() - https://docs.litellm.ai/docs/completion/input#input---request-body
+      model: huggingface/HuggingFaceH4/zephyr-7b-alpha
+      api_base: http://0.0.0.0:8001
+  - model_name: zephyr-beta
+    litellm_params:
+      model: huggingface/HuggingFaceH4/zephyr-7b-beta
+      api_base: https://<my-hosted-endpoint>
\ No newline at end of file
diff --git a/litellm/proxy/llm.py b/litellm/proxy/llm.py
deleted file mode 100644
index 7e467c4d2..000000000
--- a/litellm/proxy/llm.py
+++ /dev/null
@@ -1,152 +0,0 @@
-from typing import Dict, Optional
-from collections import defaultdict
-import threading
-import os, subprocess, traceback, json
-from fastapi import HTTPException
-from fastapi.responses import StreamingResponse
-
-import backoff
-import openai.error
-
-import litellm
-from litellm.utils import trim_messages
-from litellm.exceptions import ServiceUnavailableError, InvalidRequestError
-
-cost_dict: Dict[str, Dict[str, float]] = defaultdict(dict)
-cost_dict_lock = threading.Lock()
-
-debug = False
-##### HELPER FUNCTIONS #####
-def print_verbose(print_statement):
-    global debug 
-    if debug: 
-         print(print_statement)
-
-# for streaming
-def data_generator(response):
-    print_verbose("inside generator")
-    for chunk in response:
-        print_verbose(f"returned chunk: {chunk}")
-        yield f"data: {json.dumps(chunk)}\n\n"
-
-def run_ollama_serve():
-    command = ['ollama', 'serve']
-    
-    with open(os.devnull, 'w') as devnull:
-        process = subprocess.Popen(command, stdout=devnull, stderr=devnull)
-
-##### ERROR HANDLING #####
-class RetryConstantError(Exception):
-    pass
-
-
-class RetryExpoError(Exception):
-    pass
-
-
-class UnknownLLMError(Exception):
-    pass
-
-
-def handle_llm_exception(e: Exception, user_api_base: Optional[str]=None):
-    print(f"\033[1;31mLiteLLM.Exception: {str(e)}\033[0m")
-    if isinstance(e, ServiceUnavailableError) and e.llm_provider == "ollama": # type: ignore
-        run_ollama_serve()
-    if isinstance(e, InvalidRequestError) and e.llm_provider == "ollama": # type: ignore
-        completion_call_details = {}
-        completion_call_details["model"] = e.model # type: ignore
-        if user_api_base: 
-            completion_call_details["api_base"] = user_api_base
-        else: 
-            completion_call_details["api_base"] = None
-        print(f"\033[1;31mLiteLLM.Exception: Invalid API Call. Call details: Model: \033[1;37m{e.model}\033[1;31m; LLM Provider: \033[1;37m{e.llm_provider}\033[1;31m; Custom API Base - \033[1;37m{completion_call_details['api_base']}\033[1;31m\033[0m") # type: ignore
-        if completion_call_details["api_base"] == "http://localhost:11434": 
-            print()
-            print("Trying to call ollama? Try `litellm --model ollama/llama2 --api_base http://localhost:11434`")
-            print()
-    if isinstance(
-        e,
-        (
-            openai.error.APIError,
-            openai.error.TryAgain,
-            openai.error.Timeout,
-            openai.error.ServiceUnavailableError,
-        ),
-    ):
-        raise RetryConstantError from e
-    elif isinstance(e, openai.error.RateLimitError):
-        raise RetryExpoError from e
-    elif isinstance(
-        e,
-        (
-            openai.error.APIConnectionError,
-            openai.error.InvalidRequestError,
-            openai.error.AuthenticationError,
-            openai.error.PermissionError,
-            openai.error.InvalidAPIType,
-            openai.error.SignatureVerificationError,
-        ),
-    ):
-        raise e
-    else:
-        raise UnknownLLMError from e
-
-
-@backoff.on_exception(
-    wait_gen=backoff.constant,
-    exception=RetryConstantError,
-    max_tries=3,
-    interval=3,
-)
-@backoff.on_exception(
-    wait_gen=backoff.expo,
-    exception=RetryExpoError,
-    jitter=backoff.full_jitter,
-    max_value=100,
-    factor=1.5,
-)
-
-def litellm_completion(data: Dict,
-                type: str, 
-                user_model: Optional[str], 
-                user_temperature: Optional[str], 
-                user_max_tokens: Optional[int], 
-                user_request_timeout: Optional[int],
-                user_api_base: Optional[str], 
-                user_headers: Optional[dict], 
-                user_debug: bool,
-                model_router: Optional[litellm.Router]):
-    try:  
-        global debug
-        debug = user_debug
-        if user_model:
-            data["model"] = user_model
-        # override with user settings
-        if user_temperature: 
-            data["temperature"] = user_temperature
-        if user_request_timeout:
-            data["request_timeout"] = user_request_timeout
-        if user_max_tokens: 
-            data["max_tokens"] = user_max_tokens
-        if user_api_base: 
-            data["api_base"] = user_api_base
-        if user_headers: 
-            data["headers"] = user_headers
-        if type == "completion": 
-            if model_router and data["model"] in model_router.get_model_names(): 
-                model_router.text_completion(**data)
-            else:
-                response = litellm.text_completion(**data)
-        elif type == "chat_completion": 
-            if model_router and data["model"] in model_router.get_model_names(): 
-                model_router.completion(**data)
-            else:
-                response = litellm.completion(**data)
-        if 'stream' in data and data['stream'] == True: # use generate_responses to stream responses
-            return StreamingResponse(data_generator(response), media_type='text/event-stream')
-        print_verbose(f"response: {response}")
-        return response
-    except Exception as e: 
-        print(e)
-        handle_llm_exception(e=e, user_api_base=user_api_base)
-        return {"message": "An error occurred"}, 500
diff --git a/litellm/proxy/proxy_cli.py b/litellm/proxy/proxy_cli.py
index 2e2359b13..59303a9fc 100644
--- a/litellm/proxy/proxy_cli.py
+++ b/litellm/proxy/proxy_cli.py
@@ -5,8 +5,9 @@ import random, appdirs
 from datetime import datetime
 from dotenv import load_dotenv
 import operator
+sys.path.append(os.getcwd())
 
-config_filename = "litellm.secrets.toml"
+config_filename = "litellm.secrets"
 # Using appdirs to determine user-specific config path
 config_dir = appdirs.user_config_dir("litellm")
 user_config_path = os.getenv("LITELLM_CONFIG_PATH", os.path.join(config_dir, config_filename))
@@ -22,39 +23,6 @@ def run_ollama_serve():
     with open(os.devnull, 'w') as devnull:
         process = subprocess.Popen(command, stdout=devnull, stderr=devnull)
 
-def open_config(file_path=None):
-    # Create the .env file if it doesn't exist
-    if file_path: 
-        # Ensure the user-specific directory exists
-        os.makedirs(config_dir, exist_ok=True)
-        # Copying the file using shutil.copy
-        try:
-            shutil.copy(file_path, user_config_path)
-            with open(file_path) as f:
-                print(f"Source file: {file_path}")
-                print(f.read())
-
-            with open(user_config_path) as f:
-                print(f"Dest file: {user_config_path}")
-                print(f.read())
-            print("\033[1;32mDone successfully\033[0m")
-        except Exception as e:
-            print(f"Failed to copy {file_path}: {e}")
-    else: 
-        if os.path.exists(user_config_path):
-            if os.path.getsize(user_config_path) == 0:
-                print(f"{user_config_path} exists but is empty")
-                print(f"To create a config (save keys, modify model prompt), copy the template located here: https://docs.litellm.ai/docs/proxy_server")
-            else: 
-                with open(user_config_path) as f:
-                    print(f"Saved Config file: {user_config_path}")
-                    print(f.read())
-        else:
-            print(f"{user_config_path} hasn't been created yet.")
-            print(f"To create a config (save keys, modify model prompt), copy the template located here: https://docs.litellm.ai/docs/proxy_server")
-    print(f"LiteLLM: config location - {user_config_path}")
-
-
 def clone_subfolder(repo_url, subfolder, destination):
   # Clone the full repo
   repo_name = repo_url.split('/')[-1]  
@@ -85,6 +53,7 @@ def is_port_in_use(port):
 @click.command()
 @click.option('--host', default='0.0.0.0', help='Host for the server to listen on.')
 @click.option('--port', default=8000, help='Port to bind the server to.')
+@click.option('--num_workers', default=1, help='Number of uvicorn workers to spin up')
 @click.option('--api_base', default=None, help='API base URL.')
 @click.option('--api_version', default="2023-07-01-preview", help='For azure - pass in the api version.')
 @click.option('--model', '-m', default=None, help='The model name to pass to litellm expects') 
@@ -99,7 +68,7 @@ def is_port_in_use(port):
 @click.option('--drop_params', is_flag=True, help='Drop any unmapped params') 
 @click.option('--create_proxy', is_flag=True, help='Creates a local OpenAI-compatible server template') 
 @click.option('--add_function_to_prompt', is_flag=True, help='If function passed but unsupported, pass it as prompt') 
-@click.option('--config', '-c', is_flag=True, help='Configure Litellm')  
+@click.option('--config', '-c', help='Configure Litellm')  
 @click.option('--file', '-f', help='Path to config file')
 @click.option('--max_budget', default=None, type=float, help='Set max budget for API calls - works for hosted models like OpenAI, TogetherAI, Anthropic, etc.`') 
 @click.option('--telemetry', default=True, type=bool, help='Helps us know if people are using this feature. Turn this off by doing `--telemetry False`') 
@@ -107,17 +76,17 @@ def is_port_in_use(port):
 @click.option('--test', flag_value=True, help='proxy chat completions url to make a test request to')
 @click.option('--local', is_flag=True, default=False, help='for local debugging')
 @click.option('--cost', is_flag=True, default=False, help='for viewing cost logs')
-def run_server(host, port, api_base, api_version, model, alias, add_key, headers, save, debug, temperature, max_tokens, request_timeout, drop_params, create_proxy, add_function_to_prompt, config, file, max_budget, telemetry, logs, test, local, cost):
+def run_server(host, port, api_base, api_version, model, alias, add_key, headers, save, debug, temperature, max_tokens, request_timeout, drop_params, create_proxy, add_function_to_prompt, config, file, max_budget, telemetry, logs, test, local, cost, num_workers):
     global feature_telemetry
     args = locals()
     if local:
-        from proxy_server import app, initialize, print_cost_logs, usage_telemetry, add_keys_to_config
+        from proxy_server import app, save_worker_config, print_cost_logs, usage_telemetry, add_keys_to_config
         debug = True
     else:
         try:
-            from .proxy_server import app, initialize, print_cost_logs, usage_telemetry, add_keys_to_config
+            from .proxy_server import app, save_worker_config, print_cost_logs, usage_telemetry, add_keys_to_config
         except ImportError as e: 
-            from proxy_server import app, initialize, print_cost_logs, usage_telemetry, add_keys_to_config
+            from proxy_server import app, save_worker_config, print_cost_logs, usage_telemetry, add_keys_to_config
     feature_telemetry = usage_telemetry
     if create_proxy == True: 
         repo_url = 'https://github.com/BerriAI/litellm'
@@ -126,12 +95,6 @@ def run_server(host, port, api_base, api_version, model, alias, add_key, headers
 
         clone_subfolder(repo_url, subfolder, destination)
         return
-    if config:
-        if file: 
-            open_config(file_path=file)
-        else: 
-            open_config()
-        return
     if logs is not None:
         if logs == 0: # default to 1
             logs = 1
@@ -176,10 +139,13 @@ def run_server(host, port, api_base, api_version, model, alias, add_key, headers
         openai.api_key = "temp-key"
         print(openai.api_base)
 
+        response = openai.Completion.create(model="gpt-3.5-turbo", prompt='this is a test request, write a short poem')
+        print(response)
+
         response = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages = [
             {
                 "role": "user",
-                "content": "this is a test request, acknowledge that you got it"
+                "content": "this is a test request, write a short poem"
             }
         ])
         click.echo(f'LiteLLM: response from proxy {response}')
@@ -188,7 +154,7 @@ def run_server(host, port, api_base, api_version, model, alias, add_key, headers
         response = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages = [
             {
                 "role": "user",
-                "content": "this is a test request, acknowledge that you got it"
+                "content": "this is a test request, write a short poem"
             }
         ],
         stream=True,
@@ -199,7 +165,7 @@ def run_server(host, port, api_base, api_version, model, alias, add_key, headers
     else:
         if headers:
             headers = json.loads(headers)
-        initialize(model=model, alias=alias, api_base=api_base, api_version=api_version, debug=debug, temperature=temperature, max_tokens=max_tokens, request_timeout=request_timeout, max_budget=max_budget, telemetry=telemetry, drop_params=drop_params, add_function_to_prompt=add_function_to_prompt, headers=headers, save=save)
+        save_worker_config(model=model, alias=alias, api_base=api_base, api_version=api_version, debug=debug, temperature=temperature, max_tokens=max_tokens, request_timeout=request_timeout, max_budget=max_budget, telemetry=telemetry, drop_params=drop_params, add_function_to_prompt=add_function_to_prompt, headers=headers, save=save, config=config)
         try:
             import uvicorn
         except:
@@ -210,7 +176,8 @@ def run_server(host, port, api_base, api_version, model, alias, add_key, headers
         
         if port == 8000 and is_port_in_use(port):
             port = random.randint(1024, 49152)
-        uvicorn.run(app, host=host, port=port)
+        print(os.listdir(os.getcwd()))
+        uvicorn.run("litellm:app", host=host, port=port, workers=num_workers)
 
 
 if __name__ == "__main__":
diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index 854e17bed..462eed034 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -1,7 +1,7 @@
 import sys, os, platform, time, copy
-import threading
-import shutil, random, traceback
-
+import threading, ast
+import shutil, random, traceback, requests
+from typing import Optional
 messages: list = []
 sys.path.insert(
     0, os.path.abspath("../..")
@@ -14,6 +14,7 @@ try:
     import appdirs
     import tomli_w
     import backoff
+    import yaml
 except ImportError:
     import subprocess
     import sys
@@ -30,6 +31,7 @@ except ImportError:
             "appdirs",
             "tomli-w",
             "backoff",
+            "pyyaml"
         ]
     )
     import uvicorn
@@ -38,11 +40,6 @@ except ImportError:
     import appdirs
     import tomli_w
 
-try:
-    from .llm import litellm_completion
-except ImportError as e:
-    from llm import litellm_completion  # type: ignore
-
 import random
 
 list_of_messages = [
@@ -90,6 +87,7 @@ print("\033[1;34mDocs: https://docs.litellm.ai/docs/proxy_server\033[0m")
 print()
 
 import litellm
+litellm.suppress_debug_info = True
 from fastapi import FastAPI, Request
 from fastapi.routing import APIRouter
 from fastapi.encoders import jsonable_encoder
@@ -120,30 +118,27 @@ user_telemetry = True
 user_config = None
 user_headers = None
 local_logging = True # writes logs to a local api_log.json file for debugging
-model_router = litellm.Router()
 config_filename = "litellm.secrets.toml"
 config_dir = os.getcwd()
 config_dir = appdirs.user_config_dir("litellm")
 user_config_path = os.getenv(
     "LITELLM_CONFIG_PATH", os.path.join(config_dir, config_filename)
 )
+#### GLOBAL VARIABLES ####
+llm_router: Optional[litellm.Router] = None
+llm_model_list: Optional[list] = None
+server_settings: dict = {}
 log_file = "api_log.json"
-
+worker_config = None
 
 #### HELPER FUNCTIONS ####
 def print_verbose(print_statement):
     global user_debug
+    print(f"user debug value: {user_debug}")
     if user_debug:
         print(print_statement)
 
 
-def find_avatar_url(role):
-    role = role.replace(" ", "%20")
-    avatar_filename = f"avatars/{role}.png"
-    avatar_url = f"/static/{avatar_filename}"
-    return avatar_url
-
-
 def usage_telemetry(
     feature: str,
 ):  # helps us know if people are using this feature. Set `litellm --telemetry False` to your cli call to turn this off
@@ -205,105 +200,147 @@ def save_params_to_config(data: dict):
         tomli_w.dump(config, f)
 
 
-def load_config():
-    try:
-        global user_config, user_api_base, user_max_tokens, user_temperature, user_model, local_logging
-        # As the .env file is typically much simpler in structure, we use load_dotenv here directly
-        with open(user_config_path, "rb") as f:
-            user_config = tomllib.load(f)
-
-        ## load keys
-        if "keys" in user_config:
-            for key in user_config["keys"]:
-                os.environ[key] = user_config["keys"][
-                    key
-                ]  # litellm can read keys from the environment
-        ## settings
-        if "general" in user_config:
-            litellm.add_function_to_prompt = user_config["general"].get(
-                "add_function_to_prompt", True
-            )  # by default add function to prompt if unsupported by provider
-            litellm.drop_params = user_config["general"].get(
-                "drop_params", True
-            )  # by default drop params if unsupported by provider
-            litellm.model_fallbacks = user_config["general"].get(
-                "fallbacks", None
-            )  # fallback models in case initial completion call fails
-            default_model = user_config["general"].get(
-                "default_model", None
-            )  # route all requests to this model.
-
-            local_logging = user_config["general"].get("local_logging", True)
-
-            if user_model is None:  # `litellm --model <model-name>`` > default_model.
-                user_model = default_model
-
-        ## load model config - to set this run `litellm --config`
-        model_config = None
-        if "model" in user_config:
-            if user_model in user_config["model"]:
-                model_config = user_config["model"][user_model]
-            model_list = []
-            for model in user_config["model"]:
-                if "model_list" in user_config["model"][model]:
-                    model_list.extend(user_config["model"][model]["model_list"])
-            if len(model_list) > 0:
-                model_router.set_model_list(model_list=model_list)
-
-        print_verbose(f"user_config: {user_config}")
-        print_verbose(f"model_config: {model_config}")
-        print_verbose(f"user_model: {user_model}")
-        if model_config is None:
-            return
-
-        user_max_tokens = model_config.get("max_tokens", None)
-        user_temperature = model_config.get("temperature", None)
-        user_api_base = model_config.get("api_base", None)
-
-        ## custom prompt template
-        if "prompt_template" in model_config:
-            model_prompt_template = model_config["prompt_template"]
-            if (
-                len(model_prompt_template.keys()) > 0
-            ):  # if user has initialized this at all
-                litellm.register_prompt_template(
-                    model=user_model,
-                    initial_prompt_value=model_prompt_template.get(
-                        "MODEL_PRE_PROMPT", ""
-                    ),
-                    roles={
-                        "system": {
-                            "pre_message": model_prompt_template.get(
-                                "MODEL_SYSTEM_MESSAGE_START_TOKEN", ""
-                            ),
-                            "post_message": model_prompt_template.get(
-                                "MODEL_SYSTEM_MESSAGE_END_TOKEN", ""
-                            ),
-                        },
-                        "user": {
-                            "pre_message": model_prompt_template.get(
-                                "MODEL_USER_MESSAGE_START_TOKEN", ""
-                            ),
-                            "post_message": model_prompt_template.get(
-                                "MODEL_USER_MESSAGE_END_TOKEN", ""
-                            ),
-                        },
-                        "assistant": {
-                            "pre_message": model_prompt_template.get(
-                                "MODEL_ASSISTANT_MESSAGE_START_TOKEN", ""
-                            ),
-                            "post_message": model_prompt_template.get(
-                                "MODEL_ASSISTANT_MESSAGE_END_TOKEN", ""
-                            ),
-                        },
-                    },
-                    final_prompt_value=model_prompt_template.get(
-                        "MODEL_POST_PROMPT", ""
-                    ),
-                )
+def load_router_config(router: Optional[litellm.Router], config_file_path: str):
+    config = {}
+    server_settings  = {} 
+    try: 
+        if os.path.exists(config_file_path):
+            with open(config_file_path, 'r') as file:
+                config = yaml.safe_load(file)
+        else:
+            pass
     except:
         pass
 
+    ## SERVER SETTINGS (e.g. default completion model = 'ollama/mistral')
+    _server_settings = config.get("server_settings", None)
+    if _server_settings: 
+        server_settings = _server_settings
+
+    ## LITELLM MODULE SETTINGS (e.g. litellm.drop_params=True,..)
+    litellm_settings = config.get('litellm_settings', None)
+    if litellm_settings: 
+        for key, value in litellm_settings.items(): 
+            setattr(litellm, key, value)
+
+    ## MODEL LIST
+    model_list = config.get('model_list', None)
+    if model_list: 
+        router = litellm.Router(model_list=model_list)
+    
+    ## ENVIRONMENT VARIABLES
+    environment_variables = config.get('environment_variables', None)
+    if environment_variables: 
+        for key, value in environment_variables.items(): 
+            os.environ[key] = value
+
+    return router, model_list, server_settings
+
+def load_config():
+    #### DEPRECATED #### 
+    try:
+        global user_config, user_api_base, user_max_tokens, user_temperature, user_model, local_logging, llm_model_list, llm_router, server_settings
+        
+        # Get the file extension
+        file_extension = os.path.splitext(user_config_path)[1]
+        if file_extension.lower() == ".toml":
+            # As the .env file is typically much simpler in structure, we use load_dotenv here directly
+            with open(user_config_path, "rb") as f:
+                user_config = tomllib.load(f)
+
+            ## load keys
+            if "keys" in user_config:
+                for key in user_config["keys"]:
+                    os.environ[key] = user_config["keys"][
+                        key
+                    ]  # litellm can read keys from the environment
+            ## settings
+            if "general" in user_config:
+                litellm.add_function_to_prompt = user_config["general"].get(
+                    "add_function_to_prompt", True
+                )  # by default add function to prompt if unsupported by provider
+                litellm.drop_params = user_config["general"].get(
+                    "drop_params", True
+                )  # by default drop params if unsupported by provider
+                litellm.model_fallbacks = user_config["general"].get(
+                    "fallbacks", None
+                )  # fallback models in case initial completion call fails
+                default_model = user_config["general"].get(
+                    "default_model", None
+                )  # route all requests to this model.
+
+                local_logging = user_config["general"].get("local_logging", True)
+
+                if user_model is None:  # `litellm --model <model-name>`` > default_model.
+                    user_model = default_model
+
+            ## load model config - to set this run `litellm --config`
+            model_config = None
+            if "model" in user_config:
+                if user_model in user_config["model"]:
+                    model_config = user_config["model"][user_model]
+                model_list = []
+                for model in user_config["model"]:
+                    if "model_list" in user_config["model"][model]:
+                        model_list.extend(user_config["model"][model]["model_list"])
+
+            print_verbose(f"user_config: {user_config}")
+            print_verbose(f"model_config: {model_config}")
+            print_verbose(f"user_model: {user_model}")
+            if model_config is None:
+                return
+
+            user_max_tokens = model_config.get("max_tokens", None)
+            user_temperature = model_config.get("temperature", None)
+            user_api_base = model_config.get("api_base", None)
+
+            ## custom prompt template
+            if "prompt_template" in model_config:
+                model_prompt_template = model_config["prompt_template"]
+                if (
+                    len(model_prompt_template.keys()) > 0
+                ):  # if user has initialized this at all
+                    litellm.register_prompt_template(
+                        model=user_model,
+                        initial_prompt_value=model_prompt_template.get(
+                            "MODEL_PRE_PROMPT", ""
+                        ),
+                        roles={
+                            "system": {
+                                "pre_message": model_prompt_template.get(
+                                    "MODEL_SYSTEM_MESSAGE_START_TOKEN", ""
+                                ),
+                                "post_message": model_prompt_template.get(
+                                    "MODEL_SYSTEM_MESSAGE_END_TOKEN", ""
+                                ),
+                            },
+                            "user": {
+                                "pre_message": model_prompt_template.get(
+                                    "MODEL_USER_MESSAGE_START_TOKEN", ""
+                                ),
+                                "post_message": model_prompt_template.get(
+                                    "MODEL_USER_MESSAGE_END_TOKEN", ""
+                                ),
+                            },
+                            "assistant": {
+                                "pre_message": model_prompt_template.get(
+                                    "MODEL_ASSISTANT_MESSAGE_START_TOKEN", ""
+                                ),
+                                "post_message": model_prompt_template.get(
+                                    "MODEL_ASSISTANT_MESSAGE_END_TOKEN", ""
+                                ),
+                            },
+                        },
+                        final_prompt_value=model_prompt_template.get(
+                            "MODEL_POST_PROMPT", ""
+                        ),
+                    )
+    except:
+        pass
+
+def save_worker_config(**data): 
+    import json
+    os.environ["WORKER_CONFIG"] = json.dumps(data)
 
 def initialize(
     model,
@@ -320,12 +357,14 @@ def initialize(
     add_function_to_prompt,
     headers,
     save,
+    config
 ):
-    global user_model, user_api_base, user_debug, user_max_tokens, user_request_timeout, user_temperature, user_telemetry, user_headers
+    global user_model, user_api_base, user_debug, user_max_tokens, user_request_timeout, user_temperature, user_telemetry, user_headers, llm_model_list, llm_router, server_settings
     user_model = model
     user_debug = debug
-    load_config()
     dynamic_config = {"general": {}, user_model: {}}
+    if config:
+        llm_router, llm_model_list, server_settings = load_router_config(router=llm_router, config_file_path=config)
     if headers:  # model-specific param
         user_headers = headers
         dynamic_config[user_model]["headers"] = headers
@@ -470,57 +509,139 @@ litellm.input_callback = [logger]
 litellm.success_callback = [logger]
 litellm.failure_callback = [logger]
 
+# for streaming
+def data_generator(response):
+    print_verbose("inside generator")
+    for chunk in response:
+        print_verbose(f"returned chunk: {chunk}")
+        yield f"data: {json.dumps(chunk)}\n\n"
+
+
+def litellm_completion(*args, **kwargs):
+    global user_temperature, user_request_timeout, user_max_tokens, user_api_base
+    call_type = kwargs.pop("call_type")
+    # override with user settings
+    if user_temperature: 
+        kwargs["temperature"] = user_temperature
+    if user_request_timeout:
+        kwargs["request_timeout"] = user_request_timeout
+    if user_max_tokens: 
+        kwargs["max_tokens"] = user_max_tokens
+    if user_api_base: 
+        kwargs["api_base"] = user_api_base
+    ## CHECK CONFIG ## 
+    if llm_model_list and kwargs["model"] in [m["model_name"] for m in llm_model_list]:
+        for m in llm_model_list: 
+            if kwargs["model"] == m["model_name"]: 
+                for key, value in m["litellm_params"].items(): 
+                    kwargs[key] = value
+                break
+    print(f"litellm set verbose pre-call: {litellm.set_verbose}")
+    if call_type == "chat_completion":
+        response = litellm.completion(*args, **kwargs)
+    elif call_type == "text_completion":
+        response = litellm.text_completion(*args, **kwargs)
+    if 'stream' in kwargs and kwargs['stream'] == True: # use generate_responses to stream responses
+        return StreamingResponse(data_generator(response), media_type='text/event-stream')
+    return response
+
+
+@app.on_event("startup")
+def startup_event():
+    import json
+    worker_config = json.loads(os.getenv("WORKER_CONFIG"))
+    initialize(**worker_config)
+    print(f"\033[32mWorker Initialized\033[0m\n")
 
 #### API ENDPOINTS ####
-@router.post("/v1/models")
+@router.get("/v1/models")
 @router.get("/models")  # if project requires model list
 def model_list():
-    if user_model != None:
-        return dict(
-            data=[
-                {
-                    "id": user_model,
-                    "object": "model",
-                    "created": 1677610602,
-                    "owned_by": "openai",
-                }
-            ],
-            object="list",
-        )
-    else:
+    global llm_model_list, server_settings    
+    all_models = []
+    if server_settings.get("infer_model_from_keys", False):
         all_models = litellm.utils.get_valid_models()
-        return dict(
-            data=[
-                {
-                    "id": model,
-                    "object": "model",
-                    "created": 1677610602,
-                    "owned_by": "openai",
-                }
-                for model in all_models
-            ],
-            object="list",
-        )
-
+    if llm_model_list: 
+        all_models += llm_model_list
+    if user_model is not None:
+        all_models += user_model
+    ### CHECK OLLAMA MODELS ### 
+    try:
+        response = requests.get("http://0.0.0.0:11434/api/tags")
+        models = response.json()["models"]
+        ollama_models = [m["name"].replace(":latest", "") for m in models]
+        all_models.extend(ollama_models)
+    except Exception as e: 
+        traceback.print_exc()
+    return dict(
+        data=[
+            {
+                "id": model,
+                "object": "model",
+                "created": 1677610602,
+                "owned_by": "openai",
+            }
+            for model in all_models
+        ],
+        object="list",
+    )
 
 @router.post("/v1/completions")
 @router.post("/completions")
-async def completion(request: Request):
-    data = await request.json()
-    return litellm_completion(data=data, type="completion", user_model=user_model, user_temperature=user_temperature,
-                              user_max_tokens=user_max_tokens, user_api_base=user_api_base, user_headers=user_headers,
-                              user_debug=user_debug, model_router=model_router, user_request_timeout=user_request_timeout)
-
+@router.post("/engines/{model:path}/completions")
+async def completion(request: Request, model: Optional[str] = None):
+    try: 
+        body = await request.body()
+        body_str = body.decode()
+        try:
+            data = ast.literal_eval(body_str)
+        except: 
+            data = json.loads(body_str)
+        data["model"] = (
+            server_settings.get("completion_model", None) # server default
+            or user_model # model name passed via cli args
+            or model # for azure deployments
+            or data["model"] # default passed in http request
+        )
+        if user_model:
+            data["model"] = user_model
+        data["call_type"] = "text_completion"
+        return litellm_completion(
+            **data
+        )
+    except Exception as e: 
+        error_traceback = traceback.format_exc()
+        error_msg = f"{str(e)}\n\n{error_traceback}"
+        return {"error": error_msg}
+                              
 
 @router.post("/v1/chat/completions")
 @router.post("/chat/completions")
-async def chat_completion(request: Request):
-    data = await request.json()
-    print_verbose(f"data passed in: {data}")
-    return litellm_completion(data, type="chat_completion", user_model=user_model,
-                              user_temperature=user_temperature, user_max_tokens=user_max_tokens,
-                              user_api_base=user_api_base, user_headers=user_headers, user_debug=user_debug, model_router=model_router, user_request_timeout=user_request_timeout)
-
+@router.post("/openai/deployments/{model:path}/chat/completions") # azure compatible endpoint
+async def chat_completion(request: Request, model: Optional[str] = None):
+    global server_settings
+    try: 
+        body = await request.body()
+        body_str = body.decode()
+        try:
+            data = ast.literal_eval(body_str)
+        except: 
+            data = json.loads(body_str)
+        data["model"] = (
+            server_settings.get("completion_model", None) # server default
+            or user_model # model name passed via cli args
+            or model # for azure deployments
+            or data["model"] # default passed in http request
+        )
+        data["call_type"] = "chat_completion"
+        return litellm_completion(
+            **data
+        )
+    except Exception as e: 
+        print(f"\033[1;31mAn error occurred: {e}\n\n Debug this by setting `--debug`, e.g. `litellm --model gpt-3.5-turbo --debug`")
+        error_traceback = traceback.format_exc()
+        error_msg = f"{str(e)}\n\n{error_traceback}"
+        return {"error": error_msg}
 
 def print_cost_logs():
     with open("costs.json", "r") as f:
diff --git a/litellm/router.py b/litellm/router.py
index e8eb12b24..14b89e5b8 100644
--- a/litellm/router.py
+++ b/litellm/router.py
@@ -1,16 +1,17 @@
-from typing import Union, List, Dict, Optional
 from datetime import datetime
+from typing import Dict, List, Optional, Union
+
 import litellm
 
 
-class Router: 
+class Router:
     """
     Example usage:
     from litellm import Router
     model_list = [{
-        "model_name": "gpt-3.5-turbo", # openai model name 
-        "litellm_params": { # params for litellm completion/embedding call 
-            "model": "azure/<your-deployment-name>", 
+        "model_name": "gpt-3.5-turbo", # openai model name
+        "litellm_params": { # params for litellm completion/embedding call
+            "model": "azure/<your-deployment-name>",
             "api_key": <your-api-key>,
             "api_version": <your-api-version>,
             "api_base": <your-api-base>
@@ -23,16 +24,17 @@ class Router:
     """
     model_names: List = []
     cache_responses: bool = False
-    def __init__(self, 
-                 model_list: Optional[list]=None,
+    default_cache_time_seconds: int = 1 * 60 * 60  # 1 hour
+
+    def __init__(self,
+                 model_list: Optional[list] = None,
                  redis_host: Optional[str] = None,
                  redis_port: Optional[int] = None,
-                 redis_password: Optional[str] = None, 
+                 redis_password: Optional[str] = None,
                  cache_responses: bool = False) -> None:
         if model_list:
-            self.model_list = model_list
-            self.model_names = [m["model_name"] for m in model_list]
-        if redis_host is not None and redis_port is not None and redis_password is not None: 
+            self.set_model_list(model_list)
+        if redis_host is not None and redis_port is not None and redis_password is not None:
             cache_config = {
                     'type': 'redis',
                     'host': redis_host,
@@ -45,61 +47,55 @@ class Router:
             }
         self.cache = litellm.Cache(cache_config) # use Redis for tracking load balancing
         if cache_responses:
-            litellm.cache = litellm.Cache(**cache_config) # use Redis for caching completion requests 
+            litellm.cache = litellm.Cache(**cache_config) # use Redis for caching completion requests
             self.cache_responses = cache_responses
         litellm.success_callback = [self.deployment_callback]
-    
+
     def completion(self,
                    model: str,
                    messages: List[Dict[str, str]],
                    is_retry: Optional[bool] = False,
                    is_fallback: Optional[bool] = False,
-                   **kwargs): 
+                   **kwargs):
         """
-        Example usage: 
+        Example usage:
         response = router.completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hey, how's it going?"}]
         """
 
         # pick the one that is available (lowest TPM/RPM)
         deployment = self.get_available_deployment(model=model, messages=messages)
         data = deployment["litellm_params"]
-        data["messages"] = messages
-        data["caching"] = self.cache_responses
-        # call via litellm.completion() 
-        return litellm.completion(**{**data, **kwargs})
+        # call via litellm.completion()
+        return litellm.completion(**{**data, "messages": messages, "caching": self.cache_responses, **kwargs})
 
-    async def acompletion(self, 
-                    model: str, 
-                    messages: List[Dict[str, str]], 
+    async def acompletion(self,
+                    model: str,
+                    messages: List[Dict[str, str]],
                     is_retry: Optional[bool] = False,
                     is_fallback: Optional[bool] = False,
                     **kwargs):
         # pick the one that is available (lowest TPM/RPM)
         deployment = self.get_available_deployment(model=model, messages=messages)
         data = deployment["litellm_params"]
-        data["messages"] = messages
-        data["caching"] = self.cache_responses
-        return await litellm.acompletion(**{**data, **kwargs})
-    
-    def text_completion(self, 
-                        model: str, 
-                        prompt: str, 
+        return await litellm.acompletion(**{**data, "messages": messages, "caching": self.cache_responses, **kwargs})
+
+    def text_completion(self,
+                        model: str,
+                        prompt: str,
                         is_retry: Optional[bool] = False,
                         is_fallback: Optional[bool] = False,
                         is_async: Optional[bool] = False,
                         **kwargs):
-        
+
         messages=[{"role": "user", "content": prompt}]
         # pick the one that is available (lowest TPM/RPM)
         deployment = self.get_available_deployment(model=model, messages=messages)
 
         data = deployment["litellm_params"]
-        data["prompt"] = prompt
-        data["caching"] = self.cache_responses
-        # call via litellm.completion() 
-        return litellm.text_completion(**{**data, **kwargs})        
+        # call via litellm.completion()
+        return litellm.text_completion(**{**data, "prompt": prompt, "caching": self.cache_responses, **kwargs})
 
-    def embedding(self, 
+    def embedding(self,
                   model: str,
                   input: Union[str, List],
                   is_async: Optional[bool] = False,
@@ -108,10 +104,8 @@ class Router:
         deployment = self.get_available_deployment(model=model, input=input)
 
         data = deployment["litellm_params"]
-        data["input"] = input
-        data["caching"] = self.cache_responses
-        # call via litellm.embedding() 
-        return litellm.embedding(**{**data, **kwargs})
+        # call via litellm.embedding()
+        return litellm.embedding(**{**data, "input": input, "caching": self.cache_responses, **kwargs})
 
     async def aembedding(self,
                          model: str,
@@ -122,14 +116,13 @@ class Router:
         deployment = self.get_available_deployment(model=model, input=input)
 
         data = deployment["litellm_params"]
-        data["input"] = input
-        data["caching"] = self.cache_responses
-        return await litellm.aembedding(**{**data, **kwargs})
+        return await litellm.aembedding(**{**data, "input": input, "caching": self.cache_responses, **kwargs})
 
     def set_model_list(self, model_list: list):
         self.model_list = model_list
+        self.model_names = [m["model_name"] for m in model_list]
 
-    def get_model_names(self): 
+    def get_model_names(self):
         return self.model_names
 
     def deployment_callback(
@@ -142,69 +135,63 @@ class Router:
         Function LiteLLM submits a callback to after a successful
         completion. Purpose of this is ti update TPM/RPM usage per model
         """
-        model_name = kwargs.get('model', None)  # i.e. azure/gpt35turbo
+        model_name = kwargs.get('model', None)  # i.e. gpt35turbo
+        custom_llm_provider = kwargs.get("litellm_params", {}).get('custom_llm_provider', None)  # i.e. azure
+        if custom_llm_provider:
+            model_name = f"{custom_llm_provider}/{model_name}"
         total_tokens = completion_response['usage']['total_tokens']
         self._set_deployment_usage(model_name, total_tokens)
 
-    def get_available_deployment(self, 
-                               model: str, 
-                               messages: Optional[List[Dict[str, str]]]=None,
-                               input: Optional[Union[str, List]]=None): 
+    def get_available_deployment(self,
+                               model: str,
+                               messages: Optional[List[Dict[str, str]]] = None,
+                               input: Optional[Union[str, List]] = None):
         """
         Returns a deployment with the lowest TPM/RPM usage.
         """
-        # get list of potential deployments 
-        potential_deployments = [] 
-        for item in self.model_list: 
-            if item["model_name"] == model: 
+        # get list of potential deployments
+        potential_deployments = []
+        for item in self.model_list:
+            if item["model_name"] == model:
                 potential_deployments.append(item)
-        
-        # set first model as current model
-        deployment = potential_deployments[0] 
 
+        # set first model as current model to calculate token count
+        deployment = potential_deployments[0]
 
-        # get model tpm, rpm limits
-        tpm = deployment["tpm"]
-        rpm = deployment["rpm"]
-
-        # get deployment current usage
-        current_tpm, current_rpm = self._get_deployment_usage(deployment_name=deployment["litellm_params"]["model"])
-
-        # get encoding 
-        if messages:
+        # get encoding
+        token_count = 0
+        if messages is not None:
             token_count = litellm.token_counter(model=deployment["model_name"], messages=messages)
-        elif input:
+        elif input is not None:
             if isinstance(input, List):
                 input_text = "".join(text for text in input)
             else:
                 input_text = input
             token_count = litellm.token_counter(model=deployment["model_name"], text=input_text)
-        
-        # if at model limit, return lowest used
-        if current_tpm + token_count > tpm or current_rpm + 1 >= rpm: 
-            # -----------------------
-            # Find lowest used model
-            # ----------------------
-            lowest_tpm = float('inf')
-            deployment = None
 
-            # Go through all the models to get tpm, rpm
-            for item in potential_deployments:
-                item_tpm, item_rpm = self._get_deployment_usage(deployment_name=item["litellm_params"]["model"])
+        # -----------------------
+        # Find lowest used model
+        # ----------------------
+        lowest_tpm = float("inf")
+        deployment = None
 
-                if item_tpm == 0:
-                    return item
-                elif item_tpm + token_count > item["tpm"] or item_rpm + 1 >= item["rpm"]: 
-                    continue
-                elif item_tpm < lowest_tpm:
-                    lowest_tpm = item_tpm
-                    deployment = item
-        
-            # if none, raise exception 
-            if deployment is None: 
-                raise ValueError(f"No models available.")
+        # Go through all the models to get tpm, rpm
+        for item in potential_deployments:
+            item_tpm, item_rpm = self._get_deployment_usage(deployment_name=item["litellm_params"]["model"])
 
-        # return model 
+            if item_tpm == 0:
+                return item
+            elif item_tpm + token_count > item["tpm"] or item_rpm + 1 >= item["rpm"]:
+                continue
+            elif item_tpm < lowest_tpm:
+                lowest_tpm = item_tpm
+                deployment = item
+
+        # if none, raise exception
+        if deployment is None:
+            raise ValueError("No models available.")
+
+        # return model
         return deployment
 
     def _get_deployment_usage(
@@ -221,27 +208,22 @@ class Router:
         # ------------
         # Return usage
         # ------------
-        tpm = self.cache.get_cache(tpm_key)
-        rpm = self.cache.get_cache(rpm_key)
-
-        if tpm is None: 
-            tpm = 0
-        if rpm is None: 
-            rpm = 0
+        tpm = self.cache.get_cache(cache_key=tpm_key) or 0
+        rpm = self.cache.get_cache(cache_key=rpm_key) or 0
 
         return int(tpm), int(rpm)
-    
-    def increment(self, key: str, increment_value: int): 
-        # get value 
-        cached_value = self.cache.get_cache(key)
-        # update value 
+
+    def increment(self, key: str, increment_value: int):
+        # get value
+        cached_value = self.cache.get_cache(cache_key=key)
+        # update value
         try:
             cached_value = cached_value + increment_value
-        except: 
+        except:
             cached_value = increment_value
         # save updated value
-        self.cache.add_cache(result=cached_value, cache_key=key)
-    
+        self.cache.add_cache(result=cached_value, cache_key=key, ttl=self.default_cache_time_seconds)
+
     def _set_deployment_usage(
         self,
         model_name: str,
diff --git a/litellm/tests/test_add_function_to_prompt.py b/litellm/tests/test_add_function_to_prompt.py
index 33d2ac2a9..a5ec53062 100644
--- a/litellm/tests/test_add_function_to_prompt.py
+++ b/litellm/tests/test_add_function_to_prompt.py
@@ -37,6 +37,7 @@ def test_function_call_non_openai_model():
         response = litellm.completion(model=model, messages=messages, functions=functions)
         pytest.fail(f'An error occurred')
     except Exception as e: 
+        print(e)
         pass
 
 test_function_call_non_openai_model()
diff --git a/litellm/tests/test_api_key_param.py b/litellm/tests/test_api_key_param.py
index 40f7a12b0..9745cf83a 100644
--- a/litellm/tests/test_api_key_param.py
+++ b/litellm/tests/test_api_key_param.py
@@ -1,53 +1,53 @@
-#### What this tests ####
-#    This tests the ability to set api key's via the params instead of as environment variables
+# #### What this tests ####
+# #    This tests the ability to set api key's via the params instead of as environment variables
 
-import sys, os
-import traceback
+# import sys, os
+# import traceback
 
-sys.path.insert(
-    0, os.path.abspath("../..")
-)  # Adds the parent directory to the system path
-import litellm
-from litellm import embedding, completion
+# sys.path.insert(
+#     0, os.path.abspath("../..")
+# )  # Adds the parent directory to the system path
+# import litellm
+# from litellm import embedding, completion
 
-litellm.set_verbose = False
+# litellm.set_verbose = False
 
 
-def logger_fn(model_call_object: dict):
-    print(f"model call details: {model_call_object}")
+# def logger_fn(model_call_object: dict):
+#     print(f"model call details: {model_call_object}")
 
 
-user_message = "Hello, how are you?"
-messages = [{"content": user_message, "role": "user"}]
+# user_message = "Hello, how are you?"
+# messages = [{"content": user_message, "role": "user"}]
 
-## Test 1: Setting key dynamically
-temp_key = os.environ.get("ANTHROPIC_API_KEY", "")
-os.environ["ANTHROPIC_API_KEY"] = "bad-key"
-# test on openai completion call
-try:
-    response = completion(
-        model="claude-instant-1",
-        messages=messages,
-        logger_fn=logger_fn,
-        api_key=temp_key,
-    )
-    print(f"response: {response}")
-except:
-    print(f"error occurred: {traceback.format_exc()}")
-    pass
-os.environ["ANTHROPIC_API_KEY"] = temp_key
+# ## Test 1: Setting key dynamically
+# temp_key = os.environ.get("ANTHROPIC_API_KEY", "")
+# os.environ["ANTHROPIC_API_KEY"] = "bad-key"
+# # test on openai completion call
+# try:
+#     response = completion(
+#         model="claude-instant-1",
+#         messages=messages,
+#         logger_fn=logger_fn,
+#         api_key=temp_key,
+#     )
+#     print(f"response: {response}")
+# except:
+#     print(f"error occurred: {traceback.format_exc()}")
+#     pass
+# os.environ["ANTHROPIC_API_KEY"] = temp_key
 
 
-## Test 2: Setting key via __init__ params
-litellm.anthropic_key = os.environ.get("ANTHROPIC_API_KEY", "")
-os.environ.pop("ANTHROPIC_API_KEY")
-# test on openai completion call
-try:
-    response = completion(
-        model="claude-instant-1", messages=messages, logger_fn=logger_fn
-    )
-    print(f"response: {response}")
-except:
-    print(f"error occurred: {traceback.format_exc()}")
-    pass
-os.environ["ANTHROPIC_API_KEY"] = temp_key
+# ## Test 2: Setting key via __init__ params
+# litellm.anthropic_key = os.environ.get("ANTHROPIC_API_KEY", "")
+# os.environ.pop("ANTHROPIC_API_KEY")
+# # test on openai completion call
+# try:
+#     response = completion(
+#         model="claude-instant-1", messages=messages, logger_fn=logger_fn
+#     )
+#     print(f"response: {response}")
+# except:
+#     print(f"error occurred: {traceback.format_exc()}")
+#     pass
+# os.environ["ANTHROPIC_API_KEY"] = temp_key
diff --git a/litellm/tests/test_async_fn.py b/litellm/tests/test_async_fn.py
index 72e6fc3a7..991c2006e 100644
--- a/litellm/tests/test_async_fn.py
+++ b/litellm/tests/test_async_fn.py
@@ -9,17 +9,29 @@ import asyncio
 sys.path.insert(
     0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
-from litellm import acompletion, acreate
+import litellm
+from litellm import completion, acompletion, acreate
+
+def test_sync_response():
+    litellm.set_verbose = True
+    user_message = "Hello, how are you?"
+    messages = [{"content": user_message, "role": "user"}]
+    try:
+        response = completion(model="gpt-3.5-turbo", messages=messages, api_key=os.environ["OPENAI_API_KEY"])
+    except Exception as e:
+        pytest.fail(f"An exception occurred: {e}")
+
 
 def test_async_response():
     import asyncio
     async def test_get_response():
+        litellm.set_verbose = True
         user_message = "Hello, how are you?"
         messages = [{"content": user_message, "role": "user"}]
         try:
             response = await acompletion(model="gpt-3.5-turbo", messages=messages)
         except Exception as e:
-            pass
+            pytest.fail(f"An exception occurred: {e}")
 
     response = asyncio.run(test_get_response())
 # print(response)
@@ -51,7 +63,7 @@ def test_get_response_streaming():
             assert len(output) > 0, "Length of output needs to be greater than 0."
 
         except Exception as e:
-            pass
+            pytest.fail(f"An exception occurred: {e}")
         return response
     asyncio.run(test_async_call())
 
diff --git a/litellm/tests/test_batch_completions.py b/litellm/tests/test_batch_completions.py
index f5ac6f325..679181684 100644
--- a/litellm/tests/test_batch_completions.py
+++ b/litellm/tests/test_batch_completions.py
@@ -14,18 +14,20 @@ from litellm import batch_completion, batch_completion_models, completion, batch
 
 def test_batch_completions():
     messages = [[{"role": "user", "content": "write a short poem"}] for _ in range(3)]
-    model = "gpt-3.5-turbo"
+    model = "j2-mid"
     try:
         result = batch_completion(
             model=model, 
             messages=messages,
             max_tokens=10,
-            temperature=0.2
+            temperature=0.2,
+            request_timeout=1
         )
         print(result)
         print(len(result))
         assert(len(result)==3)
     except Timeout as e:
+        print(f"IN TIMEOUT")
         pass
     except Exception as e:
         pytest.fail(f"An error occurred: {e}")
@@ -38,18 +40,25 @@ def test_batch_completions_models():
             messages=[{"role": "user", "content": "Hey, how's it going"}]
         )
         print(result)
+    except Timeout as e:
+        pass
     except Exception as e:
         pytest.fail(f"An error occurred: {e}")
 # test_batch_completions_models()
 
 def test_batch_completion_models_all_responses():
-    responses = batch_completion_models_all_responses(
-        models=["j2-light", "claude-instant-1.2", "command-nightly"], 
-        messages=[{"role": "user", "content": "write a poem"}],
-        max_tokens=500
-    )
-    print(responses)
-    assert(len(responses) == 3)
+    try:
+        responses = batch_completion_models_all_responses(
+            models=["j2-light", "claude-instant-1.2", "command-nightly"], 
+            messages=[{"role": "user", "content": "write a poem"}],
+            max_tokens=500
+        )
+        print(responses)
+        assert(len(responses) == 3)
+    except Timeout as e:
+        pass
+    except Exception as e:
+        pytest.fail(f"An error occurred: {e}")
 # test_batch_completion_models_all_responses()
 
 # def test_batch_completions():
diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index 78c3c86a7..1ba43195f 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -1,4 +1,5 @@
 import sys, os
+import time
 import traceback
 from dotenv import load_dotenv
 
@@ -12,7 +13,7 @@ import pytest
 import litellm
 from litellm import embedding, completion
 from litellm.caching import Cache
-litellm.set_verbose=True
+# litellm.set_verbose=True
 
 messages = [{"role": "user", "content": "who is ishaan Github?  "}]
 # comment
@@ -36,7 +37,7 @@ def test_gpt_cache():
         cache_key = last_content_without_prompt_val + data["model"]
         print("cache_key", cache_key)
         return cache_key
-        
+
 
     cache.init(pre_func=pre_cache_func)
     cache.set_openai_key()
@@ -46,12 +47,12 @@ def test_gpt_cache():
     response2 = completion(model="gpt-3.5-turbo", messages=messages)
     response3 = completion(model="command-nightly", messages=messages)
 
-    if response1["choices"] != response2["choices"]: # same models should cache 
+    if response1["choices"] != response2["choices"]: # same models should cache
         print(f"response1: {response1}")
         print(f"response2: {response2}")
         pytest.fail(f"Error occurred:")
 
-    if response3["choices"] == response2["choices"]: # different models, don't cache 
+    if response3["choices"] == response2["choices"]: # different models, don't cache
         # if models are different, it should not return cached response
         print(f"response2: {response2}")
         print(f"response3: {response3}")
@@ -124,9 +125,9 @@ def test_embedding_caching():
     embedding2 = embedding(model="text-embedding-ada-002", input=text_to_embed, caching=True)
     end_time = time.time()
     print(f"Embedding 2 response time: {end_time - start_time} seconds")
-    
+
     litellm.cache = None
-    assert end_time - start_time <= 0.1 # ensure 2nd response comes in in under 0.1 s 
+    assert end_time - start_time <= 0.1 # ensure 2nd response comes in in under 0.1 s
     if embedding2['data'][0]['embedding'] != embedding1['data'][0]['embedding']:
         print(f"embedding1: {embedding1}")
         print(f"embedding2: {embedding2}")
@@ -178,14 +179,14 @@ def test_embedding_caching_azure():
     )
     end_time = time.time()
     print(f"Embedding 2 response time: {end_time - start_time} seconds")
-    
+
     litellm.cache = None
-    assert end_time - start_time <= 0.1 # ensure 2nd response comes in in under 0.1 s 
+    assert end_time - start_time <= 0.1 # ensure 2nd response comes in in under 0.1 s
     if embedding2['data'][0]['embedding'] != embedding1['data'][0]['embedding']:
         print(f"embedding1: {embedding1}")
         print(f"embedding2: {embedding2}")
         pytest.fail("Error occurred: Embedding caching failed")
-    
+
     os.environ['AZURE_API_VERSION'] = api_version
     os.environ['AZURE_API_BASE'] = api_base
     os.environ['AZURE_API_KEY'] = api_key
@@ -270,30 +271,13 @@ def test_embedding_caching_azure():
 
 
 def test_redis_cache_completion():
+    litellm.set_verbose = True
     messages = [{"role": "user", "content": "who is ishaan CTO of litellm from litellm 2023"}]
     litellm.cache = Cache(type="redis", host=os.environ['REDIS_HOST'], port=os.environ['REDIS_PORT'], password=os.environ['REDIS_PASSWORD'])
     print("test2 for caching")
-
-    # patch this redis test
-    local_cache = {}
-
-    def set_cache(key, value):
-        local_cache[key] = value
-    
-    def get_cache(key):
-        if key in local_cache:
-            return local_cache[key]
-    
-    litellm.cache.cache.set_cache = set_cache
-    litellm.cache.cache.get_cache = get_cache
-
-
     response1 = completion(model="gpt-3.5-turbo", messages=messages, caching=True)
     response2 = completion(model="gpt-3.5-turbo", messages=messages, caching=True)
     response3 = completion(model="command-nightly", messages=messages, caching=True)
-    print(f"response1: {response1}")
-    print(f"response2: {response2}")
-    print(f"response3: {response3}")
     litellm.cache = None
     if response3['choices'][0]['message']['content'] == response2['choices'][0]['message']['content']:
         # if models are different, it should not return cached response
@@ -322,29 +306,29 @@ def test_custom_redis_cache_with_key():
 
     def set_cache(key, value):
         local_cache[key] = value
-    
+
     def get_cache(key):
         if key in local_cache:
             return local_cache[key]
-    
+
     litellm.cache.cache.set_cache = set_cache
     litellm.cache.cache.get_cache = get_cache
 
     # patch this redis cache get and set call
 
-    response1 = completion(model="gpt-3.5-turbo", messages=messages, temperature=1, caching=True)
-    response2 = completion(model="gpt-3.5-turbo", messages=messages, temperature=1, caching=True)
-    response3 = completion(model="gpt-3.5-turbo", messages=messages, temperature=1, caching=False)
-    
+    response1 = completion(model="gpt-3.5-turbo", messages=messages, temperature=1, caching=True, num_retries=3)
+    response2 = completion(model="gpt-3.5-turbo", messages=messages, temperature=1, caching=True, num_retries=3)
+    response3 = completion(model="gpt-3.5-turbo", messages=messages, temperature=1, caching=False, num_retries=3)
+
     print(f"response1: {response1}")
     print(f"response2: {response2}")
     print(f"response3: {response3}")
 
     if response3['choices'][0]['message']['content'] == response2['choices'][0]['message']['content']:
-        pytest.fail(f"Error occurred:")        
+        pytest.fail(f"Error occurred:")
     litellm.cache = None
 
-test_custom_redis_cache_with_key()
+# test_custom_redis_cache_with_key()
 
 def test_hosted_cache():
     litellm.cache = Cache(type="hosted") # use api.litellm.ai for caching
@@ -364,3 +348,99 @@ def test_hosted_cache():
 
 # test_hosted_cache()
 
+
+def test_redis_cache_with_ttl():
+    cache = Cache(type="redis", host=os.environ['REDIS_HOST'], port=os.environ['REDIS_PORT'], password=os.environ['REDIS_PASSWORD'])
+    sample_model_response_object_str = """{
+  "choices": [
+    {
+      "finish_reason": "stop",
+      "index": 0,
+      "message": {
+        "role": "assistant",
+        "content": "I'm doing well, thank you for asking. I am Claude, an AI assistant created by Anthropic."
+      }
+    }
+  ],
+  "created": 1691429984.3852863,
+  "model": "claude-instant-1",
+  "usage": {
+    "prompt_tokens": 18,
+    "completion_tokens": 23,
+    "total_tokens": 41
+  }
+}"""
+    sample_model_response_object = {
+  "choices": [
+    {
+      "finish_reason": "stop",
+      "index": 0,
+      "message": {
+        "role": "assistant",
+        "content": "I'm doing well, thank you for asking. I am Claude, an AI assistant created by Anthropic."
+      }
+    }
+  ],
+  "created": 1691429984.3852863,
+  "model": "claude-instant-1",
+  "usage": {
+    "prompt_tokens": 18,
+    "completion_tokens": 23,
+    "total_tokens": 41
+  }
+}
+    cache.add_cache(cache_key="test_key", result=sample_model_response_object_str, ttl=1)
+    cached_value = cache.get_cache(cache_key="test_key")
+    print(f"cached-value: {cached_value}")
+    assert cached_value['choices'][0]['message']['content'] == sample_model_response_object['choices'][0]['message']['content']
+    time.sleep(2)
+    assert cache.get_cache(cache_key="test_key") is None
+
+# test_redis_cache_with_ttl()
+
+def test_in_memory_cache_with_ttl():
+    cache = Cache(type="local")
+    sample_model_response_object_str = """{
+  "choices": [
+    {
+      "finish_reason": "stop",
+      "index": 0,
+      "message": {
+        "role": "assistant",
+        "content": "I'm doing well, thank you for asking. I am Claude, an AI assistant created by Anthropic."
+      }
+    }
+  ],
+  "created": 1691429984.3852863,
+  "model": "claude-instant-1",
+  "usage": {
+    "prompt_tokens": 18,
+    "completion_tokens": 23,
+    "total_tokens": 41
+  }
+}"""
+    sample_model_response_object = {
+  "choices": [
+    {
+      "finish_reason": "stop",
+      "index": 0,
+      "message": {
+        "role": "assistant",
+        "content": "I'm doing well, thank you for asking. I am Claude, an AI assistant created by Anthropic."
+      }
+    }
+  ],
+  "created": 1691429984.3852863,
+  "model": "claude-instant-1",
+  "usage": {
+    "prompt_tokens": 18,
+    "completion_tokens": 23,
+    "total_tokens": 41
+  }
+}
+    cache.add_cache(cache_key="test_key", result=sample_model_response_object_str, ttl=1)
+    cached_value = cache.get_cache(cache_key="test_key")
+    assert cached_value['choices'][0]['message']['content'] == sample_model_response_object['choices'][0]['message']['content']
+    time.sleep(2)
+    assert cache.get_cache(cache_key="test_key") is None
+# test_in_memory_cache_with_ttl()
\ No newline at end of file
diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index 728055571..340a3b0ae 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -9,9 +9,11 @@ sys.path.insert(
     0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import pytest
+from openai.error import Timeout
 import litellm
-from litellm import embedding, completion, text_completion, completion_cost
+from litellm import embedding, completion, completion_cost
 from litellm import RateLimitError
+litellm.num_retries = 3
 
 user_message = "Write a short poem about the sky"
 messages = [{"content": user_message, "role": "user"}]
@@ -38,7 +40,7 @@ def test_completion_custom_provider_model_name():
 
 
 def test_completion_claude():
-    litellm.set_verbose = True
+    litellm.set_verbose = False
     litellm.AnthropicConfig(max_tokens_to_sample=200, metadata={"user_id": "1224"})
     try:
         # test without max tokens
@@ -48,6 +50,11 @@ def test_completion_claude():
         # Add any assertions here to check the response
         print(response)
         print(response.response_ms)
+        print(response.usage)
+        print(response.usage.completion_tokens)
+        print(response["usage"]["completion_tokens"])
+        # print("new cost tracking")
+        print(response.cost())
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
 
@@ -96,17 +103,12 @@ def test_completion_with_litellm_call_id():
         print(response)
         if 'litellm_call_id' in response:
             pytest.fail(f"Error occurred: litellm_call_id in response objects")
+        print(response.usage)
+        print(response.usage.completion_tokens)
         
-        litellm.use_client = True
-        response2 = completion(
-            model="gpt-3.5-turbo", messages=messages)
-        
-        if 'litellm_call_id' not in response2:
-            pytest.fail(f"Error occurred: litellm_call_id not in response object when use_client = True")
-        # Add any assertions here to check the response
-        print(response2)
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
+# test_completion_with_litellm_call_id()
 
 def test_completion_perplexity_api():
     try:
@@ -220,13 +222,12 @@ def test_get_hf_task_for_model():
 # # TGI model
 # # this is a TGI model https://huggingface.co/glaiveai/glaive-coder-7b
 # def hf_test_completion_tgi():
-#     litellm.huggingface_config(return_full_text=True)
 #     litellm.set_verbose=True
 #     try:
 #         response = litellm.completion(
 #             model="huggingface/mistralai/Mistral-7B-Instruct-v0.1",
 #             messages=[{ "content": "Hello, how are you?","role": "user"}],
-#             api_base="https://n9ox93a8sv5ihsow.us-east-1.aws.endpoints.huggingface.cloud",
+#             api_base="https://3kk3h56912qga4-80.proxy.runpod.net",
 #         )
 #         # Add any assertions here to check the response
 #         print(response)
@@ -387,33 +388,13 @@ def test_completion_openai():
         pytest.fail(f"Error occurred: {e}")
 # test_completion_openai()
 
-
-def test_completion_openai_prompt():
-    try:
-        response = text_completion(
-            model="gpt-3.5-turbo", prompt="What's the weather in SF?"
-        )
-        response_str = response["choices"][0]["text"]
-    except Exception as e:
-        pytest.fail(f"Error occurred: {e}")
-
-
 def test_completion_text_openai():
-    try:
-        # litellm.set_verbose=True
-        response = completion(model="text-davinci-003", messages=messages)
-        # Add any assertions here to check the response
-        print(response)
-    except Exception as e:
-        pytest.fail(f"Error occurred: {e}")
-
-def test_completion_gpt_instruct():
     try:
         response = completion(model="gpt-3.5-turbo-instruct", messages=messages)
         print(response)
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
-# test_completion_gpt_instruct()
+# test_completion_text_openai()
 
 def test_completion_openai_with_optional_params():
     try:
@@ -426,10 +407,11 @@ def test_completion_openai_with_optional_params():
         )
         # Add any assertions here to check the response
         print(response)
+    except Timeout as e: 
+        pass
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
 
-
 def test_completion_openai_litellm_key():
     try:
         litellm.api_key = os.environ['OPENAI_API_KEY']
@@ -648,6 +630,38 @@ def test_completion_azure2():
 
 # test_completion_azure2()
 
+def test_completion_azure3():
+    # test if we can pass api_base, api_version and api_key in compleition()
+    try:
+        print("azure gpt-3.5 test\n\n")
+        litellm.set_verbose=True
+        litellm.api_base = os.environ["AZURE_API_BASE"]
+        litellm.api_key = os.environ["AZURE_API_KEY"]
+        litellm.api_version = os.environ["AZURE_API_VERSION"]
+
+        os.environ["AZURE_API_BASE"] = ""
+        os.environ["AZURE_API_VERSION"] = ""
+        os.environ["AZURE_API_KEY"] = ""
+
+
+        ## Test azure call
+        response = completion(
+            model="azure/chatgpt-v-2",
+            messages=messages,
+            max_tokens=10,
+        )
+
+        # Add any assertions here to check the response
+        print(response)
+
+        os.environ["AZURE_API_BASE"] = litellm.api_base
+        os.environ["AZURE_API_VERSION"] = litellm.api_version
+        os.environ["AZURE_API_KEY"] = litellm.api_key
+
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+# test_completion_azure3()
+
 # new azure test for using litellm. vars, 
 # use the following vars in this test and make an azure_api_call
 #  litellm.api_type = self.azure_api_type 
@@ -787,106 +801,124 @@ def test_completion_together_ai():
         pytest.fail(f"Error occurred: {e}")
 
 # test_completion_together_ai()
-# def test_customprompt_together_ai():
-#     try:
-#         litellm.register_prompt_template(
-#             model="OpenAssistant/llama2-70b-oasst-sft-v10",
-#             roles={"system":"<|im_start|>system", "assistant":"<|im_start|>assistant", "user":"<|im_start|>user"}, # tell LiteLLM how you want to map the openai messages to this model
-#             pre_message_sep= "\n",
-#             post_message_sep= "\n"
-#         )
-#         response = completion(model="together_ai/OpenAssistant/llama2-70b-oasst-sft-v10", messages=messages)
-#         print(response)
-#     except Exception as e:
-#         pytest.fail(f"Error occurred: {e}")
+def test_customprompt_together_ai():
+    try:
+        litellm.set_verbose = True
+        response = completion(model="together_ai/OpenAssistant/llama2-70b-oasst-sft-v10", messages=messages, 
+                              roles={"system":{"pre_message":"<|im_start|>system\n", "post_message":"<|im_end|>"}, "assistant":{"pre_message":"<|im_start|>assistant\n","post_message":"<|im_end|>"}, "user":{"pre_message":"<|im_start|>user\n","post_message":"<|im_end|>"}})
+        print(response)
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
 
-# def test_completion_sagemaker():
-#     try:
-#         response = completion(
-#             model="sagemaker/jumpstart-dft-meta-textgeneration-llama-2-7b", 
-#             messages=messages,
-#             temperature=0.2,
-#             max_tokens=80,
-#             logger_fn=logger_fn
-#         )
-#         # Add any assertions here to check the response
-#         print(response)
-#     except Exception as e:
-#         pytest.fail(f"Error occurred: {e}")
+# test_customprompt_together_ai()
+
+def test_completion_sagemaker():
+    try:
+        response = completion(
+            model="sagemaker/jumpstart-dft-meta-textgeneration-llama-2-7b", 
+            messages=messages,
+            temperature=0.2,
+            max_tokens=80,
+            logger_fn=logger_fn
+        )
+        # Add any assertions here to check the response
+        print(response)
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
 
 # test_completion_sagemaker()
 
-# def test_completion_bedrock_titan():
-#     try:
-#         response = completion(
-#             model="bedrock/amazon.titan-tg1-large", 
-#             messages=messages,
-#             temperature=0.2,
-#             max_tokens=200,
-#             top_p=0.8,
-#             logger_fn=logger_fn
-#         )
-#         # Add any assertions here to check the response
-#         print(response)
-#     except RateLimitError:
-#         pass
-#     except Exception as e:
-#         pytest.fail(f"Error occurred: {e}")
+def test_completion_bedrock_titan():
+    try:
+        response = completion(
+            model="bedrock/amazon.titan-tg1-large", 
+            messages=messages,
+            temperature=0.2,
+            max_tokens=200,
+            top_p=0.8,
+            logger_fn=logger_fn
+        )
+        # Add any assertions here to check the response
+        print(response)
+    except RateLimitError:
+        pass
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
 # test_completion_bedrock_titan()
 
-# def test_completion_bedrock_claude():
-#     print("calling claude")
-#     try:
-#         response = completion(
-#             model="anthropic.claude-instant-v1", 
-#             messages=messages,
-#             max_tokens=10,
-#             temperature=0.1,
-#             logger_fn=logger_fn
-#         )
-#         # Add any assertions here to check the response
-#         print(response)
-#     except RateLimitError:
-#         pass
-#     except Exception as e:
-#         pytest.fail(f"Error occurred: {e}")
+def test_completion_bedrock_claude():
+    print("calling claude")
+    try:
+        response = completion(
+            model="anthropic.claude-instant-v1", 
+            messages=messages,
+            max_tokens=10,
+            temperature=0.1,
+            logger_fn=logger_fn
+        )
+        # Add any assertions here to check the response
+        print(response)
+    except RateLimitError:
+        pass
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
 # test_completion_bedrock_claude()
 
-
-# def test_completion_bedrock_claude_completion_auth():
-#     print("calling bedrock claude completion params auth")
-#     import os
-
-#     aws_access_key_id = os.environ["AWS_ACCESS_KEY_ID"]
-#     aws_secret_access_key = os.environ["AWS_SECRET_ACCESS_KEY"]
-#     aws_region_name = os.environ["AWS_REGION_NAME"]
-
-#     os.environ["AWS_ACCESS_KEY_ID"] = ""
-#     os.environ["AWS_SECRET_ACCESS_KEY"] = ""
-#     os.environ["AWS_REGION_NAME"] = ""
+def test_completion_bedrock_cohere():
+    print("calling bedrock cohere")
+    try:
+        response = completion(
+            model="bedrock/cohere.command-text-v14", 
+            messages=[{"role": "user", "content": "hi"}],
+            temperature=0.1,
+            max_tokens=10,
+            stream=True
+        )
+        # Add any assertions here to check the response
+        print(response)
+        for chunk in response:
+            print(chunk)
+    except RateLimitError:
+        pass
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+# test_completion_bedrock_cohere()
 
 
-#     try:
-#         response = completion(
-#             model="bedrock/anthropic.claude-instant-v1", 
-#             messages=messages,
-#             max_tokens=10,
-#             temperature=0.1,
-#             logger_fn=logger_fn,
-#             aws_access_key_id=aws_access_key_id,
-#             aws_secret_access_key=aws_secret_access_key,
-#             aws_region_name=aws_region_name,
-#         )
-#         # Add any assertions here to check the response
-#         print(response)
+def test_completion_bedrock_claude_completion_auth():
+    print("calling bedrock claude completion params auth")
+    import os
 
-#         os.environ["AWS_ACCESS_KEY_ID"] = aws_access_key_id
-#         os.environ["AWS_SECRET_ACCESS_KEY"] = aws_secret_access_key
-#         os.environ["AWS_REGION_NAME"] = aws_region_name
-#     except RateLimitError:
-#         pass
-#     except Exception as e:
-#         pytest.fail(f"Error occurred: {e}")
+    aws_access_key_id = os.environ["AWS_ACCESS_KEY_ID"]
+    aws_secret_access_key = os.environ["AWS_SECRET_ACCESS_KEY"]
+    aws_region_name = os.environ["AWS_REGION_NAME"]
+
+    os.environ["AWS_ACCESS_KEY_ID"] = ""
+    os.environ["AWS_SECRET_ACCESS_KEY"] = ""
+    os.environ["AWS_REGION_NAME"] = ""
+
+
+    try:
+        response = completion(
+            model="bedrock/anthropic.claude-instant-v1", 
+            messages=messages,
+            max_tokens=10,
+            temperature=0.1,
+            logger_fn=logger_fn,
+            aws_access_key_id=aws_access_key_id,
+            aws_secret_access_key=aws_secret_access_key,
+            aws_region_name=aws_region_name,
+        )
+        # Add any assertions here to check the response
+        print(response)
+
+        os.environ["AWS_ACCESS_KEY_ID"] = aws_access_key_id
+        os.environ["AWS_SECRET_ACCESS_KEY"] = aws_secret_access_key
+        os.environ["AWS_REGION_NAME"] = aws_region_name
+    except RateLimitError:
+        pass
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
 # test_completion_bedrock_claude_completion_auth()
 
 # def test_completion_bedrock_claude_external_client_auth():
@@ -1026,27 +1058,34 @@ def test_completion_together_ai():
 # test_completion_custom_api_base()
 
 # def test_vertex_ai():
-#     # test_models = litellm.vertex_chat_models + litellm.vertex_code_chat_models + litellm.vertex_text_models + litellm.vertex_code_text_models
-#     test_models = ["chat-bison"]
+#     test_models = ["codechat-bison"] + litellm.vertex_chat_models + litellm.vertex_code_chat_models + litellm.vertex_text_models + litellm.vertex_code_text_models
+#     # test_models = ["chat-bison"]
 #     for model in test_models:
 #         try:
+#             if model in ["code-gecko@001", "code-gecko@latest"]:
+#                 # our account does not have access to this model
+#                 continue
 #             print("making request", model)
-#             response = completion(model="vertex_ai/codechat-bison-32k", messages=[{'role': 'user', 'content': 'hi'}])
+#             response = completion(model=model, messages=[{'role': 'user', 'content': 'hi'}])
 #             print(response)
+
+#             print(response.usage.completion_tokens)
+#             print(response['usage']['completion_tokens'])
 #             assert type(response.choices[0].message.content) == str
 #         except Exception as e:
 #             pytest.fail(f"Error occurred: {e}")
 # test_vertex_ai()
 
 # def test_vertex_ai_stream():
-#     litellm.vertex_project = "hardy-device-386718"
-#     litellm.vertex_location = "us-central1"
+#     litellm.set_verbose=False
 #     test_models = litellm.vertex_chat_models + litellm.vertex_code_chat_models + litellm.vertex_text_models + litellm.vertex_code_text_models
 #     for model in test_models:
 #         try:
+#             if model in ["code-gecko@001", "code-gecko@latest"]:
+#                 # our account does not have access to this model
+#                 continue
 #             print("making request", model)
-#             response = completion(model=model, messages=[{"role": "user", "content": "write code for saying hi"}], stream=True)
-#             print(response)
+#             response = completion(model=model, messages=[{"role": "user", "content": "write 100 line code code for saying hi"}], stream=True)
 #             for chunk in response:
 #                 print(chunk)
 #                 # pass
@@ -1110,7 +1149,19 @@ def test_completion_anyscale_2():
         print(response)
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
-test_completion_anyscale_2()
+
+def test_mistral_anyscale_stream():
+    litellm.set_verbose=False
+    response = completion(
+        model = 'anyscale/mistralai/Mistral-7B-Instruct-v0.1', 
+        messages = [{ "content": "hello, good morning","role": "user"}],
+        stream=True,
+    )
+    for chunk in response:
+        # print(chunk)
+        print(chunk["choices"][0]["delta"].get("content", ""), end="")
+# test_mistral_anyscale_stream()
+# test_completion_anyscale_2()
 # def test_completion_with_fallbacks_multiple_keys():
 #     print(f"backup key 1: {os.getenv('BACKUP_OPENAI_API_KEY_1')}")
 #     print(f"backup key 2: {os.getenv('BACKUP_OPENAI_API_KEY_2')}")
@@ -1247,7 +1298,7 @@ def test_completion_palm():
     # litellm.set_verbose = True
     model_name = "palm/chat-bison"
     try:
-        response = completion(model=model_name, messages=messages)
+        response = completion(model=model_name, messages=messages, stop=["stop"])
         # Add any assertions here to check the response
         print(response)
         print(response.response_ms)
@@ -1255,6 +1306,25 @@ def test_completion_palm():
         pytest.fail(f"Error occurred: {e}")
 # test_completion_palm()
 
+# test palm with streaming
+def test_completion_palm_stream():
+    # litellm.set_verbose = True
+    model_name = "palm/chat-bison"
+    try:
+        response = completion(
+            model=model_name, 
+            messages=messages,
+            stop=["stop"],
+            stream=True,
+            max_tokens=20
+        )
+        # Add any assertions here to check the response
+        for chunk in response:
+            print(chunk)
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+# test_completion_palm_stream()
+
 # test_completion_deep_infra()
 # test_completion_ai21()
 # test config file with completion #
@@ -1270,6 +1340,14 @@ def test_completion_palm():
 #         pytest.fail(f"Error occurred: {e}")
 
 
+# def test_maritalk():
+#     messages = [{"role": "user", "content": "Hey"}]
+#     try:
+#         response = completion("maritalk", messages=messages)
+#         print(f"response: {response}")
+#     except Exception as e:
+#         pytest.fail(f"Error occurred: {e}")
+# test_maritalk()
 
 def test_completion_together_ai_stream():
     user_message = "Write 1pg about YC & litellm"
diff --git a/litellm/tests/test_completion_with_retries.py b/litellm/tests/test_completion_with_retries.py
index bfc077b1d..30ec9ec8c 100644
--- a/litellm/tests/test_completion_with_retries.py
+++ b/litellm/tests/test_completion_with_retries.py
@@ -1,86 +1,53 @@
-# import sys, os
-# import traceback
-# from dotenv import load_dotenv
+import sys, os
+import traceback
+from dotenv import load_dotenv
 
-# load_dotenv()
-# import os
+load_dotenv()
+import os
 
-# sys.path.insert(
-#     0, os.path.abspath("../..")
-# )  # Adds the parent directory to the system path
-# import pytest
-# import litellm
-# from litellm import completion_with_retries
-# from litellm import (
-#     AuthenticationError,
-#     InvalidRequestError,
-#     RateLimitError,
-#     ServiceUnavailableError,
-#     OpenAIError,
-# )
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
+import pytest
+import openai
+import litellm
+from litellm import completion_with_retries, completion
+from litellm import (
+    AuthenticationError,
+    InvalidRequestError,
+    RateLimitError,
+    ServiceUnavailableError,
+    OpenAIError,
+)
 
-# user_message = "Hello, whats the weather in San Francisco??"
-# messages = [{"content": user_message, "role": "user"}]
+user_message = "Hello, whats the weather in San Francisco??"
+messages = [{"content": user_message, "role": "user"}]
 
 
-# def logger_fn(user_model_dict):
-#     # print(f"user_model_dict: {user_model_dict}")
-#     pass
+def logger_fn(user_model_dict):
+    # print(f"user_model_dict: {user_model_dict}")
+    pass
 
-# # normal call
-# def test_completion_custom_provider_model_name():
-#     try:
-#         response = completion_with_retries(
-#             model="together_ai/togethercomputer/llama-2-70b-chat",
-#             messages=messages,
-#             logger_fn=logger_fn,
-#         )
-#         # Add any assertions here to check the response
-#         print(response)
-#     except Exception as e:
-#         pytest.fail(f"Error occurred: {e}")
+# normal call
+def test_completion_custom_provider_model_name():
+    try:
+        response = completion_with_retries(
+            model="together_ai/togethercomputer/llama-2-70b-chat",
+            messages=messages,
+            logger_fn=logger_fn,
+        )
+        # Add any assertions here to check the response
+        print(response)
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
 
+# completion with num retries + impact on exception mapping
+def test_completion_with_num_retries(): 
+    try: 
+        response = completion(model="j2-ultra", messages=[{"messages": "vibe", "bad": "message"}], num_retries=2)
+    except openai.APIError as e: 
+        pass
+    except Exception as e: 
+        pytest.fail(f"Unmapped exception occurred")
 
-# # bad call
-# # def test_completion_custom_provider_model_name():
-# #     try:
-# #         response = completion_with_retries(
-# #             model="bad-model",
-# #             messages=messages,
-# #             logger_fn=logger_fn,
-# #         )
-# #         # Add any assertions here to check the response
-# #         print(response)
-# #     except Exception as e:
-# #         pytest.fail(f"Error occurred: {e}")
-
-# # impact on exception mapping
-# def test_context_window():
-#     sample_text = "how does a court case get to the Supreme Court?" * 5000
-#     messages = [{"content": sample_text, "role": "user"}]
-#     try:
-#         model = "chatgpt-test"
-#         response = completion_with_retries(
-#             model=model,
-#             messages=messages,
-#             custom_llm_provider="azure",
-#             logger_fn=logger_fn,
-#         )
-#         print(f"response: {response}")
-#     except InvalidRequestError as e:
-#         print(f"InvalidRequestError: {e.llm_provider}")
-#         return
-#     except OpenAIError as e:
-#         print(f"OpenAIError: {e.llm_provider}")
-#         return
-#     except Exception as e:
-#         print("Uncaught Error in test_context_window")
-#         print(f"Error Type: {type(e).__name__}")
-#         print(f"Uncaught Exception - {e}")
-#         pytest.fail(f"Error occurred: {e}")
-#     return
-
-
-# test_context_window()
-
-# test_completion_custom_provider_model_name()
+# test_completion_with_num_retries()
\ No newline at end of file
diff --git a/litellm/tests/test_custom_logger.py b/litellm/tests/test_custom_logger.py
index 2ba378aad..dc79eb3ce 100644
--- a/litellm/tests/test_custom_logger.py
+++ b/litellm/tests/test_custom_logger.py
@@ -6,81 +6,44 @@ sys.path.insert(0, os.path.abspath('../..'))
 
 from litellm import completion, embedding
 import litellm
+from litellm.integrations.custom_logger import CustomLogger
 
-def custom_callback(
-        kwargs,
-        completion_response,
-        start_time,
-        end_time,
-):
-    print(
-        "in custom callback func"
-    )
-    print("kwargs", kwargs)
-    print(completion_response)
-    print(start_time)
-    print(end_time)
-def send_slack_alert(
-        kwargs,
-        completion_response,
-        start_time,
-        end_time,
-):
-    print(
-        "in custom slack callback func"
-    )
-    import requests
-    import json
+class MyCustomHandler(CustomLogger):
+    def log_pre_api_call(self, model, messages, kwargs): 
+        print(f"Pre-API Call")
+    
+    def log_post_api_call(self, kwargs, response_obj, start_time, end_time): 
+        print(f"Post-API Call")
+    
+    def log_stream_event(self, kwargs, response_obj, start_time, end_time):
+        print(f"On Stream")
+        
+    def log_success_event(self, kwargs, response_obj, start_time, end_time): 
+        print(f"On Success")
 
-    # Define the Slack webhook URL
-    slack_webhook_url = os.environ['SLACK_WEBHOOK_URL']   # "https://hooks.slack.com/services/<>/<>/<>"
-
-    # Define the text payload, send data available in litellm custom_callbacks
-    text_payload = f"""LiteLLM Logging: kwargs: {str(kwargs)}\n\n, response: {str(completion_response)}\n\n, start time{str(start_time)} end time: {str(end_time)}
-    """
-    payload = {
-        "text": text_payload
-    }
-
-    # Set the headers
-    headers = {
-        "Content-type": "application/json"
-    }
-
-    # Make the POST request
-    response = requests.post(slack_webhook_url, json=payload, headers=headers)
-
-    # Check the response status
-    if response.status_code == 200:
-        print("Message sent successfully to Slack!")
-    else:
-        print(f"Failed to send message to Slack. Status code: {response.status_code}")
-        print(response.json())
-
-def get_transformed_inputs(
-    kwargs,
-):
-    params_to_model = kwargs["additional_args"]["complete_input_dict"]
-    print("params to model", params_to_model)
-
-litellm.success_callback = [custom_callback, send_slack_alert]
-litellm.failure_callback = [send_slack_alert]
-
-
-litellm.set_verbose = True
-
-litellm.input_callback = [get_transformed_inputs]
+    def log_failure_event(self, kwargs, response_obj, start_time, end_time): 
+        print(f"On Failure")
 
+customHandler = MyCustomHandler()
 
 def test_chat_openai():
     try:
-        response = completion(model="gpt-2",
+        litellm.callbacks = [customHandler]
+        response = completion(model="gpt-3.5-turbo",
+                              messages=[{
+                                  "role": "user",
+                                  "content": "Hi 👋 - i'm openai"
+                              }],
+                              stream=True)
+        for chunk in response:
+            # print(chunk)
+            continue
+        response = completion(model="gpt-3.5-turbo",
                               messages=[{
                                   "role": "user",
                                   "content": "Hi 👋 - i'm openai"
                               }])
-
-        print(response)
+        # print(response)
 
     except Exception as e:
         print(e)
@@ -88,3 +51,77 @@ def test_chat_openai():
 
 
 test_chat_openai()
+
+
+
+
+
+# def custom_callback(
+#         kwargs,
+#         completion_response,
+#         start_time,
+#         end_time,
+# ):
+#     print(
+#         "in custom callback func"
+#     )
+#     print("kwargs", kwargs)
+#     print(completion_response)
+#     print(start_time)
+#     print(end_time)
+#     if "complete_streaming_response" in kwargs:
+#         print("\n\n complete response\n\n")
+#         complete_streaming_response = kwargs["complete_streaming_response"]
+#         print(kwargs["complete_streaming_response"])
+#         usage = complete_streaming_response["usage"]
+#         print("usage", usage)
+# def send_slack_alert(
+#         kwargs,
+#         completion_response,
+#         start_time,
+#         end_time,
+# ):
+#     print(
+#         "in custom slack callback func"
+#     )
+#     import requests
+#     import json
+
+#     # Define the Slack webhook URL
+#     slack_webhook_url = os.environ['SLACK_WEBHOOK_URL']   # "https://hooks.slack.com/services/<>/<>/<>"
+
+#     # Define the text payload, send data available in litellm custom_callbacks
+#     text_payload = f"""LiteLLM Logging: kwargs: {str(kwargs)}\n\n, response: {str(completion_response)}\n\n, start time{str(start_time)} end time: {str(end_time)}
+#     """
+#     payload = {
+#         "text": text_payload
+#     }
+
+#     # Set the headers
+#     headers = {
+#         "Content-type": "application/json"
+#     }
+
+#     # Make the POST request
+#     response = requests.post(slack_webhook_url, json=payload, headers=headers)
+
+#     # Check the response status
+#     if response.status_code == 200:
+#         print("Message sent successfully to Slack!")
+#     else:
+#         print(f"Failed to send message to Slack. Status code: {response.status_code}")
+#         print(response.json())
+
+# def get_transformed_inputs(
+#     kwargs,
+# ):
+#     params_to_model = kwargs["additional_args"]["complete_input_dict"]
+#     print("params to model", params_to_model)
+
+# litellm.success_callback = [custom_callback, send_slack_alert]
+# litellm.failure_callback = [send_slack_alert]
+
+
+# litellm.set_verbose = False
+
+# # litellm.input_callback = [get_transformed_inputs]
diff --git a/litellm/tests/test_embedding.py b/litellm/tests/test_embedding.py
index 03d613055..d533b7c42 100644
--- a/litellm/tests/test_embedding.py
+++ b/litellm/tests/test_embedding.py
@@ -69,6 +69,7 @@ def test_openai_azure_embedding():
 
 def test_cohere_embedding():
     try:
+        # litellm.set_verbose=True
         response = embedding(
             model="embed-english-v2.0", input=["good morning from litellm", "this is another item"]
         )
@@ -78,17 +79,40 @@ def test_cohere_embedding():
 
 # test_cohere_embedding()
 
+def test_cohere_embedding3():
+    try:
+        litellm.set_verbose=True
+        response = embedding(
+            model="embed-english-v3.0", 
+            input=["good morning from litellm", "this is another item"], 
+        )
+        print(f"response:", response)
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+# test_cohere_embedding3()
+
+def test_bedrock_embedding():
+    try:
+        response = embedding(
+            model="amazon.titan-embed-text-v1", input=["good morning from litellm, attempting to embed data"]
+        )
+        print(f"response:", response)
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+# test_bedrock_embedding()
+
 # comment out hf tests - since hf endpoints are unstable
-# def test_hf_embedding():
-#     try:
-#         # huggingface/microsoft/codebert-base
-#         # huggingface/facebook/bart-large
-#         response = embedding(
-#             model="huggingface/BAAI/bge-large-zh", input=["good morning from litellm", "this is another item"]
-#         )
-#         print(f"response:", response)
-#     except Exception as e:
-#         pytest.fail(f"Error occurred: {e}")
+def test_hf_embedding():
+    try:
+        # huggingface/microsoft/codebert-base
+        # huggingface/facebook/bart-large
+        response = embedding(
+            model="huggingface/sentence-transformers/all-MiniLM-L6-v2", input=["good morning from litellm", "this is another item"]
+        )
+        print(f"response:", response)
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
 # test_hf_embedding()
 
 # test async embeddings
diff --git a/litellm/tests/test_exceptions.py b/litellm/tests/test_exceptions.py
index 4c37732a1..b5fdd6fc9 100644
--- a/litellm/tests/test_exceptions.py
+++ b/litellm/tests/test_exceptions.py
@@ -38,13 +38,32 @@ models = ["command-nightly"]
 # Test 1: Context Window Errors 
 @pytest.mark.parametrize("model", models)
 def test_context_window(model):
+    sample_text = "Say error 50 times" * 100000
+    messages = [{"content": sample_text, "role": "user"}]
+    print(f"model: {model}")
+    try:
+        completion(model=model, messages=messages)
+        pytest.fail(f"An exception occurred")
+    except ContextWindowExceededError:
+        pass
+    except RateLimitError:
+        pass
+    except Exception as e: 
+        print(f"{e}")
+        pytest.fail(f"An error occcurred - {e}")
+        
+@pytest.mark.parametrize("model", models)
+def test_context_window_with_fallbacks(model):
+    ctx_window_fallback_dict = {"command-nightly": "claude-2"}
     sample_text = "how does a court case get to the Supreme Court?" * 1000
     messages = [{"content": sample_text, "role": "user"}]
 
-    with pytest.raises(ContextWindowExceededError):
-        completion(model=model, messages=messages)
+    completion(model=model, messages=messages, context_window_fallback_dict=ctx_window_fallback_dict)
 
+# for model in litellm.models_by_provider["bedrock"]:
+#     test_context_window(model=model)
 # test_context_window(model="command-nightly")
+# test_context_window_with_fallbacks(model="command-nightly")
 # Test 2: InvalidAuth Errors
 @pytest.mark.parametrize("model", models)
 def invalid_auth(model):  # set the model key to an invalid key, depending on the model
@@ -54,6 +73,13 @@ def invalid_auth(model):  # set the model key to an invalid key, depending on th
         if model == "gpt-3.5-turbo":
             temporary_key = os.environ["OPENAI_API_KEY"]
             os.environ["OPENAI_API_KEY"] = "bad-key"
+        elif model == "bedrock/anthropic.claude-v2":
+            temporary_aws_access_key = os.environ["AWS_ACCESS_KEY_ID"]
+            os.environ["AWS_ACCESS_KEY_ID"] = "bad-key"
+            temporary_aws_region_name = os.environ["AWS_REGION_NAME"]
+            os.environ["AWS_REGION_NAME"] = "bad-key"
+            temporary_secret_key = os.environ["AWS_SECRET_ACCESS_KEY"]
+            os.environ["AWS_SECRET_ACCESS_KEY"] = "bad-key"
         elif model == "chatgpt-test":
             temporary_key = os.environ["AZURE_API_KEY"]
             os.environ["AZURE_API_KEY"] = "bad-key"
@@ -90,10 +116,10 @@ def invalid_auth(model):  # set the model key to an invalid key, depending on th
         )
         print(f"response: {response}")
     except AuthenticationError as e:
-        print(f"AuthenticationError Caught Exception - {e.llm_provider}")
+        print(f"AuthenticationError Caught Exception - {str(e)}")
     except (
         OpenAIError
-    ):  # is at least an openai error -> in case of random model errors - e.g. overloaded server
+    ) as e:  # is at least an openai error -> in case of random model errors - e.g. overloaded server
         print(f"OpenAIError Caught Exception - {e}")
     except Exception as e:
         print(type(e))
@@ -124,8 +150,15 @@ def invalid_auth(model):  # set the model key to an invalid key, depending on th
             os.environ["ALEPH_ALPHA_API_KEY"] = temporary_key
         elif model in litellm.nlp_cloud_models:
             os.environ["NLP_CLOUD_API_KEY"] = temporary_key
+        elif "bedrock" in model: 
+            os.environ["AWS_ACCESS_KEY_ID"] = temporary_aws_access_key
+            os.environ["AWS_REGION_NAME"] = temporary_aws_region_name
+            os.environ["AWS_SECRET_ACCESS_KEY"] = temporary_secret_key
     return
 
+for model in litellm.models_by_provider["bedrock"]:
+    invalid_auth(model=model)
+
 # Test 3: Invalid Request Error 
 @pytest.mark.parametrize("model", models)
 def test_invalid_request_error(model):
diff --git a/litellm/tests/test_get_model_cost_map.py b/litellm/tests/test_get_model_cost_map.py
index e006f49f4..8a86cc51f 100644
--- a/litellm/tests/test_get_model_cost_map.py
+++ b/litellm/tests/test_get_model_cost_map.py
@@ -9,5 +9,15 @@ from litellm import get_max_tokens, model_cost, open_ai_chat_completion_models
 
 print(get_max_tokens("gpt-3.5-turbo"))
 
-print(model_cost)
-print(open_ai_chat_completion_models)
+def test_get_gpt3_tokens():
+    max_tokens = get_max_tokens("gpt-3.5-turbo")
+    results = max_tokens['max_tokens']
+    print(results)
+# test_get_gpt3_tokens()
+
+def test_get_palm_tokens():
+    # # 🦄🦄🦄🦄🦄🦄🦄🦄
+    max_tokens = get_max_tokens("palm/chat-bison")
+    results = max_tokens['max_tokens']
+    print(results)
+# test_get_palm_tokens()
diff --git a/litellm/tests/test_helicone_integration.py b/litellm/tests/test_helicone_integration.py
index 66e375d17..82669d092 100644
--- a/litellm/tests/test_helicone_integration.py
+++ b/litellm/tests/test_helicone_integration.py
@@ -1,30 +1,30 @@
-#### What this tests ####
-#    This tests if logging to the helicone integration actually works
+# #### What this tests ####
+# #    This tests if logging to the helicone integration actually works
 
-import sys, os
-import traceback
-import pytest
+# import sys, os
+# import traceback
+# import pytest
 
-sys.path.insert(
-    0, os.path.abspath("../..")
-)  # Adds the parent directory to the system path
-import litellm
-from litellm import embedding, completion
+# sys.path.insert(
+#     0, os.path.abspath("../..")
+# )  # Adds the parent directory to the system path
+# import litellm
+# from litellm import embedding, completion
 
-litellm.success_callback = ["helicone"]
+# litellm.success_callback = ["helicone"]
 
-litellm.set_verbose = True
+# litellm.set_verbose = True
 
-user_message = "Hello, how are you?"
-messages = [{"content": user_message, "role": "user"}]
+# user_message = "Hello, how are you?"
+# messages = [{"content": user_message, "role": "user"}]
 
 
-# openai call
-response = completion(
-    model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}]
-)
+# # openai call
+# response = completion(
+#     model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}]
+# )
 
-# cohere call
-response = completion(
-    model="command-nightly", messages=[{"role": "user", "content": "Hi 👋 - i'm cohere"}]
-)
+# # cohere call
+# response = completion(
+#     model="command-nightly", messages=[{"role": "user", "content": "Hi 👋 - i'm cohere"}]
+# )
diff --git a/litellm/tests/test_litedebugger_integration.py b/litellm/tests/test_litedebugger_integration.py
deleted file mode 100644
index 49ea9a7e3..000000000
--- a/litellm/tests/test_litedebugger_integration.py
+++ /dev/null
@@ -1,112 +0,0 @@
-#### What this tests ####
-#    This tests if logging to the litedebugger integration actually works
-
-# Test Scenarios (test across normal completion, streaming)
-## 1: Pre-API-Call
-## 2: Post-API-Call
-## 3: On LiteLLM Call success
-## 4: On LiteLLM Call failure
-
-
-import sys, os, io
-import traceback, logging
-import pytest
-import dotenv
-dotenv.load_dotenv()
-
-# Create logger
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.DEBUG)
-
-# Create a stream handler
-stream_handler = logging.StreamHandler(sys.stdout)
-logger.addHandler(stream_handler)
-
-# Create a function to log information
-def logger_fn(message):
-    logger.info(message)
-
-sys.path.insert(
-    0, os.path.abspath("../..")
-)  # Adds the parent directory to the system path
-import litellm
-from litellm import completion
-from openai.error import AuthenticationError
-litellm.set_verbose = True
-
-score = 0
-split_per_model = {
-	"gpt-4": 0, 
-	"claude-instant-1.2": 1
-}
-
-
-user_message = "Hello, how are you?"
-messages = [{"content": user_message, "role": "user"}]
-
-# # #Test 1: On completion call - without setting client to true -> ensure litedebugger is not initialized
-# try:
-#     # Redirect stdout
-#     old_stdout = sys.stdout
-#     sys.stdout = new_stdout = io.StringIO()
-
-#     response = completion(model="gpt-3.5-turbo", messages=messages)
-
-#     # Restore stdout
-#     sys.stdout = old_stdout
-#     output = new_stdout.getvalue().strip()
-
-#     if "LiteLLMDebugger" in output:
-#         raise Exception("LiteLLM Debugger should not be called!")
-#     score += 1
-# except Exception as e:
-#     pytest.fail(f"Error occurred: {e}")
-
-
-# # Test 2: On normal completion call - setting client to true
-# litellm.use_client=True
-# def test_completion_with_client():
-#     try:
-#         # Redirect stdout
-#         old_stdout = sys.stdout
-#         sys.stdout = new_stdout = io.StringIO()
-#         litellm.token = "1e6795ea-a75e-4231-8110-dcc721dcffc3" # generate one here - https://www.uuidgenerator.net/version4
-
-#         completion(model="gpt-3.5-turbo", messages=messages)
-#         completion(model="claude-instant-1", messages=messages)
-
-#         # Restore stdout
-#         sys.stdout = old_stdout
-#         output = new_stdout.getvalue().strip()
-#         print(output)
-#         if "LiteDebugger: Pre-API Call Logging" not in output:
-#             raise Exception(f"LiteLLMDebugger: pre-api call not logged!")
-#         if "LiteDebugger: Post-API Call Logging" not in output:
-#             raise Exception("LiteLLMDebugger: post-api call not logged!")
-#         if "LiteDebugger: Success/Failure Call Logging" not in output:
-#             raise Exception("LiteLLMDebugger: success/failure call not logged!")
-#     except Exception as e:
-#         pytest.fail(f"Error occurred: {e}")
-# test_completion_with_client()
-# # Test 3: On streaming completion call - setting client to true
-# try:
-#     # Redirect stdout
-#     old_stdout = sys.stdout
-#     sys.stdout = new_stdout = io.StringIO()
-
-#     response = completion_with_split_tests(models=split_per_model, messages=messages, stream=True, use_client=True, override_client=True, id="6d383c99-488d-481d-aa1b-1f94935cec44")
-#     for data in response:
-#         continue
-#     # Restore stdout
-#     sys.stdout = old_stdout
-#     output = new_stdout.getvalue().strip()
-
-#     if "LiteDebugger: Pre-API Call Logging" not in output:
-#         raise Exception("LiteLLMDebugger: pre-api call not logged!")
-#     if "LiteDebugger: Post-API Call Logging" not in output:
-#         raise Exception("LiteLLMDebugger: post-api call not logged!")
-#     if "LiteDebugger: Success/Failure Call Logging" not in output:
-#         raise Exception("LiteLLMDebugger: success/failure call not logged!")
-# except Exception as e:
-#     pytest.fail(f"Error occurred: {e}")
-
diff --git a/litellm/tests/test_promptlayer_integration.py b/litellm/tests/test_promptlayer_integration.py
index 6d3bbe105..8c919bb1e 100644
--- a/litellm/tests/test_promptlayer_integration.py
+++ b/litellm/tests/test_promptlayer_integration.py
@@ -46,12 +46,13 @@ def test_promptlayer_logging_with_metadata():
         old_stdout = sys.stdout
         sys.stdout = new_stdout = io.StringIO()
 
-
-        response = completion(model="j2-light",
+        response = completion(model="gpt-3.5-turbo",
                               messages=[{
                                   "role": "user",
                                   "content": "Hi 👋 - i'm ai21"
                               }], 
+                              temperature=0.2,
+                              max_tokens=20,
                               metadata={"model": "ai21"})
 
         # Restore stdout
@@ -65,7 +66,7 @@ def test_promptlayer_logging_with_metadata():
     except Exception as e:
         print(e)
 
-# test_promptlayer_logging_with_metadata()
+test_promptlayer_logging_with_metadata()
 
 
 
diff --git a/litellm/tests/test_rate_limit_manager.py b/litellm/tests/test_rate_limit_manager.py
deleted file mode 100644
index 4eb8b86c9..000000000
--- a/litellm/tests/test_rate_limit_manager.py
+++ /dev/null
@@ -1,81 +0,0 @@
-#### What this tests ####
-#    This tests calling batch_completions by running 100 messages together
-
-import sys, os
-import traceback
-import pytest
-sys.path.insert(
-    0, os.path.abspath("../..")
-)  # Adds the parent directory to the system path
-from openai.error import Timeout
-import litellm
-from litellm import batch_completion, batch_completion_models, completion, batch_completion_models_all_responses
-# litellm.set_verbose=True
-
-@pytest.mark.asyncio
-async def test_rate_limit_handler():
-    import asyncio
-    ##### USAGE ################
-
-    from litellm import RateLimitManager
-
-    handler = RateLimitManager(
-        max_requests_per_minute = 60,
-        max_tokens_per_minute = 200
-    )
-
-
-    async def send_request():
-        response =  await handler.acompletion(
-            model="gpt-3.5-turbo", 
-            messages=[{
-                "content": "Please provide a summary of the latest scientific discoveries."*10, 
-                "role": "user"
-            }]
-        )
-        print("got a response", response)
-        return response
-
-
-    tasks = []
-
-    for _ in range(4):
-        tasks.append(send_request())
-
-    responses = await asyncio.gather(*tasks)
-
-    for response in responses:
-        print(response)
-
-# import asyncio
-# asyncio.run(
-#     test_rate_limit_handler()
-# )
-
-
-@pytest.mark.asyncio
-async def test_rate_limit_handler_batch():
-    ##### USAGE ################
-
-    jobs = [
-        {"model": "gpt-3.5-turbo-16k", "messages": [{"content": "Please provide a summary of the latest scientific discoveries.", "role": "user"}]},
-        {"model": "gpt-3.5-turbo-16k", "messages": [{"content": "Please provide a summary of the latest scientific discoveries.", "role": "user"}]},
-    ]
-
-    from litellm import RateLimitManager
-
-    handler = RateLimitManager(
-        max_requests_per_minute = 60,
-        max_tokens_per_minute = 20000
-    )
-
-    try:
-        handler.batch_completion(
-            jobs = jobs,
-            api_key=os.environ['OPENAI_API_KEY'],
-        )
-    except Exception as e:
-        print(e)
-
-
-test_rate_limit_handler_batch()
\ No newline at end of file
diff --git a/litellm/tests/test_stream_chunk_builder.py b/litellm/tests/test_stream_chunk_builder.py
index a4527521f..3a4d25b58 100644
--- a/litellm/tests/test_stream_chunk_builder.py
+++ b/litellm/tests/test_stream_chunk_builder.py
@@ -35,6 +35,7 @@ def test_stream_chunk_builder():
     chunks = []
 
     for chunk in response:
+        print(chunk)
         chunks.append(chunk)
 
     try:
@@ -51,8 +52,9 @@ def test_stream_chunk_builder():
         message = choices["message"]
         role = message["role"]
         content = message["content"]
-        finnish_reason = choices["finish_reason"]
-    except:
-        raise Exception("stream_chunk_builder failed to rebuild response")
-# test_stream_chunk_builder()
+        finish_reason = choices["finish_reason"]
+        print(role, content, finish_reason)
+    except Exception as e:
+        raise Exception("stream_chunk_builder failed to rebuild response", e)
+test_stream_chunk_builder()
 
diff --git a/litellm/tests/test_streaming.py b/litellm/tests/test_streaming.py
index ca6748dc8..c0f969ab1 100644
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@@ -695,35 +695,52 @@ def test_completion_replicate_stream_bad_key():
 # test_completion_bedrock_claude_stream() 
 
 
-def test_completion_sagemaker_stream():
-    try:
-        response = completion(
-            model="sagemaker/jumpstart-dft-meta-textgeneration-llama-2-7b", 
-            messages=messages,
-            temperature=0.2,
-            max_tokens=80,
-            stream=True,
-        )
-        complete_response = ""
-        has_finish_reason = False
-        # Add any assertions here to check the response
-        for idx, chunk in enumerate(response):
-            chunk, finished = streaming_format_tests(idx, chunk)
-            has_finish_reason = finished
-            if finished:
-                break
-            complete_response += chunk
-        if has_finish_reason is False:
-            raise Exception("finish reason not set for last chunk")
-        if complete_response.strip() == "": 
-            raise Exception("Empty response received")
-    except InvalidRequestError as e: 
-        pass
-    except Exception as e:
-        pytest.fail(f"Error occurred: {e}")
+# def test_completion_sagemaker_stream():
+#     try:
+#         response = completion(
+#             model="sagemaker/jumpstart-dft-meta-textgeneration-llama-2-7b", 
+#             messages=messages,
+#             temperature=0.2,
+#             max_tokens=80,
+#             stream=True,
+#         )
+#         complete_response = ""
+#         has_finish_reason = False
+#         # Add any assertions here to check the response
+#         for idx, chunk in enumerate(response):
+#             chunk, finished = streaming_format_tests(idx, chunk)
+#             has_finish_reason = finished
+#             if finished:
+#                 break
+#             complete_response += chunk
+#         if has_finish_reason is False:
+#             raise Exception("finish reason not set for last chunk")
+#         if complete_response.strip() == "": 
+#             raise Exception("Empty response received")
+#     except InvalidRequestError as e: 
+#         pass
+#     except Exception as e:
+#         pytest.fail(f"Error occurred: {e}")
 
 # test_completion_sagemaker_stream()
 
+
+# def test_maritalk_streaming():
+#     messages = [{"role": "user", "content": "Hey"}]
+#     try:
+#         response = completion("maritalk", messages=messages, stream=True)
+#         complete_response = ""
+#         start_time = time.time()
+#         for idx, chunk in enumerate(response):
+#             chunk, finished = streaming_format_tests(idx, chunk)
+#             complete_response += chunk
+#             if finished:
+#                 break
+#         if complete_response.strip() == "": 
+#             raise Exception("Empty response received")
+#     except:
+#         pytest.fail(f"error occurred: {traceback.format_exc()}")
+# test_maritalk_streaming()
 # test on openai completion call
 def test_openai_text_completion_call():
     try:
diff --git a/litellm/tests/test_text_completion.py b/litellm/tests/test_text_completion.py
new file mode 100644
index 000000000..17502c194
--- /dev/null
+++ b/litellm/tests/test_text_completion.py
@@ -0,0 +1,83 @@
+import sys, os
+import traceback
+from dotenv import load_dotenv
+
+load_dotenv()
+import os, io
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
+import pytest
+import litellm
+from litellm import embedding, completion, text_completion, completion_cost
+from litellm import RateLimitError
+
+
+token_prompt = [[32, 2043, 32, 329, 4585, 262, 1644, 14, 34, 3705, 319, 616, 47551, 30, 930, 19219, 284, 1949, 284, 787, 428, 355, 1790, 355, 1744, 981, 1390, 3307, 2622, 13, 220, 198, 198, 40, 423, 587, 351, 616, 41668, 32682, 329, 718, 812, 13, 376, 666, 32682, 468, 281, 4697, 6621, 11, 356, 1183, 869, 607, 25737, 11, 508, 318, 2579, 290, 468, 257, 642, 614, 1468, 1200, 13, 314, 373, 612, 262, 1110, 25737, 373, 287, 4827, 290, 14801, 373, 4642, 11, 673, 318, 616, 41803, 13, 2399, 2104, 1641, 468, 6412, 284, 502, 355, 465, 38074, 494, 1201, 1110, 352, 13, 314, 716, 407, 2910, 475, 356, 389, 1641, 11, 673, 3848, 502, 38074, 494, 290, 356, 423, 3993, 13801, 11, 26626, 11864, 11, 3503, 13, 220, 198, 198, 17, 812, 2084, 25737, 373, 287, 14321, 422, 2563, 13230, 13, 21051, 11, 2356, 25542, 11, 290, 47482, 897, 547, 607, 1517, 13, 1375, 550, 257, 5110, 14608, 290, 262, 1641, 7723, 1637, 284, 3758, 607, 284, 14321, 290, 477, 8389, 257, 7269, 284, 1011, 1337, 286, 14801, 13, 383, 5156, 338, 9955, 11, 25737, 338, 13850, 11, 468, 257, 47973, 14, 9979, 2762, 1693, 290, 373, 503, 286, 3240, 329, 362, 1933, 523, 339, 2492, 470, 612, 329, 477, 286, 428, 13, 220, 198, 198, 3347, 10667, 5223, 503, 706, 513, 1528, 11, 23630, 673, 373, 366, 38125, 290, 655, 2622, 257, 3338, 8399, 1911, 314, 2298, 607, 510, 11, 1011, 607, 284, 607, 2156, 11, 290, 673, 3393, 2925, 284, 7523, 20349, 290, 4144, 257, 6099, 13, 314, 836, 470, 892, 20349, 318, 257, 2563, 290, 716, 845, 386, 12, 66, 1236, 571, 292, 3584, 314, 836, 470, 7523, 11, 475, 326, 373, 407, 5035, 6402, 314, 655, 6497, 607, 510, 422, 14321, 13, 220, 198, 198, 32, 1285, 1568, 673, 373, 6294, 329, 3013, 24707, 287, 262, 12436, 1539, 819, 5722, 329, 852, 604, 1933, 2739, 11, 39398, 607, 1097, 5059, 981, 1029, 290, 318, 852, 16334, 329, 720, 1120, 74, 422, 15228, 278, 656, 257, 2156, 11, 290, 373, 12165, 503, 286, 376, 666, 32682, 338, 584, 6621, 338, 2156, 329, 32012, 262, 14595, 373, 30601, 510, 290, 2491, 357, 7091, 373, 1029, 8, 290, 262, 2104, 34624, 373, 46432, 1268, 1961, 422, 1660, 2465, 780, 8168, 2073, 1625, 1363, 329, 807, 2250, 13, 720, 1238, 11, 830, 286, 2465, 290, 5875, 5770, 511, 2156, 5096, 5017, 340, 13, 220, 198, 198, 2504, 373, 477, 938, 614, 13, 1119, 1053, 587, 287, 511, 649, 2156, 319, 511, 898, 329, 546, 718, 1933, 13, 554, 3389, 673, 1444, 34020, 290, 531, 511, 8744, 373, 4423, 572, 780, 673, 1422, 470, 423, 262, 1637, 780, 41646, 338, 37751, 1392, 32621, 510, 290, 1422, 470, 467, 832, 13, 679, 3432, 511, 2739, 8744, 9024, 492, 257, 2472, 286, 720, 4059, 13, 314, 1807, 340, 373, 13678, 306, 5789, 475, 4030, 616, 5422, 4423, 13, 1439, 468, 587, 5897, 1201, 13, 220, 198, 198, 7571, 2745, 2084, 11, 673, 1965, 502, 284, 8804, 617, 1637, 284, 651, 38464, 329, 399, 8535, 13, 3226, 1781, 314, 1101, 407, 1016, 284, 1309, 616, 41803, 393, 6621, 467, 14720, 11, 645, 2300, 644, 318, 1016, 319, 4306, 11, 523, 314, 910, 314, 1183, 307, 625, 379, 642, 13, 314, 1392, 572, 670, 1903, 290, 651, 612, 379, 362, 25, 2231, 13, 314, 1282, 287, 1262, 616, 13952, 1994, 11, 2513, 287, 11, 766, 399, 8535, 2712, 351, 36062, 287, 262, 5228, 11, 25737, 3804, 503, 319, 262, 18507, 11, 290, 16914, 319, 262, 6891, 3084, 13, 8989, 2406, 422, 257, 1641, 47655, 351, 13230, 11, 314, 760, 644, 16914, 3073, 588, 13, 314, 836, 470, 760, 703, 881, 340, 373, 11, 475, 314, 714, 423, 23529, 276, 340, 510, 290, 5901, 616, 18057, 351, 340, 13, 314, 6810, 19772, 2024, 8347, 287, 262, 2166, 2119, 290, 399, 8535, 373, 287, 3294, 11685, 286, 8242, 290, 607, 7374, 15224, 13, 383, 4894, 373, 572, 13, 383, 2156, 373, 3863, 2319, 37, 532, 340, 373, 1542, 2354, 13, 220, 198, 198, 40, 1718, 399, 8535, 284, 616, 1097, 11, 290, 1444, 16679, 329, 281, 22536, 355, 314, 373, 12008, 25737, 373, 14904, 2752, 13, 220, 314, 1422, 470, 765, 284, 10436, 290, 22601, 503, 399, 8535, 523, 314, 9658, 287, 262, 1097, 290, 1309, 607, 711, 319, 616, 3072, 1566, 262, 22536, 5284, 13, 3226, 1781, 1644, 290, 32084, 3751, 510, 355, 880, 13, 314, 4893, 262, 3074, 290, 780, 399, 8535, 338, 9955, 318, 503, 286, 3240, 1762, 11, 34020, 14, 44, 4146, 547, 1444, 13, 1649, 484, 5284, 484, 547, 5897, 290, 4692, 11, 1422, 470, 1107, 1561, 11, 1718, 399, 8535, 11, 290, 1297, 502, 284, 467, 1363, 13, 220, 198, 198, 2025, 1711, 1568, 314, 651, 1363, 290, 41668, 32682, 7893, 502, 644, 314, 1053, 1760, 13, 314, 4893, 2279, 284, 683, 290, 477, 339, 550, 373, 8993, 329, 502, 13, 18626, 262, 2104, 1641, 1541, 2993, 290, 547, 28674, 379, 502, 329, 644, 314, 550, 1760, 13, 18626, 314, 373, 366, 448, 286, 1627, 290, 8531, 1, 780, 314, 1444, 16679, 878, 4379, 611, 673, 373, 1682, 31245, 6, 278, 780, 340, 2900, 503, 673, 373, 655, 47583, 503, 422, 262, 16914, 13, 775, 8350, 329, 2250, 290, 314, 1364, 290, 3377, 262, 1755, 379, 616, 1266, 1545, 338, 2156, 290, 16896, 477, 1755, 13, 314, 3521, 470, 5412, 340, 477, 523, 314, 2900, 616, 3072, 572, 290, 3088, 284, 8960, 290, 655, 9480, 866, 13, 2011, 1266, 1545, 373, 510, 477, 1755, 351, 502, 11, 5149, 502, 314, 750, 2147, 2642, 11, 290, 314, 1101, 8788, 13, 220, 198, 198, 40, 1210, 616, 3072, 319, 290, 314, 550, 6135, 13399, 14, 37348, 1095, 13, 31515, 11, 34020, 11, 47551, 11, 41668, 32682, 11, 290, 511, 7083, 1641, 1866, 24630, 502, 13, 1119, 389, 2282, 314, 20484, 607, 1204, 11, 20484, 399, 8535, 338, 1204, 11, 925, 2279, 517, 8253, 621, 340, 2622, 284, 307, 11, 925, 340, 1171, 618, 340, 373, 257, 366, 17989, 14669, 1600, 290, 20484, 25737, 338, 8395, 286, 1683, 1972, 20750, 393, 1719, 10804, 286, 607, 1200, 757, 11, 4844, 286, 606, 1683, 765, 284, 766, 502, 757, 290, 314, 481, 1239, 766, 616, 41803, 757, 11, 290, 484, 765, 502, 284, 1414, 329, 25737, 338, 7356, 6314, 290, 20889, 502, 329, 262, 32084, 1339, 290, 7016, 12616, 13, 198, 198, 40, 716, 635, 783, 2060, 13, 1406, 319, 1353, 286, 6078, 616, 1266, 1545, 286, 838, 812, 357, 69, 666, 32682, 828, 314, 481, 4425, 616, 7962, 314, 550, 351, 683, 11, 644, 314, 3177, 616, 1641, 11, 290, 616, 399, 8535, 13, 198, 198, 40, 4988, 1254, 12361, 13, 314, 423, 12361, 9751, 284, 262, 966, 810, 314, 1101, 7960, 2130, 318, 1016, 284, 1282, 651, 366, 260, 18674, 1, 319, 502, 329, 644, 314, 750, 13, 314, 460, 470, 4483, 13, 314, 423, 2626, 767, 8059, 422, 340, 13, 314, 1101, 407, 11029, 329, 7510, 13, 314, 423, 11668, 739, 616, 2951, 13, 314, 1053, 550, 807, 50082, 12, 12545, 287, 734, 2745, 13, 1629, 717, 314, 2936, 523, 6563, 287, 616, 2551, 475, 355, 262, 1528, 467, 416, 314, 1101, 3612, 3863, 484, 547, 826, 290, 314, 815, 423, 10667, 319, 607, 878, 4585, 16679, 290, 852, 5306, 3019, 992, 13, 314, 836, 470, 1337, 546, 25737, 7471, 11, 475, 314, 750, 18344, 257, 642, 614, 1468, 1200, 1497, 422, 607, 3397, 290, 314, 1254, 12361, 546, 340, 13, 314, 760, 2130, 287, 262, 1641, 481, 1011, 607, 287, 11, 475, 340, 338, 1239, 588, 852, 351, 534, 3397, 13, 1375, 481, 1663, 510, 20315, 278, 502, 329, 340, 290, 477, 314, 1053, 1683, 1760, 318, 1842, 607, 355, 616, 898, 13, 220, 198, 198, 22367, 11, 317, 2043, 32, 30, 4222, 1037, 502, 13, 383, 14934, 318, 6600, 502, 6776, 13, 220, 198, 24361, 25, 1148, 428, 2642, 30, 198, 33706, 25, 645], [32, 2043, 32, 329, 4585, 262, 1644, 14, 34, 3705, 319, 616, 47551, 30, 930, 19219, 284, 1949, 284, 787, 428, 355, 1790, 355, 1744, 981, 1390, 3307, 2622, 13, 220, 198, 198, 40, 423, 587, 351, 616, 41668, 32682, 329, 718, 812, 13, 376, 666, 32682, 468, 281, 4697, 6621, 11, 356, 1183, 869, 607, 25737, 11, 508, 318, 2579, 290, 468, 257, 642, 614, 1468, 1200, 13, 314, 373, 612, 262, 1110, 25737, 373, 287, 4827, 290, 14801, 373, 4642, 11, 673, 318, 616, 41803, 13, 2399, 2104, 1641, 468, 6412, 284, 502, 355, 465, 38074, 494, 1201, 1110, 352, 13, 314, 716, 407, 2910, 475, 356, 389, 1641, 11, 673, 3848, 502, 38074, 494, 290, 356, 423, 3993, 13801, 11, 26626, 11864, 11, 3503, 13, 220, 198, 198, 17, 812, 2084, 25737, 373, 287, 14321, 422, 2563, 13230, 13, 21051, 11, 2356, 25542, 11, 290, 47482, 897, 547, 607, 1517, 13, 1375, 550, 257, 5110, 14608, 290, 262, 1641, 7723, 1637, 284, 3758, 607, 284, 14321, 290, 477, 8389, 257, 7269, 284, 1011, 1337, 286, 14801, 13, 383, 5156, 338, 9955, 11, 25737, 338, 13850, 11, 468, 257, 47973, 14, 9979, 2762, 1693, 290, 373, 503, 286, 3240, 329, 362, 1933, 523, 339, 2492, 470, 612, 329, 477, 286, 428, 13, 220, 198, 198, 3347, 10667, 5223, 503, 706, 513, 1528, 11, 23630, 673, 373, 366, 38125, 290, 655, 2622, 257, 3338, 8399, 1911, 314, 2298, 607, 510, 11, 1011, 607, 284, 607, 2156, 11, 290, 673, 3393, 2925, 284, 7523, 20349, 290, 4144, 257, 6099, 13, 314, 836, 470, 892, 20349, 318, 257, 2563, 290, 716, 845, 386, 12, 66, 1236, 571, 292, 3584, 314, 836, 470, 7523, 11, 475, 326, 373, 407, 5035, 6402, 314, 655, 6497, 607, 510, 422, 14321, 13, 220, 198, 198, 32, 1285, 1568, 673, 373, 6294, 329, 3013, 24707, 287, 262, 12436, 1539, 819, 5722, 329, 852, 604, 1933, 2739, 11, 39398, 607, 1097, 5059, 981, 1029, 290, 318, 852, 16334, 329, 720, 1120, 74, 422, 15228, 278, 656, 257, 2156, 11, 290, 373, 12165, 503, 286, 376, 666, 32682, 338, 584, 6621, 338, 2156, 329, 32012, 262, 14595, 373, 30601, 510, 290, 2491, 357, 7091, 373, 1029, 8, 290, 262, 2104, 34624, 373, 46432, 1268, 1961, 422, 1660, 2465, 780, 8168, 2073, 1625, 1363, 329, 807, 2250, 13, 720, 1238, 11, 830, 286, 2465, 290, 5875, 5770, 511, 2156, 5096, 5017, 340, 13, 220, 198, 198, 2504, 373, 477, 938, 614, 13, 1119, 1053, 587, 287, 511, 649, 2156, 319, 511, 898, 329, 546, 718, 1933, 13, 554, 3389, 673, 1444, 34020, 290, 531, 511, 8744, 373, 4423, 572, 780, 673, 1422, 470, 423, 262, 1637, 780, 41646, 338, 37751, 1392, 32621, 510, 290, 1422, 470, 467, 832, 13, 679, 3432, 511, 2739, 8744, 9024, 492, 257, 2472, 286, 720, 4059, 13, 314, 1807, 340, 373, 13678, 306, 5789, 475, 4030, 616, 5422, 4423, 13, 1439, 468, 587, 5897, 1201, 13, 220, 198, 198, 7571, 2745, 2084, 11, 673, 1965, 502, 284, 8804, 617, 1637, 284, 651, 38464, 329, 399, 8535, 13, 3226, 1781, 314, 1101, 407, 1016, 284, 1309, 616, 41803, 393, 6621, 467, 14720, 11, 645, 2300, 644, 318, 1016, 319, 4306, 11, 523, 314, 910, 314, 1183, 307, 625, 379, 642, 13, 314, 1392, 572, 670, 1903, 290, 651, 612, 379, 362, 25, 2231, 13, 314, 1282, 287, 1262, 616, 13952, 1994, 11, 2513, 287, 11, 766, 399, 8535, 2712, 351, 36062, 287, 262, 5228, 11, 25737, 3804, 503, 319, 262, 18507, 11, 290, 16914, 319, 262, 6891, 3084, 13, 8989, 2406, 422, 257, 1641, 47655, 351, 13230, 11, 314, 760, 644, 16914, 3073, 588, 13, 314, 836, 470, 760, 703, 881, 340, 373, 11, 475, 314, 714, 423, 23529, 276, 340, 510, 290, 5901, 616, 18057, 351, 340, 13, 314, 6810, 19772, 2024, 8347, 287, 262, 2166, 2119, 290, 399, 8535, 373, 287, 3294, 11685, 286, 8242, 290, 607, 7374, 15224, 13, 383, 4894, 373, 572, 13, 383, 2156, 373, 3863, 2319, 37, 532, 340, 373, 1542, 2354, 13, 220, 198, 198, 40, 1718, 399, 8535, 284, 616, 1097, 11, 290, 1444, 16679, 329, 281, 22536, 355, 314, 373, 12008, 25737, 373, 14904, 2752, 13, 220, 314, 1422, 470, 765, 284, 10436, 290, 22601, 503, 399, 8535, 523, 314, 9658, 287, 262, 1097, 290, 1309, 607, 711, 319, 616, 3072, 1566, 262, 22536, 5284, 13, 3226, 1781, 1644, 290, 32084, 3751, 510, 355, 880, 13, 314, 4893, 262, 3074, 290, 780, 399, 8535, 338, 9955, 318, 503, 286, 3240, 1762, 11, 34020, 14, 44, 4146, 547, 1444, 13, 1649, 484, 5284, 484, 547, 5897, 290, 4692, 11, 1422, 470, 1107, 1561, 11, 1718, 399, 8535, 11, 290, 1297, 502, 284, 467, 1363, 13, 220, 198, 198, 2025, 1711, 1568, 314, 651, 1363, 290, 41668, 32682, 7893, 502, 644, 314, 1053, 1760, 13, 314, 4893, 2279, 284, 683, 290, 477, 339, 550, 373, 8993, 329, 502, 13, 18626, 262, 2104, 1641, 1541, 2993, 290, 547, 28674, 379, 502, 329, 644, 314, 550, 1760, 13, 18626, 314, 373, 366, 448, 286, 1627, 290, 8531, 1, 780, 314, 1444, 16679, 878, 4379, 611, 673, 373, 1682, 31245, 6, 278, 780, 340, 2900, 503, 673, 373, 655, 47583, 503, 422, 262, 16914, 13, 775, 8350, 329, 2250, 290, 314, 1364, 290, 3377, 262, 1755, 379, 616, 1266, 1545, 338, 2156, 290, 16896, 477, 1755, 13, 314, 3521, 470, 5412, 340, 477, 523, 314, 2900, 616, 3072, 572, 290, 3088, 284, 8960, 290, 655, 9480, 866, 13, 2011, 1266, 1545, 373, 510, 477, 1755, 351, 502, 11, 5149, 502, 314, 750, 2147, 2642, 11, 290, 314, 1101, 8788, 13, 220, 198, 198, 40, 1210, 616, 3072, 319, 290, 314, 550, 6135, 13399, 14, 37348, 1095, 13, 31515, 11, 34020, 11, 47551, 11, 41668, 32682, 11, 290, 511, 7083, 1641, 1866, 24630, 502, 13, 1119, 389, 2282, 314, 20484, 607, 1204, 11, 20484, 399, 8535, 338, 1204, 11, 925, 2279, 517, 8253, 621, 340, 2622, 284, 307, 11, 925, 340, 1171, 618, 340, 373, 257, 366, 17989, 14669, 1600, 290, 20484, 25737, 338, 8395, 286, 1683, 1972, 20750, 393, 1719, 10804, 286, 607, 1200, 757, 11, 4844, 286, 606, 1683, 765, 284, 766, 502, 757, 290, 314, 481, 1239, 766, 616, 41803, 757, 11, 290, 484, 765, 502, 284, 1414, 329, 25737, 338, 7356, 6314, 290, 20889, 502, 329, 262, 32084, 1339, 290, 7016, 12616, 13, 198, 198, 40, 716, 635, 783, 2060, 13, 1406, 319, 1353, 286, 6078, 616, 1266, 1545, 286, 838, 812, 357, 69, 666, 32682, 828, 314, 481, 4425, 616, 7962, 314, 550, 351, 683, 11, 644, 314, 3177, 616, 1641, 11, 290, 616, 399, 8535, 13, 198, 198, 40, 4988, 1254, 12361, 13, 314, 423, 12361, 9751, 284, 262, 966, 810, 314, 1101, 7960, 2130, 318, 1016, 284, 1282, 651, 366, 260, 18674, 1, 319, 502, 329, 644, 314, 750, 13, 314, 460, 470, 4483, 13, 314, 423, 2626, 767, 8059, 422, 340, 13, 314, 1101, 407, 11029, 329, 7510, 13, 314, 423, 11668, 739, 616, 2951, 13, 314, 1053, 550, 807, 50082, 12, 12545, 287, 734, 2745, 13, 1629, 717, 314, 2936, 523, 6563, 287, 616, 2551, 475, 355, 262, 1528, 467, 416, 314, 1101, 3612, 3863, 484, 547, 826, 290, 314, 815, 423, 10667, 319, 607, 878, 4585, 16679, 290, 852, 5306, 3019, 992, 13, 314, 836, 470, 1337, 546, 25737, 7471, 11, 475, 314, 750, 18344, 257, 642, 614, 1468, 1200, 1497, 422, 607, 3397, 290, 314, 1254, 12361, 546, 340, 13, 314, 760, 2130, 287, 262, 1641, 481, 1011, 607, 287, 11, 475, 340, 338, 1239, 588, 852, 351, 534, 3397, 13, 1375, 481, 1663, 510, 20315, 278, 502, 329, 340, 290, 477, 314, 1053, 1683, 1760, 318, 1842, 607, 355, 616, 898, 13, 220, 198, 198, 22367, 11, 317, 2043, 32, 30, 4222, 1037, 502, 13, 383, 14934, 318, 6600, 502, 6776, 13, 220, 198, 24361, 25, 1148, 428, 2642, 30, 198, 33706, 25, 3763]]
+
+
+
+
+def test_completion_openai_prompt():
+    try:
+        response = text_completion(
+            model="gpt-3.5-turbo", prompt="What's the weather in SF?"
+        )
+        print(response)
+        response_str = response["choices"][0]["text"]
+        print(response.choices)
+        print(response.choices[0])
+        #print(response.choices[0].text)
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+test_completion_openai_prompt()
+
+
+def test_completion_openai_prompt_array():
+    try:
+        litellm.set_verbose=False
+        response = text_completion(
+            model="text-davinci-003", prompt="good morning", max_tokens=10, logprobs=10, echo=True
+        )
+        print(response)
+        print(response.choices)
+        print(response.choices[0])
+        #print(response.choices[0].text)
+        response_str = response["choices"][0]["text"]
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+test_completion_openai_prompt_array()
+
+# def test_completion_hf_prompt_array():
+#     try:
+#         litellm.set_verbose=False
+#         response = text_completion(
+#             model="huggingface/mistralai/Mistral-7B-v0.1", 
+#             prompt=token_prompt, # token prompt is a 2d list
+#         )
+#         print("\n\n response")
+
+#         print(response)
+#         print(response.choices)
+#         assert(len(response.choices)==2)
+#         # response_str = response["choices"][0]["text"]
+#     except Exception as e:
+#         pytest.fail(f"Error occurred: {e}")
+# test_completion_hf_prompt_array()
+
+
+def test_completion_text_003_prompt_array():
+    try:
+        litellm.set_verbose=False
+        response = text_completion(
+            model="text-davinci-003", 
+            prompt=token_prompt, # token prompt is a 2d list
+        )
+        print("\n\n response")
+
+        print(response)
+        # response_str = response["choices"][0]["text"]
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+test_completion_text_003_prompt_array()
\ No newline at end of file
diff --git a/litellm/tests/test_traceloop.py b/litellm/tests/test_traceloop.py
index 96b6b13e3..ed9d4e9f4 100644
--- a/litellm/tests/test_traceloop.py
+++ b/litellm/tests/test_traceloop.py
@@ -1,57 +1,62 @@
-import litellm
-from litellm import completion
-from traceloop.sdk import Traceloop
-
-Traceloop.init(app_name="test_traceloop", disable_batch=True)
-litellm.success_callback = ["traceloop"]
+# import sys
+# import os
+# import io
+# #
+# sys.path.insert(0, os.path.abspath('../..'))
+# import litellm
+# from litellm import completion
+# from traceloop.sdk import Traceloop
+# Traceloop.init(app_name="test_traceloop", disable_batch=True, traceloop_sync_enabled=False)
+# litellm.success_callback = ["traceloop"]
 
 
-def test_traceloop_logging():
-    try:
-        response = completion(
-            model="claude-instant-1.2",
-            messages=[
-                {"role": "user", "content": "Tell me a joke about OpenTelemetry"}
-            ],
-            max_tokens=10,
-            temperature=0.2,
-        )
-        print(response)
-    except Exception as e:
-        print(e)
+# def test_traceloop_logging():
+#     try:
+#         print('making completion call')
+#         response = completion(
+#             model="claude-instant-1.2",
+#             messages=[
+#                 {"role": "user", "content": "Tell me a joke about OpenTelemetry"}
+#             ],
+#             max_tokens=10,
+#             temperature=0.2,
+#         )
+#         print(response)
+#     except Exception as e:
+#         print(e)
 
 
-test_traceloop_logging()
+# # test_traceloop_logging()
 
 
-def test_traceloop_tracing_function_calling():
-    function1 = [
-        {
-            "name": "get_current_weather",
-            "description": "Get the current weather in a given location",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "location": {
-                        "type": "string",
-                        "description": "The city and state, e.g. San Francisco, CA",
-                    },
-                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
-                },
-                "required": ["location"],
-            },
-        }
-    ]
-    try:
-        response = completion(
-            model="gpt-3.5-turbo",
-            messages=[{"role": "user", "content": "what's the weather in boston"}],
-            temperature=0.1,
-            functions=function1,
-        )
-        print(response)
-    except Exception as e:
-        print(e)
+# def test_traceloop_tracing_function_calling():
+#     function1 = [
+#         {
+#             "name": "get_current_weather",
+#             "description": "Get the current weather in a given location",
+#             "parameters": {
+#                 "type": "object",
+#                 "properties": {
+#                     "location": {
+#                         "type": "string",
+#                         "description": "The city and state, e.g. San Francisco, CA",
+#                     },
+#                     "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+#                 },
+#                 "required": ["location"],
+#             },
+#         }
+#     ]
+#     try:
+#         response = completion(
+#             model="gpt-3.5-turbo",
+#             messages=[{"role": "user", "content": "what's the weather in boston"}],
+#             temperature=0.1,
+#             functions=function1,
+#         )
+#         print(response)
+#     except Exception as e:
+#         print(e)
 
 
-test_traceloop_tracing_function_calling()
+# # test_traceloop_tracing_function_calling()
diff --git a/litellm/utils.py b/litellm/utils.py
index 4f6c9c3aa..c08517ae6 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -146,6 +146,16 @@ class Choices(OpenAIObject):
         else:
             self.message = message
 
+class Usage(OpenAIObject):
+    def __init__(self, prompt_tokens=None, completion_tokens=None, total_tokens=None, **params):
+        super(Usage, self).__init__(**params)
+        if prompt_tokens:
+            self.prompt_tokens = prompt_tokens
+        if completion_tokens:
+            self.completion_tokens = completion_tokens
+        if total_tokens:
+            self.total_tokens = total_tokens
+
 class StreamingChoices(OpenAIObject):
     def __init__(self, finish_reason=None, index=0, delta: Optional[Delta]=None, **params):
         super(StreamingChoices, self).__init__(**params)
@@ -180,15 +190,11 @@ class ModelResponse(OpenAIObject):
         else:
             self._response_ms = None
         self.model = model
-        self.usage = (
-            usage
-            if usage
-            else {
-                "prompt_tokens": None,
-                "completion_tokens": None,
-                "total_tokens": None,
-            }
-        )
+        if usage:
+            self.usage = usage
+        else:
+            self.usage = Usage()
+        self._hidden_params = {} # used in case users want to access the original model response
         super(ModelResponse, self).__init__(**params)
 
     def to_dict_recursive(self):
@@ -196,6 +202,10 @@ class ModelResponse(OpenAIObject):
         d["choices"] = [choice.to_dict_recursive() for choice in self.choices]
         return d
 
+    def cost(self):
+        # for non streaming responses
+        return completion_cost(completion_response=self)
+
 class EmbeddingResponse(OpenAIObject):
     def __init__(self, id=None, choices=None, created=None, model=None, usage=None, stream=False, response_ms=None, **params):
         self.object = "list"
@@ -210,10 +220,72 @@ class EmbeddingResponse(OpenAIObject):
         d = super().to_dict_recursive()
         return d
 
+class TextChoices(OpenAIObject):
+    def __init__(self, finish_reason=None, index=0, text=None, logprobs=None, **params):
+        super(TextChoices, self).__init__(**params)
+        if finish_reason:
+            self.finish_reason = map_finish_reason(finish_reason)
+        else:
+            self.finish_reason = "stop"
+        self.index = index
+        if text:
+            self.text = text
+        else:
+            self.text = None
+        if logprobs:
+            self.logprobs = []
+        else:
+            self.logprobs = logprobs
+
+class TextCompletionResponse(OpenAIObject):
+    """
+    {
+        "id": response["id"],
+        "object": "text_completion",
+        "created": response["created"],
+        "model": response["model"],
+        "choices": [
+        {
+            "text": response["choices"][0]["message"]["content"],
+            "index": response["choices"][0]["index"],
+            "logprobs": transformed_logprobs,
+            "finish_reason": response["choices"][0]["finish_reason"]
+        }
+        ],
+        "usage": response["usage"]
+    }
+    """
+    def __init__(self, id=None, choices=None, created=None, model=None, usage=None, stream=False, response_ms=None, **params):
+        if stream:
+            self.object = "text_completion.chunk"
+            self.choices = [StreamingChoices()]
+        else:
+            self.object = "text_completion"
+            self.choices = [TextChoices()]
+        if id is None:
+            self.id = _generate_id()
+        else:
+            self.id = id
+        if created is None:
+            self.created = int(time.time())
+        else:
+            self.created = created
+        if response_ms:
+            self._response_ms = response_ms
+        else:
+            self._response_ms = None
+        self.model = model
+        if usage:
+            self.usage = usage
+        else:
+            self.usage = Usage()
+        self._hidden_params = {} # used in case users want to access the original model response
+        super(TextCompletionResponse, self).__init__(**params)
+
 ############################################################
 def print_verbose(print_statement):
     if litellm.set_verbose:
-        print(f"LiteLLM: {print_statement}")
+        print(print_statement) # noqa
 
 ####### LOGGING ###################
 from enum import Enum
@@ -270,7 +342,7 @@ class Logging:
                 self.model_call_details["model"] = model
 
             # User Logging -> if you pass in a custom logging function
-            print_verbose(f"model call details: {self.model_call_details}")
+            print_verbose(f"MODEL CALL INPUT: {self.model_call_details}\n\n")
             if self.logger_fn and callable(self.logger_fn):
                 try:
                     self.logger_fn(
@@ -326,6 +398,12 @@ class Logging:
                             message=f"Model Call Details pre-call: {self.model_call_details}",
                             level="info",
                         )
+                    elif isinstance(callback, CustomLogger): # custom logger class 
+                        callback.log_pre_api_call(
+                            model=self.model,
+                            messages=self.messages,
+                            kwargs=self.model_call_details,
+                        )
                     elif callable(callback): # custom logger functions
                         customLogger.log_input_event(
                             model=self.model,
@@ -365,7 +443,7 @@ class Logging:
             self.model_call_details["log_event_type"] = "post_api_call"
 
             # User Logging -> if you pass in a custom logging function
-            print_verbose(f"model call details: {self.model_call_details}")
+            print_verbose(f"RAW RESPONSE: {self.model_call_details}\n\n")
             print_verbose(
                 f"Logging Details Post-API Call: logger_fn - {self.logger_fn} | callable(logger_fn) - {callable(self.logger_fn)}"
             )
@@ -399,6 +477,12 @@ class Logging:
                             message=f"Model Call Details post-call: {self.model_call_details}",
                             level="info",
                         )
+                    elif isinstance(callback, CustomLogger): # custom logger class 
+                        callback.log_post_api_call(
+                            model=self.model,
+                            messages=self.messages,
+                            kwargs=self.model_call_details,
+                        )
                 except:
                     print_verbose(
                         f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while post-call logging with integrations {traceback.format_exc()}"
@@ -466,8 +550,6 @@ class Logging:
                         print_verbose("reaches api manager for updating model cost")
                         litellm.apiManager.update_cost(completion_obj=result, user=self.user)
                     if callback == "cache":
-                        # print("entering logger first time")
-                        # print(self.litellm_params["stream_response"])
                         if litellm.cache != None and self.model_call_details.get('optional_params', {}).get('stream', False) == True:
                             litellm_call_id = self.litellm_params["litellm_call_id"]
                             if litellm_call_id in self.litellm_params["stream_response"]:
@@ -478,10 +560,7 @@ class Logging:
                                     self.litellm_params["stream_response"][litellm_call_id]["choices"][0]["message"]["content"] += result["content"]
                             else: # init a streaming response for this call id
                                 new_model_response = ModelResponse(choices=[Choices(message=Message(content="default"))])
-                                #print("creating new model response")
-                                #print(new_model_response)
                                 self.litellm_params["stream_response"][litellm_call_id] = new_model_response
-                            #print("adding to cache for", litellm_call_id)                              
                             litellm.cache.add_cache(self.litellm_params["stream_response"][litellm_call_id], **self.model_call_details)
                     if callback == "promptlayer":
                         print_verbose("reaches promptlayer for logging!")
@@ -504,7 +583,6 @@ class Logging:
                                 print_verbose("reaches supabase for streaming logging!")
                                 result = kwargs["complete_streaming_response"]
       
-                        # print(kwargs)
                         model = kwargs["model"]
                         messages = kwargs["messages"]
                         optional_params = kwargs.get("optional_params", {})
@@ -537,6 +615,23 @@ class Logging:
                             end_time=end_time,
                             print_verbose=print_verbose,
                         )
+                    if isinstance(callback, CustomLogger): # custom logger class 
+                        if self.stream and complete_streaming_response is None:
+                            callback.log_stream_event(
+                                kwargs=self.model_call_details,
+                                response_obj=result,
+                                start_time=start_time,
+                                end_time=end_time
+                                )
+                        else:
+                            if self.stream and complete_streaming_response:
+                                self.model_call_details["complete_response"] = self.model_call_details.pop("complete_streaming_response", complete_streaming_response)
+                            callback.log_success_event(
+                                kwargs=self.model_call_details,
+                                response_obj=result,
+                                start_time=start_time,
+                                end_time=end_time,
+                            )
                     if callable(callback): # custom logger functions
                         customLogger.log_event(
                             kwargs=self.model_call_details,
@@ -624,6 +719,12 @@ class Logging:
                             print_verbose=print_verbose,
                             callback_func=callback
                         )
+                    elif isinstance(callback, CustomLogger): # custom logger class 
+                        callback.log_failure_event(
+                            model=self.model,
+                            messages=self.messages,
+                            kwargs=self.model_call_details,
+                        )
                 except Exception as e:
                     print_verbose(
                         f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while failure logging with integrations {traceback.format_exc()}"
@@ -660,11 +761,11 @@ def exception_logging(
                     model_call_details
                 )  # Expectation: any logger function passed in by the user should accept a dict object
             except Exception as e:
-                print(
+                print_verbose(
                     f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while logging {traceback.format_exc()}"
                 )
     except Exception as e:
-        print(
+        print_verbose(
             f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while logging {traceback.format_exc()}"
         )
         pass
@@ -689,6 +790,14 @@ def client(original_function):
                     litellm.success_callback.append("lite_debugger")
                 if "lite_debugger" not in litellm.failure_callback:
                     litellm.failure_callback.append("lite_debugger")
+            if len(litellm.callbacks) > 0: 
+                for callback in litellm.callbacks: 
+                    if callback not in litellm.input_callback:
+                        litellm.input_callback.append(callback)
+                    if callback not in litellm.success_callback:
+                        litellm.success_callback.append(callback)
+                    if callback not in litellm.failure_callback:
+                        litellm.failure_callback.append(callback)
             if (
                 len(litellm.input_callback) > 0
                 or len(litellm.success_callback) > 0
@@ -727,7 +836,6 @@ def client(original_function):
             return logging_obj
         except Exception as e:  # DO NOT BLOCK running the function because of this
             print_verbose(f"[Non-Blocking] {traceback.format_exc()}; args - {args}; kwargs - {kwargs}")
-            print(e)
         pass
     
     def crash_reporting(*args, **kwargs):
@@ -752,8 +860,10 @@ def client(original_function):
     def wrapper(*args, **kwargs):
         start_time = datetime.datetime.now()
         result = None
-        litellm_call_id = str(uuid.uuid4())
-        kwargs["litellm_call_id"] = litellm_call_id
+
+        # only set litellm_call_id if its not in kwargs
+        if "litellm_call_id" not in kwargs:
+            kwargs["litellm_call_id"] = str(uuid.uuid4())
         try:
             model = args[0] if len(args) > 0 else kwargs["model"]
         except:
@@ -812,6 +922,28 @@ def client(original_function):
             result._response_ms = (end_time - start_time).total_seconds() * 1000 # return response latency in ms like openai
             return result
         except Exception as e:
+            call_type = original_function.__name__
+            if call_type == CallTypes.completion.value:
+                num_retries = (
+                    kwargs.get("num_retries", None)
+                    or litellm.num_retries
+                    or None
+                )
+                litellm.num_retries = None # set retries to None to prevent infinite loops 
+                context_window_fallback_dict = kwargs.get("context_window_fallback_dict", {})
+
+                if num_retries: 
+                    if (isinstance(e, openai.error.APIError) 
+                    or isinstance(e, openai.error.Timeout) 
+                    or isinstance(e, openai.error.ServiceUnavailableError)):
+                        kwargs["num_retries"] = num_retries
+                        return litellm.completion_with_retries(*args, **kwargs)
+                elif isinstance(e, litellm.exceptions.ContextWindowExceededError) and context_window_fallback_dict and model in context_window_fallback_dict:
+                    if len(args) > 0:
+                        args[0]  = context_window_fallback_dict[model]
+                    else:
+                        kwargs["model"] = context_window_fallback_dict[model]
+                    return original_function(*args, **kwargs)
             traceback_exception = traceback.format_exc()
             crash_reporting(*args, **kwargs, exception=traceback_exception)
             end_time = datetime.datetime.now()
@@ -900,6 +1032,16 @@ def _select_tokenizer(model: str):
         return {"type": "openai_tokenizer", "tokenizer": encoding}
 
 def encode(model: str, text: str): 
+    """
+    Encodes the given text using the specified model.
+
+    Args:
+        model (str): The name of the model to use for tokenization.
+        text (str): The text to be encoded.
+
+    Returns:
+        enc: The encoded text.
+    """
     tokenizer_json = _select_tokenizer(model=model)
     enc = tokenizer_json["tokenizer"].encode(text)
     return enc
@@ -1264,8 +1406,25 @@ def get_optional_params(  # use the openai defaults
             optional_params["presence_penalty"] = presence_penalty
         if stop:
             optional_params["stop_sequences"] = stop
-    elif custom_llm_provider == "perplexity":
-        optional_params[""]
+    elif custom_llm_provider == "maritalk":
+        ## check if unsupported param passed in 
+        supported_params = ["stream", "temperature", "max_tokens", "top_p", "presence_penalty", "stop"]
+        _check_valid_arg(supported_params=supported_params)
+        # handle cohere params
+        if stream:
+            optional_params["stream"] = stream
+        if temperature:
+            optional_params["temperature"] = temperature
+        if max_tokens:
+            optional_params["max_tokens"] = max_tokens
+        if logit_bias != {}:
+            optional_params["logit_bias"] = logit_bias
+        if top_p: 
+            optional_params["p"] = top_p
+        if presence_penalty: 
+            optional_params["repetition_penalty"] = presence_penalty
+        if stop:
+            optional_params["stopping_tokens"] = stop
     elif custom_llm_provider == "replicate":
         ## check if unsupported param passed in 
         supported_params = ["stream", "temperature", "max_tokens", "top_p", "stop", "seed"]
@@ -1309,6 +1468,10 @@ def get_optional_params(  # use the openai defaults
             optional_params["best_of"] = n
         if presence_penalty:
             optional_params["repetition_penalty"] = presence_penalty
+        if "echo" in special_params:
+            # https://huggingface.co/docs/huggingface_hub/main/en/package_reference/inference_client#huggingface_hub.InferenceClient.text_generation.decoder_input_details
+            #  Return the decoder input token logprobs and ids. You must set details=True as well for it to be taken into account. Defaults to False
+            optional_params["decoder_input_details"] = special_params["echo"]
     elif custom_llm_provider == "together_ai":
         ## check if unsupported param passed in 
         supported_params = ["stream", "temperature", "max_tokens", "top_p", "stop", "frequency_penalty"]
@@ -1361,7 +1524,7 @@ def get_optional_params(  # use the openai defaults
         if n: 
             optional_params["candidate_count"] = n
         if stop: 
-            optional_params["stopSequences"] = stop
+            optional_params["stop_sequences"] = stop
         if max_tokens: 
             optional_params["max_output_tokens"] = max_tokens
     elif (
@@ -1564,7 +1727,7 @@ def get_llm_provider(model: str, custom_llm_provider: Optional[str] = None, api_
             return model, custom_llm_provider, dynamic_api_key, api_base
 
         # check if llm provider part of model name
-        if model.split("/",1)[0] in litellm.provider_list:
+        if model.split("/",1)[0] in litellm.provider_list and model.split("/",1)[0] not in litellm.model_list:
             custom_llm_provider = model.split("/", 1)[0]
             model = model.split("/", 1)[1]
             if custom_llm_provider == "perplexity":
@@ -1610,8 +1773,16 @@ def get_llm_provider(model: str, custom_llm_provider: Optional[str] = None, api_
         ## openrouter
         elif model in litellm.openrouter_models:
             custom_llm_provider = "openrouter"
+        ## openrouter
+        elif model in litellm.maritalk_models:
+            custom_llm_provider = "maritalk"
         ## vertex - text + chat models
-        elif model in litellm.vertex_chat_models or model in litellm.vertex_text_models:
+        elif(
+            model in litellm.vertex_chat_models or 
+            model in litellm.vertex_code_chat_models or
+            model in litellm.vertex_text_models or
+            model in litellm.vertex_code_text_models
+        ):
             custom_llm_provider = "vertex_ai"
         ## ai21 
         elif model in litellm.ai21_models:
@@ -1637,11 +1808,13 @@ def get_llm_provider(model: str, custom_llm_provider: Optional[str] = None, api_
         # cohere embeddings
         elif model in litellm.cohere_embedding_models:
             custom_llm_provider = "cohere"
+        elif model in litellm.bedrock_embedding_models:
+            custom_llm_provider = "bedrock"
 
         if custom_llm_provider is None or custom_llm_provider=="":
-            print()
-            print("\033[1;31mProvider List: https://docs.litellm.ai/docs/providers\033[0m")
-            print()
+            print() # noqa
+            print("\033[1;31mProvider List: https://docs.litellm.ai/docs/providers\033[0m") # noqa
+            print() # noqa
             raise ValueError(f"LLM Provider NOT provided. Pass in the LLM provider you are trying to call. E.g. For 'Huggingface' inference endpoints pass in `completion(model='huggingface/{model}',..)` Learn more: https://docs.litellm.ai/docs/providers")
         return model, custom_llm_provider, dynamic_api_key, api_base
     except Exception as e: 
@@ -1915,18 +2088,32 @@ def load_test_model(
         }
 
 def validate_environment(model: Optional[str]=None) -> dict:
+    """
+    Checks if the environment variables are valid for the given model.
+    
+    Args:
+        model (Optional[str]): The name of the model. Defaults to None.
+        
+    Returns:
+        dict: A dictionary containing the following keys:
+            - keys_in_environment (bool): True if all the required keys are present in the environment, False otherwise.
+            - missing_keys (List[str]): A list of missing keys in the environment.
+    """
     keys_in_environment = False
     missing_keys: List[str] = []
 
     if model is None:
         return {"keys_in_environment": keys_in_environment, "missing_keys": missing_keys} 
     ## EXTRACT LLM PROVIDER - if model name provided
-    custom_llm_provider = None
-    # check if llm provider part of model name
-    if model.split("/",1)[0] in litellm.provider_list:
-        custom_llm_provider = model.split("/", 1)[0]
-        model = model.split("/", 1)[1]
-        custom_llm_provider_passed_in = True
+    try:
+        custom_llm_provider = get_llm_provider(model=model)
+    except:
+        custom_llm_provider = None
+    # # check if llm provider part of model name
+    # if model.split("/",1)[0] in litellm.provider_list:
+    #     custom_llm_provider = model.split("/", 1)[0]
+    #     model = model.split("/", 1)[1]
+    #     custom_llm_provider_passed_in = True
     
     if custom_llm_provider:
         if custom_llm_provider == "openai":
@@ -1997,6 +2184,12 @@ def validate_environment(model: Optional[str]=None) -> dict:
                 keys_in_environment = True
             else:
                 missing_keys.append("NLP_CLOUD_API_KEY")
+        elif custom_llm_provider == "bedrock": 
+            if "AWS_ACCESS_KEY_ID" in os.environ and "AWS_SECRET_ACCESS_KEY" in os.environ: 
+                keys_in_environment = True
+            else:
+                missing_keys.append("AWS_ACCESS_KEY_ID")
+                missing_keys.append("AWS_SECRET_ACCESS_KEY")
     else:
         ## openai - chatcompletion + text completion
         if model in litellm.open_ai_chat_completion_models or litellm.open_ai_text_completion_models:
@@ -2094,7 +2287,7 @@ def set_callbacks(callback_list, function_id=None):
                     else "1.0"
                 )
                 sentry_sdk_instance.init(
-                    dsn=os.environ.get("SENTRY_API_URL"),
+                    dsn=os.environ.get("SENTRY_DSN"),
                     traces_sample_rate=float(sentry_trace_rate),
                 )
                 capture_exception = sentry_sdk_instance.capture_exception
@@ -2491,10 +2684,17 @@ def valid_model(model):
     except:
         raise InvalidRequestError(message="", model=model, llm_provider="")
 
-# check valid api key 
 def check_valid_key(model: str, api_key: str):
-    # returns True if key is valid for the model
-    # returns False if key is invalid for the model
+    """
+    Checks if a given API key is valid for a specific model by making a litellm.completion call with max_tokens=10
+
+    Args:
+        model (str): The name of the model to check the API key against.
+        api_key (str): The API key to be checked.
+
+    Returns:
+        bool: True if the API key is valid for the model, False otherwise.
+    """
     messages = [{"role": "user", "content": "Hey, how's it going?"}]
     try:
         litellm.completion(model=model, messages=messages, api_key=api_key, max_tokens=10)
@@ -2608,7 +2808,7 @@ def get_all_keys(llm_provider=None):
 
 
 def get_model_list():
-    global last_fetched_at
+    global last_fetched_at, print_verbose
     try:
         # if user is using hosted product -> get their updated model list
         user_email = (
@@ -2620,7 +2820,7 @@ def get_model_list():
         if user_email:
             # make the api call
             last_fetched_at = time.time()
-            print(f"last_fetched_at: {last_fetched_at}")
+            print_verbose(f"last_fetched_at: {last_fetched_at}")
             response = requests.post(
                 url="http://api.litellm.ai/get_model_list",
                 headers={"content-type": "application/json"},
@@ -2655,10 +2855,11 @@ def exception_type(
     ):
     global user_logger_fn, liteDebuggerClient
     exception_mapping_worked = False
-    print()
-    print("\033[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new\033[0m")
-    print("LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.")
-    print()
+    if litellm.suppress_debug_info is False:
+        print() # noqa
+        print("\033[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new\033[0m") # noqa
+        print("LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.") # noqa
+        print() # noqa
     try:
         if isinstance(original_exception, OriginalError):
             # Handle the OpenAIError
@@ -2836,27 +3037,49 @@ def exception_type(
                     model=model
                 )
             elif custom_llm_provider == "bedrock":
-                if "Unable to locate credentials" in error_str:
+                if "too many tokens" in error_str or "expected maxLength:" in error_str or "Input is too long" in error_str or "Too many input tokens" in error_str: 
+                    exception_mapping_worked = True
+                    raise ContextWindowExceededError(
+                        message=f"BedrockException: Context Window Error - {error_str}",
+                        model=model, 
+                        llm_provider="bedrock"
+                    )
+                if "Malformed input request" in error_str:
                     exception_mapping_worked = True
                     raise InvalidRequestError(
                         message=f"BedrockException - {error_str}", 
                         model=model, 
                         llm_provider="bedrock"
                     )
-                if "The security token included in the request is invalid" in error_str:
+                if "Unable to locate credentials" in error_str or "The security token included in the request is invalid" in error_str:
                     exception_mapping_worked = True
                     raise AuthenticationError(
                             message=f"BedrockException Invalid Authentication - {error_str}",
                             model=model, 
                             llm_provider="bedrock"
                     )
-                if "throttlingException" in error_str:
+                if "throttlingException" in error_str or "ThrottlingException" in error_str:
                     exception_mapping_worked = True
                     raise RateLimitError(
                             message=f"BedrockException: Rate Limit Error - {error_str}",
                             model=model, 
                             llm_provider="bedrock"
                     )
+                if hasattr(original_exception, "status_code"):
+                    if original_exception.status_code == 500:
+                        exception_mapping_worked = True
+                        raise ServiceUnavailableError(
+                            message=f"BedrockException - {original_exception.message}",
+                            llm_provider="bedrock",
+                            model=model
+                        )
+                    elif original_exception.status_code == 401:
+                        exception_mapping_worked = True
+                        raise AuthenticationError(
+                            message=f"BedrockException - {original_exception.message}",
+                            llm_provider="bedrock",
+                            model=model
+                        )
             elif custom_llm_provider == "sagemaker": 
                 if "Unable to locate credentials" in error_str:
                     exception_mapping_worked = True
@@ -3214,7 +3437,7 @@ def exception_type(
                         model=model
                     )
                 elif hasattr(original_exception, "status_code"):
-                    print(f"status code: {original_exception.status_code}")
+                    print_verbose(f"status code: {original_exception.status_code}")
                     if original_exception.status_code == 401:
                         exception_mapping_worked = True
                         raise AuthenticationError(
@@ -3283,7 +3506,7 @@ def exception_type(
             elif custom_llm_provider == "ollama":
                 if "no attribute 'async_get_ollama_response_stream" in error_str:
                     raise ImportError("Import error - trying to use async for ollama. import async_generator failed. Try 'pip install async_generator'")
-            elif custom_llm_provider == "custom_openai":
+            elif custom_llm_provider == "custom_openai" or custom_llm_provider == "maritalk":
                 if hasattr(original_exception, "status_code"):
                     exception_mapping_worked = True
                     if original_exception.status_code == 401:
@@ -3418,7 +3641,7 @@ def litellm_telemetry(data):
         }
         # Make the POST request to litellm logging api
         response = requests.post(
-            "https://litellm.berri.ai/logging",
+            "https://litellm-logging.onrender.com/logging",
             headers={"Content-Type": "application/json"},
             json=payload,
         )
@@ -3545,6 +3768,17 @@ class CustomStreamWrapper:
         except:
             raise ValueError(f"Unable to parse response. Original response: {chunk}")
     
+    def handle_maritalk_chunk(self, chunk): # fake streaming
+        chunk = chunk.decode("utf-8")
+        data_json = json.loads(chunk)
+        try:
+            text = data_json["answer"]
+            is_finished = True
+            finish_reason = "stop"
+            return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
+        except:
+            raise ValueError(f"Unable to parse response. Original response: {chunk}")
+    
     def handle_nlp_cloud_chunk(self, chunk):
         chunk = chunk.decode("utf-8")
         data_json = json.loads(chunk)
@@ -3608,9 +3842,15 @@ class CustomStreamWrapper:
             text = "" 
             is_finished = False
             finish_reason = None
-            if str_line.startswith("data:"):
+            if str_line == "data: [DONE]":
+                # anyscale returns a [DONE] special char for streaming, this cannot be json loaded. This is the end of stream
+                text = ""
+                is_finished = True
+                finish_reason = "stop"
+                return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason}
+            elif str_line.startswith("data:"):
                 data_json = json.loads(str_line[5:])
-                print(f"delta content: {data_json['choices'][0]['delta']}")
+                print_verbose(f"delta content: {data_json['choices'][0]['delta']}")
                 text = data_json["choices"][0]["delta"].get("content", "") 
                 if data_json["choices"][0].get("finish_reason", None): 
                     is_finished = True
@@ -3682,6 +3922,14 @@ class CustomStreamWrapper:
                 if stop_reason != None:
                     is_finished = True
                     finish_reason = stop_reason
+            ######## bedrock.cohere mappings ###############
+            # cohere mapping
+            elif "text" in chunk_data:
+                text = chunk_data["text"] # bedrock.cohere
+            # cohere mapping for finish reason
+            elif "finish_reason" in chunk_data:
+                finish_reason = chunk_data["finish_reason"]
+                is_finished = True
             elif chunk_data.get("completionReason", None): 
                 is_finished = True
                 finish_reason = chunk_data["completionReason"]
@@ -3731,6 +3979,12 @@ class CustomStreamWrapper:
                     completion_obj["content"] = response_obj["text"]
                     if response_obj["is_finished"]: 
                         model_response.choices[0].finish_reason = response_obj["finish_reason"]
+                elif self.custom_llm_provider and self.custom_llm_provider == "maritalk":
+                    chunk = next(self.completion_stream)
+                    response_obj = self.handle_maritalk_chunk(chunk)
+                    completion_obj["content"] = response_obj["text"]
+                    if response_obj["is_finished"]: 
+                        model_response.choices[0].finish_reason = response_obj["finish_reason"]
                 elif self.custom_llm_provider and self.custom_llm_provider == "vllm":
                     chunk = next(self.completion_stream)
                     completion_obj["content"] = chunk[0].outputs[0].text
@@ -3826,7 +4080,7 @@ class CustomStreamWrapper:
                     chunk = next(self.completion_stream)
                     response_obj = self.handle_custom_openai_chat_completion_chunk(chunk)
                     completion_obj["content"] = response_obj["text"]
-                    print(f"completion obj content: {completion_obj['content']}")
+                    print_verbose(f"completion obj content: {completion_obj['content']}")
                     if response_obj["is_finished"]: 
                         model_response.choices[0].finish_reason = response_obj["finish_reason"]
                 else: # openai chat/azure models
@@ -3889,6 +4143,34 @@ def read_config_args(config_path) -> dict:
 ########## experimental completion variants ############################
 
 def completion_with_config(config: Union[dict, str], **kwargs):
+    """
+    Generate a litellm.completion() using a config dict and all supported completion args 
+
+    Example config;
+    config = {
+        "default_fallback_models": # [Optional] List of model names to try if a call fails
+        "available_models": # [Optional] List of all possible models you could call 
+        "adapt_to_prompt_size": # [Optional] True/False - if you want to select model based on prompt size (will pick from available_models)
+        "model": {
+            "model-name": {
+                "needs_moderation": # [Optional] True/False - if you want to call openai moderations endpoint before making completion call. Will raise exception, if flagged. 
+                "error_handling": {
+                    "error-type": { # One of the errors listed here - https://docs.litellm.ai/docs/exception_mapping#custom-mapping-list
+                        "fallback_model": "" # str, name of the model it should try instead, when that error occurs 
+                    }
+                }
+            }
+        }
+    }
+
+    Parameters:
+        config (Union[dict, str]): A configuration for litellm
+        **kwargs: Additional keyword arguments for litellm.completion
+
+    Returns:
+        litellm.ModelResponse: A ModelResponse with the generated completion
+
+    """
     if config is not None:
         if isinstance(config, str):
             config = read_config_args(config)
@@ -3979,77 +4261,7 @@ def completion_with_config(config: Union[dict, str], **kwargs):
             return completion_with_fallbacks(model=model, messages=messages, fallbacks=fallback_models)
         raise e
 
-
-
-def get_model_split_test(models, completion_call_id):
-    global last_fetched_at
-    try:
-        # make the api call
-        last_fetched_at = time.time()
-        response = requests.post(
-            #http://api.litellm.ai
-            url="http://api.litellm.ai/get_model_split_test", # get the updated dict from table or update the table with the dict
-            headers={"content-type": "application/json"},
-            data=json.dumps({"completion_call_id": completion_call_id, "models": models}),
-        )
-        print_verbose(f"get_model_list response: {response.text}")
-        data = response.json()
-        # update model list
-        split_test_models = data["split_test_models"]
-        model_configs = data.get("model_configs", {})
-        # update environment - if required
-        threading.Thread(target=get_all_keys, args=()).start()
-        return split_test_models, model_configs
-    except:
-        print_verbose(
-            f"[Non-Blocking Error] get_all_keys error - {traceback.format_exc()}"
-        )
-
-
-def completion_with_split_tests(models={}, messages=[], use_client=False, override_client=False, **kwargs):
-    """
-    Example Usage: 
-
-    models =  {
-	    "gpt-4": 0.7, 
-	    "huggingface/wizard-coder": 0.3
-    }
-    messages = [{ "content": "Hello, how are you?","role": "user"}]
-    completion_with_split_tests(models=models, messages=messages)
-    """
-    import random
-    model_configs = {}
-    if use_client and not override_client:
-        if "id" not in kwargs or kwargs["id"] is None:
-            kwargs["id"] = str(uuid.uuid4())
-            #raise ValueError("Please tag this completion call, if you'd like to update it's split test values through the UI. - eg. `completion_with_split_tests(.., id=1234)`.")
-        # get the most recent model split list from server 
-        models, model_configs = get_model_split_test(models=models, completion_call_id=kwargs["id"])
-
-    try:
-        selected_llm = random.choices(list(models.keys()), weights=list(models.values()))[0]
-    except:
-        traceback.print_exc()
-        raise ValueError("""models does not follow the required format - {'model_name': 'split_percentage'}, e.g. {'gpt-4': 0.7, 'huggingface/wizard-coder': 0.3}""")
-    
-    # use dynamic model configs if users set 
-    if model_configs!={}:
-        selected_model_configs = model_configs.get(selected_llm, {})
-        if "prompt" in selected_model_configs: # special case, add this to messages as system prompt
-            messages.append({"role": "system", "content": selected_model_configs["prompt"]})
-            selected_model_configs.pop("prompt")
-        for param_name in selected_model_configs:
-            if param_name == "temperature":
-                kwargs[param_name] = float(selected_model_configs[param_name])
-            elif param_name == "max_tokens":
-                kwargs[param_name] = int(selected_model_configs[param_name])
-            else:
-                kwargs[param_name] = selected_model_configs[param_name]
-
-    return litellm.completion(model=selected_llm, messages=messages, use_client=use_client, **kwargs)
-
 def completion_with_fallbacks(**kwargs):
-    print(f"kwargs inside completion_with_fallbacks: {kwargs}")
     nested_kwargs = kwargs.pop("kwargs", {})
     response = None
     rate_limited_models = set()
@@ -4059,7 +4271,9 @@ def completion_with_fallbacks(**kwargs):
     fallbacks = [kwargs["model"]] + nested_kwargs.get("fallbacks", [])
     if "fallbacks" in nested_kwargs:
         del nested_kwargs["fallbacks"]  # remove fallbacks so it's not recursive
+    litellm_call_id = str(uuid.uuid4())
 
+    # max time to process a request with fallbacks: default 45s
     while response == None and time.time() - start_time < 45:
         for model in fallbacks:
             # loop thru all models
@@ -4068,8 +4282,7 @@ def completion_with_fallbacks(**kwargs):
                 if isinstance(model, dict): # completion(model="gpt-4", fallbacks=[{"api_key": "", "api_base": ""}, {"api_key": "", "api_base": ""}])
                     kwargs["api_key"] = model.get("api_key", None)
                     kwargs["api_base"] = model.get("api_base", None)
-                    model = original_model
-                    print(f"switched api keys")
+                    model = model.get("model", original_model)
                 elif (
                     model in rate_limited_models
                 ):  # check if model is currently cooling down
@@ -4087,20 +4300,20 @@ def completion_with_fallbacks(**kwargs):
                 if kwargs.get("model"):
                     del kwargs["model"]
 
-                print(f"trying to make completion call with model: {model}")
+                print_verbose(f"trying to make completion call with model: {model}")
+                kwargs["litellm_call_id"] = litellm_call_id
                 kwargs = {**kwargs, **nested_kwargs} # combine the openai + litellm params at the same level
                 response = litellm.completion(**kwargs, model=model)
-                print(f"response: {response}")
+                print_verbose(f"response: {response}")
                 if response != None:
                     return response
 
             except Exception as e:
-                print(e)
+                print_verbose(e)
                 rate_limited_models.add(model)
                 model_expiration_times[model] = (
                     time.time() + 60
                 )  # cool down this selected model
-                # print(f"rate_limited_models {rate_limited_models}")
                 pass
     return response
 
@@ -4197,7 +4410,6 @@ def trim_messages(
     Args:
         messages: Input messages to be trimmed. Each message is a dictionary with 'role' and 'content'.
         model: The LiteLLM model being used (determines the token limit).
-        system_message: Optional system message to preserve at the start of the conversation.
         trim_ratio: Target ratio of tokens to use after trimming. Default is 0.75, meaning it will trim messages so they use about 75% of the model's token limit.
         return_response_tokens: If True, also return the number of tokens left available for the response after trimming.
         max_tokens: Instead of specifying a model or trim_ratio, you can specify this directly.
@@ -4246,11 +4458,19 @@ def trim_messages(
 
         return final_messages
     except Exception as e: # [NON-Blocking, if error occurs just return final_messages
-        print("Got exception while token trimming", e)
+        print_verbose(f"Got exception while token trimming{e}")
         return messages
 
-# this helper reads the .env and returns a list of supported llms for user
 def get_valid_models():
+    """
+    Returns a list of valid LLMs based on the set environment variables
+    
+    Args:
+        None
+
+    Returns:
+        A list of valid LLMs
+    """
     try:
         # get keys set in .env
         environ_keys = os.environ.keys()
@@ -4279,435 +4499,70 @@ def get_valid_models():
     except:
         return [] # NON-Blocking
 
+# used for litellm.text_completion() to transform HF logprobs to OpenAI.Completion() format
+def transform_logprobs(hf_response):
+    # Initialize an empty list for the transformed logprobs
+    transformed_logprobs = []
 
-############################# BATCH COMPLETION with Rate Limit Throttling #######################
-@dataclass
-class StatusTracker:
-    """Stores metadata about the script's progress. Only one instance is created."""
+    # For each Hugging Face response, transform the logprobs
+    for response in hf_response:
+        # Extract the relevant information from the response
+        response_details = response['details']
+        top_tokens = response_details.get("top_tokens", {})
 
-    num_tasks_started: int = 0
-    num_tasks_in_progress: int = 0  # script ends when this reaches 0
-    num_tasks_succeeded: int = 0
-    num_tasks_failed: int = 0
-    num_rate_limit_errors: int = 0
-    num_api_errors: int = 0  # excluding rate limit errors, counted above
-    num_other_errors: int = 0
-    time_of_last_rate_limit_error: int = 0  # used to cool off after hitting rate limits
+        # Initialize an empty list for the token information
+        token_info = {
+            'tokens': [],
+            'token_logprobs': [],
+            'text_offset': [],
+            'top_logprobs': [],
+        }
 
+        for i, token in enumerate(response_details['prefill']):
+            # Extract the text of the token
+            token_text = token['text']
 
-@dataclass
-class APIRequest:
-    """Stores an API request's inputs, outputs, and other metadata. Contains a method to make an API call."""
+            # Extract the logprob of the token
+            token_logprob = token['logprob']
 
-    task_id: int
-    request_json: dict
-    token_consumption: int
-    attempts_left: int
-    metadata: dict
-    result: list = field(default_factory=list)
+            # Add the token information to the 'token_info' list
+            token_info['tokens'].append(token_text)
+            token_info['token_logprobs'].append(token_logprob)
 
-    async def call_api(
-        self,
-        request_header: dict,
-        retry_queue: asyncio.Queue,
-        status_tracker: StatusTracker,
-        save_filepath: str = "",
-    ):
-        """Calls the OpenAI API and saves results."""
-        logging.info(f"Making API Call for request #{self.task_id} {self.request_json}")
-        error = None
-        try:
-            response = await litellm.acompletion(
-                **self.request_json
-            )
-            logging.info(f"Completed request #{self.task_id}")
-            if save_filepath == "": # return respons
-                return response
-            # else this gets written to save_filepath
-        except Exception as e:
-            logging.warning(
-                f"Request {self.task_id} failed with error {e}"
-            )
-            status_tracker.num_api_errors += 1
-            error = e
-            print(f"got exception {e}")
-            if "Rate limit" in str(e):
-                status_tracker.time_of_last_rate_limit_error = int(time.time())
-                status_tracker.num_rate_limit_errors += 1
-                status_tracker.num_api_errors -= (
-                    1  # rate limit errors are counted separately
-                )
+            # stub this to work with llm eval harness
+            top_alt_tokens = { "": -1, "": -2, "": -3 }
+            token_info['top_logprobs'].append(top_alt_tokens)
 
-        if error:
-            self.result.append(error)
-            if self.attempts_left:
-                retry_queue.put_nowait(self)
-            else:
-                logging.error(
-                    f"Request {self.request_json} failed after all attempts. Saving errors: {self.result}"
-                )
-                data = (
-                    [self.request_json, [str(e) for e in self.result], self.metadata]
-                    if self.metadata
-                    else [self.request_json, [str(e) for e in self.result]]
-                )
-                self.append_to_jsonl(data, save_filepath)
-                status_tracker.num_tasks_in_progress -= 1
-                status_tracker.num_tasks_failed += 1
-        else:
-            data = (
-                [self.request_json, response, self.metadata]
-                if self.metadata
-                else [self.request_json, response]
-            )
-            self.append_to_jsonl(data, save_filepath)
-            status_tracker.num_tasks_in_progress -= 1
-            status_tracker.num_tasks_succeeded += 1
-            logging.debug(f"Request {self.task_id} saved to {save_filepath}")
-        
+        # For each element in the 'tokens' list, extract the relevant information
+        for i, token in enumerate(response_details['tokens']):
 
-    def append_to_jsonl(self, data, filename: str) -> None:
-        """Append a json payload to the end of a jsonl file."""
-        json_string = json.dumps(data)
-        with open(filename, "a") as f:
-            f.write(json_string + "\n")
+            # Extract the text of the token
+            token_text = token['text']
 
+            # Extract the logprob of the token
+            token_logprob = token['logprob']
 
-class RateLimitManager():
-    import uuid
-    def __init__(self, max_tokens_per_minute, max_requests_per_minute):
-        self.max_tokens_per_minute = max_tokens_per_minute
-        self.max_requests_per_minute = max_requests_per_minute
-        # print("init rate limit handler")
-        self.status_tracker = StatusTracker()
-        self.last_update_time = time.time()
-        self.available_request_capacity = max_requests_per_minute
-        self.available_token_capacity = max_tokens_per_minute
-        self.queue_of_requests_to_retry = asyncio.Queue() # type: ignore
-        self.task = 0 # for tracking ids for tasks
-        self.cooldown_time = 10 # time to cooldown between retries in seconds
+            top_alt_tokens = {}
+            temp_top_logprobs = []
+            if top_tokens != {}:
+                temp_top_logprobs = top_tokens[i]
 
-    async def acompletion(self, max_attempts=5, **kwargs):
-        # Initialize logging
-        logging.basicConfig(level=logging.INFO)
+            # top_alt_tokens should look like this: { "alternative_1": -1, "alternative_2": -2, "alternative_3": -3 }
+            for elem in temp_top_logprobs:
+                text = elem["text"]
+                logprob = elem["logprob"]
+                top_alt_tokens[text] = logprob
 
-        # Initialize request
-        logging.info(f"Initializing API request for request id:{self.task}")
-        request = APIRequest(
-            task_id=self.task,
-            request_json=kwargs,
-            token_consumption=self.num_tokens_consumed_from_request(request_json=kwargs, token_encoding_name="cl100k_base"),
-            attempts_left=max_attempts,
-            metadata=kwargs.pop("metadata", None),
-        )
-        self.task+=1 # added a new task to execute
+            # Add the token information to the 'token_info' list
+            token_info['tokens'].append(token_text)
+            token_info['token_logprobs'].append(token_logprob)
+            token_info['top_logprobs'].append(top_alt_tokens)
 
-        # Check and update current capacity for model
-        current_time = time.time()
-        seconds_since_update = current_time - self.last_update_time
-        
-        self.available_request_capacity = min(
-            self.available_request_capacity + self.max_requests_per_minute * seconds_since_update / 60.0,
-            self.max_requests_per_minute,
-        )
+            # Add the text offset of the token
+            # This is computed as the sum of the lengths of all previous tokens
+            token_info['text_offset'].append(sum(len(t['text']) for t in response_details['tokens'][:i]))
 
-        self.available_token_capacity = min(
-            self.available_token_capacity + self.max_tokens_per_minute * seconds_since_update / 60.0,
-            self.max_tokens_per_minute,
-        )
+        # Add the 'token_info' list to the 'transformed_logprobs' list
+        transformed_logprobs = token_info
 
-        self.last_update_time = current_time
-
-        request_tokens = request.token_consumption
-        logging.debug("Request tokens: " + str(request_tokens))
-
-        queue_of_requests_to_retry = asyncio.Queue() 
-
-        if (self.available_request_capacity >= 1 and self.available_token_capacity >= request_tokens):
-
-            # Update counters
-            self.available_request_capacity -= 1
-            self.available_token_capacity -= request_tokens
-            request.attempts_left -= 1
-
-            # Call API and log final status
-            logging.info(f"""Running Request {request.task_id}, using tokens: {request.token_consumption}. Remaining available tokens: {self.available_token_capacity}""")
-            
-            result = await request.call_api(
-                request_header={},
-                retry_queue=queue_of_requests_to_retry,
-                save_filepath="",
-                status_tracker=self.status_tracker,
-            )
-            return result
-        else:
-            logging.info(f"OVER CAPACITY for {request.task_id}. retrying {request.attempts_left} times")
-            while request.attempts_left >= 0:
-                # Sleep for a minute to allow for capacity
-                logging.info(f"OVER CAPACITY for {request.task_id}. Cooling down for 60 seconds, retrying {request.attempts_left} times")
-                await asyncio.sleep(self.cooldown_time)
-
-                # Check capacity
-                current_time = time.time()
-                seconds_since_update = current_time - self.last_update_time
-                
-                self.available_request_capacity = min(
-                    self.available_request_capacity + self.max_requests_per_minute * seconds_since_update / 60.0,
-                    self.max_requests_per_minute,
-                )
-
-                self.available_token_capacity = min(
-                    self.available_token_capacity + self.max_tokens_per_minute * seconds_since_update / 60.0,
-                    self.max_tokens_per_minute,
-                )
-
-                self.last_update_time = current_time
-
-                request_tokens = request.token_consumption
-
-                if self.available_request_capacity >= 1 and self.available_token_capacity >= request_tokens:
-                    logging.info("Available token capacity available.")
-
-                    # Update counters
-                    self.available_request_capacity -= 1
-                    self.available_token_capacity -= request_tokens
-                    request.attempts_left -= 1
-
-                    # Call API and log final status
-                    logging.info(f"""Running Request {request.task_id}, using tokens: {request.token_consumption}. Remaining available tokens: {self.available_token_capacity}""")
-
-                    result = await request.call_api(
-                        request_header={},
-                        retry_queue=queue_of_requests_to_retry,
-                        save_filepath="",
-                        status_tracker=self.status_tracker,
-                    )
-                    return result
-                
-                logging.warning(f"Request {request.task_id} is still over capacity. Number of retry attempts left: {request.attempts_left}")
-                request.attempts_left -=1
-
-    async def batch_completion(
-        self,
-        requests_filepath: str = "",
-        jobs: list = [],
-        save_filepath: Optional[str] = None,
-        api_key: Optional[str] = os.getenv("OPENAI_API_KEY"),
-        max_requests_per_minute: float = 3_000 * 0.5,
-        max_tokens_per_minute: float = 250_000 * 0.5,
-        token_encoding_name: str = "cl100k_base",
-        max_attempts: int = 5,
-        logging_level: int = logging.INFO,
-        ):
-
-        if save_filepath == None:
-            save_filepath = "litellm_results.jsonl"
-        print("running batch completion")
-
-        # constants
-        seconds_to_pause_after_rate_limit_error = 15
-        seconds_to_sleep_each_loop = (
-            0.001  # 1 ms limits max throughput to 1,000 requests per second
-        )
-
-        # initialize logging
-        logging.basicConfig(level=logging_level)
-        logging.debug(f"Logging initialized at level {logging_level}")
-
-        # infer API endpoint and construct request header
-
-        request_header = {"Authorization": f"Bearer {api_key}"}
-
-        # initialize trackers
-        queue_of_requests_to_retry = asyncio.Queue() # type: ignore
-        task_id_generator = (
-            self.task_id_generator_function()
-        )  # generates integer IDs of 1, 2, 3, ...
-        status_tracker = (
-            StatusTracker()
-        )  # single instance to track a collection of variables
-        next_request = None  # variable to hold the next request to call
-
-        # initialize available capacity counts
-        available_request_capacity = max_requests_per_minute
-        available_token_capacity = max_tokens_per_minute
-        last_update_time = time.time()
-
-        # initialize flags
-        file_not_finished = True  # after file is empty, we'll skip reading it
-        logging.debug(f"Initialization complete.")
-
-        requests = iter(jobs)
-
-        while True:
-            # get next request (if one is not already waiting for capacity)
-            if next_request is None:
-                if not queue_of_requests_to_retry.empty():
-                    next_request = queue_of_requests_to_retry.get_nowait()
-                    logging.debug(
-                        f"Retrying request {next_request.task_id}: {next_request}"
-                    )
-                elif file_not_finished:
-                    try:
-                        # get new request
-                        request_json = next(requests)
-                        if "api_key" not in request_json:
-                            request_json["api_key"] = api_key
-                        # print("CREATING API REQUEST")
-                        next_request = APIRequest(
-                            task_id=next(task_id_generator),
-                            request_json=request_json,
-                            token_consumption=self.num_tokens_consumed_from_request(
-                                request_json, token_encoding_name
-                            ),
-                            attempts_left=max_attempts,
-                            metadata=request_json.pop("metadata", None),
-                        )
-                        # print("AFTER INIT API REQUEST")
-                        status_tracker.num_tasks_started += 1
-                        status_tracker.num_tasks_in_progress += 1
-                        logging.debug(
-                            f"Reading request {next_request.task_id}: {next_request}"
-                        )
-                    except:
-                        logging.debug("Jobs finished")
-                        file_not_finished = False
-
-                # update available capacity
-                current_time = time.time()
-                seconds_since_update = current_time - last_update_time
-                available_request_capacity = min(
-                    available_request_capacity
-                    + max_requests_per_minute * seconds_since_update / 60.0,
-                    max_requests_per_minute,
-                )
-                available_token_capacity = min(
-                    available_token_capacity
-                    + max_tokens_per_minute * seconds_since_update / 60.0,
-                    max_tokens_per_minute,
-                )
-                last_update_time = current_time
-
-                # if enough capacity available, call API
-                if next_request:
-                    next_request_tokens = next_request.token_consumption
-                    if (
-                        available_request_capacity >= 1
-                        and available_token_capacity >= next_request_tokens
-                    ):
-                        # update counters
-                        available_request_capacity -= 1
-                        available_token_capacity -= next_request_tokens
-                        next_request.attempts_left -= 1
-
-                        # call API
-                        # after finishing, log final status
-                        logging.info(
-                            f"""Running Request {next_request.task_id}, using tokens: {next_request.token_consumption} remaining available tokens: {available_token_capacity}"""
-                        )
-                        next_request.task_id
-        
-                        asyncio.create_task(
-                            next_request.call_api(
-                                request_header=request_header,
-                                retry_queue=queue_of_requests_to_retry,
-                                save_filepath=save_filepath,
-                                status_tracker=status_tracker,
-                            )
-                        )
-                        next_request = None  # reset next_request to empty
-
-                # if all tasks are finished, break
-                if status_tracker.num_tasks_in_progress == 0:
-                    break
-
-                # main loop sleeps briefly so concurrent tasks can run
-                await asyncio.sleep(seconds_to_sleep_each_loop)
-
-                # if a rate limit error was hit recently, pause to cool down
-                seconds_since_rate_limit_error = (
-                    time.time() - status_tracker.time_of_last_rate_limit_error
-                )
-                if (
-                    seconds_since_rate_limit_error
-                    < seconds_to_pause_after_rate_limit_error
-                ):
-                    remaining_seconds_to_pause = (
-                        seconds_to_pause_after_rate_limit_error
-                        - seconds_since_rate_limit_error
-                    )
-                    await asyncio.sleep(remaining_seconds_to_pause)
-                    # ^e.g., if pause is 15 seconds and final limit was hit 5 seconds ago
-                    logging.warn(
-                        f"Pausing to cool down until {time.ctime(status_tracker.time_of_last_rate_limit_error + seconds_to_pause_after_rate_limit_error)}"
-                    )
-
-        # after finishing, log final status
-        logging.info(
-            f"""Parallel processing complete. Results saved to {save_filepath}"""
-        )
-        if status_tracker.num_tasks_failed > 0:
-            logging.warning(
-                f"{status_tracker.num_tasks_failed} / {status_tracker.num_tasks_started} requests failed. Errors logged to {save_filepath}."
-            )
-        if status_tracker.num_rate_limit_errors > 0:
-            logging.warning(
-                f"{status_tracker.num_rate_limit_errors} rate limit errors received. Consider running at a lower rate."
-            )
-
-
-    # dataclasses
-
-
-    
-
-
-    def num_tokens_consumed_from_request(
-        self,
-        request_json: dict,
-        token_encoding_name: str,
-    ):
-        """Count the number of tokens in the request. Only supports completion and embedding requests."""
-        encoding = tiktoken.get_encoding(token_encoding_name)
-        # if completions request, tokens = prompt + n * max_tokens
-        
-        max_tokens = request_json.get("max_tokens", 15)
-        n = request_json.get("n", 1)
-        completion_tokens = n * max_tokens
-
-
-        num_tokens = 0
-        for message in request_json["messages"]:
-            num_tokens += 4  # every message follows <im_start>{role/name}\n{content}<im_end>\n
-            for key, value in message.items():
-                num_tokens += len(encoding.encode(value))
-                if key == "name":  # if there's a name, the role is omitted
-                    num_tokens -= 1  # role is always required and always 1 token
-        num_tokens += 2  # every reply is primed with <im_start>assistant
-        return num_tokens + completion_tokens
-
-    def task_id_generator_function(self):
-        """Generate integers 0, 1, 2, and so on."""
-        task_id = 0
-        while True:
-            yield task_id
-            task_id += 1
-
-
-###### USAGE ################
-# jobs = [
-#     {"model": "gpt-4", "messages": [{"content": "Please provide a summary of the latest scientific discoveries."*500, "role": "user"}]},
-#     {"model": "gpt-4", "messages": [{"content": "Please provide a summary of the latest scientific discoveries."*800, "role": "user"}]},
-#     {"model": "gpt-4", "messages": [{"content": "Please provide a summary of the latest scientific discoveries."*900, "role": "user"}]},
-#     {"model": "gpt-4", "messages": [{"content": "Please provide a summary of the latest scientific discoveries."*900, "role": "user"}]},
-#     {"model": "gpt-4", "messages": [{"content": "Please provide a summary of the latest scientific discoveries."*900, "role": "user"}]}
-# ]
-
-# asyncio.run(
-#         batch_completion_rate_limits(
-#             jobs = jobs,
-#             api_key="",
-#             max_requests_per_minute=60,
-#             max_tokens_per_minute=40000
-#         )
-# )
\ No newline at end of file
+    return transformed_logprobs
\ No newline at end of file
diff --git a/litellm_server/.env.template b/litellm_server/.env.template
index 280c38912..a87ae9cf3 100644
--- a/litellm_server/.env.template
+++ b/litellm_server/.env.template
@@ -21,6 +21,9 @@ ANTHROPIC_API_KEY = ""
 
 COHERE_API_KEY = ""
 
+## CONFIG FILE ## 
+# CONFIG_FILE_PATH = ""  # uncomment to point to config file  
+
 ## LOGGING ## 
 
 SET_VERBOSE = "False" # set to 'True' to see detailed input/output logs
diff --git a/litellm_server/Dockerfile b/litellm_server/Dockerfile
index 70d12a253..7be7ba4c9 100644
--- a/litellm_server/Dockerfile
+++ b/litellm_server/Dockerfile
@@ -7,4 +7,4 @@ RUN pip install -r requirements.txt
 
 EXPOSE $PORT 
 
-CMD exec uvicorn main:app --host 0.0.0.0 --port $PORT
\ No newline at end of file
+CMD exec uvicorn main:app --host 0.0.0.0 --port $PORT --workers 10 
\ No newline at end of file
diff --git a/litellm_server/README.md b/litellm_server/README.md
index 2eb586d22..881b7de9f 100644
--- a/litellm_server/README.md
+++ b/litellm_server/README.md
@@ -2,32 +2,33 @@
 
 A simple, fast, and lightweight **OpenAI-compatible server** to call 100+ LLM APIs.
 
-<p align="center" style="margin: 2%">
-        <a href="https://l.linklyhq.com/l/1uHsr" target="_blank">
-                <img src="https://render.com/images/deploy-to-render-button.svg" width="173"/>
+<h4 align="center">
+        <a href="https://l.linklyhq.com/l/1uHsr" target="_blank" >
+                <img src="https://render.com/images/deploy-to-render-button.svg" width=200/>
+        </a>
+        <a href="https://railway.app/template/YTHiYS?referralCode=t3ukrU" target="_blank">
+                <img src="https://railway.app/button.svg" width=200 />
         </a>
         <a href="https://l.linklyhq.com/l/1uHtX" target="_blank">
-                <img src="https://deploy.cloud.run/button.svg" width="200"/>
+                <img src="https://deploy.cloud.run/button.svg" width=200 height=50/>
         </a>
-</p>
+        <a href="https://docs.litellm.ai/docs/simple_proxy#deploy-on-aws-apprunner" target="_blank">
+            <img src="https://github.com/BerriAI/litellm/blob/main/.github/deploy-to-aws.png"  height=40/>
+          </a>
+</h4>
+
+LiteLLM Server supports: 
+- LLM API Calls in the OpenAI ChatCompletions format 
+- Caching + Logging capabilities (Redis and Langfuse, respectively)
+- Setting API keys in the request headers or in the .env 
 
 ## Usage 
 
 ```shell
-docker run -e PORT=8000 -p 8000:8000 ghcr.io/berriai/litellm:latest
-
-# UVICORN: OpenAI Proxy running on http://0.0.0.0:8000
+docker run -e PORT=8000 -e OPENAI_API_KEY=<your-openai-key> -p 8000:8000 ghcr.io/berriai/litellm:latest
 ```
+OpenAI Proxy running on http://0.0.0.0:8000
 
-## Endpoints:
-- `/chat/completions` - chat completions endpoint to call 100+ LLMs
-- `/router/completions` - for multiple deployments of the same model (e.g. Azure OpenAI), uses the least used deployment. [Learn more](https://docs.litellm.ai/docs/routing)
-- `/models` - available models on server
-
-## Making Requests to Proxy
-### Curl
-
-**Call OpenAI**
 ```shell
 curl http://0.0.0.0:8000/v1/chat/completions \
   -H "Content-Type: application/json" \
@@ -37,18 +38,80 @@ curl http://0.0.0.0:8000/v1/chat/completions \
      "temperature": 0.7
    }'
 ```
-**Call Bedrock**
+
+[**See how to call Huggingface,Bedrock,TogetherAI,Anthropic, etc.**](https://docs.litellm.ai/docs/providers)
+## Endpoints:
+- `/chat/completions` - chat completions endpoint to call 100+ LLMs
+- `/models` - available models on server
+
+## Save Model-specific params (API Base, API Keys, Temperature, etc.)
+Use the [router_config_template.yaml](https://github.com/BerriAI/litellm/blob/main/router_config_template.yaml) to save model-specific information like api_base, api_key, temperature, max_tokens, etc. 
+
+1. Create a `config.yaml` file
 ```shell
-curl http://0.0.0.0:8000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-     "model": "bedrock/anthropic.claude-instant-v1",
-     "messages": [{"role": "user", "content": "Say this is a test!"}],
-     "temperature": 0.7
-   }'
+model_list:
+  - model_name: gpt-3.5-turbo # set model alias 
+    litellm_params: # params for litellm.completion() - https://docs.litellm.ai/docs/completion/input#input---request-body
+      model: azure/chatgpt-v-2 # azure/<your-deployment-name> <- actual name used for litellm.completion()
+      api_key: your_azure_api_key
+      api_version: your_azure_api_version
+      api_base: your_azure_api_base
+  - model_name: mistral-7b
+    litellm_params:
+      model: ollama/mistral
+      api_base: your_ollama_api_base
 ```
 
-### Running Locally
+2. Start the server
+
+```shell
+docker run -e PORT=8000 -p 8000:8000 -v $(pwd)/config.yaml:/app/config.yaml ghcr.io/berriai/litellm:latest
+```
+## Caching 
+
+Add Redis Caching to your server via environment variables  
+
+```env
+### REDIS
+REDIS_HOST = "" 
+REDIS_PORT = "" 
+REDIS_PASSWORD = "" 
+```
+
+Docker command: 
+
+```shell
+docker run -e REDIST_HOST=<your-redis-host> -e REDIS_PORT=<your-redis-port> -e REDIS_PASSWORD=<your-redis-password> -e PORT=8000 -p 8000:8000 ghcr.io/berriai/litellm:latest
+```
+
+## Logging 
+
+1. Debug Logs
+Print the input/output params by setting `SET_VERBOSE = "True"`.
+
+Docker command:
+
+```shell
+docker run -e SET_VERBOSE="True" -e PORT=8000 -p 8000:8000 ghcr.io/berriai/litellm:latest
+```
+
+Add Langfuse Logging to your server via environment variables  
+
+```env
+### LANGFUSE
+LANGFUSE_PUBLIC_KEY = ""
+LANGFUSE_SECRET_KEY = ""
+# Optional, defaults to https://cloud.langfuse.com
+LANGFUSE_HOST = "" # optional
+```
+
+Docker command: 
+
+```shell
+docker run -e LANGFUSE_PUBLIC_KEY=<your-public-key> -e LANGFUSE_SECRET_KEY=<your-secret-key> -e LANGFUSE_HOST=<your-langfuse-host> -e PORT=8000 -p 8000:8000 ghcr.io/berriai/litellm:latest
+```
+
+## Running Locally
 ```shell 
 $ git clone https://github.com/BerriAI/litellm.git
 ```
@@ -59,5 +122,16 @@ $ cd ./litellm/litellm_server
 ```shell
 $ uvicorn main:app --host 0.0.0.0 --port 8000
 ```
-
-[**See how to call Huggingface,Bedrock,TogetherAI,Anthropic, etc.**](https://docs.litellm.ai/docs/simple_proxy)
+### Custom Config 
+1. Create + Modify [router_config.yaml](https://github.com/BerriAI/litellm/blob/main/router_config_template.yaml) (save your azure/openai/etc. deployment info)
+```shell
+cp ./router_config_template.yaml ./router_config.yaml
+```
+2. Build Docker Image
+```shell
+docker build -t litellm_server . --build-arg CONFIG_FILE=./router_config.yaml 
+```
+3. Run Docker Image
+```shell
+docker run --name litellm_server -e PORT=8000 -p 8000:8000 litellm_server
+```
diff --git a/litellm_server/config b/litellm_server/config
deleted file mode 100644
index e69de29bb..000000000
diff --git a/litellm_server/main.py b/litellm_server/main.py
index 4f2586b7a..c7b26b685 100644
--- a/litellm_server/main.py
+++ b/litellm_server/main.py
@@ -1,15 +1,19 @@
-import litellm, os, traceback
+import os, traceback
 from fastapi import FastAPI, Request, HTTPException
 from fastapi.routing import APIRouter
 from fastapi.responses import StreamingResponse, FileResponse
 from fastapi.middleware.cors import CORSMiddleware
-import json
-import os
+import json, sys
 from typing import Optional
+sys.path.insert(
+    0, os.path.abspath("../")
+)  # Adds the parent directory to the system path - for litellm local dev
+import litellm
+print(f"litellm: {litellm}")
 try:
-    from utils import set_callbacks, load_router_config
+    from utils import set_callbacks, load_router_config, print_verbose
 except ImportError:
-    from litellm_server.utils import set_callbacks, load_router_config
+    from litellm_server.utils import set_callbacks, load_router_config, print_verbose
 import dotenv
 dotenv.load_dotenv() # load env variables
 
@@ -26,14 +30,23 @@ app.add_middleware(
 )
 #### GLOBAL VARIABLES ####
 llm_router: Optional[litellm.Router] = None
+llm_model_list: Optional[list] = None
+server_settings: Optional[dict] = None
 
 set_callbacks() # sets litellm callbacks for logging if they exist in the environment 
-llm_router = load_router_config(router=llm_router)
+
+if "CONFIG_FILE_PATH" in os.environ:
+    print(f"CONFIG FILE DETECTED")
+    llm_router, llm_model_list, server_settings = load_router_config(router=llm_router, config_file_path=os.getenv("CONFIG_FILE_PATH"))
+else:
+    llm_router, llm_model_list, server_settings = load_router_config(router=llm_router)
 #### API ENDPOINTS ####
-@router.post("/v1/models")
+@router.get("/v1/models")
 @router.get("/models")  # if project requires model list
 def model_list():
     all_models = litellm.utils.get_valid_models()
+    if llm_model_list: 
+        all_models += llm_model_list
     return dict(
         data=[
             {
@@ -72,7 +85,7 @@ async def embedding(request: Request):
         # default to always using the "ENV" variables, only if AUTH_STRATEGY==DYNAMIC then reads headers
         if os.getenv("AUTH_STRATEGY", None) == "DYNAMIC" and "authorization" in request.headers: # if users pass LLM api keys as part of header
             api_key = request.headers.get("authorization")
-            api_key = api_key.replace("Bearer", "").strip() 
+            api_key = api_key.replace("Bearer", "").strip()
             if len(api_key.strip()) > 0:
                 api_key = api_key
                 data["api_key"] = api_key
@@ -87,27 +100,53 @@ async def embedding(request: Request):
 
 @router.post("/v1/chat/completions")
 @router.post("/chat/completions")
-async def chat_completion(request: Request):
+@router.post("/openai/deployments/{model:path}/chat/completions") # azure compatible endpoint
+async def chat_completion(request: Request, model: Optional[str] = None):
+    global llm_model_list, server_settings
     try:
         data = await request.json()
+        print(f"data: {data}")
+        data["model"] = (
+            server_settings.get("completion_model", None) # server default
+            or model # model passed in url 
+            or data["model"] # default passed in
+        )
+        ## CHECK KEYS ## 
         # default to always using the "ENV" variables, only if AUTH_STRATEGY==DYNAMIC then reads headers
-        if os.getenv("AUTH_STRATEGY", None) == "DYNAMIC" and "authorization" in request.headers: # if users pass LLM api keys as part of header
-            api_key = request.headers.get("authorization")
-            api_key = api_key.replace("Bearer", "").strip() 
-            if len(api_key.strip()) > 0:
-                api_key = api_key
-                data["api_key"] = api_key
+        # env_validation = litellm.validate_environment(model=data["model"])
+        # if (env_validation['keys_in_environment'] is False or os.getenv("AUTH_STRATEGY", None) == "DYNAMIC") and ("authorization" in request.headers or "api-key" in request.headers): # if users pass LLM api keys as part of header
+        #     if "authorization" in request.headers:
+        #         api_key = request.headers.get("authorization")
+        #     elif "api-key" in request.headers: 
+        #         api_key = request.headers.get("api-key")
+        #     print(f"api_key in headers: {api_key}")
+        #     if " " in api_key:
+        #         api_key = api_key.split(" ")[1]
+        #     print(f"api_key split: {api_key}")
+        #     if len(api_key) > 0:
+        #         api_key = api_key
+        #         data["api_key"] = api_key
+        #         print(f"api_key in data: {api_key}")
+        ## CHECK CONFIG ## 
+        if llm_model_list and data["model"] in [m["model_name"] for m in llm_model_list]:
+            for m in llm_model_list: 
+                if data["model"] == m["model_name"]: 
+                    for key, value in m["litellm_params"].items(): 
+                        data[key] = value
+                    break
         response = litellm.completion(
             **data
         )
         if 'stream' in data and data['stream'] == True: # use generate_responses to stream responses
                 return StreamingResponse(data_generator(response), media_type='text/event-stream')
+        print(f"response: {response}")
         return response
     except Exception as e:
         error_traceback = traceback.format_exc()
+        print(f"{error_traceback}")
         error_msg = f"{str(e)}\n\n{error_traceback}"
-        return {"error": error_msg}
-        # raise HTTPException(status_code=500, detail=error_msg)
+        # return {"error": error_msg}
+        raise HTTPException(status_code=500, detail=error_msg)
 
 @router.post("/router/completions")
 async def router_completion(request: Request):
@@ -157,4 +196,4 @@ async def home(request: Request):
     return "LiteLLM: RUNNING"
 
 
-app.include_router(router)
+app.include_router(router)
\ No newline at end of file
diff --git a/litellm_server/utils.py b/litellm_server/utils.py
index 5cb1bd06a..ffaa64c91 100644
--- a/litellm_server/utils.py
+++ b/litellm_server/utils.py
@@ -1,5 +1,12 @@
 import os, litellm
 import pkg_resources
+import dotenv
+dotenv.load_dotenv() # load env variables
+
+def print_verbose(print_statement): 
+    print(f"SET_VERBOSE value: {os.environ['SET_VERBOSE']}")
+    if os.environ["SET_VERBOSE"] == "True":
+        print(print_statement)
 
 def get_package_version(package_name):
     try:
@@ -36,26 +43,37 @@ def set_callbacks():
     
     ## CACHING 
     ### REDIS
-    if len(os.getenv("REDIS_HOST", "")) >  0 and len(os.getenv("REDIS_PORT", "")) > 0 and len(os.getenv("REDIS_PASSWORD", "")) > 0: 
-        from litellm.caching import Cache
-        litellm.cache = Cache(type="redis", host=os.getenv("REDIS_HOST"), port=os.getenv("REDIS_PORT"), password=os.getenv("REDIS_PASSWORD"))
-        print("\033[92mLiteLLM: Switched on Redis caching\033[0m")
+    # if len(os.getenv("REDIS_HOST", "")) >  0 and len(os.getenv("REDIS_PORT", "")) > 0 and len(os.getenv("REDIS_PASSWORD", "")) > 0: 
+    #     print(f"redis host: {os.getenv('REDIS_HOST')}; redis port: {os.getenv('REDIS_PORT')}; password: {os.getenv('REDIS_PASSWORD')}")
+    #     from litellm.caching import Cache
+    #     litellm.cache = Cache(type="redis", host=os.getenv("REDIS_HOST"), port=os.getenv("REDIS_PORT"), password=os.getenv("REDIS_PASSWORD"))
+    #     print("\033[92mLiteLLM: Switched on Redis caching\033[0m")
 
 
 
-def load_router_config(router: Optional[litellm.Router]):
+def load_router_config(router: Optional[litellm.Router], config_file_path: Optional[str]='/app/config.yaml'):
     config = {}
-    config_file = '/app/config.yaml'
-
+    server_settings  = {} 
     try: 
-        if os.path.exists(config_file):
-            with open(config_file, 'r') as file:
+        if os.path.exists(config_file_path):
+            with open(config_file_path, 'r') as file:
                 config = yaml.safe_load(file)
         else:
             pass
     except:
         pass
 
+    ## SERVER SETTINGS (e.g. default completion model = 'ollama/mistral')
+    server_settings = config.get("server_settings", None)
+    if server_settings: 
+        server_settings = server_settings
+
+    ## LITELLM MODULE SETTINGS (e.g. litellm.drop_params=True,..)
+    litellm_settings = config.get('litellm_settings', None)
+    if litellm_settings: 
+        for key, value in litellm_settings.items(): 
+            setattr(litellm, key, value)
+
     ## MODEL LIST
     model_list = config.get('model_list', None)
     if model_list: 
@@ -67,4 +85,4 @@ def load_router_config(router: Optional[litellm.Router]):
         for key, value in environment_variables.items(): 
             os.environ[key] = value
 
-    return router
+    return router, model_list, server_settings
diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json
index 84fc0b890..fe66b9f92 100644
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@@ -233,6 +233,48 @@
         "litellm_provider": "vertex_ai-code-chat-models",
         "mode": "chat"
     },
+    "palm/chat-bison": {
+        "max_tokens": 4096,
+        "input_cost_per_token": 0.000000125,
+        "output_cost_per_token": 0.000000125,
+        "litellm_provider": "palm",
+        "mode": "chat"
+    },
+    "palm/chat-bison-001": {
+        "max_tokens": 4096,
+        "input_cost_per_token": 0.000000125,
+        "output_cost_per_token": 0.000000125,
+        "litellm_provider": "palm",
+        "mode": "chat"
+    },
+    "palm/text-bison": {
+        "max_tokens": 8196,
+        "input_cost_per_token": 0.000000125,
+        "output_cost_per_token": 0.000000125,
+        "litellm_provider": "palm",
+        "mode": "completion"
+    },
+    "palm/text-bison-001": {
+        "max_tokens": 8196,
+        "input_cost_per_token": 0.000000125,
+        "output_cost_per_token": 0.000000125,
+        "litellm_provider": "palm",
+        "mode": "completion"
+    },
+    "palm/text-bison-safety-off": {
+        "max_tokens": 8196,
+        "input_cost_per_token": 0.000000125,
+        "output_cost_per_token": 0.000000125,
+        "litellm_provider": "palm",
+        "mode": "completion"
+    },
+    "palm/text-bison-safety-recitation-off": {
+        "max_tokens": 8196,
+        "input_cost_per_token": 0.000000125,
+        "output_cost_per_token": 0.000000125,
+        "litellm_provider": "palm",
+        "mode": "completion"
+    },
     "command-nightly": {
         "max_tokens": 4096,
         "input_cost_per_token": 0.000015,
@@ -544,12 +586,12 @@
         "output_cost_per_token": 0.0000004
     },
     "together-ai-20.1b-40b": {
-        "input_cost_per_token": 0.000001,
-        "output_cost_per_token": 0.000001
+        "input_cost_per_token": 0.0000008,
+        "output_cost_per_token": 0.0000008
     },
     "together-ai-40.1b-70b": {
-        "input_cost_per_token": 0.000003,
-        "output_cost_per_token": 0.000003
+        "input_cost_per_token": 0.000001,
+        "output_cost_per_token": 0.000001
     },
     "ollama/llama2": {
         "max_tokens": 4096,
@@ -579,6 +621,34 @@
         "litellm_provider": "ollama",
         "mode": "completion"
     },
+    "ollama/mistral": {
+        "max_tokens": 8192,
+        "input_cost_per_token": 0.0,
+        "output_cost_per_token": 0.0,
+        "litellm_provider": "ollama",
+        "mode": "completion"
+    },
+    "ollama/codellama": {
+        "max_tokens": 4096,
+        "input_cost_per_token": 0.0,
+        "output_cost_per_token": 0.0,
+        "litellm_provider": "ollama",
+        "mode": "completion"
+    },
+    "ollama/orca-mini": {
+        "max_tokens": 4096,
+        "input_cost_per_token": 0.0,
+        "output_cost_per_token": 0.0,
+        "litellm_provider": "ollama",
+        "mode": "completion"
+    },
+    "ollama/vicuna": {
+        "max_tokens": 2048,
+        "input_cost_per_token": 0.0,
+        "output_cost_per_token": 0.0,
+        "litellm_provider": "ollama",
+        "mode": "completion"
+    },
     "deepinfra/meta-llama/Llama-2-70b-chat-hf": {
         "max_tokens": 4096,
         "input_cost_per_token": 0.000000700,
diff --git a/pyproject.toml b/pyproject.toml
index 3f53f9496..0c10050a0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "0.12.5"
+version = "0.13.1"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT License"
@@ -26,7 +26,7 @@ requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.commitizen]
-version = "0.12.5"
+version = "0.13.1"
 version_files = [
     "pyproject.toml:^version"
 ]
diff --git a/router_config_template.yaml b/router_config_template.yaml
index e548f9829..b6a8612a4 100644
--- a/router_config_template.yaml
+++ b/router_config_template.yaml
@@ -1,26 +1,28 @@
+# Global settings for the litellm module
+litellm_settings:
+  drop_params: True
+  # failure_callbacks: ["sentry"]
+
+# Model-specific settings
 model_list: # refer to https://docs.litellm.ai/docs/routing
   - model_name: gpt-3.5-turbo
-    litellm_params:
+    litellm_params:  # parameters for litellm.completion() 
       model: azure/chatgpt-v-2 # azure/<your-deployment-name>
       api_key: your_azure_api_key
       api_version: your_azure_api_version
       api_base: your_azure_api_base
-    tpm: 240000 # REPLACE with your azure deployment tpm
-    rpm: 1800 # REPLACE with your azure deployment rpm
-  - model_name: gpt-3.5-turbo
+    tpm: 240000 # [OPTIONAL] To load balance between multiple deployments
+    rpm: 1800 # [OPTIONAL] To load balance between multiple deployments
+  - model_name: mistral
     litellm_params:
-      model: azure/chatgpt-functioncalling
-      api_key: your_azure_api_key
-      api_version: your_azure_api_version
-      api_base: your_azure_api_base
-    tpm: 240000
-    rpm: 1800
+      model: ollama/mistral
+      api_base: my_ollama_api_base
   - model_name: gpt-3.5-turbo
     litellm_params:
       model: gpt-3.5-turbo
       api_key: your_openai_api_key
-    tpm: 1000000 # REPLACE with your openai tpm
-    rpm: 9000 # REPLACE with your openai rpm
+    tpm: 1000000 # [OPTIONAL] REPLACE with your openai tpm
+    rpm: 9000 # [OPTIONAL] REPLACE with your openai rpm
 
 environment_variables:
   REDIS_HOST: your_redis_host