Merge branch 'main' of github.com:kylehh/litellm into main

2023-11-05 02:58:32 -08:00 · 2023-11-05 02:58:32 -08:00 · 067e5c3d91
commit 067e5c3d91
parent fb1205f143 99d241d2d5
108 changed files with 4717 additions and 3989 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -36,6 +36,7 @@ jobs:
            pip install appdirs
            pip install langchain
            pip install numpydoc
+            pip install traceloop-sdk==0.0.69
      - save_cache:
          paths:
            - ./venv
--- a/.flake8
+++ b/.flake8
@ -0,0 +1,2 @@
+[flake8]
+ignore = E,F,W,B,B9,C,D,I,N,S,W503,W504,E203, TCE,TCA,EXE999,E999,TD
--- a/.gitignore
+++ b/.gitignore
@ -13,3 +13,4 @@ litellm/proxy/litellm_secrets.toml
 litellm/proxy/api_log.json
 .idea/
 router_config.yaml
+litellm_server/config.yaml
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,8 @@
+repos:
+-   repo: https://github.com/pycqa/flake8
+    rev: 3.8.4  # The version of flake8 to use
+    hooks:
+    -  id: flake8
+       exclude: ^litellm/tests/|^litellm/proxy/|^litellm/integrations/
+       additional_dependencies: [flake8-print]
+       files: litellm/.*\.py
--- a/6
+++ b/6
@ -1,11 +1,5 @@
 FROM python:3.10

-# Define a build argument for the config file path
-ARG CONFIG_FILE
-
-# Copy the custom config file (if provided) into the Docker image
-COPY $CONFIG_FILE /app/config.yaml
-
 COPY . /app
 WORKDIR /app
 RUN pip install -r requirements.txt
--- a/README.md
+++ b/README.md
@ -5,22 +5,7 @@
        <p align="center">Call all LLM APIs using the OpenAI format [Bedrock, Huggingface, Cohere, TogetherAI, Azure, OpenAI, etc.]
        <br>
    </p>
-
-<h4 align="center">
-        <a href="https://l.linklyhq.com/l/1uHsr" target="_blank" >
-                <img src="https://render.com/images/deploy-to-render-button.svg" width=200/>
-        </a>
-        <a href="https://railway.app/template/YTHiYS?referralCode=t3ukrU" target="_blank">
-                <img src="https://railway.app/button.svg" width=200 />
-        </a>
-        <a href="https://l.linklyhq.com/l/1uHtX" target="_blank">
-                <img src="https://deploy.cloud.run/button.svg" width=200 height=50/>
-        </a>
-        <a href="https://docs.litellm.ai/docs/simple_proxy#deploy-on-aws-apprunner" target="_blank">
-            <img src=".github/deploy-to-aws.png"  height=40/>
-          </a>
-</h4>
-<h4 align="center"><a href="https://github.com/BerriAI/litellm/tree/main/litellm_server" target="_blank">LiteLLM Server</a></h4>
+<h4 align="center"><a href="https://github.com/BerriAI/litellm/tree/main/litellm_server" target="_blank">Evaluate LLMs → OpenAI-Compatible Server</a></h4>
 <h4 align="center">
    <a href="https://pypi.org/project/litellm/" target="_blank">
        <img src="https://img.shields.io/pypi/v/litellm.svg" alt="PyPI Version">
@ -79,6 +64,7 @@ print(response)
 liteLLM supports streaming the model response back, pass `stream=True` to get a streaming iterator in response.  
 Streaming is supported for all models (Bedrock, Huggingface, TogetherAI, Azure, OpenAI, etc.)
 ```python
+from litellm import completion
 response = completion(model="gpt-3.5-turbo", messages=messages, stream=True)
 for chunk in response:
    print(chunk['choices'][0]['delta'])
@ -89,31 +75,18 @@ for chunk in result:
  print(chunk['choices'][0]['delta'])
 ```

-## Supported Provider ([Docs](https://docs.litellm.ai/docs/providers))
-| Provider      | [Completion](https://docs.litellm.ai/docs/#basic-usage) | [Streaming](https://docs.litellm.ai/docs/completion/stream#streaming-responses)  | [Async Completion](https://docs.litellm.ai/docs/completion/stream#async-completion)  | [Async Streaming](https://docs.litellm.ai/docs/completion/stream#async-streaming)  |
-| ------------- | ------------- | ------------- | ------------- | ------------- |
-| [openai](https://docs.litellm.ai/docs/providers/openai)  | ✅ | ✅ | ✅ | ✅ |
-| [cohere](https://docs.litellm.ai/docs/providers/cohere)  | ✅ | ✅ | ✅ | ✅ |
-| [anthropic](https://docs.litellm.ai/docs/providers/anthropic)  | ✅ | ✅ | ✅ | ✅ |
-| [replicate](https://docs.litellm.ai/docs/providers/replicate)  | ✅ | ✅ | ✅ | ✅ |
-| [huggingface](https://docs.litellm.ai/docs/providers/huggingface)  | ✅ | ✅ | ✅ | ✅ |
-| [together_ai](https://docs.litellm.ai/docs/providers/togetherai)  | ✅ | ✅ | ✅ | ✅ |
-| [openrouter](https://docs.litellm.ai/docs/providers/openrouter)  | ✅ | ✅ | ✅ | ✅ |
-| [vertex_ai](https://docs.litellm.ai/docs/providers/vertex)  | ✅ | ✅ | ✅ | ✅ |
-| [palm](https://docs.litellm.ai/docs/providers/palm)  | ✅ | ✅ | ✅ | ✅ |
-| [ai21](https://docs.litellm.ai/docs/providers/ai21)  | ✅ | ✅ | ✅ | ✅ |
-| [baseten](https://docs.litellm.ai/docs/providers/baseten)  | ✅ | ✅ | ✅ | ✅ |
-| [azure](https://docs.litellm.ai/docs/providers/azure)  | ✅ | ✅ | ✅ | ✅ |
-| [sagemaker](https://docs.litellm.ai/docs/providers/aws_sagemaker)  | ✅ | ✅ | ✅ | ✅ |
-| [bedrock](https://docs.litellm.ai/docs/providers/bedrock)  | ✅ | ✅ | ✅ | ✅ |
-| [vllm](https://docs.litellm.ai/docs/providers/vllm)  | ✅ | ✅ | ✅ | ✅ |
-| [nlp_cloud](https://docs.litellm.ai/docs/providers/nlp_cloud)  | ✅ | ✅ | ✅ | ✅ |
-| [aleph alpha](https://docs.litellm.ai/docs/providers/aleph_alpha)  | ✅ | ✅ | ✅ | ✅ |
-| [petals](https://docs.litellm.ai/docs/providers/petals)  | ✅ | ✅ | ✅ | ✅ |
-| [ollama](https://docs.litellm.ai/docs/providers/ollama)  | ✅ | ✅ | ✅ | ✅ |
-| [deepinfra](https://docs.litellm.ai/docs/providers/deepinfra)  | ✅ | ✅ | ✅ | ✅ |
+## Reliability - Fallback LLMs
+Never fail a request using LiteLLM
+
+```python
+from litellm import completion
+# if gpt-4 fails, retry the request with gpt-3.5-turbo->command-nightly->claude-instant-1
+response = completion(model="gpt-4",messages=messages, fallbacks=["gpt-3.5-turbo", "command-nightly", "claude-instant-1"])
+
+# if azure/gpt-4 fails, retry the request with fallback api_keys/api_base
+response = completion(model="azure/gpt-4", messages=messages, api_key=api_key, fallbacks=[{"api_key": "good-key-1"}, {"api_key": "good-key-2", "api_base": "good-api-base-2"}])
+```

-[**Read the Docs**](https://docs.litellm.ai/docs/)
 ## Logging Observability - Log LLM Input/Output ([Docs](https://docs.litellm.ai/docs/observability/callbacks))
 LiteLLM exposes pre defined callbacks to send data to LLMonitor, Langfuse, Helicone, Promptlayer, Traceloop, Slack
 ```python
@ -132,6 +105,35 @@ litellm.success_callback = ["promptlayer", "llmonitor"] # log input/output to pr
 response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}])
 ```

+
+## Supported Provider ([Docs](https://docs.litellm.ai/docs/providers))
+| Provider      | [Completion](https://docs.litellm.ai/docs/#basic-usage) | [Streaming](https://docs.litellm.ai/docs/completion/stream#streaming-responses)  | [Async Completion](https://docs.litellm.ai/docs/completion/stream#async-completion)  | [Async Streaming](https://docs.litellm.ai/docs/completion/stream#async-streaming)  |
+| ------------- | ------------- | ------------- | ------------- | ------------- |
+| [openai](https://docs.litellm.ai/docs/providers/openai)  | ✅ | ✅ | ✅ | ✅ |
+| [azure](https://docs.litellm.ai/docs/providers/azure)  | ✅ | ✅ | ✅ | ✅ |
+| [aws - sagemaker](https://docs.litellm.ai/docs/providers/aws_sagemaker)  | ✅ | ✅ | ✅ | ✅ |
+| [aws - bedrock](https://docs.litellm.ai/docs/providers/bedrock)  | ✅ | ✅ | ✅ | ✅ |
+| [cohere](https://docs.litellm.ai/docs/providers/cohere)  | ✅ | ✅ | ✅ | ✅ |
+| [anthropic](https://docs.litellm.ai/docs/providers/anthropic)  | ✅ | ✅ | ✅ | ✅ |
+| [huggingface](https://docs.litellm.ai/docs/providers/huggingface)  | ✅ | ✅ | ✅ | ✅ |
+| [replicate](https://docs.litellm.ai/docs/providers/replicate)  | ✅ | ✅ | ✅ | ✅ |
+| [together_ai](https://docs.litellm.ai/docs/providers/togetherai)  | ✅ | ✅ | ✅ | ✅ |
+| [openrouter](https://docs.litellm.ai/docs/providers/openrouter)  | ✅ | ✅ | ✅ | ✅ |
+| [google - vertex_ai](https://docs.litellm.ai/docs/providers/vertex)  | ✅ | ✅ | ✅ | ✅ |
+| [google - palm](https://docs.litellm.ai/docs/providers/palm)  | ✅ | ✅ | ✅ | ✅ |
+| [ai21](https://docs.litellm.ai/docs/providers/ai21)  | ✅ | ✅ | ✅ | ✅ |
+| [baseten](https://docs.litellm.ai/docs/providers/baseten)  | ✅ | ✅ | ✅ | ✅ |
+| [vllm](https://docs.litellm.ai/docs/providers/vllm)  | ✅ | ✅ | ✅ | ✅ |
+| [nlp_cloud](https://docs.litellm.ai/docs/providers/nlp_cloud)  | ✅ | ✅ | ✅ | ✅ |
+| [aleph alpha](https://docs.litellm.ai/docs/providers/aleph_alpha)  | ✅ | ✅ | ✅ | ✅ |
+| [petals](https://docs.litellm.ai/docs/providers/petals)  | ✅ | ✅ | ✅ | ✅ |
+| [ollama](https://docs.litellm.ai/docs/providers/ollama)  | ✅ | ✅ | ✅ | ✅ |
+| [deepinfra](https://docs.litellm.ai/docs/providers/deepinfra)  | ✅ | ✅ | ✅ | ✅ |
+| [perplexity-ai](https://docs.litellm.ai/docs/providers/perplexity)  | ✅ | ✅ | ✅ | ✅ |
+| [anyscale](https://docs.litellm.ai/docs/providers/anyscale)  | ✅ | ✅ | ✅ | ✅ |
+
+[**Read the Docs**](https://docs.litellm.ai/docs/)
+
 ## Contributing
 To contribute: Clone the repo locally -> Make a change -> Submit a PR with the change. 

--- a/cookbook/LiteLLM_AB_TestLLMs.ipynb
+++ b/cookbook/LiteLLM_AB_TestLLMs.ipynb
@ -1,198 +0,0 @@
-{
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "provenance": []
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    }
-  },
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "source": [
-        "# LiteLLM A/B Testing LLMs in production\n",
-        "\n",
-        "* LiteLLM allows you to use 100+ LLMs as a drop in replacement for `gpt-3.5-turbo`\n",
-        "\n",
-        "This tutorial walks through how to use LiteLLM to easily A/B Test LLMs in production"
-      ],
-      "metadata": {
-        "id": "ODpmJQ5u4rXI"
-      }
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "# Example 1: A/B Test GPT-4 & GPT-3.5\n",
-        "\n",
-        "# Step 1\n",
-        "👉 Get your `id` from here: https://admin.litellm.ai/"
-      ],
-      "metadata": {
-        "id": "YamUetcC5Ke7"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "from litellm import completion_with_split_tests\n",
-        "import os\n",
-        "\n",
-        "## set ENV variables\n",
-        "os.environ[\"OPENAI_API_KEY\"] = \"\"\n",
-        "\n",
-        "\n",
-        "# define a dict of model id and % of requests for model\n",
-        "# see models here: https://docs.litellm.ai/docs/providers\n",
-        "split_per_model = {\n",
-        "\t\"gpt-4\": 0.3,\n",
-        "\t\"gpt-3.5-turbo\": 0.7\n",
-        "}\n",
-        "\n",
-        "messages = [{ \"content\": \"Hello, how are you?\",\"role\": \"user\"}]\n",
-        "\n",
-        "completion_with_split_tests(messages=messages, use_client=True,\n",
-        "   id=\"91fad14a-8c0f-4e99-8eaa-68245435aa80\") # [Optional Set your own ID]"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "7XGfv0958k70",
-        "outputId": "91a069a5-c7d4-4fb0-e345-5ebf383edbbc"
-      },
-      "execution_count": 4,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "last_fetched_at: 1693624804.2941535\n"
-          ]
-        },
-        {
-          "output_type": "execute_result",
-          "data": {
-            "text/plain": [
-              "<OpenAIObject chat.completion id=chatcmpl-7uBT4QHc8BAoZKkU7JoH4ahmXvu0M at 0x7c2895c9e890> JSON: {\n",
-              "  \"id\": \"chatcmpl-7uBT4QHc8BAoZKkU7JoH4ahmXvu0M\",\n",
-              "  \"object\": \"chat.completion\",\n",
-              "  \"created\": 1693624806,\n",
-              "  \"model\": \"gpt-3.5-turbo-0613\",\n",
-              "  \"choices\": [\n",
-              "    {\n",
-              "      \"index\": 0,\n",
-              "      \"message\": {\n",
-              "        \"role\": \"assistant\",\n",
-              "        \"content\": \"Hello! I'm an AI, so I don't have emotions, but I'm here to assist you. How can I help you today?\"\n",
-              "      },\n",
-              "      \"finish_reason\": \"stop\"\n",
-              "    }\n",
-              "  ],\n",
-              "  \"usage\": {\n",
-              "    \"prompt_tokens\": 13,\n",
-              "    \"completion_tokens\": 29,\n",
-              "    \"total_tokens\": 42\n",
-              "  }\n",
-              "}"
-            ]
-          },
-          "metadata": {},
-          "execution_count": 4
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## A/B Test GPT-4 and Claude-2"
-      ],
-      "metadata": {
-        "id": "Y12cxhZt58v8"
-      }
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 5,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "0k6rshtR8i9q",
-        "outputId": "31ac9d73-9e35-4697-d1ff-5d51048566f8"
-      },
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "last_fetched_at: 1693624809.3467667\n"
-          ]
-        },
-        {
-          "output_type": "execute_result",
-          "data": {
-            "text/plain": [
-              "<OpenAIObject chat.completion id=chatcmpl-7uBTA6gotsTksvCU7GffJ64ybfHUw at 0x7c28aa288630> JSON: {\n",
-              "  \"id\": \"chatcmpl-7uBTA6gotsTksvCU7GffJ64ybfHUw\",\n",
-              "  \"object\": \"chat.completion\",\n",
-              "  \"created\": 1693624812,\n",
-              "  \"model\": \"gpt-4-0613\",\n",
-              "  \"choices\": [\n",
-              "    {\n",
-              "      \"index\": 0,\n",
-              "      \"message\": {\n",
-              "        \"role\": \"assistant\",\n",
-              "        \"content\": \"As an AI, I don't have feelings, but I'm here and ready to assist you. How can I help you today?\"\n",
-              "      },\n",
-              "      \"finish_reason\": \"stop\"\n",
-              "    }\n",
-              "  ],\n",
-              "  \"usage\": {\n",
-              "    \"prompt_tokens\": 13,\n",
-              "    \"completion_tokens\": 27,\n",
-              "    \"total_tokens\": 40\n",
-              "  }\n",
-              "}"
-            ]
-          },
-          "metadata": {},
-          "execution_count": 5
-        }
-      ],
-      "source": [
-        "from litellm import completion_with_split_tests\n",
-        "import os\n",
-        "\n",
-        "## set ENV variables\n",
-        "os.environ[\"ANTHROPIC_API_KEY\"] = \"\"\n",
-        "\n",
-        "# define a dict of model id and % of requests for model\n",
-        "split_per_model = {\n",
-        "\t\"gpt-4\": 0.3,\n",
-        "\t\"claude-2\": 0.7\n",
-        "}\n",
-        "\n",
-        "messages = [{ \"content\": \"Hello, how are you?\",\"role\": \"user\"}]\n",
-        "\n",
-        "\n",
-        "completion_with_split_tests(messages=messages, use_client=True,\n",
-        "   id=\"91fad14a-8c0f-4e99-8eaa-68245435aa80\") # [Optional Set your own ID]"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "source": [],
-      "metadata": {
-        "id": "hzzbsAIp4pnr"
-      }
-    }
-  ]
-}
--- a/cookbook/LiteLLM_Azure_OpenAI.ipynb
+++ b/cookbook/LiteLLM_Azure_OpenAI.ipynb
@ -1,259 +0,0 @@
-{
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "provenance": []
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    }
-  },
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## Use Azure OpenAI with LiteLLM"
-      ],
-      "metadata": {
-        "id": "oTA-1bG_wBVw"
-      }
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "RreFKTyKv2nt"
-      },
-      "outputs": [],
-      "source": [
-        "!pip install litellm"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "Pass API_BASE, API_VERSION, API_KEY in COMPLETION()"
-      ],
-      "metadata": {
-        "id": "kSOo9lbKv_7H"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "import litellm\n",
-        "response = litellm.completion(\n",
-        "    model = \"azure/chatgpt-v-2\",                                  # model = azure/<your deployment name>\n",
-        "    api_base = \"https://openai-gpt-4-test-v-1.openai.azure.com/\", # azure api base\n",
-        "    api_version = \"2023-05-15\",                                   # azure api version\n",
-        "    api_key = \"\",                                                 # azure api key\n",
-        "    messages = [{\"role\": \"user\", \"content\": \"good morning\"}],\n",
-        "    max_tokens=10,\n",
-        ")\n",
-        "print(response)"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "gWIsjrHMv_DM",
-        "outputId": "732e9daa-8dca-4bc1-bb8a-aee90ee14c8d"
-      },
-      "execution_count": 4,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "{\n",
-            "  \"id\": \"chatcmpl-877x4J2JUSReOuxVGE3THLjcmdrI8\",\n",
-            "  \"object\": \"chat.completion\",\n",
-            "  \"created\": 1696709554,\n",
-            "  \"model\": \"gpt-35-turbo\",\n",
-            "  \"choices\": [\n",
-            "    {\n",
-            "      \"index\": 0,\n",
-            "      \"finish_reason\": \"length\",\n",
-            "      \"message\": {\n",
-            "        \"role\": \"assistant\",\n",
-            "        \"content\": \"Good morning! How can I assist you today?\"\n",
-            "      }\n",
-            "    }\n",
-            "  ],\n",
-            "  \"usage\": {\n",
-            "    \"completion_tokens\": 10,\n",
-            "    \"prompt_tokens\": 10,\n",
-            "    \"total_tokens\": 20\n",
-            "  }\n",
-            "}\n"
-          ]
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "source": [],
-      "metadata": {
-        "id": "PR5uhvVHxe-C"
-      }
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## Set .env variables with Azure / LiteLLM"
-      ],
-      "metadata": {
-        "id": "1P2hprlLxfDc"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "import litellm\n",
-        "import os\n",
-        "\n",
-        "os.environ['AZURE_API_KEY'] = \"\"\n",
-        "os.environ['AZURE_API_BASE'] = \"\"\n",
-        "os.environ['AZURE_API_VERSION'] = \"\"\n",
-        "\n",
-        "response = litellm.completion(\n",
-        "    model = \"azure/chatgpt-v-2\", # model = azure/<your deployment name>\n",
-        "    messages = [{\"role\": \"user\", \"content\": \"good morning\"}],\n",
-        "    max_tokens=10,\n",
-        ")\n",
-        "print(response)"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "mGi9Gae1wMjK",
-        "outputId": "29f2a9cf-f6ee-416b-9b24-02588d96fe59"
-      },
-      "execution_count": 5,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "{\n",
-            "  \"id\": \"chatcmpl-877zB0GWZl4zswopLt12yQEzEfYWy\",\n",
-            "  \"object\": \"chat.completion\",\n",
-            "  \"created\": 1696709685,\n",
-            "  \"model\": \"gpt-35-turbo\",\n",
-            "  \"choices\": [\n",
-            "    {\n",
-            "      \"index\": 0,\n",
-            "      \"finish_reason\": \"length\",\n",
-            "      \"message\": {\n",
-            "        \"role\": \"assistant\",\n",
-            "        \"content\": \"Good morning! How can I assist you today?\"\n",
-            "      }\n",
-            "    }\n",
-            "  ],\n",
-            "  \"usage\": {\n",
-            "    \"completion_tokens\": 10,\n",
-            "    \"prompt_tokens\": 10,\n",
-            "    \"total_tokens\": 20\n",
-            "  }\n",
-            "}\n"
-          ]
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## With Streaming"
-      ],
-      "metadata": {
-        "id": "uIhyvSVNx4hX"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "response = litellm.completion(\n",
-        "    model = \"azure/chatgpt-v-2\",\n",
-        "    messages = [{\"role\": \"user\", \"content\": \"good morning\"}],\n",
-        "    max_tokens=10,\n",
-        "    stream=True\n",
-        ")\n",
-        "\n",
-        "for chunk in response:\n",
-        "  print(chunk)"
-      ],
-      "metadata": {
-        "id": "R4KYKLOHxy9r"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## With Rate Limit Handler"
-      ],
-      "metadata": {
-        "id": "hB8jLz94ybTC"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "from litellm import RateLimitManager\n",
-        "\n",
-        "handler = RateLimitManager(max_requests_per_minute=10, max_tokens_per_minute=200)\n",
-        "\n",
-        "response = await handler.acompletion(\n",
-        "    model = \"azure/chatgpt-v-2\",\n",
-        "    messages = [{\"role\": \"user\", \"content\": \"good morning\"}],\n",
-        "    max_tokens=10,\n",
-        ")\n",
-        "print(response)"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "CQECDwpix7Hl",
-        "outputId": "18dc4041-3262-4ab7-a451-34ceaf70ca31"
-      },
-      "execution_count": 8,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "{\n",
-            "  \"id\": \"chatcmpl-8781gvDKwPbp44CliumABgAuIDnSf\",\n",
-            "  \"object\": \"chat.completion\",\n",
-            "  \"created\": 1696709840,\n",
-            "  \"model\": \"gpt-35-turbo\",\n",
-            "  \"choices\": [\n",
-            "    {\n",
-            "      \"index\": 0,\n",
-            "      \"finish_reason\": \"length\",\n",
-            "      \"message\": {\n",
-            "        \"role\": \"assistant\",\n",
-            "        \"content\": \"Good morning! How can I assist you today?\"\n",
-            "      }\n",
-            "    }\n",
-            "  ],\n",
-            "  \"usage\": {\n",
-            "    \"completion_tokens\": 10,\n",
-            "    \"prompt_tokens\": 10,\n",
-            "    \"total_tokens\": 20\n",
-            "  }\n",
-            "}\n"
-          ]
-        }
-      ]
-    }
-  ]
-}
--- a/cookbook/LiteLLM_Caching.ipynb
+++ b/cookbook/LiteLLM_Caching.ipynb
@ -1,123 +0,0 @@
-{
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "provenance": []
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    }
-  },
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## LiteLLM Caching Tutorial\n",
-        "Link to using Caching in Docs:\n",
-        "https://docs.litellm.ai/docs/caching/"
-      ],
-      "metadata": {
-        "id": "Lvj-GI3YQfQx"
-      }
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "eKSBuuKn99Jm"
-      },
-      "outputs": [],
-      "source": [
-        "!pip install litellm==0.1.492"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## Set `caching_with_models` to True\n",
-        "Enables caching on a per-model basis.\n",
-        "Keys are the input messages + model and values stored in the cache is the corresponding response"
-      ],
-      "metadata": {
-        "id": "sFXj4UUnQpyt"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "import os, time, litellm\n",
-        "from litellm import completion\n",
-        "litellm.caching_with_models = True # set caching for each model to True\n"
-      ],
-      "metadata": {
-        "id": "xCea1EjR99rU"
-      },
-      "execution_count": 8,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "os.environ['OPENAI_API_KEY'] = \"\""
-      ],
-      "metadata": {
-        "id": "VK3kXGXI-dtC"
-      },
-      "execution_count": 9,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## Use LiteLLM Cache"
-      ],
-      "metadata": {
-        "id": "U_CDCcnjQ7c6"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "question = \"write 1 page about what's LiteLLM\"\n",
-        "for _ in range(2):\n",
-        "    start_time = time.time()\n",
-        "    response = completion(\n",
-        "      model='gpt-3.5-turbo',\n",
-        "      messages=[\n",
-        "        {\n",
-        "            'role': 'user',\n",
-        "            'content': question\n",
-        "        }\n",
-        "      ],\n",
-        "    )\n",
-        "    print(f'Question: {question}')\n",
-        "    print(\"Time consuming: {:.2f}s\".format(time.time() - start_time))"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "Efli-J-t-bJH",
-        "outputId": "cfdb1e14-96b0-48ee-c504-7f567e84c349"
-      },
-      "execution_count": 10,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "Question: write 1 page about what's LiteLLM\n",
-            "Time consuming: 13.53s\n",
-            "Question: write 1 page about what's LiteLLM\n",
-            "Time consuming: 0.00s\n"
-          ]
-        }
-      ]
-    }
-  ]
-}
--- a/cookbook/LiteLLM_GPTCache.ipynb
+++ b/cookbook/LiteLLM_GPTCache.ipynb
@ -1,336 +0,0 @@
-{
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "provenance": []
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    }
-  },
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "source": [
-        "# Using GPT Cache with LiteLLM\n",
-        "GPT Cache allows you to slash Your LLM API Costs by 10x 💰, Boost Speed by 100x ⚡\n",
-        "\n",
-        "In this tutorial we demo how to use LiteLLM with GPTCache\n",
-        "* Quick Start Usage\n",
-        "* Advanced Usaged\n",
-        "* Setting custom cache keys\n",
-        "\n"
-      ],
-      "metadata": {
-        "id": "2BUxu9L2mPbX"
-      }
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "SRbVXJUGk6HC"
-      },
-      "outputs": [],
-      "source": [
-        "# installation\n",
-        "!pip install litellm gptcache"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "# Set ENV variables\n"
-      ],
-      "metadata": {
-        "id": "UuZX3OSBlIDt"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "import os\n",
-        "os.environ['OPENAI_API_KEY'] = \"\"\n",
-        "os.environ['COHERE_API_KEY'] = \"\""
-      ],
-      "metadata": {
-        "id": "E4jn-bPWlBZs"
-      },
-      "execution_count": 12,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "# Quick Start Usage\n",
-        "By default GPT Cache uses the content in `messages` as the cache key\n",
-        " Import GPT Cache"
-      ],
-      "metadata": {
-        "id": "Tswo-058lcid"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "import litellm\n",
-        "from litellm.gpt_cache import completion\n",
-        "\n",
-        "### using / setting up gpt cache\n",
-        "from gptcache import cache\n",
-        "cache.init()\n",
-        "cache.set_openai_key()\n",
-        "#########################"
-      ],
-      "metadata": {
-        "id": "9oOV8gRtk_la"
-      },
-      "execution_count": 4,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "## two completion calls\n",
-        "import time\n",
-        "question = \"why should i use LiteLLM\"\n",
-        "for _ in range(2):\n",
-        "    start_time = time.time()\n",
-        "    response = completion(\n",
-        "      model='gpt-3.5-turbo',\n",
-        "      messages=[\n",
-        "        {\n",
-        "            'role': 'user',\n",
-        "            'content': question\n",
-        "        }\n",
-        "      ],\n",
-        "    )\n",
-        "    print(f'Question: {question}, Response {response}')\n",
-        "    print(\"Time consuming: {:.2f}s\".format(time.time() - start_time))"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "Em1kgIOOm1Vo",
-        "outputId": "d8e57747-a851-4675-f936-d65e5570d95a"
-      },
-      "execution_count": 7,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "Question: why should i use LiteLLM, Response {\n",
-            "  \"id\": \"chatcmpl-7tJozrtW5UzVHNUcxX6cfzRS4nbxd\",\n",
-            "  \"object\": \"chat.completion\",\n",
-            "  \"created\": 1693418589,\n",
-            "  \"model\": \"gpt-3.5-turbo-0613\",\n",
-            "  \"choices\": [\n",
-            "    {\n",
-            "      \"index\": 0,\n",
-            "      \"message\": {\n",
-            "        \"role\": \"assistant\",\n",
-            "        \"content\": \"There are several reasons why you might consider using LiteLLM:\\n\\n1. Simplified document management: LiteLLM offers a user-friendly interface that makes it easy to manage and organize your legal documents. You can track versions, organize files into folders, and quickly find what you need.\\n\\n2. Collaboration and accessibility: LiteLLM allows multiple users to work on documents simultaneously, making it easier for teams to collaborate and exchange feedback. It also provides flexible accessibility, allowing you to access your documents from anywhere, anytime, as long as you have an internet connection.\\n\\n3. Time-saving features: The platform offers various time-saving features, such as automated document generation, customizable templates, and integration with other tools like Microsoft Word. This can significantly reduce the time and effort required to prepare legal documents.\\n\\n4. Enhanced security: LiteLLM prioritizes the security of your data. It provides robust encryption, secure data storage, and role-based access controls. This ensures that your sensitive legal documents are protected from unauthorized access.\\n\\n5. Cost-effective solution: LiteLLM offers a cost-effective solution compared to traditional legal document management systems. With its cloud-based approach, you don't need to invest in expensive hardware or software installations. Instead, you pay for a subscription-based model, which can be more affordable for small firms or individual practitioners.\\n\\nUltimately, the decision to use LiteLLM depends on your specific needs and preferences. It's important to consider factors such as the size of your practice, the volume of legal documents you handle, and your budget before making a decision.\"\n",
-            "      },\n",
-            "      \"finish_reason\": \"stop\"\n",
-            "    }\n",
-            "  ],\n",
-            "  \"usage\": {\n",
-            "    \"prompt_tokens\": 14,\n",
-            "    \"completion_tokens\": 312,\n",
-            "    \"total_tokens\": 326\n",
-            "  }\n",
-            "}\n",
-            "Time consuming: 9.48s\n",
-            "Question: why should i use LiteLLM, Response {'gptcache': True, 'saved_token': [14, 312], 'choices': [{'message': {'role': 'assistant', 'content': \"There are several reasons why you might consider using LiteLLM:\\n\\n1. Simplified document management: LiteLLM offers a user-friendly interface that makes it easy to manage and organize your legal documents. You can track versions, organize files into folders, and quickly find what you need.\\n\\n2. Collaboration and accessibility: LiteLLM allows multiple users to work on documents simultaneously, making it easier for teams to collaborate and exchange feedback. It also provides flexible accessibility, allowing you to access your documents from anywhere, anytime, as long as you have an internet connection.\\n\\n3. Time-saving features: The platform offers various time-saving features, such as automated document generation, customizable templates, and integration with other tools like Microsoft Word. This can significantly reduce the time and effort required to prepare legal documents.\\n\\n4. Enhanced security: LiteLLM prioritizes the security of your data. It provides robust encryption, secure data storage, and role-based access controls. This ensures that your sensitive legal documents are protected from unauthorized access.\\n\\n5. Cost-effective solution: LiteLLM offers a cost-effective solution compared to traditional legal document management systems. With its cloud-based approach, you don't need to invest in expensive hardware or software installations. Instead, you pay for a subscription-based model, which can be more affordable for small firms or individual practitioners.\\n\\nUltimately, the decision to use LiteLLM depends on your specific needs and preferences. It's important to consider factors such as the size of your practice, the volume of legal documents you handle, and your budget before making a decision.\"}, 'finish_reason': 'stop', 'index': 0}], 'created': 1693418598, 'usage': {'completion_tokens': 0, 'prompt_tokens': 0, 'total_tokens': 0}, 'object': 'chat.completion'}\n",
-            "Time consuming: 0.00s\n"
-          ]
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "# Advanced Usage - Setting custom keys for Cache\n",
-        "By default gptcache uses the `messages` as the cache key\n",
-        "\n",
-        "GPTCache allows you to set custom cache keys by setting\n",
-        "```python\n",
-        "cache.init(pre_func=pre_cache_func)\n",
-        "```\n",
-        "\n",
-        "In this code snippet below we define a `pre_func` that returns message content + model as key"
-      ],
-      "metadata": {
-        "id": "R6hywKu8nXXW"
-      }
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## Defining a `pre_func` for GPTCache\n"
-      ],
-      "metadata": {
-        "id": "6nx1X-2Hn3ak"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "### using / setting up gpt cache\n",
-        "from gptcache import cache\n",
-        "from gptcache.processor.pre import last_content_without_prompt\n",
-        "from typing import Dict, Any\n",
-        "\n",
-        "# use this function to set your cache keys -> gptcache\n",
-        "# data are all the args passed to your completion call\n",
-        "def pre_cache_func(data: Dict[str, Any], **params: Dict[str, Any]) -> Any:\n",
-        "        # use this to set cache key\n",
-        "        print(\"in pre_cache_func\")\n",
-        "        last_content_without_prompt_val = last_content_without_prompt(data, **params)\n",
-        "        print(\"last content without prompt\", last_content_without_prompt_val)\n",
-        "        print(\"model\", data[\"model\"])\n",
-        "        cache_key = last_content_without_prompt_val + data[\"model\"]\n",
-        "        print(\"cache_key\", cache_key)\n",
-        "        return cache_key # using this as cache_key\n",
-        ""
-      ],
-      "metadata": {
-        "id": "jJQsTyrZlvDY"
-      },
-      "execution_count": 9,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## Init Cache with `pre_func` to set custom keys"
-      ],
-      "metadata": {
-        "id": "Tjv1e0hqn-dX"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# init GPT Cache with custom pre_func\n",
-        "cache.init(pre_func=pre_cache_func)\n",
-        "cache.set_openai_key()"
-      ],
-      "metadata": {
-        "id": "Ua8UhEp6n9yR"
-      },
-      "execution_count": 10,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "## Using Cache\n",
-        "* Cache key is `message` + `model`\n",
-        "\n",
-        "We make 3 LLM API calls\n",
-        "* 2 to OpenAI\n",
-        "* 1 to Cohere command nightly"
-      ],
-      "metadata": {
-        "id": "jHqWdfC4sTHf"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "messages = [{\"role\": \"user\", \"content\": \"why should I use LiteLLM for completions()\"}]\n",
-        "response1 = completion(model=\"gpt-3.5-turbo\", messages=messages)\n",
-        "response2 = completion(model=\"gpt-3.5-turbo\", messages=messages)\n",
-        "response3 = completion(model=\"command-nightly\", messages=messages) # calling cohere command nightly\n",
-        "\n",
-        "if response1[\"choices\"] != response2[\"choices\"]: # same models should cache\n",
-        "    print(f\"Error occurred: Caching for same model+prompt failed\")\n",
-        "\n",
-        "if response3[\"choices\"] == response2[\"choices\"]: # different models, don't cache\n",
-        "    # if models are different, it should not return cached response\n",
-        "    print(f\"Error occurred: Caching for different model+prompt failed\")\n",
-        "\n",
-        "print(\"response1\", response1)\n",
-        "print(\"response2\", response2)\n",
-        "print(\"response3\", response3)"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "ftrKpB2GsPMi",
-        "outputId": "1ee49273-bd62-49b4-a177-d40e33a51785"
-      },
-      "execution_count": 14,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "in pre_cache_func\n",
-            "last content without prompt why should I use LiteLLM for completions()\n",
-            "model gpt-3.5-turbo\n",
-            "cache_key why should I use LiteLLM for completions()gpt-3.5-turbo\n",
-            "in pre_cache_func\n",
-            "last content without prompt why should I use LiteLLM for completions()\n",
-            "model gpt-3.5-turbo\n",
-            "cache_key why should I use LiteLLM for completions()gpt-3.5-turbo\n",
-            "in pre_cache_func\n",
-            "last content without prompt why should I use LiteLLM for completions()\n",
-            "model command-nightly\n",
-            "cache_key why should I use LiteLLM for completions()command-nightly\n",
-            "response1 {\n",
-            "  \"id\": \"chatcmpl-7tKE21PEe43sR6RvZ7pcUmanFwZLf\",\n",
-            "  \"object\": \"chat.completion\",\n",
-            "  \"created\": 1693420142,\n",
-            "  \"model\": \"gpt-3.5-turbo-0613\",\n",
-            "  \"choices\": [\n",
-            "    {\n",
-            "      \"index\": 0,\n",
-            "      \"message\": {\n",
-            "        \"role\": \"assistant\",\n",
-            "        \"content\": \"There are several reasons why you should use LiteLLM for completions() in your code:\\n\\n1. Fast and efficient: LiteLLM is implemented in a lightweight manner, making it highly performant. It provides quick and accurate completions, ensuring a smooth development experience.\\n\\n2. Customizable completion options: LiteLLM allows you to customize the completion options based on your specific needs. You can specify the maximum number of completions to retrieve, the desired timeout, and more.\\n\\n3. Language-agnostic: LiteLLM supports multiple programming languages, including Python, JavaScript, Java, C++, and many others. So, regardless of the language you are using, LiteLLM can help you with intelligent code completions.\\n\\n4. Learning capabilities: LiteLLM has the ability to learn from the provided code and context, improving the accuracy of the completions over time. This means that as you continue to use it, the suggested completions will become increasingly tailored to your specific coding style and patterns.\\n\\n5. Ease of integration: LiteLLM is designed to be easily integrated into existing code editors or IDEs. It provides a simple API that allows you to integrate it seamlessly into your development workflow.\\n\\n6. Supported by OpenAI: LiteLLM is developed and maintained by OpenAI, a well-known and reputable organization in the field of artificial intelligence. This ensures ongoing support and updates to enhance the functionality and performance of LiteLLM.\\n\\nOverall, using LiteLLM for completions() can greatly improve your coding productivity by providing accurate and context-aware code completion suggestions, regardless of the programming language you are working with.\"\n",
-            "      },\n",
-            "      \"finish_reason\": \"stop\"\n",
-            "    }\n",
-            "  ],\n",
-            "  \"usage\": {\n",
-            "    \"prompt_tokens\": 18,\n",
-            "    \"completion_tokens\": 326,\n",
-            "    \"total_tokens\": 344\n",
-            "  }\n",
-            "}\n",
-            "response2 {'gptcache': True, 'saved_token': [18, 326], 'choices': [{'message': {'role': 'assistant', 'content': 'There are several reasons why you should use LiteLLM for completions() in your code:\\n\\n1. Fast and efficient: LiteLLM is implemented in a lightweight manner, making it highly performant. It provides quick and accurate completions, ensuring a smooth development experience.\\n\\n2. Customizable completion options: LiteLLM allows you to customize the completion options based on your specific needs. You can specify the maximum number of completions to retrieve, the desired timeout, and more.\\n\\n3. Language-agnostic: LiteLLM supports multiple programming languages, including Python, JavaScript, Java, C++, and many others. So, regardless of the language you are using, LiteLLM can help you with intelligent code completions.\\n\\n4. Learning capabilities: LiteLLM has the ability to learn from the provided code and context, improving the accuracy of the completions over time. This means that as you continue to use it, the suggested completions will become increasingly tailored to your specific coding style and patterns.\\n\\n5. Ease of integration: LiteLLM is designed to be easily integrated into existing code editors or IDEs. It provides a simple API that allows you to integrate it seamlessly into your development workflow.\\n\\n6. Supported by OpenAI: LiteLLM is developed and maintained by OpenAI, a well-known and reputable organization in the field of artificial intelligence. This ensures ongoing support and updates to enhance the functionality and performance of LiteLLM.\\n\\nOverall, using LiteLLM for completions() can greatly improve your coding productivity by providing accurate and context-aware code completion suggestions, regardless of the programming language you are working with.'}, 'finish_reason': 'stop', 'index': 0}], 'created': 1693420152, 'usage': {'completion_tokens': 0, 'prompt_tokens': 0, 'total_tokens': 0}, 'object': 'chat.completion'}\n",
-            "response3 {\n",
-            "  \"choices\": [\n",
-            "    {\n",
-            "      \"finish_reason\": \"stop\",\n",
-            "      \"index\": 0,\n",
-            "      \"message\": {\n",
-            "        \"content\": \" LiteLLM is a state-of-the-art, privacy-preserving LLM trained\",\n",
-            "        \"role\": \"assistant\"\n",
-            "      }\n",
-            "    }\n",
-            "  ],\n",
-            "  \"created\": 1693420153.8769038,\n",
-            "  \"model\": \"command-nightly\",\n",
-            "  \"usage\": {\n",
-            "    \"prompt_tokens\": 11,\n",
-            "    \"completion_tokens\": 16,\n",
-            "    \"total_tokens\": 27\n",
-            "  }\n",
-            "}\n"
-          ]
-        }
-      ]
-    }
-  ]
-}
--- a/cookbook/Using_Nemo_Guardrails_with_LiteLLM_Server.ipynb
+++ b/cookbook/Using_Nemo_Guardrails_with_LiteLLM_Server.ipynb
@ -0,0 +1,159 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Using Nemo-Guardrails with LiteLLM Server\n",
+        "\n",
+        "[Call Bedrock, TogetherAI, Huggingface, etc. on the server](https://docs.litellm.ai/docs/providers)"
+      ],
+      "metadata": {
+        "id": "eKXncoQbU_2j"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Using with Bedrock\n",
+        "\n",
+        "`docker run -e PORT=8000 -e AWS_ACCESS_KEY_ID=<your-aws-access-key> -e AWS_SECRET_ACCESS_KEY=<your-aws-secret-key> -p 8000:8000 ghcr.io/berriai/litellm:latest`"
+      ],
+      "metadata": {
+        "id": "ZciYaLwvuFbu"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "pip install nemoguardrails langchain"
+      ],
+      "metadata": {
+        "id": "vOUwGSJ2Vsy3"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "xXEJNxe7U0IN"
+      },
+      "outputs": [],
+      "source": [
+        "import openai\n",
+        "from langchain.chat_models import ChatOpenAI\n",
+        "\n",
+        "llm = ChatOpenAI(model_name=\"anthropic.claude-v2\", openai_api_base=\"http://0.0.0.0:8000\", openai_api_key=\"my-fake-key\")\n",
+        "\n",
+        "from nemoguardrails import LLMRails, RailsConfig\n",
+        "\n",
+        "config = RailsConfig.from_path(\"./config.yml\")\n",
+        "app = LLMRails(config, llm=llm)\n",
+        "\n",
+        "new_message = app.generate(messages=[{\n",
+        "    \"role\": \"user\",\n",
+        "    \"content\": \"Hello! What can you do for me?\"\n",
+        "}])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Using with TogetherAI\n",
+        "\n",
+        "1. You can either set this in the server environment:\n",
+        "`docker run -e PORT=8000 -e TOGETHERAI_API_KEY=<your-together-ai-api-key> -p 8000:8000 ghcr.io/berriai/litellm:latest`\n",
+        "\n",
+        "2. **Or** Pass this in as the api key `(...openai_api_key=\"<your-together-ai-api-key>\")`"
+      ],
+      "metadata": {
+        "id": "vz5n00qyuKjp"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import openai\n",
+        "from langchain.chat_models import ChatOpenAI\n",
+        "\n",
+        "llm = ChatOpenAI(model_name=\"together_ai/togethercomputer/CodeLlama-13b-Instruct\", openai_api_base=\"http://0.0.0.0:8000\", openai_api_key=\"my-together-ai-api-key\")\n",
+        "\n",
+        "from nemoguardrails import LLMRails, RailsConfig\n",
+        "\n",
+        "config = RailsConfig.from_path(\"./config.yml\")\n",
+        "app = LLMRails(config, llm=llm)\n",
+        "\n",
+        "new_message = app.generate(messages=[{\n",
+        "    \"role\": \"user\",\n",
+        "    \"content\": \"Hello! What can you do for me?\"\n",
+        "}])"
+      ],
+      "metadata": {
+        "id": "XK1sk-McuhpE"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### CONFIG.YML\n",
+        "\n",
+        "save this example `config.yml` in your current directory"
+      ],
+      "metadata": {
+        "id": "8A1KWKnzuxAS"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# instructions:\n",
+        "#   - type: general\n",
+        "#     content: |\n",
+        "#       Below is a conversation between a bot and a user about the recent job reports.\n",
+        "#       The bot is factual and concise. If the bot does not know the answer to a\n",
+        "#       question, it truthfully says it does not know.\n",
+        "\n",
+        "# sample_conversation: |\n",
+        "#   user \"Hello there!\"\n",
+        "#     express greeting\n",
+        "#   bot express greeting\n",
+        "#     \"Hello! How can I assist you today?\"\n",
+        "#   user \"What can you do for me?\"\n",
+        "#     ask about capabilities\n",
+        "#   bot respond about capabilities\n",
+        "#     \"I am an AI assistant that helps answer mathematical questions. My core mathematical skills are powered by wolfram alpha.\"\n",
+        "#   user \"What's 2+2?\"\n",
+        "#     ask math question\n",
+        "#   bot responds to math question\n",
+        "#     \"2+2 is equal to 4.\"\n",
+        "\n",
+        "# models:\n",
+        "#   - type: main\n",
+        "#     engine: openai\n",
+        "#     model: claude-instant-1"
+      ],
+      "metadata": {
+        "id": "NKN1GmSvu0Cx"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
--- a/cookbook/community-resources/get_hf_models.py
+++ b/cookbook/community-resources/get_hf_models.py
--- a/cookbook/liteLLM_ChromaDB_Cache.ipynb
+++ b/cookbook/liteLLM_ChromaDB_Cache.ipynb
--- a/dist/litellm-0.13.1.dev1-py3-none-any.whl
+++ b/dist/litellm-0.13.1.dev1-py3-none-any.whl
--- a/dist/litellm-0.13.1.dev1.tar.gz
+++ b/dist/litellm-0.13.1.dev1.tar.gz
--- a/dist/litellm-0.13.1.dev2-py3-none-any.whl
+++ b/dist/litellm-0.13.1.dev2-py3-none-any.whl
--- a/dist/litellm-0.13.1.dev2.tar.gz
+++ b/dist/litellm-0.13.1.dev2.tar.gz
--- a/dist/litellm-0.13.1.dev3-py3-none-any.whl
+++ b/dist/litellm-0.13.1.dev3-py3-none-any.whl
--- a/dist/litellm-0.13.1.dev3.tar.gz
+++ b/dist/litellm-0.13.1.dev3.tar.gz
--- a/docs/my-website/docs/caching/local_caching.md
+++ b/docs/my-website/docs/caching/local_caching.md
@ -6,7 +6,6 @@ liteLLM implements exact match caching and supports the following Caching:
 * In-Memory Caching [Default]
 * Redis Caching Local
 * Redis Caching Hosted
-* GPTCache 

 ## Quick Start Usage - Completion
 Caching - cache
--- a/docs/my-website/docs/completion/function_call.md
+++ b/docs/my-website/docs/completion/function_call.md
@ -1,7 +1,7 @@
 # Function Calling 
-LiteLLM only supports: OpenAI gpt-4-0613 and gpt-3.5-turbo-0613 for function calling 
+
 ## Quick Start 
-This is exactly how OpenAI supports function calling for gpt-4-0613 and gpt-3.5-turbo-0613
+
 ```python
 import os, litellm
 from litellm import completion
@ -128,7 +128,6 @@ print(response)
 ```

 ## Function calling for Non-OpenAI LLMs
-**For Non OpenAI LLMs - LiteLLM raises an exception if you try using it for function calling**

 ### Adding Function to prompt
 For Non OpenAI LLMs LiteLLM allows you to add the function to the prompt set: `litellm.add_function_to_prompt = True`
--- a/docs/my-website/docs/completion/input.md
+++ b/docs/my-website/docs/completion/input.md
@ -58,6 +58,108 @@ To drop the param instead, set `litellm.drop_params = True`.
 Add to prompt for non-openai models, set: `litellm.add_function_to_prompt = True`. 
 ::: 

+## Input Params
+
+```python
+def completion(
+    model: str,
+    messages: List = [],
+    # Optional OpenAI params
+    functions: List = [],
+    function_call: str = "",  # optional params
+    temperature: Optional[float] = None,
+    top_p: Optional[float] = None,
+    n: Optional[int] = None,
+    stream: Optional[bool] = None,
+    stop=None,
+    max_tokens: Optional[float] = None,
+    presence_penalty: Optional[float] = None,
+    frequency_penalty: Optional[float]=None,
+    logit_bias: dict = {},
+    user: str = "",
+    deployment_id = None,
+    request_timeout: Optional[int] = None,
+
+    # Optional LiteLLM params
+    api_base: Optional[str] = None,
+    api_version: Optional[str] = None,
+    api_key: Optional[str] = None,
+    num_retries: Optional[int] = None, # set to retry a model if an APIError, TimeoutError, or ServiceUnavailableError occurs 
+    context_window_fallback_dict: Optional[dict] = None, # mapping of model to use if call fails due to context window error
+    fallbacks: Optional[list] = None, # pass in a list of api_base,keys, etc. 
+    metadata: Optional[dict] = None # additional call metadata, passed to logging integrations / custom callbacks
+    
+
+    **kwargs,
+) -> ModelResponse:
+```
+### Required Fields
+
+- `model`: *string* - ID of the model to use. Refer to the model endpoint compatibility table for details on which models work with the Chat API.
+  
+- `messages`: *array* - A list of messages comprising the conversation so far.
+
+#### Properties of `messages`
+*Note* - Each message in the array contains the following properties:
+
+- `role`: *string* - The role of the message's author. Roles can be: system, user, assistant, or function.
+
+- `content`: *string or null* - The contents of the message. It is required for all messages, but may be null for assistant messages with function calls.
+
+- `name`: *string (optional)* - The name of the author of the message. It is required if the role is "function". The name should match the name of the function represented in the content. It can contain characters (a-z, A-Z, 0-9), and underscores, with a maximum length of 64 characters.
+
+- `function_call`: *object (optional)* - The name and arguments of a function that should be called, as generated by the model.
+
+
+
+### Optional Fields
+
+- `functions`: *array* - A list of functions that the model may use to generate JSON inputs. Each function should have the following properties:
+
+    - `name`: *string* - The name of the function to be called. It should contain a-z, A-Z, 0-9, underscores and dashes, with a maximum length of 64 characters.
+    
+    - `description`: *string (optional)* - A description explaining what the function does. It helps the model to decide when and how to call the function.
+    
+    - `parameters`: *object* - The parameters that the function accepts, described as a JSON Schema object.
+    
+    - `function_call`: *string or object (optional)* - Controls how the model responds to function calls.
+
+- `temperature`: *number or null (optional)* - The sampling temperature to be used, between 0 and 2. Higher values like 0.8 produce more random outputs, while lower values like 0.2 make outputs more focused and deterministic. 
+
+- `top_p`: *number or null (optional)* - An alternative to sampling with temperature. It instructs the model to consider the results of the tokens with top_p probability. For example, 0.1 means only the tokens comprising the top 10% probability mass are considered.
+
+- `n`: *integer or null (optional)* - The number of chat completion choices to generate for each input message.
+
+- `stream`: *boolean or null (optional)* - If set to true, it sends partial message deltas. Tokens will be sent as they become available, with the stream terminated by a [DONE] message.
+
+- `stop`: *string/ array/ null (optional)* - Up to 4 sequences where the API will stop generating further tokens.
+
+- `max_tokens`: *integer (optional)* - The maximum number of tokens to generate in the chat completion.
+
+- `presence_penalty`: *number or null (optional)* - It is used to penalize new tokens based on their existence in the text so far.
+
+- `frequency_penalty`: *number or null (optional)* - It is used to penalize new tokens based on their frequency in the text so far.
+
+- `logit_bias`: *map (optional)* - Used to modify the probability of specific tokens appearing in the completion.
+
+- `user`: *string (optional)* - A unique identifier representing your end-user. This can help OpenAI to monitor and detect abuse.
+
+- `request_timeout`: *int (optional)* - Timeout in seconds for completion requests (Defaults to 600 seconds)
+
+#### litellm-specific params 
+
+- `api_base`: *string (optional)* - The api endpoint you want to call the model with
+
+- `api_version`: *string (optional)* - (Azure-specific) the api version for the call
+
+- `num_retries`: *int (optional)* - The number of times to retry the API call if an APIError, TimeoutError or ServiceUnavailableError occurs 
+
+- `context_window_fallback_dict`: *dict (optional)* - A mapping of model to use if call fails due to context window error
+
+- `fallbacks`: *list (optional)* - A list of model names + params to be used, in case the initial call fails
+
+- `metadata`: *dict (optional)* - Any additional data you want to be logged when the call is made (sent to logging integrations, eg. promptlayer and accessible via custom callback function)
+
 ## Provider-specific Params
 Providers might offer params not supported by OpenAI (e.g. top_k). You can pass those in 2 ways: 
 - via completion(): We'll pass the non-openai param, straight to the provider as part of the request body.
@ -453,59 +555,3 @@ assert len(response_2_text) > len(response_1_text)


 [**Check out the tutorial!**](../tutorials/provider_specific_params.md)
-
-## Input - Request Body
-# Request Body
-
-### Required Fields
-
- `model`: *string* - ID of the model to use. Refer to the model endpoint compatibility table for details on which models work with the Chat API.
-  
- `messages`: *array* - A list of messages comprising the conversation so far.
-
-#### Properties of `messages`
-*Note* - Each message in the array contains the following properties:
-
- `role`: *string* - The role of the message's author. Roles can be: system, user, assistant, or function.
-
- `content`: *string or null* - The contents of the message. It is required for all messages, but may be null for assistant messages with function calls.
-
- `name`: *string (optional)* - The name of the author of the message. It is required if the role is "function". The name should match the name of the function represented in the content. It can contain characters (a-z, A-Z, 0-9), and underscores, with a maximum length of 64 characters.
-
- `function_call`: *object (optional)* - The name and arguments of a function that should be called, as generated by the model.
-
-
-
-### Optional Fields
-
- `functions`: *array* - A list of functions that the model may use to generate JSON inputs. Each function should have the following properties:
-
-    - `name`: *string* - The name of the function to be called. It should contain a-z, A-Z, 0-9, underscores and dashes, with a maximum length of 64 characters.
-    
-    - `description`: *string (optional)* - A description explaining what the function does. It helps the model to decide when and how to call the function.
-    
-    - `parameters`: *object* - The parameters that the function accepts, described as a JSON Schema object.
-    
-    - `function_call`: *string or object (optional)* - Controls how the model responds to function calls.
-
- `temperature`: *number or null (optional)* - The sampling temperature to be used, between 0 and 2. Higher values like 0.8 produce more random outputs, while lower values like 0.2 make outputs more focused and deterministic. 
-
- `top_p`: *number or null (optional)* - An alternative to sampling with temperature. It instructs the model to consider the results of the tokens with top_p probability. For example, 0.1 means only the tokens comprising the top 10% probability mass are considered.
-
- `n`: *integer or null (optional)* - The number of chat completion choices to generate for each input message.
-
- `stream`: *boolean or null (optional)* - If set to true, it sends partial message deltas. Tokens will be sent as they become available, with the stream terminated by a [DONE] message.
-
- `stop`: *string/ array/ null (optional)* - Up to 4 sequences where the API will stop generating further tokens.
-
- `max_tokens`: *integer (optional)* - The maximum number of tokens to generate in the chat completion.
-
- `presence_penalty`: *number or null (optional)* - It is used to penalize new tokens based on their existence in the text so far.
-
- `frequency_penalty`: *number or null (optional)* - It is used to penalize new tokens based on their frequency in the text so far.
-
- `logit_bias`: *map (optional)* - Used to modify the probability of specific tokens appearing in the completion.
-
- `user`: *string (optional)* - A unique identifier representing your end-user. This can help OpenAI to monitor and detect abuse.
-
- `request_timeout`: *int (optional)* - Timeout in seconds for completion requests (Defaults to 600 seconds)
--- a/docs/my-website/docs/completion/message_trimming.md
+++ b/docs/my-website/docs/completion/message_trimming.md
@ -31,8 +31,6 @@ The function uses the following parameters:

 - `model`:[Optional] This is the LiteLLM model being used. This parameter is optional, as you can alternatively specify the `max_tokens` parameter.

- `system_message`:[Optional] This is a string containing an optional system message that will be preserved at the beginning of the conversation. This parameter is optional and set to `None` by default.
-
 - `max_tokens`:[Optional] This is an int, manually set upper limit on messages

 - `trim_ratio`:[Optional] This represents the target ratio of tokens to use following trimming. It's default value is 0.75, which implies that messages will be trimmed to utilise about 75%
--- a/docs/my-website/docs/completion/model_alias.md
+++ b/docs/my-website/docs/completion/model_alias.md
@ -45,14 +45,9 @@ litellm.model_alias_map = model_alias_map

 messages = [{ "content": "Hello, how are you?","role": "user"}]

-# openai call
+# call "gpt-3.5-turbo-16k"
 response = completion(model="GPT-3.5", messages=messages)

-# replicate call
+# call replicate/llama-2-70b-chat:2796ee9483c3fd7aa2e171d38f4ca1...
 response = completion("llama2", messages)
 ```
-
-
-# no-code
-
-If you use litellm client, you can also do this without going into code. [Learn more]("https://docs.litellm.ai/docs/debugging/hosted_debugging")
--- a/docs/my-website/docs/completion/reliable_completions.md
+++ b/docs/my-website/docs/completion/reliable_completions.md
@ -1,62 +1,53 @@
 # Reliability
+
+LiteLLM helps prevent failed requests in 2 ways: 
+- Retries
+- Fallbacks: Context Window + General
+
 ## Helper utils 
 LiteLLM supports the following functions for reliability:
 * `litellm.longer_context_model_fallback_dict`: Dictionary which has a mapping for those models which have larger equivalents  
-* `completion_with_retries`: use tenacity retries
+* `num_retries`: use tenacity retries
 * `completion()` with fallbacks: switch between models/keys/api bases in case of errors. 

-## Context Window Errors 
-
-```python 
-from litellm import longer_context_model_fallback_dict, ContextWindowExceededError
-
-sample_text = "how does a court case get to the Supreme Court?" * 1000
-messages = [{"content": user_message, "role": "user"}]
-model = "gpt-3.5-turbo"
-try: 
-    # try the original model
-    response = completion(model=model, messages=messages) 
-# catch the context window error
-except ContextWindowExceededError as e:
-    if model in longer_context_model_fallback_dict: 
-        # switch to the equivalent larger model -> gpt.3.5-turbo-16k 
-        new_model = longer_context_model_fallback_dict[model]
-        response = completion(new_model, messages)
-
-print(response)
-```
-
-
 ## Retry failed requests

-You can use this as a drop-in replacement for the `completion()` function to use tenacity retries - by default we retry the call 3 times. 
+Call it in completion like this `completion(..num_retries=2)`.
+

 Here's a quick look at how you can use it: 

 ```python 
-from litellm import completion_with_retries
+from litellm import completion

 user_message = "Hello, whats the weather in San Francisco??"
 messages = [{"content": user_message, "role": "user"}]

 # normal call 
-def test_completion_custom_provider_model_name():
-    try:
-        response = completion_with_retries(
+response = completion(
            model="gpt-3.5-turbo",
            messages=messages,
+            num_retries=2
        )
-        # Add any assertions here to check the response
-        print(response)
-    except Exception as e:
-        printf"Error occurred: {e}")
 ```

-## Switch Models/API Keys/API Bases
+## Fallbacks 
+
+### Context Window Fallbacks
+```python 
+from litellm import completion
+
+fallback_dict = {"gpt-3.5-turbo": "gpt-3.5-turbo-16k"}
+messages = [{"content": "how does a court case get to the Supreme Court?" * 500, "role": "user"}]
+
+completion(model="gpt-3.5-turbo", messages=messages, context_window_fallback_dict=ctx_window_fallback_dict)
+```
+
+### Fallbacks - Switch Models/API Keys/API Bases

 LLM APIs can be unstable, completion() with fallbacks ensures you'll always get a response from your calls

-### Usage 
+#### Usage 
 To use fallback models with `completion()`, specify a list of models in the `fallbacks` parameter. 

 The `fallbacks` list should include the primary model you want to use, followed by additional models that can be used as backups in case the primary model fails to provide a response.
@ -76,6 +67,11 @@ response = completion(model="azure/gpt-4", messages=messages, api_key=api_key,
    fallbacks=[{"api_key": "good-key-1"}, {"api_key": "good-key-2", "api_base": "good-api-base-2"}])
 ```

+[Check out this section for implementation details](#fallbacks-1)
+
+## Implementation Details 
+
+### Fallbacks
 #### Output from calls
 ```
 Completion with 'bad-model': got exception Unable to map your input to a model. Check your input - {'model': 'bad-model'
@ -112,7 +108,7 @@ completion call gpt-3.5-turbo
 When you pass `fallbacks` to `completion`, it makes the first `completion` call using the primary model specified as `model` in `completion(model=model)`. If the primary model fails or encounters an error, it automatically tries the `fallbacks` models in the specified order. This ensures a response even if the primary model is unavailable.


-### Key components of Model Fallbacks implementation:
+#### Key components of Model Fallbacks implementation:
 * Looping through `fallbacks`
 * Cool-Downs for rate-limited models

--- a/docs/my-website/docs/completion/stream.md
+++ b/docs/my-website/docs/completion/stream.md
@ -2,11 +2,13 @@

 - [Streaming Responses](#streaming-responses)
 - [Async Completion](#async-completion)
+- [Async + Streaming Completion](#async-streaming)

 ## Streaming Responses
 LiteLLM supports streaming the model response back by passing `stream=True` as an argument to the completion function
 ### Usage
 ```python
+from litellm import completion
 response = completion(model="gpt-3.5-turbo", messages=messages, stream=True)
 for chunk in response:
    print(chunk['choices'][0]['delta'])
@ -35,36 +37,22 @@ print(response)
 We've implemented an `__anext__()` function in the streaming object returned. This enables async iteration over the streaming object. 

 ### Usage
-Here's an example of using it with openai. But this 
+Here's an example of using it with openai.
 ```python
-from litellm import completion
-import asyncio, os, traceback, time
-
-os.environ["OPENAI_API_KEY"] = "your-api-key"
-
-def logger_fn(model_call_object: dict):
-    print(f"LOGGER FUNCTION: {model_call_object}")
-
-
-user_message = "Hello, how are you?"
-messages = [{"content": user_message, "role": "user"}]
+from litellm import acompletion
+import asyncio, os, traceback

 async def completion_call():
    try:
-        response = completion(
-            model="gpt-3.5-turbo", messages=messages, stream=True, logger_fn=logger_fn
+        print("test acompletion + streaming")
+        response = await acompletion(
+            model="gpt-3.5-turbo", 
+            messages=[{"content": "Hello, how are you?", "role": "user"}], 
+            stream=True
        )
        print(f"response: {response}")
-        complete_response = ""
-        start_time = time.time()
-        # Change for loop to async for loop
        async for chunk in response:
-            chunk_time = time.time()
-            print(f"time since initial request: {chunk_time - start_time:.5f}")
-            print(chunk["choices"][0]["delta"])
-            complete_response += chunk["choices"][0]["delta"].get("content", "")
-        if complete_response == "": 
-            raise Exception("Empty response received")
+            print(chunk)
    except:
        print(f"error occurred: {traceback.format_exc()}")
        pass
--- a/docs/my-website/docs/completion/token_usage.md
+++ b/docs/my-website/docs/completion/token_usage.md
@ -31,21 +31,10 @@ Encoding has model-specific tokenizers for anthropic, cohere, llama2 and openai.
 ```python
 from litellm import encode, decode

-
-def test_encoding_and_decoding():
-    try: 
-        sample_text = "Hellö World, this is my input string!"
-
-        # openai tokenizer 
-        openai_tokens = token_counter(model="gpt-3.5-turbo", text=sample_text)
-
-        openai_text = decode(model="gpt-3.5-turbo", tokens=openai_tokens)
-
-        assert openai_text == sample_text
-    except: 
-        pass
-
-test_encoding_and_decoding()
+sample_text = "Hellö World, this is my input string!"
+# openai encoding + decoding
+openai_tokens = encode(model="gpt-3.5-turbo", text=sample_text)
+print(openai_tokens)
 ```

 ### 2. `decode`
@ -55,21 +44,11 @@ Decoding is supported for anthropic, cohere, llama2 and openai.
 ```python
 from litellm import encode, decode

-
-def test_encoding_and_decoding():
-    try: 
-        sample_text = "Hellö World, this is my input string!"
-
-        # openai tokenizer 
-        openai_tokens = token_counter(model="gpt-3.5-turbo", text=sample_text)
-
-        openai_text = decode(model="gpt-3.5-turbo", tokens=openai_tokens)
-
-        assert openai_text == sample_text
-    except: 
-        pass
-
-test_encoding_and_decoding()
+sample_text = "Hellö World, this is my input string!"
+# openai encoding + decoding
+openai_tokens = encode(model="gpt-3.5-turbo", text=sample_text)
+openai_text = decode(model="gpt-3.5-turbo", tokens=openai_tokens)
+print(openai_text)
 ```

 ### 3. `token_counter`
--- a/docs/my-website/docs/embedding/supported_embedding.md
+++ b/docs/my-website/docs/embedding/supported_embedding.md
@ -52,22 +52,57 @@ print(response)

 h/t to [Mikko](https://www.linkedin.com/in/mikkolehtimaki/) for this integration

+
+## Bedrock Embedding
+
+### API keys
+This can be set as env variables or passed as **params to litellm.embedding()**
+```python
+import os
+os.environ["AWS_ACCESS_KEY_ID"] = ""  # Access key
+os.environ["AWS_SECRET_ACCESS_KEY"] = "" # Secret access key
+os.environ["AWS_REGION_NAME"] = "" # us-east-1, us-east-2, us-west-1, us-west-2
+```
+
+### Usage
+```python
+from litellm import embedding
+response = embedding(
+    model="amazon.titan-embed-text-v1",
+    input=["good morning from litellm"],
+)
+print(response)
+```
+
+| Model Name           | Function Call                               |
+|----------------------|---------------------------------------------|
+| Titan Embeddings - G1 | `embedding(model="amazon.titan-embed-text-v1", input=input)` |
+
+
 ## Cohere Embedding Models
 https://docs.cohere.com/reference/embed

 ### Usage
 ```python
 from litellm import embedding
-import os
-os.environ['COHERE_API_KEY'] = ""
-response = embedding('embed-english-v2.0', input=["good morning from litellm"])
-```
+os.environ["COHERE_API_KEY"] = "cohere key"

-| Model Name            | Function Call | Required OS Variables                        |
-|-----------------------|--------------------------------------------------------------|-------------------------------------------------|
-| embed-english-v2.0    | `embedding('embed-english-v2.0', input=input)`               | `os.environ['COHERE_API_KEY']`                                             |
-| embed-english-light-v2.0 | `embedding('embed-english-light-v2.0', input=input)`         | `os.environ['COHERE_API_KEY']`                                             |
-| embed-multilingual-v2.0 | `embedding('embed-multilingual-v2.0', input=input)`         | `os.environ['COHERE_API_KEY']`                                             |
+# cohere call
+response = embedding(
+    model="embed-english-v3.0", 
+    input=["good morning from litellm", "this is another item"], 
+    input_type="search_document" # optional param for v3 llms
+)
+```
+| Model Name               | Function Call                                                |
+|--------------------------|--------------------------------------------------------------|
+| embed-english-v3.0       | `embedding(model="embed-english-v3.0", input=["good morning from litellm", "this is another item"])` |
+| embed-english-light-v3.0 | `embedding(model="embed-english-light-v3.0", input=["good morning from litellm", "this is another item"])` |
+| embed-multilingual-v3.0  | `embedding(model="embed-multilingual-v3.0", input=["good morning from litellm", "this is another item"])` |
+| embed-multilingual-light-v3.0 | `embedding(model="embed-multilingual-light-v3.0", input=["good morning from litellm", "this is another item"])` |
+| embed-english-v2.0       | `embedding(model="embed-english-v2.0", input=["good morning from litellm", "this is another item"])` |
+| embed-english-light-v2.0 | `embedding(model="embed-english-light-v2.0", input=["good morning from litellm", "this is another item"])` |
+| embed-multilingual-v2.0  | `embedding(model="embed-multilingual-v2.0", input=["good morning from litellm", "this is another item"])` |

 ## HuggingFace Embedding Models
 LiteLLM supports all Feature-Extraction Embedding models: https://huggingface.co/models?pipeline_tag=feature-extraction
--- a/docs/my-website/docs/extras/contributing.md
+++ b/docs/my-website/docs/extras/contributing.md
@ -1,5 +1,7 @@
 # Contributing to Documentation

+This website is built using [Docusaurus 2](https://docusaurus.io/), a modern static website generator.
+
 Clone litellm 
 ```
 git clone https://github.com/BerriAI/litellm.git
@ -9,16 +11,28 @@ git clone https://github.com/BerriAI/litellm.git

 #### Installation
 ```
-pip install mkdocs
+npm install --global yarn
 ```

-#### Locally Serving Docs
+
+### Local Development
+
 ```
-mkdocs serve
+cd docs/my-website
 ```
-If you see `command not found: mkdocs` try running the following
+
+Let's Install requirement
+
 ```
-python3 -m mkdocs serve
+yarn
+```
+Run website
+
+```
+yarn start
+```
+Open docs here: [http://localhost:3000/](http://localhost:3000/)
+
 ```

 This command builds your Markdown files into HTML and starts a development server to browse your documentation. Open up [http://127.0.0.1:8000/](http://127.0.0.1:8000/) in your web browser to see your documentation. You can make changes to your Markdown files and your docs will automatically rebuild.
--- a/docs/my-website/docs/index.md
+++ b/docs/my-website/docs/index.md
@ -395,9 +395,6 @@ response = completion(
 )
 ```

-Need a dedicated key? Email us @ krrish@berri.ai
-
-
 ## More details
 * [exception mapping](./exception_mapping.md)
 * [retries + model fallbacks for completion()](./completion/reliable_completions.md)
--- a/docs/my-website/docs/observability/callbacks.md
+++ b/docs/my-website/docs/observability/callbacks.md
@ -25,7 +25,7 @@ litellm.success_callback=["posthog", "helicone", "llmonitor"]
 litellm.failure_callback=["sentry", "llmonitor"]

 ## set env variables
-os.environ['SENTRY_API_URL'], os.environ['SENTRY_API_TRACE_RATE']= ""
+os.environ['SENTRY_DSN'], os.environ['SENTRY_API_TRACE_RATE']= ""
 os.environ['POSTHOG_API_KEY'], os.environ['POSTHOG_API_URL'] = "api-key", "api-url"
 os.environ["HELICONE_API_KEY"] = ""
 os.environ["TRACELOOP_API_KEY"] = ""
--- a/docs/my-website/docs/observability/custom_callback.md
+++ b/docs/my-website/docs/observability/custom_callback.md
@ -1,4 +1,39 @@
-# Custom Callback Functions for Completion()
+# Custom Callbacks
+
+## Callback Class
+You can create a custom callback class to precisely log events as they occur in litellm. 
+
+```python
+from litellm.integrations.custom_logger import CustomLogger
+
+class MyCustomHandler(CustomLogger):
+    def log_pre_api_call(self, model, messages, kwargs): 
+        print(f"Pre-API Call")
+    
+    def log_post_api_call(self, kwargs, response_obj, start_time, end_time): 
+        print(f"Post-API Call")
+    
+    def log_stream_event(self, kwargs, response_obj, start_time, end_time):
+        print(f"On Stream")
+        
+    def log_success_event(self, kwargs, response_obj, start_time, end_time): 
+        print(f"On Success")
+
+    def log_failure_event(self, kwargs, response_obj, start_time, end_time): 
+        print(f"On Failure")
+
+customHandler = MyCustomHandler()
+
+litellm.callbacks = [customHandler]
+response = completion(model="gpt-3.5-turbo", messages=[{ "role": "user", "content": "Hi 👋 - i'm openai"}],
+                              stream=True)
+for chunk in response: 
+    continue
+```
+
+## Callback Functions
+If you just want to log on a specific event (e.g. on input) - you can use callback functions. 
+
 You can set custom callbacks to trigger for:
 - `litellm.input_callback`   - Track inputs/transformed inputs before making the LLM API call
 - `litellm.success_callback` - Track inputs/outputs after making LLM API call
--- a/docs/my-website/docs/observability/sentry.md
+++ b/docs/my-website/docs/observability/sentry.md
@ -1,20 +1,36 @@
-# Sentry Tutorial 
+import Image from '@theme/IdealImage';
+
+# Sentry - Log LLM Exceptions
 [Sentry](https://sentry.io/) provides error monitoring for production. LiteLLM can add breadcrumbs and send exceptions to Sentry with this integration

-This works on normal, async and streaming completion calls
+Track exceptions for:
+- litellm.completion() - completion()for 100+ LLMs
+- litellm.acompletion() - async completion()
+- Streaming completion() & acompletion() calls

-### usage 
+<Image img={require('../../img/sentry.png')} />

+
+## Usage
+
+### Set SENTRY_DSN & callback
+
+```python
+import litellm, os
+os.environ["SENTRY_DSN"] = "your-sentry-url"
+litellm.failure_callback=["sentry"]
+```
+
+### Sentry callback with completion
 ```python
 import litellm
 from litellm import completion 
-litellm.set_verbose = True

 litellm.input_callback=["sentry"] # adds sentry breadcrumbing
 litellm.failure_callback=["sentry"] # [OPTIONAL] if you want litellm to capture -> send exception to sentry

 import os 
-os.environ["SENTRY_API_URL"] = "your-sentry-url"
+os.environ["SENTRY_DSN"] = "your-sentry-url"
 os.environ["OPENAI_API_KEY"] = "your-openai-key"

 # set bad key to trigger error 
--- a/docs/my-website/docs/providers/bedrock.md
+++ b/docs/my-website/docs/providers/bedrock.md
@ -29,11 +29,53 @@ os.environ["AWS_SECRET_ACCESS_KEY"] = ""
 os.environ["AWS_REGION_NAME"] = ""

 response = completion(
-            model="anthropic.claude-instant-v1", 
-            messages=[{ "content": "Hello, how are you?","role": "user"}]
+  model="anthropic.claude-instant-v1", 
+  messages=[{ "content": "Hello, how are you?","role": "user"}]
 )
 ```

+## Usage - Streaming
+```python
+import os 
+from litellm import completion
+
+os.environ["AWS_ACCESS_KEY_ID"] = ""
+os.environ["AWS_SECRET_ACCESS_KEY"] = ""
+os.environ["AWS_REGION_NAME"] = ""
+
+response = completion(
+  model="anthropic.claude-instant-v1", 
+  messages=[{ "content": "Hello, how are you?","role": "user"}],
+  stream=True
+)
+for chunk in response:
+  print(chunk)
+```
+
+#### Example Streaming Output Chunk
+```json
+{
+  "choices": [
+    {
+      "finish_reason": null,
+      "index": 0,
+      "delta": {
+        "content": "ase can appeal the case to a higher federal court. If a higher federal court rules in a way that conflicts with a ruling from a lower federal court or conflicts with a ruling from a higher state court, the parties involved in the case can appeal the case to the Supreme Court. In order to appeal a case to the Sup"
+      }
+    }
+  ],
+  "created": null,
+  "model": "anthropic.claude-instant-v1",
+  "usage": {
+    "prompt_tokens": null,
+    "completion_tokens": null,
+    "total_tokens": null
+  }
+}
+```
+
+## Boto3 - Authentication
+
 ### Passing credentials as parameters - Completion()
 Pass AWS credentials as parameters to litellm.completion
 ```python
@ -93,8 +135,8 @@ response = completion(
 ## Supported AWS Bedrock Models
 Here's an example of using a bedrock model with LiteLLM 

-| Model Name               | Command                                                          | Environment Variables                                              |
-|--------------------------|------------------------------------------------------------------|---------------------------------------------------------------------|
+| Model Name               | Command                                                          |
+|--------------------------|------------------------------------------------------------------|
 | Anthropic Claude-V2      | `completion(model='anthropic.claude-v2', messages=messages)`   | `os.environ['ANTHROPIC_ACCESS_KEY_ID']`, `os.environ['ANTHROPIC_SECRET_ACCESS_KEY']`           |
 | Anthropic Claude-Instant V1 | `completion(model='anthropic.claude-instant-v1', messages=messages)` | `os.environ['ANTHROPIC_ACCESS_KEY_ID']`, `os.environ['ANTHROPIC_SECRET_ACCESS_KEY']`           |
 | Anthropic Claude-V1      | `completion(model='anthropic.claude-v1', messages=messages)`   | `os.environ['ANTHROPIC_ACCESS_KEY_ID']`, `os.environ['ANTHROPIC_SECRET_ACCESS_KEY']`           |
@ -104,45 +146,29 @@ Here's an example of using a bedrock model with LiteLLM
 | AI21 J2-Mid             | `completion(model='ai21.j2-mid-v1', messages=messages)`   | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` |
 | AI21 J2-Ultra              | `completion(model='ai21.j2-ultra-v1', messages=messages)`   | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` |

+## Bedrock Embedding

-## Streaming
-
+### API keys
+This can be set as env variables or passed as **params to litellm.embedding()**
 ```python
-import os 
-from litellm import completion
+import os
+os.environ["AWS_ACCESS_KEY_ID"] = ""        # Access key
+os.environ["AWS_SECRET_ACCESS_KEY"] = ""    # Secret access key
+os.environ["AWS_REGION_NAME"] = ""           # us-east-1, us-east-2, us-west-1, us-west-2
+```

-os.environ["AWS_ACCESS_KEY_ID"] = ""
-os.environ["AWS_SECRET_ACCESS_KEY"] = ""
-os.environ["AWS_REGION_NAME"] = ""
-
-response = completion(
-            model="bedrock/anthropic.claude-instant-v1", 
-            messages=[{ "content": "Hello, how are you?","role": "user"}],
-            stream=True
+### Usage
+```python
+from litellm import embedding
+response = embedding(
+    model="amazon.titan-embed-text-v1",
+    input=["good morning from litellm"],
 )
-
-for chunk in response:
-    print(chunk)
+print(response)
 ```

-### Example Streaming Output Chunk
-```json
-{
-  "choices": [
-    {
-      "finish_reason": null,
-      "index": 0,
-      "delta": {
-        "content": "ase can appeal the case to a higher federal court. If a higher federal court rules in a way that conflicts with a ruling from a lower federal court or conflicts with a ruling from a higher state court, the parties involved in the case can appeal the case to the Supreme Court. In order to appeal a case to the Sup"
-      }
-    }
-  ],
-  "created": null,
-  "model": "amazon.titan-tg1-large",
-  "usage": {
-    "prompt_tokens": null,
-    "completion_tokens": null,
-    "total_tokens": null
-  }
-}
-```
+## Supported AWS Bedrock Embedding Models
+
+| Model Name           | Function Call                               |
+|----------------------|---------------------------------------------|
+| Titan Embeddings - G1 | `embedding(model="amazon.titan-embed-text-v1", input=input)` |
--- a/docs/my-website/docs/providers/cohere.md
+++ b/docs/my-website/docs/providers/cohere.md
@ -1,27 +1,90 @@
 # Cohere

-LiteLLM supports 'command', 'command-light', 'command-medium', 'command-medium-beta', 'command-xlarge-beta', 'command-nightly' models from [Cohere](https://cohere.com/). 
-
-Like AI21, these models are available without a waitlist. 
-
-### API KEYS
+## API KEYS

 ```python
 import os 
 os.environ["COHERE_API_KEY"] = ""
 ```

-### Example Usage
+## Usage

 ```python
-
 from litellm import completion

 ## set ENV variables
 os.environ["COHERE_API_KEY"] = "cohere key"

-messages = [{ "content": "Hello, how are you?","role": "user"}]
+# cohere call
+response = completion(
+    model="command-nightly", 
+    messages = [{ "content": "Hello, how are you?","role": "user"}]
+)
+```
+
+## Usage - Streaming
+
+```python
+from litellm import completion
+
+## set ENV variables
+os.environ["COHERE_API_KEY"] = "cohere key"

 # cohere call
-response = completion("command-nightly", messages)
-```
+response = completion(
+    model="command-nightly", 
+    messages = [{ "content": "Hello, how are you?","role": "user"}],
+    stream=True
+)
+
+for chunk in response:
+    print(chunk)
+```
+
+LiteLLM supports 'command', 'command-light', 'command-medium', 'command-medium-beta', 'command-xlarge-beta', 'command-nightly' models from [Cohere](https://cohere.com/). 
+
+## Embedding
+
+```python
+from litellm import embedding
+os.environ["COHERE_API_KEY"] = "cohere key"
+
+# cohere call
+response = embedding(
+    model="embed-english-v3.0", 
+    input=["good morning from litellm", "this is another item"], 
+)
+```
+
+### Setting - Input Type for v3 models
+v3 Models have a required parameter: `input_type`, it can be one of the following four values:
+
+- `input_type="search_document"`: (default) Use this for texts (documents) you want to store in your vector database
+- `input_type="search_query"`: Use this for search queries to find the most relevant documents in your vector database
+- `input_type="classification"`: Use this if you use the embeddings as an input for a classification system
+- `input_type="clustering"`: Use this if you use the embeddings for text clustering
+
+https://txt.cohere.com/introducing-embed-v3/
+```python
+from litellm import embedding
+os.environ["COHERE_API_KEY"] = "cohere key"
+
+# cohere call
+response = embedding(
+    model="embed-english-v3.0", 
+    input=["good morning from litellm", "this is another item"], 
+    input_type="search_document" 
+)
+```
+
+### Supported Embedding Models
+| Model Name               | Function Call                                                |
+|--------------------------|--------------------------------------------------------------|
+| embed-english-v3.0       | `embedding(model="embed-english-v3.0", input=["good morning from litellm", "this is another item"])` |
+| embed-english-light-v3.0 | `embedding(model="embed-english-light-v3.0", input=["good morning from litellm", "this is another item"])` |
+| embed-multilingual-v3.0  | `embedding(model="embed-multilingual-v3.0", input=["good morning from litellm", "this is another item"])` |
+| embed-multilingual-light-v3.0 | `embedding(model="embed-multilingual-light-v3.0", input=["good morning from litellm", "this is another item"])` |
+| embed-english-v2.0       | `embedding(model="embed-english-v2.0", input=["good morning from litellm", "this is another item"])` |
+| embed-english-light-v2.0 | `embedding(model="embed-english-light-v2.0", input=["good morning from litellm", "this is another item"])` |
+| embed-multilingual-v2.0  | `embedding(model="embed-multilingual-v2.0", input=["good morning from litellm", "this is another item"])` |
+
--- a/docs/my-website/docs/providers/huggingface.md
+++ b/docs/my-website/docs/providers/huggingface.md
@ -128,7 +128,9 @@ response = embedding(
 )
 ```

-### [OPTIONAL] API KEYS + API BASE
+## Advanced
+
+### Setting API KEYS + API BASE
 If required, you can set the api key + api base, set it in your os environment. [Code for how it's sent](https://github.com/BerriAI/litellm/blob/0100ab2382a0e720c7978fbf662cc6e6920e7e03/litellm/llms/huggingface_restapi.py#L25)

 ```python
@ -137,6 +139,72 @@ os.environ["HUGGINGFACE_API_KEY"] = ""
 os.environ["HUGGINGFACE_API_BASE"] = "" 
 ```

+### Viewing Log probs
+
+#### Using `decoder_input_details` - OpenAI `echo`
+The `echo` param is supported by OpenAI Completions - Use `litellm.text_completion()` for this 
+```python
+from litellm import text_completion
+response = text_completion(
+    model="huggingface/bigcode/starcoder", 
+    prompt="good morning", 
+    max_tokens=10, logprobs=10,
+    echo=True
+)
+ ```
+
+#### Output
+ ```json
+{
+   "id":"chatcmpl-3fc71792-c442-4ba1-a611-19dd0ac371ad",
+   "object":"text_completion",
+   "created":1698801125.936519,
+   "model":"bigcode/starcoder",
+   "choices":[
+      {
+         "text":", I'm going to make you a sand",
+         "index":0,
+         "logprobs":{
+            "tokens":[
+               "good",
+               " morning",
+               ",",
+               " I",
+               "'m",
+               " going",
+               " to",
+               " make",
+               " you",
+               " a",
+               " s",
+               "and"
+            ],
+            "token_logprobs":[
+               "None",
+               -14.96875,
+               -2.2285156,
+               -2.734375,
+               -2.0957031,
+               -2.0917969,
+               -0.09429932,
+               -3.1132812,
+               -1.3203125,
+               -1.2304688,
+               -1.6201172,
+               -0.010292053
+            ]
+         },
+         "finish_reason":"length"
+      }
+   ],
+   "usage":{
+      "completion_tokens":9,
+      "prompt_tokens":2,
+      "total_tokens":11
+   }
+}
+```
+
 ### Models with Prompt Formatting
 For models with special prompt templates (e.g. Llama2), we format the prompt to fit their template. 

@ -198,7 +266,7 @@ test_huggingface_custom_model()

 [Implementation Code](https://github.com/BerriAI/litellm/blob/c0b3da2c14c791a0b755f0b1e5a9ef065951ecbf/litellm/llms/huggingface_restapi.py#L52)

-## deploying a model on huggingface
+### Deploying a model on huggingface
 You can use any chat/text model from Hugging Face with the following steps:

 * Copy your model id/url from Huggingface Inference Endpoints
--- a/docs/my-website/docs/providers/ollama.md
+++ b/docs/my-website/docs/providers/ollama.md
@ -89,8 +89,8 @@ For Ollama LiteLLM Provides a Docker Image for an OpenAI API compatible server f

 ### Quick Start:
 Docker Hub: 
-https://hub.docker.com/repository/docker/litellm/ollama/general
-
+For ARM Processors: https://hub.docker.com/repository/docker/litellm/ollama/general
+For Intel/AMD Processors: to be added
 ```shell
 docker pull litellm/ollama
 ```
--- a/docs/my-website/docs/providers/openai.md
+++ b/docs/my-website/docs/providers/openai.md
@ -1,17 +1,12 @@
 # OpenAI
 LiteLLM supports OpenAI Chat + Text completion and embedding calls.

-### API Keys
+### Required API Keys

 ```python
 import os 
-
 os.environ["OPENAI_API_KEY"] = "your-api-key"
 ```
-**Need a dedicated key?**
-Email us @ krrish@berri.ai 
-
-[**See all supported models by the litellm api key**](../proxy_api.md#supported-models-for-litellm-key)

 ### Usage
 ```python
@ -20,44 +15,70 @@ from litellm import completion

 os.environ["OPENAI_API_KEY"] = "your-api-key"

-
-messages = [{ "content": "Hello, how are you?","role": "user"}]
-
 # openai call
-response = completion("gpt-3.5-turbo", messages)
+response = completion(
+    model = "gpt-3.5-turbo", 
+    messages=[{ "content": "Hello, how are you?","role": "user"}]
+)
+```
+
+### Optional Keys - OpenAI Organization, OpenAI API Base
+
+```python
+import os 
+os.environ["OPENAI_ORGANIZATION"] = "your-org-id"       # OPTIONAL
+os.environ["OPENAI_API_BASE"] = "openaiai-api-base"     # OPTIONAL
 ```

 ### OpenAI Chat Completion Models

-| Model Name       | Function Call                          | Required OS Variables                |
-|------------------|----------------------------------------|--------------------------------------|
-| gpt-3.5-turbo    | `completion('gpt-3.5-turbo', messages)` | `os.environ['OPENAI_API_KEY']`       |
-| gpt-3.5-turbo-0301    | `completion('gpt-3.5-turbo-0301', messages)` | `os.environ['OPENAI_API_KEY']`       |
-| gpt-3.5-turbo-0613    | `completion('gpt-3.5-turbo-0613', messages)` | `os.environ['OPENAI_API_KEY']`       |
-| gpt-3.5-turbo-16k    | `completion('gpt-3.5-turbo-16k', messages)` | `os.environ['OPENAI_API_KEY']`       |
-| gpt-3.5-turbo-16k-0613    | `completion('gpt-3.5-turbo-16k-0613', messages)` | `os.environ['OPENAI_API_KEY']`       |
-| gpt-4            | `completion('gpt-4', messages)`         | `os.environ['OPENAI_API_KEY']`       |
-| gpt-4-0314            | `completion('gpt-4-0314', messages)`         | `os.environ['OPENAI_API_KEY']`       |
-| gpt-4-0613            | `completion('gpt-4-0613', messages)`         | `os.environ['OPENAI_API_KEY']`       |
-| gpt-4-32k            | `completion('gpt-4-32k', messages)`         | `os.environ['OPENAI_API_KEY']`       |
-| gpt-4-32k-0314            | `completion('gpt-4-32k-0314', messages)`         | `os.environ['OPENAI_API_KEY']`       |
-| gpt-4-32k-0613            | `completion('gpt-4-32k-0613', messages)`         | `os.environ['OPENAI_API_KEY']`       |
+| Model Name            | Function Call                                                   |
+|-----------------------|-----------------------------------------------------------------|
+| gpt-3.5-turbo         | `response = completion(model="gpt-3.5-turbo", messages=messages)` |
+| gpt-3.5-turbo-0301    | `response = completion(model="gpt-3.5-turbo-0301", messages=messages)` |
+| gpt-3.5-turbo-0613    | `response = completion(model="gpt-3.5-turbo-0613", messages=messages)` |
+| gpt-3.5-turbo-16k     | `response = completion(model="gpt-3.5-turbo-16k", messages=messages)` |
+| gpt-3.5-turbo-16k-0613| `response = completion(model="gpt-3.5-turbo-16k-0613", messages=messages)` |
+| gpt-4                 | `response = completion(model="gpt-4", messages=messages)` |
+| gpt-4-0314            | `response = completion(model="gpt-4-0314", messages=messages)` |
+| gpt-4-0613            | `response = completion(model="gpt-4-0613", messages=messages)` |
+| gpt-4-32k             | `response = completion(model="gpt-4-32k", messages=messages)` |
+| gpt-4-32k-0314        | `response = completion(model="gpt-4-32k-0314", messages=messages)` |
+| gpt-4-32k-0613        | `response = completion(model="gpt-4-32k-0613", messages=messages)` |
+

 These also support the `OPENAI_API_BASE` environment variable, which can be used to specify a custom API endpoint.

 ### OpenAI Text Completion Models / Instruct Models

-| Model Name       | Function Call                              | Required OS Variables                |
-|------------------|--------------------------------------------|--------------------------------------|
-| gpt-3.5-turbo-instruct | `completion('gpt-3.5-turbo-instruct', messages)` | `os.environ['OPENAI_API_KEY'`       |
-| text-davinci-003 | `completion('text-davinci-003', messages)` | `os.environ['OPENAI_API_KEY']`       |
-| ada-001 | `completion('ada-001', messages)` | `os.environ['OPENAI_API_KEY']`       |
-| curie-001 | `completion('curie-001', messages)` | `os.environ['OPENAI_API_KEY']`       |
-| babbage-001 | `completion('babbage-001', messages)` | `os.environ['OPENAI_API_KEY']`       |
-| babbage-002 | `completion('ada-001', messages)` | `os.environ['OPENAI_API_KEY']`       |
-| davinci-002 | `completion('davinci-002', messages)` | `os.environ['OPENAI_API_KEY']`       |
+| Model Name          | Function Call                                      |
+|---------------------|----------------------------------------------------|
+| gpt-3.5-turbo-instruct | `response = completion(model="gpt-3.5-turbo-instruct", messages=messages)` |
+| text-davinci-003    | `response = completion(model="text-davinci-003", messages=messages)` |
+| ada-001             | `response = completion(model="ada-001", messages=messages)` |
+| curie-001           | `response = completion(model="curie-001", messages=messages)` |
+| babbage-001         | `response = completion(model="babbage-001", messages=messages)` |
+| babbage-002         | `response = completion(model="babbage-002", messages=messages)` |
+| davinci-002         | `response = completion(model="davinci-002", messages=messages)` |


+### Setting Organization-ID for completion calls
+This can be set in one of the following ways:
+- Environment Variable `OPENAI_ORGANIZATION`
+- Params to `litellm.completion(model=model, organization="your-organization-id")`
+- Set as `litellm.organization="your-organization-id"`
+```python
+import os 
+from litellm import completion
+
+os.environ["OPENAI_API_KEY"] = "your-api-key"
+os.environ["OPENAI_ORGANIZATION"] = "your-org-id" # OPTIONAL
+
+response = completion(
+    model = "gpt-3.5-turbo", 
+    messages=[{ "content": "Hello, how are you?","role": "user"}]
+)
+```
 ### Using Helicone Proxy with LiteLLM
 ```python
 import os 
--- a/docs/my-website/docs/proxy_server.md
+++ b/docs/my-website/docs/proxy_server.md
@ -6,8 +6,10 @@ import TabItem from '@theme/TabItem';
 A fast, and lightweight OpenAI-compatible server to call 100+ LLM APIs. 

 :::info
-This is deprecated. Support for the CLI tool will be removed in our next MAJOR release - https://github.com/BerriAI/litellm/discussions/648.
-::: 
+
+Docs outdated. New docs 👉 [here](./simple_proxy.md)
+
+:::

 ## Usage 
 ```shell
--- a/docs/my-website/docs/routing.md
+++ b/docs/my-website/docs/routing.md
@ -1,4 +1,79 @@
-# Manage Multiple Deployments
+# Reliability - Fallbacks, Azure Deployments, etc.
+
+# Reliability
+
+LiteLLM helps prevent failed requests in 3 ways: 
+- Retries
+- Fallbacks: Context Window + General
+- RateLimitManager
+
+## Helper utils 
+LiteLLM supports the following functions for reliability:
+* `litellm.longer_context_model_fallback_dict`: Dictionary which has a mapping for those models which have larger equivalents  
+* `num_retries`: use tenacity retries
+* `completion()` with fallbacks: switch between models/keys/api bases in case of errors. 
+* `router()`: An abstraction on top of completion + embeddings to route the request to a deployment with capacity (available tpm/rpm).
+
+## Retry failed requests
+
+Call it in completion like this `completion(..num_retries=2)`.
+
+
+Here's a quick look at how you can use it: 
+
+```python 
+from litellm import completion
+
+user_message = "Hello, whats the weather in San Francisco??"
+messages = [{"content": user_message, "role": "user"}]
+
+# normal call 
+response = completion(
+            model="gpt-3.5-turbo",
+            messages=messages,
+            num_retries=2
+        )
+```
+
+## Fallbacks 
+
+### Context Window Fallbacks
+```python 
+from litellm import completion
+
+fallback_dict = {"gpt-3.5-turbo": "gpt-3.5-turbo-16k"}
+messages = [{"content": "how does a court case get to the Supreme Court?" * 500, "role": "user"}]
+
+completion(model="gpt-3.5-turbo", messages=messages, context_window_fallback_dict=ctx_window_fallback_dict)
+```
+
+### Fallbacks - Switch Models/API Keys/API Bases
+
+LLM APIs can be unstable, completion() with fallbacks ensures you'll always get a response from your calls
+
+#### Usage 
+To use fallback models with `completion()`, specify a list of models in the `fallbacks` parameter. 
+
+The `fallbacks` list should include the primary model you want to use, followed by additional models that can be used as backups in case the primary model fails to provide a response.
+
+#### switch models 
+```python
+response = completion(model="bad-model", messages=messages, 
+    fallbacks=["gpt-3.5-turbo" "command-nightly"])
+```
+
+#### switch api keys/bases (E.g. azure deployment)
+Switch between different keys for the same azure deployment, or use another deployment as well. 
+
+```python
+api_key="bad-key"
+response = completion(model="azure/gpt-4", messages=messages, api_key=api_key,
+    fallbacks=[{"api_key": "good-key-1"}, {"api_key": "good-key-2", "api_base": "good-api-base-2"}])
+```
+
+[Check out this section for implementation details](#fallbacks-1)
+
+## Manage Multiple Deployments

 Use this if you're trying to load-balance across multiple deployments (e.g. Azure/OpenAI). 

@ -6,11 +81,7 @@ Use this if you're trying to load-balance across multiple deployments (e.g. Azur

 In production, [Router connects to a Redis Cache](#redis-queue) to track usage across multiple deployments.

-## Quick Start
-
-```python
-pip install litellm
-```
+### Quick Start

 ```python
 from litellm import Router
@ -54,7 +125,7 @@ response = router.completion(model="gpt-3.5-turbo",
 print(response)
 ```

-## Redis Queue 
+### Redis Queue 

 In production, we use Redis to track usage across multiple Azure deployments.

@ -67,7 +138,7 @@ router = Router(model_list=model_list,
 print(response)
 ```

-## Deploy Router 
+### Deploy Router 

 1. Clone repo
 ```shell
@ -99,4 +170,131 @@ curl 'http://0.0.0.0:8000/router/completions' \
    "model": "gpt-3.5-turbo",
    "messages": [{"role": "user", "content": "Hey"}]
 }'
+```
+
+
+## Implementation Details 
+
+### Fallbacks
+#### Output from calls
+```
+Completion with 'bad-model': got exception Unable to map your input to a model. Check your input - {'model': 'bad-model'
+
+
+
+completion call gpt-3.5-turbo
+{
+  "id": "chatcmpl-7qTmVRuO3m3gIBg4aTmAumV1TmQhB",
+  "object": "chat.completion",
+  "created": 1692741891,
+  "model": "gpt-3.5-turbo-0613",
+  "choices": [
+    {
+      "index": 0,
+      "message": {
+        "role": "assistant",
+        "content": "I apologize, but as an AI, I do not have the capability to provide real-time weather updates. However, you can easily check the current weather in San Francisco by using a search engine or checking a weather website or app."
+      },
+      "finish_reason": "stop"
+    }
+  ],
+  "usage": {
+    "prompt_tokens": 16,
+    "completion_tokens": 46,
+    "total_tokens": 62
+  }
+}
+
+```
+
+#### How does fallbacks work
+
+When you pass `fallbacks` to `completion`, it makes the first `completion` call using the primary model specified as `model` in `completion(model=model)`. If the primary model fails or encounters an error, it automatically tries the `fallbacks` models in the specified order. This ensures a response even if the primary model is unavailable.
+
+
+#### Key components of Model Fallbacks implementation:
+* Looping through `fallbacks`
+* Cool-Downs for rate-limited models
+
+#### Looping through `fallbacks`
+Allow `45seconds` for each request. In the 45s this function tries calling the primary model set as `model`. If model fails it loops through the backup `fallbacks` models and attempts to get a response in the allocated `45s` time set here: 
+```python
+while response == None and time.time() - start_time < 45:
+        for model in fallbacks:
+```
+
+#### Cool-Downs for rate-limited models
+If a model API call leads to an error - allow it to cooldown for `60s`
+```python
+except Exception as e:
+  print(f"got exception {e} for model {model}")
+  rate_limited_models.add(model)
+  model_expiration_times[model] = (
+      time.time() + 60
+  )  # cool down this selected model
+  pass
+```
+
+Before making an LLM API call we check if the selected model is in `rate_limited_models`, if so skip making the API call
+```python
+if (
+  model in rate_limited_models
+):  # check if model is currently cooling down
+  if (
+      model_expiration_times.get(model)
+      and time.time() >= model_expiration_times[model]
+  ):
+      rate_limited_models.remove(
+          model
+      )  # check if it's been 60s of cool down and remove model
+  else:
+      continue  # skip model
+
+```
+
+#### Full code of completion with fallbacks()
+```python
+
+    response = None
+    rate_limited_models = set()
+    model_expiration_times = {}
+    start_time = time.time()
+    fallbacks = [kwargs["model"]] + kwargs["fallbacks"]
+    del kwargs["fallbacks"]  # remove fallbacks so it's not recursive
+
+    while response == None and time.time() - start_time < 45:
+        for model in fallbacks:
+            # loop thru all models
+            try:
+                if (
+                    model in rate_limited_models
+                ):  # check if model is currently cooling down
+                    if (
+                        model_expiration_times.get(model)
+                        and time.time() >= model_expiration_times[model]
+                    ):
+                        rate_limited_models.remove(
+                            model
+                        )  # check if it's been 60s of cool down and remove model
+                    else:
+                        continue  # skip model
+
+                # delete model from kwargs if it exists
+                if kwargs.get("model"):
+                    del kwargs["model"]
+
+                print("making completion call", model)
+                response = litellm.completion(**kwargs, model=model)
+
+                if response != None:
+                    return response
+
+            except Exception as e:
+                print(f"got exception {e} for model {model}")
+                rate_limited_models.add(model)
+                model_expiration_times[model] = (
+                    time.time() + 60
+                )  # cool down this selected model
+                pass
+    return response
 ```
--- a/docs/my-website/docs/set_keys.md
+++ b/docs/my-website/docs/set_keys.md
@ -78,6 +78,14 @@ litellm.api_base = "https://hosted-llm-api.co"
 response = litellm.completion(messages=messages, model="gpt-3.5-turbo")
 ```

+### litellm.api_version
+
+```python
+import litellm
+litellm.api_version = "2023-05-15"
+response = litellm.completion(messages=messages, model="gpt-3.5-turbo")
+```
+
 ### litellm.organization
 ```python
 import litellm
@ -124,7 +132,7 @@ response = completion("command-nightly", messages, api_version="2023-02-15")

 Check if a user submitted a valid key for the model they're trying to call. 

-```
+```python
 key = "bad-key"
 response = check_valid_key(model="gpt-3.5-turbo", api_key=key)
 assert(response == False)
@ -134,7 +142,7 @@ assert(response == False)

 This helper reads the .env and returns a list of supported llms for user

-```
+```python
 old_environ = os.environ
 os.environ = {'OPENAI_API_KEY': 'temp'} # mock set only openai key in environ

--- a/docs/my-website/docs/simple_proxy.md
+++ b/docs/my-website/docs/simple_proxy.md
@ -2,23 +2,337 @@ import Image from '@theme/IdealImage';
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# 💥 LiteLLM Server - Deploy LiteLLM
+# 💥 Evaluate LLMs - OpenAI Proxy Server

-A simple, fast, and lightweight **OpenAI-compatible server** to call 100+ LLM APIs in the OpenAI Input/Output format
+A simple, fast, and lightweight **OpenAI-compatible server** to call 100+ LLM APIs.

-## Endpoints:
- `/chat/completions` - chat completions endpoint to call 100+ LLMs
- `/models` - available models on server
+LiteLLM Server supports:

-[![Deploy](https://deploy.cloud.run/button.svg)](https://l.linklyhq.com/l/1uHtX)
-[![Deploy](https://render.com/images/deploy-to-render-button.svg)](https://l.linklyhq.com/l/1uHsr)
-[![Deploy](../img/deploy-to-aws.png)](https://docs.litellm.ai/docs/simple_proxy#deploy-on-aws-apprunner)
+* Call [Huggingface/Bedrock/TogetherAI/etc.](#other-supported-models) in the OpenAI ChatCompletions format
+* Set custom prompt templates + model-specific configs (temperature, max_tokens, etc.)
+* Caching (In-memory + Redis)
+
+[**See Code**](https://github.com/BerriAI/litellm/tree/main/litellm_server)

 :::info
 We want to learn how we can make the server better! Meet the [founders](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version) or
 join our [discord](https://discord.gg/wuPM9dRgDw)
 ::: 

+## Quick Start 
+
+```shell
+$ litellm --model huggingface/bigcode/starcoder
+```
+OpenAI Proxy running on http://0.0.0.0:8000
+
+```shell
+curl http://0.0.0.0:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+     "model": "gpt-3.5-turbo",
+     "messages": [{"role": "user", "content": "Say this is a test!"}],
+     "temperature": 0.7
+   }'
+```
+
+This will now automatically route any requests for gpt-3.5-turbo to bigcode starcoder, hosted on huggingface inference endpoints. 
+
+
+#### Other supported models:
+<Tabs>
+<TabItem value="bedrock" label="Bedrock">
+
+```shell
+$ export AWS_ACCESS_KEY_ID=""
+$ export AWS_REGION_NAME="" # e.g. us-west-2
+$ export AWS_SECRET_ACCESS_KEY=""
+$ litellm --model bedrock/anthropic.claude-v2
+```
+</TabItem>
+<TabItem value="vllm-local" label="VLLM">
+Assuming you're running vllm locally
+
+```shell
+$ litellm --model vllm/facebook/opt-125m
+```
+</TabItem>
+<TabItem value="openai-proxy" label="OpenAI Compatible Server">
+
+```shell
+$ litellm --model openai/<model_name> --api_base <your-api-base>
+```
+</TabItem>
+<TabItem value="huggingface" label="Huggingface (TGI)">
+
+```shell
+$ export HUGGINGFACE_API_KEY=my-api-key #[OPTIONAL]
+$ litellm --model huggingface/<huggingface-model-name> --api_base https://<your-hf-endpoint># e.g. huggingface/mistralai/Mistral-7B-v0.1
+```
+
+</TabItem>
+<TabItem value="anthropic" label="Anthropic">
+
+```shell
+$ export ANTHROPIC_API_KEY=my-api-key
+$ litellm --model claude-instant-1
+```
+
+</TabItem>
+
+<TabItem value="together_ai" label="TogetherAI">
+
+```shell
+$ export TOGETHERAI_API_KEY=my-api-key
+$ litellm --model together_ai/lmsys/vicuna-13b-v1.5-16k
+```
+
+</TabItem>
+
+<TabItem value="replicate" label="Replicate">
+
+```shell
+$ export REPLICATE_API_KEY=my-api-key
+$ litellm \
+  --model replicate/meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3
+```
+
+</TabItem>
+
+<TabItem value="petals" label="Petals">
+
+```shell
+$ litellm --model petals/meta-llama/Llama-2-70b-chat-hf
+```
+
+</TabItem>
+
+<TabItem value="palm" label="Palm">
+
+```shell
+$ export PALM_API_KEY=my-palm-key
+$ litellm --model palm/chat-bison
+```
+
+</TabItem>
+
+<TabItem value="azure" label="Azure OpenAI">
+
+```shell
+$ export AZURE_API_KEY=my-api-key
+$ export AZURE_API_BASE=my-api-base
+
+$ litellm --model azure/my-deployment-name
+```
+
+</TabItem>
+
+<TabItem value="ai21" label="AI21">
+
+```shell
+$ export AI21_API_KEY=my-api-key
+$ litellm --model j2-light
+```
+
+</TabItem>
+
+<TabItem value="cohere" label="Cohere">
+
+```shell
+$ export COHERE_API_KEY=my-api-key
+$ litellm --model command-nightly
+```
+
+</TabItem>
+
+</Tabs>
+
+[**Jump to Code**](https://github.com/BerriAI/litellm/blob/fef4146396d5d87006259e00095a62e3900d6bb4/litellm/proxy.py#L36)
+
+# [TUTORIAL] LM-Evaluation Harness with TGI
+
+Evaluate LLMs 20x faster with TGI via litellm proxy's `/completions` endpoint. 
+
+This tutorial assumes you're using [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness)
+
+**Step 1: Start the local proxy**
+```shell
+$ litellm --model huggingface/bigcode/starcoder
+```
+
+OpenAI Compatible Endpoint at http://0.0.0.0:8000
+
+**Step 2: Set OpenAI API Base**
+```shell
+$ export OPENAI_API_BASE="http://0.0.0.0:8000"
+```
+
+**Step 3: Run LM-Eval-Harness**
+
+```shell
+$ python3 main.py \
+  --model gpt3 \
+  --model_args engine=huggingface/bigcode/starcoder \
+  --tasks hellaswag
+```
+
+
+## Endpoints:
+- `/chat/completions` - chat completions endpoint to call 100+ LLMs
+- `/embeddings` - embedding endpoint for Azure, OpenAI, Huggingface endpoints
+- `/models` - available models on server
+
+## Set Custom Prompt Templates
+
+LiteLLM by default checks if a model has a [prompt template and applies it](./completion/prompt_formatting.md) (e.g. if a huggingface model has a saved chat template in it's tokenizer_config.json). However, you can also set a custom prompt template on your proxy in the `config.yaml`: 
+
+**Step 1**: Save your prompt template in a `config.yaml`
+```yaml
+# Model-specific parameters
+model_list:
+  - model_name: mistral-7b # model alias
+    litellm_params: # actual params for litellm.completion()
+      model: "huggingface/mistralai/Mistral-7B-Instruct-v0.1" 
+      api_base: "<your-api-base>"
+      api_key: "<your-api-key>" # [OPTIONAL] for hf inference endpoints
+      initial_prompt_value: "\n"
+      roles: {"system":{"pre_message":"<|im_start|>system\n", "post_message":"<|im_end|>"}, "assistant":{"pre_message":"<|im_start|>assistant\n","post_message":"<|im_end|>"}, "user":{"pre_message":"<|im_start|>user\n","post_message":"<|im_end|>"}}
+      final_prompt_value: "\n"
+      bos_token: "<s>"
+      eos_token: "</s>"
+      max_tokens: 4096
+```
+
+**Step 2**: Start server with config
+
+```shell
+$ litellm --config /path/to/config.yaml
+```
+
+## Multiple Models 
+
+If you have 1 model running on a local GPU and another that's hosted (e.g. on Runpod), you can call both via the same litellm server by listing them in your `config.yaml`. 
+
+```yaml
+model_list:
+  - model_name: zephyr-alpha
+    litellm_params: # params for litellm.completion() - https://docs.litellm.ai/docs/completion/input#input---request-body
+      model: huggingface/HuggingFaceH4/zephyr-7b-alpha
+      api_base: http://0.0.0.0:8001
+  - model_name: zephyr-beta
+    litellm_params:
+      model: huggingface/HuggingFaceH4/zephyr-7b-beta
+      api_base: https://<my-hosted-endpoint>
+```
+
+```shell
+$ litellm --config /path/to/config.yaml
+```
+
+### Evaluate model
+
+If you're repo let's you set model name, you can call the specific model by just passing in that model's name - 
+
+```python
+import openai 
+openai.api_base = "http://0.0.0.0:8000" 
+
+completion = openai.ChatCompletion.create(model="zephyr-alpha", messages=[{"role": "user", "content": "Hello world"}])
+print(completion.choices[0].message.content)
+```
+
+If you're repo only let's you specify api base, then you can add the model name to the api base passed in - 
+
+```python
+import openai 
+openai.api_base = "http://0.0.0.0:8000/openai/deployments/zephyr-alpha/chat/completions" # zephyr-alpha will be used 
+
+completion = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hello world"}])
+print(completion.choices[0].message.content)
+```
+
+## Save Model-specific params (API Base, API Keys, Temperature, etc.)
+Use the [router_config_template.yaml](https://github.com/BerriAI/litellm/blob/main/router_config_template.yaml) to save model-specific information like api_base, api_key, temperature, max_tokens, etc. 
+
+**Step 1**: Create a `config.yaml` file
+```shell
+model_list:
+  - model_name: gpt-3.5-turbo
+    litellm_params: # params for litellm.completion() - https://docs.litellm.ai/docs/completion/input#input---request-body
+      model: azure/chatgpt-v-2 # azure/<your-deployment-name>
+      api_key: your_azure_api_key
+      api_version: your_azure_api_version
+      api_base: your_azure_api_base
+  - model_name: mistral-7b
+    litellm_params:
+      model: ollama/mistral
+      api_base: your_ollama_api_base
+```
+
+**Step 2**: Start server with config
+
+```shell
+$ litellm --config /path/to/config.yaml
+```
+## Model Alias 
+
+Set a model alias for your deployments. 
+
+In the `config.yaml` the model_name parameter is the user-facing name to use for your deployment. 
+
+E.g.: If we want to save a Huggingface TGI Mistral-7b deployment, as 'mistral-7b' for our users, we might save it as: 
+
+```yaml
+model_list:
+  - model_name: mistral-7b # ALIAS
+    litellm_params:
+      model: huggingface/mistralai/Mistral-7B-Instruct-v0.1 # ACTUAL NAME
+      api_key: your_huggingface_api_key # [OPTIONAL] if deployed on huggingface inference endpoints
+      api_base: your_api_base # url where model is deployed 
+```
+
+## Caching 
+
+Add Redis Caching to your server via environment variables  
+
+```env
+### REDIS
+REDIS_HOST = "" 
+REDIS_PORT = "" 
+REDIS_PASSWORD = "" 
+```
+
+Docker command: 
+
+```shell
+docker run -e REDIST_HOST=<your-redis-host> -e REDIS_PORT=<your-redis-port> -e REDIS_PASSWORD=<your-redis-password> -e PORT=8000 -p 8000:8000 ghcr.io/berriai/litellm:latest
+```
+
+## Logging 
+
+1. Debug Logs
+Print the input/output params by setting `SET_VERBOSE = "True"`.
+
+Docker command:
+
+```shell
+docker run -e SET_VERBOSE="True" -e PORT=8000 -p 8000:8000 ghcr.io/berriai/litellm:latest
+```
+2. Add Langfuse Logging to your server via environment variables  
+
+```env
+### LANGFUSE
+LANGFUSE_PUBLIC_KEY = ""
+LANGFUSE_SECRET_KEY = ""
+# Optional, defaults to https://cloud.langfuse.com
+LANGFUSE_HOST = "" # optional
+```
+
+Docker command: 
+
+```shell
+docker run -e LANGFUSE_PUBLIC_KEY=<your-public-key> -e LANGFUSE_SECRET_KEY=<your-secret-key> -e LANGFUSE_HOST=<your-langfuse-host> -e PORT=8000 -p 8000:8000 ghcr.io/berriai/litellm:latest
+```

 ## Local Usage 

@ -33,53 +347,6 @@ $ cd ./litellm/litellm_server
 $ uvicorn main:app --host 0.0.0.0 --port 8000
 ```

-### Test Request
-Ensure your API keys are set in the Environment for these requests
-
-<Tabs>
-<TabItem value="openai" label="OpenAI">
-
-```shell
-curl http://0.0.0.0:8000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-     "model": "gpt-3.5-turbo",
-     "messages": [{"role": "user", "content": "Say this is a test!"}],
-     "temperature": 0.7
-   }'
-```
-
-</TabItem>
-<TabItem value="azure" label="Azure">
-
-```shell
-curl http://0.0.0.0:8000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-     "model": "azure/<your-deployment-name>",
-     "messages": [{"role": "user", "content": "Say this is a test!"}],
-     "temperature": 0.7
-   }'
-```
-
-</TabItem>
-
-<TabItem value="anthropic" label="Anthropic">
-
-```shell
-curl http://0.0.0.0:8000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-     "model": "claude-2",
-     "messages": [{"role": "user", "content": "Say this is a test!"}],
-     "temperature": 0.7,
-   }'
-```
-</TabItem>
-
-</Tabs>
-
-
 ## Setting LLM API keys
 This server allows two ways of passing API keys to litellm
 - Environment Variables - This server by default assumes the LLM API Keys are stored in the environment variables
@ -87,7 +354,11 @@ This server allows two ways of passing API keys to litellm
  - Set `AUTH_STRATEGY=DYNAMIC` in the Environment 
  - Pass required auth params `api_key`,`api_base`, `api_version` with the request params

-## Deploy on Google Cloud Run
+
+<Tabs>
+<TabItem value="gcp-run" label="Google Cloud Run">
+
+#### Deploy on Google Cloud Run
 **Click the button** to deploy to Google Cloud Run

 [![Deploy](https://deploy.cloud.run/button.svg)](https://l.linklyhq.com/l/1uHtX)
@ -159,8 +430,10 @@ More info [here](https://cloud.google.com/run/docs/configuring/services/environm
 Example `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`
 <Image img={require('../img/cloud_run3.png')} />

+</TabItem>
+<TabItem value="render" label="Render">

-## Deploy on Render
+#### Deploy on Render
 **Click the button** to deploy to Render

 [![Deploy](https://render.com/images/deploy-to-render-button.svg)](https://l.linklyhq.com/l/1uHsr)
@ -169,8 +442,10 @@ On a successfull deploy https://dashboard.render.com/ should display the followi
 <Image img={require('../img/render1.png')} />

 <Image img={require('../img/render2.png')} />
+</TabItem>
+<TabItem value="aws-apprunner" label="AWS Apprunner">

-## Deploy on AWS Apprunner
+#### Deploy on AWS Apprunner
 1. Fork LiteLLM https://github.com/BerriAI/litellm 
 2. Navigate to to App Runner on AWS Console: https://console.aws.amazon.com/apprunner/home#/services
 3. Follow the steps in the video below
@ -225,6 +500,8 @@ On a successfull deploy https://dashboard.render.com/ should display the followi

  </Tabs>

+</TabItem>
+</Tabs>

 ## Advanced
 ### Caching - Completion() and Embedding() Responses
@ -287,3 +564,220 @@ Caching can be switched on/off per /chat/completions request



+## Tutorials (Chat-UI, NeMO-Guardrails, PromptTools, Phoenix ArizeAI, Langchain, ragas, LlamaIndex, etc.)
+
+**Start server:**
+```shell
+`docker run -e PORT=8000 -p 8000:8000 ghcr.io/berriai/litellm:latest`
+```
+The server is now live on http://0.0.0.0:8000
+
+<Tabs>
+<TabItem value="chat-ui" label="Chat UI">
+
+Here's the `docker-compose.yml` for running LiteLLM Server with Mckay Wrigley's Chat-UI: 
+```yaml
+version: '3'
+services:
+  container1:
+    image: ghcr.io/berriai/litellm:latest
+    ports:
+      - '8000:8000'
+    environment:
+      - PORT=8000
+      - OPENAI_API_KEY=<your-openai-key>
+
+  container2:
+    image: ghcr.io/mckaywrigley/chatbot-ui:main
+    ports:
+      - '3000:3000'
+    environment:
+      - OPENAI_API_KEY=my-fake-key
+      - OPENAI_API_HOST=http://container1:8000
+```
+
+Run this via: 
+```shell
+docker-compose up
+```
+</TabItem>
+<TabItem value="nemo-guardrails" label="NeMO-Guardrails">
+
+#### Adding NeMO-Guardrails to Bedrock 
+
+1. Start server
+```shell
+`docker run -e PORT=8000 -e AWS_ACCESS_KEY_ID=<your-aws-access-key> -e AWS_SECRET_ACCESS_KEY=<your-aws-secret-key> -p 8000:8000 ghcr.io/berriai/litellm:latest`
+```
+
+2. Install dependencies
+```shell
+pip install nemoguardrails langchain
+```
+
+3. Run script
+```python
+import openai
+from langchain.chat_models import ChatOpenAI
+
+llm = ChatOpenAI(model_name="bedrock/anthropic.claude-v2", openai_api_base="http://0.0.0.0:8000", openai_api_key="my-fake-key")
+
+from nemoguardrails import LLMRails, RailsConfig
+
+config = RailsConfig.from_path("./config.yml")
+app = LLMRails(config, llm=llm)
+
+new_message = app.generate(messages=[{
+    "role": "user",
+    "content": "Hello! What can you do for me?"
+}])
+``` 
+</TabItem>
+<TabItem value="prompttools" label="PromptTools">
+
+Use [PromptTools](https://github.com/hegelai/prompttools) for evaluating different LLMs
+
+1. Start server
+```shell
+`docker run -e PORT=8000 -p 8000:8000 ghcr.io/berriai/litellm:latest`
+```
+
+2. Install dependencies 
+```python 
+pip install prompttools
+```
+
+3. Run script 
+```python 
+import os
+os.environ['DEBUG']=""  # Set this to "" to call OpenAI's API
+os.environ['AZURE_OPENAI_KEY'] = "my-api-key"  # Insert your key here
+
+from typing import Dict, List
+from prompttools.experiment import OpenAIChatExperiment
+
+models = ["gpt-3.5-turbo", "gpt-3.5-turbo-0613"]
+messages = [
+    [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "Who was the first president?"},
+    ]
+]
+temperatures = [0.0, 1.0]
+# You can add more parameters that you'd like to test here.
+
+experiment = OpenAIChatExperiment(models, messages, temperature=temperatures, azure_openai_service_configs={"AZURE_OPENAI_ENDPOINT": "http://0.0.0.0:8000", "API_TYPE": "azure", "API_VERSION": "2023-05-15"})
+```
+</TabItem>
+<TabItem value="phoenix-arizeai" label="ArizeAI">
+
+Use [Arize AI's LLM Evals](https://github.com/Arize-ai/phoenix#llm-evals) to evaluate different LLMs
+
+1. Start server
+```shell
+`docker run -e PORT=8000 -p 8000:8000 ghcr.io/berriai/litellm:latest`
+```
+
+2. Use this LLM Evals Quickstart colab
+[![Open in Colab](https://img.shields.io/static/v1?message=Open%20in%20Colab&logo=googlecolab&labelColor=grey&color=blue&logoColor=orange&label=%20)](https://colab.research.google.com/github/Arize-ai/phoenix/blob/main/tutorials/evals/evaluate_relevance_classifications.ipynb)
+
+3. Call the model
+```python
+import openai 
+
+## SET API BASE + PROVIDER KEY
+openai.api_base = "http://0.0.0.0:8000
+openai.api_key = "my-anthropic-key"
+
+## CALL MODEL 
+model = OpenAIModel(
+    model_name="claude-2",
+    temperature=0.0,
+)
+```
+</TabItem>
+<TabItem value="langchain" label="Langchain">
+
+```python
+from langchain.chat_models import ChatOpenAI
+from langchain.prompts.chat import (
+    ChatPromptTemplate,
+    SystemMessagePromptTemplate,
+    AIMessagePromptTemplate,
+    HumanMessagePromptTemplate,
+)
+from langchain.schema import AIMessage, HumanMessage, SystemMessage
+
+chat = ChatOpenAI(model_name="claude-instant-1", openai_api_key="my-anthropic-key", openai_api_base="http://0.0.0.0:8000")
+
+messages = [
+    SystemMessage(
+        content="You are a helpful assistant that translates English to French."
+    ),
+    HumanMessage(
+        content="Translate this sentence from English to French. I love programming."
+    ),
+]
+chat(messages)
+```
+</TabItem>
+<TabItem value="ragas" label="ragas">
+
+#### Evaluating with Open-Source LLMs 
+
+Use [Ragas](https://github.com/explodinggradients/ragas/blob/7b123533df80d0ada33a2cb2dd2fdedf36807f33/docs/howtos/customisations/llms.ipynb#L247) to evaluate LLMs for RAG-scenarios.
+```python
+from langchain.chat_models import ChatOpenAI
+
+inference_server_url = "http://localhost:8080/v1"
+
+chat = ChatOpenAI(
+    model="bedrock/anthropic.claude-v2",
+    openai_api_key="no-key",
+    openai_api_base=inference_server_url,
+    max_tokens=5,
+    temperature=0,
+)
+
+from ragas.metrics import (
+    context_precision,
+    answer_relevancy,
+    faithfulness,
+    context_recall,
+)
+from ragas.metrics.critique import harmfulness
+
+# change the LLM
+
+faithfulness.llm.langchain_llm = chat
+answer_relevancy.llm.langchain_llm = chat
+context_precision.llm.langchain_llm = chat
+context_recall.llm.langchain_llm = chat
+harmfulness.llm.langchain_llm = chat
+
+
+# evaluate
+from ragas import evaluate
+
+result = evaluate(
+    fiqa_eval["baseline"].select(range(5)),  # showing only 5 for demonstration
+    metrics=[faithfulness],
+)
+
+result
+```
+</TabItem>
+<TabItem value="llama_index" label="Llama Index">
+
+```python
+!pip install llama-index
+```
+```python
+from llama_index.llms import OpenAI
+
+response = OpenAI(model="claude-2", api_key="your-anthropic-key",api_base="http://0.0.0.0:8000").complete('Paul Graham is ')
+print(response)
+```
+</TabItem>
+</Tabs>
+
--- a/docs/my-website/docs/tutorials/finetuned_chat_gpt.md
+++ b/docs/my-website/docs/tutorials/finetuned_chat_gpt.md
@ -2,13 +2,41 @@
 LiteLLM allows you to call `completion` with your fine-tuned gpt-3.5-turbo models
 If you're trying to create your custom finetuned gpt-3.5-turbo model following along on this tutorial: https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset

-Once you've created your fine tuned model, you can call it with `completion()` 
+Once you've created your fine tuned model, you can call it with `litellm.completion()` 

 ## Usage
 ```python
 import os
 from litellm import completion
-# set your OPENAI key in your .env as "OPENAI_API_KEY"
+
+# LiteLLM reads from your .env
+os.environ["OPENAI_API_KEY"] = "your-api-key"
+
+response = completion(
+  model="ft:gpt-3.5-turbo:my-org:custom_suffix:id",
+  messages=[
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "Hello!"}
+  ]
+)
+
+print(response.choices[0].message)
+```
+
+## Usage - Setting OpenAI Organization ID
+LiteLLM allows you to specify your OpenAI Organization when calling OpenAI LLMs. More details here: 
+[setting Organization ID](https://docs.litellm.ai/docs/providers/openai#setting-organization-id-for-completion-calls)
+This can be set in one of the following ways:
+- Environment Variable `OPENAI_ORGANIZATION`
+- Params to `litellm.completion(model=model, organization="your-organization-id")`
+- Set as `litellm.organization="your-organization-id"`
+```python
+import os
+from litellm import completion
+
+# LiteLLM reads from your .env
+os.environ["OPENAI_API_KEY"] = "your-api-key"
+os.environ["OPENAI_ORGANIZATION"] = "your-org-id" # Optional

 response = completion(
  model="ft:gpt-3.5-turbo:my-org:custom_suffix:id",
--- a/docs/my-website/docs/tutorials/first_playground.md
+++ b/docs/my-website/docs/tutorials/first_playground.md
@ -184,6 +184,4 @@ This is what you should see:
 You've created your first LLM Playground - with the ability to call 50+ LLM APIs. 

 Next Steps: 
-* [Check out the full list of LLM Providers you can now add](../completion/supported)
-* [Deploy your server using Render](https://render.com/docs/deploy-flask)
-* [Deploy your playground using Streamlit](https://docs.streamlit.io/streamlit-community-cloud/deploy-your-app)
+* [Check out the full list of LLM Providers you can now add](https://docs.litellm.ai/docs/providers)
--- a/docs/my-website/docs/tutorials/lm_evaluation_harness.md
+++ b/docs/my-website/docs/tutorials/lm_evaluation_harness.md
@ -0,0 +1,50 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# LM-Evaluation Harness with TGI
+
+Evaluate LLMs 20x faster with TGI via litellm proxy's `/completions` endpoint. 
+
+This tutorial assumes you're using the `big-refactor` branch of [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness/tree/big-refactor)
+
+**Step 1: Start the local proxy**
+```shell
+$ litellm --model huggingface/bigcode/starcoder
+```
+
+Using a custom api base
+
+```shell
+$ export HUGGINGFACE_API_KEY=my-api-key #[OPTIONAL]
+$ litellm --model huggingface/tinyllama --api_base https://k58ory32yinf1ly0.us-east-1.aws.endpoints.huggingface.cloud
+```
+
+OpenAI Compatible Endpoint at http://0.0.0.0:8000
+
+**Step 2: Set OpenAI API Base & Key**
+```shell
+$ export OPENAI_API_BASE=http://0.0.0.0:8000
+```
+
+LM Harness requires you to set an OpenAI API key `OPENAI_API_SECRET_KEY` for running benchmarks
+```shell
+export OPENAI_API_SECRET_KEY=anything
+```
+
+**Step 3: Run LM-Eval-Harness**
+
+```shell
+python3 -m lm_eval \
+  --model openai-completions \
+  --model_args engine=davinci \
+  --task crows_pairs_english_age
+
+```
+
+## Debugging 
+
+### Making a test request to your proxy
+This command makes a test Completion, ChatCompletion request to your proxy server
+```shell
+litellm --test
+```
--- a/docs/my-website/img/sentry.png
+++ b/docs/my-website/img/sentry.png
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -36,8 +36,6 @@ const sidebars = {
        "completion/message_trimming",
        "completion/function_call",
        "completion/model_alias", 
-        "completion/reliable_completions", 
-        "completion/multiple_deployments",
        "completion/config",
        "completion/batching",
        "completion/mock_requests",
@ -97,10 +95,10 @@ const sidebars = {
      label: 'Tutorials',
      items: [
        'tutorials/azure_openai',
-        'tutorials/ab_test_llms',
        'tutorials/oobabooga',
        "tutorials/gradio_integration",
        "tutorials/model_config_proxy",
+        "tutorials/lm_evaluation_harness",
        'tutorials/huggingface_codellama',
        'tutorials/huggingface_tutorial', 
        'tutorials/TogetherAI_liteLLM', 
--- a/docs/my-website/src/pages/index.md
+++ b/docs/my-website/src/pages/index.md
@ -5,6 +5,7 @@ import TabItem from '@theme/TabItem';

 https://github.com/BerriAI/litellm

+
 ## **Call 100+ LLMs using the same Input/Output Format**

 ## Basic usage 
--- a/docs/my-website/src/pages/observability/callbacks.md
+++ b/docs/my-website/src/pages/observability/callbacks.md
@ -22,7 +22,7 @@ litellm.success_callback=["posthog", "helicone", "llmonitor"]
 litellm.failure_callback=["sentry", "llmonitor"]

 ## set env variables
-os.environ['SENTRY_API_URL'], os.environ['SENTRY_API_TRACE_RATE']= ""
+os.environ['SENTRY_DSN'], os.environ['SENTRY_API_TRACE_RATE']= ""
 os.environ['POSTHOG_API_KEY'], os.environ['POSTHOG_API_URL'] = "api-key", "api-url"
 os.environ["HELICONE_API_KEY"] = ""

--- a/litellm/init.py
+++ b/litellm/init.py
@ -6,6 +6,7 @@ from litellm.caching import Cache
 input_callback: List[Union[str, Callable]] = []
 success_callback: List[Union[str, Callable]] = []
 failure_callback: List[Union[str, Callable]] = []
+callbacks: List[Callable] = []
 set_verbose = False
 email: Optional[
    str
@ -23,6 +24,7 @@ azure_key: Optional[str] = None
 anthropic_key: Optional[str] = None
 replicate_key: Optional[str] = None
 cohere_key: Optional[str] = None
+maritalk_key: Optional[str] = None
 ai21_key: Optional[str] = None
 openrouter_key: Optional[str] = None
 huggingface_key: Optional[str] = None
@ -45,6 +47,8 @@ add_function_to_prompt: bool = False # if function calling not supported by api,
 client_session: Optional[requests.Session] = None
 model_fallbacks: Optional[List] = None
 model_cost_map_url: str = "https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json"
+num_retries: Optional[int] = None
+suppress_debug_info = False
 #############################################

 def get_model_cost_map(url: str):
@ -218,6 +222,10 @@ ollama_models = [
    "llama2"
 ]

+maritalk_models = [
+    "maritalk"
+]
+
 model_list = (
    open_ai_chat_completion_models
    + open_ai_text_completion_models
@ -237,6 +245,7 @@ model_list = (
    + bedrock_models
    + deepinfra_models
    + perplexity_models
+    + maritalk_models
 )

 provider_list: List = [
@ -263,6 +272,7 @@ provider_list: List = [
    "deepinfra",
    "perplexity",
    "anyscale",
+    "maritalk",
    "custom", # custom apis
 ]

@ -282,6 +292,7 @@ models_by_provider: dict = {
    "ollama": ollama_models,
    "deepinfra": deepinfra_models,
    "perplexity": perplexity_models,
+    "maritalk": maritalk_models
 }

 # mapping for those models which have larger equivalents 
@ -308,7 +319,15 @@ longer_context_model_fallback_dict: dict = {

 ####### EMBEDDING MODELS ###################
 open_ai_embedding_models: List = ["text-embedding-ada-002"]
-cohere_embedding_models: List = ["embed-english-v2.0", "embed-english-light-v2.0", "embed-multilingual-v2.0"]
+cohere_embedding_models: List = [
+    "embed-english-v3.0",
+    "embed-english-light-v3.0",
+    "embed-multilingual-v3.0", 
+    "embed-english-v2.0", 
+    "embed-english-light-v2.0", 
+    "embed-multilingual-v2.0", 
+]
+bedrock_embedding_models: List = ["amazon.titan-embed-text-v1"]

 from .timeout import timeout
 from .testing import *
@ -324,7 +343,6 @@ from .utils import (
    Logging,
    acreate,
    get_model_list,
-    completion_with_split_tests,
    get_max_tokens,
    register_prompt_template,
    validate_environment,
@ -348,6 +366,7 @@ from .llms.petals import PetalsConfig
 from .llms.vertex_ai import VertexAIConfig
 from .llms.sagemaker import SagemakerConfig
 from .llms.ollama import OllamaConfig
+from .llms.maritalk import MaritTalkConfig
 from .llms.bedrock import AmazonTitanConfig, AmazonAI21Config, AmazonAnthropicConfig, AmazonCohereConfig
 from .llms.openai import OpenAIConfig, OpenAITextCompletionConfig, AzureOpenAIConfig
 from .main import *  # type: ignore
@ -359,10 +378,9 @@ from .exceptions import (
    ServiceUnavailableError,
    OpenAIError,
    ContextWindowExceededError,
-    BudgetExceededError
-
+    BudgetExceededError, 
 )
 from .budget_manager import BudgetManager
 from .proxy.proxy_cli import run_server
 from .router import Router
-
+from .proxy.proxy_server import app
--- a/litellm/budget_manager.py
+++ b/litellm/budget_manager.py
@ -14,7 +14,8 @@ class BudgetManager:
    
    def print_verbose(self, print_statement):
        if litellm.set_verbose:
-            print(print_statement)
+            import logging
+            logging.info(print_statement)
    
    def load_data(self):
        if self.client_type == "local":
@ -149,8 +150,6 @@ class BudgetManager:
                'project_name' : self.project_name, 
                "user_dict": self.user_dict
            }
-            print(f"data: {data}")
            response = requests.post(url, headers=headers, json=data)
-            print(f"response: {response.text}")
            response = response.json()
            return response
--- a/litellm/caching.py
+++ b/litellm/caching.py
@ -8,8 +8,9 @@
 #  Thank you users! We ❤️ you! - Krrish & Ishaan

 import litellm
-import time
-import json
+import time, logging
+import json, traceback
+

 def get_prompt(*args, **kwargs):
    # make this safe checks, it should not throw any exceptions
@ -23,81 +24,105 @@ def get_prompt(*args, **kwargs):
        return prompt
    return None

-class RedisCache():
+
+class BaseCache:
+    def set_cache(self, key, value, **kwargs):
+        raise NotImplementedError
+
+    def get_cache(self, key, **kwargs):
+        raise NotImplementedError
+
+
+class RedisCache(BaseCache):
    def __init__(self, host, port, password):
        import redis
        # if users don't provider one, use the default litellm cache
        self.redis_client = redis.Redis(host=host, port=port, password=password)

-    def set_cache(self, key, value):
+    def set_cache(self, key, value, **kwargs):
+        ttl = kwargs.get("ttl", None)
        try:
-            self.redis_client.set(key, str(value))
+            self.redis_client.set(name=key, value=str(value), ex=ttl)
        except Exception as e:
            # NON blocking - notify users Redis is throwing an exception
-            print("LiteLLM Caching: Got exception from REDIS: ", e)
+            logging.debug("LiteLLM Caching: set() - Got exception from REDIS : ", e)

-    def get_cache(self, key):
+    def get_cache(self, key, **kwargs):
        try:
            # TODO convert this to a ModelResponse object
            cached_response = self.redis_client.get(key)
-            if cached_response!=None:
+            if cached_response != None:
                # cached_response is in `b{} convert it to ModelResponse
                cached_response = cached_response.decode("utf-8")  # Convert bytes to string
                cached_response = json.loads(cached_response)  # Convert string to dictionary
-                cached_response['cache'] = True # set cache-hit flag to True
+                cached_response['cache'] = True  # set cache-hit flag to True
                return cached_response
        except Exception as e:
            # NON blocking - notify users Redis is throwing an exception
-            print("LiteLLM Caching: Got exception from REDIS: ", e)
+            traceback.print_exc()
+            logging.debug("LiteLLM Caching: get() - Got exception from REDIS: ", e)

-class HostedCache():
-    def set_cache(self, key, value):
+
+class HostedCache(BaseCache):
+    def set_cache(self, key, value, **kwargs):
+        if "ttl" in kwargs:
+            logging.debug("LiteLLM Caching: TTL is not supported for hosted cache!")
        # make a post request to api.litellm.ai/set_cache
        import requests
        url = f"https://api.litellm.ai/set_cache?key={key}&value={str(value)}"
-        requests.request("POST", url) # post request to set this in the hosted litellm cache
+        requests.request("POST", url)  # post request to set this in the hosted litellm cache

-    def get_cache(self, key):
+    def get_cache(self, key, **kwargs):
        import requests
        url = f"https://api.litellm.ai/get_cache?key={key}"
        cached_response = requests.request("GET", url)
        cached_response = cached_response.text
-        if cached_response == "NONE": # api.litellm.ai returns "NONE" if it's not a cache hit
-            return None        
-        if cached_response!=None:
+        if cached_response == "NONE":  # api.litellm.ai returns "NONE" if it's not a cache hit
+            return None
+        if cached_response != None:
            try:
                cached_response = json.loads(cached_response)  # Convert string to dictionary
-                cached_response['cache'] = True # set cache-hit flag to True
+                cached_response['cache'] = True  # set cache-hit flag to True
                return cached_response
            except:
                return cached_response

-class InMemoryCache():
+
+class InMemoryCache(BaseCache):
    def __init__(self):
        # if users don't provider one, use the default litellm cache
        self.cache_dict = {}
+        self.ttl_dict = {}

-    def set_cache(self, key, value):
-        #print("in set cache for inmem")
+    def set_cache(self, key, value, **kwargs):
        self.cache_dict[key] = value
-        #print(self.cache_dict)
+        if "ttl" in kwargs:
+            self.ttl_dict[key] = time.time() + kwargs["ttl"]

-    def get_cache(self, key):
-        #print("in get cache for inmem")
+    def get_cache(self, key, **kwargs):
        if key in self.cache_dict:
-            #print("got a cache hit")
-            return self.cache_dict[key]
-        #print("got a cache miss")
+            if key in self.ttl_dict:
+                if time.time() > self.ttl_dict[key]:
+                    self.cache_dict.pop(key, None)
+                    return None
+            original_cached_response = self.cache_dict[key]
+            try: 
+                cached_response = json.loads(original_cached_response)
+            except: 
+                cached_response = original_cached_response
+            cached_response['cache'] = True  # set cache-hit flag to True
+            return cached_response
        return None

-class Cache():
+
+class Cache:
    def __init__(
-            self, 
-            type = "local",
-            host = None,
-            port = None,
-            password = None
-        ):
+            self,
+            type="local",
+            host=None,
+            port=None,
+            password=None
+    ):
        """
        Initializes the cache based on the given type.

@ -151,9 +176,9 @@ class Cache():
    def generate_streaming_content(self, content):
        chunk_size = 5  # Adjust the chunk size as needed
        for i in range(0, len(content), chunk_size):
-            yield {'choices': [{'delta': {'role': 'assistant', 'content': content[i:i+chunk_size]}}]}
+            yield {'choices': [{'delta': {'role': 'assistant', 'content': content[i:i + chunk_size]}}]}
            time.sleep(0.02)
-    
+
    def get_cache(self, *args, **kwargs):
        """
        Retrieves the cached result for the given arguments.
@ -166,19 +191,18 @@ class Cache():
            The cached result if it exists, otherwise None.
        """
        try:  # never block execution
-            if "cache_key" in kwargs: 
+            if "cache_key" in kwargs:
                cache_key = kwargs["cache_key"]
-            else: 
+            else:
                cache_key = self.get_cache_key(*args, **kwargs)
            if cache_key is not None:
                cached_result = self.cache.get_cache(cache_key)
                if cached_result != None and 'stream' in kwargs and kwargs['stream'] == True:
                    # if streaming is true and we got a cache hit, return a generator
-                    #print("cache hit and stream=True")
-                    #print(cached_result)
                    return self.generate_streaming_content(cached_result["choices"][0]['message']['content'])
                return cached_result
-        except:
+        except Exception as e:
+            logging.debug(f"An exception occurred: {traceback.format_exc()}")
            return None

    def add_cache(self, result, *args, **kwargs):
@ -193,20 +217,11 @@ class Cache():
            None
        """
        try:
-            if "cache_key" in kwargs: 
+            if "cache_key" in kwargs:
                cache_key = kwargs["cache_key"]
-            else: 
+            else:
                cache_key = self.get_cache_key(*args, **kwargs)
-            # print("adding to cache", cache_key, result)
-            # print(cache_key)
            if cache_key is not None:
-                # print("adding to cache", cache_key, result)
-                self.cache.set_cache(cache_key, result)
+                self.cache.set_cache(cache_key, result, **kwargs)
        except:
            pass
-
-
-
-
-
-
--- a/litellm/gpt_cache.py
+++ b/litellm/gpt_cache.py
@ -1,4 +1,5 @@
 ###### LiteLLM Integration with GPT Cache #########
+# will be deprecated soon https://github.com/BerriAI/litellm/discussions/648#discussioncomment-7461510 
 import gptcache

 # openai.ChatCompletion._llm_handler = litellm.completion
--- a/litellm/integrations/custom_logger.py
+++ b/litellm/integrations/custom_logger.py
@ -12,7 +12,25 @@ class CustomLogger:
    # Class variables or attributes
    def __init__(self):
        pass
+
+    def log_pre_api_call(self, model, messages, kwargs): 
+        pass
+
+    def log_post_api_call(self, kwargs, response_obj, start_time, end_time): 
+        pass
    
+    def log_stream_event(self, kwargs, response_obj, start_time, end_time):
+        pass
+
+    def log_success_event(self, kwargs, response_obj, start_time, end_time): 
+        pass
+
+    def log_failure_event(self, kwargs, response_obj, start_time, end_time): 
+        pass
+
+
+    #### DEPRECATED ####
+
    def log_input_event(self, model, messages, kwargs, print_verbose, callback_func):
        try: 
            print_verbose(
--- a/litellm/integrations/langfuse.py
+++ b/litellm/integrations/langfuse.py
@ -13,8 +13,8 @@ class LangFuseLogger:
    def __init__(self):
        try:
            from langfuse import Langfuse
-        except:
-            raise Exception("\033[91mLangfuse not installed, try running 'pip install langfuse' to fix this error\033[0m")
+        except Exception as e:
+            raise Exception("\033[91mLangfuse not installed, try running 'pip install langfuse' to fix this error\033[0m", e)
        # Instance variables
        self.secret_key = os.getenv("LANGFUSE_SECRET_KEY")
        self.public_key = os.getenv("LANGFUSE_PUBLIC_KEY")
@ -36,10 +36,6 @@ class LangFuseLogger:
            print_verbose(
                f"Langfuse Logging - Enters logging function for model {kwargs}"
            )
-            # print(response_obj)
-            # print(response_obj['choices'][0]['message']['content'])
-            # print(response_obj['usage']['prompt_tokens'])
-            # print(response_obj['usage']['completion_tokens'])
            metadata = kwargs.get("metadata", {})
            prompt = [kwargs['messages']]

--- a/litellm/integrations/prompt_layer.py
+++ b/litellm/integrations/prompt_layer.py
@ -17,18 +17,25 @@ class PromptLayerLogger:
    def log_event(self, kwargs, response_obj, start_time, end_time, print_verbose):
        # Method definition
        try:
-            if 'litellm_logging_obj' in kwargs:
-                kwargs.pop('litellm_logging_obj')
+            new_kwargs = {}
+            new_kwargs['model'] = kwargs['model']
+            new_kwargs['messages'] = kwargs['messages']
+
+            # add kwargs["optional_params"] to new_kwargs
+            for optional_param in kwargs["optional_params"]:
+                new_kwargs[optional_param] = kwargs["optional_params"][optional_param]
+

            print_verbose(
-                f"Prompt Layer Logging - Enters logging function for model kwargs: {kwargs}\n, response: {response_obj}"
+                f"Prompt Layer Logging - Enters logging function for model kwargs: {new_kwargs}\n, response: {response_obj}"
            )

+
            request_response = requests.post(
                "https://api.promptlayer.com/rest/track-request",
                json={
                    "function_name": "openai.ChatCompletion.create",
-                    "kwargs": kwargs,
+                    "kwargs": new_kwargs,
                    "tags": ["hello", "world"],
                    "request_response": dict(response_obj),
                    "request_start_time": int(start_time.timestamp()),
--- a/litellm/llms/aleph_alpha.py
+++ b/litellm/llms/aleph_alpha.py
@ -262,11 +262,9 @@ def completion(

        model_response["created"] = time.time()
        model_response["model"] = model
-        model_response["usage"] = {
-            "prompt_tokens": prompt_tokens,
-            "completion_tokens": completion_tokens,
-            "total_tokens": prompt_tokens + completion_tokens,
-        }
+        model_response.usage.completion_tokens = completion_tokens
+        model_response.usage.prompt_tokens = prompt_tokens
+        model_response.usage.total_tokens = prompt_tokens + completion_tokens
        return model_response

 def embedding():
--- a/litellm/llms/anthropic.py
+++ b/litellm/llms/anthropic.py
@ -164,11 +164,9 @@ def completion(

        model_response["created"] = time.time()
        model_response["model"] = model
-        model_response["usage"] = {
-            "prompt_tokens": prompt_tokens,
-            "completion_tokens": completion_tokens,
-            "total_tokens": prompt_tokens + completion_tokens,
-        }
+        model_response.usage.completion_tokens = completion_tokens
+        model_response.usage.prompt_tokens = prompt_tokens
+        model_response.usage.total_tokens = prompt_tokens + completion_tokens
        return model_response

 def embedding():
--- a/litellm/llms/baseten.py
+++ b/litellm/llms/baseten.py
@ -136,11 +136,9 @@ def completion(

        model_response["created"] = time.time()
        model_response["model"] = model
-        model_response["usage"] = {
-            "prompt_tokens": prompt_tokens,
-            "completion_tokens": completion_tokens,
-            "total_tokens": prompt_tokens + completion_tokens,
-        }
+        model_response.usage.completion_tokens = completion_tokens
+        model_response.usage.prompt_tokens = prompt_tokens
+        model_response.usage.total_tokens = prompt_tokens + completion_tokens
        return model_response

 def embedding():
--- a/litellm/llms/bedrock.py
+++ b/litellm/llms/bedrock.py
@ -1,4 +1,5 @@
 import json, copy, types
+import os
 from enum import Enum
 import time
 from typing import Callable, Optional
@ -174,8 +175,32 @@ def init_bedrock_client(
        aws_access_key_id = None,
        aws_secret_access_key = None,
        aws_region_name=None,
+        aws_bedrock_runtime_endpoint=None,
    ):

+    # check for custom AWS_REGION_NAME and use it if not passed to init_bedrock_client
+    litellm_aws_region_name = get_secret("AWS_REGION_NAME")
+    standard_aws_region_name = get_secret("AWS_REGION")
+    if region_name:
+        pass
+    elif aws_region_name:
+        region_name = aws_region_name
+    elif litellm_aws_region_name:
+        region_name = litellm_aws_region_name
+    elif standard_aws_region_name:
+        region_name = standard_aws_region_name
+    else:
+        raise BedrockError(message="AWS region not set: set AWS_REGION_NAME or AWS_REGION env variable or in .env file", status_code=401)
+
+    # check for custom AWS_BEDROCK_RUNTIME_ENDPOINT and use it if not passed to init_bedrock_client
+    env_aws_bedrock_runtime_endpoint = get_secret("AWS_BEDROCK_RUNTIME_ENDPOINT")
+    if aws_bedrock_runtime_endpoint:
+        endpoint_url = aws_bedrock_runtime_endpoint
+    elif env_aws_bedrock_runtime_endpoint:
+        endpoint_url = env_aws_bedrock_runtime_endpoint
+    else:
+        endpoint_url = f'https://bedrock-runtime.{region_name}.amazonaws.com'
+
    import boto3
    if aws_access_key_id != None:
        # uses auth params passed to completion
@ -185,23 +210,17 @@ def init_bedrock_client(
            service_name="bedrock-runtime",
            aws_access_key_id=aws_access_key_id,
            aws_secret_access_key=aws_secret_access_key,
-            region_name=aws_region_name,
-            endpoint_url=f'https://bedrock-runtime.{aws_region_name}.amazonaws.com'
+            region_name=region_name,
+            endpoint_url=endpoint_url,
        )
    else:
        # aws_access_key_id is None, assume user is trying to auth using env variables 
-        # boto3 automaticaly reads env variables
+        # boto3 automatically reads env variables

-        # we need to read region name from env
-        # I assume majority of users use .env for auth 
-        region_name = (
-            get_secret("AWS_REGION_NAME") or
-            "us-west-2"  # default to us-west-2 if user not specified
-        )
        client = boto3.client(
            service_name="bedrock-runtime",
            region_name=region_name,
-            endpoint_url=f'https://bedrock-runtime.{region_name}.amazonaws.com'
+            endpoint_url=endpoint_url,
        )

    return client
@ -259,6 +278,174 @@ def completion(
        litellm_params=None,
        logger_fn=None,
 ):
+    exception_mapping_worked = False
+    try:
+        # pop aws_secret_access_key, aws_access_key_id, aws_region_name from kwargs, since completion calls fail with them
+        aws_secret_access_key = optional_params.pop("aws_secret_access_key", None)
+        aws_access_key_id = optional_params.pop("aws_access_key_id", None)
+        aws_region_name = optional_params.pop("aws_region_name", None)
+
+        # use passed in BedrockRuntime.Client if provided, otherwise create a new one
+        client = optional_params.pop(
+            "aws_bedrock_client",
+            # only pass variables that are not None
+            init_bedrock_client(
+                aws_access_key_id=aws_access_key_id,
+                aws_secret_access_key=aws_secret_access_key,
+                aws_region_name=aws_region_name,
+            ),
+        )
+
+        model = model
+        provider = model.split(".")[0]
+        prompt = convert_messages_to_prompt(model, messages, provider, custom_prompt_dict)
+        inference_params = copy.deepcopy(optional_params)
+        stream = inference_params.pop("stream", False)
+        if provider == "anthropic":
+            ## LOAD CONFIG
+            config = litellm.AmazonAnthropicConfig.get_config() 
+            for k, v in config.items(): 
+                if k not in inference_params: # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
+                    inference_params[k] = v
+            data = json.dumps({
+                "prompt": prompt,
+                **inference_params
+            })
+        elif provider == "ai21":
+            ## LOAD CONFIG
+            config = litellm.AmazonAI21Config.get_config() 
+            for k, v in config.items(): 
+                if k not in inference_params: # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
+                    inference_params[k] = v
+
+            data = json.dumps({
+                "prompt": prompt,
+                **inference_params
+            })
+        elif provider == "cohere":
+            ## LOAD CONFIG
+            config = litellm.AmazonCohereConfig.get_config() 
+            for k, v in config.items(): 
+                if k not in inference_params: # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
+                    inference_params[k] = v
+            if optional_params.get("stream", False) == True:
+                inference_params["stream"] = True # cohere requires stream = True in inference params
+            data = json.dumps({
+                "prompt": prompt,
+                **inference_params
+            })
+        elif provider == "amazon":  # amazon titan
+            ## LOAD CONFIG
+            config = litellm.AmazonTitanConfig.get_config() 
+            for k, v in config.items(): 
+                if k not in inference_params: # completion(top_k=3) > amazon_config(top_k=3) <- allows for dynamic variables to be passed in
+                    inference_params[k] = v
+
+            data = json.dumps({
+                "inputText": prompt,
+                "textGenerationConfig": inference_params,
+            })
+        
+        ## LOGGING
+        logging_obj.pre_call(
+            input=prompt,
+            api_key="",
+            additional_args={"complete_input_dict": data},
+        )
+
+        ## COMPLETION CALL
+        accept = 'application/json'
+        contentType = 'application/json'
+        if stream == True:
+            response = client.invoke_model_with_response_stream(
+                body=data,
+                modelId=model,
+                accept=accept,
+                contentType=contentType
+            )
+            response = response.get('body')
+            return response
+
+        try: 
+            response = client.invoke_model(
+                body=data,
+                modelId=model,
+                accept=accept,
+                contentType=contentType
+            )
+        except Exception as e: 
+            raise BedrockError(status_code=500, message=str(e))
+        
+        response_body = json.loads(response.get('body').read())
+
+        ## LOGGING
+        logging_obj.post_call(
+            input=prompt,
+            api_key="",
+            original_response=response_body,
+            additional_args={"complete_input_dict": data},
+        )
+        print_verbose(f"raw model_response: {response}")
+        ## RESPONSE OBJECT
+        outputText = "default"
+        if provider == "ai21":
+            outputText = response_body.get('completions')[0].get('data').get('text')
+        elif provider == "anthropic":
+            outputText = response_body['completion']
+            model_response["finish_reason"] = response_body["stop_reason"]
+        elif provider == "cohere": 
+            outputText = response_body["generations"][0]["text"]
+        else:  # amazon titan
+            outputText = response_body.get('results')[0].get('outputText')
+
+        response_metadata = response.get("ResponseMetadata", {})
+        if response_metadata.get("HTTPStatusCode", 500) >= 400:
+            raise BedrockError(
+                message=outputText,
+                status_code=response_metadata.get("HTTPStatusCode", 500),
+            )
+        else:
+            try:
+                if len(outputText) > 0:
+                    model_response["choices"][0]["message"]["content"] = outputText
+            except:
+                raise BedrockError(message=json.dumps(outputText), status_code=response_metadata.get("HTTPStatusCode", 500))
+
+        ## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here. 
+        prompt_tokens = len(
+            encoding.encode(prompt)
+        )
+        completion_tokens = len(
+            encoding.encode(model_response["choices"][0]["message"].get("content", ""))
+        )
+
+        model_response["created"] = time.time()
+        model_response["model"] = model
+        model_response.usage.completion_tokens = completion_tokens
+        model_response.usage.prompt_tokens = prompt_tokens
+        model_response.usage.total_tokens = prompt_tokens + completion_tokens
+        return model_response
+    except BedrockError as e:
+        exception_mapping_worked = True
+        raise e
+    except Exception as e: 
+        if exception_mapping_worked:
+            raise e
+        else: 
+            import traceback
+            raise BedrockError(status_code=500, message=traceback.format_exc())
+
+
+
+def embedding(
+    model: str,
+    input: list,
+    logging_obj=None,
+    model_response=None,
+    optional_params=None,
+    encoding=None,
+):
+    # logic for parsing in - calling - parsing out model embedding calls
    # pop aws_secret_access_key, aws_access_key_id, aws_region_name from kwargs, since completion calls fail with them
    aws_secret_access_key = optional_params.pop("aws_secret_access_key", None)
    aws_access_key_id = optional_params.pop("aws_access_key_id", None)
@ -274,132 +461,39 @@ def completion(
            aws_region_name=aws_region_name,
        ),
    )
-
-    model = model
-    provider = model.split(".")[0]
-    prompt = convert_messages_to_prompt(model, messages, provider, custom_prompt_dict)
-    inference_params = copy.deepcopy(optional_params)
-    stream = inference_params.pop("stream", False)
-    if provider == "anthropic":
-        ## LOAD CONFIG
-        config = litellm.AmazonAnthropicConfig.get_config() 
-        for k, v in config.items(): 
-            if k not in inference_params: # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
-                inference_params[k] = v
-        data = json.dumps({
-            "prompt": prompt,
-            **inference_params
-        })
-    elif provider == "ai21":
-        ## LOAD CONFIG
-        config = litellm.AmazonAI21Config.get_config() 
-        for k, v in config.items(): 
-            if k not in inference_params: # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
-                inference_params[k] = v
-
-        data = json.dumps({
-            "prompt": prompt,
-            **inference_params
-        })
-    elif provider == "cohere":
-        ## LOAD CONFIG
-        config = litellm.AmazonCohereConfig.get_config() 
-        for k, v in config.items(): 
-            if k not in inference_params: # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
-                inference_params[k] = v
-        data = json.dumps({
-            "prompt": prompt,
-            **inference_params
-        })
-    elif provider == "amazon":  # amazon titan
-        ## LOAD CONFIG
-        config = litellm.AmazonTitanConfig.get_config() 
-        for k, v in config.items(): 
-            if k not in inference_params: # completion(top_k=3) > amazon_config(top_k=3) <- allows for dynamic variables to be passed in
-                inference_params[k] = v
-
-        data = json.dumps({
-            "inputText": prompt,
-            "textGenerationConfig": inference_params,
-        })
    
-    ## LOGGING
-    logging_obj.pre_call(
-        input=prompt,
-        api_key="",
-        additional_args={"complete_input_dict": data},
-    )
-
-    ## COMPLETION CALL
-    accept = 'application/json'
-    contentType = 'application/json'
-    if stream == True:
-        response = client.invoke_model_with_response_stream(
-            body=data,
-            modelId=model,
-            accept=accept,
-            contentType=contentType
-        )
-        response = response.get('body')
-        return response
+    # translate to bedrock
+    # bedrock only accepts (str) for inputText
+    if type(input) == list:
+        if len(input) > 1: # input is a list with more than 1 elem, raise Exception, Bedrock only supports one element 
+            raise BedrockError(message="Bedrock cannot embed() more than one string - len(input) must always ==  1, input = ['hi from litellm']", status_code=400)
+        input_str = "".join(input)

    response = client.invoke_model(
-        body=data,
+        body=json.dumps({
+            "inputText": input_str
+        }),
        modelId=model,
-        accept=accept,
-        contentType=contentType
+        accept="*/*",
+        contentType="application/json"
    )
+
    response_body = json.loads(response.get('body').read())

-    ## LOGGING
-    logging_obj.post_call(
-        input=prompt,
-        api_key="",
-        original_response=response_body,
-        additional_args={"complete_input_dict": data},
-    )
-    print_verbose(f"raw model_response: {response}")
-    ## RESPONSE OBJECT
-    outputText = "default"
-    if provider == "ai21":
-        outputText = response_body.get('completions')[0].get('data').get('text')
-    elif provider == "anthropic":
-        outputText = response_body['completion']
-        model_response["finish_reason"] = response_body["stop_reason"]
-    elif provider == "cohere": 
-        outputText = response_body["generations"][0]["text"]
-    else:  # amazon titan
-        outputText = response_body.get('results')[0].get('outputText')
-    if "error" in outputText:
-        raise BedrockError(
-            message=outputText,
-            status_code=response.status_code,
-        )
-    else:
-        try:
-            if len(outputText) > 0:
-                model_response["choices"][0]["message"]["content"] = outputText
-        except:
-            raise BedrockError(message=json.dumps(outputText), status_code=response.status_code)
+    embedding_response = response_body["embedding"]

-    ## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here. 
-    prompt_tokens = len(
-        encoding.encode(prompt)
-    )
-    completion_tokens = len(
-        encoding.encode(model_response["choices"][0]["message"].get("content", ""))
-    )
-
-    model_response["created"] = time.time()
+    model_response["object"] = "list"
+    model_response["data"] = embedding_response
    model_response["model"] = model
-    model_response["usage"] = {
-        "prompt_tokens": prompt_tokens,
-        "completion_tokens": completion_tokens,
-        "total_tokens": prompt_tokens + completion_tokens,
+    input_tokens = 0
+
+    input_tokens+=len(encoding.encode(input_str)) 
+
+    model_response["usage"] = { 
+        "prompt_tokens": input_tokens, 
+        "total_tokens": input_tokens,
    }
+
+
+
    return model_response
-
-
-def embedding():
-    # logic for parsing in - calling - parsing out model embedding calls
-    pass
--- a/litellm/llms/cohere.py
+++ b/litellm/llms/cohere.py
@ -137,6 +137,10 @@ def completion(
    response = requests.post(
        completion_url, headers=headers, data=json.dumps(data), stream=optional_params["stream"] if "stream" in optional_params else False
    )
+    ## error handling for cohere calls
+    if response.status_code!=200:
+        raise CohereError(message=response.text, status_code=response.status_code)
+
    if "stream" in optional_params and optional_params["stream"] == True:
        return response.iter_lines()
    else:
@ -179,11 +183,9 @@ def completion(

        model_response["created"] = time.time()
        model_response["model"] = model
-        model_response["usage"] = {
-            "prompt_tokens": prompt_tokens,
-            "completion_tokens": completion_tokens,
-            "total_tokens": prompt_tokens + completion_tokens,
-        }
+        model_response.usage.completion_tokens = completion_tokens
+        model_response.usage.prompt_tokens = prompt_tokens
+        model_response.usage.total_tokens = prompt_tokens + completion_tokens
        return model_response

 def embedding(
@ -193,6 +195,7 @@ def embedding(
    logging_obj=None,
    model_response=None,
    encoding=None,
+    optional_params=None,
 ):
    headers = validate_environment(api_key)
    embed_url = "https://api.cohere.ai/v1/embed"
@ -200,8 +203,13 @@ def embedding(
    data = {
        "model": model,
        "texts": input,
+        **optional_params
    }

+    if "3" in model and "input_type" not in data:
+        # cohere v3 embedding models require input_type, if no input_type is provided, default to "search_document"
+        data["input_type"] = "search_document"
+
    ## LOGGING
    logging_obj.pre_call(
            input=input,
@ -212,7 +220,6 @@ def embedding(
    response = requests.post(
        embed_url, headers=headers, data=json.dumps(data)
    )
-
    ## LOGGING
    logging_obj.post_call(
            input=input,
@ -220,7 +227,6 @@ def embedding(
            additional_args={"complete_input_dict": data},
            original_response=response,
        )
-    # print(response.json())
    """
        response 
        {
@ -232,6 +238,8 @@ def embedding(
            'usage'
        }
    """
+    if response.status_code!=200:
+        raise CohereError(message=response.text, status_code=response.status_code)
    embeddings = response.json()['embeddings']
    output_data = []
    for idx, embedding in enumerate(embeddings):
--- a/litellm/llms/huggingface_restapi.py
+++ b/litellm/llms/huggingface_restapi.py
@ -141,216 +141,239 @@ def completion(
    litellm_params=None,
    logger_fn=None,
 ):
-    headers = validate_environment(api_key, headers)
-    task = get_hf_task_for_model(model)
-    print_verbose(f"{model}, {task}")
-    completion_url = ""
-    input_text = None
-    if "https" in model:
-        completion_url = model
-    elif api_base:
-        completion_url = api_base
-    elif "HF_API_BASE" in os.environ:
-        completion_url = os.getenv("HF_API_BASE", "")
-    elif "HUGGINGFACE_API_BASE" in os.environ:
-        completion_url = os.getenv("HUGGINGFACE_API_BASE", "")
-    else:
-        completion_url = f"https://api-inference.huggingface.co/models/{model}"
-
-    ## Load Config
-    config=litellm.HuggingfaceConfig.get_config()
-    for k, v in config.items():
-        if k not in optional_params: # completion(top_k=3) > huggingfaceConfig(top_k=3) <- allows for dynamic variables to be passed in
-            optional_params[k] = v
-
-    ### MAP INPUT PARAMS
-    if task == "conversational":
-        inference_params = copy.deepcopy(optional_params)
-        inference_params.pop("details")
-        inference_params.pop("return_full_text")
-        past_user_inputs = []
-        generated_responses = []
-        text = ""
-        for message in messages:
-            if message["role"] == "user":
-                if text != "":
-                    past_user_inputs.append(text)
-                text = message["content"]
-            elif message["role"] == "assistant" or message["role"] == "system":
-                generated_responses.append(message["content"])
-        data = {
-            "inputs": {
-                "text": text, 
-                "past_user_inputs": past_user_inputs, 
-                "generated_responses": generated_responses
-            },
-            "parameters": inference_params
-        }
-        input_text = "".join(message["content"] for message in messages)
-    elif task == "text-generation-inference":
-        # always send "details" and "return_full_text" as params
-        if model in custom_prompt_dict:
-            # check if the model has a registered custom prompt
-            model_prompt_details = custom_prompt_dict[model]
-            prompt = custom_prompt(
-                role_dict=model_prompt_details["roles"], 
-                initial_prompt_value=model_prompt_details["initial_prompt_value"],  
-                final_prompt_value=model_prompt_details["final_prompt_value"], 
-                messages=messages
-            )
+    exception_mapping_worked = False
+    try:
+        headers = validate_environment(api_key, headers)
+        task = get_hf_task_for_model(model)
+        print_verbose(f"{model}, {task}")
+        completion_url = ""
+        input_text = None
+        if "https" in model:
+            completion_url = model
+        elif api_base:
+            completion_url = api_base
+        elif "HF_API_BASE" in os.environ:
+            completion_url = os.getenv("HF_API_BASE", "")
+        elif "HUGGINGFACE_API_BASE" in os.environ:
+            completion_url = os.getenv("HUGGINGFACE_API_BASE", "")
        else:
-            prompt = prompt_factory(model=model, messages=messages)
-        data = {
-            "inputs": prompt,
-            "parameters": optional_params,
-            "stream": True if "stream" in optional_params and optional_params["stream"] == True else False,
-        }
-        input_text = prompt
-    else:
-        # Non TGI and Conversational llms
-        # We need this branch, it removes 'details' and 'return_full_text' from params 
-        if model in custom_prompt_dict:
-            # check if the model has a registered custom prompt
-            model_prompt_details = custom_prompt_dict[model]
-            prompt = custom_prompt(
-                role_dict=model_prompt_details["roles"], 
-                initial_prompt_value=model_prompt_details["initial_prompt_value"],  
-                final_prompt_value=model_prompt_details["final_prompt_value"], 
-                messages=messages
-            )
-        else:
-            prompt = prompt_factory(model=model, messages=messages)
-        inference_params = copy.deepcopy(optional_params)
-        inference_params.pop("details")
-        inference_params.pop("return_full_text")
-        data = {
-            "inputs": prompt,
-            "parameters": inference_params,
-            "stream": True if "stream" in optional_params and optional_params["stream"] == True else False,
-        }
-        input_text = prompt
-    ## LOGGING
-    logging_obj.pre_call(
-            input=input_text,
-            api_key=api_key,
-            additional_args={"complete_input_dict": data, "task": task, "headers": headers},
-        )
-    ## COMPLETION CALL
-    if "stream" in optional_params and optional_params["stream"] == True:
-        response = requests.post(
-            completion_url, 
-            headers=headers, 
-            data=json.dumps(data), 
-            stream=optional_params["stream"]
-        )
-        return response.iter_lines()
-    else:
-        response = requests.post(
-            completion_url, 
-            headers=headers, 
-            data=json.dumps(data)
-        )
+            completion_url = f"https://api-inference.huggingface.co/models/{model}"

-        ## Some servers might return streaming responses even though stream was not set to true. (e.g. Baseten)
-        is_streamed = False 
-        if response.__dict__['headers']["Content-Type"] == "text/event-stream":
-            is_streamed = True
-        
-        # iterate over the complete streamed response, and return the final answer
-        if is_streamed:
-            streamed_response = CustomStreamWrapper(completion_stream=response.iter_lines(), model=model, custom_llm_provider="huggingface", logging_obj=logging_obj)
-            content = ""
-            for chunk in streamed_response: 
-                content += chunk["choices"][0]["delta"]["content"]
-            completion_response: List[Dict[str, Any]] = [{"generated_text": content}]
-            ## LOGGING
-            logging_obj.post_call(
-                input=input_text,
-                api_key=api_key,
-                original_response=completion_response,
-                additional_args={"complete_input_dict": data, "task": task},
-            )
-        else: 
-            ## LOGGING
-            logging_obj.post_call(
-                input=input_text,
-                api_key=api_key,
-                original_response=response.text,
-                additional_args={"complete_input_dict": data, "task": task},
-            )
-            ## RESPONSE OBJECT
-            try:
-                completion_response = response.json()
-            except:
-                raise HuggingfaceError(
-                    message=response.text, status_code=response.status_code
+        ## Load Config
+        config=litellm.HuggingfaceConfig.get_config()
+        for k, v in config.items():
+            if k not in optional_params: # completion(top_k=3) > huggingfaceConfig(top_k=3) <- allows for dynamic variables to be passed in
+                optional_params[k] = v
+
+        ### MAP INPUT PARAMS
+        if task == "conversational":
+            inference_params = copy.deepcopy(optional_params)
+            inference_params.pop("details")
+            inference_params.pop("return_full_text")
+            past_user_inputs = []
+            generated_responses = []
+            text = ""
+            for message in messages:
+                if message["role"] == "user":
+                    if text != "":
+                        past_user_inputs.append(text)
+                    text = message["content"]
+                elif message["role"] == "assistant" or message["role"] == "system":
+                    generated_responses.append(message["content"])
+            data = {
+                "inputs": {
+                    "text": text, 
+                    "past_user_inputs": past_user_inputs, 
+                    "generated_responses": generated_responses
+                },
+                "parameters": inference_params
+            }
+            input_text = "".join(message["content"] for message in messages)
+        elif task == "text-generation-inference":
+            # always send "details" and "return_full_text" as params
+            if model in custom_prompt_dict:
+                # check if the model has a registered custom prompt
+                model_prompt_details = custom_prompt_dict[model]
+                prompt = custom_prompt(
+                    role_dict=model_prompt_details.get("roles", None), 
+                    initial_prompt_value=model_prompt_details.get("initial_prompt_value", ""),  
+                    final_prompt_value=model_prompt_details.get("final_prompt_value", ""), 
+                    messages=messages
                )
-        print_verbose(f"response: {completion_response}")
-        if isinstance(completion_response, dict) and "error" in completion_response:
-            print_verbose(f"completion error: {completion_response['error']}")
-            print_verbose(f"response.status_code: {response.status_code}")
-            raise HuggingfaceError(
-                message=completion_response["error"],
-                status_code=response.status_code,
-            )
-        else:
-            if task == "conversational": 
-                if len(completion_response["generated_text"]) > 0: # type: ignore
-                    model_response["choices"][0]["message"][
-                        "content"
-                    ] = completion_response["generated_text"] # type: ignore
-            elif task == "text-generation-inference": 
-                if len(completion_response[0]["generated_text"]) > 0: 
-                    model_response["choices"][0]["message"][
-                        "content"
-                    ] = completion_response[0]["generated_text"]   
-                ## GETTING LOGPROBS + FINISH REASON 
-                if "details" in completion_response[0] and "tokens" in completion_response[0]["details"]:
-                    model_response.choices[0].finish_reason = completion_response[0]["details"]["finish_reason"]
-                    sum_logprob = 0
-                    for token in completion_response[0]["details"]["tokens"]:
-                        sum_logprob += token["logprob"]
-                    model_response["choices"][0]["message"]._logprob = sum_logprob
-                if "best_of" in optional_params and optional_params["best_of"] > 1: 
-                    if "details" in completion_response[0] and "best_of_sequences" in completion_response[0]["details"]:
-                        choices_list = []
-                        for idx, item in enumerate(completion_response[0]["details"]["best_of_sequences"]):
-                            sum_logprob = 0
-                            for token in item["tokens"]:
-                                sum_logprob += token["logprob"]
-                            if len(item["generated_text"]) > 0: 
-                                message_obj = Message(content=item["generated_text"], logprobs=sum_logprob)
-                            else: 
-                                message_obj = Message(content=None)
-                            choice_obj = Choices(finish_reason=item["finish_reason"], index=idx+1, message=message_obj)
-                            choices_list.append(choice_obj)
-                        model_response["choices"].extend(choices_list)
            else:
-                if len(completion_response[0]["generated_text"]) > 0: 
-                    model_response["choices"][0]["message"][
-                        "content"
-                    ] = completion_response[0]["generated_text"]   
-        ## CALCULATING USAGE
-        prompt_tokens = len(
-            encoding.encode(input_text)
-        )  ##[TODO] use the llama2 tokenizer here
-        print_verbose(f'output: {model_response["choices"][0]["message"]}')
-        output_text = model_response["choices"][0]["message"].get("content", "")
-        if output_text is not None and len(output_text) > 0:
-            completion_tokens = len(
-                encoding.encode(model_response["choices"][0]["message"].get("content", ""))
-            )  ##[TODO] use the llama2 tokenizer here
-        else: 
-            completion_tokens = 0
+                prompt = prompt_factory(model=model, messages=messages)
+            data = {
+                "inputs": prompt,
+                "parameters": optional_params,
+                "stream": True if "stream" in optional_params and optional_params["stream"] == True else False,
+            }
+            input_text = prompt
+        else:
+            # Non TGI and Conversational llms
+            # We need this branch, it removes 'details' and 'return_full_text' from params 
+            if model in custom_prompt_dict:
+                # check if the model has a registered custom prompt
+                model_prompt_details = custom_prompt_dict[model]
+                prompt = custom_prompt(
+                    role_dict=model_prompt_details.get("roles", {}), 
+                    initial_prompt_value=model_prompt_details.get("initial_prompt_value", ""),  
+                    final_prompt_value=model_prompt_details.get("final_prompt_value", ""), 
+                    bos_token=model_prompt_details.get("bos_token", ""),
+                    eos_token=model_prompt_details.get("eos_token", ""),
+                    messages=messages,
+                )
+            else:
+                prompt = prompt_factory(model=model, messages=messages)
+            inference_params = copy.deepcopy(optional_params)
+            inference_params.pop("details")
+            inference_params.pop("return_full_text")
+            data = {
+                "inputs": prompt,
+                "parameters": inference_params,
+                "stream": True if "stream" in optional_params and optional_params["stream"] == True else False,
+            }
+            input_text = prompt
+        ## LOGGING
+        logging_obj.pre_call(
+                input=input_text,
+                api_key=api_key,
+                additional_args={"complete_input_dict": data, "task": task, "headers": headers},
+            )
+        ## COMPLETION CALL
+        if "stream" in optional_params and optional_params["stream"] == True:
+            response = requests.post(
+                completion_url, 
+                headers=headers, 
+                data=json.dumps(data), 
+                stream=optional_params["stream"]
+            )
+            return response.iter_lines()
+        else:
+            response = requests.post(
+                completion_url, 
+                headers=headers, 
+                data=json.dumps(data)
+            )

-        model_response["created"] = time.time()
-        model_response["model"] = model
-        model_response["usage"] = {
-            "prompt_tokens": prompt_tokens,
-            "completion_tokens": completion_tokens,
-            "total_tokens": prompt_tokens + completion_tokens,
-        }
-        return model_response
+            ## Some servers might return streaming responses even though stream was not set to true. (e.g. Baseten)
+            is_streamed = False 
+            if response.__dict__['headers'].get("Content-Type", "") == "text/event-stream":
+                is_streamed = True
+            
+            # iterate over the complete streamed response, and return the final answer
+            if is_streamed:
+                streamed_response = CustomStreamWrapper(completion_stream=response.iter_lines(), model=model, custom_llm_provider="huggingface", logging_obj=logging_obj)
+                content = ""
+                for chunk in streamed_response: 
+                    content += chunk["choices"][0]["delta"]["content"]
+                completion_response: List[Dict[str, Any]] = [{"generated_text": content}]
+                ## LOGGING
+                logging_obj.post_call(
+                    input=input_text,
+                    api_key=api_key,
+                    original_response=completion_response,
+                    additional_args={"complete_input_dict": data, "task": task},
+                )
+            else: 
+                ## LOGGING
+                logging_obj.post_call(
+                    input=input_text,
+                    api_key=api_key,
+                    original_response=response.text,
+                    additional_args={"complete_input_dict": data, "task": task},
+                )
+                ## RESPONSE OBJECT
+                try:
+                    completion_response = response.json()
+                except:
+                    import traceback
+                    raise HuggingfaceError(
+                        message=f"Original Response received: {response.text}; Stacktrace: {traceback.format_exc()}", status_code=response.status_code
+                    )
+            print_verbose(f"response: {completion_response}")
+            if isinstance(completion_response, dict) and "error" in completion_response:
+                print_verbose(f"completion error: {completion_response['error']}")
+                print_verbose(f"response.status_code: {response.status_code}")
+                raise HuggingfaceError(
+                    message=completion_response["error"],
+                    status_code=response.status_code,
+                )
+            else:
+                if task == "conversational": 
+                    if len(completion_response["generated_text"]) > 0: # type: ignore
+                        model_response["choices"][0]["message"][
+                            "content"
+                        ] = completion_response["generated_text"] # type: ignore
+                elif task == "text-generation-inference": 
+                    if len(completion_response[0]["generated_text"]) > 0: 
+                        model_response["choices"][0]["message"][
+                            "content"
+                        ] = completion_response[0]["generated_text"]   
+                    ## GETTING LOGPROBS + FINISH REASON 
+                    if "details" in completion_response[0] and "tokens" in completion_response[0]["details"]:
+                        model_response.choices[0].finish_reason = completion_response[0]["details"]["finish_reason"]
+                        sum_logprob = 0
+                        for token in completion_response[0]["details"]["tokens"]:
+                            sum_logprob += token["logprob"]
+                        model_response["choices"][0]["message"]._logprob = sum_logprob
+                    if "best_of" in optional_params and optional_params["best_of"] > 1: 
+                        if "details" in completion_response[0] and "best_of_sequences" in completion_response[0]["details"]:
+                            choices_list = []
+                            for idx, item in enumerate(completion_response[0]["details"]["best_of_sequences"]):
+                                sum_logprob = 0
+                                for token in item["tokens"]:
+                                    sum_logprob += token["logprob"]
+                                if len(item["generated_text"]) > 0: 
+                                    message_obj = Message(content=item["generated_text"], logprobs=sum_logprob)
+                                else: 
+                                    message_obj = Message(content=None)
+                                choice_obj = Choices(finish_reason=item["finish_reason"], index=idx+1, message=message_obj)
+                                choices_list.append(choice_obj)
+                            model_response["choices"].extend(choices_list)
+                else:
+                    if len(completion_response[0]["generated_text"]) > 0: 
+                        model_response["choices"][0]["message"][
+                            "content"
+                        ] = completion_response[0]["generated_text"]   
+            ## CALCULATING USAGE
+            prompt_tokens = 0
+            try:
+                prompt_tokens = len(
+                    encoding.encode(input_text)
+                )  ##[TODO] use the llama2 tokenizer here
+            except:
+                # this should remain non blocking we should not block a response returning if calculating usage fails
+                pass
+            print_verbose(f'output: {model_response["choices"][0]["message"]}')
+            output_text = model_response["choices"][0]["message"].get("content", "")
+            if output_text is not None and len(output_text) > 0:
+                completion_tokens = 0
+                try:
+                    completion_tokens = len(
+                        encoding.encode(model_response["choices"][0]["message"].get("content", ""))
+                    )  ##[TODO] use the llama2 tokenizer here
+                except:
+                    # this should remain non blocking we should not block a response returning if calculating usage fails
+                    pass
+            else: 
+                completion_tokens = 0
+
+            model_response["created"] = time.time()
+            model_response["model"] = model
+            model_response.usage.completion_tokens = completion_tokens
+            model_response.usage.prompt_tokens = prompt_tokens
+            model_response.usage.total_tokens = prompt_tokens + completion_tokens
+            model_response._hidden_params["original_response"] = completion_response
+            return model_response
+    except HuggingfaceError as e: 
+        exception_mapping_worked = True
+        raise e
+    except Exception as e: 
+        if exception_mapping_worked: 
+            raise e
+        else: 
+            import traceback
+            raise HuggingfaceError(status_code=500, message=traceback.format_exc())


 def embedding(
@ -376,9 +399,19 @@ def embedding(
    else:
        embed_url = f"https://api-inference.huggingface.co/models/{model}"
    
-    data = {
-        "inputs": input
-    }
+    if "sentence-transformers" in model: 
+        if len(input) == 0: 
+            raise HuggingfaceError(status_code=400, message="sentence transformers requires 2+ sentences")
+        data = {
+            "inputs": {
+                "source_sentence": input[0], 
+                "sentences": [ "That is a happy dog", "That is a very happy person", "Today is a sunny day" ]
+            }
+        }
+    else:
+        data = {
+            "inputs": input # type: ignore
+        }
    
    ## LOGGING
    logging_obj.pre_call(
@ -403,15 +436,37 @@ def embedding(

    embeddings = response.json()

+    if "error" in embeddings: 
+        raise HuggingfaceError(status_code=500, message=embeddings['error'])
+    
    output_data = []
-    for idx, embedding in enumerate(embeddings):
-        output_data.append(
+    if "similarities" in embeddings: 
+        for idx, embedding in embeddings["similarities"]:
+            output_data.append(
            {
                "object": "embedding",
                "index": idx,
-                "embedding": embedding[0][0] # flatten list returned from hf
+                "embedding": embedding # flatten list returned from hf
            }
        )
+    else: 
+        for idx, embedding in enumerate(embeddings):
+            if isinstance(embedding, float): 
+                output_data.append(
+                    {
+                        "object": "embedding",
+                        "index": idx,
+                        "embedding": embedding # flatten list returned from hf
+                    }
+                )
+            else: 
+                output_data.append(
+                    {
+                        "object": "embedding",
+                        "index": idx,
+                        "embedding": embedding[0][0] # flatten list returned from hf
+                    }
+                )
    model_response["object"] = "list"
    model_response["data"] = output_data
    model_response["model"] = model
--- a/litellm/llms/maritalk.py
+++ b/litellm/llms/maritalk.py
@ -0,0 +1,161 @@
+import os, types
+import json
+from enum import Enum
+import requests
+import time, traceback
+from typing import Callable, Optional, List
+from litellm.utils import ModelResponse, Choices, Message
+import litellm
+
+class MaritalkError(Exception):
+    def __init__(self, status_code, message):
+        self.status_code = status_code
+        self.message = message
+        super().__init__(
+            self.message
+        )  # Call the base class constructor with the parameters it needs
+
+class MaritTalkConfig():
+    """
+    The class `MaritTalkConfig` provides configuration for the MaritTalk's API interface. Here are the parameters:
+        
+    - `max_tokens` (integer): Maximum number of tokens the model will generate as part of the response. Default is 1.
+        
+    - `model` (string): The model used for conversation. Default is 'maritalk'.
+        
+    - `do_sample` (boolean): If set to True, the API will generate a response using sampling. Default is True.
+        
+    - `temperature` (number): A non-negative float controlling the randomness in generation. Lower temperatures result in less random generations. Default is 0.7.
+        
+    - `top_p` (number): Selection threshold for token inclusion based on cumulative probability. Default is 0.95.
+        
+    - `repetition_penalty` (number): Penalty for repetition in the generated conversation. Default is 1.
+        
+    - `stopping_tokens` (list of string): List of tokens where the conversation can be stopped/stopped.
+    """
+    max_tokens: Optional[int] = None
+    model: Optional[str] = None
+    do_sample: Optional[bool] = None
+    temperature: Optional[float] = None
+    top_p: Optional[float] = None
+    repetition_penalty: Optional[float] = None
+    stopping_tokens: Optional[List[str]] = None
+
+    def __init__(self,
+                 max_tokens: Optional[int]=None,
+                 model: Optional[str] = None,
+                 do_sample: Optional[bool] = None,
+                 temperature: Optional[float] = None,
+                 top_p: Optional[float] = None,
+                 repetition_penalty: Optional[float] = None,
+                 stopping_tokens: Optional[List[str]] = None) -> None:
+        
+        locals_ = locals()
+        for key, value in locals_.items():
+            if key != 'self' and value is not None:
+                setattr(self.__class__, key, value)
+   
+    @classmethod
+    def get_config(cls):
+        return {k: v for k, v in cls.__dict__.items() 
+                if not k.startswith('__') 
+                and not isinstance(v, (types.FunctionType, types.BuiltinFunctionType, classmethod, staticmethod)) 
+                and v is not None}
+        
+def validate_environment(api_key):
+    headers = {
+        "accept": "application/json",
+        "content-type": "application/json",
+    }
+    if api_key:
+        headers["Authorization"] = f"Key {api_key}"
+    return headers
+
+def completion(
+    model: str,
+    messages: list,
+    api_base: str,
+    model_response: ModelResponse,
+    print_verbose: Callable,
+    encoding,
+    api_key,
+    logging_obj,
+    optional_params=None,
+    litellm_params=None,
+    logger_fn=None,
+):
+    headers = validate_environment(api_key)
+    completion_url = api_base
+    model = model
+
+    ## Load Config
+    config=litellm.MaritTalkConfig.get_config()
+    for k, v in config.items():
+        if k not in optional_params: # completion(top_k=3) > maritalk_config(top_k=3) <- allows for dynamic variables to be passed in
+            optional_params[k] = v
+
+    data = {
+        "messages": messages,
+        **optional_params,
+    }
+
+    ## LOGGING
+    logging_obj.pre_call(
+            input=messages,
+            api_key=api_key,
+            additional_args={"complete_input_dict": data},
+        )
+    ## COMPLETION CALL
+    response = requests.post(
+        completion_url, headers=headers, data=json.dumps(data), stream=optional_params["stream"] if "stream" in optional_params else False
+    )
+    if "stream" in optional_params and optional_params["stream"] == True:
+        return response.iter_lines()
+    else:
+        ## LOGGING
+        logging_obj.post_call(
+                input=messages,
+                api_key=api_key,
+                original_response=response.text,
+                additional_args={"complete_input_dict": data},
+            )
+        print_verbose(f"raw model_response: {response.text}")
+        ## RESPONSE OBJECT
+        completion_response = response.json()
+        if "error" in completion_response:
+            raise MaritalkError(
+                message=completion_response["error"],
+                status_code=response.status_code,
+            )
+        else:
+            try:
+                if len(completion_response["answer"]) > 0:
+                    model_response["choices"][0]["message"]["content"] = completion_response["answer"]
+            except Exception as e:
+                raise MaritalkError(message=response.text, status_code=response.status_code)
+
+        ## CALCULATING USAGE
+        prompt = "".join(m["content"] for m in messages)
+        prompt_tokens = len(
+            encoding.encode(prompt)
+        ) 
+        completion_tokens = len(
+            encoding.encode(model_response["choices"][0]["message"].get("content", ""))
+        )
+
+        model_response["created"] = time.time()
+        model_response["model"] = model
+        model_response.usage.completion_tokens = completion_tokens
+        model_response.usage.prompt_tokens = prompt_tokens
+        model_response.usage.total_tokens = prompt_tokens + completion_tokens
+        return model_response
+
+def embedding(
+    model: str,
+    input: list,
+    api_key: Optional[str] = None,
+    logging_obj=None,
+    model_response=None,
+    encoding=None,
+):
+    pass
--- a/litellm/llms/nlp_cloud.py
+++ b/litellm/llms/nlp_cloud.py
@ -171,11 +171,9 @@ def completion(

        model_response["created"] = time.time()
        model_response["model"] = model
-        model_response["usage"] = {
-            "prompt_tokens": prompt_tokens,
-            "completion_tokens": completion_tokens,
-            "total_tokens": prompt_tokens + completion_tokens,
-        }
+        model_response.usage.completion_tokens = completion_tokens
+        model_response.usage.prompt_tokens = prompt_tokens
+        model_response.usage.total_tokens = prompt_tokens + completion_tokens
        return model_response

 def embedding():
--- a/litellm/llms/ollama.py
+++ b/litellm/llms/ollama.py
@ -147,7 +147,6 @@ def get_ollama_response_stream(
                                yield completion_obj
                except Exception as e:
                    traceback.print_exc()
-                    print(f"Error decoding JSON: {e}")
    session.close()

 if async_generator_imported:
@ -198,5 +197,6 @@ if async_generator_imported:
                                    completion_obj["content"] = j["response"]
                                    await yield_({"choices": [{"delta": completion_obj}]})
                    except Exception as e:
-                        print(f"Error decoding JSON: {e}")
+                        import logging
+                        logging.debug(f"Error decoding JSON: {e}")
        session.close()
--- a/litellm/llms/oobabooga.py
+++ b/litellm/llms/oobabooga.py
@ -111,11 +111,9 @@ def completion(

        model_response["created"] = time.time()
        model_response["model"] = model
-        model_response["usage"] = {
-            "prompt_tokens": prompt_tokens,
-            "completion_tokens": completion_tokens,
-            "total_tokens": prompt_tokens + completion_tokens,
-        }
+        model_response.usage.completion_tokens = completion_tokens
+        model_response.usage.prompt_tokens = prompt_tokens
+        model_response.usage.total_tokens = prompt_tokens + completion_tokens
        return model_response

 def embedding():
--- a/litellm/llms/palm.py
+++ b/litellm/llms/palm.py
@ -157,11 +157,9 @@ def completion(

    model_response["created"] = time.time()
    model_response["model"] = "palm/" + model
-    model_response["usage"] = {
-        "prompt_tokens": prompt_tokens,
-        "completion_tokens": completion_tokens,
-        "total_tokens": prompt_tokens + completion_tokens,
-    }
+    model_response.usage.completion_tokens = completion_tokens
+    model_response.usage.prompt_tokens = prompt_tokens
+    model_response.usage.total_tokens = prompt_tokens + completion_tokens
    return model_response

 def embedding():
--- a/litellm/llms/petals.py
+++ b/litellm/llms/petals.py
@ -176,11 +176,9 @@ def completion(

    model_response["created"] = time.time()
    model_response["model"] = model
-    model_response["usage"] = {
-        "prompt_tokens": prompt_tokens,
-        "completion_tokens": completion_tokens,
-        "total_tokens": prompt_tokens + completion_tokens,
-    }
+    model_response.usage.completion_tokens = completion_tokens
+    model_response.usage.prompt_tokens = prompt_tokens
+    model_response.usage.total_tokens = prompt_tokens + completion_tokens
    return model_response

 def embedding():
--- a/litellm/llms/replicate.py
+++ b/litellm/llms/replicate.py
@ -240,11 +240,9 @@ def completion(
        prompt_tokens = len(encoding.encode(prompt))
        completion_tokens = len(encoding.encode(model_response["choices"][0]["message"].get("content", "")))
        model_response["model"] = "replicate/" + model
-        model_response["usage"] = {
-            "prompt_tokens": prompt_tokens,
-            "completion_tokens": completion_tokens,
-            "total_tokens": prompt_tokens + completion_tokens,
-        }
+        model_response.usage.completion_tokens = completion_tokens
+        model_response.usage.prompt_tokens = prompt_tokens
+        model_response.usage.total_tokens = prompt_tokens + completion_tokens
        return model_response


--- a/litellm/llms/sagemaker.py
+++ b/litellm/llms/sagemaker.py
@ -169,11 +169,9 @@ def completion(

    model_response["created"] = time.time()
    model_response["model"] = model
-    model_response["usage"] = {
-        "prompt_tokens": prompt_tokens,
-        "completion_tokens": completion_tokens,
-        "total_tokens": prompt_tokens + completion_tokens,
-    }
+    model_response.usage.completion_tokens = completion_tokens
+    model_response.usage.prompt_tokens = prompt_tokens
+    model_response.usage.total_tokens = prompt_tokens + completion_tokens
    return model_response

 def embedding():
--- a/litellm/llms/together_ai.py
+++ b/litellm/llms/together_ai.py
@ -99,15 +99,18 @@ def completion(
        if k not in optional_params: # completion(top_k=3) > togetherai_config(top_k=3) <- allows for dynamic variables to be passed in
            optional_params[k] = v

+    print_verbose(f"CUSTOM PROMPT DICT: {custom_prompt_dict}; model: {model}")
    if model in custom_prompt_dict:
        # check if the model has a registered custom prompt
        model_prompt_details = custom_prompt_dict[model]
        prompt = custom_prompt(
-            role_dict=model_prompt_details["roles"], 
-            initial_prompt_value=model_prompt_details["initial_prompt_value"],  
-            final_prompt_value=model_prompt_details["final_prompt_value"], 
-            messages=messages
-        )
+                role_dict=model_prompt_details.get("roles", {}), 
+                initial_prompt_value=model_prompt_details.get("initial_prompt_value", ""),  
+                final_prompt_value=model_prompt_details.get("final_prompt_value", ""), 
+                bos_token=model_prompt_details.get("bos_token", ""),
+                eos_token=model_prompt_details.get("eos_token", ""),
+                messages=messages,
+            )
    else:
        prompt = prompt_factory(model=model, messages=messages)

@ -175,11 +178,9 @@ def completion(
            model_response.choices[0].finish_reason = completion_response["output"]["choices"][0]["finish_reason"]
        model_response["created"] = time.time()
        model_response["model"] = model
-        model_response["usage"] = {
-            "prompt_tokens": prompt_tokens,
-            "completion_tokens": completion_tokens,
-            "total_tokens": prompt_tokens + completion_tokens,
-        }
+        model_response.usage.completion_tokens = completion_tokens
+        model_response.usage.prompt_tokens = prompt_tokens
+        model_response.usage.total_tokens = prompt_tokens + completion_tokens
        return model_response

 def embedding():
--- a/litellm/llms/vertex_ai.py
+++ b/litellm/llms/vertex_ai.py
@ -109,7 +109,12 @@ def completion(
        logging_obj.pre_call(input=prompt, api_key=None, additional_args={"complete_input_dict": optional_params})

        if "stream" in optional_params and optional_params["stream"] == True:
+            # NOTE: VertexAI does not accept stream=True as a param and raises an error,
+            # we handle this by removing 'stream' from optional params and sending the request
+            # after we get the response we add optional_params["stream"] = True, since main.py needs to know it's a streaming response to then transform it for the OpenAI format
+            optional_params.pop("stream", None) # vertex ai raises an error when passing stream in optional params
            model_response = chat.send_message_streaming(prompt, **optional_params)
+            optional_params["stream"] = True
            return model_response

        completion_response = chat.send_message(prompt, **optional_params).text
@ -118,7 +123,9 @@ def completion(
        logging_obj.pre_call(input=prompt, api_key=None)

        if "stream" in optional_params and optional_params["stream"] == True:
+            optional_params.pop("stream", None) # See note above on handling streaming for vertex ai 
            model_response = text_model.predict_streaming(prompt, **optional_params)
+            optional_params["stream"] = True
            return model_response

        completion_response = text_model.predict(prompt, **optional_params).text
@ -144,11 +151,9 @@ def completion(
        encoding.encode(model_response["choices"][0]["message"].get("content", ""))
    )

-    model_response["usage"] = {
-        "prompt_tokens": prompt_tokens,
-        "completion_tokens": completion_tokens,
-        "total_tokens": prompt_tokens + completion_tokens,
-    }
+    model_response.usage.completion_tokens = completion_tokens
+    model_response.usage.prompt_tokens = prompt_tokens
+    model_response.usage.total_tokens = prompt_tokens + completion_tokens
    return model_response


--- a/litellm/llms/vllm.py
+++ b/litellm/llms/vllm.py
@ -90,11 +90,9 @@ def completion(

        model_response["created"] = time.time()
        model_response["model"] = model
-        model_response["usage"] = {
-            "prompt_tokens": prompt_tokens,
-            "completion_tokens": completion_tokens,
-            "total_tokens": prompt_tokens + completion_tokens,
-        }
+        model_response.usage.completion_tokens = completion_tokens
+        model_response.usage.prompt_tokens = prompt_tokens
+        model_response.usage.total_tokens = prompt_tokens + completion_tokens
        return model_response

 def batch_completions(
@ -172,11 +170,9 @@ def batch_completions(

        model_response["created"] = time.time()
        model_response["model"] = model
-        model_response["usage"] = {
-            "prompt_tokens": prompt_tokens,
-            "completion_tokens": completion_tokens,
-            "total_tokens": prompt_tokens + completion_tokens,
-        }
+        model_response.usage.completion_tokens = completion_tokens
+        model_response.usage.prompt_tokens = prompt_tokens
+        model_response.usage.total_tokens = prompt_tokens + completion_tokens
        final_outputs.append(model_response)
    return final_outputs

--- a/litellm/main.py
+++ b/litellm/main.py
@ -47,7 +47,8 @@ from .llms import (
    petals,
    oobabooga,
    palm,
-    vertex_ai)
+    vertex_ai,
+    maritalk)
 from .llms.openai import OpenAIChatCompletion
 from .llms.prompt_templates.factory import prompt_factory, custom_prompt, function_call_prompt
 import tiktoken
@ -59,9 +60,10 @@ from litellm.utils import (
    get_secret,
    CustomStreamWrapper,
    ModelResponse,
+    TextCompletionResponse,
+    TextChoices,
    EmbeddingResponse,
    read_config_args,
-    RateLimitManager,
    Choices, 
    Message
 )
@ -73,21 +75,42 @@ openai_proxy_chat_completions = OpenAIChatCompletion()

 async def acompletion(*args, **kwargs):
    """
-    Asynchronously perform a completion() using the any LiteLLM model (ex gpt-3.5-turbo, claude-2)
-
-    This function takes the same arguments as the 'completion' function and is used for asynchronous completion requests.
+    Asynchronously executes a litellm.completion() call for any of litellm supported llms (example gpt-4, gpt-3.5-turbo, claude-2, command-nightly)

    Parameters:
-        *args: Positional arguments to pass to the 'litellm.completion' function.
-        **kwargs: Keyword arguments to pass to the 'litellm.completion' function.
+        model (str): The name of the language model to use for text completion. see all supported LLMs: https://docs.litellm.ai/docs/providers/
+        messages (List): A list of message objects representing the conversation context (default is an empty list).

+        OPTIONAL PARAMS
+        functions (List, optional): A list of functions to apply to the conversation messages (default is an empty list).
+        function_call (str, optional): The name of the function to call within the conversation (default is an empty string).
+        temperature (float, optional): The temperature parameter for controlling the randomness of the output (default is 1.0).
+        top_p (float, optional): The top-p parameter for nucleus sampling (default is 1.0).
+        n (int, optional): The number of completions to generate (default is 1).
+        stream (bool, optional): If True, return a streaming response (default is False).
+        stop(string/list, optional): - Up to 4 sequences where the LLM API will stop generating further tokens.
+        max_tokens (integer, optional): The maximum number of tokens in the generated completion (default is infinity).
+        presence_penalty (float, optional): It is used to penalize new tokens based on their existence in the text so far.
+        frequency_penalty: It is used to penalize new tokens based on their frequency in the text so far.
+        logit_bias (dict, optional): Used to modify the probability of specific tokens appearing in the completion.
+        user (str, optional):  A unique identifier representing your end-user. This can help the LLM provider to monitor and detect abuse.
+        metadata (dict, optional): Pass in additional metadata to tag your completion calls - eg. prompt version, details, etc. 
+        api_base (str, optional): Base URL for the API (default is None).
+        api_version (str, optional): API version (default is None).
+        api_key (str, optional): API key (default is None).
+        model_list (list, optional): List of api base, version, keys
+
+        LITELLM Specific Params
+        mock_response (str, optional): If provided, return a mock completion response for testing or debugging purposes (default is None).
+        force_timeout (int, optional): The maximum execution time in seconds for the completion request (default is 600).
+        custom_llm_provider (str, optional): Used for Non-OpenAI LLMs, Example usage for bedrock, set model="amazon.titan-tg1-large" and custom_llm_provider="bedrock"
    Returns:
-        The completion response, either as a litellm.ModelResponse Object or an async generator if 'stream' is set to True.
+        ModelResponse: A response object containing the generated completion and associated metadata.

-    Note:
-        - This function uses asynchronous programming to perform completions.
-        - It leverages the 'loop.run_in_executor' method to execute the synchronous 'completion' function.
-        - If 'stream' is set to True in kwargs, the function returns an async generator.
+    Notes:
+        - This function is an asynchronous version of the `completion` function.
+        - The `completion` function is called using `run_in_executor` to execute synchronously in the event loop.
+        - If `stream` is True, the function returns an async generator that yields completion lines.
    """
    loop = asyncio.get_event_loop()

@ -212,6 +235,7 @@ def completion(
        mock_response (str, optional): If provided, return a mock completion response for testing or debugging purposes (default is None).
        force_timeout (int, optional): The maximum execution time in seconds for the completion request (default is 600).
        custom_llm_provider (str, optional): Used for Non-OpenAI LLMs, Example usage for bedrock, set model="amazon.titan-tg1-large" and custom_llm_provider="bedrock"
+        num_retries (int, optional): The number of retries to attempt (default is 0).
    Returns:
        ModelResponse: A response object containing the generated completion and associated metadata.

@ -233,13 +257,22 @@ def completion(
    metadata = kwargs.get('metadata', None)
    fallbacks = kwargs.get('fallbacks', None)
    headers = kwargs.get("headers", None)
+    num_retries = kwargs.get("num_retries", None)
+    context_window_fallback_dict = kwargs.get("context_window_fallback_dict", None)
+    ### CUSTOM PROMPT TEMPLATE ### 
+    initial_prompt_value = kwargs.get("intial_prompt_value", None)
+    roles = kwargs.get("roles", None)
+    final_prompt_value = kwargs.get("final_prompt_value", None)
+    bos_token = kwargs.get("bos_token", None)
+    eos_token = kwargs.get("eos_token", None)
    ######## end of unpacking kwargs ###########
    openai_params = ["functions", "function_call", "temperature", "temperature", "top_p", "n", "stream", "stop", "max_tokens", "presence_penalty", "frequency_penalty", "logit_bias", "user", "request_timeout", "api_base", "api_version", "api_key"]
-    litellm_params = ["metadata", "acompletion", "caching", "return_async", "mock_response", "api_key", "api_version", "api_base", "force_timeout", "logger_fn", "verbose", "custom_llm_provider", "litellm_logging_obj", "litellm_call_id", "use_client", "id", "fallbacks", "azure", "headers", "model_list"]
+    litellm_params = ["metadata", "acompletion", "caching", "return_async", "mock_response", "api_key", "api_version", "api_base", "force_timeout", "logger_fn", "verbose", "custom_llm_provider", "litellm_logging_obj", "litellm_call_id", "use_client", "id", "fallbacks", "azure", "headers", "model_list", "num_retries", "context_window_fallback_dict", "roles", "final_prompt_value", "bos_token", "eos_token"]
    default_params = openai_params + litellm_params
    non_default_params = {k: v for k,v in kwargs.items() if k not in default_params} # model-specific params - pass them straight to the model/provider
    if mock_response:
        return mock_completion(model, messages, stream=stream, mock_response=mock_response)
+    
    try:
        logging = litellm_logging_obj
        fallbacks = (
@ -256,6 +289,7 @@ def completion(
            model = litellm.model_alias_map[
                model
            ]  # update the model to the actual value if an alias has been passed in
+
        model_response = ModelResponse()

        if kwargs.get('azure', False) == True: # don't remove flag check, to remain backwards compatible for repos like Codium
@ -264,6 +298,19 @@ def completion(
                model=deployment_id
                custom_llm_provider="azure"
        model, custom_llm_provider, dynamic_api_key, api_base = get_llm_provider(model=model, custom_llm_provider=custom_llm_provider, api_base=api_base)
+        custom_prompt_dict = {} # type: ignore
+        if initial_prompt_value or roles or final_prompt_value or bos_token or eos_token:
+            custom_prompt_dict = {model: {}}
+            if initial_prompt_value:
+                custom_prompt_dict[model]["initial_prompt_value"] = initial_prompt_value
+            if roles: 
+                custom_prompt_dict[model]["roles"] = roles
+            if final_prompt_value: 
+                custom_prompt_dict[model]["final_prompt_value"] = final_prompt_value
+            if bos_token:
+                custom_prompt_dict[model]["bos_token"] = bos_token
+            if eos_token:
+                custom_prompt_dict[model]["eos_token"] = eos_token
        model_api_key = get_api_key(llm_provider=custom_llm_provider, dynamic_api_key=api_key) # get the api key from the environment if required for the model
        if model_api_key and "sk-litellm" in model_api_key:
            api_base = "https://proxy.litellm.ai"
@ -334,6 +381,11 @@ def completion(
                get_secret("AZURE_API_KEY")
            )

+            headers = (
+                headers or
+                litellm.headers
+            )
+
            ## LOAD CONFIG - if set
            config=litellm.AzureOpenAIConfig.get_config()
            for k, v in config.items():
@ -345,7 +397,7 @@ def completion(
                input=messages,
                api_key=api_key,
                additional_args={
-                    "headers": litellm.headers,
+                    "headers": headers,
                    "api_version": api_version,
                    "api_base": api_base,
                },
@ -354,7 +406,7 @@ def completion(
            response = openai.ChatCompletion.create(
                engine=model,
                messages=messages,
-                headers=litellm.headers,
+                headers=headers,
                api_key=api_key,
                api_base=api_base,
                api_version=api_version,
@ -370,7 +422,7 @@ def completion(
                api_key=api_key,
                original_response=response,
                additional_args={
-                    "headers": litellm.headers,
+                    "headers": headers,
                    "api_version": api_version,
                    "api_base": api_base,
                },
@ -403,6 +455,11 @@ def completion(
                get_secret("OPENAI_API_KEY")
            )

+            headers = (
+                    headers or
+                    litellm.headers
+            )
+
            ## LOAD CONFIG - if set
            config=litellm.OpenAIConfig.get_config()
            for k, v in config.items():
@ -413,7 +470,7 @@ def completion(
            logging.pre_call(
                input=messages,
                api_key=api_key,
-                additional_args={"headers": litellm.headers, "api_base": api_base},
+                additional_args={"headers": headers, "api_base": api_base},
            )
            ## COMPLETION CALL
            try:
@ -434,7 +491,7 @@ def completion(
                    response = openai.ChatCompletion.create(
                        model=model,
                        messages=messages,
-                        headers=litellm.headers, # None by default
+                        headers=headers, # None by default
                        api_base=api_base, # thread safe setting base, key, api_version
                        api_key=api_key,
                        api_type="openai",
@ -447,7 +504,7 @@ def completion(
                    input=messages,
                    api_key=api_key,
                    original_response=str(e),
-                    additional_args={"headers": litellm.headers},
+                    additional_args={"headers": headers},
                )
                raise e
            
@ -459,10 +516,11 @@ def completion(
                input=messages,
                api_key=api_key,
                original_response=response,
-                additional_args={"headers": litellm.headers},
+                additional_args={"headers": headers},
            )
        elif (
-            model in litellm.open_ai_text_completion_models
+            custom_llm_provider == "text-completion-openai"
+            or model in litellm.open_ai_text_completion_models
            or "ft:babbage-002" in model
            or "ft:davinci-002" in model  # support for finetuned completion models
            # NOTE: Do NOT add custom_llm_provider == "openai". 
@ -491,23 +549,32 @@ def completion(
                get_secret("OPENAI_API_KEY")
            )

+            headers = (
+                headers or
+                litellm.headers
+            )
+
            ## LOAD CONFIG - if set
            config=litellm.OpenAITextCompletionConfig.get_config()
            for k, v in config.items():
                if k not in optional_params: # completion(top_k=3) > openai_text_config(top_k=3) <- allows for dynamic variables to be passed in
                    optional_params[k] = v
-
-
            if litellm.organization:
                openai.organization = litellm.organization
-            prompt = " ".join([message["content"] for message in messages])
+
+            if len(messages)>0 and "content" in messages[0] and type(messages[0]["content"]) == list: 
+                # text-davinci-003 can accept a string or array, if it's an array, assume the array is set in messages[0]['content']
+                # https://platform.openai.com/docs/api-reference/completions/create
+                prompt = messages[0]["content"]
+            else:
+                prompt = " ".join([message["content"] for message in messages]) # type: ignore
            ## LOGGING
            logging.pre_call(
                input=prompt,
                api_key=api_key,
                additional_args={
                    "openai_organization": litellm.organization,
-                    "headers": litellm.headers,
+                    "headers": headers,
                    "api_base": api_base,
                    "api_type": openai.api_type,
                },
@ -516,7 +583,7 @@ def completion(
            response = openai.Completion.create(
                model=model, 
                prompt=prompt,
-                headers=litellm.headers,
+                headers=headers,
                api_key = api_key,
                api_base=api_base,
                **optional_params
@ -531,12 +598,13 @@ def completion(
                original_response=response,
                additional_args={
                    "openai_organization": litellm.organization,
-                    "headers": litellm.headers,
+                    "headers": headers,
                    "api_base": openai.api_base,
                    "api_type": openai.api_type,
                },
            )
            ## RESPONSE OBJECT
+            model_response._hidden_params["original_response"] = response # track original response, if users make a litellm.text_completion() request, we can return the original response
            choices_list = []
            for idx, item in enumerate(response["choices"]):
                if len(item["text"]) > 0: 
@ -601,6 +669,10 @@ def completion(
                or get_secret("ANTHROPIC_API_BASE")
                or "https://api.anthropic.com/v1/complete"
            )
+            custom_prompt_dict = (
+                custom_prompt_dict
+                or litellm.custom_prompt_dict
+            )
            model_response = anthropic.completion(
                model=model,
                messages=messages,
@ -683,7 +755,7 @@ def completion(
                response = CustomStreamWrapper(model_response, model, custom_llm_provider="aleph_alpha", logging_obj=logging)
                return response
            response = model_response
-        elif model in litellm.cohere_models:
+        elif custom_llm_provider == "cohere":
            cohere_key = (
                api_key
                or litellm.cohere_key
@ -718,6 +790,40 @@ def completion(
                response = CustomStreamWrapper(model_response, model, custom_llm_provider="cohere", logging_obj=logging)
                return response
            response = model_response
+        elif custom_llm_provider == "maritalk":
+            maritalk_key = (
+                api_key
+                or litellm.maritalk_key
+                or get_secret("MARITALK_API_KEY")
+                or litellm.api_key
+            )
+
+            api_base = (
+                api_base
+                or litellm.api_base
+                or get_secret("MARITALK_API_BASE")
+                or "https://chat.maritaca.ai/api/chat/inference"
+            )
+            
+            model_response = maritalk.completion(
+                model=model,
+                messages=messages,
+                api_base=api_base,
+                model_response=model_response,
+                print_verbose=print_verbose,
+                optional_params=optional_params,
+                litellm_params=litellm_params,
+                logger_fn=logger_fn,
+                encoding=encoding,
+                api_key=maritalk_key,
+                logging_obj=logging 
+            )
+
+            if "stream" in optional_params and optional_params["stream"] == True:
+                # don't try to access stream object,
+                response = CustomStreamWrapper(model_response, model, custom_llm_provider="maritalk", logging_obj=logging)
+                return response
+            response = model_response
        elif custom_llm_provider == "deepinfra": # for now this NEEDS to be above Hugging Face otherwise all calls to meta-llama/Llama-2-70b-chat-hf go to hf, we need this to go to deep infra if user sets provider to deep infra 
            # this can be called with the openai python package
            api_key = (
@ -734,6 +840,11 @@ def completion(
                or "https://api.deepinfra.com/v1/openai"
            )

+            headers = (
+                headers or
+                litellm.headers
+            )
+
            ## LOGGING
            logging.pre_call(
                input=messages,
@ -766,7 +877,7 @@ def completion(
                input=messages,
                api_key=api_key,
                original_response=response,
-                additional_args={"headers": litellm.headers},
+                additional_args={"headers": headers},
            )
        elif ( 
            custom_llm_provider == "huggingface"
@ -783,6 +894,11 @@ def completion(
                headers
                or litellm.headers
            )
+
+            custom_prompt_dict = (
+                custom_prompt_dict
+                or litellm.custom_prompt_dict
+            )
            model_response = huggingface_restapi.completion(
                model=model,
                messages=messages,
@ -796,7 +912,7 @@ def completion(
                encoding=encoding, 
                api_key=huggingface_key, 
                logging_obj=logging,
-                custom_prompt_dict=litellm.custom_prompt_dict
+                custom_prompt_dict=custom_prompt_dict
            )
            if "stream" in optional_params and optional_params["stream"] == True:
                # don't try to access stream object,
@ -846,15 +962,24 @@ def completion(
                openai.api_key = get_secret("OPENROUTER_API_KEY") or get_secret(
                    "OR_API_KEY"
                ) or litellm.api_key
+
+            headers = (
+                headers or
+                litellm.headers
+            )
+
+            data = {
+                "model": model, 
+                "messages": messages,  
+                **optional_params
+            }
            ## LOGGING
-            logging.pre_call(input=messages, api_key=openai.api_key)
+            logging.pre_call(input=messages, api_key=openai.api_key, additional_args={"complete_input_dict": data, "headers": headers})
            ## COMPLETION CALL
-            if litellm.headers:
+            if headers:
                response = openai.ChatCompletion.create(
-                    model=model,
-                    messages=messages,
-                    headers=litellm.headers,
-                    **optional_params,
+                    headers=headers,
+                    **data,
                )
            else:
                openrouter_site_url = get_secret("OR_SITE_URL")
@ -866,13 +991,11 @@ def completion(
                if openrouter_app_name is None:
                    openrouter_app_name = "liteLLM"
                response = openai.ChatCompletion.create(
-                    model=model,
-                    messages=messages,
                    headers={
                        "HTTP-Referer": openrouter_site_url,  # To identify your site
                        "X-Title": openrouter_app_name,  # To identify your app
                    },
-                    **optional_params,
+                    **data,
                )
            ## LOGGING
            logging.post_call(
@ -894,6 +1017,11 @@ def completion(
                or get_secret("TOGETHERAI_API_BASE")
                or "https://api.together.xyz/inference"
            )
+
+            custom_prompt_dict = (
+                custom_prompt_dict
+                or litellm.custom_prompt_dict
+            )
            
            model_response = together_ai.completion(
                model=model,
@ -906,7 +1034,8 @@ def completion(
                logger_fn=logger_fn,
                encoding=encoding,
                api_key=together_ai_key,
-                logging_obj=logging
+                logging_obj=logging,
+                custom_prompt_dict=custom_prompt_dict
            )
            if "stream_tokens" in optional_params and optional_params["stream_tokens"] == True:
                # don't try to access stream object,
@ -1038,6 +1167,10 @@ def completion(
            response = model_response
        elif custom_llm_provider == "bedrock":
            # boto3 reads keys from .env
+            custom_prompt_dict = (
+                custom_prompt_dict
+                or litellm.custom_prompt_dict
+            )
            model_response = bedrock.completion(
                model=model,
                messages=messages,
@ -1087,12 +1220,17 @@ def completion(
            api_base = (
                litellm.api_base or
                api_base or
+                get_secret("OLLAMA_API_BASE") or 
                "http://localhost:11434"
                
            )
-            if model in litellm.custom_prompt_dict:
+            custom_prompt_dict = (
+                custom_prompt_dict
+                or litellm.custom_prompt_dict
+            )
+            if model in custom_prompt_dict:
                # check if the model has a registered custom prompt
-                model_prompt_details = litellm.custom_prompt_dict[model]
+                model_prompt_details = custom_prompt_dict[model]
                prompt = custom_prompt(
                    role_dict=model_prompt_details["roles"], 
                    initial_prompt_value=model_prompt_details["initial_prompt_value"],  
@ -1104,7 +1242,7 @@ def completion(

            ## LOGGING
            logging.pre_call(
-                input=prompt, api_key=None, additional_args={"api_base": api_base, "custom_prompt_dict": litellm.custom_prompt_dict}
+                input=prompt, api_key=None, additional_args={"api_base": api_base, "custom_prompt_dict": custom_prompt_dict}
            )
            if kwargs.get('acompletion', False) == True:    
                if optional_params.get("stream", False) == True:
@ -1128,7 +1266,7 @@ def completion(
            model_response["choices"][0]["message"]["content"] = response_string
            model_response["created"] = time.time()
            model_response["model"] = "ollama/" + model
-            prompt_tokens = len(encoding.encode(prompt))
+            prompt_tokens = len(encoding.encode(prompt)) # type: ignore
            completion_tokens = len(encoding.encode(response_string))
            model_response["usage"] = {
                "prompt_tokens": prompt_tokens,
@ -1224,7 +1362,7 @@ def completion(
            )

            """
-            prompt = " ".join([message["content"] for message in messages])
+            prompt = " ".join([message["content"] for message in messages]) # type: ignore
            resp = requests.post(url, json={
                'model': model,
                'params': {
@ -1263,17 +1401,21 @@ def completion(
    except Exception as e:
        ## Map to OpenAI Exception
        raise exception_type(
-            model=model, custom_llm_provider=custom_llm_provider, original_exception=e, completion_kwargs=args,
-        )
+                model=model, custom_llm_provider=custom_llm_provider, original_exception=e, completion_kwargs=args,
+            )


 def completion_with_retries(*args, **kwargs):
+    """
+    Executes a litellm.completion() with 3 retries
+    """
    try:
        import tenacity
    except:
        raise Exception("tenacity import failed please run `pip install tenacity`")
-
-    retryer = tenacity.Retrying(stop=tenacity.stop_after_attempt(3), reraise=True)
+    
+    num_retries = kwargs.pop("num_retries", 3)
+    retryer = tenacity.Retrying(stop=tenacity.stop_after_attempt(num_retries), reraise=True)
    return retryer(completion, *args, **kwargs)


@ -1297,6 +1439,30 @@ def batch_completion(
    request_timeout: Optional[int] = None,
    # Optional liteLLM function params
    **kwargs):
+    """
+    Batch litellm.completion function for a given model.
+
+    Args:
+        model (str): The model to use for generating completions.
+        messages (List, optional): List of messages to use as input for generating completions. Defaults to [].
+        functions (List, optional): List of functions to use as input for generating completions. Defaults to [].
+        function_call (str, optional): The function call to use as input for generating completions. Defaults to "".
+        temperature (float, optional): The temperature parameter for generating completions. Defaults to None.
+        top_p (float, optional): The top-p parameter for generating completions. Defaults to None.
+        n (int, optional): The number of completions to generate. Defaults to None.
+        stream (bool, optional): Whether to stream completions or not. Defaults to None.
+        stop (optional): The stop parameter for generating completions. Defaults to None.
+        max_tokens (float, optional): The maximum number of tokens to generate. Defaults to None.
+        presence_penalty (float, optional): The presence penalty for generating completions. Defaults to None.
+        frequency_penalty (float, optional): The frequency penalty for generating completions. Defaults to None.
+        logit_bias (dict, optional): The logit bias for generating completions. Defaults to {}.
+        user (str, optional): The user string for generating completions. Defaults to "".
+        deployment_id (optional): The deployment ID for generating completions. Defaults to None.
+        request_timeout (int, optional): The request timeout for generating completions. Defaults to None.
+
+    Returns:
+        list: A list of completion results.
+    """
    args = locals()
    batch_messages = messages
    completions = []
@ -1393,10 +1559,33 @@ def batch_completion_models(*args, **kwargs):
                kwargs = {**deployment, **nested_kwargs}
                futures[deployment["model"]] = executor.submit(completion, **kwargs)

-            done, not_done = concurrent.futures.wait(futures.values(), return_when=concurrent.futures.FIRST_COMPLETED)
+            while futures:
+                # wait for the first returned future
+                print_verbose("\n\n waiting for next result\n\n")
+                done, _ = concurrent.futures.wait(futures.values(), return_when=concurrent.futures.FIRST_COMPLETED)
+                print_verbose(f"done list\n{done}")
+                for future in done:
+                    try:
+                        result = future.result()
+                        return result
+                    except Exception as e:
+                        # if model 1 fails, continue with response from model 2, model3
+                        print_verbose(f"\n\ngot an exception, ignoring, removing from futures")
+                        print_verbose(futures)
+                        new_futures = {}
+                        for key, value in futures.items():
+                            if future == value:
+                                print_verbose(f"removing key{key}")
+                                continue
+                            else:
+                                new_futures[key] = value
+                        futures = new_futures
+                        print_verbose(f"new futures{futures}")
+                        continue

-            for future in done:
-                return future.result()
+                
+                print_verbose("\n\ndone looping through futures\n\n")
+                print_verbose(futures)

    return None  # If no response is received from any model

@ -1435,19 +1624,25 @@ def batch_completion_models_all_responses(*args, **kwargs):

    with concurrent.futures.ThreadPoolExecutor(max_workers=len(models)) as executor:
        for idx, model in enumerate(models):
-            print(f"{GREEN}LiteLLM: Making request to model: {model}{RESET}")
            future = executor.submit(completion, *args, model=model, **kwargs)
            if future.result() is not None:
                responses.append(future.result())
-                print(f"{GREEN}LiteLLM: Model {model} returned response{RESET}")
-            else:
-                print(f"{RED}LiteLLM: Model {model } did not return a response{RESET}")

    return responses

 ### EMBEDDING ENDPOINTS ####################

 async def aembedding(*args, **kwargs):
+    """
+    Asynchronously calls the `embedding` function with the given arguments and keyword arguments.
+
+    Parameters:
+    - `args` (tuple): Positional arguments to be passed to the `embedding` function.
+    - `kwargs` (dict): Keyword arguments to be passed to the `embedding` function.
+
+    Returns:
+    - `response` (Any): The response returned by the `embedding` function.
+    """
    loop = asyncio.get_event_loop()

    # Use a partial function to pass your keyword arguments
@ -1481,6 +1676,7 @@ def embedding(
    api_type: Optional[str] = None,
    caching: bool=False,
    custom_llm_provider=None,
+    **kwargs
 ):
    """
    Embedding function that calls an API to generate embeddings for the given input.
@ -1610,6 +1806,7 @@ def embedding(
            response = cohere.embedding(
                model=model,
                input=input,
+                optional_params=kwargs,
                encoding=encoding,
                api_key=cohere_key,
                logging_obj=logging,
@ -1632,6 +1829,15 @@ def embedding(
                logging_obj=logging,
                model_response= EmbeddingResponse()
            )
+        elif custom_llm_provider == "bedrock":
+            response = bedrock.embedding(
+                model=model,
+                input=input,
+                encoding=encoding,
+                logging_obj=logging,
+                optional_params=kwargs,
+                model_response= EmbeddingResponse()
+            )
        else:
            args = locals()
            raise ValueError(f"No valid embedding model args passed in - {args}")
@ -1653,32 +1859,87 @@ def embedding(

 ###### Text Completion ################
 def text_completion(*args, **kwargs):
+    global print_verbose
+    import copy
    """
    This maps to the Openai.Completion.create format, which has a different I/O (accepts prompt, returning ["choices"]["text"].
    """
-    if "prompt" in kwargs:
+    if "engine" in  kwargs:
+        kwargs["model"] = kwargs["engine"]
+        kwargs.pop("engine")
+
+    # input validation
+    if "prompt" not in kwargs:
+        raise ValueError("please pass prompt into the `text_completion` endpoint - `text_completion(model, prompt='hello world')`")
+
+    text_completion_response = TextCompletionResponse()
+    model = kwargs["model"]
+    prompt = kwargs["prompt"]
+    # get custom_llm_provider
+    _, custom_llm_provider, dynamic_api_key, api_base = get_llm_provider(model=model)
+    
+    if custom_llm_provider == "text-completion-openai":
+        # text-davinci-003 and openai text completion models
        messages = [{"role": "system", "content": kwargs["prompt"]}]
        kwargs["messages"] = messages
        kwargs.pop("prompt")
        response = completion(*args, **kwargs) # assume the response is the openai response object 
-        formatted_response_obj = {
-            "id": response["id"],
-            "object": "text_completion",
-            "created": response["created"],
-            "model": response["model"],
-            "choices": [
-            {
-                "text": response["choices"][0]["message"]["content"],
-                "index": response["choices"][0]["index"],
-                "logprobs": None,
-                "finish_reason": response["choices"][0]["finish_reason"]
-            }
-            ],
-            "usage": response["usage"]
-        }
-        return formatted_response_obj
+        # return raw response from openai
+        return response._hidden_params.get("original_response", None)
+
+    elif custom_llm_provider == "huggingface":
+        # if echo == True, for TGI llms we need to set top_n_tokens to 3
+        if kwargs.get("echo", False) == True:
+            # for tgi llms
+            if "top_n_tokens" not in kwargs:
+                kwargs["top_n_tokens"] = 3
+
+    # processing prompt - users can pass raw tokens to OpenAI Completion()
+    if type(prompt) == list:
+        tokenizer = tiktoken.encoding_for_model("text-davinci-003")
+        ## if it's a 2d list - each element in the list is a text_completion() request
+        if len(prompt) > 0 and type(prompt[0]) == list:
+            responses = [None for x in prompt] # init responses 
+            for i, request in enumerate(prompt):
+                decoded_prompt = tokenizer.decode(request)
+                new_kwargs = copy.deepcopy(kwargs)
+                new_kwargs["prompt"] = decoded_prompt
+                response = text_completion(**new_kwargs)
+                responses[i] = response["choices"][0]
+
+            text_completion_response["id"] = response["id"]
+            text_completion_response["object"] = "text_completion"
+            text_completion_response["created"] = response["created"]
+            text_completion_response["model"] = response["model"]
+            text_completion_response["choices"] = responses
+            text_completion_response["usage"] = response["usage"]
+
+            return text_completion_response
    else:
-        raise ValueError("please pass prompt into the `text_completion` endpoint - `text_completion(model, prompt='hello world')`")
+        messages = [{"role": "system", "content": kwargs["prompt"]}]
+        kwargs["messages"] = messages
+        kwargs.pop("prompt")
+        response = completion(*args, **kwargs) # assume the response is the openai response object 
+
+        transformed_logprobs = None
+        # only supported for TGI models
+        try:
+            raw_response = response._hidden_params.get("original_response", None)
+            transformed_logprobs = litellm.utils.transform_logprobs(raw_response)
+        except Exception as e:
+            print_verbose(f"LiteLLM non blocking exception: {e}")
+        text_completion_response["id"] = response["id"]
+        text_completion_response["object"] = "text_completion"
+        text_completion_response["created"] = response["created"]
+        text_completion_response["model"] = response["model"]
+        text_choices = TextChoices()
+        text_choices["text"] = response["choices"][0]["message"]["content"]
+        text_choices["index"] = response["choices"][0]["index"]
+        text_choices["logprobs"] = transformed_logprobs
+        text_choices["finish_reason"] = response["choices"][0]["finish_reason"]
+        text_completion_response["choices"] = [text_choices]
+        text_completion_response["usage"] = response["usage"]
+        return text_completion_response

 ##### Moderation #######################
 def moderation(input: str, api_key: Optional[str]=None):
@ -1700,7 +1961,7 @@ def moderation(input: str, api_key: Optional[str]=None):
 ## Set verbose to true -> ```litellm.set_verbose = True```
 def print_verbose(print_statement):
    if litellm.set_verbose:
-        print(f"LiteLLM: {print_statement}")
+        print(print_statement) # noqa

 def config_completion(**kwargs):
    if litellm.config_path != None:
@ -1736,15 +1997,16 @@ def stream_chunk_builder(chunks: list):
                "finish_reason": finish_reason,
            }
        ],
-        # "usage": {
-        #     "prompt_tokens": 0,  # Modify as needed
-        #     "completion_tokens": 0,  # Modify as needed
-        #     "total_tokens": 0  # Modify as needed
-        # }
+        "usage": {
+            "prompt_tokens": 0,  # Modify as needed
+            "completion_tokens": 0,  # Modify as needed
+            "total_tokens": 0  # Modify as needed
+        }
    }

    # Extract the "content" strings from the nested dictionaries within "choices"
    content_list = []
+    combined_content = ""

    if "function_call" in chunks[0]["choices"][0]["delta"]:
        argument_list = []
@ -1787,6 +2049,5 @@ def stream_chunk_builder(chunks: list):


    # # Update usage information if needed
-    # response["usage"]["completion_tokens"] = token
-
+    response["usage"]["completion_tokens"] = litellm.utils.token_counter(model=model, text=combined_content)
    return response
--- a/litellm/proxy/config.yaml
+++ b/litellm/proxy/config.yaml
@ -0,0 +1,9 @@
+model_list:
+  - model_name: zephyr-alpha
+    litellm_params: # params for litellm.completion() - https://docs.litellm.ai/docs/completion/input#input---request-body
+      model: huggingface/HuggingFaceH4/zephyr-7b-alpha
+      api_base: http://0.0.0.0:8001
+  - model_name: zephyr-beta
+    litellm_params:
+      model: huggingface/HuggingFaceH4/zephyr-7b-beta
+      api_base: https://<my-hosted-endpoint>
--- a/litellm/proxy/llm.py
+++ b/litellm/proxy/llm.py
@ -1,152 +0,0 @@
-from typing import Dict, Optional
-from collections import defaultdict
-import threading
-import os, subprocess, traceback, json
-from fastapi import HTTPException
-from fastapi.responses import StreamingResponse
-
-import backoff
-import openai.error
-
-import litellm
-from litellm.utils import trim_messages
-from litellm.exceptions import ServiceUnavailableError, InvalidRequestError
-
-cost_dict: Dict[str, Dict[str, float]] = defaultdict(dict)
-cost_dict_lock = threading.Lock()
-
-debug = False
-##### HELPER FUNCTIONS #####
-def print_verbose(print_statement):
-    global debug 
-    if debug: 
-         print(print_statement)
-
-# for streaming
-def data_generator(response):
-    print_verbose("inside generator")
-    for chunk in response:
-        print_verbose(f"returned chunk: {chunk}")
-        yield f"data: {json.dumps(chunk)}\n\n"
-
-def run_ollama_serve():
-    command = ['ollama', 'serve']
-    
-    with open(os.devnull, 'w') as devnull:
-        process = subprocess.Popen(command, stdout=devnull, stderr=devnull)
-
-##### ERROR HANDLING #####
-class RetryConstantError(Exception):
-    pass
-
-
-class RetryExpoError(Exception):
-    pass
-
-
-class UnknownLLMError(Exception):
-    pass
-
-
-def handle_llm_exception(e: Exception, user_api_base: Optional[str]=None):
-    print(f"\033[1;31mLiteLLM.Exception: {str(e)}\033[0m")
-    if isinstance(e, ServiceUnavailableError) and e.llm_provider == "ollama": # type: ignore
-        run_ollama_serve()
-    if isinstance(e, InvalidRequestError) and e.llm_provider == "ollama": # type: ignore
-        completion_call_details = {}
-        completion_call_details["model"] = e.model # type: ignore
-        if user_api_base: 
-            completion_call_details["api_base"] = user_api_base
-        else: 
-            completion_call_details["api_base"] = None
-        print(f"\033[1;31mLiteLLM.Exception: Invalid API Call. Call details: Model: \033[1;37m{e.model}\033[1;31m; LLM Provider: \033[1;37m{e.llm_provider}\033[1;31m; Custom API Base - \033[1;37m{completion_call_details['api_base']}\033[1;31m\033[0m") # type: ignore
-        if completion_call_details["api_base"] == "http://localhost:11434": 
-            print()
-            print("Trying to call ollama? Try `litellm --model ollama/llama2 --api_base http://localhost:11434`")
-            print()
-    if isinstance(
-        e,
-        (
-            openai.error.APIError,
-            openai.error.TryAgain,
-            openai.error.Timeout,
-            openai.error.ServiceUnavailableError,
-        ),
-    ):
-        raise RetryConstantError from e
-    elif isinstance(e, openai.error.RateLimitError):
-        raise RetryExpoError from e
-    elif isinstance(
-        e,
-        (
-            openai.error.APIConnectionError,
-            openai.error.InvalidRequestError,
-            openai.error.AuthenticationError,
-            openai.error.PermissionError,
-            openai.error.InvalidAPIType,
-            openai.error.SignatureVerificationError,
-        ),
-    ):
-        raise e
-    else:
-        raise UnknownLLMError from e
-
-
-@backoff.on_exception(
-    wait_gen=backoff.constant,
-    exception=RetryConstantError,
-    max_tries=3,
-    interval=3,
-)
-@backoff.on_exception(
-    wait_gen=backoff.expo,
-    exception=RetryExpoError,
-    jitter=backoff.full_jitter,
-    max_value=100,
-    factor=1.5,
-)
-
-def litellm_completion(data: Dict,
-                type: str, 
-                user_model: Optional[str], 
-                user_temperature: Optional[str], 
-                user_max_tokens: Optional[int], 
-                user_request_timeout: Optional[int],
-                user_api_base: Optional[str], 
-                user_headers: Optional[dict], 
-                user_debug: bool,
-                model_router: Optional[litellm.Router]):
-    try:  
-        global debug
-        debug = user_debug
-        if user_model:
-            data["model"] = user_model
-        # override with user settings
-        if user_temperature: 
-            data["temperature"] = user_temperature
-        if user_request_timeout:
-            data["request_timeout"] = user_request_timeout
-        if user_max_tokens: 
-            data["max_tokens"] = user_max_tokens
-        if user_api_base: 
-            data["api_base"] = user_api_base
-        if user_headers: 
-            data["headers"] = user_headers
-        if type == "completion": 
-            if model_router and data["model"] in model_router.get_model_names(): 
-                model_router.text_completion(**data)
-            else:
-                response = litellm.text_completion(**data)
-        elif type == "chat_completion": 
-            if model_router and data["model"] in model_router.get_model_names(): 
-                model_router.completion(**data)
-            else:
-                response = litellm.completion(**data)
-        if 'stream' in data and data['stream'] == True: # use generate_responses to stream responses
-            return StreamingResponse(data_generator(response), media_type='text/event-stream')
-        print_verbose(f"response: {response}")
-        return response
-    except Exception as e: 
-        print(e)
-        handle_llm_exception(e=e, user_api_base=user_api_base)
-        return {"message": "An error occurred"}, 500
--- a/litellm/proxy/proxy_cli.py
+++ b/litellm/proxy/proxy_cli.py
@ -5,8 +5,9 @@ import random, appdirs
 from datetime import datetime
 from dotenv import load_dotenv
 import operator
+sys.path.append(os.getcwd())

-config_filename = "litellm.secrets.toml"
+config_filename = "litellm.secrets"
 # Using appdirs to determine user-specific config path
 config_dir = appdirs.user_config_dir("litellm")
 user_config_path = os.getenv("LITELLM_CONFIG_PATH", os.path.join(config_dir, config_filename))
@ -22,39 +23,6 @@ def run_ollama_serve():
    with open(os.devnull, 'w') as devnull:
        process = subprocess.Popen(command, stdout=devnull, stderr=devnull)

-def open_config(file_path=None):
-    # Create the .env file if it doesn't exist
-    if file_path: 
-        # Ensure the user-specific directory exists
-        os.makedirs(config_dir, exist_ok=True)
-        # Copying the file using shutil.copy
-        try:
-            shutil.copy(file_path, user_config_path)
-            with open(file_path) as f:
-                print(f"Source file: {file_path}")
-                print(f.read())
-
-            with open(user_config_path) as f:
-                print(f"Dest file: {user_config_path}")
-                print(f.read())
-            print("\033[1;32mDone successfully\033[0m")
-        except Exception as e:
-            print(f"Failed to copy {file_path}: {e}")
-    else: 
-        if os.path.exists(user_config_path):
-            if os.path.getsize(user_config_path) == 0:
-                print(f"{user_config_path} exists but is empty")
-                print(f"To create a config (save keys, modify model prompt), copy the template located here: https://docs.litellm.ai/docs/proxy_server")
-            else: 
-                with open(user_config_path) as f:
-                    print(f"Saved Config file: {user_config_path}")
-                    print(f.read())
-        else:
-            print(f"{user_config_path} hasn't been created yet.")
-            print(f"To create a config (save keys, modify model prompt), copy the template located here: https://docs.litellm.ai/docs/proxy_server")
-    print(f"LiteLLM: config location - {user_config_path}")
-
-
 def clone_subfolder(repo_url, subfolder, destination):
  # Clone the full repo
  repo_name = repo_url.split('/')[-1]  
@ -85,6 +53,7 @@ def is_port_in_use(port):
@click.command()
@click.option('--host', default='0.0.0.0', help='Host for the server to listen on.')
@click.option('--port', default=8000, help='Port to bind the server to.')
+@click.option('--num_workers', default=1, help='Number of uvicorn workers to spin up')
@click.option('--api_base', default=None, help='API base URL.')
@click.option('--api_version', default="2023-07-01-preview", help='For azure - pass in the api version.')
@click.option('--model', '-m', default=None, help='The model name to pass to litellm expects') 
@ -99,7 +68,7 @@ def is_port_in_use(port):
@click.option('--drop_params', is_flag=True, help='Drop any unmapped params') 
@click.option('--create_proxy', is_flag=True, help='Creates a local OpenAI-compatible server template') 
@click.option('--add_function_to_prompt', is_flag=True, help='If function passed but unsupported, pass it as prompt') 
-@click.option('--config', '-c', is_flag=True, help='Configure Litellm')  
+@click.option('--config', '-c', help='Configure Litellm')  
@click.option('--file', '-f', help='Path to config file')
@click.option('--max_budget', default=None, type=float, help='Set max budget for API calls - works for hosted models like OpenAI, TogetherAI, Anthropic, etc.`') 
@click.option('--telemetry', default=True, type=bool, help='Helps us know if people are using this feature. Turn this off by doing `--telemetry False`') 
@ -107,17 +76,17 @@ def is_port_in_use(port):
@click.option('--test', flag_value=True, help='proxy chat completions url to make a test request to')
@click.option('--local', is_flag=True, default=False, help='for local debugging')
@click.option('--cost', is_flag=True, default=False, help='for viewing cost logs')
-def run_server(host, port, api_base, api_version, model, alias, add_key, headers, save, debug, temperature, max_tokens, request_timeout, drop_params, create_proxy, add_function_to_prompt, config, file, max_budget, telemetry, logs, test, local, cost):
+def run_server(host, port, api_base, api_version, model, alias, add_key, headers, save, debug, temperature, max_tokens, request_timeout, drop_params, create_proxy, add_function_to_prompt, config, file, max_budget, telemetry, logs, test, local, cost, num_workers):
    global feature_telemetry
    args = locals()
    if local:
-        from proxy_server import app, initialize, print_cost_logs, usage_telemetry, add_keys_to_config
+        from proxy_server import app, save_worker_config, print_cost_logs, usage_telemetry, add_keys_to_config
        debug = True
    else:
        try:
-            from .proxy_server import app, initialize, print_cost_logs, usage_telemetry, add_keys_to_config
+            from .proxy_server import app, save_worker_config, print_cost_logs, usage_telemetry, add_keys_to_config
        except ImportError as e: 
-            from proxy_server import app, initialize, print_cost_logs, usage_telemetry, add_keys_to_config
+            from proxy_server import app, save_worker_config, print_cost_logs, usage_telemetry, add_keys_to_config
    feature_telemetry = usage_telemetry
    if create_proxy == True: 
        repo_url = 'https://github.com/BerriAI/litellm'
@ -126,12 +95,6 @@ def run_server(host, port, api_base, api_version, model, alias, add_key, headers

        clone_subfolder(repo_url, subfolder, destination)
        return
-    if config:
-        if file: 
-            open_config(file_path=file)
-        else: 
-            open_config()
-        return
    if logs is not None:
        if logs == 0: # default to 1
            logs = 1
@ -176,10 +139,13 @@ def run_server(host, port, api_base, api_version, model, alias, add_key, headers
        openai.api_key = "temp-key"
        print(openai.api_base)

+        response = openai.Completion.create(model="gpt-3.5-turbo", prompt='this is a test request, write a short poem')
+        print(response)
+
        response = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages = [
            {
                "role": "user",
-                "content": "this is a test request, acknowledge that you got it"
+                "content": "this is a test request, write a short poem"
            }
        ])
        click.echo(f'LiteLLM: response from proxy {response}')
@ -188,7 +154,7 @@ def run_server(host, port, api_base, api_version, model, alias, add_key, headers
        response = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages = [
            {
                "role": "user",
-                "content": "this is a test request, acknowledge that you got it"
+                "content": "this is a test request, write a short poem"
            }
        ],
        stream=True,
@ -199,7 +165,7 @@ def run_server(host, port, api_base, api_version, model, alias, add_key, headers
    else:
        if headers:
            headers = json.loads(headers)
-        initialize(model=model, alias=alias, api_base=api_base, api_version=api_version, debug=debug, temperature=temperature, max_tokens=max_tokens, request_timeout=request_timeout, max_budget=max_budget, telemetry=telemetry, drop_params=drop_params, add_function_to_prompt=add_function_to_prompt, headers=headers, save=save)
+        save_worker_config(model=model, alias=alias, api_base=api_base, api_version=api_version, debug=debug, temperature=temperature, max_tokens=max_tokens, request_timeout=request_timeout, max_budget=max_budget, telemetry=telemetry, drop_params=drop_params, add_function_to_prompt=add_function_to_prompt, headers=headers, save=save, config=config)
        try:
            import uvicorn
        except:
@ -210,7 +176,8 @@ def run_server(host, port, api_base, api_version, model, alias, add_key, headers
        
        if port == 8000 and is_port_in_use(port):
            port = random.randint(1024, 49152)
-        uvicorn.run(app, host=host, port=port)
+        print(os.listdir(os.getcwd()))
+        uvicorn.run("litellm:app", host=host, port=port, workers=num_workers)


 if __name__ == "__main__":
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -1,7 +1,7 @@
 import sys, os, platform, time, copy
-import threading
-import shutil, random, traceback
-
+import threading, ast
+import shutil, random, traceback, requests
+from typing import Optional
 messages: list = []
 sys.path.insert(
    0, os.path.abspath("../..")
@ -14,6 +14,7 @@ try:
    import appdirs
    import tomli_w
    import backoff
+    import yaml
 except ImportError:
    import subprocess
    import sys
@ -30,6 +31,7 @@ except ImportError:
            "appdirs",
            "tomli-w",
            "backoff",
+            "pyyaml"
        ]
    )
    import uvicorn
@ -38,11 +40,6 @@ except ImportError:
    import appdirs
    import tomli_w

-try:
-    from .llm import litellm_completion
-except ImportError as e:
-    from llm import litellm_completion  # type: ignore
-
 import random

 list_of_messages = [
@ -90,6 +87,7 @@ print("\033[1;34mDocs: https://docs.litellm.ai/docs/proxy_server\033[0m")
 print()

 import litellm
+litellm.suppress_debug_info = True
 from fastapi import FastAPI, Request
 from fastapi.routing import APIRouter
 from fastapi.encoders import jsonable_encoder
@ -120,30 +118,27 @@ user_telemetry = True
 user_config = None
 user_headers = None
 local_logging = True # writes logs to a local api_log.json file for debugging
-model_router = litellm.Router()
 config_filename = "litellm.secrets.toml"
 config_dir = os.getcwd()
 config_dir = appdirs.user_config_dir("litellm")
 user_config_path = os.getenv(
    "LITELLM_CONFIG_PATH", os.path.join(config_dir, config_filename)
 )
+#### GLOBAL VARIABLES ####
+llm_router: Optional[litellm.Router] = None
+llm_model_list: Optional[list] = None
+server_settings: dict = {}
 log_file = "api_log.json"
-
+worker_config = None

 #### HELPER FUNCTIONS ####
 def print_verbose(print_statement):
    global user_debug
+    print(f"user debug value: {user_debug}")
    if user_debug:
        print(print_statement)


-def find_avatar_url(role):
-    role = role.replace(" ", "%20")
-    avatar_filename = f"avatars/{role}.png"
-    avatar_url = f"/static/{avatar_filename}"
-    return avatar_url
-
-
 def usage_telemetry(
    feature: str,
 ):  # helps us know if people are using this feature. Set `litellm --telemetry False` to your cli call to turn this off
@ -205,105 +200,147 @@ def save_params_to_config(data: dict):
        tomli_w.dump(config, f)


-def load_config():
-    try:
-        global user_config, user_api_base, user_max_tokens, user_temperature, user_model, local_logging
-        # As the .env file is typically much simpler in structure, we use load_dotenv here directly
-        with open(user_config_path, "rb") as f:
-            user_config = tomllib.load(f)
-
-        ## load keys
-        if "keys" in user_config:
-            for key in user_config["keys"]:
-                os.environ[key] = user_config["keys"][
-                    key
-                ]  # litellm can read keys from the environment
-        ## settings
-        if "general" in user_config:
-            litellm.add_function_to_prompt = user_config["general"].get(
-                "add_function_to_prompt", True
-            )  # by default add function to prompt if unsupported by provider
-            litellm.drop_params = user_config["general"].get(
-                "drop_params", True
-            )  # by default drop params if unsupported by provider
-            litellm.model_fallbacks = user_config["general"].get(
-                "fallbacks", None
-            )  # fallback models in case initial completion call fails
-            default_model = user_config["general"].get(
-                "default_model", None
-            )  # route all requests to this model.
-
-            local_logging = user_config["general"].get("local_logging", True)
-
-            if user_model is None:  # `litellm --model <model-name>`` > default_model.
-                user_model = default_model
-
-        ## load model config - to set this run `litellm --config`
-        model_config = None
-        if "model" in user_config:
-            if user_model in user_config["model"]:
-                model_config = user_config["model"][user_model]
-            model_list = []
-            for model in user_config["model"]:
-                if "model_list" in user_config["model"][model]:
-                    model_list.extend(user_config["model"][model]["model_list"])
-            if len(model_list) > 0:
-                model_router.set_model_list(model_list=model_list)
-
-        print_verbose(f"user_config: {user_config}")
-        print_verbose(f"model_config: {model_config}")
-        print_verbose(f"user_model: {user_model}")
-        if model_config is None:
-            return
-
-        user_max_tokens = model_config.get("max_tokens", None)
-        user_temperature = model_config.get("temperature", None)
-        user_api_base = model_config.get("api_base", None)
-
-        ## custom prompt template
-        if "prompt_template" in model_config:
-            model_prompt_template = model_config["prompt_template"]
-            if (
-                len(model_prompt_template.keys()) > 0
-            ):  # if user has initialized this at all
-                litellm.register_prompt_template(
-                    model=user_model,
-                    initial_prompt_value=model_prompt_template.get(
-                        "MODEL_PRE_PROMPT", ""
-                    ),
-                    roles={
-                        "system": {
-                            "pre_message": model_prompt_template.get(
-                                "MODEL_SYSTEM_MESSAGE_START_TOKEN", ""
-                            ),
-                            "post_message": model_prompt_template.get(
-                                "MODEL_SYSTEM_MESSAGE_END_TOKEN", ""
-                            ),
-                        },
-                        "user": {
-                            "pre_message": model_prompt_template.get(
-                                "MODEL_USER_MESSAGE_START_TOKEN", ""
-                            ),
-                            "post_message": model_prompt_template.get(
-                                "MODEL_USER_MESSAGE_END_TOKEN", ""
-                            ),
-                        },
-                        "assistant": {
-                            "pre_message": model_prompt_template.get(
-                                "MODEL_ASSISTANT_MESSAGE_START_TOKEN", ""
-                            ),
-                            "post_message": model_prompt_template.get(
-                                "MODEL_ASSISTANT_MESSAGE_END_TOKEN", ""
-                            ),
-                        },
-                    },
-                    final_prompt_value=model_prompt_template.get(
-                        "MODEL_POST_PROMPT", ""
-                    ),
-                )
+def load_router_config(router: Optional[litellm.Router], config_file_path: str):
+    config = {}
+    server_settings  = {} 
+    try: 
+        if os.path.exists(config_file_path):
+            with open(config_file_path, 'r') as file:
+                config = yaml.safe_load(file)
+        else:
+            pass
    except:
        pass

+    ## SERVER SETTINGS (e.g. default completion model = 'ollama/mistral')
+    _server_settings = config.get("server_settings", None)
+    if _server_settings: 
+        server_settings = _server_settings
+
+    ## LITELLM MODULE SETTINGS (e.g. litellm.drop_params=True,..)
+    litellm_settings = config.get('litellm_settings', None)
+    if litellm_settings: 
+        for key, value in litellm_settings.items(): 
+            setattr(litellm, key, value)
+
+    ## MODEL LIST
+    model_list = config.get('model_list', None)
+    if model_list: 
+        router = litellm.Router(model_list=model_list)
+    
+    ## ENVIRONMENT VARIABLES
+    environment_variables = config.get('environment_variables', None)
+    if environment_variables: 
+        for key, value in environment_variables.items(): 
+            os.environ[key] = value
+
+    return router, model_list, server_settings
+
+def load_config():
+    #### DEPRECATED #### 
+    try:
+        global user_config, user_api_base, user_max_tokens, user_temperature, user_model, local_logging, llm_model_list, llm_router, server_settings
+        
+        # Get the file extension
+        file_extension = os.path.splitext(user_config_path)[1]
+        if file_extension.lower() == ".toml":
+            # As the .env file is typically much simpler in structure, we use load_dotenv here directly
+            with open(user_config_path, "rb") as f:
+                user_config = tomllib.load(f)
+
+            ## load keys
+            if "keys" in user_config:
+                for key in user_config["keys"]:
+                    os.environ[key] = user_config["keys"][
+                        key
+                    ]  # litellm can read keys from the environment
+            ## settings
+            if "general" in user_config:
+                litellm.add_function_to_prompt = user_config["general"].get(
+                    "add_function_to_prompt", True
+                )  # by default add function to prompt if unsupported by provider
+                litellm.drop_params = user_config["general"].get(
+                    "drop_params", True
+                )  # by default drop params if unsupported by provider
+                litellm.model_fallbacks = user_config["general"].get(
+                    "fallbacks", None
+                )  # fallback models in case initial completion call fails
+                default_model = user_config["general"].get(
+                    "default_model", None
+                )  # route all requests to this model.
+
+                local_logging = user_config["general"].get("local_logging", True)
+
+                if user_model is None:  # `litellm --model <model-name>`` > default_model.
+                    user_model = default_model
+
+            ## load model config - to set this run `litellm --config`
+            model_config = None
+            if "model" in user_config:
+                if user_model in user_config["model"]:
+                    model_config = user_config["model"][user_model]
+                model_list = []
+                for model in user_config["model"]:
+                    if "model_list" in user_config["model"][model]:
+                        model_list.extend(user_config["model"][model]["model_list"])
+
+            print_verbose(f"user_config: {user_config}")
+            print_verbose(f"model_config: {model_config}")
+            print_verbose(f"user_model: {user_model}")
+            if model_config is None:
+                return
+
+            user_max_tokens = model_config.get("max_tokens", None)
+            user_temperature = model_config.get("temperature", None)
+            user_api_base = model_config.get("api_base", None)
+
+            ## custom prompt template
+            if "prompt_template" in model_config:
+                model_prompt_template = model_config["prompt_template"]
+                if (
+                    len(model_prompt_template.keys()) > 0
+                ):  # if user has initialized this at all
+                    litellm.register_prompt_template(
+                        model=user_model,
+                        initial_prompt_value=model_prompt_template.get(
+                            "MODEL_PRE_PROMPT", ""
+                        ),
+                        roles={
+                            "system": {
+                                "pre_message": model_prompt_template.get(
+                                    "MODEL_SYSTEM_MESSAGE_START_TOKEN", ""
+                                ),
+                                "post_message": model_prompt_template.get(
+                                    "MODEL_SYSTEM_MESSAGE_END_TOKEN", ""
+                                ),
+                            },
+                            "user": {
+                                "pre_message": model_prompt_template.get(
+                                    "MODEL_USER_MESSAGE_START_TOKEN", ""
+                                ),
+                                "post_message": model_prompt_template.get(
+                                    "MODEL_USER_MESSAGE_END_TOKEN", ""
+                                ),
+                            },
+                            "assistant": {
+                                "pre_message": model_prompt_template.get(
+                                    "MODEL_ASSISTANT_MESSAGE_START_TOKEN", ""
+                                ),
+                                "post_message": model_prompt_template.get(
+                                    "MODEL_ASSISTANT_MESSAGE_END_TOKEN", ""
+                                ),
+                            },
+                        },
+                        final_prompt_value=model_prompt_template.get(
+                            "MODEL_POST_PROMPT", ""
+                        ),
+                    )
+    except:
+        pass
+
+def save_worker_config(**data): 
+    import json
+    os.environ["WORKER_CONFIG"] = json.dumps(data)

 def initialize(
    model,
@ -320,12 +357,14 @@ def initialize(
    add_function_to_prompt,
    headers,
    save,
+    config
 ):
-    global user_model, user_api_base, user_debug, user_max_tokens, user_request_timeout, user_temperature, user_telemetry, user_headers
+    global user_model, user_api_base, user_debug, user_max_tokens, user_request_timeout, user_temperature, user_telemetry, user_headers, llm_model_list, llm_router, server_settings
    user_model = model
    user_debug = debug
-    load_config()
    dynamic_config = {"general": {}, user_model: {}}
+    if config:
+        llm_router, llm_model_list, server_settings = load_router_config(router=llm_router, config_file_path=config)
    if headers:  # model-specific param
        user_headers = headers
        dynamic_config[user_model]["headers"] = headers
@ -470,57 +509,139 @@ litellm.input_callback = [logger]
 litellm.success_callback = [logger]
 litellm.failure_callback = [logger]

+# for streaming
+def data_generator(response):
+    print_verbose("inside generator")
+    for chunk in response:
+        print_verbose(f"returned chunk: {chunk}")
+        yield f"data: {json.dumps(chunk)}\n\n"
+
+
+def litellm_completion(*args, **kwargs):
+    global user_temperature, user_request_timeout, user_max_tokens, user_api_base
+    call_type = kwargs.pop("call_type")
+    # override with user settings
+    if user_temperature: 
+        kwargs["temperature"] = user_temperature
+    if user_request_timeout:
+        kwargs["request_timeout"] = user_request_timeout
+    if user_max_tokens: 
+        kwargs["max_tokens"] = user_max_tokens
+    if user_api_base: 
+        kwargs["api_base"] = user_api_base
+    ## CHECK CONFIG ## 
+    if llm_model_list and kwargs["model"] in [m["model_name"] for m in llm_model_list]:
+        for m in llm_model_list: 
+            if kwargs["model"] == m["model_name"]: 
+                for key, value in m["litellm_params"].items(): 
+                    kwargs[key] = value
+                break
+    print(f"litellm set verbose pre-call: {litellm.set_verbose}")
+    if call_type == "chat_completion":
+        response = litellm.completion(*args, **kwargs)
+    elif call_type == "text_completion":
+        response = litellm.text_completion(*args, **kwargs)
+    if 'stream' in kwargs and kwargs['stream'] == True: # use generate_responses to stream responses
+        return StreamingResponse(data_generator(response), media_type='text/event-stream')
+    return response
+
+
+@app.on_event("startup")
+def startup_event():
+    import json
+    worker_config = json.loads(os.getenv("WORKER_CONFIG"))
+    initialize(**worker_config)
+    print(f"\033[32mWorker Initialized\033[0m\n")

 #### API ENDPOINTS ####
-@router.post("/v1/models")
+@router.get("/v1/models")
@router.get("/models")  # if project requires model list
 def model_list():
-    if user_model != None:
-        return dict(
-            data=[
-                {
-                    "id": user_model,
-                    "object": "model",
-                    "created": 1677610602,
-                    "owned_by": "openai",
-                }
-            ],
-            object="list",
-        )
-    else:
+    global llm_model_list, server_settings    
+    all_models = []
+    if server_settings.get("infer_model_from_keys", False):
        all_models = litellm.utils.get_valid_models()
-        return dict(
-            data=[
-                {
-                    "id": model,
-                    "object": "model",
-                    "created": 1677610602,
-                    "owned_by": "openai",
-                }
-                for model in all_models
-            ],
-            object="list",
-        )
-
+    if llm_model_list: 
+        all_models += llm_model_list
+    if user_model is not None:
+        all_models += user_model
+    ### CHECK OLLAMA MODELS ### 
+    try:
+        response = requests.get("http://0.0.0.0:11434/api/tags")
+        models = response.json()["models"]
+        ollama_models = [m["name"].replace(":latest", "") for m in models]
+        all_models.extend(ollama_models)
+    except Exception as e: 
+        traceback.print_exc()
+    return dict(
+        data=[
+            {
+                "id": model,
+                "object": "model",
+                "created": 1677610602,
+                "owned_by": "openai",
+            }
+            for model in all_models
+        ],
+        object="list",
+    )

@router.post("/v1/completions")
@router.post("/completions")
-async def completion(request: Request):
-    data = await request.json()
-    return litellm_completion(data=data, type="completion", user_model=user_model, user_temperature=user_temperature,
-                              user_max_tokens=user_max_tokens, user_api_base=user_api_base, user_headers=user_headers,
-                              user_debug=user_debug, model_router=model_router, user_request_timeout=user_request_timeout)
-
+@router.post("/engines/{model:path}/completions")
+async def completion(request: Request, model: Optional[str] = None):
+    try: 
+        body = await request.body()
+        body_str = body.decode()
+        try:
+            data = ast.literal_eval(body_str)
+        except: 
+            data = json.loads(body_str)
+        data["model"] = (
+            server_settings.get("completion_model", None) # server default
+            or user_model # model name passed via cli args
+            or model # for azure deployments
+            or data["model"] # default passed in http request
+        )
+        if user_model:
+            data["model"] = user_model
+        data["call_type"] = "text_completion"
+        return litellm_completion(
+            **data
+        )
+    except Exception as e: 
+        error_traceback = traceback.format_exc()
+        error_msg = f"{str(e)}\n\n{error_traceback}"
+        return {"error": error_msg}
+                              

@router.post("/v1/chat/completions")
@router.post("/chat/completions")
-async def chat_completion(request: Request):
-    data = await request.json()
-    print_verbose(f"data passed in: {data}")
-    return litellm_completion(data, type="chat_completion", user_model=user_model,
-                              user_temperature=user_temperature, user_max_tokens=user_max_tokens,
-                              user_api_base=user_api_base, user_headers=user_headers, user_debug=user_debug, model_router=model_router, user_request_timeout=user_request_timeout)
-
+@router.post("/openai/deployments/{model:path}/chat/completions") # azure compatible endpoint
+async def chat_completion(request: Request, model: Optional[str] = None):
+    global server_settings
+    try: 
+        body = await request.body()
+        body_str = body.decode()
+        try:
+            data = ast.literal_eval(body_str)
+        except: 
+            data = json.loads(body_str)
+        data["model"] = (
+            server_settings.get("completion_model", None) # server default
+            or user_model # model name passed via cli args
+            or model # for azure deployments
+            or data["model"] # default passed in http request
+        )
+        data["call_type"] = "chat_completion"
+        return litellm_completion(
+            **data
+        )
+    except Exception as e: 
+        print(f"\033[1;31mAn error occurred: {e}\n\n Debug this by setting `--debug`, e.g. `litellm --model gpt-3.5-turbo --debug`")
+        error_traceback = traceback.format_exc()
+        error_msg = f"{str(e)}\n\n{error_traceback}"
+        return {"error": error_msg}

 def print_cost_logs():
    with open("costs.json", "r") as f:
--- a/litellm/router.py
+++ b/litellm/router.py
@ -1,16 +1,17 @@
-from typing import Union, List, Dict, Optional
 from datetime import datetime
+from typing import Dict, List, Optional, Union
+
 import litellm


-class Router: 
+class Router:
    """
    Example usage:
    from litellm import Router
    model_list = [{
-        "model_name": "gpt-3.5-turbo", # openai model name 
-        "litellm_params": { # params for litellm completion/embedding call 
-            "model": "azure/<your-deployment-name>", 
+        "model_name": "gpt-3.5-turbo", # openai model name
+        "litellm_params": { # params for litellm completion/embedding call
+            "model": "azure/<your-deployment-name>",
            "api_key": <your-api-key>,
            "api_version": <your-api-version>,
            "api_base": <your-api-base>
@ -23,16 +24,17 @@ class Router:
    """
    model_names: List = []
    cache_responses: bool = False
-    def __init__(self, 
-                 model_list: Optional[list]=None,
+    default_cache_time_seconds: int = 1 * 60 * 60  # 1 hour
+
+    def __init__(self,
+                 model_list: Optional[list] = None,
                 redis_host: Optional[str] = None,
                 redis_port: Optional[int] = None,
-                 redis_password: Optional[str] = None, 
+                 redis_password: Optional[str] = None,
                 cache_responses: bool = False) -> None:
        if model_list:
-            self.model_list = model_list
-            self.model_names = [m["model_name"] for m in model_list]
-        if redis_host is not None and redis_port is not None and redis_password is not None: 
+            self.set_model_list(model_list)
+        if redis_host is not None and redis_port is not None and redis_password is not None:
            cache_config = {
                    'type': 'redis',
                    'host': redis_host,
@ -45,61 +47,55 @@ class Router:
            }
        self.cache = litellm.Cache(cache_config) # use Redis for tracking load balancing
        if cache_responses:
-            litellm.cache = litellm.Cache(**cache_config) # use Redis for caching completion requests 
+            litellm.cache = litellm.Cache(**cache_config) # use Redis for caching completion requests
            self.cache_responses = cache_responses
        litellm.success_callback = [self.deployment_callback]
-    
+
    def completion(self,
                   model: str,
                   messages: List[Dict[str, str]],
                   is_retry: Optional[bool] = False,
                   is_fallback: Optional[bool] = False,
-                   **kwargs): 
+                   **kwargs):
        """
-        Example usage: 
+        Example usage:
        response = router.completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hey, how's it going?"}]
        """

        # pick the one that is available (lowest TPM/RPM)
        deployment = self.get_available_deployment(model=model, messages=messages)
        data = deployment["litellm_params"]
-        data["messages"] = messages
-        data["caching"] = self.cache_responses
-        # call via litellm.completion() 
-        return litellm.completion(**{**data, **kwargs})
+        # call via litellm.completion()
+        return litellm.completion(**{**data, "messages": messages, "caching": self.cache_responses, **kwargs})

-    async def acompletion(self, 
-                    model: str, 
-                    messages: List[Dict[str, str]], 
+    async def acompletion(self,
+                    model: str,
+                    messages: List[Dict[str, str]],
                    is_retry: Optional[bool] = False,
                    is_fallback: Optional[bool] = False,
                    **kwargs):
        # pick the one that is available (lowest TPM/RPM)
        deployment = self.get_available_deployment(model=model, messages=messages)
        data = deployment["litellm_params"]
-        data["messages"] = messages
-        data["caching"] = self.cache_responses
-        return await litellm.acompletion(**{**data, **kwargs})
-    
-    def text_completion(self, 
-                        model: str, 
-                        prompt: str, 
+        return await litellm.acompletion(**{**data, "messages": messages, "caching": self.cache_responses, **kwargs})
+
+    def text_completion(self,
+                        model: str,
+                        prompt: str,
                        is_retry: Optional[bool] = False,
                        is_fallback: Optional[bool] = False,
                        is_async: Optional[bool] = False,
                        **kwargs):
-        
+
        messages=[{"role": "user", "content": prompt}]
        # pick the one that is available (lowest TPM/RPM)
        deployment = self.get_available_deployment(model=model, messages=messages)

        data = deployment["litellm_params"]
-        data["prompt"] = prompt
-        data["caching"] = self.cache_responses
-        # call via litellm.completion() 
-        return litellm.text_completion(**{**data, **kwargs})        
+        # call via litellm.completion()
+        return litellm.text_completion(**{**data, "prompt": prompt, "caching": self.cache_responses, **kwargs})

-    def embedding(self, 
+    def embedding(self,
                  model: str,
                  input: Union[str, List],
                  is_async: Optional[bool] = False,
@ -108,10 +104,8 @@ class Router:
        deployment = self.get_available_deployment(model=model, input=input)

        data = deployment["litellm_params"]
-        data["input"] = input
-        data["caching"] = self.cache_responses
-        # call via litellm.embedding() 
-        return litellm.embedding(**{**data, **kwargs})
+        # call via litellm.embedding()
+        return litellm.embedding(**{**data, "input": input, "caching": self.cache_responses, **kwargs})

    async def aembedding(self,
                         model: str,
@ -122,14 +116,13 @@ class Router:
        deployment = self.get_available_deployment(model=model, input=input)

        data = deployment["litellm_params"]
-        data["input"] = input
-        data["caching"] = self.cache_responses
-        return await litellm.aembedding(**{**data, **kwargs})
+        return await litellm.aembedding(**{**data, "input": input, "caching": self.cache_responses, **kwargs})

    def set_model_list(self, model_list: list):
        self.model_list = model_list
+        self.model_names = [m["model_name"] for m in model_list]

-    def get_model_names(self): 
+    def get_model_names(self):
        return self.model_names

    def deployment_callback(
@ -142,69 +135,63 @@ class Router:
        Function LiteLLM submits a callback to after a successful
        completion. Purpose of this is ti update TPM/RPM usage per model
        """
-        model_name = kwargs.get('model', None)  # i.e. azure/gpt35turbo
+        model_name = kwargs.get('model', None)  # i.e. gpt35turbo
+        custom_llm_provider = kwargs.get("litellm_params", {}).get('custom_llm_provider', None)  # i.e. azure
+        if custom_llm_provider:
+            model_name = f"{custom_llm_provider}/{model_name}"
        total_tokens = completion_response['usage']['total_tokens']
        self._set_deployment_usage(model_name, total_tokens)

-    def get_available_deployment(self, 
-                               model: str, 
-                               messages: Optional[List[Dict[str, str]]]=None,
-                               input: Optional[Union[str, List]]=None): 
+    def get_available_deployment(self,
+                               model: str,
+                               messages: Optional[List[Dict[str, str]]] = None,
+                               input: Optional[Union[str, List]] = None):
        """
        Returns a deployment with the lowest TPM/RPM usage.
        """
-        # get list of potential deployments 
-        potential_deployments = [] 
-        for item in self.model_list: 
-            if item["model_name"] == model: 
+        # get list of potential deployments
+        potential_deployments = []
+        for item in self.model_list:
+            if item["model_name"] == model:
                potential_deployments.append(item)
-        
-        # set first model as current model
-        deployment = potential_deployments[0] 

+        # set first model as current model to calculate token count
+        deployment = potential_deployments[0]

-        # get model tpm, rpm limits
-        tpm = deployment["tpm"]
-        rpm = deployment["rpm"]
-
-        # get deployment current usage
-        current_tpm, current_rpm = self._get_deployment_usage(deployment_name=deployment["litellm_params"]["model"])
-
-        # get encoding 
-        if messages:
+        # get encoding
+        token_count = 0
+        if messages is not None:
            token_count = litellm.token_counter(model=deployment["model_name"], messages=messages)
-        elif input:
+        elif input is not None:
            if isinstance(input, List):
                input_text = "".join(text for text in input)
            else:
                input_text = input
            token_count = litellm.token_counter(model=deployment["model_name"], text=input_text)
-        
-        # if at model limit, return lowest used
-        if current_tpm + token_count > tpm or current_rpm + 1 >= rpm: 
-            # -----------------------
-            # Find lowest used model
-            # ----------------------
-            lowest_tpm = float('inf')
-            deployment = None

-            # Go through all the models to get tpm, rpm
-            for item in potential_deployments:
-                item_tpm, item_rpm = self._get_deployment_usage(deployment_name=item["litellm_params"]["model"])
+        # -----------------------
+        # Find lowest used model
+        # ----------------------
+        lowest_tpm = float("inf")
+        deployment = None

-                if item_tpm == 0:
-                    return item
-                elif item_tpm + token_count > item["tpm"] or item_rpm + 1 >= item["rpm"]: 
-                    continue
-                elif item_tpm < lowest_tpm:
-                    lowest_tpm = item_tpm
-                    deployment = item
-        
-            # if none, raise exception 
-            if deployment is None: 
-                raise ValueError(f"No models available.")
+        # Go through all the models to get tpm, rpm
+        for item in potential_deployments:
+            item_tpm, item_rpm = self._get_deployment_usage(deployment_name=item["litellm_params"]["model"])

-        # return model 
+            if item_tpm == 0:
+                return item
+            elif item_tpm + token_count > item["tpm"] or item_rpm + 1 >= item["rpm"]:
+                continue
+            elif item_tpm < lowest_tpm:
+                lowest_tpm = item_tpm
+                deployment = item
+
+        # if none, raise exception
+        if deployment is None:
+            raise ValueError("No models available.")
+
+        # return model
        return deployment

    def _get_deployment_usage(
@ -221,27 +208,22 @@ class Router:
        # ------------
        # Return usage
        # ------------
-        tpm = self.cache.get_cache(tpm_key)
-        rpm = self.cache.get_cache(rpm_key)
-
-        if tpm is None: 
-            tpm = 0
-        if rpm is None: 
-            rpm = 0
+        tpm = self.cache.get_cache(cache_key=tpm_key) or 0
+        rpm = self.cache.get_cache(cache_key=rpm_key) or 0

        return int(tpm), int(rpm)
-    
-    def increment(self, key: str, increment_value: int): 
-        # get value 
-        cached_value = self.cache.get_cache(key)
-        # update value 
+
+    def increment(self, key: str, increment_value: int):
+        # get value
+        cached_value = self.cache.get_cache(cache_key=key)
+        # update value
        try:
            cached_value = cached_value + increment_value
-        except: 
+        except:
            cached_value = increment_value
        # save updated value
-        self.cache.add_cache(result=cached_value, cache_key=key)
-    
+        self.cache.add_cache(result=cached_value, cache_key=key, ttl=self.default_cache_time_seconds)
+
    def _set_deployment_usage(
        self,
        model_name: str,
--- a/litellm/tests/test_add_function_to_prompt.py
+++ b/litellm/tests/test_add_function_to_prompt.py
@ -37,6 +37,7 @@ def test_function_call_non_openai_model():
        response = litellm.completion(model=model, messages=messages, functions=functions)
        pytest.fail(f'An error occurred')
    except Exception as e: 
+        print(e)
        pass

 test_function_call_non_openai_model()
--- a/litellm/tests/test_api_key_param.py
+++ b/litellm/tests/test_api_key_param.py
@ -1,53 +1,53 @@
-#### What this tests ####
-#    This tests the ability to set api key's via the params instead of as environment variables
+# #### What this tests ####
+# #    This tests the ability to set api key's via the params instead of as environment variables

-import sys, os
-import traceback
+# import sys, os
+# import traceback

-sys.path.insert(
-    0, os.path.abspath("../..")
-)  # Adds the parent directory to the system path
-import litellm
-from litellm import embedding, completion
+# sys.path.insert(
+#     0, os.path.abspath("../..")
+# )  # Adds the parent directory to the system path
+# import litellm
+# from litellm import embedding, completion

-litellm.set_verbose = False
+# litellm.set_verbose = False


-def logger_fn(model_call_object: dict):
-    print(f"model call details: {model_call_object}")
+# def logger_fn(model_call_object: dict):
+#     print(f"model call details: {model_call_object}")


-user_message = "Hello, how are you?"
-messages = [{"content": user_message, "role": "user"}]
+# user_message = "Hello, how are you?"
+# messages = [{"content": user_message, "role": "user"}]

-## Test 1: Setting key dynamically
-temp_key = os.environ.get("ANTHROPIC_API_KEY", "")
-os.environ["ANTHROPIC_API_KEY"] = "bad-key"
-# test on openai completion call
-try:
-    response = completion(
-        model="claude-instant-1",
-        messages=messages,
-        logger_fn=logger_fn,
-        api_key=temp_key,
-    )
-    print(f"response: {response}")
-except:
-    print(f"error occurred: {traceback.format_exc()}")
-    pass
-os.environ["ANTHROPIC_API_KEY"] = temp_key
+# ## Test 1: Setting key dynamically
+# temp_key = os.environ.get("ANTHROPIC_API_KEY", "")
+# os.environ["ANTHROPIC_API_KEY"] = "bad-key"
+# # test on openai completion call
+# try:
+#     response = completion(
+#         model="claude-instant-1",
+#         messages=messages,
+#         logger_fn=logger_fn,
+#         api_key=temp_key,
+#     )
+#     print(f"response: {response}")
+# except:
+#     print(f"error occurred: {traceback.format_exc()}")
+#     pass
+# os.environ["ANTHROPIC_API_KEY"] = temp_key


-## Test 2: Setting key via __init__ params
-litellm.anthropic_key = os.environ.get("ANTHROPIC_API_KEY", "")
-os.environ.pop("ANTHROPIC_API_KEY")
-# test on openai completion call
-try:
-    response = completion(
-        model="claude-instant-1", messages=messages, logger_fn=logger_fn
-    )
-    print(f"response: {response}")
-except:
-    print(f"error occurred: {traceback.format_exc()}")
-    pass
-os.environ["ANTHROPIC_API_KEY"] = temp_key
+# ## Test 2: Setting key via __init__ params
+# litellm.anthropic_key = os.environ.get("ANTHROPIC_API_KEY", "")
+# os.environ.pop("ANTHROPIC_API_KEY")
+# # test on openai completion call
+# try:
+#     response = completion(
+#         model="claude-instant-1", messages=messages, logger_fn=logger_fn
+#     )
+#     print(f"response: {response}")
+# except:
+#     print(f"error occurred: {traceback.format_exc()}")
+#     pass
+# os.environ["ANTHROPIC_API_KEY"] = temp_key
--- a/litellm/tests/test_async_fn.py
+++ b/litellm/tests/test_async_fn.py
@ -9,17 +9,29 @@ import asyncio
 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
-from litellm import acompletion, acreate
+import litellm
+from litellm import completion, acompletion, acreate
+
+def test_sync_response():
+    litellm.set_verbose = True
+    user_message = "Hello, how are you?"
+    messages = [{"content": user_message, "role": "user"}]
+    try:
+        response = completion(model="gpt-3.5-turbo", messages=messages, api_key=os.environ["OPENAI_API_KEY"])
+    except Exception as e:
+        pytest.fail(f"An exception occurred: {e}")
+

 def test_async_response():
    import asyncio
    async def test_get_response():
+        litellm.set_verbose = True
        user_message = "Hello, how are you?"
        messages = [{"content": user_message, "role": "user"}]
        try:
            response = await acompletion(model="gpt-3.5-turbo", messages=messages)
        except Exception as e:
-            pass
+            pytest.fail(f"An exception occurred: {e}")

    response = asyncio.run(test_get_response())
 # print(response)
@ -51,7 +63,7 @@ def test_get_response_streaming():
            assert len(output) > 0, "Length of output needs to be greater than 0."

        except Exception as e:
-            pass
+            pytest.fail(f"An exception occurred: {e}")
        return response
    asyncio.run(test_async_call())

--- a/litellm/tests/test_batch_completions.py
+++ b/litellm/tests/test_batch_completions.py
@ -14,18 +14,20 @@ from litellm import batch_completion, batch_completion_models, completion, batch

 def test_batch_completions():
    messages = [[{"role": "user", "content": "write a short poem"}] for _ in range(3)]
-    model = "gpt-3.5-turbo"
+    model = "j2-mid"
    try:
        result = batch_completion(
            model=model, 
            messages=messages,
            max_tokens=10,
-            temperature=0.2
+            temperature=0.2,
+            request_timeout=1
        )
        print(result)
        print(len(result))
        assert(len(result)==3)
    except Timeout as e:
+        print(f"IN TIMEOUT")
        pass
    except Exception as e:
        pytest.fail(f"An error occurred: {e}")
@ -38,18 +40,25 @@ def test_batch_completions_models():
            messages=[{"role": "user", "content": "Hey, how's it going"}]
        )
        print(result)
+    except Timeout as e:
+        pass
    except Exception as e:
        pytest.fail(f"An error occurred: {e}")
 # test_batch_completions_models()

 def test_batch_completion_models_all_responses():
-    responses = batch_completion_models_all_responses(
-        models=["j2-light", "claude-instant-1.2", "command-nightly"], 
-        messages=[{"role": "user", "content": "write a poem"}],
-        max_tokens=500
-    )
-    print(responses)
-    assert(len(responses) == 3)
+    try:
+        responses = batch_completion_models_all_responses(
+            models=["j2-light", "claude-instant-1.2", "command-nightly"], 
+            messages=[{"role": "user", "content": "write a poem"}],
+            max_tokens=500
+        )
+        print(responses)
+        assert(len(responses) == 3)
+    except Timeout as e:
+        pass
+    except Exception as e:
+        pytest.fail(f"An error occurred: {e}")
 # test_batch_completion_models_all_responses()

 # def test_batch_completions():
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@ -1,4 +1,5 @@
 import sys, os
+import time
 import traceback
 from dotenv import load_dotenv

@ -12,7 +13,7 @@ import pytest
 import litellm
 from litellm import embedding, completion
 from litellm.caching import Cache
-litellm.set_verbose=True
+# litellm.set_verbose=True

 messages = [{"role": "user", "content": "who is ishaan Github?  "}]
 # comment
@ -36,7 +37,7 @@ def test_gpt_cache():
        cache_key = last_content_without_prompt_val + data["model"]
        print("cache_key", cache_key)
        return cache_key
-        
+

    cache.init(pre_func=pre_cache_func)
    cache.set_openai_key()
@ -46,12 +47,12 @@ def test_gpt_cache():
    response2 = completion(model="gpt-3.5-turbo", messages=messages)
    response3 = completion(model="command-nightly", messages=messages)

-    if response1["choices"] != response2["choices"]: # same models should cache 
+    if response1["choices"] != response2["choices"]: # same models should cache
        print(f"response1: {response1}")
        print(f"response2: {response2}")
        pytest.fail(f"Error occurred:")

-    if response3["choices"] == response2["choices"]: # different models, don't cache 
+    if response3["choices"] == response2["choices"]: # different models, don't cache
        # if models are different, it should not return cached response
        print(f"response2: {response2}")
        print(f"response3: {response3}")
@ -124,9 +125,9 @@ def test_embedding_caching():
    embedding2 = embedding(model="text-embedding-ada-002", input=text_to_embed, caching=True)
    end_time = time.time()
    print(f"Embedding 2 response time: {end_time - start_time} seconds")
-    
+
    litellm.cache = None
-    assert end_time - start_time <= 0.1 # ensure 2nd response comes in in under 0.1 s 
+    assert end_time - start_time <= 0.1 # ensure 2nd response comes in in under 0.1 s
    if embedding2['data'][0]['embedding'] != embedding1['data'][0]['embedding']:
        print(f"embedding1: {embedding1}")
        print(f"embedding2: {embedding2}")
@ -178,14 +179,14 @@ def test_embedding_caching_azure():
    )
    end_time = time.time()
    print(f"Embedding 2 response time: {end_time - start_time} seconds")
-    
+
    litellm.cache = None
-    assert end_time - start_time <= 0.1 # ensure 2nd response comes in in under 0.1 s 
+    assert end_time - start_time <= 0.1 # ensure 2nd response comes in in under 0.1 s
    if embedding2['data'][0]['embedding'] != embedding1['data'][0]['embedding']:
        print(f"embedding1: {embedding1}")
        print(f"embedding2: {embedding2}")
        pytest.fail("Error occurred: Embedding caching failed")
-    
+
    os.environ['AZURE_API_VERSION'] = api_version
    os.environ['AZURE_API_BASE'] = api_base
    os.environ['AZURE_API_KEY'] = api_key
@ -270,30 +271,13 @@ def test_embedding_caching_azure():


 def test_redis_cache_completion():
+    litellm.set_verbose = True
    messages = [{"role": "user", "content": "who is ishaan CTO of litellm from litellm 2023"}]
    litellm.cache = Cache(type="redis", host=os.environ['REDIS_HOST'], port=os.environ['REDIS_PORT'], password=os.environ['REDIS_PASSWORD'])
    print("test2 for caching")
-
-    # patch this redis test
-    local_cache = {}
-
-    def set_cache(key, value):
-        local_cache[key] = value
-    
-    def get_cache(key):
-        if key in local_cache:
-            return local_cache[key]
-    
-    litellm.cache.cache.set_cache = set_cache
-    litellm.cache.cache.get_cache = get_cache
-
-
    response1 = completion(model="gpt-3.5-turbo", messages=messages, caching=True)
    response2 = completion(model="gpt-3.5-turbo", messages=messages, caching=True)
    response3 = completion(model="command-nightly", messages=messages, caching=True)
-    print(f"response1: {response1}")
-    print(f"response2: {response2}")
-    print(f"response3: {response3}")
    litellm.cache = None
    if response3['choices'][0]['message']['content'] == response2['choices'][0]['message']['content']:
        # if models are different, it should not return cached response
@ -322,29 +306,29 @@ def test_custom_redis_cache_with_key():

    def set_cache(key, value):
        local_cache[key] = value
-    
+
    def get_cache(key):
        if key in local_cache:
            return local_cache[key]
-    
+
    litellm.cache.cache.set_cache = set_cache
    litellm.cache.cache.get_cache = get_cache

    # patch this redis cache get and set call

-    response1 = completion(model="gpt-3.5-turbo", messages=messages, temperature=1, caching=True)
-    response2 = completion(model="gpt-3.5-turbo", messages=messages, temperature=1, caching=True)
-    response3 = completion(model="gpt-3.5-turbo", messages=messages, temperature=1, caching=False)
-    
+    response1 = completion(model="gpt-3.5-turbo", messages=messages, temperature=1, caching=True, num_retries=3)
+    response2 = completion(model="gpt-3.5-turbo", messages=messages, temperature=1, caching=True, num_retries=3)
+    response3 = completion(model="gpt-3.5-turbo", messages=messages, temperature=1, caching=False, num_retries=3)
+
    print(f"response1: {response1}")
    print(f"response2: {response2}")
    print(f"response3: {response3}")

    if response3['choices'][0]['message']['content'] == response2['choices'][0]['message']['content']:
-        pytest.fail(f"Error occurred:")        
+        pytest.fail(f"Error occurred:")
    litellm.cache = None

-test_custom_redis_cache_with_key()
+# test_custom_redis_cache_with_key()

 def test_hosted_cache():
    litellm.cache = Cache(type="hosted") # use api.litellm.ai for caching
@ -364,3 +348,99 @@ def test_hosted_cache():

 # test_hosted_cache()

+
+def test_redis_cache_with_ttl():
+    cache = Cache(type="redis", host=os.environ['REDIS_HOST'], port=os.environ['REDIS_PORT'], password=os.environ['REDIS_PASSWORD'])
+    sample_model_response_object_str = """{
+  "choices": [
+    {
+      "finish_reason": "stop",
+      "index": 0,
+      "message": {
+        "role": "assistant",
+        "content": "I'm doing well, thank you for asking. I am Claude, an AI assistant created by Anthropic."
+      }
+    }
+  ],
+  "created": 1691429984.3852863,
+  "model": "claude-instant-1",
+  "usage": {
+    "prompt_tokens": 18,
+    "completion_tokens": 23,
+    "total_tokens": 41
+  }
+}"""
+    sample_model_response_object = {
+  "choices": [
+    {
+      "finish_reason": "stop",
+      "index": 0,
+      "message": {
+        "role": "assistant",
+        "content": "I'm doing well, thank you for asking. I am Claude, an AI assistant created by Anthropic."
+      }
+    }
+  ],
+  "created": 1691429984.3852863,
+  "model": "claude-instant-1",
+  "usage": {
+    "prompt_tokens": 18,
+    "completion_tokens": 23,
+    "total_tokens": 41
+  }
+}
+    cache.add_cache(cache_key="test_key", result=sample_model_response_object_str, ttl=1)
+    cached_value = cache.get_cache(cache_key="test_key")
+    print(f"cached-value: {cached_value}")
+    assert cached_value['choices'][0]['message']['content'] == sample_model_response_object['choices'][0]['message']['content']
+    time.sleep(2)
+    assert cache.get_cache(cache_key="test_key") is None
+
+# test_redis_cache_with_ttl()
+
+def test_in_memory_cache_with_ttl():
+    cache = Cache(type="local")
+    sample_model_response_object_str = """{
+  "choices": [
+    {
+      "finish_reason": "stop",
+      "index": 0,
+      "message": {
+        "role": "assistant",
+        "content": "I'm doing well, thank you for asking. I am Claude, an AI assistant created by Anthropic."
+      }
+    }
+  ],
+  "created": 1691429984.3852863,
+  "model": "claude-instant-1",
+  "usage": {
+    "prompt_tokens": 18,
+    "completion_tokens": 23,
+    "total_tokens": 41
+  }
+}"""
+    sample_model_response_object = {
+  "choices": [
+    {
+      "finish_reason": "stop",
+      "index": 0,
+      "message": {
+        "role": "assistant",
+        "content": "I'm doing well, thank you for asking. I am Claude, an AI assistant created by Anthropic."
+      }
+    }
+  ],
+  "created": 1691429984.3852863,
+  "model": "claude-instant-1",
+  "usage": {
+    "prompt_tokens": 18,
+    "completion_tokens": 23,
+    "total_tokens": 41
+  }
+}
+    cache.add_cache(cache_key="test_key", result=sample_model_response_object_str, ttl=1)
+    cached_value = cache.get_cache(cache_key="test_key")
+    assert cached_value['choices'][0]['message']['content'] == sample_model_response_object['choices'][0]['message']['content']
+    time.sleep(2)
+    assert cache.get_cache(cache_key="test_key") is None
+# test_in_memory_cache_with_ttl()
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@ -9,9 +9,11 @@ sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import pytest
+from openai.error import Timeout
 import litellm
-from litellm import embedding, completion, text_completion, completion_cost
+from litellm import embedding, completion, completion_cost
 from litellm import RateLimitError
+litellm.num_retries = 3

 user_message = "Write a short poem about the sky"
 messages = [{"content": user_message, "role": "user"}]
@ -38,7 +40,7 @@ def test_completion_custom_provider_model_name():


 def test_completion_claude():
-    litellm.set_verbose = True
+    litellm.set_verbose = False
    litellm.AnthropicConfig(max_tokens_to_sample=200, metadata={"user_id": "1224"})
    try:
        # test without max tokens
@ -48,6 +50,11 @@ def test_completion_claude():
        # Add any assertions here to check the response
        print(response)
        print(response.response_ms)
+        print(response.usage)
+        print(response.usage.completion_tokens)
+        print(response["usage"]["completion_tokens"])
+        # print("new cost tracking")
+        print(response.cost())
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")

@ -96,17 +103,12 @@ def test_completion_with_litellm_call_id():
        print(response)
        if 'litellm_call_id' in response:
            pytest.fail(f"Error occurred: litellm_call_id in response objects")
+        print(response.usage)
+        print(response.usage.completion_tokens)
        
-        litellm.use_client = True
-        response2 = completion(
-            model="gpt-3.5-turbo", messages=messages)
-        
-        if 'litellm_call_id' not in response2:
-            pytest.fail(f"Error occurred: litellm_call_id not in response object when use_client = True")
-        # Add any assertions here to check the response
-        print(response2)
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
+# test_completion_with_litellm_call_id()

 def test_completion_perplexity_api():
    try:
@ -220,13 +222,12 @@ def test_get_hf_task_for_model():
 # # TGI model
 # # this is a TGI model https://huggingface.co/glaiveai/glaive-coder-7b
 # def hf_test_completion_tgi():
-#     litellm.huggingface_config(return_full_text=True)
 #     litellm.set_verbose=True
 #     try:
 #         response = litellm.completion(
 #             model="huggingface/mistralai/Mistral-7B-Instruct-v0.1",
 #             messages=[{ "content": "Hello, how are you?","role": "user"}],
-#             api_base="https://n9ox93a8sv5ihsow.us-east-1.aws.endpoints.huggingface.cloud",
+#             api_base="https://3kk3h56912qga4-80.proxy.runpod.net",
 #         )
 #         # Add any assertions here to check the response
 #         print(response)
@ -387,33 +388,13 @@ def test_completion_openai():
        pytest.fail(f"Error occurred: {e}")
 # test_completion_openai()

-
-def test_completion_openai_prompt():
-    try:
-        response = text_completion(
-            model="gpt-3.5-turbo", prompt="What's the weather in SF?"
-        )
-        response_str = response["choices"][0]["text"]
-    except Exception as e:
-        pytest.fail(f"Error occurred: {e}")
-
-
 def test_completion_text_openai():
-    try:
-        # litellm.set_verbose=True
-        response = completion(model="text-davinci-003", messages=messages)
-        # Add any assertions here to check the response
-        print(response)
-    except Exception as e:
-        pytest.fail(f"Error occurred: {e}")
-
-def test_completion_gpt_instruct():
    try:
        response = completion(model="gpt-3.5-turbo-instruct", messages=messages)
        print(response)
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
-# test_completion_gpt_instruct()
+# test_completion_text_openai()

 def test_completion_openai_with_optional_params():
    try:
@ -426,10 +407,11 @@ def test_completion_openai_with_optional_params():
        )
        # Add any assertions here to check the response
        print(response)
+    except Timeout as e: 
+        pass
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")

-
 def test_completion_openai_litellm_key():
    try:
        litellm.api_key = os.environ['OPENAI_API_KEY']
@ -648,6 +630,38 @@ def test_completion_azure2():

 # test_completion_azure2()

+def test_completion_azure3():
+    # test if we can pass api_base, api_version and api_key in compleition()
+    try:
+        print("azure gpt-3.5 test\n\n")
+        litellm.set_verbose=True
+        litellm.api_base = os.environ["AZURE_API_BASE"]
+        litellm.api_key = os.environ["AZURE_API_KEY"]
+        litellm.api_version = os.environ["AZURE_API_VERSION"]
+
+        os.environ["AZURE_API_BASE"] = ""
+        os.environ["AZURE_API_VERSION"] = ""
+        os.environ["AZURE_API_KEY"] = ""
+
+
+        ## Test azure call
+        response = completion(
+            model="azure/chatgpt-v-2",
+            messages=messages,
+            max_tokens=10,
+        )
+
+        # Add any assertions here to check the response
+        print(response)
+
+        os.environ["AZURE_API_BASE"] = litellm.api_base
+        os.environ["AZURE_API_VERSION"] = litellm.api_version
+        os.environ["AZURE_API_KEY"] = litellm.api_key
+
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+# test_completion_azure3()
+
 # new azure test for using litellm. vars, 
 # use the following vars in this test and make an azure_api_call
 #  litellm.api_type = self.azure_api_type 
@ -787,106 +801,124 @@ def test_completion_together_ai():
        pytest.fail(f"Error occurred: {e}")

 # test_completion_together_ai()
-# def test_customprompt_together_ai():
-#     try:
-#         litellm.register_prompt_template(
-#             model="OpenAssistant/llama2-70b-oasst-sft-v10",
-#             roles={"system":"<|im_start|>system", "assistant":"<|im_start|>assistant", "user":"<|im_start|>user"}, # tell LiteLLM how you want to map the openai messages to this model
-#             pre_message_sep= "\n",
-#             post_message_sep= "\n"
-#         )
-#         response = completion(model="together_ai/OpenAssistant/llama2-70b-oasst-sft-v10", messages=messages)
-#         print(response)
-#     except Exception as e:
-#         pytest.fail(f"Error occurred: {e}")
+def test_customprompt_together_ai():
+    try:
+        litellm.set_verbose = True
+        response = completion(model="together_ai/OpenAssistant/llama2-70b-oasst-sft-v10", messages=messages, 
+                              roles={"system":{"pre_message":"<|im_start|>system\n", "post_message":"<|im_end|>"}, "assistant":{"pre_message":"<|im_start|>assistant\n","post_message":"<|im_end|>"}, "user":{"pre_message":"<|im_start|>user\n","post_message":"<|im_end|>"}})
+        print(response)
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")

-# def test_completion_sagemaker():
-#     try:
-#         response = completion(
-#             model="sagemaker/jumpstart-dft-meta-textgeneration-llama-2-7b", 
-#             messages=messages,
-#             temperature=0.2,
-#             max_tokens=80,
-#             logger_fn=logger_fn
-#         )
-#         # Add any assertions here to check the response
-#         print(response)
-#     except Exception as e:
-#         pytest.fail(f"Error occurred: {e}")
+# test_customprompt_together_ai()
+
+def test_completion_sagemaker():
+    try:
+        response = completion(
+            model="sagemaker/jumpstart-dft-meta-textgeneration-llama-2-7b", 
+            messages=messages,
+            temperature=0.2,
+            max_tokens=80,
+            logger_fn=logger_fn
+        )
+        # Add any assertions here to check the response
+        print(response)
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")

 # test_completion_sagemaker()

-# def test_completion_bedrock_titan():
-#     try:
-#         response = completion(
-#             model="bedrock/amazon.titan-tg1-large", 
-#             messages=messages,
-#             temperature=0.2,
-#             max_tokens=200,
-#             top_p=0.8,
-#             logger_fn=logger_fn
-#         )
-#         # Add any assertions here to check the response
-#         print(response)
-#     except RateLimitError:
-#         pass
-#     except Exception as e:
-#         pytest.fail(f"Error occurred: {e}")
+def test_completion_bedrock_titan():
+    try:
+        response = completion(
+            model="bedrock/amazon.titan-tg1-large", 
+            messages=messages,
+            temperature=0.2,
+            max_tokens=200,
+            top_p=0.8,
+            logger_fn=logger_fn
+        )
+        # Add any assertions here to check the response
+        print(response)
+    except RateLimitError:
+        pass
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
 # test_completion_bedrock_titan()

-# def test_completion_bedrock_claude():
-#     print("calling claude")
-#     try:
-#         response = completion(
-#             model="anthropic.claude-instant-v1", 
-#             messages=messages,
-#             max_tokens=10,
-#             temperature=0.1,
-#             logger_fn=logger_fn
-#         )
-#         # Add any assertions here to check the response
-#         print(response)
-#     except RateLimitError:
-#         pass
-#     except Exception as e:
-#         pytest.fail(f"Error occurred: {e}")
+def test_completion_bedrock_claude():
+    print("calling claude")
+    try:
+        response = completion(
+            model="anthropic.claude-instant-v1", 
+            messages=messages,
+            max_tokens=10,
+            temperature=0.1,
+            logger_fn=logger_fn
+        )
+        # Add any assertions here to check the response
+        print(response)
+    except RateLimitError:
+        pass
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
 # test_completion_bedrock_claude()

-
-# def test_completion_bedrock_claude_completion_auth():
-#     print("calling bedrock claude completion params auth")
-#     import os
-
-#     aws_access_key_id = os.environ["AWS_ACCESS_KEY_ID"]
-#     aws_secret_access_key = os.environ["AWS_SECRET_ACCESS_KEY"]
-#     aws_region_name = os.environ["AWS_REGION_NAME"]
-
-#     os.environ["AWS_ACCESS_KEY_ID"] = ""
-#     os.environ["AWS_SECRET_ACCESS_KEY"] = ""
-#     os.environ["AWS_REGION_NAME"] = ""
+def test_completion_bedrock_cohere():
+    print("calling bedrock cohere")
+    try:
+        response = completion(
+            model="bedrock/cohere.command-text-v14", 
+            messages=[{"role": "user", "content": "hi"}],
+            temperature=0.1,
+            max_tokens=10,
+            stream=True
+        )
+        # Add any assertions here to check the response
+        print(response)
+        for chunk in response:
+            print(chunk)
+    except RateLimitError:
+        pass
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+# test_completion_bedrock_cohere()


-#     try:
-#         response = completion(
-#             model="bedrock/anthropic.claude-instant-v1", 
-#             messages=messages,
-#             max_tokens=10,
-#             temperature=0.1,
-#             logger_fn=logger_fn,
-#             aws_access_key_id=aws_access_key_id,
-#             aws_secret_access_key=aws_secret_access_key,
-#             aws_region_name=aws_region_name,
-#         )
-#         # Add any assertions here to check the response
-#         print(response)
+def test_completion_bedrock_claude_completion_auth():
+    print("calling bedrock claude completion params auth")
+    import os

-#         os.environ["AWS_ACCESS_KEY_ID"] = aws_access_key_id
-#         os.environ["AWS_SECRET_ACCESS_KEY"] = aws_secret_access_key
-#         os.environ["AWS_REGION_NAME"] = aws_region_name
-#     except RateLimitError:
-#         pass
-#     except Exception as e:
-#         pytest.fail(f"Error occurred: {e}")
+    aws_access_key_id = os.environ["AWS_ACCESS_KEY_ID"]
+    aws_secret_access_key = os.environ["AWS_SECRET_ACCESS_KEY"]
+    aws_region_name = os.environ["AWS_REGION_NAME"]
+
+    os.environ["AWS_ACCESS_KEY_ID"] = ""
+    os.environ["AWS_SECRET_ACCESS_KEY"] = ""
+    os.environ["AWS_REGION_NAME"] = ""
+
+
+    try:
+        response = completion(
+            model="bedrock/anthropic.claude-instant-v1", 
+            messages=messages,
+            max_tokens=10,
+            temperature=0.1,
+            logger_fn=logger_fn,
+            aws_access_key_id=aws_access_key_id,
+            aws_secret_access_key=aws_secret_access_key,
+            aws_region_name=aws_region_name,
+        )
+        # Add any assertions here to check the response
+        print(response)
+
+        os.environ["AWS_ACCESS_KEY_ID"] = aws_access_key_id
+        os.environ["AWS_SECRET_ACCESS_KEY"] = aws_secret_access_key
+        os.environ["AWS_REGION_NAME"] = aws_region_name
+    except RateLimitError:
+        pass
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
 # test_completion_bedrock_claude_completion_auth()

 # def test_completion_bedrock_claude_external_client_auth():
@ -1026,27 +1058,34 @@ def test_completion_together_ai():
 # test_completion_custom_api_base()

 # def test_vertex_ai():
-#     # test_models = litellm.vertex_chat_models + litellm.vertex_code_chat_models + litellm.vertex_text_models + litellm.vertex_code_text_models
-#     test_models = ["chat-bison"]
+#     test_models = ["codechat-bison"] + litellm.vertex_chat_models + litellm.vertex_code_chat_models + litellm.vertex_text_models + litellm.vertex_code_text_models
+#     # test_models = ["chat-bison"]
 #     for model in test_models:
 #         try:
+#             if model in ["code-gecko@001", "code-gecko@latest"]:
+#                 # our account does not have access to this model
+#                 continue
 #             print("making request", model)
-#             response = completion(model="vertex_ai/codechat-bison-32k", messages=[{'role': 'user', 'content': 'hi'}])
+#             response = completion(model=model, messages=[{'role': 'user', 'content': 'hi'}])
 #             print(response)
+
+#             print(response.usage.completion_tokens)
+#             print(response['usage']['completion_tokens'])
 #             assert type(response.choices[0].message.content) == str
 #         except Exception as e:
 #             pytest.fail(f"Error occurred: {e}")
 # test_vertex_ai()

 # def test_vertex_ai_stream():
-#     litellm.vertex_project = "hardy-device-386718"
-#     litellm.vertex_location = "us-central1"
+#     litellm.set_verbose=False
 #     test_models = litellm.vertex_chat_models + litellm.vertex_code_chat_models + litellm.vertex_text_models + litellm.vertex_code_text_models
 #     for model in test_models:
 #         try:
+#             if model in ["code-gecko@001", "code-gecko@latest"]:
+#                 # our account does not have access to this model
+#                 continue
 #             print("making request", model)
-#             response = completion(model=model, messages=[{"role": "user", "content": "write code for saying hi"}], stream=True)
-#             print(response)
+#             response = completion(model=model, messages=[{"role": "user", "content": "write 100 line code code for saying hi"}], stream=True)
 #             for chunk in response:
 #                 print(chunk)
 #                 # pass
@ -1110,7 +1149,19 @@ def test_completion_anyscale_2():
        print(response)
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
-test_completion_anyscale_2()
+
+def test_mistral_anyscale_stream():
+    litellm.set_verbose=False
+    response = completion(
+        model = 'anyscale/mistralai/Mistral-7B-Instruct-v0.1', 
+        messages = [{ "content": "hello, good morning","role": "user"}],
+        stream=True,
+    )
+    for chunk in response:
+        # print(chunk)
+        print(chunk["choices"][0]["delta"].get("content", ""), end="")
+# test_mistral_anyscale_stream()
+# test_completion_anyscale_2()
 # def test_completion_with_fallbacks_multiple_keys():
 #     print(f"backup key 1: {os.getenv('BACKUP_OPENAI_API_KEY_1')}")
 #     print(f"backup key 2: {os.getenv('BACKUP_OPENAI_API_KEY_2')}")
@ -1247,7 +1298,7 @@ def test_completion_palm():
    # litellm.set_verbose = True
    model_name = "palm/chat-bison"
    try:
-        response = completion(model=model_name, messages=messages)
+        response = completion(model=model_name, messages=messages, stop=["stop"])
        # Add any assertions here to check the response
        print(response)
        print(response.response_ms)
@ -1255,6 +1306,25 @@ def test_completion_palm():
        pytest.fail(f"Error occurred: {e}")
 # test_completion_palm()

+# test palm with streaming
+def test_completion_palm_stream():
+    # litellm.set_verbose = True
+    model_name = "palm/chat-bison"
+    try:
+        response = completion(
+            model=model_name, 
+            messages=messages,
+            stop=["stop"],
+            stream=True,
+            max_tokens=20
+        )
+        # Add any assertions here to check the response
+        for chunk in response:
+            print(chunk)
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+# test_completion_palm_stream()
+
 # test_completion_deep_infra()
 # test_completion_ai21()
 # test config file with completion #
@ -1270,6 +1340,14 @@ def test_completion_palm():
 #         pytest.fail(f"Error occurred: {e}")


+# def test_maritalk():
+#     messages = [{"role": "user", "content": "Hey"}]
+#     try:
+#         response = completion("maritalk", messages=messages)
+#         print(f"response: {response}")
+#     except Exception as e:
+#         pytest.fail(f"Error occurred: {e}")
+# test_maritalk()

 def test_completion_together_ai_stream():
    user_message = "Write 1pg about YC & litellm"
--- a/litellm/tests/test_completion_with_retries.py
+++ b/litellm/tests/test_completion_with_retries.py
@ -1,86 +1,53 @@
-# import sys, os
-# import traceback
-# from dotenv import load_dotenv
+import sys, os
+import traceback
+from dotenv import load_dotenv

-# load_dotenv()
-# import os
+load_dotenv()
+import os

-# sys.path.insert(
-#     0, os.path.abspath("../..")
-# )  # Adds the parent directory to the system path
-# import pytest
-# import litellm
-# from litellm import completion_with_retries
-# from litellm import (
-#     AuthenticationError,
-#     InvalidRequestError,
-#     RateLimitError,
-#     ServiceUnavailableError,
-#     OpenAIError,
-# )
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
+import pytest
+import openai
+import litellm
+from litellm import completion_with_retries, completion
+from litellm import (
+    AuthenticationError,
+    InvalidRequestError,
+    RateLimitError,
+    ServiceUnavailableError,
+    OpenAIError,
+)

-# user_message = "Hello, whats the weather in San Francisco??"
-# messages = [{"content": user_message, "role": "user"}]
+user_message = "Hello, whats the weather in San Francisco??"
+messages = [{"content": user_message, "role": "user"}]


-# def logger_fn(user_model_dict):
-#     # print(f"user_model_dict: {user_model_dict}")
-#     pass
+def logger_fn(user_model_dict):
+    # print(f"user_model_dict: {user_model_dict}")
+    pass

-# # normal call
-# def test_completion_custom_provider_model_name():
-#     try:
-#         response = completion_with_retries(
-#             model="together_ai/togethercomputer/llama-2-70b-chat",
-#             messages=messages,
-#             logger_fn=logger_fn,
-#         )
-#         # Add any assertions here to check the response
-#         print(response)
-#     except Exception as e:
-#         pytest.fail(f"Error occurred: {e}")
+# normal call
+def test_completion_custom_provider_model_name():
+    try:
+        response = completion_with_retries(
+            model="together_ai/togethercomputer/llama-2-70b-chat",
+            messages=messages,
+            logger_fn=logger_fn,
+        )
+        # Add any assertions here to check the response
+        print(response)
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")

+# completion with num retries + impact on exception mapping
+def test_completion_with_num_retries(): 
+    try: 
+        response = completion(model="j2-ultra", messages=[{"messages": "vibe", "bad": "message"}], num_retries=2)
+    except openai.APIError as e: 
+        pass
+    except Exception as e: 
+        pytest.fail(f"Unmapped exception occurred")

-# # bad call
-# # def test_completion_custom_provider_model_name():
-# #     try:
-# #         response = completion_with_retries(
-# #             model="bad-model",
-# #             messages=messages,
-# #             logger_fn=logger_fn,
-# #         )
-# #         # Add any assertions here to check the response
-# #         print(response)
-# #     except Exception as e:
-# #         pytest.fail(f"Error occurred: {e}")
-
-# # impact on exception mapping
-# def test_context_window():
-#     sample_text = "how does a court case get to the Supreme Court?" * 5000
-#     messages = [{"content": sample_text, "role": "user"}]
-#     try:
-#         model = "chatgpt-test"
-#         response = completion_with_retries(
-#             model=model,
-#             messages=messages,
-#             custom_llm_provider="azure",
-#             logger_fn=logger_fn,
-#         )
-#         print(f"response: {response}")
-#     except InvalidRequestError as e:
-#         print(f"InvalidRequestError: {e.llm_provider}")
-#         return
-#     except OpenAIError as e:
-#         print(f"OpenAIError: {e.llm_provider}")
-#         return
-#     except Exception as e:
-#         print("Uncaught Error in test_context_window")
-#         print(f"Error Type: {type(e).__name__}")
-#         print(f"Uncaught Exception - {e}")
-#         pytest.fail(f"Error occurred: {e}")
-#     return
-
-
-# test_context_window()
-
-# test_completion_custom_provider_model_name()
+# test_completion_with_num_retries()
--- a/litellm/tests/test_custom_logger.py
+++ b/litellm/tests/test_custom_logger.py
@ -6,81 +6,44 @@ sys.path.insert(0, os.path.abspath('../..'))

 from litellm import completion, embedding
 import litellm
+from litellm.integrations.custom_logger import CustomLogger

-def custom_callback(
-        kwargs,
-        completion_response,
-        start_time,
-        end_time,
-):
-    print(
-        "in custom callback func"
-    )
-    print("kwargs", kwargs)
-    print(completion_response)
-    print(start_time)
-    print(end_time)
-def send_slack_alert(
-        kwargs,
-        completion_response,
-        start_time,
-        end_time,
-):
-    print(
-        "in custom slack callback func"
-    )
-    import requests
-    import json
+class MyCustomHandler(CustomLogger):
+    def log_pre_api_call(self, model, messages, kwargs): 
+        print(f"Pre-API Call")
+    
+    def log_post_api_call(self, kwargs, response_obj, start_time, end_time): 
+        print(f"Post-API Call")
+    
+    def log_stream_event(self, kwargs, response_obj, start_time, end_time):
+        print(f"On Stream")
+        
+    def log_success_event(self, kwargs, response_obj, start_time, end_time): 
+        print(f"On Success")

-    # Define the Slack webhook URL
-    slack_webhook_url = os.environ['SLACK_WEBHOOK_URL']   # "https://hooks.slack.com/services/<>/<>/<>"
-
-    # Define the text payload, send data available in litellm custom_callbacks
-    text_payload = f"""LiteLLM Logging: kwargs: {str(kwargs)}\n\n, response: {str(completion_response)}\n\n, start time{str(start_time)} end time: {str(end_time)}
-    """
-    payload = {
-        "text": text_payload
-    }
-
-    # Set the headers
-    headers = {
-        "Content-type": "application/json"
-    }
-
-    # Make the POST request
-    response = requests.post(slack_webhook_url, json=payload, headers=headers)
-
-    # Check the response status
-    if response.status_code == 200:
-        print("Message sent successfully to Slack!")
-    else:
-        print(f"Failed to send message to Slack. Status code: {response.status_code}")
-        print(response.json())
-
-def get_transformed_inputs(
-    kwargs,
-):
-    params_to_model = kwargs["additional_args"]["complete_input_dict"]
-    print("params to model", params_to_model)
-
-litellm.success_callback = [custom_callback, send_slack_alert]
-litellm.failure_callback = [send_slack_alert]
-
-
-litellm.set_verbose = True
-
-litellm.input_callback = [get_transformed_inputs]
+    def log_failure_event(self, kwargs, response_obj, start_time, end_time): 
+        print(f"On Failure")

+customHandler = MyCustomHandler()

 def test_chat_openai():
    try:
-        response = completion(model="gpt-2",
+        litellm.callbacks = [customHandler]
+        response = completion(model="gpt-3.5-turbo",
+                              messages=[{
+                                  "role": "user",
+                                  "content": "Hi 👋 - i'm openai"
+                              }],
+                              stream=True)
+        for chunk in response:
+            # print(chunk)
+            continue
+        response = completion(model="gpt-3.5-turbo",
                              messages=[{
                                  "role": "user",
                                  "content": "Hi 👋 - i'm openai"
                              }])
-
-        print(response)
+        # print(response)

    except Exception as e:
        print(e)
@ -88,3 +51,77 @@ def test_chat_openai():


 test_chat_openai()
+
+
+
+
+
+# def custom_callback(
+#         kwargs,
+#         completion_response,
+#         start_time,
+#         end_time,
+# ):
+#     print(
+#         "in custom callback func"
+#     )
+#     print("kwargs", kwargs)
+#     print(completion_response)
+#     print(start_time)
+#     print(end_time)
+#     if "complete_streaming_response" in kwargs:
+#         print("\n\n complete response\n\n")
+#         complete_streaming_response = kwargs["complete_streaming_response"]
+#         print(kwargs["complete_streaming_response"])
+#         usage = complete_streaming_response["usage"]
+#         print("usage", usage)
+# def send_slack_alert(
+#         kwargs,
+#         completion_response,
+#         start_time,
+#         end_time,
+# ):
+#     print(
+#         "in custom slack callback func"
+#     )
+#     import requests
+#     import json
+
+#     # Define the Slack webhook URL
+#     slack_webhook_url = os.environ['SLACK_WEBHOOK_URL']   # "https://hooks.slack.com/services/<>/<>/<>"
+
+#     # Define the text payload, send data available in litellm custom_callbacks
+#     text_payload = f"""LiteLLM Logging: kwargs: {str(kwargs)}\n\n, response: {str(completion_response)}\n\n, start time{str(start_time)} end time: {str(end_time)}
+#     """
+#     payload = {
+#         "text": text_payload
+#     }
+
+#     # Set the headers
+#     headers = {
+#         "Content-type": "application/json"
+#     }
+
+#     # Make the POST request
+#     response = requests.post(slack_webhook_url, json=payload, headers=headers)
+
+#     # Check the response status
+#     if response.status_code == 200:
+#         print("Message sent successfully to Slack!")
+#     else:
+#         print(f"Failed to send message to Slack. Status code: {response.status_code}")
+#         print(response.json())
+
+# def get_transformed_inputs(
+#     kwargs,
+# ):
+#     params_to_model = kwargs["additional_args"]["complete_input_dict"]
+#     print("params to model", params_to_model)
+
+# litellm.success_callback = [custom_callback, send_slack_alert]
+# litellm.failure_callback = [send_slack_alert]
+
+
+# litellm.set_verbose = False
+
+# # litellm.input_callback = [get_transformed_inputs]
--- a/litellm/tests/test_embedding.py
+++ b/litellm/tests/test_embedding.py
@ -69,6 +69,7 @@ def test_openai_azure_embedding():

 def test_cohere_embedding():
    try:
+        # litellm.set_verbose=True
        response = embedding(
            model="embed-english-v2.0", input=["good morning from litellm", "this is another item"]
        )
@ -78,17 +79,40 @@ def test_cohere_embedding():

 # test_cohere_embedding()

+def test_cohere_embedding3():
+    try:
+        litellm.set_verbose=True
+        response = embedding(
+            model="embed-english-v3.0", 
+            input=["good morning from litellm", "this is another item"], 
+        )
+        print(f"response:", response)
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+# test_cohere_embedding3()
+
+def test_bedrock_embedding():
+    try:
+        response = embedding(
+            model="amazon.titan-embed-text-v1", input=["good morning from litellm, attempting to embed data"]
+        )
+        print(f"response:", response)
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+# test_bedrock_embedding()
+
 # comment out hf tests - since hf endpoints are unstable
-# def test_hf_embedding():
-#     try:
-#         # huggingface/microsoft/codebert-base
-#         # huggingface/facebook/bart-large
-#         response = embedding(
-#             model="huggingface/BAAI/bge-large-zh", input=["good morning from litellm", "this is another item"]
-#         )
-#         print(f"response:", response)
-#     except Exception as e:
-#         pytest.fail(f"Error occurred: {e}")
+def test_hf_embedding():
+    try:
+        # huggingface/microsoft/codebert-base
+        # huggingface/facebook/bart-large
+        response = embedding(
+            model="huggingface/sentence-transformers/all-MiniLM-L6-v2", input=["good morning from litellm", "this is another item"]
+        )
+        print(f"response:", response)
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
 # test_hf_embedding()

 # test async embeddings
--- a/litellm/tests/test_exceptions.py
+++ b/litellm/tests/test_exceptions.py
@ -38,13 +38,32 @@ models = ["command-nightly"]
 # Test 1: Context Window Errors 
@pytest.mark.parametrize("model", models)
 def test_context_window(model):
+    sample_text = "Say error 50 times" * 100000
+    messages = [{"content": sample_text, "role": "user"}]
+    print(f"model: {model}")
+    try:
+        completion(model=model, messages=messages)
+        pytest.fail(f"An exception occurred")
+    except ContextWindowExceededError:
+        pass
+    except RateLimitError:
+        pass
+    except Exception as e: 
+        print(f"{e}")
+        pytest.fail(f"An error occcurred - {e}")
+        
+@pytest.mark.parametrize("model", models)
+def test_context_window_with_fallbacks(model):
+    ctx_window_fallback_dict = {"command-nightly": "claude-2"}
    sample_text = "how does a court case get to the Supreme Court?" * 1000
    messages = [{"content": sample_text, "role": "user"}]

-    with pytest.raises(ContextWindowExceededError):
-        completion(model=model, messages=messages)
+    completion(model=model, messages=messages, context_window_fallback_dict=ctx_window_fallback_dict)

+# for model in litellm.models_by_provider["bedrock"]:
+#     test_context_window(model=model)
 # test_context_window(model="command-nightly")
+# test_context_window_with_fallbacks(model="command-nightly")
 # Test 2: InvalidAuth Errors
@pytest.mark.parametrize("model", models)
 def invalid_auth(model):  # set the model key to an invalid key, depending on the model
@ -54,6 +73,13 @@ def invalid_auth(model):  # set the model key to an invalid key, depending on th
        if model == "gpt-3.5-turbo":
            temporary_key = os.environ["OPENAI_API_KEY"]
            os.environ["OPENAI_API_KEY"] = "bad-key"
+        elif model == "bedrock/anthropic.claude-v2":
+            temporary_aws_access_key = os.environ["AWS_ACCESS_KEY_ID"]
+            os.environ["AWS_ACCESS_KEY_ID"] = "bad-key"
+            temporary_aws_region_name = os.environ["AWS_REGION_NAME"]
+            os.environ["AWS_REGION_NAME"] = "bad-key"
+            temporary_secret_key = os.environ["AWS_SECRET_ACCESS_KEY"]
+            os.environ["AWS_SECRET_ACCESS_KEY"] = "bad-key"
        elif model == "chatgpt-test":
            temporary_key = os.environ["AZURE_API_KEY"]
            os.environ["AZURE_API_KEY"] = "bad-key"
@ -90,10 +116,10 @@ def invalid_auth(model):  # set the model key to an invalid key, depending on th
        )
        print(f"response: {response}")
    except AuthenticationError as e:
-        print(f"AuthenticationError Caught Exception - {e.llm_provider}")
+        print(f"AuthenticationError Caught Exception - {str(e)}")
    except (
        OpenAIError
-    ):  # is at least an openai error -> in case of random model errors - e.g. overloaded server
+    ) as e:  # is at least an openai error -> in case of random model errors - e.g. overloaded server
        print(f"OpenAIError Caught Exception - {e}")
    except Exception as e:
        print(type(e))
@ -124,8 +150,15 @@ def invalid_auth(model):  # set the model key to an invalid key, depending on th
            os.environ["ALEPH_ALPHA_API_KEY"] = temporary_key
        elif model in litellm.nlp_cloud_models:
            os.environ["NLP_CLOUD_API_KEY"] = temporary_key
+        elif "bedrock" in model: 
+            os.environ["AWS_ACCESS_KEY_ID"] = temporary_aws_access_key
+            os.environ["AWS_REGION_NAME"] = temporary_aws_region_name
+            os.environ["AWS_SECRET_ACCESS_KEY"] = temporary_secret_key
    return

+for model in litellm.models_by_provider["bedrock"]:
+    invalid_auth(model=model)
+
 # Test 3: Invalid Request Error 
@pytest.mark.parametrize("model", models)
 def test_invalid_request_error(model):
--- a/litellm/tests/test_get_model_cost_map.py
+++ b/litellm/tests/test_get_model_cost_map.py
@ -9,5 +9,15 @@ from litellm import get_max_tokens, model_cost, open_ai_chat_completion_models

 print(get_max_tokens("gpt-3.5-turbo"))

-print(model_cost)
-print(open_ai_chat_completion_models)
+def test_get_gpt3_tokens():
+    max_tokens = get_max_tokens("gpt-3.5-turbo")
+    results = max_tokens['max_tokens']
+    print(results)
+# test_get_gpt3_tokens()
+
+def test_get_palm_tokens():
+    # # 🦄🦄🦄🦄🦄🦄🦄🦄
+    max_tokens = get_max_tokens("palm/chat-bison")
+    results = max_tokens['max_tokens']
+    print(results)
+# test_get_palm_tokens()
--- a/litellm/tests/test_helicone_integration.py
+++ b/litellm/tests/test_helicone_integration.py
@ -1,30 +1,30 @@
-#### What this tests ####
-#    This tests if logging to the helicone integration actually works
+# #### What this tests ####
+# #    This tests if logging to the helicone integration actually works

-import sys, os
-import traceback
-import pytest
+# import sys, os
+# import traceback
+# import pytest

-sys.path.insert(
-    0, os.path.abspath("../..")
-)  # Adds the parent directory to the system path
-import litellm
-from litellm import embedding, completion
+# sys.path.insert(
+#     0, os.path.abspath("../..")
+# )  # Adds the parent directory to the system path
+# import litellm
+# from litellm import embedding, completion

-litellm.success_callback = ["helicone"]
+# litellm.success_callback = ["helicone"]

-litellm.set_verbose = True
+# litellm.set_verbose = True

-user_message = "Hello, how are you?"
-messages = [{"content": user_message, "role": "user"}]
+# user_message = "Hello, how are you?"
+# messages = [{"content": user_message, "role": "user"}]


-# openai call
-response = completion(
-    model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}]
-)
+# # openai call
+# response = completion(
+#     model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}]
+# )

-# cohere call
-response = completion(
-    model="command-nightly", messages=[{"role": "user", "content": "Hi 👋 - i'm cohere"}]
-)
+# # cohere call
+# response = completion(
+#     model="command-nightly", messages=[{"role": "user", "content": "Hi 👋 - i'm cohere"}]
+# )
--- a/litellm/tests/test_litedebugger_integration.py
+++ b/litellm/tests/test_litedebugger_integration.py
@ -1,112 +0,0 @@
-#### What this tests ####
-#    This tests if logging to the litedebugger integration actually works
-
-# Test Scenarios (test across normal completion, streaming)
-## 1: Pre-API-Call
-## 2: Post-API-Call
-## 3: On LiteLLM Call success
-## 4: On LiteLLM Call failure
-
-
-import sys, os, io
-import traceback, logging
-import pytest
-import dotenv
-dotenv.load_dotenv()
-
-# Create logger
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.DEBUG)
-
-# Create a stream handler
-stream_handler = logging.StreamHandler(sys.stdout)
-logger.addHandler(stream_handler)
-
-# Create a function to log information
-def logger_fn(message):
-    logger.info(message)
-
-sys.path.insert(
-    0, os.path.abspath("../..")
-)  # Adds the parent directory to the system path
-import litellm
-from litellm import completion
-from openai.error import AuthenticationError
-litellm.set_verbose = True
-
-score = 0
-split_per_model = {
-	"gpt-4": 0, 
-	"claude-instant-1.2": 1
-}
-
-
-user_message = "Hello, how are you?"
-messages = [{"content": user_message, "role": "user"}]
-
-# # #Test 1: On completion call - without setting client to true -> ensure litedebugger is not initialized
-# try:
-#     # Redirect stdout
-#     old_stdout = sys.stdout
-#     sys.stdout = new_stdout = io.StringIO()
-
-#     response = completion(model="gpt-3.5-turbo", messages=messages)
-
-#     # Restore stdout
-#     sys.stdout = old_stdout
-#     output = new_stdout.getvalue().strip()
-
-#     if "LiteLLMDebugger" in output:
-#         raise Exception("LiteLLM Debugger should not be called!")
-#     score += 1
-# except Exception as e:
-#     pytest.fail(f"Error occurred: {e}")
-
-
-# # Test 2: On normal completion call - setting client to true
-# litellm.use_client=True
-# def test_completion_with_client():
-#     try:
-#         # Redirect stdout
-#         old_stdout = sys.stdout
-#         sys.stdout = new_stdout = io.StringIO()
-#         litellm.token = "1e6795ea-a75e-4231-8110-dcc721dcffc3" # generate one here - https://www.uuidgenerator.net/version4
-
-#         completion(model="gpt-3.5-turbo", messages=messages)
-#         completion(model="claude-instant-1", messages=messages)
-
-#         # Restore stdout
-#         sys.stdout = old_stdout
-#         output = new_stdout.getvalue().strip()
-#         print(output)
-#         if "LiteDebugger: Pre-API Call Logging" not in output:
-#             raise Exception(f"LiteLLMDebugger: pre-api call not logged!")
-#         if "LiteDebugger: Post-API Call Logging" not in output:
-#             raise Exception("LiteLLMDebugger: post-api call not logged!")
-#         if "LiteDebugger: Success/Failure Call Logging" not in output:
-#             raise Exception("LiteLLMDebugger: success/failure call not logged!")
-#     except Exception as e:
-#         pytest.fail(f"Error occurred: {e}")
-# test_completion_with_client()
-# # Test 3: On streaming completion call - setting client to true
-# try:
-#     # Redirect stdout
-#     old_stdout = sys.stdout
-#     sys.stdout = new_stdout = io.StringIO()
-
-#     response = completion_with_split_tests(models=split_per_model, messages=messages, stream=True, use_client=True, override_client=True, id="6d383c99-488d-481d-aa1b-1f94935cec44")
-#     for data in response:
-#         continue
-#     # Restore stdout
-#     sys.stdout = old_stdout
-#     output = new_stdout.getvalue().strip()
-
-#     if "LiteDebugger: Pre-API Call Logging" not in output:
-#         raise Exception("LiteLLMDebugger: pre-api call not logged!")
-#     if "LiteDebugger: Post-API Call Logging" not in output:
-#         raise Exception("LiteLLMDebugger: post-api call not logged!")
-#     if "LiteDebugger: Success/Failure Call Logging" not in output:
-#         raise Exception("LiteLLMDebugger: success/failure call not logged!")
-# except Exception as e:
-#     pytest.fail(f"Error occurred: {e}")
-
--- a/litellm/tests/test_promptlayer_integration.py
+++ b/litellm/tests/test_promptlayer_integration.py
@ -46,12 +46,13 @@ def test_promptlayer_logging_with_metadata():
        old_stdout = sys.stdout
        sys.stdout = new_stdout = io.StringIO()

-
-        response = completion(model="j2-light",
+        response = completion(model="gpt-3.5-turbo",
                              messages=[{
                                  "role": "user",
                                  "content": "Hi 👋 - i'm ai21"
                              }], 
+                              temperature=0.2,
+                              max_tokens=20,
                              metadata={"model": "ai21"})

        # Restore stdout
@ -65,7 +66,7 @@ def test_promptlayer_logging_with_metadata():
    except Exception as e:
        print(e)

-# test_promptlayer_logging_with_metadata()
+test_promptlayer_logging_with_metadata()



--- a/litellm/tests/test_rate_limit_manager.py
+++ b/litellm/tests/test_rate_limit_manager.py
@ -1,81 +0,0 @@
-#### What this tests ####
-#    This tests calling batch_completions by running 100 messages together
-
-import sys, os
-import traceback
-import pytest
-sys.path.insert(
-    0, os.path.abspath("../..")
-)  # Adds the parent directory to the system path
-from openai.error import Timeout
-import litellm
-from litellm import batch_completion, batch_completion_models, completion, batch_completion_models_all_responses
-# litellm.set_verbose=True
-
-@pytest.mark.asyncio
-async def test_rate_limit_handler():
-    import asyncio
-    ##### USAGE ################
-
-    from litellm import RateLimitManager
-
-    handler = RateLimitManager(
-        max_requests_per_minute = 60,
-        max_tokens_per_minute = 200
-    )
-
-
-    async def send_request():
-        response =  await handler.acompletion(
-            model="gpt-3.5-turbo", 
-            messages=[{
-                "content": "Please provide a summary of the latest scientific discoveries."*10, 
-                "role": "user"
-            }]
-        )
-        print("got a response", response)
-        return response
-
-
-    tasks = []
-
-    for _ in range(4):
-        tasks.append(send_request())
-
-    responses = await asyncio.gather(*tasks)
-
-    for response in responses:
-        print(response)
-
-# import asyncio
-# asyncio.run(
-#     test_rate_limit_handler()
-# )
-
-
-@pytest.mark.asyncio
-async def test_rate_limit_handler_batch():
-    ##### USAGE ################
-
-    jobs = [
-        {"model": "gpt-3.5-turbo-16k", "messages": [{"content": "Please provide a summary of the latest scientific discoveries.", "role": "user"}]},
-        {"model": "gpt-3.5-turbo-16k", "messages": [{"content": "Please provide a summary of the latest scientific discoveries.", "role": "user"}]},
-    ]
-
-    from litellm import RateLimitManager
-
-    handler = RateLimitManager(
-        max_requests_per_minute = 60,
-        max_tokens_per_minute = 20000
-    )
-
-    try:
-        handler.batch_completion(
-            jobs = jobs,
-            api_key=os.environ['OPENAI_API_KEY'],
-        )
-    except Exception as e:
-        print(e)
-
-
-test_rate_limit_handler_batch()
--- a/litellm/tests/test_stream_chunk_builder.py
+++ b/litellm/tests/test_stream_chunk_builder.py
@ -35,6 +35,7 @@ def test_stream_chunk_builder():
    chunks = []

    for chunk in response:
+        print(chunk)
        chunks.append(chunk)

    try:
@ -51,8 +52,9 @@ def test_stream_chunk_builder():
        message = choices["message"]
        role = message["role"]
        content = message["content"]
-        finnish_reason = choices["finish_reason"]
-    except:
-        raise Exception("stream_chunk_builder failed to rebuild response")
-# test_stream_chunk_builder()
+        finish_reason = choices["finish_reason"]
+        print(role, content, finish_reason)
+    except Exception as e:
+        raise Exception("stream_chunk_builder failed to rebuild response", e)
+test_stream_chunk_builder()

--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@ -695,35 +695,52 @@ def test_completion_replicate_stream_bad_key():
 # test_completion_bedrock_claude_stream() 


-def test_completion_sagemaker_stream():
-    try:
-        response = completion(
-            model="sagemaker/jumpstart-dft-meta-textgeneration-llama-2-7b", 
-            messages=messages,
-            temperature=0.2,
-            max_tokens=80,
-            stream=True,
-        )
-        complete_response = ""
-        has_finish_reason = False
-        # Add any assertions here to check the response
-        for idx, chunk in enumerate(response):
-            chunk, finished = streaming_format_tests(idx, chunk)
-            has_finish_reason = finished
-            if finished:
-                break
-            complete_response += chunk
-        if has_finish_reason is False:
-            raise Exception("finish reason not set for last chunk")
-        if complete_response.strip() == "": 
-            raise Exception("Empty response received")
-    except InvalidRequestError as e: 
-        pass
-    except Exception as e:
-        pytest.fail(f"Error occurred: {e}")
+# def test_completion_sagemaker_stream():
+#     try:
+#         response = completion(
+#             model="sagemaker/jumpstart-dft-meta-textgeneration-llama-2-7b", 
+#             messages=messages,
+#             temperature=0.2,
+#             max_tokens=80,
+#             stream=True,
+#         )
+#         complete_response = ""
+#         has_finish_reason = False
+#         # Add any assertions here to check the response
+#         for idx, chunk in enumerate(response):
+#             chunk, finished = streaming_format_tests(idx, chunk)
+#             has_finish_reason = finished
+#             if finished:
+#                 break
+#             complete_response += chunk
+#         if has_finish_reason is False:
+#             raise Exception("finish reason not set for last chunk")
+#         if complete_response.strip() == "": 
+#             raise Exception("Empty response received")
+#     except InvalidRequestError as e: 
+#         pass
+#     except Exception as e:
+#         pytest.fail(f"Error occurred: {e}")

 # test_completion_sagemaker_stream()

+
+# def test_maritalk_streaming():
+#     messages = [{"role": "user", "content": "Hey"}]
+#     try:
+#         response = completion("maritalk", messages=messages, stream=True)
+#         complete_response = ""
+#         start_time = time.time()
+#         for idx, chunk in enumerate(response):
+#             chunk, finished = streaming_format_tests(idx, chunk)
+#             complete_response += chunk
+#             if finished:
+#                 break
+#         if complete_response.strip() == "": 
+#             raise Exception("Empty response received")
+#     except:
+#         pytest.fail(f"error occurred: {traceback.format_exc()}")
+# test_maritalk_streaming()
 # test on openai completion call
 def test_openai_text_completion_call():
    try:
--- a/litellm/tests/test_text_completion.py
+++ b/litellm/tests/test_text_completion.py
--- a/litellm/tests/test_traceloop.py
+++ b/litellm/tests/test_traceloop.py
@ -1,57 +1,62 @@
-import litellm
-from litellm import completion
-from traceloop.sdk import Traceloop
-
-Traceloop.init(app_name="test_traceloop", disable_batch=True)
-litellm.success_callback = ["traceloop"]
+# import sys
+# import os
+# import io
+# #
+# sys.path.insert(0, os.path.abspath('../..'))
+# import litellm
+# from litellm import completion
+# from traceloop.sdk import Traceloop
+# Traceloop.init(app_name="test_traceloop", disable_batch=True, traceloop_sync_enabled=False)
+# litellm.success_callback = ["traceloop"]


-def test_traceloop_logging():
-    try:
-        response = completion(
-            model="claude-instant-1.2",
-            messages=[
-                {"role": "user", "content": "Tell me a joke about OpenTelemetry"}
-            ],
-            max_tokens=10,
-            temperature=0.2,
-        )
-        print(response)
-    except Exception as e:
-        print(e)
+# def test_traceloop_logging():
+#     try:
+#         print('making completion call')
+#         response = completion(
+#             model="claude-instant-1.2",
+#             messages=[
+#                 {"role": "user", "content": "Tell me a joke about OpenTelemetry"}
+#             ],
+#             max_tokens=10,
+#             temperature=0.2,
+#         )
+#         print(response)
+#     except Exception as e:
+#         print(e)


-test_traceloop_logging()
+# # test_traceloop_logging()


-def test_traceloop_tracing_function_calling():
-    function1 = [
-        {
-            "name": "get_current_weather",
-            "description": "Get the current weather in a given location",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "location": {
-                        "type": "string",
-                        "description": "The city and state, e.g. San Francisco, CA",
-                    },
-                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
-                },
-                "required": ["location"],
-            },
-        }
-    ]
-    try:
-        response = completion(
-            model="gpt-3.5-turbo",
-            messages=[{"role": "user", "content": "what's the weather in boston"}],
-            temperature=0.1,
-            functions=function1,
-        )
-        print(response)
-    except Exception as e:
-        print(e)
+# def test_traceloop_tracing_function_calling():
+#     function1 = [
+#         {
+#             "name": "get_current_weather",
+#             "description": "Get the current weather in a given location",
+#             "parameters": {
+#                 "type": "object",
+#                 "properties": {
+#                     "location": {
+#                         "type": "string",
+#                         "description": "The city and state, e.g. San Francisco, CA",
+#                     },
+#                     "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+#                 },
+#                 "required": ["location"],
+#             },
+#         }
+#     ]
+#     try:
+#         response = completion(
+#             model="gpt-3.5-turbo",
+#             messages=[{"role": "user", "content": "what's the weather in boston"}],
+#             temperature=0.1,
+#             functions=function1,
+#         )
+#         print(response)
+#     except Exception as e:
+#         print(e)


-test_traceloop_tracing_function_calling()
+# # test_traceloop_tracing_function_calling()
--- a/litellm/utils.py
+++ b/litellm/utils.py
--- a/litellm_server/.env.template
+++ b/litellm_server/.env.template
@ -21,6 +21,9 @@ ANTHROPIC_API_KEY = ""

 COHERE_API_KEY = ""

+## CONFIG FILE ## 
+# CONFIG_FILE_PATH = ""  # uncomment to point to config file  
+
 ## LOGGING ## 

 SET_VERBOSE = "False" # set to 'True' to see detailed input/output logs
--- a/Show more
+++ b/Show more