Merge branch 'main' into main

2025-04-26 11:14:04 +00:00 · 2023-08-09 11:00:40 -07:00 · 2023-08-09 11:00:40 -07:00 · 4278b183d0
commit 4278b183d0
parent 90f1c0d7e2 79285cd4a2
18 changed files with 1000 additions and 102 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -13,6 +13,9 @@ jobs:
          command: |
            python -m pip install --upgrade pip
            python -m pip install -r requirements.txt
            pip install infisical
            pip install pytest
            pip install openai[datalib]
      # Run pytest and generate JUnit XML report
      - run:
--- a/README.md
+++ b/README.md
@ -3,18 +3,16 @@
 [![PyPI Version](https://img.shields.io/badge/stable%20version-v0.1.345-blue?color=green&link=https://pypi.org/project/litellm/0.1.1/)](https://pypi.org/project/litellm/0.1.1/)
 [![CircleCI](https://dl.circleci.com/status-badge/img/gh/BerriAI/litellm/tree/main.svg?style=svg)](https://dl.circleci.com/status-badge/redirect/gh/BerriAI/litellm/tree/main)
 ![Downloads](https://img.shields.io/pypi/dm/litellm)
-[![litellm](https://img.shields.io/badge/%20%F0%9F%9A%85%20liteLLM-OpenAI%7CAzure%7CAnthropic%7CPalm%7CCohere-blue?color=green)](https://github.com/BerriAI/litellm)
+[![litellm](https://img.shields.io/badge/%20%F0%9F%9A%85%20liteLLM-OpenAI%7CAzure%7CAnthropic%7CPalm%7CCohere%7CReplicate%7CHugging%20Face-blue?color=green)](https://github.com/BerriAI/litellm)
-Get Support / Join the community 👉 [![](https://dcbadge.vercel.app/api/server/wuPM9dRgDw)](https://discord.gg/wuPM9dRgDw)
+[![](https://dcbadge.vercel.app/api/server/wuPM9dRgDw)](https://discord.gg/wuPM9dRgDw)
 a simple & light package to call OpenAI, Azure, Cohere, Anthropic API Endpoints 
 litellm manages:
 - translating inputs to completion and embedding endpoints
 - guarantees consistent output, text responses will always be available at `['choices'][0]['message']['content']`
 a light package to simplify calling OpenAI, Azure, Cohere, Anthropic, Huggingface API Endpoints. It manages:
 - translating inputs to the provider's completion and embedding endpoints
 - guarantees [consistent output](https://litellm.readthedocs.io/en/latest/output/), text responses will always be available at `['choices'][0]['message']['content']`
 - exception mapping - common exceptions across providers are mapped to the [OpenAI exception types](https://help.openai.com/en/articles/6897213-openai-library-error-types-guidance)
 # usage
-
+Demo - https://litellm.ai/ \
 Read the docs - https://litellm.readthedocs.io/en/latest/
 ## quick start
@ -25,11 +23,6 @@ pip install litellm
 ```python
 from litellm import completion
 ## set ENV variables
 # ENV variables can be set in .env file, too. Example in .env.example
 os.environ["OPENAI_API_KEY"] = "openai key"
 os.environ["COHERE_API_KEY"] = "cohere key"
 messages = [{ "content": "Hello, how are you?","role": "user"}]
 # openai call
@ -41,6 +34,9 @@ response = completion("command-nightly", messages)
 # azure openai call
 response = completion("chatgpt-test", messages, azure=True)
 # hugging face call
 response = completion(model="stabilityai/stablecode-completion-alpha-3b-4k", messages=messages, hugging_face=True)
 # openrouter call
 response = completion("google/palm-2-codechat-bison", messages)
 ```
@ -53,17 +49,23 @@ pip install litellm==0.1.345
 ## Streaming Queries
 liteLLM supports streaming the model response back, pass `stream=True` to get a streaming iterator in response.
 Streaming is supported for OpenAI, Azure, Anthropic models
 ```python
 response = completion(model="gpt-3.5-turbo", messages=messages, stream=True)
 for chunk in response:
    print(chunk['choices'][0]['delta'])
 # claude 2
 result = completion('claude-2', messages, stream=True)
 for chunk in result:
  print(chunk['choices'][0]['delta'])
 ```
-# hosted version
+# support / talk with founders
- [Grab time if you want access 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version)
+- [Our calendar 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version)
 - [Community Discord 💭](https://discord.gg/wuPM9dRgDw)
 - Our numbers 📞 +1 (770) 8783-106 / ‭+1 (412) 618-6238‬
 - Our emails ✉️ ishaan@berri.ai / krrish@berri.ai
 # why did we build this 
 - **Need for simplicity**: Our code started to get extremely complicated managing & translating calls between Azure, OpenAI, Cohere
 # Support
 Contact us at ishaan@berri.ai / krrish@berri.ai
--- a/cookbook/Claude_(Anthropic)_with_Streaming_liteLLM_Examples.ipynb
+++ b/cookbook/Claude_(Anthropic)_with_Streaming_liteLLM_Examples.ipynb
@ -0,0 +1,406 @@
 {
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "ZwuaylskLxFu",
        "outputId": "d684d6a3-32fe-4beb-c378-c39134bcf8cc"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Collecting litellm==0.1.363\n",
            "  Downloading litellm-0.1.363-py3-none-any.whl (34 kB)\n",
            "Requirement already satisfied: openai<0.28.0,>=0.27.8 in /usr/local/lib/python3.10/dist-packages (from litellm==0.1.363) (0.27.8)\n",
            "Requirement already satisfied: python-dotenv<2.0.0,>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from litellm==0.1.363) (1.0.0)\n",
            "Requirement already satisfied: tiktoken<0.5.0,>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from litellm==0.1.363) (0.4.0)\n",
            "Requirement already satisfied: requests>=2.20 in /usr/local/lib/python3.10/dist-packages (from openai<0.28.0,>=0.27.8->litellm==0.1.363) (2.31.0)\n",
            "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from openai<0.28.0,>=0.27.8->litellm==0.1.363) (4.65.0)\n",
            "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from openai<0.28.0,>=0.27.8->litellm==0.1.363) (3.8.5)\n",
            "Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.10/dist-packages (from tiktoken<0.5.0,>=0.4.0->litellm==0.1.363) (2022.10.31)\n",
            "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.20->openai<0.28.0,>=0.27.8->litellm==0.1.363) (3.2.0)\n",
            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.20->openai<0.28.0,>=0.27.8->litellm==0.1.363) (3.4)\n",
            "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.20->openai<0.28.0,>=0.27.8->litellm==0.1.363) (1.26.16)\n",
            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.20->openai<0.28.0,>=0.27.8->litellm==0.1.363) (2023.7.22)\n",
            "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai<0.28.0,>=0.27.8->litellm==0.1.363) (23.1.0)\n",
            "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai<0.28.0,>=0.27.8->litellm==0.1.363) (6.0.4)\n",
            "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai<0.28.0,>=0.27.8->litellm==0.1.363) (4.0.2)\n",
            "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai<0.28.0,>=0.27.8->litellm==0.1.363) (1.9.2)\n",
            "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai<0.28.0,>=0.27.8->litellm==0.1.363) (1.4.0)\n",
            "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai<0.28.0,>=0.27.8->litellm==0.1.363) (1.3.1)\n",
            "Installing collected packages: litellm\n",
            "  Attempting uninstall: litellm\n",
            "    Found existing installation: litellm 0.1.362\n",
            "    Uninstalling litellm-0.1.362:\n",
            "      Successfully uninstalled litellm-0.1.362\n",
            "Successfully installed litellm-0.1.363\n"
          ]
        }
      ],
      "source": [
        "!pip install litellm==\"0.1.363\""
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# @title Import litellm & Set env variables\n",
        "import litellm\n",
        "import os\n",
        "\n",
        "os.environ[\"ANTHROPIC_API_KEY\"] = \" \" #@param"
      ],
      "metadata": {
        "id": "W216G__XL19Q"
      },
      "execution_count": 2,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# @title Request Claude Instant-1 and Claude-2\n",
        "messages = [\n",
        "  {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
        "  {\"role\": \"user\", \"content\": \"Who won the world series in 2020?\"}\n",
        "  ]\n",
        "\n",
        "result = litellm.completion('claude-instant-1', messages)\n",
        "print(\"\\n\\n Result from claude-instant-1\", result)\n",
        "result = litellm.completion('claude-2', messages, max_tokens=5, temperature=0.2)\n",
        "print(\"\\n\\n Result from claude-2\", result)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "ff1lKwUMMLJj",
        "outputId": "bfddf6f8-36d4-45e5-92dc-349083fa41b8"
      },
      "execution_count": 9,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\n",
            "\n",
            " Result from claude-instant-1 {'choices': [{'finish_reason': 'stop', 'index': 0, 'message': {'role': 'assistant', 'content': \" The Los Angeles Dodgers won the 2020 World Series, defeating the Tampa Bay Rays 4-2. It was the Dodgers' first World Series title since 1988.\"}}], 'created': 1691536677.2676156, 'model': 'claude-instant-1', 'usage': {'prompt_tokens': 30, 'completion_tokens': 32, 'total_tokens': 62}}\n",
            "\n",
            "\n",
            " Result from claude-2 {'choices': [{'finish_reason': 'stop', 'index': 0, 'message': {'role': 'assistant', 'content': ' The Los Angeles Dodgers won'}}], 'created': 1691536677.944753, 'model': 'claude-2', 'usage': {'prompt_tokens': 30, 'completion_tokens': 5, 'total_tokens': 35}}\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# @title Streaming Example: Request Claude-2\n",
        "messages = [\n",
        "  {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
        "  {\"role\": \"user\", \"content\": \"how does a court case get to the Supreme Court?\"}\n",
        "  ]\n",
        "\n",
        "result = litellm.completion('claude-2', messages, stream=True)\n",
        "for chunk in result:\n",
        "  print(chunk['choices'][0]['delta'])\n",
        "\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "06hWKnNQMrV-",
        "outputId": "7fdec0eb-d4a9-4882-f9c4-987ff9a31114"
      },
      "execution_count": 7,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            " Here\n",
            "'s\n",
            " a\n",
            " quick\n",
            " overview\n",
            " of\n",
            " how\n",
            " a\n",
            " court\n",
            " case\n",
            " can\n",
            " reach\n",
            " the\n",
            " U\n",
            ".\n",
            "S\n",
            ".\n",
            " Supreme\n",
            " Court\n",
            ":\n",
            "\n",
            "\n",
            "-\n",
            " The\n",
            " case\n",
            " must\n",
            " first\n",
            " be\n",
            " heard\n",
            " in\n",
            " a\n",
            " lower\n",
            " trial\n",
            " court\n",
            " (\n",
            "either\n",
            " a\n",
            " state\n",
            " court\n",
            " or\n",
            " federal\n",
            " district\n",
            " court\n",
            ").\n",
            " The\n",
            " trial\n",
            " court\n",
            " makes\n",
            " initial\n",
            " r\n",
            "ulings\n",
            " and\n",
            " produces\n",
            " a\n",
            " record\n",
            " of\n",
            " the\n",
            " case\n",
            ".\n",
            "\n",
            "\n",
            "-\n",
            " The\n",
            " losing\n",
            " party\n",
            " can\n",
            " appeal\n",
            " the\n",
            " decision\n",
            " to\n",
            " an\n",
            " appeals\n",
            " court\n",
            " (\n",
            "a\n",
            " state\n",
            " appeals\n",
            " court\n",
            " for\n",
            " state\n",
            " cases\n",
            ",\n",
            " or\n",
            " a\n",
            " federal\n",
            " circuit\n",
            " court\n",
            " for\n",
            " federal\n",
            " cases\n",
            ").\n",
            " The\n",
            " appeals\n",
            " court\n",
            " reviews\n",
            " the\n",
            " trial\n",
            " court\n",
            "'s\n",
            " r\n",
            "ulings\n",
            " and\n",
            " can\n",
            " affirm\n",
            ",\n",
            " reverse\n",
            ",\n",
            " or\n",
            " modify\n",
            " the\n",
            " decision\n",
            ".\n",
            "\n",
            "\n",
            "-\n",
            " If\n",
            " a\n",
            " party\n",
            " is\n",
            " still\n",
            " unsat\n",
            "isf\n",
            "ied\n",
            " after\n",
            " the\n",
            " appeals\n",
            " court\n",
            " rules\n",
            ",\n",
            " they\n",
            " can\n",
            " petition\n",
            " the\n",
            " Supreme\n",
            " Court\n",
            " to\n",
            " hear\n",
            " the\n",
            " case\n",
            " through\n",
            " a\n",
            " writ\n",
            " of\n",
            " cert\n",
            "ior\n",
            "ari\n",
            ".\n",
            " \n",
            "\n",
            "\n",
            "-\n",
            " The\n",
            " Supreme\n",
            " Court\n",
            " gets\n",
            " thousands\n",
            " of\n",
            " cert\n",
            " petitions\n",
            " every\n",
            " year\n",
            " but\n",
            " usually\n",
            " only\n",
            " agrees\n",
            " to\n",
            " hear\n",
            " about\n",
            " 100\n",
            "-\n",
            "150\n",
            " of\n",
            " cases\n",
            " that\n",
            " have\n",
            " significant\n",
            " national\n",
            " importance\n",
            " or\n",
            " where\n",
            " lower\n",
            " courts\n",
            " disagree\n",
            " on\n",
            " federal\n",
            " law\n",
            ".\n",
            " \n",
            "\n",
            "\n",
            "-\n",
            " If\n",
            " 4\n",
            " out\n",
            " of\n",
            " the\n",
            " 9\n",
            " Just\n",
            "ices\n",
            " vote\n",
            " to\n",
            " grant\n",
            " cert\n",
            " (\n",
            "agree\n",
            " to\n",
            " hear\n",
            " the\n",
            " case\n",
            "),\n",
            " it\n",
            " goes\n",
            " on\n",
            " the\n",
            " Supreme\n",
            " Court\n",
            "'s\n",
            " do\n",
            "cket\n",
            " for\n",
            " arguments\n",
            ".\n",
            "\n",
            "\n",
            "-\n",
            " The\n",
            " Supreme\n",
            " Court\n",
            " then\n",
            " hears\n",
            " oral\n",
            " arguments\n",
            ",\n",
            " considers\n",
            " written\n",
            " brief\n",
            "s\n",
            ",\n",
            " examines\n",
            " the\n",
            " lower\n",
            " court\n",
            " records\n",
            ",\n",
            " and\n",
            " issues\n",
            " a\n",
            " final\n",
            " ruling\n",
            " on\n",
            " the\n",
            " case\n",
            ",\n",
            " which\n",
            " serves\n",
            " as\n",
            " binding\n",
            " precedent\n"
          ]
        }
      ]
    }
  ]
 }
--- a/cookbook/community-resources/max_tokens.json
+++ b/cookbook/community-resources/max_tokens.json
--- a/cookbook/liteLLM_Hugging_Face_Example.ipynb
+++ b/cookbook/liteLLM_Hugging_Face_Example.ipynb
@ -0,0 +1,153 @@
 {
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "## Install liteLLM https://github.com/BerriAI/litellm\n",
        "liteLLM provides one interface to call gpt 3.5, hugging face inference endpoints"
      ],
      "metadata": {
        "id": "IGQZtR61AZSd"
      }
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "x_4jcmmXcdm-",
        "outputId": "c89e7817-561d-4867-904b-aa1634565cbb"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Requirement already satisfied: litellm==0.1.362 in /usr/local/lib/python3.10/dist-packages (0.1.362)\n",
            "Requirement already satisfied: openai<0.28.0,>=0.27.8 in /usr/local/lib/python3.10/dist-packages (from litellm==0.1.362) (0.27.8)\n",
            "Requirement already satisfied: python-dotenv<2.0.0,>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from litellm==0.1.362) (1.0.0)\n",
            "Requirement already satisfied: tiktoken<0.5.0,>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from litellm==0.1.362) (0.4.0)\n",
            "Requirement already satisfied: requests>=2.20 in /usr/local/lib/python3.10/dist-packages (from openai<0.28.0,>=0.27.8->litellm==0.1.362) (2.28.2)\n",
            "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from openai<0.28.0,>=0.27.8->litellm==0.1.362) (4.65.0)\n",
            "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from openai<0.28.0,>=0.27.8->litellm==0.1.362) (3.8.5)\n",
            "Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.10/dist-packages (from tiktoken<0.5.0,>=0.4.0->litellm==0.1.362) (2022.10.31)\n",
            "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.20->openai<0.28.0,>=0.27.8->litellm==0.1.362) (3.2.0)\n",
            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.20->openai<0.28.0,>=0.27.8->litellm==0.1.362) (3.4)\n",
            "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.20->openai<0.28.0,>=0.27.8->litellm==0.1.362) (1.26.16)\n",
            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.20->openai<0.28.0,>=0.27.8->litellm==0.1.362) (2023.7.22)\n",
            "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai<0.28.0,>=0.27.8->litellm==0.1.362) (23.1.0)\n",
            "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai<0.28.0,>=0.27.8->litellm==0.1.362) (6.0.4)\n",
            "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai<0.28.0,>=0.27.8->litellm==0.1.362) (4.0.2)\n",
            "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai<0.28.0,>=0.27.8->litellm==0.1.362) (1.9.2)\n",
            "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai<0.28.0,>=0.27.8->litellm==0.1.362) (1.4.0)\n",
            "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai<0.28.0,>=0.27.8->litellm==0.1.362) (1.3.1)\n"
          ]
        }
      ],
      "source": [
        "!pip install litellm==\"0.1.362\""
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from litellm import completion\n",
        "import os\n",
        "user_message = \"Hello, whats the weather in San Francisco??\"\n",
        "messages = [{ \"content\": user_message,\"role\": \"user\"}]\n",
        "\n",
        "os.environ['HF_TOKEN'] = \"\"#@param\n",
        "# get your hugging face token from here:\n",
        "# https://huggingface.co/settings/tokens\n",
        "\n",
        "# Optional if you want to run OpenAI TOO\n",
        "os.environ['OPENAI_API_KEY'] = \"\" #@param\n",
        "\n",
        "response = completion(\"stabilityai/stablecode-completion-alpha-3b-4k\", messages=messages, hugging_face=True)\n",
        "print(\"Response from stabilityai/stablecode-completion-alpha-3b-4k\")\n",
        "print(response['choices'][0]['message']['content'])\n",
        "print(\"\\n\\n\")\n",
        "\n",
        "response = completion(\"bigcode/starcoder\", messages=messages, hugging_face=True)\n",
        "print(\"Response from bigcode/starcoder\")\n",
        "print(response['choices'][0]['message']['content'])\n",
        "print(\"\\n\\n\")\n",
        "\n",
        "response = completion(\"google/flan-t5-xxl\", messages=messages, hugging_face=True)\n",
        "print(\"Response from google/flan-t5-xxl\")\n",
        "print(response['choices'][0]['message']['content'])\n",
        "print(\"\\n\\n\")\n",
        "\n",
        "response = completion(\"google/flan-t5-large\", messages=messages, hugging_face=True)\n",
        "print(\"Response from google/flan-t5-large\")\n",
        "print(response['choices'][0]['message']['content'])\n",
        "print(\"\\n\\n\")\n",
        "\n",
        "response = completion(model=\"gpt-3.5-turbo\", messages=messages)\n",
        "print(response['choices'][0]['message']['content'])\n",
        "print(response)\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "vC54VW3jvLnN",
        "outputId": "e6616221-12c9-4313-dd03-fd94fa095e8e"
      },
      "execution_count": 5,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Response from stabilityai/stablecode-completion-alpha-3b-4k\n",
            "Hello, whats the weather in San Francisco??\",\n",
            "    \"id\": 1,\n",
            "    \"\n",
            "\n",
            "\n",
            "\n",
            "Response from bigcode/starcoder\n",
            "Hello, whats the weather in San Francisco??\")\n",
            "\n",
            "# print(response)\n",
            "\n",
            "# print(response.text)\n",
            "\n",
            "#\n",
            "\n",
            "\n",
            "\n",
            "Response from google/flan-t5-xxl\n",
            "a little cold\n",
            "\n",
            "\n",
            "\n",
            "Response from google/flan-t5-large\n",
            "cool\n",
            "\n",
            "\n",
            "\n",
            "I'm sorry, but I am an AI language model and do not have real-time data. However, you can check the weather in San Francisco by searching for \"San Francisco weather\" on a search engine or checking a reliable weather website or app.\n"
          ]
        }
      ]
    }
  ]
 }
--- a/docs/supported.md
+++ b/docs/supported.md
@ -34,5 +34,26 @@
 | Model Name       | Function Call                              | Required OS Variables                |
 |------------------|--------------------------------------------|--------------------------------------|
 | claude-instant-1  | `completion('claude-instant-1', messages)` | `os.environ['ANTHROPIC_API_KEY']`       |
-| claude-v2  | `completion('claude-v2', messages)` | `os.environ['ANTHROPIC_API_KEY']`       |
+| claude-2  | `completion('claude-2', messages)` | `os.environ['ANTHROPIC_API_KEY']`       |
 ### Hugging Face Inference API
 All [`text2text-generation`](https://huggingface.co/models?library=transformers&pipeline_tag=text2text-generation&sort=downloads) and [`text-generation`](https://huggingface.co/models?library=transformers&pipeline_tag=text-generation&sort=downloads) models are supported by liteLLM. You can use any text model from Hugging Face with the following steps:
 * Copy the `model repo` URL from Hugging Face and set it as the `model` parameter in the completion call.
 * Set `hugging_face` parameter to `True`.
 * Make sure to set the hugging face API key
 Here are some examples of supported models:
 **Note that the models mentioned in the table are examples, and you can use any text model available on Hugging Face by following the steps above.**
 | Model Name       | Function Call                                                                       | Required OS Variables                |
 |------------------|-------------------------------------------------------------------------------------|--------------------------------------|
 | [stabilityai/stablecode-completion-alpha-3b-4k](https://huggingface.co/stabilityai/stablecode-completion-alpha-3b-4k)  | `completion(model="stabilityai/stablecode-completion-alpha-3b-4k", messages=messages, hugging_face=True)` | `os.environ['HF_TOKEN']`       |
 | [bigcode/starcoder](https://huggingface.co/bigcode/starcoder)                           | `completion(model="bigcode/starcoder", messages=messages, hugging_face=True)`          | `os.environ['HF_TOKEN']`       |
 | [google/flan-t5-xxl](https://huggingface.co/google/flan-t5-xxl)                         | `completion(model="google/flan-t5-xxl", messages=messages, hugging_face=True)`         | `os.environ['HF_TOKEN']`       |
 | [google/flan-t5-large](https://huggingface.co/google/flan-t5-large)                     | `completion(model="google/flan-t5-large", messages=messages, hugging_face=True)`       | `os.environ['HF_TOKEN']`       |
--- a/docs/token_usage.md
+++ b/docs/token_usage.md
@ -0,0 +1,45 @@
 # Token Usage
 By default LiteLLM returns token usage in all completion requests ([See here](https://litellm.readthedocs.io/en/latest/output/))
 However, we also expose 3 public helper functions to calculate token usage across providers:
 - `token_counter`: This returns the number of tokens for a given input - it uses the tokenizer based on the model, and defaults to tiktoken if no model-specific tokenizer is available. 
 - `cost_per_token`: This returns the cost (in USD) for prompt (input) and completion (output) tokens. It utilizes our model_cost map which can be found in `__init__.py` and also as a [community resource](https://github.com/BerriAI/litellm/blob/main/cookbook/community-resources/max_tokens.json).
 - `completion_cost`: This returns the overall cost (in USD) for a given LLM API Call. It combines `token_counter` and `cost_per_token` to return the cost for that query (counting both cost of input and output). 
 ## Example Usage 
 1. `token_counter`
 ```python
 from litellm import token_counter
 messages = [{"user": "role", "content": "Hey, how's it going"}]
 print(token_counter(model="gpt-3.5-turbo", messages=messages))
 ```
 2. `cost_per_token`
 ```python
 from litellm import cost_per_token
 prompt_tokens =  5
 completion_tokens = 10
 prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = cost_per_token(model="gpt-3.5-turbo", prompt_tokens=prompt_tokens, completion_tokens=completion_tokens))
 print(prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar)
 ```
 3. `completion_cost`
 ```python
 from litellm import completion_cost
 prompt = "Hey, how's it going"
 completion = "Hi, I'm gpt - I am doing well"
 cost_of_query = completion_cost(model="gpt-3.5-turbo", prompt=prompt, completion=completion))
 print(cost_of_query)
 ```
--- a/litellm/init.py
+++ b/litellm/init.py
@ -4,13 +4,34 @@ failure_callback = []
 set_verbose=False
 telemetry=True
 max_tokens = 256 # OpenAI Defaults
-retry = True # control tenacity retries. 
+retry = True
 openai_key = None 
 azure_key = None 
 anthropic_key = None 
 replicate_key = None 
 cohere_key = None 
 openrouter_key = None
 hugging_api_token = None
 model_cost = {
    "gpt-3.5-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
    "gpt-35-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, # azure model name
    "gpt-3.5-turbo-0613": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
    "gpt-3.5-turbo-0301": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
    "gpt-3.5-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
    "gpt-35-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, # azure model name
    "gpt-3.5-turbo-16k-0613": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
    "gpt-4": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
    "gpt-4-0613": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
    "gpt-4-32k": {"max_tokens": 8000, "input_cost_per_token": 0.00006, "output_cost_per_token": 0.00012},
    "claude-instant-1": {"max_tokens": 100000, "input_cost_per_token": 0.00000163, "output_cost_per_token": 0.00000551},
    "claude-2": {"max_tokens": 100000, "input_cost_per_token": 0.00001102, "output_cost_per_token": 0.00003268},
    "text-bison-001": {"max_tokens": 8192, "input_cost_per_token": 0.000004, "output_cost_per_token": 0.000004},
    "chat-bison-001": {"max_tokens": 4096, "input_cost_per_token": 0.000002, "output_cost_per_token": 0.000002},
    "command-nightly": {"max_tokens": 4096, "input_cost_per_token": 0.000015, "output_cost_per_token": 0.000015},
 }
 ####### THREAD-SPECIFIC DATA ###################
 class MyLocal(threading.local):
    def __init__(self):
@ -83,7 +104,7 @@ open_ai_embedding_models = [
    'text-embedding-ada-002'
 ]
 from .timeout import timeout
-from .utils import client, logging, exception_type, get_optional_params, modify_integration
+from .utils import client, logging, exception_type, get_optional_params, modify_integration, token_counter, cost_per_token, completion_cost
 from .main import *  # Import all the symbols from main.py
 from .integrations import *
 from openai.error import AuthenticationError, InvalidRequestError, RateLimitError, ServiceUnavailableError, OpenAIError
--- a/litellm/integrations/helicone.py
+++ b/litellm/integrations/helicone.py
@ -2,7 +2,6 @@
 #    On success, logs events to Helicone
 import dotenv, os
 import requests
 from anthropic import HUMAN_PROMPT, AI_PROMPT
 dotenv.load_dotenv() # Loading env variables using dotenv
 import traceback
 class HeliconeLogger:
@ -14,6 +13,7 @@ class HeliconeLogger:
        self.key = os.getenv('HELICONE_API_KEY')
    def claude_mapping(self, model, messages, response_obj):
        from anthropic import HUMAN_PROMPT, AI_PROMPT
        prompt = f"{HUMAN_PROMPT}" 
        for message in messages:
            if "role" in message:
--- a/litellm/main.py
+++ b/litellm/main.py
@ -1,6 +1,5 @@
-import os, openai, cohere, replicate, sys
+import os, openai, sys
 from typing import Any
 from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT
 from functools import partial
 import dotenv, traceback, random, asyncio, time
 from copy import deepcopy
@ -8,15 +7,9 @@ import litellm
 from litellm import client, logging, exception_type, timeout, get_optional_params
 import tiktoken
 encoding = tiktoken.get_encoding("cl100k_base")
-from tenacity import (
+from litellm.utils import get_secret, install_and_import, CustomStreamWrapper
    retry,
    stop_after_attempt,
    wait_random_exponential,
 )  # for exponential backoff
 from litellm.utils import get_secret
 ####### ENVIRONMENT VARIABLES ###################
 dotenv.load_dotenv() # Loading env variables using dotenv
 new_response = {
        "choices": [
          {
@ -28,9 +21,7 @@ new_response = {
          }
        ]
      }
 # TODO move this to utils.py
 # TODO add translations
 # TODO see if this worked - model_name == krrish
 ####### COMPLETION ENDPOINTS ################
 #############################################
 async def acompletion(*args, **kwargs):
@ -52,7 +43,8 @@ def completion(
    temperature=1, top_p=1, n=1, stream=False, stop=None, max_tokens=float('inf'),
    presence_penalty=0, frequency_penalty=0, logit_bias={}, user="", deployment_id=None,
    # Optional liteLLM function params
-    *, return_async=False, api_key=None, force_timeout=60, azure=False, logger_fn=None, verbose=False
+    *, return_async=False, api_key=None, force_timeout=60, azure=False, logger_fn=None, verbose=False,
    hugging_face = False, replicate=False,
  ):
  try:
    global new_response
@ -61,13 +53,16 @@ def completion(
    optional_params = get_optional_params(
      functions=functions, function_call=function_call, 
      temperature=temperature, top_p=top_p, n=n, stream=stream, stop=stop, max_tokens=max_tokens,
-      presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, logit_bias=logit_bias, user=user, deployment_id=deployment_id
+      presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, logit_bias=logit_bias, user=user, deployment_id=deployment_id,
      # params to identify the model
      model=model, replicate=replicate, hugging_face=hugging_face
    )
    if azure == True:
      # azure configs
      openai.api_type = "azure"
      openai.api_base = litellm.api_base if litellm.api_base is not None else get_secret("AZURE_API_BASE")
      openai.api_version = litellm.api_version if litellm.api_version is not None else get_secret("AZURE_API_VERSION")
      # set key
      if api_key:
          openai.api_key = api_key
      elif litellm.azure_key:
@ -92,6 +87,7 @@ def completion(
        )
    elif model in litellm.open_ai_chat_completion_models:
      openai.api_type = "openai"
      # note: if a user sets a custom base - we should ensure this works
      openai.api_base = litellm.api_base if litellm.api_base is not None else "https://api.openai.com/v1"
      openai.api_version = None
      if litellm.organization:
@ -154,7 +150,10 @@ def completion(
      model_response["model"] = model
      model_response["usage"] = response["usage"]
      response = model_response
-    elif "replicate" in model:
+    elif "replicate" in model or replicate == True:
      # import replicate/if it fails then pip install replicate
      install_and_import("replicate")
      import replicate
      # replicate defaults to os.environ.get("REPLICATE_API_TOKEN")
      # checking in case user set it to REPLICATE_API_KEY instead 
      if not get_secret("REPLICATE_API_TOKEN") and get_secret("REPLICATE_API_KEY"):
@ -175,6 +174,11 @@ def completion(
      output = replicate.run(
        model,
        input=input)
      if 'stream' in optional_params and optional_params['stream'] == True:
        # don't try to access stream object,
        # let the stream handler know this is replicate
        response = CustomStreamWrapper(output, "replicate")
        return response
      response = ""
      for item in output: 
        response += item
@ -194,6 +198,10 @@ def completion(
        }
      response = model_response
    elif model in litellm.anthropic_models:
      # import anthropic/if it fails then pip install anthropic
      install_and_import("anthropic")
      from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT
      #anthropic defaults to os.environ.get("ANTHROPIC_API_KEY")
      if api_key:
         os.environ["ANTHROPIC_API_KEY"] = api_key
@ -220,8 +228,14 @@ def completion(
      completion = anthropic.completions.create(
            model=model,
            prompt=prompt,
-            max_tokens_to_sample=max_tokens_to_sample
+            max_tokens_to_sample=max_tokens_to_sample,
            **optional_params
        )
      if 'stream' in optional_params and optional_params['stream'] == True:
        # don't try to access stream object,
        response = CustomStreamWrapper(completion, model)
        return response
      completion_response = completion.completion
      ## LOGGING
      logging(model=model, input=prompt, azure=azure, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn)
@ -274,6 +288,9 @@ def completion(
          **optional_params
        )
    elif model in litellm.cohere_models:
      # import cohere/if it fails then pip install cohere
      install_and_import("cohere")
      import cohere
      if api_key:
        cohere_key = api_key
      elif litellm.cohere_key:
@ -287,8 +304,14 @@ def completion(
      ## COMPLETION CALL
      response = co.generate(  
        model=model,
-        prompt = prompt
+        prompt = prompt,
        **optional_params
      )
      if 'stream' in optional_params and optional_params['stream'] == True:
        # don't try to access stream object,
        response = CustomStreamWrapper(response, model)
        return response
      completion_response = response[0].text
      ## LOGGING
      logging(model=model, input=prompt, azure=azure, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn)
@ -304,6 +327,33 @@ def completion(
          "total_tokens": prompt_tokens + completion_tokens
        }
      response = model_response
    elif hugging_face == True:
      import requests
      API_URL = f"https://api-inference.huggingface.co/models/{model}"
      HF_TOKEN = get_secret("HF_TOKEN")
      headers = {"Authorization": f"Bearer {HF_TOKEN}"}
      prompt = " ".join([message["content"] for message in messages])
      ## LOGGING
      logging(model=model, input=prompt, azure=azure, logger_fn=logger_fn)
      input_payload = {"inputs": prompt}
      response = requests.post(API_URL, headers=headers, json=input_payload)
      completion_response = response.json()[0]['generated_text']
      ## LOGGING
      logging(model=model, input=prompt, azure=azure, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn)
      prompt_tokens = len(encoding.encode(prompt))
      completion_tokens = len(encoding.encode(completion_response))
      ## RESPONSE OBJECT
      model_response["choices"][0]["message"]["content"] = completion_response
      model_response["created"] = time.time()
      model_response["model"] = model
      model_response["usage"] = {
          "prompt_tokens": prompt_tokens,
          "completion_tokens": completion_tokens,
          "total_tokens": prompt_tokens + completion_tokens
        }
      response = model_response
    else: 
      ## LOGGING
      logging(model=model, input=messages, azure=azure, logger_fn=logger_fn)
--- a/litellm/tests.txt
+++ b/litellm/tests.txt
@ -1 +0,0 @@
 test 1
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@ -7,8 +7,10 @@ sys.path.insert(0, os.path.abspath('../..'))  # Adds the parent directory to the
 import pytest
 import litellm
 from litellm import embedding, completion
 from infisical import InfisicalClient
 # litellm.set_verbose = True
 litellm.secret_manager_client = InfisicalClient(token=os.environ["INFISICAL_TOKEN"])
 user_message = "Hello, whats the weather in San Francisco??"
 messages = [{ "content": user_message,"role": "user"}]
@ -16,6 +18,59 @@ messages = [{ "content": user_message,"role": "user"}]
 def logger_fn(user_model_dict):
    print(f"user_model_dict: {user_model_dict}")
 def test_completion_claude():
    try:
        response = completion(model="claude-instant-1", messages=messages, logger_fn=logger_fn)
        # Add any assertions here to check the response
        print(response)
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 def test_completion_claude_stream():
    try:
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "how does a court case get to the Supreme Court?"}
        ]
        response = completion(model="claude-2", messages=messages, stream=True)
        # Add any assertions here to check the response
        for chunk in response:
            print(chunk['choices'][0]['delta']) # same as openai format
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 def test_completion_hf_api():
    try:
        user_message = "write some code to find the sum of two numbers"
        messages = [{ "content": user_message,"role": "user"}]
        response = completion(model="stabilityai/stablecode-completion-alpha-3b-4k", messages=messages, hugging_face=True)
        # Add any assertions here to check the response
        print(response)
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 def test_completion_cohere():
    try:
        response = completion(model="command-nightly", messages=messages, max_tokens=500)
        # Add any assertions here to check the response
        print(response)
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 def test_completion_cohere_stream():
    try:
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "how does a court case get to the Supreme Court?"}
        ]
        response = completion(model="command-nightly", messages=messages, stream=True, max_tokens=50)
        # Add any assertions here to check the response
        for chunk in response:
            print(chunk['choices'][0]['delta']) # same as openai format
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 def test_completion_openai():
    try:
        response = completion(model="gpt-3.5-turbo", messages=messages)
@ -92,18 +147,25 @@ def test_completion_azure():
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
-def test_completion_claude():
+# Replicate API endpoints are unstable -> throw random CUDA errors -> this means our tests can fail even if our tests weren't incorrect. 
 def test_completion_replicate_llama_stream():
    model_name = "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1"
    try:
-        response = completion(model="claude-instant-1", messages=messages, logger_fn=logger_fn)
+        response = completion(model=model_name, messages=messages, stream=True)
        # Add any assertions here to check the response
        for result in response:
            print(result)
        print(response)
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
-def test_completion_cohere():
+def test_completion_replicate_stability_stream():
    model_name = "stability-ai/stablelm-tuned-alpha-7b:c49dae362cbaecd2ceabb5bd34fdb68413c4ff775111fea065d259d577757beb"
    try:
-        response = completion(model="command-nightly", messages=messages, max_tokens=500)
+        response = completion(model=model_name, messages=messages, stream=True, replicate=True)
        # Add any assertions here to check the response
        for result in response:
            print(result)
        print(response)
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
@ -124,3 +186,14 @@ def test_completion_cohere():
 #             pass
 #         else:
 #             pytest.fail(f"Error occurred: {e}")
 def test_completion_replicate_stability():
    model_name = "stability-ai/stablelm-tuned-alpha-7b:c49dae362cbaecd2ceabb5bd34fdb68413c4ff775111fea065d259d577757beb"
    try:
        response = completion(model=model_name, messages=messages, replicate=True)
        # Add any assertions here to check the response
        for result in response:
            print(result)
        print(response)
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
--- a/litellm/tests/test_embedding.py
+++ b/litellm/tests/test_embedding.py
@ -0,0 +1,20 @@
 import sys, os
 import traceback
 import pytest
 sys.path.insert(0, os.path.abspath('../..'))  # Adds the parent directory to the system path
 import litellm
 from litellm import embedding, completion
 from infisical import InfisicalClient
 # litellm.set_verbose = True
 litellm.secret_manager_client = InfisicalClient(token=os.environ["INFISICAL_TOKEN"])
 def test_openai_embedding():
    try:
        response = embedding(model='text-embedding-ada-002', input=["good morning from litellm"])
        # Add any assertions here to check the response
        print(f"response: {str(response)}")
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -4,7 +4,6 @@ import subprocess, os
 import litellm, openai 
 import random, uuid, requests
 import datetime, time
 from anthropic import Anthropic
 import tiktoken
 encoding = tiktoken.get_encoding("cl100k_base")
 from .integrations.helicone import HeliconeLogger
@ -34,6 +33,19 @@ def print_verbose(print_statement):
    if random.random() <= 0.3:
      print("Get help - https://discord.com/invite/wuPM9dRgDw")
 ####### Package Import Handler ###################
 import importlib
 import subprocess
 def install_and_import(package):
    try:
        importlib.import_module(package)
    except ImportError:
        print(f"{package} is not installed. Installing...")
        subprocess.call([sys.executable, '-m', 'pip', 'install', package])
    finally:
        globals()[package] = importlib.import_module(package)
 ##################################################
 ####### LOGGING ###################
 #Logging function -> log the exact model details + what's being sent | Non-Blocking
 def logging(model=None, input=None, azure=False, additional_args={}, logger_fn=None, exception=None):
@ -119,6 +131,51 @@ def client(original_function):
          raise e
    return wrapper
 ####### USAGE CALCULATOR ################
 def token_counter(model, text):
  # use tiktoken or anthropic's tokenizer depending on the model
  num_tokens = 0
  if "claude" in model:
    install_and_import('anthropic')
    from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT
    anthropic = Anthropic()
    num_tokens = anthropic.count_tokens(text)
  else:
    num_tokens = len(encoding.encode(text))
  return num_tokens
 def cost_per_token(model="gpt-3.5-turbo", prompt_tokens = 0, completion_tokens = 0):
   ## given 
  prompt_tokens_cost_usd_dollar = 0
  completion_tokens_cost_usd_dollar = 0
  model_cost_ref = litellm.model_cost
  if model in model_cost_ref:
    prompt_tokens_cost_usd_dollar = model_cost_ref[model]["input_cost_per_token"] * prompt_tokens
    completion_tokens_cost_usd_dollar = model_cost_ref[model]["output_cost_per_token"] * completion_tokens
    return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
  else:
    # calculate average input cost 
    input_cost_sum = 0
    output_cost_sum = 0
    model_cost_ref = litellm.model_cost
    for model in model_cost_ref:
        input_cost_sum += model_cost_ref[model]["input_cost_per_token"]
        output_cost_sum += model_cost_ref[model]["output_cost_per_token"]
    avg_input_cost = input_cost_sum / len(model_cost_ref.keys())
    avg_output_cost = output_cost_sum / len(model_cost_ref.keys())
    prompt_tokens_cost_usd_dollar = avg_input_cost * prompt_tokens
    completion_tokens_cost_usd_dollar = avg_output_cost * completion_tokens
    return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
 def completion_cost(model="gpt-3.5-turbo", prompt="", completion=""):
   prompt_tokens = tokenizer(model=model, text=prompt)
   completion_tokens = tokenizer(model=model, text=completion)
   prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = cost_per_token(model=model, prompt_tokens = prompt_tokens, completion_tokens = completion_tokens)
   return prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
 ####### HELPER FUNCTIONS ################
 def get_optional_params(
    # 12 optional params
@ -134,35 +191,66 @@ def get_optional_params(
    frequency_penalty = 0,
    logit_bias = {},
    user = "",
-    deployment_id = None
+    deployment_id = None,
    model = None,
    replicate = False,
    hugging_face = False,
 ):
  optional_params = {}
-  if functions != []:
+  if model in litellm.anthropic_models:
-      optional_params["functions"] = functions
+    # handle anthropic params
-  if function_call != "":
+    if stream:
      optional_params["function_call"] = function_call
  if temperature != 1:
      optional_params["temperature"] = temperature
  if top_p != 1:
      optional_params["top_p"] = top_p
  if n != 1:
      optional_params["n"] = n
  if stream:
      optional_params["stream"] = stream
-  if stop != None:
+    if stop != None:
-      optional_params["stop"] = stop
+        optional_params["stop_sequences"] = stop
-  if max_tokens != float('inf'):
+    if temperature != 1:
-      optional_params["max_tokens"] = max_tokens
+        optional_params["temperature"] = temperature
-  if presence_penalty != 0:
+    if top_p != 1:
-      optional_params["presence_penalty"] = presence_penalty
+        optional_params["top_p"] = top_p
-  if frequency_penalty != 0:
+    return optional_params
-      optional_params["frequency_penalty"] = frequency_penalty
+  elif model in litellm.cohere_models:
-  if logit_bias != {}:
+     # handle cohere params
-      optional_params["logit_bias"] = logit_bias
+    if stream:
-  if user != "":
+      optional_params["stream"] = stream
-      optional_params["user"] = user
+    if temperature != 1:
-  if deployment_id != None:
+        optional_params["temperature"] = temperature
-      optional_params["deployment_id"] = deployment_id
+    if max_tokens != float('inf'):
        optional_params["max_tokens"] = max_tokens
    return optional_params
  elif replicate == True:
    # any replicate models
    # TODO: handle translating remaining replicate params
    if stream:
      optional_params["stream"] = stream
      return optional_params
  else:# assume passing in params for openai/azure openai
    if functions != []:
        optional_params["functions"] = functions
    if function_call != "":
        optional_params["function_call"] = function_call
    if temperature != 1:
        optional_params["temperature"] = temperature
    if top_p != 1:
        optional_params["top_p"] = top_p
    if n != 1:
        optional_params["n"] = n
    if stream:
        optional_params["stream"] = stream
    if stop != None:
        optional_params["stop"] = stop
    if max_tokens != float('inf'):
        optional_params["max_tokens"] = max_tokens
    if presence_penalty != 0:
        optional_params["presence_penalty"] = presence_penalty
    if frequency_penalty != 0:
        optional_params["frequency_penalty"] = frequency_penalty
    if logit_bias != {}:
        optional_params["logit_bias"] = logit_bias
    if user != "":
        optional_params["user"] = user
    if deployment_id != None:
        optional_params["deployment_id"] = deployment_id
    return optional_params
  return optional_params
 def set_callbacks(callback_list):
@ -324,19 +412,6 @@ def handle_failure(exception, traceback_exception, start_time, end_time, args, k
      logging(logger_fn=user_logger_fn, exception=e)
      pass
 def prompt_token_calculator(model, messages):
  # use tiktoken or anthropic's tokenizer depending on the model
  text = " ".join(message["content"] for message in messages)
  num_tokens = 0
  if "claude" in model:
    anthropic = Anthropic()
    num_tokens = anthropic.count_tokens(text)
  else:
    num_tokens = len(encoding.encode(text))
  return num_tokens
 def handle_success(args, kwargs, result, start_time, end_time):
  global heliconeLogger, aispendLogger
  try:
@ -396,6 +471,19 @@ def handle_success(args, kwargs, result, start_time, end_time):
    print_verbose(f"[Non-Blocking] Success Callback Error - {traceback.format_exc()}")
    pass
 def prompt_token_calculator(model, messages):
  # use tiktoken or anthropic's tokenizer depending on the model
  text = " ".join(message["content"] for message in messages)
  num_tokens = 0
  if "claude" in model:
    install_and_import('anthropic')
    from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT
    anthropic = Anthropic()
    num_tokens = anthropic.count_tokens(text)
  else:
    num_tokens = len(encoding.encode(text))
  return num_tokens
 # integration helper function 
 def modify_integration(integration_name, integration_params):
   global supabaseClient
@ -520,3 +608,30 @@ def get_secret(secret_name):
      return os.environ.get(secret_name)
  else:
    return os.environ.get(secret_name)
 ######## Streaming Class ############################
 # wraps the completion stream to return the correct format for the model
 # replicate/anthropic/cohere
 class CustomStreamWrapper:
    def __init__(self, completion_stream, model):
        self.model = model
        if model in litellm.cohere_models:
           # cohere does not return an iterator, so we need to wrap it in one
           self.completion_stream = iter(completion_stream)
        else: 
          self.completion_stream = completion_stream
    def __iter__(self):
        return self
    def __next__(self):
        if self.model in litellm.anthropic_models:
          chunk = next(self.completion_stream)
          return {"choices": [{"delta": chunk.completion}]}
        elif self.model == "replicate":
           chunk = next(self.completion_stream)
           return {"choices": [{"delta": chunk}]}
        elif self.model in litellm.cohere_models:
          chunk = next(self.completion_stream)
          return {"choices": [{"delta": chunk.text}]}
--- a/mkdocs.yml
+++ b/mkdocs.yml
@ -6,6 +6,8 @@ nav:
      - Input - Request Body: input.md
      - Output - Response Object: output.md
      - Streaming & Async Calls: stream.md
    - token usage:
      - Helper Functions: token_usage.md
  - 🤖 Supported LLM APIs: 
    - Supported Completion & Chat APIs: supported.md
    - Supported Embedding APIs: supported_embedding.md
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "0.1.356"
+version = "0.1.367"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT License"
@ -8,14 +8,8 @@ readme = "README.md"
 [tool.poetry.dependencies]
 python = "^3.8"
-openai = {extras = ["datalib"], version = "^0.27.8"}
+openai = "^0.27.8"
 cohere = "^4.18.0"
 pytest = "^7.4.0"
 pydantic = "^2.1.1"
 anthropic = "^0.3.7"
 replicate = "^0.10.0"
 python-dotenv = "^1.0.0"
 tenacity = "^8.0.1"
 tiktoken = "^0.4.0"
 [build-system]
--- a/requirements.txt
+++ b/requirements.txt
@ -1,11 +1,5 @@
-pydantic
+# used by CI/CD testing
 openai
 cohere
 anthropic
 replicate
 pytest
 python-dotenv
-openai[datalib]
+openai
 tenacity
 tiktoken
 infisical