Merge branch 'main' into main

This commit is contained in:
Krish Dholakia 2023-08-09 11:00:40 -07:00 committed by GitHub
commit 4278b183d0
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
18 changed files with 1000 additions and 102 deletions

BIN
.DS_Store vendored

Binary file not shown.

View file

@ -13,6 +13,9 @@ jobs:
command: | command: |
python -m pip install --upgrade pip python -m pip install --upgrade pip
python -m pip install -r requirements.txt python -m pip install -r requirements.txt
pip install infisical
pip install pytest
pip install openai[datalib]
# Run pytest and generate JUnit XML report # Run pytest and generate JUnit XML report
- run: - run:

View file

@ -3,18 +3,16 @@
[![PyPI Version](https://img.shields.io/badge/stable%20version-v0.1.345-blue?color=green&link=https://pypi.org/project/litellm/0.1.1/)](https://pypi.org/project/litellm/0.1.1/) [![PyPI Version](https://img.shields.io/badge/stable%20version-v0.1.345-blue?color=green&link=https://pypi.org/project/litellm/0.1.1/)](https://pypi.org/project/litellm/0.1.1/)
[![CircleCI](https://dl.circleci.com/status-badge/img/gh/BerriAI/litellm/tree/main.svg?style=svg)](https://dl.circleci.com/status-badge/redirect/gh/BerriAI/litellm/tree/main) [![CircleCI](https://dl.circleci.com/status-badge/img/gh/BerriAI/litellm/tree/main.svg?style=svg)](https://dl.circleci.com/status-badge/redirect/gh/BerriAI/litellm/tree/main)
![Downloads](https://img.shields.io/pypi/dm/litellm) ![Downloads](https://img.shields.io/pypi/dm/litellm)
[![litellm](https://img.shields.io/badge/%20%F0%9F%9A%85%20liteLLM-OpenAI%7CAzure%7CAnthropic%7CPalm%7CCohere-blue?color=green)](https://github.com/BerriAI/litellm) [![litellm](https://img.shields.io/badge/%20%F0%9F%9A%85%20liteLLM-OpenAI%7CAzure%7CAnthropic%7CPalm%7CCohere%7CReplicate%7CHugging%20Face-blue?color=green)](https://github.com/BerriAI/litellm)
Get Support / Join the community 👉 [![](https://dcbadge.vercel.app/api/server/wuPM9dRgDw)](https://discord.gg/wuPM9dRgDw) [![](https://dcbadge.vercel.app/api/server/wuPM9dRgDw)](https://discord.gg/wuPM9dRgDw)
a simple & light package to call OpenAI, Azure, Cohere, Anthropic API Endpoints
litellm manages:
- translating inputs to completion and embedding endpoints
- guarantees consistent output, text responses will always be available at `['choices'][0]['message']['content']`
a light package to simplify calling OpenAI, Azure, Cohere, Anthropic, Huggingface API Endpoints. It manages:
- translating inputs to the provider's completion and embedding endpoints
- guarantees [consistent output](https://litellm.readthedocs.io/en/latest/output/), text responses will always be available at `['choices'][0]['message']['content']`
- exception mapping - common exceptions across providers are mapped to the [OpenAI exception types](https://help.openai.com/en/articles/6897213-openai-library-error-types-guidance)
# usage # usage
Demo - https://litellm.ai/ \
Read the docs - https://litellm.readthedocs.io/en/latest/ Read the docs - https://litellm.readthedocs.io/en/latest/
## quick start ## quick start
@ -25,11 +23,6 @@ pip install litellm
```python ```python
from litellm import completion from litellm import completion
## set ENV variables
# ENV variables can be set in .env file, too. Example in .env.example
os.environ["OPENAI_API_KEY"] = "openai key"
os.environ["COHERE_API_KEY"] = "cohere key"
messages = [{ "content": "Hello, how are you?","role": "user"}] messages = [{ "content": "Hello, how are you?","role": "user"}]
# openai call # openai call
@ -41,6 +34,9 @@ response = completion("command-nightly", messages)
# azure openai call # azure openai call
response = completion("chatgpt-test", messages, azure=True) response = completion("chatgpt-test", messages, azure=True)
# hugging face call
response = completion(model="stabilityai/stablecode-completion-alpha-3b-4k", messages=messages, hugging_face=True)
# openrouter call # openrouter call
response = completion("google/palm-2-codechat-bison", messages) response = completion("google/palm-2-codechat-bison", messages)
``` ```
@ -53,17 +49,23 @@ pip install litellm==0.1.345
## Streaming Queries ## Streaming Queries
liteLLM supports streaming the model response back, pass `stream=True` to get a streaming iterator in response. liteLLM supports streaming the model response back, pass `stream=True` to get a streaming iterator in response.
Streaming is supported for OpenAI, Azure, Anthropic models
```python ```python
response = completion(model="gpt-3.5-turbo", messages=messages, stream=True) response = completion(model="gpt-3.5-turbo", messages=messages, stream=True)
for chunk in response: for chunk in response:
print(chunk['choices'][0]['delta']) print(chunk['choices'][0]['delta'])
# claude 2
result = completion('claude-2', messages, stream=True)
for chunk in result:
print(chunk['choices'][0]['delta'])
``` ```
# hosted version # support / talk with founders
- [Grab time if you want access 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version) - [Our calendar 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version)
- [Community Discord 💭](https://discord.gg/wuPM9dRgDw)
- Our numbers 📞 +1 (770) 8783-106 / +1 (412) 618-6238
- Our emails ✉️ ishaan@berri.ai / krrish@berri.ai
# why did we build this # why did we build this
- **Need for simplicity**: Our code started to get extremely complicated managing & translating calls between Azure, OpenAI, Cohere - **Need for simplicity**: Our code started to get extremely complicated managing & translating calls between Azure, OpenAI, Cohere
# Support
Contact us at ishaan@berri.ai / krrish@berri.ai

View file

@ -0,0 +1,406 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ZwuaylskLxFu",
"outputId": "d684d6a3-32fe-4beb-c378-c39134bcf8cc"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Collecting litellm==0.1.363\n",
" Downloading litellm-0.1.363-py3-none-any.whl (34 kB)\n",
"Requirement already satisfied: openai<0.28.0,>=0.27.8 in /usr/local/lib/python3.10/dist-packages (from litellm==0.1.363) (0.27.8)\n",
"Requirement already satisfied: python-dotenv<2.0.0,>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from litellm==0.1.363) (1.0.0)\n",
"Requirement already satisfied: tiktoken<0.5.0,>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from litellm==0.1.363) (0.4.0)\n",
"Requirement already satisfied: requests>=2.20 in /usr/local/lib/python3.10/dist-packages (from openai<0.28.0,>=0.27.8->litellm==0.1.363) (2.31.0)\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from openai<0.28.0,>=0.27.8->litellm==0.1.363) (4.65.0)\n",
"Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from openai<0.28.0,>=0.27.8->litellm==0.1.363) (3.8.5)\n",
"Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.10/dist-packages (from tiktoken<0.5.0,>=0.4.0->litellm==0.1.363) (2022.10.31)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.20->openai<0.28.0,>=0.27.8->litellm==0.1.363) (3.2.0)\n",
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.20->openai<0.28.0,>=0.27.8->litellm==0.1.363) (3.4)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.20->openai<0.28.0,>=0.27.8->litellm==0.1.363) (1.26.16)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.20->openai<0.28.0,>=0.27.8->litellm==0.1.363) (2023.7.22)\n",
"Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai<0.28.0,>=0.27.8->litellm==0.1.363) (23.1.0)\n",
"Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai<0.28.0,>=0.27.8->litellm==0.1.363) (6.0.4)\n",
"Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai<0.28.0,>=0.27.8->litellm==0.1.363) (4.0.2)\n",
"Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai<0.28.0,>=0.27.8->litellm==0.1.363) (1.9.2)\n",
"Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai<0.28.0,>=0.27.8->litellm==0.1.363) (1.4.0)\n",
"Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai<0.28.0,>=0.27.8->litellm==0.1.363) (1.3.1)\n",
"Installing collected packages: litellm\n",
" Attempting uninstall: litellm\n",
" Found existing installation: litellm 0.1.362\n",
" Uninstalling litellm-0.1.362:\n",
" Successfully uninstalled litellm-0.1.362\n",
"Successfully installed litellm-0.1.363\n"
]
}
],
"source": [
"!pip install litellm==\"0.1.363\""
]
},
{
"cell_type": "code",
"source": [
"# @title Import litellm & Set env variables\n",
"import litellm\n",
"import os\n",
"\n",
"os.environ[\"ANTHROPIC_API_KEY\"] = \" \" #@param"
],
"metadata": {
"id": "W216G__XL19Q"
},
"execution_count": 2,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# @title Request Claude Instant-1 and Claude-2\n",
"messages = [\n",
" {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
" {\"role\": \"user\", \"content\": \"Who won the world series in 2020?\"}\n",
" ]\n",
"\n",
"result = litellm.completion('claude-instant-1', messages)\n",
"print(\"\\n\\n Result from claude-instant-1\", result)\n",
"result = litellm.completion('claude-2', messages, max_tokens=5, temperature=0.2)\n",
"print(\"\\n\\n Result from claude-2\", result)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ff1lKwUMMLJj",
"outputId": "bfddf6f8-36d4-45e5-92dc-349083fa41b8"
},
"execution_count": 9,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"\n",
" Result from claude-instant-1 {'choices': [{'finish_reason': 'stop', 'index': 0, 'message': {'role': 'assistant', 'content': \" The Los Angeles Dodgers won the 2020 World Series, defeating the Tampa Bay Rays 4-2. It was the Dodgers' first World Series title since 1988.\"}}], 'created': 1691536677.2676156, 'model': 'claude-instant-1', 'usage': {'prompt_tokens': 30, 'completion_tokens': 32, 'total_tokens': 62}}\n",
"\n",
"\n",
" Result from claude-2 {'choices': [{'finish_reason': 'stop', 'index': 0, 'message': {'role': 'assistant', 'content': ' The Los Angeles Dodgers won'}}], 'created': 1691536677.944753, 'model': 'claude-2', 'usage': {'prompt_tokens': 30, 'completion_tokens': 5, 'total_tokens': 35}}\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# @title Streaming Example: Request Claude-2\n",
"messages = [\n",
" {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
" {\"role\": \"user\", \"content\": \"how does a court case get to the Supreme Court?\"}\n",
" ]\n",
"\n",
"result = litellm.completion('claude-2', messages, stream=True)\n",
"for chunk in result:\n",
" print(chunk['choices'][0]['delta'])\n",
"\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "06hWKnNQMrV-",
"outputId": "7fdec0eb-d4a9-4882-f9c4-987ff9a31114"
},
"execution_count": 7,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
" Here\n",
"'s\n",
" a\n",
" quick\n",
" overview\n",
" of\n",
" how\n",
" a\n",
" court\n",
" case\n",
" can\n",
" reach\n",
" the\n",
" U\n",
".\n",
"S\n",
".\n",
" Supreme\n",
" Court\n",
":\n",
"\n",
"\n",
"-\n",
" The\n",
" case\n",
" must\n",
" first\n",
" be\n",
" heard\n",
" in\n",
" a\n",
" lower\n",
" trial\n",
" court\n",
" (\n",
"either\n",
" a\n",
" state\n",
" court\n",
" or\n",
" federal\n",
" district\n",
" court\n",
").\n",
" The\n",
" trial\n",
" court\n",
" makes\n",
" initial\n",
" r\n",
"ulings\n",
" and\n",
" produces\n",
" a\n",
" record\n",
" of\n",
" the\n",
" case\n",
".\n",
"\n",
"\n",
"-\n",
" The\n",
" losing\n",
" party\n",
" can\n",
" appeal\n",
" the\n",
" decision\n",
" to\n",
" an\n",
" appeals\n",
" court\n",
" (\n",
"a\n",
" state\n",
" appeals\n",
" court\n",
" for\n",
" state\n",
" cases\n",
",\n",
" or\n",
" a\n",
" federal\n",
" circuit\n",
" court\n",
" for\n",
" federal\n",
" cases\n",
").\n",
" The\n",
" appeals\n",
" court\n",
" reviews\n",
" the\n",
" trial\n",
" court\n",
"'s\n",
" r\n",
"ulings\n",
" and\n",
" can\n",
" affirm\n",
",\n",
" reverse\n",
",\n",
" or\n",
" modify\n",
" the\n",
" decision\n",
".\n",
"\n",
"\n",
"-\n",
" If\n",
" a\n",
" party\n",
" is\n",
" still\n",
" unsat\n",
"isf\n",
"ied\n",
" after\n",
" the\n",
" appeals\n",
" court\n",
" rules\n",
",\n",
" they\n",
" can\n",
" petition\n",
" the\n",
" Supreme\n",
" Court\n",
" to\n",
" hear\n",
" the\n",
" case\n",
" through\n",
" a\n",
" writ\n",
" of\n",
" cert\n",
"ior\n",
"ari\n",
".\n",
" \n",
"\n",
"\n",
"-\n",
" The\n",
" Supreme\n",
" Court\n",
" gets\n",
" thousands\n",
" of\n",
" cert\n",
" petitions\n",
" every\n",
" year\n",
" but\n",
" usually\n",
" only\n",
" agrees\n",
" to\n",
" hear\n",
" about\n",
" 100\n",
"-\n",
"150\n",
" of\n",
" cases\n",
" that\n",
" have\n",
" significant\n",
" national\n",
" importance\n",
" or\n",
" where\n",
" lower\n",
" courts\n",
" disagree\n",
" on\n",
" federal\n",
" law\n",
".\n",
" \n",
"\n",
"\n",
"-\n",
" If\n",
" 4\n",
" out\n",
" of\n",
" the\n",
" 9\n",
" Just\n",
"ices\n",
" vote\n",
" to\n",
" grant\n",
" cert\n",
" (\n",
"agree\n",
" to\n",
" hear\n",
" the\n",
" case\n",
"),\n",
" it\n",
" goes\n",
" on\n",
" the\n",
" Supreme\n",
" Court\n",
"'s\n",
" do\n",
"cket\n",
" for\n",
" arguments\n",
".\n",
"\n",
"\n",
"-\n",
" The\n",
" Supreme\n",
" Court\n",
" then\n",
" hears\n",
" oral\n",
" arguments\n",
",\n",
" considers\n",
" written\n",
" brief\n",
"s\n",
",\n",
" examines\n",
" the\n",
" lower\n",
" court\n",
" records\n",
",\n",
" and\n",
" issues\n",
" a\n",
" final\n",
" ruling\n",
" on\n",
" the\n",
" case\n",
",\n",
" which\n",
" serves\n",
" as\n",
" binding\n",
" precedent\n"
]
}
]
}
]
}

View file

@ -0,0 +1,153 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"source": [
"## Install liteLLM https://github.com/BerriAI/litellm\n",
"liteLLM provides one interface to call gpt 3.5, hugging face inference endpoints"
],
"metadata": {
"id": "IGQZtR61AZSd"
}
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "x_4jcmmXcdm-",
"outputId": "c89e7817-561d-4867-904b-aa1634565cbb"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Requirement already satisfied: litellm==0.1.362 in /usr/local/lib/python3.10/dist-packages (0.1.362)\n",
"Requirement already satisfied: openai<0.28.0,>=0.27.8 in /usr/local/lib/python3.10/dist-packages (from litellm==0.1.362) (0.27.8)\n",
"Requirement already satisfied: python-dotenv<2.0.0,>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from litellm==0.1.362) (1.0.0)\n",
"Requirement already satisfied: tiktoken<0.5.0,>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from litellm==0.1.362) (0.4.0)\n",
"Requirement already satisfied: requests>=2.20 in /usr/local/lib/python3.10/dist-packages (from openai<0.28.0,>=0.27.8->litellm==0.1.362) (2.28.2)\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from openai<0.28.0,>=0.27.8->litellm==0.1.362) (4.65.0)\n",
"Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from openai<0.28.0,>=0.27.8->litellm==0.1.362) (3.8.5)\n",
"Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.10/dist-packages (from tiktoken<0.5.0,>=0.4.0->litellm==0.1.362) (2022.10.31)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.20->openai<0.28.0,>=0.27.8->litellm==0.1.362) (3.2.0)\n",
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.20->openai<0.28.0,>=0.27.8->litellm==0.1.362) (3.4)\n",
"Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.20->openai<0.28.0,>=0.27.8->litellm==0.1.362) (1.26.16)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.20->openai<0.28.0,>=0.27.8->litellm==0.1.362) (2023.7.22)\n",
"Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai<0.28.0,>=0.27.8->litellm==0.1.362) (23.1.0)\n",
"Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai<0.28.0,>=0.27.8->litellm==0.1.362) (6.0.4)\n",
"Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai<0.28.0,>=0.27.8->litellm==0.1.362) (4.0.2)\n",
"Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai<0.28.0,>=0.27.8->litellm==0.1.362) (1.9.2)\n",
"Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai<0.28.0,>=0.27.8->litellm==0.1.362) (1.4.0)\n",
"Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai<0.28.0,>=0.27.8->litellm==0.1.362) (1.3.1)\n"
]
}
],
"source": [
"!pip install litellm==\"0.1.362\""
]
},
{
"cell_type": "code",
"source": [
"from litellm import completion\n",
"import os\n",
"user_message = \"Hello, whats the weather in San Francisco??\"\n",
"messages = [{ \"content\": user_message,\"role\": \"user\"}]\n",
"\n",
"os.environ['HF_TOKEN'] = \"\"#@param\n",
"# get your hugging face token from here:\n",
"# https://huggingface.co/settings/tokens\n",
"\n",
"# Optional if you want to run OpenAI TOO\n",
"os.environ['OPENAI_API_KEY'] = \"\" #@param\n",
"\n",
"response = completion(\"stabilityai/stablecode-completion-alpha-3b-4k\", messages=messages, hugging_face=True)\n",
"print(\"Response from stabilityai/stablecode-completion-alpha-3b-4k\")\n",
"print(response['choices'][0]['message']['content'])\n",
"print(\"\\n\\n\")\n",
"\n",
"response = completion(\"bigcode/starcoder\", messages=messages, hugging_face=True)\n",
"print(\"Response from bigcode/starcoder\")\n",
"print(response['choices'][0]['message']['content'])\n",
"print(\"\\n\\n\")\n",
"\n",
"response = completion(\"google/flan-t5-xxl\", messages=messages, hugging_face=True)\n",
"print(\"Response from google/flan-t5-xxl\")\n",
"print(response['choices'][0]['message']['content'])\n",
"print(\"\\n\\n\")\n",
"\n",
"response = completion(\"google/flan-t5-large\", messages=messages, hugging_face=True)\n",
"print(\"Response from google/flan-t5-large\")\n",
"print(response['choices'][0]['message']['content'])\n",
"print(\"\\n\\n\")\n",
"\n",
"response = completion(model=\"gpt-3.5-turbo\", messages=messages)\n",
"print(response['choices'][0]['message']['content'])\n",
"print(response)\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "vC54VW3jvLnN",
"outputId": "e6616221-12c9-4313-dd03-fd94fa095e8e"
},
"execution_count": 5,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Response from stabilityai/stablecode-completion-alpha-3b-4k\n",
"Hello, whats the weather in San Francisco??\",\n",
" \"id\": 1,\n",
" \"\n",
"\n",
"\n",
"\n",
"Response from bigcode/starcoder\n",
"Hello, whats the weather in San Francisco??\")\n",
"\n",
"# print(response)\n",
"\n",
"# print(response.text)\n",
"\n",
"#\n",
"\n",
"\n",
"\n",
"Response from google/flan-t5-xxl\n",
"a little cold\n",
"\n",
"\n",
"\n",
"Response from google/flan-t5-large\n",
"cool\n",
"\n",
"\n",
"\n",
"I'm sorry, but I am an AI language model and do not have real-time data. However, you can check the weather in San Francisco by searching for \"San Francisco weather\" on a search engine or checking a reliable weather website or app.\n"
]
}
]
}
]
}

View file

@ -34,5 +34,26 @@
| Model Name | Function Call | Required OS Variables | | Model Name | Function Call | Required OS Variables |
|------------------|--------------------------------------------|--------------------------------------| |------------------|--------------------------------------------|--------------------------------------|
| claude-instant-1 | `completion('claude-instant-1', messages)` | `os.environ['ANTHROPIC_API_KEY']` | | claude-instant-1 | `completion('claude-instant-1', messages)` | `os.environ['ANTHROPIC_API_KEY']` |
| claude-v2 | `completion('claude-v2', messages)` | `os.environ['ANTHROPIC_API_KEY']` | | claude-2 | `completion('claude-2', messages)` | `os.environ['ANTHROPIC_API_KEY']` |
### Hugging Face Inference API
All [`text2text-generation`](https://huggingface.co/models?library=transformers&pipeline_tag=text2text-generation&sort=downloads) and [`text-generation`](https://huggingface.co/models?library=transformers&pipeline_tag=text-generation&sort=downloads) models are supported by liteLLM. You can use any text model from Hugging Face with the following steps:
* Copy the `model repo` URL from Hugging Face and set it as the `model` parameter in the completion call.
* Set `hugging_face` parameter to `True`.
* Make sure to set the hugging face API key
Here are some examples of supported models:
**Note that the models mentioned in the table are examples, and you can use any text model available on Hugging Face by following the steps above.**
| Model Name | Function Call | Required OS Variables |
|------------------|-------------------------------------------------------------------------------------|--------------------------------------|
| [stabilityai/stablecode-completion-alpha-3b-4k](https://huggingface.co/stabilityai/stablecode-completion-alpha-3b-4k) | `completion(model="stabilityai/stablecode-completion-alpha-3b-4k", messages=messages, hugging_face=True)` | `os.environ['HF_TOKEN']` |
| [bigcode/starcoder](https://huggingface.co/bigcode/starcoder) | `completion(model="bigcode/starcoder", messages=messages, hugging_face=True)` | `os.environ['HF_TOKEN']` |
| [google/flan-t5-xxl](https://huggingface.co/google/flan-t5-xxl) | `completion(model="google/flan-t5-xxl", messages=messages, hugging_face=True)` | `os.environ['HF_TOKEN']` |
| [google/flan-t5-large](https://huggingface.co/google/flan-t5-large) | `completion(model="google/flan-t5-large", messages=messages, hugging_face=True)` | `os.environ['HF_TOKEN']` |

45
docs/token_usage.md Normal file
View file

@ -0,0 +1,45 @@
# Token Usage
By default LiteLLM returns token usage in all completion requests ([See here](https://litellm.readthedocs.io/en/latest/output/))
However, we also expose 3 public helper functions to calculate token usage across providers:
- `token_counter`: This returns the number of tokens for a given input - it uses the tokenizer based on the model, and defaults to tiktoken if no model-specific tokenizer is available.
- `cost_per_token`: This returns the cost (in USD) for prompt (input) and completion (output) tokens. It utilizes our model_cost map which can be found in `__init__.py` and also as a [community resource](https://github.com/BerriAI/litellm/blob/main/cookbook/community-resources/max_tokens.json).
- `completion_cost`: This returns the overall cost (in USD) for a given LLM API Call. It combines `token_counter` and `cost_per_token` to return the cost for that query (counting both cost of input and output).
## Example Usage
1. `token_counter`
```python
from litellm import token_counter
messages = [{"user": "role", "content": "Hey, how's it going"}]
print(token_counter(model="gpt-3.5-turbo", messages=messages))
```
2. `cost_per_token`
```python
from litellm import cost_per_token
prompt_tokens = 5
completion_tokens = 10
prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = cost_per_token(model="gpt-3.5-turbo", prompt_tokens=prompt_tokens, completion_tokens=completion_tokens))
print(prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar)
```
3. `completion_cost`
```python
from litellm import completion_cost
prompt = "Hey, how's it going"
completion = "Hi, I'm gpt - I am doing well"
cost_of_query = completion_cost(model="gpt-3.5-turbo", prompt=prompt, completion=completion))
print(cost_of_query)
```

View file

@ -4,13 +4,34 @@ failure_callback = []
set_verbose=False set_verbose=False
telemetry=True telemetry=True
max_tokens = 256 # OpenAI Defaults max_tokens = 256 # OpenAI Defaults
retry = True # control tenacity retries. retry = True
openai_key = None openai_key = None
azure_key = None azure_key = None
anthropic_key = None anthropic_key = None
replicate_key = None replicate_key = None
cohere_key = None cohere_key = None
openrouter_key = None openrouter_key = None
hugging_api_token = None
model_cost = {
"gpt-3.5-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
"gpt-35-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002}, # azure model name
"gpt-3.5-turbo-0613": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
"gpt-3.5-turbo-0301": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
"gpt-3.5-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
"gpt-35-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004}, # azure model name
"gpt-3.5-turbo-16k-0613": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
"gpt-4": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
"gpt-4-0613": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
"gpt-4-32k": {"max_tokens": 8000, "input_cost_per_token": 0.00006, "output_cost_per_token": 0.00012},
"claude-instant-1": {"max_tokens": 100000, "input_cost_per_token": 0.00000163, "output_cost_per_token": 0.00000551},
"claude-2": {"max_tokens": 100000, "input_cost_per_token": 0.00001102, "output_cost_per_token": 0.00003268},
"text-bison-001": {"max_tokens": 8192, "input_cost_per_token": 0.000004, "output_cost_per_token": 0.000004},
"chat-bison-001": {"max_tokens": 4096, "input_cost_per_token": 0.000002, "output_cost_per_token": 0.000002},
"command-nightly": {"max_tokens": 4096, "input_cost_per_token": 0.000015, "output_cost_per_token": 0.000015},
}
####### THREAD-SPECIFIC DATA ################### ####### THREAD-SPECIFIC DATA ###################
class MyLocal(threading.local): class MyLocal(threading.local):
def __init__(self): def __init__(self):
@ -83,7 +104,7 @@ open_ai_embedding_models = [
'text-embedding-ada-002' 'text-embedding-ada-002'
] ]
from .timeout import timeout from .timeout import timeout
from .utils import client, logging, exception_type, get_optional_params, modify_integration from .utils import client, logging, exception_type, get_optional_params, modify_integration, token_counter, cost_per_token, completion_cost
from .main import * # Import all the symbols from main.py from .main import * # Import all the symbols from main.py
from .integrations import * from .integrations import *
from openai.error import AuthenticationError, InvalidRequestError, RateLimitError, ServiceUnavailableError, OpenAIError from openai.error import AuthenticationError, InvalidRequestError, RateLimitError, ServiceUnavailableError, OpenAIError

View file

@ -2,7 +2,6 @@
# On success, logs events to Helicone # On success, logs events to Helicone
import dotenv, os import dotenv, os
import requests import requests
from anthropic import HUMAN_PROMPT, AI_PROMPT
dotenv.load_dotenv() # Loading env variables using dotenv dotenv.load_dotenv() # Loading env variables using dotenv
import traceback import traceback
class HeliconeLogger: class HeliconeLogger:
@ -14,6 +13,7 @@ class HeliconeLogger:
self.key = os.getenv('HELICONE_API_KEY') self.key = os.getenv('HELICONE_API_KEY')
def claude_mapping(self, model, messages, response_obj): def claude_mapping(self, model, messages, response_obj):
from anthropic import HUMAN_PROMPT, AI_PROMPT
prompt = f"{HUMAN_PROMPT}" prompt = f"{HUMAN_PROMPT}"
for message in messages: for message in messages:
if "role" in message: if "role" in message:

View file

@ -1,6 +1,5 @@
import os, openai, cohere, replicate, sys import os, openai, sys
from typing import Any from typing import Any
from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT
from functools import partial from functools import partial
import dotenv, traceback, random, asyncio, time import dotenv, traceback, random, asyncio, time
from copy import deepcopy from copy import deepcopy
@ -8,15 +7,9 @@ import litellm
from litellm import client, logging, exception_type, timeout, get_optional_params from litellm import client, logging, exception_type, timeout, get_optional_params
import tiktoken import tiktoken
encoding = tiktoken.get_encoding("cl100k_base") encoding = tiktoken.get_encoding("cl100k_base")
from tenacity import ( from litellm.utils import get_secret, install_and_import, CustomStreamWrapper
retry,
stop_after_attempt,
wait_random_exponential,
) # for exponential backoff
from litellm.utils import get_secret
####### ENVIRONMENT VARIABLES ################### ####### ENVIRONMENT VARIABLES ###################
dotenv.load_dotenv() # Loading env variables using dotenv dotenv.load_dotenv() # Loading env variables using dotenv
new_response = { new_response = {
"choices": [ "choices": [
{ {
@ -28,9 +21,7 @@ new_response = {
} }
] ]
} }
# TODO move this to utils.py
# TODO add translations # TODO add translations
# TODO see if this worked - model_name == krrish
####### COMPLETION ENDPOINTS ################ ####### COMPLETION ENDPOINTS ################
############################################# #############################################
async def acompletion(*args, **kwargs): async def acompletion(*args, **kwargs):
@ -52,7 +43,8 @@ def completion(
temperature=1, top_p=1, n=1, stream=False, stop=None, max_tokens=float('inf'), temperature=1, top_p=1, n=1, stream=False, stop=None, max_tokens=float('inf'),
presence_penalty=0, frequency_penalty=0, logit_bias={}, user="", deployment_id=None, presence_penalty=0, frequency_penalty=0, logit_bias={}, user="", deployment_id=None,
# Optional liteLLM function params # Optional liteLLM function params
*, return_async=False, api_key=None, force_timeout=60, azure=False, logger_fn=None, verbose=False *, return_async=False, api_key=None, force_timeout=60, azure=False, logger_fn=None, verbose=False,
hugging_face = False, replicate=False,
): ):
try: try:
global new_response global new_response
@ -61,13 +53,16 @@ def completion(
optional_params = get_optional_params( optional_params = get_optional_params(
functions=functions, function_call=function_call, functions=functions, function_call=function_call,
temperature=temperature, top_p=top_p, n=n, stream=stream, stop=stop, max_tokens=max_tokens, temperature=temperature, top_p=top_p, n=n, stream=stream, stop=stop, max_tokens=max_tokens,
presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, logit_bias=logit_bias, user=user, deployment_id=deployment_id presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, logit_bias=logit_bias, user=user, deployment_id=deployment_id,
# params to identify the model
model=model, replicate=replicate, hugging_face=hugging_face
) )
if azure == True: if azure == True:
# azure configs # azure configs
openai.api_type = "azure" openai.api_type = "azure"
openai.api_base = litellm.api_base if litellm.api_base is not None else get_secret("AZURE_API_BASE") openai.api_base = litellm.api_base if litellm.api_base is not None else get_secret("AZURE_API_BASE")
openai.api_version = litellm.api_version if litellm.api_version is not None else get_secret("AZURE_API_VERSION") openai.api_version = litellm.api_version if litellm.api_version is not None else get_secret("AZURE_API_VERSION")
# set key
if api_key: if api_key:
openai.api_key = api_key openai.api_key = api_key
elif litellm.azure_key: elif litellm.azure_key:
@ -92,6 +87,7 @@ def completion(
) )
elif model in litellm.open_ai_chat_completion_models: elif model in litellm.open_ai_chat_completion_models:
openai.api_type = "openai" openai.api_type = "openai"
# note: if a user sets a custom base - we should ensure this works
openai.api_base = litellm.api_base if litellm.api_base is not None else "https://api.openai.com/v1" openai.api_base = litellm.api_base if litellm.api_base is not None else "https://api.openai.com/v1"
openai.api_version = None openai.api_version = None
if litellm.organization: if litellm.organization:
@ -154,7 +150,10 @@ def completion(
model_response["model"] = model model_response["model"] = model
model_response["usage"] = response["usage"] model_response["usage"] = response["usage"]
response = model_response response = model_response
elif "replicate" in model: elif "replicate" in model or replicate == True:
# import replicate/if it fails then pip install replicate
install_and_import("replicate")
import replicate
# replicate defaults to os.environ.get("REPLICATE_API_TOKEN") # replicate defaults to os.environ.get("REPLICATE_API_TOKEN")
# checking in case user set it to REPLICATE_API_KEY instead # checking in case user set it to REPLICATE_API_KEY instead
if not get_secret("REPLICATE_API_TOKEN") and get_secret("REPLICATE_API_KEY"): if not get_secret("REPLICATE_API_TOKEN") and get_secret("REPLICATE_API_KEY"):
@ -175,6 +174,11 @@ def completion(
output = replicate.run( output = replicate.run(
model, model,
input=input) input=input)
if 'stream' in optional_params and optional_params['stream'] == True:
# don't try to access stream object,
# let the stream handler know this is replicate
response = CustomStreamWrapper(output, "replicate")
return response
response = "" response = ""
for item in output: for item in output:
response += item response += item
@ -194,6 +198,10 @@ def completion(
} }
response = model_response response = model_response
elif model in litellm.anthropic_models: elif model in litellm.anthropic_models:
# import anthropic/if it fails then pip install anthropic
install_and_import("anthropic")
from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT
#anthropic defaults to os.environ.get("ANTHROPIC_API_KEY") #anthropic defaults to os.environ.get("ANTHROPIC_API_KEY")
if api_key: if api_key:
os.environ["ANTHROPIC_API_KEY"] = api_key os.environ["ANTHROPIC_API_KEY"] = api_key
@ -220,8 +228,14 @@ def completion(
completion = anthropic.completions.create( completion = anthropic.completions.create(
model=model, model=model,
prompt=prompt, prompt=prompt,
max_tokens_to_sample=max_tokens_to_sample max_tokens_to_sample=max_tokens_to_sample,
**optional_params
) )
if 'stream' in optional_params and optional_params['stream'] == True:
# don't try to access stream object,
response = CustomStreamWrapper(completion, model)
return response
completion_response = completion.completion completion_response = completion.completion
## LOGGING ## LOGGING
logging(model=model, input=prompt, azure=azure, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn) logging(model=model, input=prompt, azure=azure, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn)
@ -274,6 +288,9 @@ def completion(
**optional_params **optional_params
) )
elif model in litellm.cohere_models: elif model in litellm.cohere_models:
# import cohere/if it fails then pip install cohere
install_and_import("cohere")
import cohere
if api_key: if api_key:
cohere_key = api_key cohere_key = api_key
elif litellm.cohere_key: elif litellm.cohere_key:
@ -287,8 +304,14 @@ def completion(
## COMPLETION CALL ## COMPLETION CALL
response = co.generate( response = co.generate(
model=model, model=model,
prompt = prompt prompt = prompt,
**optional_params
) )
if 'stream' in optional_params and optional_params['stream'] == True:
# don't try to access stream object,
response = CustomStreamWrapper(response, model)
return response
completion_response = response[0].text completion_response = response[0].text
## LOGGING ## LOGGING
logging(model=model, input=prompt, azure=azure, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn) logging(model=model, input=prompt, azure=azure, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn)
@ -304,6 +327,33 @@ def completion(
"total_tokens": prompt_tokens + completion_tokens "total_tokens": prompt_tokens + completion_tokens
} }
response = model_response response = model_response
elif hugging_face == True:
import requests
API_URL = f"https://api-inference.huggingface.co/models/{model}"
HF_TOKEN = get_secret("HF_TOKEN")
headers = {"Authorization": f"Bearer {HF_TOKEN}"}
prompt = " ".join([message["content"] for message in messages])
## LOGGING
logging(model=model, input=prompt, azure=azure, logger_fn=logger_fn)
input_payload = {"inputs": prompt}
response = requests.post(API_URL, headers=headers, json=input_payload)
completion_response = response.json()[0]['generated_text']
## LOGGING
logging(model=model, input=prompt, azure=azure, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn)
prompt_tokens = len(encoding.encode(prompt))
completion_tokens = len(encoding.encode(completion_response))
## RESPONSE OBJECT
model_response["choices"][0]["message"]["content"] = completion_response
model_response["created"] = time.time()
model_response["model"] = model
model_response["usage"] = {
"prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens,
"total_tokens": prompt_tokens + completion_tokens
}
response = model_response
else: else:
## LOGGING ## LOGGING
logging(model=model, input=messages, azure=azure, logger_fn=logger_fn) logging(model=model, input=messages, azure=azure, logger_fn=logger_fn)

View file

@ -1 +0,0 @@
test 1

View file

@ -7,8 +7,10 @@ sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the
import pytest import pytest
import litellm import litellm
from litellm import embedding, completion from litellm import embedding, completion
from infisical import InfisicalClient
# litellm.set_verbose = True # litellm.set_verbose = True
litellm.secret_manager_client = InfisicalClient(token=os.environ["INFISICAL_TOKEN"])
user_message = "Hello, whats the weather in San Francisco??" user_message = "Hello, whats the weather in San Francisco??"
messages = [{ "content": user_message,"role": "user"}] messages = [{ "content": user_message,"role": "user"}]
@ -16,6 +18,59 @@ messages = [{ "content": user_message,"role": "user"}]
def logger_fn(user_model_dict): def logger_fn(user_model_dict):
print(f"user_model_dict: {user_model_dict}") print(f"user_model_dict: {user_model_dict}")
def test_completion_claude():
try:
response = completion(model="claude-instant-1", messages=messages, logger_fn=logger_fn)
# Add any assertions here to check the response
print(response)
except Exception as e:
pytest.fail(f"Error occurred: {e}")
def test_completion_claude_stream():
try:
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "how does a court case get to the Supreme Court?"}
]
response = completion(model="claude-2", messages=messages, stream=True)
# Add any assertions here to check the response
for chunk in response:
print(chunk['choices'][0]['delta']) # same as openai format
except Exception as e:
pytest.fail(f"Error occurred: {e}")
def test_completion_hf_api():
try:
user_message = "write some code to find the sum of two numbers"
messages = [{ "content": user_message,"role": "user"}]
response = completion(model="stabilityai/stablecode-completion-alpha-3b-4k", messages=messages, hugging_face=True)
# Add any assertions here to check the response
print(response)
except Exception as e:
pytest.fail(f"Error occurred: {e}")
def test_completion_cohere():
try:
response = completion(model="command-nightly", messages=messages, max_tokens=500)
# Add any assertions here to check the response
print(response)
except Exception as e:
pytest.fail(f"Error occurred: {e}")
def test_completion_cohere_stream():
try:
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "how does a court case get to the Supreme Court?"}
]
response = completion(model="command-nightly", messages=messages, stream=True, max_tokens=50)
# Add any assertions here to check the response
for chunk in response:
print(chunk['choices'][0]['delta']) # same as openai format
except Exception as e:
pytest.fail(f"Error occurred: {e}")
def test_completion_openai(): def test_completion_openai():
try: try:
response = completion(model="gpt-3.5-turbo", messages=messages) response = completion(model="gpt-3.5-turbo", messages=messages)
@ -92,18 +147,25 @@ def test_completion_azure():
except Exception as e: except Exception as e:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
def test_completion_claude(): # Replicate API endpoints are unstable -> throw random CUDA errors -> this means our tests can fail even if our tests weren't incorrect.
def test_completion_replicate_llama_stream():
model_name = "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1"
try: try:
response = completion(model="claude-instant-1", messages=messages, logger_fn=logger_fn) response = completion(model=model_name, messages=messages, stream=True)
# Add any assertions here to check the response # Add any assertions here to check the response
for result in response:
print(result)
print(response) print(response)
except Exception as e: except Exception as e:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
def test_completion_cohere(): def test_completion_replicate_stability_stream():
model_name = "stability-ai/stablelm-tuned-alpha-7b:c49dae362cbaecd2ceabb5bd34fdb68413c4ff775111fea065d259d577757beb"
try: try:
response = completion(model="command-nightly", messages=messages, max_tokens=500) response = completion(model=model_name, messages=messages, stream=True, replicate=True)
# Add any assertions here to check the response # Add any assertions here to check the response
for result in response:
print(result)
print(response) print(response)
except Exception as e: except Exception as e:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
@ -124,3 +186,14 @@ def test_completion_cohere():
# pass # pass
# else: # else:
# pytest.fail(f"Error occurred: {e}") # pytest.fail(f"Error occurred: {e}")
def test_completion_replicate_stability():
model_name = "stability-ai/stablelm-tuned-alpha-7b:c49dae362cbaecd2ceabb5bd34fdb68413c4ff775111fea065d259d577757beb"
try:
response = completion(model=model_name, messages=messages, replicate=True)
# Add any assertions here to check the response
for result in response:
print(result)
print(response)
except Exception as e:
pytest.fail(f"Error occurred: {e}")

View file

@ -0,0 +1,20 @@
import sys, os
import traceback
import pytest
sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path
import litellm
from litellm import embedding, completion
from infisical import InfisicalClient
# litellm.set_verbose = True
litellm.secret_manager_client = InfisicalClient(token=os.environ["INFISICAL_TOKEN"])
def test_openai_embedding():
try:
response = embedding(model='text-embedding-ada-002', input=["good morning from litellm"])
# Add any assertions here to check the response
print(f"response: {str(response)}")
except Exception as e:
pytest.fail(f"Error occurred: {e}")

View file

@ -4,7 +4,6 @@ import subprocess, os
import litellm, openai import litellm, openai
import random, uuid, requests import random, uuid, requests
import datetime, time import datetime, time
from anthropic import Anthropic
import tiktoken import tiktoken
encoding = tiktoken.get_encoding("cl100k_base") encoding = tiktoken.get_encoding("cl100k_base")
from .integrations.helicone import HeliconeLogger from .integrations.helicone import HeliconeLogger
@ -34,6 +33,19 @@ def print_verbose(print_statement):
if random.random() <= 0.3: if random.random() <= 0.3:
print("Get help - https://discord.com/invite/wuPM9dRgDw") print("Get help - https://discord.com/invite/wuPM9dRgDw")
####### Package Import Handler ###################
import importlib
import subprocess
def install_and_import(package):
try:
importlib.import_module(package)
except ImportError:
print(f"{package} is not installed. Installing...")
subprocess.call([sys.executable, '-m', 'pip', 'install', package])
finally:
globals()[package] = importlib.import_module(package)
##################################################
####### LOGGING ################### ####### LOGGING ###################
#Logging function -> log the exact model details + what's being sent | Non-Blocking #Logging function -> log the exact model details + what's being sent | Non-Blocking
def logging(model=None, input=None, azure=False, additional_args={}, logger_fn=None, exception=None): def logging(model=None, input=None, azure=False, additional_args={}, logger_fn=None, exception=None):
@ -119,6 +131,51 @@ def client(original_function):
raise e raise e
return wrapper return wrapper
####### USAGE CALCULATOR ################
def token_counter(model, text):
# use tiktoken or anthropic's tokenizer depending on the model
num_tokens = 0
if "claude" in model:
install_and_import('anthropic')
from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT
anthropic = Anthropic()
num_tokens = anthropic.count_tokens(text)
else:
num_tokens = len(encoding.encode(text))
return num_tokens
def cost_per_token(model="gpt-3.5-turbo", prompt_tokens = 0, completion_tokens = 0):
## given
prompt_tokens_cost_usd_dollar = 0
completion_tokens_cost_usd_dollar = 0
model_cost_ref = litellm.model_cost
if model in model_cost_ref:
prompt_tokens_cost_usd_dollar = model_cost_ref[model]["input_cost_per_token"] * prompt_tokens
completion_tokens_cost_usd_dollar = model_cost_ref[model]["output_cost_per_token"] * completion_tokens
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
else:
# calculate average input cost
input_cost_sum = 0
output_cost_sum = 0
model_cost_ref = litellm.model_cost
for model in model_cost_ref:
input_cost_sum += model_cost_ref[model]["input_cost_per_token"]
output_cost_sum += model_cost_ref[model]["output_cost_per_token"]
avg_input_cost = input_cost_sum / len(model_cost_ref.keys())
avg_output_cost = output_cost_sum / len(model_cost_ref.keys())
prompt_tokens_cost_usd_dollar = avg_input_cost * prompt_tokens
completion_tokens_cost_usd_dollar = avg_output_cost * completion_tokens
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
def completion_cost(model="gpt-3.5-turbo", prompt="", completion=""):
prompt_tokens = tokenizer(model=model, text=prompt)
completion_tokens = tokenizer(model=model, text=completion)
prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = cost_per_token(model=model, prompt_tokens = prompt_tokens, completion_tokens = completion_tokens)
return prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
####### HELPER FUNCTIONS ################ ####### HELPER FUNCTIONS ################
def get_optional_params( def get_optional_params(
# 12 optional params # 12 optional params
@ -134,35 +191,66 @@ def get_optional_params(
frequency_penalty = 0, frequency_penalty = 0,
logit_bias = {}, logit_bias = {},
user = "", user = "",
deployment_id = None deployment_id = None,
model = None,
replicate = False,
hugging_face = False,
): ):
optional_params = {} optional_params = {}
if functions != []: if model in litellm.anthropic_models:
optional_params["functions"] = functions # handle anthropic params
if function_call != "": if stream:
optional_params["function_call"] = function_call
if temperature != 1:
optional_params["temperature"] = temperature
if top_p != 1:
optional_params["top_p"] = top_p
if n != 1:
optional_params["n"] = n
if stream:
optional_params["stream"] = stream optional_params["stream"] = stream
if stop != None: if stop != None:
optional_params["stop"] = stop optional_params["stop_sequences"] = stop
if max_tokens != float('inf'): if temperature != 1:
optional_params["max_tokens"] = max_tokens optional_params["temperature"] = temperature
if presence_penalty != 0: if top_p != 1:
optional_params["presence_penalty"] = presence_penalty optional_params["top_p"] = top_p
if frequency_penalty != 0: return optional_params
optional_params["frequency_penalty"] = frequency_penalty elif model in litellm.cohere_models:
if logit_bias != {}: # handle cohere params
optional_params["logit_bias"] = logit_bias if stream:
if user != "": optional_params["stream"] = stream
optional_params["user"] = user if temperature != 1:
if deployment_id != None: optional_params["temperature"] = temperature
optional_params["deployment_id"] = deployment_id if max_tokens != float('inf'):
optional_params["max_tokens"] = max_tokens
return optional_params
elif replicate == True:
# any replicate models
# TODO: handle translating remaining replicate params
if stream:
optional_params["stream"] = stream
return optional_params
else:# assume passing in params for openai/azure openai
if functions != []:
optional_params["functions"] = functions
if function_call != "":
optional_params["function_call"] = function_call
if temperature != 1:
optional_params["temperature"] = temperature
if top_p != 1:
optional_params["top_p"] = top_p
if n != 1:
optional_params["n"] = n
if stream:
optional_params["stream"] = stream
if stop != None:
optional_params["stop"] = stop
if max_tokens != float('inf'):
optional_params["max_tokens"] = max_tokens
if presence_penalty != 0:
optional_params["presence_penalty"] = presence_penalty
if frequency_penalty != 0:
optional_params["frequency_penalty"] = frequency_penalty
if logit_bias != {}:
optional_params["logit_bias"] = logit_bias
if user != "":
optional_params["user"] = user
if deployment_id != None:
optional_params["deployment_id"] = deployment_id
return optional_params
return optional_params return optional_params
def set_callbacks(callback_list): def set_callbacks(callback_list):
@ -324,19 +412,6 @@ def handle_failure(exception, traceback_exception, start_time, end_time, args, k
logging(logger_fn=user_logger_fn, exception=e) logging(logger_fn=user_logger_fn, exception=e)
pass pass
def prompt_token_calculator(model, messages):
# use tiktoken or anthropic's tokenizer depending on the model
text = " ".join(message["content"] for message in messages)
num_tokens = 0
if "claude" in model:
anthropic = Anthropic()
num_tokens = anthropic.count_tokens(text)
else:
num_tokens = len(encoding.encode(text))
return num_tokens
def handle_success(args, kwargs, result, start_time, end_time): def handle_success(args, kwargs, result, start_time, end_time):
global heliconeLogger, aispendLogger global heliconeLogger, aispendLogger
try: try:
@ -396,6 +471,19 @@ def handle_success(args, kwargs, result, start_time, end_time):
print_verbose(f"[Non-Blocking] Success Callback Error - {traceback.format_exc()}") print_verbose(f"[Non-Blocking] Success Callback Error - {traceback.format_exc()}")
pass pass
def prompt_token_calculator(model, messages):
# use tiktoken or anthropic's tokenizer depending on the model
text = " ".join(message["content"] for message in messages)
num_tokens = 0
if "claude" in model:
install_and_import('anthropic')
from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT
anthropic = Anthropic()
num_tokens = anthropic.count_tokens(text)
else:
num_tokens = len(encoding.encode(text))
return num_tokens
# integration helper function # integration helper function
def modify_integration(integration_name, integration_params): def modify_integration(integration_name, integration_params):
global supabaseClient global supabaseClient
@ -520,3 +608,30 @@ def get_secret(secret_name):
return os.environ.get(secret_name) return os.environ.get(secret_name)
else: else:
return os.environ.get(secret_name) return os.environ.get(secret_name)
######## Streaming Class ############################
# wraps the completion stream to return the correct format for the model
# replicate/anthropic/cohere
class CustomStreamWrapper:
def __init__(self, completion_stream, model):
self.model = model
if model in litellm.cohere_models:
# cohere does not return an iterator, so we need to wrap it in one
self.completion_stream = iter(completion_stream)
else:
self.completion_stream = completion_stream
def __iter__(self):
return self
def __next__(self):
if self.model in litellm.anthropic_models:
chunk = next(self.completion_stream)
return {"choices": [{"delta": chunk.completion}]}
elif self.model == "replicate":
chunk = next(self.completion_stream)
return {"choices": [{"delta": chunk}]}
elif self.model in litellm.cohere_models:
chunk = next(self.completion_stream)
return {"choices": [{"delta": chunk.text}]}

View file

@ -6,6 +6,8 @@ nav:
- Input - Request Body: input.md - Input - Request Body: input.md
- Output - Response Object: output.md - Output - Response Object: output.md
- Streaming & Async Calls: stream.md - Streaming & Async Calls: stream.md
- token usage:
- Helper Functions: token_usage.md
- 🤖 Supported LLM APIs: - 🤖 Supported LLM APIs:
- Supported Completion & Chat APIs: supported.md - Supported Completion & Chat APIs: supported.md
- Supported Embedding APIs: supported_embedding.md - Supported Embedding APIs: supported_embedding.md

View file

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "litellm" name = "litellm"
version = "0.1.356" version = "0.1.367"
description = "Library to easily interface with LLM API providers" description = "Library to easily interface with LLM API providers"
authors = ["BerriAI"] authors = ["BerriAI"]
license = "MIT License" license = "MIT License"
@ -8,14 +8,8 @@ readme = "README.md"
[tool.poetry.dependencies] [tool.poetry.dependencies]
python = "^3.8" python = "^3.8"
openai = {extras = ["datalib"], version = "^0.27.8"} openai = "^0.27.8"
cohere = "^4.18.0"
pytest = "^7.4.0"
pydantic = "^2.1.1"
anthropic = "^0.3.7"
replicate = "^0.10.0"
python-dotenv = "^1.0.0" python-dotenv = "^1.0.0"
tenacity = "^8.0.1"
tiktoken = "^0.4.0" tiktoken = "^0.4.0"
[build-system] [build-system]

View file

@ -1,11 +1,5 @@
pydantic # used by CI/CD testing
openai openai
cohere
anthropic
replicate
pytest
python-dotenv python-dotenv
openai[datalib] openai
tenacity
tiktoken tiktoken
infisical