Merge branch 'main' into patch-1

This commit is contained in:
Mikkel Gravgaard 2024-05-17 10:26:14 +02:00 committed by GitHub
commit 80ef0f86d1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
332 changed files with 43212 additions and 8160 deletions

View file

@ -1,4 +1,4 @@
version: 2.1
version: 4.3.4
jobs:
local_testing:
docker:
@ -40,7 +40,7 @@ jobs:
pip install "aioboto3==12.3.0"
pip install langchain
pip install lunary==0.2.5
pip install "langfuse==2.7.3"
pip install "langfuse==2.27.1"
pip install numpydoc
pip install traceloop-sdk==0.0.69
pip install openai
@ -57,6 +57,9 @@ jobs:
pip install "pytest-mock==3.12.0"
pip install python-multipart
pip install google-cloud-aiplatform
pip install prometheus-client==0.20.0
pip install "pydantic==2.7.1"
pip install "diskcache==5.6.1"
- save_cache:
paths:
- ./venv
@ -187,22 +190,28 @@ jobs:
command: |
docker run -d \
-p 4000:4000 \
-e DATABASE_URL=$PROXY_DOCKER_DB_URL \
-e DATABASE_URL=$PROXY_DATABASE_URL \
-e AZURE_API_KEY=$AZURE_API_KEY \
-e REDIS_HOST=$REDIS_HOST \
-e REDIS_PASSWORD=$REDIS_PASSWORD \
-e REDIS_PORT=$REDIS_PORT \
-e AZURE_FRANCE_API_KEY=$AZURE_FRANCE_API_KEY \
-e AZURE_EUROPE_API_KEY=$AZURE_EUROPE_API_KEY \
-e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
-e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
-e AWS_REGION_NAME=$AWS_REGION_NAME \
-e AUTO_INFER_REGION=True \
-e OPENAI_API_KEY=$OPENAI_API_KEY \
-e LANGFUSE_PROJECT1_PUBLIC=$LANGFUSE_PROJECT1_PUBLIC \
-e LANGFUSE_PROJECT2_PUBLIC=$LANGFUSE_PROJECT2_PUBLIC \
-e LANGFUSE_PROJECT1_SECRET=$LANGFUSE_PROJECT1_SECRET \
-e LANGFUSE_PROJECT2_SECRET=$LANGFUSE_PROJECT2_SECRET \
--name my-app \
-v $(pwd)/proxy_server_config.yaml:/app/config.yaml \
my-app:latest \
--config /app/config.yaml \
--port 4000 \
--num_workers 8 \
--detailed_debug \
--run_gunicorn \
- run:
name: Install curl and dockerize
command: |
@ -217,7 +226,7 @@ jobs:
background: true
- run:
name: Wait for app to be ready
command: dockerize -wait http://localhost:4000 -timeout 1m
command: dockerize -wait http://localhost:4000 -timeout 5m
- run:
name: Run tests
command: |

View file

@ -0,0 +1,51 @@
{
"name": "Python 3.11",
// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
"image": "mcr.microsoft.com/devcontainers/python:3.11-bookworm",
// https://github.com/devcontainers/images/tree/main/src/python
// https://mcr.microsoft.com/en-us/product/devcontainers/python/tags
// "build": {
// "dockerfile": "Dockerfile",
// "context": ".."
// },
// Features to add to the dev container. More info: https://containers.dev/features.
// "features": {},
// Configure tool-specific properties.
"customizations": {
// Configure properties specific to VS Code.
"vscode": {
"settings": {},
"extensions": [
"ms-python.python",
"ms-python.vscode-pylance",
"GitHub.copilot",
"GitHub.copilot-chat"
]
}
},
// Use 'forwardPorts' to make a list of ports inside the container available locally.
"forwardPorts": [4000],
"containerEnv": {
"LITELLM_LOG": "DEBUG"
},
// Use 'portsAttributes' to set default properties for specific forwarded ports.
// More info: https://containers.dev/implementors/json_reference/#port-attributes
"portsAttributes": {
"4000": {
"label": "LiteLLM Server",
"onAutoForward": "notify"
}
},
// More info: https://aka.ms/dev-containers-non-root.
// "remoteUser": "litellm",
// Use 'postCreateCommand' to run commands after the container is created.
"postCreateCommand": "pipx install poetry && poetry install -E extra_proxy -E proxy"
}

View file

@ -1,5 +1,5 @@
/docs
/cookbook
/.circleci
/.github
/tests
docs
cookbook
.circleci
.github
tests

10
.git-blame-ignore-revs Normal file
View file

@ -0,0 +1,10 @@
# Add the commit hash of any commit you want to ignore in `git blame` here.
# One commit hash per line.
#
# The GitHub Blame UI will use this file automatically!
#
# Run this command to always ignore formatting commits in `git blame`
# git config blame.ignoreRevsFile .git-blame-ignore-revs
# Update pydantic code to fix warnings (GH-3600)
876840e9957bc7e9f7d6a2b58c4d7c53dad16481

29
.github/pull_request_template.md vendored Normal file
View file

@ -0,0 +1,29 @@
## Title
<!-- e.g. "Implement user authentication feature" -->
## Relevant issues
<!-- e.g. "Fixes #000" -->
## Type
<!-- Select the type of Pull Request -->
<!-- Keep only the necessary ones -->
🆕 New Feature
🐛 Bug Fix
🧹 Refactoring
📖 Documentation
🚄 Infrastructure
✅ Test
## Changes
<!-- List of changes -->
## [REQUIRED] Testing - Attach a screenshot of any new tests passing locall
If UI changes, send a screenshot/GIF of working UI fixes
<!-- Test procedure -->

View file

@ -64,6 +64,11 @@ if __name__ == "__main__":
) # Replace with your repository's username and name
latest_release = repo.get_latest_release()
print("got latest release: ", latest_release)
print(latest_release.title)
print(latest_release.tag_name)
release_version = latest_release.title
print("latest release body: ", latest_release.body)
print("markdown table: ", markdown_table)
@ -74,8 +79,25 @@ if __name__ == "__main__":
start_index = latest_release.body.find("Load Test LiteLLM Proxy Results")
existing_release_body = latest_release.body[:start_index]
docker_run_command = f"""
\n\n
## Docker Run LiteLLM Proxy
```
docker run \\
-e STORE_MODEL_IN_DB=True \\
-p 4000:4000 \\
ghcr.io/berriai/litellm:main-{release_version}
```
"""
print("docker run command: ", docker_run_command)
new_release_body = (
existing_release_body
+ docker_run_command
+ "\n\n"
+ "### Don't want to maintain your internal proxy? get in touch 🎉"
+ "\nHosted Proxy Alpha: https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat"
+ "\n\n"
+ "## Load Test LiteLLM Proxy Results"
+ "\n\n"

6
.gitignore vendored
View file

@ -1,5 +1,6 @@
.venv
.env
litellm/proxy/myenv/*
litellm_uuid.txt
__pycache__/
*.pyc
@ -50,3 +51,8 @@ kub.yaml
loadtest_kub.yaml
litellm/proxy/_new_secret_config.yaml
litellm/proxy/_new_secret_config.yaml
litellm/proxy/_super_secret_config.yaml
litellm/proxy/_super_secret_config.yaml
litellm/proxy/myenv/bin/activate
litellm/proxy/myenv/bin/Activate.ps1
myenv/*

View file

@ -7,7 +7,7 @@ repos:
rev: 7.0.0 # The version of flake8 to use
hooks:
- id: flake8
exclude: ^litellm/tests/|^litellm/proxy/proxy_cli.py|^litellm/integrations/|^litellm/proxy/tests/
exclude: ^litellm/tests/|^litellm/proxy/proxy_cli.py|^litellm/proxy/tests/
additional_dependencies: [flake8-print]
files: litellm/.*\.py
- repo: local

View file

@ -5,7 +5,7 @@
<p align="center">Call all LLM APIs using the OpenAI format [Bedrock, Huggingface, VertexAI, TogetherAI, Azure, OpenAI, etc.]
<br>
</p>
<h4 align="center"><a href="https://docs.litellm.ai/docs/simple_proxy" target="_blank">OpenAI Proxy Server</a> | <a href="https://docs.litellm.ai/docs/enterprise"target="_blank">Enterprise Tier</a></h4>
<h4 align="center"><a href="https://docs.litellm.ai/docs/simple_proxy" target="_blank">OpenAI Proxy Server</a> | <a href="https://docs.litellm.ai/docs/hosted" target="_blank"> Hosted Proxy (Preview)</a> | <a href="https://docs.litellm.ai/docs/enterprise"target="_blank">Enterprise Tier</a></h4>
<h4 align="center">
<a href="https://pypi.org/project/litellm/" target="_blank">
<img src="https://img.shields.io/pypi/v/litellm.svg" alt="PyPI Version">
@ -128,7 +128,9 @@ response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content
# OpenAI Proxy - ([Docs](https://docs.litellm.ai/docs/simple_proxy))
Set Budgets & Rate limits across multiple projects
Track spend + Load Balance across multiple projects
[Hosted Proxy (Preview)](https://docs.litellm.ai/docs/hosted)
The proxy provides:
@ -224,7 +226,9 @@ curl 'http://0.0.0.0:4000/key/generate' \
| [deepinfra](https://docs.litellm.ai/docs/providers/deepinfra) | ✅ | ✅ | ✅ | ✅ |
| [perplexity-ai](https://docs.litellm.ai/docs/providers/perplexity) | ✅ | ✅ | ✅ | ✅ |
| [Groq AI](https://docs.litellm.ai/docs/providers/groq) | ✅ | ✅ | ✅ | ✅ |
| [Deepseek](https://docs.litellm.ai/docs/providers/deepseek) | ✅ | ✅ | ✅ | ✅ |
| [anyscale](https://docs.litellm.ai/docs/providers/anyscale) | ✅ | ✅ | ✅ | ✅ |
| [IBM - watsonx.ai](https://docs.litellm.ai/docs/providers/watsonx) | ✅ | ✅ | ✅ | ✅ | ✅
| [voyage ai](https://docs.litellm.ai/docs/providers/voyage) | | | | | ✅ |
| [xinference [Xorbits Inference]](https://docs.litellm.ai/docs/providers/xinference) | | | | | ✅ |
@ -245,7 +249,7 @@ Step 2: Navigate into the project, and install dependencies:
```
cd litellm
poetry install
poetry install -E extra_proxy -E proxy
```
Step 3: Test your change:

204
cookbook/Proxy_Batch_Users.ipynb vendored Normal file
View file

@ -0,0 +1,204 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "680oRk1af-xJ"
},
"source": [
"# Environment Setup"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "X7TgJFn8f88p"
},
"outputs": [],
"source": [
"import csv\n",
"from typing import Optional\n",
"import httpx, json\n",
"import asyncio\n",
"\n",
"proxy_base_url = \"http://0.0.0.0:4000\" # 👈 SET TO PROXY URL\n",
"master_key = \"sk-1234\" # 👈 SET TO PROXY MASTER KEY"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "rauw8EOhgBz5"
},
"outputs": [],
"source": [
"## GLOBAL HTTP CLIENT ## - faster http calls\n",
"class HTTPHandler:\n",
" def __init__(self, concurrent_limit=1000):\n",
" # Create a client with a connection pool\n",
" self.client = httpx.AsyncClient(\n",
" limits=httpx.Limits(\n",
" max_connections=concurrent_limit,\n",
" max_keepalive_connections=concurrent_limit,\n",
" )\n",
" )\n",
"\n",
" async def close(self):\n",
" # Close the client when you're done with it\n",
" await self.client.aclose()\n",
"\n",
" async def get(\n",
" self, url: str, params: Optional[dict] = None, headers: Optional[dict] = None\n",
" ):\n",
" response = await self.client.get(url, params=params, headers=headers)\n",
" return response\n",
"\n",
" async def post(\n",
" self,\n",
" url: str,\n",
" data: Optional[dict] = None,\n",
" params: Optional[dict] = None,\n",
" headers: Optional[dict] = None,\n",
" ):\n",
" try:\n",
" response = await self.client.post(\n",
" url, data=data, params=params, headers=headers\n",
" )\n",
" return response\n",
" except Exception as e:\n",
" raise e\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "7LXN8zaLgOie"
},
"source": [
"# Import Sheet\n",
"\n",
"\n",
"Format: | ID | Name | Max Budget |"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "oiED0usegPGf"
},
"outputs": [],
"source": [
"async def import_sheet():\n",
" tasks = []\n",
" http_client = HTTPHandler()\n",
" with open('my-batch-sheet.csv', 'r') as file:\n",
" csv_reader = csv.DictReader(file)\n",
" for row in csv_reader:\n",
" task = create_user(client=http_client, user_id=row['ID'], max_budget=row['Max Budget'], user_name=row['Name'])\n",
" tasks.append(task)\n",
" # print(f\"ID: {row['ID']}, Name: {row['Name']}, Max Budget: {row['Max Budget']}\")\n",
"\n",
" keys = await asyncio.gather(*tasks)\n",
"\n",
" with open('my-batch-sheet_new.csv', 'w', newline='') as new_file:\n",
" fieldnames = ['ID', 'Name', 'Max Budget', 'keys']\n",
" csv_writer = csv.DictWriter(new_file, fieldnames=fieldnames)\n",
" csv_writer.writeheader()\n",
"\n",
" with open('my-batch-sheet.csv', 'r') as file:\n",
" csv_reader = csv.DictReader(file)\n",
" for i, row in enumerate(csv_reader):\n",
" row['keys'] = keys[i] # Add the 'keys' value from the corresponding task result\n",
" csv_writer.writerow(row)\n",
"\n",
" await http_client.close()\n",
"\n",
"asyncio.run(import_sheet())"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "E7M0Li_UgJeZ"
},
"source": [
"# Create Users + Keys\n",
"\n",
"- Creates a user\n",
"- Creates a key with max budget"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "NZudRFujf7j-"
},
"outputs": [],
"source": [
"\n",
"async def create_key_with_alias(client: HTTPHandler, user_id: str, max_budget: float):\n",
" global proxy_base_url\n",
" if not proxy_base_url.endswith(\"/\"):\n",
" proxy_base_url += \"/\"\n",
" url = proxy_base_url + \"key/generate\"\n",
"\n",
" # call /key/generate\n",
" print(\"CALLING /KEY/GENERATE\")\n",
" response = await client.post(\n",
" url=url,\n",
" headers={\"Authorization\": f\"Bearer {master_key}\"},\n",
" data=json.dumps({\n",
" \"user_id\": user_id,\n",
" \"key_alias\": f\"{user_id}-key\",\n",
" \"max_budget\": max_budget # 👈 KEY CHANGE: SETS MAX BUDGET PER KEY\n",
" })\n",
" )\n",
" print(f\"response: {response.text}\")\n",
" return response.json()[\"key\"]\n",
"\n",
"async def create_user(client: HTTPHandler, user_id: str, max_budget: float, user_name: str):\n",
" \"\"\"\n",
" - call /user/new\n",
" - create key for user\n",
" \"\"\"\n",
" global proxy_base_url\n",
" if not proxy_base_url.endswith(\"/\"):\n",
" proxy_base_url += \"/\"\n",
" url = proxy_base_url + \"user/new\"\n",
"\n",
" # call /user/new\n",
" await client.post(\n",
" url=url,\n",
" headers={\"Authorization\": f\"Bearer {master_key}\"},\n",
" data=json.dumps({\n",
" \"user_id\": user_id,\n",
" \"user_alias\": user_name,\n",
" \"auto_create_key\": False,\n",
" # \"max_budget\": max_budget # 👈 [OPTIONAL] Sets max budget per user (if you want to set a max budget across keys)\n",
" })\n",
" )\n",
"\n",
" # create key for user\n",
" return await create_key_with_alias(client=client, user_id=user_id, max_budget=max_budget)\n"
]
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

300
cookbook/liteLLM_IBM_Watsonx.ipynb vendored Normal file

File diff suppressed because one or more lines are too long

187
cookbook/liteLLM_clarifai_Demo.ipynb vendored Normal file
View file

@ -0,0 +1,187 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# LiteLLM Clarifai \n",
"This notebook walks you through on how to use liteLLM integration of Clarifai and call LLM model from clarifai with response in openAI output format."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Pre-Requisites"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#install necessary packages\n",
"!pip install litellm\n",
"!pip install clarifai"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To obtain Clarifai Personal Access Token follow the steps mentioned in the [link](https://docs.clarifai.com/clarifai-basics/authentication/personal-access-tokens/)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"## Set Clarifai Credentials\n",
"import os\n",
"os.environ[\"CLARIFAI_API_KEY\"]= \"YOUR_CLARIFAI_PAT\" # Clarifai PAT"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Mistral-large"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import litellm\n",
"\n",
"litellm.set_verbose=False"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Mistral large response : ModelResponse(id='chatcmpl-6eed494d-7ae2-4870-b9c2-6a64d50a6151', choices=[Choices(finish_reason='stop', index=1, message=Message(content=\"In the grand tapestry of time, where tales unfold,\\nLies the chronicle of ages, a sight to behold.\\nA tale of empires rising, and kings of old,\\nOf civilizations lost, and stories untold.\\n\\nOnce upon a yesterday, in a time so vast,\\nHumans took their first steps, casting shadows in the past.\\nFrom the cradle of mankind, a journey they embarked,\\nThrough stone and bronze and iron, their skills they sharpened and marked.\\n\\nEgyptians built pyramids, reaching for the skies,\\nWhile Greeks sought wisdom, truth, in philosophies that lie.\\nRoman legions marched, their empire to expand,\\nAnd in the East, the Silk Road joined the world, hand in hand.\\n\\nThe Middle Ages came, with knights in shining armor,\\nFeudal lords and serfs, a time of both clamor and calm order.\\nThen Renaissance bloomed, like a flower in the sun,\\nA rebirth of art and science, a new age had begun.\\n\\nAcross the vast oceans, explorers sailed with courage bold,\\nDiscovering new lands, stories of adventure, untold.\\nIndustrial Revolution churned, progress in its wake,\\nMachines and factories, a whole new world to make.\\n\\nTwo World Wars raged, a testament to man's strife,\\nYet from the ashes rose hope, a renewed will for life.\\nInto the modern era, technology took flight,\\nConnecting every corner, bathed in digital light.\\n\\nHistory, a symphony, a melody of time,\\nA testament to human will, resilience so sublime.\\nIn every page, a lesson, in every tale, a guide,\\nFor understanding our past, shapes our future's tide.\", role='assistant'))], created=1713896412, model='https://api.clarifai.com/v2/users/mistralai/apps/completion/models/mistral-large/outputs', object='chat.completion', system_fingerprint=None, usage=Usage(prompt_tokens=13, completion_tokens=338, total_tokens=351))\n"
]
}
],
"source": [
"from litellm import completion\n",
"\n",
"messages = [{\"role\": \"user\",\"content\": \"\"\"Write a poem about history?\"\"\"}]\n",
"response=completion(\n",
" model=\"clarifai/mistralai.completion.mistral-large\",\n",
" messages=messages,\n",
" )\n",
"\n",
"print(f\"Mistral large response : {response}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Claude-2.1 "
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Claude-2.1 response : ModelResponse(id='chatcmpl-d126c919-4db4-4aa3-ac8f-7edea41e0b93', choices=[Choices(finish_reason='stop', index=1, message=Message(content=\" Here's a poem I wrote about history:\\n\\nThe Tides of Time\\n\\nThe tides of time ebb and flow,\\nCarrying stories of long ago.\\nFigures and events come into light,\\nShaping the future with all their might.\\n\\nKingdoms rise, empires fall, \\nLeaving traces that echo down every hall.\\nRevolutions bring change with a fiery glow,\\nToppling structures from long ago.\\n\\nExplorers traverse each ocean and land,\\nSeeking treasures they don't understand.\\nWhile artists and writers try to make their mark,\\nHoping their works shine bright in the dark.\\n\\nThe cycle repeats again and again,\\nAs humanity struggles to learn from its pain.\\nThough the players may change on history's stage,\\nThe themes stay the same from age to age.\\n\\nWar and peace, life and death,\\nLove and strife with every breath.\\nThe tides of time continue their dance,\\nAs we join in, by luck or by chance.\\n\\nSo we study the past to light the way forward, \\nHeeding warnings from stories told and heard.\\nThe future unfolds from this unending flow -\\nWhere the tides of time ultimately go.\", role='assistant'))], created=1713896579, model='https://api.clarifai.com/v2/users/anthropic/apps/completion/models/claude-2_1/outputs', object='chat.completion', system_fingerprint=None, usage=Usage(prompt_tokens=12, completion_tokens=232, total_tokens=244))\n"
]
}
],
"source": [
"from litellm import completion\n",
"\n",
"messages = [{\"role\": \"user\",\"content\": \"\"\"Write a poem about history?\"\"\"}]\n",
"response=completion(\n",
" model=\"clarifai/anthropic.completion.claude-2_1\",\n",
" messages=messages,\n",
" )\n",
"\n",
"print(f\"Claude-2.1 response : {response}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### OpenAI GPT-4 (Streaming)\n",
"Though clarifai doesn't support streaming, still you can call stream and get the response in standard StreamResponse format of liteLLM"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ModelResponse(id='chatcmpl-40ae19af-3bf0-4eb4-99f2-33aec3ba84af', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=\"In the quiet corners of time's grand hall,\\nLies the tale of rise and fall.\\nFrom ancient ruins to modern sprawl,\\nHistory, the greatest story of them all.\\n\\nEmpires have risen, empires have decayed,\\nThrough the eons, memories have stayed.\\nIn the book of time, history is laid,\\nA tapestry of events, meticulously displayed.\\n\\nThe pyramids of Egypt, standing tall,\\nThe Roman Empire's mighty sprawl.\\nFrom Alexander's conquest, to the Berlin Wall,\\nHistory, a silent witness to it all.\\n\\nIn the shadow of the past we tread,\\nWhere once kings and prophets led.\\nTheir stories in our hearts are spread,\\nEchoes of their words, in our minds are read.\\n\\nBattles fought and victories won,\\nActs of courage under the sun.\\nTales of love, of deeds done,\\nIn history's grand book, they all run.\\n\\nHeroes born, legends made,\\nIn the annals of time, they'll never fade.\\nTheir triumphs and failures all displayed,\\nIn the eternal march of history's parade.\\n\\nThe ink of the past is forever dry,\\nBut its lessons, we cannot deny.\\nIn its stories, truths lie,\\nIn its wisdom, we rely.\\n\\nHistory, a mirror to our past,\\nA guide for the future vast.\\nThrough its lens, we're ever cast,\\nIn the drama of life, forever vast.\", role='assistant', function_call=None, tool_calls=None), logprobs=None)], created=1714744515, model='https://api.clarifai.com/v2/users/openai/apps/chat-completion/models/GPT-4/outputs', object='chat.completion.chunk', system_fingerprint=None)\n",
"ModelResponse(id='chatcmpl-40ae19af-3bf0-4eb4-99f2-33aec3ba84af', choices=[StreamingChoices(finish_reason='stop', index=0, delta=Delta(content=None, role=None, function_call=None, tool_calls=None), logprobs=None)], created=1714744515, model='https://api.clarifai.com/v2/users/openai/apps/chat-completion/models/GPT-4/outputs', object='chat.completion.chunk', system_fingerprint=None)\n"
]
}
],
"source": [
"from litellm import completion\n",
"\n",
"messages = [{\"role\": \"user\",\"content\": \"\"\"Write a poem about history?\"\"\"}]\n",
"response = completion(\n",
" model=\"clarifai/openai.chat-completion.GPT-4\",\n",
" messages=messages,\n",
" stream=True,\n",
" api_key = \"c75cc032415e45368be331fdd2c06db0\")\n",
"\n",
"for chunk in response:\n",
" print(chunk)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Binary file not shown.

View file

@ -0,0 +1,15 @@
{
"$schema": "https://schema.management.azure.com/schemas/0.1.2-preview/CreateUIDefinition.MultiVm.json#",
"handler": "Microsoft.Azure.CreateUIDef",
"version": "0.1.2-preview",
"parameters": {
"config": {
"isWizard": false,
"basics": { }
},
"basics": [ ],
"steps": [ ],
"outputs": { },
"resourceTypes": [ ]
}
}

View file

@ -0,0 +1,63 @@
{
"$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"parameters": {
"imageName": {
"type": "string",
"defaultValue": "ghcr.io/berriai/litellm:main-latest"
},
"containerName": {
"type": "string",
"defaultValue": "litellm-container"
},
"dnsLabelName": {
"type": "string",
"defaultValue": "litellm"
},
"portNumber": {
"type": "int",
"defaultValue": 4000
}
},
"resources": [
{
"type": "Microsoft.ContainerInstance/containerGroups",
"apiVersion": "2021-03-01",
"name": "[parameters('containerName')]",
"location": "[resourceGroup().location]",
"properties": {
"containers": [
{
"name": "[parameters('containerName')]",
"properties": {
"image": "[parameters('imageName')]",
"resources": {
"requests": {
"cpu": 1,
"memoryInGB": 2
}
},
"ports": [
{
"port": "[parameters('portNumber')]"
}
]
}
}
],
"osType": "Linux",
"restartPolicy": "Always",
"ipAddress": {
"type": "Public",
"ports": [
{
"protocol": "tcp",
"port": "[parameters('portNumber')]"
}
],
"dnsNameLabel": "[parameters('dnsLabelName')]"
}
}
}
]
}

View file

@ -0,0 +1,42 @@
param imageName string = 'ghcr.io/berriai/litellm:main-latest'
param containerName string = 'litellm-container'
param dnsLabelName string = 'litellm'
param portNumber int = 4000
resource containerGroupName 'Microsoft.ContainerInstance/containerGroups@2021-03-01' = {
name: containerName
location: resourceGroup().location
properties: {
containers: [
{
name: containerName
properties: {
image: imageName
resources: {
requests: {
cpu: 1
memoryInGB: 2
}
}
ports: [
{
port: portNumber
}
]
}
}
]
osType: 'Linux'
restartPolicy: 'Always'
ipAddress: {
type: 'Public'
ports: [
{
protocol: 'tcp'
port: portNumber
}
]
dnsNameLabel: dnsLabelName
}
}
}

View file

@ -24,7 +24,7 @@ version: 0.2.0
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: v1.24.5
appVersion: v1.35.38
dependencies:
- name: "postgresql"

View file

@ -1,7 +1,7 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Caching - In-Memory, Redis, s3, Redis Semantic Cache
# Caching - In-Memory, Redis, s3, Redis Semantic Cache, Disk
[**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/caching.py)
@ -11,7 +11,7 @@ Need to use Caching on LiteLLM Proxy Server? Doc here: [Caching Proxy Server](ht
:::
## Initialize Cache - In Memory, Redis, s3 Bucket, Redis Semantic Cache
## Initialize Cache - In Memory, Redis, s3 Bucket, Redis Semantic, Disk Cache
<Tabs>
@ -159,7 +159,7 @@ litellm.cache = Cache()
# Make completion calls
response1 = completion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Tell me a joke."}]
messages=[{"role": "user", "content": "Tell me a joke."}],
caching=True
)
response2 = completion(
@ -174,6 +174,43 @@ response2 = completion(
</TabItem>
<TabItem value="disk" label="disk cache">
### Quick Start
Install diskcache:
```shell
pip install diskcache
```
Then you can use the disk cache as follows.
```python
import litellm
from litellm import completion
from litellm.caching import Cache
litellm.cache = Cache(type="disk")
# Make completion calls
response1 = completion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Tell me a joke."}],
caching=True
)
response2 = completion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Tell me a joke."}],
caching=True
)
# response1 == response2, response 1 is cached
```
If you run the code two times, response1 will use the cache from the first run that was stored in a cache file.
</TabItem>
</Tabs>
@ -191,13 +228,13 @@ Advanced Params
```python
litellm.enable_cache(
type: Optional[Literal["local", "redis"]] = "local",
type: Optional[Literal["local", "redis", "s3", "disk"]] = "local",
host: Optional[str] = None,
port: Optional[str] = None,
password: Optional[str] = None,
supported_call_types: Optional[
List[Literal["completion", "acompletion", "embedding", "aembedding"]]
] = ["completion", "acompletion", "embedding", "aembedding"],
List[Literal["completion", "acompletion", "embedding", "aembedding", "atranscription", "transcription"]]
] = ["completion", "acompletion", "embedding", "aembedding", "atranscription", "transcription"],
**kwargs,
)
```
@ -215,13 +252,13 @@ Update the Cache params
```python
litellm.update_cache(
type: Optional[Literal["local", "redis"]] = "local",
type: Optional[Literal["local", "redis", "s3", "disk"]] = "local",
host: Optional[str] = None,
port: Optional[str] = None,
password: Optional[str] = None,
supported_call_types: Optional[
List[Literal["completion", "acompletion", "embedding", "aembedding"]]
] = ["completion", "acompletion", "embedding", "aembedding"],
List[Literal["completion", "acompletion", "embedding", "aembedding", "atranscription", "transcription"]]
] = ["completion", "acompletion", "embedding", "aembedding", "atranscription", "transcription"],
**kwargs,
)
```
@ -276,22 +313,29 @@ cache.get_cache = get_cache
```python
def __init__(
self,
type: Optional[Literal["local", "redis", "s3"]] = "local",
type: Optional[Literal["local", "redis", "redis-semantic", "s3", "disk"]] = "local",
supported_call_types: Optional[
List[Literal["completion", "acompletion", "embedding", "aembedding"]]
] = ["completion", "acompletion", "embedding", "aembedding"], # A list of litellm call types to cache for. Defaults to caching for all litellm call types.
List[Literal["completion", "acompletion", "embedding", "aembedding", "atranscription", "transcription"]]
] = ["completion", "acompletion", "embedding", "aembedding", "atranscription", "transcription"],
ttl: Optional[float] = None,
default_in_memory_ttl: Optional[float] = None,
# redis cache params
host: Optional[str] = None,
port: Optional[str] = None,
password: Optional[str] = None,
namespace: Optional[str] = None,
default_in_redis_ttl: Optional[float] = None,
similarity_threshold: Optional[float] = None,
redis_semantic_cache_use_async=False,
redis_semantic_cache_embedding_model="text-embedding-ada-002",
redis_flush_size=None,
# s3 Bucket, boto3 configuration
s3_bucket_name: Optional[str] = None,
s3_region_name: Optional[str] = None,
s3_api_version: Optional[str] = None,
s3_path: Optional[str] = None, # if you wish to save to a spefic path
s3_path: Optional[str] = None, # if you wish to save to a specific path
s3_use_ssl: Optional[bool] = True,
s3_verify: Optional[Union[bool, str]] = None,
s3_endpoint_url: Optional[str] = None,
@ -299,7 +343,11 @@ def __init__(
s3_aws_secret_access_key: Optional[str] = None,
s3_aws_session_token: Optional[str] = None,
s3_config: Optional[Any] = None,
**kwargs,
# disk cache params
disk_cache_dir=None,
**kwargs
):
```

View file

@ -40,7 +40,7 @@ cache = Cache()
cache.add_cache(cache_key="test-key", result="1234")
cache.get_cache(cache_key="test-key)
cache.get_cache(cache_key="test-key")
```
## Caching with Streaming

View file

@ -4,6 +4,12 @@ LiteLLM allows you to:
* Send 1 completion call to many models: Return Fastest Response
* Send 1 completion call to many models: Return All Responses
:::info
Trying to do batch completion on LiteLLM Proxy ? Go here: https://docs.litellm.ai/docs/proxy/user_keys#beta-batch-completions---pass-model-as-list
:::
## Send multiple completion calls to 1 model
In the batch_completion method, you provide a list of `messages` where each sub-list of messages is passed to `litellm.completion()`, allowing you to process multiple prompts efficiently in a single API call.

View file

@ -37,11 +37,12 @@ print(response) # ["max_tokens", "tools", "tool_choice", "stream"]
This is a list of openai params we translate across providers.
This list is constantly being updated.
Use `litellm.get_supported_openai_params()` for an updated list of params for each model + provider
| Provider | temperature | max_tokens | top_p | stream | stop | n | presence_penalty | frequency_penalty | functions | function_call | logit_bias | user | response_format | seed | tools | tool_choice | logprobs | top_logprobs | extra_headers |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|--|
|Anthropic| ✅ | ✅ | ✅ | ✅ | ✅ | | | | | |
|Anthropic| ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | ✅ | ✅ |
|Anthropic| ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | ✅ | ✅ | ✅ | ✅ |
|OpenAI| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ | ✅ |
|Azure OpenAI| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ |✅ | ✅ | | | ✅ |
|Replicate | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | |
@ -83,8 +84,9 @@ def completion(
top_p: Optional[float] = None,
n: Optional[int] = None,
stream: Optional[bool] = None,
stream_options: Optional[dict] = None,
stop=None,
max_tokens: Optional[float] = None,
max_tokens: Optional[int] = None,
presence_penalty: Optional[float] = None,
frequency_penalty: Optional[float] = None,
logit_bias: Optional[dict] = None,
@ -139,6 +141,10 @@ def completion(
- `stream`: *boolean or null (optional)* - If set to true, it sends partial message deltas. Tokens will be sent as they become available, with the stream terminated by a [DONE] message.
- `stream_options` *dict or null (optional)* - Options for streaming response. Only set this when you set `stream: true`
- `include_usage` *boolean (optional)* - If set, an additional chunk will be streamed before the data: [DONE] message. The usage field on this chunk shows the token usage statistics for the entire request, and the choices field will always be an empty array. All other chunks will also include a usage field, but with a null value.
- `stop`: *string/ array/ null (optional)* - Up to 4 sequences where the API will stop generating further tokens.
- `max_tokens`: *integer (optional)* - The maximum number of tokens to generate in the chat completion.

View file

@ -1,7 +1,7 @@
# Completion Token Usage & Cost
By default LiteLLM returns token usage in all completion requests ([See here](https://litellm.readthedocs.io/en/latest/output/))
However, we also expose 5 helper functions + **[NEW]** an API to calculate token usage across providers:
However, we also expose some helper functions + **[NEW]** an API to calculate token usage across providers:
- `encode`: This encodes the text passed in, using the model-specific tokenizer. [**Jump to code**](#1-encode)
@ -9,17 +9,19 @@ However, we also expose 5 helper functions + **[NEW]** an API to calculate token
- `token_counter`: This returns the number of tokens for a given input - it uses the tokenizer based on the model, and defaults to tiktoken if no model-specific tokenizer is available. [**Jump to code**](#3-token_counter)
- `cost_per_token`: This returns the cost (in USD) for prompt (input) and completion (output) tokens. Uses the live list from `api.litellm.ai`. [**Jump to code**](#4-cost_per_token)
- `create_pretrained_tokenizer` and `create_tokenizer`: LiteLLM provides default tokenizer support for OpenAI, Cohere, Anthropic, Llama2, and Llama3 models. If you are using a different model, you can create a custom tokenizer and pass it as `custom_tokenizer` to the `encode`, `decode`, and `token_counter` methods. [**Jump to code**](#4-create_pretrained_tokenizer-and-create_tokenizer)
- `completion_cost`: This returns the overall cost (in USD) for a given LLM API Call. It combines `token_counter` and `cost_per_token` to return the cost for that query (counting both cost of input and output). [**Jump to code**](#5-completion_cost)
- `cost_per_token`: This returns the cost (in USD) for prompt (input) and completion (output) tokens. Uses the live list from `api.litellm.ai`. [**Jump to code**](#5-cost_per_token)
- `get_max_tokens`: This returns the maximum number of tokens allowed for the given model. [**Jump to code**](#6-get_max_tokens)
- `completion_cost`: This returns the overall cost (in USD) for a given LLM API Call. It combines `token_counter` and `cost_per_token` to return the cost for that query (counting both cost of input and output). [**Jump to code**](#6-completion_cost)
- `model_cost`: This returns a dictionary for all models, with their max_tokens, input_cost_per_token and output_cost_per_token. It uses the `api.litellm.ai` call shown below. [**Jump to code**](#7-model_cost)
- `get_max_tokens`: This returns the maximum number of tokens allowed for the given model. [**Jump to code**](#7-get_max_tokens)
- `register_model`: This registers new / overrides existing models (and their pricing details) in the model cost dictionary. [**Jump to code**](#8-register_model)
- `model_cost`: This returns a dictionary for all models, with their max_tokens, input_cost_per_token and output_cost_per_token. It uses the `api.litellm.ai` call shown below. [**Jump to code**](#8-model_cost)
- `api.litellm.ai`: Live token + price count across [all supported models](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json). [**Jump to code**](#9-apilitellmai)
- `register_model`: This registers new / overrides existing models (and their pricing details) in the model cost dictionary. [**Jump to code**](#9-register_model)
- `api.litellm.ai`: Live token + price count across [all supported models](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json). [**Jump to code**](#10-apilitellmai)
📣 This is a community maintained list. Contributions are welcome! ❤️
@ -60,7 +62,24 @@ messages = [{"user": "role", "content": "Hey, how's it going"}]
print(token_counter(model="gpt-3.5-turbo", messages=messages))
```
### 4. `cost_per_token`
### 4. `create_pretrained_tokenizer` and `create_tokenizer`
```python
from litellm import create_pretrained_tokenizer, create_tokenizer
# get tokenizer from huggingface repo
custom_tokenizer_1 = create_pretrained_tokenizer("Xenova/llama-3-tokenizer")
# use tokenizer from json file
with open("tokenizer.json") as f:
json_data = json.load(f)
json_str = json.dumps(json_data)
custom_tokenizer_2 = create_tokenizer(json_str)
```
### 5. `cost_per_token`
```python
from litellm import cost_per_token
@ -72,7 +91,7 @@ prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = cost_per_toke
print(prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar)
```
### 5. `completion_cost`
### 6. `completion_cost`
* Input: Accepts a `litellm.completion()` response **OR** prompt + completion strings
* Output: Returns a `float` of cost for the `completion` call
@ -99,7 +118,7 @@ cost = completion_cost(model="bedrock/anthropic.claude-v2", prompt="Hey!", compl
formatted_string = f"${float(cost):.10f}"
print(formatted_string)
```
### 6. `get_max_tokens`
### 7. `get_max_tokens`
Input: Accepts a model name - e.g., gpt-3.5-turbo (to get a complete list, call litellm.model_list).
Output: Returns the maximum number of tokens allowed for the given model
@ -112,7 +131,7 @@ model = "gpt-3.5-turbo"
print(get_max_tokens(model)) # Output: 4097
```
### 7. `model_cost`
### 8. `model_cost`
* Output: Returns a dict object containing the max_tokens, input_cost_per_token, output_cost_per_token for all models on [community-maintained list](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)
@ -122,7 +141,7 @@ from litellm import model_cost
print(model_cost) # {'gpt-3.5-turbo': {'max_tokens': 4000, 'input_cost_per_token': 1.5e-06, 'output_cost_per_token': 2e-06}, ...}
```
### 8. `register_model`
### 9. `register_model`
* Input: Provide EITHER a model cost dictionary or a url to a hosted json blob
* Output: Returns updated model_cost dictionary + updates litellm.model_cost with model details.
@ -157,5 +176,3 @@ export LITELLM_LOCAL_MODEL_COST_MAP="True"
```
Note: this means you will need to upgrade to get updated pricing, and newer models.

View file

@ -0,0 +1,45 @@
# Using Vision Models
## Quick Start
Example passing images to a model
```python
import os
from litellm import completion
os.environ["OPENAI_API_KEY"] = "your-api-key"
# openai call
response = completion(
model = "gpt-4-vision-preview",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Whats in this image?"
},
{
"type": "image_url",
"image_url": {
"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
}
}
]
}
],
)
```
## Checking if a model supports `vision`
Use `litellm.supports_vision(model="")` -> returns `True` if model supports `vision` and `False` if not
```python
assert litellm.supports_vision(model="gpt-4-vision-preview") == True
assert litellm.supports_vision(model="gemini-1.0-pro-visionn") == True
assert litellm.supports_vision(model="gpt-3.5-turbo") == False
```

View file

@ -23,6 +23,14 @@ response = completion(model="gpt-3.5-turbo", messages=messages)
response = completion("command-nightly", messages)
```
## JSON Logs
If you need to store the logs as JSON, just set the `litellm.json_logs = True`.
We currently just log the raw POST request from litellm as a JSON - [**See Code**].
[Share feedback here](https://github.com/BerriAI/litellm/issues)
## Logger Function
But sometimes all you care about is seeing exactly what's getting sent to your api call and what's being returned - e.g. if the api call is failing, why is that happening? what are the exact params being set?

View file

@ -320,8 +320,6 @@ from litellm import embedding
litellm.vertex_project = "hardy-device-38811" # Your Project ID
litellm.vertex_location = "us-central1" # proj location
os.environ['VOYAGE_API_KEY'] = ""
response = embedding(
model="vertex_ai/textembedding-gecko",
input=["good morning from litellm"],
@ -339,6 +337,8 @@ All models listed [here](https://github.com/BerriAI/litellm/blob/57f37f743886a02
| textembedding-gecko-multilingual@001 | `embedding(model="vertex_ai/textembedding-gecko-multilingual@001", input)` |
| textembedding-gecko@001 | `embedding(model="vertex_ai/textembedding-gecko@001", input)` |
| textembedding-gecko@003 | `embedding(model="vertex_ai/textembedding-gecko@003", input)` |
| text-embedding-preview-0409 | `embedding(model="vertex_ai/text-embedding-preview-0409", input)` |
| text-multilingual-embedding-preview-0409 | `embedding(model="vertex_ai/text-multilingual-embedding-preview-0409", input)` |
## Voyage AI Embedding Models

View file

@ -8,14 +8,23 @@ For companies that need SSO, user management and professional support for LiteLL
:::
This covers:
- ✅ **Features under the [LiteLLM Commercial License](https://docs.litellm.ai/docs/proxy/enterprise):**
- ✅ **Features under the [LiteLLM Commercial License (Content Mod, Custom Tags, etc.)](https://docs.litellm.ai/docs/proxy/enterprise)**
- ✅ **Feature Prioritization**
- ✅ **Custom Integrations**
- ✅ **Professional Support - Dedicated discord + slack**
- ✅ **Custom SLAs**
- ✅ **Secure access with Single Sign-On**
- ✅ [**Secure UI access with Single Sign-On**](../docs/proxy/ui.md#setup-ssoauth-for-ui)
- ✅ [**JWT-Auth**](../docs/proxy/token_auth.md)
## [COMING SOON] AWS Marketplace Support
Deploy managed LiteLLM Proxy within your VPC.
Includes all enterprise features.
[**Get early access**](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
## Frequently Asked Questions
### What topics does Professional support cover and what SLAs do you offer?

View file

@ -13,7 +13,7 @@ LiteLLM maps exceptions across all providers to their OpenAI counterparts.
| >=500 | InternalServerError |
| N/A | ContextWindowExceededError|
| 400 | ContentPolicyViolationError|
| N/A | APIConnectionError |
| 500 | APIConnectionError |
Base case we return APIConnectionError
@ -74,6 +74,28 @@ except Exception as e:
```
## Usage - Should you retry exception?
```
import litellm
import openai
try:
response = litellm.completion(
model="gpt-4",
messages=[
{
"role": "user",
"content": "hello, write a 20 pageg essay"
}
],
timeout=0.01, # this will raise a timeout exception
)
except openai.APITimeoutError as e:
should_retry = litellm._should_retry(e.status_code)
print(f"should_retry: {should_retry}")
```
## Details
To see how it's implemented - [check out the code](https://github.com/BerriAI/litellm/blob/a42c197e5a6de56ea576c73715e6c7c6b19fa249/litellm/utils.py#L1217)
@ -84,23 +106,37 @@ To see how it's implemented - [check out the code](https://github.com/BerriAI/li
## Custom mapping list
Base case - we return the original exception.
Base case - we return `litellm.APIConnectionError` exception (inherits from openai's APIConnectionError exception).
| | ContextWindowExceededError | AuthenticationError | InvalidRequestError | RateLimitError | ServiceUnavailableError |
|---------------|----------------------------|---------------------|---------------------|---------------|-------------------------|
| Anthropic | ✅ | ✅ | ✅ | ✅ | |
| OpenAI | ✅ | ✅ |✅ |✅ |✅|
| Azure OpenAI | ✅ | ✅ |✅ |✅ |✅|
| Replicate | ✅ | ✅ | ✅ | ✅ | ✅ |
| Cohere | ✅ | ✅ | ✅ | ✅ | ✅ |
| Huggingface | ✅ | ✅ | ✅ | ✅ | |
| Openrouter | ✅ | ✅ | ✅ | ✅ | |
| AI21 | ✅ | ✅ | ✅ | ✅ | |
| VertexAI | | |✅ | | |
| Bedrock | | |✅ | | |
| Sagemaker | | |✅ | | |
| TogetherAI | ✅ | ✅ | ✅ | ✅ | |
| AlephAlpha | ✅ | ✅ | ✅ | ✅ | ✅ |
| custom_llm_provider | Timeout | ContextWindowExceededError | BadRequestError | NotFoundError | ContentPolicyViolationError | AuthenticationError | APIError | RateLimitError | ServiceUnavailableError | PermissionDeniedError | UnprocessableEntityError |
|----------------------------|---------|----------------------------|------------------|---------------|-----------------------------|---------------------|----------|----------------|-------------------------|-----------------------|-------------------------|
| openai | ✓ | ✓ | ✓ | | ✓ | ✓ | | | | | |
| watsonx | | | | | | | |✓| | | |
| text-completion-openai | ✓ | ✓ | ✓ | | ✓ | ✓ | | | | | |
| custom_openai | ✓ | ✓ | ✓ | | ✓ | ✓ | | | | | |
| openai_compatible_providers| ✓ | ✓ | ✓ | | ✓ | ✓ | | | | | |
| anthropic | ✓ | ✓ | ✓ | ✓ | | ✓ | | | ✓ | ✓ | |
| replicate | ✓ | ✓ | ✓ | ✓ | | ✓ | | ✓ | ✓ | | |
| bedrock | ✓ | ✓ | ✓ | ✓ | | ✓ | | ✓ | ✓ | ✓ | |
| sagemaker | | ✓ | ✓ | | | | | | | | |
| vertex_ai | ✓ | | ✓ | | | | ✓ | | | | ✓ |
| palm | ✓ | ✓ | | | | | ✓ | | | | |
| gemini | ✓ | ✓ | | | | | ✓ | | | | |
| cloudflare | | | ✓ | | | ✓ | | | | | |
| cohere | | ✓ | ✓ | | | ✓ | | | ✓ | | |
| cohere_chat | | ✓ | ✓ | | | ✓ | | | ✓ | | |
| huggingface | ✓ | ✓ | ✓ | | | ✓ | | ✓ | ✓ | | |
| ai21 | ✓ | ✓ | ✓ | ✓ | | ✓ | | ✓ | | | |
| nlp_cloud | ✓ | ✓ | ✓ | | | ✓ | ✓ | ✓ | ✓ | | |
| together_ai | ✓ | ✓ | ✓ | | | ✓ | | | | | |
| aleph_alpha | | | ✓ | | | ✓ | | | | | |
| ollama | ✓ | | ✓ | | | | | | ✓ | | |
| ollama_chat | ✓ | | ✓ | | | | | | ✓ | | |
| vllm | | | | | | ✓ | ✓ | | | | |
| azure | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | | | ✓ | | |
- "✓" indicates that the specified `custom_llm_provider` can raise the corresponding exception.
- Empty cells indicate the lack of association or that the provider does not raise that particular exception type as indicated by the function.
> For a deeper understanding of these exceptions, you can check out [this](https://github.com/BerriAI/litellm/blob/d7e58d13bf9ba9edbab2ab2f096f3de7547f35fa/litellm/utils.py#L1544) implementation for additional insights.

View file

@ -0,0 +1,58 @@
import Image from '@theme/IdealImage';
# Hosted LiteLLM Proxy
LiteLLM maintains the proxy, so you can focus on your core products.
## [**Get Onboarded**](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
This is in alpha. Schedule a call with us, and we'll give you a hosted proxy within 30 minutes.
[**🚨 Schedule Call**](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
### **Status**: Alpha
Our proxy is already used in production by customers.
See our status page for [**live reliability**](https://status.litellm.ai/)
### **Benefits**
- **No Maintenance, No Infra**: We'll maintain the proxy, and spin up any additional infrastructure (e.g.: separate server for spend logs) to make sure you can load balance + track spend across multiple LLM projects.
- **Reliable**: Our hosted proxy is tested on 1k requests per second, making it reliable for high load.
- **Secure**: LiteLLM is currently undergoing SOC-2 compliance, to make sure your data is as secure as possible.
### Pricing
Pricing is based on usage. We can figure out a price that works for your team, on the call.
[**🚨 Schedule Call**](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
## **Screenshots**
### 1. Create keys
<Image img={require('../img/litellm_hosted_ui_create_key.png')} />
### 2. Add Models
<Image img={require('../img/litellm_hosted_ui_add_models.png')}/>
### 3. Track spend
<Image img={require('../img/litellm_hosted_usage_dashboard.png')} />
### 4. Configure load balancing
<Image img={require('../img/litellm_hosted_ui_router.png')} />
#### [**🚨 Schedule Call**](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
## Feature List
- Easy way to add/remove models
- 100% uptime even when models are added/removed
- custom callback webhooks
- your domain name with HTTPS
- Ability to create/delete User API keys
- Reasonable set monthly cost

View file

@ -14,14 +14,14 @@ import TabItem from '@theme/TabItem';
```python
import os
from langchain.chat_models import ChatLiteLLM
from langchain.prompts.chat import (
from langchain_community.chat_models import ChatLiteLLM
from langchain_core.prompts import (
ChatPromptTemplate,
SystemMessagePromptTemplate,
AIMessagePromptTemplate,
HumanMessagePromptTemplate,
)
from langchain.schema import AIMessage, HumanMessage, SystemMessage
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
os.environ['OPENAI_API_KEY'] = ""
chat = ChatLiteLLM(model="gpt-3.5-turbo")
@ -30,7 +30,7 @@ messages = [
content="what model are you"
)
]
chat(messages)
chat.invoke(messages)
```
</TabItem>
@ -39,14 +39,14 @@ chat(messages)
```python
import os
from langchain.chat_models import ChatLiteLLM
from langchain.prompts.chat import (
from langchain_community.chat_models import ChatLiteLLM
from langchain_core.prompts import (
ChatPromptTemplate,
SystemMessagePromptTemplate,
AIMessagePromptTemplate,
HumanMessagePromptTemplate,
)
from langchain.schema import AIMessage, HumanMessage, SystemMessage
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
os.environ['ANTHROPIC_API_KEY'] = ""
chat = ChatLiteLLM(model="claude-2", temperature=0.3)
@ -55,7 +55,7 @@ messages = [
content="what model are you"
)
]
chat(messages)
chat.invoke(messages)
```
</TabItem>
@ -64,14 +64,14 @@ chat(messages)
```python
import os
from langchain.chat_models import ChatLiteLLM
from langchain.prompts.chat import (
from langchain_community.chat_models import ChatLiteLLM
from langchain_core.prompts.chat import (
ChatPromptTemplate,
SystemMessagePromptTemplate,
AIMessagePromptTemplate,
HumanMessagePromptTemplate,
)
from langchain.schema import AIMessage, HumanMessage, SystemMessage
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
os.environ['REPLICATE_API_TOKEN'] = ""
chat = ChatLiteLLM(model="replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1")
@ -80,7 +80,7 @@ messages = [
content="what model are you?"
)
]
chat(messages)
chat.invoke(messages)
```
</TabItem>
@ -89,14 +89,14 @@ chat(messages)
```python
import os
from langchain.chat_models import ChatLiteLLM
from langchain.prompts.chat import (
from langchain_community.chat_models import ChatLiteLLM
from langchain_core.prompts import (
ChatPromptTemplate,
SystemMessagePromptTemplate,
AIMessagePromptTemplate,
HumanMessagePromptTemplate,
)
from langchain.schema import AIMessage, HumanMessage, SystemMessage
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
os.environ['COHERE_API_KEY'] = ""
chat = ChatLiteLLM(model="command-nightly")
@ -105,32 +105,9 @@ messages = [
content="what model are you?"
)
]
chat(messages)
chat.invoke(messages)
```
</TabItem>
<TabItem value="palm" label="PaLM - Google">
```python
import os
from langchain.chat_models import ChatLiteLLM
from langchain.prompts.chat import (
ChatPromptTemplate,
SystemMessagePromptTemplate,
AIMessagePromptTemplate,
HumanMessagePromptTemplate,
)
from langchain.schema import AIMessage, HumanMessage, SystemMessage
os.environ['PALM_API_KEY'] = ""
chat = ChatLiteLLM(model="palm/chat-bison")
messages = [
HumanMessage(
content="what model are you?"
)
]
chat(messages)
```
</TabItem>
</Tabs>

View file

@ -213,3 +213,349 @@ asyncio.run(loadtest_fn())
```
## Multi-Instance TPM/RPM Load Test (Router)
Test if your defined tpm/rpm limits are respected across multiple instances of the Router object.
In our test:
- Max RPM per deployment is = 100 requests per minute
- Max Throughput / min on router = 200 requests per minute (2 deployments)
- Load we'll send through router = 600 requests per minute
:::info
If you don't want to call a real LLM API endpoint, you can setup a fake openai server. [See code](#extra---setup-fake-openai-server)
:::
### Code
Let's hit the router with 600 requests per minute.
Copy this script 👇. Save it as `test_loadtest_router.py` AND run it with `python3 test_loadtest_router.py`
```python
from litellm import Router
import litellm
litellm.suppress_debug_info = True
litellm.set_verbose = False
import logging
logging.basicConfig(level=logging.CRITICAL)
import os, random, uuid, time, asyncio
# Model list for OpenAI and Anthropic models
model_list = [
{
"model_name": "fake-openai-endpoint",
"litellm_params": {
"model": "gpt-3.5-turbo",
"api_key": "my-fake-key",
"api_base": "http://0.0.0.0:8080",
"rpm": 100
},
},
{
"model_name": "fake-openai-endpoint",
"litellm_params": {
"model": "gpt-3.5-turbo",
"api_key": "my-fake-key",
"api_base": "http://0.0.0.0:8081",
"rpm": 100
},
},
]
router_1 = Router(model_list=model_list, num_retries=0, enable_pre_call_checks=True, routing_strategy="usage-based-routing-v2", redis_host=os.getenv("REDIS_HOST"), redis_port=os.getenv("REDIS_PORT"), redis_password=os.getenv("REDIS_PASSWORD"))
router_2 = Router(model_list=model_list, num_retries=0, routing_strategy="usage-based-routing-v2", enable_pre_call_checks=True, redis_host=os.getenv("REDIS_HOST"), redis_port=os.getenv("REDIS_PORT"), redis_password=os.getenv("REDIS_PASSWORD"))
async def router_completion_non_streaming():
try:
client: Router = random.sample([router_1, router_2], 1)[0] # randomly pick b/w clients
# print(f"client={client}")
response = await client.acompletion(
model="fake-openai-endpoint", # [CHANGE THIS] (if you call it something else on your proxy)
messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
)
return response
except Exception as e:
# print(e)
return None
async def loadtest_fn():
start = time.time()
n = 600 # Number of concurrent tasks
tasks = [router_completion_non_streaming() for _ in range(n)]
chat_completions = await asyncio.gather(*tasks)
successful_completions = [c for c in chat_completions if c is not None]
print(n, time.time() - start, len(successful_completions))
def get_utc_datetime():
import datetime as dt
from datetime import datetime
if hasattr(dt, "UTC"):
return datetime.now(dt.UTC) # type: ignore
else:
return datetime.utcnow() # type: ignore
# Run the event loop to execute the async function
async def parent_fn():
for _ in range(10):
dt = get_utc_datetime()
current_minute = dt.strftime("%H-%M")
print(f"triggered new batch - {current_minute}")
await loadtest_fn()
await asyncio.sleep(10)
asyncio.run(parent_fn())
```
## Multi-Instance TPM/RPM Load Test (Proxy)
Test if your defined tpm/rpm limits are respected across multiple instances.
The quickest way to do this is by testing the [proxy](./proxy/quick_start.md). The proxy uses the [router](./routing.md) under the hood, so if you're using either of them, this test should work for you.
In our test:
- Max RPM per deployment is = 100 requests per minute
- Max Throughput / min on proxy = 200 requests per minute (2 deployments)
- Load we'll send to proxy = 600 requests per minute
So we'll send 600 requests per minute, but expect only 200 requests per minute to succeed.
:::info
If you don't want to call a real LLM API endpoint, you can setup a fake openai server. [See code](#extra---setup-fake-openai-server)
:::
### 1. Setup config
```yaml
model_list:
- litellm_params:
api_base: http://0.0.0.0:8080
api_key: my-fake-key
model: openai/my-fake-model
rpm: 100
model_name: fake-openai-endpoint
- litellm_params:
api_base: http://0.0.0.0:8081
api_key: my-fake-key
model: openai/my-fake-model-2
rpm: 100
model_name: fake-openai-endpoint
router_settings:
num_retries: 0
enable_pre_call_checks: true
redis_host: os.environ/REDIS_HOST ## 👈 IMPORTANT! Setup the proxy w/ redis
redis_password: os.environ/REDIS_PASSWORD
redis_port: os.environ/REDIS_PORT
routing_strategy: usage-based-routing-v2
```
### 2. Start proxy 2 instances
**Instance 1**
```bash
litellm --config /path/to/config.yaml --port 4000
## RUNNING on http://0.0.0.0:4000
```
**Instance 2**
```bash
litellm --config /path/to/config.yaml --port 4001
## RUNNING on http://0.0.0.0:4001
```
### 3. Run Test
Let's hit the proxy with 600 requests per minute.
Copy this script 👇. Save it as `test_loadtest_proxy.py` AND run it with `python3 test_loadtest_proxy.py`
```python
from openai import AsyncOpenAI, AsyncAzureOpenAI
import random, uuid
import time, asyncio, litellm
# import logging
# logging.basicConfig(level=logging.DEBUG)
#### LITELLM PROXY ####
litellm_client = AsyncOpenAI(
api_key="sk-1234", # [CHANGE THIS]
base_url="http://0.0.0.0:4000"
)
litellm_client_2 = AsyncOpenAI(
api_key="sk-1234", # [CHANGE THIS]
base_url="http://0.0.0.0:4001"
)
async def proxy_completion_non_streaming():
try:
client = random.sample([litellm_client, litellm_client_2], 1)[0] # randomly pick b/w clients
# print(f"client={client}")
response = await client.chat.completions.create(
model="fake-openai-endpoint", # [CHANGE THIS] (if you call it something else on your proxy)
messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
)
return response
except Exception as e:
# print(e)
return None
async def loadtest_fn():
start = time.time()
n = 600 # Number of concurrent tasks
tasks = [proxy_completion_non_streaming() for _ in range(n)]
chat_completions = await asyncio.gather(*tasks)
successful_completions = [c for c in chat_completions if c is not None]
print(n, time.time() - start, len(successful_completions))
def get_utc_datetime():
import datetime as dt
from datetime import datetime
if hasattr(dt, "UTC"):
return datetime.now(dt.UTC) # type: ignore
else:
return datetime.utcnow() # type: ignore
# Run the event loop to execute the async function
async def parent_fn():
for _ in range(10):
dt = get_utc_datetime()
current_minute = dt.strftime("%H-%M")
print(f"triggered new batch - {current_minute}")
await loadtest_fn()
await asyncio.sleep(10)
asyncio.run(parent_fn())
```
### Extra - Setup Fake OpenAI Server
Let's setup a fake openai server with a RPM limit of 100.
Let's call our file `fake_openai_server.py`.
```
# import sys, os
# sys.path.insert(
# 0, os.path.abspath("../")
# ) # Adds the parent directory to the system path
from fastapi import FastAPI, Request, status, HTTPException, Depends
from fastapi.responses import StreamingResponse
from fastapi.security import OAuth2PasswordBearer
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
from fastapi import FastAPI, Request, HTTPException, UploadFile, File
import httpx, os, json
from openai import AsyncOpenAI
from typing import Optional
from slowapi import Limiter
from slowapi.util import get_remote_address
from slowapi.errors import RateLimitExceeded
from fastapi import FastAPI, Request, HTTPException
from fastapi.responses import PlainTextResponse
class ProxyException(Exception):
# NOTE: DO NOT MODIFY THIS
# This is used to map exactly to OPENAI Exceptions
def __init__(
self,
message: str,
type: str,
param: Optional[str],
code: Optional[int],
):
self.message = message
self.type = type
self.param = param
self.code = code
def to_dict(self) -> dict:
"""Converts the ProxyException instance to a dictionary."""
return {
"message": self.message,
"type": self.type,
"param": self.param,
"code": self.code,
}
limiter = Limiter(key_func=get_remote_address)
app = FastAPI()
app.state.limiter = limiter
@app.exception_handler(RateLimitExceeded)
async def _rate_limit_exceeded_handler(request: Request, exc: RateLimitExceeded):
return JSONResponse(status_code=429,
content={"detail": "Rate Limited!"})
app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# for completion
@app.post("/chat/completions")
@app.post("/v1/chat/completions")
@limiter.limit("100/minute")
async def completion(request: Request):
# raise HTTPException(status_code=429, detail="Rate Limited!")
return {
"id": "chatcmpl-123",
"object": "chat.completion",
"created": 1677652288,
"model": None,
"system_fingerprint": "fp_44709d6fcb",
"choices": [{
"index": 0,
"message": {
"role": "assistant",
"content": "\n\nHello there, how may I assist you today?",
},
"logprobs": None,
"finish_reason": "stop"
}],
"usage": {
"prompt_tokens": 9,
"completion_tokens": 12,
"total_tokens": 21
}
}
if __name__ == "__main__":
import socket
import uvicorn
port = 8080
while True:
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
result = sock.connect_ex(('0.0.0.0', port))
if result != 0:
print(f"Port {port} is available, starting server...")
break
else:
port += 1
uvicorn.run(app, host="0.0.0.0", port=port)
```
```bash
python3 fake_openai_server.py
```

View file

@ -8,6 +8,7 @@ liteLLM supports:
- [Custom Callback Functions](https://docs.litellm.ai/docs/observability/custom_callback)
- [Lunary](https://lunary.ai/docs)
- [Langfuse](https://langfuse.com/docs)
- [Helicone](https://docs.helicone.ai/introduction)
- [Traceloop](https://traceloop.com/docs)
- [Athina](https://docs.athina.ai/)
@ -22,8 +23,8 @@ from litellm import completion
# set callbacks
litellm.input_callback=["sentry"] # for sentry breadcrumbing - logs the input being sent to the api
litellm.success_callback=["posthog", "helicone", "lunary", "athina"]
litellm.failure_callback=["sentry", "lunary"]
litellm.success_callback=["posthog", "helicone", "langfuse", "lunary", "athina"]
litellm.failure_callback=["sentry", "lunary", "langfuse"]
## set env variables
os.environ['SENTRY_DSN'], os.environ['SENTRY_API_TRACE_RATE']= ""
@ -32,6 +33,9 @@ os.environ["HELICONE_API_KEY"] = ""
os.environ["TRACELOOP_API_KEY"] = ""
os.environ["LUNARY_PUBLIC_KEY"] = ""
os.environ["ATHINA_API_KEY"] = ""
os.environ["LANGFUSE_PUBLIC_KEY"] = ""
os.environ["LANGFUSE_SECRET_KEY"] = ""
os.environ["LANGFUSE_HOST"] = ""
response = completion(model="gpt-3.5-turbo", messages=messages)
```

View file

@ -331,49 +331,25 @@ response = litellm.completion(model="gpt-3.5-turbo", messages=messages, metadata
## Examples
### Custom Callback to track costs for Streaming + Non-Streaming
By default, the response cost is accessible in the logging object via `kwargs["response_cost"]` on success (sync + async)
```python
# Step 1. Write your custom callback function
def track_cost_callback(
kwargs, # kwargs to completion
completion_response, # response from completion
start_time, end_time # start/end time
):
try:
# init logging config
logging.basicConfig(
filename='cost.log',
level=logging.INFO,
format='%(asctime)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
# check if it has collected an entire stream response
if "complete_streaming_response" in kwargs:
# for tracking streaming cost we pass the "messages" and the output_text to litellm.completion_cost
completion_response=kwargs["complete_streaming_response"]
input_text = kwargs["messages"]
output_text = completion_response["choices"][0]["message"]["content"]
response_cost = litellm.completion_cost(
model = kwargs["model"],
messages = input_text,
completion=output_text
)
print("streaming response_cost", response_cost)
logging.info(f"Model {kwargs['model']} Cost: ${response_cost:.8f}")
# for non streaming responses
else:
# we pass the completion_response obj
if kwargs["stream"] != True:
response_cost = litellm.completion_cost(completion_response=completion_response)
print("regular response_cost", response_cost)
logging.info(f"Model {completion_response.model} Cost: ${response_cost:.8f}")
response_cost = kwargs["response_cost"] # litellm calculates response cost for you
print("regular response_cost", response_cost)
except:
pass
# Assign the custom callback function
# Step 2. Assign the custom callback function
litellm.success_callback = [track_cost_callback]
# Step 3. Make litellm.completion call
response = completion(
model="gpt-3.5-turbo",
messages=[

View file

@ -0,0 +1,68 @@
# Greenscale - Track LLM Spend and Responsible Usage
[Greenscale](https://greenscale.ai/) is a production monitoring platform for your LLM-powered app that provides you granular key insights into your GenAI spending and responsible usage. Greenscale only captures metadata to minimize the exposure risk of personally identifiable information (PII).
## Getting Started
Use Greenscale to log requests across all LLM Providers
liteLLM provides `callbacks`, making it easy for you to log data depending on the status of your responses.
## Using Callbacks
First, email `hello@greenscale.ai` to get an API_KEY.
Use just 1 line of code, to instantly log your responses **across all providers** with Greenscale:
```python
litellm.success_callback = ["greenscale"]
```
### Complete code
```python
from litellm import completion
## set env variables
os.environ['GREENSCALE_API_KEY'] = 'your-greenscale-api-key'
os.environ['GREENSCALE_ENDPOINT'] = 'greenscale-endpoint'
os.environ["OPENAI_API_KEY"]= ""
# set callback
litellm.success_callback = ["greenscale"]
#openai call
response = completion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}]
metadata={
"greenscale_project": "acme-project",
"greenscale_application": "acme-application"
}
)
```
## Additional information in metadata
You can send any additional information to Greenscale by using the `metadata` field in completion and `greenscale_` prefix. This can be useful for sending metadata about the request, such as the project and application name, customer_id, enviornment, or any other information you want to track usage. `greenscale_project` and `greenscale_application` are required fields.
```python
#openai call with additional metadata
response = completion(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": "Hi 👋 - i'm openai"}
],
metadata={
"greenscale_project": "acme-project",
"greenscale_application": "acme-application",
"greenscale_customer_id": "customer-123"
}
)
```
## Support & Talk with Greenscale Team
- [Schedule Demo 👋](https://calendly.com/nandesh/greenscale)
- [Website 💻](https://greenscale.ai)
- Our email ✉️ `hello@greenscale.ai`

View file

@ -0,0 +1,173 @@
import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Lago - Usage Based Billing
[Lago](https://www.getlago.com/) offers a self-hosted and cloud, metering and usage-based billing solution.
<Image img={require('../../img/lago.jpeg')} />
## Quick Start
Use just 1 lines of code, to instantly log your responses **across all providers** with Lago
Get your Lago [API Key](https://docs.getlago.com/guide/self-hosted/docker#find-your-api-key)
```python
litellm.callbacks = ["lago"] # logs cost + usage of successful calls to lago
```
<Tabs>
<TabItem value="sdk" label="SDK">
```python
# pip install lago
import litellm
import os
os.environ["LAGO_API_BASE"] = "" # http://0.0.0.0:3000
os.environ["LAGO_API_KEY"] = ""
os.environ["LAGO_API_EVENT_CODE"] = "" # The billable metric's code - https://docs.getlago.com/guide/events/ingesting-usage#define-a-billable-metric
# LLM API Keys
os.environ['OPENAI_API_KEY']=""
# set lago as a callback, litellm will send the data to lago
litellm.success_callback = ["lago"]
# openai call
response = litellm.completion(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": "Hi 👋 - i'm openai"}
],
user="your_customer_id" # 👈 SET YOUR CUSTOMER ID HERE
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Add to Config.yaml
```yaml
model_list:
- litellm_params:
api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
api_key: my-fake-key
model: openai/my-fake-model
model_name: fake-openai-endpoint
litellm_settings:
callbacks: ["lago"] # 👈 KEY CHANGE
```
2. Start Proxy
```
litellm --config /path/to/config.yaml
```
3. Test it!
<Tabs>
<TabItem value="curl" label="Curl">
```bash
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "fake-openai-endpoint",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
],
"user": "your-customer-id" # 👈 SET YOUR CUSTOMER ID
}
'
```
</TabItem>
<TabItem value="openai_python" label="OpenAI Python SDK">
```python
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
}
], user="my_customer_id") # 👈 whatever your customer id is
print(response)
```
</TabItem>
<TabItem value="langchain" label="Langchain">
```python
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
ChatPromptTemplate,
HumanMessagePromptTemplate,
SystemMessagePromptTemplate,
)
from langchain.schema import HumanMessage, SystemMessage
import os
os.environ["OPENAI_API_KEY"] = "anything"
chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:4000",
model = "gpt-3.5-turbo",
temperature=0.1,
extra_body={
"user": "my_customer_id" # 👈 whatever your customer id is
}
)
messages = [
SystemMessage(
content="You are a helpful assistant that im using to make a test request to."
),
HumanMessage(
content="test from litellm. tell me why it's amazing in 1 sentence"
),
]
response = chat(messages)
print(response)
```
</TabItem>
</Tabs>
</TabItem>
</Tabs>
<Image img={require('../../img/lago_2.png')} />
## Advanced - Lagos Logging object
This is what LiteLLM will log to Lagos
```
{
"event": {
"transaction_id": "<generated_unique_id>",
"external_customer_id": <litellm_end_user_id>, # passed via `user` param in /chat/completion call - https://platform.openai.com/docs/api-reference/chat/create
"code": os.getenv("LAGO_API_EVENT_CODE"),
"properties": {
"input_tokens": <number>,
"output_tokens": <number>,
"model": <string>,
"response_cost": <number>, # 👈 LITELLM CALCULATED RESPONSE COST - https://github.com/BerriAI/litellm/blob/d43f75150a65f91f60dc2c0c9462ce3ffc713c1f/litellm/utils.py#L1473
}
}
}
```

View file

@ -94,9 +94,10 @@ print(response)
```
### Set Custom Trace ID, Trace User ID and Tags
### Set Custom Trace ID, Trace User ID, Trace Metadata, Trace Version, Trace Release and Tags
Pass `trace_id`, `trace_user_id`, `trace_metadata`, `trace_version`, `trace_release`, `tags` in `metadata`
Pass `trace_id`, `trace_user_id` in `metadata`
```python
import litellm
@ -121,10 +122,21 @@ response = completion(
metadata={
"generation_name": "ishaan-test-generation", # set langfuse Generation Name
"generation_id": "gen-id22", # set langfuse Generation ID
"trace_id": "trace-id22", # set langfuse Trace ID
"version": "test-generation-version" # set langfuse Generation Version
"trace_user_id": "user-id2", # set langfuse Trace User ID
"session_id": "session-1", # set langfuse Session ID
"tags": ["tag1", "tag2"] # set langfuse Tags
"tags": ["tag1", "tag2"], # set langfuse Tags
"trace_id": "trace-id22", # set langfuse Trace ID
"trace_metadata": {"key": "value"}, # set langfuse Trace Metadata
"trace_version": "test-trace-version", # set langfuse Trace Version (if not set, defaults to Generation Version)
"trace_release": "test-trace-release", # set langfuse Trace Release
### OR ###
"existing_trace_id": "trace-id22", # if generation is continuation of past trace. This prevents default behaviour of setting a trace name
### OR enforce that certain fields are trace overwritten in the trace during the continuation ###
"existing_trace_id": "trace-id22",
"trace_metadata": {"key": "updated_trace_value"}, # The new value to use for the langfuse Trace Metadata
"update_trace_keys": ["input", "output", "trace_metadata"], # Updates the trace input & output to be this generations input & output also updates the Trace Metadata to match the passed in value
"debug_langfuse": True, # Will log the exact metadata sent to litellm for the trace/generation as `metadata_passed_to_litellm`
},
)
@ -132,6 +144,38 @@ print(response)
```
### Trace & Generation Parameters
#### Trace Specific Parameters
* `trace_id` - Identifier for the trace, must use `existing_trace_id` instead of `trace_id` if this is an existing trace, auto-generated by default
* `trace_name` - Name of the trace, auto-generated by default
* `session_id` - Session identifier for the trace, defaults to `None`
* `trace_version` - Version for the trace, defaults to value for `version`
* `trace_release` - Release for the trace, defaults to `None`
* `trace_metadata` - Metadata for the trace, defaults to `None`
* `trace_user_id` - User identifier for the trace, defaults to completion argument `user`
* `tags` - Tags for the trace, defeaults to `None`
##### Updatable Parameters on Continuation
The following parameters can be updated on a continuation of a trace by passing in the following values into the `update_trace_keys` in the metadata of the completion.
* `input` - Will set the traces input to be the input of this latest generation
* `output` - Will set the traces output to be the output of this generation
* `trace_version` - Will set the trace version to be the provided value (To use the latest generations version instead, use `version`)
* `trace_release` - Will set the trace release to be the provided value
* `trace_metadata` - Will set the trace metadata to the provided value
* `trace_user_id` - Will set the trace user id to the provided value
#### Generation Specific Parameters
* `generation_id` - Identifier for the generation, auto-generated by default
* `generation_name` - Identifier for the generation, auto-generated by default
* `prompt` - Langfuse prompt object used for the generation, defaults to None
Any other key value pairs passed into the metadata not listed in the above spec for a `litellm` completion will be added as a metadata key value pair for the generation.
### Use LangChain ChatLiteLLM + Langfuse
Pass `trace_user_id`, `session_id` in model_kwargs
```python
@ -167,6 +211,21 @@ messages = [
chat(messages)
```
## Redacting Messages, Response Content from Langfuse Logging
### Redact Messages and Responses from all Langfuse Logging
Set `litellm.turn_off_message_logging=True` This will prevent the messages and responses from being logged to langfuse, but request metadata will still be logged.
### Redact Messages and Responses from specific Langfuse Logging
In the metadata typically passed for text completion or embedding calls you can set specific keys to mask the messages and responses for this call.
Setting `mask_input` to `True` will mask the input from being logged for this call
Setting `mask_output` to `True` will make the output from being logged for this call.
Be aware that if you are continuing an existing trace, and you set `update_trace_keys` to include either `input` or `output` and you set the corresponding `mask_input` or `mask_output`, then that trace will have its existing input and/or output replaced with a redacted message.
## Troubleshooting & Errors
### Data not getting logged to Langfuse ?

View file

@ -57,7 +57,7 @@ os.environ["LANGSMITH_API_KEY"] = ""
os.environ['OPENAI_API_KEY']=""
# set langfuse as a callback, litellm will send the data to langfuse
litellm.success_callback = ["langfuse"]
litellm.success_callback = ["langsmith"]
response = litellm.completion(
model="gpt-3.5-turbo",
@ -71,6 +71,23 @@ response = litellm.completion(
)
print(response)
```
### Make LiteLLM Proxy use Custom `LANGSMITH_BASE_URL`
If you're using a custom LangSmith instance, you can set the
`LANGSMITH_BASE_URL` environment variable to point to your instance.
For example, you can make LiteLLM Proxy log to a local LangSmith instance with
this config:
```yaml
litellm_settings:
success_callback: ["langsmith"]
environment_variables:
LANGSMITH_BASE_URL: "http://localhost:1984"
LANGSMITH_PROJECT: "litellm-proxy"
```
## Support & Talk to Founders
- [Schedule Demo 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version)

View file

@ -0,0 +1,97 @@
import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# OpenMeter - Usage-Based Billing
[OpenMeter](https://openmeter.io/) is an Open Source Usage-Based Billing solution for AI/Cloud applications. It integrates with Stripe for easy billing.
<Image img={require('../../img/openmeter.png')} />
:::info
We want to learn how we can make the callbacks better! Meet the LiteLLM [founders](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version) or
join our [discord](https://discord.gg/wuPM9dRgDw)
:::
## Quick Start
Use just 2 lines of code, to instantly log your responses **across all providers** with OpenMeter
Get your OpenMeter API Key from https://openmeter.cloud/meters
```python
litellm.callbacks = ["openmeter"] # logs cost + usage of successful calls to openmeter
```
<Tabs>
<TabItem value="sdk" label="SDK">
```python
# pip install openmeter
import litellm
import os
# from https://openmeter.cloud
os.environ["OPENMETER_API_ENDPOINT"] = ""
os.environ["OPENMETER_API_KEY"] = ""
# LLM API Keys
os.environ['OPENAI_API_KEY']=""
# set openmeter as a callback, litellm will send the data to openmeter
litellm.callbacks = ["openmeter"]
# openai call
response = litellm.completion(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": "Hi 👋 - i'm openai"}
]
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Add to Config.yaml
```yaml
model_list:
- litellm_params:
api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
api_key: my-fake-key
model: openai/my-fake-model
model_name: fake-openai-endpoint
litellm_settings:
callbacks: ["openmeter"] # 👈 KEY CHANGE
```
2. Start Proxy
```
litellm --config /path/to/config.yaml
```
3. Test it!
```bash
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "fake-openai-endpoint",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
],
}
'
```
</TabItem>
</Tabs>
<Image img={require('../../img/openmeter_img_2.png')} />

View file

@ -40,5 +40,9 @@ response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content
print(response)
```
## Redacting Messages, Response Content from Sentry Logging
Set `litellm.turn_off_message_logging=True` This will prevent the messages and responses from being logged to sentry, but request metadata will still be logged.
[Let us know](https://github.com/BerriAI/litellm/issues/new?assignees=&labels=enhancement&projects=&template=feature_request.yml&title=%5BFeature%5D%3A+) if you need any additional options from Sentry.

View file

@ -223,6 +223,117 @@ assert isinstance(
```
### Setting `anthropic-beta` Header in Requests
Pass the the `extra_headers` param to litellm, All headers will be forwarded to Anthropic API
```python
response = completion(
model="anthropic/claude-3-opus-20240229",
messages=messages,
tools=tools,
)
```
### Forcing Anthropic Tool Use
If you want Claude to use a specific tool to answer the users question
You can do this by specifying the tool in the `tool_choice` field like so:
```python
response = completion(
model="anthropic/claude-3-opus-20240229",
messages=messages,
tools=tools,
tool_choice={"type": "tool", "name": "get_weather"},
)
```
### Parallel Function Calling
Here's how to pass the result of a function call back to an anthropic model:
```python
from litellm import completion
import os
os.environ["ANTHROPIC_API_KEY"] = "sk-ant.."
litellm.set_verbose = True
### 1ST FUNCTION CALL ###
tools = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
},
"required": ["location"],
},
},
}
]
messages = [
{
"role": "user",
"content": "What's the weather like in Boston today in Fahrenheit?",
}
]
try:
# test without max tokens
response = completion(
model="anthropic/claude-3-opus-20240229",
messages=messages,
tools=tools,
tool_choice="auto",
)
# Add any assertions, here to check response args
print(response)
assert isinstance(response.choices[0].message.tool_calls[0].function.name, str)
assert isinstance(
response.choices[0].message.tool_calls[0].function.arguments, str
)
messages.append(
response.choices[0].message.model_dump()
) # Add assistant tool invokes
tool_result = (
'{"location": "Boston", "temperature": "72", "unit": "fahrenheit"}'
)
# Add user submitted tool results in the OpenAI format
messages.append(
{
"tool_call_id": response.choices[0].message.tool_calls[0].id,
"role": "tool",
"name": response.choices[0].message.tool_calls[0].function.name,
"content": tool_result,
}
)
### 2ND FUNCTION CALL ###
# In the second response, Claude should deduce answer from tool results
second_response = completion(
model="anthropic/claude-3-opus-20240229",
messages=messages,
tools=tools,
tool_choice="auto",
)
print(second_response)
except Exception as e:
print(f"An error occurred - {str(e)}")
```
s/o @[Shekhar Patnaik](https://www.linkedin.com/in/patnaikshekhar) for requesting this!
## Usage - Vision

View file

@ -3,8 +3,6 @@ import TabItem from '@theme/TabItem';
# Azure AI Studio
## Sample Usage
**Ensure the following:**
1. The API Base passed ends in the `/v1/` prefix
example:
@ -14,8 +12,11 @@ import TabItem from '@theme/TabItem';
2. The `model` passed is listed in [supported models](#supported-models). You **DO NOT** Need to pass your deployment name to litellm. Example `model=azure/Mistral-large-nmefg`
## Usage
<Tabs>
<TabItem value="sdk" label="SDK">
**Quick Start**
```python
import litellm
response = litellm.completion(
@ -26,6 +27,9 @@ response = litellm.completion(
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
## Sample Usage - LiteLLM Proxy
1. Add models to your config.yaml
@ -99,6 +103,107 @@ response = litellm.completion(
</Tabs>
</TabItem>
</Tabs>
## Function Calling
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
# set env
os.environ["AZURE_MISTRAL_API_KEY"] = "your-api-key"
os.environ["AZURE_MISTRAL_API_BASE"] = "your-api-base"
tools = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
},
"required": ["location"],
},
},
}
]
messages = [{"role": "user", "content": "What's the weather like in Boston today?"}]
response = completion(
model="azure/mistral-large-latest",
api_base=os.getenv("AZURE_MISTRAL_API_BASE")
api_key=os.getenv("AZURE_MISTRAL_API_KEY")
messages=messages,
tools=tools,
tool_choice="auto",
)
# Add any assertions, here to check response args
print(response)
assert isinstance(response.choices[0].message.tool_calls[0].function.name, str)
assert isinstance(
response.choices[0].message.tool_calls[0].function.arguments, str
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```bash
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $YOUR_API_KEY" \
-d '{
"model": "mistral",
"messages": [
{
"role": "user",
"content": "What'\''s the weather like in Boston today?"
}
],
"tools": [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA"
},
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"]
}
},
"required": ["location"]
}
}
}
],
"tool_choice": "auto"
}'
```
</TabItem>
</Tabs>
## Supported Models
| Model Name | Function Call |

View file

@ -535,7 +535,8 @@ print(response)
| Model Name | Function Call |
|----------------------|---------------------------------------------|
| Titan Embeddings - G1 | `embedding(model="bedrock/amazon.titan-embed-text-v1", input=input)` |
| Titan Embeddings V2 | `embedding(model="bedrock/amazon.titan-embed-text-v2:0", input=input)` |
| Titan Embeddings - V1 | `embedding(model="bedrock/amazon.titan-embed-text-v1", input=input)` |
| Cohere Embeddings - English | `embedding(model="bedrock/cohere.embed-english-v3", input=input)` |
| Cohere Embeddings - Multilingual | `embedding(model="bedrock/cohere.embed-multilingual-v3", input=input)` |

View file

@ -0,0 +1,177 @@
# Clarifai
Anthropic, OpenAI, Mistral, Llama and Gemini LLMs are Supported on Clarifai.
## Pre-Requisites
`pip install clarifai`
`pip install litellm`
## Required Environment Variables
To obtain your Clarifai Personal access token follow this [link](https://docs.clarifai.com/clarifai-basics/authentication/personal-access-tokens/). Optionally the PAT can also be passed in `completion` function.
```python
os.environ["CALRIFAI_API_KEY"] = "YOUR_CLARIFAI_PAT" # CLARIFAI_PAT
```
## Usage
```python
import os
from litellm import completion
os.environ["CLARIFAI_API_KEY"] = ""
response = completion(
model="clarifai/mistralai.completion.mistral-large",
messages=[{ "content": "Tell me a joke about physics?","role": "user"}]
)
```
**Output**
```json
{
"id": "chatcmpl-572701ee-9ab2-411c-ac75-46c1ba18e781",
"choices": [
{
"finish_reason": "stop",
"index": 1,
"message": {
"content": "Sure, here's a physics joke for you:\n\nWhy can't you trust an atom?\n\nBecause they make up everything!",
"role": "assistant"
}
}
],
"created": 1714410197,
"model": "https://api.clarifai.com/v2/users/mistralai/apps/completion/models/mistral-large/outputs",
"object": "chat.completion",
"system_fingerprint": null,
"usage": {
"prompt_tokens": 14,
"completion_tokens": 24,
"total_tokens": 38
}
}
```
## Clarifai models
liteLLM supports non-streaming requests to all models on [Clarifai community](https://clarifai.com/explore/models?filterData=%5B%7B%22field%22%3A%22use_cases%22%2C%22value%22%3A%5B%22llm%22%5D%7D%5D&page=1&perPage=24)
Example Usage - Note: liteLLM supports all models deployed on Clarifai
## Llama LLMs
| Model Name | Function Call |
---------------------------|---------------------------------|
| clarifai/meta.Llama-2.llama2-7b-chat | `completion('clarifai/meta.Llama-2.llama2-7b-chat', messages)`
| clarifai/meta.Llama-2.llama2-13b-chat | `completion('clarifai/meta.Llama-2.llama2-13b-chat', messages)`
| clarifai/meta.Llama-2.llama2-70b-chat | `completion('clarifai/meta.Llama-2.llama2-70b-chat', messages)` |
| clarifai/meta.Llama-2.codeLlama-70b-Python | `completion('clarifai/meta.Llama-2.codeLlama-70b-Python', messages)`|
| clarifai/meta.Llama-2.codeLlama-70b-Instruct | `completion('clarifai/meta.Llama-2.codeLlama-70b-Instruct', messages)` |
## Mistal LLMs
| Model Name | Function Call |
|---------------------------------------------|------------------------------------------------------------------------|
| clarifai/mistralai.completion.mixtral-8x22B | `completion('clarifai/mistralai.completion.mixtral-8x22B', messages)` |
| clarifai/mistralai.completion.mistral-large | `completion('clarifai/mistralai.completion.mistral-large', messages)` |
| clarifai/mistralai.completion.mistral-medium | `completion('clarifai/mistralai.completion.mistral-medium', messages)` |
| clarifai/mistralai.completion.mistral-small | `completion('clarifai/mistralai.completion.mistral-small', messages)` |
| clarifai/mistralai.completion.mixtral-8x7B-Instruct-v0_1 | `completion('clarifai/mistralai.completion.mixtral-8x7B-Instruct-v0_1', messages)`
| clarifai/mistralai.completion.mistral-7B-OpenOrca | `completion('clarifai/mistralai.completion.mistral-7B-OpenOrca', messages)` |
| clarifai/mistralai.completion.openHermes-2-mistral-7B | `completion('clarifai/mistralai.completion.openHermes-2-mistral-7B', messages)` |
## Jurassic LLMs
| Model Name | Function Call |
|-----------------------------------------------|---------------------------------------------------------------------|
| clarifai/ai21.complete.Jurassic2-Grande | `completion('clarifai/ai21.complete.Jurassic2-Grande', messages)` |
| clarifai/ai21.complete.Jurassic2-Grande-Instruct | `completion('clarifai/ai21.complete.Jurassic2-Grande-Instruct', messages)` |
| clarifai/ai21.complete.Jurassic2-Jumbo-Instruct | `completion('clarifai/ai21.complete.Jurassic2-Jumbo-Instruct', messages)` |
| clarifai/ai21.complete.Jurassic2-Jumbo | `completion('clarifai/ai21.complete.Jurassic2-Jumbo', messages)` |
| clarifai/ai21.complete.Jurassic2-Large | `completion('clarifai/ai21.complete.Jurassic2-Large', messages)` |
## Wizard LLMs
| Model Name | Function Call |
|-----------------------------------------------|---------------------------------------------------------------------|
| clarifai/wizardlm.generate.wizardCoder-Python-34B | `completion('clarifai/wizardlm.generate.wizardCoder-Python-34B', messages)` |
| clarifai/wizardlm.generate.wizardLM-70B | `completion('clarifai/wizardlm.generate.wizardLM-70B', messages)` |
| clarifai/wizardlm.generate.wizardLM-13B | `completion('clarifai/wizardlm.generate.wizardLM-13B', messages)` |
| clarifai/wizardlm.generate.wizardCoder-15B | `completion('clarifai/wizardlm.generate.wizardCoder-15B', messages)` |
## Anthropic models
| Model Name | Function Call |
|-----------------------------------------------|---------------------------------------------------------------------|
| clarifai/anthropic.completion.claude-v1 | `completion('clarifai/anthropic.completion.claude-v1', messages)` |
| clarifai/anthropic.completion.claude-instant-1_2 | `completion('clarifai/anthropic.completion.claude-instant-1_2', messages)` |
| clarifai/anthropic.completion.claude-instant | `completion('clarifai/anthropic.completion.claude-instant', messages)` |
| clarifai/anthropic.completion.claude-v2 | `completion('clarifai/anthropic.completion.claude-v2', messages)` |
| clarifai/anthropic.completion.claude-2_1 | `completion('clarifai/anthropic.completion.claude-2_1', messages)` |
| clarifai/anthropic.completion.claude-3-opus | `completion('clarifai/anthropic.completion.claude-3-opus', messages)` |
| clarifai/anthropic.completion.claude-3-sonnet | `completion('clarifai/anthropic.completion.claude-3-sonnet', messages)` |
## OpenAI GPT LLMs
| Model Name | Function Call |
|-----------------------------------------------|---------------------------------------------------------------------|
| clarifai/openai.chat-completion.GPT-4 | `completion('clarifai/openai.chat-completion.GPT-4', messages)` |
| clarifai/openai.chat-completion.GPT-3_5-turbo | `completion('clarifai/openai.chat-completion.GPT-3_5-turbo', messages)` |
| clarifai/openai.chat-completion.gpt-4-turbo | `completion('clarifai/openai.chat-completion.gpt-4-turbo', messages)` |
| clarifai/openai.completion.gpt-3_5-turbo-instruct | `completion('clarifai/openai.completion.gpt-3_5-turbo-instruct', messages)` |
## GCP LLMs
| Model Name | Function Call |
|-----------------------------------------------|---------------------------------------------------------------------|
| clarifai/gcp.generate.gemini-1_5-pro | `completion('clarifai/gcp.generate.gemini-1_5-pro', messages)` |
| clarifai/gcp.generate.imagen-2 | `completion('clarifai/gcp.generate.imagen-2', messages)` |
| clarifai/gcp.generate.code-gecko | `completion('clarifai/gcp.generate.code-gecko', messages)` |
| clarifai/gcp.generate.code-bison | `completion('clarifai/gcp.generate.code-bison', messages)` |
| clarifai/gcp.generate.text-bison | `completion('clarifai/gcp.generate.text-bison', messages)` |
| clarifai/gcp.generate.gemma-2b-it | `completion('clarifai/gcp.generate.gemma-2b-it', messages)` |
| clarifai/gcp.generate.gemma-7b-it | `completion('clarifai/gcp.generate.gemma-7b-it', messages)` |
| clarifai/gcp.generate.gemini-pro | `completion('clarifai/gcp.generate.gemini-pro', messages)` |
| clarifai/gcp.generate.gemma-1_1-7b-it | `completion('clarifai/gcp.generate.gemma-1_1-7b-it', messages)` |
## Cohere LLMs
| Model Name | Function Call |
|-----------------------------------------------|---------------------------------------------------------------------|
| clarifai/cohere.generate.cohere-generate-command | `completion('clarifai/cohere.generate.cohere-generate-command', messages)` |
clarifai/cohere.generate.command-r-plus' | `completion('clarifai/clarifai/cohere.generate.command-r-plus', messages)`|
## Databricks LLMs
| Model Name | Function Call |
|---------------------------------------------------|---------------------------------------------------------------------|
| clarifai/databricks.drbx.dbrx-instruct | `completion('clarifai/databricks.drbx.dbrx-instruct', messages)` |
| clarifai/databricks.Dolly-v2.dolly-v2-12b | `completion('clarifai/databricks.Dolly-v2.dolly-v2-12b', messages)`|
## Microsoft LLMs
| Model Name | Function Call |
|---------------------------------------------------|---------------------------------------------------------------------|
| clarifai/microsoft.text-generation.phi-2 | `completion('clarifai/microsoft.text-generation.phi-2', messages)` |
| clarifai/microsoft.text-generation.phi-1_5 | `completion('clarifai/microsoft.text-generation.phi-1_5', messages)`|
## Salesforce models
| Model Name | Function Call |
|-----------------------------------------------------------|-------------------------------------------------------------------------------|
| clarifai/salesforce.blip.general-english-image-caption-blip-2 | `completion('clarifai/salesforce.blip.general-english-image-caption-blip-2', messages)` |
| clarifai/salesforce.xgen.xgen-7b-8k-instruct | `completion('clarifai/salesforce.xgen.xgen-7b-8k-instruct', messages)` |
## Other Top performing LLMs
| Model Name | Function Call |
|---------------------------------------------------|---------------------------------------------------------------------|
| clarifai/deci.decilm.deciLM-7B-instruct | `completion('clarifai/deci.decilm.deciLM-7B-instruct', messages)` |
| clarifai/upstage.solar.solar-10_7b-instruct | `completion('clarifai/upstage.solar.solar-10_7b-instruct', messages)` |
| clarifai/openchat.openchat.openchat-3_5-1210 | `completion('clarifai/openchat.openchat.openchat-3_5-1210', messages)` |
| clarifai/togethercomputer.stripedHyena.stripedHyena-Nous-7B | `completion('clarifai/togethercomputer.stripedHyena.stripedHyena-Nous-7B', messages)` |
| clarifai/fblgit.una-cybertron.una-cybertron-7b-v2 | `completion('clarifai/fblgit.una-cybertron.una-cybertron-7b-v2', messages)` |
| clarifai/tiiuae.falcon.falcon-40b-instruct | `completion('clarifai/tiiuae.falcon.falcon-40b-instruct', messages)` |
| clarifai/togethercomputer.RedPajama.RedPajama-INCITE-7B-Chat | `completion('clarifai/togethercomputer.RedPajama.RedPajama-INCITE-7B-Chat', messages)` |
| clarifai/bigcode.code.StarCoder | `completion('clarifai/bigcode.code.StarCoder', messages)` |
| clarifai/mosaicml.mpt.mpt-7b-instruct | `completion('clarifai/mosaicml.mpt.mpt-7b-instruct', messages)` |

View file

@ -0,0 +1,54 @@
# Deepseek
https://deepseek.com/
**We support ALL Deepseek models, just set `deepseek/` as a prefix when sending completion requests**
## API Key
```python
# env variable
os.environ['DEEPSEEK_API_KEY']
```
## Sample Usage
```python
from litellm import completion
import os
os.environ['DEEPSEEK_API_KEY'] = ""
response = completion(
model="deepseek/deepseek-chat",
messages=[
{"role": "user", "content": "hello from litellm"}
],
)
print(response)
```
## Sample Usage - Streaming
```python
from litellm import completion
import os
os.environ['DEEPSEEK_API_KEY'] = ""
response = completion(
model="deepseek/deepseek-chat",
messages=[
{"role": "user", "content": "hello from litellm"}
],
stream=True
)
for chunk in response:
print(chunk)
```
## Supported Models - ALL Deepseek Models Supported!
We support ALL Deepseek models, just set `deepseek/` as a prefix when sending completion requests
| Model Name | Function Call |
|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| deepseek-chat | `completion(model="deepseek/deepseek-chat", messages)` |
| deepseek-coder | `completion(model="deepseek/deepseek-chat", messages)` |

View file

@ -23,7 +23,7 @@ In certain use-cases you may need to make calls to the models and pass [safety s
```python
response = completion(
model="gemini/gemini-pro",
messages=[{"role": "user", "content": "write code for saying hi from LiteLLM"}]
messages=[{"role": "user", "content": "write code for saying hi from LiteLLM"}],
safety_settings=[
{
"category": "HARM_CATEGORY_HARASSMENT",

View file

@ -48,6 +48,109 @@ We support ALL Groq models, just set `groq/` as a prefix when sending completion
| Model Name | Function Call |
|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| llama3-8b-8192 | `completion(model="groq/llama3-8b-8192", messages)` |
| llama3-70b-8192 | `completion(model="groq/llama3-70b-8192", messages)` |
| llama2-70b-4096 | `completion(model="groq/llama2-70b-4096", messages)` |
| mixtral-8x7b-32768 | `completion(model="groq/mixtral-8x7b-32768", messages)` |
| gemma-7b-it | `completion(model="groq/gemma-7b-it", messages)` |
## Groq - Tool / Function Calling Example
```python
# Example dummy function hard coded to return the current weather
import json
def get_current_weather(location, unit="fahrenheit"):
"""Get the current weather in a given location"""
if "tokyo" in location.lower():
return json.dumps({"location": "Tokyo", "temperature": "10", "unit": "celsius"})
elif "san francisco" in location.lower():
return json.dumps(
{"location": "San Francisco", "temperature": "72", "unit": "fahrenheit"}
)
elif "paris" in location.lower():
return json.dumps({"location": "Paris", "temperature": "22", "unit": "celsius"})
else:
return json.dumps({"location": location, "temperature": "unknown"})
# Step 1: send the conversation and available functions to the model
messages = [
{
"role": "system",
"content": "You are a function calling LLM that uses the data extracted from get_current_weather to answer questions about the weather in San Francisco.",
},
{
"role": "user",
"content": "What's the weather like in San Francisco?",
},
]
tools = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
},
},
"required": ["location"],
},
},
}
]
response = litellm.completion(
model="groq/llama2-70b-4096",
messages=messages,
tools=tools,
tool_choice="auto", # auto is default, but we'll be explicit
)
print("Response\n", response)
response_message = response.choices[0].message
tool_calls = response_message.tool_calls
# Step 2: check if the model wanted to call a function
if tool_calls:
# Step 3: call the function
# Note: the JSON response may not always be valid; be sure to handle errors
available_functions = {
"get_current_weather": get_current_weather,
}
messages.append(
response_message
) # extend conversation with assistant's reply
print("Response message\n", response_message)
# Step 4: send the info for each function call and function response to the model
for tool_call in tool_calls:
function_name = tool_call.function.name
function_to_call = available_functions[function_name]
function_args = json.loads(tool_call.function.arguments)
function_response = function_to_call(
location=function_args.get("location"),
unit=function_args.get("unit"),
)
messages.append(
{
"tool_call_id": tool_call.id,
"role": "tool",
"name": function_name,
"content": function_response,
}
) # extend conversation with function response
print(f"messages: {messages}")
second_response = litellm.completion(
model="groq/llama2-70b-4096", messages=messages
) # get a new response from the model where it can see the function response
print("second response\n", second_response)
```

View file

@ -21,6 +21,11 @@ This is done by adding the "huggingface/" prefix to `model`, example `completion
<Tabs>
<TabItem value="tgi" label="Text-generation-interface (TGI)">
By default, LiteLLM will assume a huggingface call follows the TGI format.
<Tabs>
<TabItem value="sdk" label="SDK">
```python
import os
from litellm import completion
@ -40,9 +45,58 @@ response = completion(
print(response)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Add models to your config.yaml
```yaml
model_list:
- model_name: wizard-coder
litellm_params:
model: huggingface/WizardLM/WizardCoder-Python-34B-V1.0
api_key: os.environ/HUGGINGFACE_API_KEY
api_base: "https://my-endpoint.endpoints.huggingface.cloud"
```
2. Start the proxy
```bash
$ litellm --config /path/to/config.yaml --debug
```
3. Test it!
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{
"model": "wizard-coder",
"messages": [
{
"role": "user",
"content": "I like you!"
}
],
}'
```
</TabItem>
</Tabs>
</TabItem>
<TabItem value="conv" label="Conversational-task (BlenderBot, etc.)">
Append `conversational` to the model name
e.g. `huggingface/conversational/<model-name>`
<Tabs>
<TabItem value="sdk" label="SDK">
```python
import os
from litellm import completion
@ -54,7 +108,7 @@ messages = [{ "content": "There's a llama in my garden 😱 What should I do?","
# e.g. Call 'facebook/blenderbot-400M-distill' hosted on HF Inference endpoints
response = completion(
model="huggingface/facebook/blenderbot-400M-distill",
model="huggingface/conversational/facebook/blenderbot-400M-distill",
messages=messages,
api_base="https://my-endpoint.huggingface.cloud"
)
@ -62,7 +116,123 @@ response = completion(
print(response)
```
</TabItem>
<TabItem value="none" label="Non TGI/Conversational-task LLMs">
<TabItem value="proxy" label="PROXY">
1. Add models to your config.yaml
```yaml
model_list:
- model_name: blenderbot
litellm_params:
model: huggingface/conversational/facebook/blenderbot-400M-distill
api_key: os.environ/HUGGINGFACE_API_KEY
api_base: "https://my-endpoint.endpoints.huggingface.cloud"
```
2. Start the proxy
```bash
$ litellm --config /path/to/config.yaml --debug
```
3. Test it!
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{
"model": "blenderbot",
"messages": [
{
"role": "user",
"content": "I like you!"
}
],
}'
```
</TabItem>
</Tabs>
</TabItem>
<TabItem value="classification" label="Text Classification">
Append `text-classification` to the model name
e.g. `huggingface/text-classification/<model-name>`
<Tabs>
<TabItem value="sdk" label="SDK">
```python
import os
from litellm import completion
# [OPTIONAL] set env var
os.environ["HUGGINGFACE_API_KEY"] = "huggingface_api_key"
messages = [{ "content": "I like you, I love you!","role": "user"}]
# e.g. Call 'shahrukhx01/question-vs-statement-classifier' hosted on HF Inference endpoints
response = completion(
model="huggingface/text-classification/shahrukhx01/question-vs-statement-classifier",
messages=messages,
api_base="https://my-endpoint.endpoints.huggingface.cloud",
)
print(response)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Add models to your config.yaml
```yaml
model_list:
- model_name: bert-classifier
litellm_params:
model: huggingface/text-classification/shahrukhx01/question-vs-statement-classifier
api_key: os.environ/HUGGINGFACE_API_KEY
api_base: "https://my-endpoint.endpoints.huggingface.cloud"
```
2. Start the proxy
```bash
$ litellm --config /path/to/config.yaml --debug
```
3. Test it!
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{
"model": "bert-classifier",
"messages": [
{
"role": "user",
"content": "I like you!"
}
],
}'
```
</TabItem>
</Tabs>
</TabItem>
<TabItem value="none" label="Text Generation (NOT TGI)">
Append `text-generation` to the model name
e.g. `huggingface/text-generation/<model-name>`
```python
import os
@ -75,7 +245,7 @@ messages = [{ "content": "There's a llama in my garden 😱 What should I do?","
# e.g. Call 'roneneldan/TinyStories-3M' hosted on HF Inference endpoints
response = completion(
model="huggingface/roneneldan/TinyStories-3M",
model="huggingface/text-generation/roneneldan/TinyStories-3M",
messages=messages,
api_base="https://p69xlsj6rpno5drq.us-east-1.aws.endpoints.huggingface.cloud",
)

View file

@ -44,13 +44,58 @@ for chunk in response:
## Supported Models
All models listed here https://docs.mistral.ai/platform/endpoints are supported. We actively maintain the list of models, pricing, token window, etc. [here](https://github.com/BerriAI/litellm/blob/c1b25538277206b9f00de5254d80d6a83bb19a29/model_prices_and_context_window.json).
| Model Name | Function Call |
|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| mistral-tiny | `completion(model="mistral/mistral-tiny", messages)` |
| mistral-small | `completion(model="mistral/mistral-small", messages)` |
| mistral-medium | `completion(model="mistral/mistral-medium", messages)` |
| mistral-large-latest | `completion(model="mistral/mistral-large-latest", messages)` |
| Model Name | Function Call |
|----------------|--------------------------------------------------------------|
| Mistral Small | `completion(model="mistral/mistral-small-latest", messages)` |
| Mistral Medium | `completion(model="mistral/mistral-medium-latest", messages)`|
| Mistral Large | `completion(model="mistral/mistral-large-latest", messages)` |
| Mistral 7B | `completion(model="mistral/open-mistral-7b", messages)` |
| Mixtral 8x7B | `completion(model="mistral/open-mixtral-8x7b", messages)` |
| Mixtral 8x22B | `completion(model="mistral/open-mixtral-8x22b", messages)` |
## Function Calling
```python
from litellm import completion
# set env
os.environ["MISTRAL_API_KEY"] = "your-api-key"
tools = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
},
"required": ["location"],
},
},
}
]
messages = [{"role": "user", "content": "What's the weather like in Boston today?"}]
response = completion(
model="mistral/mistral-large-latest",
messages=messages,
tools=tools,
tool_choice="auto",
)
# Add any assertions, here to check response args
print(response)
assert isinstance(response.choices[0].message.tool_calls[0].function.name, str)
assert isinstance(
response.choices[0].message.tool_calls[0].function.arguments, str
)
```
## Sample Usage - Embedding
```python
@ -71,6 +116,6 @@ All models listed here https://docs.mistral.ai/platform/endpoints are supported
| Model Name | Function Call |
|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| mistral-embed | `embedding(model="mistral/mistral-embed", input)` |
| Mistral Embeddings | `embedding(model="mistral/mistral-embed", input)` |

View file

@ -102,12 +102,18 @@ Ollama supported models: https://github.com/ollama/ollama
| Model Name | Function Call |
|----------------------|-----------------------------------------------------------------------------------
| Mistral | `completion(model='ollama/mistral', messages, api_base="http://localhost:11434", stream=True)` |
| Mistral-7B-Instruct-v0.1 | `completion(model='ollama/mistral-7B-Instruct-v0.1', messages, api_base="http://localhost:11434", stream=False)` |
| Mistral-7B-Instruct-v0.2 | `completion(model='ollama/mistral-7B-Instruct-v0.2', messages, api_base="http://localhost:11434", stream=False)` |
| Mixtral-8x7B-Instruct-v0.1 | `completion(model='ollama/mistral-8x7B-Instruct-v0.1', messages, api_base="http://localhost:11434", stream=False)` |
| Mixtral-8x22B-Instruct-v0.1 | `completion(model='ollama/mixtral-8x22B-Instruct-v0.1', messages, api_base="http://localhost:11434", stream=False)` |
| Llama2 7B | `completion(model='ollama/llama2', messages, api_base="http://localhost:11434", stream=True)` |
| Llama2 13B | `completion(model='ollama/llama2:13b', messages, api_base="http://localhost:11434", stream=True)` |
| Llama2 70B | `completion(model='ollama/llama2:70b', messages, api_base="http://localhost:11434", stream=True)` |
| Llama2 Uncensored | `completion(model='ollama/llama2-uncensored', messages, api_base="http://localhost:11434", stream=True)` |
| Code Llama | `completion(model='ollama/codellama', messages, api_base="http://localhost:11434", stream=True)` |
| Llama2 Uncensored | `completion(model='ollama/llama2-uncensored', messages, api_base="http://localhost:11434", stream=True)` |
|Meta LLaMa3 8B | `completion(model='ollama/llama3', messages, api_base="http://localhost:11434", stream=False)` |
| Meta LLaMa3 70B | `completion(model='ollama/llama3:70b', messages, api_base="http://localhost:11434", stream=False)` |
| Orca Mini | `completion(model='ollama/orca-mini', messages, api_base="http://localhost:11434", stream=True)` |
| Vicuna | `completion(model='ollama/vicuna', messages, api_base="http://localhost:11434", stream=True)` |
| Nous-Hermes | `completion(model='ollama/nous-hermes', messages, api_base="http://localhost:11434", stream=True)` |

View file

@ -2,7 +2,7 @@ import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# OpenAI
LiteLLM supports OpenAI Chat + Text completion and embedding calls.
LiteLLM supports OpenAI Chat + Embedding calls.
### Required API Keys
@ -20,7 +20,7 @@ os.environ["OPENAI_API_KEY"] = "your-api-key"
# openai call
response = completion(
model = "gpt-3.5-turbo",
model = "gpt-4o",
messages=[{ "content": "Hello, how are you?","role": "user"}]
)
```
@ -163,6 +163,10 @@ os.environ["OPENAI_API_BASE"] = "openaiai-api-base" # OPTIONAL
| Model Name | Function Call |
|-----------------------|-----------------------------------------------------------------|
| gpt-4o | `response = completion(model="gpt-4o", messages=messages)` |
| gpt-4o-2024-05-13 | `response = completion(model="gpt-4o-2024-05-13", messages=messages)` |
| gpt-4-turbo | `response = completion(model="gpt-4-turbo", messages=messages)` |
| gpt-4-turbo-preview | `response = completion(model="gpt-4-0125-preview", messages=messages)` |
| gpt-4-0125-preview | `response = completion(model="gpt-4-0125-preview", messages=messages)` |
| gpt-4-1106-preview | `response = completion(model="gpt-4-1106-preview", messages=messages)` |
| gpt-3.5-turbo-1106 | `response = completion(model="gpt-3.5-turbo-1106", messages=messages)` |
@ -184,6 +188,8 @@ These also support the `OPENAI_API_BASE` environment variable, which can be used
## OpenAI Vision Models
| Model Name | Function Call |
|-----------------------|-----------------------------------------------------------------|
| gpt-4o | `response = completion(model="gpt-4o", messages=messages)` |
| gpt-4-turbo | `response = completion(model="gpt-4-turbo", messages=messages)` |
| gpt-4-vision-preview | `response = completion(model="gpt-4-vision-preview", messages=messages)` |
#### Usage
@ -217,19 +223,6 @@ response = completion(
```
## OpenAI Text Completion Models / Instruct Models
| Model Name | Function Call |
|---------------------|----------------------------------------------------|
| gpt-3.5-turbo-instruct | `response = completion(model="gpt-3.5-turbo-instruct", messages=messages)` |
| gpt-3.5-turbo-instruct-0914 | `response = completion(model="gpt-3.5-turbo-instruct-0914", messages=messages)` |
| text-davinci-003 | `response = completion(model="text-davinci-003", messages=messages)` |
| ada-001 | `response = completion(model="ada-001", messages=messages)` |
| curie-001 | `response = completion(model="curie-001", messages=messages)` |
| babbage-001 | `response = completion(model="babbage-001", messages=messages)` |
| babbage-002 | `response = completion(model="babbage-002", messages=messages)` |
| davinci-002 | `response = completion(model="davinci-002", messages=messages)` |
## Advanced
### Parallel Function calling

View file

@ -5,7 +5,9 @@ import TabItem from '@theme/TabItem';
To call models hosted behind an openai proxy, make 2 changes:
1. Put `openai/` in front of your model name, so litellm knows you're trying to call an openai-compatible endpoint.
1. For `/chat/completions`: Put `openai/` in front of your model name, so litellm knows you're trying to call an openai `/chat/completions` endpoint.
2. For `/completions`: Put `text-completion-openai/` in front of your model name, so litellm knows you're trying to call an openai `/completions` endpoint.
2. **Do NOT** add anything additional to the base url e.g. `/v1/embedding`. LiteLLM uses the openai-client to make these calls, and that automatically adds the relevant endpoints.

View file

@ -0,0 +1,247 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# 🆕 Predibase
LiteLLM supports all models on Predibase
## Usage
<Tabs>
<TabItem value="sdk" label="SDK">
### API KEYS
```python
import os
os.environ["PREDIBASE_API_KEY"] = ""
```
### Example Call
```python
from litellm import completion
import os
## set ENV variables
os.environ["PREDIBASE_API_KEY"] = "predibase key"
os.environ["PREDIBASE_TENANT_ID"] = "predibase tenant id"
# predibase llama-3 call
response = completion(
model="predibase/llama-3-8b-instruct",
messages = [{ "content": "Hello, how are you?","role": "user"}]
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Add models to your config.yaml
```yaml
model_list:
- model_name: llama-3
litellm_params:
model: predibase/llama-3-8b-instruct
api_key: os.environ/PREDIBASE_API_KEY
tenant_id: os.environ/PREDIBASE_TENANT_ID
```
2. Start the proxy
```bash
$ litellm --config /path/to/config.yaml --debug
```
3. Send Request to LiteLLM Proxy Server
<Tabs>
<TabItem value="openai" label="OpenAI Python v1.0.0+">
```python
import openai
client = openai.OpenAI(
api_key="sk-1234", # pass litellm proxy key, if you're using virtual keys
base_url="http://0.0.0.0:4000" # litellm-proxy-base url
)
response = client.chat.completions.create(
model="llama-3",
messages = [
{
"role": "system",
"content": "Be a good human!"
},
{
"role": "user",
"content": "What do you know about earth?"
}
]
)
print(response)
```
</TabItem>
<TabItem value="curl" label="curl">
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{
"model": "llama-3",
"messages": [
{
"role": "system",
"content": "Be a good human!"
},
{
"role": "user",
"content": "What do you know about earth?"
}
],
}'
```
</TabItem>
</Tabs>
</TabItem>
</Tabs>
## Advanced Usage - Prompt Formatting
LiteLLM has prompt template mappings for all `meta-llama` llama3 instruct models. [**See Code**](https://github.com/BerriAI/litellm/blob/4f46b4c3975cd0f72b8c5acb2cb429d23580c18a/litellm/llms/prompt_templates/factory.py#L1360)
To apply a custom prompt template:
<Tabs>
<TabItem value="sdk" label="SDK">
```python
import litellm
import os
os.environ["PREDIBASE_API_KEY"] = ""
# Create your own custom prompt template
litellm.register_prompt_template(
model="togethercomputer/LLaMA-2-7B-32K",
initial_prompt_value="You are a good assistant" # [OPTIONAL]
roles={
"system": {
"pre_message": "[INST] <<SYS>>\n", # [OPTIONAL]
"post_message": "\n<</SYS>>\n [/INST]\n" # [OPTIONAL]
},
"user": {
"pre_message": "[INST] ", # [OPTIONAL]
"post_message": " [/INST]" # [OPTIONAL]
},
"assistant": {
"pre_message": "\n" # [OPTIONAL]
"post_message": "\n" # [OPTIONAL]
}
}
final_prompt_value="Now answer as best you can:" # [OPTIONAL]
)
def predibase_custom_model():
model = "predibase/togethercomputer/LLaMA-2-7B-32K"
response = completion(model=model, messages=messages)
print(response['choices'][0]['message']['content'])
return response
predibase_custom_model()
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```yaml
# Model-specific parameters
model_list:
- model_name: mistral-7b # model alias
litellm_params: # actual params for litellm.completion()
model: "predibase/mistralai/Mistral-7B-Instruct-v0.1"
api_key: os.environ/PREDIBASE_API_KEY
initial_prompt_value: "\n"
roles: {"system":{"pre_message":"<|im_start|>system\n", "post_message":"<|im_end|>"}, "assistant":{"pre_message":"<|im_start|>assistant\n","post_message":"<|im_end|>"}, "user":{"pre_message":"<|im_start|>user\n","post_message":"<|im_end|>"}}
final_prompt_value: "\n"
bos_token: "<s>"
eos_token: "</s>"
max_tokens: 4096
```
</TabItem>
</Tabs>
## Passing additional params - max_tokens, temperature
See all litellm.completion supported params [here](https://docs.litellm.ai/docs/completion/input)
```python
# !pip install litellm
from litellm import completion
import os
## set ENV variables
os.environ["PREDIBASE_API_KEY"] = "predibase key"
# predibae llama-3 call
response = completion(
model="predibase/llama3-8b-instruct",
messages = [{ "content": "Hello, how are you?","role": "user"}],
max_tokens=20,
temperature=0.5
)
```
**proxy**
```yaml
model_list:
- model_name: llama-3
litellm_params:
model: predibase/llama-3-8b-instruct
api_key: os.environ/PREDIBASE_API_KEY
max_tokens: 20
temperature: 0.5
```
## Passings Predibase specific params - adapter_id, adapter_source,
Send params [not supported by `litellm.completion()`](https://docs.litellm.ai/docs/completion/input) but supported by Predibase by passing them to `litellm.completion`
Example `adapter_id`, `adapter_source` are Predibase specific param - [See List](https://github.com/BerriAI/litellm/blob/8a35354dd6dbf4c2fcefcd6e877b980fcbd68c58/litellm/llms/predibase.py#L54)
```python
# !pip install litellm
from litellm import completion
import os
## set ENV variables
os.environ["PREDIBASE_API_KEY"] = "predibase key"
# predibase llama3 call
response = completion(
model="predibase/llama-3-8b-instruct",
messages = [{ "content": "Hello, how are you?","role": "user"}],
adapter_id="my_repo/3",
adapter_soruce="pbase",
)
```
**proxy**
```yaml
model_list:
- model_name: llama-3
litellm_params:
model: predibase/llama-3-8b-instruct
api_key: os.environ/PREDIBASE_API_KEY
adapter_id: my_repo/3
adapter_source: pbase
```

View file

@ -1,7 +1,16 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Replicate
LiteLLM supports all models on Replicate
## Usage
<Tabs>
<TabItem value="sdk" label="SDK">
### API KEYS
```python
import os
@ -16,14 +25,175 @@ import os
## set ENV variables
os.environ["REPLICATE_API_KEY"] = "replicate key"
# replicate llama-2 call
# replicate llama-3 call
response = completion(
model="replicate/llama-2-70b-chat:2796ee9483c3fd7aa2e171d38f4ca12251a30609463dcfd4cd76703f22e96cdf",
model="replicate/meta/meta-llama-3-8b-instruct",
messages = [{ "content": "Hello, how are you?","role": "user"}]
)
```
### Example - Calling Replicate Deployments
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Add models to your config.yaml
```yaml
model_list:
- model_name: llama-3
litellm_params:
model: replicate/meta/meta-llama-3-8b-instruct
api_key: os.environ/REPLICATE_API_KEY
```
2. Start the proxy
```bash
$ litellm --config /path/to/config.yaml --debug
```
3. Send Request to LiteLLM Proxy Server
<Tabs>
<TabItem value="openai" label="OpenAI Python v1.0.0+">
```python
import openai
client = openai.OpenAI(
api_key="sk-1234", # pass litellm proxy key, if you're using virtual keys
base_url="http://0.0.0.0:4000" # litellm-proxy-base url
)
response = client.chat.completions.create(
model="llama-3",
messages = [
{
"role": "system",
"content": "Be a good human!"
},
{
"role": "user",
"content": "What do you know about earth?"
}
]
)
print(response)
```
</TabItem>
<TabItem value="curl" label="curl">
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{
"model": "llama-3",
"messages": [
{
"role": "system",
"content": "Be a good human!"
},
{
"role": "user",
"content": "What do you know about earth?"
}
],
}'
```
</TabItem>
</Tabs>
### Expected Replicate Call
This is the call litellm will make to replicate, from the above example:
```bash
POST Request Sent from LiteLLM:
curl -X POST \
https://api.replicate.com/v1/models/meta/meta-llama-3-8b-instruct \
-H 'Authorization: Token your-api-key' -H 'Content-Type: application/json' \
-d '{'version': 'meta/meta-llama-3-8b-instruct', 'input': {'prompt': '<|start_header_id|>system<|end_header_id|>\n\nBe a good human!<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat do you know about earth?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'}}'
```
</TabItem>
</Tabs>
## Advanced Usage - Prompt Formatting
LiteLLM has prompt template mappings for all `meta-llama` llama3 instruct models. [**See Code**](https://github.com/BerriAI/litellm/blob/4f46b4c3975cd0f72b8c5acb2cb429d23580c18a/litellm/llms/prompt_templates/factory.py#L1360)
To apply a custom prompt template:
<Tabs>
<TabItem value="sdk" label="SDK">
```python
import litellm
import os
os.environ["REPLICATE_API_KEY"] = ""
# Create your own custom prompt template
litellm.register_prompt_template(
model="togethercomputer/LLaMA-2-7B-32K",
initial_prompt_value="You are a good assistant" # [OPTIONAL]
roles={
"system": {
"pre_message": "[INST] <<SYS>>\n", # [OPTIONAL]
"post_message": "\n<</SYS>>\n [/INST]\n" # [OPTIONAL]
},
"user": {
"pre_message": "[INST] ", # [OPTIONAL]
"post_message": " [/INST]" # [OPTIONAL]
},
"assistant": {
"pre_message": "\n" # [OPTIONAL]
"post_message": "\n" # [OPTIONAL]
}
}
final_prompt_value="Now answer as best you can:" # [OPTIONAL]
)
def test_replicate_custom_model():
model = "replicate/togethercomputer/LLaMA-2-7B-32K"
response = completion(model=model, messages=messages)
print(response['choices'][0]['message']['content'])
return response
test_replicate_custom_model()
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```yaml
# Model-specific parameters
model_list:
- model_name: mistral-7b # model alias
litellm_params: # actual params for litellm.completion()
model: "replicate/mistralai/Mistral-7B-Instruct-v0.1"
api_key: os.environ/REPLICATE_API_KEY
initial_prompt_value: "\n"
roles: {"system":{"pre_message":"<|im_start|>system\n", "post_message":"<|im_end|>"}, "assistant":{"pre_message":"<|im_start|>assistant\n","post_message":"<|im_end|>"}, "user":{"pre_message":"<|im_start|>user\n","post_message":"<|im_end|>"}}
final_prompt_value: "\n"
bos_token: "<s>"
eos_token: "</s>"
max_tokens: 4096
```
</TabItem>
</Tabs>
## Advanced Usage - Calling Replicate Deployments
Calling a [deployed replicate LLM](https://replicate.com/deployments)
Add the `replicate/deployments/` prefix to your model, so litellm will call the `deployments` endpoint. This will call `ishaan-jaff/ishaan-mistral` deployment on replicate
@ -40,7 +210,7 @@ Replicate responses can take 3-5 mins due to replicate cold boots, if you're try
:::
### Replicate Models
## Replicate Models
liteLLM supports all replicate LLMs
For replicate models ensure to add a `replicate/` prefix to the `model` arg. liteLLM detects it using this arg.
@ -49,15 +219,15 @@ Below are examples on how to call replicate LLMs using liteLLM
Model Name | Function Call | Required OS Variables |
-----------------------------|----------------------------------------------------------------|--------------------------------------|
replicate/llama-2-70b-chat | `completion(model='replicate/llama-2-70b-chat:2796ee9483c3fd7aa2e171d38f4ca12251a30609463dcfd4cd76703f22e96cdf', messages, supports_system_prompt=True)` | `os.environ['REPLICATE_API_KEY']` |
a16z-infra/llama-2-13b-chat| `completion(model='replicate/a16z-infra/llama-2-13b-chat:2a7f981751ec7fdf87b5b91ad4db53683a98082e9ff7bfd12c8cd5ea85980a52', messages, supports_system_prompt=True)`| `os.environ['REPLICATE_API_KEY']` |
replicate/llama-2-70b-chat | `completion(model='replicate/llama-2-70b-chat:2796ee9483c3fd7aa2e171d38f4ca12251a30609463dcfd4cd76703f22e96cdf', messages)` | `os.environ['REPLICATE_API_KEY']` |
a16z-infra/llama-2-13b-chat| `completion(model='replicate/a16z-infra/llama-2-13b-chat:2a7f981751ec7fdf87b5b91ad4db53683a98082e9ff7bfd12c8cd5ea85980a52', messages)`| `os.environ['REPLICATE_API_KEY']` |
replicate/vicuna-13b | `completion(model='replicate/vicuna-13b:6282abe6a492de4145d7bb601023762212f9ddbbe78278bd6771c8b3b2f2a13b', messages)` | `os.environ['REPLICATE_API_KEY']` |
daanelson/flan-t5-large | `completion(model='replicate/daanelson/flan-t5-large:ce962b3f6792a57074a601d3979db5839697add2e4e02696b3ced4c022d4767f', messages)` | `os.environ['REPLICATE_API_KEY']` |
custom-llm | `completion(model='replicate/custom-llm-version-id', messages)` | `os.environ['REPLICATE_API_KEY']` |
replicate deployment | `completion(model='replicate/deployments/ishaan-jaff/ishaan-mistral', messages)` | `os.environ['REPLICATE_API_KEY']` |
### Passing additional params - max_tokens, temperature
## Passing additional params - max_tokens, temperature
See all litellm.completion supported params [here](https://docs.litellm.ai/docs/completion/input)
```python
@ -73,11 +243,22 @@ response = completion(
messages = [{ "content": "Hello, how are you?","role": "user"}],
max_tokens=20,
temperature=0.5
)
```
### Passings Replicate specific params
**proxy**
```yaml
model_list:
- model_name: llama-3
litellm_params:
model: replicate/meta/meta-llama-3-8b-instruct
api_key: os.environ/REPLICATE_API_KEY
max_tokens: 20
temperature: 0.5
```
## Passings Replicate specific params
Send params [not supported by `litellm.completion()`](https://docs.litellm.ai/docs/completion/input) but supported by Replicate by passing them to `litellm.completion`
Example `seed`, `min_tokens` are Replicate specific param
@ -98,3 +279,15 @@ response = completion(
top_k=20,
)
```
**proxy**
```yaml
model_list:
- model_name: llama-3
litellm_params:
model: replicate/meta/meta-llama-3-8b-instruct
api_key: os.environ/REPLICATE_API_KEY
min_tokens: 2
top_k: 20
```

View file

@ -0,0 +1,163 @@
# OpenAI (Text Completion)
LiteLLM supports OpenAI text completion models
### Required API Keys
```python
import os
os.environ["OPENAI_API_KEY"] = "your-api-key"
```
### Usage
```python
import os
from litellm import completion
os.environ["OPENAI_API_KEY"] = "your-api-key"
# openai call
response = completion(
model = "gpt-3.5-turbo-instruct",
messages=[{ "content": "Hello, how are you?","role": "user"}]
)
```
### Usage - LiteLLM Proxy Server
Here's how to call OpenAI models with the LiteLLM Proxy Server
### 1. Save key in your environment
```bash
export OPENAI_API_KEY=""
```
### 2. Start the proxy
<Tabs>
<TabItem value="config" label="config.yaml">
```yaml
model_list:
- model_name: gpt-3.5-turbo
litellm_params:
model: openai/gpt-3.5-turbo # The `openai/` prefix will call openai.chat.completions.create
api_key: os.environ/OPENAI_API_KEY
- model_name: gpt-3.5-turbo-instruct
litellm_params:
model: text-completion-openai/gpt-3.5-turbo-instruct # The `text-completion-openai/` prefix will call openai.completions.create
api_key: os.environ/OPENAI_API_KEY
```
</TabItem>
<TabItem value="config-*" label="config.yaml - proxy all OpenAI models">
Use this to add all openai models with one API Key. **WARNING: This will not do any load balancing**
This means requests to `gpt-4`, `gpt-3.5-turbo` , `gpt-4-turbo-preview` will all go through this route
```yaml
model_list:
- model_name: "*" # all requests where model not in your config go to this deployment
litellm_params:
model: openai/* # set `openai/` to use the openai route
api_key: os.environ/OPENAI_API_KEY
```
</TabItem>
<TabItem value="cli" label="CLI">
```bash
$ litellm --model gpt-3.5-turbo-instruct
# Server running on http://0.0.0.0:4000
```
</TabItem>
</Tabs>
### 3. Test it
<Tabs>
<TabItem value="Curl" label="Curl Request">
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "gpt-3.5-turbo-instruct",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
]
}
'
```
</TabItem>
<TabItem value="openai" label="OpenAI v1.0.0+">
```python
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.create(model="gpt-3.5-turbo-instruct", messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
}
])
print(response)
```
</TabItem>
<TabItem value="langchain" label="Langchain">
```python
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
ChatPromptTemplate,
HumanMessagePromptTemplate,
SystemMessagePromptTemplate,
)
from langchain.schema import HumanMessage, SystemMessage
chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy
model = "gpt-3.5-turbo-instruct",
temperature=0.1
)
messages = [
SystemMessage(
content="You are a helpful assistant that im using to make a test request to."
),
HumanMessage(
content="test from litellm. tell me why it's amazing in 1 sentence"
),
]
response = chat(messages)
print(response)
```
</TabItem>
</Tabs>
## OpenAI Text Completion Models / Instruct Models
| Model Name | Function Call |
|---------------------|----------------------------------------------------|
| gpt-3.5-turbo-instruct | `response = completion(model="gpt-3.5-turbo-instruct", messages=messages)` |
| gpt-3.5-turbo-instruct-0914 | `response = completion(model="gpt-3.5-turbo-instruct-0914", messages=messages)` |
| text-davinci-003 | `response = completion(model="text-davinci-003", messages=messages)` |
| ada-001 | `response = completion(model="ada-001", messages=messages)` |
| curie-001 | `response = completion(model="curie-001", messages=messages)` |
| babbage-001 | `response = completion(model="babbage-001", messages=messages)` |
| babbage-002 | `response = completion(model="babbage-002", messages=messages)` |
| davinci-002 | `response = completion(model="davinci-002", messages=messages)` |

View file

@ -0,0 +1,95 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Triton Inference Server
LiteLLM supports Embedding Models on Triton Inference Servers
## Usage
<Tabs>
<TabItem value="sdk" label="SDK">
### Example Call
Use the `triton/` prefix to route to triton server
```python
from litellm import embedding
import os
response = await litellm.aembedding(
model="triton/<your-triton-model>",
api_base="https://your-triton-api-base/triton/embeddings", # /embeddings endpoint you want litellm to call on your server
input=["good morning from litellm"],
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Add models to your config.yaml
```yaml
model_list:
- model_name: my-triton-model
litellm_params:
model: triton/<your-triton-model>"
api_base: https://your-triton-api-base/triton/embeddings
```
2. Start the proxy
```bash
$ litellm --config /path/to/config.yaml --detailed_debug
```
3. Send Request to LiteLLM Proxy Server
<Tabs>
<TabItem value="openai" label="OpenAI Python v1.0.0+">
```python
import openai
from openai import OpenAI
# set base_url to your proxy server
# set api_key to send to proxy server
client = OpenAI(api_key="<proxy-api-key>", base_url="http://0.0.0.0:4000")
response = client.embeddings.create(
input=["hello from litellm"],
model="my-triton-model"
)
print(response)
```
</TabItem>
<TabItem value="curl" label="curl">
`--header` is optional, only required if you're using litellm proxy with Virtual Keys
```shell
curl --location 'http://0.0.0.0:4000/embeddings' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer sk-1234' \
--data ' {
"model": "my-triton-model",
"input": ["write a litellm poem"]
}'
```
</TabItem>
</Tabs>
</TabItem>
</Tabs>

View file

@ -253,6 +253,7 @@ litellm.vertex_location = "us-central1 # Your Location
## Anthropic
| Model Name | Function Call |
|------------------|--------------------------------------|
| claude-3-opus@20240229 | `completion('vertex_ai/claude-3-opus@20240229', messages)` |
| claude-3-sonnet@20240229 | `completion('vertex_ai/claude-3-sonnet@20240229', messages)` |
| claude-3-haiku@20240307 | `completion('vertex_ai/claude-3-haiku@20240307', messages)` |
@ -363,6 +364,8 @@ response = completion(
| Model Name | Function Call |
|------------------|--------------------------------------|
| gemini-1.5-pro | `completion('gemini-1.5-pro', messages)`, `completion('vertex_ai/gemini-pro', messages)` |
| gemini-1.5-flash-preview-0514 | `completion('gemini-1.5-flash-preview-0514', messages)`, `completion('vertex_ai/gemini-pro', messages)` |
| gemini-1.5-pro-preview-0514 | `completion('gemini-1.5-pro-preview-0514', messages)`, `completion('vertex_ai/gemini-1.5-pro-preview-0514', messages)` |
@ -476,6 +479,36 @@ print(response)
| code-gecko@latest| `completion('code-gecko@latest', messages)` |
## Embedding Models
#### Usage - Embedding
```python
import litellm
from litellm import embedding
litellm.vertex_project = "hardy-device-38811" # Your Project ID
litellm.vertex_location = "us-central1" # proj location
response = embedding(
model="vertex_ai/textembedding-gecko",
input=["good morning from litellm"],
)
print(response)
```
#### Supported Embedding Models
All models listed [here](https://github.com/BerriAI/litellm/blob/57f37f743886a0249f630a6792d49dffc2c5d9b7/model_prices_and_context_window.json#L835) are supported
| Model Name | Function Call |
|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| textembedding-gecko | `embedding(model="vertex_ai/textembedding-gecko", input)` |
| textembedding-gecko-multilingual | `embedding(model="vertex_ai/textembedding-gecko-multilingual", input)` |
| textembedding-gecko-multilingual@001 | `embedding(model="vertex_ai/textembedding-gecko-multilingual@001", input)` |
| textembedding-gecko@001 | `embedding(model="vertex_ai/textembedding-gecko@001", input)` |
| textembedding-gecko@003 | `embedding(model="vertex_ai/textembedding-gecko@003", input)` |
| text-embedding-preview-0409 | `embedding(model="vertex_ai/text-embedding-preview-0409", input)` |
| text-multilingual-embedding-preview-0409 | `embedding(model="vertex_ai/text-multilingual-embedding-preview-0409", input)` |
## Extra
### Using `GOOGLE_APPLICATION_CREDENTIALS`
@ -519,6 +552,12 @@ def load_vertex_ai_credentials():
### Using GCP Service Account
:::info
Trying to deploy LiteLLM on Google Cloud Run? Tutorial [here](https://docs.litellm.ai/docs/proxy/deploy#deploy-on-google-cloud-run)
:::
1. Figure out the Service Account bound to the Google Cloud Run service
<Image img={require('../../img/gcp_acc_1.png')} />

View file

@ -4,6 +4,13 @@ LiteLLM supports all models on VLLM.
🚀[Code Tutorial](https://github.com/BerriAI/litellm/blob/main/cookbook/VLLM_Model_Testing.ipynb)
:::info
To call a HOSTED VLLM Endpoint use [these docs](./openai_compatible.md)
:::
### Quick Start
```
pip install litellm vllm

View file

@ -0,0 +1,284 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# IBM watsonx.ai
LiteLLM supports all IBM [watsonx.ai](https://watsonx.ai/) foundational models and embeddings.
## Environment Variables
```python
os.environ["WATSONX_URL"] = "" # (required) Base URL of your WatsonX instance
# (required) either one of the following:
os.environ["WATSONX_APIKEY"] = "" # IBM cloud API key
os.environ["WATSONX_TOKEN"] = "" # IAM auth token
# optional - can also be passed as params to completion() or embedding()
os.environ["WATSONX_PROJECT_ID"] = "" # Project ID of your WatsonX instance
os.environ["WATSONX_DEPLOYMENT_SPACE_ID"] = "" # ID of your deployment space to use deployed models
```
See [here](https://cloud.ibm.com/apidocs/watsonx-ai#api-authentication) for more information on how to get an access token to authenticate to watsonx.ai.
## Usage
<a target="_blank" href="https://colab.research.google.com/github/BerriAI/litellm/blob/main/cookbook/liteLLM_IBM_Watsonx.ipynb">
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>
```python
import os
from litellm import completion
os.environ["WATSONX_URL"] = ""
os.environ["WATSONX_APIKEY"] = ""
response = completion(
model="watsonx/ibm/granite-13b-chat-v2",
messages=[{ "content": "what is your favorite colour?","role": "user"}],
project_id="<my-project-id>" # or pass with os.environ["WATSONX_PROJECT_ID"]
)
response = completion(
model="watsonx/meta-llama/llama-3-8b-instruct",
messages=[{ "content": "what is your favorite colour?","role": "user"}],
project_id="<my-project-id>"
)
```
## Usage - Streaming
```python
import os
from litellm import completion
os.environ["WATSONX_URL"] = ""
os.environ["WATSONX_APIKEY"] = ""
os.environ["WATSONX_PROJECT_ID"] = ""
response = completion(
model="watsonx/ibm/granite-13b-chat-v2",
messages=[{ "content": "what is your favorite colour?","role": "user"}],
stream=True
)
for chunk in response:
print(chunk)
```
#### Example Streaming Output Chunk
```json
{
"choices": [
{
"finish_reason": null,
"index": 0,
"delta": {
"content": "I don't have a favorite color, but I do like the color blue. What's your favorite color?"
}
}
],
"created": null,
"model": "watsonx/ibm/granite-13b-chat-v2",
"usage": {
"prompt_tokens": null,
"completion_tokens": null,
"total_tokens": null
}
}
```
## Usage - Models in deployment spaces
Models that have been deployed to a deployment space (e.g.: tuned models) can be called using the `deployment/<deployment_id>` format (where `<deployment_id>` is the ID of the deployed model in your deployment space).
The ID of your deployment space must also be set in the environment variable `WATSONX_DEPLOYMENT_SPACE_ID` or passed to the function as `space_id=<deployment_space_id>`.
```python
import litellm
response = litellm.completion(
model="watsonx/deployment/<deployment_id>",
messages=[{"content": "Hello, how are you?", "role": "user"}],
space_id="<deployment_space_id>"
)
```
## Usage - Embeddings
LiteLLM also supports making requests to IBM watsonx.ai embedding models. The credential needed for this is the same as for completion.
```python
from litellm import embedding
response = embedding(
model="watsonx/ibm/slate-30m-english-rtrvr",
input=["What is the capital of France?"],
project_id="<my-project-id>"
)
print(response)
# EmbeddingResponse(model='ibm/slate-30m-english-rtrvr', data=[{'object': 'embedding', 'index': 0, 'embedding': [-0.037463713, -0.02141933, -0.02851813, 0.015519324, ..., -0.0021367231, -0.01704561, -0.001425816, 0.0035238306]}], object='list', usage=Usage(prompt_tokens=8, total_tokens=8))
```
## OpenAI Proxy Usage
Here's how to call IBM watsonx.ai with the LiteLLM Proxy Server
### 1. Save keys in your environment
```bash
export WATSONX_URL=""
export WATSONX_APIKEY=""
export WATSONX_PROJECT_ID=""
```
### 2. Start the proxy
<Tabs>
<TabItem value="cli" label="CLI">
```bash
$ litellm --model watsonx/meta-llama/llama-3-8b-instruct
# Server running on http://0.0.0.0:4000
```
</TabItem>
<TabItem value="config" label="config.yaml">
```yaml
model_list:
- model_name: llama-3-8b
litellm_params:
# all params accepted by litellm.completion()
model: watsonx/meta-llama/llama-3-8b-instruct
api_key: "os.environ/WATSONX_API_KEY" # does os.getenv("WATSONX_API_KEY")
```
</TabItem>
</Tabs>
### 3. Test it
<Tabs>
<TabItem value="Curl" label="Curl Request">
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "llama-3-8b",
"messages": [
{
"role": "user",
"content": "what is your favorite colour?"
}
]
}
'
```
</TabItem>
<TabItem value="openai" label="OpenAI v1.0.0+">
```python
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.create(model="llama-3-8b", messages=[
{
"role": "user",
"content": "what is your favorite colour?"
}
])
print(response)
```
</TabItem>
<TabItem value="langchain" label="Langchain">
```python
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
ChatPromptTemplate,
HumanMessagePromptTemplate,
SystemMessagePromptTemplate,
)
from langchain.schema import HumanMessage, SystemMessage
chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy
model = "llama-3-8b",
temperature=0.1
)
messages = [
SystemMessage(
content="You are a helpful assistant that im using to make a test request to."
),
HumanMessage(
content="test from litellm. tell me why it's amazing in 1 sentence"
),
]
response = chat(messages)
print(response)
```
</TabItem>
</Tabs>
## Authentication
### Passing credentials as parameters
You can also pass the credentials as parameters to the completion and embedding functions.
```python
import os
from litellm import completion
response = completion(
model="watsonx/ibm/granite-13b-chat-v2",
messages=[{ "content": "What is your favorite color?","role": "user"}],
url="",
api_key="",
project_id=""
)
```
## Supported IBM watsonx.ai Models
Here are some examples of models available in IBM watsonx.ai that you can use with LiteLLM:
| Mode Name | Command |
| ---------- | --------- |
| Flan T5 XXL | `completion(model=watsonx/google/flan-t5-xxl, messages=messages)` |
| Flan Ul2 | `completion(model=watsonx/google/flan-ul2, messages=messages)` |
| Mt0 XXL | `completion(model=watsonx/bigscience/mt0-xxl, messages=messages)` |
| Gpt Neox | `completion(model=watsonx/eleutherai/gpt-neox-20b, messages=messages)` |
| Mpt 7B Instruct2 | `completion(model=watsonx/ibm/mpt-7b-instruct2, messages=messages)` |
| Starcoder | `completion(model=watsonx/bigcode/starcoder, messages=messages)` |
| Llama 2 70B Chat | `completion(model=watsonx/meta-llama/llama-2-70b-chat, messages=messages)` |
| Llama 2 13B Chat | `completion(model=watsonx/meta-llama/llama-2-13b-chat, messages=messages)` |
| Granite 13B Instruct | `completion(model=watsonx/ibm/granite-13b-instruct-v1, messages=messages)` |
| Granite 13B Chat | `completion(model=watsonx/ibm/granite-13b-chat-v1, messages=messages)` |
| Flan T5 XL | `completion(model=watsonx/google/flan-t5-xl, messages=messages)` |
| Granite 13B Chat V2 | `completion(model=watsonx/ibm/granite-13b-chat-v2, messages=messages)` |
| Granite 13B Instruct V2 | `completion(model=watsonx/ibm/granite-13b-instruct-v2, messages=messages)` |
| Elyza Japanese Llama 2 7B Instruct | `completion(model=watsonx/elyza/elyza-japanese-llama-2-7b-instruct, messages=messages)` |
| Mixtral 8X7B Instruct V01 Q | `completion(model=watsonx/ibm-mistralai/mixtral-8x7b-instruct-v01-q, messages=messages)` |
For a list of all available models in watsonx.ai, see [here](https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/fm-models.html?context=wx&locale=en&audience=wdp).
## Supported IBM watsonx.ai Embedding Models
| Model Name | Function Call |
|----------------------|---------------------------------------------|
| Slate 30m | `embedding(model="watsonx/ibm/slate-30m-english-rtrvr", input=input)` |
| Slate 125m | `embedding(model="watsonx/ibm/slate-125m-english-rtrvr", input=input)` |
For a list of all available embedding models in watsonx.ai, see [here](https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/fm-models-embed.html?context=wx).

View file

@ -1,13 +1,18 @@
# Slack Alerting
# 🚨 Alerting
Get alerts for:
- hanging LLM api calls
- failed LLM api calls
- slow LLM api calls
- budget Tracking per key/user:
- When a User/Key crosses their Budget
- When a User/Key is 15% away from crossing their Budget
- failed db read/writes
- Hanging LLM api calls
- Slow LLM api calls
- Failed LLM api calls
- Budget Tracking per key/user
- Spend Reports - Weekly & Monthly spend per Team, Tag
- Failed db read/writes
- Daily Reports:
- **LLM** Top 5 slowest deployments
- **LLM** Top 5 deployments with most failed requests
- **Spend** Weekly & Monthly spend per Team, Tag
## Quick Start
@ -17,10 +22,12 @@ Set up a slack alert channel to receive alerts from proxy.
Get a slack webhook url from https://api.slack.com/messaging/webhooks
You can also use Discord Webhooks, see [here](#using-discord-webhooks)
### Step 2: Update config.yaml
Let's save a bad key to our proxy
- Set `SLACK_WEBHOOK_URL` in your proxy env to enable Slack alerts.
- Just for testing purposes, let's save a bad key to our proxy.
```yaml
model_list:
@ -33,16 +40,88 @@ general_settings:
alerting: ["slack"]
alerting_threshold: 300 # sends alerts if requests hang for 5min+ and responses take 5min+
environment_variables:
SLACK_WEBHOOK_URL: "https://hooks.slack.com/services/<>/<>/<>"
SLACK_DAILY_REPORT_FREQUENCY: "86400" # 24 hours; Optional: defaults to 12 hours
```
Set `SLACK_WEBHOOK_URL` in your proxy env
```shell
SLACK_WEBHOOK_URL: "https://hooks.slack.com/services/<>/<>/<>"
```
### Step 3: Start proxy
```bash
$ litellm --config /path/to/config.yaml
```
## Testing Alerting is Setup Correctly
Make a GET request to `/health/services`, expect to see a test slack alert in your provided webhook slack channel
```shell
curl -X GET 'http://localhost:4000/health/services?service=slack' \
-H 'Authorization: Bearer sk-1234'
```
## Advanced
### Opting into specific alert types
Set `alert_types` if you want to Opt into only specific alert types
```shell
general_settings:
alerting: ["slack"]
alert_types: ["spend_reports"]
```
All Possible Alert Types
```python
alert_types:
Optional[
List[
Literal[
"llm_exceptions",
"llm_too_slow",
"llm_requests_hanging",
"budget_alerts",
"db_exceptions",
"daily_reports",
"spend_reports",
"cooldown_deployment",
"new_model_added",
]
]
```
### Using Discord Webhooks
Discord provides a slack compatible webhook url that you can use for alerting
##### Quick Start
1. Get a webhook url for your discord channel
2. Append `/slack` to your discord webhook - it should look like
```
"https://discord.com/api/webhooks/1240030362193760286/cTLWt5ATn1gKmcy_982rl5xmYHsrM1IWJdmCL1AyOmU9JdQXazrp8L1_PYgUtgxj8x4f/slack"
```
3. Add it to your litellm config
```yaml
model_list:
model_name: "azure-model"
litellm_params:
model: "azure/gpt-35-turbo"
api_key: "my-bad-key" # 👈 bad key
general_settings:
alerting: ["slack"]
alerting_threshold: 300 # sends alerts if requests hang for 5min+ and responses take 5min+
environment_variables:
SLACK_WEBHOOK_URL: "https://discord.com/api/webhooks/1240030362193760286/cTLWt5ATn1gKmcy_982rl5xmYHsrM1IWJdmCL1AyOmU9JdQXazrp8L1_PYgUtgxj8x4f/slack"
```
That's it ! You're ready to go !

View file

@ -0,0 +1,319 @@
import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# 💵 Billing
Bill internal teams, external customers for their usage
**🚨 Requirements**
- [Setup Lago](https://docs.getlago.com/guide/self-hosted/docker#run-the-app), for usage-based billing. We recommend following [their Stripe tutorial](https://docs.getlago.com/templates/per-transaction/stripe#step-1-create-billable-metrics-for-transaction)
Steps:
- Connect the proxy to Lago
- Set the id you want to bill for (customers, internal users, teams)
- Start!
## Quick Start
Bill internal teams for their usage
### 1. Connect proxy to Lago
Set 'lago' as a callback on your proxy config.yaml
```yaml
model_name:
- model_name: fake-openai-endpoint
litellm_params:
model: openai/fake
api_key: fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
litellm_settings:
callbacks: ["lago"] # 👈 KEY CHANGE
general_settings:
master_key: sk-1234
```
Add your Lago keys to the environment
```bash
export LAGO_API_BASE="http://localhost:3000" # self-host - https://docs.getlago.com/guide/self-hosted/docker#run-the-app
export LAGO_API_KEY="3e29d607-de54-49aa-a019-ecf585729070" # Get key - https://docs.getlago.com/guide/self-hosted/docker#find-your-api-key
export LAGO_API_EVENT_CODE="openai_tokens" # name of lago billing code
export LAGO_API_CHARGE_BY="team_id" # 👈 Charges 'team_id' attached to proxy key
```
Start proxy
```bash
litellm --config /path/to/config.yaml
```
### 2. Create Key for Internal Team
```bash
curl 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data-raw '{"team_id": "my-unique-id"}' # 👈 Internal Team's ID
```
Response Object:
```bash
{
"key": "sk-tXL0wt5-lOOVK9sfY2UacA",
}
```
### 3. Start billing!
<Tabs>
<TabItem value="curl" label="Curl">
```bash
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer sk-tXL0wt5-lOOVK9sfY2UacA' \ # 👈 Team's Key
--data ' {
"model": "fake-openai-endpoint",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
],
}
'
```
</TabItem>
<TabItem value="openai_python" label="OpenAI Python SDK">
```python
import openai
client = openai.OpenAI(
api_key="sk-tXL0wt5-lOOVK9sfY2UacA", # 👈 Team's Key
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
}
])
print(response)
```
</TabItem>
<TabItem value="langchain" label="Langchain">
```python
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
ChatPromptTemplate,
HumanMessagePromptTemplate,
SystemMessagePromptTemplate,
)
from langchain.schema import HumanMessage, SystemMessage
import os
os.environ["OPENAI_API_KEY"] = "sk-tXL0wt5-lOOVK9sfY2UacA" # 👈 Team's Key
chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:4000",
model = "gpt-3.5-turbo",
temperature=0.1,
)
messages = [
SystemMessage(
content="You are a helpful assistant that im using to make a test request to."
),
HumanMessage(
content="test from litellm. tell me why it's amazing in 1 sentence"
),
]
response = chat(messages)
print(response)
```
</TabItem>
</Tabs>
**See Results on Lago**
<Image img={require('../../img/lago_2.png')} style={{ width: '500px', height: 'auto' }} />
## Advanced - Lago Logging object
This is what LiteLLM will log to Lagos
```
{
"event": {
"transaction_id": "<generated_unique_id>",
"external_customer_id": <selected_id>, # either 'end_user_id', 'user_id', or 'team_id'. Default 'end_user_id'.
"code": os.getenv("LAGO_API_EVENT_CODE"),
"properties": {
"input_tokens": <number>,
"output_tokens": <number>,
"model": <string>,
"response_cost": <number>, # 👈 LITELLM CALCULATED RESPONSE COST - https://github.com/BerriAI/litellm/blob/d43f75150a65f91f60dc2c0c9462ce3ffc713c1f/litellm/utils.py#L1473
}
}
}
```
## Advanced - Bill Customers, Internal Users
For:
- Customers (id passed via 'user' param in /chat/completion call) = 'end_user_id'
- Internal Users (id set when [creating keys](https://docs.litellm.ai/docs/proxy/virtual_keys#advanced---spend-tracking)) = 'user_id'
- Teams (id set when [creating keys](https://docs.litellm.ai/docs/proxy/virtual_keys#advanced---spend-tracking)) = 'team_id'
<Tabs>
<TabItem value="customers" label="Customer Billing">
1. Set 'LAGO_API_CHARGE_BY' to 'end_user_id'
```bash
export LAGO_API_CHARGE_BY="end_user_id"
```
2. Test it!
<Tabs>
<TabItem value="curl" label="Curl">
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "gpt-3.5-turbo",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
],
"user": "my_customer_id" # 👈 whatever your customer id is
}
'
```
</TabItem>
<TabItem value="openai_sdk" label="OpenAI Python SDK">
```python
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
}
], user="my_customer_id") # 👈 whatever your customer id is
print(response)
```
</TabItem>
<TabItem value="langchain" label="Langchain">
```python
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
ChatPromptTemplate,
HumanMessagePromptTemplate,
SystemMessagePromptTemplate,
)
from langchain.schema import HumanMessage, SystemMessage
import os
os.environ["OPENAI_API_KEY"] = "anything"
chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:4000",
model = "gpt-3.5-turbo",
temperature=0.1,
extra_body={
"user": "my_customer_id" # 👈 whatever your customer id is
}
)
messages = [
SystemMessage(
content="You are a helpful assistant that im using to make a test request to."
),
HumanMessage(
content="test from litellm. tell me why it's amazing in 1 sentence"
),
]
response = chat(messages)
print(response)
```
</TabItem>
</Tabs>
</TabItem>
<TabItem value="users" label="Internal User Billing">
1. Set 'LAGO_API_CHARGE_BY' to 'user_id'
```bash
export LAGO_API_CHARGE_BY="user_id"
```
2. Create a key for that user
```bash
curl 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{"user_id": "my-unique-id"}' # 👈 Internal User's id
```
Response Object:
```bash
{
"key": "sk-tXL0wt5-lOOVK9sfY2UacA",
}
```
3. Make API Calls with that Key
```python
import openai
client = openai.OpenAI(
api_key="sk-tXL0wt5-lOOVK9sfY2UacA", # 👈 Generated key
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
}
])
print(response)
```
</TabItem>
</Tabs>

View file

@ -61,6 +61,22 @@ litellm_settings:
ttl: 600 # will be cached on redis for 600s
```
## SSL
just set `REDIS_SSL="True"` in your .env, and LiteLLM will pick this up.
```env
REDIS_SSL="True"
```
For quick testing, you can also use REDIS_URL, eg.:
```
REDIS_URL="rediss://.."
```
but we **don't** recommend using REDIS_URL in prod. We've noticed a performance difference between using it vs. redis_host, port, etc.
#### Step 2: Add Redis Credentials to .env
Set either `REDIS_URL` or the `REDIS_HOST` in your os environment, to enable caching.

View file

@ -62,9 +62,11 @@ model_list:
litellm_settings: # module level litellm settings - https://github.com/BerriAI/litellm/blob/main/litellm/__init__.py
drop_params: True
success_callback: ["langfuse"] # OPTIONAL - if you want to start sending LLM Logs to Langfuse. Make sure to set `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` in your env
general_settings:
master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234)
alerting: ["slack"] # [OPTIONAL] If you want Slack Alerts for Hanging LLM requests, Slow llm responses, Budget Alerts. Make sure to set `SLACK_WEBHOOK_URL` in your env
```
:::info
@ -600,6 +602,7 @@ general_settings:
"general_settings": {
"completion_model": "string",
"disable_spend_logs": "boolean", # turn off writing each transaction to the db
"disable_master_key_return": "boolean", # turn off returning master key on UI (checked on '/user/info' endpoint)
"disable_reset_budget": "boolean", # turn off reset budget scheduled task
"enable_jwt_auth": "boolean", # allow proxy admin to auth in via jwt tokens with 'litellm_proxy_admin' in claims
"enforce_user_param": "boolean", # requires all openai endpoint requests to have a 'user' param

View file

@ -1,8 +1,161 @@
# Cost Tracking - Azure
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# 💸 Spend Tracking
Track spend for keys, users, and teams across 100+ LLMs.
## Getting Spend Reports - To Charge Other Teams, API Keys
Use the `/global/spend/report` endpoint to get daily spend per team, with a breakdown of spend per API Key, Model
### Example Request
```shell
curl -X GET 'http://localhost:4000/global/spend/report?start_date=2024-04-01&end_date=2024-06-30' \
-H 'Authorization: Bearer sk-1234'
```
### Example Response
<Tabs>
<TabItem value="response" label="Expected Response">
```shell
[
{
"group_by_day": "2024-04-30T00:00:00+00:00",
"teams": [
{
"team_name": "Prod Team",
"total_spend": 0.0015265,
"metadata": [ # see the spend by unique(key + model)
{
"model": "gpt-4",
"spend": 0.00123,
"total_tokens": 28,
"api_key": "88dc28.." # the hashed api key
},
{
"model": "gpt-4",
"spend": 0.00123,
"total_tokens": 28,
"api_key": "a73dc2.." # the hashed api key
},
{
"model": "chatgpt-v-2",
"spend": 0.000214,
"total_tokens": 122,
"api_key": "898c28.." # the hashed api key
},
{
"model": "gpt-3.5-turbo",
"spend": 0.0000825,
"total_tokens": 85,
"api_key": "84dc28.." # the hashed api key
}
]
}
]
}
]
```
</TabItem>
<TabItem value="py-script" label="Script to Parse Response (Python)">
```python
import requests
url = 'http://localhost:4000/global/spend/report'
params = {
'start_date': '2023-04-01',
'end_date': '2024-06-30'
}
headers = {
'Authorization': 'Bearer sk-1234'
}
# Make the GET request
response = requests.get(url, headers=headers, params=params)
spend_report = response.json()
for row in spend_report:
date = row["group_by_day"]
teams = row["teams"]
for team in teams:
team_name = team["team_name"]
total_spend = team["total_spend"]
metadata = team["metadata"]
print(f"Date: {date}")
print(f"Team: {team_name}")
print(f"Total Spend: {total_spend}")
print("Metadata: ", metadata)
print()
```
Output from script
```shell
# Date: 2024-05-11T00:00:00+00:00
# Team: local_test_team
# Total Spend: 0.003675099999999999
# Metadata: [{'model': 'gpt-3.5-turbo', 'spend': 0.003675099999999999, 'api_key': 'b94d5e0bc3a71a573917fe1335dc0c14728c7016337451af9714924ff3a729db', 'total_tokens': 3105}]
# Date: 2024-05-13T00:00:00+00:00
# Team: Unassigned Team
# Total Spend: 3.4e-05
# Metadata: [{'model': 'gpt-3.5-turbo', 'spend': 3.4e-05, 'api_key': '9569d13c9777dba68096dea49b0b03e0aaf4d2b65d4030eda9e8a2733c3cd6e0', 'total_tokens': 50}]
# Date: 2024-05-13T00:00:00+00:00
# Team: central
# Total Spend: 0.000684
# Metadata: [{'model': 'gpt-3.5-turbo', 'spend': 0.000684, 'api_key': '0323facdf3af551594017b9ef162434a9b9a8ca1bbd9ccbd9d6ce173b1015605', 'total_tokens': 498}]
# Date: 2024-05-13T00:00:00+00:00
# Team: local_test_team
# Total Spend: 0.0005715000000000001
# Metadata: [{'model': 'gpt-3.5-turbo', 'spend': 0.0005715000000000001, 'api_key': 'b94d5e0bc3a71a573917fe1335dc0c14728c7016337451af9714924ff3a729db', 'total_tokens': 423}]
```
</TabItem>
</Tabs>
## Reset Team, API Key Spend - MASTER KEY ONLY
Use `/global/spend/reset` if you want to:
- Reset the Spend for all API Keys, Teams. The `spend` for ALL Teams and Keys in `LiteLLM_TeamTable` and `LiteLLM_VerificationToken` will be set to `spend=0`
- LiteLLM will maintain all the logs in `LiteLLMSpendLogs` for Auditing Purposes
### Request
Only the `LITELLM_MASTER_KEY` you set can access this route
```shell
curl -X POST \
'http://localhost:4000/global/spend/reset' \
-H 'Authorization: Bearer sk-1234' \
-H 'Content-Type: application/json'
```
### Expected Responses
```shell
{"message":"Spend for all API Keys and Teams reset successfully","status":"success"}
```
## Spend Tracking for Azure
Set base model for cost tracking azure image-gen call
## Image Generation
### Image Generation
```yaml
model_list:
@ -17,7 +170,7 @@ model_list:
mode: image_generation
```
## Chat Completions / Embeddings
### Chat Completions / Embeddings
**Problem**: Azure returns `gpt-4` in the response when `azure/gpt-4-1106-preview` is used. This leads to inaccurate cost tracking

View file

@ -0,0 +1,83 @@
# Region-based Routing
Route specific customers to eu-only models.
By specifying 'allowed_model_region' for a customer, LiteLLM will filter-out any models in a model group which is not in the allowed region (i.e. 'eu').
[**See Code**](https://github.com/BerriAI/litellm/blob/5eb12e30cc5faa73799ebc7e48fc86ebf449c879/litellm/router.py#L2938)
### 1. Create customer with region-specification
Use the litellm 'end-user' object for this.
End-users can be tracked / id'ed by passing the 'user' param to litellm in an openai chat completion/embedding call.
```bash
curl -X POST --location 'http://0.0.0.0:4000/end_user/new' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{
"user_id" : "ishaan-jaff-45",
"allowed_model_region": "eu", # 👈 SPECIFY ALLOWED REGION='eu'
}'
```
### 2. Add eu models to model-group
Add eu models to a model group. For azure models, litellm can automatically infer the region (no need to set it).
```yaml
model_list:
- model_name: gpt-3.5-turbo
litellm_params:
model: azure/gpt-35-turbo-eu # 👈 EU azure model
api_base: https://my-endpoint-europe-berri-992.openai.azure.com/
api_key: os.environ/AZURE_EUROPE_API_KEY
- model_name: gpt-3.5-turbo
litellm_params:
model: azure/chatgpt-v-2
api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
api_version: "2023-05-15"
api_key: os.environ/AZURE_API_KEY
router_settings:
enable_pre_call_checks: true # 👈 IMPORTANT
```
Start the proxy
```yaml
litellm --config /path/to/config.yaml
```
### 3. Test it!
Make a simple chat completions call to the proxy. In the response headers, you should see the returned api base.
```bash
curl -X POST --location 'http://localhost:4000/chat/completions' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer sk-1234' \
--data '{
"model": "gpt-3.5-turbo",
"messages": [
{
"role": "user",
"content": "what is the meaning of the universe? 1234"
}],
"user": "ishaan-jaff-45" # 👈 USER ID
}
'
```
Expected API Base in response headers
```
x-litellm-api-base: "https://my-endpoint-europe-berri-992.openai.azure.com/"
```
### FAQ
**What happens if there are no available models for that region?**
Since the router filters out models not in the specified region, it will return back as an error to the user, if no models in that region are available.

View file

@ -11,40 +11,37 @@ You can find the Dockerfile to build litellm proxy [here](https://github.com/Ber
<TabItem value="basic" label="Basic">
**Step 1. Create a file called `litellm_config.yaml`**
### Step 1. CREATE config.yaml
Example `litellm_config.yaml` (the `os.environ/` prefix means litellm will read `AZURE_API_BASE` from the env)
```yaml
model_list:
- model_name: azure-gpt-3.5
litellm_params:
model: azure/<your-azure-model-deployment>
api_base: os.environ/AZURE_API_BASE
api_key: os.environ/AZURE_API_KEY
api_version: "2023-07-01-preview"
```
Example `litellm_config.yaml`
**Step 2. Run litellm docker image**
```yaml
model_list:
- model_name: azure-gpt-3.5
litellm_params:
model: azure/<your-azure-model-deployment>
api_base: os.environ/AZURE_API_BASE # runs os.getenv("AZURE_API_BASE")
api_key: os.environ/AZURE_API_KEY # runs os.getenv("AZURE_API_KEY")
api_version: "2023-07-01-preview"
```
See the latest available ghcr docker image here:
https://github.com/berriai/litellm/pkgs/container/litellm
Your litellm config.yaml should be called `litellm_config.yaml` in the directory you run this command.
The `-v` command will mount that file
Pass `AZURE_API_KEY` and `AZURE_API_BASE` since we set them in step 1
### Step 2. RUN Docker Image
```shell
docker run \
-v $(pwd)/litellm_config.yaml:/app/config.yaml \
-e AZURE_API_KEY=d6*********** \
-e AZURE_API_BASE=https://openai-***********/ \
-p 4000:4000 \
ghcr.io/berriai/litellm:main-latest \
--config /app/config.yaml --detailed_debug
```
```shell
docker run \
-v $(pwd)/litellm_config.yaml:/app/config.yaml \
-e AZURE_API_KEY=d6*********** \
-e AZURE_API_BASE=https://openai-***********/ \
-p 4000:4000 \
ghcr.io/berriai/litellm:main-latest \
--config /app/config.yaml --detailed_debug
```
**Step 3. Send a Test Request**
Get Latest Image 👉 [here](https://github.com/berriai/litellm/pkgs/container/litellm)
### Step 3. TEST Request
Pass `model=azure-gpt-3.5` this was set on step 1
@ -231,13 +228,16 @@ Your OpenAI proxy server is now running on `http://127.0.0.1:4000`.
| Docs | When to Use |
| --- | --- |
| [Quick Start](#quick-start) | call 100+ LLMs + Load Balancing |
| [Deploy with Database](#deploy-with-database) | + use Virtual Keys + Track Spend |
| [Deploy with Database](#deploy-with-database) | + use Virtual Keys + Track Spend (Note: When deploying with a database providing a `DATABASE_URL` and `LITELLM_MASTER_KEY` are required in your env ) |
| [LiteLLM container + Redis](#litellm-container--redis) | + load balance across multiple litellm containers |
| [LiteLLM Database container + PostgresDB + Redis](#litellm-database-container--postgresdb--redis) | + use Virtual Keys + Track Spend + load balance across multiple litellm containers |
## Deploy with Database
### Docker, Kubernetes, Helm Chart
Requirements:
- Need a postgres database (e.g. [Supabase](https://supabase.com/), [Neon](https://neon.tech/), etc) Set `DATABASE_URL=postgresql://<user>:<password>@<host>:<port>/<dbname>` in your env
- Set a `LITELLM_MASTER_KEY`, this is your Proxy Admin key - you can use this to create other keys (🚨 must start with `sk-`)
<Tabs>
@ -252,6 +252,8 @@ docker pull ghcr.io/berriai/litellm-database:main-latest
```shell
docker run \
-v $(pwd)/litellm_config.yaml:/app/config.yaml \
-e LITELLM_MASTER_KEY=sk-1234 \
-e DATABASE_URL=postgresql://<user>:<password>@<host>:<port>/<dbname> \
-e AZURE_API_KEY=d6*********** \
-e AZURE_API_BASE=https://openai-***********/ \
-p 4000:4000 \
@ -267,26 +269,63 @@ Your OpenAI proxy server is now running on `http://0.0.0.0:4000`.
#### Step 1. Create deployment.yaml
```yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: litellm-deployment
spec:
replicas: 1
selector:
matchLabels:
app: litellm
template:
metadata:
labels:
app: litellm
spec:
containers:
- name: litellm-container
image: ghcr.io/berriai/litellm-database:main-latest
env:
- name: DATABASE_URL
value: postgresql://<user>:<password>@<host>:<port>/<dbname>
apiVersion: apps/v1
kind: Deployment
metadata:
name: litellm-deployment
spec:
replicas: 3
selector:
matchLabels:
app: litellm
template:
metadata:
labels:
app: litellm
spec:
containers:
- name: litellm-container
image: ghcr.io/berriai/litellm:main-latest
imagePullPolicy: Always
env:
- name: AZURE_API_KEY
value: "d6******"
- name: AZURE_API_BASE
value: "https://ope******"
- name: LITELLM_MASTER_KEY
value: "sk-1234"
- name: DATABASE_URL
value: "po**********"
args:
- "--config"
- "/app/proxy_config.yaml" # Update the path to mount the config file
volumeMounts: # Define volume mount for proxy_config.yaml
- name: config-volume
mountPath: /app
readOnly: true
livenessProbe:
httpGet:
path: /health/liveliness
port: 4000
initialDelaySeconds: 120
periodSeconds: 15
successThreshold: 1
failureThreshold: 3
timeoutSeconds: 10
readinessProbe:
httpGet:
path: /health/readiness
port: 4000
initialDelaySeconds: 120
periodSeconds: 15
successThreshold: 1
failureThreshold: 3
timeoutSeconds: 10
volumes: # Define volume to mount proxy_config.yaml
- name: config-volume
configMap:
name: litellm-config
```
```bash

View file

@ -3,19 +3,21 @@ import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# 🔎 Logging - Custom Callbacks, DataDog, Langfuse, s3 Bucket, Sentry, OpenTelemetry, Athina
# 🔎 Logging - Custom Callbacks, DataDog, Langfuse, s3 Bucket, Sentry, OpenTelemetry, Athina, Azure Content-Safety
Log Proxy Input, Output, Exceptions using Custom Callbacks, Langfuse, OpenTelemetry, LangFuse, DynamoDB, s3 Bucket
- [Async Custom Callbacks](#custom-callback-class-async)
- [Async Custom Callback APIs](#custom-callback-apis-async)
- [Logging to DataDog](#logging-proxy-inputoutput---datadog)
- [Logging to Langfuse](#logging-proxy-inputoutput---langfuse)
- [Logging to OpenMeter](#logging-proxy-inputoutput---langfuse)
- [Logging to s3 Buckets](#logging-proxy-inputoutput---s3-buckets)
- [Logging to DataDog](#logging-proxy-inputoutput---datadog)
- [Logging to DynamoDB](#logging-proxy-inputoutput---dynamodb)
- [Logging to Sentry](#logging-proxy-inputoutput---sentry)
- [Logging to Traceloop (OpenTelemetry)](#logging-proxy-inputoutput-traceloop-opentelemetry)
- [Logging to Athina](#logging-proxy-inputoutput-athina)
- [(BETA) Moderation with Azure Content-Safety](#moderation-with-azure-content-safety)
## Custom Callback Class [Async]
Use this when you want to run custom callbacks in `python`
@ -401,7 +403,7 @@ litellm_settings:
Start the LiteLLM Proxy and make a test request to verify the logs reached your callback API
## Logging Proxy Input/Output - Langfuse
We will use the `--config` to set `litellm.success_callback = ["langfuse"]` this will log all successfull LLM calls to langfuse
We will use the `--config` to set `litellm.success_callback = ["langfuse"]` this will log all successfull LLM calls to langfuse. Make sure to set `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` in your environment
**Step 1** Install langfuse
@ -419,7 +421,13 @@ litellm_settings:
success_callback: ["langfuse"]
```
**Step 3**: Start the proxy, make a test request
**Step 3**: Set required env variables for logging to langfuse
```shell
export LANGFUSE_PUBLIC_KEY="pk_kk"
export LANGFUSE_SECRET_KEY="sk_ss
```
**Step 4**: Start the proxy, make a test request
Start proxy
```shell
@ -539,6 +547,105 @@ print(response)
</Tabs>
### Team based Logging to Langfuse
**Example:**
This config would send langfuse logs to 2 different langfuse projects, based on the team id
```yaml
litellm_settings:
default_team_settings:
- team_id: my-secret-project
success_callback: ["langfuse"]
langfuse_public_key: os.environ/LANGFUSE_PUB_KEY_1 # Project 1
langfuse_secret: os.environ/LANGFUSE_PRIVATE_KEY_1 # Project 1
- team_id: ishaans-secret-project
success_callback: ["langfuse"]
langfuse_public_key: os.environ/LANGFUSE_PUB_KEY_2 # Project 2
langfuse_secret: os.environ/LANGFUSE_SECRET_2 # Project 2
```
Now, when you [generate keys](./virtual_keys.md) for this team-id
```bash
curl -X POST 'http://0.0.0.0:4000/key/generate' \
-H 'Authorization: Bearer sk-1234' \
-H 'Content-Type: application/json' \
-d '{"team_id": "ishaans-secret-project"}'
```
All requests made with these keys will log data to their team-specific logging.
### Redacting Messages, Response Content from Langfuse Logging
Set `litellm.turn_off_message_logging=True` This will prevent the messages and responses from being logged to langfuse, but request metadata will still be logged.
```yaml
model_list:
- model_name: gpt-3.5-turbo
litellm_params:
model: gpt-3.5-turbo
litellm_settings:
success_callback: ["langfuse"]
turn_off_message_logging: True
```
## Logging Proxy Cost + Usage - OpenMeter
Bill customers according to their LLM API usage with [OpenMeter](../observability/openmeter.md)
**Required Env Variables**
```bash
# from https://openmeter.cloud
export OPENMETER_API_ENDPOINT="" # defaults to https://openmeter.cloud
export OPENMETER_API_KEY=""
```
### Quick Start
1. Add to Config.yaml
```yaml
model_list:
- litellm_params:
api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
api_key: my-fake-key
model: openai/my-fake-model
model_name: fake-openai-endpoint
litellm_settings:
success_callback: ["openmeter"] # 👈 KEY CHANGE
```
2. Start Proxy
```
litellm --config /path/to/config.yaml
```
3. Test it!
```bash
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "fake-openai-endpoint",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
],
}
'
```
<Image img={require('../../img/openmeter_img_2.png')} />
## Logging Proxy Input/Output - DataDog
We will use the `--config` to set `litellm.success_callback = ["datadog"]` this will log all successfull LLM calls to DataDog
@ -808,39 +915,72 @@ Test Request
litellm --test
```
## Logging Proxy Input/Output Traceloop (OpenTelemetry)
## Logging Proxy Input/Output in OpenTelemetry format using Traceloop's OpenLLMetry
Traceloop allows you to log LLM Input/Output in the OpenTelemetry format
[OpenLLMetry](https://github.com/traceloop/openllmetry) _(built and maintained by Traceloop)_ is a set of extensions
built on top of [OpenTelemetry](https://opentelemetry.io/) that gives you complete observability over your LLM
application. Because it uses OpenTelemetry under the
hood, [it can be connected to various observability solutions](https://www.traceloop.com/docs/openllmetry/integrations/introduction)
like:
We will use the `--config` to set `litellm.success_callback = ["traceloop"]` this will log all successfull LLM calls to traceloop
* [Traceloop](https://www.traceloop.com/docs/openllmetry/integrations/traceloop)
* [Axiom](https://www.traceloop.com/docs/openllmetry/integrations/axiom)
* [Azure Application Insights](https://www.traceloop.com/docs/openllmetry/integrations/azure)
* [Datadog](https://www.traceloop.com/docs/openllmetry/integrations/datadog)
* [Dynatrace](https://www.traceloop.com/docs/openllmetry/integrations/dynatrace)
* [Grafana Tempo](https://www.traceloop.com/docs/openllmetry/integrations/grafana)
* [Honeycomb](https://www.traceloop.com/docs/openllmetry/integrations/honeycomb)
* [HyperDX](https://www.traceloop.com/docs/openllmetry/integrations/hyperdx)
* [Instana](https://www.traceloop.com/docs/openllmetry/integrations/instana)
* [New Relic](https://www.traceloop.com/docs/openllmetry/integrations/newrelic)
* [OpenTelemetry Collector](https://www.traceloop.com/docs/openllmetry/integrations/otel-collector)
* [Service Now Cloud Observability](https://www.traceloop.com/docs/openllmetry/integrations/service-now)
* [Sentry](https://www.traceloop.com/docs/openllmetry/integrations/sentry)
* [SigNoz](https://www.traceloop.com/docs/openllmetry/integrations/signoz)
* [Splunk](https://www.traceloop.com/docs/openllmetry/integrations/splunk)
**Step 1** Install traceloop-sdk and set Traceloop API key
We will use the `--config` to set `litellm.success_callback = ["traceloop"]` to achieve this, steps are listed below.
**Step 1:** Install the SDK
```shell
pip install traceloop-sdk -U
pip install traceloop-sdk
```
Traceloop outputs standard OpenTelemetry data that can be connected to your observability stack. Send standard OpenTelemetry from LiteLLM Proxy to [Traceloop](https://www.traceloop.com/docs/openllmetry/integrations/traceloop), [Dynatrace](https://www.traceloop.com/docs/openllmetry/integrations/dynatrace), [Datadog](https://www.traceloop.com/docs/openllmetry/integrations/datadog)
, [New Relic](https://www.traceloop.com/docs/openllmetry/integrations/newrelic), [Honeycomb](https://www.traceloop.com/docs/openllmetry/integrations/honeycomb), [Grafana Tempo](https://www.traceloop.com/docs/openllmetry/integrations/grafana), [Splunk](https://www.traceloop.com/docs/openllmetry/integrations/splunk), [OpenTelemetry Collector](https://www.traceloop.com/docs/openllmetry/integrations/otel-collector)
**Step 2:** Configure Environment Variable for trace exporting
You will need to configure where to export your traces. Environment variables will control this, example: For Traceloop
you should use `TRACELOOP_API_KEY`, whereas for Datadog you use `TRACELOOP_BASE_URL`. For more
visit [the Integrations Catalog](https://www.traceloop.com/docs/openllmetry/integrations/introduction).
If you are using Datadog as the observability solutions then you can set `TRACELOOP_BASE_URL` as:
```shell
TRACELOOP_BASE_URL=http://<datadog-agent-hostname>:4318
```
**Step 3**: Create a `config.yaml` file and set `litellm_settings`: `success_callback`
**Step 2**: Create a `config.yaml` file and set `litellm_settings`: `success_callback`
```yaml
model_list:
- model_name: gpt-3.5-turbo
- model_name: gpt-3.5-turbo
litellm_params:
model: gpt-3.5-turbo
api_key: my-fake-key # replace api_key with actual key
litellm_settings:
success_callback: ["traceloop"]
success_callback: [ "traceloop" ]
```
**Step 3**: Start the proxy, make a test request
**Step 4**: Start the proxy, make a test request
Start proxy
```shell
litellm --config config.yaml --debug
```
Test Request
```
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
@ -898,3 +1038,86 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
]
}'
```
## (BETA) Moderation with Azure Content Safety
[Azure Content-Safety](https://azure.microsoft.com/en-us/products/ai-services/ai-content-safety) is a Microsoft Azure service that provides content moderation APIs to detect potential offensive, harmful, or risky content in text.
We will use the `--config` to set `litellm.success_callback = ["azure_content_safety"]` this will moderate all LLM calls using Azure Content Safety.
**Step 0** Deploy Azure Content Safety
Deploy an Azure Content-Safety instance from the Azure Portal and get the `endpoint` and `key`.
**Step 1** Set Athina API key
```shell
AZURE_CONTENT_SAFETY_KEY = "<your-azure-content-safety-key>"
```
**Step 2**: Create a `config.yaml` file and set `litellm_settings`: `success_callback`
```yaml
model_list:
- model_name: gpt-3.5-turbo
litellm_params:
model: gpt-3.5-turbo
litellm_settings:
callbacks: ["azure_content_safety"]
azure_content_safety_params:
endpoint: "<your-azure-content-safety-endpoint>"
key: "os.environ/AZURE_CONTENT_SAFETY_KEY"
```
**Step 3**: Start the proxy, make a test request
Start proxy
```shell
litellm --config config.yaml --debug
```
Test Request
```
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "gpt-3.5-turbo",
"messages": [
{
"role": "user",
"content": "Hi, how are you?"
}
]
}'
```
An HTTP 400 error will be returned if the content is detected with a value greater than the threshold set in the `config.yaml`.
The details of the response will describe :
- The `source` : input text or llm generated text
- The `category` : the category of the content that triggered the moderation
- The `severity` : the severity from 0 to 10
**Step 4**: Customizing Azure Content Safety Thresholds
You can customize the thresholds for each category by setting the `thresholds` in the `config.yaml`
```yaml
model_list:
- model_name: gpt-3.5-turbo
litellm_params:
model: gpt-3.5-turbo
litellm_settings:
callbacks: ["azure_content_safety"]
azure_content_safety_params:
endpoint: "<your-azure-content-safety-endpoint>"
key: "os.environ/AZURE_CONTENT_SAFETY_KEY"
thresholds:
Hate: 6
SelfHarm: 8
Sexual: 6
Violence: 4
```
:::info
`thresholds` are not required by default, but you can tune the values to your needs.
Default values is `4` for all categories
:::

View file

@ -3,7 +3,75 @@ import TabItem from '@theme/TabItem';
# ⚡ Best Practices for Production
Expected Performance in Production
## 1. Use this config.yaml
Use this config.yaml in production (with your own LLMs)
```yaml
model_list:
- model_name: fake-openai-endpoint
litellm_params:
model: openai/fake
api_key: fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
general_settings:
master_key: sk-1234 # enter your own master key, ensure it starts with 'sk-'
alerting: ["slack"] # Setup slack alerting - get alerts on LLM exceptions, Budget Alerts, Slow LLM Responses
proxy_batch_write_at: 60 # Batch write spend updates every 60s
litellm_settings:
set_verbose: False # Switch off Debug Logging, ensure your logs do not have any debugging on
```
Set slack webhook url in your env
```shell
export SLACK_WEBHOOK_URL="https://hooks.slack.com/services/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"
```
:::info
Need Help or want dedicated support ? Talk to a founder [here]: (https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
:::
## 2. On Kubernetes - Use 1 Uvicorn worker [Suggested CMD]
Use this Docker `CMD`. This will start the proxy with 1 Uvicorn Async Worker
(Ensure that you're not setting `run_gunicorn` or `num_workers` in the CMD).
```shell
CMD ["--port", "4000", "--config", "./proxy_server_config.yaml"]
```
## 3. Use Redis 'port','host', 'password'. NOT 'redis_url'
If you decide to use Redis, DO NOT use 'redis_url'. We recommend usig redis port, host, and password params.
`redis_url`is 80 RPS slower
This is still something we're investigating. Keep track of it [here](https://github.com/BerriAI/litellm/issues/3188)
Recommended to do this for prod:
```yaml
router_settings:
routing_strategy: usage-based-routing-v2
# redis_url: "os.environ/REDIS_URL"
redis_host: os.environ/REDIS_HOST
redis_port: os.environ/REDIS_PORT
redis_password: os.environ/REDIS_PASSWORD
```
## 4. Disable 'load_dotenv'
Set `export LITELLM_MODE="PRODUCTION"`
This disables the load_dotenv() functionality, which will automatically load your environment credentials from the local `.env`.
## Extras
### Expected Performance in Production
1 LiteLLM Uvicorn Worker on Kubernetes
@ -16,13 +84,7 @@ Expected Performance in Production
| `/chat/completions` Requests/hour | `126K` |
## 1. Switch of Debug Logging
Remove `set_verbose: True` from your config.yaml
```yaml
litellm_settings:
set_verbose: True
```
### Verifying Debugging logs are off
You should only see the following level of details in logs on the proxy server
```shell
@ -31,135 +93,8 @@ You should only see the following level of details in logs on the proxy server
# INFO: 192.168.2.205:29734 - "POST /chat/completions HTTP/1.1" 200 OK
```
## 2. On Kubernetes - Use 1 Uvicorn worker [Suggested CMD]
Use this Docker `CMD`. This will start the proxy with 1 Uvicorn Async Worker
(Ensure that you're not setting `run_gunicorn` or `num_workers` in the CMD).
```shell
CMD ["--port", "4000", "--config", "./proxy_server_config.yaml"]
```
## 2. Batch write spend updates every 60s
The default proxy batch write is 10s. This is to make it easy to see spend when debugging locally.
In production, we recommend using a longer interval period of 60s. This reduces the number of connections used to make DB writes.
```yaml
general_settings:
master_key: sk-1234
proxy_batch_write_at: 5 # 👈 Frequency of batch writing logs to server (in seconds)
```
## 3. Move spend logs to separate server
Writing each spend log to the db can slow down your proxy. In testing we saw a 70% improvement in median response time, by moving writing spend logs to a separate server.
👉 [LiteLLM Spend Logs Server](https://github.com/BerriAI/litellm/tree/main/litellm-js/spend-logs)
**Spend Logs**
This is a log of the key, tokens, model, and latency for each call on the proxy.
[**Full Payload**](https://github.com/BerriAI/litellm/blob/8c9623a6bc4ad9da0a2dac64249a60ed8da719e8/litellm/proxy/utils.py#L1769)
**1. Start the spend logs server**
```bash
docker run -p 3000:3000 \
-e DATABASE_URL="postgres://.." \
ghcr.io/berriai/litellm-spend_logs:main-latest
# RUNNING on http://0.0.0.0:3000
```
**2. Connect to proxy**
Example litellm_config.yaml
```yaml
model_list:
- model_name: fake-openai-endpoint
litellm_params:
model: openai/my-fake-model
api_key: my-fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
general_settings:
master_key: sk-1234
proxy_batch_write_at: 5 # 👈 Frequency of batch writing logs to server (in seconds)
```
Add `SPEND_LOGS_URL` as an environment variable when starting the proxy
```bash
docker run \
-v $(pwd)/litellm_config.yaml:/app/config.yaml \
-e DATABASE_URL="postgresql://.." \
-e SPEND_LOGS_URL="http://host.docker.internal:3000" \ # 👈 KEY CHANGE
-p 4000:4000 \
ghcr.io/berriai/litellm:main-latest \
--config /app/config.yaml --detailed_debug
# Running on http://0.0.0.0:4000
```
**3. Test Proxy!**
```bash
curl --location 'http://0.0.0.0:4000/v1/chat/completions' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer sk-1234' \
--data '{
"model": "fake-openai-endpoint",
"messages": [
{"role": "system", "content": "Be helpful"},
{"role": "user", "content": "What do you know?"}
]
}'
```
In your LiteLLM Spend Logs Server, you should see
**Expected Response**
```
Received and stored 1 logs. Total logs in memory: 1
...
Flushed 1 log to the DB.
```
### Machine Specification
A t2.micro should be sufficient to handle 1k logs / minute on this server.
This consumes at max 120MB, and <0.1 vCPU.
## 4. Switch off resetting budgets
Add this to your config.yaml. (Only spend per Key, User and Team will be tracked - spend per API Call will not be written to the LiteLLM Database)
```yaml
general_settings:
disable_spend_logs: true
disable_reset_budget: true
```
## 5. Switch of `litellm.telemetry`
Switch of all telemetry tracking done by litellm
```yaml
litellm_settings:
telemetry: False
```
## Machine Specifications to Deploy LiteLLM
### Machine Specifications to Deploy LiteLLM
| Service | Spec | CPUs | Memory | Architecture | Version|
| --- | --- | --- | --- | --- | --- |
@ -167,7 +102,7 @@ litellm_settings:
| Redis Cache | - | - | - | - | 7.0+ Redis Engine|
## Reference Kubernetes Deployment YAML
### Reference Kubernetes Deployment YAML
Reference Kubernetes `deployment.yaml` that was load tested by us

View file

@ -14,6 +14,7 @@ model_list:
model: gpt-3.5-turbo
litellm_settings:
success_callback: ["prometheus"]
failure_callback: ["prometheus"]
```
Start the proxy
@ -48,6 +49,26 @@ http://localhost:4000/metrics
| Metric Name | Description |
|----------------------|--------------------------------------|
| `litellm_requests_metric` | Number of requests made, per `"user", "key", "model"` |
| `litellm_spend_metric` | Total Spend, per `"user", "key", "model"` |
| `litellm_total_tokens` | input + output tokens per `"user", "key", "model"` |
| `litellm_requests_metric` | Number of requests made, per `"user", "key", "model", "team", "end-user"` |
| `litellm_spend_metric` | Total Spend, per `"user", "key", "model", "team", "end-user"` |
| `litellm_total_tokens` | input + output tokens per `"user", "key", "model", "team", "end-user"` |
| `litellm_llm_api_failed_requests_metric` | Number of failed LLM API requests per `"user", "key", "model", "team", "end-user"` |
## Monitor System Health
To monitor the health of litellm adjacent services (redis / postgres), do:
```yaml
model_list:
- model_name: gpt-3.5-turbo
litellm_params:
model: gpt-3.5-turbo
litellm_settings:
service_callback: ["prometheus_system"]
```
| Metric Name | Description |
|----------------------|--------------------------------------|
| `litellm_redis_latency` | histogram latency for redis calls |
| `litellm_redis_fails` | Number of failed redis calls |
| `litellm_self_latency` | Histogram latency for successful litellm api call |

View file

@ -348,6 +348,29 @@ query_result = embeddings.embed_query(text)
print(f"TITAN EMBEDDINGS")
print(query_result[:5])
```
</TabItem>
<TabItem value="litellm" label="LiteLLM SDK">
This is **not recommended**. There is duplicate logic as the proxy also uses the sdk, which might lead to unexpected errors.
```python
from litellm import completion
response = completion(
model="openai/gpt-3.5-turbo",
messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
}
],
api_key="anything",
base_url="http://0.0.0.0:4000"
)
print(response)
```
</TabItem>
</Tabs>

View file

@ -136,7 +136,22 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
'
```
## Advanced - Context Window Fallbacks
### Test it!
```bash
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data-raw '{
"model": "zephyr-beta", # 👈 MODEL NAME to fallback from
"messages": [
{"role": "user", "content": "what color is red"}
],
"mock_testing_fallbacks": true
}'
```
## Advanced - Context Window Fallbacks (Pre-Call Checks + Fallbacks)
**Before call is made** check if a call is within model context window with **`enable_pre_call_checks: true`**.
@ -217,16 +232,16 @@ model_list:
- model_name: gpt-3.5-turbo-small
litellm_params:
model: azure/chatgpt-v-2
api_base: os.environ/AZURE_API_BASE
api_key: os.environ/AZURE_API_KEY
api_version: "2023-07-01-preview"
model_info:
base_model: azure/gpt-4-1106-preview # 2. 👈 (azure-only) SET BASE MODEL
api_base: os.environ/AZURE_API_BASE
api_key: os.environ/AZURE_API_KEY
api_version: "2023-07-01-preview"
model_info:
base_model: azure/gpt-4-1106-preview # 2. 👈 (azure-only) SET BASE MODEL
- model_name: gpt-3.5-turbo-large
litellm_params:
model: gpt-3.5-turbo-1106
api_key: os.environ/OPENAI_API_KEY
model: gpt-3.5-turbo-1106
api_key: os.environ/OPENAI_API_KEY
- model_name: claude-opus
litellm_params:
@ -272,6 +287,69 @@ print(response)
</Tabs>
## Advanced - EU-Region Filtering (Pre-Call Checks)
**Before call is made** check if a call is within model context window with **`enable_pre_call_checks: true`**.
Set 'region_name' of deployment.
**Note:** LiteLLM can automatically infer region_name for Vertex AI, Bedrock, and IBM WatsonxAI based on your litellm params. For Azure, set `litellm.enable_preview = True`.
**1. Set Config**
```yaml
router_settings:
enable_pre_call_checks: true # 1. Enable pre-call checks
model_list:
- model_name: gpt-3.5-turbo
litellm_params:
model: azure/chatgpt-v-2
api_base: os.environ/AZURE_API_BASE
api_key: os.environ/AZURE_API_KEY
api_version: "2023-07-01-preview"
region_name: "eu" # 👈 SET EU-REGION
- model_name: gpt-3.5-turbo
litellm_params:
model: gpt-3.5-turbo-1106
api_key: os.environ/OPENAI_API_KEY
- model_name: gemini-pro
litellm_params:
model: vertex_ai/gemini-pro-1.5
vertex_project: adroit-crow-1234
vertex_location: us-east1 # 👈 AUTOMATICALLY INFERS 'region_name'
```
**2. Start proxy**
```bash
litellm --config /path/to/config.yaml
# RUNNING on http://0.0.0.0:4000
```
**3. Test it!**
```python
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.with_raw_response.create(
model="gpt-3.5-turbo",
messages = [{"role": "user", "content": "Who was Alexander?"}]
)
print(response)
print(f"response.headers.get('x-litellm-model-api-base')")
```
## Advanced - Custom Timeouts, Stream Timeouts - Per Model
For each model you can set `timeout` & `stream_timeout` under `litellm_params`
```yaml

View file

@ -17,6 +17,7 @@ This is a new feature, and subject to changes based on feedback.
### Step 1. Setup Proxy
- `JWT_PUBLIC_KEY_URL`: This is the public keys endpoint of your OpenID provider. Typically it's `{openid-provider-base-url}/.well-known/openid-configuration/jwks`. For Keycloak it's `{keycloak_base_url}/realms/{your-realm}/protocol/openid-connect/certs`.
- `JWT_AUDIENCE`: This is the audience used for decoding the JWT. If not set, the decode step will not verify the audience.
```bash
export JWT_PUBLIC_KEY_URL="" # "https://demo.duendesoftware.com/.well-known/openid-configuration/jwks"
@ -109,7 +110,7 @@ general_settings:
admin_jwt_scope: "litellm-proxy-admin"
```
## Advanced - Spend Tracking (User / Team / Org)
## Advanced - Spend Tracking (End-Users / Internal Users / Team / Org)
Set the field in the jwt token, which corresponds to a litellm user / team / org.
@ -122,6 +123,7 @@ general_settings:
team_id_jwt_field: "client_id" # 👈 CAN BE ANY FIELD
user_id_jwt_field: "sub" # 👈 CAN BE ANY FIELD
org_id_jwt_field: "org_id" # 👈 CAN BE ANY FIELD
end_user_id_jwt_field: "customer_id" # 👈 CAN BE ANY FIELD
```
Expected JWT:
@ -130,7 +132,7 @@ Expected JWT:
{
"client_id": "my-unique-team",
"sub": "my-unique-user",
"org_id": "my-unique-org"
"org_id": "my-unique-org",
}
```

View file

@ -121,6 +121,9 @@ from langchain.prompts.chat import (
SystemMessagePromptTemplate,
)
from langchain.schema import HumanMessage, SystemMessage
import os
os.environ["OPENAI_API_KEY"] = "anything"
chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:4000",
@ -362,6 +365,188 @@ curl --location 'http://0.0.0.0:4000/moderations' \
## Advanced
### (BETA) Batch Completions - pass multiple models
Use this when you want to send 1 request to N Models
#### Expected Request Format
Pass model as a string of comma separated value of models. Example `"model"="llama3,gpt-3.5-turbo"`
This same request will be sent to the following model groups on the [litellm proxy config.yaml](https://docs.litellm.ai/docs/proxy/configs)
- `model_name="llama3"`
- `model_name="gpt-3.5-turbo"`
<Tabs>
<TabItem value="openai-py" label="OpenAI Python SDK">
```python
import openai
client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000")
response = client.chat.completions.create(
model="gpt-3.5-turbo,llama3",
messages=[
{"role": "user", "content": "this is a test request, write a short poem"}
],
)
print(response)
```
#### Expected Response Format
Get a list of responses when `model` is passed as a list
```python
[
ChatCompletion(
id='chatcmpl-9NoYhS2G0fswot0b6QpoQgmRQMaIf',
choices=[
Choice(
finish_reason='stop',
index=0,
logprobs=None,
message=ChatCompletionMessage(
content='In the depths of my soul, a spark ignites\nA light that shines so pure and bright\nIt dances and leaps, refusing to die\nA flame of hope that reaches the sky\n\nIt warms my heart and fills me with bliss\nA reminder that in darkness, there is light to kiss\nSo I hold onto this fire, this guiding light\nAnd let it lead me through the darkest night.',
role='assistant',
function_call=None,
tool_calls=None
)
)
],
created=1715462919,
model='gpt-3.5-turbo-0125',
object='chat.completion',
system_fingerprint=None,
usage=CompletionUsage(
completion_tokens=83,
prompt_tokens=17,
total_tokens=100
)
),
ChatCompletion(
id='chatcmpl-4ac3e982-da4e-486d-bddb-ed1d5cb9c03c',
choices=[
Choice(
finish_reason='stop',
index=0,
logprobs=None,
message=ChatCompletionMessage(
content="A test request, and I'm delighted!\nHere's a short poem, just for you:\n\nMoonbeams dance upon the sea,\nA path of light, for you to see.\nThe stars up high, a twinkling show,\nA night of wonder, for all to know.\n\nThe world is quiet, save the night,\nA peaceful hush, a gentle light.\nThe world is full, of beauty rare,\nA treasure trove, beyond compare.\n\nI hope you enjoyed this little test,\nA poem born, of whimsy and jest.\nLet me know, if there's anything else!",
role='assistant',
function_call=None,
tool_calls=None
)
)
],
created=1715462919,
model='groq/llama3-8b-8192',
object='chat.completion',
system_fingerprint='fp_a2c8d063cb',
usage=CompletionUsage(
completion_tokens=120,
prompt_tokens=20,
total_tokens=140
)
)
]
```
</TabItem>
<TabItem value="curl" label="Curl">
```shell
curl --location 'http://localhost:4000/chat/completions' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{
"model": "llama3,gpt-3.5-turbo",
"max_tokens": 10,
"user": "litellm2",
"messages": [
{
"role": "user",
"content": "is litellm getting better"
}
]
}'
```
#### Expected Response Format
Get a list of responses when `model` is passed as a list
```json
[
{
"id": "chatcmpl-3dbd5dd8-7c82-4ca3-bf1f-7c26f497cf2b",
"choices": [
{
"finish_reason": "length",
"index": 0,
"message": {
"content": "The Elder Scrolls IV: Oblivion!\n\nReleased",
"role": "assistant"
}
}
],
"created": 1715459876,
"model": "groq/llama3-8b-8192",
"object": "chat.completion",
"system_fingerprint": "fp_179b0f92c9",
"usage": {
"completion_tokens": 10,
"prompt_tokens": 12,
"total_tokens": 22
}
},
{
"id": "chatcmpl-9NnldUfFLmVquFHSX4yAtjCw8PGei",
"choices": [
{
"finish_reason": "length",
"index": 0,
"message": {
"content": "TES4 could refer to The Elder Scrolls IV:",
"role": "assistant"
}
}
],
"created": 1715459877,
"model": "gpt-3.5-turbo-0125",
"object": "chat.completion",
"system_fingerprint": null,
"usage": {
"completion_tokens": 10,
"prompt_tokens": 9,
"total_tokens": 19
}
}
]
```
</TabItem>
</Tabs>
### Pass User LLM API Keys, Fallbacks
Allow your end-users to pass their model list, api base, OpenAI API key (any LiteLLM supported provider) to make requests

View file

@ -12,8 +12,8 @@ Requirements:
You can set budgets at 3 levels:
- For the proxy
- For a user
- For a 'user' passed to `/chat/completions`, `/embeddings` etc
- For an internal user
- For an end-user
- For a key
- For a key (model specific budgets)
@ -58,7 +58,7 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
}'
```
</TabItem>
<TabItem value="per-user" label="For User">
<TabItem value="per-user" label="For Internal User">
Apply a budget across multiple keys.
@ -165,12 +165,12 @@ curl --location 'http://localhost:4000/team/new' \
}
```
</TabItem>
<TabItem value="per-user-chat" label="For 'user' passed to /chat/completions">
<TabItem value="per-user-chat" label="For End User">
Use this to budget `user` passed to `/chat/completions`, **without needing to create a key for every user**
**Step 1. Modify config.yaml**
Define `litellm.max_user_budget`
Define `litellm.max_end_user_budget`
```yaml
general_settings:
master_key: sk-1234
@ -328,7 +328,7 @@ You can set:
- max parallel requests
<Tabs>
<TabItem value="per-user" label="Per User">
<TabItem value="per-user" label="Per Internal User">
Use `/user/new`, to persist rate limits across multiple keys.
@ -408,7 +408,7 @@ curl --location 'http://localhost:4000/user/new' \
```
## Create new keys for existing user
## Create new keys for existing internal user
Just include user_id in the `/key/generate` request.

View file

@ -95,8 +95,8 @@ print(response)
- `router.image_generation()` - completion calls in OpenAI `/v1/images/generations` endpoint format
- `router.aimage_generation()` - async image generation calls
### Advanced - Routing Strategies
#### Routing Strategies - Weighted Pick, Rate Limit Aware, Least Busy, Latency Based
## Advanced - Routing Strategies
#### Routing Strategies - Weighted Pick, Rate Limit Aware, Least Busy, Latency Based, Cost Based
Router provides 4 strategies for routing your calls across multiple deployments:
@ -278,8 +278,38 @@ router_settings:
routing_strategy_args: {"ttl": 10}
```
### Set Lowest Latency Buffer
Set a buffer within which deployments are candidates for making calls to.
E.g.
if you have 5 deployments
```
https://litellm-prod-1.openai.azure.com/: 0.07s
https://litellm-prod-2.openai.azure.com/: 0.1s
https://litellm-prod-3.openai.azure.com/: 0.1s
https://litellm-prod-4.openai.azure.com/: 0.1s
https://litellm-prod-5.openai.azure.com/: 4.66s
```
to prevent initially overloading `prod-1`, with all requests - we can set a buffer of 50%, to consider deployments `prod-2, prod-3, prod-4`.
**In Router**
```python
router = Router(..., routing_strategy_args={"lowest_latency_buffer": 0.5})
```
**In Proxy**
```yaml
router_settings:
routing_strategy_args: {"lowest_latency_buffer": 0.5}
```
</TabItem>
<TabItem value="simple-shuffle" label="(Default) Weighted Pick">
<TabItem value="simple-shuffle" label="(Default) Weighted Pick (Async)">
**Default** Picks a deployment based on the provided **Requests per minute (rpm) or Tokens per minute (tpm)**
@ -437,12 +467,136 @@ async def router_acompletion():
asyncio.run(router_acompletion())
```
</TabItem>
<TabItem value="lowest-cost" label="Lowest Cost Routing (Async)">
Picks a deployment based on the lowest cost
How this works:
- Get all healthy deployments
- Select all deployments that are under their provided `rpm/tpm` limits
- For each deployment check if `litellm_param["model"]` exists in [`litellm_model_cost_map`](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)
- if deployment does not exist in `litellm_model_cost_map` -> use deployment_cost= `$1`
- Select deployment with lowest cost
```python
from litellm import Router
import asyncio
model_list = [
{
"model_name": "gpt-3.5-turbo",
"litellm_params": {"model": "gpt-4"},
"model_info": {"id": "openai-gpt-4"},
},
{
"model_name": "gpt-3.5-turbo",
"litellm_params": {"model": "groq/llama3-8b-8192"},
"model_info": {"id": "groq-llama"},
},
]
# init router
router = Router(model_list=model_list, routing_strategy="cost-based-routing")
async def router_acompletion():
response = await router.acompletion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hey, how's it going?"}]
)
print(response)
print(response._hidden_params["model_id"]) # expect groq-llama, since groq/llama has lowest cost
return response
asyncio.run(router_acompletion())
```
#### Using Custom Input/Output pricing
Set `litellm_params["input_cost_per_token"]` and `litellm_params["output_cost_per_token"]` for using custom pricing when routing
```python
model_list = [
{
"model_name": "gpt-3.5-turbo",
"litellm_params": {
"model": "azure/chatgpt-v-2",
"input_cost_per_token": 0.00003,
"output_cost_per_token": 0.00003,
},
"model_info": {"id": "chatgpt-v-experimental"},
},
{
"model_name": "gpt-3.5-turbo",
"litellm_params": {
"model": "azure/chatgpt-v-1",
"input_cost_per_token": 0.000000001,
"output_cost_per_token": 0.00000001,
},
"model_info": {"id": "chatgpt-v-1"},
},
{
"model_name": "gpt-3.5-turbo",
"litellm_params": {
"model": "azure/chatgpt-v-5",
"input_cost_per_token": 10,
"output_cost_per_token": 12,
},
"model_info": {"id": "chatgpt-v-5"},
},
]
# init router
router = Router(model_list=model_list, routing_strategy="cost-based-routing")
async def router_acompletion():
response = await router.acompletion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hey, how's it going?"}]
)
print(response)
print(response._hidden_params["model_id"]) # expect chatgpt-v-1, since chatgpt-v-1 has lowest cost
return response
asyncio.run(router_acompletion())
```
</TabItem>
</Tabs>
## Basic Reliability
### Max Parallel Requests (ASYNC)
Used in semaphore for async requests on router. Limit the max concurrent calls made to a deployment. Useful in high-traffic scenarios.
If tpm/rpm is set, and no max parallel request limit given, we use the RPM or calculated RPM (tpm/1000/6) as the max parallel request limit.
```python
from litellm import Router
model_list = [{
"model_name": "gpt-4",
"litellm_params": {
"model": "azure/gpt-4",
...
"max_parallel_requests": 10 # 👈 SET PER DEPLOYMENT
}
}]
### OR ###
router = Router(model_list=model_list, default_max_parallel_requests=20) # 👈 SET DEFAULT MAX PARALLEL REQUESTS
# deployment max parallel requests > default max parallel requests
```
[**See Code**](https://github.com/BerriAI/litellm/blob/a978f2d8813c04dad34802cb95e0a0e35a3324bc/litellm/utils.py#L5605)
### Timeouts
The timeout set in router is for the entire length of the call, and is passed down to the completion() call level as well.
@ -499,7 +653,9 @@ from litellm import Router
model_list = [{...}]
router = Router(model_list=model_list,
allowed_fails=1) # cooldown model if it fails > 1 call in a minute.
allowed_fails=1, # cooldown model if it fails > 1 call in a minute.
cooldown_time=100 # cooldown the deployment for 100 seconds if it num_fails > allowed_fails
)
user_message = "Hello, whats the weather in San Francisco??"
messages = [{"content": user_message, "role": "user"}]
@ -557,6 +713,57 @@ response = router.completion(model="gpt-3.5-turbo", messages=messages)
print(f"response: {response}")
```
#### Retries based on Error Type
Use `RetryPolicy` if you want to set a `num_retries` based on the Exception receieved
Example:
- 4 retries for `ContentPolicyViolationError`
- 0 retries for `RateLimitErrors`
Example Usage
```python
from litellm.router import RetryPolicy
retry_policy = RetryPolicy(
ContentPolicyViolationErrorRetries=3, # run 3 retries for ContentPolicyViolationErrors
AuthenticationErrorRetries=0, # run 0 retries for AuthenticationErrorRetries
BadRequestErrorRetries=1,
TimeoutErrorRetries=2,
RateLimitErrorRetries=3,
)
router = litellm.Router(
model_list=[
{
"model_name": "gpt-3.5-turbo", # openai model name
"litellm_params": { # params for litellm completion/embedding call
"model": "azure/chatgpt-v-2",
"api_key": os.getenv("AZURE_API_KEY"),
"api_version": os.getenv("AZURE_API_VERSION"),
"api_base": os.getenv("AZURE_API_BASE"),
},
},
{
"model_name": "bad-model", # openai model name
"litellm_params": { # params for litellm completion/embedding call
"model": "azure/chatgpt-v-2",
"api_key": "bad-key",
"api_version": os.getenv("AZURE_API_VERSION"),
"api_base": os.getenv("AZURE_API_BASE"),
},
},
],
retry_policy=retry_policy,
)
response = await router.acompletion(
model=model,
messages=messages,
)
```
### Fallbacks
If a call fails after num_retries, fall back to another model group.
@ -565,6 +772,8 @@ If the error is a context window exceeded error, fall back to a larger model gro
Fallbacks are done in-order - ["gpt-3.5-turbo, "gpt-4", "gpt-4-32k"], will do 'gpt-3.5-turbo' first, then 'gpt-4', etc.
You can also set 'default_fallbacks', in case a specific model group is misconfigured / bad.
```python
from litellm import Router
@ -625,6 +834,7 @@ model_list = [
router = Router(model_list=model_list,
fallbacks=[{"azure/gpt-3.5-turbo": ["gpt-3.5-turbo"]}],
default_fallbacks=["gpt-3.5-turbo-16k"],
context_window_fallbacks=[{"azure/gpt-3.5-turbo-context-fallback": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}],
set_verbose=True)
@ -674,13 +884,11 @@ router = Router(model_list: Optional[list] = None,
cache_responses=True)
```
## Pre-Call Checks (Context Window)
## Pre-Call Checks (Context Window, EU-Regions)
Enable pre-call checks to filter out:
1. deployments with context window limit < messages for a call.
2. deployments that have exceeded rate limits when making concurrent calls. (eg. `asyncio.gather(*[
router.acompletion(model="gpt-3.5-turbo", messages=m) for m in list_of_messages
])`)
2. deployments outside of eu-region
<Tabs>
<TabItem value="sdk" label="SDK">
@ -695,10 +903,14 @@ router = Router(model_list=model_list, enable_pre_call_checks=True) # 👈 Set t
**2. Set Model List**
For azure deployments, set the base model. Pick the base model from [this list](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json), all the azure models start with `azure/`.
For context window checks on azure deployments, set the base model. Pick the base model from [this list](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json), all the azure models start with `azure/`.
<Tabs>
<TabItem value="same-group" label="Same Group">
For 'eu-region' filtering, Set 'region_name' of deployment.
**Note:** We automatically infer region_name for Vertex AI, Bedrock, and IBM WatsonxAI based on your litellm params. For Azure, set `litellm.enable_preview = True`.
[**See Code**](https://github.com/BerriAI/litellm/blob/d33e49411d6503cb634f9652873160cd534dec96/litellm/router.py#L2958)
```python
model_list = [
@ -709,10 +921,9 @@ model_list = [
"api_key": os.getenv("AZURE_API_KEY"),
"api_version": os.getenv("AZURE_API_VERSION"),
"api_base": os.getenv("AZURE_API_BASE"),
},
"model_info": {
"region_name": "eu" # 👈 SET 'EU' REGION NAME
"base_model": "azure/gpt-35-turbo", # 👈 (Azure-only) SET BASE MODEL
}
},
},
{
"model_name": "gpt-3.5-turbo", # model group name
@ -721,54 +932,26 @@ model_list = [
"api_key": os.getenv("OPENAI_API_KEY"),
},
},
{
"model_name": "gemini-pro",
"litellm_params: {
"model": "vertex_ai/gemini-pro-1.5",
"vertex_project": "adroit-crow-1234",
"vertex_location": "us-east1" # 👈 AUTOMATICALLY INFERS 'region_name'
}
}
]
router = Router(model_list=model_list, enable_pre_call_checks=True)
```
</TabItem>
<TabItem value="different-group" label="Context Window Fallbacks (Different Groups)">
```python
model_list = [
{
"model_name": "gpt-3.5-turbo-small", # model group name
"litellm_params": { # params for litellm completion/embedding call
"model": "azure/chatgpt-v-2",
"api_key": os.getenv("AZURE_API_KEY"),
"api_version": os.getenv("AZURE_API_VERSION"),
"api_base": os.getenv("AZURE_API_BASE"),
},
"model_info": {
"base_model": "azure/gpt-35-turbo", # 👈 (Azure-only) SET BASE MODEL
}
},
{
"model_name": "gpt-3.5-turbo-large", # model group name
"litellm_params": { # params for litellm completion/embedding call
"model": "gpt-3.5-turbo-1106",
"api_key": os.getenv("OPENAI_API_KEY"),
},
},
{
"model_name": "claude-opus",
"litellm_params": { call
"model": "claude-3-opus-20240229",
"api_key": os.getenv("ANTHROPIC_API_KEY"),
},
},
]
router = Router(model_list=model_list, enable_pre_call_checks=True, context_window_fallbacks=[{"gpt-3.5-turbo-small": ["gpt-3.5-turbo-large", "claude-opus"]}])
```
</TabItem>
</Tabs>
**3. Test it!**
<Tabs>
<TabItem value="context-window-check" label="Context Window Check">
```python
"""
- Give a gpt-3.5-turbo model group with different context windows (4k vs. 16k)
@ -778,7 +961,6 @@ router = Router(model_list=model_list, enable_pre_call_checks=True, context_wind
from litellm import Router
import os
try:
model_list = [
{
"model_name": "gpt-3.5-turbo", # model group name
@ -787,6 +969,7 @@ model_list = [
"api_key": os.getenv("AZURE_API_KEY"),
"api_version": os.getenv("AZURE_API_VERSION"),
"api_base": os.getenv("AZURE_API_BASE"),
"base_model": "azure/gpt-35-turbo",
},
"model_info": {
"base_model": "azure/gpt-35-turbo",
@ -816,6 +999,59 @@ response = router.completion(
print(f"response: {response}")
```
</TabItem>
<TabItem value="eu-region-check" label="EU Region Check">
```python
"""
- Give 2 gpt-3.5-turbo deployments, in eu + non-eu regions
- Make a call
- Assert it picks the eu-region model
"""
from litellm import Router
import os
model_list = [
{
"model_name": "gpt-3.5-turbo", # model group name
"litellm_params": { # params for litellm completion/embedding call
"model": "azure/chatgpt-v-2",
"api_key": os.getenv("AZURE_API_KEY"),
"api_version": os.getenv("AZURE_API_VERSION"),
"api_base": os.getenv("AZURE_API_BASE"),
"region_name": "eu"
},
"model_info": {
"id": "1"
}
},
{
"model_name": "gpt-3.5-turbo", # model group name
"litellm_params": { # params for litellm completion/embedding call
"model": "gpt-3.5-turbo-1106",
"api_key": os.getenv("OPENAI_API_KEY"),
},
"model_info": {
"id": "2"
}
},
]
router = Router(model_list=model_list, enable_pre_call_checks=True)
response = router.completion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Who was Alexander?"}],
)
print(f"response: {response}")
print(f"response id: {response._hidden_params['model_id']}")
```
</TabItem>
</Tabs>
</TabItem>
<TabItem value="proxy" label="Proxy">
:::info
@ -881,6 +1117,46 @@ async def test_acompletion_caching_on_router_caching_groups():
asyncio.run(test_acompletion_caching_on_router_caching_groups())
```
## Alerting 🚨
Send alerts to slack / your webhook url for the following events
- LLM API Exceptions
- Slow LLM Responses
Get a slack webhook url from https://api.slack.com/messaging/webhooks
#### Usage
Initialize an `AlertingConfig` and pass it to `litellm.Router`. The following code will trigger an alert because `api_key=bad-key` which is invalid
```python
from litellm.router import AlertingConfig
import litellm
import os
router = litellm.Router(
model_list=[
{
"model_name": "gpt-3.5-turbo",
"litellm_params": {
"model": "gpt-3.5-turbo",
"api_key": "bad_key",
},
}
],
alerting_config= AlertingConfig(
alerting_threshold=10, # threshold for slow / hanging llm responses (in seconds). Defaults to 300 seconds
webhook_url= os.getenv("SLACK_WEBHOOK_URL") # webhook you want to send alerts to
),
)
try:
await router.acompletion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
)
except:
pass
```
## Track cost for Azure Deployments
**Problem**: Azure returns `gpt-4` in the response when `azure/gpt-4-1106-preview` is used. This leads to inaccurate cost tracking
@ -1038,10 +1314,11 @@ def __init__(
num_retries: int = 0,
timeout: Optional[float] = None,
default_litellm_params={}, # default params for Router.chat.completion.create
fallbacks: List = [],
fallbacks: Optional[List] = None,
default_fallbacks: Optional[List] = None
allowed_fails: Optional[int] = None, # Number of times a deployment can failbefore being added to cooldown
cooldown_time: float = 1, # (seconds) time to cooldown a deployment after failure
context_window_fallbacks: List = [],
context_window_fallbacks: Optional[List] = None,
model_group_alias: Optional[dict] = {},
retry_after: int = 0, # (min) time to wait before retrying a failed request
routing_strategy: Literal[
@ -1049,6 +1326,7 @@ def __init__(
"least-busy",
"usage-based-routing",
"latency-based-routing",
"cost-based-routing",
] = "simple-shuffle",
## DEBUGGING ##

View file

@ -5,6 +5,9 @@ LiteLLM allows you to specify the following:
* API Base
* API Version
* API Type
* Project
* Location
* Token
Useful Helper functions:
* [`check_valid_key()`](#check_valid_key)
@ -43,6 +46,24 @@ os.environ['AZURE_API_TYPE'] = "azure" # [OPTIONAL]
os.environ['OPENAI_API_BASE'] = "https://openai-gpt-4-test2-v-12.openai.azure.com/"
```
### Setting Project, Location, Token
For cloud providers:
- Azure
- Bedrock
- GCP
- Watson AI
you might need to set additional parameters. LiteLLM provides a common set of params, that we map across all providers.
| | LiteLLM param | Watson | Vertex AI | Azure | Bedrock |
|------|--------------|--------------|--------------|--------------|--------------|
| Project | project | watsonx_project | vertex_project | n/a | n/a |
| Region | region_name | watsonx_region_name | vertex_location | n/a | aws_region_name |
| Token | token | watsonx_token or token | n/a | azure_ad_token | n/a |
If you want, you can call them by their provider-specific params as well.
## litellm variables
### litellm.api_key

View file

@ -105,6 +105,12 @@ const config = {
label: 'Enterprise',
to: "docs/enterprise"
},
{
sidebarId: 'tutorialSidebar',
position: 'left',
label: '🚀 Hosted',
to: "docs/hosted"
},
{
href: 'https://github.com/BerriAI/litellm',
label: 'GitHub',

Binary file not shown.

After

Width:  |  Height:  |  Size: 344 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 219 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 398 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 496 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 348 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 460 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.5 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 533 KiB

View file

@ -39,13 +39,21 @@ const sidebars = {
"proxy/demo",
"proxy/configs",
"proxy/reliability",
"proxy/cost_tracking",
"proxy/users",
"proxy/billing",
"proxy/user_keys",
"proxy/enterprise",
"proxy/virtual_keys",
"proxy/alerting",
{
type: "category",
label: "Logging",
items: ["proxy/logging", "proxy/streaming_logging"],
},
"proxy/team_based_routing",
"proxy/customer_routing",
"proxy/ui",
"proxy/cost_tracking",
"proxy/token_auth",
{
type: "category",
@ -58,12 +66,7 @@ const sidebars = {
"proxy/pii_masking",
"proxy/prompt_injection",
"proxy/caching",
{
type: "category",
label: "Logging, Alerting",
items: ["proxy/logging", "proxy/alerting", "proxy/streaming_logging"],
},
"proxy/grafana_metrics",
"proxy/prometheus",
"proxy/call_hooks",
"proxy/rules",
"proxy/cli",
@ -86,6 +89,7 @@ const sidebars = {
"completion/stream",
"completion/message_trimming",
"completion/function_call",
"completion/vision",
"completion/model_alias",
"completion/batching",
"completion/mock_requests",
@ -115,6 +119,7 @@ const sidebars = {
},
items: [
"providers/openai",
"providers/text_completion_openai",
"providers/openai_compatible",
"providers/azure",
"providers/azure_ai",
@ -128,9 +133,13 @@ const sidebars = {
"providers/cohere",
"providers/anyscale",
"providers/huggingface",
"providers/watsonx",
"providers/predibase",
"providers/triton-inference-server",
"providers/ollama",
"providers/perplexity",
"providers/groq",
"providers/deepseek",
"providers/fireworks_ai",
"providers/vllm",
"providers/xinference",
@ -146,6 +155,7 @@ const sidebars = {
"providers/openrouter",
"providers/custom_openai_proxy",
"providers/petals",
],
},
"proxy/custom_pricing",
@ -166,19 +176,22 @@ const sidebars = {
"observability/custom_callback",
"observability/langfuse_integration",
"observability/sentry",
"observability/lago",
"observability/openmeter",
"observability/promptlayer_integration",
"observability/wandb_integration",
"observability/langsmith_integration",
"observability/slack_integration",
"observability/traceloop_integration",
"observability/lunary_integration",
"observability/athina_integration",
"observability/lunary_integration",
"observability/greenscale_integration",
"observability/helicone_integration",
"observability/supabase_integration",
`observability/telemetry`,
],
},
"caching/redis_cache",
"caching/all_caches",
{
type: "category",
label: "Tutorials",

View file

@ -16,7 +16,7 @@ However, we also expose 3 public helper functions to calculate token usage acros
```python
from litellm import token_counter
messages = [{"user": "role", "content": "Hey, how's it going"}]
messages = [{"role": "user", "content": "Hey, how's it going"}]
print(token_counter(model="gpt-3.5-turbo", messages=messages))
```

View file

@ -10,7 +10,6 @@ from litellm.caching import DualCache
from typing import Literal, Union
dotenv.load_dotenv() # Loading env variables using dotenv
import traceback
@ -19,8 +18,6 @@ import traceback
import dotenv, os
import requests
dotenv.load_dotenv() # Loading env variables using dotenv
import traceback
import datetime, subprocess, sys
import litellm, uuid

View file

@ -1,6 +1,7 @@
# Enterprise Proxy Util Endpoints
from litellm._logging import verbose_logger
import collections
from datetime import datetime
async def get_spend_by_tags(start_date=None, end_date=None, prisma_client=None):
@ -18,26 +19,33 @@ async def get_spend_by_tags(start_date=None, end_date=None, prisma_client=None):
return response
async def ui_get_spend_by_tags(start_date=None, end_date=None, prisma_client=None):
response = await prisma_client.db.query_raw(
"""
async def ui_get_spend_by_tags(start_date: str, end_date: str, prisma_client):
sql_query = """
SELECT
jsonb_array_elements_text(request_tags) AS individual_request_tag,
DATE(s."startTime") AS spend_date,
COUNT(*) AS log_count,
SUM(spend) AS total_spend
FROM "LiteLLM_SpendLogs" s
WHERE s."startTime" >= current_date - interval '30 days'
WHERE
DATE(s."startTime") >= $1::date
AND DATE(s."startTime") <= $2::date
GROUP BY individual_request_tag, spend_date
ORDER BY spend_date;
"""
ORDER BY spend_date
LIMIT 100;
"""
response = await prisma_client.db.query_raw(
sql_query,
start_date,
end_date,
)
# print("tags - spend")
# print(response)
# Bar Chart 1 - Spend per tag - Top 10 tags by spend
total_spend_per_tag = collections.defaultdict(float)
total_requests_per_tag = collections.defaultdict(int)
total_spend_per_tag: collections.defaultdict = collections.defaultdict(float)
total_requests_per_tag: collections.defaultdict = collections.defaultdict(int)
for row in response:
tag_name = row["individual_request_tag"]
tag_spend = row["total_spend"]
@ -49,15 +57,18 @@ async def ui_get_spend_by_tags(start_date=None, end_date=None, prisma_client=Non
# convert to ui format
ui_tags = []
for tag in sorted_tags:
current_spend = tag[1]
if current_spend is not None and isinstance(current_spend, float):
current_spend = round(current_spend, 4)
ui_tags.append(
{
"name": tag[0],
"value": tag[1],
"spend": current_spend,
"log_count": total_requests_per_tag[tag[0]],
}
)
return {"top_10_tags": ui_tags}
return {"spend_per_tag": ui_tags}
async def view_spend_logs_from_clickhouse(
@ -291,7 +302,7 @@ def _create_clickhouse_aggregate_tables(client=None, table_names=[]):
def _forecast_daily_cost(data: list):
import requests
import requests # type: ignore
from datetime import datetime, timedelta
if len(data) == 0:

108
index.yaml Normal file
View file

@ -0,0 +1,108 @@
apiVersion: v1
entries:
litellm-helm:
- apiVersion: v2
appVersion: v1.35.38
created: "2024-05-06T10:22:24.384392-07:00"
dependencies:
- condition: db.deployStandalone
name: postgresql
repository: oci://registry-1.docker.io/bitnamicharts
version: '>=13.3.0'
- condition: redis.enabled
name: redis
repository: oci://registry-1.docker.io/bitnamicharts
version: '>=18.0.0'
description: Call all LLM APIs using the OpenAI format
digest: 60f0cfe9e7c1087437cb35f6fb7c43c3ab2be557b6d3aec8295381eb0dfa760f
name: litellm-helm
type: application
urls:
- litellm-helm-0.2.0.tgz
version: 0.2.0
postgresql:
- annotations:
category: Database
images: |
- name: os-shell
image: docker.io/bitnami/os-shell:12-debian-12-r16
- name: postgres-exporter
image: docker.io/bitnami/postgres-exporter:0.15.0-debian-12-r14
- name: postgresql
image: docker.io/bitnami/postgresql:16.2.0-debian-12-r6
licenses: Apache-2.0
apiVersion: v2
appVersion: 16.2.0
created: "2024-05-06T10:22:24.387717-07:00"
dependencies:
- name: common
repository: oci://registry-1.docker.io/bitnamicharts
tags:
- bitnami-common
version: 2.x.x
description: PostgreSQL (Postgres) is an open source object-relational database
known for reliability and data integrity. ACID-compliant, it supports foreign
keys, joins, views, triggers and stored procedures.
digest: 3c8125526b06833df32e2f626db34aeaedb29d38f03d15349db6604027d4a167
home: https://bitnami.com
icon: https://bitnami.com/assets/stacks/postgresql/img/postgresql-stack-220x234.png
keywords:
- postgresql
- postgres
- database
- sql
- replication
- cluster
maintainers:
- name: VMware, Inc.
url: https://github.com/bitnami/charts
name: postgresql
sources:
- https://github.com/bitnami/charts/tree/main/bitnami/postgresql
urls:
- charts/postgresql-14.3.1.tgz
version: 14.3.1
redis:
- annotations:
category: Database
images: |
- name: kubectl
image: docker.io/bitnami/kubectl:1.29.2-debian-12-r3
- name: os-shell
image: docker.io/bitnami/os-shell:12-debian-12-r16
- name: redis
image: docker.io/bitnami/redis:7.2.4-debian-12-r9
- name: redis-exporter
image: docker.io/bitnami/redis-exporter:1.58.0-debian-12-r4
- name: redis-sentinel
image: docker.io/bitnami/redis-sentinel:7.2.4-debian-12-r7
licenses: Apache-2.0
apiVersion: v2
appVersion: 7.2.4
created: "2024-05-06T10:22:24.391903-07:00"
dependencies:
- name: common
repository: oci://registry-1.docker.io/bitnamicharts
tags:
- bitnami-common
version: 2.x.x
description: Redis(R) is an open source, advanced key-value store. It is often
referred to as a data structure server since keys can contain strings, hashes,
lists, sets and sorted sets.
digest: b2fa1835f673a18002ca864c54fadac3c33789b26f6c5e58e2851b0b14a8f984
home: https://bitnami.com
icon: https://bitnami.com/assets/stacks/redis/img/redis-stack-220x234.png
keywords:
- redis
- keyvalue
- database
maintainers:
- name: VMware, Inc.
url: https://github.com/bitnami/charts
name: redis
sources:
- https://github.com/bitnami/charts/tree/main/bitnami/redis
urls:
- charts/redis-18.19.1.tgz
version: 18.19.1
generated: "2024-05-06T10:22:24.375026-07:00"

BIN
litellm-helm-0.2.0.tgz Normal file

Binary file not shown.

View file

@ -5,8 +5,8 @@
"packages": {
"": {
"dependencies": {
"@hono/node-server": "^1.9.0",
"hono": "^4.1.5"
"@hono/node-server": "^1.10.1",
"hono": "^4.2.7"
},
"devDependencies": {
"@types/node": "^20.11.17",
@ -382,9 +382,9 @@
}
},
"node_modules/@hono/node-server": {
"version": "1.9.0",
"resolved": "https://registry.npmjs.org/@hono/node-server/-/node-server-1.9.0.tgz",
"integrity": "sha512-oJjk7WXBlENeHhWiMqSyxPIZ3Kmf5ZYxqdlcSIXyN8Rn50bNJsPl99G4POBS03Jxh56FdfRJ0SEnC8mAVIiavQ==",
"version": "1.10.1",
"resolved": "https://registry.npmjs.org/@hono/node-server/-/node-server-1.10.1.tgz",
"integrity": "sha512-5BKW25JH5PQKPDkTcIgv3yNUPtOAbnnjFFgWvIxxAY/B/ZNeYjjWoAeDmqhIiCgOAJ3Tauuw+0G+VainhuZRYQ==",
"engines": {
"node": ">=18.14.1"
}
@ -463,9 +463,9 @@
}
},
"node_modules/hono": {
"version": "4.1.5",
"resolved": "https://registry.npmjs.org/hono/-/hono-4.1.5.tgz",
"integrity": "sha512-3ChJiIoeCxvkt6vnkxJagplrt1YZg3NyNob7ssVeK2PUqEINp4q1F94HzFnvY9QE8asVmbW5kkTDlyWylfg2vg==",
"version": "4.2.7",
"resolved": "https://registry.npmjs.org/hono/-/hono-4.2.7.tgz",
"integrity": "sha512-k1xHi86tJnRIVvqhFMBDGFKJ8r5O+bEsT4P59ZK59r0F300Xd910/r237inVfuT/VmE86RQQffX4OYNda6dLXw==",
"engines": {
"node": ">=16.0.0"
}

View file

@ -3,8 +3,8 @@
"dev": "tsx watch src/index.ts"
},
"dependencies": {
"@hono/node-server": "^1.9.0",
"hono": "^4.1.5"
"@hono/node-server": "^1.10.1",
"hono": "^4.2.7"
},
"devDependencies": {
"@types/node": "^20.11.17",

View file

@ -1,8 +1,12 @@
### Hide pydantic namespace conflict warnings globally ###
import warnings
warnings.filterwarnings("ignore", message=".*conflict with protected namespace.*")
### INIT VARIABLES ###
import threading, requests, os
from typing import Callable, List, Optional, Dict, Union, Any, Literal
from litellm.caching import Cache
from litellm._logging import set_verbose, _turn_on_debug, verbose_logger
from litellm._logging import set_verbose, _turn_on_debug, verbose_logger, json_logs
from litellm.proxy._types import (
KeyManagementSystem,
KeyManagementSettings,
@ -11,15 +15,32 @@ from litellm.proxy._types import (
import httpx
import dotenv
dotenv.load_dotenv()
litellm_mode = os.getenv("LITELLM_MODE", "DEV") # "PRODUCTION", "DEV"
if litellm_mode == "DEV":
dotenv.load_dotenv()
#############################################
if set_verbose == True:
_turn_on_debug()
#############################################
### Callbacks /Logging / Success / Failure Handlers ###
input_callback: List[Union[str, Callable]] = []
success_callback: List[Union[str, Callable]] = []
failure_callback: List[Union[str, Callable]] = []
callbacks: List[Callable] = []
service_callback: List[Union[str, Callable]] = []
_custom_logger_compatible_callbacks_literal = Literal["lago", "openmeter"]
callbacks: List[Union[Callable, _custom_logger_compatible_callbacks_literal]] = []
_langfuse_default_tags: Optional[
List[
Literal[
"user_api_key_alias",
"user_api_key_user_id",
"user_api_key_user_email",
"user_api_key_team_alias",
"semantic-similarity",
"proxy_base_url",
]
]
] = None
_async_input_callback: List[Callable] = (
[]
) # internal variable - async custom callbacks are routed here.
@ -31,6 +52,9 @@ _async_failure_callback: List[Callable] = (
) # internal variable - async custom callbacks are routed here.
pre_call_rules: List[Callable] = []
post_call_rules: List[Callable] = []
turn_off_message_logging: Optional[bool] = False
## end of callbacks #############
email: Optional[str] = (
None # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
)
@ -42,24 +66,34 @@ max_tokens = 256 # OpenAI Defaults
drop_params = False
modify_params = False
retry = True
### AUTH ###
api_key: Optional[str] = None
openai_key: Optional[str] = None
azure_key: Optional[str] = None
anthropic_key: Optional[str] = None
replicate_key: Optional[str] = None
cohere_key: Optional[str] = None
clarifai_key: Optional[str] = None
maritalk_key: Optional[str] = None
ai21_key: Optional[str] = None
ollama_key: Optional[str] = None
openrouter_key: Optional[str] = None
predibase_key: Optional[str] = None
huggingface_key: Optional[str] = None
vertex_project: Optional[str] = None
vertex_location: Optional[str] = None
predibase_tenant_id: Optional[str] = None
togetherai_api_key: Optional[str] = None
cloudflare_api_key: Optional[str] = None
baseten_key: Optional[str] = None
aleph_alpha_key: Optional[str] = None
nlp_cloud_key: Optional[str] = None
common_cloud_provider_auth_params: dict = {
"params": ["project", "region_name", "token"],
"providers": ["vertex_ai", "bedrock", "watsonx", "azure"],
}
use_client: bool = False
ssl_verify: bool = True
disable_streaming_logging: bool = False
### GUARDRAILS ###
llamaguard_model_name: Optional[str] = None
@ -70,6 +104,9 @@ blocked_user_list: Optional[Union[str, List]] = None
banned_keywords_list: Optional[Union[str, List]] = None
llm_guard_mode: Literal["all", "key-specific", "request-specific"] = "all"
##################
### PREVIEW FEATURES ###
enable_preview_features: bool = False
##################
logging: bool = True
caching: bool = (
False # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
@ -184,6 +221,7 @@ max_end_user_budget: Optional[float] = None
#### RELIABILITY ####
request_timeout: Optional[float] = 6000
num_retries: Optional[int] = None # per model endpoint
default_fallbacks: Optional[List] = None
fallbacks: Optional[List] = None
context_window_fallbacks: Optional[List] = None
allowed_fails: int = 0
@ -281,6 +319,7 @@ aleph_alpha_models: List = []
bedrock_models: List = []
deepinfra_models: List = []
perplexity_models: List = []
watsonx_models: List = []
for key, value in model_cost.items():
if value.get("litellm_provider") == "openai":
open_ai_chat_completion_models.append(key)
@ -325,6 +364,8 @@ for key, value in model_cost.items():
deepinfra_models.append(key)
elif value.get("litellm_provider") == "perplexity":
perplexity_models.append(key)
elif value.get("litellm_provider") == "watsonx":
watsonx_models.append(key)
# known openai compatible endpoints - we'll eventually move this list to the model_prices_and_context_window.json dictionary
openai_compatible_endpoints: List = [
@ -333,6 +374,7 @@ openai_compatible_endpoints: List = [
"api.deepinfra.com/v1/openai",
"api.mistral.ai/v1",
"api.groq.com/openai/v1",
"api.deepseek.com/v1",
"api.together.xyz/v1",
]
@ -341,6 +383,7 @@ openai_compatible_providers: List = [
"anyscale",
"mistral",
"groq",
"deepseek",
"deepinfra",
"perplexity",
"xinference",
@ -365,6 +408,73 @@ replicate_models: List = [
"replit/replit-code-v1-3b:b84f4c074b807211cd75e3e8b1589b6399052125b4c27106e43d47189e8415ad",
]
clarifai_models: List = [
"clarifai/meta.Llama-3.Llama-3-8B-Instruct",
"clarifai/gcp.generate.gemma-1_1-7b-it",
"clarifai/mistralai.completion.mixtral-8x22B",
"clarifai/cohere.generate.command-r-plus",
"clarifai/databricks.drbx.dbrx-instruct",
"clarifai/mistralai.completion.mistral-large",
"clarifai/mistralai.completion.mistral-medium",
"clarifai/mistralai.completion.mistral-small",
"clarifai/mistralai.completion.mixtral-8x7B-Instruct-v0_1",
"clarifai/gcp.generate.gemma-2b-it",
"clarifai/gcp.generate.gemma-7b-it",
"clarifai/deci.decilm.deciLM-7B-instruct",
"clarifai/mistralai.completion.mistral-7B-Instruct",
"clarifai/gcp.generate.gemini-pro",
"clarifai/anthropic.completion.claude-v1",
"clarifai/anthropic.completion.claude-instant-1_2",
"clarifai/anthropic.completion.claude-instant",
"clarifai/anthropic.completion.claude-v2",
"clarifai/anthropic.completion.claude-2_1",
"clarifai/meta.Llama-2.codeLlama-70b-Python",
"clarifai/meta.Llama-2.codeLlama-70b-Instruct",
"clarifai/openai.completion.gpt-3_5-turbo-instruct",
"clarifai/meta.Llama-2.llama2-7b-chat",
"clarifai/meta.Llama-2.llama2-13b-chat",
"clarifai/meta.Llama-2.llama2-70b-chat",
"clarifai/openai.chat-completion.gpt-4-turbo",
"clarifai/microsoft.text-generation.phi-2",
"clarifai/meta.Llama-2.llama2-7b-chat-vllm",
"clarifai/upstage.solar.solar-10_7b-instruct",
"clarifai/openchat.openchat.openchat-3_5-1210",
"clarifai/togethercomputer.stripedHyena.stripedHyena-Nous-7B",
"clarifai/gcp.generate.text-bison",
"clarifai/meta.Llama-2.llamaGuard-7b",
"clarifai/fblgit.una-cybertron.una-cybertron-7b-v2",
"clarifai/openai.chat-completion.GPT-4",
"clarifai/openai.chat-completion.GPT-3_5-turbo",
"clarifai/ai21.complete.Jurassic2-Grande",
"clarifai/ai21.complete.Jurassic2-Grande-Instruct",
"clarifai/ai21.complete.Jurassic2-Jumbo-Instruct",
"clarifai/ai21.complete.Jurassic2-Jumbo",
"clarifai/ai21.complete.Jurassic2-Large",
"clarifai/cohere.generate.cohere-generate-command",
"clarifai/wizardlm.generate.wizardCoder-Python-34B",
"clarifai/wizardlm.generate.wizardLM-70B",
"clarifai/tiiuae.falcon.falcon-40b-instruct",
"clarifai/togethercomputer.RedPajama.RedPajama-INCITE-7B-Chat",
"clarifai/gcp.generate.code-gecko",
"clarifai/gcp.generate.code-bison",
"clarifai/mistralai.completion.mistral-7B-OpenOrca",
"clarifai/mistralai.completion.openHermes-2-mistral-7B",
"clarifai/wizardlm.generate.wizardLM-13B",
"clarifai/huggingface-research.zephyr.zephyr-7B-alpha",
"clarifai/wizardlm.generate.wizardCoder-15B",
"clarifai/microsoft.text-generation.phi-1_5",
"clarifai/databricks.Dolly-v2.dolly-v2-12b",
"clarifai/bigcode.code.StarCoder",
"clarifai/salesforce.xgen.xgen-7b-8k-instruct",
"clarifai/mosaicml.mpt.mpt-7b-instruct",
"clarifai/anthropic.completion.claude-3-opus",
"clarifai/anthropic.completion.claude-3-sonnet",
"clarifai/gcp.generate.gemini-1_5-pro",
"clarifai/gcp.generate.imagen-2",
"clarifai/salesforce.blip.general-english-image-caption-blip-2",
]
huggingface_models: List = [
"meta-llama/Llama-2-7b-hf",
"meta-llama/Llama-2-7b-chat-hf",
@ -461,6 +571,7 @@ model_list = (
+ perplexity_models
+ maritalk_models
+ vertex_language_models
+ watsonx_models
)
provider_list: List = [
@ -469,6 +580,7 @@ provider_list: List = [
"text-completion-openai",
"cohere",
"cohere_chat",
"clarifai",
"anthropic",
"replicate",
"huggingface",
@ -494,11 +606,15 @@ provider_list: List = [
"anyscale",
"mistral",
"groq",
"deepseek",
"maritalk",
"voyage",
"cloudflare",
"xinference",
"fireworks_ai",
"watsonx",
"triton",
"predibase",
"custom", # custom apis
]
@ -512,7 +628,11 @@ models_by_provider: dict = {
"together_ai": together_ai_models,
"baseten": baseten_models,
"openrouter": openrouter_models,
"vertex_ai": vertex_chat_models + vertex_text_models,
"vertex_ai": vertex_chat_models
+ vertex_text_models
+ vertex_anthropic_models
+ vertex_vision_models
+ vertex_language_models,
"ai21": ai21_models,
"bedrock": bedrock_models,
"petals": petals_models,
@ -520,6 +640,7 @@ models_by_provider: dict = {
"deepinfra": deepinfra_models,
"perplexity": perplexity_models,
"maritalk": maritalk_models,
"watsonx": watsonx_models,
}
# mapping for those models which have larger equivalents
@ -570,7 +691,6 @@ all_embedding_models = (
####### IMAGE GENERATION MODELS ###################
openai_image_generation_models = ["dall-e-2", "dall-e-3"]
from .timeout import timeout
from .utils import (
client,
@ -578,10 +698,13 @@ from .utils import (
get_optional_params,
modify_integration,
token_counter,
create_pretrained_tokenizer,
create_tokenizer,
cost_per_token,
completion_cost,
supports_function_calling,
supports_parallel_function_calling,
supports_vision,
get_litellm_params,
Logging,
acreate,
@ -600,12 +723,15 @@ from .utils import (
get_secret,
get_supported_openai_params,
get_api_base,
get_first_chars_messages,
)
from .llms.huggingface_restapi import HuggingfaceConfig
from .llms.anthropic import AnthropicConfig
from .llms.predibase import PredibaseConfig
from .llms.anthropic_text import AnthropicTextConfig
from .llms.replicate import ReplicateConfig
from .llms.cohere import CohereConfig
from .llms.clarifai import ClarifaiConfig
from .llms.ai21 import AI21Config
from .llms.together_ai import TogetherAIConfig
from .llms.cloudflare import CloudflareConfig
@ -620,6 +746,7 @@ from .llms.sagemaker import SagemakerConfig
from .llms.ollama import OllamaConfig
from .llms.ollama_chat import OllamaChatConfig
from .llms.maritalk import MaritTalkConfig
from .llms.bedrock_httpx import AmazonCohereChatConfig
from .llms.bedrock import (
AmazonTitanConfig,
AmazonAI21Config,
@ -629,9 +756,11 @@ from .llms.bedrock import (
AmazonLlamaConfig,
AmazonStabilityConfig,
AmazonMistralConfig,
AmazonBedrockGlobalConfig,
)
from .llms.openai import OpenAIConfig, OpenAITextCompletionConfig
from .llms.openai import OpenAIConfig, OpenAITextCompletionConfig, MistralConfig
from .llms.azure import AzureOpenAIConfig, AzureOpenAIError
from .llms.watsonx import IBMWatsonXAIConfig
from .main import * # type: ignore
from .integrations import *
from .exceptions import (
@ -654,3 +783,4 @@ from .exceptions import (
from .budget_manager import BudgetManager
from .proxy.proxy_cli import run_server
from .router import Router
from .assistants.main import *

View file

@ -1,7 +1,7 @@
import logging
set_verbose = False
json_logs = False
# Create a handler for the logger (you may need to adapt this based on your needs)
handler = logging.StreamHandler()
handler.setLevel(logging.DEBUG)

View file

@ -10,8 +10,8 @@
# s/o [@Frank Colson](https://www.linkedin.com/in/frank-colson-422b9b183/) for this redis implementation
import os
import inspect
import redis, litellm
import redis.asyncio as async_redis
import redis, litellm # type: ignore
import redis.asyncio as async_redis # type: ignore
from typing import List, Optional
@ -32,6 +32,25 @@ def _get_redis_kwargs():
return available_args
def _get_redis_url_kwargs(client=None):
if client is None:
client = redis.Redis.from_url
arg_spec = inspect.getfullargspec(redis.Redis.from_url)
# Only allow primitive arguments
exclude_args = {
"self",
"connection_pool",
"retry",
}
include_args = ["url"]
available_args = [x for x in arg_spec.args if x not in exclude_args] + include_args
return available_args
def _get_redis_env_kwarg_mapping():
PREFIX = "REDIS_"
@ -91,27 +110,39 @@ def _get_redis_client_logic(**env_overrides):
redis_kwargs.pop("password", None)
elif "host" not in redis_kwargs or redis_kwargs["host"] is None:
raise ValueError("Either 'host' or 'url' must be specified for redis.")
litellm.print_verbose(f"redis_kwargs: {redis_kwargs}")
# litellm.print_verbose(f"redis_kwargs: {redis_kwargs}")
return redis_kwargs
def get_redis_client(**env_overrides):
redis_kwargs = _get_redis_client_logic(**env_overrides)
if "url" in redis_kwargs and redis_kwargs["url"] is not None:
redis_kwargs.pop(
"connection_pool", None
) # redis.from_url doesn't support setting your own connection pool
return redis.Redis.from_url(**redis_kwargs)
args = _get_redis_url_kwargs()
url_kwargs = {}
for arg in redis_kwargs:
if arg in args:
url_kwargs[arg] = redis_kwargs[arg]
return redis.Redis.from_url(**url_kwargs)
return redis.Redis(**redis_kwargs)
def get_redis_async_client(**env_overrides):
redis_kwargs = _get_redis_client_logic(**env_overrides)
if "url" in redis_kwargs and redis_kwargs["url"] is not None:
redis_kwargs.pop(
"connection_pool", None
) # redis.from_url doesn't support setting your own connection pool
return async_redis.Redis.from_url(**redis_kwargs)
args = _get_redis_url_kwargs(client=async_redis.Redis.from_url)
url_kwargs = {}
for arg in redis_kwargs:
if arg in args:
url_kwargs[arg] = redis_kwargs[arg]
else:
litellm.print_verbose(
"REDIS: ignoring argument: {}. Not an allowed async_redis.Redis.from_url arg.".format(
arg
)
)
return async_redis.Redis.from_url(**url_kwargs)
return async_redis.Redis(
socket_timeout=5,
**redis_kwargs,
@ -124,4 +155,9 @@ def get_redis_connection_pool(**env_overrides):
return async_redis.BlockingConnectionPool.from_url(
timeout=5, url=redis_kwargs["url"]
)
connection_class = async_redis.Connection
if "ssl" in redis_kwargs and redis_kwargs["ssl"] is not None:
connection_class = async_redis.SSLConnection
redis_kwargs.pop("ssl", None)
redis_kwargs["connection_class"] = connection_class
return async_redis.BlockingConnectionPool(timeout=5, **redis_kwargs)

130
litellm/_service_logger.py Normal file
View file

@ -0,0 +1,130 @@
import litellm, traceback
from litellm.proxy._types import UserAPIKeyAuth
from .types.services import ServiceTypes, ServiceLoggerPayload
from .integrations.prometheus_services import PrometheusServicesLogger
from .integrations.custom_logger import CustomLogger
from datetime import timedelta
from typing import Union
class ServiceLogging(CustomLogger):
"""
Separate class used for monitoring health of litellm-adjacent services (redis/postgres).
"""
def __init__(self, mock_testing: bool = False) -> None:
self.mock_testing = mock_testing
self.mock_testing_sync_success_hook = 0
self.mock_testing_async_success_hook = 0
self.mock_testing_sync_failure_hook = 0
self.mock_testing_async_failure_hook = 0
if "prometheus_system" in litellm.service_callback:
self.prometheusServicesLogger = PrometheusServicesLogger()
def service_success_hook(
self, service: ServiceTypes, duration: float, call_type: str
):
"""
[TODO] Not implemented for sync calls yet. V0 is focused on async monitoring (used by proxy).
"""
if self.mock_testing:
self.mock_testing_sync_success_hook += 1
def service_failure_hook(
self, service: ServiceTypes, duration: float, error: Exception, call_type: str
):
"""
[TODO] Not implemented for sync calls yet. V0 is focused on async monitoring (used by proxy).
"""
if self.mock_testing:
self.mock_testing_sync_failure_hook += 1
async def async_service_success_hook(
self, service: ServiceTypes, duration: float, call_type: str
):
"""
- For counting if the redis, postgres call is successful
"""
if self.mock_testing:
self.mock_testing_async_success_hook += 1
payload = ServiceLoggerPayload(
is_error=False,
error=None,
service=service,
duration=duration,
call_type=call_type,
)
for callback in litellm.service_callback:
if callback == "prometheus_system":
await self.prometheusServicesLogger.async_service_success_hook(
payload=payload
)
async def async_service_failure_hook(
self,
service: ServiceTypes,
duration: float,
error: Union[str, Exception],
call_type: str,
):
"""
- For counting if the redis, postgres call is unsuccessful
"""
if self.mock_testing:
self.mock_testing_async_failure_hook += 1
error_message = ""
if isinstance(error, Exception):
error_message = str(error)
elif isinstance(error, str):
error_message = error
payload = ServiceLoggerPayload(
is_error=True,
error=error_message,
service=service,
duration=duration,
call_type=call_type,
)
for callback in litellm.service_callback:
if callback == "prometheus_system":
if self.prometheusServicesLogger is None:
self.prometheusServicesLogger = self.prometheusServicesLogger()
await self.prometheusServicesLogger.async_service_failure_hook(
payload=payload
)
async def async_post_call_failure_hook(
self, original_exception: Exception, user_api_key_dict: UserAPIKeyAuth
):
"""
Hook to track failed litellm-service calls
"""
return await super().async_post_call_failure_hook(
original_exception, user_api_key_dict
)
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
"""
Hook to track latency for litellm proxy llm api calls
"""
try:
_duration = end_time - start_time
if isinstance(_duration, timedelta):
_duration = _duration.total_seconds()
elif isinstance(_duration, float):
pass
else:
raise Exception(
"Duration={} is not a float or timedelta object. type={}".format(
_duration, type(_duration)
)
) # invalid _duration value
await self.async_service_success_hook(
service=ServiceTypes.LITELLM,
duration=_duration,
call_type=kwargs["call_type"],
)
except Exception as e:
raise e

495
litellm/assistants/main.py Normal file
View file

@ -0,0 +1,495 @@
# What is this?
## Main file for assistants API logic
from typing import Iterable
import os
import litellm
from openai import OpenAI
from litellm import client
from litellm.utils import supports_httpx_timeout
from ..llms.openai import OpenAIAssistantsAPI
from ..types.llms.openai import *
from ..types.router import *
####### ENVIRONMENT VARIABLES ###################
openai_assistants_api = OpenAIAssistantsAPI()
### ASSISTANTS ###
def get_assistants(
custom_llm_provider: Literal["openai"],
client: Optional[OpenAI] = None,
**kwargs,
) -> SyncCursorPage[Assistant]:
optional_params = GenericLiteLLMParams(**kwargs)
### TIMEOUT LOGIC ###
timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
# set timeout for 10 minutes by default
if (
timeout is not None
and isinstance(timeout, httpx.Timeout)
and supports_httpx_timeout(custom_llm_provider) == False
):
read_timeout = timeout.read or 600
timeout = read_timeout # default 10 min timeout
elif timeout is not None and not isinstance(timeout, httpx.Timeout):
timeout = float(timeout) # type: ignore
elif timeout is None:
timeout = 600.0
response: Optional[SyncCursorPage[Assistant]] = None
if custom_llm_provider == "openai":
api_base = (
optional_params.api_base # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
or litellm.api_base
or os.getenv("OPENAI_API_BASE")
or "https://api.openai.com/v1"
)
organization = (
optional_params.organization
or litellm.organization
or os.getenv("OPENAI_ORGANIZATION", None)
or None # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
)
# set API KEY
api_key = (
optional_params.api_key
or litellm.api_key # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
or litellm.openai_key
or os.getenv("OPENAI_API_KEY")
)
response = openai_assistants_api.get_assistants(
api_base=api_base,
api_key=api_key,
timeout=timeout,
max_retries=optional_params.max_retries,
organization=organization,
client=client,
)
else:
raise litellm.exceptions.BadRequestError(
message="LiteLLM doesn't support {} for 'get_assistants'. Only 'openai' is supported.".format(
custom_llm_provider
),
model="n/a",
llm_provider=custom_llm_provider,
response=httpx.Response(
status_code=400,
content="Unsupported provider",
request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"), # type: ignore
),
)
return response
### THREADS ###
def create_thread(
custom_llm_provider: Literal["openai"],
messages: Optional[Iterable[OpenAICreateThreadParamsMessage]] = None,
metadata: Optional[dict] = None,
tool_resources: Optional[OpenAICreateThreadParamsToolResources] = None,
client: Optional[OpenAI] = None,
**kwargs,
) -> Thread:
"""
- get the llm provider
- if openai - route it there
- pass through relevant params
```
from litellm import create_thread
create_thread(
custom_llm_provider="openai",
### OPTIONAL ###
messages = {
"role": "user",
"content": "Hello, what is AI?"
},
{
"role": "user",
"content": "How does AI work? Explain it in simple terms."
}]
)
```
"""
optional_params = GenericLiteLLMParams(**kwargs)
### TIMEOUT LOGIC ###
timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
# set timeout for 10 minutes by default
if (
timeout is not None
and isinstance(timeout, httpx.Timeout)
and supports_httpx_timeout(custom_llm_provider) == False
):
read_timeout = timeout.read or 600
timeout = read_timeout # default 10 min timeout
elif timeout is not None and not isinstance(timeout, httpx.Timeout):
timeout = float(timeout) # type: ignore
elif timeout is None:
timeout = 600.0
response: Optional[Thread] = None
if custom_llm_provider == "openai":
api_base = (
optional_params.api_base # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
or litellm.api_base
or os.getenv("OPENAI_API_BASE")
or "https://api.openai.com/v1"
)
organization = (
optional_params.organization
or litellm.organization
or os.getenv("OPENAI_ORGANIZATION", None)
or None # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
)
# set API KEY
api_key = (
optional_params.api_key
or litellm.api_key # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
or litellm.openai_key
or os.getenv("OPENAI_API_KEY")
)
response = openai_assistants_api.create_thread(
messages=messages,
metadata=metadata,
api_base=api_base,
api_key=api_key,
timeout=timeout,
max_retries=optional_params.max_retries,
organization=organization,
client=client,
)
else:
raise litellm.exceptions.BadRequestError(
message="LiteLLM doesn't support {} for 'create_thread'. Only 'openai' is supported.".format(
custom_llm_provider
),
model="n/a",
llm_provider=custom_llm_provider,
response=httpx.Response(
status_code=400,
content="Unsupported provider",
request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"), # type: ignore
),
)
return response
def get_thread(
custom_llm_provider: Literal["openai"],
thread_id: str,
client: Optional[OpenAI] = None,
**kwargs,
) -> Thread:
"""Get the thread object, given a thread_id"""
optional_params = GenericLiteLLMParams(**kwargs)
### TIMEOUT LOGIC ###
timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
# set timeout for 10 minutes by default
if (
timeout is not None
and isinstance(timeout, httpx.Timeout)
and supports_httpx_timeout(custom_llm_provider) == False
):
read_timeout = timeout.read or 600
timeout = read_timeout # default 10 min timeout
elif timeout is not None and not isinstance(timeout, httpx.Timeout):
timeout = float(timeout) # type: ignore
elif timeout is None:
timeout = 600.0
response: Optional[Thread] = None
if custom_llm_provider == "openai":
api_base = (
optional_params.api_base # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
or litellm.api_base
or os.getenv("OPENAI_API_BASE")
or "https://api.openai.com/v1"
)
organization = (
optional_params.organization
or litellm.organization
or os.getenv("OPENAI_ORGANIZATION", None)
or None # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
)
# set API KEY
api_key = (
optional_params.api_key
or litellm.api_key # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
or litellm.openai_key
or os.getenv("OPENAI_API_KEY")
)
response = openai_assistants_api.get_thread(
thread_id=thread_id,
api_base=api_base,
api_key=api_key,
timeout=timeout,
max_retries=optional_params.max_retries,
organization=organization,
client=client,
)
else:
raise litellm.exceptions.BadRequestError(
message="LiteLLM doesn't support {} for 'get_thread'. Only 'openai' is supported.".format(
custom_llm_provider
),
model="n/a",
llm_provider=custom_llm_provider,
response=httpx.Response(
status_code=400,
content="Unsupported provider",
request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"), # type: ignore
),
)
return response
### MESSAGES ###
def add_message(
custom_llm_provider: Literal["openai"],
thread_id: str,
role: Literal["user", "assistant"],
content: str,
attachments: Optional[List[Attachment]] = None,
metadata: Optional[dict] = None,
client: Optional[OpenAI] = None,
**kwargs,
) -> OpenAIMessage:
### COMMON OBJECTS ###
message_data = MessageData(
role=role, content=content, attachments=attachments, metadata=metadata
)
optional_params = GenericLiteLLMParams(**kwargs)
### TIMEOUT LOGIC ###
timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
# set timeout for 10 minutes by default
if (
timeout is not None
and isinstance(timeout, httpx.Timeout)
and supports_httpx_timeout(custom_llm_provider) == False
):
read_timeout = timeout.read or 600
timeout = read_timeout # default 10 min timeout
elif timeout is not None and not isinstance(timeout, httpx.Timeout):
timeout = float(timeout) # type: ignore
elif timeout is None:
timeout = 600.0
response: Optional[OpenAIMessage] = None
if custom_llm_provider == "openai":
api_base = (
optional_params.api_base # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
or litellm.api_base
or os.getenv("OPENAI_API_BASE")
or "https://api.openai.com/v1"
)
organization = (
optional_params.organization
or litellm.organization
or os.getenv("OPENAI_ORGANIZATION", None)
or None # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
)
# set API KEY
api_key = (
optional_params.api_key
or litellm.api_key # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
or litellm.openai_key
or os.getenv("OPENAI_API_KEY")
)
response = openai_assistants_api.add_message(
thread_id=thread_id,
message_data=message_data,
api_base=api_base,
api_key=api_key,
timeout=timeout,
max_retries=optional_params.max_retries,
organization=organization,
client=client,
)
else:
raise litellm.exceptions.BadRequestError(
message="LiteLLM doesn't support {} for 'create_thread'. Only 'openai' is supported.".format(
custom_llm_provider
),
model="n/a",
llm_provider=custom_llm_provider,
response=httpx.Response(
status_code=400,
content="Unsupported provider",
request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"), # type: ignore
),
)
return response
def get_messages(
custom_llm_provider: Literal["openai"],
thread_id: str,
client: Optional[OpenAI] = None,
**kwargs,
) -> SyncCursorPage[OpenAIMessage]:
optional_params = GenericLiteLLMParams(**kwargs)
### TIMEOUT LOGIC ###
timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
# set timeout for 10 minutes by default
if (
timeout is not None
and isinstance(timeout, httpx.Timeout)
and supports_httpx_timeout(custom_llm_provider) == False
):
read_timeout = timeout.read or 600
timeout = read_timeout # default 10 min timeout
elif timeout is not None and not isinstance(timeout, httpx.Timeout):
timeout = float(timeout) # type: ignore
elif timeout is None:
timeout = 600.0
response: Optional[SyncCursorPage[OpenAIMessage]] = None
if custom_llm_provider == "openai":
api_base = (
optional_params.api_base # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
or litellm.api_base
or os.getenv("OPENAI_API_BASE")
or "https://api.openai.com/v1"
)
organization = (
optional_params.organization
or litellm.organization
or os.getenv("OPENAI_ORGANIZATION", None)
or None # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
)
# set API KEY
api_key = (
optional_params.api_key
or litellm.api_key # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
or litellm.openai_key
or os.getenv("OPENAI_API_KEY")
)
response = openai_assistants_api.get_messages(
thread_id=thread_id,
api_base=api_base,
api_key=api_key,
timeout=timeout,
max_retries=optional_params.max_retries,
organization=organization,
client=client,
)
else:
raise litellm.exceptions.BadRequestError(
message="LiteLLM doesn't support {} for 'get_messages'. Only 'openai' is supported.".format(
custom_llm_provider
),
model="n/a",
llm_provider=custom_llm_provider,
response=httpx.Response(
status_code=400,
content="Unsupported provider",
request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"), # type: ignore
),
)
return response
### RUNS ###
def run_thread(
custom_llm_provider: Literal["openai"],
thread_id: str,
assistant_id: str,
additional_instructions: Optional[str] = None,
instructions: Optional[str] = None,
metadata: Optional[dict] = None,
model: Optional[str] = None,
stream: Optional[bool] = None,
tools: Optional[Iterable[AssistantToolParam]] = None,
client: Optional[OpenAI] = None,
**kwargs,
) -> Run:
"""Run a given thread + assistant."""
optional_params = GenericLiteLLMParams(**kwargs)
### TIMEOUT LOGIC ###
timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
# set timeout for 10 minutes by default
if (
timeout is not None
and isinstance(timeout, httpx.Timeout)
and supports_httpx_timeout(custom_llm_provider) == False
):
read_timeout = timeout.read or 600
timeout = read_timeout # default 10 min timeout
elif timeout is not None and not isinstance(timeout, httpx.Timeout):
timeout = float(timeout) # type: ignore
elif timeout is None:
timeout = 600.0
response: Optional[Run] = None
if custom_llm_provider == "openai":
api_base = (
optional_params.api_base # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
or litellm.api_base
or os.getenv("OPENAI_API_BASE")
or "https://api.openai.com/v1"
)
organization = (
optional_params.organization
or litellm.organization
or os.getenv("OPENAI_ORGANIZATION", None)
or None # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
)
# set API KEY
api_key = (
optional_params.api_key
or litellm.api_key # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
or litellm.openai_key
or os.getenv("OPENAI_API_KEY")
)
response = openai_assistants_api.run_thread(
thread_id=thread_id,
assistant_id=assistant_id,
additional_instructions=additional_instructions,
instructions=instructions,
metadata=metadata,
model=model,
stream=stream,
tools=tools,
api_base=api_base,
api_key=api_key,
timeout=timeout,
max_retries=optional_params.max_retries,
organization=organization,
client=client,
)
else:
raise litellm.exceptions.BadRequestError(
message="LiteLLM doesn't support {} for 'run_thread'. Only 'openai' is supported.".format(
custom_llm_provider
),
model="n/a",
llm_provider=custom_llm_provider,
response=httpx.Response(
status_code=400,
content="Unsupported provider",
request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"), # type: ignore
),
)
return response

View file

@ -10,7 +10,7 @@
import os, json, time
import litellm
from litellm.utils import ModelResponse
import requests, threading
import requests, threading # type: ignore
from typing import Optional, Union, Literal

View file

@ -13,6 +13,7 @@ import json, traceback, ast, hashlib
from typing import Optional, Literal, List, Union, Any, BinaryIO
from openai._models import BaseModel as OpenAIObject
from litellm._logging import verbose_logger
from litellm.types.services import ServiceLoggerPayload, ServiceTypes
import traceback
@ -88,6 +89,13 @@ class InMemoryCache(BaseCache):
return_val.append(val)
return return_val
def increment_cache(self, key, value: int, **kwargs) -> int:
# get the value
init_value = self.get_cache(key=key) or 0
value = init_value + value
self.set_cache(key, value, **kwargs)
return value
async def async_get_cache(self, key, **kwargs):
return self.get_cache(key=key, **kwargs)
@ -98,11 +106,12 @@ class InMemoryCache(BaseCache):
return_val.append(val)
return return_val
async def async_increment(self, key, value: int, **kwargs):
async def async_increment(self, key, value: float, **kwargs) -> float:
# get the value
init_value = await self.async_get_cache(key=key) or 0
value = init_value + value
await self.async_set_cache(key, value, **kwargs)
return value
def flush_cache(self):
self.cache_dict.clear()
@ -129,6 +138,7 @@ class RedisCache(BaseCache):
**kwargs,
):
from ._redis import get_redis_client, get_redis_connection_pool
from litellm._service_logger import ServiceLogging
import redis
redis_kwargs = {}
@ -139,18 +149,19 @@ class RedisCache(BaseCache):
if password is not None:
redis_kwargs["password"] = password
### HEALTH MONITORING OBJECT ###
if kwargs.get("service_logger_obj", None) is not None and isinstance(
kwargs["service_logger_obj"], ServiceLogging
):
self.service_logger_obj = kwargs.pop("service_logger_obj")
else:
self.service_logger_obj = ServiceLogging()
redis_kwargs.update(kwargs)
self.redis_client = get_redis_client(**redis_kwargs)
self.redis_kwargs = redis_kwargs
self.async_redis_conn_pool = get_redis_connection_pool(**redis_kwargs)
if "url" in redis_kwargs and redis_kwargs["url"] is not None:
parsed_kwargs = redis.connection.parse_url(redis_kwargs["url"])
redis_kwargs.update(parsed_kwargs)
self.redis_kwargs.update(parsed_kwargs)
# pop url
self.redis_kwargs.pop("url")
# redis namespaces
self.namespace = namespace
# for high traffic, we store the redis results in memory and then batch write to redis
@ -162,6 +173,23 @@ class RedisCache(BaseCache):
except Exception as e:
pass
### ASYNC HEALTH PING ###
try:
# asyncio.get_running_loop().create_task(self.ping())
result = asyncio.get_running_loop().create_task(self.ping())
except Exception as e:
verbose_logger.error(
"Error connecting to Async Redis client", extra={"error": str(e)}
)
### SYNC HEALTH PING ###
try:
self.redis_client.ping()
except Exception as e:
verbose_logger.error(
"Error connecting to Sync Redis client", extra={"error": str(e)}
)
def init_async_client(self):
from ._redis import get_redis_async_client
@ -192,18 +220,101 @@ class RedisCache(BaseCache):
f"LiteLLM Caching: set() - Got exception from REDIS : {str(e)}"
)
def increment_cache(self, key, value: int, **kwargs) -> int:
_redis_client = self.redis_client
start_time = time.time()
try:
result = _redis_client.incr(name=key, amount=value)
## LOGGING ##
end_time = time.time()
_duration = end_time - start_time
asyncio.create_task(
self.service_logger_obj.service_success_hook(
service=ServiceTypes.REDIS,
duration=_duration,
call_type="increment_cache",
)
)
return result
except Exception as e:
## LOGGING ##
end_time = time.time()
_duration = end_time - start_time
asyncio.create_task(
self.service_logger_obj.async_service_failure_hook(
service=ServiceTypes.REDIS,
duration=_duration,
error=e,
call_type="increment_cache",
)
)
verbose_logger.error(
"LiteLLM Redis Caching: increment_cache() - Got exception from REDIS %s, Writing value=%s",
str(e),
value,
)
traceback.print_exc()
raise e
async def async_scan_iter(self, pattern: str, count: int = 100) -> list:
keys = []
_redis_client = self.init_async_client()
async with _redis_client as redis_client:
async for key in redis_client.scan_iter(match=pattern + "*", count=count):
keys.append(key)
if len(keys) >= count:
break
return keys
start_time = time.time()
try:
keys = []
_redis_client = self.init_async_client()
async with _redis_client as redis_client:
async for key in redis_client.scan_iter(
match=pattern + "*", count=count
):
keys.append(key)
if len(keys) >= count:
break
## LOGGING ##
end_time = time.time()
_duration = end_time - start_time
asyncio.create_task(
self.service_logger_obj.async_service_success_hook(
service=ServiceTypes.REDIS,
duration=_duration,
call_type="async_scan_iter",
)
) # DO NOT SLOW DOWN CALL B/C OF THIS
return keys
except Exception as e:
# NON blocking - notify users Redis is throwing an exception
## LOGGING ##
end_time = time.time()
_duration = end_time - start_time
asyncio.create_task(
self.service_logger_obj.async_service_failure_hook(
service=ServiceTypes.REDIS,
duration=_duration,
error=e,
call_type="async_scan_iter",
)
)
raise e
async def async_set_cache(self, key, value, **kwargs):
_redis_client = self.init_async_client()
start_time = time.time()
try:
_redis_client = self.init_async_client()
except Exception as e:
end_time = time.time()
_duration = end_time - start_time
asyncio.create_task(
self.service_logger_obj.async_service_failure_hook(
service=ServiceTypes.REDIS, duration=_duration, error=e
)
)
# NON blocking - notify users Redis is throwing an exception
verbose_logger.error(
"LiteLLM Redis Caching: async set() - Got exception from REDIS %s, Writing value=%s",
str(e),
value,
)
traceback.print_exc()
key = self.check_and_fix_namespace(key=key)
async with _redis_client as redis_client:
ttl = kwargs.get("ttl", None)
@ -215,7 +326,26 @@ class RedisCache(BaseCache):
print_verbose(
f"Successfully Set ASYNC Redis Cache: key: {key}\nValue {value}\nttl={ttl}"
)
end_time = time.time()
_duration = end_time - start_time
asyncio.create_task(
self.service_logger_obj.async_service_success_hook(
service=ServiceTypes.REDIS,
duration=_duration,
call_type="async_set_cache",
)
)
except Exception as e:
end_time = time.time()
_duration = end_time - start_time
asyncio.create_task(
self.service_logger_obj.async_service_failure_hook(
service=ServiceTypes.REDIS,
duration=_duration,
error=e,
call_type="async_set_cache",
)
)
# NON blocking - notify users Redis is throwing an exception
verbose_logger.error(
"LiteLLM Redis Caching: async set() - Got exception from REDIS %s, Writing value=%s",
@ -229,6 +359,11 @@ class RedisCache(BaseCache):
Use Redis Pipelines for bulk write operations
"""
_redis_client = self.init_async_client()
start_time = time.time()
print_verbose(
f"Set Async Redis Cache: key list: {cache_list}\nttl={ttl}, redis_version={self.redis_version}"
)
try:
async with _redis_client as redis_client:
async with redis_client.pipeline(transaction=True) as pipe:
@ -238,18 +373,41 @@ class RedisCache(BaseCache):
print_verbose(
f"Set ASYNC Redis Cache PIPELINE: key: {cache_key}\nValue {cache_value}\nttl={ttl}"
)
json_cache_value = json.dumps(cache_value)
# Set the value with a TTL if it's provided.
if ttl is not None:
pipe.setex(cache_key, ttl, json.dumps(cache_value))
pipe.setex(cache_key, ttl, json_cache_value)
else:
pipe.set(cache_key, json.dumps(cache_value))
pipe.set(cache_key, json_cache_value)
# Execute the pipeline and return the results.
results = await pipe.execute()
print_verbose(f"pipeline results: {results}")
# Optionally, you could process 'results' to make sure that all set operations were successful.
## LOGGING ##
end_time = time.time()
_duration = end_time - start_time
asyncio.create_task(
self.service_logger_obj.async_service_success_hook(
service=ServiceTypes.REDIS,
duration=_duration,
call_type="async_set_cache_pipeline",
)
)
return results
except Exception as e:
## LOGGING ##
end_time = time.time()
_duration = end_time - start_time
asyncio.create_task(
self.service_logger_obj.async_service_failure_hook(
service=ServiceTypes.REDIS,
duration=_duration,
error=e,
call_type="async_set_cache_pipeline",
)
)
verbose_logger.error(
"LiteLLM Redis Caching: async set_cache_pipeline() - Got exception from REDIS %s, Writing value=%s",
str(e),
@ -264,20 +422,44 @@ class RedisCache(BaseCache):
key = self.check_and_fix_namespace(key=key)
self.redis_batch_writing_buffer.append((key, value))
if len(self.redis_batch_writing_buffer) >= self.redis_flush_size:
await self.flush_cache_buffer()
await self.flush_cache_buffer() # logging done in here
async def async_increment(self, key, value: int, **kwargs):
async def async_increment(self, key, value: float, **kwargs) -> float:
_redis_client = self.init_async_client()
start_time = time.time()
try:
async with _redis_client as redis_client:
await redis_client.incr(name=key, amount=value)
result = await redis_client.incrbyfloat(name=key, amount=value)
## LOGGING ##
end_time = time.time()
_duration = end_time - start_time
asyncio.create_task(
self.service_logger_obj.async_service_success_hook(
service=ServiceTypes.REDIS,
duration=_duration,
call_type="async_increment",
)
)
return result
except Exception as e:
## LOGGING ##
end_time = time.time()
_duration = end_time - start_time
asyncio.create_task(
self.service_logger_obj.async_service_failure_hook(
service=ServiceTypes.REDIS,
duration=_duration,
error=e,
call_type="async_increment",
)
)
verbose_logger.error(
"LiteLLM Redis Caching: async async_increment() - Got exception from REDIS %s, Writing value=%s",
str(e),
value,
)
traceback.print_exc()
raise e
async def flush_cache_buffer(self):
print_verbose(
@ -345,6 +527,7 @@ class RedisCache(BaseCache):
async def async_get_cache(self, key, **kwargs):
_redis_client = self.init_async_client()
key = self.check_and_fix_namespace(key=key)
start_time = time.time()
async with _redis_client as redis_client:
try:
print_verbose(f"Get Async Redis Cache: key: {key}")
@ -353,8 +536,29 @@ class RedisCache(BaseCache):
f"Got Async Redis Cache: key: {key}, cached_response {cached_response}"
)
response = self._get_cache_logic(cached_response=cached_response)
## LOGGING ##
end_time = time.time()
_duration = end_time - start_time
asyncio.create_task(
self.service_logger_obj.async_service_success_hook(
service=ServiceTypes.REDIS,
duration=_duration,
call_type="async_get_cache",
)
)
return response
except Exception as e:
## LOGGING ##
end_time = time.time()
_duration = end_time - start_time
asyncio.create_task(
self.service_logger_obj.async_service_failure_hook(
service=ServiceTypes.REDIS,
duration=_duration,
error=e,
call_type="async_get_cache",
)
)
# NON blocking - notify users Redis is throwing an exception
print_verbose(
f"LiteLLM Caching: async get() - Got exception from REDIS: {str(e)}"
@ -366,6 +570,7 @@ class RedisCache(BaseCache):
"""
_redis_client = await self.init_async_client()
key_value_dict = {}
start_time = time.time()
try:
async with _redis_client as redis_client:
_keys = []
@ -374,29 +579,110 @@ class RedisCache(BaseCache):
_keys.append(cache_key)
results = await redis_client.mget(keys=_keys)
## LOGGING ##
end_time = time.time()
_duration = end_time - start_time
asyncio.create_task(
self.service_logger_obj.async_service_success_hook(
service=ServiceTypes.REDIS,
duration=_duration,
call_type="async_batch_get_cache",
)
)
# Associate the results back with their keys.
# 'results' is a list of values corresponding to the order of keys in 'key_list'.
key_value_dict = dict(zip(key_list, results))
decoded_results = {
k.decode("utf-8"): self._get_cache_logic(v)
for k, v in key_value_dict.items()
}
decoded_results = {}
for k, v in key_value_dict.items():
if isinstance(k, bytes):
k = k.decode("utf-8")
v = self._get_cache_logic(v)
decoded_results[k] = v
return decoded_results
except Exception as e:
## LOGGING ##
end_time = time.time()
_duration = end_time - start_time
asyncio.create_task(
self.service_logger_obj.async_service_failure_hook(
service=ServiceTypes.REDIS,
duration=_duration,
error=e,
call_type="async_batch_get_cache",
)
)
print_verbose(f"Error occurred in pipeline read - {str(e)}")
return key_value_dict
async def ping(self):
def sync_ping(self) -> bool:
"""
Tests if the sync redis client is correctly setup.
"""
print_verbose(f"Pinging Sync Redis Cache")
start_time = time.time()
try:
response = self.redis_client.ping()
print_verbose(f"Redis Cache PING: {response}")
## LOGGING ##
end_time = time.time()
_duration = end_time - start_time
self.service_logger_obj.service_success_hook(
service=ServiceTypes.REDIS,
duration=_duration,
call_type="sync_ping",
)
return response
except Exception as e:
# NON blocking - notify users Redis is throwing an exception
## LOGGING ##
end_time = time.time()
_duration = end_time - start_time
self.service_logger_obj.service_failure_hook(
service=ServiceTypes.REDIS,
duration=_duration,
error=e,
call_type="sync_ping",
)
print_verbose(
f"LiteLLM Redis Cache PING: - Got exception from REDIS : {str(e)}"
)
traceback.print_exc()
raise e
async def ping(self) -> bool:
_redis_client = self.init_async_client()
start_time = time.time()
async with _redis_client as redis_client:
print_verbose(f"Pinging Async Redis Cache")
try:
response = await redis_client.ping()
print_verbose(f"Redis Cache PING: {response}")
## LOGGING ##
end_time = time.time()
_duration = end_time - start_time
asyncio.create_task(
self.service_logger_obj.async_service_success_hook(
service=ServiceTypes.REDIS,
duration=_duration,
call_type="async_ping",
)
)
return response
except Exception as e:
# NON blocking - notify users Redis is throwing an exception
## LOGGING ##
end_time = time.time()
_duration = end_time - start_time
asyncio.create_task(
self.service_logger_obj.async_service_failure_hook(
service=ServiceTypes.REDIS,
duration=_duration,
error=e,
call_type="async_ping",
)
)
print_verbose(
f"LiteLLM Redis Cache PING: - Got exception from REDIS : {str(e)}"
)
@ -525,9 +811,7 @@ class RedisSemanticCache(BaseCache):
# get the prompt
messages = kwargs["messages"]
prompt = ""
for message in messages:
prompt += message["content"]
prompt = "".join(message["content"] for message in messages)
# create an embedding for prompt
embedding_response = litellm.embedding(
@ -562,9 +846,7 @@ class RedisSemanticCache(BaseCache):
# get the messages
messages = kwargs["messages"]
prompt = ""
for message in messages:
prompt += message["content"]
prompt = "".join(message["content"] for message in messages)
# convert to embedding
embedding_response = litellm.embedding(
@ -624,9 +906,7 @@ class RedisSemanticCache(BaseCache):
# get the prompt
messages = kwargs["messages"]
prompt = ""
for message in messages:
prompt += message["content"]
prompt = "".join(message["content"] for message in messages)
# create an embedding for prompt
router_model_names = (
[m["model_name"] for m in llm_model_list]
@ -679,9 +959,7 @@ class RedisSemanticCache(BaseCache):
# get the messages
messages = kwargs["messages"]
prompt = ""
for message in messages:
prompt += message["content"]
prompt = "".join(message["content"] for message in messages)
router_model_names = (
[m["model_name"] for m in llm_model_list]
@ -927,6 +1205,30 @@ class DualCache(BaseCache):
except Exception as e:
print_verbose(e)
def increment_cache(
self, key, value: int, local_only: bool = False, **kwargs
) -> int:
"""
Key - the key in cache
Value - int - the value you want to increment by
Returns - int - the incremented value
"""
try:
result: int = value
if self.in_memory_cache is not None:
result = self.in_memory_cache.increment_cache(key, value, **kwargs)
if self.redis_cache is not None and local_only == False:
result = self.redis_cache.increment_cache(key, value, **kwargs)
return result
except Exception as e:
print_verbose(f"LiteLLM Cache: Excepton async add_cache: {str(e)}")
traceback.print_exc()
raise e
def get_cache(self, key, local_only: bool = False, **kwargs):
# Try to fetch from in-memory cache first
try:
@ -979,7 +1281,7 @@ class DualCache(BaseCache):
self.in_memory_cache.set_cache(key, redis_result[key], **kwargs)
for key, value in redis_result.items():
result[sublist_keys.index(key)] = value
result[keys.index(key)] = value
print_verbose(f"async batch get cache: cache result: {result}")
return result
@ -1029,10 +1331,8 @@ class DualCache(BaseCache):
keys, **kwargs
)
print_verbose(f"in_memory_result: {in_memory_result}")
if in_memory_result is not None:
result = in_memory_result
if None in result and self.redis_cache is not None and local_only == False:
"""
- for the none values in the result
@ -1048,22 +1348,23 @@ class DualCache(BaseCache):
if redis_result is not None:
# Update in-memory cache with the value from Redis
for key in redis_result:
await self.in_memory_cache.async_set_cache(
key, redis_result[key], **kwargs
)
for key, value in redis_result.items():
if value is not None:
await self.in_memory_cache.async_set_cache(
key, redis_result[key], **kwargs
)
for key, value in redis_result.items():
index = keys.index(key)
result[index] = value
sublist_dict = dict(zip(sublist_keys, redis_result))
for key, value in sublist_dict.items():
result[sublist_keys.index(key)] = value
print_verbose(f"async batch get cache: cache result: {result}")
return result
except Exception as e:
traceback.print_exc()
async def async_set_cache(self, key, value, local_only: bool = False, **kwargs):
print_verbose(
f"async set cache: cache key: {key}; local_only: {local_only}; value: {value}"
)
try:
if self.in_memory_cache is not None:
await self.in_memory_cache.async_set_cache(key, value, **kwargs)
@ -1074,24 +1375,55 @@ class DualCache(BaseCache):
print_verbose(f"LiteLLM Cache: Excepton async add_cache: {str(e)}")
traceback.print_exc()
async def async_increment_cache(
self, key, value: int, local_only: bool = False, **kwargs
async def async_batch_set_cache(
self, cache_list: list, local_only: bool = False, **kwargs
):
"""
Key - the key in cache
Value - int - the value you want to increment by
Batch write values to the cache
"""
print_verbose(
f"async batch set cache: cache keys: {cache_list}; local_only: {local_only}"
)
try:
if self.in_memory_cache is not None:
await self.in_memory_cache.async_increment(key, value, **kwargs)
await self.in_memory_cache.async_set_cache_pipeline(
cache_list=cache_list, **kwargs
)
if self.redis_cache is not None and local_only == False:
await self.redis_cache.async_increment(key, value, **kwargs)
await self.redis_cache.async_set_cache_pipeline(
cache_list=cache_list, ttl=kwargs.get("ttl", None)
)
except Exception as e:
print_verbose(f"LiteLLM Cache: Excepton async add_cache: {str(e)}")
traceback.print_exc()
async def async_increment_cache(
self, key, value: float, local_only: bool = False, **kwargs
) -> float:
"""
Key - the key in cache
Value - float - the value you want to increment by
Returns - float - the incremented value
"""
try:
result: float = value
if self.in_memory_cache is not None:
result = await self.in_memory_cache.async_increment(
key, value, **kwargs
)
if self.redis_cache is not None and local_only == False:
result = await self.redis_cache.async_increment(key, value, **kwargs)
return result
except Exception as e:
print_verbose(f"LiteLLM Cache: Excepton async add_cache: {str(e)}")
traceback.print_exc()
raise e
def flush_cache(self):
if self.in_memory_cache is not None:
self.in_memory_cache.flush_cache()
@ -1109,7 +1441,7 @@ class DualCache(BaseCache):
class Cache:
def __init__(
self,
type: Optional[Literal["local", "redis", "redis-semantic", "s3"]] = "local",
type: Optional[Literal["local", "redis", "redis-semantic", "s3", "disk"]] = "local",
host: Optional[str] = None,
port: Optional[str] = None,
password: Optional[str] = None,
@ -1152,13 +1484,14 @@ class Cache:
redis_semantic_cache_use_async=False,
redis_semantic_cache_embedding_model="text-embedding-ada-002",
redis_flush_size=None,
disk_cache_dir=None,
**kwargs,
):
"""
Initializes the cache based on the given type.
Args:
type (str, optional): The type of cache to initialize. Can be "local", "redis", "redis-semantic", or "s3". Defaults to "local".
type (str, optional): The type of cache to initialize. Can be "local", "redis", "redis-semantic", "s3" or "disk". Defaults to "local".
host (str, optional): The host address for the Redis cache. Required if type is "redis".
port (int, optional): The port number for the Redis cache. Required if type is "redis".
password (str, optional): The password for the Redis cache. Required if type is "redis".
@ -1204,6 +1537,8 @@ class Cache:
s3_path=s3_path,
**kwargs,
)
elif type == "disk":
self.cache = DiskCache(disk_cache_dir=disk_cache_dir)
if "cache" not in litellm.input_callback:
litellm.input_callback.append("cache")
if "cache" not in litellm.success_callback:
@ -1575,8 +1910,86 @@ class Cache:
await self.cache.disconnect()
class DiskCache(BaseCache):
def __init__(self, disk_cache_dir: Optional[str] = None):
import diskcache as dc
# if users don't provider one, use the default litellm cache
if disk_cache_dir is None:
self.disk_cache = dc.Cache(".litellm_cache")
else:
self.disk_cache = dc.Cache(disk_cache_dir)
def set_cache(self, key, value, **kwargs):
print_verbose("DiskCache: set_cache")
if "ttl" in kwargs:
self.disk_cache.set(key, value, expire=kwargs["ttl"])
else:
self.disk_cache.set(key, value)
async def async_set_cache(self, key, value, **kwargs):
self.set_cache(key=key, value=value, **kwargs)
async def async_set_cache_pipeline(self, cache_list, ttl=None):
for cache_key, cache_value in cache_list:
if ttl is not None:
self.set_cache(key=cache_key, value=cache_value, ttl=ttl)
else:
self.set_cache(key=cache_key, value=cache_value)
def get_cache(self, key, **kwargs):
original_cached_response = self.disk_cache.get(key)
if original_cached_response:
try:
cached_response = json.loads(original_cached_response)
except:
cached_response = original_cached_response
return cached_response
return None
def batch_get_cache(self, keys: list, **kwargs):
return_val = []
for k in keys:
val = self.get_cache(key=k, **kwargs)
return_val.append(val)
return return_val
def increment_cache(self, key, value: int, **kwargs) -> int:
# get the value
init_value = self.get_cache(key=key) or 0
value = init_value + value
self.set_cache(key, value, **kwargs)
return value
async def async_get_cache(self, key, **kwargs):
return self.get_cache(key=key, **kwargs)
async def async_batch_get_cache(self, keys: list, **kwargs):
return_val = []
for k in keys:
val = self.get_cache(key=k, **kwargs)
return_val.append(val)
return return_val
async def async_increment(self, key, value: int, **kwargs) -> int:
# get the value
init_value = await self.async_get_cache(key=key) or 0
value = init_value + value
await self.async_set_cache(key, value, **kwargs)
return value
def flush_cache(self):
self.disk_cache.clear()
async def disconnect(self):
pass
def delete_cache(self, key):
self.disk_cache.pop(key)
def enable_cache(
type: Optional[Literal["local", "redis", "s3"]] = "local",
type: Optional[Literal["local", "redis", "s3", "disk"]] = "local",
host: Optional[str] = None,
port: Optional[str] = None,
password: Optional[str] = None,
@ -1605,7 +2018,7 @@ def enable_cache(
Enable cache with the specified configuration.
Args:
type (Optional[Literal["local", "redis"]]): The type of cache to enable. Defaults to "local".
type (Optional[Literal["local", "redis", "s3", "disk"]]): The type of cache to enable. Defaults to "local".
host (Optional[str]): The host address of the cache server. Defaults to None.
port (Optional[str]): The port number of the cache server. Defaults to None.
password (Optional[str]): The password for the cache server. Defaults to None.
@ -1641,7 +2054,7 @@ def enable_cache(
def update_cache(
type: Optional[Literal["local", "redis"]] = "local",
type: Optional[Literal["local", "redis", "s3", "disk"]] = "local",
host: Optional[str] = None,
port: Optional[str] = None,
password: Optional[str] = None,
@ -1670,7 +2083,7 @@ def update_cache(
Update the cache for LiteLLM.
Args:
type (Optional[Literal["local", "redis"]]): The type of cache. Defaults to "local".
type (Optional[Literal["local", "redis", "s3", "disk"]]): The type of cache. Defaults to "local".
host (Optional[str]): The host of the cache. Defaults to None.
port (Optional[str]): The port of the cache. Defaults to None.
password (Optional[str]): The password for the cache. Defaults to None.

View file

@ -9,55 +9,64 @@
## LiteLLM versions of the OpenAI Exception Types
from openai import (
AuthenticationError,
BadRequestError,
NotFoundError,
RateLimitError,
APIStatusError,
OpenAIError,
APIError,
APITimeoutError,
APIConnectionError,
APIResponseValidationError,
UnprocessableEntityError,
PermissionDeniedError,
)
import openai
import httpx
from typing import Optional
class AuthenticationError(AuthenticationError): # type: ignore
def __init__(self, message, llm_provider, model, response: httpx.Response):
class AuthenticationError(openai.AuthenticationError): # type: ignore
def __init__(
self,
message,
llm_provider,
model,
response: httpx.Response,
litellm_debug_info: Optional[str] = None,
):
self.status_code = 401
self.message = message
self.llm_provider = llm_provider
self.model = model
self.litellm_debug_info = litellm_debug_info
super().__init__(
self.message, response=response, body=None
) # Call the base class constructor with the parameters it needs
# raise when invalid models passed, example gpt-8
class NotFoundError(NotFoundError): # type: ignore
def __init__(self, message, model, llm_provider, response: httpx.Response):
class NotFoundError(openai.NotFoundError): # type: ignore
def __init__(
self,
message,
model,
llm_provider,
response: httpx.Response,
litellm_debug_info: Optional[str] = None,
):
self.status_code = 404
self.message = message
self.model = model
self.llm_provider = llm_provider
self.litellm_debug_info = litellm_debug_info
super().__init__(
self.message, response=response, body=None
) # Call the base class constructor with the parameters it needs
class BadRequestError(BadRequestError): # type: ignore
class BadRequestError(openai.BadRequestError): # type: ignore
def __init__(
self, message, model, llm_provider, response: Optional[httpx.Response] = None
self,
message,
model,
llm_provider,
response: Optional[httpx.Response] = None,
litellm_debug_info: Optional[str] = None,
):
self.status_code = 400
self.message = message
self.model = model
self.llm_provider = llm_provider
self.litellm_debug_info = litellm_debug_info
response = response or httpx.Response(
status_code=self.status_code,
request=httpx.Request(
@ -69,46 +78,77 @@ class BadRequestError(BadRequestError): # type: ignore
) # Call the base class constructor with the parameters it needs
class UnprocessableEntityError(UnprocessableEntityError): # type: ignore
def __init__(self, message, model, llm_provider, response: httpx.Response):
class UnprocessableEntityError(openai.UnprocessableEntityError): # type: ignore
def __init__(
self,
message,
model,
llm_provider,
response: httpx.Response,
litellm_debug_info: Optional[str] = None,
):
self.status_code = 422
self.message = message
self.model = model
self.llm_provider = llm_provider
self.litellm_debug_info = litellm_debug_info
super().__init__(
self.message, response=response, body=None
) # Call the base class constructor with the parameters it needs
class Timeout(APITimeoutError): # type: ignore
def __init__(self, message, model, llm_provider):
self.status_code = 408
self.message = message
self.model = model
self.llm_provider = llm_provider
class Timeout(openai.APITimeoutError): # type: ignore
def __init__(
self, message, model, llm_provider, litellm_debug_info: Optional[str] = None
):
request = httpx.Request(method="POST", url="https://api.openai.com/v1")
super().__init__(
request=request
) # Call the base class constructor with the parameters it needs
self.status_code = 408
self.message = message
self.model = model
self.llm_provider = llm_provider
self.litellm_debug_info = litellm_debug_info
# custom function to convert to str
def __str__(self):
return str(self.message)
class PermissionDeniedError(PermissionDeniedError): # type:ignore
def __init__(self, message, llm_provider, model, response: httpx.Response):
class PermissionDeniedError(openai.PermissionDeniedError): # type:ignore
def __init__(
self,
message,
llm_provider,
model,
response: httpx.Response,
litellm_debug_info: Optional[str] = None,
):
self.status_code = 403
self.message = message
self.llm_provider = llm_provider
self.model = model
self.litellm_debug_info = litellm_debug_info
super().__init__(
self.message, response=response, body=None
) # Call the base class constructor with the parameters it needs
class RateLimitError(RateLimitError): # type: ignore
def __init__(self, message, llm_provider, model, response: httpx.Response):
class RateLimitError(openai.RateLimitError): # type: ignore
def __init__(
self,
message,
llm_provider,
model,
response: httpx.Response,
litellm_debug_info: Optional[str] = None,
):
self.status_code = 429
self.message = message
self.llm_provider = llm_provider
self.modle = model
self.litellm_debug_info = litellm_debug_info
super().__init__(
self.message, response=response, body=None
) # Call the base class constructor with the parameters it needs
@ -116,11 +156,19 @@ class RateLimitError(RateLimitError): # type: ignore
# sub class of rate limit error - meant to give more granularity for error handling context window exceeded errors
class ContextWindowExceededError(BadRequestError): # type: ignore
def __init__(self, message, model, llm_provider, response: httpx.Response):
def __init__(
self,
message,
model,
llm_provider,
response: httpx.Response,
litellm_debug_info: Optional[str] = None,
):
self.status_code = 400
self.message = message
self.model = model
self.llm_provider = llm_provider
self.litellm_debug_info = litellm_debug_info
super().__init__(
message=self.message,
model=self.model, # type: ignore
@ -131,11 +179,19 @@ class ContextWindowExceededError(BadRequestError): # type: ignore
class ContentPolicyViolationError(BadRequestError): # type: ignore
# Error code: 400 - {'error': {'code': 'content_policy_violation', 'message': 'Your request was rejected as a result of our safety system. Image descriptions generated from your prompt may contain text that is not allowed by our safety system. If you believe this was done in error, your request may succeed if retried, or by adjusting your prompt.', 'param': None, 'type': 'invalid_request_error'}}
def __init__(self, message, model, llm_provider, response: httpx.Response):
def __init__(
self,
message,
model,
llm_provider,
response: httpx.Response,
litellm_debug_info: Optional[str] = None,
):
self.status_code = 400
self.message = message
self.model = model
self.llm_provider = llm_provider
self.litellm_debug_info = litellm_debug_info
super().__init__(
message=self.message,
model=self.model, # type: ignore
@ -144,51 +200,77 @@ class ContentPolicyViolationError(BadRequestError): # type: ignore
) # Call the base class constructor with the parameters it needs
class ServiceUnavailableError(APIStatusError): # type: ignore
def __init__(self, message, llm_provider, model, response: httpx.Response):
class ServiceUnavailableError(openai.APIStatusError): # type: ignore
def __init__(
self,
message,
llm_provider,
model,
response: httpx.Response,
litellm_debug_info: Optional[str] = None,
):
self.status_code = 503
self.message = message
self.llm_provider = llm_provider
self.model = model
self.litellm_debug_info = litellm_debug_info
super().__init__(
self.message, response=response, body=None
) # Call the base class constructor with the parameters it needs
# raise this when the API returns an invalid response object - https://github.com/openai/openai-python/blob/1be14ee34a0f8e42d3f9aa5451aa4cb161f1781f/openai/api_requestor.py#L401
class APIError(APIError): # type: ignore
class APIError(openai.APIError): # type: ignore
def __init__(
self, status_code, message, llm_provider, model, request: httpx.Request
self,
status_code,
message,
llm_provider,
model,
request: httpx.Request,
litellm_debug_info: Optional[str] = None,
):
self.status_code = status_code
self.message = message
self.llm_provider = llm_provider
self.model = model
self.litellm_debug_info = litellm_debug_info
super().__init__(self.message, request=request, body=None) # type: ignore
# raised if an invalid request (not get, delete, put, post) is made
class APIConnectionError(APIConnectionError): # type: ignore
def __init__(self, message, llm_provider, model, request: httpx.Request):
class APIConnectionError(openai.APIConnectionError): # type: ignore
def __init__(
self,
message,
llm_provider,
model,
request: httpx.Request,
litellm_debug_info: Optional[str] = None,
):
self.message = message
self.llm_provider = llm_provider
self.model = model
self.status_code = 500
self.litellm_debug_info = litellm_debug_info
super().__init__(message=self.message, request=request)
# raised if an invalid request (not get, delete, put, post) is made
class APIResponseValidationError(APIResponseValidationError): # type: ignore
def __init__(self, message, llm_provider, model):
class APIResponseValidationError(openai.APIResponseValidationError): # type: ignore
def __init__(
self, message, llm_provider, model, litellm_debug_info: Optional[str] = None
):
self.message = message
self.llm_provider = llm_provider
self.model = model
request = httpx.Request(method="POST", url="https://api.openai.com/v1")
response = httpx.Response(status_code=500, request=request)
self.litellm_debug_info = litellm_debug_info
super().__init__(response=response, body=None, message=message)
class OpenAIError(OpenAIError): # type: ignore
class OpenAIError(openai.OpenAIError): # type: ignore
def __init__(self, original_exception):
self.status_code = original_exception.http_status
super().__init__(
@ -210,7 +292,7 @@ class BudgetExceededError(Exception):
## DEPRECATED ##
class InvalidRequestError(BadRequestError): # type: ignore
class InvalidRequestError(openai.BadRequestError): # type: ignore
def __init__(self, message, model, llm_provider):
self.status_code = 400
self.message = message

View file

@ -1,9 +1,6 @@
#### What this does ####
# On success + failure, log events to aispend.io
import dotenv, os
import requests
dotenv.load_dotenv() # Loading env variables using dotenv
import traceback
import datetime

Some files were not shown because too many files have changed in this diff Show more