Merge branch 'main' into support_anthropic_function_result

This commit is contained in:
Krish Dholakia 2024-03-16 09:58:08 -07:00 committed by GitHub
commit 0368a335e6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
42 changed files with 815 additions and 216 deletions

5
.dockerignore Normal file
View file

@ -0,0 +1,5 @@
/docs
/cookbook
/.circleci
/.github
/tests

View file

@ -10,6 +10,7 @@ on:
env: env:
REGISTRY: ghcr.io REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }} IMAGE_NAME: ${{ github.repository }}
CHART_NAME: litellm
# There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu. # There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu.
jobs: jobs:
@ -103,6 +104,11 @@ jobs:
uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
with: with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}-database images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}-database
# Configure multi platform Docker builds
- name: Set up QEMU
uses: docker/setup-qemu-action@e0e4588fad221d38ee467c0bffd91115366dc0c5
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@edfb0fe6204400c56fbfd3feba3fe9ad1adfa345
- name: Build and push Database Docker image - name: Build and push Database Docker image
uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4 uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
@ -112,6 +118,60 @@ jobs:
push: true push: true
tags: ${{ steps.meta-database.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-database.outputs.tags }}-latest tags: ${{ steps.meta-database.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-database.outputs.tags }}-latest
labels: ${{ steps.meta-database.outputs.labels }} labels: ${{ steps.meta-database.outputs.labels }}
platforms: local,linux/amd64,linux/arm64,linux/arm64/v8
build-and-push-helm-chart:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Log in to the Container registry
uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: lowercase github.repository_owner
run: |
echo "REPO_OWNER=`echo ${{github.repository_owner}} | tr '[:upper:]' '[:lower:]'`" >>${GITHUB_ENV}
- name: Get LiteLLM Latest Tag
id: current_app_tag
uses: WyriHaximus/github-action-get-previous-tag@v1.3.0
- name: Get last published chart version
id: current_version
shell: bash
run: |
CHART_LIST=$(helm show chart oci://${{ env.REGISTRY }}/${{ env.REPO_OWNER }}/${{ env.CHART_NAME }} 2>/dev/null || true)
if [ -z "${CHART_LIST}" ]; then
echo "current-version=0.1.0" | tee -a $GITHUB_OUTPUT
else
printf '%s' "${CHART_LIST}" | grep '^version:' | awk 'BEGIN{FS=":"}{print "current-version="$2}' | tr -d " " | tee -a $GITHUB_OUTPUT
fi
env:
HELM_EXPERIMENTAL_OCI: '1'
# Automatically update the helm chart version one "patch" level
- name: Bump release version
id: bump_version
uses: christian-draeger/increment-semantic-version@1.1.0
with:
current-version: ${{ steps.current_version.outputs.current-version || '0.1.0' }}
version-fragment: 'bug'
- uses: ./.github/actions/helm-oci-chart-releaser
with:
name: ${{ env.CHART_NAME }}
repository: ${{ env.REPO_OWNER }}
tag: ${{ github.event.inputs.chartVersion || steps.bump_version.outputs.next-version || '0.1.0' }}
app_version: ${{ steps.current_app_tag.outputs.tag || 'latest' }}
path: deploy/charts/${{ env.CHART_NAME }}
registry: ${{ env.REGISTRY }}
registry_username: ${{ github.actor }}
registry_password: ${{ secrets.GITHUB_TOKEN }}
update_dependencies: true
release: release:
name: "New LiteLLM Release" name: "New LiteLLM Release"
needs: [docker-hub-deploy, build-and-push-image, build-and-push-image-database] needs: [docker-hub-deploy, build-and-push-image, build-and-push-image-database]
@ -171,13 +231,13 @@ jobs:
RELEASE_NOTES: ${{ steps.release-notes.outputs.result }} RELEASE_NOTES: ${{ steps.release-notes.outputs.result }}
run: | run: |
curl -H "Content-Type: application/json" -X POST -d '{ curl -H "Content-Type: application/json" -X POST -d '{
"content": "||@everyone||", "content": "New LiteLLM release ${{ env.RELEASE_TAG }}",
"username": "Release Changelog", "username": "Release Changelog",
"avatar_url": "https://cdn.discordapp.com/avatars/487431320314576937/bd64361e4ba6313d561d54e78c9e7171.png", "avatar_url": "https://cdn.discordapp.com/avatars/487431320314576937/bd64361e4ba6313d561d54e78c9e7171.png",
"embeds": [ "embeds": [
{ {
"title": "Changelog for ${RELEASE_TAG}", "title": "Changelog for LiteLLM ${{ env.RELEASE_TAG }}",
"description": "${RELEASE_NOTES}", "description": "${{ env.RELEASE_NOTES }}",
"color": 2105893 "color": 2105893
} }
] ]

View file

@ -0,0 +1,91 @@
import csv
import os
from github import Github
def interpret_results(csv_file):
with open(csv_file, newline="") as csvfile:
csvreader = csv.DictReader(csvfile)
rows = list(csvreader)
"""
in this csv reader
- Create 1 new column "Status"
- if a row has a median response time < 300 and an average response time < 300, Status = "Passed ✅"
- if a row has a median response time >= 300 or an average response time >= 300, Status = "Failed ❌"
- Order the table in this order Name, Status, Median Response Time, Average Response Time, Requests/s,Failures/s, Min Response Time, Max Response Time, all other columns
"""
# Add a new column "Status"
for row in rows:
median_response_time = float(
row["Median Response Time"].strip().rstrip("ms")
)
average_response_time = float(
row["Average Response Time"].strip().rstrip("s")
)
request_count = int(row["Request Count"])
failure_count = int(row["Failure Count"])
failure_percent = round((failure_count / request_count) * 100, 2)
# Determine status based on conditions
if (
median_response_time < 300
and average_response_time < 300
and failure_percent < 5
):
row["Status"] = "Passed ✅"
else:
row["Status"] = "Failed ❌"
# Construct Markdown table header
markdown_table = "| Name | Status | Median Response Time (ms) | Average Response Time (ms) | Requests/s | Failures/s | Request Count | Failure Count | Min Response Time (ms) | Max Response Time (ms) |"
markdown_table += (
"\n| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |"
)
# Construct Markdown table rows
for row in rows:
markdown_table += f"\n| {row['Name']} | {row['Status']} | {row['Median Response Time']} | {row['Average Response Time']} | {row['Requests/s']} | {row['Failures/s']} | {row['Request Count']} | {row['Failure Count']} | {row['Min Response Time']} | {row['Max Response Time']} |"
print("markdown table: ", markdown_table)
return markdown_table
if __name__ == "__main__":
csv_file = "load_test_stats.csv" # Change this to the path of your CSV file
markdown_table = interpret_results(csv_file)
# Update release body with interpreted results
github_token = os.getenv("GITHUB_TOKEN")
g = Github(github_token)
repo = g.get_repo(
"BerriAI/litellm"
) # Replace with your repository's username and name
latest_release = repo.get_latest_release()
print("got latest release: ", latest_release)
print("latest release body: ", latest_release.body)
print("markdown table: ", markdown_table)
# check if "Load Test LiteLLM Proxy Results" exists
existing_release_body = latest_release.body
if "Load Test LiteLLM Proxy Results" in latest_release.body:
# find the "Load Test LiteLLM Proxy Results" section and delete it
start_index = latest_release.body.find("Load Test LiteLLM Proxy Results")
existing_release_body = latest_release.body[:start_index]
new_release_body = (
existing_release_body
+ "\n\n"
+ "## Load Test LiteLLM Proxy Results"
+ "\n\n"
+ markdown_table
)
print("new release body: ", new_release_body)
try:
latest_release.update_release(
name=latest_release.tag_name,
message=new_release_body,
)
except Exception as e:
print(e)

View file

@ -1,6 +1,11 @@
name: Test Locust Load Test name: Test Locust Load Test
on: [push] on:
workflow_run:
workflows: ["Build, Publish LiteLLM Docker Image. New Release"]
types:
- completed
workflow_dispatch:
jobs: jobs:
build: build:
@ -8,15 +13,32 @@ jobs:
steps: steps:
- name: Checkout - name: Checkout
uses: actions/checkout@v1 uses: actions/checkout@v1
- name: Setup Python
uses: actions/setup-python@v2
with:
python-version: '3.x'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install PyGithub
- name: Run Load Test - name: Run Load Test
id: locust_run id: locust_run
uses: BerriAI/locust-github-action@master uses: BerriAI/locust-github-action@master
with: with:
LOCUSTFILE: ".github/workflows/locustfile.py" LOCUSTFILE: ".github/workflows/locustfile.py"
URL: "https://litellm-api.up.railway.app/" URL: "https://litellm-database-docker-build-production.up.railway.app/"
USERS: "100" USERS: "100"
RATE: "10" RATE: "10"
RUNTIME: "60s" RUNTIME: "300s"
- name: Process Load Test Stats
run: |
echo "Current working directory: $PWD"
ls
python ".github/workflows/interpret_load_test.py"
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
working-directory: ${{ github.workspace }}
- name: Upload CSV as Asset to Latest Release - name: Upload CSV as Asset to Latest Release
uses: xresloader/upload-to-github-release@v1 uses: xresloader/upload-to-github-release@v1
env: env:

View file

@ -1,4 +1,6 @@
from locust import HttpUser, task, between from locust import HttpUser, task, between, events
import json
import time
class MyUser(HttpUser): class MyUser(HttpUser):
@ -8,7 +10,7 @@ class MyUser(HttpUser):
def chat_completion(self): def chat_completion(self):
headers = { headers = {
"Content-Type": "application/json", "Content-Type": "application/json",
"Authorization": f"Bearer sk-1234", "Authorization": f"Bearer sk-gUvTeN9g0sgHBMf9HeCaqA",
# Include any additional headers you may need for authentication, etc. # Include any additional headers you may need for authentication, etc.
} }
@ -26,3 +28,15 @@ class MyUser(HttpUser):
response = self.client.post("chat/completions", json=payload, headers=headers) response = self.client.post("chat/completions", json=payload, headers=headers)
# Print or log the response if needed # Print or log the response if needed
@task(10)
def health_readiness(self):
start_time = time.time()
response = self.client.get("health/readiness")
response_time = time.time() - start_time
@task(10)
def health_liveliness(self):
start_time = time.time()
response = self.client.get("health/liveliness")
response_time = time.time() - start_time

View file

@ -66,4 +66,4 @@ ENTRYPOINT ["litellm"]
# Append "--detailed_debug" to the end of CMD to view detailed debug logs # Append "--detailed_debug" to the end of CMD to view detailed debug logs
# CMD ["--port", "4000", "--config", "./proxy_server_config.yaml", "--run_gunicorn", "--detailed_debug"] # CMD ["--port", "4000", "--config", "./proxy_server_config.yaml", "--run_gunicorn", "--detailed_debug"]
CMD ["--port", "4000", "--config", "./proxy_server_config.yaml", "--run_gunicorn", "--num_workers", "1"] CMD ["--port", "4000", "--config", "./proxy_server_config.yaml", "--run_gunicorn", "--num_workers", "4"]

View file

@ -2,7 +2,7 @@
## Prerequisites ## Prerequisites
- Kubernetes 1.23+ - Kubernetes 1.21+
- Helm 3.8.0+ - Helm 3.8.0+
If `db.deployStandalone` is used: If `db.deployStandalone` is used:
@ -33,6 +33,7 @@ If `db.useStackgresOperator` is used (not yet implemented):
| `proxy_config.*` | See [values.yaml](./values.yaml) for default settings. See [example_config_yaml](../../../litellm/proxy/example_config_yaml/) for configuration examples. | N/A | | `proxy_config.*` | See [values.yaml](./values.yaml) for default settings. See [example_config_yaml](../../../litellm/proxy/example_config_yaml/) for configuration examples. | N/A |
#### Example `environmentSecrets` Secret #### Example `environmentSecrets` Secret
``` ```
apiVersion: v1 apiVersion: v1
kind: Secret kind: Secret

View file

@ -6,7 +6,6 @@ replicaCount: 1
image: image:
# Use "ghcr.io/berriai/litellm-database" for optimized image with database # Use "ghcr.io/berriai/litellm-database" for optimized image with database
# Alternatively, use "ghcr.io/berriai/litellm" for the default image
repository: ghcr.io/berriai/litellm-database repository: ghcr.io/berriai/litellm-database
pullPolicy: IfNotPresent pullPolicy: IfNotPresent
# Overrides the image tag whose default is the chart appVersion. # Overrides the image tag whose default is the chart appVersion.
@ -85,10 +84,13 @@ proxy_config:
litellm_params: litellm_params:
model: gpt-3.5-turbo model: gpt-3.5-turbo
api_key: eXaMpLeOnLy api_key: eXaMpLeOnLy
- model_name: fake-openai-endpoint
litellm_params:
model: openai/fake
api_key: fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
general_settings: general_settings:
master_key: os.environ/PROXY_MASTER_KEY master_key: os.environ/PROXY_MASTER_KEY
# litellm_settings:
# cache: true
resources: {} resources: {}
# We usually recommend not to specify default resources and to leave this as a conscious # We usually recommend not to specify default resources and to leave this as a conscious

View file

@ -77,9 +77,32 @@ litellm --config /path/to/config.yaml
### Test ### Test
<Tabs>
<TabItem value="curl" label="Curl">
```bash ```bash
curl --location 'http://0.0.0.0:4000/v1/audio/transcriptions' \ curl --location 'http://0.0.0.0:8000/v1/audio/transcriptions' \
--header 'Authorization: Bearer sk-1234' \ --header 'Authorization: Bearer sk-1234' \
--form 'file=@"/Users/krrishdholakia/Downloads/gettysburg.wav"' \ --form 'file=@"/Users/krrishdholakia/Downloads/gettysburg.wav"' \
--form 'model="whisper"' --form 'model="whisper"'
``` ```
</TabItem>
<TabItem value="openai" label="OpenAI">
```python
from openai import OpenAI
client = openai.OpenAI(
api_key="sk-1234",
base_url="http://0.0.0.0:8000"
)
audio_file = open("speech.mp3", "rb")
transcript = client.audio.transcriptions.create(
model="whisper",
file=audio_file
)
```
</TabItem>
</Tabs>

View file

@ -133,3 +133,6 @@ chat(messages)
``` ```
</TabItem> </TabItem>
</Tabs> </Tabs>
## Use LangChain ChatLiteLLM + Langfuse
Checkout this section [here](../observability/langfuse_integration#use-langchain-chatlitellm--langfuse) for more details on how to integrate Langfuse with ChatLiteLLM.

View file

@ -2,6 +2,54 @@ import Image from '@theme/IdealImage';
# 🔥 Load Test LiteLLM # 🔥 Load Test LiteLLM
## How to run a locust load test on LiteLLM Proxy
1. Add `fake-openai-endpoint` to your proxy config.yaml and start your litellm proxy
litellm provides a free hosted `fake-openai-endpoint` you can load test against
```yaml
model_list:
- model_name: fake-openai-endpoint
litellm_params:
model: openai/fake
api_key: fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
```
2. `pip install locust`
3. Create a file called `locustfile.py` on your local machine. Copy the contents from the litellm load test located [here](https://github.com/BerriAI/litellm/blob/main/.github/workflows/locustfile.py)
4. Start locust
Run `locust` in the same directory as your `locustfile.py` from step 2
```shell
locust
```
Output on terminal
```
[2024-03-15 07:19:58,893] Starting web interface at http://0.0.0.0:8089
[2024-03-15 07:19:58,898] Starting Locust 2.24.0
```
5. Run Load test on locust
Head to the locust UI on http://0.0.0.0:8089
Set Users=100, Ramp Up Users=10, Host=Base URL of your LiteLLM Proxy
<Image img={require('../img/locust_load_test.png')} />
6. Expected Results
Expect to see the following response times for `/health/readiness`
Median → /health/readiness is `150ms`
Avg → /health/readiness is `219ms`
<Image img={require('../img/litellm_load_test.png')} />
## Load Test LiteLLM Proxy - 1500+ req/s ## Load Test LiteLLM Proxy - 1500+ req/s
## 1500+ concurrent requests/s ## 1500+ concurrent requests/s

View file

@ -132,6 +132,41 @@ print(response)
``` ```
### Use LangChain ChatLiteLLM + Langfuse
Pass `trace_user_id`, `session_id` in model_kwargs
```python
import os
from langchain.chat_models import ChatLiteLLM
from langchain.schema import HumanMessage
import litellm
# from https://cloud.langfuse.com/
os.environ["LANGFUSE_PUBLIC_KEY"] = ""
os.environ["LANGFUSE_SECRET_KEY"] = ""
os.environ['OPENAI_API_KEY']=""
# set langfuse as a callback, litellm will send the data to langfuse
litellm.success_callback = ["langfuse"]
chat = ChatLiteLLM(
model="gpt-3.5-turbo"
model_kwargs={
"metadata": {
"trace_user_id": "user-id2", # set langfuse Trace User ID
"session_id": "session-1" , # set langfuse Session ID
"tags": ["tag1", "tag2"]
}
}
)
messages = [
HumanMessage(
content="what model are you"
)
]
chat(messages)
```
## Troubleshooting & Errors ## Troubleshooting & Errors
### Data not getting logged to Langfuse ? ### Data not getting logged to Langfuse ?

View file

@ -49,7 +49,7 @@ for chunk in response:
| command-light | `completion('command-light', messages)` | | command-light | `completion('command-light', messages)` |
| command-medium | `completion('command-medium', messages)` | | command-medium | `completion('command-medium', messages)` |
| command-medium-beta | `completion('command-medium-beta', messages)` | | command-medium-beta | `completion('command-medium-beta', messages)` |
| command-xlarge-beta | `completion('command-xlarge-beta', messages)` | | command-xlarge-nightly | `completion('command-xlarge-nightly', messages)` |
| command-nightly | `completion('command-nightly', messages)` | | command-nightly | `completion('command-nightly', messages)` |

View file

@ -0,0 +1,53 @@
# Fireworks AI
https://fireworks.ai/
**We support ALL Fireworks AI models, just set `fireworks_ai/` as a prefix when sending completion requests**
## API Key
```python
# env variable
os.environ['FIREWORKS_AI_API_KEY']
```
## Sample Usage
```python
from litellm import completion
import os
os.environ['FIREWORKS_AI_API_KEY'] = ""
response = completion(
model="fireworks_ai/mixtral-8x7b-instruct",
messages=[
{"role": "user", "content": "hello from litellm"}
],
)
print(response)
```
## Sample Usage - Streaming
```python
from litellm import completion
import os
os.environ['FIREWORKS_AI_API_KEY'] = ""
response = completion(
model="fireworks_ai/mixtral-8x7b-instruct",
messages=[
{"role": "user", "content": "hello from litellm"}
],
stream=True
)
for chunk in response:
print(chunk)
```
## Supported Models - ALL Fireworks AI Models Supported!
We support ALL Fireworks AI models, just set `fireworks_ai/` as a prefix when sending completion requests
| Model Name | Function Call |
|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| mixtral-8x7b-instruct | `completion(model="fireworks_ai/mixtral-8x7b-instruct", messages)` |
| firefunction-v1 | `completion(model="fireworks_ai/firefunction-v1", messages)` |
| llama-v2-70b-chat | `completion(model="fireworks_ai/llama-v2-70b-chat", messages)` |

View file

@ -50,3 +50,4 @@ We support ALL Groq models, just set `groq/` as a prefix when sending completion
|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------| |--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| llama2-70b-4096 | `completion(model="groq/llama2-70b-4096", messages)` | | llama2-70b-4096 | `completion(model="groq/llama2-70b-4096", messages)` |
| mixtral-8x7b-32768 | `completion(model="groq/mixtral-8x7b-32768", messages)` | | mixtral-8x7b-32768 | `completion(model="groq/mixtral-8x7b-32768", messages)` |
| gemma-7b-it | `completion(model="groq/gemma-7b-it", messages)` |

View file

@ -225,6 +225,32 @@ litellm_settings:
supported_call_types: ["acompletion", "completion", "embedding", "aembedding"] # defaults to all litellm call types supported_call_types: ["acompletion", "completion", "embedding", "aembedding"] # defaults to all litellm call types
``` ```
### Turn on `batch_redis_requests`
**What it does?**
When a request is made:
- Check if a key starting with `litellm:<hashed_api_key>:<call_type>:` exists in-memory, if no - get the last 100 cached requests for this key and store it
- New requests are stored with this `litellm:..` as the namespace
**Why?**
Reduce number of redis GET requests. This improved latency by 46% in prod load tests.
**Usage**
```yaml
litellm_settings:
cache: true
cache_params:
type: redis
... # remaining redis args (host, port, etc.)
callbacks: ["batch_redis_requests"] # 👈 KEY CHANGE!
```
[**SEE CODE**](https://github.com/BerriAI/litellm/blob/main/litellm/proxy/hooks/batch_redis_get.py)
### Turn on / off caching per request. ### Turn on / off caching per request.
The proxy support 3 cache-controls: The proxy support 3 cache-controls:

View file

@ -150,17 +150,20 @@ To avoid issues with predictability, difficulties in rollback, and inconsistent
## Deploy with Database ## Deploy with Database
### Docker, Kubernetes, Helm Chart
<Tabs>
<TabItem value="docker-deploy" label="Dockerfile">
We maintain a [seperate Dockerfile](https://github.com/BerriAI/litellm/pkgs/container/litellm-database) for reducing build time when running LiteLLM proxy with a connected Postgres Database We maintain a [seperate Dockerfile](https://github.com/BerriAI/litellm/pkgs/container/litellm-database) for reducing build time when running LiteLLM proxy with a connected Postgres Database
<Tabs> ```shell
<TabItem value="docker-deploy" label="Dockerfile">
```
docker pull docker pull ghcr.io/berriai/litellm-database:main-latest docker pull docker pull ghcr.io/berriai/litellm-database:main-latest
``` ```
``` ```shell
docker run --name litellm-proxy \ docker run --name litellm-proxy \
-e DATABASE_URL=postgresql://<user>:<password>@<host>:<port>/<dbname> \ -e DATABASE_URL=postgresql://<user>:<password>@<host>:<port>/<dbname> \
-p 4000:4000 \ -p 4000:4000 \
@ -233,6 +236,8 @@ Your OpenAI proxy server is now running on `http://0.0.0.0:4000`.
</TabItem> </TabItem>
<TabItem value="helm-deploy" label="Helm"> <TabItem value="helm-deploy" label="Helm">
Use this to deploy litellm using a helm chart. Link to [the LiteLLM Helm Chart](https://github.com/BerriAI/litellm/tree/main/deploy/charts/litellm)
#### Step 1. Clone the repository #### Step 1. Clone the repository
```bash ```bash
@ -241,9 +246,11 @@ git clone https://github.com/BerriAI/litellm.git
#### Step 2. Deploy with Helm #### Step 2. Deploy with Helm
Run the following command in the root of your `litellm` repo. This will set the litellm proxy master key as `sk-1234`
```bash ```bash
helm install \ helm install \
--set masterkey=SuPeRsEcReT \ --set masterkey=sk-1234 \
mydeploy \ mydeploy \
deploy/charts/litellm deploy/charts/litellm
``` ```
@ -259,6 +266,9 @@ kubectl \
Your OpenAI proxy server is now running on `http://127.0.0.1:4000`. Your OpenAI proxy server is now running on `http://127.0.0.1:4000`.
If you need to set your litellm proxy config.yaml, you can find this in [values.yaml](https://github.com/BerriAI/litellm/blob/main/deploy/charts/litellm/values.yaml)
</TabItem> </TabItem>
</Tabs> </Tabs>

Binary file not shown.

After

Width:  |  Height:  |  Size: 125 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 204 KiB

View file

@ -138,6 +138,7 @@ const sidebars = {
"providers/ollama", "providers/ollama",
"providers/perplexity", "providers/perplexity",
"providers/groq", "providers/groq",
"providers/fireworks_ai",
"providers/vllm", "providers/vllm",
"providers/xinference", "providers/xinference",
"providers/cloudflare_workers", "providers/cloudflare_workers",

1
enterprise/__init__.py Normal file
View file

@ -0,0 +1 @@
from . import *

View file

@ -36,6 +36,7 @@ token: Optional[str] = (
telemetry = True telemetry = True
max_tokens = 256 # OpenAI Defaults max_tokens = 256 # OpenAI Defaults
drop_params = False drop_params = False
modify_params = False
retry = True retry = True
api_key: Optional[str] = None api_key: Optional[str] = None
openai_key: Optional[str] = None openai_key: Optional[str] = None
@ -327,6 +328,7 @@ openai_compatible_providers: List = [
"perplexity", "perplexity",
"xinference", "xinference",
"together_ai", "together_ai",
"fireworks_ai",
] ]
@ -478,6 +480,7 @@ provider_list: List = [
"voyage", "voyage",
"cloudflare", "cloudflare",
"xinference", "xinference",
"fireworks_ai",
"custom", # custom apis "custom", # custom apis
] ]

View file

@ -129,6 +129,16 @@ class RedisCache(BaseCache):
f"LiteLLM Caching: set() - Got exception from REDIS : {str(e)}" f"LiteLLM Caching: set() - Got exception from REDIS : {str(e)}"
) )
async def async_scan_iter(self, pattern: str, count: int = 100) -> list:
keys = []
_redis_client = self.init_async_client()
async with _redis_client as redis_client:
async for key in redis_client.scan_iter(match=pattern + "*", count=count):
keys.append(key)
if len(keys) >= count:
break
return keys
async def async_set_cache(self, key, value, **kwargs): async def async_set_cache(self, key, value, **kwargs):
_redis_client = self.init_async_client() _redis_client = self.init_async_client()
async with _redis_client as redis_client: async with _redis_client as redis_client:
@ -140,6 +150,9 @@ class RedisCache(BaseCache):
await redis_client.set( await redis_client.set(
name=key, value=json.dumps(value), ex=ttl, get=True name=key, value=json.dumps(value), ex=ttl, get=True
) )
print_verbose(
f"Successfully Set ASYNC Redis Cache: key: {key}\nValue {value}\nttl={ttl}"
)
except Exception as e: except Exception as e:
# NON blocking - notify users Redis is throwing an exception # NON blocking - notify users Redis is throwing an exception
print_verbose( print_verbose(
@ -172,8 +185,6 @@ class RedisCache(BaseCache):
return results return results
except Exception as e: except Exception as e:
print_verbose(f"Error occurred in pipeline write - {str(e)}") print_verbose(f"Error occurred in pipeline write - {str(e)}")
# NON blocking - notify users Redis is throwing an exception
logging.debug("LiteLLM Caching: set() - Got exception from REDIS : ", e)
def _get_cache_logic(self, cached_response: Any): def _get_cache_logic(self, cached_response: Any):
""" """
@ -208,7 +219,7 @@ class RedisCache(BaseCache):
_redis_client = self.init_async_client() _redis_client = self.init_async_client()
async with _redis_client as redis_client: async with _redis_client as redis_client:
try: try:
print_verbose(f"Get Redis Cache: key: {key}") print_verbose(f"Get Async Redis Cache: key: {key}")
cached_response = await redis_client.get(key) cached_response = await redis_client.get(key)
print_verbose( print_verbose(
f"Got Async Redis Cache: key: {key}, cached_response {cached_response}" f"Got Async Redis Cache: key: {key}, cached_response {cached_response}"
@ -217,8 +228,39 @@ class RedisCache(BaseCache):
return response return response
except Exception as e: except Exception as e:
# NON blocking - notify users Redis is throwing an exception # NON blocking - notify users Redis is throwing an exception
traceback.print_exc() print_verbose(
logging.debug("LiteLLM Caching: get() - Got exception from REDIS: ", e) f"LiteLLM Caching: async get() - Got exception from REDIS: {str(e)}"
)
async def async_get_cache_pipeline(self, key_list) -> dict:
"""
Use Redis for bulk read operations
"""
_redis_client = await self.init_async_client()
key_value_dict = {}
try:
async with _redis_client as redis_client:
async with redis_client.pipeline(transaction=True) as pipe:
# Queue the get operations in the pipeline for all keys.
for cache_key in key_list:
pipe.get(cache_key) # Queue GET command in pipeline
# Execute the pipeline and await the results.
results = await pipe.execute()
# Associate the results back with their keys.
# 'results' is a list of values corresponding to the order of keys in 'key_list'.
key_value_dict = dict(zip(key_list, results))
decoded_results = {
k.decode("utf-8"): self._get_cache_logic(v)
for k, v in key_value_dict.items()
}
return decoded_results
except Exception as e:
print_verbose(f"Error occurred in pipeline read - {str(e)}")
return key_value_dict
def flush_cache(self): def flush_cache(self):
self.redis_client.flushall() self.redis_client.flushall()
@ -1001,6 +1043,10 @@ class Cache:
if self.namespace is not None: if self.namespace is not None:
hash_hex = f"{self.namespace}:{hash_hex}" hash_hex = f"{self.namespace}:{hash_hex}"
print_verbose(f"Hashed Key with Namespace: {hash_hex}") print_verbose(f"Hashed Key with Namespace: {hash_hex}")
elif kwargs.get("metadata", {}).get("redis_namespace", None) is not None:
_namespace = kwargs.get("metadata", {}).get("redis_namespace", None)
hash_hex = f"{_namespace}:{hash_hex}"
print_verbose(f"Hashed Key with Namespace: {hash_hex}")
return hash_hex return hash_hex
def generate_streaming_content(self, content): def generate_streaming_content(self, content):

View file

@ -82,12 +82,22 @@ class AmazonAnthropicClaude3Config:
Supported Params for the Amazon / Anthropic Claude 3 models: Supported Params for the Amazon / Anthropic Claude 3 models:
- `max_tokens` (integer) max tokens, - `max_tokens` Required (integer) max tokens,
- `anthropic_version` (string) version of anthropic for bedrock - e.g. "bedrock-2023-05-31" - `anthropic_version` Required (string) version of anthropic for bedrock - e.g. "bedrock-2023-05-31"
- `system` Optional (string) the system prompt, conversion from openai format to this is handled in factory.py
- `temperature` Optional (float) The amount of randomness injected into the response
- `top_p` Optional (float) Use nucleus sampling.
- `top_k` Optional (int) Only sample from the top K options for each subsequent token
- `stop_sequences` Optional (List[str]) Custom text sequences that cause the model to stop generating
""" """
max_tokens: Optional[int] = litellm.max_tokens max_tokens: Optional[int] = litellm.max_tokens
anthropic_version: Optional[str] = "bedrock-2023-05-31" anthropic_version: Optional[str] = "bedrock-2023-05-31"
system: Optional[str] = None
temperature: Optional[float] = None
top_p: Optional[float] = None
top_k: Optional[int] = None
stop_sequences: Optional[List[str]] = None
def __init__( def __init__(
self, self,
@ -128,6 +138,12 @@ class AmazonAnthropicClaude3Config:
optional_params["tools"] = value optional_params["tools"] = value
if param == "stream": if param == "stream":
optional_params["stream"] = value optional_params["stream"] = value
if param == "stop":
optional_params["stop_sequences"] = value
if param == "temperature":
optional_params["temperature"] = value
if param == "top_p":
optional_params["top_p"] = value
return optional_params return optional_params

View file

@ -300,8 +300,7 @@ def embedding(
for text in input: for text in input:
input_tokens += len(encoding.encode(text)) input_tokens += len(encoding.encode(text))
model_response["usage"] = { model_response["usage"] = Usage(
"prompt_tokens": input_tokens, prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens
"total_tokens": input_tokens, )
}
return model_response return model_response

View file

@ -705,6 +705,7 @@ def anthropic_messages_pt(messages: list):
"text" "text"
].rstrip() # no trailing whitespace for final assistant message ].rstrip() # no trailing whitespace for final assistant message
return new_messages return new_messages

View file

@ -12,6 +12,7 @@ from typing import Any, Literal, Union, BinaryIO
from functools import partial from functools import partial
import dotenv, traceback, random, asyncio, time, contextvars import dotenv, traceback, random, asyncio, time, contextvars
from copy import deepcopy from copy import deepcopy
import httpx import httpx
import litellm import litellm
from ._logging import verbose_logger from ._logging import verbose_logger
@ -891,6 +892,7 @@ def completion(
or custom_llm_provider == "mistral" or custom_llm_provider == "mistral"
or custom_llm_provider == "openai" or custom_llm_provider == "openai"
or custom_llm_provider == "together_ai" or custom_llm_provider == "together_ai"
or custom_llm_provider in litellm.openai_compatible_providers
or "ft:gpt-3.5-turbo" in model # finetune gpt-3.5-turbo or "ft:gpt-3.5-turbo" in model # finetune gpt-3.5-turbo
): # allow user to make an openai call with a custom base ): # allow user to make an openai call with a custom base
# note: if a user sets a custom base - we should ensure this works # note: if a user sets a custom base - we should ensure this works
@ -2393,6 +2395,7 @@ async def aembedding(*args, **kwargs):
or custom_llm_provider == "deepinfra" or custom_llm_provider == "deepinfra"
or custom_llm_provider == "perplexity" or custom_llm_provider == "perplexity"
or custom_llm_provider == "groq" or custom_llm_provider == "groq"
or custom_llm_provider == "fireworks_ai"
or custom_llm_provider == "ollama" or custom_llm_provider == "ollama"
or custom_llm_provider == "vertex_ai" or custom_llm_provider == "vertex_ai"
): # currently implemented aiohttp calls for just azure and openai, soon all. ): # currently implemented aiohttp calls for just azure and openai, soon all.
@ -2892,6 +2895,7 @@ async def atext_completion(*args, **kwargs):
or custom_llm_provider == "deepinfra" or custom_llm_provider == "deepinfra"
or custom_llm_provider == "perplexity" or custom_llm_provider == "perplexity"
or custom_llm_provider == "groq" or custom_llm_provider == "groq"
or custom_llm_provider == "fireworks_ai"
or custom_llm_provider == "text-completion-openai" or custom_llm_provider == "text-completion-openai"
or custom_llm_provider == "huggingface" or custom_llm_provider == "huggingface"
or custom_llm_provider == "ollama" or custom_llm_provider == "ollama"

View file

@ -631,6 +631,13 @@
"litellm_provider": "groq", "litellm_provider": "groq",
"mode": "chat" "mode": "chat"
}, },
"groq/gemma-7b-it": {
"max_tokens": 8192,
"input_cost_per_token": 0.00000010,
"output_cost_per_token": 0.00000010,
"litellm_provider": "groq",
"mode": "chat"
},
"claude-instant-1.2": { "claude-instant-1.2": {
"max_tokens": 100000, "max_tokens": 100000,
"max_output_tokens": 8191, "max_output_tokens": 8191,

View file

@ -9,6 +9,12 @@ model_list:
model: gpt-3.5-turbo-1106 model: gpt-3.5-turbo-1106
api_key: os.environ/OPENAI_API_KEY api_key: os.environ/OPENAI_API_KEY
litellm_settings:
cache: true
cache_params:
type: redis
callbacks: ["batch_redis_requests"]
general_settings: general_settings:
master_key: sk-1234 master_key: sk-1234
database_url: "postgresql://krrishdholakia:9yQkKWiB8vVs@ep-icy-union-a5j4dwls.us-east-2.aws.neon.tech/neondb?sslmode=require" # database_url: "postgresql://krrishdholakia:9yQkKWiB8vVs@ep-icy-union-a5j4dwls.us-east-2.aws.neon.tech/neondb?sslmode=require"

View file

@ -0,0 +1,124 @@
# What this does?
## Gets a key's redis cache, and store it in memory for 1 minute.
## This reduces the number of REDIS GET requests made during high-traffic by the proxy.
### [BETA] this is in Beta. And might change.
from typing import Optional, Literal
import litellm
from litellm.caching import DualCache, RedisCache, InMemoryCache
from litellm.proxy._types import UserAPIKeyAuth
from litellm.integrations.custom_logger import CustomLogger
from litellm._logging import verbose_proxy_logger
from fastapi import HTTPException
import json, traceback
class _PROXY_BatchRedisRequests(CustomLogger):
# Class variables or attributes
in_memory_cache: Optional[InMemoryCache] = None
def __init__(self):
litellm.cache.async_get_cache = (
self.async_get_cache
) # map the litellm 'get_cache' function to our custom function
def print_verbose(
self, print_statement, debug_level: Literal["INFO", "DEBUG"] = "DEBUG"
):
if debug_level == "DEBUG":
verbose_proxy_logger.debug(print_statement)
elif debug_level == "INFO":
verbose_proxy_logger.debug(print_statement)
if litellm.set_verbose is True:
print(print_statement) # noqa
async def async_pre_call_hook(
self,
user_api_key_dict: UserAPIKeyAuth,
cache: DualCache,
data: dict,
call_type: str,
):
try:
"""
Get the user key
Check if a key starting with `litellm:<api_key>:<call_type:` exists in-memory
If no, then get relevant cache from redis
"""
api_key = user_api_key_dict.api_key
cache_key_name = f"litellm:{api_key}:{call_type}"
self.in_memory_cache = cache.in_memory_cache
key_value_dict = {}
in_memory_cache_exists = False
for key in cache.in_memory_cache.cache_dict.keys():
if isinstance(key, str) and key.startswith(cache_key_name):
in_memory_cache_exists = True
if in_memory_cache_exists == False and litellm.cache is not None:
"""
- Check if `litellm.Cache` is redis
- Get the relevant values
"""
if litellm.cache.type is not None and isinstance(
litellm.cache.cache, RedisCache
):
# Initialize an empty list to store the keys
keys = []
self.print_verbose(f"cache_key_name: {cache_key_name}")
# Use the SCAN iterator to fetch keys matching the pattern
keys = await litellm.cache.cache.async_scan_iter(
pattern=cache_key_name, count=100
)
# If you need the truly "last" based on time or another criteria,
# ensure your key naming or storage strategy allows this determination
# Here you would sort or filter the keys as needed based on your strategy
self.print_verbose(f"redis keys: {keys}")
if len(keys) > 0:
key_value_dict = (
await litellm.cache.cache.async_get_cache_pipeline(
key_list=keys
)
)
## Add to cache
if len(key_value_dict.items()) > 0:
await cache.in_memory_cache.async_set_cache_pipeline(
cache_list=list(key_value_dict.items()), ttl=60
)
## Set cache namespace if it's a miss
data["metadata"]["redis_namespace"] = cache_key_name
except HTTPException as e:
raise e
except Exception as e:
traceback.print_exc()
async def async_get_cache(self, *args, **kwargs):
"""
- Check if the cache key is in-memory
- Else return None
"""
try: # never block execution
if "cache_key" in kwargs:
cache_key = kwargs["cache_key"]
else:
cache_key = litellm.cache.get_cache_key(
*args, **kwargs
) # returns "<cache_key_name>:<hash>" - we pass redis_namespace in async_pre_call_hook. Done to avoid rewriting the async_set_cache logic
if cache_key is not None and self.in_memory_cache is not None:
cache_control_args = kwargs.get("cache", {})
max_age = cache_control_args.get(
"s-max-age", cache_control_args.get("s-maxage", float("inf"))
)
cached_result = self.in_memory_cache.get_cache(
cache_key, *args, **kwargs
)
return litellm.cache._get_cache_logic(
cached_result=cached_result, max_age=max_age
)
except Exception as e:
return None

View file

@ -6,7 +6,7 @@ import time
class MyUser(HttpUser): class MyUser(HttpUser):
wait_time = between(1, 5) wait_time = between(1, 5)
@task @task(3)
def chat_completion(self): def chat_completion(self):
headers = { headers = {
"Content-Type": "application/json", "Content-Type": "application/json",
@ -31,62 +31,8 @@ class MyUser(HttpUser):
@task(10) @task(10)
def health_readiness(self): def health_readiness(self):
start_time = time.time()
response = self.client.get("health/readiness") response = self.client.get("health/readiness")
response_time = time.time() - start_time
if response_time > 1:
events.request_failure.fire(
request_type="GET",
name="health/readiness",
response_time=response_time,
exception=None,
response=response,
)
@task(10) @task(10)
def health_liveliness(self): def health_liveliness(self):
start_time = time.time()
response = self.client.get("health/liveliness") response = self.client.get("health/liveliness")
response_time = time.time() - start_time
if response_time > 1:
events.request_failure.fire(
request_type="GET",
name="health/liveliness",
response_time=response_time,
exception=None,
response=response,
)
# @task
# def key_generate(self):
# headers = {
# "Authorization": "Bearer sk-1234",
# "Content-Type": "application/json",
# }
# payload = {
# "models": ["gpt-3.5-turbo", "gpt-4", "claude-2"],
# "duration": "20m",
# "metadata": {"user": "ishaan@berri.ai"},
# "team_id": "core-infra",
# "max_budget": 10,
# "soft_budget": 5,
# }
# response = self.client.post("key/generate", json=payload, headers=headers)
# if response.status_code == 200:
# key_response = response.json()
# models = key_response.get("models", [])
# if models:
# # Use the first model from the key generation response to make a chat completions request
# model_to_use = models[0]
# chat_payload = {
# "model": model_to_use,
# "messages": [
# {"role": "system", "content": "You are a chat bot."},
# {"role": "user", "content": "Hello, how are you?"},
# ],
# }
# chat_response = self.client.post("chat/completions", json=chat_payload, headers=headers)
# # Print or log the chat response if needed

View file

@ -8,7 +8,6 @@ import hashlib, uuid
import warnings import warnings
import importlib import importlib
import warnings import warnings
import backoff
def showwarning(message, category, filename, lineno, file=None, line=None): def showwarning(message, category, filename, lineno, file=None, line=None):
@ -35,7 +34,6 @@ try:
import orjson import orjson
import logging import logging
from apscheduler.schedulers.asyncio import AsyncIOScheduler from apscheduler.schedulers.asyncio import AsyncIOScheduler
from argon2 import PasswordHasher
except ImportError as e: except ImportError as e:
raise ImportError(f"Missing dependency {e}. Run `pip install 'litellm[proxy]'`") raise ImportError(f"Missing dependency {e}. Run `pip install 'litellm[proxy]'`")
@ -145,9 +143,12 @@ from typing import Union
try: try:
# when using litellm cli # when using litellm cli
import litellm.proxy.enterprise as enterprise import litellm.proxy.enterprise as enterprise
except: except Exception as e:
# when using litellm docker image # when using litellm docker image
try:
import enterprise # type: ignore import enterprise # type: ignore
except Exception as e:
pass
ui_link = f"/ui/" ui_link = f"/ui/"
ui_message = ( ui_message = (
@ -252,7 +253,6 @@ user_headers = None
user_config_file_path = f"config_{int(time.time())}.yaml" user_config_file_path = f"config_{int(time.time())}.yaml"
local_logging = True # writes logs to a local api_log.json file for debugging local_logging = True # writes logs to a local api_log.json file for debugging
experimental = False experimental = False
ph = PasswordHasher()
#### GLOBAL VARIABLES #### #### GLOBAL VARIABLES ####
llm_router: Optional[litellm.Router] = None llm_router: Optional[litellm.Router] = None
llm_model_list: Optional[list] = None llm_model_list: Optional[list] = None
@ -382,7 +382,7 @@ async def user_api_key_auth(
return valid_token return valid_token
try: try:
is_master_key_valid = ph.verify(litellm_master_key_hash, api_key) is_master_key_valid = secrets.compare_digest(api_key, master_key)
except Exception as e: except Exception as e:
is_master_key_valid = False is_master_key_valid = False
@ -887,6 +887,9 @@ async def user_api_key_auth(
raise Exception( raise Exception(
f"This key is made for LiteLLM UI, Tried to access route: {route}. Not allowed" f"This key is made for LiteLLM UI, Tried to access route: {route}. Not allowed"
) )
if valid_token is None:
# No token was found when looking up in the DB
raise Exception("Invalid token passed")
if valid_token_dict is not None: if valid_token_dict is not None:
return UserAPIKeyAuth(api_key=api_key, **valid_token_dict) return UserAPIKeyAuth(api_key=api_key, **valid_token_dict)
else: else:
@ -1420,6 +1423,8 @@ async def update_cache(
try: try:
for _id in user_ids: for _id in user_ids:
# Fetch the existing cost for the given user # Fetch the existing cost for the given user
if _id is None:
continue
existing_spend_obj = await user_api_key_cache.async_get_cache(key=_id) existing_spend_obj = await user_api_key_cache.async_get_cache(key=_id)
if existing_spend_obj is None: if existing_spend_obj is None:
# if user does not exist in LiteLLM_UserTable, create a new user # if user does not exist in LiteLLM_UserTable, create a new user
@ -1791,6 +1796,16 @@ class ProxyConfig:
_ENTERPRISE_PromptInjectionDetection() _ENTERPRISE_PromptInjectionDetection()
) )
imported_list.append(prompt_injection_detection_obj) imported_list.append(prompt_injection_detection_obj)
elif (
isinstance(callback, str)
and callback == "batch_redis_requests"
):
from litellm.proxy.hooks.batch_redis_get import (
_PROXY_BatchRedisRequests,
)
batch_redis_obj = _PROXY_BatchRedisRequests()
imported_list.append(batch_redis_obj)
else: else:
imported_list.append( imported_list.append(
get_instance_fn( get_instance_fn(
@ -1913,7 +1928,7 @@ class ProxyConfig:
master_key = litellm.get_secret(master_key) master_key = litellm.get_secret(master_key)
if master_key is not None and isinstance(master_key, str): if master_key is not None and isinstance(master_key, str):
litellm_master_key_hash = ph.hash(master_key) litellm_master_key_hash = master_key
### CUSTOM API KEY AUTH ### ### CUSTOM API KEY AUTH ###
## pass filepath ## pass filepath
custom_auth = general_settings.get("custom_auth", None) custom_auth = general_settings.get("custom_auth", None)

View file

@ -474,11 +474,10 @@ def test_redis_cache_completion_stream():
# test_redis_cache_completion_stream() # test_redis_cache_completion_stream()
def test_redis_cache_acompletion_stream(): @pytest.mark.asyncio
import asyncio async def test_redis_cache_acompletion_stream():
try: try:
litellm.set_verbose = False litellm.set_verbose = True
random_word = generate_random_word() random_word = generate_random_word()
messages = [ messages = [
{ {
@ -496,8 +495,6 @@ def test_redis_cache_acompletion_stream():
response_1_content = "" response_1_content = ""
response_2_content = "" response_2_content = ""
async def call1():
nonlocal response_1_content
response1 = await litellm.acompletion( response1 = await litellm.acompletion(
model="gpt-3.5-turbo", model="gpt-3.5-turbo",
messages=messages, messages=messages,
@ -509,12 +506,9 @@ def test_redis_cache_acompletion_stream():
response_1_content += chunk.choices[0].delta.content or "" response_1_content += chunk.choices[0].delta.content or ""
print(response_1_content) print(response_1_content)
asyncio.run(call1())
time.sleep(0.5) time.sleep(0.5)
print("\n\n Response 1 content: ", response_1_content, "\n\n") print("\n\n Response 1 content: ", response_1_content, "\n\n")
async def call2():
nonlocal response_2_content
response2 = await litellm.acompletion( response2 = await litellm.acompletion(
model="gpt-3.5-turbo", model="gpt-3.5-turbo",
messages=messages, messages=messages,
@ -526,7 +520,6 @@ def test_redis_cache_acompletion_stream():
response_2_content += chunk.choices[0].delta.content or "" response_2_content += chunk.choices[0].delta.content or ""
print(response_2_content) print(response_2_content)
asyncio.run(call2())
print("\nresponse 1", response_1_content) print("\nresponse 1", response_1_content)
print("\nresponse 2", response_2_content) print("\nresponse 2", response_2_content)
assert ( assert (
@ -536,14 +529,15 @@ def test_redis_cache_acompletion_stream():
litellm.success_callback = [] litellm.success_callback = []
litellm._async_success_callback = [] litellm._async_success_callback = []
except Exception as e: except Exception as e:
print(e) print(f"{str(e)}\n\n{traceback.format_exc()}")
raise e raise e
# test_redis_cache_acompletion_stream() # test_redis_cache_acompletion_stream()
def test_redis_cache_acompletion_stream_bedrock(): @pytest.mark.asyncio
async def test_redis_cache_acompletion_stream_bedrock():
import asyncio import asyncio
try: try:
@ -565,8 +559,6 @@ def test_redis_cache_acompletion_stream_bedrock():
response_1_content = "" response_1_content = ""
response_2_content = "" response_2_content = ""
async def call1():
nonlocal response_1_content
response1 = await litellm.acompletion( response1 = await litellm.acompletion(
model="bedrock/anthropic.claude-v2", model="bedrock/anthropic.claude-v2",
messages=messages, messages=messages,
@ -579,12 +571,9 @@ def test_redis_cache_acompletion_stream_bedrock():
response_1_content += chunk.choices[0].delta.content or "" response_1_content += chunk.choices[0].delta.content or ""
print(response_1_content) print(response_1_content)
asyncio.run(call1())
time.sleep(0.5) time.sleep(0.5)
print("\n\n Response 1 content: ", response_1_content, "\n\n") print("\n\n Response 1 content: ", response_1_content, "\n\n")
async def call2():
nonlocal response_2_content
response2 = await litellm.acompletion( response2 = await litellm.acompletion(
model="bedrock/anthropic.claude-v2", model="bedrock/anthropic.claude-v2",
messages=messages, messages=messages,
@ -597,7 +586,6 @@ def test_redis_cache_acompletion_stream_bedrock():
response_2_content += chunk.choices[0].delta.content or "" response_2_content += chunk.choices[0].delta.content or ""
print(response_2_content) print(response_2_content)
asyncio.run(call2())
print("\nresponse 1", response_1_content) print("\nresponse 1", response_1_content)
print("\nresponse 2", response_2_content) print("\nresponse 2", response_2_content)
assert ( assert (
@ -612,8 +600,8 @@ def test_redis_cache_acompletion_stream_bedrock():
raise e raise e
@pytest.mark.skip(reason="AWS Suspended Account") @pytest.mark.asyncio
def test_s3_cache_acompletion_stream_azure(): async def test_s3_cache_acompletion_stream_azure():
import asyncio import asyncio
try: try:
@ -637,8 +625,6 @@ def test_s3_cache_acompletion_stream_azure():
response_1_created = "" response_1_created = ""
response_2_created = "" response_2_created = ""
async def call1():
nonlocal response_1_content, response_1_created
response1 = await litellm.acompletion( response1 = await litellm.acompletion(
model="azure/chatgpt-v-2", model="azure/chatgpt-v-2",
messages=messages, messages=messages,
@ -652,12 +638,9 @@ def test_s3_cache_acompletion_stream_azure():
response_1_content += chunk.choices[0].delta.content or "" response_1_content += chunk.choices[0].delta.content or ""
print(response_1_content) print(response_1_content)
asyncio.run(call1())
time.sleep(0.5) time.sleep(0.5)
print("\n\n Response 1 content: ", response_1_content, "\n\n") print("\n\n Response 1 content: ", response_1_content, "\n\n")
async def call2():
nonlocal response_2_content, response_2_created
response2 = await litellm.acompletion( response2 = await litellm.acompletion(
model="azure/chatgpt-v-2", model="azure/chatgpt-v-2",
messages=messages, messages=messages,
@ -671,7 +654,6 @@ def test_s3_cache_acompletion_stream_azure():
response_2_created = chunk.created response_2_created = chunk.created
print(response_2_content) print(response_2_content)
asyncio.run(call2())
print("\nresponse 1", response_1_content) print("\nresponse 1", response_1_content)
print("\nresponse 2", response_2_content) print("\nresponse 2", response_2_content)

View file

@ -358,7 +358,7 @@ def test_completion_mistral_azure():
} }
], ],
) )
# Add any assertions here to check the response # Add any assertions here to check, the response
print(response) print(response)
except Exception as e: except Exception as e:
@ -575,6 +575,25 @@ def test_completion_azure_gpt4_vision():
# test_completion_azure_gpt4_vision() # test_completion_azure_gpt4_vision()
def test_completion_fireworks_ai():
try:
litellm.set_verbose = True
messages = [
{"role": "system", "content": "You're a good bot"},
{
"role": "user",
"content": "Hey",
},
]
response = completion(
model="fireworks_ai/accounts/fireworks/models/mixtral-8x7b-instruct",
messages=messages,
)
print(response)
except Exception as e:
pytest.fail(f"Error occurred: {e}")
@pytest.mark.skip(reason="this test is flaky") @pytest.mark.skip(reason="this test is flaky")
def test_completion_perplexity_api(): def test_completion_perplexity_api():
try: try:

View file

@ -97,27 +97,23 @@ class TmpFunction:
) )
def test_async_chat_openai_stream(): @pytest.mark.asyncio
async def test_async_chat_openai_stream():
try: try:
tmp_function = TmpFunction() tmp_function = TmpFunction()
litellm.set_verbose = True litellm.set_verbose = True
litellm.success_callback = [tmp_function.async_test_logging_fn] litellm.success_callback = [tmp_function.async_test_logging_fn]
complete_streaming_response = "" complete_streaming_response = ""
async def call_gpt():
nonlocal complete_streaming_response
response = await litellm.acompletion( response = await litellm.acompletion(
model="gpt-3.5-turbo", model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}], messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}],
stream=True, stream=True,
) )
async for chunk in response: async for chunk in response:
complete_streaming_response += ( complete_streaming_response += chunk["choices"][0]["delta"]["content"] or ""
chunk["choices"][0]["delta"]["content"] or ""
)
print(complete_streaming_response) print(complete_streaming_response)
asyncio.run(call_gpt())
complete_streaming_response = complete_streaming_response.strip("'") complete_streaming_response = complete_streaming_response.strip("'")
response1 = tmp_function.complete_streaming_response_in_callback["choices"][0][ response1 = tmp_function.complete_streaming_response_in_callback["choices"][0][
"message" "message"
@ -130,7 +126,7 @@ def test_async_chat_openai_stream():
assert tmp_function.async_success == True assert tmp_function.async_success == True
except Exception as e: except Exception as e:
print(e) print(e)
pytest.fail(f"An error occurred - {str(e)}") pytest.fail(f"An error occurred - {str(e)}\n\n{traceback.format_exc()}")
# test_async_chat_openai_stream() # test_async_chat_openai_stream()

View file

@ -117,6 +117,8 @@ def test_openai_azure_embedding_simple():
print("Calculated request cost=", request_cost) print("Calculated request cost=", request_cost)
assert isinstance(response.usage, litellm.Usage)
except Exception as e: except Exception as e:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
@ -204,6 +206,8 @@ def test_cohere_embedding():
input=["good morning from litellm", "this is another item"], input=["good morning from litellm", "this is another item"],
) )
print(f"response:", response) print(f"response:", response)
assert isinstance(response.usage, litellm.Usage)
except Exception as e: except Exception as e:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
@ -269,6 +273,8 @@ def test_bedrock_embedding_titan():
assert end_time - start_time < 0.1 assert end_time - start_time < 0.1
litellm.disable_cache() litellm.disable_cache()
assert isinstance(response.usage, litellm.Usage)
except Exception as e: except Exception as e:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
@ -295,6 +301,8 @@ def test_bedrock_embedding_cohere():
isinstance(x, float) for x in response["data"][0]["embedding"] isinstance(x, float) for x in response["data"][0]["embedding"]
), "Expected response to be a list of floats" ), "Expected response to be a list of floats"
# print(f"response:", response) # print(f"response:", response)
assert isinstance(response.usage, litellm.Usage)
except Exception as e: except Exception as e:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
@ -331,6 +339,8 @@ def test_hf_embedding():
input=["good morning from litellm", "this is another item"], input=["good morning from litellm", "this is another item"],
) )
print(f"response:", response) print(f"response:", response)
assert isinstance(response.usage, litellm.Usage)
except Exception as e: except Exception as e:
# Note: Huggingface inference API is unstable and fails with "model loading errors all the time" # Note: Huggingface inference API is unstable and fails with "model loading errors all the time"
pass pass
@ -386,6 +396,8 @@ def test_aembedding_azure():
response._hidden_params["custom_llm_provider"], response._hidden_params["custom_llm_provider"],
) )
assert response._hidden_params["custom_llm_provider"] == "azure" assert response._hidden_params["custom_llm_provider"] == "azure"
assert isinstance(response.usage, litellm.Usage)
except Exception as e: except Exception as e:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
@ -440,6 +452,7 @@ def test_mistral_embeddings():
input=["good morning from litellm"], input=["good morning from litellm"],
) )
print(f"response: {response}") print(f"response: {response}")
assert isinstance(response.usage, litellm.Usage)
except Exception as e: except Exception as e:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")

View file

@ -158,7 +158,7 @@ def test_call_with_invalid_key(prisma_client):
async def test(): async def test():
await litellm.proxy.proxy_server.prisma_client.connect() await litellm.proxy.proxy_server.prisma_client.connect()
generated_key = "bad-key" generated_key = "sk-126666"
bearer_token = "Bearer " + generated_key bearer_token = "Bearer " + generated_key
request = Request(scope={"type": "http"}, receive=None) request = Request(scope={"type": "http"}, receive=None)
@ -173,7 +173,7 @@ def test_call_with_invalid_key(prisma_client):
except Exception as e: except Exception as e:
print("Got Exception", e) print("Got Exception", e)
print(e.message) print(e.message)
assert "Authentication Error" in e.message assert "Authentication Error, Invalid token passed" in e.message
pass pass

View file

@ -72,7 +72,7 @@ from .integrations.litedebugger import LiteDebugger
from .proxy._types import KeyManagementSystem from .proxy._types import KeyManagementSystem
from openai import OpenAIError as OriginalError from openai import OpenAIError as OriginalError
from openai._models import BaseModel as OpenAIObject from openai._models import BaseModel as OpenAIObject
from .caching import S3Cache, RedisSemanticCache from .caching import S3Cache, RedisSemanticCache, RedisCache
from .exceptions import ( from .exceptions import (
AuthenticationError, AuthenticationError,
BadRequestError, BadRequestError,
@ -1795,6 +1795,11 @@ class Logging:
) )
result = kwargs["async_complete_streaming_response"] result = kwargs["async_complete_streaming_response"]
# only add to cache once we have a complete streaming response # only add to cache once we have a complete streaming response
if litellm.cache is not None and not isinstance(
litellm.cache.cache, S3Cache
):
await litellm.cache.async_add_cache(result, **kwargs)
else:
litellm.cache.add_cache(result, **kwargs) litellm.cache.add_cache(result, **kwargs)
if isinstance(callback, CustomLogger): # custom logger class if isinstance(callback, CustomLogger): # custom logger class
print_verbose( print_verbose(
@ -2589,7 +2594,7 @@ def client(original_function):
if ( if (
kwargs.get("max_tokens", None) is not None kwargs.get("max_tokens", None) is not None
and model is not None and model is not None
and litellm.drop_params and litellm.modify_params
== True # user is okay with params being modified == True # user is okay with params being modified
and ( and (
call_type == CallTypes.acompletion.value call_type == CallTypes.acompletion.value
@ -2806,7 +2811,9 @@ def client(original_function):
): ):
if len(cached_result) == 1 and cached_result[0] is None: if len(cached_result) == 1 and cached_result[0] is None:
cached_result = None cached_result = None
elif isinstance(litellm.cache.cache, RedisSemanticCache): elif isinstance(
litellm.cache.cache, RedisSemanticCache
) or isinstance(litellm.cache.cache, RedisCache):
preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs) preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs)
kwargs["preset_cache_key"] = ( kwargs["preset_cache_key"] = (
preset_cache_key # for streaming calls, we need to pass the preset_cache_key preset_cache_key # for streaming calls, we need to pass the preset_cache_key
@ -5375,6 +5382,17 @@ def get_llm_provider(
# groq is openai compatible, we just need to set this to custom_openai and have the api_base be https://api.groq.com/openai/v1 # groq is openai compatible, we just need to set this to custom_openai and have the api_base be https://api.groq.com/openai/v1
api_base = "https://api.groq.com/openai/v1" api_base = "https://api.groq.com/openai/v1"
dynamic_api_key = get_secret("GROQ_API_KEY") dynamic_api_key = get_secret("GROQ_API_KEY")
elif custom_llm_provider == "fireworks_ai":
# fireworks is openai compatible, we just need to set this to custom_openai and have the api_base be https://api.groq.com/openai/v1
if not model.startswith("accounts/fireworks/models"):
model = f"accounts/fireworks/models/{model}"
api_base = "https://api.fireworks.ai/inference/v1"
dynamic_api_key = (
get_secret("FIREWORKS_API_KEY")
or get_secret("FIREWORKS_AI_API_KEY")
or get_secret("FIREWORKSAI_API_KEY")
or get_secret("FIREWORKS_AI_TOKEN")
)
elif custom_llm_provider == "mistral": elif custom_llm_provider == "mistral":
# mistral is openai compatible, we just need to set this to custom_openai and have the api_base be https://api.mistral.ai # mistral is openai compatible, we just need to set this to custom_openai and have the api_base be https://api.mistral.ai
api_base = ( api_base = (

View file

@ -631,6 +631,13 @@
"litellm_provider": "groq", "litellm_provider": "groq",
"mode": "chat" "mode": "chat"
}, },
"groq/gemma-7b-it": {
"max_tokens": 8192,
"input_cost_per_token": 0.00000010,
"output_cost_per_token": 0.00000010,
"litellm_provider": "groq",
"mode": "chat"
},
"claude-instant-1.2": { "claude-instant-1.2": {
"max_tokens": 100000, "max_tokens": 100000,
"max_output_tokens": 8191, "max_output_tokens": 8191,

View file

@ -45,13 +45,15 @@ litellm_settings:
budget_duration: 30d budget_duration: 30d
num_retries: 5 num_retries: 5
request_timeout: 600 request_timeout: 600
cache: true
callbacks: ["batch_redis_requests"]
general_settings: general_settings:
master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234) master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234)
proxy_budget_rescheduler_min_time: 60 proxy_budget_rescheduler_min_time: 60
proxy_budget_rescheduler_max_time: 64 proxy_budget_rescheduler_max_time: 64
# database_url: "postgresql://<user>:<password>@<host>:<port>/<dbname>" # [OPTIONAL] use for token-based auth to proxy # database_url: "postgresql://<user>:<password>@<host>:<port>/<dbname>" # [OPTIONAL] use for token-based auth to proxy
environment_variables: # environment_variables:
# settings for using redis caching # settings for using redis caching
# REDIS_HOST: redis-16337.c322.us-east-1-2.ec2.cloud.redislabs.com # REDIS_HOST: redis-16337.c322.us-east-1-2.ec2.cloud.redislabs.com
# REDIS_PORT: "16337" # REDIS_PORT: "16337"

View file

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "litellm" name = "litellm"
version = "1.31.12" version = "1.31.16"
description = "Library to easily interface with LLM API providers" description = "Library to easily interface with LLM API providers"
authors = ["BerriAI"] authors = ["BerriAI"]
license = "MIT" license = "MIT"
@ -76,7 +76,7 @@ requires = ["poetry-core", "wheel"]
build-backend = "poetry.core.masonry.api" build-backend = "poetry.core.masonry.api"
[tool.commitizen] [tool.commitizen]
version = "1.31.12" version = "1.31.16"
version_files = [ version_files = [
"pyproject.toml:^version" "pyproject.toml:^version"
] ]

View file

@ -34,5 +34,4 @@ jinja2==3.1.3 # for prompt templates
certifi>=2023.7.22 # [TODO] clean up certifi>=2023.7.22 # [TODO] clean up
aiohttp==3.9.0 # for network calls aiohttp==3.9.0 # for network calls
aioboto3==12.3.0 # for async sagemaker calls aioboto3==12.3.0 # for async sagemaker calls
argon2-cffi==23.1.0 # for checking secrets
#### ####