forked from phoenix/litellm-mirror
Merge branch 'main' into support_anthropic_function_result
This commit is contained in:
commit
0368a335e6
42 changed files with 815 additions and 216 deletions
5
.dockerignore
Normal file
5
.dockerignore
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
/docs
|
||||||
|
/cookbook
|
||||||
|
/.circleci
|
||||||
|
/.github
|
||||||
|
/tests
|
66
.github/workflows/ghcr_deploy.yml
vendored
66
.github/workflows/ghcr_deploy.yml
vendored
|
@ -10,6 +10,7 @@ on:
|
||||||
env:
|
env:
|
||||||
REGISTRY: ghcr.io
|
REGISTRY: ghcr.io
|
||||||
IMAGE_NAME: ${{ github.repository }}
|
IMAGE_NAME: ${{ github.repository }}
|
||||||
|
CHART_NAME: litellm
|
||||||
|
|
||||||
# There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu.
|
# There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu.
|
||||||
jobs:
|
jobs:
|
||||||
|
@ -103,6 +104,11 @@ jobs:
|
||||||
uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
|
uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
|
||||||
with:
|
with:
|
||||||
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}-database
|
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}-database
|
||||||
|
# Configure multi platform Docker builds
|
||||||
|
- name: Set up QEMU
|
||||||
|
uses: docker/setup-qemu-action@e0e4588fad221d38ee467c0bffd91115366dc0c5
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@edfb0fe6204400c56fbfd3feba3fe9ad1adfa345
|
||||||
|
|
||||||
- name: Build and push Database Docker image
|
- name: Build and push Database Docker image
|
||||||
uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
|
uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
|
||||||
|
@ -112,6 +118,60 @@ jobs:
|
||||||
push: true
|
push: true
|
||||||
tags: ${{ steps.meta-database.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-database.outputs.tags }}-latest
|
tags: ${{ steps.meta-database.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-database.outputs.tags }}-latest
|
||||||
labels: ${{ steps.meta-database.outputs.labels }}
|
labels: ${{ steps.meta-database.outputs.labels }}
|
||||||
|
platforms: local,linux/amd64,linux/arm64,linux/arm64/v8
|
||||||
|
build-and-push-helm-chart:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Log in to the Container registry
|
||||||
|
uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
|
||||||
|
with:
|
||||||
|
registry: ${{ env.REGISTRY }}
|
||||||
|
username: ${{ github.actor }}
|
||||||
|
password: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
|
||||||
|
- name: lowercase github.repository_owner
|
||||||
|
run: |
|
||||||
|
echo "REPO_OWNER=`echo ${{github.repository_owner}} | tr '[:upper:]' '[:lower:]'`" >>${GITHUB_ENV}
|
||||||
|
- name: Get LiteLLM Latest Tag
|
||||||
|
id: current_app_tag
|
||||||
|
uses: WyriHaximus/github-action-get-previous-tag@v1.3.0
|
||||||
|
|
||||||
|
- name: Get last published chart version
|
||||||
|
id: current_version
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
CHART_LIST=$(helm show chart oci://${{ env.REGISTRY }}/${{ env.REPO_OWNER }}/${{ env.CHART_NAME }} 2>/dev/null || true)
|
||||||
|
if [ -z "${CHART_LIST}" ]; then
|
||||||
|
echo "current-version=0.1.0" | tee -a $GITHUB_OUTPUT
|
||||||
|
else
|
||||||
|
printf '%s' "${CHART_LIST}" | grep '^version:' | awk 'BEGIN{FS=":"}{print "current-version="$2}' | tr -d " " | tee -a $GITHUB_OUTPUT
|
||||||
|
fi
|
||||||
|
env:
|
||||||
|
HELM_EXPERIMENTAL_OCI: '1'
|
||||||
|
|
||||||
|
# Automatically update the helm chart version one "patch" level
|
||||||
|
- name: Bump release version
|
||||||
|
id: bump_version
|
||||||
|
uses: christian-draeger/increment-semantic-version@1.1.0
|
||||||
|
with:
|
||||||
|
current-version: ${{ steps.current_version.outputs.current-version || '0.1.0' }}
|
||||||
|
version-fragment: 'bug'
|
||||||
|
|
||||||
|
- uses: ./.github/actions/helm-oci-chart-releaser
|
||||||
|
with:
|
||||||
|
name: ${{ env.CHART_NAME }}
|
||||||
|
repository: ${{ env.REPO_OWNER }}
|
||||||
|
tag: ${{ github.event.inputs.chartVersion || steps.bump_version.outputs.next-version || '0.1.0' }}
|
||||||
|
app_version: ${{ steps.current_app_tag.outputs.tag || 'latest' }}
|
||||||
|
path: deploy/charts/${{ env.CHART_NAME }}
|
||||||
|
registry: ${{ env.REGISTRY }}
|
||||||
|
registry_username: ${{ github.actor }}
|
||||||
|
registry_password: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
update_dependencies: true
|
||||||
|
|
||||||
release:
|
release:
|
||||||
name: "New LiteLLM Release"
|
name: "New LiteLLM Release"
|
||||||
needs: [docker-hub-deploy, build-and-push-image, build-and-push-image-database]
|
needs: [docker-hub-deploy, build-and-push-image, build-and-push-image-database]
|
||||||
|
@ -171,13 +231,13 @@ jobs:
|
||||||
RELEASE_NOTES: ${{ steps.release-notes.outputs.result }}
|
RELEASE_NOTES: ${{ steps.release-notes.outputs.result }}
|
||||||
run: |
|
run: |
|
||||||
curl -H "Content-Type: application/json" -X POST -d '{
|
curl -H "Content-Type: application/json" -X POST -d '{
|
||||||
"content": "||@everyone||",
|
"content": "New LiteLLM release ${{ env.RELEASE_TAG }}",
|
||||||
"username": "Release Changelog",
|
"username": "Release Changelog",
|
||||||
"avatar_url": "https://cdn.discordapp.com/avatars/487431320314576937/bd64361e4ba6313d561d54e78c9e7171.png",
|
"avatar_url": "https://cdn.discordapp.com/avatars/487431320314576937/bd64361e4ba6313d561d54e78c9e7171.png",
|
||||||
"embeds": [
|
"embeds": [
|
||||||
{
|
{
|
||||||
"title": "Changelog for ${RELEASE_TAG}",
|
"title": "Changelog for LiteLLM ${{ env.RELEASE_TAG }}",
|
||||||
"description": "${RELEASE_NOTES}",
|
"description": "${{ env.RELEASE_NOTES }}",
|
||||||
"color": 2105893
|
"color": 2105893
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
91
.github/workflows/interpret_load_test.py
vendored
Normal file
91
.github/workflows/interpret_load_test.py
vendored
Normal file
|
@ -0,0 +1,91 @@
|
||||||
|
import csv
|
||||||
|
import os
|
||||||
|
from github import Github
|
||||||
|
|
||||||
|
|
||||||
|
def interpret_results(csv_file):
|
||||||
|
with open(csv_file, newline="") as csvfile:
|
||||||
|
csvreader = csv.DictReader(csvfile)
|
||||||
|
rows = list(csvreader)
|
||||||
|
"""
|
||||||
|
in this csv reader
|
||||||
|
- Create 1 new column "Status"
|
||||||
|
- if a row has a median response time < 300 and an average response time < 300, Status = "Passed ✅"
|
||||||
|
- if a row has a median response time >= 300 or an average response time >= 300, Status = "Failed ❌"
|
||||||
|
- Order the table in this order Name, Status, Median Response Time, Average Response Time, Requests/s,Failures/s, Min Response Time, Max Response Time, all other columns
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Add a new column "Status"
|
||||||
|
for row in rows:
|
||||||
|
median_response_time = float(
|
||||||
|
row["Median Response Time"].strip().rstrip("ms")
|
||||||
|
)
|
||||||
|
average_response_time = float(
|
||||||
|
row["Average Response Time"].strip().rstrip("s")
|
||||||
|
)
|
||||||
|
|
||||||
|
request_count = int(row["Request Count"])
|
||||||
|
failure_count = int(row["Failure Count"])
|
||||||
|
|
||||||
|
failure_percent = round((failure_count / request_count) * 100, 2)
|
||||||
|
|
||||||
|
# Determine status based on conditions
|
||||||
|
if (
|
||||||
|
median_response_time < 300
|
||||||
|
and average_response_time < 300
|
||||||
|
and failure_percent < 5
|
||||||
|
):
|
||||||
|
row["Status"] = "Passed ✅"
|
||||||
|
else:
|
||||||
|
row["Status"] = "Failed ❌"
|
||||||
|
|
||||||
|
# Construct Markdown table header
|
||||||
|
markdown_table = "| Name | Status | Median Response Time (ms) | Average Response Time (ms) | Requests/s | Failures/s | Request Count | Failure Count | Min Response Time (ms) | Max Response Time (ms) |"
|
||||||
|
markdown_table += (
|
||||||
|
"\n| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Construct Markdown table rows
|
||||||
|
for row in rows:
|
||||||
|
markdown_table += f"\n| {row['Name']} | {row['Status']} | {row['Median Response Time']} | {row['Average Response Time']} | {row['Requests/s']} | {row['Failures/s']} | {row['Request Count']} | {row['Failure Count']} | {row['Min Response Time']} | {row['Max Response Time']} |"
|
||||||
|
print("markdown table: ", markdown_table)
|
||||||
|
return markdown_table
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
csv_file = "load_test_stats.csv" # Change this to the path of your CSV file
|
||||||
|
markdown_table = interpret_results(csv_file)
|
||||||
|
|
||||||
|
# Update release body with interpreted results
|
||||||
|
github_token = os.getenv("GITHUB_TOKEN")
|
||||||
|
g = Github(github_token)
|
||||||
|
repo = g.get_repo(
|
||||||
|
"BerriAI/litellm"
|
||||||
|
) # Replace with your repository's username and name
|
||||||
|
latest_release = repo.get_latest_release()
|
||||||
|
print("got latest release: ", latest_release)
|
||||||
|
print("latest release body: ", latest_release.body)
|
||||||
|
print("markdown table: ", markdown_table)
|
||||||
|
|
||||||
|
# check if "Load Test LiteLLM Proxy Results" exists
|
||||||
|
existing_release_body = latest_release.body
|
||||||
|
if "Load Test LiteLLM Proxy Results" in latest_release.body:
|
||||||
|
# find the "Load Test LiteLLM Proxy Results" section and delete it
|
||||||
|
start_index = latest_release.body.find("Load Test LiteLLM Proxy Results")
|
||||||
|
existing_release_body = latest_release.body[:start_index]
|
||||||
|
|
||||||
|
new_release_body = (
|
||||||
|
existing_release_body
|
||||||
|
+ "\n\n"
|
||||||
|
+ "## Load Test LiteLLM Proxy Results"
|
||||||
|
+ "\n\n"
|
||||||
|
+ markdown_table
|
||||||
|
)
|
||||||
|
print("new release body: ", new_release_body)
|
||||||
|
try:
|
||||||
|
latest_release.update_release(
|
||||||
|
name=latest_release.tag_name,
|
||||||
|
message=new_release_body,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
28
.github/workflows/load_test.yml
vendored
28
.github/workflows/load_test.yml
vendored
|
@ -1,6 +1,11 @@
|
||||||
name: Test Locust Load Test
|
name: Test Locust Load Test
|
||||||
|
|
||||||
on: [push]
|
on:
|
||||||
|
workflow_run:
|
||||||
|
workflows: ["Build, Publish LiteLLM Docker Image. New Release"]
|
||||||
|
types:
|
||||||
|
- completed
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build:
|
build:
|
||||||
|
@ -8,15 +13,32 @@ jobs:
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v1
|
uses: actions/checkout@v1
|
||||||
|
- name: Setup Python
|
||||||
|
uses: actions/setup-python@v2
|
||||||
|
with:
|
||||||
|
python-version: '3.x'
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
python -m pip install --upgrade pip
|
||||||
|
pip install PyGithub
|
||||||
- name: Run Load Test
|
- name: Run Load Test
|
||||||
id: locust_run
|
id: locust_run
|
||||||
uses: BerriAI/locust-github-action@master
|
uses: BerriAI/locust-github-action@master
|
||||||
with:
|
with:
|
||||||
LOCUSTFILE: ".github/workflows/locustfile.py"
|
LOCUSTFILE: ".github/workflows/locustfile.py"
|
||||||
URL: "https://litellm-api.up.railway.app/"
|
URL: "https://litellm-database-docker-build-production.up.railway.app/"
|
||||||
USERS: "100"
|
USERS: "100"
|
||||||
RATE: "10"
|
RATE: "10"
|
||||||
RUNTIME: "60s"
|
RUNTIME: "300s"
|
||||||
|
- name: Process Load Test Stats
|
||||||
|
run: |
|
||||||
|
echo "Current working directory: $PWD"
|
||||||
|
ls
|
||||||
|
python ".github/workflows/interpret_load_test.py"
|
||||||
|
env:
|
||||||
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
working-directory: ${{ github.workspace }}
|
||||||
- name: Upload CSV as Asset to Latest Release
|
- name: Upload CSV as Asset to Latest Release
|
||||||
uses: xresloader/upload-to-github-release@v1
|
uses: xresloader/upload-to-github-release@v1
|
||||||
env:
|
env:
|
||||||
|
|
18
.github/workflows/locustfile.py
vendored
18
.github/workflows/locustfile.py
vendored
|
@ -1,4 +1,6 @@
|
||||||
from locust import HttpUser, task, between
|
from locust import HttpUser, task, between, events
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
class MyUser(HttpUser):
|
class MyUser(HttpUser):
|
||||||
|
@ -8,7 +10,7 @@ class MyUser(HttpUser):
|
||||||
def chat_completion(self):
|
def chat_completion(self):
|
||||||
headers = {
|
headers = {
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
"Authorization": f"Bearer sk-1234",
|
"Authorization": f"Bearer sk-gUvTeN9g0sgHBMf9HeCaqA",
|
||||||
# Include any additional headers you may need for authentication, etc.
|
# Include any additional headers you may need for authentication, etc.
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -26,3 +28,15 @@ class MyUser(HttpUser):
|
||||||
response = self.client.post("chat/completions", json=payload, headers=headers)
|
response = self.client.post("chat/completions", json=payload, headers=headers)
|
||||||
|
|
||||||
# Print or log the response if needed
|
# Print or log the response if needed
|
||||||
|
|
||||||
|
@task(10)
|
||||||
|
def health_readiness(self):
|
||||||
|
start_time = time.time()
|
||||||
|
response = self.client.get("health/readiness")
|
||||||
|
response_time = time.time() - start_time
|
||||||
|
|
||||||
|
@task(10)
|
||||||
|
def health_liveliness(self):
|
||||||
|
start_time = time.time()
|
||||||
|
response = self.client.get("health/liveliness")
|
||||||
|
response_time = time.time() - start_time
|
||||||
|
|
|
@ -66,4 +66,4 @@ ENTRYPOINT ["litellm"]
|
||||||
|
|
||||||
# Append "--detailed_debug" to the end of CMD to view detailed debug logs
|
# Append "--detailed_debug" to the end of CMD to view detailed debug logs
|
||||||
# CMD ["--port", "4000", "--config", "./proxy_server_config.yaml", "--run_gunicorn", "--detailed_debug"]
|
# CMD ["--port", "4000", "--config", "./proxy_server_config.yaml", "--run_gunicorn", "--detailed_debug"]
|
||||||
CMD ["--port", "4000", "--config", "./proxy_server_config.yaml", "--run_gunicorn", "--num_workers", "1"]
|
CMD ["--port", "4000", "--config", "./proxy_server_config.yaml", "--run_gunicorn", "--num_workers", "4"]
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
- Kubernetes 1.23+
|
- Kubernetes 1.21+
|
||||||
- Helm 3.8.0+
|
- Helm 3.8.0+
|
||||||
|
|
||||||
If `db.deployStandalone` is used:
|
If `db.deployStandalone` is used:
|
||||||
|
@ -33,6 +33,7 @@ If `db.useStackgresOperator` is used (not yet implemented):
|
||||||
| `proxy_config.*` | See [values.yaml](./values.yaml) for default settings. See [example_config_yaml](../../../litellm/proxy/example_config_yaml/) for configuration examples. | N/A |
|
| `proxy_config.*` | See [values.yaml](./values.yaml) for default settings. See [example_config_yaml](../../../litellm/proxy/example_config_yaml/) for configuration examples. | N/A |
|
||||||
|
|
||||||
#### Example `environmentSecrets` Secret
|
#### Example `environmentSecrets` Secret
|
||||||
|
|
||||||
```
|
```
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: Secret
|
kind: Secret
|
||||||
|
|
|
@ -6,7 +6,6 @@ replicaCount: 1
|
||||||
|
|
||||||
image:
|
image:
|
||||||
# Use "ghcr.io/berriai/litellm-database" for optimized image with database
|
# Use "ghcr.io/berriai/litellm-database" for optimized image with database
|
||||||
# Alternatively, use "ghcr.io/berriai/litellm" for the default image
|
|
||||||
repository: ghcr.io/berriai/litellm-database
|
repository: ghcr.io/berriai/litellm-database
|
||||||
pullPolicy: IfNotPresent
|
pullPolicy: IfNotPresent
|
||||||
# Overrides the image tag whose default is the chart appVersion.
|
# Overrides the image tag whose default is the chart appVersion.
|
||||||
|
@ -85,10 +84,13 @@ proxy_config:
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: gpt-3.5-turbo
|
model: gpt-3.5-turbo
|
||||||
api_key: eXaMpLeOnLy
|
api_key: eXaMpLeOnLy
|
||||||
|
- model_name: fake-openai-endpoint
|
||||||
|
litellm_params:
|
||||||
|
model: openai/fake
|
||||||
|
api_key: fake-key
|
||||||
|
api_base: https://exampleopenaiendpoint-production.up.railway.app/
|
||||||
general_settings:
|
general_settings:
|
||||||
master_key: os.environ/PROXY_MASTER_KEY
|
master_key: os.environ/PROXY_MASTER_KEY
|
||||||
# litellm_settings:
|
|
||||||
# cache: true
|
|
||||||
|
|
||||||
resources: {}
|
resources: {}
|
||||||
# We usually recommend not to specify default resources and to leave this as a conscious
|
# We usually recommend not to specify default resources and to leave this as a conscious
|
||||||
|
|
|
@ -77,9 +77,32 @@ litellm --config /path/to/config.yaml
|
||||||
|
|
||||||
### Test
|
### Test
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
<TabItem value="curl" label="Curl">
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl --location 'http://0.0.0.0:4000/v1/audio/transcriptions' \
|
curl --location 'http://0.0.0.0:8000/v1/audio/transcriptions' \
|
||||||
--header 'Authorization: Bearer sk-1234' \
|
--header 'Authorization: Bearer sk-1234' \
|
||||||
--form 'file=@"/Users/krrishdholakia/Downloads/gettysburg.wav"' \
|
--form 'file=@"/Users/krrishdholakia/Downloads/gettysburg.wav"' \
|
||||||
--form 'model="whisper"'
|
--form 'model="whisper"'
|
||||||
```
|
```
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
<TabItem value="openai" label="OpenAI">
|
||||||
|
|
||||||
|
```python
|
||||||
|
from openai import OpenAI
|
||||||
|
client = openai.OpenAI(
|
||||||
|
api_key="sk-1234",
|
||||||
|
base_url="http://0.0.0.0:8000"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
audio_file = open("speech.mp3", "rb")
|
||||||
|
transcript = client.audio.transcriptions.create(
|
||||||
|
model="whisper",
|
||||||
|
file=audio_file
|
||||||
|
)
|
||||||
|
```
|
||||||
|
</TabItem>
|
||||||
|
</Tabs>
|
|
@ -133,3 +133,6 @@ chat(messages)
|
||||||
```
|
```
|
||||||
</TabItem>
|
</TabItem>
|
||||||
</Tabs>
|
</Tabs>
|
||||||
|
|
||||||
|
## Use LangChain ChatLiteLLM + Langfuse
|
||||||
|
Checkout this section [here](../observability/langfuse_integration#use-langchain-chatlitellm--langfuse) for more details on how to integrate Langfuse with ChatLiteLLM.
|
||||||
|
|
|
@ -2,6 +2,54 @@ import Image from '@theme/IdealImage';
|
||||||
|
|
||||||
# 🔥 Load Test LiteLLM
|
# 🔥 Load Test LiteLLM
|
||||||
|
|
||||||
|
## How to run a locust load test on LiteLLM Proxy
|
||||||
|
|
||||||
|
1. Add `fake-openai-endpoint` to your proxy config.yaml and start your litellm proxy
|
||||||
|
litellm provides a free hosted `fake-openai-endpoint` you can load test against
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
model_list:
|
||||||
|
- model_name: fake-openai-endpoint
|
||||||
|
litellm_params:
|
||||||
|
model: openai/fake
|
||||||
|
api_key: fake-key
|
||||||
|
api_base: https://exampleopenaiendpoint-production.up.railway.app/
|
||||||
|
```
|
||||||
|
|
||||||
|
2. `pip install locust`
|
||||||
|
|
||||||
|
3. Create a file called `locustfile.py` on your local machine. Copy the contents from the litellm load test located [here](https://github.com/BerriAI/litellm/blob/main/.github/workflows/locustfile.py)
|
||||||
|
|
||||||
|
4. Start locust
|
||||||
|
Run `locust` in the same directory as your `locustfile.py` from step 2
|
||||||
|
|
||||||
|
```shell
|
||||||
|
locust
|
||||||
|
```
|
||||||
|
|
||||||
|
Output on terminal
|
||||||
|
```
|
||||||
|
[2024-03-15 07:19:58,893] Starting web interface at http://0.0.0.0:8089
|
||||||
|
[2024-03-15 07:19:58,898] Starting Locust 2.24.0
|
||||||
|
```
|
||||||
|
|
||||||
|
5. Run Load test on locust
|
||||||
|
|
||||||
|
Head to the locust UI on http://0.0.0.0:8089
|
||||||
|
|
||||||
|
Set Users=100, Ramp Up Users=10, Host=Base URL of your LiteLLM Proxy
|
||||||
|
|
||||||
|
<Image img={require('../img/locust_load_test.png')} />
|
||||||
|
|
||||||
|
6. Expected Results
|
||||||
|
|
||||||
|
Expect to see the following response times for `/health/readiness`
|
||||||
|
Median → /health/readiness is `150ms`
|
||||||
|
|
||||||
|
Avg → /health/readiness is `219ms`
|
||||||
|
|
||||||
|
<Image img={require('../img/litellm_load_test.png')} />
|
||||||
|
|
||||||
## Load Test LiteLLM Proxy - 1500+ req/s
|
## Load Test LiteLLM Proxy - 1500+ req/s
|
||||||
|
|
||||||
## 1500+ concurrent requests/s
|
## 1500+ concurrent requests/s
|
||||||
|
|
|
@ -132,6 +132,41 @@ print(response)
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Use LangChain ChatLiteLLM + Langfuse
|
||||||
|
Pass `trace_user_id`, `session_id` in model_kwargs
|
||||||
|
```python
|
||||||
|
import os
|
||||||
|
from langchain.chat_models import ChatLiteLLM
|
||||||
|
from langchain.schema import HumanMessage
|
||||||
|
import litellm
|
||||||
|
|
||||||
|
# from https://cloud.langfuse.com/
|
||||||
|
os.environ["LANGFUSE_PUBLIC_KEY"] = ""
|
||||||
|
os.environ["LANGFUSE_SECRET_KEY"] = ""
|
||||||
|
|
||||||
|
os.environ['OPENAI_API_KEY']=""
|
||||||
|
|
||||||
|
# set langfuse as a callback, litellm will send the data to langfuse
|
||||||
|
litellm.success_callback = ["langfuse"]
|
||||||
|
|
||||||
|
chat = ChatLiteLLM(
|
||||||
|
model="gpt-3.5-turbo"
|
||||||
|
model_kwargs={
|
||||||
|
"metadata": {
|
||||||
|
"trace_user_id": "user-id2", # set langfuse Trace User ID
|
||||||
|
"session_id": "session-1" , # set langfuse Session ID
|
||||||
|
"tags": ["tag1", "tag2"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)
|
||||||
|
messages = [
|
||||||
|
HumanMessage(
|
||||||
|
content="what model are you"
|
||||||
|
)
|
||||||
|
]
|
||||||
|
chat(messages)
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
## Troubleshooting & Errors
|
## Troubleshooting & Errors
|
||||||
### Data not getting logged to Langfuse ?
|
### Data not getting logged to Langfuse ?
|
||||||
|
|
|
@ -49,7 +49,7 @@ for chunk in response:
|
||||||
| command-light | `completion('command-light', messages)` |
|
| command-light | `completion('command-light', messages)` |
|
||||||
| command-medium | `completion('command-medium', messages)` |
|
| command-medium | `completion('command-medium', messages)` |
|
||||||
| command-medium-beta | `completion('command-medium-beta', messages)` |
|
| command-medium-beta | `completion('command-medium-beta', messages)` |
|
||||||
| command-xlarge-beta | `completion('command-xlarge-beta', messages)` |
|
| command-xlarge-nightly | `completion('command-xlarge-nightly', messages)` |
|
||||||
| command-nightly | `completion('command-nightly', messages)` |
|
| command-nightly | `completion('command-nightly', messages)` |
|
||||||
|
|
||||||
|
|
||||||
|
|
53
docs/my-website/docs/providers/fireworks_ai.md
Normal file
53
docs/my-website/docs/providers/fireworks_ai.md
Normal file
|
@ -0,0 +1,53 @@
|
||||||
|
# Fireworks AI
|
||||||
|
https://fireworks.ai/
|
||||||
|
|
||||||
|
**We support ALL Fireworks AI models, just set `fireworks_ai/` as a prefix when sending completion requests**
|
||||||
|
|
||||||
|
## API Key
|
||||||
|
```python
|
||||||
|
# env variable
|
||||||
|
os.environ['FIREWORKS_AI_API_KEY']
|
||||||
|
```
|
||||||
|
|
||||||
|
## Sample Usage
|
||||||
|
```python
|
||||||
|
from litellm import completion
|
||||||
|
import os
|
||||||
|
|
||||||
|
os.environ['FIREWORKS_AI_API_KEY'] = ""
|
||||||
|
response = completion(
|
||||||
|
model="fireworks_ai/mixtral-8x7b-instruct",
|
||||||
|
messages=[
|
||||||
|
{"role": "user", "content": "hello from litellm"}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
print(response)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Sample Usage - Streaming
|
||||||
|
```python
|
||||||
|
from litellm import completion
|
||||||
|
import os
|
||||||
|
|
||||||
|
os.environ['FIREWORKS_AI_API_KEY'] = ""
|
||||||
|
response = completion(
|
||||||
|
model="fireworks_ai/mixtral-8x7b-instruct",
|
||||||
|
messages=[
|
||||||
|
{"role": "user", "content": "hello from litellm"}
|
||||||
|
],
|
||||||
|
stream=True
|
||||||
|
)
|
||||||
|
|
||||||
|
for chunk in response:
|
||||||
|
print(chunk)
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## Supported Models - ALL Fireworks AI Models Supported!
|
||||||
|
We support ALL Fireworks AI models, just set `fireworks_ai/` as a prefix when sending completion requests
|
||||||
|
|
||||||
|
| Model Name | Function Call |
|
||||||
|
|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||||
|
| mixtral-8x7b-instruct | `completion(model="fireworks_ai/mixtral-8x7b-instruct", messages)` |
|
||||||
|
| firefunction-v1 | `completion(model="fireworks_ai/firefunction-v1", messages)` |
|
||||||
|
| llama-v2-70b-chat | `completion(model="fireworks_ai/llama-v2-70b-chat", messages)` |
|
|
@ -50,3 +50,4 @@ We support ALL Groq models, just set `groq/` as a prefix when sending completion
|
||||||
|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||||
| llama2-70b-4096 | `completion(model="groq/llama2-70b-4096", messages)` |
|
| llama2-70b-4096 | `completion(model="groq/llama2-70b-4096", messages)` |
|
||||||
| mixtral-8x7b-32768 | `completion(model="groq/mixtral-8x7b-32768", messages)` |
|
| mixtral-8x7b-32768 | `completion(model="groq/mixtral-8x7b-32768", messages)` |
|
||||||
|
| gemma-7b-it | `completion(model="groq/gemma-7b-it", messages)` |
|
|
@ -225,6 +225,32 @@ litellm_settings:
|
||||||
supported_call_types: ["acompletion", "completion", "embedding", "aembedding"] # defaults to all litellm call types
|
supported_call_types: ["acompletion", "completion", "embedding", "aembedding"] # defaults to all litellm call types
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
|
### Turn on `batch_redis_requests`
|
||||||
|
|
||||||
|
**What it does?**
|
||||||
|
When a request is made:
|
||||||
|
|
||||||
|
- Check if a key starting with `litellm:<hashed_api_key>:<call_type>:` exists in-memory, if no - get the last 100 cached requests for this key and store it
|
||||||
|
|
||||||
|
- New requests are stored with this `litellm:..` as the namespace
|
||||||
|
|
||||||
|
**Why?**
|
||||||
|
Reduce number of redis GET requests. This improved latency by 46% in prod load tests.
|
||||||
|
|
||||||
|
**Usage**
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
litellm_settings:
|
||||||
|
cache: true
|
||||||
|
cache_params:
|
||||||
|
type: redis
|
||||||
|
... # remaining redis args (host, port, etc.)
|
||||||
|
callbacks: ["batch_redis_requests"] # 👈 KEY CHANGE!
|
||||||
|
```
|
||||||
|
|
||||||
|
[**SEE CODE**](https://github.com/BerriAI/litellm/blob/main/litellm/proxy/hooks/batch_redis_get.py)
|
||||||
|
|
||||||
### Turn on / off caching per request.
|
### Turn on / off caching per request.
|
||||||
|
|
||||||
The proxy support 3 cache-controls:
|
The proxy support 3 cache-controls:
|
||||||
|
|
|
@ -150,17 +150,20 @@ To avoid issues with predictability, difficulties in rollback, and inconsistent
|
||||||
|
|
||||||
|
|
||||||
## Deploy with Database
|
## Deploy with Database
|
||||||
|
### Docker, Kubernetes, Helm Chart
|
||||||
|
|
||||||
|
|
||||||
|
<Tabs>
|
||||||
|
|
||||||
|
<TabItem value="docker-deploy" label="Dockerfile">
|
||||||
|
|
||||||
We maintain a [seperate Dockerfile](https://github.com/BerriAI/litellm/pkgs/container/litellm-database) for reducing build time when running LiteLLM proxy with a connected Postgres Database
|
We maintain a [seperate Dockerfile](https://github.com/BerriAI/litellm/pkgs/container/litellm-database) for reducing build time when running LiteLLM proxy with a connected Postgres Database
|
||||||
|
|
||||||
<Tabs>
|
```shell
|
||||||
<TabItem value="docker-deploy" label="Dockerfile">
|
|
||||||
|
|
||||||
```
|
|
||||||
docker pull docker pull ghcr.io/berriai/litellm-database:main-latest
|
docker pull docker pull ghcr.io/berriai/litellm-database:main-latest
|
||||||
```
|
```
|
||||||
|
|
||||||
```
|
```shell
|
||||||
docker run --name litellm-proxy \
|
docker run --name litellm-proxy \
|
||||||
-e DATABASE_URL=postgresql://<user>:<password>@<host>:<port>/<dbname> \
|
-e DATABASE_URL=postgresql://<user>:<password>@<host>:<port>/<dbname> \
|
||||||
-p 4000:4000 \
|
-p 4000:4000 \
|
||||||
|
@ -233,6 +236,8 @@ Your OpenAI proxy server is now running on `http://0.0.0.0:4000`.
|
||||||
</TabItem>
|
</TabItem>
|
||||||
<TabItem value="helm-deploy" label="Helm">
|
<TabItem value="helm-deploy" label="Helm">
|
||||||
|
|
||||||
|
Use this to deploy litellm using a helm chart. Link to [the LiteLLM Helm Chart](https://github.com/BerriAI/litellm/tree/main/deploy/charts/litellm)
|
||||||
|
|
||||||
#### Step 1. Clone the repository
|
#### Step 1. Clone the repository
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
@ -241,9 +246,11 @@ git clone https://github.com/BerriAI/litellm.git
|
||||||
|
|
||||||
#### Step 2. Deploy with Helm
|
#### Step 2. Deploy with Helm
|
||||||
|
|
||||||
|
Run the following command in the root of your `litellm` repo. This will set the litellm proxy master key as `sk-1234`
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
helm install \
|
helm install \
|
||||||
--set masterkey=SuPeRsEcReT \
|
--set masterkey=sk-1234 \
|
||||||
mydeploy \
|
mydeploy \
|
||||||
deploy/charts/litellm
|
deploy/charts/litellm
|
||||||
```
|
```
|
||||||
|
@ -259,6 +266,9 @@ kubectl \
|
||||||
|
|
||||||
Your OpenAI proxy server is now running on `http://127.0.0.1:4000`.
|
Your OpenAI proxy server is now running on `http://127.0.0.1:4000`.
|
||||||
|
|
||||||
|
|
||||||
|
If you need to set your litellm proxy config.yaml, you can find this in [values.yaml](https://github.com/BerriAI/litellm/blob/main/deploy/charts/litellm/values.yaml)
|
||||||
|
|
||||||
</TabItem>
|
</TabItem>
|
||||||
</Tabs>
|
</Tabs>
|
||||||
|
|
||||||
|
|
BIN
docs/my-website/img/litellm_load_test.png
Normal file
BIN
docs/my-website/img/litellm_load_test.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 125 KiB |
BIN
docs/my-website/img/locust_load_test.png
Normal file
BIN
docs/my-website/img/locust_load_test.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 204 KiB |
|
@ -138,6 +138,7 @@ const sidebars = {
|
||||||
"providers/ollama",
|
"providers/ollama",
|
||||||
"providers/perplexity",
|
"providers/perplexity",
|
||||||
"providers/groq",
|
"providers/groq",
|
||||||
|
"providers/fireworks_ai",
|
||||||
"providers/vllm",
|
"providers/vllm",
|
||||||
"providers/xinference",
|
"providers/xinference",
|
||||||
"providers/cloudflare_workers",
|
"providers/cloudflare_workers",
|
||||||
|
|
1
enterprise/__init__.py
Normal file
1
enterprise/__init__.py
Normal file
|
@ -0,0 +1 @@
|
||||||
|
from . import *
|
|
@ -36,6 +36,7 @@ token: Optional[str] = (
|
||||||
telemetry = True
|
telemetry = True
|
||||||
max_tokens = 256 # OpenAI Defaults
|
max_tokens = 256 # OpenAI Defaults
|
||||||
drop_params = False
|
drop_params = False
|
||||||
|
modify_params = False
|
||||||
retry = True
|
retry = True
|
||||||
api_key: Optional[str] = None
|
api_key: Optional[str] = None
|
||||||
openai_key: Optional[str] = None
|
openai_key: Optional[str] = None
|
||||||
|
@ -327,6 +328,7 @@ openai_compatible_providers: List = [
|
||||||
"perplexity",
|
"perplexity",
|
||||||
"xinference",
|
"xinference",
|
||||||
"together_ai",
|
"together_ai",
|
||||||
|
"fireworks_ai",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -478,6 +480,7 @@ provider_list: List = [
|
||||||
"voyage",
|
"voyage",
|
||||||
"cloudflare",
|
"cloudflare",
|
||||||
"xinference",
|
"xinference",
|
||||||
|
"fireworks_ai",
|
||||||
"custom", # custom apis
|
"custom", # custom apis
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
@ -129,6 +129,16 @@ class RedisCache(BaseCache):
|
||||||
f"LiteLLM Caching: set() - Got exception from REDIS : {str(e)}"
|
f"LiteLLM Caching: set() - Got exception from REDIS : {str(e)}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
async def async_scan_iter(self, pattern: str, count: int = 100) -> list:
|
||||||
|
keys = []
|
||||||
|
_redis_client = self.init_async_client()
|
||||||
|
async with _redis_client as redis_client:
|
||||||
|
async for key in redis_client.scan_iter(match=pattern + "*", count=count):
|
||||||
|
keys.append(key)
|
||||||
|
if len(keys) >= count:
|
||||||
|
break
|
||||||
|
return keys
|
||||||
|
|
||||||
async def async_set_cache(self, key, value, **kwargs):
|
async def async_set_cache(self, key, value, **kwargs):
|
||||||
_redis_client = self.init_async_client()
|
_redis_client = self.init_async_client()
|
||||||
async with _redis_client as redis_client:
|
async with _redis_client as redis_client:
|
||||||
|
@ -140,6 +150,9 @@ class RedisCache(BaseCache):
|
||||||
await redis_client.set(
|
await redis_client.set(
|
||||||
name=key, value=json.dumps(value), ex=ttl, get=True
|
name=key, value=json.dumps(value), ex=ttl, get=True
|
||||||
)
|
)
|
||||||
|
print_verbose(
|
||||||
|
f"Successfully Set ASYNC Redis Cache: key: {key}\nValue {value}\nttl={ttl}"
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# NON blocking - notify users Redis is throwing an exception
|
# NON blocking - notify users Redis is throwing an exception
|
||||||
print_verbose(
|
print_verbose(
|
||||||
|
@ -172,8 +185,6 @@ class RedisCache(BaseCache):
|
||||||
return results
|
return results
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print_verbose(f"Error occurred in pipeline write - {str(e)}")
|
print_verbose(f"Error occurred in pipeline write - {str(e)}")
|
||||||
# NON blocking - notify users Redis is throwing an exception
|
|
||||||
logging.debug("LiteLLM Caching: set() - Got exception from REDIS : ", e)
|
|
||||||
|
|
||||||
def _get_cache_logic(self, cached_response: Any):
|
def _get_cache_logic(self, cached_response: Any):
|
||||||
"""
|
"""
|
||||||
|
@ -208,7 +219,7 @@ class RedisCache(BaseCache):
|
||||||
_redis_client = self.init_async_client()
|
_redis_client = self.init_async_client()
|
||||||
async with _redis_client as redis_client:
|
async with _redis_client as redis_client:
|
||||||
try:
|
try:
|
||||||
print_verbose(f"Get Redis Cache: key: {key}")
|
print_verbose(f"Get Async Redis Cache: key: {key}")
|
||||||
cached_response = await redis_client.get(key)
|
cached_response = await redis_client.get(key)
|
||||||
print_verbose(
|
print_verbose(
|
||||||
f"Got Async Redis Cache: key: {key}, cached_response {cached_response}"
|
f"Got Async Redis Cache: key: {key}, cached_response {cached_response}"
|
||||||
|
@ -217,8 +228,39 @@ class RedisCache(BaseCache):
|
||||||
return response
|
return response
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# NON blocking - notify users Redis is throwing an exception
|
# NON blocking - notify users Redis is throwing an exception
|
||||||
traceback.print_exc()
|
print_verbose(
|
||||||
logging.debug("LiteLLM Caching: get() - Got exception from REDIS: ", e)
|
f"LiteLLM Caching: async get() - Got exception from REDIS: {str(e)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
async def async_get_cache_pipeline(self, key_list) -> dict:
|
||||||
|
"""
|
||||||
|
Use Redis for bulk read operations
|
||||||
|
"""
|
||||||
|
_redis_client = await self.init_async_client()
|
||||||
|
key_value_dict = {}
|
||||||
|
try:
|
||||||
|
async with _redis_client as redis_client:
|
||||||
|
async with redis_client.pipeline(transaction=True) as pipe:
|
||||||
|
# Queue the get operations in the pipeline for all keys.
|
||||||
|
for cache_key in key_list:
|
||||||
|
pipe.get(cache_key) # Queue GET command in pipeline
|
||||||
|
|
||||||
|
# Execute the pipeline and await the results.
|
||||||
|
results = await pipe.execute()
|
||||||
|
|
||||||
|
# Associate the results back with their keys.
|
||||||
|
# 'results' is a list of values corresponding to the order of keys in 'key_list'.
|
||||||
|
key_value_dict = dict(zip(key_list, results))
|
||||||
|
|
||||||
|
decoded_results = {
|
||||||
|
k.decode("utf-8"): self._get_cache_logic(v)
|
||||||
|
for k, v in key_value_dict.items()
|
||||||
|
}
|
||||||
|
|
||||||
|
return decoded_results
|
||||||
|
except Exception as e:
|
||||||
|
print_verbose(f"Error occurred in pipeline read - {str(e)}")
|
||||||
|
return key_value_dict
|
||||||
|
|
||||||
def flush_cache(self):
|
def flush_cache(self):
|
||||||
self.redis_client.flushall()
|
self.redis_client.flushall()
|
||||||
|
@ -1001,6 +1043,10 @@ class Cache:
|
||||||
if self.namespace is not None:
|
if self.namespace is not None:
|
||||||
hash_hex = f"{self.namespace}:{hash_hex}"
|
hash_hex = f"{self.namespace}:{hash_hex}"
|
||||||
print_verbose(f"Hashed Key with Namespace: {hash_hex}")
|
print_verbose(f"Hashed Key with Namespace: {hash_hex}")
|
||||||
|
elif kwargs.get("metadata", {}).get("redis_namespace", None) is not None:
|
||||||
|
_namespace = kwargs.get("metadata", {}).get("redis_namespace", None)
|
||||||
|
hash_hex = f"{_namespace}:{hash_hex}"
|
||||||
|
print_verbose(f"Hashed Key with Namespace: {hash_hex}")
|
||||||
return hash_hex
|
return hash_hex
|
||||||
|
|
||||||
def generate_streaming_content(self, content):
|
def generate_streaming_content(self, content):
|
||||||
|
|
|
@ -82,12 +82,22 @@ class AmazonAnthropicClaude3Config:
|
||||||
|
|
||||||
Supported Params for the Amazon / Anthropic Claude 3 models:
|
Supported Params for the Amazon / Anthropic Claude 3 models:
|
||||||
|
|
||||||
- `max_tokens` (integer) max tokens,
|
- `max_tokens` Required (integer) max tokens,
|
||||||
- `anthropic_version` (string) version of anthropic for bedrock - e.g. "bedrock-2023-05-31"
|
- `anthropic_version` Required (string) version of anthropic for bedrock - e.g. "bedrock-2023-05-31"
|
||||||
|
- `system` Optional (string) the system prompt, conversion from openai format to this is handled in factory.py
|
||||||
|
- `temperature` Optional (float) The amount of randomness injected into the response
|
||||||
|
- `top_p` Optional (float) Use nucleus sampling.
|
||||||
|
- `top_k` Optional (int) Only sample from the top K options for each subsequent token
|
||||||
|
- `stop_sequences` Optional (List[str]) Custom text sequences that cause the model to stop generating
|
||||||
"""
|
"""
|
||||||
|
|
||||||
max_tokens: Optional[int] = litellm.max_tokens
|
max_tokens: Optional[int] = litellm.max_tokens
|
||||||
anthropic_version: Optional[str] = "bedrock-2023-05-31"
|
anthropic_version: Optional[str] = "bedrock-2023-05-31"
|
||||||
|
system: Optional[str] = None
|
||||||
|
temperature: Optional[float] = None
|
||||||
|
top_p: Optional[float] = None
|
||||||
|
top_k: Optional[int] = None
|
||||||
|
stop_sequences: Optional[List[str]] = None
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
@ -128,6 +138,12 @@ class AmazonAnthropicClaude3Config:
|
||||||
optional_params["tools"] = value
|
optional_params["tools"] = value
|
||||||
if param == "stream":
|
if param == "stream":
|
||||||
optional_params["stream"] = value
|
optional_params["stream"] = value
|
||||||
|
if param == "stop":
|
||||||
|
optional_params["stop_sequences"] = value
|
||||||
|
if param == "temperature":
|
||||||
|
optional_params["temperature"] = value
|
||||||
|
if param == "top_p":
|
||||||
|
optional_params["top_p"] = value
|
||||||
return optional_params
|
return optional_params
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -300,8 +300,7 @@ def embedding(
|
||||||
for text in input:
|
for text in input:
|
||||||
input_tokens += len(encoding.encode(text))
|
input_tokens += len(encoding.encode(text))
|
||||||
|
|
||||||
model_response["usage"] = {
|
model_response["usage"] = Usage(
|
||||||
"prompt_tokens": input_tokens,
|
prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens
|
||||||
"total_tokens": input_tokens,
|
)
|
||||||
}
|
|
||||||
return model_response
|
return model_response
|
||||||
|
|
|
@ -705,6 +705,7 @@ def anthropic_messages_pt(messages: list):
|
||||||
"text"
|
"text"
|
||||||
].rstrip() # no trailing whitespace for final assistant message
|
].rstrip() # no trailing whitespace for final assistant message
|
||||||
|
|
||||||
|
|
||||||
return new_messages
|
return new_messages
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -12,6 +12,7 @@ from typing import Any, Literal, Union, BinaryIO
|
||||||
from functools import partial
|
from functools import partial
|
||||||
import dotenv, traceback, random, asyncio, time, contextvars
|
import dotenv, traceback, random, asyncio, time, contextvars
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
import litellm
|
import litellm
|
||||||
from ._logging import verbose_logger
|
from ._logging import verbose_logger
|
||||||
|
@ -891,6 +892,7 @@ def completion(
|
||||||
or custom_llm_provider == "mistral"
|
or custom_llm_provider == "mistral"
|
||||||
or custom_llm_provider == "openai"
|
or custom_llm_provider == "openai"
|
||||||
or custom_llm_provider == "together_ai"
|
or custom_llm_provider == "together_ai"
|
||||||
|
or custom_llm_provider in litellm.openai_compatible_providers
|
||||||
or "ft:gpt-3.5-turbo" in model # finetune gpt-3.5-turbo
|
or "ft:gpt-3.5-turbo" in model # finetune gpt-3.5-turbo
|
||||||
): # allow user to make an openai call with a custom base
|
): # allow user to make an openai call with a custom base
|
||||||
# note: if a user sets a custom base - we should ensure this works
|
# note: if a user sets a custom base - we should ensure this works
|
||||||
|
@ -2393,6 +2395,7 @@ async def aembedding(*args, **kwargs):
|
||||||
or custom_llm_provider == "deepinfra"
|
or custom_llm_provider == "deepinfra"
|
||||||
or custom_llm_provider == "perplexity"
|
or custom_llm_provider == "perplexity"
|
||||||
or custom_llm_provider == "groq"
|
or custom_llm_provider == "groq"
|
||||||
|
or custom_llm_provider == "fireworks_ai"
|
||||||
or custom_llm_provider == "ollama"
|
or custom_llm_provider == "ollama"
|
||||||
or custom_llm_provider == "vertex_ai"
|
or custom_llm_provider == "vertex_ai"
|
||||||
): # currently implemented aiohttp calls for just azure and openai, soon all.
|
): # currently implemented aiohttp calls for just azure and openai, soon all.
|
||||||
|
@ -2892,6 +2895,7 @@ async def atext_completion(*args, **kwargs):
|
||||||
or custom_llm_provider == "deepinfra"
|
or custom_llm_provider == "deepinfra"
|
||||||
or custom_llm_provider == "perplexity"
|
or custom_llm_provider == "perplexity"
|
||||||
or custom_llm_provider == "groq"
|
or custom_llm_provider == "groq"
|
||||||
|
or custom_llm_provider == "fireworks_ai"
|
||||||
or custom_llm_provider == "text-completion-openai"
|
or custom_llm_provider == "text-completion-openai"
|
||||||
or custom_llm_provider == "huggingface"
|
or custom_llm_provider == "huggingface"
|
||||||
or custom_llm_provider == "ollama"
|
or custom_llm_provider == "ollama"
|
||||||
|
|
|
@ -631,6 +631,13 @@
|
||||||
"litellm_provider": "groq",
|
"litellm_provider": "groq",
|
||||||
"mode": "chat"
|
"mode": "chat"
|
||||||
},
|
},
|
||||||
|
"groq/gemma-7b-it": {
|
||||||
|
"max_tokens": 8192,
|
||||||
|
"input_cost_per_token": 0.00000010,
|
||||||
|
"output_cost_per_token": 0.00000010,
|
||||||
|
"litellm_provider": "groq",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
"claude-instant-1.2": {
|
"claude-instant-1.2": {
|
||||||
"max_tokens": 100000,
|
"max_tokens": 100000,
|
||||||
"max_output_tokens": 8191,
|
"max_output_tokens": 8191,
|
||||||
|
|
|
@ -9,6 +9,12 @@ model_list:
|
||||||
model: gpt-3.5-turbo-1106
|
model: gpt-3.5-turbo-1106
|
||||||
api_key: os.environ/OPENAI_API_KEY
|
api_key: os.environ/OPENAI_API_KEY
|
||||||
|
|
||||||
|
litellm_settings:
|
||||||
|
cache: true
|
||||||
|
cache_params:
|
||||||
|
type: redis
|
||||||
|
callbacks: ["batch_redis_requests"]
|
||||||
|
|
||||||
general_settings:
|
general_settings:
|
||||||
master_key: sk-1234
|
master_key: sk-1234
|
||||||
database_url: "postgresql://krrishdholakia:9yQkKWiB8vVs@ep-icy-union-a5j4dwls.us-east-2.aws.neon.tech/neondb?sslmode=require"
|
# database_url: "postgresql://krrishdholakia:9yQkKWiB8vVs@ep-icy-union-a5j4dwls.us-east-2.aws.neon.tech/neondb?sslmode=require"
|
124
litellm/proxy/hooks/batch_redis_get.py
Normal file
124
litellm/proxy/hooks/batch_redis_get.py
Normal file
|
@ -0,0 +1,124 @@
|
||||||
|
# What this does?
|
||||||
|
## Gets a key's redis cache, and store it in memory for 1 minute.
|
||||||
|
## This reduces the number of REDIS GET requests made during high-traffic by the proxy.
|
||||||
|
### [BETA] this is in Beta. And might change.
|
||||||
|
|
||||||
|
from typing import Optional, Literal
|
||||||
|
import litellm
|
||||||
|
from litellm.caching import DualCache, RedisCache, InMemoryCache
|
||||||
|
from litellm.proxy._types import UserAPIKeyAuth
|
||||||
|
from litellm.integrations.custom_logger import CustomLogger
|
||||||
|
from litellm._logging import verbose_proxy_logger
|
||||||
|
from fastapi import HTTPException
|
||||||
|
import json, traceback
|
||||||
|
|
||||||
|
|
||||||
|
class _PROXY_BatchRedisRequests(CustomLogger):
|
||||||
|
# Class variables or attributes
|
||||||
|
in_memory_cache: Optional[InMemoryCache] = None
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
litellm.cache.async_get_cache = (
|
||||||
|
self.async_get_cache
|
||||||
|
) # map the litellm 'get_cache' function to our custom function
|
||||||
|
|
||||||
|
def print_verbose(
|
||||||
|
self, print_statement, debug_level: Literal["INFO", "DEBUG"] = "DEBUG"
|
||||||
|
):
|
||||||
|
if debug_level == "DEBUG":
|
||||||
|
verbose_proxy_logger.debug(print_statement)
|
||||||
|
elif debug_level == "INFO":
|
||||||
|
verbose_proxy_logger.debug(print_statement)
|
||||||
|
if litellm.set_verbose is True:
|
||||||
|
print(print_statement) # noqa
|
||||||
|
|
||||||
|
async def async_pre_call_hook(
|
||||||
|
self,
|
||||||
|
user_api_key_dict: UserAPIKeyAuth,
|
||||||
|
cache: DualCache,
|
||||||
|
data: dict,
|
||||||
|
call_type: str,
|
||||||
|
):
|
||||||
|
try:
|
||||||
|
"""
|
||||||
|
Get the user key
|
||||||
|
|
||||||
|
Check if a key starting with `litellm:<api_key>:<call_type:` exists in-memory
|
||||||
|
|
||||||
|
If no, then get relevant cache from redis
|
||||||
|
"""
|
||||||
|
api_key = user_api_key_dict.api_key
|
||||||
|
|
||||||
|
cache_key_name = f"litellm:{api_key}:{call_type}"
|
||||||
|
self.in_memory_cache = cache.in_memory_cache
|
||||||
|
|
||||||
|
key_value_dict = {}
|
||||||
|
in_memory_cache_exists = False
|
||||||
|
for key in cache.in_memory_cache.cache_dict.keys():
|
||||||
|
if isinstance(key, str) and key.startswith(cache_key_name):
|
||||||
|
in_memory_cache_exists = True
|
||||||
|
|
||||||
|
if in_memory_cache_exists == False and litellm.cache is not None:
|
||||||
|
"""
|
||||||
|
- Check if `litellm.Cache` is redis
|
||||||
|
- Get the relevant values
|
||||||
|
"""
|
||||||
|
if litellm.cache.type is not None and isinstance(
|
||||||
|
litellm.cache.cache, RedisCache
|
||||||
|
):
|
||||||
|
# Initialize an empty list to store the keys
|
||||||
|
keys = []
|
||||||
|
self.print_verbose(f"cache_key_name: {cache_key_name}")
|
||||||
|
# Use the SCAN iterator to fetch keys matching the pattern
|
||||||
|
keys = await litellm.cache.cache.async_scan_iter(
|
||||||
|
pattern=cache_key_name, count=100
|
||||||
|
)
|
||||||
|
# If you need the truly "last" based on time or another criteria,
|
||||||
|
# ensure your key naming or storage strategy allows this determination
|
||||||
|
# Here you would sort or filter the keys as needed based on your strategy
|
||||||
|
self.print_verbose(f"redis keys: {keys}")
|
||||||
|
if len(keys) > 0:
|
||||||
|
key_value_dict = (
|
||||||
|
await litellm.cache.cache.async_get_cache_pipeline(
|
||||||
|
key_list=keys
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
## Add to cache
|
||||||
|
if len(key_value_dict.items()) > 0:
|
||||||
|
await cache.in_memory_cache.async_set_cache_pipeline(
|
||||||
|
cache_list=list(key_value_dict.items()), ttl=60
|
||||||
|
)
|
||||||
|
## Set cache namespace if it's a miss
|
||||||
|
data["metadata"]["redis_namespace"] = cache_key_name
|
||||||
|
except HTTPException as e:
|
||||||
|
raise e
|
||||||
|
except Exception as e:
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
|
async def async_get_cache(self, *args, **kwargs):
|
||||||
|
"""
|
||||||
|
- Check if the cache key is in-memory
|
||||||
|
|
||||||
|
- Else return None
|
||||||
|
"""
|
||||||
|
try: # never block execution
|
||||||
|
if "cache_key" in kwargs:
|
||||||
|
cache_key = kwargs["cache_key"]
|
||||||
|
else:
|
||||||
|
cache_key = litellm.cache.get_cache_key(
|
||||||
|
*args, **kwargs
|
||||||
|
) # returns "<cache_key_name>:<hash>" - we pass redis_namespace in async_pre_call_hook. Done to avoid rewriting the async_set_cache logic
|
||||||
|
if cache_key is not None and self.in_memory_cache is not None:
|
||||||
|
cache_control_args = kwargs.get("cache", {})
|
||||||
|
max_age = cache_control_args.get(
|
||||||
|
"s-max-age", cache_control_args.get("s-maxage", float("inf"))
|
||||||
|
)
|
||||||
|
cached_result = self.in_memory_cache.get_cache(
|
||||||
|
cache_key, *args, **kwargs
|
||||||
|
)
|
||||||
|
return litellm.cache._get_cache_logic(
|
||||||
|
cached_result=cached_result, max_age=max_age
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
return None
|
|
@ -6,7 +6,7 @@ import time
|
||||||
class MyUser(HttpUser):
|
class MyUser(HttpUser):
|
||||||
wait_time = between(1, 5)
|
wait_time = between(1, 5)
|
||||||
|
|
||||||
@task
|
@task(3)
|
||||||
def chat_completion(self):
|
def chat_completion(self):
|
||||||
headers = {
|
headers = {
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
|
@ -31,62 +31,8 @@ class MyUser(HttpUser):
|
||||||
|
|
||||||
@task(10)
|
@task(10)
|
||||||
def health_readiness(self):
|
def health_readiness(self):
|
||||||
start_time = time.time()
|
|
||||||
response = self.client.get("health/readiness")
|
response = self.client.get("health/readiness")
|
||||||
response_time = time.time() - start_time
|
|
||||||
if response_time > 1:
|
|
||||||
events.request_failure.fire(
|
|
||||||
request_type="GET",
|
|
||||||
name="health/readiness",
|
|
||||||
response_time=response_time,
|
|
||||||
exception=None,
|
|
||||||
response=response,
|
|
||||||
)
|
|
||||||
|
|
||||||
@task(10)
|
@task(10)
|
||||||
def health_liveliness(self):
|
def health_liveliness(self):
|
||||||
start_time = time.time()
|
|
||||||
response = self.client.get("health/liveliness")
|
response = self.client.get("health/liveliness")
|
||||||
response_time = time.time() - start_time
|
|
||||||
if response_time > 1:
|
|
||||||
events.request_failure.fire(
|
|
||||||
request_type="GET",
|
|
||||||
name="health/liveliness",
|
|
||||||
response_time=response_time,
|
|
||||||
exception=None,
|
|
||||||
response=response,
|
|
||||||
)
|
|
||||||
|
|
||||||
# @task
|
|
||||||
# def key_generate(self):
|
|
||||||
# headers = {
|
|
||||||
# "Authorization": "Bearer sk-1234",
|
|
||||||
# "Content-Type": "application/json",
|
|
||||||
# }
|
|
||||||
|
|
||||||
# payload = {
|
|
||||||
# "models": ["gpt-3.5-turbo", "gpt-4", "claude-2"],
|
|
||||||
# "duration": "20m",
|
|
||||||
# "metadata": {"user": "ishaan@berri.ai"},
|
|
||||||
# "team_id": "core-infra",
|
|
||||||
# "max_budget": 10,
|
|
||||||
# "soft_budget": 5,
|
|
||||||
# }
|
|
||||||
|
|
||||||
# response = self.client.post("key/generate", json=payload, headers=headers)
|
|
||||||
|
|
||||||
# if response.status_code == 200:
|
|
||||||
# key_response = response.json()
|
|
||||||
# models = key_response.get("models", [])
|
|
||||||
# if models:
|
|
||||||
# # Use the first model from the key generation response to make a chat completions request
|
|
||||||
# model_to_use = models[0]
|
|
||||||
# chat_payload = {
|
|
||||||
# "model": model_to_use,
|
|
||||||
# "messages": [
|
|
||||||
# {"role": "system", "content": "You are a chat bot."},
|
|
||||||
# {"role": "user", "content": "Hello, how are you?"},
|
|
||||||
# ],
|
|
||||||
# }
|
|
||||||
# chat_response = self.client.post("chat/completions", json=chat_payload, headers=headers)
|
|
||||||
# # Print or log the chat response if needed
|
|
||||||
|
|
|
@ -8,7 +8,6 @@ import hashlib, uuid
|
||||||
import warnings
|
import warnings
|
||||||
import importlib
|
import importlib
|
||||||
import warnings
|
import warnings
|
||||||
import backoff
|
|
||||||
|
|
||||||
|
|
||||||
def showwarning(message, category, filename, lineno, file=None, line=None):
|
def showwarning(message, category, filename, lineno, file=None, line=None):
|
||||||
|
@ -35,7 +34,6 @@ try:
|
||||||
import orjson
|
import orjson
|
||||||
import logging
|
import logging
|
||||||
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
||||||
from argon2 import PasswordHasher
|
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
raise ImportError(f"Missing dependency {e}. Run `pip install 'litellm[proxy]'`")
|
raise ImportError(f"Missing dependency {e}. Run `pip install 'litellm[proxy]'`")
|
||||||
|
|
||||||
|
@ -145,9 +143,12 @@ from typing import Union
|
||||||
try:
|
try:
|
||||||
# when using litellm cli
|
# when using litellm cli
|
||||||
import litellm.proxy.enterprise as enterprise
|
import litellm.proxy.enterprise as enterprise
|
||||||
except:
|
except Exception as e:
|
||||||
# when using litellm docker image
|
# when using litellm docker image
|
||||||
|
try:
|
||||||
import enterprise # type: ignore
|
import enterprise # type: ignore
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
||||||
|
|
||||||
ui_link = f"/ui/"
|
ui_link = f"/ui/"
|
||||||
ui_message = (
|
ui_message = (
|
||||||
|
@ -252,7 +253,6 @@ user_headers = None
|
||||||
user_config_file_path = f"config_{int(time.time())}.yaml"
|
user_config_file_path = f"config_{int(time.time())}.yaml"
|
||||||
local_logging = True # writes logs to a local api_log.json file for debugging
|
local_logging = True # writes logs to a local api_log.json file for debugging
|
||||||
experimental = False
|
experimental = False
|
||||||
ph = PasswordHasher()
|
|
||||||
#### GLOBAL VARIABLES ####
|
#### GLOBAL VARIABLES ####
|
||||||
llm_router: Optional[litellm.Router] = None
|
llm_router: Optional[litellm.Router] = None
|
||||||
llm_model_list: Optional[list] = None
|
llm_model_list: Optional[list] = None
|
||||||
|
@ -382,7 +382,7 @@ async def user_api_key_auth(
|
||||||
return valid_token
|
return valid_token
|
||||||
|
|
||||||
try:
|
try:
|
||||||
is_master_key_valid = ph.verify(litellm_master_key_hash, api_key)
|
is_master_key_valid = secrets.compare_digest(api_key, master_key)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
is_master_key_valid = False
|
is_master_key_valid = False
|
||||||
|
|
||||||
|
@ -887,6 +887,9 @@ async def user_api_key_auth(
|
||||||
raise Exception(
|
raise Exception(
|
||||||
f"This key is made for LiteLLM UI, Tried to access route: {route}. Not allowed"
|
f"This key is made for LiteLLM UI, Tried to access route: {route}. Not allowed"
|
||||||
)
|
)
|
||||||
|
if valid_token is None:
|
||||||
|
# No token was found when looking up in the DB
|
||||||
|
raise Exception("Invalid token passed")
|
||||||
if valid_token_dict is not None:
|
if valid_token_dict is not None:
|
||||||
return UserAPIKeyAuth(api_key=api_key, **valid_token_dict)
|
return UserAPIKeyAuth(api_key=api_key, **valid_token_dict)
|
||||||
else:
|
else:
|
||||||
|
@ -1420,6 +1423,8 @@ async def update_cache(
|
||||||
try:
|
try:
|
||||||
for _id in user_ids:
|
for _id in user_ids:
|
||||||
# Fetch the existing cost for the given user
|
# Fetch the existing cost for the given user
|
||||||
|
if _id is None:
|
||||||
|
continue
|
||||||
existing_spend_obj = await user_api_key_cache.async_get_cache(key=_id)
|
existing_spend_obj = await user_api_key_cache.async_get_cache(key=_id)
|
||||||
if existing_spend_obj is None:
|
if existing_spend_obj is None:
|
||||||
# if user does not exist in LiteLLM_UserTable, create a new user
|
# if user does not exist in LiteLLM_UserTable, create a new user
|
||||||
|
@ -1791,6 +1796,16 @@ class ProxyConfig:
|
||||||
_ENTERPRISE_PromptInjectionDetection()
|
_ENTERPRISE_PromptInjectionDetection()
|
||||||
)
|
)
|
||||||
imported_list.append(prompt_injection_detection_obj)
|
imported_list.append(prompt_injection_detection_obj)
|
||||||
|
elif (
|
||||||
|
isinstance(callback, str)
|
||||||
|
and callback == "batch_redis_requests"
|
||||||
|
):
|
||||||
|
from litellm.proxy.hooks.batch_redis_get import (
|
||||||
|
_PROXY_BatchRedisRequests,
|
||||||
|
)
|
||||||
|
|
||||||
|
batch_redis_obj = _PROXY_BatchRedisRequests()
|
||||||
|
imported_list.append(batch_redis_obj)
|
||||||
else:
|
else:
|
||||||
imported_list.append(
|
imported_list.append(
|
||||||
get_instance_fn(
|
get_instance_fn(
|
||||||
|
@ -1913,7 +1928,7 @@ class ProxyConfig:
|
||||||
master_key = litellm.get_secret(master_key)
|
master_key = litellm.get_secret(master_key)
|
||||||
|
|
||||||
if master_key is not None and isinstance(master_key, str):
|
if master_key is not None and isinstance(master_key, str):
|
||||||
litellm_master_key_hash = ph.hash(master_key)
|
litellm_master_key_hash = master_key
|
||||||
### CUSTOM API KEY AUTH ###
|
### CUSTOM API KEY AUTH ###
|
||||||
## pass filepath
|
## pass filepath
|
||||||
custom_auth = general_settings.get("custom_auth", None)
|
custom_auth = general_settings.get("custom_auth", None)
|
||||||
|
|
|
@ -474,11 +474,10 @@ def test_redis_cache_completion_stream():
|
||||||
# test_redis_cache_completion_stream()
|
# test_redis_cache_completion_stream()
|
||||||
|
|
||||||
|
|
||||||
def test_redis_cache_acompletion_stream():
|
@pytest.mark.asyncio
|
||||||
import asyncio
|
async def test_redis_cache_acompletion_stream():
|
||||||
|
|
||||||
try:
|
try:
|
||||||
litellm.set_verbose = False
|
litellm.set_verbose = True
|
||||||
random_word = generate_random_word()
|
random_word = generate_random_word()
|
||||||
messages = [
|
messages = [
|
||||||
{
|
{
|
||||||
|
@ -496,8 +495,6 @@ def test_redis_cache_acompletion_stream():
|
||||||
response_1_content = ""
|
response_1_content = ""
|
||||||
response_2_content = ""
|
response_2_content = ""
|
||||||
|
|
||||||
async def call1():
|
|
||||||
nonlocal response_1_content
|
|
||||||
response1 = await litellm.acompletion(
|
response1 = await litellm.acompletion(
|
||||||
model="gpt-3.5-turbo",
|
model="gpt-3.5-turbo",
|
||||||
messages=messages,
|
messages=messages,
|
||||||
|
@ -509,12 +506,9 @@ def test_redis_cache_acompletion_stream():
|
||||||
response_1_content += chunk.choices[0].delta.content or ""
|
response_1_content += chunk.choices[0].delta.content or ""
|
||||||
print(response_1_content)
|
print(response_1_content)
|
||||||
|
|
||||||
asyncio.run(call1())
|
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
print("\n\n Response 1 content: ", response_1_content, "\n\n")
|
print("\n\n Response 1 content: ", response_1_content, "\n\n")
|
||||||
|
|
||||||
async def call2():
|
|
||||||
nonlocal response_2_content
|
|
||||||
response2 = await litellm.acompletion(
|
response2 = await litellm.acompletion(
|
||||||
model="gpt-3.5-turbo",
|
model="gpt-3.5-turbo",
|
||||||
messages=messages,
|
messages=messages,
|
||||||
|
@ -526,7 +520,6 @@ def test_redis_cache_acompletion_stream():
|
||||||
response_2_content += chunk.choices[0].delta.content or ""
|
response_2_content += chunk.choices[0].delta.content or ""
|
||||||
print(response_2_content)
|
print(response_2_content)
|
||||||
|
|
||||||
asyncio.run(call2())
|
|
||||||
print("\nresponse 1", response_1_content)
|
print("\nresponse 1", response_1_content)
|
||||||
print("\nresponse 2", response_2_content)
|
print("\nresponse 2", response_2_content)
|
||||||
assert (
|
assert (
|
||||||
|
@ -536,14 +529,15 @@ def test_redis_cache_acompletion_stream():
|
||||||
litellm.success_callback = []
|
litellm.success_callback = []
|
||||||
litellm._async_success_callback = []
|
litellm._async_success_callback = []
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
print(f"{str(e)}\n\n{traceback.format_exc()}")
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
|
|
||||||
# test_redis_cache_acompletion_stream()
|
# test_redis_cache_acompletion_stream()
|
||||||
|
|
||||||
|
|
||||||
def test_redis_cache_acompletion_stream_bedrock():
|
@pytest.mark.asyncio
|
||||||
|
async def test_redis_cache_acompletion_stream_bedrock():
|
||||||
import asyncio
|
import asyncio
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
@ -565,8 +559,6 @@ def test_redis_cache_acompletion_stream_bedrock():
|
||||||
response_1_content = ""
|
response_1_content = ""
|
||||||
response_2_content = ""
|
response_2_content = ""
|
||||||
|
|
||||||
async def call1():
|
|
||||||
nonlocal response_1_content
|
|
||||||
response1 = await litellm.acompletion(
|
response1 = await litellm.acompletion(
|
||||||
model="bedrock/anthropic.claude-v2",
|
model="bedrock/anthropic.claude-v2",
|
||||||
messages=messages,
|
messages=messages,
|
||||||
|
@ -579,12 +571,9 @@ def test_redis_cache_acompletion_stream_bedrock():
|
||||||
response_1_content += chunk.choices[0].delta.content or ""
|
response_1_content += chunk.choices[0].delta.content or ""
|
||||||
print(response_1_content)
|
print(response_1_content)
|
||||||
|
|
||||||
asyncio.run(call1())
|
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
print("\n\n Response 1 content: ", response_1_content, "\n\n")
|
print("\n\n Response 1 content: ", response_1_content, "\n\n")
|
||||||
|
|
||||||
async def call2():
|
|
||||||
nonlocal response_2_content
|
|
||||||
response2 = await litellm.acompletion(
|
response2 = await litellm.acompletion(
|
||||||
model="bedrock/anthropic.claude-v2",
|
model="bedrock/anthropic.claude-v2",
|
||||||
messages=messages,
|
messages=messages,
|
||||||
|
@ -597,7 +586,6 @@ def test_redis_cache_acompletion_stream_bedrock():
|
||||||
response_2_content += chunk.choices[0].delta.content or ""
|
response_2_content += chunk.choices[0].delta.content or ""
|
||||||
print(response_2_content)
|
print(response_2_content)
|
||||||
|
|
||||||
asyncio.run(call2())
|
|
||||||
print("\nresponse 1", response_1_content)
|
print("\nresponse 1", response_1_content)
|
||||||
print("\nresponse 2", response_2_content)
|
print("\nresponse 2", response_2_content)
|
||||||
assert (
|
assert (
|
||||||
|
@ -612,8 +600,8 @@ def test_redis_cache_acompletion_stream_bedrock():
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip(reason="AWS Suspended Account")
|
@pytest.mark.asyncio
|
||||||
def test_s3_cache_acompletion_stream_azure():
|
async def test_s3_cache_acompletion_stream_azure():
|
||||||
import asyncio
|
import asyncio
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
@ -637,8 +625,6 @@ def test_s3_cache_acompletion_stream_azure():
|
||||||
response_1_created = ""
|
response_1_created = ""
|
||||||
response_2_created = ""
|
response_2_created = ""
|
||||||
|
|
||||||
async def call1():
|
|
||||||
nonlocal response_1_content, response_1_created
|
|
||||||
response1 = await litellm.acompletion(
|
response1 = await litellm.acompletion(
|
||||||
model="azure/chatgpt-v-2",
|
model="azure/chatgpt-v-2",
|
||||||
messages=messages,
|
messages=messages,
|
||||||
|
@ -652,12 +638,9 @@ def test_s3_cache_acompletion_stream_azure():
|
||||||
response_1_content += chunk.choices[0].delta.content or ""
|
response_1_content += chunk.choices[0].delta.content or ""
|
||||||
print(response_1_content)
|
print(response_1_content)
|
||||||
|
|
||||||
asyncio.run(call1())
|
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
print("\n\n Response 1 content: ", response_1_content, "\n\n")
|
print("\n\n Response 1 content: ", response_1_content, "\n\n")
|
||||||
|
|
||||||
async def call2():
|
|
||||||
nonlocal response_2_content, response_2_created
|
|
||||||
response2 = await litellm.acompletion(
|
response2 = await litellm.acompletion(
|
||||||
model="azure/chatgpt-v-2",
|
model="azure/chatgpt-v-2",
|
||||||
messages=messages,
|
messages=messages,
|
||||||
|
@ -671,7 +654,6 @@ def test_s3_cache_acompletion_stream_azure():
|
||||||
response_2_created = chunk.created
|
response_2_created = chunk.created
|
||||||
print(response_2_content)
|
print(response_2_content)
|
||||||
|
|
||||||
asyncio.run(call2())
|
|
||||||
print("\nresponse 1", response_1_content)
|
print("\nresponse 1", response_1_content)
|
||||||
print("\nresponse 2", response_2_content)
|
print("\nresponse 2", response_2_content)
|
||||||
|
|
||||||
|
|
|
@ -358,7 +358,7 @@ def test_completion_mistral_azure():
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
# Add any assertions here to check the response
|
# Add any assertions here to check, the response
|
||||||
print(response)
|
print(response)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -575,6 +575,25 @@ def test_completion_azure_gpt4_vision():
|
||||||
# test_completion_azure_gpt4_vision()
|
# test_completion_azure_gpt4_vision()
|
||||||
|
|
||||||
|
|
||||||
|
def test_completion_fireworks_ai():
|
||||||
|
try:
|
||||||
|
litellm.set_verbose = True
|
||||||
|
messages = [
|
||||||
|
{"role": "system", "content": "You're a good bot"},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "Hey",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
response = completion(
|
||||||
|
model="fireworks_ai/accounts/fireworks/models/mixtral-8x7b-instruct",
|
||||||
|
messages=messages,
|
||||||
|
)
|
||||||
|
print(response)
|
||||||
|
except Exception as e:
|
||||||
|
pytest.fail(f"Error occurred: {e}")
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip(reason="this test is flaky")
|
@pytest.mark.skip(reason="this test is flaky")
|
||||||
def test_completion_perplexity_api():
|
def test_completion_perplexity_api():
|
||||||
try:
|
try:
|
||||||
|
|
|
@ -97,27 +97,23 @@ class TmpFunction:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_async_chat_openai_stream():
|
@pytest.mark.asyncio
|
||||||
|
async def test_async_chat_openai_stream():
|
||||||
try:
|
try:
|
||||||
tmp_function = TmpFunction()
|
tmp_function = TmpFunction()
|
||||||
litellm.set_verbose = True
|
litellm.set_verbose = True
|
||||||
litellm.success_callback = [tmp_function.async_test_logging_fn]
|
litellm.success_callback = [tmp_function.async_test_logging_fn]
|
||||||
complete_streaming_response = ""
|
complete_streaming_response = ""
|
||||||
|
|
||||||
async def call_gpt():
|
|
||||||
nonlocal complete_streaming_response
|
|
||||||
response = await litellm.acompletion(
|
response = await litellm.acompletion(
|
||||||
model="gpt-3.5-turbo",
|
model="gpt-3.5-turbo",
|
||||||
messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}],
|
messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}],
|
||||||
stream=True,
|
stream=True,
|
||||||
)
|
)
|
||||||
async for chunk in response:
|
async for chunk in response:
|
||||||
complete_streaming_response += (
|
complete_streaming_response += chunk["choices"][0]["delta"]["content"] or ""
|
||||||
chunk["choices"][0]["delta"]["content"] or ""
|
|
||||||
)
|
|
||||||
print(complete_streaming_response)
|
print(complete_streaming_response)
|
||||||
|
|
||||||
asyncio.run(call_gpt())
|
|
||||||
complete_streaming_response = complete_streaming_response.strip("'")
|
complete_streaming_response = complete_streaming_response.strip("'")
|
||||||
response1 = tmp_function.complete_streaming_response_in_callback["choices"][0][
|
response1 = tmp_function.complete_streaming_response_in_callback["choices"][0][
|
||||||
"message"
|
"message"
|
||||||
|
@ -130,7 +126,7 @@ def test_async_chat_openai_stream():
|
||||||
assert tmp_function.async_success == True
|
assert tmp_function.async_success == True
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
print(e)
|
||||||
pytest.fail(f"An error occurred - {str(e)}")
|
pytest.fail(f"An error occurred - {str(e)}\n\n{traceback.format_exc()}")
|
||||||
|
|
||||||
|
|
||||||
# test_async_chat_openai_stream()
|
# test_async_chat_openai_stream()
|
||||||
|
|
|
@ -117,6 +117,8 @@ def test_openai_azure_embedding_simple():
|
||||||
|
|
||||||
print("Calculated request cost=", request_cost)
|
print("Calculated request cost=", request_cost)
|
||||||
|
|
||||||
|
assert isinstance(response.usage, litellm.Usage)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pytest.fail(f"Error occurred: {e}")
|
pytest.fail(f"Error occurred: {e}")
|
||||||
|
|
||||||
|
@ -204,6 +206,8 @@ def test_cohere_embedding():
|
||||||
input=["good morning from litellm", "this is another item"],
|
input=["good morning from litellm", "this is another item"],
|
||||||
)
|
)
|
||||||
print(f"response:", response)
|
print(f"response:", response)
|
||||||
|
|
||||||
|
assert isinstance(response.usage, litellm.Usage)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pytest.fail(f"Error occurred: {e}")
|
pytest.fail(f"Error occurred: {e}")
|
||||||
|
|
||||||
|
@ -269,6 +273,8 @@ def test_bedrock_embedding_titan():
|
||||||
|
|
||||||
assert end_time - start_time < 0.1
|
assert end_time - start_time < 0.1
|
||||||
litellm.disable_cache()
|
litellm.disable_cache()
|
||||||
|
|
||||||
|
assert isinstance(response.usage, litellm.Usage)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pytest.fail(f"Error occurred: {e}")
|
pytest.fail(f"Error occurred: {e}")
|
||||||
|
|
||||||
|
@ -295,6 +301,8 @@ def test_bedrock_embedding_cohere():
|
||||||
isinstance(x, float) for x in response["data"][0]["embedding"]
|
isinstance(x, float) for x in response["data"][0]["embedding"]
|
||||||
), "Expected response to be a list of floats"
|
), "Expected response to be a list of floats"
|
||||||
# print(f"response:", response)
|
# print(f"response:", response)
|
||||||
|
|
||||||
|
assert isinstance(response.usage, litellm.Usage)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pytest.fail(f"Error occurred: {e}")
|
pytest.fail(f"Error occurred: {e}")
|
||||||
|
|
||||||
|
@ -331,6 +339,8 @@ def test_hf_embedding():
|
||||||
input=["good morning from litellm", "this is another item"],
|
input=["good morning from litellm", "this is another item"],
|
||||||
)
|
)
|
||||||
print(f"response:", response)
|
print(f"response:", response)
|
||||||
|
|
||||||
|
assert isinstance(response.usage, litellm.Usage)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Note: Huggingface inference API is unstable and fails with "model loading errors all the time"
|
# Note: Huggingface inference API is unstable and fails with "model loading errors all the time"
|
||||||
pass
|
pass
|
||||||
|
@ -386,6 +396,8 @@ def test_aembedding_azure():
|
||||||
response._hidden_params["custom_llm_provider"],
|
response._hidden_params["custom_llm_provider"],
|
||||||
)
|
)
|
||||||
assert response._hidden_params["custom_llm_provider"] == "azure"
|
assert response._hidden_params["custom_llm_provider"] == "azure"
|
||||||
|
|
||||||
|
assert isinstance(response.usage, litellm.Usage)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pytest.fail(f"Error occurred: {e}")
|
pytest.fail(f"Error occurred: {e}")
|
||||||
|
|
||||||
|
@ -440,6 +452,7 @@ def test_mistral_embeddings():
|
||||||
input=["good morning from litellm"],
|
input=["good morning from litellm"],
|
||||||
)
|
)
|
||||||
print(f"response: {response}")
|
print(f"response: {response}")
|
||||||
|
assert isinstance(response.usage, litellm.Usage)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pytest.fail(f"Error occurred: {e}")
|
pytest.fail(f"Error occurred: {e}")
|
||||||
|
|
||||||
|
|
|
@ -158,7 +158,7 @@ def test_call_with_invalid_key(prisma_client):
|
||||||
|
|
||||||
async def test():
|
async def test():
|
||||||
await litellm.proxy.proxy_server.prisma_client.connect()
|
await litellm.proxy.proxy_server.prisma_client.connect()
|
||||||
generated_key = "bad-key"
|
generated_key = "sk-126666"
|
||||||
bearer_token = "Bearer " + generated_key
|
bearer_token = "Bearer " + generated_key
|
||||||
|
|
||||||
request = Request(scope={"type": "http"}, receive=None)
|
request = Request(scope={"type": "http"}, receive=None)
|
||||||
|
@ -173,7 +173,7 @@ def test_call_with_invalid_key(prisma_client):
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("Got Exception", e)
|
print("Got Exception", e)
|
||||||
print(e.message)
|
print(e.message)
|
||||||
assert "Authentication Error" in e.message
|
assert "Authentication Error, Invalid token passed" in e.message
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -72,7 +72,7 @@ from .integrations.litedebugger import LiteDebugger
|
||||||
from .proxy._types import KeyManagementSystem
|
from .proxy._types import KeyManagementSystem
|
||||||
from openai import OpenAIError as OriginalError
|
from openai import OpenAIError as OriginalError
|
||||||
from openai._models import BaseModel as OpenAIObject
|
from openai._models import BaseModel as OpenAIObject
|
||||||
from .caching import S3Cache, RedisSemanticCache
|
from .caching import S3Cache, RedisSemanticCache, RedisCache
|
||||||
from .exceptions import (
|
from .exceptions import (
|
||||||
AuthenticationError,
|
AuthenticationError,
|
||||||
BadRequestError,
|
BadRequestError,
|
||||||
|
@ -1795,6 +1795,11 @@ class Logging:
|
||||||
)
|
)
|
||||||
result = kwargs["async_complete_streaming_response"]
|
result = kwargs["async_complete_streaming_response"]
|
||||||
# only add to cache once we have a complete streaming response
|
# only add to cache once we have a complete streaming response
|
||||||
|
if litellm.cache is not None and not isinstance(
|
||||||
|
litellm.cache.cache, S3Cache
|
||||||
|
):
|
||||||
|
await litellm.cache.async_add_cache(result, **kwargs)
|
||||||
|
else:
|
||||||
litellm.cache.add_cache(result, **kwargs)
|
litellm.cache.add_cache(result, **kwargs)
|
||||||
if isinstance(callback, CustomLogger): # custom logger class
|
if isinstance(callback, CustomLogger): # custom logger class
|
||||||
print_verbose(
|
print_verbose(
|
||||||
|
@ -2589,7 +2594,7 @@ def client(original_function):
|
||||||
if (
|
if (
|
||||||
kwargs.get("max_tokens", None) is not None
|
kwargs.get("max_tokens", None) is not None
|
||||||
and model is not None
|
and model is not None
|
||||||
and litellm.drop_params
|
and litellm.modify_params
|
||||||
== True # user is okay with params being modified
|
== True # user is okay with params being modified
|
||||||
and (
|
and (
|
||||||
call_type == CallTypes.acompletion.value
|
call_type == CallTypes.acompletion.value
|
||||||
|
@ -2806,7 +2811,9 @@ def client(original_function):
|
||||||
):
|
):
|
||||||
if len(cached_result) == 1 and cached_result[0] is None:
|
if len(cached_result) == 1 and cached_result[0] is None:
|
||||||
cached_result = None
|
cached_result = None
|
||||||
elif isinstance(litellm.cache.cache, RedisSemanticCache):
|
elif isinstance(
|
||||||
|
litellm.cache.cache, RedisSemanticCache
|
||||||
|
) or isinstance(litellm.cache.cache, RedisCache):
|
||||||
preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs)
|
preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs)
|
||||||
kwargs["preset_cache_key"] = (
|
kwargs["preset_cache_key"] = (
|
||||||
preset_cache_key # for streaming calls, we need to pass the preset_cache_key
|
preset_cache_key # for streaming calls, we need to pass the preset_cache_key
|
||||||
|
@ -5375,6 +5382,17 @@ def get_llm_provider(
|
||||||
# groq is openai compatible, we just need to set this to custom_openai and have the api_base be https://api.groq.com/openai/v1
|
# groq is openai compatible, we just need to set this to custom_openai and have the api_base be https://api.groq.com/openai/v1
|
||||||
api_base = "https://api.groq.com/openai/v1"
|
api_base = "https://api.groq.com/openai/v1"
|
||||||
dynamic_api_key = get_secret("GROQ_API_KEY")
|
dynamic_api_key = get_secret("GROQ_API_KEY")
|
||||||
|
elif custom_llm_provider == "fireworks_ai":
|
||||||
|
# fireworks is openai compatible, we just need to set this to custom_openai and have the api_base be https://api.groq.com/openai/v1
|
||||||
|
if not model.startswith("accounts/fireworks/models"):
|
||||||
|
model = f"accounts/fireworks/models/{model}"
|
||||||
|
api_base = "https://api.fireworks.ai/inference/v1"
|
||||||
|
dynamic_api_key = (
|
||||||
|
get_secret("FIREWORKS_API_KEY")
|
||||||
|
or get_secret("FIREWORKS_AI_API_KEY")
|
||||||
|
or get_secret("FIREWORKSAI_API_KEY")
|
||||||
|
or get_secret("FIREWORKS_AI_TOKEN")
|
||||||
|
)
|
||||||
elif custom_llm_provider == "mistral":
|
elif custom_llm_provider == "mistral":
|
||||||
# mistral is openai compatible, we just need to set this to custom_openai and have the api_base be https://api.mistral.ai
|
# mistral is openai compatible, we just need to set this to custom_openai and have the api_base be https://api.mistral.ai
|
||||||
api_base = (
|
api_base = (
|
||||||
|
|
|
@ -631,6 +631,13 @@
|
||||||
"litellm_provider": "groq",
|
"litellm_provider": "groq",
|
||||||
"mode": "chat"
|
"mode": "chat"
|
||||||
},
|
},
|
||||||
|
"groq/gemma-7b-it": {
|
||||||
|
"max_tokens": 8192,
|
||||||
|
"input_cost_per_token": 0.00000010,
|
||||||
|
"output_cost_per_token": 0.00000010,
|
||||||
|
"litellm_provider": "groq",
|
||||||
|
"mode": "chat"
|
||||||
|
},
|
||||||
"claude-instant-1.2": {
|
"claude-instant-1.2": {
|
||||||
"max_tokens": 100000,
|
"max_tokens": 100000,
|
||||||
"max_output_tokens": 8191,
|
"max_output_tokens": 8191,
|
||||||
|
|
|
@ -45,13 +45,15 @@ litellm_settings:
|
||||||
budget_duration: 30d
|
budget_duration: 30d
|
||||||
num_retries: 5
|
num_retries: 5
|
||||||
request_timeout: 600
|
request_timeout: 600
|
||||||
|
cache: true
|
||||||
|
callbacks: ["batch_redis_requests"]
|
||||||
general_settings:
|
general_settings:
|
||||||
master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234)
|
master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234)
|
||||||
proxy_budget_rescheduler_min_time: 60
|
proxy_budget_rescheduler_min_time: 60
|
||||||
proxy_budget_rescheduler_max_time: 64
|
proxy_budget_rescheduler_max_time: 64
|
||||||
# database_url: "postgresql://<user>:<password>@<host>:<port>/<dbname>" # [OPTIONAL] use for token-based auth to proxy
|
# database_url: "postgresql://<user>:<password>@<host>:<port>/<dbname>" # [OPTIONAL] use for token-based auth to proxy
|
||||||
|
|
||||||
environment_variables:
|
# environment_variables:
|
||||||
# settings for using redis caching
|
# settings for using redis caching
|
||||||
# REDIS_HOST: redis-16337.c322.us-east-1-2.ec2.cloud.redislabs.com
|
# REDIS_HOST: redis-16337.c322.us-east-1-2.ec2.cloud.redislabs.com
|
||||||
# REDIS_PORT: "16337"
|
# REDIS_PORT: "16337"
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "litellm"
|
name = "litellm"
|
||||||
version = "1.31.12"
|
version = "1.31.16"
|
||||||
description = "Library to easily interface with LLM API providers"
|
description = "Library to easily interface with LLM API providers"
|
||||||
authors = ["BerriAI"]
|
authors = ["BerriAI"]
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
|
@ -76,7 +76,7 @@ requires = ["poetry-core", "wheel"]
|
||||||
build-backend = "poetry.core.masonry.api"
|
build-backend = "poetry.core.masonry.api"
|
||||||
|
|
||||||
[tool.commitizen]
|
[tool.commitizen]
|
||||||
version = "1.31.12"
|
version = "1.31.16"
|
||||||
version_files = [
|
version_files = [
|
||||||
"pyproject.toml:^version"
|
"pyproject.toml:^version"
|
||||||
]
|
]
|
||||||
|
|
|
@ -34,5 +34,4 @@ jinja2==3.1.3 # for prompt templates
|
||||||
certifi>=2023.7.22 # [TODO] clean up
|
certifi>=2023.7.22 # [TODO] clean up
|
||||||
aiohttp==3.9.0 # for network calls
|
aiohttp==3.9.0 # for network calls
|
||||||
aioboto3==12.3.0 # for async sagemaker calls
|
aioboto3==12.3.0 # for async sagemaker calls
|
||||||
argon2-cffi==23.1.0 # for checking secrets
|
|
||||||
####
|
####
|
Loading…
Add table
Add a link
Reference in a new issue