diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 000000000..efff383d4
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,5 @@
+/docs
+/cookbook
+/.circleci
+/.github
+/tests
\ No newline at end of file
diff --git a/.github/workflows/ghcr_deploy.yml b/.github/workflows/ghcr_deploy.yml
index d7cf4271c..f6e88bbd8 100644
--- a/.github/workflows/ghcr_deploy.yml
+++ b/.github/workflows/ghcr_deploy.yml
@@ -10,6 +10,7 @@ on:
env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}
+ CHART_NAME: litellm
# There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu.
jobs:
@@ -103,6 +104,11 @@ jobs:
uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}-database
+ # Configure multi platform Docker builds
+ - name: Set up QEMU
+ uses: docker/setup-qemu-action@e0e4588fad221d38ee467c0bffd91115366dc0c5
+ - name: Set up Docker Buildx
+ uses: docker/setup-buildx-action@edfb0fe6204400c56fbfd3feba3fe9ad1adfa345
- name: Build and push Database Docker image
uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
@@ -112,6 +118,60 @@ jobs:
push: true
tags: ${{ steps.meta-database.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-database.outputs.tags }}-latest
labels: ${{ steps.meta-database.outputs.labels }}
+ platforms: local,linux/amd64,linux/arm64,linux/arm64/v8
+ build-and-push-helm-chart:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v4
+
+ - name: Log in to the Container registry
+ uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
+ with:
+ registry: ${{ env.REGISTRY }}
+ username: ${{ github.actor }}
+ password: ${{ secrets.GITHUB_TOKEN }}
+
+ - name: lowercase github.repository_owner
+ run: |
+ echo "REPO_OWNER=`echo ${{github.repository_owner}} | tr '[:upper:]' '[:lower:]'`" >>${GITHUB_ENV}
+ - name: Get LiteLLM Latest Tag
+ id: current_app_tag
+ uses: WyriHaximus/github-action-get-previous-tag@v1.3.0
+
+ - name: Get last published chart version
+ id: current_version
+ shell: bash
+ run: |
+ CHART_LIST=$(helm show chart oci://${{ env.REGISTRY }}/${{ env.REPO_OWNER }}/${{ env.CHART_NAME }} 2>/dev/null || true)
+ if [ -z "${CHART_LIST}" ]; then
+ echo "current-version=0.1.0" | tee -a $GITHUB_OUTPUT
+ else
+ printf '%s' "${CHART_LIST}" | grep '^version:' | awk 'BEGIN{FS=":"}{print "current-version="$2}' | tr -d " " | tee -a $GITHUB_OUTPUT
+ fi
+ env:
+ HELM_EXPERIMENTAL_OCI: '1'
+
+ # Automatically update the helm chart version one "patch" level
+ - name: Bump release version
+ id: bump_version
+ uses: christian-draeger/increment-semantic-version@1.1.0
+ with:
+ current-version: ${{ steps.current_version.outputs.current-version || '0.1.0' }}
+ version-fragment: 'bug'
+
+ - uses: ./.github/actions/helm-oci-chart-releaser
+ with:
+ name: ${{ env.CHART_NAME }}
+ repository: ${{ env.REPO_OWNER }}
+ tag: ${{ github.event.inputs.chartVersion || steps.bump_version.outputs.next-version || '0.1.0' }}
+ app_version: ${{ steps.current_app_tag.outputs.tag || 'latest' }}
+ path: deploy/charts/${{ env.CHART_NAME }}
+ registry: ${{ env.REGISTRY }}
+ registry_username: ${{ github.actor }}
+ registry_password: ${{ secrets.GITHUB_TOKEN }}
+ update_dependencies: true
+
release:
name: "New LiteLLM Release"
needs: [docker-hub-deploy, build-and-push-image, build-and-push-image-database]
@@ -171,13 +231,13 @@ jobs:
RELEASE_NOTES: ${{ steps.release-notes.outputs.result }}
run: |
curl -H "Content-Type: application/json" -X POST -d '{
- "content": "||@everyone||",
+ "content": "New LiteLLM release ${{ env.RELEASE_TAG }}",
"username": "Release Changelog",
"avatar_url": "https://cdn.discordapp.com/avatars/487431320314576937/bd64361e4ba6313d561d54e78c9e7171.png",
"embeds": [
{
- "title": "Changelog for ${RELEASE_TAG}",
- "description": "${RELEASE_NOTES}",
+ "title": "Changelog for LiteLLM ${{ env.RELEASE_TAG }}",
+ "description": "${{ env.RELEASE_NOTES }}",
"color": 2105893
}
]
diff --git a/.github/workflows/interpret_load_test.py b/.github/workflows/interpret_load_test.py
new file mode 100644
index 000000000..b52d4d2b3
--- /dev/null
+++ b/.github/workflows/interpret_load_test.py
@@ -0,0 +1,91 @@
+import csv
+import os
+from github import Github
+
+
+def interpret_results(csv_file):
+ with open(csv_file, newline="") as csvfile:
+ csvreader = csv.DictReader(csvfile)
+ rows = list(csvreader)
+ """
+ in this csv reader
+ - Create 1 new column "Status"
+ - if a row has a median response time < 300 and an average response time < 300, Status = "Passed ✅"
+ - if a row has a median response time >= 300 or an average response time >= 300, Status = "Failed ❌"
+ - Order the table in this order Name, Status, Median Response Time, Average Response Time, Requests/s,Failures/s, Min Response Time, Max Response Time, all other columns
+ """
+
+ # Add a new column "Status"
+ for row in rows:
+ median_response_time = float(
+ row["Median Response Time"].strip().rstrip("ms")
+ )
+ average_response_time = float(
+ row["Average Response Time"].strip().rstrip("s")
+ )
+
+ request_count = int(row["Request Count"])
+ failure_count = int(row["Failure Count"])
+
+ failure_percent = round((failure_count / request_count) * 100, 2)
+
+ # Determine status based on conditions
+ if (
+ median_response_time < 300
+ and average_response_time < 300
+ and failure_percent < 5
+ ):
+ row["Status"] = "Passed ✅"
+ else:
+ row["Status"] = "Failed ❌"
+
+ # Construct Markdown table header
+ markdown_table = "| Name | Status | Median Response Time (ms) | Average Response Time (ms) | Requests/s | Failures/s | Request Count | Failure Count | Min Response Time (ms) | Max Response Time (ms) |"
+ markdown_table += (
+ "\n| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |"
+ )
+
+ # Construct Markdown table rows
+ for row in rows:
+ markdown_table += f"\n| {row['Name']} | {row['Status']} | {row['Median Response Time']} | {row['Average Response Time']} | {row['Requests/s']} | {row['Failures/s']} | {row['Request Count']} | {row['Failure Count']} | {row['Min Response Time']} | {row['Max Response Time']} |"
+ print("markdown table: ", markdown_table)
+ return markdown_table
+
+
+if __name__ == "__main__":
+ csv_file = "load_test_stats.csv" # Change this to the path of your CSV file
+ markdown_table = interpret_results(csv_file)
+
+ # Update release body with interpreted results
+ github_token = os.getenv("GITHUB_TOKEN")
+ g = Github(github_token)
+ repo = g.get_repo(
+ "BerriAI/litellm"
+ ) # Replace with your repository's username and name
+ latest_release = repo.get_latest_release()
+ print("got latest release: ", latest_release)
+ print("latest release body: ", latest_release.body)
+ print("markdown table: ", markdown_table)
+
+ # check if "Load Test LiteLLM Proxy Results" exists
+ existing_release_body = latest_release.body
+ if "Load Test LiteLLM Proxy Results" in latest_release.body:
+ # find the "Load Test LiteLLM Proxy Results" section and delete it
+ start_index = latest_release.body.find("Load Test LiteLLM Proxy Results")
+ existing_release_body = latest_release.body[:start_index]
+
+ new_release_body = (
+ existing_release_body
+ + "\n\n"
+ + "## Load Test LiteLLM Proxy Results"
+ + "\n\n"
+ + markdown_table
+ )
+ print("new release body: ", new_release_body)
+ try:
+ latest_release.update_release(
+ name=latest_release.tag_name,
+ message=new_release_body,
+ )
+ except Exception as e:
+ print(e)
diff --git a/.github/workflows/load_test.yml b/.github/workflows/load_test.yml
index ed0c34fbd..ddf613fa6 100644
--- a/.github/workflows/load_test.yml
+++ b/.github/workflows/load_test.yml
@@ -1,6 +1,11 @@
name: Test Locust Load Test
-on: [push]
+on:
+ workflow_run:
+ workflows: ["Build, Publish LiteLLM Docker Image. New Release"]
+ types:
+ - completed
+ workflow_dispatch:
jobs:
build:
@@ -8,15 +13,32 @@ jobs:
steps:
- name: Checkout
uses: actions/checkout@v1
+ - name: Setup Python
+ uses: actions/setup-python@v2
+ with:
+ python-version: '3.x'
+
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install PyGithub
- name: Run Load Test
id: locust_run
uses: BerriAI/locust-github-action@master
with:
LOCUSTFILE: ".github/workflows/locustfile.py"
- URL: "https://litellm-api.up.railway.app/"
+ URL: "https://litellm-database-docker-build-production.up.railway.app/"
USERS: "100"
RATE: "10"
- RUNTIME: "60s"
+ RUNTIME: "300s"
+ - name: Process Load Test Stats
+ run: |
+ echo "Current working directory: $PWD"
+ ls
+ python ".github/workflows/interpret_load_test.py"
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ working-directory: ${{ github.workspace }}
- name: Upload CSV as Asset to Latest Release
uses: xresloader/upload-to-github-release@v1
env:
@@ -25,4 +47,4 @@ jobs:
file: "load_test_stats.csv;load_test.html"
update_latest_release: true
tag_name: "load-test"
- overwrite: true
+ overwrite: true
\ No newline at end of file
diff --git a/.github/workflows/locustfile.py b/.github/workflows/locustfile.py
index 5efdca84d..9e5b62ff0 100644
--- a/.github/workflows/locustfile.py
+++ b/.github/workflows/locustfile.py
@@ -1,4 +1,6 @@
-from locust import HttpUser, task, between
+from locust import HttpUser, task, between, events
+import json
+import time
class MyUser(HttpUser):
@@ -8,7 +10,7 @@ class MyUser(HttpUser):
def chat_completion(self):
headers = {
"Content-Type": "application/json",
- "Authorization": f"Bearer sk-1234",
+ "Authorization": f"Bearer sk-gUvTeN9g0sgHBMf9HeCaqA",
# Include any additional headers you may need for authentication, etc.
}
@@ -26,3 +28,15 @@ class MyUser(HttpUser):
response = self.client.post("chat/completions", json=payload, headers=headers)
# Print or log the response if needed
+
+ @task(10)
+ def health_readiness(self):
+ start_time = time.time()
+ response = self.client.get("health/readiness")
+ response_time = time.time() - start_time
+
+ @task(10)
+ def health_liveliness(self):
+ start_time = time.time()
+ response = self.client.get("health/liveliness")
+ response_time = time.time() - start_time
diff --git a/Dockerfile b/Dockerfile
index dd3012109..4408afb3d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -66,4 +66,4 @@ ENTRYPOINT ["litellm"]
# Append "--detailed_debug" to the end of CMD to view detailed debug logs
# CMD ["--port", "4000", "--config", "./proxy_server_config.yaml", "--run_gunicorn", "--detailed_debug"]
-CMD ["--port", "4000", "--config", "./proxy_server_config.yaml", "--run_gunicorn", "--num_workers", "1"]
\ No newline at end of file
+CMD ["--port", "4000", "--config", "./proxy_server_config.yaml", "--run_gunicorn", "--num_workers", "4"]
diff --git a/deploy/charts/litellm/README.md b/deploy/charts/litellm/README.md
index 817781ed0..e005280b8 100644
--- a/deploy/charts/litellm/README.md
+++ b/deploy/charts/litellm/README.md
@@ -2,7 +2,7 @@
## Prerequisites
-- Kubernetes 1.23+
+- Kubernetes 1.21+
- Helm 3.8.0+
If `db.deployStandalone` is used:
@@ -33,6 +33,7 @@ If `db.useStackgresOperator` is used (not yet implemented):
| `proxy_config.*` | See [values.yaml](./values.yaml) for default settings. See [example_config_yaml](../../../litellm/proxy/example_config_yaml/) for configuration examples. | N/A |
#### Example `environmentSecrets` Secret
+
```
apiVersion: v1
kind: Secret
diff --git a/deploy/charts/litellm/values.yaml b/deploy/charts/litellm/values.yaml
index cc53fc59c..891c44f2a 100644
--- a/deploy/charts/litellm/values.yaml
+++ b/deploy/charts/litellm/values.yaml
@@ -6,7 +6,6 @@ replicaCount: 1
image:
# Use "ghcr.io/berriai/litellm-database" for optimized image with database
- # Alternatively, use "ghcr.io/berriai/litellm" for the default image
repository: ghcr.io/berriai/litellm-database
pullPolicy: IfNotPresent
# Overrides the image tag whose default is the chart appVersion.
@@ -85,10 +84,13 @@ proxy_config:
litellm_params:
model: gpt-3.5-turbo
api_key: eXaMpLeOnLy
+ - model_name: fake-openai-endpoint
+ litellm_params:
+ model: openai/fake
+ api_key: fake-key
+ api_base: https://exampleopenaiendpoint-production.up.railway.app/
general_settings:
master_key: os.environ/PROXY_MASTER_KEY
-# litellm_settings:
-# cache: true
resources: {}
# We usually recommend not to specify default resources and to leave this as a conscious
diff --git a/docs/my-website/docs/audio_transcription.md b/docs/my-website/docs/audio_transcription.md
index 09fa1a1b9..25eca6caa 100644
--- a/docs/my-website/docs/audio_transcription.md
+++ b/docs/my-website/docs/audio_transcription.md
@@ -77,9 +77,32 @@ litellm --config /path/to/config.yaml
### Test
+
+
+
```bash
-curl --location 'http://0.0.0.0:4000/v1/audio/transcriptions' \
+curl --location 'http://0.0.0.0:8000/v1/audio/transcriptions' \
--header 'Authorization: Bearer sk-1234' \
--form 'file=@"/Users/krrishdholakia/Downloads/gettysburg.wav"' \
--form 'model="whisper"'
```
+
+
+
+
+```python
+from openai import OpenAI
+client = openai.OpenAI(
+ api_key="sk-1234",
+ base_url="http://0.0.0.0:8000"
+)
+
+
+audio_file = open("speech.mp3", "rb")
+transcript = client.audio.transcriptions.create(
+ model="whisper",
+ file=audio_file
+)
+```
+
+
\ No newline at end of file
diff --git a/docs/my-website/docs/langchain/langchain.md b/docs/my-website/docs/langchain/langchain.md
index fa5a0a96b..cc12767b8 100644
--- a/docs/my-website/docs/langchain/langchain.md
+++ b/docs/my-website/docs/langchain/langchain.md
@@ -133,3 +133,6 @@ chat(messages)
```
+
+## Use LangChain ChatLiteLLM + Langfuse
+Checkout this section [here](../observability/langfuse_integration#use-langchain-chatlitellm--langfuse) for more details on how to integrate Langfuse with ChatLiteLLM.
diff --git a/docs/my-website/docs/load_test.md b/docs/my-website/docs/load_test.md
index f85ff9122..5eb6a0610 100644
--- a/docs/my-website/docs/load_test.md
+++ b/docs/my-website/docs/load_test.md
@@ -2,6 +2,54 @@ import Image from '@theme/IdealImage';
# 🔥 Load Test LiteLLM
+## How to run a locust load test on LiteLLM Proxy
+
+1. Add `fake-openai-endpoint` to your proxy config.yaml and start your litellm proxy
+litellm provides a free hosted `fake-openai-endpoint` you can load test against
+
+```yaml
+model_list:
+ - model_name: fake-openai-endpoint
+ litellm_params:
+ model: openai/fake
+ api_key: fake-key
+ api_base: https://exampleopenaiendpoint-production.up.railway.app/
+```
+
+2. `pip install locust`
+
+3. Create a file called `locustfile.py` on your local machine. Copy the contents from the litellm load test located [here](https://github.com/BerriAI/litellm/blob/main/.github/workflows/locustfile.py)
+
+4. Start locust
+ Run `locust` in the same directory as your `locustfile.py` from step 2
+
+ ```shell
+ locust
+ ```
+
+ Output on terminal
+ ```
+ [2024-03-15 07:19:58,893] Starting web interface at http://0.0.0.0:8089
+ [2024-03-15 07:19:58,898] Starting Locust 2.24.0
+ ```
+
+5. Run Load test on locust
+
+ Head to the locust UI on http://0.0.0.0:8089
+
+ Set Users=100, Ramp Up Users=10, Host=Base URL of your LiteLLM Proxy
+
+
+
+6. Expected Results
+
+ Expect to see the following response times for `/health/readiness`
+ Median → /health/readiness is `150ms`
+
+ Avg → /health/readiness is `219ms`
+
+
+
## Load Test LiteLLM Proxy - 1500+ req/s
## 1500+ concurrent requests/s
diff --git a/docs/my-website/docs/observability/langfuse_integration.md b/docs/my-website/docs/observability/langfuse_integration.md
index 294f3fb38..50b016d09 100644
--- a/docs/my-website/docs/observability/langfuse_integration.md
+++ b/docs/my-website/docs/observability/langfuse_integration.md
@@ -132,6 +132,41 @@ print(response)
```
+### Use LangChain ChatLiteLLM + Langfuse
+Pass `trace_user_id`, `session_id` in model_kwargs
+```python
+import os
+from langchain.chat_models import ChatLiteLLM
+from langchain.schema import HumanMessage
+import litellm
+
+# from https://cloud.langfuse.com/
+os.environ["LANGFUSE_PUBLIC_KEY"] = ""
+os.environ["LANGFUSE_SECRET_KEY"] = ""
+
+os.environ['OPENAI_API_KEY']=""
+
+# set langfuse as a callback, litellm will send the data to langfuse
+litellm.success_callback = ["langfuse"]
+
+chat = ChatLiteLLM(
+ model="gpt-3.5-turbo"
+ model_kwargs={
+ "metadata": {
+ "trace_user_id": "user-id2", # set langfuse Trace User ID
+ "session_id": "session-1" , # set langfuse Session ID
+ "tags": ["tag1", "tag2"]
+ }
+ }
+ )
+messages = [
+ HumanMessage(
+ content="what model are you"
+ )
+]
+chat(messages)
+```
+
## Troubleshooting & Errors
### Data not getting logged to Langfuse ?
@@ -142,4 +177,4 @@ print(response)
- [Schedule Demo 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version)
- [Community Discord 💭](https://discord.gg/wuPM9dRgDw)
- Our numbers 📞 +1 (770) 8783-106 / +1 (412) 618-6238
-- Our emails ✉️ ishaan@berri.ai / krrish@berri.ai
\ No newline at end of file
+- Our emails ✉️ ishaan@berri.ai / krrish@berri.ai
diff --git a/docs/my-website/docs/providers/cohere.md b/docs/my-website/docs/providers/cohere.md
index c6efb3b40..71763e30d 100644
--- a/docs/my-website/docs/providers/cohere.md
+++ b/docs/my-website/docs/providers/cohere.md
@@ -49,7 +49,7 @@ for chunk in response:
| command-light | `completion('command-light', messages)` |
| command-medium | `completion('command-medium', messages)` |
| command-medium-beta | `completion('command-medium-beta', messages)` |
-| command-xlarge-beta | `completion('command-xlarge-beta', messages)` |
+| command-xlarge-nightly | `completion('command-xlarge-nightly', messages)` |
| command-nightly | `completion('command-nightly', messages)` |
diff --git a/docs/my-website/docs/providers/fireworks_ai.md b/docs/my-website/docs/providers/fireworks_ai.md
new file mode 100644
index 000000000..ba50bd1f2
--- /dev/null
+++ b/docs/my-website/docs/providers/fireworks_ai.md
@@ -0,0 +1,53 @@
+# Fireworks AI
+https://fireworks.ai/
+
+**We support ALL Fireworks AI models, just set `fireworks_ai/` as a prefix when sending completion requests**
+
+## API Key
+```python
+# env variable
+os.environ['FIREWORKS_AI_API_KEY']
+```
+
+## Sample Usage
+```python
+from litellm import completion
+import os
+
+os.environ['FIREWORKS_AI_API_KEY'] = ""
+response = completion(
+ model="fireworks_ai/mixtral-8x7b-instruct",
+ messages=[
+ {"role": "user", "content": "hello from litellm"}
+ ],
+)
+print(response)
+```
+
+## Sample Usage - Streaming
+```python
+from litellm import completion
+import os
+
+os.environ['FIREWORKS_AI_API_KEY'] = ""
+response = completion(
+ model="fireworks_ai/mixtral-8x7b-instruct",
+ messages=[
+ {"role": "user", "content": "hello from litellm"}
+ ],
+ stream=True
+)
+
+for chunk in response:
+ print(chunk)
+```
+
+
+## Supported Models - ALL Fireworks AI Models Supported!
+We support ALL Fireworks AI models, just set `fireworks_ai/` as a prefix when sending completion requests
+
+| Model Name | Function Call |
+|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| mixtral-8x7b-instruct | `completion(model="fireworks_ai/mixtral-8x7b-instruct", messages)` |
+| firefunction-v1 | `completion(model="fireworks_ai/firefunction-v1", messages)` |
+| llama-v2-70b-chat | `completion(model="fireworks_ai/llama-v2-70b-chat", messages)` |
\ No newline at end of file
diff --git a/docs/my-website/docs/providers/groq.md b/docs/my-website/docs/providers/groq.md
index e09cf9f8a..d8a4fded4 100644
--- a/docs/my-website/docs/providers/groq.md
+++ b/docs/my-website/docs/providers/groq.md
@@ -49,4 +49,5 @@ We support ALL Groq models, just set `groq/` as a prefix when sending completion
| Model Name | Function Call |
|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| llama2-70b-4096 | `completion(model="groq/llama2-70b-4096", messages)` |
-| mixtral-8x7b-32768 | `completion(model="groq/mixtral-8x7b-32768", messages)` |
+| mixtral-8x7b-32768 | `completion(model="groq/mixtral-8x7b-32768", messages)` |
+| gemma-7b-it | `completion(model="groq/gemma-7b-it", messages)` |
\ No newline at end of file
diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md
index 2be1d8de1..1521f63b0 100644
--- a/docs/my-website/docs/proxy/caching.md
+++ b/docs/my-website/docs/proxy/caching.md
@@ -225,6 +225,32 @@ litellm_settings:
supported_call_types: ["acompletion", "completion", "embedding", "aembedding"] # defaults to all litellm call types
```
+
+### Turn on `batch_redis_requests`
+
+**What it does?**
+When a request is made:
+
+- Check if a key starting with `litellm:::` exists in-memory, if no - get the last 100 cached requests for this key and store it
+
+- New requests are stored with this `litellm:..` as the namespace
+
+**Why?**
+Reduce number of redis GET requests. This improved latency by 46% in prod load tests.
+
+**Usage**
+
+```yaml
+litellm_settings:
+ cache: true
+ cache_params:
+ type: redis
+ ... # remaining redis args (host, port, etc.)
+ callbacks: ["batch_redis_requests"] # 👈 KEY CHANGE!
+```
+
+[**SEE CODE**](https://github.com/BerriAI/litellm/blob/main/litellm/proxy/hooks/batch_redis_get.py)
+
### Turn on / off caching per request.
The proxy support 3 cache-controls:
diff --git a/docs/my-website/docs/proxy/deploy.md b/docs/my-website/docs/proxy/deploy.md
index 175806d27..3f105e387 100644
--- a/docs/my-website/docs/proxy/deploy.md
+++ b/docs/my-website/docs/proxy/deploy.md
@@ -150,17 +150,20 @@ To avoid issues with predictability, difficulties in rollback, and inconsistent
## Deploy with Database
+### Docker, Kubernetes, Helm Chart
+
+
+
+
+
We maintain a [seperate Dockerfile](https://github.com/BerriAI/litellm/pkgs/container/litellm-database) for reducing build time when running LiteLLM proxy with a connected Postgres Database
-
-
-
-```
+```shell
docker pull docker pull ghcr.io/berriai/litellm-database:main-latest
```
-```
+```shell
docker run --name litellm-proxy \
-e DATABASE_URL=postgresql://:@:/ \
-p 4000:4000 \
@@ -233,6 +236,8 @@ Your OpenAI proxy server is now running on `http://0.0.0.0:4000`.
+Use this to deploy litellm using a helm chart. Link to [the LiteLLM Helm Chart](https://github.com/BerriAI/litellm/tree/main/deploy/charts/litellm)
+
#### Step 1. Clone the repository
```bash
@@ -241,9 +246,11 @@ git clone https://github.com/BerriAI/litellm.git
#### Step 2. Deploy with Helm
+Run the following command in the root of your `litellm` repo. This will set the litellm proxy master key as `sk-1234`
+
```bash
helm install \
- --set masterkey=SuPeRsEcReT \
+ --set masterkey=sk-1234 \
mydeploy \
deploy/charts/litellm
```
@@ -259,6 +266,9 @@ kubectl \
Your OpenAI proxy server is now running on `http://127.0.0.1:4000`.
+
+If you need to set your litellm proxy config.yaml, you can find this in [values.yaml](https://github.com/BerriAI/litellm/blob/main/deploy/charts/litellm/values.yaml)
+
diff --git a/docs/my-website/img/litellm_load_test.png b/docs/my-website/img/litellm_load_test.png
new file mode 100644
index 000000000..2dd8299d2
Binary files /dev/null and b/docs/my-website/img/litellm_load_test.png differ
diff --git a/docs/my-website/img/locust_load_test.png b/docs/my-website/img/locust_load_test.png
new file mode 100644
index 000000000..37de623a1
Binary files /dev/null and b/docs/my-website/img/locust_load_test.png differ
diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js
index ae56f9d7c..21f66a778 100644
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@@ -138,6 +138,7 @@ const sidebars = {
"providers/ollama",
"providers/perplexity",
"providers/groq",
+ "providers/fireworks_ai",
"providers/vllm",
"providers/xinference",
"providers/cloudflare_workers",
diff --git a/enterprise/__init__.py b/enterprise/__init__.py
new file mode 100644
index 000000000..b6e690fd5
--- /dev/null
+++ b/enterprise/__init__.py
@@ -0,0 +1 @@
+from . import *
diff --git a/litellm/__init__.py b/litellm/__init__.py
index a821bde30..7eae39097 100644
--- a/litellm/__init__.py
+++ b/litellm/__init__.py
@@ -36,6 +36,7 @@ token: Optional[str] = (
telemetry = True
max_tokens = 256 # OpenAI Defaults
drop_params = False
+modify_params = False
retry = True
api_key: Optional[str] = None
openai_key: Optional[str] = None
@@ -327,6 +328,7 @@ openai_compatible_providers: List = [
"perplexity",
"xinference",
"together_ai",
+ "fireworks_ai",
]
@@ -478,6 +480,7 @@ provider_list: List = [
"voyage",
"cloudflare",
"xinference",
+ "fireworks_ai",
"custom", # custom apis
]
diff --git a/litellm/caching.py b/litellm/caching.py
index 9df95f199..ed856f86f 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -129,6 +129,16 @@ class RedisCache(BaseCache):
f"LiteLLM Caching: set() - Got exception from REDIS : {str(e)}"
)
+ async def async_scan_iter(self, pattern: str, count: int = 100) -> list:
+ keys = []
+ _redis_client = self.init_async_client()
+ async with _redis_client as redis_client:
+ async for key in redis_client.scan_iter(match=pattern + "*", count=count):
+ keys.append(key)
+ if len(keys) >= count:
+ break
+ return keys
+
async def async_set_cache(self, key, value, **kwargs):
_redis_client = self.init_async_client()
async with _redis_client as redis_client:
@@ -140,6 +150,9 @@ class RedisCache(BaseCache):
await redis_client.set(
name=key, value=json.dumps(value), ex=ttl, get=True
)
+ print_verbose(
+ f"Successfully Set ASYNC Redis Cache: key: {key}\nValue {value}\nttl={ttl}"
+ )
except Exception as e:
# NON blocking - notify users Redis is throwing an exception
print_verbose(
@@ -172,8 +185,6 @@ class RedisCache(BaseCache):
return results
except Exception as e:
print_verbose(f"Error occurred in pipeline write - {str(e)}")
- # NON blocking - notify users Redis is throwing an exception
- logging.debug("LiteLLM Caching: set() - Got exception from REDIS : ", e)
def _get_cache_logic(self, cached_response: Any):
"""
@@ -208,7 +219,7 @@ class RedisCache(BaseCache):
_redis_client = self.init_async_client()
async with _redis_client as redis_client:
try:
- print_verbose(f"Get Redis Cache: key: {key}")
+ print_verbose(f"Get Async Redis Cache: key: {key}")
cached_response = await redis_client.get(key)
print_verbose(
f"Got Async Redis Cache: key: {key}, cached_response {cached_response}"
@@ -217,8 +228,39 @@ class RedisCache(BaseCache):
return response
except Exception as e:
# NON blocking - notify users Redis is throwing an exception
- traceback.print_exc()
- logging.debug("LiteLLM Caching: get() - Got exception from REDIS: ", e)
+ print_verbose(
+ f"LiteLLM Caching: async get() - Got exception from REDIS: {str(e)}"
+ )
+
+ async def async_get_cache_pipeline(self, key_list) -> dict:
+ """
+ Use Redis for bulk read operations
+ """
+ _redis_client = await self.init_async_client()
+ key_value_dict = {}
+ try:
+ async with _redis_client as redis_client:
+ async with redis_client.pipeline(transaction=True) as pipe:
+ # Queue the get operations in the pipeline for all keys.
+ for cache_key in key_list:
+ pipe.get(cache_key) # Queue GET command in pipeline
+
+ # Execute the pipeline and await the results.
+ results = await pipe.execute()
+
+ # Associate the results back with their keys.
+ # 'results' is a list of values corresponding to the order of keys in 'key_list'.
+ key_value_dict = dict(zip(key_list, results))
+
+ decoded_results = {
+ k.decode("utf-8"): self._get_cache_logic(v)
+ for k, v in key_value_dict.items()
+ }
+
+ return decoded_results
+ except Exception as e:
+ print_verbose(f"Error occurred in pipeline read - {str(e)}")
+ return key_value_dict
def flush_cache(self):
self.redis_client.flushall()
@@ -1001,6 +1043,10 @@ class Cache:
if self.namespace is not None:
hash_hex = f"{self.namespace}:{hash_hex}"
print_verbose(f"Hashed Key with Namespace: {hash_hex}")
+ elif kwargs.get("metadata", {}).get("redis_namespace", None) is not None:
+ _namespace = kwargs.get("metadata", {}).get("redis_namespace", None)
+ hash_hex = f"{_namespace}:{hash_hex}"
+ print_verbose(f"Hashed Key with Namespace: {hash_hex}")
return hash_hex
def generate_streaming_content(self, content):
diff --git a/litellm/llms/bedrock.py b/litellm/llms/bedrock.py
index 4aa27b3c9..31e4905cb 100644
--- a/litellm/llms/bedrock.py
+++ b/litellm/llms/bedrock.py
@@ -82,12 +82,22 @@ class AmazonAnthropicClaude3Config:
Supported Params for the Amazon / Anthropic Claude 3 models:
- - `max_tokens` (integer) max tokens,
- - `anthropic_version` (string) version of anthropic for bedrock - e.g. "bedrock-2023-05-31"
+ - `max_tokens` Required (integer) max tokens,
+ - `anthropic_version` Required (string) version of anthropic for bedrock - e.g. "bedrock-2023-05-31"
+ - `system` Optional (string) the system prompt, conversion from openai format to this is handled in factory.py
+ - `temperature` Optional (float) The amount of randomness injected into the response
+ - `top_p` Optional (float) Use nucleus sampling.
+ - `top_k` Optional (int) Only sample from the top K options for each subsequent token
+ - `stop_sequences` Optional (List[str]) Custom text sequences that cause the model to stop generating
"""
max_tokens: Optional[int] = litellm.max_tokens
anthropic_version: Optional[str] = "bedrock-2023-05-31"
+ system: Optional[str] = None
+ temperature: Optional[float] = None
+ top_p: Optional[float] = None
+ top_k: Optional[int] = None
+ stop_sequences: Optional[List[str]] = None
def __init__(
self,
@@ -128,6 +138,12 @@ class AmazonAnthropicClaude3Config:
optional_params["tools"] = value
if param == "stream":
optional_params["stream"] = value
+ if param == "stop":
+ optional_params["stop_sequences"] = value
+ if param == "temperature":
+ optional_params["temperature"] = value
+ if param == "top_p":
+ optional_params["top_p"] = value
return optional_params
diff --git a/litellm/llms/cohere.py b/litellm/llms/cohere.py
index 960dc66d3..a09e249af 100644
--- a/litellm/llms/cohere.py
+++ b/litellm/llms/cohere.py
@@ -300,8 +300,7 @@ def embedding(
for text in input:
input_tokens += len(encoding.encode(text))
- model_response["usage"] = {
- "prompt_tokens": input_tokens,
- "total_tokens": input_tokens,
- }
+ model_response["usage"] = Usage(
+ prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens
+ )
return model_response
diff --git a/litellm/llms/prompt_templates/factory.py b/litellm/llms/prompt_templates/factory.py
index 4ebb48dba..0347e5fd6 100644
--- a/litellm/llms/prompt_templates/factory.py
+++ b/litellm/llms/prompt_templates/factory.py
@@ -705,6 +705,7 @@ def anthropic_messages_pt(messages: list):
"text"
].rstrip() # no trailing whitespace for final assistant message
+
return new_messages
diff --git a/litellm/main.py b/litellm/main.py
index 8ccde52e6..3a9fed77e 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -12,6 +12,7 @@ from typing import Any, Literal, Union, BinaryIO
from functools import partial
import dotenv, traceback, random, asyncio, time, contextvars
from copy import deepcopy
+
import httpx
import litellm
from ._logging import verbose_logger
@@ -891,6 +892,7 @@ def completion(
or custom_llm_provider == "mistral"
or custom_llm_provider == "openai"
or custom_llm_provider == "together_ai"
+ or custom_llm_provider in litellm.openai_compatible_providers
or "ft:gpt-3.5-turbo" in model # finetune gpt-3.5-turbo
): # allow user to make an openai call with a custom base
# note: if a user sets a custom base - we should ensure this works
@@ -2393,6 +2395,7 @@ async def aembedding(*args, **kwargs):
or custom_llm_provider == "deepinfra"
or custom_llm_provider == "perplexity"
or custom_llm_provider == "groq"
+ or custom_llm_provider == "fireworks_ai"
or custom_llm_provider == "ollama"
or custom_llm_provider == "vertex_ai"
): # currently implemented aiohttp calls for just azure and openai, soon all.
@@ -2892,6 +2895,7 @@ async def atext_completion(*args, **kwargs):
or custom_llm_provider == "deepinfra"
or custom_llm_provider == "perplexity"
or custom_llm_provider == "groq"
+ or custom_llm_provider == "fireworks_ai"
or custom_llm_provider == "text-completion-openai"
or custom_llm_provider == "huggingface"
or custom_llm_provider == "ollama"
diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json
index ddacbf05c..0a90c91ca 100644
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@@ -631,6 +631,13 @@
"litellm_provider": "groq",
"mode": "chat"
},
+ "groq/gemma-7b-it": {
+ "max_tokens": 8192,
+ "input_cost_per_token": 0.00000010,
+ "output_cost_per_token": 0.00000010,
+ "litellm_provider": "groq",
+ "mode": "chat"
+ },
"claude-instant-1.2": {
"max_tokens": 100000,
"max_output_tokens": 8191,
diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml
index aab9b3d5c..1c41d79fc 100644
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@@ -9,6 +9,12 @@ model_list:
model: gpt-3.5-turbo-1106
api_key: os.environ/OPENAI_API_KEY
+litellm_settings:
+ cache: true
+ cache_params:
+ type: redis
+ callbacks: ["batch_redis_requests"]
+
general_settings:
master_key: sk-1234
- database_url: "postgresql://krrishdholakia:9yQkKWiB8vVs@ep-icy-union-a5j4dwls.us-east-2.aws.neon.tech/neondb?sslmode=require"
\ No newline at end of file
+ # database_url: "postgresql://krrishdholakia:9yQkKWiB8vVs@ep-icy-union-a5j4dwls.us-east-2.aws.neon.tech/neondb?sslmode=require"
\ No newline at end of file
diff --git a/litellm/proxy/hooks/batch_redis_get.py b/litellm/proxy/hooks/batch_redis_get.py
new file mode 100644
index 000000000..71588c9d4
--- /dev/null
+++ b/litellm/proxy/hooks/batch_redis_get.py
@@ -0,0 +1,124 @@
+# What this does?
+## Gets a key's redis cache, and store it in memory for 1 minute.
+## This reduces the number of REDIS GET requests made during high-traffic by the proxy.
+### [BETA] this is in Beta. And might change.
+
+from typing import Optional, Literal
+import litellm
+from litellm.caching import DualCache, RedisCache, InMemoryCache
+from litellm.proxy._types import UserAPIKeyAuth
+from litellm.integrations.custom_logger import CustomLogger
+from litellm._logging import verbose_proxy_logger
+from fastapi import HTTPException
+import json, traceback
+
+
+class _PROXY_BatchRedisRequests(CustomLogger):
+ # Class variables or attributes
+ in_memory_cache: Optional[InMemoryCache] = None
+
+ def __init__(self):
+ litellm.cache.async_get_cache = (
+ self.async_get_cache
+ ) # map the litellm 'get_cache' function to our custom function
+
+ def print_verbose(
+ self, print_statement, debug_level: Literal["INFO", "DEBUG"] = "DEBUG"
+ ):
+ if debug_level == "DEBUG":
+ verbose_proxy_logger.debug(print_statement)
+ elif debug_level == "INFO":
+ verbose_proxy_logger.debug(print_statement)
+ if litellm.set_verbose is True:
+ print(print_statement) # noqa
+
+ async def async_pre_call_hook(
+ self,
+ user_api_key_dict: UserAPIKeyAuth,
+ cache: DualCache,
+ data: dict,
+ call_type: str,
+ ):
+ try:
+ """
+ Get the user key
+
+ Check if a key starting with `litellm:: 0:
+ key_value_dict = (
+ await litellm.cache.cache.async_get_cache_pipeline(
+ key_list=keys
+ )
+ )
+
+ ## Add to cache
+ if len(key_value_dict.items()) > 0:
+ await cache.in_memory_cache.async_set_cache_pipeline(
+ cache_list=list(key_value_dict.items()), ttl=60
+ )
+ ## Set cache namespace if it's a miss
+ data["metadata"]["redis_namespace"] = cache_key_name
+ except HTTPException as e:
+ raise e
+ except Exception as e:
+ traceback.print_exc()
+
+ async def async_get_cache(self, *args, **kwargs):
+ """
+ - Check if the cache key is in-memory
+
+ - Else return None
+ """
+ try: # never block execution
+ if "cache_key" in kwargs:
+ cache_key = kwargs["cache_key"]
+ else:
+ cache_key = litellm.cache.get_cache_key(
+ *args, **kwargs
+ ) # returns ":" - we pass redis_namespace in async_pre_call_hook. Done to avoid rewriting the async_set_cache logic
+ if cache_key is not None and self.in_memory_cache is not None:
+ cache_control_args = kwargs.get("cache", {})
+ max_age = cache_control_args.get(
+ "s-max-age", cache_control_args.get("s-maxage", float("inf"))
+ )
+ cached_result = self.in_memory_cache.get_cache(
+ cache_key, *args, **kwargs
+ )
+ return litellm.cache._get_cache_logic(
+ cached_result=cached_result, max_age=max_age
+ )
+ except Exception as e:
+ return None
diff --git a/litellm/proxy/proxy_load_test/locustfile.py b/litellm/proxy/proxy_load_test/locustfile.py
index 220bd3553..f439f7274 100644
--- a/litellm/proxy/proxy_load_test/locustfile.py
+++ b/litellm/proxy/proxy_load_test/locustfile.py
@@ -6,7 +6,7 @@ import time
class MyUser(HttpUser):
wait_time = between(1, 5)
- @task
+ @task(3)
def chat_completion(self):
headers = {
"Content-Type": "application/json",
@@ -31,62 +31,8 @@ class MyUser(HttpUser):
@task(10)
def health_readiness(self):
- start_time = time.time()
response = self.client.get("health/readiness")
- response_time = time.time() - start_time
- if response_time > 1:
- events.request_failure.fire(
- request_type="GET",
- name="health/readiness",
- response_time=response_time,
- exception=None,
- response=response,
- )
@task(10)
def health_liveliness(self):
- start_time = time.time()
response = self.client.get("health/liveliness")
- response_time = time.time() - start_time
- if response_time > 1:
- events.request_failure.fire(
- request_type="GET",
- name="health/liveliness",
- response_time=response_time,
- exception=None,
- response=response,
- )
-
- # @task
- # def key_generate(self):
- # headers = {
- # "Authorization": "Bearer sk-1234",
- # "Content-Type": "application/json",
- # }
-
- # payload = {
- # "models": ["gpt-3.5-turbo", "gpt-4", "claude-2"],
- # "duration": "20m",
- # "metadata": {"user": "ishaan@berri.ai"},
- # "team_id": "core-infra",
- # "max_budget": 10,
- # "soft_budget": 5,
- # }
-
- # response = self.client.post("key/generate", json=payload, headers=headers)
-
- # if response.status_code == 200:
- # key_response = response.json()
- # models = key_response.get("models", [])
- # if models:
- # # Use the first model from the key generation response to make a chat completions request
- # model_to_use = models[0]
- # chat_payload = {
- # "model": model_to_use,
- # "messages": [
- # {"role": "system", "content": "You are a chat bot."},
- # {"role": "user", "content": "Hello, how are you?"},
- # ],
- # }
- # chat_response = self.client.post("chat/completions", json=chat_payload, headers=headers)
- # # Print or log the chat response if needed
diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index c50b0e895..4c9cd876e 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -8,7 +8,6 @@ import hashlib, uuid
import warnings
import importlib
import warnings
-import backoff
def showwarning(message, category, filename, lineno, file=None, line=None):
@@ -35,7 +34,6 @@ try:
import orjson
import logging
from apscheduler.schedulers.asyncio import AsyncIOScheduler
- from argon2 import PasswordHasher
except ImportError as e:
raise ImportError(f"Missing dependency {e}. Run `pip install 'litellm[proxy]'`")
@@ -145,9 +143,12 @@ from typing import Union
try:
# when using litellm cli
import litellm.proxy.enterprise as enterprise
-except:
+except Exception as e:
# when using litellm docker image
- import enterprise # type: ignore
+ try:
+ import enterprise # type: ignore
+ except Exception as e:
+ pass
ui_link = f"/ui/"
ui_message = (
@@ -252,7 +253,6 @@ user_headers = None
user_config_file_path = f"config_{int(time.time())}.yaml"
local_logging = True # writes logs to a local api_log.json file for debugging
experimental = False
-ph = PasswordHasher()
#### GLOBAL VARIABLES ####
llm_router: Optional[litellm.Router] = None
llm_model_list: Optional[list] = None
@@ -382,7 +382,7 @@ async def user_api_key_auth(
return valid_token
try:
- is_master_key_valid = ph.verify(litellm_master_key_hash, api_key)
+ is_master_key_valid = secrets.compare_digest(api_key, master_key)
except Exception as e:
is_master_key_valid = False
@@ -887,6 +887,9 @@ async def user_api_key_auth(
raise Exception(
f"This key is made for LiteLLM UI, Tried to access route: {route}. Not allowed"
)
+ if valid_token is None:
+ # No token was found when looking up in the DB
+ raise Exception("Invalid token passed")
if valid_token_dict is not None:
return UserAPIKeyAuth(api_key=api_key, **valid_token_dict)
else:
@@ -1420,6 +1423,8 @@ async def update_cache(
try:
for _id in user_ids:
# Fetch the existing cost for the given user
+ if _id is None:
+ continue
existing_spend_obj = await user_api_key_cache.async_get_cache(key=_id)
if existing_spend_obj is None:
# if user does not exist in LiteLLM_UserTable, create a new user
@@ -1791,6 +1796,16 @@ class ProxyConfig:
_ENTERPRISE_PromptInjectionDetection()
)
imported_list.append(prompt_injection_detection_obj)
+ elif (
+ isinstance(callback, str)
+ and callback == "batch_redis_requests"
+ ):
+ from litellm.proxy.hooks.batch_redis_get import (
+ _PROXY_BatchRedisRequests,
+ )
+
+ batch_redis_obj = _PROXY_BatchRedisRequests()
+ imported_list.append(batch_redis_obj)
else:
imported_list.append(
get_instance_fn(
@@ -1913,7 +1928,7 @@ class ProxyConfig:
master_key = litellm.get_secret(master_key)
if master_key is not None and isinstance(master_key, str):
- litellm_master_key_hash = ph.hash(master_key)
+ litellm_master_key_hash = master_key
### CUSTOM API KEY AUTH ###
## pass filepath
custom_auth = general_settings.get("custom_auth", None)
diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index 07d39b086..aa0681c61 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -474,11 +474,10 @@ def test_redis_cache_completion_stream():
# test_redis_cache_completion_stream()
-def test_redis_cache_acompletion_stream():
- import asyncio
-
+@pytest.mark.asyncio
+async def test_redis_cache_acompletion_stream():
try:
- litellm.set_verbose = False
+ litellm.set_verbose = True
random_word = generate_random_word()
messages = [
{
@@ -496,37 +495,31 @@ def test_redis_cache_acompletion_stream():
response_1_content = ""
response_2_content = ""
- async def call1():
- nonlocal response_1_content
- response1 = await litellm.acompletion(
- model="gpt-3.5-turbo",
- messages=messages,
- max_tokens=40,
- temperature=1,
- stream=True,
- )
- async for chunk in response1:
- response_1_content += chunk.choices[0].delta.content or ""
- print(response_1_content)
+ response1 = await litellm.acompletion(
+ model="gpt-3.5-turbo",
+ messages=messages,
+ max_tokens=40,
+ temperature=1,
+ stream=True,
+ )
+ async for chunk in response1:
+ response_1_content += chunk.choices[0].delta.content or ""
+ print(response_1_content)
- asyncio.run(call1())
time.sleep(0.5)
print("\n\n Response 1 content: ", response_1_content, "\n\n")
- async def call2():
- nonlocal response_2_content
- response2 = await litellm.acompletion(
- model="gpt-3.5-turbo",
- messages=messages,
- max_tokens=40,
- temperature=1,
- stream=True,
- )
- async for chunk in response2:
- response_2_content += chunk.choices[0].delta.content or ""
- print(response_2_content)
+ response2 = await litellm.acompletion(
+ model="gpt-3.5-turbo",
+ messages=messages,
+ max_tokens=40,
+ temperature=1,
+ stream=True,
+ )
+ async for chunk in response2:
+ response_2_content += chunk.choices[0].delta.content or ""
+ print(response_2_content)
- asyncio.run(call2())
print("\nresponse 1", response_1_content)
print("\nresponse 2", response_2_content)
assert (
@@ -536,14 +529,15 @@ def test_redis_cache_acompletion_stream():
litellm.success_callback = []
litellm._async_success_callback = []
except Exception as e:
- print(e)
+ print(f"{str(e)}\n\n{traceback.format_exc()}")
raise e
# test_redis_cache_acompletion_stream()
-def test_redis_cache_acompletion_stream_bedrock():
+@pytest.mark.asyncio
+async def test_redis_cache_acompletion_stream_bedrock():
import asyncio
try:
@@ -565,39 +559,33 @@ def test_redis_cache_acompletion_stream_bedrock():
response_1_content = ""
response_2_content = ""
- async def call1():
- nonlocal response_1_content
- response1 = await litellm.acompletion(
- model="bedrock/anthropic.claude-v2",
- messages=messages,
- max_tokens=40,
- temperature=1,
- stream=True,
- )
- async for chunk in response1:
- print(chunk)
- response_1_content += chunk.choices[0].delta.content or ""
- print(response_1_content)
+ response1 = await litellm.acompletion(
+ model="bedrock/anthropic.claude-v2",
+ messages=messages,
+ max_tokens=40,
+ temperature=1,
+ stream=True,
+ )
+ async for chunk in response1:
+ print(chunk)
+ response_1_content += chunk.choices[0].delta.content or ""
+ print(response_1_content)
- asyncio.run(call1())
time.sleep(0.5)
print("\n\n Response 1 content: ", response_1_content, "\n\n")
- async def call2():
- nonlocal response_2_content
- response2 = await litellm.acompletion(
- model="bedrock/anthropic.claude-v2",
- messages=messages,
- max_tokens=40,
- temperature=1,
- stream=True,
- )
- async for chunk in response2:
- print(chunk)
- response_2_content += chunk.choices[0].delta.content or ""
- print(response_2_content)
+ response2 = await litellm.acompletion(
+ model="bedrock/anthropic.claude-v2",
+ messages=messages,
+ max_tokens=40,
+ temperature=1,
+ stream=True,
+ )
+ async for chunk in response2:
+ print(chunk)
+ response_2_content += chunk.choices[0].delta.content or ""
+ print(response_2_content)
- asyncio.run(call2())
print("\nresponse 1", response_1_content)
print("\nresponse 2", response_2_content)
assert (
@@ -612,8 +600,8 @@ def test_redis_cache_acompletion_stream_bedrock():
raise e
-@pytest.mark.skip(reason="AWS Suspended Account")
-def test_s3_cache_acompletion_stream_azure():
+@pytest.mark.asyncio
+async def test_s3_cache_acompletion_stream_azure():
import asyncio
try:
@@ -637,41 +625,35 @@ def test_s3_cache_acompletion_stream_azure():
response_1_created = ""
response_2_created = ""
- async def call1():
- nonlocal response_1_content, response_1_created
- response1 = await litellm.acompletion(
- model="azure/chatgpt-v-2",
- messages=messages,
- max_tokens=40,
- temperature=1,
- stream=True,
- )
- async for chunk in response1:
- print(chunk)
- response_1_created = chunk.created
- response_1_content += chunk.choices[0].delta.content or ""
- print(response_1_content)
+ response1 = await litellm.acompletion(
+ model="azure/chatgpt-v-2",
+ messages=messages,
+ max_tokens=40,
+ temperature=1,
+ stream=True,
+ )
+ async for chunk in response1:
+ print(chunk)
+ response_1_created = chunk.created
+ response_1_content += chunk.choices[0].delta.content or ""
+ print(response_1_content)
- asyncio.run(call1())
time.sleep(0.5)
print("\n\n Response 1 content: ", response_1_content, "\n\n")
- async def call2():
- nonlocal response_2_content, response_2_created
- response2 = await litellm.acompletion(
- model="azure/chatgpt-v-2",
- messages=messages,
- max_tokens=40,
- temperature=1,
- stream=True,
- )
- async for chunk in response2:
- print(chunk)
- response_2_content += chunk.choices[0].delta.content or ""
- response_2_created = chunk.created
- print(response_2_content)
+ response2 = await litellm.acompletion(
+ model="azure/chatgpt-v-2",
+ messages=messages,
+ max_tokens=40,
+ temperature=1,
+ stream=True,
+ )
+ async for chunk in response2:
+ print(chunk)
+ response_2_content += chunk.choices[0].delta.content or ""
+ response_2_created = chunk.created
+ print(response_2_content)
- asyncio.run(call2())
print("\nresponse 1", response_1_content)
print("\nresponse 2", response_2_content)
diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index 85f1139fa..44e2f7af6 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -358,7 +358,7 @@ def test_completion_mistral_azure():
}
],
)
- # Add any assertions here to check the response
+ # Add any assertions here to check, the response
print(response)
except Exception as e:
@@ -575,6 +575,25 @@ def test_completion_azure_gpt4_vision():
# test_completion_azure_gpt4_vision()
+def test_completion_fireworks_ai():
+ try:
+ litellm.set_verbose = True
+ messages = [
+ {"role": "system", "content": "You're a good bot"},
+ {
+ "role": "user",
+ "content": "Hey",
+ },
+ ]
+ response = completion(
+ model="fireworks_ai/accounts/fireworks/models/mixtral-8x7b-instruct",
+ messages=messages,
+ )
+ print(response)
+ except Exception as e:
+ pytest.fail(f"Error occurred: {e}")
+
+
@pytest.mark.skip(reason="this test is flaky")
def test_completion_perplexity_api():
try:
diff --git a/litellm/tests/test_custom_logger.py b/litellm/tests/test_custom_logger.py
index 0a8f7b941..b2e2b7d22 100644
--- a/litellm/tests/test_custom_logger.py
+++ b/litellm/tests/test_custom_logger.py
@@ -97,27 +97,23 @@ class TmpFunction:
)
-def test_async_chat_openai_stream():
+@pytest.mark.asyncio
+async def test_async_chat_openai_stream():
try:
tmp_function = TmpFunction()
litellm.set_verbose = True
litellm.success_callback = [tmp_function.async_test_logging_fn]
complete_streaming_response = ""
- async def call_gpt():
- nonlocal complete_streaming_response
- response = await litellm.acompletion(
- model="gpt-3.5-turbo",
- messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}],
- stream=True,
- )
- async for chunk in response:
- complete_streaming_response += (
- chunk["choices"][0]["delta"]["content"] or ""
- )
- print(complete_streaming_response)
+ response = await litellm.acompletion(
+ model="gpt-3.5-turbo",
+ messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}],
+ stream=True,
+ )
+ async for chunk in response:
+ complete_streaming_response += chunk["choices"][0]["delta"]["content"] or ""
+ print(complete_streaming_response)
- asyncio.run(call_gpt())
complete_streaming_response = complete_streaming_response.strip("'")
response1 = tmp_function.complete_streaming_response_in_callback["choices"][0][
"message"
@@ -130,7 +126,7 @@ def test_async_chat_openai_stream():
assert tmp_function.async_success == True
except Exception as e:
print(e)
- pytest.fail(f"An error occurred - {str(e)}")
+ pytest.fail(f"An error occurred - {str(e)}\n\n{traceback.format_exc()}")
# test_async_chat_openai_stream()
diff --git a/litellm/tests/test_embedding.py b/litellm/tests/test_embedding.py
index c32a55353..7eecca60b 100644
--- a/litellm/tests/test_embedding.py
+++ b/litellm/tests/test_embedding.py
@@ -117,6 +117,8 @@ def test_openai_azure_embedding_simple():
print("Calculated request cost=", request_cost)
+ assert isinstance(response.usage, litellm.Usage)
+
except Exception as e:
pytest.fail(f"Error occurred: {e}")
@@ -204,6 +206,8 @@ def test_cohere_embedding():
input=["good morning from litellm", "this is another item"],
)
print(f"response:", response)
+
+ assert isinstance(response.usage, litellm.Usage)
except Exception as e:
pytest.fail(f"Error occurred: {e}")
@@ -269,6 +273,8 @@ def test_bedrock_embedding_titan():
assert end_time - start_time < 0.1
litellm.disable_cache()
+
+ assert isinstance(response.usage, litellm.Usage)
except Exception as e:
pytest.fail(f"Error occurred: {e}")
@@ -295,6 +301,8 @@ def test_bedrock_embedding_cohere():
isinstance(x, float) for x in response["data"][0]["embedding"]
), "Expected response to be a list of floats"
# print(f"response:", response)
+
+ assert isinstance(response.usage, litellm.Usage)
except Exception as e:
pytest.fail(f"Error occurred: {e}")
@@ -331,6 +339,8 @@ def test_hf_embedding():
input=["good morning from litellm", "this is another item"],
)
print(f"response:", response)
+
+ assert isinstance(response.usage, litellm.Usage)
except Exception as e:
# Note: Huggingface inference API is unstable and fails with "model loading errors all the time"
pass
@@ -386,6 +396,8 @@ def test_aembedding_azure():
response._hidden_params["custom_llm_provider"],
)
assert response._hidden_params["custom_llm_provider"] == "azure"
+
+ assert isinstance(response.usage, litellm.Usage)
except Exception as e:
pytest.fail(f"Error occurred: {e}")
@@ -440,6 +452,7 @@ def test_mistral_embeddings():
input=["good morning from litellm"],
)
print(f"response: {response}")
+ assert isinstance(response.usage, litellm.Usage)
except Exception as e:
pytest.fail(f"Error occurred: {e}")
diff --git a/litellm/tests/test_key_generate_prisma.py b/litellm/tests/test_key_generate_prisma.py
index 151781beb..103b344f5 100644
--- a/litellm/tests/test_key_generate_prisma.py
+++ b/litellm/tests/test_key_generate_prisma.py
@@ -158,7 +158,7 @@ def test_call_with_invalid_key(prisma_client):
async def test():
await litellm.proxy.proxy_server.prisma_client.connect()
- generated_key = "bad-key"
+ generated_key = "sk-126666"
bearer_token = "Bearer " + generated_key
request = Request(scope={"type": "http"}, receive=None)
@@ -173,7 +173,7 @@ def test_call_with_invalid_key(prisma_client):
except Exception as e:
print("Got Exception", e)
print(e.message)
- assert "Authentication Error" in e.message
+ assert "Authentication Error, Invalid token passed" in e.message
pass
diff --git a/litellm/utils.py b/litellm/utils.py
index 3fb961c05..95b18421f 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -72,7 +72,7 @@ from .integrations.litedebugger import LiteDebugger
from .proxy._types import KeyManagementSystem
from openai import OpenAIError as OriginalError
from openai._models import BaseModel as OpenAIObject
-from .caching import S3Cache, RedisSemanticCache
+from .caching import S3Cache, RedisSemanticCache, RedisCache
from .exceptions import (
AuthenticationError,
BadRequestError,
@@ -1795,7 +1795,12 @@ class Logging:
)
result = kwargs["async_complete_streaming_response"]
# only add to cache once we have a complete streaming response
- litellm.cache.add_cache(result, **kwargs)
+ if litellm.cache is not None and not isinstance(
+ litellm.cache.cache, S3Cache
+ ):
+ await litellm.cache.async_add_cache(result, **kwargs)
+ else:
+ litellm.cache.add_cache(result, **kwargs)
if isinstance(callback, CustomLogger): # custom logger class
print_verbose(
f"Running Async success callback: {callback}; self.stream: {self.stream}; async_complete_streaming_response: {self.model_call_details.get('async_complete_streaming_response', None)} result={result}"
@@ -2589,7 +2594,7 @@ def client(original_function):
if (
kwargs.get("max_tokens", None) is not None
and model is not None
- and litellm.drop_params
+ and litellm.modify_params
== True # user is okay with params being modified
and (
call_type == CallTypes.acompletion.value
@@ -2806,7 +2811,9 @@ def client(original_function):
):
if len(cached_result) == 1 and cached_result[0] is None:
cached_result = None
- elif isinstance(litellm.cache.cache, RedisSemanticCache):
+ elif isinstance(
+ litellm.cache.cache, RedisSemanticCache
+ ) or isinstance(litellm.cache.cache, RedisCache):
preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs)
kwargs["preset_cache_key"] = (
preset_cache_key # for streaming calls, we need to pass the preset_cache_key
@@ -5375,6 +5382,17 @@ def get_llm_provider(
# groq is openai compatible, we just need to set this to custom_openai and have the api_base be https://api.groq.com/openai/v1
api_base = "https://api.groq.com/openai/v1"
dynamic_api_key = get_secret("GROQ_API_KEY")
+ elif custom_llm_provider == "fireworks_ai":
+ # fireworks is openai compatible, we just need to set this to custom_openai and have the api_base be https://api.groq.com/openai/v1
+ if not model.startswith("accounts/fireworks/models"):
+ model = f"accounts/fireworks/models/{model}"
+ api_base = "https://api.fireworks.ai/inference/v1"
+ dynamic_api_key = (
+ get_secret("FIREWORKS_API_KEY")
+ or get_secret("FIREWORKS_AI_API_KEY")
+ or get_secret("FIREWORKSAI_API_KEY")
+ or get_secret("FIREWORKS_AI_TOKEN")
+ )
elif custom_llm_provider == "mistral":
# mistral is openai compatible, we just need to set this to custom_openai and have the api_base be https://api.mistral.ai
api_base = (
diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json
index ddacbf05c..0a90c91ca 100644
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@@ -631,6 +631,13 @@
"litellm_provider": "groq",
"mode": "chat"
},
+ "groq/gemma-7b-it": {
+ "max_tokens": 8192,
+ "input_cost_per_token": 0.00000010,
+ "output_cost_per_token": 0.00000010,
+ "litellm_provider": "groq",
+ "mode": "chat"
+ },
"claude-instant-1.2": {
"max_tokens": 100000,
"max_output_tokens": 8191,
diff --git a/proxy_server_config.yaml b/proxy_server_config.yaml
index 32f12bd79..69da70153 100644
--- a/proxy_server_config.yaml
+++ b/proxy_server_config.yaml
@@ -45,13 +45,15 @@ litellm_settings:
budget_duration: 30d
num_retries: 5
request_timeout: 600
+ cache: true
+ callbacks: ["batch_redis_requests"]
general_settings:
master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234)
proxy_budget_rescheduler_min_time: 60
proxy_budget_rescheduler_max_time: 64
# database_url: "postgresql://:@:/" # [OPTIONAL] use for token-based auth to proxy
-environment_variables:
+# environment_variables:
# settings for using redis caching
# REDIS_HOST: redis-16337.c322.us-east-1-2.ec2.cloud.redislabs.com
# REDIS_PORT: "16337"
diff --git a/pyproject.toml b/pyproject.toml
index acbbeb1dd..d07e87500 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[tool.poetry]
name = "litellm"
-version = "1.31.12"
+version = "1.31.16"
description = "Library to easily interface with LLM API providers"
authors = ["BerriAI"]
license = "MIT"
@@ -76,7 +76,7 @@ requires = ["poetry-core", "wheel"]
build-backend = "poetry.core.masonry.api"
[tool.commitizen]
-version = "1.31.12"
+version = "1.31.16"
version_files = [
"pyproject.toml:^version"
]
diff --git a/requirements.txt b/requirements.txt
index adfec7bc6..eaff0fb71 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -34,5 +34,4 @@ jinja2==3.1.3 # for prompt templates
certifi>=2023.7.22 # [TODO] clean up
aiohttp==3.9.0 # for network calls
aioboto3==12.3.0 # for async sagemaker calls
-argon2-cffi==23.1.0 # for checking secrets
####
\ No newline at end of file