Merge branch 'main' into support_anthropic_function_result

2024-03-16 09:58:08 -07:00 · 2024-03-16 09:58:08 -07:00 · 0368a335e6
commit 0368a335e6
parent 91f467f55d 5a87ac8047
42 changed files with 815 additions and 216 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -0,0 +1,5 @@
+/docs
+/cookbook
+/.circleci
+/.github
+/tests
--- a/.github/workflows/ghcr_deploy.yml
+++ b/.github/workflows/ghcr_deploy.yml
@ -10,6 +10,7 @@ on:
 env:
  REGISTRY: ghcr.io
  IMAGE_NAME: ${{ github.repository }}
+  CHART_NAME: litellm

 # There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu.
 jobs:
@ -103,6 +104,11 @@ jobs:
        uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
        with:
          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}-database
+      # Configure multi platform Docker builds
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@e0e4588fad221d38ee467c0bffd91115366dc0c5
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@edfb0fe6204400c56fbfd3feba3fe9ad1adfa345

      - name: Build and push Database Docker image
        uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
@ -112,6 +118,60 @@ jobs:
          push: true
          tags: ${{ steps.meta-database.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-database.outputs.tags }}-latest 
          labels: ${{ steps.meta-database.outputs.labels }} 
+          platforms: local,linux/amd64,linux/arm64,linux/arm64/v8
+  build-and-push-helm-chart:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Log in to the Container registry
+        uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: lowercase github.repository_owner
+        run: |
+          echo "REPO_OWNER=`echo ${{github.repository_owner}} | tr '[:upper:]' '[:lower:]'`" >>${GITHUB_ENV}
+      - name: Get LiteLLM Latest Tag
+        id: current_app_tag
+        uses: WyriHaximus/github-action-get-previous-tag@v1.3.0
+
+      - name: Get last published chart version
+        id: current_version
+        shell: bash
+        run: |
+          CHART_LIST=$(helm show chart oci://${{ env.REGISTRY }}/${{ env.REPO_OWNER }}/${{ env.CHART_NAME }} 2>/dev/null || true)
+          if [ -z "${CHART_LIST}" ]; then
+            echo "current-version=0.1.0" | tee -a $GITHUB_OUTPUT
+          else
+            printf '%s' "${CHART_LIST}" | grep '^version:' | awk 'BEGIN{FS=":"}{print "current-version="$2}' | tr -d " " | tee -a $GITHUB_OUTPUT
+          fi
+        env:
+          HELM_EXPERIMENTAL_OCI: '1'
+
+      # Automatically update the helm chart version one "patch" level
+      - name: Bump release version
+        id: bump_version
+        uses: christian-draeger/increment-semantic-version@1.1.0
+        with:
+          current-version: ${{ steps.current_version.outputs.current-version || '0.1.0' }}
+          version-fragment: 'bug'
+
+      - uses: ./.github/actions/helm-oci-chart-releaser
+        with:
+          name: ${{ env.CHART_NAME }}
+          repository: ${{ env.REPO_OWNER }}
+          tag: ${{ github.event.inputs.chartVersion || steps.bump_version.outputs.next-version || '0.1.0' }}
+          app_version: ${{ steps.current_app_tag.outputs.tag || 'latest' }}
+          path: deploy/charts/${{ env.CHART_NAME }}
+          registry: ${{ env.REGISTRY }}
+          registry_username: ${{ github.actor }}
+          registry_password: ${{ secrets.GITHUB_TOKEN }}
+          update_dependencies: true
+
  release:
    name: "New LiteLLM Release"
    needs: [docker-hub-deploy, build-and-push-image, build-and-push-image-database]
@ -171,13 +231,13 @@ jobs:
          RELEASE_NOTES: ${{ steps.release-notes.outputs.result }}
        run: |
          curl -H "Content-Type: application/json" -X POST -d '{
-            "content": "||@everyone||",
+            "content": "New LiteLLM release ${{ env.RELEASE_TAG }}",
            "username": "Release Changelog",
            "avatar_url": "https://cdn.discordapp.com/avatars/487431320314576937/bd64361e4ba6313d561d54e78c9e7171.png",
            "embeds": [
              {
-                "title": "Changelog for ${RELEASE_TAG}",
-                "description": "${RELEASE_NOTES}",
+                "title": "Changelog for LiteLLM ${{ env.RELEASE_TAG }}",
+                "description": "${{ env.RELEASE_NOTES }}",
                "color": 2105893
              }
            ]
--- a/.github/workflows/interpret_load_test.py
+++ b/.github/workflows/interpret_load_test.py
@ -0,0 +1,91 @@
+import csv
+import os
+from github import Github
+
+
+def interpret_results(csv_file):
+    with open(csv_file, newline="") as csvfile:
+        csvreader = csv.DictReader(csvfile)
+        rows = list(csvreader)
+        """
+        in this csv reader
+        - Create 1 new column "Status"
+        - if a row has a median response time < 300 and an average response time < 300, Status = "Passed ✅"
+        - if a row has a median response time >= 300 or an average response time >= 300, Status = "Failed ❌"
+        - Order the table in this order Name, Status, Median Response Time, Average Response Time, Requests/s,Failures/s, Min Response Time, Max Response Time, all other columns
+        """
+
+        # Add a new column "Status"
+        for row in rows:
+            median_response_time = float(
+                row["Median Response Time"].strip().rstrip("ms")
+            )
+            average_response_time = float(
+                row["Average Response Time"].strip().rstrip("s")
+            )
+
+            request_count = int(row["Request Count"])
+            failure_count = int(row["Failure Count"])
+
+            failure_percent = round((failure_count / request_count) * 100, 2)
+
+            # Determine status based on conditions
+            if (
+                median_response_time < 300
+                and average_response_time < 300
+                and failure_percent < 5
+            ):
+                row["Status"] = "Passed ✅"
+            else:
+                row["Status"] = "Failed ❌"
+
+        # Construct Markdown table header
+        markdown_table = "| Name | Status | Median Response Time (ms) | Average Response Time (ms) | Requests/s | Failures/s | Request Count | Failure Count | Min Response Time (ms) | Max Response Time (ms) |"
+        markdown_table += (
+            "\n| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |"
+        )
+
+        # Construct Markdown table rows
+        for row in rows:
+            markdown_table += f"\n| {row['Name']} | {row['Status']} | {row['Median Response Time']} | {row['Average Response Time']} | {row['Requests/s']} | {row['Failures/s']} | {row['Request Count']} | {row['Failure Count']} | {row['Min Response Time']} | {row['Max Response Time']} |"
+    print("markdown table: ", markdown_table)
+    return markdown_table
+
+
+if __name__ == "__main__":
+    csv_file = "load_test_stats.csv"  # Change this to the path of your CSV file
+    markdown_table = interpret_results(csv_file)
+
+    # Update release body with interpreted results
+    github_token = os.getenv("GITHUB_TOKEN")
+    g = Github(github_token)
+    repo = g.get_repo(
+        "BerriAI/litellm"
+    )  # Replace with your repository's username and name
+    latest_release = repo.get_latest_release()
+    print("got latest release: ", latest_release)
+    print("latest release body: ", latest_release.body)
+    print("markdown table: ", markdown_table)
+
+    # check if "Load Test LiteLLM Proxy Results" exists
+    existing_release_body = latest_release.body
+    if "Load Test LiteLLM Proxy Results" in latest_release.body:
+        # find the "Load Test LiteLLM Proxy Results" section and delete it
+        start_index = latest_release.body.find("Load Test LiteLLM Proxy Results")
+        existing_release_body = latest_release.body[:start_index]
+
+    new_release_body = (
+        existing_release_body
+        + "\n\n"
+        + "## Load Test LiteLLM Proxy Results"
+        + "\n\n"
+        + markdown_table
+    )
+    print("new release body: ", new_release_body)
+    try:
+        latest_release.update_release(
+            name=latest_release.tag_name,
+            message=new_release_body,
+        )
+    except Exception as e:
+        print(e)
--- a/.github/workflows/load_test.yml
+++ b/.github/workflows/load_test.yml
@ -1,6 +1,11 @@
 name: Test Locust Load Test

-on: [push]
+on:
+  workflow_run:
+    workflows: ["Build, Publish LiteLLM Docker Image. New Release"]
+    types:
+      - completed
+  workflow_dispatch:

 jobs:
  build:
@ -8,15 +13,32 @@ jobs:
    steps:
      - name: Checkout
        uses: actions/checkout@v1
+      - name: Setup Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: '3.x'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install PyGithub
      - name: Run Load Test
        id: locust_run
        uses: BerriAI/locust-github-action@master
        with:
          LOCUSTFILE: ".github/workflows/locustfile.py"
-          URL:  "https://litellm-api.up.railway.app/"
+          URL:  "https://litellm-database-docker-build-production.up.railway.app/"
          USERS: "100"
          RATE: "10"
-          RUNTIME: "60s"
+          RUNTIME: "300s"
+      - name: Process Load Test Stats
+        run: |
+          echo "Current working directory: $PWD"
+          ls
+          python ".github/workflows/interpret_load_test.py"
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        working-directory: ${{ github.workspace }}
      - name: Upload CSV as Asset to Latest Release
        uses: xresloader/upload-to-github-release@v1
        env:
@ -25,4 +47,4 @@ jobs:
          file: "load_test_stats.csv;load_test.html"
          update_latest_release: true
          tag_name: "load-test"
-          overwrite: true
+          overwrite: true
--- a/.github/workflows/locustfile.py
+++ b/.github/workflows/locustfile.py
@ -1,4 +1,6 @@
-from locust import HttpUser, task, between
+from locust import HttpUser, task, between, events
+import json
+import time


 class MyUser(HttpUser):
@ -8,7 +10,7 @@ class MyUser(HttpUser):
    def chat_completion(self):
        headers = {
            "Content-Type": "application/json",
-            "Authorization": f"Bearer sk-1234",
+            "Authorization": f"Bearer sk-gUvTeN9g0sgHBMf9HeCaqA",
            # Include any additional headers you may need for authentication, etc.
        }

@ -26,3 +28,15 @@ class MyUser(HttpUser):
        response = self.client.post("chat/completions", json=payload, headers=headers)

        # Print or log the response if needed
+
+    @task(10)
+    def health_readiness(self):
+        start_time = time.time()
+        response = self.client.get("health/readiness")
+        response_time = time.time() - start_time
+
+    @task(10)
+    def health_liveliness(self):
+        start_time = time.time()
+        response = self.client.get("health/liveliness")
+        response_time = time.time() - start_time
--- a/2
+++ b/2
@ -66,4 +66,4 @@ ENTRYPOINT ["litellm"]

 # Append "--detailed_debug" to the end of CMD to view detailed debug logs 
 # CMD ["--port", "4000", "--config", "./proxy_server_config.yaml", "--run_gunicorn", "--detailed_debug"]
-CMD ["--port", "4000", "--config", "./proxy_server_config.yaml", "--run_gunicorn", "--num_workers", "1"]
+CMD ["--port", "4000", "--config", "./proxy_server_config.yaml", "--run_gunicorn", "--num_workers", "4"]
--- a/deploy/charts/litellm/README.md
+++ b/deploy/charts/litellm/README.md
@ -2,7 +2,7 @@

 ## Prerequisites

- Kubernetes 1.23+
+- Kubernetes 1.21+
 - Helm 3.8.0+

 If `db.deployStandalone` is used:
@ -33,6 +33,7 @@ If `db.useStackgresOperator` is used (not yet implemented):
 | `proxy_config.*`                                           | See [values.yaml](./values.yaml) for default settings.  See [example_config_yaml](../../../litellm/proxy/example_config_yaml/) for configuration examples.                            | N/A  |

 #### Example `environmentSecrets` Secret 
+
 ```
 apiVersion: v1
 kind: Secret
--- a/deploy/charts/litellm/values.yaml
+++ b/deploy/charts/litellm/values.yaml
@ -6,7 +6,6 @@ replicaCount: 1

 image:
  # Use "ghcr.io/berriai/litellm-database" for optimized image with database
-  # Alternatively, use "ghcr.io/berriai/litellm" for the default image
  repository: ghcr.io/berriai/litellm-database
  pullPolicy: IfNotPresent
  # Overrides the image tag whose default is the chart appVersion.
@ -85,10 +84,13 @@ proxy_config:
      litellm_params:
        model: gpt-3.5-turbo
        api_key: eXaMpLeOnLy
+    - model_name: fake-openai-endpoint
+      litellm_params:
+        model: openai/fake
+        api_key: fake-key
+        api_base: https://exampleopenaiendpoint-production.up.railway.app/
  general_settings:
    master_key: os.environ/PROXY_MASTER_KEY
-#  litellm_settings:
-#    cache: true

 resources: {}
  # We usually recommend not to specify default resources and to leave this as a conscious
--- a/docs/my-website/docs/audio_transcription.md
+++ b/docs/my-website/docs/audio_transcription.md
@ -77,9 +77,32 @@ litellm --config /path/to/config.yaml

 ### Test 

+<Tabs>
+<TabItem value="curl" label="Curl">
+
 ```bash
-curl --location 'http://0.0.0.0:4000/v1/audio/transcriptions' \
+curl --location 'http://0.0.0.0:8000/v1/audio/transcriptions' \
 --header 'Authorization: Bearer sk-1234' \
 --form 'file=@"/Users/krrishdholakia/Downloads/gettysburg.wav"' \
 --form 'model="whisper"'
 ```
+
+</TabItem>
+<TabItem value="openai" label="OpenAI">
+
+```python
+from openai import OpenAI
+client = openai.OpenAI(
+    api_key="sk-1234",
+    base_url="http://0.0.0.0:8000"
+)
+
+
+audio_file = open("speech.mp3", "rb")
+transcript = client.audio.transcriptions.create(
+  model="whisper",
+  file=audio_file
+)
+```
+</TabItem>
+</Tabs>
--- a/docs/my-website/docs/langchain/langchain.md
+++ b/docs/my-website/docs/langchain/langchain.md
@ -133,3 +133,6 @@ chat(messages)
 ```
 </TabItem>
 </Tabs>
+
+## Use LangChain ChatLiteLLM + Langfuse
+Checkout this section [here](../observability/langfuse_integration#use-langchain-chatlitellm--langfuse) for more details on how to integrate Langfuse with ChatLiteLLM.
--- a/docs/my-website/docs/load_test.md
+++ b/docs/my-website/docs/load_test.md
@ -2,6 +2,54 @@ import Image from '@theme/IdealImage';

 # 🔥 Load Test LiteLLM 

+## How to run a locust load test on LiteLLM Proxy 
+
+1. Add `fake-openai-endpoint` to your proxy config.yaml and start your litellm proxy
+litellm provides a free hosted `fake-openai-endpoint` you can load test against
+
+```yaml
+model_list:
+  - model_name: fake-openai-endpoint
+    litellm_params:
+      model: openai/fake
+      api_key: fake-key
+      api_base: https://exampleopenaiendpoint-production.up.railway.app/
+```
+
+2. `pip install locust`
+
+3. Create a file called `locustfile.py` on your local machine. Copy the contents from the litellm load test located [here](https://github.com/BerriAI/litellm/blob/main/.github/workflows/locustfile.py)
+
+4. Start locust
+  Run `locust` in the same directory as your `locustfile.py` from step 2
+
+  ```shell
+  locust
+  ```
+
+  Output on terminal 
+  ```
+  [2024-03-15 07:19:58,893] Starting web interface at http://0.0.0.0:8089
+  [2024-03-15 07:19:58,898] Starting Locust 2.24.0
+  ```
+
+5. Run Load test on locust
+
+  Head to the locust UI on http://0.0.0.0:8089
+
+  Set Users=100, Ramp Up Users=10, Host=Base URL of your LiteLLM Proxy
+
+  <Image img={require('../img/locust_load_test.png')} />
+
+6. Expected Results
+
+  Expect to see the following response times for `/health/readiness` 
+  Median → /health/readiness is `150ms`
+
+  Avg →  /health/readiness is `219ms`
+
+  <Image img={require('../img/litellm_load_test.png')} />
+
 ## Load Test LiteLLM Proxy - 1500+ req/s

 ## 1500+ concurrent requests/s
--- a/docs/my-website/docs/observability/langfuse_integration.md
+++ b/docs/my-website/docs/observability/langfuse_integration.md
@ -132,6 +132,41 @@ print(response)

 ```

+### Use LangChain ChatLiteLLM + Langfuse
+Pass `trace_user_id`, `session_id` in model_kwargs
+```python
+import os
+from langchain.chat_models import ChatLiteLLM
+from langchain.schema import HumanMessage
+import litellm
+
+# from https://cloud.langfuse.com/
+os.environ["LANGFUSE_PUBLIC_KEY"] = ""
+os.environ["LANGFUSE_SECRET_KEY"] = ""
+
+os.environ['OPENAI_API_KEY']=""
+
+# set langfuse as a callback, litellm will send the data to langfuse
+litellm.success_callback = ["langfuse"] 
+
+chat = ChatLiteLLM(
+  model="gpt-3.5-turbo"
+  model_kwargs={
+      "metadata": {
+        "trace_user_id": "user-id2", # set langfuse Trace User ID
+        "session_id": "session-1" ,  # set langfuse Session ID
+        "tags": ["tag1", "tag2"] 
+      }
+    }
+  )
+messages = [
+    HumanMessage(
+        content="what model are you"
+    )
+]
+chat(messages)
+```
+

 ## Troubleshooting & Errors
 ### Data not getting logged to Langfuse ? 
@ -142,4 +177,4 @@ print(response)
 - [Schedule Demo 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version)
 - [Community Discord 💭](https://discord.gg/wuPM9dRgDw)
 - Our numbers 📞 +1 (770) 8783-106 / ‭+1 (412) 618-6238‬
- Our emails ✉️ ishaan@berri.ai / krrish@berri.ai
+- Our emails ✉️ ishaan@berri.ai / krrish@berri.ai
--- a/docs/my-website/docs/providers/cohere.md
+++ b/docs/my-website/docs/providers/cohere.md
@ -49,7 +49,7 @@ for chunk in response:
 | command-light | `completion('command-light', messages)` |  
 | command-medium | `completion('command-medium', messages)` |
 | command-medium-beta | `completion('command-medium-beta', messages)` |
-| command-xlarge-beta | `completion('command-xlarge-beta', messages)` |
+| command-xlarge-nightly | `completion('command-xlarge-nightly', messages)` |
 | command-nightly | `completion('command-nightly', messages)` |


--- a/docs/my-website/docs/providers/fireworks_ai.md
+++ b/docs/my-website/docs/providers/fireworks_ai.md
@ -0,0 +1,53 @@
+# Fireworks AI
+https://fireworks.ai/
+
+**We support ALL Fireworks AI models, just set `fireworks_ai/` as a prefix when sending completion requests**
+
+## API Key
+```python
+# env variable
+os.environ['FIREWORKS_AI_API_KEY']
+```
+
+## Sample Usage
+```python
+from litellm import completion
+import os
+
+os.environ['FIREWORKS_AI_API_KEY'] = ""
+response = completion(
+    model="fireworks_ai/mixtral-8x7b-instruct", 
+    messages=[
+       {"role": "user", "content": "hello from litellm"}
+   ],
+)
+print(response)
+```
+
+## Sample Usage - Streaming
+```python
+from litellm import completion
+import os
+
+os.environ['FIREWORKS_AI_API_KEY'] = ""
+response = completion(
+    model="fireworks_ai/mixtral-8x7b-instruct", 
+    messages=[
+       {"role": "user", "content": "hello from litellm"}
+   ],
+    stream=True
+)
+
+for chunk in response:
+    print(chunk)
+```
+
+
+## Supported Models - ALL Fireworks AI Models Supported!
+We support ALL Fireworks AI models, just set `fireworks_ai/` as a prefix when sending completion requests
+
+| Model Name               | Function Call                                                                                                                                                      |
+|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| mixtral-8x7b-instruct | `completion(model="fireworks_ai/mixtral-8x7b-instruct", messages)` | 
+| firefunction-v1 | `completion(model="fireworks_ai/firefunction-v1", messages)` |
+| llama-v2-70b-chat | `completion(model="fireworks_ai/llama-v2-70b-chat", messages)` |  
--- a/docs/my-website/docs/providers/groq.md
+++ b/docs/my-website/docs/providers/groq.md
@ -49,4 +49,5 @@ We support ALL Groq models, just set `groq/` as a prefix when sending completion
 | Model Name               | Function Call                                                                                                                                                      |
 |--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | llama2-70b-4096 | `completion(model="groq/llama2-70b-4096", messages)` | 
-| mixtral-8x7b-32768 | `completion(model="groq/mixtral-8x7b-32768", messages)` | 
+| mixtral-8x7b-32768 | `completion(model="groq/mixtral-8x7b-32768", messages)` |
+| gemma-7b-it | `completion(model="groq/gemma-7b-it", messages)` |  
--- a/docs/my-website/docs/proxy/caching.md
+++ b/docs/my-website/docs/proxy/caching.md
@ -225,6 +225,32 @@ litellm_settings:
    supported_call_types: ["acompletion", "completion", "embedding", "aembedding"] # defaults to all litellm call types
 ```

+
+### Turn on `batch_redis_requests` 
+
+**What it does?**
+When a request is made:
+
+- Check if a key starting with `litellm:<hashed_api_key>:<call_type>:` exists in-memory, if no - get the last 100 cached requests for this key and store it
+
+- New requests are stored with this `litellm:..` as the namespace
+
+**Why?**
+Reduce number of redis GET requests. This improved latency by 46% in prod load tests. 
+
+**Usage**
+
+```yaml
+litellm_settings:
+  cache: true
+  cache_params:
+    type: redis
+    ... # remaining redis args (host, port, etc.)
+  callbacks: ["batch_redis_requests"] # 👈 KEY CHANGE!
+```
+
+[**SEE CODE**](https://github.com/BerriAI/litellm/blob/main/litellm/proxy/hooks/batch_redis_get.py)
+
 ### Turn on / off caching per request.  

 The proxy support 3 cache-controls:
--- a/docs/my-website/docs/proxy/deploy.md
+++ b/docs/my-website/docs/proxy/deploy.md
@ -150,17 +150,20 @@ To avoid issues with predictability, difficulties in rollback, and inconsistent


 ## Deploy with Database
+### Docker, Kubernetes, Helm Chart
+
+
+<Tabs>
+
+<TabItem value="docker-deploy" label="Dockerfile">

 We maintain a [seperate Dockerfile](https://github.com/BerriAI/litellm/pkgs/container/litellm-database) for reducing build time when running LiteLLM proxy with a connected Postgres Database 

-<Tabs>
-<TabItem value="docker-deploy" label="Dockerfile">
-
-```
+```shell
 docker pull docker pull ghcr.io/berriai/litellm-database:main-latest
 ```

-```
+```shell
 docker run --name litellm-proxy \
 -e DATABASE_URL=postgresql://<user>:<password>@<host>:<port>/<dbname> \
 -p 4000:4000 \
@ -233,6 +236,8 @@ Your OpenAI proxy server is now running on `http://0.0.0.0:4000`.
 </TabItem>
 <TabItem value="helm-deploy" label="Helm">

+Use this to deploy litellm using a helm chart. Link to [the LiteLLM Helm Chart](https://github.com/BerriAI/litellm/tree/main/deploy/charts/litellm)
+
 #### Step 1. Clone the repository

 ```bash
@ -241,9 +246,11 @@ git clone https://github.com/BerriAI/litellm.git

 #### Step 2. Deploy with Helm

+Run the following command in the root of your `litellm` repo. This will set the litellm proxy master key as `sk-1234`
+
 ```bash
 helm install \
-  --set masterkey=SuPeRsEcReT \
+  --set masterkey=sk-1234 \
  mydeploy \
  deploy/charts/litellm
 ```
@ -259,6 +266,9 @@ kubectl \

 Your OpenAI proxy server is now running on `http://127.0.0.1:4000`.

+
+If you need to set your litellm proxy config.yaml, you can find this in [values.yaml](https://github.com/BerriAI/litellm/blob/main/deploy/charts/litellm/values.yaml)
+
 </TabItem>
 </Tabs>

--- a/docs/my-website/img/litellm_load_test.png
+++ b/docs/my-website/img/litellm_load_test.png
--- a/docs/my-website/img/locust_load_test.png
+++ b/docs/my-website/img/locust_load_test.png
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -138,6 +138,7 @@ const sidebars = {
        "providers/ollama", 
        "providers/perplexity", 
        "providers/groq", 
+        "providers/fireworks_ai", 
        "providers/vllm", 
        "providers/xinference", 
        "providers/cloudflare_workers", 
--- a/enterprise/init.py
+++ b/enterprise/init.py
@ -0,0 +1 @@
+from . import *
--- a/litellm/init.py
+++ b/litellm/init.py
@ -36,6 +36,7 @@ token: Optional[str] = (
 telemetry = True
 max_tokens = 256  # OpenAI Defaults
 drop_params = False
+modify_params = False
 retry = True
 api_key: Optional[str] = None
 openai_key: Optional[str] = None
@ -327,6 +328,7 @@ openai_compatible_providers: List = [
    "perplexity",
    "xinference",
    "together_ai",
+    "fireworks_ai",
 ]


@ -478,6 +480,7 @@ provider_list: List = [
    "voyage",
    "cloudflare",
    "xinference",
+    "fireworks_ai",
    "custom",  # custom apis
 ]

--- a/litellm/caching.py
+++ b/litellm/caching.py
@ -129,6 +129,16 @@ class RedisCache(BaseCache):
                f"LiteLLM Caching: set() - Got exception from REDIS : {str(e)}"
            )

+    async def async_scan_iter(self, pattern: str, count: int = 100) -> list:
+        keys = []
+        _redis_client = self.init_async_client()
+        async with _redis_client as redis_client:
+            async for key in redis_client.scan_iter(match=pattern + "*", count=count):
+                keys.append(key)
+                if len(keys) >= count:
+                    break
+        return keys
+
    async def async_set_cache(self, key, value, **kwargs):
        _redis_client = self.init_async_client()
        async with _redis_client as redis_client:
@ -140,6 +150,9 @@ class RedisCache(BaseCache):
                await redis_client.set(
                    name=key, value=json.dumps(value), ex=ttl, get=True
                )
+                print_verbose(
+                    f"Successfully Set ASYNC Redis Cache: key: {key}\nValue {value}\nttl={ttl}"
+                )
            except Exception as e:
                # NON blocking - notify users Redis is throwing an exception
                print_verbose(
@ -172,8 +185,6 @@ class RedisCache(BaseCache):
            return results
        except Exception as e:
            print_verbose(f"Error occurred in pipeline write - {str(e)}")
-            # NON blocking - notify users Redis is throwing an exception
-            logging.debug("LiteLLM Caching: set() - Got exception from REDIS : ", e)

    def _get_cache_logic(self, cached_response: Any):
        """
@ -208,7 +219,7 @@ class RedisCache(BaseCache):
        _redis_client = self.init_async_client()
        async with _redis_client as redis_client:
            try:
-                print_verbose(f"Get Redis Cache: key: {key}")
+                print_verbose(f"Get Async Redis Cache: key: {key}")
                cached_response = await redis_client.get(key)
                print_verbose(
                    f"Got Async Redis Cache: key: {key}, cached_response {cached_response}"
@ -217,8 +228,39 @@ class RedisCache(BaseCache):
                return response
            except Exception as e:
                # NON blocking - notify users Redis is throwing an exception
-                traceback.print_exc()
-                logging.debug("LiteLLM Caching: get() - Got exception from REDIS: ", e)
+                print_verbose(
+                    f"LiteLLM Caching: async get() - Got exception from REDIS: {str(e)}"
+                )
+
+    async def async_get_cache_pipeline(self, key_list) -> dict:
+        """
+        Use Redis for bulk read operations
+        """
+        _redis_client = await self.init_async_client()
+        key_value_dict = {}
+        try:
+            async with _redis_client as redis_client:
+                async with redis_client.pipeline(transaction=True) as pipe:
+                    # Queue the get operations in the pipeline for all keys.
+                    for cache_key in key_list:
+                        pipe.get(cache_key)  # Queue GET command in pipeline
+
+                    # Execute the pipeline and await the results.
+                    results = await pipe.execute()
+
+            # Associate the results back with their keys.
+            # 'results' is a list of values corresponding to the order of keys in 'key_list'.
+            key_value_dict = dict(zip(key_list, results))
+
+            decoded_results = {
+                k.decode("utf-8"): self._get_cache_logic(v)
+                for k, v in key_value_dict.items()
+            }
+
+            return decoded_results
+        except Exception as e:
+            print_verbose(f"Error occurred in pipeline read - {str(e)}")
+            return key_value_dict

    def flush_cache(self):
        self.redis_client.flushall()
@ -1001,6 +1043,10 @@ class Cache:
        if self.namespace is not None:
            hash_hex = f"{self.namespace}:{hash_hex}"
            print_verbose(f"Hashed Key with Namespace: {hash_hex}")
+        elif kwargs.get("metadata", {}).get("redis_namespace", None) is not None:
+            _namespace = kwargs.get("metadata", {}).get("redis_namespace", None)
+            hash_hex = f"{_namespace}:{hash_hex}"
+            print_verbose(f"Hashed Key with Namespace: {hash_hex}")
        return hash_hex

    def generate_streaming_content(self, content):
--- a/litellm/llms/bedrock.py
+++ b/litellm/llms/bedrock.py
@ -82,12 +82,22 @@ class AmazonAnthropicClaude3Config:

    Supported Params for the Amazon / Anthropic Claude 3 models:

-    - `max_tokens` (integer) max tokens,
-    - `anthropic_version` (string) version of anthropic for bedrock - e.g. "bedrock-2023-05-31"
+    - `max_tokens` Required (integer) max tokens,
+    - `anthropic_version` Required (string) version of anthropic for bedrock - e.g. "bedrock-2023-05-31"
+    - `system` Optional (string) the system prompt, conversion from openai format to this is handled in factory.py
+    - `temperature` Optional (float) The amount of randomness injected into the response
+    - `top_p` Optional (float) Use nucleus sampling.
+    - `top_k` Optional (int) Only sample from the top K options for each subsequent token
+    - `stop_sequences` Optional (List[str]) Custom text sequences that cause the model to stop generating
    """

    max_tokens: Optional[int] = litellm.max_tokens
    anthropic_version: Optional[str] = "bedrock-2023-05-31"
+    system: Optional[str] = None
+    temperature: Optional[float] = None
+    top_p: Optional[float] = None
+    top_k: Optional[int] = None
+    stop_sequences: Optional[List[str]] = None

    def __init__(
        self,
@ -128,6 +138,12 @@ class AmazonAnthropicClaude3Config:
                optional_params["tools"] = value
            if param == "stream":
                optional_params["stream"] = value
+            if param == "stop":
+                optional_params["stop_sequences"] = value
+            if param == "temperature":
+                optional_params["temperature"] = value
+            if param == "top_p":
+                optional_params["top_p"] = value
        return optional_params


--- a/litellm/llms/cohere.py
+++ b/litellm/llms/cohere.py
@ -300,8 +300,7 @@ def embedding(
    for text in input:
        input_tokens += len(encoding.encode(text))

-    model_response["usage"] = {
-        "prompt_tokens": input_tokens,
-        "total_tokens": input_tokens,
-    }
+    model_response["usage"] = Usage(
+        prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens
+    )
    return model_response
--- a/litellm/llms/prompt_templates/factory.py
+++ b/litellm/llms/prompt_templates/factory.py
@ -705,6 +705,7 @@ def anthropic_messages_pt(messages: list):
                    "text"
                ].rstrip()  # no trailing whitespace for final assistant message

+
    return new_messages


--- a/litellm/main.py
+++ b/litellm/main.py
@ -12,6 +12,7 @@ from typing import Any, Literal, Union, BinaryIO
 from functools import partial
 import dotenv, traceback, random, asyncio, time, contextvars
 from copy import deepcopy
+
 import httpx
 import litellm
 from ._logging import verbose_logger
@ -891,6 +892,7 @@ def completion(
            or custom_llm_provider == "mistral"
            or custom_llm_provider == "openai"
            or custom_llm_provider == "together_ai"
+            or custom_llm_provider in litellm.openai_compatible_providers
            or "ft:gpt-3.5-turbo" in model  # finetune gpt-3.5-turbo
        ):  # allow user to make an openai call with a custom base
            # note: if a user sets a custom base - we should ensure this works
@ -2393,6 +2395,7 @@ async def aembedding(*args, **kwargs):
            or custom_llm_provider == "deepinfra"
            or custom_llm_provider == "perplexity"
            or custom_llm_provider == "groq"
+            or custom_llm_provider == "fireworks_ai"
            or custom_llm_provider == "ollama"
            or custom_llm_provider == "vertex_ai"
        ):  # currently implemented aiohttp calls for just azure and openai, soon all.
@ -2892,6 +2895,7 @@ async def atext_completion(*args, **kwargs):
            or custom_llm_provider == "deepinfra"
            or custom_llm_provider == "perplexity"
            or custom_llm_provider == "groq"
+            or custom_llm_provider == "fireworks_ai"
            or custom_llm_provider == "text-completion-openai"
            or custom_llm_provider == "huggingface"
            or custom_llm_provider == "ollama"
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -631,6 +631,13 @@
        "litellm_provider": "groq",
        "mode": "chat"
    },
+    "groq/gemma-7b-it": {
+        "max_tokens": 8192,
+        "input_cost_per_token": 0.00000010,
+        "output_cost_per_token": 0.00000010,
+        "litellm_provider": "groq",
+        "mode": "chat"
+    },
    "claude-instant-1.2": {
        "max_tokens": 100000,
        "max_output_tokens": 8191,
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@ -9,6 +9,12 @@ model_list:
    model: gpt-3.5-turbo-1106
    api_key: os.environ/OPENAI_API_KEY

+litellm_settings:
+  cache: true
+  cache_params:
+    type: redis
+  callbacks: ["batch_redis_requests"]
+
 general_settings:
  master_key: sk-1234
-  database_url: "postgresql://krrishdholakia:9yQkKWiB8vVs@ep-icy-union-a5j4dwls.us-east-2.aws.neon.tech/neondb?sslmode=require"
+  # database_url: "postgresql://krrishdholakia:9yQkKWiB8vVs@ep-icy-union-a5j4dwls.us-east-2.aws.neon.tech/neondb?sslmode=require"
--- a/litellm/proxy/hooks/batch_redis_get.py
+++ b/litellm/proxy/hooks/batch_redis_get.py
@ -0,0 +1,124 @@
+# What this does?
+## Gets a key's redis cache, and store it in memory for 1 minute.
+## This reduces the number of REDIS GET requests made during high-traffic by the proxy.
+### [BETA] this is in Beta. And might change.
+
+from typing import Optional, Literal
+import litellm
+from litellm.caching import DualCache, RedisCache, InMemoryCache
+from litellm.proxy._types import UserAPIKeyAuth
+from litellm.integrations.custom_logger import CustomLogger
+from litellm._logging import verbose_proxy_logger
+from fastapi import HTTPException
+import json, traceback
+
+
+class _PROXY_BatchRedisRequests(CustomLogger):
+    # Class variables or attributes
+    in_memory_cache: Optional[InMemoryCache] = None
+
+    def __init__(self):
+        litellm.cache.async_get_cache = (
+            self.async_get_cache
+        )  # map the litellm 'get_cache' function to our custom function
+
+    def print_verbose(
+        self, print_statement, debug_level: Literal["INFO", "DEBUG"] = "DEBUG"
+    ):
+        if debug_level == "DEBUG":
+            verbose_proxy_logger.debug(print_statement)
+        elif debug_level == "INFO":
+            verbose_proxy_logger.debug(print_statement)
+        if litellm.set_verbose is True:
+            print(print_statement)  # noqa
+
+    async def async_pre_call_hook(
+        self,
+        user_api_key_dict: UserAPIKeyAuth,
+        cache: DualCache,
+        data: dict,
+        call_type: str,
+    ):
+        try:
+            """
+            Get the user key
+
+            Check if a key starting with `litellm:<api_key>:<call_type:` exists in-memory
+
+            If no, then get relevant cache from redis
+            """
+            api_key = user_api_key_dict.api_key
+
+            cache_key_name = f"litellm:{api_key}:{call_type}"
+            self.in_memory_cache = cache.in_memory_cache
+
+            key_value_dict = {}
+            in_memory_cache_exists = False
+            for key in cache.in_memory_cache.cache_dict.keys():
+                if isinstance(key, str) and key.startswith(cache_key_name):
+                    in_memory_cache_exists = True
+
+            if in_memory_cache_exists == False and litellm.cache is not None:
+                """
+                - Check if `litellm.Cache` is redis
+                - Get the relevant values
+                """
+                if litellm.cache.type is not None and isinstance(
+                    litellm.cache.cache, RedisCache
+                ):
+                    # Initialize an empty list to store the keys
+                    keys = []
+                    self.print_verbose(f"cache_key_name: {cache_key_name}")
+                    # Use the SCAN iterator to fetch keys matching the pattern
+                    keys = await litellm.cache.cache.async_scan_iter(
+                        pattern=cache_key_name, count=100
+                    )
+                    # If you need the truly "last" based on time or another criteria,
+                    # ensure your key naming or storage strategy allows this determination
+                    # Here you would sort or filter the keys as needed based on your strategy
+                    self.print_verbose(f"redis keys: {keys}")
+                    if len(keys) > 0:
+                        key_value_dict = (
+                            await litellm.cache.cache.async_get_cache_pipeline(
+                                key_list=keys
+                            )
+                        )
+
+            ## Add to cache
+            if len(key_value_dict.items()) > 0:
+                await cache.in_memory_cache.async_set_cache_pipeline(
+                    cache_list=list(key_value_dict.items()), ttl=60
+                )
+            ## Set cache namespace if it's a miss
+            data["metadata"]["redis_namespace"] = cache_key_name
+        except HTTPException as e:
+            raise e
+        except Exception as e:
+            traceback.print_exc()
+
+    async def async_get_cache(self, *args, **kwargs):
+        """
+        - Check if the cache key is in-memory
+
+        - Else return None
+        """
+        try:  # never block execution
+            if "cache_key" in kwargs:
+                cache_key = kwargs["cache_key"]
+            else:
+                cache_key = litellm.cache.get_cache_key(
+                    *args, **kwargs
+                )  # returns "<cache_key_name>:<hash>" - we pass redis_namespace in async_pre_call_hook. Done to avoid rewriting the async_set_cache logic
+            if cache_key is not None and self.in_memory_cache is not None:
+                cache_control_args = kwargs.get("cache", {})
+                max_age = cache_control_args.get(
+                    "s-max-age", cache_control_args.get("s-maxage", float("inf"))
+                )
+                cached_result = self.in_memory_cache.get_cache(
+                    cache_key, *args, **kwargs
+                )
+                return litellm.cache._get_cache_logic(
+                    cached_result=cached_result, max_age=max_age
+                )
+        except Exception as e:
+            return None
--- a/litellm/proxy/proxy_load_test/locustfile.py
+++ b/litellm/proxy/proxy_load_test/locustfile.py
@ -6,7 +6,7 @@ import time
 class MyUser(HttpUser):
    wait_time = between(1, 5)

-    @task
+    @task(3)
    def chat_completion(self):
        headers = {
            "Content-Type": "application/json",
@ -31,62 +31,8 @@ class MyUser(HttpUser):

    @task(10)
    def health_readiness(self):
-        start_time = time.time()
        response = self.client.get("health/readiness")
-        response_time = time.time() - start_time
-        if response_time > 1:
-            events.request_failure.fire(
-                request_type="GET",
-                name="health/readiness",
-                response_time=response_time,
-                exception=None,
-                response=response,
-            )

    @task(10)
    def health_liveliness(self):
-        start_time = time.time()
        response = self.client.get("health/liveliness")
-        response_time = time.time() - start_time
-        if response_time > 1:
-            events.request_failure.fire(
-                request_type="GET",
-                name="health/liveliness",
-                response_time=response_time,
-                exception=None,
-                response=response,
-            )
-
-    # @task
-    # def key_generate(self):
-    #     headers = {
-    #         "Authorization": "Bearer sk-1234",
-    #         "Content-Type": "application/json",
-    #     }
-
-    #     payload = {
-    #         "models": ["gpt-3.5-turbo", "gpt-4", "claude-2"],
-    #         "duration": "20m",
-    #         "metadata": {"user": "ishaan@berri.ai"},
-    #         "team_id": "core-infra",
-    #         "max_budget": 10,
-    #         "soft_budget": 5,
-    #     }
-
-    #     response = self.client.post("key/generate", json=payload, headers=headers)
-
-    #     if response.status_code == 200:
-    #         key_response = response.json()
-    #         models = key_response.get("models", [])
-    #         if models:
-    #             # Use the first model from the key generation response to make a chat completions request
-    #             model_to_use = models[0]
-    #             chat_payload = {
-    #                 "model": model_to_use,
-    #                 "messages": [
-    #                     {"role": "system", "content": "You are a chat bot."},
-    #                     {"role": "user", "content": "Hello, how are you?"},
-    #                 ],
-    #             }
-    #             chat_response = self.client.post("chat/completions", json=chat_payload, headers=headers)
-    #             # Print or log the chat response if needed
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -8,7 +8,6 @@ import hashlib, uuid
 import warnings
 import importlib
 import warnings
-import backoff


 def showwarning(message, category, filename, lineno, file=None, line=None):
@ -35,7 +34,6 @@ try:
    import orjson
    import logging
    from apscheduler.schedulers.asyncio import AsyncIOScheduler
-    from argon2 import PasswordHasher
 except ImportError as e:
    raise ImportError(f"Missing dependency {e}. Run `pip install 'litellm[proxy]'`")

@ -145,9 +143,12 @@ from typing import Union
 try:
    # when using litellm cli
    import litellm.proxy.enterprise as enterprise
-except:
+except Exception as e:
    # when using litellm docker image
-    import enterprise  # type: ignore
+    try:
+        import enterprise  # type: ignore
+    except Exception as e:
+        pass

 ui_link = f"/ui/"
 ui_message = (
@ -252,7 +253,6 @@ user_headers = None
 user_config_file_path = f"config_{int(time.time())}.yaml"
 local_logging = True  # writes logs to a local api_log.json file for debugging
 experimental = False
-ph = PasswordHasher()
 #### GLOBAL VARIABLES ####
 llm_router: Optional[litellm.Router] = None
 llm_model_list: Optional[list] = None
@ -382,7 +382,7 @@ async def user_api_key_auth(
            return valid_token

        try:
-            is_master_key_valid = ph.verify(litellm_master_key_hash, api_key)
+            is_master_key_valid = secrets.compare_digest(api_key, master_key)
        except Exception as e:
            is_master_key_valid = False

@ -887,6 +887,9 @@ async def user_api_key_auth(
                    raise Exception(
                        f"This key is made for LiteLLM UI, Tried to access route: {route}. Not allowed"
                    )
+        if valid_token is None:
+            # No token was found when looking up in the DB
+            raise Exception("Invalid token passed")
        if valid_token_dict is not None:
            return UserAPIKeyAuth(api_key=api_key, **valid_token_dict)
        else:
@ -1420,6 +1423,8 @@ async def update_cache(
        try:
            for _id in user_ids:
                # Fetch the existing cost for the given user
+                if _id is None:
+                    continue
                existing_spend_obj = await user_api_key_cache.async_get_cache(key=_id)
                if existing_spend_obj is None:
                    # if user does not exist in LiteLLM_UserTable, create a new user
@ -1791,6 +1796,16 @@ class ProxyConfig:
                                    _ENTERPRISE_PromptInjectionDetection()
                                )
                                imported_list.append(prompt_injection_detection_obj)
+                            elif (
+                                isinstance(callback, str)
+                                and callback == "batch_redis_requests"
+                            ):
+                                from litellm.proxy.hooks.batch_redis_get import (
+                                    _PROXY_BatchRedisRequests,
+                                )
+
+                                batch_redis_obj = _PROXY_BatchRedisRequests()
+                                imported_list.append(batch_redis_obj)
                            else:
                                imported_list.append(
                                    get_instance_fn(
@ -1913,7 +1928,7 @@ class ProxyConfig:
                master_key = litellm.get_secret(master_key)

            if master_key is not None and isinstance(master_key, str):
-                litellm_master_key_hash = ph.hash(master_key)
+                litellm_master_key_hash = master_key
            ### CUSTOM API KEY AUTH ###
            ## pass filepath
            custom_auth = general_settings.get("custom_auth", None)
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@ -474,11 +474,10 @@ def test_redis_cache_completion_stream():
 # test_redis_cache_completion_stream()


-def test_redis_cache_acompletion_stream():
-    import asyncio
-
+@pytest.mark.asyncio
+async def test_redis_cache_acompletion_stream():
    try:
-        litellm.set_verbose = False
+        litellm.set_verbose = True
        random_word = generate_random_word()
        messages = [
            {
@ -496,37 +495,31 @@ def test_redis_cache_acompletion_stream():
        response_1_content = ""
        response_2_content = ""

-        async def call1():
-            nonlocal response_1_content
-            response1 = await litellm.acompletion(
-                model="gpt-3.5-turbo",
-                messages=messages,
-                max_tokens=40,
-                temperature=1,
-                stream=True,
-            )
-            async for chunk in response1:
-                response_1_content += chunk.choices[0].delta.content or ""
-            print(response_1_content)
+        response1 = await litellm.acompletion(
+            model="gpt-3.5-turbo",
+            messages=messages,
+            max_tokens=40,
+            temperature=1,
+            stream=True,
+        )
+        async for chunk in response1:
+            response_1_content += chunk.choices[0].delta.content or ""
+        print(response_1_content)

-        asyncio.run(call1())
        time.sleep(0.5)
        print("\n\n Response 1 content: ", response_1_content, "\n\n")

-        async def call2():
-            nonlocal response_2_content
-            response2 = await litellm.acompletion(
-                model="gpt-3.5-turbo",
-                messages=messages,
-                max_tokens=40,
-                temperature=1,
-                stream=True,
-            )
-            async for chunk in response2:
-                response_2_content += chunk.choices[0].delta.content or ""
-            print(response_2_content)
+        response2 = await litellm.acompletion(
+            model="gpt-3.5-turbo",
+            messages=messages,
+            max_tokens=40,
+            temperature=1,
+            stream=True,
+        )
+        async for chunk in response2:
+            response_2_content += chunk.choices[0].delta.content or ""
+        print(response_2_content)

-        asyncio.run(call2())
        print("\nresponse 1", response_1_content)
        print("\nresponse 2", response_2_content)
        assert (
@ -536,14 +529,15 @@ def test_redis_cache_acompletion_stream():
        litellm.success_callback = []
        litellm._async_success_callback = []
    except Exception as e:
-        print(e)
+        print(f"{str(e)}\n\n{traceback.format_exc()}")
        raise e


 # test_redis_cache_acompletion_stream()


-def test_redis_cache_acompletion_stream_bedrock():
+@pytest.mark.asyncio
+async def test_redis_cache_acompletion_stream_bedrock():
    import asyncio

    try:
@ -565,39 +559,33 @@ def test_redis_cache_acompletion_stream_bedrock():
        response_1_content = ""
        response_2_content = ""

-        async def call1():
-            nonlocal response_1_content
-            response1 = await litellm.acompletion(
-                model="bedrock/anthropic.claude-v2",
-                messages=messages,
-                max_tokens=40,
-                temperature=1,
-                stream=True,
-            )
-            async for chunk in response1:
-                print(chunk)
-                response_1_content += chunk.choices[0].delta.content or ""
-            print(response_1_content)
+        response1 = await litellm.acompletion(
+            model="bedrock/anthropic.claude-v2",
+            messages=messages,
+            max_tokens=40,
+            temperature=1,
+            stream=True,
+        )
+        async for chunk in response1:
+            print(chunk)
+            response_1_content += chunk.choices[0].delta.content or ""
+        print(response_1_content)

-        asyncio.run(call1())
        time.sleep(0.5)
        print("\n\n Response 1 content: ", response_1_content, "\n\n")

-        async def call2():
-            nonlocal response_2_content
-            response2 = await litellm.acompletion(
-                model="bedrock/anthropic.claude-v2",
-                messages=messages,
-                max_tokens=40,
-                temperature=1,
-                stream=True,
-            )
-            async for chunk in response2:
-                print(chunk)
-                response_2_content += chunk.choices[0].delta.content or ""
-            print(response_2_content)
+        response2 = await litellm.acompletion(
+            model="bedrock/anthropic.claude-v2",
+            messages=messages,
+            max_tokens=40,
+            temperature=1,
+            stream=True,
+        )
+        async for chunk in response2:
+            print(chunk)
+            response_2_content += chunk.choices[0].delta.content or ""
+        print(response_2_content)

-        asyncio.run(call2())
        print("\nresponse 1", response_1_content)
        print("\nresponse 2", response_2_content)
        assert (
@ -612,8 +600,8 @@ def test_redis_cache_acompletion_stream_bedrock():
        raise e


-@pytest.mark.skip(reason="AWS Suspended Account")
-def test_s3_cache_acompletion_stream_azure():
+@pytest.mark.asyncio
+async def test_s3_cache_acompletion_stream_azure():
    import asyncio

    try:
@ -637,41 +625,35 @@ def test_s3_cache_acompletion_stream_azure():
        response_1_created = ""
        response_2_created = ""

-        async def call1():
-            nonlocal response_1_content, response_1_created
-            response1 = await litellm.acompletion(
-                model="azure/chatgpt-v-2",
-                messages=messages,
-                max_tokens=40,
-                temperature=1,
-                stream=True,
-            )
-            async for chunk in response1:
-                print(chunk)
-                response_1_created = chunk.created
-                response_1_content += chunk.choices[0].delta.content or ""
-            print(response_1_content)
+        response1 = await litellm.acompletion(
+            model="azure/chatgpt-v-2",
+            messages=messages,
+            max_tokens=40,
+            temperature=1,
+            stream=True,
+        )
+        async for chunk in response1:
+            print(chunk)
+            response_1_created = chunk.created
+            response_1_content += chunk.choices[0].delta.content or ""
+        print(response_1_content)

-        asyncio.run(call1())
        time.sleep(0.5)
        print("\n\n Response 1 content: ", response_1_content, "\n\n")

-        async def call2():
-            nonlocal response_2_content, response_2_created
-            response2 = await litellm.acompletion(
-                model="azure/chatgpt-v-2",
-                messages=messages,
-                max_tokens=40,
-                temperature=1,
-                stream=True,
-            )
-            async for chunk in response2:
-                print(chunk)
-                response_2_content += chunk.choices[0].delta.content or ""
-                response_2_created = chunk.created
-            print(response_2_content)
+        response2 = await litellm.acompletion(
+            model="azure/chatgpt-v-2",
+            messages=messages,
+            max_tokens=40,
+            temperature=1,
+            stream=True,
+        )
+        async for chunk in response2:
+            print(chunk)
+            response_2_content += chunk.choices[0].delta.content or ""
+            response_2_created = chunk.created
+        print(response_2_content)

-        asyncio.run(call2())
        print("\nresponse 1", response_1_content)
        print("\nresponse 2", response_2_content)

--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@ -358,7 +358,7 @@ def test_completion_mistral_azure():
                }
            ],
        )
-        # Add any assertions here to check the response
+        # Add any assertions here to check, the response
        print(response)

    except Exception as e:
@ -575,6 +575,25 @@ def test_completion_azure_gpt4_vision():
 # test_completion_azure_gpt4_vision()


+def test_completion_fireworks_ai():
+    try:
+        litellm.set_verbose = True
+        messages = [
+            {"role": "system", "content": "You're a good bot"},
+            {
+                "role": "user",
+                "content": "Hey",
+            },
+        ]
+        response = completion(
+            model="fireworks_ai/accounts/fireworks/models/mixtral-8x7b-instruct",
+            messages=messages,
+        )
+        print(response)
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+
@pytest.mark.skip(reason="this test is flaky")
 def test_completion_perplexity_api():
    try:
--- a/litellm/tests/test_custom_logger.py
+++ b/litellm/tests/test_custom_logger.py
@ -97,27 +97,23 @@ class TmpFunction:
        )


-def test_async_chat_openai_stream():
+@pytest.mark.asyncio
+async def test_async_chat_openai_stream():
    try:
        tmp_function = TmpFunction()
        litellm.set_verbose = True
        litellm.success_callback = [tmp_function.async_test_logging_fn]
        complete_streaming_response = ""

-        async def call_gpt():
-            nonlocal complete_streaming_response
-            response = await litellm.acompletion(
-                model="gpt-3.5-turbo",
-                messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}],
-                stream=True,
-            )
-            async for chunk in response:
-                complete_streaming_response += (
-                    chunk["choices"][0]["delta"]["content"] or ""
-                )
-                print(complete_streaming_response)
+        response = await litellm.acompletion(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}],
+            stream=True,
+        )
+        async for chunk in response:
+            complete_streaming_response += chunk["choices"][0]["delta"]["content"] or ""
+            print(complete_streaming_response)

-        asyncio.run(call_gpt())
        complete_streaming_response = complete_streaming_response.strip("'")
        response1 = tmp_function.complete_streaming_response_in_callback["choices"][0][
            "message"
@ -130,7 +126,7 @@ def test_async_chat_openai_stream():
        assert tmp_function.async_success == True
    except Exception as e:
        print(e)
-        pytest.fail(f"An error occurred - {str(e)}")
+        pytest.fail(f"An error occurred - {str(e)}\n\n{traceback.format_exc()}")


 # test_async_chat_openai_stream()
--- a/litellm/tests/test_embedding.py
+++ b/litellm/tests/test_embedding.py
@ -117,6 +117,8 @@ def test_openai_azure_embedding_simple():

        print("Calculated request cost=", request_cost)

+        assert isinstance(response.usage, litellm.Usage)
+
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")

@ -204,6 +206,8 @@ def test_cohere_embedding():
            input=["good morning from litellm", "this is another item"],
        )
        print(f"response:", response)
+
+        assert isinstance(response.usage, litellm.Usage)
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")

@ -269,6 +273,8 @@ def test_bedrock_embedding_titan():

        assert end_time - start_time < 0.1
        litellm.disable_cache()
+
+        assert isinstance(response.usage, litellm.Usage)
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")

@ -295,6 +301,8 @@ def test_bedrock_embedding_cohere():
            isinstance(x, float) for x in response["data"][0]["embedding"]
        ), "Expected response to be a list of floats"
        # print(f"response:", response)
+
+        assert isinstance(response.usage, litellm.Usage)
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")

@ -331,6 +339,8 @@ def test_hf_embedding():
            input=["good morning from litellm", "this is another item"],
        )
        print(f"response:", response)
+
+        assert isinstance(response.usage, litellm.Usage)
    except Exception as e:
        # Note: Huggingface inference API is unstable and fails with "model loading errors all the time"
        pass
@ -386,6 +396,8 @@ def test_aembedding_azure():
                    response._hidden_params["custom_llm_provider"],
                )
                assert response._hidden_params["custom_llm_provider"] == "azure"
+
+                assert isinstance(response.usage, litellm.Usage)
            except Exception as e:
                pytest.fail(f"Error occurred: {e}")

@ -440,6 +452,7 @@ def test_mistral_embeddings():
            input=["good morning from litellm"],
        )
        print(f"response: {response}")
+        assert isinstance(response.usage, litellm.Usage)
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")

--- a/litellm/tests/test_key_generate_prisma.py
+++ b/litellm/tests/test_key_generate_prisma.py
@ -158,7 +158,7 @@ def test_call_with_invalid_key(prisma_client):

        async def test():
            await litellm.proxy.proxy_server.prisma_client.connect()
-            generated_key = "bad-key"
+            generated_key = "sk-126666"
            bearer_token = "Bearer " + generated_key

            request = Request(scope={"type": "http"}, receive=None)
@ -173,7 +173,7 @@ def test_call_with_invalid_key(prisma_client):
    except Exception as e:
        print("Got Exception", e)
        print(e.message)
-        assert "Authentication Error" in e.message
+        assert "Authentication Error, Invalid token passed" in e.message
        pass


--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -72,7 +72,7 @@ from .integrations.litedebugger import LiteDebugger
 from .proxy._types import KeyManagementSystem
 from openai import OpenAIError as OriginalError
 from openai._models import BaseModel as OpenAIObject
-from .caching import S3Cache, RedisSemanticCache
+from .caching import S3Cache, RedisSemanticCache, RedisCache
 from .exceptions import (
    AuthenticationError,
    BadRequestError,
@ -1795,7 +1795,12 @@ class Logging:
                            )
                            result = kwargs["async_complete_streaming_response"]
                            # only add to cache once we have a complete streaming response
-                            litellm.cache.add_cache(result, **kwargs)
+                            if litellm.cache is not None and not isinstance(
+                                litellm.cache.cache, S3Cache
+                            ):
+                                await litellm.cache.async_add_cache(result, **kwargs)
+                            else:
+                                litellm.cache.add_cache(result, **kwargs)
                if isinstance(callback, CustomLogger):  # custom logger class
                    print_verbose(
                        f"Running Async success callback: {callback}; self.stream: {self.stream}; async_complete_streaming_response: {self.model_call_details.get('async_complete_streaming_response', None)} result={result}"
@ -2589,7 +2594,7 @@ def client(original_function):
            if (
                kwargs.get("max_tokens", None) is not None
                and model is not None
-                and litellm.drop_params
+                and litellm.modify_params
                == True  # user is okay with params being modified
                and (
                    call_type == CallTypes.acompletion.value
@ -2806,7 +2811,9 @@ def client(original_function):
                        ):
                            if len(cached_result) == 1 and cached_result[0] is None:
                                cached_result = None
-                    elif isinstance(litellm.cache.cache, RedisSemanticCache):
+                    elif isinstance(
+                        litellm.cache.cache, RedisSemanticCache
+                    ) or isinstance(litellm.cache.cache, RedisCache):
                        preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs)
                        kwargs["preset_cache_key"] = (
                            preset_cache_key  # for streaming calls, we need to pass the preset_cache_key
@ -5375,6 +5382,17 @@ def get_llm_provider(
                # groq is openai compatible, we just need to set this to custom_openai and have the api_base be https://api.groq.com/openai/v1
                api_base = "https://api.groq.com/openai/v1"
                dynamic_api_key = get_secret("GROQ_API_KEY")
+            elif custom_llm_provider == "fireworks_ai":
+                # fireworks is openai compatible, we just need to set this to custom_openai and have the api_base be https://api.groq.com/openai/v1
+                if not model.startswith("accounts/fireworks/models"):
+                    model = f"accounts/fireworks/models/{model}"
+                api_base = "https://api.fireworks.ai/inference/v1"
+                dynamic_api_key = (
+                    get_secret("FIREWORKS_API_KEY")
+                    or get_secret("FIREWORKS_AI_API_KEY")
+                    or get_secret("FIREWORKSAI_API_KEY")
+                    or get_secret("FIREWORKS_AI_TOKEN")
+                )
            elif custom_llm_provider == "mistral":
                # mistral is openai compatible, we just need to set this to custom_openai and have the api_base be https://api.mistral.ai
                api_base = (
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@ -631,6 +631,13 @@
        "litellm_provider": "groq",
        "mode": "chat"
    },
+    "groq/gemma-7b-it": {
+        "max_tokens": 8192,
+        "input_cost_per_token": 0.00000010,
+        "output_cost_per_token": 0.00000010,
+        "litellm_provider": "groq",
+        "mode": "chat"
+    },
    "claude-instant-1.2": {
        "max_tokens": 100000,
        "max_output_tokens": 8191,
--- a/proxy_server_config.yaml
+++ b/proxy_server_config.yaml
@ -45,13 +45,15 @@ litellm_settings:
  budget_duration: 30d
  num_retries: 5
  request_timeout: 600
+  cache: true
+  callbacks: ["batch_redis_requests"]
 general_settings: 
  master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234)
  proxy_budget_rescheduler_min_time: 60
  proxy_budget_rescheduler_max_time: 64
  # database_url: "postgresql://<user>:<password>@<host>:<port>/<dbname>" # [OPTIONAL] use for token-based auth to proxy

-environment_variables:
+# environment_variables:
  # settings for using redis caching
  # REDIS_HOST: redis-16337.c322.us-east-1-2.ec2.cloud.redislabs.com
  # REDIS_PORT: "16337"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.31.12"
+version = "1.31.16"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@ -76,7 +76,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"

 [tool.commitizen]
-version = "1.31.12"
+version = "1.31.16"
 version_files = [
    "pyproject.toml:^version"
 ]
--- a/requirements.txt
+++ b/requirements.txt
@ -34,5 +34,4 @@ jinja2==3.1.3 # for prompt templates
 certifi>=2023.7.22 # [TODO] clean up 
 aiohttp==3.9.0 # for network calls
 aioboto3==12.3.0 # for async sagemaker calls
-argon2-cffi==23.1.0 # for checking secrets
 ####