Merge branch 'BerriAI:main' into main

2025-04-26 11:14:04 +00:00 · 2024-01-30 16:53:19 +01:00 · 2024-01-30 16:53:19 +01:00 · fe9b511e45
commit fe9b511e45
parent bc4dab029c 54ea1b0cf4
135 changed files with 17797 additions and 1333 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -42,6 +42,7 @@ jobs:
            pip install "anyio==3.7.1"
            pip install "aiodynamo==23.10.1"
            pip install "asyncio==3.4.3"
+            pip install "apscheduler==3.10.4"
            pip install "PyGithub==1.59.1"
      - save_cache:
          paths:
@ -97,6 +98,43 @@ jobs:
          command: |
            sudo apt-get update
            sudo apt-get install -y docker-ce docker-ce-cli containerd.io
+      - run:
+          name: Install Python 3.9
+          command: |
+            curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh --output miniconda.sh
+            bash miniconda.sh -b -p $HOME/miniconda
+            export PATH="$HOME/miniconda/bin:$PATH"
+            conda init bash
+            source ~/.bashrc
+            conda create -n myenv python=3.9 -y
+            conda activate myenv
+            python --version
+      - run:
+          name: Install Dependencies
+          command: |
+            pip install "pytest==7.3.1"
+            pip install "pytest-asyncio==0.21.1"
+            pip install aiohttp
+            pip install openai
+            python -m pip install --upgrade pip
+            python -m pip install -r .circleci/requirements.txt
+            pip install "pytest==7.3.1"
+            pip install "pytest-asyncio==0.21.1"
+            pip install mypy
+            pip install "google-generativeai>=0.3.2"
+            pip install "google-cloud-aiplatform>=1.38.0"
+            pip install "boto3>=1.28.57"
+            pip install langchain
+            pip install "langfuse>=2.0.0"
+            pip install numpydoc
+            pip install prisma            
+            pip install "httpx==0.24.1"
+            pip install "gunicorn==21.2.0"
+            pip install "anyio==3.7.1"
+            pip install "aiodynamo==23.10.1"
+            pip install "asyncio==3.4.3"
+            pip install "PyGithub==1.59.1"
+            # Run pytest and generate JUnit XML report
      - run:
          name: Build Docker image
          command: docker build -t my-app:latest -f Dockerfile.database .
@ -106,15 +144,20 @@ jobs:
            docker run -d \
              -p 4000:4000 \
              -e DATABASE_URL=$PROXY_DOCKER_DB_URL \
-              -e AZURE_API_KEY=$AZURE_FRANCE_API_KEY \
+              -e AZURE_API_KEY=$AZURE_API_KEY \
              -e AZURE_FRANCE_API_KEY=$AZURE_FRANCE_API_KEY \
              -e AZURE_EUROPE_API_KEY=$AZURE_EUROPE_API_KEY \
+              -e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
+              -e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
+              -e AWS_REGION_NAME=$AWS_REGION_NAME \
              --name my-app \
              -v $(pwd)/proxy_server_config.yaml:/app/config.yaml \
              my-app:latest \
              --config /app/config.yaml \
              --port 4000 \
-              --num_workers 8
+              --num_workers 8 \
+              --run_gunicorn \ 
+              --debug
      - run:
          name: Install curl and dockerize
          command: |
@ -125,63 +168,22 @@ jobs:
            sudo rm dockerize-linux-amd64-v0.6.1.tar.gz
      - run:
          name: Start outputting logs
-          command: |
-            while true; do
-              docker logs my-app
-              sleep 10
-            done
+          command: docker logs -f my-app
          background: true
      - run: 
          name: Wait for app to be ready
          command: dockerize -wait http://localhost:4000 -timeout 1m
      - run:
-          name: Test the application
+          name: Run tests
          command: |
-            mkdir -p /tmp/responses
-            for i in {1..10}; do
-              status_file="/tmp/responses/status_${i}.txt"
-              response_file="/tmp/responses/response_${i}.json"
+            pwd
+            ls
+            python -m pytest -vv tests/ -x --junitxml=test-results/junit.xml --durations=5 
+          no_output_timeout: 120m

-              (curl --location --request POST 'http://0.0.0.0:4000/key/generate' \
-                --header 'Authorization: Bearer sk-1234' \
-                --header 'Content-Type: application/json' \
-                --data '{"models": ["azure-models"], "aliases": {"mistral-7b": "gpt-3.5-turbo"}, "duration": null}' \
-                --silent --output "${response_file}" --write-out '%{http_code}' > "${status_file}") &
-
-              # Capture PIDs of background processes
-              pids[${i}]=$!
-            done
-
-            # Wait for all background processes to finish
-            for pid in ${pids[*]}; do
-              wait $pid
-            done
-
-            # Check all responses and status codes
-            fail=false
-            for i in {1..10}; do
-              status=$(cat "/tmp/responses/status_${i}.txt")
-              
-              # Here, we need to set the correct response file path for each iteration
-              response_file="/tmp/responses/response_${i}.json" # This was missing in the provided script
-              
-              response=$(cat "${response_file}")
-              echo "Response ${i} (Status code: ${status}):"
-              echo "${response}" # Use echo here to print the contents
-              echo # Additional newline for readability
-
-              if [ "$status" -ne 200 ]; then
-                echo "A request did not return a 200 status code: $status"
-                fail=true
-              fi
-            done
-
-            # If any request did not return status code 200, fail the job
-            if [ "$fail" = true ]; then
-              exit 1
-            fi
-
-            echo "All requests returned a 200 status code."
+      # Store test results
+      - store_test_results:
+          path: test-results

  publish_to_pypi:
    docker:
--- a/.github/workflows/ghcr_deploy.yml
+++ b/.github/workflows/ghcr_deploy.yml
@ -41,6 +41,7 @@ jobs:
          push: true
          file: Dockerfile.database
          tags: litellm/litellm-database:${{ github.event.inputs.tag || 'latest' }}
+      
  build-and-push-image:
    runs-on: ubuntu-latest
    # Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job.
@ -74,7 +75,9 @@ jobs:
          push: true
          tags: ${{ steps.meta.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta.outputs.tags }}-latest # if a tag is provided, use that, otherwise use the release tag, and if neither is available, use 'latest'
          labels: ${{ steps.meta.outputs.labels }}
-  build-and-push-image-alpine:
+          platform: local, linux/amd64,linux/arm64,linux/arm64/v8
+          
+  build-and-push-image-ui:
    runs-on: ubuntu-latest
    permissions:
      contents: read
@ -90,20 +93,21 @@ jobs:
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}

-      - name: Extract metadata (tags, labels) for Alpine Dockerfile
-        id: meta-alpine
+      - name: Extract metadata (tags, labels) for UI Dockerfile
+        id: meta-ui
        uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
        with:
-          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}-alpine
+          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}-ui

-      - name: Build and push Alpine Docker image
+      - name: Build and push UI Docker image
        uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
        with:
-          context: .
-          file: Dockerfile.alpine
+          context: ui/
+          file: ui/Dockerfile
          push: true
-          tags: ${{ steps.meta-alpine.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-alpine.outputs.tags }}-latest
-          labels: ${{ steps.meta-alpine.outputs.labels }}
+          tags: ${{ steps.meta-ui.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-ui.outputs.tags }}-latest
+          labels: ${{ steps.meta-ui.outputs.labels }}
+          platform: local, linux/amd64,linux/arm64,linux/arm64/v8
  build-and-push-image-database:
    runs-on: ubuntu-latest
    permissions:
--- a/.gitignore
+++ b/.gitignore
@ -35,3 +35,8 @@ hosted_config.yaml
 litellm/proxy/tests/node_modules
 litellm/proxy/tests/package.json
 litellm/proxy/tests/package-lock.json
+ui/litellm-dashboard/.next
+ui/litellm-dashboard/node_modules
+ui/litellm-dashboard/next-env.d.ts
+ui/litellm-dashboard/package.json
+ui/litellm-dashboard/package-lock.json
--- a/2
+++ b/2
@ -52,4 +52,4 @@ RUN chmod +x entrypoint.sh
 EXPOSE 4000/tcp

 ENTRYPOINT ["litellm"]
-CMD ["--port", "4000"]
+CMD ["--port", "4000", "--config", "./proxy_server_config.yaml", "--detailed_debug", "--run_gunicorn"]
--- a/Dockerfile.database
+++ b/Dockerfile.database
@ -56,4 +56,4 @@ EXPOSE 4000/tcp
 # # Set your entrypoint and command

 ENTRYPOINT ["litellm"]
-CMD ["--port", "4000"]
+CMD ["--port", "4000", "--run_gunicorn"]
--- a/cookbook/misc/openai_timeouts.py
+++ b/cookbook/misc/openai_timeouts.py
@ -0,0 +1,34 @@
+import os
+from openai import OpenAI
+from dotenv import load_dotenv
+import httpx
+import concurrent.futures
+
+load_dotenv()
+
+client = OpenAI(
+    # This is the default and can be omitted
+    api_key=os.environ.get("OPENAI_API_KEY"),
+)
+
+
+def create_chat_completion():
+    return client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": "Say this is a test. Respond in 20 lines",
+            }
+        ],
+        model="gpt-3.5-turbo",
+    )
+
+
+with concurrent.futures.ThreadPoolExecutor() as executor:
+    # Set a timeout of 10 seconds
+    future = executor.submit(create_chat_completion)
+    try:
+        chat_completion = future.result(timeout=0.00001)
+        print(chat_completion)
+    except concurrent.futures.TimeoutError:
+        print("Operation timed out.")
--- a/cookbook/misc/sagmaker_streaming.py
+++ b/cookbook/misc/sagmaker_streaming.py
@ -0,0 +1,61 @@
+# Notes - on how to do sagemaker streaming using boto3
+import json
+import boto3
+
+import sys, os
+import traceback
+from dotenv import load_dotenv
+
+load_dotenv()
+import os, io
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
+import pytest
+import litellm
+
+import io
+import json
+
+
+class TokenIterator:
+    def __init__(self, stream):
+        self.byte_iterator = iter(stream)
+        self.buffer = io.BytesIO()
+        self.read_pos = 0
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        while True:
+            self.buffer.seek(self.read_pos)
+            line = self.buffer.readline()
+            if line and line[-1] == ord("\n"):
+                self.read_pos += len(line) + 1
+                full_line = line[:-1].decode("utf-8")
+                line_data = json.loads(full_line.lstrip("data:").rstrip("/n"))
+                return line_data["token"]["text"]
+            chunk = next(self.byte_iterator)
+            self.buffer.seek(0, io.SEEK_END)
+            self.buffer.write(chunk["PayloadPart"]["Bytes"])
+
+
+payload = {
+    "inputs": "How do I build a website?",
+    "parameters": {"max_new_tokens": 256},
+    "stream": True,
+}
+
+import boto3
+
+client = boto3.client("sagemaker-runtime", region_name="us-west-2")
+response = client.invoke_endpoint_with_response_stream(
+    EndpointName="berri-benchmarking-Llama-2-70b-chat-hf-4",
+    Body=json.dumps(payload),
+    ContentType="application/json",
+)
+
+# for token in TokenIterator(response["Body"]):
+#     print(token)
--- a/docker-compose.example.yml
+++ b/docker-compose.example.yml
@ -1,12 +0,0 @@
-version: "3.9"
-services:
-  litellm:
-    image: ghcr.io/berriai/litellm:main
-    ports:
-      - "8000:8000" # Map the container port to the host, change the host port if necessary
-    volumes:
-      - ./litellm-config.yaml:/app/config.yaml # Mount the local configuration file
-    # You can change the port or number of workers as per your requirements or pass any new supported CLI augument. Make sure the port passed here matches with the container port defined above in `ports` value
-    command: [ "--config", "/app/config.yaml", "--port", "8000", "--num_workers", "8" ]
-
-# ...rest of your docker-compose config if any
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -0,0 +1,15 @@
+version: "3.9"
+services:
+  litellm:
+    image: ghcr.io/berriai/litellm:main-latest
+    volumes:
+      - ./proxy_server_config.yaml:/app/proxy_server_config.yaml # mount your litellm config.yaml
+    ports:
+      - "4000:4000"
+    environment:
+      - AZURE_API_KEY=sk-123
+  litellm-ui:
+    image: ghcr.io/berriai/litellm-ui:main-latest
+
+
+
--- a/docs/my-website/docs/caching/redis_cache.md
+++ b/docs/my-website/docs/caching/redis_cache.md
@ -204,6 +204,7 @@ def __init__(
    s3_bucket_name: Optional[str] = None,
    s3_region_name: Optional[str] = None,
    s3_api_version: Optional[str] = None,
+    s3_path: Optional[str] = None, # if you wish to save to a spefic path
    s3_use_ssl: Optional[bool] = True,
    s3_verify: Optional[Union[bool, str]] = None,
    s3_endpoint_url: Optional[str] = None,
--- a/docs/my-website/docs/embedding/supported_embedding.md
+++ b/docs/my-website/docs/embedding/supported_embedding.md
@ -13,8 +13,8 @@ response = embedding(model='text-embedding-ada-002', input=["good morning from l

 - `model`: *string* - ID of the model to use. `model='text-embedding-ada-002'`

- `input`: *array* - Input text to embed, encoded as a string or array of tokens. To embed multiple inputs in a single request, pass an array of strings or array of token arrays. The input must not exceed the max input tokens for the model (8192 tokens for text-embedding-ada-002), cannot be an empty string, and any array must be 2048 dimensions or less. 
-```
+- `input`: *string or array* - Input text to embed, encoded as a string or array of tokens. To embed multiple inputs in a single request, pass an array of strings or array of token arrays. The input must not exceed the max input tokens for the model (8192 tokens for text-embedding-ada-002), cannot be an empty string, and any array must be 2048 dimensions or less. 
+```python
 input=["good morning from litellm"]
 ```

@ -22,7 +22,11 @@ input=["good morning from litellm"]

 - `user`: *string (optional)* A unique identifier representing your end-user, 

- `timeout`: *integer* - The maximum time, in seconds, to wait for the API to respond. Defaults to 600 seconds (10 minutes).
+- `dimensions`: *integer (Optional)* The number of dimensions the resulting output embeddings should have. Only supported in OpenAI/Azure text-embedding-3 and later models.
+
+- `encoding_format`: *string (Optional)* The format to return the embeddings in. Can be either `"float"` or `"base64"`. Defaults to `encoding_format="float"`
+
+- `timeout`: *integer (Optional)* - The maximum time, in seconds, to wait for the API to respond. Defaults to 600 seconds (10 minutes).

 - `api_base`: *string (optional)* - The api endpoint you want to call the model with

@ -66,11 +70,18 @@ input=["good morning from litellm"]
 from litellm import embedding
 import os
 os.environ['OPENAI_API_KEY'] = ""
-response = embedding('text-embedding-ada-002', input=["good morning from litellm"])
+response = embedding(
+    model="text-embedding-3-small",
+    input=["good morning from litellm", "this is another item"],
+    metadata={"anything": "good day"},
+    dimensions=5 # Only supported in text-embedding-3 and later models.
+)
 ```

 | Model Name           | Function Call                               | Required OS Variables                |
 |----------------------|---------------------------------------------|--------------------------------------|
+| text-embedding-3-small | `embedding('text-embedding-3-small', input)` | `os.environ['OPENAI_API_KEY']`       |
+| text-embedding-3-large | `embedding('text-embedding-3-large', input)` | `os.environ['OPENAI_API_KEY']`       |
 | text-embedding-ada-002 | `embedding('text-embedding-ada-002', input)` | `os.environ['OPENAI_API_KEY']`       |

 ## Azure OpenAI Embedding Models
--- a/docs/my-website/docs/observability/langsmith_integration.md
+++ b/docs/my-website/docs/observability/langsmith_integration.md
@ -28,6 +28,8 @@ import litellm
 import os

 os.environ["LANGSMITH_API_KEY"] = ""
+os.environ["LANGSMITH_PROJECT"] = "" # defaults to litellm-completion
+os.environ["LANGSMITH_DEFAULT_RUN_NAME"] = "" # defaults to LLMRun
 # LLM API Keys
 os.environ['OPENAI_API_KEY']=""

--- a/docs/my-website/docs/providers/gemini.md
+++ b/docs/my-website/docs/providers/gemini.md
@ -6,7 +6,7 @@
 # Gemini-Pro
 ## Sample Usage
 ```python
-import litellm
+from litellm import completion
 import os

 os.environ['GEMINI_API_KEY'] = ""
@ -24,7 +24,7 @@ LiteLLM Supports the following image types passed in `url`
 ## Sample Usage
 ```python
 import os
-import litellm 
+import litellm
 from dotenv import load_dotenv

 # Load the environment variables from .env file
--- a/docs/my-website/docs/providers/openai.md
+++ b/docs/my-website/docs/providers/openai.md
@ -34,6 +34,7 @@ os.environ["OPENAI_API_BASE"] = "openaiai-api-base"     # OPTIONAL

 | Model Name            | Function Call                                                   |
 |-----------------------|-----------------------------------------------------------------|
+| gpt-4-0125-preview    | `response = completion(model="gpt-4-0125-preview", messages=messages)` |
 | gpt-4-1106-preview    | `response = completion(model="gpt-4-1106-preview", messages=messages)` |
 | gpt-3.5-turbo-1106    | `response = completion(model="gpt-3.5-turbo-1106", messages=messages)` |
 | gpt-3.5-turbo         | `response = completion(model="gpt-3.5-turbo", messages=messages)` |
--- a/docs/my-website/docs/providers/palm.md
+++ b/docs/my-website/docs/providers/palm.md
@ -5,7 +5,7 @@

 ## Sample Usage
 ```python
-import litellm
+from litellm import completion
 import os

 os.environ['PALM_API_KEY'] = ""
@ -17,7 +17,7 @@ response = completion(

 ## Sample Usage - Streaming
 ```python
-import litellm
+from litellm import completion
 import os

 os.environ['PALM_API_KEY'] = ""
--- a/docs/my-website/docs/providers/vertex.md
+++ b/docs/my-website/docs/providers/vertex.md
@ -17,7 +17,28 @@ import litellm
 litellm.vertex_project = "hardy-device-38811" # Your Project ID
 litellm.vertex_location = "us-central1"  # proj location

-response = completion(model="gemini-pro", messages=[{"role": "user", "content": "write code for saying hi from LiteLLM"}])
+response = litellm.completion(model="gemini-pro", messages=[{"role": "user", "content": "write code for saying hi from LiteLLM"}])
+```
+
+## OpenAI Proxy Usage 
+
+1. Modify the config.yaml 
+
+```yaml
+litellm_settings: 
+  vertex_project: "hardy-device-38811" # Your Project ID
+  vertex_location: "us-central1" # proj location
+
+model_list: 
+  -model_name: team1-gemini-pro
+   litellm_params: 
+     model: gemini-pro
+```
+
+2. Start the proxy 
+
+```bash
+$ litellm --config /path/to/config.yaml
 ```

 ## Set Vertex Project & Vertex Location
--- a/docs/my-website/docs/providers/vllm.md
+++ b/docs/my-website/docs/providers/vllm.md
@ -11,7 +11,7 @@ pip install litellm vllm
 ```python
 import litellm 

-response = completion(
+response = litellm.completion(
            model="vllm/facebook/opt-125m", # add a vllm prefix so litellm knows the custom_llm_provider==vllm
            messages=messages,
            temperature=0.2,
@ -29,7 +29,7 @@ In order to use litellm to call a hosted vllm server add the following to your c
 ```python
 import litellm 

-response = completion(
+response = litellm.completion(
            model="openai/facebook/opt-125m", # pass the vllm model name
            messages=messages,
            api_base="https://hosted-vllm-api.co",
--- a/docs/my-website/docs/proxy/alerting.md
+++ b/docs/my-website/docs/proxy/alerting.md
@ -1,6 +1,13 @@
 # Slack Alerting

-Get alerts for failed db read/writes, hanging api calls, failed api calls. 
+Get alerts for:
+- hanging LLM api calls
+- failed LLM api calls
+- slow LLM api calls
+- budget Tracking per key/user:
+    - When a User/Key crosses their Budget 
+    - When a User/Key is 15% away from crossing their Budget
+- failed db read/writes

 ## Quick Start

--- a/docs/my-website/docs/proxy/call_hooks.md
+++ b/docs/my-website/docs/proxy/call_hooks.md
@ -1,4 +1,4 @@
-# Modify Incoming Data
+# Modify / Reject Incoming Requests

 Modify data just before making litellm completion calls call on proxy

--- a/docs/my-website/docs/proxy/configs.md
+++ b/docs/my-website/docs/proxy/configs.md
@ -483,3 +483,55 @@ general_settings:
  max_parallel_requests: 100 # max parallel requests for a user = 100
 ```

+## All settings 
+
+```python
+{
+  "environment_variables": {},
+  "model_list": [
+    {
+      "model_name": "string",
+      "litellm_params": {},
+      "model_info": {
+        "id": "string",
+        "mode": "embedding",
+        "input_cost_per_token": 0,
+        "output_cost_per_token": 0,
+        "max_tokens": 2048,
+        "base_model": "gpt-4-1106-preview",
+        "additionalProp1": {}
+      }
+    }
+  ],
+  "litellm_settings": {}, # ALL (https://github.com/BerriAI/litellm/blob/main/litellm/__init__.py)
+  "general_settings": {
+    "completion_model": "string",
+    "key_management_system": "google_kms", # either google_kms or azure_kms
+    "master_key": "string",
+    "database_url": "string",
+    "database_type": "dynamo_db",
+    "database_args": {
+      "billing_mode": "PROVISIONED_THROUGHPUT",
+      "read_capacity_units": 0,
+      "write_capacity_units": 0,
+      "ssl_verify": true,
+      "region_name": "string",
+      "user_table_name": "LiteLLM_UserTable",
+      "key_table_name": "LiteLLM_VerificationToken",
+      "config_table_name": "LiteLLM_Config",
+      "spend_table_name": "LiteLLM_SpendLogs"
+    },
+    "otel": true,
+    "custom_auth": "string",
+    "max_parallel_requests": 0,
+    "infer_model_from_keys": true,
+    "background_health_checks": true,
+    "health_check_interval": 300,
+    "alerting": [
+      "string"
+    ],
+    "alerting_threshold": 0
+  }
+}
+```
+
--- a/docs/my-website/docs/proxy/custom_pricing.md
+++ b/docs/my-website/docs/proxy/custom_pricing.md
@ -0,0 +1,115 @@
+import Image from '@theme/IdealImage';
+
+# Custom Pricing - Sagemaker, etc. 
+
+Use this to register custom pricing for models. 
+
+There's 2 ways to track cost: 
+- cost per token
+- cost per second
+
+By default, the response cost is accessible in the logging object via `kwargs["response_cost"]` on success (sync + async). [**Learn More**](../observability/custom_callback.md)
+
+## Quick Start 
+
+Register custom pricing for sagemaker completion model. 
+
+For cost per second pricing, you **just** need to register `input_cost_per_second`. 
+
+```python
+# !pip install boto3 
+from litellm import completion, completion_cost 
+
+os.environ["AWS_ACCESS_KEY_ID"] = ""
+os.environ["AWS_SECRET_ACCESS_KEY"] = ""
+os.environ["AWS_REGION_NAME"] = ""
+
+
+def test_completion_sagemaker():
+    try:
+        print("testing sagemaker")
+        response = completion(
+            model="sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4",
+            messages=[{"role": "user", "content": "Hey, how's it going?"}],
+            input_cost_per_second=0.000420,
+        )
+        # Add any assertions here to check the response
+        print(response)
+        cost = completion_cost(completion_response=response)
+        print(cost)
+    except Exception as e:
+        raise Exception(f"Error occurred: {e}")
+
+```
+
+### Usage with OpenAI Proxy Server
+
+**Step 1: Add pricing to config.yaml**
+```yaml
+model_list:
+  - model_name: sagemaker-completion-model
+    litellm_params:
+      model: sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4
+      input_cost_per_second: 0.000420
+  - model_name: sagemaker-embedding-model
+    litellm_params:
+      model: sagemaker/berri-benchmarking-gpt-j-6b-fp16
+      input_cost_per_second: 0.000420 
+```
+
+**Step 2: Start proxy**
+
+```bash
+litellm /path/to/config.yaml
+```
+
+**Step 3: View Spend Logs**
+
+<Image img={require('../../img/spend_logs_table.png')} />
+
+## Cost Per Token (e.g. Azure)
+
+
+```python
+# !pip install boto3 
+from litellm import completion, completion_cost 
+
+## set ENV variables
+os.environ["AZURE_API_KEY"] = ""
+os.environ["AZURE_API_BASE"] = ""
+os.environ["AZURE_API_VERSION"] = ""
+
+
+def test_completion_azure_model():
+    try:
+        print("testing azure custom pricing")
+        # azure call
+        response = completion(
+          model = "azure/<your_deployment_name>", 
+          messages = [{ "content": "Hello, how are you?","role": "user"}]
+          input_cost_per_token=0.005,
+          output_cost_per_token=1,
+        )
+        # Add any assertions here to check the response
+        print(response)
+        cost = completion_cost(completion_response=response)
+        print(cost)
+    except Exception as e:
+        raise Exception(f"Error occurred: {e}")
+
+test_completion_azure_model()
+```
+
+### Usage with OpenAI Proxy Server
+
+```yaml
+model_list:
+  - model_name: azure-model
+    litellm_params:
+      model: azure/<your_deployment_name>
+      api_key: os.environ/AZURE_API_KEY
+      api_base: os.environ/AZURE_API_BASE
+      api_version: os.envrion/AZURE_API_VERSION
+      input_cost_per_token: 0.000421 # 👈 ONLY to track cost per token
+      output_cost_per_token: 0.000520 # 👈 ONLY to track cost per token
+```
--- a/docs/my-website/docs/proxy/debugging.md
+++ b/docs/my-website/docs/proxy/debugging.md
@ -0,0 +1,34 @@
+# Debugging
+
+2 levels of debugging supported. 
+
+- debug (prints info logs)
+- detailed debug (prints debug logs)
+
+## `debug`
+
+**via cli**
+
+```bash
+$ litellm --debug
+```
+
+**via env**
+
+```python
+os.environ["LITELLM_LOG"] = "INFO"
+```
+
+## `detailed debug`
+
+**via cli**
+
+```bash
+$ litellm --detailed_debug
+```
+
+**via env**
+
+```python
+os.environ["LITELLM_LOG"] = "DEBUG"
+```
--- a/docs/my-website/docs/proxy/health.md
+++ b/docs/my-website/docs/proxy/health.md
@ -5,8 +5,10 @@ Use this to health check all LLMs defined in your config.yaml

 The proxy exposes: 
 * a /health endpoint which returns the health of the LLM APIs  
-* a /test endpoint which makes a ping to the litellm server
+* a /health/readiness endpoint for returning if the proxy is ready to accept requests 
+* a /health/liveliness endpoint for returning if the proxy is alive 

+## `/health`
 #### Request
 Make a GET Request to `/health` on the proxy
 ```shell
@ -39,7 +41,7 @@ litellm --health
 }
 ```

-## Background Health Checks 
+### Background Health Checks 

 You can enable model health checks being run in the background, to prevent each model from being queried too frequently via `/health`.

@ -61,7 +63,7 @@ $ litellm /path/to/config.yaml
 curl --location 'http://0.0.0.0:8000/health'
 ```

-## Embedding Models 
+### Embedding Models 

 We need some way to know if the model is an embedding model when running checks, if you have this in your config, specifying mode it makes an embedding health check

@ -77,7 +79,7 @@ model_list:
      mode: embedding # 👈 ADD THIS
 ```

-## Text Completion Models 
+### Text Completion Models 

 We need some way to know if the model is a text completion model when running checks, if you have this in your config, specifying mode it makes an embedding health check

@ -92,3 +94,54 @@ model_list:
    model_info:
      mode: completion # 👈 ADD THIS
 ```
+
+## `/health/readiness`
+
+Unprotected endpoint for checking if proxy is ready to accept requests
+
+Example Request: 
+
+```bash 
+curl --location 'http://0.0.0.0:8000/health/readiness'
+```
+
+Example Response:  
+
+*If proxy connected to a database*  
+
+```json
+{
+    "status": "healthy",
+    "db": "connected",
+    "litellm_version":"1.19.2",
+}
+```
+
+*If proxy not connected to a database*  
+
+```json
+{
+    "status": "healthy",
+    "db": "Not connected",
+    "litellm_version":"1.19.2",
+}
+```
+
+## `/health/liveliness`
+
+Unprotected endpoint for checking if proxy is alive
+
+
+Example Request: 
+
+```
+curl -X 'GET' \
+  'http://0.0.0.0:8000/health/liveliness' \
+  -H 'accept: application/json'
+```
+
+Example Response: 
+
+```json
+"I'm alive!"
+```
--- a/docs/my-website/docs/proxy/load_balancing.md
+++ b/docs/my-website/docs/proxy/load_balancing.md
@ -1,5 +1,4 @@
-
-# Load Balancing - Multiple Instances of 1 model
+# Multiple Instances of 1 model
 Load balance multiple instances of the same model

 The proxy will handle routing requests (using LiteLLM's Router). **Set `rpm` in the config if you want maximize throughput**
--- a/docs/my-website/docs/proxy/ui.md
+++ b/docs/my-website/docs/proxy/ui.md
@ -1,8 +1,6 @@
 import Image from '@theme/IdealImage';

-# [BETA] Self-serve UI 
-
-Allow your users to create their own keys through a UI
+# [BETA] Admin UI

 :::info

@ -10,26 +8,17 @@ This is in beta, so things may change. If you have feedback, [let us know](https

 :::

+Allow your users to create, view their own keys through a UI
+
+<Image img={require('../../img/admin_ui_2.png')} />  
+
+
+
 ## Quick Start

-Requirements: 
+## 1. Changes to your config.yaml

- Need to a SMTP server connection to send emails (e.g. [Resend](https://resend.com/docs/send-with-smtp))
-
-[**See code**](https://github.com/BerriAI/litellm/blob/61cd800b9ffbb02c286481d2056b65c7fb5447bf/litellm/proxy/proxy_server.py#L1782)
-
-### Step 1. Save SMTP server credentials
-
-```env
-export SMTP_HOST="my-smtp-host"
-export SMTP_USERNAME="my-smtp-password"
-export SMTP_PASSWORD="my-smtp-password"
-export SMTP_SENDER_EMAIL="krrish@berri.ai"
-```
-
-### Step 2. Enable user auth 
-
-In your config.yaml, 
+Set `allow_user_auth: true` on your config

 ```yaml
 general_settings:
@ -37,13 +26,36 @@ general_settings:
    allow_user_auth: true
 ```

-This will enable:
-* Users to create keys via `/key/generate` (by default, only admin can create keys)
-* The `/user/auth` endpoint to send user's emails with their login credentials (key + user id)
+## 2. Setup Google SSO - Use this to Authenticate Team Members to the UI
+- Create an Oauth 2.0 Client
+    <Image img={require('../../img/google_oauth2.png')} />  

-### Step 3. Connect to UI 
+    - Navigate to Google `Credenentials` 
+    - Create a new Oauth client ID 
+    - Set the `GOOGLE_CLIENT_ID` and `GOOGLE_CLIENT_SECRET` in your Proxy .env 
+- Set Redirect URL on your Oauth 2.0 Client 
+    - Click on your Oauth 2.0 client on https://console.cloud.google.com/
+    - Set a redirect url = `<your proxy base url>/google-callback`
+    ```
+    https://litellm-production-7002.up.railway.app/google-callback
+    ```
+    <Image img={require('../../img/google_redirect.png')} />  
+## 3. Required env variables on your Proxy

-You can use our hosted UI (https://dashboard.litellm.ai/) or [self-host your own](https://github.com/BerriAI/litellm/tree/main/ui). 
+```shell
+PROXY_BASE_URL="<your deployed proxy endpoint>" example PROXY_BASE_URL=https://litellm-production-7002.up.railway.app/
+
+# for Google SSO Login
+GOOGLE_CLIENT_ID=
+GOOGLE_CLIENT_SECRET=
+```
+
+## 4. Use UI
+
+👉 Get Started here: https://litellm-dashboard.vercel.app/
+
+
+<!-- You can use our hosted UI (https://dashboard.litellm.ai/) or [self-host your own](https://github.com/BerriAI/litellm/tree/main/ui). 

 If you self-host, you need to save the UI url in your proxy environment as `LITELLM_HOSTED_UI`. 

@ -62,4 +74,13 @@ Connect your proxy to your UI, by entering:

 ### Create Keys 

-<Image img={require('../../img/user_create_key_screen.png')} />  
+<Image img={require('../../img/user_create_key_screen.png')} />  
+
+### Spend Per Key
+
+<Image img={require('../../img/spend_per_api_key.png')} />  
+
+### Spend Per User
+
+<Image img={require('../../img/spend_per_user.png')} />   -->
+
--- a/docs/my-website/docs/proxy/users.md
+++ b/docs/my-website/docs/proxy/users.md
@ -1,4 +1,7 @@
-# 💰 Budgets, Rate Limits per user 
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# 💰 Budgets, Rate Limits

 Requirements: 

@ -6,17 +9,74 @@ Requirements:


 ## Set Budgets
-LiteLLM exposes a `/user/new` endpoint to create budgets for users, that persist across multiple keys. 

-This is documented in the swagger (live on your server root endpoint - e.g. `http://0.0.0.0:8000/`). Here's an example request. 
+You can set budgets at 3 levels: 
+- For the proxy 
+- For a user 
+- For a key

+
+<Tabs>
+<TabItem value="proxy" label="For Proxy">
+
+Apply a budget across all calls on the proxy
+
+**Step 1. Modify config.yaml**
+
+```yaml
+general_settings:
+  master_key: sk-1234
+
+litellm_settings:
+  # other litellm settings
+  max_budget: 0 # (float) sets max budget as $0 USD
+  budget_duration: 30d # (str) frequency of reset - You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
+```
+
+**Step 2. Start proxy**
+
+```bash
+litellm /path/to/config.yaml
+```
+
+**Step 3. Send test call**
+
+```bash
+curl --location 'http://0.0.0.0:8000/chat/completions' \
+    --header 'Autherization: Bearer sk-1234' \
+    --header 'Content-Type: application/json' \
+    --data '{
+    "model": "gpt-3.5-turbo",
+    "messages": [
+        {
+        "role": "user",
+        "content": "what llm are you"
+        }
+    ],
+}'
+```
+</TabItem>
+<TabItem value="per-user" label="For User">
+
+Apply a budget across multiple keys.
+
+LiteLLM exposes a `/user/new` endpoint to create budgets for this.
+
+You can:
+- Add budgets to users [**Jump**](#add-budgets-to-users)
+- Add budget durations, to reset spend [**Jump**](#add-budget-duration-to-users)
+
+By default the `max_budget` is set to `null` and is not checked for keys
+
+### **Add budgets to users**
 ```shell 
 curl --location 'http://localhost:8000/user/new' \
 --header 'Authorization: Bearer <your-master-key>' \
 --header 'Content-Type: application/json' \
 --data-raw '{"models": ["azure-models"], "max_budget": 0, "user_id": "krrish3@berri.ai"}' 
 ```
-The request is a normal `/key/generate` request body + a `max_budget` field. 
+
+[**See Swagger**](https://litellm-api.up.railway.app/#/user%20management/new_user_user_new_post)

 **Sample Response**

@ -29,18 +89,163 @@ The request is a normal `/key/generate` request body + a `max_budget` field.
 }
 ```

+### **Add budget duration to users**
+
+`budget_duration`: Budget is reset at the end of specified duration. If not set, budget is never reset. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
+
+```
+curl 'http://0.0.0.0:8000/user/new' \
+--header 'Authorization: Bearer <your-master-key>' \
+--header 'Content-Type: application/json' \
+--data-raw '{
+  "team_id": "core-infra", # [OPTIONAL]
+  "max_budget": 10,
+  "budget_duration": 10s,
+}'
+```
+
+### Create new keys for existing user
+
+Now you can just call `/key/generate` with that user_id (i.e. krrish3@berri.ai) and:
+- **Budget Check**: krrish3@berri.ai's budget (i.e. $10) will be checked for this key
+- **Spend Tracking**: spend for this key will update krrish3@berri.ai's spend as well
+
+```bash
+curl --location 'http://0.0.0.0:8000/key/generate' \
+--header 'Authorization: Bearer <your-master-key>' \
+--header 'Content-Type: application/json' \
+--data '{"models": ["azure-models"], "user_id": "krrish3@berri.ai"}'
+```
+
+</TabItem>
+<TabItem value="per-key" label="For Key">
+
+Apply a budget on a key.
+
+You can:
+- Add budgets to keys [**Jump**](#add-budgets-to-keys)
+- Add budget durations, to reset spend [**Jump**](#add-budget-duration-to-keys)
+
+**Expected Behaviour**
+- Costs Per key get auto-populated in `LiteLLM_VerificationToken` Table
+- After the key crosses it's `max_budget`, requests fail
+- If duration set, spend is reset at the end of the duration
+
+By default the `max_budget` is set to `null` and is not checked for keys
+
+### **Add budgets to keys**
+
+```bash
+curl 'http://0.0.0.0:8000/key/generate' \
+--header 'Authorization: Bearer <your-master-key>' \
+--header 'Content-Type: application/json' \
+--data-raw '{
+  "team_id": "core-infra", # [OPTIONAL]
+  "max_budget": 10,
+}'
+```
+
+Example Request to `/chat/completions` when key has crossed budget
+
+```shell
+curl --location 'http://0.0.0.0:8000/chat/completions' \
+  --header 'Content-Type: application/json' \
+  --header 'Authorization: Bearer <generated-key>' \
+  --data ' {
+  "model": "azure-gpt-3.5",
+  "user": "e09b4da8-ed80-4b05-ac93-e16d9eb56fca",
+  "messages": [
+      {
+      "role": "user",
+      "content": "respond in 50 lines"
+      }
+  ],
+}'
+```
+
+
+Expected Response from `/chat/completions` when key has crossed budget
+```shell
+{
+  "detail":"Authentication Error, ExceededTokenBudget: Current spend for token: 7.2e-05; Max Budget for Token: 2e-07"
+}   
+```
+
+### **Add budget duration to keys**
+
+`budget_duration`: Budget is reset at the end of specified duration. If not set, budget is never reset. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
+
+```
+curl 'http://0.0.0.0:8000/key/generate' \
+--header 'Authorization: Bearer <your-master-key>' \
+--header 'Content-Type: application/json' \
+--data-raw '{
+  "team_id": "core-infra", # [OPTIONAL]
+  "max_budget": 10,
+  "budget_duration": 10s,
+}'
+```
+
+</TabItem>
+</Tabs>

 ## Set Rate Limits 

-Set max parallel requests a user can make, when you create user keys - `/key/generate`. 
+You can set: 
+- max parallel requests
+- tpm limits 
+- rpm limits 
+
+<Tabs>
+<TabItem value="per-user" label="Per User">
+
+Use `/user/new`, to persist rate limits across multiple keys.
+
+
+```shell
+curl --location 'http://0.0.0.0:8000/user/new' \
+--header 'Authorization: Bearer sk-1234' \
+--header 'Content-Type: application/json' \
+--data '{"user_id": "krrish@berri.ai", "max_parallel_requests": 10, "tpm_limit": 20, "rpm_limit": 4}' 
+```
+
+[**See Swagger**](https://litellm-api.up.railway.app/#/user%20management/new_user_user_new_post)
+
+**Expected Response**
+
+```json
+{
+    "key": "sk-sA7VDkyhlQ7m8Gt77Mbt3Q",
+    "expires": "2024-01-19T01:21:12.816168",
+    "user_id": "krrish@berri.ai",
+}
+```
+
+</TabItem>
+<TabItem value="per-key" label="Per Key">
+
+Use `/key/generate`, if you want them for just that key.

 ```shell
 curl --location 'http://0.0.0.0:8000/key/generate' \
 --header 'Authorization: Bearer sk-1234' \
 --header 'Content-Type: application/json' \
--data '{"duration": "20m", "max_parallel_requests": 1}' # 👈 max parallel requests = 1
+--data '{"max_parallel_requests": 10, "tpm_limit": 20, "rpm_limit": 4}' 
 ```

+**Expected Response**
+
+```json
+{
+    "key": "sk-ulGNRXWtv7M0lFnnsQk0wQ",
+    "expires": "2024-01-18T20:48:44.297973",
+    "user_id": "78c2c8fc-c233-43b9-b0c3-eb931da27b84"  // 👈 auto-generated
+}
+```
+
+</TabItem>
+</Tabs>
+
 ## Grant Access to new model 

 Use model access groups to give users access to select models, and add new ones to it over time (e.g. mistral, llama-2, etc.). 
--- a/docs/my-website/docs/proxy/virtual_keys.md
+++ b/docs/my-website/docs/proxy/virtual_keys.md
@ -1,4 +1,4 @@
-# Key Management
+# Virtual Keys
 Track Spend, Set budgets and create virtual keys for the proxy

 Grant other's temporary access to your proxy, with keys that expire after a set duration.
@ -12,7 +12,7 @@ Grant other's temporary access to your proxy, with keys that expire after a set

 :::

-## Quick Start
+## Setup

 Requirements: 

@ -58,36 +58,53 @@ litellm --config /path/to/config.yaml
 curl 'http://0.0.0.0:8000/key/generate' \
 --header 'Authorization: Bearer <your-master-key>' \
 --header 'Content-Type: application/json' \
--data-raw '{"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"], "duration": "20m","metadata": {"user": "ishaan@berri.ai", "team": "core-infra"}}'
+--data-raw '{"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"], "duration": "20m","metadata": {"user": "ishaan@berri.ai"}}'
 ```

- `models`: *list or null (optional)* - Specify the models a token has access too. If null, then token has access to all models on server. 

- `duration`: *str or null (optional)* Specify the length of time the token is valid for. If null, default is set to 1 hour. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
+## /key/generate

- `metadata`: *dict or null (optional)* Pass metadata for the created token. If null defaults to {}
+### Request
+```shell
+curl 'http://0.0.0.0:8000/key/generate' \
+--header 'Authorization: Bearer <your-master-key>' \
+--header 'Content-Type: application/json' \
+--data-raw '{
+  "models": ["gpt-3.5-turbo", "gpt-4", "claude-2"],
+  "duration": "20m",
+  "metadata": {"user": "ishaan@berri.ai"},
+  "team_id": "core-infra",
+  "max_budget": 10,
+}'
+```

-Expected response: 
+
+Request Params:
+
+- `duration`: *Optional[str]* - Specify the length of time the token is valid for. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
+- `key_alias`: *Optional[str]* - User defined key alias
+- `team_id`: *Optional[str]* - The team id of the user
+- `models`: *Optional[list]* - Model_name's a user is allowed to call. (if empty, key is allowed to call all models)
+- `aliases`: *Optional[dict]* - Any alias mappings, on top of anything in the config.yaml model list. - https://docs.litellm.ai/docs/proxy/virtual_keys#managing-auth---upgradedowngrade-models
+- `config`: *Optional[dict]* - any key-specific configs, overrides config in config.yaml
+- `spend`: *Optional[int]* - Amount spent by key. Default is 0. Will be updated by proxy whenever key is used. https://docs.litellm.ai/docs/proxy/virtual_keys#managing-auth---tracking-spend
+- `max_budget`: *Optional[float]* - Specify max budget for a given key.
+- `max_parallel_requests`: *Optional[int]* - Rate limit a user based on the number of parallel requests. Raises 429 error, if user's parallel requests > x.
+- `metadata`: *Optional[dict]* - Metadata for key, store information for key. Example metadata = {"team": "core-infra", "app": "app2", "email": "ishaan@berri.ai" }
+
+
+### Response

 ```python
 {
    "key": "sk-kdEXbIqZRwEeEiHwdg7sFA", # Bearer token
    "expires": "2023-11-19T01:38:25.838000+00:00" # datetime object
+    "key_name": "sk-...7sFA" # abbreviated key string, ONLY stored in db if `allow_user_auth: true` set - [see](./ui.md)
+    ...
 }
 ```

-## Keys that don't expire
-
-Just set duration to None. 
-
-```bash
-curl --location 'http://0.0.0.0:8000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data '{"models": ["azure-models"], "aliases": {"mistral-7b": "gpt-3.5-turbo"}, "duration": null}'
-```
-
-## Upgrade/Downgrade Models 
+### Upgrade/Downgrade Models 

 If a user is expected to use a given model (i.e. gpt3-5), and you want to:

@ -137,7 +154,7 @@ curl -X POST "https://0.0.0.0:8000/key/generate" \
 - **How are routing between diff keys/api bases done?** litellm handles this by shuffling between different models in the model list with the same model_name. [**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/router.py)


-## Grant Access to new model 
+### Grant Access to new model 

 Use model access groups to give users access to select models, and add new ones to it over time (e.g. mistral, llama-2, etc.)

@ -165,6 +182,188 @@ curl --location 'http://localhost:8000/key/generate' \
 			"max_budget": 0,}'
 ```

+
+## /key/info
+
+### Request
+```shell
+curl -X GET "http://0.0.0.0:8000/key/info?key=sk-02Wr4IAlN3NvPXvL5JVvDA" \
+-H "Authorization: Bearer sk-1234"
+```
+
+Request Params:
+- key: str - The key you want the info for
+
+### Response
+
+`token` is the hashed key (The DB stores the hashed key for security)
+```json
+{
+  "key": "sk-02Wr4IAlN3NvPXvL5JVvDA",
+  "info": {
+    "token": "80321a12d03412c527f2bd9db5fabd746abead2e1d50b435a534432fbaca9ef5",
+    "spend": 0.0,
+    "expires": "2024-01-18T23:52:09.125000+00:00",
+    "models": ["azure-gpt-3.5", "azure-embedding-model"],
+    "aliases": {},
+    "config": {},
+    "user_id": "ishaan2@berri.ai",
+    "team_id": "None",
+    "max_parallel_requests": null,
+    "metadata": {}
+  }
+}
+
+
+```
+
+## /key/update
+
+### Request
+```shell
+curl 'http://0.0.0.0:8000/key/update' \
+--header 'Authorization: Bearer <your-master-key>' \
+--header 'Content-Type: application/json' \
+--data-raw '{
+  "key": "sk-kdEXbIqZRwEeEiHwdg7sFA",
+  "models": ["gpt-3.5-turbo", "gpt-4", "claude-2"],
+  "metadata": {"user": "ishaan@berri.ai"},
+  "team_id": "core-infra"
+}'
+```
+
+Request Params:
+- key: str - The key that needs to be updated.
+
+- models: list or null (optional) - Specify the models a token has access to. If null, then the token has access to all models on the server.
+
+- metadata: dict or null (optional) - Pass metadata for the updated token. If null, defaults to an empty dictionary.
+
+- team_id: str or null (optional) - Specify the team_id for the associated key.
+
+### Response
+
+```json
+{
+  "key": "sk-kdEXbIqZRwEeEiHwdg7sFA",
+  "models": ["gpt-3.5-turbo", "gpt-4", "claude-2"],
+  "metadata": {
+    "user": "ishaan@berri.ai"
+  }
+}
+
+```
+
+
+## /key/delete
+
+### Request
+```shell
+curl 'http://0.0.0.0:8000/key/delete' \
+--header 'Authorization: Bearer <your-master-key>' \
+--header 'Content-Type: application/json' \
+--data-raw '{
+  "keys": ["sk-kdEXbIqZRwEeEiHwdg7sFA"]
+}'
+```
+
+Request Params:
+- keys: List[str] - List of keys to delete
+
+### Response
+
+```json
+{
+  "deleted_keys": ["sk-kdEXbIqZRwEeEiHwdg7sFA"]
+}
+```
+
+## Default /key/generate params
+Use this, if you need to control the default `max_budget` or any `key/generate` param per key. 
+
+When a `/key/generate` request does not specify `max_budget`, it will use the `max_budget` specified in `default_key_generate_params`
+
+Set `litellm_settings:default_key_generate_params`:
+```yaml
+litellm_settings:
+  default_key_generate_params:
+    max_budget: 1.5000
+    models: ["azure-gpt-3.5"]
+    duration:     # blank means `null`
+    metadata: {"setting":"default"}
+    team_id: "core-infra"
+```
+## Set Budgets - Per Key
+
+Set `max_budget` in (USD $) param in the `key/generate` request. By default the `max_budget` is set to `null` and is not checked for keys
+
+```shell
+curl 'http://0.0.0.0:8000/key/generate' \
+--header 'Authorization: Bearer <your-master-key>' \
+--header 'Content-Type: application/json' \
+--data-raw '{
+  "metadata": {"user": "ishaan@berri.ai"},
+  "team_id": "core-infra",
+  "max_budget": 10,
+}'
+```
+
+#### Expected Behaviour
+- Costs Per key get auto-populated in `LiteLLM_VerificationToken` Table
+- After the key crosses it's `max_budget`, requests fail
+
+Example Request to `/chat/completions` when key has crossed budget
+
+```shell
+curl --location 'http://0.0.0.0:8000/chat/completions' \
+  --header 'Content-Type: application/json' \
+  --header 'Authorization: Bearer sk-ULl_IKCVFy2EZRzQB16RUA' \
+  --data ' {
+  "model": "azure-gpt-3.5",
+  "user": "e09b4da8-ed80-4b05-ac93-e16d9eb56fca",
+  "messages": [
+      {
+      "role": "user",
+      "content": "respond in 50 lines"
+      }
+  ],
+}'
+```
+
+
+Expected Response from `/chat/completions` when key has crossed budget
+```shell
+{
+  "detail":"Authentication Error, ExceededTokenBudget: Current spend for token: 7.2e-05; Max Budget for Token: 2e-07"
+}   
+```
+
+
+## Set Budgets - Per User
+
+LiteLLM exposes a `/user/new` endpoint to create budgets for users, that persist across multiple keys. 
+
+This is documented in the swagger (live on your server root endpoint - e.g. `http://0.0.0.0:8000/`). Here's an example request. 
+
+```shell 
+curl --location 'http://localhost:8000/user/new' \
+--header 'Authorization: Bearer <your-master-key>' \
+--header 'Content-Type: application/json' \
+--data-raw '{"models": ["azure-models"], "max_budget": 0, "user_id": "krrish3@berri.ai"}' 
+```
+The request is a normal `/key/generate` request body + a `max_budget` field. 
+
+**Sample Response**
+
+```shell
+{
+    "key": "sk-YF2OxDbrgd1y2KgwxmEA2w",
+    "expires": "2023-12-22T09:53:13.861000Z",
+    "user_id": "krrish3@berri.ai",
+    "max_budget": 0.0
+}
+```
+
 ## Tracking Spend 

 You can get spend for a key by using the `/key/info` endpoint. 
@ -200,32 +399,6 @@ This is automatically updated (in USD) when calls are made to /completions, /cha
 ```


-
-## Set Budgets 
-
-LiteLLM exposes a `/user/new` endpoint to create budgets for users, that persist across multiple keys. 
-
-This is documented in the swagger (live on your server root endpoint - e.g. `http://0.0.0.0:8000/`). Here's an example request. 
-
-```shell 
-curl --location 'http://localhost:8000/user/new' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{"models": ["azure-models"], "max_budget": 0, "user_id": "krrish3@berri.ai"}' 
-```
-The request is a normal `/key/generate` request body + a `max_budget` field. 
-
-**Sample Response**
-
-```shell
-{
-    "key": "sk-YF2OxDbrgd1y2KgwxmEA2w",
-    "expires": "2023-12-22T09:53:13.861000Z",
-    "user_id": "krrish3@berri.ai",
-    "max_budget": 0.0
-}
-```
-
 ## Custom Auth 

 You can now override the default api key auth. 
@ -275,6 +448,97 @@ general_settings:
 $ litellm --config /path/to/config.yaml 
 ```

+## Custom /key/generate
+
+If you need to add custom logic before generating a Proxy API Key (Example Validating `team_id`)
+
+### 1. Write a custom `custom_generate_key_fn`
+
+
+The input to the custom_generate_key_fn function is a single parameter: `data` [(Type: GenerateKeyRequest)](https://github.com/BerriAI/litellm/blob/main/litellm/proxy/_types.py#L125)
+
+The output of your `custom_generate_key_fn` should be a dictionary with the following structure
+```python
+{
+    "decision": False,
+    "message": "This violates LiteLLM Proxy Rules. No team id provided.",
+}
+
+```
+
+- decision (Type: bool): A boolean value indicating whether the key generation is allowed (True) or not (False).
+
+- message (Type: str, Optional): An optional message providing additional information about the decision. This field is included when the decision is False.
+
+
+```python
+async def custom_generate_key_fn(data: GenerateKeyRequest)-> dict:
+        """
+        Asynchronous function for generating a key based on the input data.
+
+        Args:
+            data (GenerateKeyRequest): The input data for key generation.
+
+        Returns:
+            dict: A dictionary containing the decision and an optional message.
+            {
+                "decision": False,
+                "message": "This violates LiteLLM Proxy Rules. No team id provided.",
+            }
+        """
+        
+        # decide if a key should be generated or not
+        print("using custom auth function!")
+        data_json = data.json()  # type: ignore
+
+        # Unpacking variables
+        team_id = data_json.get("team_id")
+        duration = data_json.get("duration")
+        models = data_json.get("models")
+        aliases = data_json.get("aliases")
+        config = data_json.get("config")
+        spend = data_json.get("spend")
+        user_id = data_json.get("user_id")
+        max_parallel_requests = data_json.get("max_parallel_requests")
+        metadata = data_json.get("metadata")
+        tpm_limit = data_json.get("tpm_limit")
+        rpm_limit = data_json.get("rpm_limit")
+
+        if team_id is not None and team_id == "litellm-core-infra@gmail.com":
+            # only team_id="litellm-core-infra@gmail.com" can make keys
+            return {
+                "decision": True,
+            }
+        else:
+            print("Failed custom auth")
+            return {
+                "decision": False,
+                "message": "This violates LiteLLM Proxy Rules. No team id provided.",
+            }
+```
+
+
+### 2. Pass the filepath (relative to the config.yaml)
+
+Pass the filepath to the config.yaml 
+
+e.g. if they're both in the same dir - `./config.yaml` and `./custom_auth.py`, this is what it looks like:
+```yaml 
+model_list: 
+  - model_name: "openai-model"
+    litellm_params: 
+      model: "gpt-3.5-turbo"
+
+litellm_settings:
+  drop_params: True
+  set_verbose: True
+
+general_settings:
+  custom_key_generate: custom_auth.custom_generate_key_fn
+```
+
+
+

 ## [BETA] Dynamo DB 

--- a/docs/my-website/docs/routing.md
+++ b/docs/my-website/docs/routing.md
@ -302,6 +302,7 @@ asyncio.run(router_acompletion())

 The timeout set in router is for the entire length of the call, and is passed down to the completion() call level as well. 

+**Global Timeouts**
 ```python
 from litellm import Router 

@ -313,6 +314,36 @@ router = Router(model_list=model_list,
 print(response)
 ```

+**Timeouts per model**
+
+```python
+from litellm import Router 
+import asyncio
+
+model_list = [{
+	"model_name": "gpt-3.5-turbo",
+	"litellm_params": {
+		"model": "azure/chatgpt-v-2",
+		"api_key": os.getenv("AZURE_API_KEY"),
+		"api_version": os.getenv("AZURE_API_VERSION"),
+		"api_base": os.getenv("AZURE_API_BASE"),
+		"timeout": 300 # sets a 5 minute timeout
+		"stream_timeout": 30 # sets a 30s timeout for streaming calls
+	}
+}]
+
+# init router
+router = Router(model_list=model_list, routing_strategy="least-busy")
+async def router_acompletion():
+	response = await router.acompletion(
+		model="gpt-3.5-turbo", 
+		messages=[{"role": "user", "content": "Hey, how's it going?"}]
+	)
+	print(response)
+	return response
+
+asyncio.run(router_acompletion())
+```
 ### Cooldowns

 Set the limit for how many calls a model is allowed to fail in a minute, before being cooled down for a minute. 
@ -574,6 +605,49 @@ response = router.completion(model="gpt-3.5-turbo", messages=messages)
 print(f"response: {response}")
 ```

+## Custom Callbacks - Track API Key, API Endpoint, Model Used 
+
+If you need to track the api_key, api endpoint, model, custom_llm_provider used for each completion call, you can setup a [custom callback](https://docs.litellm.ai/docs/observability/custom_callback) 
+
+### Usage
+
+```python
+import litellm
+from litellm.integrations.custom_logger import CustomLogger
+
+class MyCustomHandler(CustomLogger):        
+	def log_success_event(self, kwargs, response_obj, start_time, end_time): 
+		print(f"On Success")
+		print("kwargs=", kwargs)
+		litellm_params= kwargs.get("litellm_params")
+		api_key = litellm_params.get("api_key")
+		api_base = litellm_params.get("api_base")
+		custom_llm_provider= litellm_params.get("custom_llm_provider")
+		response_cost = kwargs.get("response_cost")
+
+		# print the values
+		print("api_key=", api_key)
+		print("api_base=", api_base)
+		print("custom_llm_provider=", custom_llm_provider)
+		print("response_cost=", response_cost)
+
+	def log_failure_event(self, kwargs, response_obj, start_time, end_time): 
+		print(f"On Failure")
+		print("kwargs=")
+
+customHandler = MyCustomHandler()
+
+litellm.callbacks = [customHandler]
+
+# Init Router
+router = Router(model_list=model_list, routing_strategy="simple-shuffle")
+
+# router completion call
+response = router.completion(
+	model="gpt-3.5-turbo", 
+	messages=[{ "role": "user", "content": "Hi who are you"}]
+)
+```

 ## Deploy Router 

@ -602,17 +676,63 @@ def __init__(
 	num_retries: int = 0,
 	timeout: Optional[float] = None,
 	default_litellm_params={},  # default params for Router.chat.completion.create
-	set_verbose: bool = False,
 	fallbacks: List = [],
-	allowed_fails: Optional[int] = None,
+	allowed_fails: Optional[int] = None, # Number of times a deployment can failbefore being added to cooldown
+	cooldown_time: float = 1,  # (seconds) time to cooldown a deployment after failure
 	context_window_fallbacks: List = [],
 	model_group_alias: Optional[dict] = {},
-	retry_after: int = 0,  # min time to wait before retrying a failed request
+	retry_after: int = 0,  # (min) time to wait before retrying a failed request
 	routing_strategy: Literal[
 		"simple-shuffle",
 		"least-busy",
 		"usage-based-routing",
 		"latency-based-routing",
 	] = "simple-shuffle",
+
+	## DEBUGGING ##
+	set_verbose: bool = False,	# set this to True for seeing logs
+    debug_level: Literal["DEBUG", "INFO"] = "INFO", # set this to "DEBUG" for detailed debugging
 ):
-```
+```
+
+## Debugging Router
+### Basic Debugging
+Set `Router(set_verbose=True)`
+
+```python
+from litellm import Router
+
+router = Router(
+    model_list=model_list,
+    set_verbose=True
+)
+```
+
+### Detailed Debugging
+Set `Router(set_verbose=True,debug_level="DEBUG")`
+
+```python
+from litellm import Router
+
+router = Router(
+    model_list=model_list,
+    set_verbose=True,
+    debug_level="DEBUG"  # defaults to INFO
+)
+```
+
+### Very Detailed Debugging
+Set `litellm.set_verbose=True` and `Router(set_verbose=True,debug_level="DEBUG")`
+
+```python
+from litellm import Router
+import litellm
+
+litellm.set_verbose = True
+
+router = Router(
+    model_list=model_list,
+    set_verbose=True,
+    debug_level="DEBUG"  # defaults to INFO
+)
+```
--- a/docs/my-website/img/admin_ui_2.png
+++ b/docs/my-website/img/admin_ui_2.png
--- a/docs/my-website/img/google_oauth2.png
+++ b/docs/my-website/img/google_oauth2.png
--- a/docs/my-website/img/google_redirect.png
+++ b/docs/my-website/img/google_redirect.png
--- a/docs/my-website/img/spend_logs_table.png
+++ b/docs/my-website/img/spend_logs_table.png
--- a/docs/my-website/img/spend_per_api_key.png
+++ b/docs/my-website/img/spend_per_api_key.png
--- a/docs/my-website/img/spend_per_user.png
+++ b/docs/my-website/img/spend_per_user.png
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -104,24 +104,49 @@ const sidebars = {
      items: [
        "proxy/quick_start", 
        "proxy/configs",
+        {
+          type: 'link',
+          label: '📖 All Endpoints',
+          href: 'https://litellm-api.up.railway.app/',
+        },
        "proxy/user_keys",
-        "proxy/load_balancing", 
        "proxy/virtual_keys",
        "proxy/users",
        "proxy/ui",
        "proxy/model_management",
-        "proxy/reliability",
-        "proxy/caching",
-        "proxy/logging", 
        "proxy/health",
-        "proxy/call_hooks",
-        "proxy/rules",
-        "proxy/alerting",
-        "proxy/streaming_logging",
+        "proxy/debugging",
+        {
+          "type": "category",
+          "label": "🔥 Load Balancing",
+          "items": [
+            "proxy/load_balancing", 
+            "proxy/reliability",
+          ]
+        },
+        {
+          "type": "category",
+          "label": "Logging, Alerting, Caching",
+          "items": [
+            "proxy/logging", 
+            "proxy/alerting",
+            "proxy/streaming_logging",
+            "proxy/caching",
+          ]
+        },
+        {
+          "type": "category",
+          "label": "Admin Controls",
+          "items": [
+            "proxy/call_hooks",
+            "proxy/rules",
+          ]
+        },
        "proxy/deploy", 
        "proxy/cli", 
      ]
    },
+    "proxy/custom_pricing",
    "routing",
    "rules",
    "set_keys",
--- a/litellm/init.py
+++ b/litellm/init.py
@ -2,10 +2,14 @@
 import threading, requests
 from typing import Callable, List, Optional, Dict, Union, Any
 from litellm.caching import Cache
-from litellm._logging import set_verbose
+from litellm._logging import set_verbose, _turn_on_debug
 from litellm.proxy._types import KeyManagementSystem
 import httpx

+#############################################
+if set_verbose == True:
+    _turn_on_debug()
+#############################################
 input_callback: List[Union[str, Callable]] = []
 success_callback: List[Union[str, Callable]] = []
 failure_callback: List[Union[str, Callable]] = []
@ -58,6 +62,9 @@ cache: Optional[
 model_alias_map: Dict[str, str] = {}
 model_group_alias_map: Dict[str, str] = {}
 max_budget: float = 0.0  # set the max budget across all providers
+budget_duration: Optional[
+    str
+] = None  # proxy only - resets budget after fixed duration. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
 _openai_completion_params = [
    "functions",
    "function_call",
@ -136,6 +143,7 @@ model_cost_map_url: str = "https://raw.githubusercontent.com/BerriAI/litellm/mai
 suppress_debug_info = False
 dynamodb_table_name: Optional[str] = None
 s3_callback_params: Optional[Dict] = None
+default_key_generate_params: Optional[Dict] = None
 #### RELIABILITY ####
 request_timeout: Optional[float] = 6000
 num_retries: Optional[int] = None  # per model endpoint
--- a/litellm/_logging.py
+++ b/litellm/_logging.py
@ -7,20 +7,14 @@ handler = logging.StreamHandler()
 handler.setLevel(logging.DEBUG)

 # Create a formatter and set it for the handler
+formatter = logging.Formatter(
+    "\033[92m%(asctime)s - %(name)s:%(levelname)s\033[0m: %(message)s",
+    datefmt="%H:%M:%S",
+)

-formatter = logging.Formatter("\033[92m%(name)s - %(levelname)s\033[0m: %(message)s")

 handler.setFormatter(formatter)

-
-def print_verbose(print_statement):
-    try:
-        if set_verbose:
-            print(print_statement)  # noqa
-    except:
-        pass
-
-
 verbose_proxy_logger = logging.getLogger("LiteLLM Proxy")
 verbose_router_logger = logging.getLogger("LiteLLM Router")
 verbose_logger = logging.getLogger("LiteLLM")
@ -28,3 +22,18 @@ verbose_logger = logging.getLogger("LiteLLM")
 # Add the handler to the logger
 verbose_router_logger.addHandler(handler)
 verbose_proxy_logger.addHandler(handler)
+verbose_logger.addHandler(handler)
+
+
+def _turn_on_debug():
+    verbose_logger.setLevel(level=logging.DEBUG)  # set package log to debug
+    verbose_router_logger.setLevel(level=logging.DEBUG)  # set router logs to debug
+    verbose_proxy_logger.setLevel(level=logging.DEBUG)  # set proxy logs to debug
+
+
+def print_verbose(print_statement):
+    try:
+        if set_verbose:
+            print(print_statement)  # noqa
+    except:
+        pass
--- a/litellm/budget_manager.py
+++ b/litellm/budget_manager.py
@ -1,3 +1,12 @@
+# +-----------------------------------------------+
+# |                                               |
+# |           NOT PROXY BUDGET MANAGER            |
+# |  proxy budget manager is in proxy_server.py   |
+# |                                               |
+# +-----------------------------------------------+
+#
+#  Thank you users! We ❤️ you! - Krrish & Ishaan
+
 import os, json, time
 import litellm
 from litellm.utils import ModelResponse
@ -11,10 +20,12 @@ class BudgetManager:
        project_name: str,
        client_type: str = "local",
        api_base: Optional[str] = None,
+        headers: Optional[dict] = None,
    ):
        self.client_type = client_type
        self.project_name = project_name
        self.api_base = api_base or "https://api.litellm.ai"
+        self.headers = headers or {"Content-Type": "application/json"}
        ## load the data or init the initial dictionaries
        self.load_data()

@ -43,7 +54,7 @@ class BudgetManager:
            url = self.api_base + "/get_budget"
            headers = {"Content-Type": "application/json"}
            data = {"project_name": self.project_name}
-            response = requests.post(url, headers=headers, json=data)
+            response = requests.post(url, headers=self.headers, json=data)
            response = response.json()
            if response["status"] == "error":
                self.user_dict = (
@ -201,6 +212,6 @@ class BudgetManager:
            url = self.api_base + "/set_budget"
            headers = {"Content-Type": "application/json"}
            data = {"project_name": self.project_name, "user_dict": self.user_dict}
-            response = requests.post(url, headers=headers, json=data)
+            response = requests.post(url, headers=self.headers, json=data)
            response = response.json()
            return response
--- a/litellm/caching.py
+++ b/litellm/caching.py
@ -12,10 +12,12 @@ import time, logging
 import json, traceback, ast, hashlib
 from typing import Optional, Literal, List, Union, Any
 from openai._models import BaseModel as OpenAIObject
+from litellm._logging import verbose_logger


 def print_verbose(print_statement):
    try:
+        verbose_logger.debug(print_statement)
        if litellm.set_verbose:
            print(print_statement)  # noqa
    except:
@ -129,11 +131,13 @@ class S3Cache(BaseCache):
        s3_aws_secret_access_key=None,
        s3_aws_session_token=None,
        s3_config=None,
+        s3_path=None,
        **kwargs,
    ):
        import boto3

        self.bucket_name = s3_bucket_name
+        self.key_prefix = s3_path.rstrip("/") + "/" if s3_path else ""
        # Create an S3 client with custom endpoint URL
        self.s3_client = boto3.client(
            "s3",
@ -155,6 +159,8 @@ class S3Cache(BaseCache):
            ttl = kwargs.get("ttl", None)
            # Convert value to JSON before storing in S3
            serialized_value = json.dumps(value)
+            key = self.key_prefix + key
+
            if ttl is not None:
                cache_control = f"immutable, max-age={ttl}, s-maxage={ttl}"
                import datetime
@ -171,7 +177,7 @@ class S3Cache(BaseCache):
                    CacheControl=cache_control,
                    ContentType="application/json",
                    ContentLanguage="en",
-                    ContentDisposition=f"inline; filename=\"{key}.json\""
+                    ContentDisposition=f'inline; filename="{key}.json"',
                )
            else:
                cache_control = "immutable, max-age=31536000, s-maxage=31536000"
@ -183,7 +189,7 @@ class S3Cache(BaseCache):
                    CacheControl=cache_control,
                    ContentType="application/json",
                    ContentLanguage="en",
-                    ContentDisposition=f"inline; filename=\"{key}.json\""
+                    ContentDisposition=f'inline; filename="{key}.json"',
                )
        except Exception as e:
            # NON blocking - notify users S3 is throwing an exception
@ -193,6 +199,8 @@ class S3Cache(BaseCache):
        import boto3, botocore

        try:
+            key = self.key_prefix + key
+
            print_verbose(f"Get S3 Cache: key: {key}")
            # Download the data from S3
            cached_response = self.s3_client.get_object(
--- a/litellm/integrations/langfuse.py
+++ b/litellm/integrations/langfuse.py
@ -8,6 +8,7 @@ from datetime import datetime
 dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback
 from packaging.version import Version
+from litellm._logging import verbose_logger


 class LangFuseLogger:
@ -93,6 +94,7 @@ class LangFuseLogger:
            print_verbose(
                f"Langfuse Layer Logging - final response object: {response_obj}"
            )
+            verbose_logger.info(f"Langfuse Layer Logging - logging success")
        except:
            traceback.print_exc()
            print_verbose(f"Langfuse Layer Error - {traceback.format_exc()}")
@ -181,6 +183,8 @@ class LangFuseLogger:
        if supports_tags:
            for key, value in metadata.items():
                tags.append(f"{key}:{value}")
+            if "cache_hit" in kwargs:
+                tags.append(f"cache_hit:{kwargs['cache_hit']}")
            trace_params.update({"tags": tags})

        trace = self.Langfuse.trace(**trace_params)
--- a/litellm/integrations/langsmith.py
+++ b/litellm/integrations/langsmith.py
@ -13,19 +13,22 @@ class LangsmithLogger:
    # Class variables or attributes
    def __init__(self):
        self.langsmith_api_key = os.getenv("LANGSMITH_API_KEY")
+        self.langsmith_project = os.getenv("LANGSMITH_PROJECT", "litellm-completion")
+        self.langsmith_default_run_name = os.getenv(
+            "LANGSMITH_DEFAULT_RUN_NAME", "LLMRun"
+        )

    def log_event(self, kwargs, response_obj, start_time, end_time, print_verbose):
        # Method definition
        # inspired by Langsmith http api here: https://github.com/langchain-ai/langsmith-cookbook/blob/main/tracing-examples/rest/rest.ipynb
-        metadata = {}
-        if "litellm_params" in kwargs:
-            metadata = kwargs["litellm_params"].get("metadata", {})
+        metadata = kwargs.get('litellm_params', {}).get("metadata", {}) or {}  # if metadata is None
+
        # set project name and run_name for langsmith logging
        # users can pass project_name and run name to litellm.completion()
        # Example: litellm.completion(model, messages, metadata={"project_name": "my-litellm-project", "run_name": "my-langsmith-run"})
-        # if not set litellm will use default project_name = litellm-completion, run_name = LLMRun
-        project_name = metadata.get("project_name", "litellm-completion")
-        run_name = metadata.get("run_name", "LLMRun")
+        # if not set litellm will fallback to the environment variable LANGSMITH_PROJECT, then to the default project_name = litellm-completion, run_name = LLMRun
+        project_name = metadata.get("project_name", self.langsmith_project)
+        run_name = metadata.get("run_name", self.langsmith_default_run_name)
        print_verbose(
            f"Langsmith Logging - project_name: {project_name}, run_name {run_name}"
        )
--- a/litellm/integrations/s3.py
+++ b/litellm/integrations/s3.py
@ -8,7 +8,7 @@ dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback
 import datetime, subprocess, sys
 import litellm, uuid
-from litellm._logging import print_verbose
+from litellm._logging import print_verbose, verbose_logger


 class S3Logger:
@ -16,6 +16,7 @@ class S3Logger:
    def __init__(
        self,
        s3_bucket_name=None,
+        s3_path=None,
        s3_region_name=None,
        s3_api_version=None,
        s3_use_ssl=True,
@ -30,7 +31,9 @@ class S3Logger:
        import boto3

        try:
-            print_verbose("in init s3 logger")
+            verbose_logger.debug(
+                f"in init s3 logger - s3_callback_params {litellm.s3_callback_params}"
+            )

            if litellm.s3_callback_params is not None:
                # read in .env variables - example os.environ/AWS_BUCKET_NAME
@ -41,7 +44,7 @@ class S3Logger:
                s3_bucket_name = litellm.s3_callback_params.get("s3_bucket_name")
                s3_region_name = litellm.s3_callback_params.get("s3_region_name")
                s3_api_version = litellm.s3_callback_params.get("s3_api_version")
-                s3_use_ssl = litellm.s3_callback_params.get("s3_use_ssl")
+                s3_use_ssl = litellm.s3_callback_params.get("s3_use_ssl", True)
                s3_verify = litellm.s3_callback_params.get("s3_verify")
                s3_endpoint_url = litellm.s3_callback_params.get("s3_endpoint_url")
                s3_aws_access_key_id = litellm.s3_callback_params.get(
@ -57,6 +60,8 @@ class S3Logger:
                # done reading litellm.s3_callback_params

            self.bucket_name = s3_bucket_name
+            self.s3_path = s3_path
+            verbose_logger.debug(f"s3 logger using endpoint url {s3_endpoint_url}")
            # Create an S3 client with custom endpoint URL
            self.s3_client = boto3.client(
                "s3",
@ -82,7 +87,9 @@ class S3Logger:

    def log_event(self, kwargs, response_obj, start_time, end_time, print_verbose):
        try:
-            print_verbose(f"s3 Logging - Enters logging function for model {kwargs}")
+            verbose_logger.debug(
+                f"s3 Logging - Enters logging function for model {kwargs}"
+            )

            # construct payload to send to s3
            # follows the same params as langfuse.py
@ -122,8 +129,12 @@ class S3Logger:
                    pass

            s3_object_key = (
-                payload["id"] + "-time=" + str(start_time)
+                (self.s3_path.rstrip("/") + "/" if self.s3_path else "")
+                + payload["id"]
+                + "-time="
+                + str(start_time)
            )  # we need the s3 key to include the time, so we log cache hits too
+            s3_object_key += ".json"

            import json

@ -146,5 +157,5 @@ class S3Logger:
            return response
        except Exception as e:
            traceback.print_exc()
-            print_verbose(f"s3 Layer Error - {str(e)}\n{traceback.format_exc()}")
+            verbose_logger.debug(f"s3 Layer Error - {str(e)}\n{traceback.format_exc()}")
            pass
--- a/litellm/llms/anthropic.py
+++ b/litellm/llms/anthropic.py
@ -78,7 +78,7 @@ class AnthropicConfig:


 # makes headers for API call
-def validate_environment(api_key):
+def validate_environment(api_key, user_headers):
    if api_key is None:
        raise ValueError(
            "Missing Anthropic API Key - A call is being made to anthropic but no key is set either in the environment variables or via params"
@ -89,6 +89,8 @@ def validate_environment(api_key):
        "content-type": "application/json",
        "x-api-key": api_key,
    }
+    if user_headers is not None and isinstance(user_headers, dict):
+        headers = {**headers, **user_headers}
    return headers


@ -105,8 +107,9 @@ def completion(
    optional_params=None,
    litellm_params=None,
    logger_fn=None,
+    headers={},
 ):
-    headers = validate_environment(api_key)
+    headers = validate_environment(api_key, headers)
    if model in custom_prompt_dict:
        # check if the model has a registered custom prompt
        model_prompt_details = custom_prompt_dict[model]
@ -139,7 +142,11 @@ def completion(
    logging_obj.pre_call(
        input=prompt,
        api_key=api_key,
-        additional_args={"complete_input_dict": data, "api_base": api_base},
+        additional_args={
+            "complete_input_dict": data,
+            "api_base": api_base,
+            "headers": headers,
+        },
    )

    ## COMPLETION CALL
--- a/litellm/llms/azure.py
+++ b/litellm/llms/azure.py
@ -629,12 +629,23 @@ class AzureChatCompletion(BaseLLM):
                client_session = litellm.aclient_session or httpx.AsyncClient(
                    transport=AsyncCustomHTTPTransport(),
                )
-                openai_aclient = AsyncAzureOpenAI(
+                azure_client = AsyncAzureOpenAI(
                    http_client=client_session, **azure_client_params
                )
            else:
-                openai_aclient = client
-            response = await openai_aclient.images.generate(**data, timeout=timeout)
+                azure_client = client
+            ## LOGGING
+            logging_obj.pre_call(
+                input=data["prompt"],
+                api_key=azure_client.api_key,
+                additional_args={
+                    "headers": {"api_key": azure_client.api_key},
+                    "api_base": azure_client._base_url._uri_reference,
+                    "acompletion": True,
+                    "complete_input_dict": data,
+                },
+            )
+            response = await azure_client.images.generate(**data, timeout=timeout)
            stringified_response = response.model_dump()
            ## LOGGING
            logging_obj.post_call(
@ -719,7 +730,7 @@ class AzureChatCompletion(BaseLLM):
                input=prompt,
                api_key=azure_client.api_key,
                additional_args={
-                    "headers": {"Authorization": f"Bearer {azure_client.api_key}"},
+                    "headers": {"api_key": azure_client.api_key},
                    "api_base": azure_client._base_url._uri_reference,
                    "acompletion": False,
                    "complete_input_dict": data,
--- a/litellm/llms/bedrock.py
+++ b/litellm/llms/bedrock.py
@ -659,9 +659,16 @@ def completion(
                )

        ## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here.
-        prompt_tokens = len(encoding.encode(prompt))
-        completion_tokens = len(
-            encoding.encode(model_response["choices"][0]["message"].get("content", ""))
+        prompt_tokens = response_metadata.get(
+            "x-amzn-bedrock-input-token-count", len(encoding.encode(prompt))
+        )
+        completion_tokens = response_metadata.get(
+            "x-amzn-bedrock-output-token-count",
+            len(
+                encoding.encode(
+                    model_response["choices"][0]["message"].get("content", "")
+                )
+            ),
        )

        model_response["created"] = int(time.time())
@ -672,6 +679,8 @@ def completion(
            total_tokens=prompt_tokens + completion_tokens,
        )
        model_response.usage = usage
+        model_response._hidden_params["region_name"] = client.meta.region_name
+        print_verbose(f"model_response._hidden_params: {model_response._hidden_params}")
        return model_response
    except BedrockError as e:
        exception_mapping_worked = True
--- a/litellm/llms/custom_httpx/azure_dall_e_2.py
+++ b/litellm/llms/custom_httpx/azure_dall_e_2.py
@ -43,7 +43,7 @@ class AsyncCustomHTTPTransport(httpx.AsyncHTTPTransport):
                        request=request,
                    )

-                time.sleep(int(response.headers.get("retry-after")) or 10)
+                await asyncio.sleep(int(response.headers.get("retry-after") or 10))
                response = await super().handle_async_request(request)
                await response.aread()

@ -95,7 +95,6 @@ class CustomHTTPTransport(httpx.HTTPTransport):
            request.method = "GET"
            response = super().handle_request(request)
            response.read()
-
            timeout_secs: int = 120
            start_time = time.time()
            while response.json()["status"] not in ["succeeded", "failed"]:
@ -112,11 +111,9 @@ class CustomHTTPTransport(httpx.HTTPTransport):
                        content=json.dumps(timeout).encode("utf-8"),
                        request=request,
                    )
-
-                time.sleep(int(response.headers.get("retry-after")) or 10)
+                time.sleep(int(response.headers.get("retry-after", None) or 10))
                response = super().handle_request(request)
                response.read()
-
            if response.json()["status"] == "failed":
                error_data = response.json()
                return httpx.Response(
--- a/litellm/llms/gemini.py
+++ b/litellm/llms/gemini.py
@ -120,9 +120,7 @@ def completion(

    ## Load Config
    inference_params = copy.deepcopy(optional_params)
-    inference_params.pop(
-        "stream", None
-    )  # palm does not support streaming, so we handle this by fake streaming in main.py
+    stream = inference_params.pop("stream", None)
    config = litellm.GeminiConfig.get_config()
    for k, v in config.items():
        if (
@ -139,10 +137,18 @@ def completion(
    ## COMPLETION CALL
    try:
        _model = genai.GenerativeModel(f"models/{model}")
-        response = _model.generate_content(
-            contents=prompt,
-            generation_config=genai.types.GenerationConfig(**inference_params),
-        )
+        if stream != True:
+            response = _model.generate_content(
+                contents=prompt,
+                generation_config=genai.types.GenerationConfig(**inference_params),
+            )
+        else:
+            response = _model.generate_content(
+                contents=prompt,
+                generation_config=genai.types.GenerationConfig(**inference_params),
+                stream=True,
+            )
+            return response
    except Exception as e:
        raise GeminiError(
            message=str(e),
@ -177,16 +183,20 @@ def completion(

    try:
        completion_response = model_response["choices"][0]["message"].get("content")
-        if completion_response is None: 
+        if completion_response is None:
            raise Exception
    except:
        original_response = f"response: {response}"
-        if hasattr(response, "candidates"): 
+        if hasattr(response, "candidates"):
            original_response = f"response: {response.candidates}"
-            if "SAFETY" in original_response: 
-                original_response += "\nThe candidate content was flagged for safety reasons."
+            if "SAFETY" in original_response:
+                original_response += (
+                    "\nThe candidate content was flagged for safety reasons."
+                )
            elif "RECITATION" in original_response:
-                original_response += "\nThe candidate content was flagged for recitation reasons."
+                original_response += (
+                    "\nThe candidate content was flagged for recitation reasons."
+                )
        raise GeminiError(
            status_code=400,
            message=f"No response received. Original response - {original_response}",
--- a/litellm/llms/ollama_chat.py
+++ b/litellm/llms/ollama_chat.py
@ -220,8 +220,10 @@ def get_ollama_response(
        model_response["choices"][0]["message"] = response_json["message"]
    model_response["created"] = int(time.time())
    model_response["model"] = "ollama/" + model
-    prompt_tokens = response_json["prompt_eval_count"]  # type: ignore
-    completion_tokens = response_json["eval_count"]
+    prompt_tokens = response_json.get("prompt_eval_count", litellm.token_counter(messages=messages))  # type: ignore
+    completion_tokens = response_json.get(
+        "eval_count", litellm.token_counter(text=response_json["message"])
+    )
    model_response["usage"] = litellm.Usage(
        prompt_tokens=prompt_tokens,
        completion_tokens=completion_tokens,
@ -320,8 +322,10 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
                model_response["choices"][0]["message"] = response_json["message"]
            model_response["created"] = int(time.time())
            model_response["model"] = "ollama/" + data["model"]
-            prompt_tokens = response_json["prompt_eval_count"]  # type: ignore
-            completion_tokens = response_json["eval_count"]
+            prompt_tokens = response_json.get("prompt_eval_count", litellm.token_counter(messages=data["messages"]))  # type: ignore
+            completion_tokens = response_json.get(
+                "eval_count", litellm.token_counter(text=response_json["message"])
+            )
            model_response["usage"] = litellm.Usage(
                prompt_tokens=prompt_tokens,
                completion_tokens=completion_tokens,
--- a/litellm/llms/openai.py
+++ b/litellm/llms/openai.py
@ -1,5 +1,5 @@
 from typing import Optional, Union, Any
-import types, time, json
+import types, time, json, traceback
 import httpx
 from .base import BaseLLM
 from litellm.utils import (
@ -349,7 +349,7 @@ class OpenAIChatCompletion(BaseLLM):
            if hasattr(e, "status_code"):
                raise OpenAIError(status_code=e.status_code, message=str(e))
            else:
-                raise OpenAIError(status_code=500, message=str(e))
+                raise OpenAIError(status_code=500, message=traceback.format_exc())

    async def acompletion(
        self,
@ -706,19 +706,34 @@ class OpenAIChatCompletion(BaseLLM):

            ## COMPLETION CALL
            response = openai_client.images.generate(**data, timeout=timeout)  # type: ignore
+            response = response.model_dump()  # type: ignore
            ## LOGGING
            logging_obj.post_call(
-                input=input,
+                input=prompt,
                api_key=api_key,
                additional_args={"complete_input_dict": data},
                original_response=response,
            )
            # return response
-            return convert_to_model_response_object(response_object=response.model_dump(), model_response_object=model_response, response_type="image_generation")  # type: ignore
+            return convert_to_model_response_object(response_object=response, model_response_object=model_response, response_type="image_generation")  # type: ignore
        except OpenAIError as e:
            exception_mapping_worked = True
+            ## LOGGING
+            logging_obj.post_call(
+                input=prompt,
+                api_key=api_key,
+                additional_args={"complete_input_dict": data},
+                original_response=str(e),
+            )
            raise e
        except Exception as e:
+            ## LOGGING
+            logging_obj.post_call(
+                input=prompt,
+                api_key=api_key,
+                additional_args={"complete_input_dict": data},
+                original_response=str(e),
+            )
            if hasattr(e, "status_code"):
                raise OpenAIError(status_code=e.status_code, message=str(e))
            else:
--- a/litellm/llms/prompt_templates/factory.py
+++ b/litellm/llms/prompt_templates/factory.py
@ -99,12 +99,16 @@ def ollama_pt(


 def mistral_instruct_pt(messages):
+    # Following the Mistral example's https://huggingface.co/docs/transformers/main/chat_templating
    prompt = custom_prompt(
        initial_prompt_value="<s>",
        role_dict={
-            "system": {"pre_message": "[INST]", "post_message": "[/INST]"},
-            "user": {"pre_message": "[INST]", "post_message": "[/INST]"},
-            "assistant": {"pre_message": "[INST]", "post_message": "[/INST]"},
+            "system": {
+                "pre_message": "[INST] \n",
+                "post_message": " [/INST]\n",
+            },
+            "user": {"pre_message": "[INST] ", "post_message": " [/INST]\n"},
+            "assistant": {"pre_message": " ", "post_message": " "},
        },
        final_prompt_value="</s>",
        messages=messages,
@ -372,6 +376,7 @@ def anthropic_pt(
    You can "put words in Claude's mouth" by ending with an assistant message.
    See: https://docs.anthropic.com/claude/docs/put-words-in-claudes-mouth
    """
+
    class AnthropicConstants(Enum):
        HUMAN_PROMPT = "\n\nHuman: "
        AI_PROMPT = "\n\nAssistant: "
@ -394,32 +399,35 @@ def anthropic_pt(
        prompt += f"{AnthropicConstants.AI_PROMPT.value}"
    return prompt

-    
+
 def _load_image_from_url(image_url):
    try:
        from PIL import Image
    except:
-        raise Exception("gemini image conversion failed please run `pip install Pillow`")
+        raise Exception(
+            "gemini image conversion failed please run `pip install Pillow`"
+        )
    from io import BytesIO
+
    try:
        # Send a GET request to the image URL
        response = requests.get(image_url)
        response.raise_for_status()  # Raise an exception for HTTP errors

        # Check the response's content type to ensure it is an image
-        content_type = response.headers.get('content-type')
-        if not content_type or 'image' not in content_type:
-            raise ValueError(f"URL does not point to a valid image (content-type: {content_type})")
+        content_type = response.headers.get("content-type")
+        if not content_type or "image" not in content_type:
+            raise ValueError(
+                f"URL does not point to a valid image (content-type: {content_type})"
+            )

        # Load the image from the response content
        return Image.open(BytesIO(response.content))
-        
+
    except requests.RequestException as e:
-        print(f"Request failed: {e}")
-    except UnidentifiedImageError:
-        print("Cannot identify image file (it may not be a supported image format or might be corrupted).")
-    except ValueError as e:
-        print(e)
+        raise Exception(f"Request failed: {e}")
+    except Exception as e:
+        raise e


 def _gemini_vision_convert_messages(messages: list):
@ -437,10 +445,11 @@ def _gemini_vision_convert_messages(messages: list):
    try:
        from PIL import Image
    except:
-        raise Exception("gemini image conversion failed please run `pip install Pillow`")
+        raise Exception(
+            "gemini image conversion failed please run `pip install Pillow`"
+        )

    try:
-
        # given messages for gpt-4 vision, convert them for gemini
        # https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/getting-started/intro_gemini_python.ipynb
        prompt = ""
@ -589,7 +598,7 @@ def prompt_factory(
    if custom_llm_provider == "ollama":
        return ollama_pt(model=model, messages=messages)
    elif custom_llm_provider == "anthropic":
-        if any(_ in model for _ in ["claude-2.1","claude-v2:1"]):
+        if any(_ in model for _ in ["claude-2.1", "claude-v2:1"]):
            return claude_2_1_pt(messages=messages)
        else:
            return anthropic_pt(messages=messages)
--- a/litellm/llms/sagemaker.py
+++ b/litellm/llms/sagemaker.py
@ -25,6 +25,46 @@ class SagemakerError(Exception):
        )  # Call the base class constructor with the parameters it needs


+import io
+import json
+
+
+class TokenIterator:
+    def __init__(self, stream):
+        self.byte_iterator = iter(stream)
+        self.buffer = io.BytesIO()
+        self.read_pos = 0
+        self.end_of_data = False
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        try:
+            while True:
+                self.buffer.seek(self.read_pos)
+                line = self.buffer.readline()
+                if line and line[-1] == ord("\n"):
+                    response_obj = {"text": "", "is_finished": False}
+                    self.read_pos += len(line) + 1
+                    full_line = line[:-1].decode("utf-8")
+                    line_data = json.loads(full_line.lstrip("data:").rstrip("/n"))
+                    if line_data.get("generated_text", None) is not None:
+                        self.end_of_data = True
+                        response_obj["is_finished"] = True
+                    response_obj["text"] = line_data["token"]["text"]
+                    return response_obj
+                chunk = next(self.byte_iterator)
+                self.buffer.seek(0, io.SEEK_END)
+                self.buffer.write(chunk["PayloadPart"]["Bytes"])
+        except StopIteration as e:
+            if self.end_of_data == True:
+                raise e  # Re-raise StopIteration
+            else:
+                self.end_of_data = True
+                return "data: [DONE]"
+
+
 class SagemakerConfig:
    """
    Reference: https://d-uuwbxj1u4cnu.studio.us-west-2.sagemaker.aws/jupyter/default/lab/workspaces/auto-q/tree/DemoNotebooks/meta-textgeneration-llama-2-7b-SDK_1.ipynb
@ -121,7 +161,6 @@ def completion(

    # pop streaming if it's in the optional params as 'stream' raises an error with sagemaker
    inference_params = deepcopy(optional_params)
-    inference_params.pop("stream", None)

    ## Load Config
    config = litellm.SagemakerConfig.get_config()
@ -152,6 +191,28 @@ def completion(
            hf_model_name or model
        )  # pass in hf model name for pulling it's prompt template - (e.g. `hf_model_name="meta-llama/Llama-2-7b-chat-hf` applies the llama2 chat template to the prompt)
        prompt = prompt_factory(model=hf_model_name, messages=messages)
+    stream = inference_params.pop("stream", None)
+    if stream == True:
+        data = json.dumps(
+            {"inputs": prompt, "parameters": inference_params, "stream": True}
+        ).encode("utf-8")
+        ## LOGGING
+        request_str = f"""
+        response = client.invoke_endpoint_with_response_stream(
+            EndpointName={model},
+            ContentType="application/json",
+            Body={data},
+            CustomAttributes="accept_eula=true",
+        )
+        """  # type: ignore
+        response = client.invoke_endpoint_with_response_stream(
+            EndpointName=model,
+            ContentType="application/json",
+            Body=data,
+            CustomAttributes="accept_eula=true",
+        )
+
+        return response["Body"]

    data = json.dumps({"inputs": prompt, "parameters": inference_params}).encode(
        "utf-8"
--- a/litellm/llms/tokenizers/init.py
+++ b/litellm/llms/tokenizers/init.py
--- a/litellm/main.py
+++ b/litellm/main.py
@ -10,12 +10,11 @@
 import os, openai, sys, json, inspect, uuid, datetime, threading
 from typing import Any, Literal, Union
 from functools import partial
-
 import dotenv, traceback, random, asyncio, time, contextvars
 from copy import deepcopy
 import httpx
 import litellm
-
+from ._logging import verbose_logger
 from litellm import (  # type: ignore
    client,
    exception_type,
@ -83,6 +82,7 @@ from litellm.utils import (
    TextCompletionResponse,
    TextChoices,
    EmbeddingResponse,
+    ImageResponse,
    read_config_args,
    Choices,
    Message,
@ -273,14 +273,10 @@ async def acompletion(
        else:
            # Call the synchronous function using run_in_executor
            response = await loop.run_in_executor(None, func_with_context)  # type: ignore
-        # if kwargs.get("stream", False):  # return an async generator
-        #     return _async_streaming(
-        #         response=response,
-        #         model=model,
-        #         custom_llm_provider=custom_llm_provider,
-        #         args=args,
-        #     )
-        # else:
+        if isinstance(response, CustomStreamWrapper):
+            response.set_logging_event_loop(
+                loop=loop
+            )  # sets the logging event loop if the user does sync streaming (e.g. on proxy for sagemaker calls)
        return response
    except Exception as e:
        custom_llm_provider = custom_llm_provider or "openai"
@ -343,6 +339,18 @@ def mock_completion(
        model_response["choices"][0]["message"]["content"] = mock_response
        model_response["created"] = int(time.time())
        model_response["model"] = model
+
+        model_response.usage = Usage(
+            prompt_tokens=10, completion_tokens=20, total_tokens=30
+        )
+
+        try:
+            _, custom_llm_provider, _, _ = litellm.utils.get_llm_provider(model=model)
+            model_response._hidden_params["custom_llm_provider"] = custom_llm_provider
+        except:
+            # dont let setting a hidden param block a mock_respose
+            pass
+
        return model_response

    except:
@ -445,6 +453,8 @@ def completion(
    ### CUSTOM MODEL COST ###
    input_cost_per_token = kwargs.get("input_cost_per_token", None)
    output_cost_per_token = kwargs.get("output_cost_per_token", None)
+    input_cost_per_second = kwargs.get("input_cost_per_second", None)
+    output_cost_per_second = kwargs.get("output_cost_per_second", None)
    ### CUSTOM PROMPT TEMPLATE ###
    initial_prompt_value = kwargs.get("initial_prompt_value", None)
    roles = kwargs.get("roles", None)
@ -522,6 +532,8 @@ def completion(
        "tpm",
        "input_cost_per_token",
        "output_cost_per_token",
+        "input_cost_per_second",
+        "output_cost_per_second",
        "hf_model_name",
        "model_info",
        "proxy_server_request",
@ -534,10 +546,6 @@ def completion(
    non_default_params = {
        k: v for k, v in kwargs.items() if k not in default_params
    }  # model-specific params - pass them straight to the model/provider
-    if mock_response:
-        return mock_completion(
-            model, messages, stream=stream, mock_response=mock_response
-        )
    if timeout is None:
        timeout = (
            kwargs.get("request_timeout", None) or 600
@ -577,6 +585,10 @@ def completion(
        )
        if model_response is not None and hasattr(model_response, "_hidden_params"):
            model_response._hidden_params["custom_llm_provider"] = custom_llm_provider
+            model_response._hidden_params["region_name"] = kwargs.get(
+                "aws_region_name", None
+            )  # support region-based pricing for bedrock
+
        ### REGISTER CUSTOM MODEL PRICING -- IF GIVEN ###
        if input_cost_per_token is not None and output_cost_per_token is not None:
            litellm.register_model(
@ -588,6 +600,19 @@ def completion(
                    }
                }
            )
+        if (
+            input_cost_per_second is not None
+        ):  # time based pricing just needs cost in place
+            output_cost_per_second = output_cost_per_second or 0.0
+            litellm.register_model(
+                {
+                    model: {
+                        "input_cost_per_second": input_cost_per_second,
+                        "output_cost_per_second": output_cost_per_second,
+                        "litellm_provider": custom_llm_provider,
+                    }
+                }
+            )
        ### BUILD CUSTOM PROMPT TEMPLATE -- IF GIVEN ###
        custom_prompt_dict = {}  # type: ignore
        if (
@ -674,6 +699,10 @@ def completion(
            optional_params=optional_params,
            litellm_params=litellm_params,
        )
+        if mock_response:
+            return mock_completion(
+                model, messages, stream=stream, mock_response=mock_response
+            )
        if custom_llm_provider == "azure":
            # azure configs
            api_type = get_secret("AZURE_API_TYPE") or "azure"
@ -692,9 +721,9 @@ def completion(
                or get_secret("AZURE_API_KEY")
            )

-            azure_ad_token = optional_params.pop("azure_ad_token", None) or get_secret(
-                "AZURE_AD_TOKEN"
-            )
+            azure_ad_token = optional_params.get("extra_body", {}).pop(
+                "azure_ad_token", None
+            ) or get_secret("AZURE_AD_TOKEN")

            headers = headers or litellm.headers

@ -967,6 +996,7 @@ def completion(
                encoding=encoding,  # for calculating input/output tokens
                api_key=api_key,
                logging_obj=logging,
+                headers=headers,
            )
            if "stream" in optional_params and optional_params["stream"] == True:
                # don't try to access stream object,
@ -1376,11 +1406,29 @@ def completion(
                acompletion=acompletion,
                custom_prompt_dict=custom_prompt_dict,
            )
+            if (
+                "stream" in optional_params
+                and optional_params["stream"] == True
+                and acompletion == False
+            ):
+                response = CustomStreamWrapper(
+                    iter(model_response),
+                    model,
+                    custom_llm_provider="gemini",
+                    logging_obj=logging,
+                )
+                return response
            response = model_response
        elif custom_llm_provider == "vertex_ai":
-            vertex_ai_project = litellm.vertex_project or get_secret("VERTEXAI_PROJECT")
-            vertex_ai_location = litellm.vertex_location or get_secret(
-                "VERTEXAI_LOCATION"
+            vertex_ai_project = (
+                optional_params.pop("vertex_ai_project", None)
+                or litellm.vertex_project
+                or get_secret("VERTEXAI_PROJECT")
+            )
+            vertex_ai_location = (
+                optional_params.pop("vertex_ai_location", None)
+                or litellm.vertex_location
+                or get_secret("VERTEXAI_LOCATION")
            )

            model_response = vertex_ai.completion(
@ -1471,19 +1519,22 @@ def completion(
            if (
                "stream" in optional_params and optional_params["stream"] == True
            ):  ## [BETA]
-                # sagemaker does not support streaming as of now so we're faking streaming:
-                # https://discuss.huggingface.co/t/streaming-output-text-when-deploying-on-sagemaker/39611
-                # "SageMaker is currently not supporting streaming responses."
-
-                # fake streaming for sagemaker
                print_verbose(f"ENTERS SAGEMAKER CUSTOMSTREAMWRAPPER")
-                resp_string = model_response["choices"][0]["message"]["content"]
+                from .llms.sagemaker import TokenIterator
+
+                tokenIterator = TokenIterator(model_response)
                response = CustomStreamWrapper(
-                    resp_string,
-                    model,
+                    completion_stream=tokenIterator,
+                    model=model,
                    custom_llm_provider="sagemaker",
                    logging_obj=logging,
                )
+                ## LOGGING
+                logging.post_call(
+                    input=messages,
+                    api_key=None,
+                    original_response=response,
+                )
                return response

            ## RESPONSE OBJECT
@ -2176,6 +2227,7 @@ def embedding(
    model,
    input=[],
    # Optional params
+    dimensions: Optional[int] = None,
    timeout=600,  # default to 10 minutes
    # set api_base, api_version, api_key
    api_base: Optional[str] = None,
@ -2196,6 +2248,7 @@ def embedding(
    Parameters:
    - model: The embedding model to use.
    - input: The input for which embeddings are to be generated.
+    - dimensions: The number of dimensions the resulting output embeddings should have. Only supported in text-embedding-3 and later models.
    - timeout: The timeout value for the API call, default 10 mins
    - litellm_call_id: The call ID for litellm logging.
    - litellm_logging_obj: The litellm logging object.
@ -2222,8 +2275,14 @@ def embedding(
    encoding_format = kwargs.get("encoding_format", None)
    proxy_server_request = kwargs.get("proxy_server_request", None)
    aembedding = kwargs.get("aembedding", None)
+    ### CUSTOM MODEL COST ###
+    input_cost_per_token = kwargs.get("input_cost_per_token", None)
+    output_cost_per_token = kwargs.get("output_cost_per_token", None)
+    input_cost_per_second = kwargs.get("input_cost_per_second", None)
+    output_cost_per_second = kwargs.get("output_cost_per_second", None)
    openai_params = [
        "user",
+        "dimensions",
        "request_timeout",
        "api_base",
        "api_version",
@ -2270,6 +2329,8 @@ def embedding(
        "tpm",
        "input_cost_per_token",
        "output_cost_per_token",
+        "input_cost_per_second",
+        "output_cost_per_second",
        "hf_model_name",
        "proxy_server_request",
        "model_info",
@ -2290,11 +2351,35 @@ def embedding(
        api_key=api_key,
    )
    optional_params = get_optional_params_embeddings(
+        model=model,
        user=user,
+        dimensions=dimensions,
        encoding_format=encoding_format,
        custom_llm_provider=custom_llm_provider,
        **non_default_params,
    )
+    ### REGISTER CUSTOM MODEL PRICING -- IF GIVEN ###
+    if input_cost_per_token is not None and output_cost_per_token is not None:
+        litellm.register_model(
+            {
+                model: {
+                    "input_cost_per_token": input_cost_per_token,
+                    "output_cost_per_token": output_cost_per_token,
+                    "litellm_provider": custom_llm_provider,
+                }
+            }
+        )
+    if input_cost_per_second is not None:  # time based pricing just needs cost in place
+        output_cost_per_second = output_cost_per_second or 0.0
+        litellm.register_model(
+            {
+                model: {
+                    "input_cost_per_second": input_cost_per_second,
+                    "output_cost_per_second": output_cost_per_second,
+                    "litellm_provider": custom_llm_provider,
+                }
+            }
+        )
    try:
        response = None
        logging = litellm_logging_obj
@ -2916,6 +3001,7 @@ def image_generation(
        else:
            model = "dall-e-2"
            custom_llm_provider = "openai"  # default to dall-e-2 on openai
+        model_response._hidden_params["model"] = model
        openai_params = [
            "user",
            "request_timeout",
@ -2989,7 +3075,7 @@ def image_generation(
            custom_llm_provider=custom_llm_provider,
            **non_default_params,
        )
-        logging = litellm_logging_obj
+        logging: Logging = litellm_logging_obj
        logging.update_environment_variables(
            model=model,
            user=user,
@ -3089,6 +3175,9 @@ async def ahealth_check(
        if model is None:
            raise Exception("model not set")

+        if model in litellm.model_cost and mode is None:
+            mode = litellm.model_cost[model]["mode"]
+
        model, custom_llm_provider, _, _ = get_llm_provider(model=model)
        mode = mode or "chat"  # default to chat completion calls

@ -3263,8 +3352,20 @@ def stream_chunk_builder_text_completion(chunks: list, messages: Optional[List]
    return response


-def stream_chunk_builder(chunks: list, messages: Optional[list] = None):
+def stream_chunk_builder(
+    chunks: list, messages: Optional[list] = None, start_time=None, end_time=None
+):
    model_response = litellm.ModelResponse()
+    ### SORT CHUNKS BASED ON CREATED ORDER ##
+    print_verbose("Goes into checking if chunk has hiddden created at param")
+    if chunks[0]._hidden_params.get("created_at", None):
+        print_verbose("Chunks have a created at hidden param")
+        # Sort chunks based on created_at in ascending order
+        chunks = sorted(
+            chunks, key=lambda x: x._hidden_params.get("created_at", float("inf"))
+        )
+        print_verbose("Chunks sorted")
+
    # set hidden params from chunk to model_response
    if model_response is not None and hasattr(model_response, "_hidden_params"):
        model_response._hidden_params = chunks[0].get("_hidden_params", {})
@ -3438,5 +3539,8 @@ def stream_chunk_builder(chunks: list, messages: Optional[list] = None):
        response["usage"]["prompt_tokens"] + response["usage"]["completion_tokens"]
    )
    return convert_to_model_response_object(
-        response_object=response, model_response_object=model_response
+        response_object=response,
+        model_response_object=model_response,
+        start_time=start_time,
+        end_time=end_time,
    )
--- a/litellm/proxy/_types.py
+++ b/litellm/proxy/_types.py
@ -1,8 +1,8 @@
-from pydantic import BaseModel, Extra, Field, root_validator
+from pydantic import BaseModel, Extra, Field, root_validator, Json
 import enum
-from typing import Optional, List, Union, Dict, Literal
+from typing import Optional, List, Union, Dict, Literal, Any
 from datetime import datetime
-import uuid, json
+import uuid, json, sys, os


 class LiteLLMBase(BaseModel):
@ -13,7 +13,7 @@ class LiteLLMBase(BaseModel):
    def json(self, **kwargs):
        try:
            return self.model_dump()  # noqa
-        except:
+        except Exception as e:
            # if using pydantic v1
            return self.dict()

@ -122,27 +122,59 @@ class ModelParams(LiteLLMBase):
        return values


-class GenerateKeyRequest(LiteLLMBase):
-    duration: Optional[str] = "1h"
+class GenerateRequestBase(LiteLLMBase):
+    """
+    Overlapping schema between key and user generate/update requests
+    """
+
    models: Optional[list] = []
+    spend: Optional[float] = 0
+    max_budget: Optional[float] = None
+    user_id: Optional[str] = None
+    team_id: Optional[str] = None
+    max_parallel_requests: Optional[int] = None
+    metadata: Optional[dict] = {}
+    tpm_limit: Optional[int] = None
+    rpm_limit: Optional[int] = None
+    budget_duration: Optional[str] = None
+
+
+class GenerateKeyRequest(GenerateRequestBase):
+    key_alias: Optional[str] = None
+    duration: Optional[str] = None
    aliases: Optional[dict] = {}
    config: Optional[dict] = {}
-    spend: Optional[float] = 0
-    user_id: Optional[str] = None
-    max_parallel_requests: Optional[int] = None
-    metadata: Optional[dict] = {}


-class UpdateKeyRequest(LiteLLMBase):
+class GenerateKeyResponse(GenerateKeyRequest):
+    key: str
+    key_name: Optional[str] = None
+    expires: Optional[datetime]
+    user_id: str
+
+    @root_validator(pre=True)
+    def set_model_info(cls, values):
+        if values.get("token") is not None:
+            values.update({"key": values.get("token")})
+        dict_fields = ["metadata", "aliases", "config"]
+        for field in dict_fields:
+            value = values.get(field)
+            if value is not None and isinstance(value, str):
+                try:
+                    values[field] = json.loads(value)
+                except json.JSONDecodeError:
+                    raise ValueError(f"Field {field} should be a valid dictionary")
+
+        return values
+
+
+class UpdateKeyRequest(GenerateKeyRequest):
+    # Note: the defaults of all Params here MUST BE NONE
+    # else they will get overwritten
    key: str
    duration: Optional[str] = None
-    models: Optional[list] = None
-    aliases: Optional[dict] = None
-    config: Optional[dict] = None
    spend: Optional[float] = None
-    user_id: Optional[str] = None
-    max_parallel_requests: Optional[int] = None
-    metadata: Optional[dict] = {}
+    metadata: Optional[dict] = None


 class UserAPIKeyAuth(LiteLLMBase):  # the expected response object for user api key auth
@ -155,20 +187,17 @@ class UserAPIKeyAuth(LiteLLMBase):  # the expected response object for user api
    aliases: dict = {}
    config: dict = {}
    spend: Optional[float] = 0
+    max_budget: Optional[float] = None
    user_id: Optional[str] = None
    max_parallel_requests: Optional[int] = None
    duration: str = "1h"
    metadata: dict = {}
-
-
-class GenerateKeyResponse(LiteLLMBase):
-    key: str
-    expires: Optional[datetime]
-    user_id: str
+    tpm_limit: Optional[int] = None
+    rpm_limit: Optional[int] = None


 class DeleteKeyRequest(LiteLLMBase):
-    keys: List[str]
+    keys: List


 class NewUserRequest(GenerateKeyRequest):
@ -179,6 +208,14 @@ class NewUserResponse(GenerateKeyResponse):
    max_budget: Optional[float] = None


+class UpdateUserRequest(GenerateRequestBase):
+    # Note: the defaults of all Params here MUST BE NONE
+    # else they will get overwritten
+    user_id: str
+    spend: Optional[float] = None
+    metadata: Optional[dict] = None
+
+
 class KeyManagementSystem(enum.Enum):
    GOOGLE_KMS = "google_kms"
    AZURE_KEY_VAULT = "azure_key_vault"
@ -194,6 +231,7 @@ class DynamoDBArgs(LiteLLMBase):
    user_table_name: str = "LiteLLM_UserTable"
    key_table_name: str = "LiteLLM_VerificationToken"
    config_table_name: str = "LiteLLM_Config"
+    spend_table_name: str = "LiteLLM_SpendLogs"


 class ConfigGeneralSettings(LiteLLMBase):
@ -283,7 +321,10 @@ class ConfigYAML(LiteLLMBase):

 class LiteLLM_VerificationToken(LiteLLMBase):
    token: str
+    key_name: Optional[str] = None
+    key_alias: Optional[str] = None
    spend: float = 0.0
+    max_budget: Optional[float] = None
    expires: Union[str, None]
    models: List[str]
    aliases: Dict[str, str] = {}
@ -291,6 +332,10 @@ class LiteLLM_VerificationToken(LiteLLMBase):
    user_id: Union[str, None]
    max_parallel_requests: Union[int, None]
    metadata: Dict[str, str] = {}
+    tpm_limit: Optional[int] = None
+    rpm_limit: Optional[int] = None
+    budget_duration: Optional[str] = None
+    budget_reset_at: Optional[datetime] = None


 class LiteLLM_Config(LiteLLMBase):
@ -310,5 +355,22 @@ class LiteLLM_UserTable(LiteLLMBase):
        if values.get("spend") is None:
            values.update({"spend": 0.0})
        if values.get("models") is None:
-            values.update({"models", []})
+            values.update({"models": []})
        return values
+
+
+class LiteLLM_SpendLogs(LiteLLMBase):
+    request_id: str
+    api_key: str
+    model: Optional[str] = ""
+    call_type: str
+    spend: Optional[float] = 0.0
+    total_tokens: Optional[int] = 0
+    prompt_tokens: Optional[int] = 0
+    completion_tokens: Optional[int] = 0
+    startTime: Union[str, datetime, None]
+    endTime: Union[str, datetime, None]
+    user: Optional[str] = ""
+    metadata: Optional[Json] = {}
+    cache_hit: Optional[str] = "False"
+    cache_key: Optional[str] = None
--- a/litellm/proxy/admin_ui.py
+++ b/litellm/proxy/admin_ui.py
@ -98,7 +98,7 @@ def list_models():
            st.error(f"An error occurred while requesting models: {e}")
    else:
        st.warning(
-            "Please configure the Proxy Endpoint and Proxy Key on the Proxy Setup page."
+            f"Please configure the Proxy Endpoint and Proxy Key on the Proxy Setup page. Currently set Proxy Endpoint: {st.session_state.get('api_url', None)} and Proxy Key: {st.session_state.get('proxy_key', None)}"
        )


@ -151,7 +151,7 @@ def create_key():
                raise e
    else:
        st.warning(
-            "Please configure the Proxy Endpoint and Proxy Key on the Proxy Setup page."
+            f"Please configure the Proxy Endpoint and Proxy Key on the Proxy Setup page. Currently set Proxy Endpoint: {st.session_state.get('api_url', None)} and Proxy Key: {st.session_state.get('proxy_key', None)}"
        )


--- a/litellm/proxy/db/dynamo_db.py
+++ b/litellm/proxy/db/dynamo_db.py
@ -5,6 +5,7 @@ from litellm.proxy._types import (
    LiteLLM_Config,
    LiteLLM_UserTable,
 )
+from litellm.proxy.utils import hash_token
 from litellm import get_secret
 from typing import Any, List, Literal, Optional, Union
 import json
@ -131,10 +132,27 @@ class DynamoDBWrapper(CustomDB):
                raise Exception(
                    f"Failed to create table - {self.database_arguments.config_table_name}.\nPlease create a new table called {self.database_arguments.config_table_name}\nAND set `hash_key` as 'param_name'"
                )
+
+            ## Spend
+            try:
+                verbose_proxy_logger.debug("DynamoDB Wrapper - Creating Spend Table")
+                error_occurred = False
+                table = client.table(self.database_arguments.spend_table_name)
+                if not await table.exists():
+                    await table.create(
+                        self.throughput_type,
+                        KeySchema(hash_key=KeySpec("request_id", KeyType.string)),
+                    )
+            except Exception as e:
+                error_occurred = True
+            if error_occurred == True:
+                raise Exception(
+                    f"Failed to create table - {self.database_arguments.key_table_name}.\nPlease create a new table called {self.database_arguments.key_table_name}\nAND set `hash_key` as 'token'"
+                )
            verbose_proxy_logger.debug("DynamoDB Wrapper - Done connecting()")

    async def insert_data(
-        self, value: Any, table_name: Literal["user", "key", "config"]
+        self, value: Any, table_name: Literal["user", "key", "config", "spend"]
    ):
        from aiodynamo.client import Client
        from aiodynamo.credentials import Credentials, StaticCredentials
@ -166,8 +184,13 @@ class DynamoDBWrapper(CustomDB):
                table = client.table(self.database_arguments.key_table_name)
            elif table_name == "config":
                table = client.table(self.database_arguments.config_table_name)
+            elif table_name == "spend":
+                table = client.table(self.database_arguments.spend_table_name)

+            value = value.copy()
            for k, v in value.items():
+                if k == "token" and value[k].startswith("sk-"):
+                    value[k] = hash_token(token=v)
                if isinstance(v, datetime):
                    value[k] = v.isoformat()

@ -224,6 +247,10 @@ class DynamoDBWrapper(CustomDB):
                        and isinstance(v, str)
                    ):
                        new_response[k] = json.loads(v)
+                    elif (k == "tpm_limit" or k == "rpm_limit") and isinstance(
+                        v, float
+                    ):
+                        new_response[k] = int(v)
                    else:
                        new_response[k] = v
                new_response = LiteLLM_VerificationToken(**new_response)
@ -281,10 +308,13 @@ class DynamoDBWrapper(CustomDB):
            # Initialize an empty UpdateExpression

            actions: List = []
+            value = value.copy()
            for k, v in value.items():
                # Convert datetime object to ISO8601 string
                if isinstance(v, datetime):
                    v = v.isoformat()
+                if k == "token" and value[k].startswith("sk-"):
+                    value[k] = hash_token(token=v)

                # Accumulate updates
                actions.append((F(k), Value(value=v)))
--- a/litellm/proxy/example_config_yaml/custom_auth.py
+++ b/litellm/proxy/example_config_yaml/custom_auth.py
@ -1,4 +1,4 @@
-from litellm.proxy._types import UserAPIKeyAuth
+from litellm.proxy._types import UserAPIKeyAuth, GenerateKeyRequest
 from fastapi import Request
 from dotenv import load_dotenv
 import os
@ -14,3 +14,40 @@ async def user_api_key_auth(request: Request, api_key: str) -> UserAPIKeyAuth:
        raise Exception
    except:
        raise Exception
+
+
+async def generate_key_fn(data: GenerateKeyRequest):
+    """
+    Asynchronously decides if a key should be generated or not based on the provided data.
+
+    Args:
+        data (GenerateKeyRequest): The data to be used for decision making.
+
+    Returns:
+        bool: True if a key should be generated, False otherwise.
+    """
+    # decide if a key should be generated or not
+    data_json = data.json()  # type: ignore
+
+    # Unpacking variables
+    team_id = data_json.get("team_id")
+    duration = data_json.get("duration")
+    models = data_json.get("models")
+    aliases = data_json.get("aliases")
+    config = data_json.get("config")
+    spend = data_json.get("spend")
+    user_id = data_json.get("user_id")
+    max_parallel_requests = data_json.get("max_parallel_requests")
+    metadata = data_json.get("metadata")
+    tpm_limit = data_json.get("tpm_limit")
+    rpm_limit = data_json.get("rpm_limit")
+
+    if team_id is not None and len(team_id) > 0:
+        return {
+            "decision": True,
+        }
+    else:
+        return {
+            "decision": True,
+            "message": "This violates LiteLLM Proxy Rules. No team id provided.",
+        }
--- a/litellm/proxy/hooks/parallel_request_limiter.py
+++ b/litellm/proxy/hooks/parallel_request_limiter.py
@ -1,9 +1,12 @@
 from typing import Optional
-import litellm
+import litellm, traceback, sys
 from litellm.caching import DualCache
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.integrations.custom_logger import CustomLogger
 from fastapi import HTTPException
+from litellm._logging import verbose_proxy_logger
+from litellm import ModelResponse
+from datetime import datetime


 class MaxParallelRequestsHandler(CustomLogger):
@ -14,8 +17,7 @@ class MaxParallelRequestsHandler(CustomLogger):
        pass

    def print_verbose(self, print_statement):
-        if litellm.set_verbose is True:
-            print(print_statement)  # noqa
+        verbose_proxy_logger.debug(print_statement)

    async def async_pre_call_hook(
        self,
@ -26,25 +28,56 @@ class MaxParallelRequestsHandler(CustomLogger):
    ):
        self.print_verbose(f"Inside Max Parallel Request Pre-Call Hook")
        api_key = user_api_key_dict.api_key
-        max_parallel_requests = user_api_key_dict.max_parallel_requests
+        max_parallel_requests = user_api_key_dict.max_parallel_requests or sys.maxsize
+        tpm_limit = user_api_key_dict.tpm_limit or sys.maxsize
+        rpm_limit = user_api_key_dict.rpm_limit or sys.maxsize

        if api_key is None:
            return

-        if max_parallel_requests is None:
+        if (
+            max_parallel_requests == sys.maxsize
+            and tpm_limit == sys.maxsize
+            and rpm_limit == sys.maxsize
+        ):
            return

        self.user_api_key_cache = cache  # save the api key cache for updating the value
+        # ------------
+        # Setup values
+        # ------------
+
+        current_date = datetime.now().strftime("%Y-%m-%d")
+        current_hour = datetime.now().strftime("%H")
+        current_minute = datetime.now().strftime("%M")
+        precise_minute = f"{current_date}-{current_hour}-{current_minute}"
+
+        request_count_api_key = f"{api_key}::{precise_minute}::request_count"

        # CHECK IF REQUEST ALLOWED
-        request_count_api_key = f"{api_key}_request_count"
-        current = cache.get_cache(key=request_count_api_key)
+        current = cache.get_cache(
+            key=request_count_api_key
+        )  # {"current_requests": 1, "current_tpm": 1, "current_rpm": 10}
        self.print_verbose(f"current: {current}")
        if current is None:
-            cache.set_cache(request_count_api_key, 1)
-        elif int(current) < max_parallel_requests:
+            new_val = {
+                "current_requests": 1,
+                "current_tpm": 0,
+                "current_rpm": 0,
+            }
+            cache.set_cache(request_count_api_key, new_val)
+        elif (
+            int(current["current_requests"]) < max_parallel_requests
+            and current["current_tpm"] < tpm_limit
+            and current["current_rpm"] < rpm_limit
+        ):
            # Increase count for this token
-            cache.set_cache(request_count_api_key, int(current) + 1)
+            new_val = {
+                "current_requests": current["current_requests"] + 1,
+                "current_tpm": current["current_tpm"],
+                "current_rpm": current["current_rpm"],
+            }
+            cache.set_cache(request_count_api_key, new_val)
        else:
            raise HTTPException(
                status_code=429, detail="Max parallel request limit reached."
@ -52,7 +85,7 @@ class MaxParallelRequestsHandler(CustomLogger):

    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
        try:
-            self.print_verbose(f"INSIDE ASYNC SUCCESS LOGGING")
+            self.print_verbose(f"INSIDE parallel request limiter ASYNC SUCCESS LOGGING")
            user_api_key = kwargs["litellm_params"]["metadata"]["user_api_key"]
            if user_api_key is None:
                return
@ -60,29 +93,50 @@ class MaxParallelRequestsHandler(CustomLogger):
            if self.user_api_key_cache is None:
                return

-            request_count_api_key = f"{user_api_key}_request_count"
-            # check if it has collected an entire stream response
-            self.print_verbose(
-                f"'complete_streaming_response' is in kwargs: {'complete_streaming_response' in kwargs}"
-            )
-            if "complete_streaming_response" in kwargs or kwargs["stream"] != True:
-                # Decrease count for this token
-                current = (
-                    self.user_api_key_cache.get_cache(key=request_count_api_key) or 1
-                )
-                new_val = current - 1
-                self.print_verbose(f"updated_value in success call: {new_val}")
-                self.user_api_key_cache.set_cache(request_count_api_key, new_val)
+            # ------------
+            # Setup values
+            # ------------
+
+            current_date = datetime.now().strftime("%Y-%m-%d")
+            current_hour = datetime.now().strftime("%H")
+            current_minute = datetime.now().strftime("%M")
+            precise_minute = f"{current_date}-{current_hour}-{current_minute}"
+
+            total_tokens = 0
+
+            if isinstance(response_obj, ModelResponse):
+                total_tokens = response_obj.usage.total_tokens
+
+            request_count_api_key = f"{user_api_key}::{precise_minute}::request_count"
+
+            current = self.user_api_key_cache.get_cache(key=request_count_api_key) or {
+                "current_requests": 1,
+                "current_tpm": total_tokens,
+                "current_rpm": 1,
+            }
+
+            # ------------
+            # Update usage
+            # ------------
+
+            new_val = {
+                "current_requests": current["current_requests"] - 1,
+                "current_tpm": current["current_tpm"] + total_tokens,
+                "current_rpm": current["current_rpm"] + 1,
+            }
+
+            self.print_verbose(f"updated_value in success call: {new_val}")
+            self.user_api_key_cache.set_cache(
+                request_count_api_key, new_val, ttl=60
+            )  # store in cache for 1 min.
        except Exception as e:
            self.print_verbose(e)  # noqa

-    async def async_log_failure_call(
-        self, user_api_key_dict: UserAPIKeyAuth, original_exception: Exception
-    ):
+    async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
        try:
            self.print_verbose(f"Inside Max Parallel Request Failure Hook")
-            api_key = user_api_key_dict.api_key
-            if api_key is None:
+            user_api_key = kwargs["litellm_params"]["metadata"]["user_api_key"]
+            if user_api_key is None:
                return

            if self.user_api_key_cache is None:
@ -90,19 +144,46 @@ class MaxParallelRequestsHandler(CustomLogger):

            ## decrement call count if call failed
            if (
-                hasattr(original_exception, "status_code")
-                and original_exception.status_code == 429
-                and "Max parallel request limit reached" in str(original_exception)
+                hasattr(kwargs["exception"], "status_code")
+                and kwargs["exception"].status_code == 429
+                and "Max parallel request limit reached" in str(kwargs["exception"])
            ):
                pass  # ignore failed calls due to max limit being reached
            else:
-                request_count_api_key = f"{api_key}_request_count"
-                # Decrease count for this token
-                current = (
-                    self.user_api_key_cache.get_cache(key=request_count_api_key) or 1
+                # ------------
+                # Setup values
+                # ------------
+
+                current_date = datetime.now().strftime("%Y-%m-%d")
+                current_hour = datetime.now().strftime("%H")
+                current_minute = datetime.now().strftime("%M")
+                precise_minute = f"{current_date}-{current_hour}-{current_minute}"
+
+                request_count_api_key = (
+                    f"{user_api_key}::{precise_minute}::request_count"
                )
-                new_val = current - 1
+
+                # ------------
+                # Update usage
+                # ------------
+
+                current = self.user_api_key_cache.get_cache(
+                    key=request_count_api_key
+                ) or {
+                    "current_requests": 1,
+                    "current_tpm": 0,
+                    "current_rpm": 0,
+                }
+
+                new_val = {
+                    "current_requests": current["current_requests"] - 1,
+                    "current_tpm": current["current_tpm"],
+                    "current_rpm": current["current_rpm"],
+                }
+
                self.print_verbose(f"updated_value in failure call: {new_val}")
-                self.user_api_key_cache.set_cache(request_count_api_key, new_val)
+                self.user_api_key_cache.set_cache(
+                    request_count_api_key, new_val, ttl=60
+                )  # save in cache for up to 1 min.
        except Exception as e:
-            self.print_verbose(f"An exception occurred - {str(e)}")  # noqa
+            print(f"An exception occurred - {str(e)}")  # noqa
--- a/litellm/proxy/proxy_cli.py
+++ b/litellm/proxy/proxy_cli.py
@ -157,6 +157,12 @@ def is_port_in_use(port):
    type=int,
    help="Number of requests to hit async endpoint with",
 )
+@click.option(
+    "--run_gunicorn",
+    default=False,
+    is_flag=True,
+    help="Starts proxy via gunicorn, instead of uvicorn (better for managing multiple workers)",
+)
@click.option("--local", is_flag=True, default=False, help="for local debugging")
 def run_server(
    host,
@ -186,21 +192,32 @@ def run_server(
    use_queue,
    health,
    version,
+    run_gunicorn,
 ):
    global feature_telemetry
    args = locals()
    if local:
-        from proxy_server import app, save_worker_config, usage_telemetry
+        from proxy_server import app, save_worker_config, usage_telemetry, ProxyConfig
    else:
        try:
-            from .proxy_server import app, save_worker_config, usage_telemetry
+            from .proxy_server import (
+                app,
+                save_worker_config,
+                usage_telemetry,
+                ProxyConfig,
+            )
        except ImportError as e:
            if "litellm[proxy]" in str(e):
                # user is missing a proxy dependency, ask them to pip install litellm[proxy]
                raise e
            else:
                # this is just a local/relative import error, user git cloned litellm
-                from proxy_server import app, save_worker_config, usage_telemetry
+                from proxy_server import (
+                    app,
+                    save_worker_config,
+                    usage_telemetry,
+                    ProxyConfig,
+                )
    feature_telemetry = usage_telemetry
    if version == True:
        pkg_version = importlib.metadata.version("litellm")
@ -373,16 +390,16 @@ def run_server(
            read from there and save it to os.env['DATABASE_URL']
            """
            try:
-                import yaml
+                import yaml, asyncio
            except:
                raise ImportError(
                    "yaml needs to be imported. Run - `pip install 'litellm[proxy]'`"
                )

-            if os.path.exists(config):
-                with open(config, "r") as config_file:
-                    config = yaml.safe_load(config_file)
-            general_settings = config.get("general_settings", {})
+            proxy_config = ProxyConfig()
+            _, _, general_settings = asyncio.run(
+                proxy_config.load_config(router=None, config_file_path=config)
+            )
            database_url = general_settings.get("database_url", None)
            if database_url and database_url.startswith("os.environ/"):
                original_dir = os.getcwd()
@ -418,6 +435,7 @@ def run_server(
                        break  # Exit the loop if the subprocess succeeds
                    except subprocess.CalledProcessError as e:
                        print(f"Error: {e}")
+                        time.sleep(random.randrange(start=1, stop=5))
                    finally:
                        os.chdir(original_dir)
            else:
@ -428,9 +446,9 @@ def run_server(
            port = random.randint(1024, 49152)
        from litellm.proxy.proxy_server import app

-        if os.name == "nt":
+        if run_gunicorn == False:
            uvicorn.run(app, host=host, port=port)  # run uvicorn
-        else:
+        elif run_gunicorn == True:
            import gunicorn.app.base

            # Gunicorn Application Class
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@ -11,6 +11,12 @@ model_list:
      output_cost_per_token: 0.00003
      max_tokens: 4096
      base_model: gpt-3.5-turbo
+  - model_name: gpt-4
+    litellm_params:
+      model: azure/chatgpt-v-2
+      api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
+      api_version: "2023-05-15"
+      api_key: os.environ/AZURE_API_KEY # The `os.environ/` prefix tells litellm to read this from the env. See https://docs.litellm.ai/docs/simple_proxy#load-api-keys-from-vault
  - model_name: gpt-vision
    litellm_params:
      model: azure/gpt-4-vision
@ -25,6 +31,9 @@ model_list:
  - model_name: BEDROCK_GROUP
    litellm_params:
      model: bedrock/cohere.command-text-v14
+  - model_name: tg-ai
+    litellm_params:
+      model: together_ai/mistralai/Mistral-7B-Instruct-v0.1
  - model_name: sagemaker
    litellm_params:
      model: sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4
@ -57,12 +66,22 @@ model_list:
      mode: embedding
 litellm_settings:
  fallbacks: [{"openai-gpt-3.5": ["azure-gpt-3.5"]}]
+  success_callback: ['langfuse']
+  max_budget: 10      # global budget for proxy 
+  budget_duration: 30d    # global budget duration, will reset after 30d
+  default_key_generate_params:
+    max_budget: 1.5000
+    models: ["azure-gpt-3.5"]
+    duration: None
  # cache: True     
  # setting callback class
  # callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance]

-# general_settings: 
-  # master_key: sk-1234
+general_settings: 
+  allow_user_auth: True
+  master_key: sk-1234
+  alerting: ["slack"]
+  alerting_threshold: 10 # sends alerts if requests hang for 2 seconds
  # database_type: "dynamo_db" 
  # database_args: { # 👈  all args - https://github.com/BerriAI/litellm/blob/befbcbb7ac8f59835ce47415c128decf37aac328/litellm/proxy/_types.py#L190
  #   "billing_mode": "PAY_PER_REQUEST", 
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
--- a/litellm/proxy/schema.prisma
+++ b/litellm/proxy/schema.prisma
@ -7,28 +7,62 @@ generator client {
  provider = "prisma-client-py"
 }

+// Track spend, rate limit, budget Users
 model LiteLLM_UserTable {
 		user_id    String @unique
+    team_id    String?
 		max_budget Float?
    spend      Float    @default(0.0)
    user_email    String?
-    models     String[] @default([])
+    models     String[]
+    max_parallel_requests Int?
+    tpm_limit     BigInt?
+    rpm_limit     BigInt?
+    budget_duration String? 
+    budget_reset_at DateTime?
 }

-// required for token gen
+// Generate Tokens for Proxy
 model LiteLLM_VerificationToken {
    token      String   @unique
+    key_name   String?
+    key_alias   String?
    spend      Float    @default(0.0)
    expires    DateTime?
-    models     String[] @default([])
+    models     String[]
    aliases    Json  @default("{}")
    config     Json  @default("{}")
    user_id    String?
+    team_id    String?
    max_parallel_requests Int?
    metadata   Json  @default("{}")
+    tpm_limit     BigInt?
+    rpm_limit     BigInt?
+    max_budget Float?    
+    budget_duration String? 
+    budget_reset_at DateTime?
 }

+// store proxy config.yaml
 model LiteLLM_Config {
  param_name String @id
  param_value Json?
+}
+
+// View spend, model, api_key per request
+model LiteLLM_SpendLogs {
+  request_id          String @unique
+  call_type           String
+  api_key             String  @default ("")
+  spend               Float    @default(0.0)
+  total_tokens        Int     @default(0)
+  prompt_tokens       Int     @default(0)
+  completion_tokens   Int     @default(0)
+  startTime           DateTime // Assuming start_time is a DateTime field
+  endTime             DateTime // Assuming end_time is a DateTime field
+  model               String   @default("")
+  user                String   @default("")
+  metadata            Json     @default("{}")
+  cache_hit           String   @default("")
+  cache_key           String   @default("")
 }
--- a/litellm/proxy/tests/load_test_completion.py
+++ b/litellm/proxy/tests/load_test_completion.py
@ -11,12 +11,10 @@ async def litellm_completion():
    # Your existing code for litellm_completion goes here
    try:
        response = await litellm_client.chat.completions.create(
-            model="Azure OpenAI GPT-4 Canada-East (External)",
-            stream=True,
+            model="azure-gpt-3.5",
            messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
        )
-        async for chunk in response:
-            print(chunk)
+        print(response)
        return response

    except Exception as e:
@ -27,9 +25,9 @@ async def litellm_completion():


 async def main():
-    for i in range(1000000):
+    for i in range(150):
        start = time.time()
-        n = 1000  # Number of concurrent tasks
+        n = 150  # Number of concurrent tasks
        tasks = [litellm_completion() for _ in range(n)]

        chat_completions = await asyncio.gather(*tasks)
--- a/litellm/proxy/tests/test_openai_js.js
+++ b/litellm/proxy/tests/test_openai_js.js
@ -4,22 +4,28 @@ const openai = require('openai');
 process.env.DEBUG=false;
 async function runOpenAI() {
  const client = new openai.OpenAI({
-    apiKey: 'your_api_key_here',
+    apiKey: 'sk-JkKeNi6WpWDngBsghJ6B9g',
    baseURL: 'http://0.0.0.0:8000'
  });

  try {
    const response = await client.chat.completions.create({
-      model: 'azure-gpt-3.5',
+      model: 'sagemaker',
+      stream: true,
+      max_tokens: 1000,
      messages: [
        {
          role: 'user',
-          content: 'this is a test request, write a short poem'.repeat(2000),
+          content: 'write a 20 pg essay about YC ',
        },
      ],
    });

    console.log(response);
+    for await (const chunk of response) {
+      console.log(chunk);
+      console.log(chunk.choices[0].delta.content);
+    }
  } catch (error) {
    console.log("got this exception from server");
    console.error(error);
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@ -1,7 +1,12 @@
 from typing import Optional, List, Any, Literal, Union
 import os, subprocess, hashlib, importlib, asyncio, copy, json, aiohttp, httpx
 import litellm, backoff
-from litellm.proxy._types import UserAPIKeyAuth, DynamoDBArgs
+from litellm.proxy._types import (
+    UserAPIKeyAuth,
+    DynamoDBArgs,
+    LiteLLM_VerificationToken,
+    LiteLLM_SpendLogs,
+)
 from litellm.caching import DualCache
 from litellm.proxy.hooks.parallel_request_limiter import MaxParallelRequestsHandler
 from litellm.proxy.hooks.max_budget_limiter import MaxBudgetLimiter
@ -9,10 +14,10 @@ from litellm.integrations.custom_logger import CustomLogger
 from litellm.proxy.db.base_client import CustomDB
 from litellm._logging import verbose_proxy_logger
 from fastapi import HTTPException, status
-import smtplib
+import smtplib, re
 from email.mime.text import MIMEText
 from email.mime.multipart import MIMEMultipart
-from datetime import datetime
+from datetime import datetime, timedelta


 def print_verbose(print_statement):
@ -92,7 +97,7 @@ class ProxyLogging:
        3. /image/generation
        """
        ### ALERTING ###
-        asyncio.create_task(self.response_taking_too_long())
+        asyncio.create_task(self.response_taking_too_long(request_data=data))

        try:
            for callback in litellm.callbacks:
@ -132,27 +137,113 @@ class ProxyLogging:
        start_time: Optional[float] = None,
        end_time: Optional[float] = None,
        type: Literal["hanging_request", "slow_response"] = "hanging_request",
+        request_data: Optional[dict] = None,
    ):
+        if request_data is not None:
+            model = request_data.get("model", "")
+            messages = request_data.get("messages", "")
+            # try casting messages to str and get the first 100 characters, else mark as None
+            try:
+                messages = str(messages)
+                messages = messages[:10000]
+            except:
+                messages = None
+
+            request_info = f"\nRequest Model: {model}\nMessages: {messages}"
+        else:
+            request_info = ""
+
        if type == "hanging_request":
            # Simulate a long-running operation that could take more than 5 minutes
            await asyncio.sleep(
                self.alerting_threshold
            )  # Set it to 5 minutes - i'd imagine this might be different for streaming, non-streaming, non-completion (embedding + img) requests
-
-            await self.alerting_handler(
-                message=f"Requests are hanging - {self.alerting_threshold}s+ request time",
-                level="Medium",
-            )
+            if (
+                request_data is not None
+                and request_data.get("litellm_status", "") != "success"
+            ):
+                # only alert hanging responses if they have not been marked as success
+                alerting_message = (
+                    f"Requests are hanging - {self.alerting_threshold}s+ request time"
+                )
+                await self.alerting_handler(
+                    message=alerting_message + request_info,
+                    level="Medium",
+                )

        elif (
            type == "slow_response" and start_time is not None and end_time is not None
        ):
+            slow_message = f"Responses are slow - {round(end_time-start_time,2)}s response time > Alerting threshold: {self.alerting_threshold}s"
            if end_time - start_time > self.alerting_threshold:
                await self.alerting_handler(
-                    message=f"Responses are slow - {round(end_time-start_time,2)}s response time",
+                    message=slow_message + request_info,
                    level="Low",
                )

+    async def budget_alerts(
+        self,
+        type: Literal["token_budget", "user_budget", "user_and_proxy_budget"],
+        user_max_budget: float,
+        user_current_spend: float,
+        user_info=None,
+    ):
+        if self.alerting is None:
+            # do nothing if alerting is not switched on
+            return
+
+        if type == "user_and_proxy_budget":
+            user_info = dict(user_info)
+            user_id = user_info["user_id"]
+            max_budget = user_info["max_budget"]
+            spend = user_info["spend"]
+            user_email = user_info["user_email"]
+            user_info = f"""\nUser ID: {user_id}\nMax Budget: ${max_budget}\nSpend: ${spend}\nUser Email: {user_email}"""
+        elif type == "token_budget":
+            token_info = dict(user_info)
+            token = token_info["token"]
+            spend = token_info["spend"]
+            max_budget = token_info["max_budget"]
+            user_id = token_info["user_id"]
+            user_info = f"""\nToken: {token}\nSpend: ${spend}\nMax Budget: ${max_budget}\nUser ID: {user_id}"""
+        else:
+            user_info = str(user_info)
+        # percent of max_budget left to spend
+        percent_left = (user_max_budget - user_current_spend) / user_max_budget
+        verbose_proxy_logger.debug(
+            f"Budget Alerts: Percent left: {percent_left} for {user_info}"
+        )
+
+        # check if crossed budget
+        if user_current_spend >= user_max_budget:
+            verbose_proxy_logger.debug(f"Budget Crossed for {user_info}")
+            message = "Budget Crossed for" + user_info
+            await self.alerting_handler(
+                message=message,
+                level="High",
+            )
+            return
+
+        # check if 5% of max budget is left
+        if percent_left <= 0.05:
+            message = "5% budget left for" + user_info
+            await self.alerting_handler(
+                message=message,
+                level="Medium",
+            )
+            return
+
+        # check if 15% of max budget is left
+        if percent_left <= 0.15:
+            message = "15% budget left for" + user_info
+            await self.alerting_handler(
+                message=message,
+                level="Low",
+            )
+            return
+
+        return
+
    async def alerting_handler(
        self, message: str, level: Literal["Low", "Medium", "High"]
    ):
@ -163,12 +254,20 @@ class ProxyLogging:
        - Requests are hanging
        - Calls are failing
        - DB Read/Writes are failing
+        - Proxy Close to max budget
+        - Key Close to max budget

        Parameters:
            level: str - Low|Medium|High - if calls might fail (Medium) or are failing (High); Currently, no alerts would be 'Low'.
            message: str - what is the alert about
        """
-        formatted_message = f"Level: {level}\n\nMessage: {message}"
+        from datetime import datetime
+
+        # Get the current timestamp
+        current_time = datetime.now().strftime("%H:%M:%S")
+        formatted_message = (
+            f"Level: {level}\nTimestamp: {current_time}\n\nMessage: {message}"
+        )
        if self.alerting is None:
            return

@ -179,7 +278,9 @@ class ProxyLogging:
                    raise Exception("Missing SLACK_WEBHOOK_URL from environment")
                payload = {"text": formatted_message}
                headers = {"Content-type": "application/json"}
-                async with aiohttp.ClientSession() as session:
+                async with aiohttp.ClientSession(
+                    connector=aiohttp.TCPConnector(ssl=False)
+                ) as session:
                    async with session.post(
                        slack_webhook_url, json=payload, headers=headers
                    ) as response:
@ -316,7 +417,7 @@ class PrismaClient:
        self,
        key: str,
        value: Any,
-        table_name: Literal["users", "keys", "config"],
+        table_name: Literal["users", "keys", "config", "spend"],
    ):
        """
        Generic implementation of get data
@ -334,6 +435,10 @@ class PrismaClient:
                response = await self.db.litellm_config.find_first(  # type: ignore
                    where={key: value}  # type: ignore
                )
+            elif table_name == "spend":
+                response = await self.db.l.find_first(  # type: ignore
+                    where={key: value}  # type: ignore
+                )
            return response
        except Exception as e:
            asyncio.create_task(
@ -352,8 +457,12 @@ class PrismaClient:
        self,
        token: Optional[str] = None,
        user_id: Optional[str] = None,
-        table_name: Optional[Literal["user", "key", "config"]] = None,
+        user_id_list: Optional[list] = None,
+        key_val: Optional[dict] = None,
+        table_name: Optional[Literal["user", "key", "config", "spend"]] = None,
        query_type: Literal["find_unique", "find_all"] = "find_unique",
+        expires: Optional[datetime] = None,
+        reset_at: Optional[datetime] = None,
    ):
        try:
            print_verbose("PrismaClient: get_data")
@ -365,20 +474,51 @@ class PrismaClient:
                    hashed_token = token
                    if token.startswith("sk-"):
                        hashed_token = self.hash_token(token=token)
-                print_verbose("PrismaClient: find_unique")
+                    verbose_proxy_logger.debug(
+                        f"PrismaClient: find_unique for token: {hashed_token}"
+                    )
                if query_type == "find_unique":
                    response = await self.db.litellm_verificationtoken.find_unique(
                        where={"token": hashed_token}
                    )
+                    if response is not None:
+                        # for prisma we need to cast the expires time to str
+                        if response.expires is not None and isinstance(
+                            response.expires, datetime
+                        ):
+                            response.expires = response.expires.isoformat()
                elif query_type == "find_all" and user_id is not None:
                    response = await self.db.litellm_verificationtoken.find_many(
                        where={"user_id": user_id}
                    )
+                    if response is not None and len(response) > 0:
+                        for r in response:
+                            if isinstance(r.expires, datetime):
+                                r.expires = r.expires.isoformat()
+                elif (
+                    query_type == "find_all"
+                    and expires is not None
+                    and reset_at is not None
+                ):
+                    response = await self.db.litellm_verificationtoken.find_many(
+                        where={  # type:ignore
+                            "OR": [
+                                {"expires": None},
+                                {"expires": {"gt": expires}},
+                            ],
+                            "budget_reset_at": {"lt": reset_at},
+                        }
+                    )
+                    if response is not None and len(response) > 0:
+                        for r in response:
+                            if isinstance(r.expires, datetime):
+                                r.expires = r.expires.isoformat()
+                elif query_type == "find_all":
+                    response = await self.db.litellm_verificationtoken.find_many(
+                        order={"spend": "desc"},
+                    )
                print_verbose(f"PrismaClient: response={response}")
                if response is not None:
-                    # for prisma we need to cast the expires time to str
-                    if isinstance(response.expires, datetime):
-                        response.expires = response.expires.isoformat()
                    return response
                else:
                    # Token does not exist.
@ -386,13 +526,61 @@ class PrismaClient:
                        status_code=status.HTTP_401_UNAUTHORIZED,
                        detail="Authentication Error: invalid user key - token does not exist",
                    )
-            elif user_id is not None:
-                response = await self.db.litellm_usertable.find_unique(  # type: ignore
-                    where={
-                        "user_id": user_id,
-                    }
-                )
+            elif user_id is not None or (
+                table_name is not None and table_name == "user"
+            ):
+                if query_type == "find_unique":
+                    response = await self.db.litellm_usertable.find_unique(  # type: ignore
+                        where={
+                            "user_id": user_id,  # type: ignore
+                        }
+                    )
+                elif query_type == "find_all" and reset_at is not None:
+                    response = await self.db.litellm_usertable.find_many(
+                        where={  # type:ignore
+                            "budget_reset_at": {"lt": reset_at},
+                        }
+                    )
+                elif query_type == "find_all" and user_id_list is not None:
+                    user_id_values = str(tuple(user_id_list))
+                    sql_query = f"""
+                    SELECT *
+                    FROM "LiteLLM_UserTable"
+                    WHERE "user_id" IN {user_id_values}
+                    """
+
+                    # Execute the raw query
+                    # The asterisk before `user_id_list` unpacks the list into separate arguments
+                    response = await self.db.query_raw(sql_query)
+                elif query_type == "find_all":
+                    response = await self.db.litellm_usertable.find_many(  # type: ignore
+                        order={"spend": "desc"},
+                    )
                return response
+            elif table_name == "spend":
+                verbose_proxy_logger.debug(
+                    f"PrismaClient: get_data: table_name == 'spend'"
+                )
+                if key_val is not None:
+                    if query_type == "find_unique":
+                        response = await self.db.litellm_spendlogs.find_unique(  # type: ignore
+                            where={  # type: ignore
+                                key_val["key"]: key_val["value"],  # type: ignore
+                            }
+                        )
+                    elif query_type == "find_all":
+                        response = await self.db.litellm_spendlogs.find_many(  # type: ignore
+                            where={
+                                key_val["key"]: key_val["value"],  # type: ignore
+                            }
+                        )
+                    return response
+                else:
+                    response = await self.db.litellm_spendlogs.find_many(  # type: ignore
+                        order={"startTime": "desc"},
+                    )
+                    return response
+
        except Exception as e:
            print_verbose(f"LiteLLM Prisma Client Exception: {e}")
            import traceback
@ -412,7 +600,7 @@ class PrismaClient:
        on_backoff=on_backoff,  # specifying the function to call on backoff
    )
    async def insert_data(
-        self, data: dict, table_name: Literal["user", "key", "config"]
+        self, data: dict, table_name: Literal["user", "key", "config", "spend"]
    ):
        """
        Add a key to the database. If it already exists, do nothing.
@ -435,6 +623,7 @@ class PrismaClient:
                        "update": {},  # don't do anything if it already exists
                    },
                )
+                verbose_proxy_logger.info(f"Data Inserted into Keys Table")
                return new_verification_token
            elif table_name == "user":
                db_data = self.jsonify_object(data=data)
@ -445,6 +634,7 @@ class PrismaClient:
                        "update": {},  # don't do anything if it already exists
                    },
                )
+                verbose_proxy_logger.info(f"Data Inserted into User Table")
                return new_user_row
            elif table_name == "config":
                """
@ -468,8 +658,20 @@ class PrismaClient:
                    )

                    tasks.append(updated_table_row)
-
                await asyncio.gather(*tasks)
+                verbose_proxy_logger.info(f"Data Inserted into Config Table")
+            elif table_name == "spend":
+                db_data = self.jsonify_object(data=data)
+                new_spend_row = await self.db.litellm_spendlogs.upsert(
+                    where={"request_id": data["request_id"]},
+                    data={
+                        "create": {**db_data},  # type: ignore
+                        "update": {},  # don't do anything if it already exists
+                    },
+                )
+                verbose_proxy_logger.info(f"Data Inserted into Spend Table")
+                return new_spend_row
+
        except Exception as e:
            print_verbose(f"LiteLLM Prisma Client Exception: {e}")
            asyncio.create_task(
@ -489,7 +691,11 @@ class PrismaClient:
        self,
        token: Optional[str] = None,
        data: dict = {},
+        data_list: Optional[List] = None,
        user_id: Optional[str] = None,
+        query_type: Literal["update", "update_many"] = "update",
+        table_name: Optional[Literal["user", "key", "config", "spend"]] = None,
+        update_key_values: Optional[dict] = None,
    ):
        """
        Update existing data
@ -506,17 +712,95 @@ class PrismaClient:
                    where={"token": token},  # type: ignore
                    data={**db_data},  # type: ignore
                )
-                print_verbose("\033[91m" + f"DB write succeeded {response}" + "\033[0m")
+                verbose_proxy_logger.debug(
+                    "\033[91m"
+                    + f"DB Token Table update succeeded {response}"
+                    + "\033[0m"
+                )
                return {"token": token, "data": db_data}
-            elif user_id is not None:
+            elif (
+                user_id is not None
+                or (table_name is not None and table_name == "user")
+                and query_type == "update"
+            ):
                """
                If data['spend'] + data['user'], update the user table with spend info as well
                """
-                update_user_row = await self.db.litellm_usertable.update(
+                if user_id is None:
+                    user_id = db_data["user_id"]
+                if update_key_values is None:
+                    update_key_values = db_data
+                update_user_row = await self.db.litellm_usertable.upsert(
                    where={"user_id": user_id},  # type: ignore
-                    data={**db_data},  # type: ignore
+                    data={
+                        "create": {**db_data},  # type: ignore
+                        "update": {
+                            **update_key_values  # type: ignore
+                        },  # just update user-specified values, if it already exists
+                    },
+                )
+                verbose_proxy_logger.info(
+                    "\033[91m"
+                    + f"DB User Table - update succeeded {update_user_row}"
+                    + "\033[0m"
                )
                return {"user_id": user_id, "data": db_data}
+            elif (
+                table_name is not None
+                and table_name == "key"
+                and query_type == "update_many"
+                and data_list is not None
+                and isinstance(data_list, list)
+            ):
+                """
+                Batch write update queries
+                """
+                batcher = self.db.batch_()
+                for idx, t in enumerate(data_list):
+                    # check if plain text or hash
+                    if t.token.startswith("sk-"):  # type: ignore
+                        t.token = self.hash_token(token=t.token)  # type: ignore
+                    try:
+                        data_json = self.jsonify_object(data=t.model_dump())
+                    except:
+                        data_json = self.jsonify_object(data=t.dict())
+                    batcher.litellm_verificationtoken.update(
+                        where={"token": t.token},  # type: ignore
+                        data={**data_json},  # type: ignore
+                    )
+                await batcher.commit()
+                print_verbose(
+                    "\033[91m" + f"DB Token Table update succeeded" + "\033[0m"
+                )
+            elif (
+                table_name is not None
+                and table_name == "user"
+                and query_type == "update_many"
+                and data_list is not None
+                and isinstance(data_list, list)
+            ):
+                """
+                Batch write update queries
+                """
+                batcher = self.db.batch_()
+                for idx, user in enumerate(data_list):
+                    try:
+                        data_json = self.jsonify_object(data=user.model_dump())
+                    except:
+                        data_json = self.jsonify_object(data=user.dict())
+                    batcher.litellm_usertable.upsert(
+                        where={"user_id": user.user_id},  # type: ignore
+                        data={
+                            "create": {**data_json},  # type: ignore
+                            "update": {
+                                **data_json  # type: ignore
+                            },  # just update user-specified values, if it already exists
+                        },
+                    )
+                await batcher.commit()
+                verbose_proxy_logger.info(
+                    "\033[91m" + f"DB User Table Batch update succeeded" + "\033[0m"
+                )
        except Exception as e:
            asyncio.create_task(
                self.proxy_logging_obj.failure_handler(original_exception=e)
@ -537,7 +821,13 @@ class PrismaClient:
        Allow user to delete a key(s)
        """
        try:
-            hashed_tokens = [self.hash_token(token=token) for token in tokens]
+            hashed_tokens = []
+            for token in tokens:
+                if isinstance(token, str) and token.startswith("sk-"):
+                    hashed_token = self.hash_token(token=token)
+                else:
+                    hashed_token = token
+                hashed_tokens.append(hashed_token)
            await self.db.litellm_verificationtoken.delete_many(
                where={"token": {"in": hashed_tokens}}
            )
@ -745,7 +1035,8 @@ async def send_email(sender_name, sender_email, receiver_email, subject, html):
        print_verbose(f"SMTP Connection Init")
        # Establish a secure connection with the SMTP server
        with smtplib.SMTP(smtp_host, smtp_port) as server:
-            server.starttls()
+            if os.getenv("SMTP_TLS", 'True') != "False":
+                server.starttls()

            # Login to your email account
            server.login(smtp_username, smtp_password)
@ -754,4 +1045,164 @@ async def send_email(sender_name, sender_email, receiver_email, subject, html):
            server.send_message(email_message)

    except Exception as e:
-        print_verbose("An error occurred while sending the email:", str(e))
+        print_verbose("An error occurred while sending the email:" + str(e))
+
+
+def hash_token(token: str):
+    import hashlib
+
+    # Hash the string using SHA-256
+    hashed_token = hashlib.sha256(token.encode()).hexdigest()
+
+    return hashed_token
+
+
+def get_logging_payload(kwargs, response_obj, start_time, end_time):
+    from litellm.proxy._types import LiteLLM_SpendLogs
+    from pydantic import Json
+    import uuid
+
+    verbose_proxy_logger.debug(
+        f"SpendTable: get_logging_payload - kwargs: {kwargs}\n\n"
+    )
+
+    if kwargs == None:
+        kwargs = {}
+    # standardize this function to be used across, s3, dynamoDB, langfuse logging
+    litellm_params = kwargs.get("litellm_params", {})
+    metadata = (
+        litellm_params.get("metadata", {}) or {}
+    )  # if litellm_params['metadata'] == None
+    call_type = kwargs.get("call_type", "litellm.completion")
+    cache_hit = kwargs.get("cache_hit", False)
+    usage = response_obj["usage"]
+    if type(usage) == litellm.Usage:
+        usage = dict(usage)
+    id = response_obj.get("id", str(uuid.uuid4()))
+    api_key = metadata.get("user_api_key", "")
+    if api_key is not None and isinstance(api_key, str) and api_key.startswith("sk-"):
+        # hash the api_key
+        api_key = hash_token(api_key)
+    if "headers" in metadata and "authorization" in metadata["headers"]:
+        metadata["headers"].pop(
+            "authorization"
+        )  # do not store the original `sk-..` api key in the db
+    if litellm.cache is not None:
+        cache_key = litellm.cache.get_cache_key(**kwargs)
+    else:
+        cache_key = "Cache OFF"
+    if cache_hit == True:
+        import time
+
+        id = f"{id}_cache_hit{time.time()}"  # SpendLogs does not allow duplicate request_id
+
+    payload = {
+        "request_id": id,
+        "call_type": call_type,
+        "api_key": api_key,
+        "cache_hit": cache_hit,
+        "startTime": start_time,
+        "endTime": end_time,
+        "model": kwargs.get("model", ""),
+        "user": kwargs.get("user", ""),
+        "metadata": metadata,
+        "cache_key": cache_key,
+        "total_tokens": usage.get("total_tokens", 0),
+        "prompt_tokens": usage.get("prompt_tokens", 0),
+        "completion_tokens": usage.get("completion_tokens", 0),
+    }
+
+    json_fields = [
+        field
+        for field, field_type in LiteLLM_SpendLogs.__annotations__.items()
+        if field_type == Json or field_type == Optional[Json]
+    ]
+    str_fields = [
+        field
+        for field, field_type in LiteLLM_SpendLogs.__annotations__.items()
+        if field_type == str or field_type == Optional[str]
+    ]
+    datetime_fields = [
+        field
+        for field, field_type in LiteLLM_SpendLogs.__annotations__.items()
+        if field_type == datetime
+    ]
+
+    for param in json_fields:
+        if param in payload and type(payload[param]) != Json:
+            if type(payload[param]) == litellm.ModelResponse:
+                payload[param] = payload[param].model_dump_json()
+            if type(payload[param]) == litellm.EmbeddingResponse:
+                payload[param] = payload[param].model_dump_json()
+            else:
+                payload[param] = json.dumps(payload[param])
+
+    for param in str_fields:
+        if param in payload and type(payload[param]) != str:
+            payload[param] = str(payload[param])
+
+    return payload
+
+
+def _duration_in_seconds(duration: str):
+    match = re.match(r"(\d+)([smhd]?)", duration)
+    if not match:
+        raise ValueError("Invalid duration format")
+
+    value, unit = match.groups()
+    value = int(value)
+
+    if unit == "s":
+        return value
+    elif unit == "m":
+        return value * 60
+    elif unit == "h":
+        return value * 3600
+    elif unit == "d":
+        return value * 86400
+    else:
+        raise ValueError("Unsupported duration unit")
+
+
+async def reset_budget(prisma_client: PrismaClient):
+    """
+    Gets all the non-expired keys for a db, which need spend to be reset
+
+    Resets their spend
+
+    Updates db
+    """
+    if prisma_client is not None:
+        ### RESET KEY BUDGET ###
+        now = datetime.utcnow()
+        keys_to_reset = await prisma_client.get_data(
+            table_name="key", query_type="find_all", expires=now, reset_at=now
+        )
+
+        if keys_to_reset is not None and len(keys_to_reset) > 0:
+            for key in keys_to_reset:
+                key.spend = 0.0
+                duration_s = _duration_in_seconds(duration=key.budget_duration)
+                key.budget_reset_at = now + timedelta(seconds=duration_s)
+
+            await prisma_client.update_data(
+                query_type="update_many", data_list=keys_to_reset, table_name="key"
+            )
+
+        ### RESET USER BUDGET ###
+        now = datetime.utcnow()
+        users_to_reset = await prisma_client.get_data(
+            table_name="user", query_type="find_all", reset_at=now
+        )
+
+        verbose_proxy_logger.debug(f"users_to_reset from get_data: {users_to_reset}")
+
+        if users_to_reset is not None and len(users_to_reset) > 0:
+            for user in users_to_reset:
+                user.spend = 0.0
+                duration_s = _duration_in_seconds(duration=user.budget_duration)
+                user.budget_reset_at = now + timedelta(seconds=duration_s)
+
+            await prisma_client.update_data(
+                query_type="update_many", data_list=users_to_reset, table_name="user"
+            )
--- a/litellm/router.py
+++ b/litellm/router.py
@ -94,11 +94,15 @@ class Router:
        timeout: Optional[float] = None,
        default_litellm_params={},  # default params for Router.chat.completion.create
        set_verbose: bool = False,
+        debug_level: Literal["DEBUG", "INFO"] = "INFO",
        fallbacks: List = [],
-        allowed_fails: Optional[int] = None,
        context_window_fallbacks: List = [],
        model_group_alias: Optional[dict] = {},
        retry_after: int = 0,  # min time to wait before retrying a failed request
+        allowed_fails: Optional[
+            int
+        ] = None,  # Number of times a deployment can failbefore being added to cooldown
+        cooldown_time: float = 1,  # (seconds) time to cooldown a deployment after failure
        routing_strategy: Literal[
            "simple-shuffle",
            "least-busy",
@ -107,7 +111,42 @@ class Router:
        ] = "simple-shuffle",
        routing_strategy_args: dict = {},  # just for latency-based routing
    ) -> None:
+        """
+        Initialize the Router class with the given parameters for caching, reliability, and routing strategy.
+
+        Args:
+            model_list (Optional[list]): List of models to be used. Defaults to None.
+            redis_url (Optional[str]): URL of the Redis server. Defaults to None.
+            redis_host (Optional[str]): Hostname of the Redis server. Defaults to None.
+            redis_port (Optional[int]): Port of the Redis server. Defaults to None.
+            redis_password (Optional[str]): Password of the Redis server. Defaults to None.
+            cache_responses (Optional[bool]): Flag to enable caching of responses. Defaults to False.
+            cache_kwargs (dict): Additional kwargs to pass to RedisCache. Defaults to {}.
+            caching_groups (Optional[List[tuple]]): List of model groups for caching across model groups. Defaults to None.
+            client_ttl (int): Time-to-live for cached clients in seconds. Defaults to 3600.
+            num_retries (int): Number of retries for failed requests. Defaults to 0.
+            timeout (Optional[float]): Timeout for requests. Defaults to None.
+            default_litellm_params (dict): Default parameters for Router.chat.completion.create. Defaults to {}.
+            set_verbose (bool): Flag to set verbose mode. Defaults to False.
+            debug_level (Literal["DEBUG", "INFO"]): Debug level for logging. Defaults to "INFO".
+            fallbacks (List): List of fallback options. Defaults to [].
+            context_window_fallbacks (List): List of context window fallback options. Defaults to [].
+            model_group_alias (Optional[dict]): Alias for model groups. Defaults to {}.
+            retry_after (int): Minimum time to wait before retrying a failed request. Defaults to 0.
+            allowed_fails (Optional[int]): Number of allowed fails before adding to cooldown. Defaults to None.
+            cooldown_time (float): Time to cooldown a deployment after failure in seconds. Defaults to 1.
+            routing_strategy (Literal["simple-shuffle", "least-busy", "usage-based-routing", "latency-based-routing"]): Routing strategy. Defaults to "simple-shuffle".
+            routing_strategy_args (dict): Additional args for latency-based routing. Defaults to {}.
+
+        Returns:
+            Router: An instance of the litellm.Router class.
+        """
        self.set_verbose = set_verbose
+        if self.set_verbose:
+            if debug_level == "INFO":
+                verbose_router_logger.setLevel(logging.INFO)
+            elif debug_level == "DEBUG":
+                verbose_router_logger.setLevel(logging.DEBUG)
        self.deployment_names: List = (
            []
        )  # names of models under litellm_params. ex. azure/chatgpt-v-2
@ -157,6 +196,7 @@ class Router:
                self.deployment_latency_map[m["litellm_params"]["model"]] = 0

        self.allowed_fails = allowed_fails or litellm.allowed_fails
+        self.cooldown_time = cooldown_time or 1
        self.failed_calls = (
            InMemoryCache()
        )  # cache to track failed call per deployment, if num failed calls within 1 minute > allowed fails, then add it to cooldown
@ -259,6 +299,7 @@ class Router:
            raise e

    def _completion(self, model: str, messages: List[Dict[str, str]], **kwargs):
+        model_name = None
        try:
            # pick the one that is available (lowest TPM/RPM)
            deployment = self.get_available_deployment(
@ -271,6 +312,7 @@ class Router:
            )
            data = deployment["litellm_params"].copy()
            kwargs["model_info"] = deployment.get("model_info", {})
+            model_name = data["model"]
            for k, v in self.default_litellm_params.items():
                if (
                    k not in kwargs
@ -292,7 +334,7 @@ class Router:
            else:
                model_client = potential_model_client

-            return litellm.completion(
+            response = litellm.completion(
                **{
                    **data,
                    "messages": messages,
@ -301,7 +343,14 @@ class Router:
                    **kwargs,
                }
            )
+            verbose_router_logger.info(
+                f"litellm.completion(model={model_name})\033[32m 200 OK\033[0m"
+            )
+            return response
        except Exception as e:
+            verbose_router_logger.info(
+                f"litellm.completion(model={model_name})\033[31m Exception {str(e)}\033[0m"
+            )
            raise e

    async def acompletion(self, model: str, messages: List[Dict[str, str]], **kwargs):
@ -830,6 +879,9 @@ class Router:
                        """
                        try:
                            kwargs["model"] = mg
+                            kwargs.setdefault("metadata", {}).update(
+                                {"model_group": mg}
+                            )  # update model_group used, if fallbacks are done
                            response = await self.async_function_with_retries(
                                *args, **kwargs
                            )
@ -858,8 +910,10 @@ class Router:
                                f"Falling back to model_group = {mg}"
                            )
                            kwargs["model"] = mg
-                            kwargs["metadata"]["model_group"] = mg
-                            response = await self.async_function_with_retries(
+                            kwargs.setdefault("metadata", {}).update(
+                                {"model_group": mg}
+                            )  # update model_group used, if fallbacks are done
+                            response = await self.async_function_with_fallbacks(
                                *args, **kwargs
                            )
                            return response
@ -1024,6 +1078,9 @@ class Router:
                            ## LOGGING
                            kwargs = self.log_retry(kwargs=kwargs, e=original_exception)
                            kwargs["model"] = mg
+                            kwargs.setdefault("metadata", {}).update(
+                                {"model_group": mg}
+                            )  # update model_group used, if fallbacks are done
                            response = self.function_with_fallbacks(*args, **kwargs)
                            return response
                        except Exception as e:
@ -1047,6 +1104,9 @@ class Router:
                            ## LOGGING
                            kwargs = self.log_retry(kwargs=kwargs, e=original_exception)
                            kwargs["model"] = mg
+                            kwargs.setdefault("metadata", {}).update(
+                                {"model_group": mg}
+                            )  # update model_group used, if fallbacks are done
                            response = self.function_with_fallbacks(*args, **kwargs)
                            return response
                        except Exception as e:
@ -1232,6 +1292,7 @@ class Router:
        verbose_router_logger.debug(
            f"Attempting to add {deployment} to cooldown list. updated_fails: {updated_fails}; self.allowed_fails: {self.allowed_fails}"
        )
+        cooldown_time = self.cooldown_time or 1
        if updated_fails > self.allowed_fails:
            # get the current cooldown list for that minute
            cooldown_key = f"{current_minute}:cooldown_models"  # group cooldown models by minute to reduce number of redis calls
@ -1245,13 +1306,19 @@ class Router:
                else:
                    cached_value = cached_value + [deployment]
                    # save updated value
-                    self.cache.set_cache(value=cached_value, key=cooldown_key, ttl=1)
+                    self.cache.set_cache(
+                        value=cached_value, key=cooldown_key, ttl=cooldown_time
+                    )
            except:
                cached_value = [deployment]
                # save updated value
-                self.cache.set_cache(value=cached_value, key=cooldown_key, ttl=1)
+                self.cache.set_cache(
+                    value=cached_value, key=cooldown_key, ttl=cooldown_time
+                )
        else:
-            self.failed_calls.set_cache(key=deployment, value=updated_fails, ttl=1)
+            self.failed_calls.set_cache(
+                key=deployment, value=updated_fails, ttl=cooldown_time
+            )

    def _get_cooldown_deployments(self):
        """
@ -1828,6 +1895,9 @@ class Router:
                selected_index = random.choices(range(len(rpms)), weights=weights)[0]
                verbose_router_logger.debug(f"\n selected index, {selected_index}")
                deployment = healthy_deployments[selected_index]
+                verbose_router_logger.info(
+                    f"get_available_deployment for model: {model}, Selected deployment: {deployment or deployment[0]} for model: {model}"
+                )
                return deployment or deployment[0]
            ############## Check if we can do a RPM/TPM based weighted pick #################
            tpm = healthy_deployments[0].get("litellm_params").get("tpm", None)
@ -1842,6 +1912,9 @@ class Router:
                selected_index = random.choices(range(len(tpms)), weights=weights)[0]
                verbose_router_logger.debug(f"\n selected index, {selected_index}")
                deployment = healthy_deployments[selected_index]
+                verbose_router_logger.info(
+                    f"get_available_deployment for model: {model}, Selected deployment: {deployment or deployment[0]} for model: {model}"
+                )
                return deployment or deployment[0]

            ############## No RPM/TPM passed, we do a random pick #################
@ -1866,8 +1939,13 @@ class Router:
            )

        if deployment is None:
+            verbose_router_logger.info(
+                f"get_available_deployment for model: {model}, No deployment available"
+            )
            raise ValueError("No models available.")
-
+        verbose_router_logger.info(
+            f"get_available_deployment for model: {model}, Selected deployment: {deployment} for model: {model}"
+        )
        return deployment

    def flush_cache(self):
--- a/litellm/router_strategy/lowest_tpm_rpm.py
+++ b/litellm/router_strategy/lowest_tpm_rpm.py
@ -10,6 +10,7 @@ import traceback
 from litellm import token_counter
 from litellm.caching import DualCache
 from litellm.integrations.custom_logger import CustomLogger
+from litellm._logging import verbose_router_logger


 class LowestTPMLoggingHandler(CustomLogger):
@ -130,6 +131,9 @@ class LowestTPMLoggingHandler(CustomLogger):
        Returns a deployment with the lowest TPM/RPM usage.
        """
        # get list of potential deployments
+        verbose_router_logger.debug(
+            f"get_available_deployments - Usage Based. model_group: {model_group}, healthy_deployments: {healthy_deployments}"
+        )
        current_minute = datetime.now().strftime("%H-%M")
        tpm_key = f"{model_group}:tpm:{current_minute}"
        rpm_key = f"{model_group}:rpm:{current_minute}"
@ -137,14 +141,31 @@ class LowestTPMLoggingHandler(CustomLogger):
        tpm_dict = self.router_cache.get_cache(key=tpm_key)
        rpm_dict = self.router_cache.get_cache(key=rpm_key)

+        verbose_router_logger.debug(
+            f"tpm_key={tpm_key}, tpm_dict: {tpm_dict}, rpm_dict: {rpm_dict}"
+        )
+        try:
+            input_tokens = token_counter(messages=messages, text=input)
+        except:
+            input_tokens = 0
        # -----------------------
        # Find lowest used model
        # ----------------------
        lowest_tpm = float("inf")
        deployment = None
-        if tpm_dict is None:  # base case
-            item = random.choice(healthy_deployments)
-            return item
+        if tpm_dict is None:  # base case - none of the deployments have been used
+            # Return the 1st deployment where deployment["tpm"] >= input_tokens
+            for deployment in healthy_deployments:
+                _deployment_tpm = (
+                    deployment.get("tpm", None)
+                    or deployment.get("litellm_params", {}).get("tpm", None)
+                    or deployment.get("model_info", {}).get("tpm", None)
+                    or float("inf")
+                )
+
+                if _deployment_tpm >= input_tokens:
+                    return deployment
+            return None

        all_deployments = tpm_dict
        for d in healthy_deployments:
@ -152,11 +173,6 @@ class LowestTPMLoggingHandler(CustomLogger):
            if d["model_info"]["id"] not in all_deployments:
                all_deployments[d["model_info"]["id"]] = 0

-        try:
-            input_tokens = token_counter(messages=messages, text=input)
-        except:
-            input_tokens = 0
-
        for item, item_tpm in all_deployments.items():
            ## get the item from model list
            _deployment = None
--- a/litellm/tests/langfuse.log
+++ b/litellm/tests/langfuse.log
@ -1,57 +0,0 @@
-Starting new HTTPS connection (1): api.anthropic.com:443
-Starting new HTTPS connection (1): litellm-logging.onrender.com:443
-https://litellm-logging.onrender.com:443 "POST /logging HTTP/1.1" 200 38
-https://api.anthropic.com:443 "POST /v1/complete HTTP/1.1" 200 None
-Starting new HTTPS connection (1): litellm-logging.onrender.com:443
-Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'user', 'content': 'this is a streaming test for llama2 + langfuse'}], 'model': 'gpt-3.5-turbo', 'max_tokens': 20, 'stream': True, 'temperature': 0.2}}
-connect_tcp.started host='api.openai.com' port=443 local_address=None timeout=600.0 socket_options=None
-connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x1090f92d0>
-start_tls.started ssl_context=<ssl.SSLContext object at 0x108ddf020> server_hostname='api.openai.com' timeout=600.0
-start_tls.complete return_value=<httpcore._backends.sync.SyncStream object at 0x1090f9290>
-send_request_headers.started request=<Request [b'POST']>
-send_request_headers.complete
-send_request_body.started request=<Request [b'POST']>
-send_request_body.complete
-receive_response_headers.started request=<Request [b'POST']>
-https://litellm-logging.onrender.com:443 "POST /logging HTTP/1.1" 200 38
-receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Date', b'Sat, 23 Dec 2023 06:33:00 GMT'), (b'Content-Type', b'text/event-stream'), (b'Transfer-Encoding', b'chunked'), (b'Connection', b'keep-alive'), (b'access-control-allow-origin', b'*'), (b'Cache-Control', b'no-cache, must-revalidate'), (b'openai-model', b'gpt-3.5-turbo-0613'), (b'openai-organization', b'reliablekeystest'), (b'openai-processing-ms', b'62'), (b'openai-version', b'2020-10-01'), (b'strict-transport-security', b'max-age=15724800; includeSubDomains'), (b'x-ratelimit-limit-requests', b'9000'), (b'x-ratelimit-limit-tokens', b'1000000'), (b'x-ratelimit-limit-tokens_usage_based', b'1000000'), (b'x-ratelimit-remaining-requests', b'8998'), (b'x-ratelimit-remaining-tokens', b'999967'), (b'x-ratelimit-remaining-tokens_usage_based', b'999967'), (b'x-ratelimit-reset-requests', b'6ms'), (b'x-ratelimit-reset-tokens', b'1ms'), (b'x-ratelimit-reset-tokens_usage_based', b'1ms'), (b'x-request-id', b'dd1029a85edecb986fb662945c9f7b4f'), (b'CF-Cache-Status', b'DYNAMIC'), (b'Set-Cookie', b'__cf_bm=dnuSnc6BPNJd4lgWKpv3iE2P5zy4r5aCVekXVi7HG7U-1703313180-1-AbeMpAfvmJ6BShULb7tMaErR5ergUrt6ohiXj1e8zoo9AotZ0Jz0alUSUcp8FXyQX2VQ9P6gBUeoSR9aE98OasU=; path=/; expires=Sat, 23-Dec-23 07:03:00 GMT; domain=.api.openai.com; HttpOnly; Secure; SameSite=None'), (b'Set-Cookie', b'_cfuvid=dET0GKSNfbtSWNJuXndP8GY8M0ANzDK4Dl7mvIfhmM0-1703313180257-0-604800000; path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None'), (b'Server', b'cloudflare'), (b'CF-RAY', b'839e920e4f47f4b0-BOM'), (b'alt-svc', b'h3=":443"; ma=86400')])
-HTTP Request: POST https://api.openai.com/v1/chat/completions "200 OK"
-receive_response_body.started request=<Request [b'POST']>
-receive_response_body.complete
-response_closed.started
-response_closed.complete
-Starting new HTTPS connection (1): litellm-logging.onrender.com:443
-Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'user', 'content': "What's the weather like in San Francisco, Tokyo, and Paris?"}], 'model': 'gpt-3.5-turbo-1106', 'tool_choice': 'auto', 'tools': [{'type': 'function', 'function': {'name': 'get_current_weather', 'description': 'Get the current weather in a given location', 'parameters': {'type': 'object', 'properties': {'location': {'type': 'string', 'description': 'The city and state, e.g. San Francisco, CA'}, 'unit': {'type': 'string', 'enum': ['celsius', 'fahrenheit']}}, 'required': ['location']}}}]}}
-connect_tcp.started host='api.openai.com' port=443 local_address=None timeout=600.0 socket_options=None
-connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x10972d410>
-start_tls.started ssl_context=<ssl.SSLContext object at 0x1090c5be0> server_hostname='api.openai.com' timeout=600.0
-start_tls.complete return_value=<httpcore._backends.sync.SyncStream object at 0x1097547d0>
-send_request_headers.started request=<Request [b'POST']>
-send_request_headers.complete
-send_request_body.started request=<Request [b'POST']>
-send_request_body.complete
-receive_response_headers.started request=<Request [b'POST']>
-https://litellm-logging.onrender.com:443 "POST /logging HTTP/1.1" 200 38
-receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Date', b'Sat, 23 Dec 2023 06:33:03 GMT'), (b'Content-Type', b'application/json'), (b'Transfer-Encoding', b'chunked'), (b'Connection', b'keep-alive'), (b'access-control-allow-origin', b'*'), (b'Cache-Control', b'no-cache, must-revalidate'), (b'openai-model', b'gpt-3.5-turbo-1106'), (b'openai-organization', b'reliablekeystest'), (b'openai-processing-ms', b'2145'), (b'openai-version', b'2020-10-01'), (b'strict-transport-security', b'max-age=15724800; includeSubDomains'), (b'x-ratelimit-limit-requests', b'9000'), (b'x-ratelimit-limit-tokens', b'1000000'), (b'x-ratelimit-limit-tokens_usage_based', b'1000000'), (b'x-ratelimit-remaining-requests', b'8998'), (b'x-ratelimit-remaining-tokens', b'999968'), (b'x-ratelimit-remaining-tokens_usage_based', b'999968'), (b'x-ratelimit-reset-requests', b'6ms'), (b'x-ratelimit-reset-tokens', b'1ms'), (b'x-ratelimit-reset-tokens_usage_based', b'1ms'), (b'x-request-id', b'd0fd54d3a7696ee677f3690e9e0d6d04'), (b'CF-Cache-Status', b'DYNAMIC'), (b'Set-Cookie', b'__cf_bm=P_4fUmw4vvrbGKTlavf9VWuuzzro87gvhLE0DEGKA84-1703313183-1-ARgz+AQXAzH1uTTK8iyPE3QnT8TovAP61UvYsFD+d5DWM0lFi5U2+eSgPH+Pqt+Y1fNH1FWBUn9DmVceJKvyLcU=; path=/; expires=Sat, 23-Dec-23 07:03:03 GMT; domain=.api.openai.com; HttpOnly; Secure; SameSite=None'), (b'Set-Cookie', b'_cfuvid=g.nvBthte.6BJ7KHg5tihyGwupeGfMNMGnw72QUUBQc-1703313183034-0-604800000; path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None'), (b'Server', b'cloudflare'), (b'CF-RAY', b'839e92128b7ff2e2-BOM'), (b'Content-Encoding', b'gzip'), (b'alt-svc', b'h3=":443"; ma=86400')])
-receive_response_body.started request=<Request [b'POST']>
-receive_response_body.complete
-response_closed.started
-response_closed.complete
-HTTP Request: POST https://api.openai.com/v1/chat/completions "200 OK"
-nction': {'name': 'get_current_weather', 'description': 'Get the current weather in a given location', 'parameters': {'type': 'object', 'properties': {'location': {'type': 'string', 'description': 'The city and state, e.g. San Francisco, CA'}, 'unit': {'type': 'string', 'enum': ['celsius', 'fahrenheit']}}, 'required': ['location']}}}]}}
-connect_tcp.started host='api.openai.com' port=443 local_address=None timeout=600.0 socket_options=None
-connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x10972d410>
-start_tls.started ssl_context=<ssl.SSLContext object at 0x1090c5be0> server_hostname='api.openai.com' timeout=600.0
-start_tls.complete return_value=<httpcore._backends.sync.SyncStream object at 0x1097547d0>
-send_request_headers.started request=<Request [b'POST']>
-send_request_headers.complete
-send_request_body.started request=<Request [b'POST']>
-send_request_body.complete
-receive_response_headers.started request=<Request [b'POST']>
-https://litellm-logging.onrender.com:443 "POST /logging HTTP/1.1" 200 38
-receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Date', b'Sat, 23 Dec 2023 06:33:03 GMT'), (b'Content-Type', b'application/json'), (b'Transfer-Encoding', b'chunked'), (b'Connection', b'keep-alive'), (b'access-control-allow-origin', b'*'), (b'Cache-Control', b'no-cache, must-revalidate'), (b'openai-model', b'gpt-3.5-turbo-1106'), (b'openai-organization', b'reliablekeystest'), (b'openai-processing-ms', b'2145'), (b'openai-version', b'2020-10-01'), (b'strict-transport-security', b'max-age=15724800; includeSubDomains'), (b'x-ratelimit-limit-requests', b'9000'), (b'x-ratelimit-limit-tokens', b'1000000'), (b'x-ratelimit-limit-tokens_usage_based', b'1000000'), (b'x-ratelimit-remaining-requests', b'8998'), (b'x-ratelimit-remaining-tokens', b'999968'), (b'x-ratelimit-remaining-tokens_usage_based', b'999968'), (b'x-ratelimit-reset-requests', b'6ms'), (b'x-ratelimit-reset-tokens', b'1ms'), (b'x-ratelimit-reset-tokens_usage_based', b'1ms'), (b'x-request-id', b'd0fd54d3a7696ee677f3690e9e0d6d04'), (b'CF-Cache-Status', b'DYNAMIC'), (b'Set-Cookie', b'__cf_bm=P_4fUmw4vvrbGKTlavf9VWuuzzro87gvhLE0DEGKA84-1703313183-1-ARgz+AQXAzH1uTTK8iyPE3QnT8TovAP61UvYsFD+d5DWM0lFi5U2+eSgPH+Pqt+Y1fNH1FWBUn9DmVceJKvyLcU=; path=/; expires=Sat, 23-Dec-23 07:03:03 GMT; domain=.api.openai.com; HttpOnly; Secure; SameSite=None'), (b'Set-Cookie', b'_cfuvid=g.nvBthte.6BJ7KHg5tihyGwupeGfMNMGnw72QUUBQc-1703313183034-0-604800000; path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None'), (b'Server', b'cloudflare'), (b'CF-RAY', b'839e92128b7ff2e2-BOM'), (b'Content-Encoding', b'gzip'), (b'alt-svc', b'h3=":443"; ma=86400')])
-receive_response_body.started request=<Request [b'POST']>
-receive_response_body.complete
-response_closed.started
-response_closed.complete
-HTTP Request: POST https://api.openai.com/v1/chat/completions "200 OK"
--- a/litellm/tests/test_alangfuse.py
+++ b/litellm/tests/test_alangfuse.py
@ -99,36 +99,68 @@ def pre_langfuse_setup():
    return


-@pytest.mark.skip(reason="beta test - checking langfuse output")
 def test_langfuse_logging_async():
+    # this tests time added to make langfuse logging calls, vs just acompletion calls
    try:
        pre_langfuse_setup()
        litellm.set_verbose = True
+
+        # Make 5 calls with an empty success_callback
+        litellm.success_callback = []
+        start_time_empty_callback = asyncio.run(make_async_calls())
+        print("done with no callback test")
+
+        print("starting langfuse test")
+        # Make 5 calls with success_callback set to "langfuse"
        litellm.success_callback = ["langfuse"]
+        start_time_langfuse = asyncio.run(make_async_calls())
+        print("done with langfuse test")

-        async def _test_langfuse():
-            response = await litellm.acompletion(
-                model="azure/chatgpt-v-2",
-                messages=[{"role": "user", "content": "This is a test"}],
-                max_tokens=100,
-                temperature=0.7,
-                timeout=5,
-                user="test_user",
-            )
-            await asyncio.sleep(1)
-            return response
+        # Compare the time for both scenarios
+        print(f"Time taken with success_callback='langfuse': {start_time_langfuse}")
+        print(f"Time taken with empty success_callback: {start_time_empty_callback}")

-        response = asyncio.run(_test_langfuse())
-        print(f"response: {response}")
+        # assert the diff is not more than 1 second - this was 5 seconds before the fix
+        assert abs(start_time_langfuse - start_time_empty_callback) < 1

-        # # check langfuse.log to see if there was a failed response
-        search_logs("langfuse.log")
    except litellm.Timeout as e:
        pass
    except Exception as e:
        pytest.fail(f"An exception occurred - {e}")


+async def make_async_calls():
+    tasks = []
+    for _ in range(5):
+        task = asyncio.create_task(
+            litellm.acompletion(
+                model="azure/chatgpt-v-2",
+                messages=[{"role": "user", "content": "This is a test"}],
+                max_tokens=5,
+                temperature=0.7,
+                timeout=5,
+                user="langfuse_latency_test_user",
+                mock_response="It's simple to use and easy to get started",
+            )
+        )
+        tasks.append(task)
+
+    # Measure the start time before running the tasks
+    start_time = asyncio.get_event_loop().time()
+
+    # Wait for all tasks to complete
+    responses = await asyncio.gather(*tasks)
+
+    # Print the responses when tasks return
+    for idx, response in enumerate(responses):
+        print(f"Response from Task {idx + 1}: {response}")
+
+    # Calculate the total time taken
+    total_time = asyncio.get_event_loop().time() - start_time
+
+    return total_time
+
+
 # def test_langfuse_logging_async_text_completion():
 #     try:
 #         pre_langfuse_setup()
--- a/litellm/tests/test_amazing_s3_logs.py
+++ b/litellm/tests/test_amazing_s3_logs.py
@ -115,4 +115,103 @@ def test_s3_logging():
        print("Passed! Testing async s3 logging")


-test_s3_logging()
+# test_s3_logging()
+
+
+def test_s3_logging_r2():
+    # all s3 requests need to be in one test function
+    # since we are modifying stdout, and pytests runs tests in parallel
+    # on circle ci - we only test litellm.acompletion()
+    try:
+        # redirect stdout to log_file
+        # litellm.cache = litellm.Cache(
+        #     type="s3", s3_bucket_name="litellm-r2-bucket", s3_region_name="us-west-2"
+        # )
+        litellm.set_verbose = True
+        from litellm._logging import verbose_logger
+        import logging
+
+        verbose_logger.setLevel(level=logging.DEBUG)
+
+        litellm.success_callback = ["s3"]
+        litellm.s3_callback_params = {
+            "s3_bucket_name": "litellm-r2-bucket",
+            "s3_aws_secret_access_key": "os.environ/R2_S3_ACCESS_KEY",
+            "s3_aws_access_key_id": "os.environ/R2_S3_ACCESS_ID",
+            "s3_endpoint_url": "os.environ/R2_S3_URL",
+            "s3_region_name": "os.environ/R2_S3_REGION_NAME",
+        }
+        print("Testing async s3 logging")
+
+        expected_keys = []
+
+        import time
+
+        curr_time = str(time.time())
+
+        async def _test():
+            return await litellm.acompletion(
+                model="gpt-3.5-turbo",
+                messages=[{"role": "user", "content": f"This is a test {curr_time}"}],
+                max_tokens=10,
+                temperature=0.7,
+                user="ishaan-2",
+            )
+
+        response = asyncio.run(_test())
+        print(f"response: {response}")
+        expected_keys.append(response.id)
+
+        import boto3
+
+        s3 = boto3.client(
+            "s3",
+            endpoint_url=os.getenv("R2_S3_URL"),
+            region_name=os.getenv("R2_S3_REGION_NAME"),
+            aws_access_key_id=os.getenv("R2_S3_ACCESS_ID"),
+            aws_secret_access_key=os.getenv("R2_S3_ACCESS_KEY"),
+        )
+
+        bucket_name = "litellm-r2-bucket"
+        # List objects in the bucket
+        response = s3.list_objects(Bucket=bucket_name)
+
+        # # Sort the objects based on the LastModified timestamp
+        # objects = sorted(
+        #     response["Contents"], key=lambda x: x["LastModified"], reverse=True
+        # )
+        # # Get the keys of the most recent objects
+        # most_recent_keys = [obj["Key"] for obj in objects]
+        # print(most_recent_keys)
+        # # for each key, get the part before "-" as the key. Do it safely
+        # cleaned_keys = []
+        # for key in most_recent_keys:
+        #     split_key = key.split("-time=")
+        #     cleaned_keys.append(split_key[0])
+        # print("\n most recent keys", most_recent_keys)
+        # print("\n cleaned keys", cleaned_keys)
+        # print("\n Expected keys: ", expected_keys)
+        # matches = 0
+        # for key in expected_keys:
+        #     assert key in cleaned_keys
+
+        #     if key in cleaned_keys:
+        #         matches += 1
+        #         # remove the match key
+        #         cleaned_keys.remove(key)
+        # # this asserts we log, the first request + the 2nd cached request
+        # print("we had two matches ! passed ", matches)
+        # assert matches == 1
+        # try:
+        #     # cleanup s3 bucket in test
+        #     for key in most_recent_keys:
+        #         s3.delete_object(Bucket=bucket_name, Key=key)
+        # except:
+        #     # don't let cleanup fail a test
+        #     pass
+    except Exception as e:
+        pytest.fail(f"An exception occurred - {e}")
+    finally:
+        # post, close log file and verify
+        # Reset stdout to the original value
+        print("Passed! Testing async s3 logging")
--- a/litellm/tests/test_amazing_vertex_completion.py
+++ b/litellm/tests/test_amazing_vertex_completion.py
@ -95,7 +95,8 @@ def test_vertex_ai():
        + litellm.vertex_code_text_models
    )
    litellm.set_verbose = False
-    litellm.vertex_project = "reliablekeys"
+    vertex_ai_project = "reliablekeys"
+    # litellm.vertex_project = "reliablekeys"

    test_models = random.sample(test_models, 1)
    # test_models += litellm.vertex_language_models  # always test gemini-pro
@ -117,6 +118,7 @@ def test_vertex_ai():
                model=model,
                messages=[{"role": "user", "content": "hi"}],
                temperature=0.7,
+                vertex_ai_project=vertex_ai_project,
            )
            print("\nModel Response", response)
            print(response)
@ -302,10 +304,7 @@ def test_gemini_pro_vision():
        assert prompt_tokens == 263  # the gemini api returns 263 to us

    except Exception as e:
-        import traceback
-
-        traceback.print_exc()
-        raise e
+        pytest.fail(f"An exception occurred - {str(e)}")


 # test_gemini_pro_vision()
--- a/litellm/tests/test_bad_params.py
+++ b/litellm/tests/test_bad_params.py
@ -70,18 +70,16 @@ def test_completion_with_empty_model():

 def test_completion_invalid_param_cohere():
    try:
-        response = completion(model="command-nightly", messages=messages, top_p=1)
-        print(f"response: {response}")
+        litellm.set_verbose = True
+        response = completion(model="command-nightly", messages=messages, seed=12)
+        pytest.fail(f"This should have failed cohere does not support `seed` parameter")
    except Exception as e:
-        if "Unsupported parameters passed: top_p" in str(e):
+        if " cohere does not support parameters: {'seed': 12}" in str(e):
            pass
        else:
            pytest.fail(f"An error occurred {e}")


-# test_completion_invalid_param_cohere()
-
-
 def test_completion_function_call_cohere():
    try:
        response = completion(
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@ -127,9 +127,10 @@ def test_caching_with_models_v2():
    ]
    litellm.cache = Cache()
    print("test2 for caching")
+    litellm.set_verbose = True
    response1 = completion(model="gpt-3.5-turbo", messages=messages, caching=True)
    response2 = completion(model="gpt-3.5-turbo", messages=messages, caching=True)
-    response3 = completion(model="command-nightly", messages=messages, caching=True)
+    response3 = completion(model="azure/chatgpt-v-2", messages=messages, caching=True)
    print(f"response1: {response1}")
    print(f"response2: {response2}")
    print(f"response3: {response3}")
@ -286,7 +287,7 @@ def test_redis_cache_completion():
    response3 = completion(
        model="gpt-3.5-turbo", messages=messages, caching=True, temperature=0.5
    )
-    response4 = completion(model="command-nightly", messages=messages, caching=True)
+    response4 = completion(model="azure/chatgpt-v-2", messages=messages, caching=True)

    print("\nresponse 1", response1)
    print("\nresponse 2", response2)
@ -401,7 +402,7 @@ def test_redis_cache_completion_stream():
    """


-test_redis_cache_completion_stream()
+# test_redis_cache_completion_stream()


 def test_redis_cache_acompletion_stream():
@ -723,8 +724,8 @@ def test_cache_override():
    print(f"Embedding 2 response time: {end_time - start_time} seconds")

    assert (
-        end_time - start_time > 0.1
-    )  # ensure 2nd response comes in over 0.1s. This should not be cached.
+        end_time - start_time > 0.05
+    )  # ensure 2nd response comes in over 0.05s. This should not be cached.


 # test_cache_override()
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@ -191,6 +191,21 @@ def test_completion_gpt4_turbo():
 # test_completion_gpt4_turbo()


+def test_completion_gpt4_turbo_0125():
+    try:
+        response = completion(
+            model="gpt-4-0125-preview",
+            messages=messages,
+            max_tokens=10,
+        )
+        print(response)
+    except openai.RateLimitError:
+        print("got a rate liimt error")
+        pass
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+
@pytest.mark.skip(reason="this test is flaky")
 def test_completion_gpt4_vision():
    try:
@ -224,7 +239,7 @@ def test_completion_gpt4_vision():


 def test_completion_azure_gpt4_vision():
-    # azure gpt-4 vision takes 5s to respond
+    # azure/gpt-4, vision takes 5seconds to respond
    try:
        litellm.set_verbose = True
        response = completion(
@ -268,7 +283,7 @@ def test_completion_azure_gpt4_vision():
        pytest.fail(f"Error occurred: {e}")


-test_completion_azure_gpt4_vision()
+# test_completion_azure_gpt4_vision()


@pytest.mark.skip(reason="this test is flaky")
@ -500,22 +515,22 @@ def hf_test_completion_tgi():
 # hf_test_error_logs()


-def test_completion_cohere():  # commenting for now as the cohere endpoint is being flaky
-    try:
-        litellm.CohereConfig(max_tokens=1000, stop_sequences=["a"])
-        response = completion(
-            model="command-nightly", messages=messages, logger_fn=logger_fn
-        )
-        # Add any assertions here to check the response
-        print(response)
-        response_str = response["choices"][0]["message"]["content"]
-        response_str_2 = response.choices[0].message.content
-        if type(response_str) != str:
-            pytest.fail(f"Error occurred: {e}")
-        if type(response_str_2) != str:
-            pytest.fail(f"Error occurred: {e}")
-    except Exception as e:
-        pytest.fail(f"Error occurred: {e}")
+# def test_completion_cohere():  # commenting out,for now as the cohere endpoint is being flaky
+#     try:
+#         litellm.CohereConfig(max_tokens=10, stop_sequences=["a"])
+#         response = completion(
+#             model="command-nightly", messages=messages, logger_fn=logger_fn
+#         )
+#         # Add any assertions here to check the response
+#         print(response)
+#         response_str = response["choices"][0]["message"]["content"]
+#         response_str_2 = response.choices[0].message.content
+#         if type(response_str) != str:
+#             pytest.fail(f"Error occurred: {e}")
+#         if type(response_str_2) != str:
+#             pytest.fail(f"Error occurred: {e}")
+#     except Exception as e:
+#         pytest.fail(f"Error occurred: {e}")


 # test_completion_cohere()
@ -854,7 +869,7 @@ def test_completion_anyscale_with_functions():


 def test_completion_azure_key_completion_arg():
-    # this tests if we can pass api_key to completion, when it's not in the env
+    # this tests if we can pass api_key to completion, when it's not in the env.
    # DO NOT REMOVE THIS TEST. No MATTER WHAT Happens!
    # If you want to remove it, speak to Ishaan!
    # Ishaan will be very disappointed if this test is removed -> this is a standard way to pass api_key + the router + proxy use this
@ -990,9 +1005,9 @@ def test_azure_openai_ad_token():
        print("azure ad token respoonse\n")
        print(response)
        litellm.input_callback = []
-    except:
+    except Exception as e:
        litellm.input_callback = []
-        pass
+        pytest.fail(f"An exception occurs - {str(e)}")


 # test_azure_openai_ad_token()
@ -1269,6 +1284,8 @@ def test_completion_together_ai():
            "Cost for completion call together-computer/llama-2-70b: ",
            f"${float(cost):.10f}",
        )
+    except litellm.Timeout as e:
+        pass
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")

@ -1370,16 +1387,22 @@ def test_customprompt_together_ai():

 def test_completion_sagemaker():
    try:
-        print("testing sagemaker")
        litellm.set_verbose = True
+        print("testing sagemaker")
        response = completion(
            model="sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4",
            messages=messages,
            temperature=0.2,
            max_tokens=80,
+            input_cost_per_second=0.000420,
        )
        # Add any assertions here to check the response
        print(response)
+        cost = completion_cost(completion_response=response)
+        print("calculated cost", cost)
+        assert (
+            cost > 0.0 and cost < 1.0
+        )  # should never be > $1 for a single completion call
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")

@ -1387,6 +1410,36 @@ def test_completion_sagemaker():
 # test_completion_sagemaker()


+def test_completion_sagemaker_stream():
+    try:
+        litellm.set_verbose = False
+        print("testing sagemaker")
+        response = completion(
+            model="sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4",
+            messages=messages,
+            temperature=0.2,
+            max_tokens=80,
+            stream=True,
+        )
+
+        complete_streaming_response = ""
+        first_chunk_id, chunk_id = None, None
+        for i, chunk in enumerate(response):
+            print(chunk)
+            chunk_id = chunk.id
+            print(chunk_id)
+            if i == 0:
+                first_chunk_id = chunk_id
+            else:
+                assert chunk_id == first_chunk_id
+            complete_streaming_response += chunk.choices[0].delta.content or ""
+        # Add any assertions here to check the response
+        # print(response)
+        assert len(complete_streaming_response) > 0
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+
 def test_completion_chat_sagemaker():
    try:
        messages = [{"role": "user", "content": "Hey, how's it going?"}]
--- a/litellm/tests/test_completion_cost.py
+++ b/litellm/tests/test_completion_cost.py
@ -124,7 +124,7 @@ def test_cost_azure_gpt_35():
        )


-test_cost_azure_gpt_35()
+# test_cost_azure_gpt_35()


 def test_cost_azure_embedding():
@ -158,3 +158,78 @@ def test_cost_azure_embedding():


 # test_cost_azure_embedding()
+
+
+def test_cost_openai_image_gen():
+    cost = litellm.completion_cost(
+        model="dall-e-2", size="1024-x-1024", quality="standard", n=1
+    )
+    assert cost == 0.019922944
+
+
+def test_cost_bedrock_pricing():
+    """
+    - get pricing specific to region for a model
+    """
+    from litellm import ModelResponse, Choices, Message
+    from litellm.utils import Usage
+
+    litellm.set_verbose = True
+    input_tokens = litellm.token_counter(
+        model="bedrock/anthropic.claude-instant-v1",
+        messages=[{"role": "user", "content": "Hey, how's it going?"}],
+    )
+    print(f"input_tokens: {input_tokens}")
+    output_tokens = litellm.token_counter(
+        model="bedrock/anthropic.claude-instant-v1",
+        text="It's all going well",
+        count_response_tokens=True,
+    )
+    print(f"output_tokens: {output_tokens}")
+    resp = ModelResponse(
+        id="chatcmpl-e41836bb-bb8b-4df2-8e70-8f3e160155ac",
+        choices=[
+            Choices(
+                finish_reason=None,
+                index=0,
+                message=Message(
+                    content="It's all going well",
+                    role="assistant",
+                ),
+            )
+        ],
+        created=1700775391,
+        model="anthropic.claude-instant-v1",
+        object="chat.completion",
+        system_fingerprint=None,
+        usage=Usage(
+            prompt_tokens=input_tokens,
+            completion_tokens=output_tokens,
+            total_tokens=input_tokens + output_tokens,
+        ),
+    )
+    resp._hidden_params = {
+        "custom_llm_provider": "bedrock",
+        "region_name": "ap-northeast-1",
+    }
+
+    cost = litellm.completion_cost(
+        model="anthropic.claude-instant-v1",
+        completion_response=resp,
+        messages=[{"role": "user", "content": "Hey, how's it going?"}],
+    )
+    predicted_cost = input_tokens * 0.00000223 + 0.00000755 * output_tokens
+    assert cost == predicted_cost
+
+
+def test_cost_bedrock_pricing_actual_calls():
+    litellm.set_verbose = True
+    model = "anthropic.claude-instant-v1"
+    messages = [{"role": "user", "content": "Hey, how's it going?"}]
+    response = litellm.completion(model=model, messages=messages)
+    assert response._hidden_params["region_name"] is not None
+    cost = litellm.completion_cost(
+        completion_response=response,
+        messages=[{"role": "user", "content": "Hey, how's it going?"}],
+    )
+    assert cost > 0
--- a/litellm/tests/test_configs/custom_auth.py
+++ b/litellm/tests/test_configs/custom_auth.py
@ -13,4 +13,4 @@ async def user_api_key_auth(request: Request, api_key: str) -> UserAPIKeyAuth:
            return UserAPIKeyAuth(api_key=api_key)
        raise Exception
    except:
-        raise Exception
+        raise Exception("Failed custom auth")
--- a/litellm/tests/test_configs/test_config_no_auth.yaml
+++ b/litellm/tests/test_configs/test_config_no_auth.yaml
@ -53,9 +53,9 @@ model_list:
    api_key: os.environ/AZURE_API_KEY
    api_version: 2023-07-01-preview
    model: azure/azure-embedding-model
-  model_name: azure-embedding-model
  model_info:
-    mode: "embedding"
+    mode: embedding
+  model_name: azure-embedding-model
 - litellm_params:
    model: gpt-3.5-turbo
  model_info:
@ -80,43 +80,49 @@ model_list:
    description: this is a test openai model
    id: 9b1ef341-322c-410a-8992-903987fef439
  model_name: test_openai_models
- model_name: amazon-embeddings
-  litellm_params:
-    model: "bedrock/amazon.titan-embed-text-v1"
+- litellm_params:
+    model: bedrock/amazon.titan-embed-text-v1
  model_info:
    mode: embedding
- model_name: "GPT-J 6B - Sagemaker Text Embedding (Internal)"
-  litellm_params: 
-    model: "sagemaker/berri-benchmarking-gpt-j-6b-fp16"
+  model_name: amazon-embeddings
+- litellm_params:
+    model: sagemaker/berri-benchmarking-gpt-j-6b-fp16
  model_info:
    mode: embedding
- model_name: dall-e-3
-  litellm_params: 
+  model_name: GPT-J 6B - Sagemaker Text Embedding (Internal)
+- litellm_params:
    model: dall-e-3
  model_info:
    mode: image_generation
- model_name: dall-e-3
-  litellm_params: 
-    model: "azure/dall-e-3-test"
-    api_version: "2023-12-01-preview"
-    api_base: "os.environ/AZURE_SWEDEN_API_BASE"
-    api_key: "os.environ/AZURE_SWEDEN_API_KEY"
+  model_name: dall-e-3
+- litellm_params:
+    api_base: os.environ/AZURE_SWEDEN_API_BASE
+    api_key: os.environ/AZURE_SWEDEN_API_KEY
+    api_version: 2023-12-01-preview
+    model: azure/dall-e-3-test
  model_info:
    mode: image_generation
- model_name: dall-e-2
-  litellm_params: 
-    model: "azure/"
-    api_version: "2023-06-01-preview"
-    api_base: "os.environ/AZURE_API_BASE"
-    api_key: "os.environ/AZURE_API_KEY"
+  model_name: dall-e-3
+- litellm_params:
+    api_base: os.environ/AZURE_API_BASE
+    api_key: os.environ/AZURE_API_KEY
+    api_version: 2023-06-01-preview
+    model: azure/
  model_info:
    mode: image_generation
- model_name: text-embedding-ada-002
-  litellm_params:
+  model_name: dall-e-2
+- litellm_params:
+    api_base: os.environ/AZURE_API_BASE
+    api_key: os.environ/AZURE_API_KEY
+    api_version: 2023-07-01-preview
    model: azure/azure-embedding-model
-    api_base: "os.environ/AZURE_API_BASE"
-    api_key: "os.environ/AZURE_API_KEY"
-    api_version: "2023-07-01-preview"
  model_info:
+    base_model: text-embedding-ada-002
    mode: embedding
-    base_model: text-embedding-ada-002
+  model_name: text-embedding-ada-002
+- litellm_params:
+    model: gpt-3.5-turbo
+  model_info:
+    description: this is a test openai model
+    id: 34cb2419-7c63-44ae-a189-53f1d1ce5953
+  model_name: test_openai_models
--- a/litellm/tests/test_custom_callback_input.py
+++ b/litellm/tests/test_custom_callback_input.py
@ -74,6 +74,7 @@ class CompletionCustomHandler(

    def log_post_api_call(self, kwargs, response_obj, start_time, end_time):
        try:
+            print(f"kwargs: {kwargs}")
            self.states.append("post_api_call")
            ## START TIME
            assert isinstance(start_time, datetime)
@ -149,7 +150,14 @@ class CompletionCustomHandler(
            ## END TIME
            assert isinstance(end_time, datetime)
            ## RESPONSE OBJECT
-            assert isinstance(response_obj, litellm.ModelResponse)
+            assert isinstance(
+                response_obj,
+                (
+                    litellm.ModelResponse,
+                    litellm.EmbeddingResponse,
+                    litellm.ImageResponse,
+                ),
+            )
            ## KWARGS
            assert isinstance(kwargs["model"], str)
            assert isinstance(kwargs["messages"], list) and isinstance(
@ -170,12 +178,14 @@ class CompletionCustomHandler(
            )
            assert isinstance(kwargs["additional_args"], (dict, type(None)))
            assert isinstance(kwargs["log_event_type"], str)
+            assert isinstance(kwargs["response_cost"], (float, type(None)))
        except:
            print(f"Assertion Error: {traceback.format_exc()}")
            self.errors.append(traceback.format_exc())

    def log_failure_event(self, kwargs, response_obj, start_time, end_time):
        try:
+            print(f"kwargs: {kwargs}")
            self.states.append("sync_failure")
            ## START TIME
            assert isinstance(start_time, datetime)
@ -262,6 +272,7 @@ class CompletionCustomHandler(
            assert isinstance(kwargs["additional_args"], (dict, type(None)))
            assert isinstance(kwargs["log_event_type"], str)
            assert kwargs["cache_hit"] is None or isinstance(kwargs["cache_hit"], bool)
+            assert isinstance(kwargs["response_cost"], (float, type(None)))
        except:
            print(f"Assertion Error: {traceback.format_exc()}")
            self.errors.append(traceback.format_exc())
@ -545,8 +556,50 @@ async def test_async_chat_bedrock_stream():

 # asyncio.run(test_async_chat_bedrock_stream())

-# Text Completion 
-        
+
+## Test Sagemaker + Async
+@pytest.mark.asyncio
+async def test_async_chat_sagemaker_stream():
+    try:
+        customHandler = CompletionCustomHandler()
+        litellm.callbacks = [customHandler]
+        response = await litellm.acompletion(
+            model="sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4",
+            messages=[{"role": "user", "content": "Hi 👋 - i'm async sagemaker"}],
+        )
+        # test streaming
+        response = await litellm.acompletion(
+            model="sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4",
+            messages=[{"role": "user", "content": "Hi 👋 - i'm async sagemaker"}],
+            stream=True,
+        )
+        print(f"response: {response}")
+        async for chunk in response:
+            print(f"chunk: {chunk}")
+            continue
+        ## test failure callback
+        try:
+            response = await litellm.acompletion(
+                model="sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4",
+                messages=[{"role": "user", "content": "Hi 👋 - i'm async sagemaker"}],
+                aws_region_name="my-bad-key",
+                stream=True,
+            )
+            async for chunk in response:
+                continue
+        except:
+            pass
+        time.sleep(1)
+        print(f"customHandler.errors: {customHandler.errors}")
+        assert len(customHandler.errors) == 0
+        litellm.callbacks = []
+    except Exception as e:
+        pytest.fail(f"An exception occurred: {str(e)}")
+
+
+# Text Completion
+
+
 ## Test OpenAI text completion + Async
@pytest.mark.asyncio
 async def test_async_text_completion_openai_stream():
@ -585,6 +638,7 @@ async def test_async_text_completion_openai_stream():
    except Exception as e:
        pytest.fail(f"An exception occurred: {str(e)}")

+
 # EMBEDDING
 ## Test OpenAI + Async
@pytest.mark.asyncio
@ -762,6 +816,54 @@ async def test_async_embedding_azure_caching():
    assert len(customHandler_caching.states) == 4  # pre, post, success, success


-# asyncio.run(
-#     test_async_embedding_azure_caching()
-# )
+# Image Generation
+
+
+## Test OpenAI + Sync
+def test_image_generation_openai():
+    try:
+        customHandler_success = CompletionCustomHandler()
+        customHandler_failure = CompletionCustomHandler()
+        # litellm.callbacks = [customHandler_success]
+
+        # litellm.set_verbose = True
+
+        # response = litellm.image_generation(
+        #     prompt="A cute baby sea otter", model="dall-e-3"
+        # )
+
+        # print(f"response: {response}")
+        # assert len(response.data) > 0
+
+        # print(f"customHandler_success.errors: {customHandler_success.errors}")
+        # print(f"customHandler_success.states: {customHandler_success.states}")
+        # assert len(customHandler_success.errors) == 0
+        # assert len(customHandler_success.states) == 3  # pre, post, success
+        # test failure callback
+        litellm.callbacks = [customHandler_failure]
+        try:
+            response = litellm.image_generation(
+                prompt="A cute baby sea otter",
+                model="dall-e-2",
+                api_key="my-bad-api-key",
+            )
+        except:
+            pass
+        print(f"customHandler_failure.errors: {customHandler_failure.errors}")
+        print(f"customHandler_failure.states: {customHandler_failure.states}")
+        assert len(customHandler_failure.errors) == 0
+        assert len(customHandler_failure.states) == 3  # pre, post, failure
+    except litellm.RateLimitError as e:
+        pass
+    except litellm.ContentPolicyViolationError:
+        pass  # OpenAI randomly raises these errors - skip when they occur
+    except Exception as e:
+        pytest.fail(f"An exception occurred - {str(e)}")
+
+
+test_image_generation_openai()
+## Test OpenAI + Async
+
+## Test Azure + Sync
+
+## Test Azure + Async
--- a/litellm/tests/test_custom_logger.py
+++ b/litellm/tests/test_custom_logger.py
@ -1,56 +1,58 @@
 ### What this tests ####
 import sys, os, time, inspect, asyncio, traceback
 import pytest
-sys.path.insert(0, os.path.abspath('../..'))
+
+sys.path.insert(0, os.path.abspath("../.."))

 from litellm import completion, embedding
 import litellm
 from litellm.integrations.custom_logger import CustomLogger

+
 class MyCustomHandler(CustomLogger):
    complete_streaming_response_in_callback = ""
+
    def __init__(self):
-        self.success: bool = False                  # type: ignore
-        self.failure: bool = False                  # type: ignore
-        self.async_success: bool = False            # type: ignore
+        self.success: bool = False  # type: ignore
+        self.failure: bool = False  # type: ignore
+        self.async_success: bool = False  # type: ignore
        self.async_success_embedding: bool = False  # type: ignore
-        self.async_failure: bool = False            # type: ignore
+        self.async_failure: bool = False  # type: ignore
        self.async_failure_embedding: bool = False  # type: ignore

-        self.async_completion_kwargs = None         # type: ignore
-        self.async_embedding_kwargs = None          # type: ignore
-        self.async_embedding_response = None        # type: ignore
+        self.async_completion_kwargs = None  # type: ignore
+        self.async_embedding_kwargs = None  # type: ignore
+        self.async_embedding_response = None  # type: ignore

-        self.async_completion_kwargs_fail = None    # type: ignore
-        self.async_embedding_kwargs_fail = None     # type: ignore
+        self.async_completion_kwargs_fail = None  # type: ignore
+        self.async_embedding_kwargs_fail = None  # type: ignore

-        self.stream_collected_response = None       # type: ignore
-        self.sync_stream_collected_response = None       # type: ignore
-        self.user = None # type: ignore
+        self.stream_collected_response = None  # type: ignore
+        self.sync_stream_collected_response = None  # type: ignore
+        self.user = None  # type: ignore
        self.data_sent_to_api: dict = {}

-    def log_pre_api_call(self, model, messages, kwargs): 
+    def log_pre_api_call(self, model, messages, kwargs):
        print(f"Pre-API Call")
        self.data_sent_to_api = kwargs["additional_args"].get("complete_input_dict", {})
-    
-    def log_post_api_call(self, kwargs, response_obj, start_time, end_time): 
+
+    def log_post_api_call(self, kwargs, response_obj, start_time, end_time):
        print(f"Post-API Call")
-    
+
    def log_stream_event(self, kwargs, response_obj, start_time, end_time):
        print(f"On Stream")
-        
-    def log_success_event(self, kwargs, response_obj, start_time, end_time): 
+
+    def log_success_event(self, kwargs, response_obj, start_time, end_time):
        print(f"On Success")
        self.success = True
        if kwargs.get("stream") == True:
            self.sync_stream_collected_response = response_obj

-
-    def log_failure_event(self, kwargs, response_obj, start_time, end_time): 
+    def log_failure_event(self, kwargs, response_obj, start_time, end_time):
        print(f"On Failure")
        self.failure = True

-    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time): 
+    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
        print(f"On Async success")
        print(f"received kwargs user: {kwargs['user']}")
        self.async_success = True
@ -62,24 +64,30 @@ class MyCustomHandler(CustomLogger):
            self.stream_collected_response = response_obj
        self.async_completion_kwargs = kwargs
        self.user = kwargs.get("user", None)
-    
-    async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time): 
+
+    async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
        print(f"On Async Failure")
        self.async_failure = True
        if kwargs.get("model") == "text-embedding-ada-002":
            self.async_failure_embedding = True
            self.async_embedding_kwargs_fail = kwargs
-        
+
        self.async_completion_kwargs_fail = kwargs

+
 class TmpFunction:
    complete_streaming_response_in_callback = ""
    async_success: bool = False
+
    async def async_test_logging_fn(self, kwargs, completion_obj, start_time, end_time):
        print(f"ON ASYNC LOGGING")
        self.async_success = True
-        print(f'kwargs.get("complete_streaming_response"): {kwargs.get("complete_streaming_response")}')
-        self.complete_streaming_response_in_callback = kwargs.get("complete_streaming_response")
+        print(
+            f'kwargs.get("complete_streaming_response"): {kwargs.get("complete_streaming_response")}'
+        )
+        self.complete_streaming_response_in_callback = kwargs.get(
+            "complete_streaming_response"
+        )


 def test_async_chat_openai_stream():
@ -88,29 +96,39 @@ def test_async_chat_openai_stream():
        # litellm.set_verbose = True
        litellm.success_callback = [tmp_function.async_test_logging_fn]
        complete_streaming_response = ""
+
        async def call_gpt():
            nonlocal complete_streaming_response
-            response = await litellm.acompletion(model="gpt-3.5-turbo",
-                                messages=[{
-                                    "role": "user",
-                                    "content": "Hi 👋 - i'm openai"
-                                }],
-                                stream=True)
-            async for chunk in response: 
-                complete_streaming_response += chunk["choices"][0]["delta"]["content"] or ""
+            response = await litellm.acompletion(
+                model="gpt-3.5-turbo",
+                messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}],
+                stream=True,
+            )
+            async for chunk in response:
+                complete_streaming_response += (
+                    chunk["choices"][0]["delta"]["content"] or ""
+                )
                print(complete_streaming_response)
+
        asyncio.run(call_gpt())
        complete_streaming_response = complete_streaming_response.strip("'")
-        response1 = tmp_function.complete_streaming_response_in_callback["choices"][0]["message"]["content"]
+        response1 = tmp_function.complete_streaming_response_in_callback["choices"][0][
+            "message"
+        ]["content"]
        response2 = complete_streaming_response
        # assert [ord(c) for c in response1] == [ord(c) for c in response2]
+        print(f"response1: {response1}")
+        print(f"response2: {response2}")
        assert response1 == response2
        assert tmp_function.async_success == True
    except Exception as e:
        print(e)
        pytest.fail(f"An error occurred - {str(e)}")
+
+
 # test_async_chat_openai_stream()

+
 def test_completion_azure_stream_moderation_failure():
    try:
        customHandler = MyCustomHandler()
@ -122,11 +140,11 @@ def test_completion_azure_stream_moderation_failure():
                "content": "how do i kill someone",
            },
        ]
-        try: 
+        try:
            response = completion(
                model="azure/chatgpt-v-2", messages=messages, stream=True
            )
-            for chunk in response: 
+            for chunk in response:
                print(f"chunk: {chunk}")
                continue
        except Exception as e:
@ -139,7 +157,7 @@ def test_completion_azure_stream_moderation_failure():

 def test_async_custom_handler_stream():
    try:
-        # [PROD Test] - Do not DELETE 
+        # [PROD Test] - Do not DELETE
        # checks if the model response available in the async + stream callbacks is equal to the received response
        customHandler2 = MyCustomHandler()
        litellm.callbacks = [customHandler2]
@ -152,59 +170,64 @@ def test_async_custom_handler_stream():
            },
        ]
        complete_streaming_response = ""
+
        async def test_1():
            nonlocal complete_streaming_response
            response = await litellm.acompletion(
-                model="azure/chatgpt-v-2", 
-                messages=messages,
-                stream=True
+                model="azure/chatgpt-v-2", messages=messages, stream=True
            )
-            async for chunk in response: 
-                complete_streaming_response += chunk["choices"][0]["delta"]["content"] or ""
+            async for chunk in response:
+                complete_streaming_response += (
+                    chunk["choices"][0]["delta"]["content"] or ""
+                )
                print(complete_streaming_response)
-        
+
        asyncio.run(test_1())

        response_in_success_handler = customHandler2.stream_collected_response
-        response_in_success_handler = response_in_success_handler["choices"][0]["message"]["content"]
+        response_in_success_handler = response_in_success_handler["choices"][0][
+            "message"
+        ]["content"]
        print("\n\n")
        print("response_in_success_handler: ", response_in_success_handler)
        print("complete_streaming_response: ", complete_streaming_response)
        assert response_in_success_handler == complete_streaming_response
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
+
+
 # test_async_custom_handler_stream()


 def test_azure_completion_stream():
-    # [PROD Test] - Do not DELETE 
+    # [PROD Test] - Do not DELETE
    # test if completion() + sync custom logger get the same complete stream response
    try:
        # checks if the model response available in the async + stream callbacks is equal to the received response
        customHandler2 = MyCustomHandler()
        litellm.callbacks = [customHandler2]
-        litellm.set_verbose = False
+        litellm.set_verbose = True
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {
                "role": "user",
-                "content": "write 1 sentence about litellm being amazing",
+                "content": f"write 1 sentence about litellm being amazing {time.time()}",
            },
        ]
        complete_streaming_response = ""

        response = litellm.completion(
-            model="azure/chatgpt-v-2", 
-            messages=messages,
-            stream=True
+            model="azure/chatgpt-v-2", messages=messages, stream=True
        )
-        for chunk in response: 
+        for chunk in response:
            complete_streaming_response += chunk["choices"][0]["delta"]["content"] or ""
            print(complete_streaming_response)
-        
-        time.sleep(0.5) # wait 1/2 second before checking callbacks
+
+        time.sleep(0.5)  # wait 1/2 second before checking callbacks
        response_in_success_handler = customHandler2.sync_stream_collected_response
-        response_in_success_handler = response_in_success_handler["choices"][0]["message"]["content"]
+        response_in_success_handler = response_in_success_handler["choices"][0][
+            "message"
+        ]["content"]
        print("\n\n")
        print("response_in_success_handler: ", response_in_success_handler)
        print("complete_streaming_response: ", complete_streaming_response)
@ -212,24 +235,32 @@ def test_azure_completion_stream():
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")

+
@pytest.mark.asyncio
-async def test_async_custom_handler_completion(): 
-    try: 
+async def test_async_custom_handler_completion():
+    try:
        customHandler_success = MyCustomHandler()
        customHandler_failure = MyCustomHandler()
        # success
        assert customHandler_success.async_success == False
        litellm.callbacks = [customHandler_success]
        response = await litellm.acompletion(
-                model="gpt-3.5-turbo", 
-                messages=[{
+            model="gpt-3.5-turbo",
+            messages=[
+                {
                    "role": "user",
                    "content": "hello from litellm test",
-                }]
-            )
+                }
+            ],
+        )
        await asyncio.sleep(1)
-        assert customHandler_success.async_success == True, "async success is not set to True even after success"
-        assert customHandler_success.async_completion_kwargs.get("model") == "gpt-3.5-turbo"
+        assert (
+            customHandler_success.async_success == True
+        ), "async success is not set to True even after success"
+        assert (
+            customHandler_success.async_completion_kwargs.get("model")
+            == "gpt-3.5-turbo"
+        )
        # failure
        litellm.callbacks = [customHandler_failure]
        messages = [
@ -240,80 +271,119 @@ async def test_async_custom_handler_completion():
            },
        ]

-        assert customHandler_failure.async_failure == False 
-        try: 
+        assert customHandler_failure.async_failure == False
+        try:
            response = await litellm.acompletion(
-                        model="gpt-3.5-turbo", 
-                        messages=messages,
-                        api_key="my-bad-key",
-                    )
+                model="gpt-3.5-turbo",
+                messages=messages,
+                api_key="my-bad-key",
+            )
        except:
            pass
-        assert customHandler_failure.async_failure == True, "async failure is not set to True even after failure"        
-        assert customHandler_failure.async_completion_kwargs_fail.get("model") == "gpt-3.5-turbo"
-        assert len(str(customHandler_failure.async_completion_kwargs_fail.get("exception"))) > 10 # expect APIError("OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: test. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}"), 'traceback_exception': 'Traceback (most recent call last):\n  File "/Users/ishaanjaffer/Github/litellm/litellm/llms/openai.py", line 269, in acompletion\n    response = await openai_aclient.chat.completions.create(**data)\n  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/openai/resources/chat/completions.py", line 119
+        assert (
+            customHandler_failure.async_failure == True
+        ), "async failure is not set to True even after failure"
+        assert (
+            customHandler_failure.async_completion_kwargs_fail.get("model")
+            == "gpt-3.5-turbo"
+        )
+        assert (
+            len(
+                str(customHandler_failure.async_completion_kwargs_fail.get("exception"))
+            )
+            > 10
+        )  # expect APIError("OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: test. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}"), 'traceback_exception': 'Traceback (most recent call last):\n  File "/Users/ishaanjaffer/Github/litellm/litellm/llms/openai.py", line 269, in acompletion\n    response = await openai_aclient.chat.completions.create(**data)\n  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/openai/resources/chat/completions.py", line 119
        litellm.callbacks = []
        print("Passed setting async failure")
    except Exception as e:
        pytest.fail(f"An exception occurred - {str(e)}")
+
+
 # asyncio.run(test_async_custom_handler_completion())

+
@pytest.mark.asyncio
-async def test_async_custom_handler_embedding(): 
-    try: 
+async def test_async_custom_handler_embedding():
+    try:
        customHandler_embedding = MyCustomHandler()
        litellm.callbacks = [customHandler_embedding]
        # success
        assert customHandler_embedding.async_success_embedding == False
        response = await litellm.aembedding(
-                model="text-embedding-ada-002", 
-                input = ["hello world"],
-            )
+            model="text-embedding-ada-002",
+            input=["hello world"],
+        )
        await asyncio.sleep(1)
-        assert customHandler_embedding.async_success_embedding == True, "async_success_embedding is not set to True even after success"
-        assert customHandler_embedding.async_embedding_kwargs.get("model") == "text-embedding-ada-002"
-        assert customHandler_embedding.async_embedding_response["usage"]["prompt_tokens"] ==2
+        assert (
+            customHandler_embedding.async_success_embedding == True
+        ), "async_success_embedding is not set to True even after success"
+        assert (
+            customHandler_embedding.async_embedding_kwargs.get("model")
+            == "text-embedding-ada-002"
+        )
+        assert (
+            customHandler_embedding.async_embedding_response["usage"]["prompt_tokens"]
+            == 2
+        )
        print("Passed setting async success: Embedding")
-        # failure 
+        # failure
        assert customHandler_embedding.async_failure_embedding == False
-        try: 
+        try:
            response = await litellm.aembedding(
-                        model="text-embedding-ada-002", 
-                        input = ["hello world"],
-                        api_key="my-bad-key",
-                    )
-        except: 
+                model="text-embedding-ada-002",
+                input=["hello world"],
+                api_key="my-bad-key",
+            )
+        except:
            pass
-        assert customHandler_embedding.async_failure_embedding == True, "async failure embedding is not set to True even after failure"        
-        assert customHandler_embedding.async_embedding_kwargs_fail.get("model") == "text-embedding-ada-002"
-        assert len(str(customHandler_embedding.async_embedding_kwargs_fail.get("exception"))) > 10 # exppect APIError("OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: test. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}"), 'traceback_exception': 'Traceback (most recent call last):\n  File "/Users/ishaanjaffer/Github/litellm/litellm/llms/openai.py", line 269, in acompletion\n    response = await openai_aclient.chat.completions.create(**data)\n  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/openai/resources/chat/completions.py", line 119
+        assert (
+            customHandler_embedding.async_failure_embedding == True
+        ), "async failure embedding is not set to True even after failure"
+        assert (
+            customHandler_embedding.async_embedding_kwargs_fail.get("model")
+            == "text-embedding-ada-002"
+        )
+        assert (
+            len(
+                str(
+                    customHandler_embedding.async_embedding_kwargs_fail.get("exception")
+                )
+            )
+            > 10
+        )  # exppect APIError("OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: test. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}"), 'traceback_exception': 'Traceback (most recent call last):\n  File "/Users/ishaanjaffer/Github/litellm/litellm/llms/openai.py", line 269, in acompletion\n    response = await openai_aclient.chat.completions.create(**data)\n  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/openai/resources/chat/completions.py", line 119
    except Exception as e:
        pytest.fail(f"An exception occurred - {str(e)}")
+
+
 # asyncio.run(test_async_custom_handler_embedding())

+
@pytest.mark.asyncio
-async def test_async_custom_handler_embedding_optional_param(): 
+async def test_async_custom_handler_embedding_optional_param():
    """
-    Tests if the openai optional params for embedding - user + encoding_format, 
+    Tests if the openai optional params for embedding - user + encoding_format,
    are logged
    """
    customHandler_optional_params = MyCustomHandler()
    litellm.callbacks = [customHandler_optional_params]
    response = await litellm.aembedding(
-                model="azure/azure-embedding-model", 
-                input = ["hello world"],
-                user = "John"
-            )
-    await asyncio.sleep(1) # success callback is async 
+        model="azure/azure-embedding-model", input=["hello world"], user="John"
+    )
+    await asyncio.sleep(1)  # success callback is async
    assert customHandler_optional_params.user == "John"
-    assert customHandler_optional_params.user == customHandler_optional_params.data_sent_to_api["user"]
+    assert (
+        customHandler_optional_params.user
+        == customHandler_optional_params.data_sent_to_api["user"]
+    )
+

 # asyncio.run(test_async_custom_handler_embedding_optional_param())

+
@pytest.mark.asyncio
-async def test_async_custom_handler_embedding_optional_param_bedrock(): 
+async def test_async_custom_handler_embedding_optional_param_bedrock():
    """
-    Tests if the openai optional params for embedding - user + encoding_format, 
+    Tests if the openai optional params for embedding - user + encoding_format,
    are logged

    but makes sure these are not sent to the non-openai/azure endpoint (raises errors).
@ -323,42 +393,68 @@ async def test_async_custom_handler_embedding_optional_param_bedrock():
    customHandler_optional_params = MyCustomHandler()
    litellm.callbacks = [customHandler_optional_params]
    response = await litellm.aembedding(
-                model="bedrock/amazon.titan-embed-text-v1", 
-                input = ["hello world"],
-                user = "John"
-            )
-    await asyncio.sleep(1) # success callback is async 
+        model="bedrock/amazon.titan-embed-text-v1", input=["hello world"], user="John"
+    )
+    await asyncio.sleep(1)  # success callback is async
    assert customHandler_optional_params.user == "John"
    assert "user" not in customHandler_optional_params.data_sent_to_api


 def test_redis_cache_completion_stream():
    from litellm import Cache
-    # Important Test - This tests if we can add to streaming cache, when custom callbacks are set 
+
+    # Important Test - This tests if we can add to streaming cache, when custom callbacks are set
    import random
+
    try:
        print("\nrunning test_redis_cache_completion_stream")
        litellm.set_verbose = True
-        random_number = random.randint(1, 100000) # add a random number to ensure it's always adding / reading from cache
-        messages = [{"role": "user", "content": f"write a one sentence poem about: {random_number}"}]
-        litellm.cache = Cache(type="redis", host=os.environ['REDIS_HOST'], port=os.environ['REDIS_PORT'], password=os.environ['REDIS_PASSWORD'])
+        random_number = random.randint(
+            1, 100000
+        )  # add a random number to ensure it's always adding / reading from cache
+        messages = [
+            {
+                "role": "user",
+                "content": f"write a one sentence poem about: {random_number}",
+            }
+        ]
+        litellm.cache = Cache(
+            type="redis",
+            host=os.environ["REDIS_HOST"],
+            port=os.environ["REDIS_PORT"],
+            password=os.environ["REDIS_PASSWORD"],
+        )
        print("test for caching, streaming + completion")
-        response1 = completion(model="gpt-3.5-turbo", messages=messages, max_tokens=40, temperature=0.2, stream=True)
+        response1 = completion(
+            model="gpt-3.5-turbo",
+            messages=messages,
+            max_tokens=40,
+            temperature=0.2,
+            stream=True,
+        )
        response_1_content = ""
        for chunk in response1:
            print(chunk)
            response_1_content += chunk.choices[0].delta.content or ""
        print(response_1_content)

-        time.sleep(0.1) # sleep for 0.1 seconds allow set cache to occur
-        response2 = completion(model="gpt-3.5-turbo", messages=messages, max_tokens=40, temperature=0.2, stream=True)
+        time.sleep(0.1)  # sleep for 0.1 seconds allow set cache to occur
+        response2 = completion(
+            model="gpt-3.5-turbo",
+            messages=messages,
+            max_tokens=40,
+            temperature=0.2,
+            stream=True,
+        )
        response_2_content = ""
        for chunk in response2:
            print(chunk)
            response_2_content += chunk.choices[0].delta.content or ""
        print("\nresponse 1", response_1_content)
        print("\nresponse 2", response_2_content)
-        assert response_1_content == response_2_content, f"Response 1 != Response 2. Same params, Response 1{response_1_content} != Response 2{response_2_content}"
+        assert (
+            response_1_content == response_2_content
+        ), f"Response 1 != Response 2. Same params, Response 1{response_1_content} != Response 2{response_2_content}"
        litellm.success_callback = []
        litellm._async_success_callback = []
        litellm.cache = None
@ -366,4 +462,6 @@ def test_redis_cache_completion_stream():
        print(e)
        litellm.success_callback = []
        raise e
-# test_redis_cache_completion_stream()
+
+
+# test_redis_cache_completion_stream()
--- a/litellm/tests/test_dynamodb_logs.py
+++ b/litellm/tests/test_dynamodb_logs.py
@ -33,6 +33,7 @@ def pre_request():
 import re


+@pytest.mark.skip
 def verify_log_file(log_file_path):
    with open(log_file_path, "r") as log_file:
        log_content = log_file.read()
@ -123,7 +124,7 @@ def test_dynamo_logging():
        sys.stdout = original_stdout
        # Close the file
        log_file.close()
-        verify_log_file(file_name)
+        # verify_log_file(file_name)
        print("Passed! Testing async dynamoDB logging")


--- a/litellm/tests/test_embedding.py
+++ b/litellm/tests/test_embedding.py
@ -10,7 +10,7 @@ sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import litellm
-from litellm import embedding, completion
+from litellm import embedding, completion, completion_cost

 litellm.set_verbose = False

@ -57,6 +57,48 @@ def test_openai_embedding():
 # test_openai_embedding()


+def test_openai_embedding_3():
+    try:
+        litellm.set_verbose = True
+        response = embedding(
+            model="text-embedding-3-small",
+            input=["good morning from litellm", "this is another item"],
+            metadata={"anything": "good day"},
+            dimensions=5,
+        )
+        print(f"response:", response)
+        litellm_response = dict(response)
+        litellm_response_keys = set(litellm_response.keys())
+        litellm_response_keys.discard("_response_ms")
+
+        print(litellm_response_keys)
+        print("LiteLLM Response\n")
+        # print(litellm_response)
+
+        # same request with OpenAI 1.0+
+        import openai
+
+        client = openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"])
+        response = client.embeddings.create(
+            model="text-embedding-3-small",
+            input=["good morning from litellm", "this is another item"],
+            dimensions=5,
+        )
+
+        response = dict(response)
+        openai_response_keys = set(response.keys())
+        print(openai_response_keys)
+        assert (
+            litellm_response_keys == openai_response_keys
+        )  # ENSURE the Keys in litellm response is exactly what the openai package returns
+        assert (
+            len(litellm_response["data"]) == 2
+        )  # expect two embedding responses from litellm_response since input had two
+        print(openai_response_keys)
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+
 def test_openai_azure_embedding_simple():
    try:
        litellm.set_verbose = True
@ -186,7 +228,7 @@ def test_cohere_embedding3():
        pytest.fail(f"Error occurred: {e}")


-test_cohere_embedding3()
+# test_cohere_embedding3()


 def test_bedrock_embedding_titan():
@ -341,8 +383,30 @@ def test_sagemaker_embeddings():
        response = litellm.embedding(
            model="sagemaker/berri-benchmarking-gpt-j-6b-fp16",
            input=["good morning from litellm", "this is another item"],
+            input_cost_per_second=0.000420,
        )
        print(f"response: {response}")
+        cost = completion_cost(completion_response=response)
+        assert (
+            cost > 0.0 and cost < 1.0
+        )  # should never be > $1 for a single embedding call
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+
+@pytest.mark.asyncio
+async def test_sagemaker_aembeddings():
+    try:
+        response = await litellm.aembedding(
+            model="sagemaker/berri-benchmarking-gpt-j-6b-fp16",
+            input=["good morning from litellm", "this is another item"],
+            input_cost_per_second=0.000420,
+        )
+        print(f"response: {response}")
+        cost = completion_cost(completion_response=response)
+        assert (
+            cost > 0.0 and cost < 1.0
+        )  # should never be > $1 for a single embedding call
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")

--- a/litellm/tests/test_exceptions.py
+++ b/litellm/tests/test_exceptions.py
@ -2,7 +2,7 @@ from openai import AuthenticationError, BadRequestError, RateLimitError, OpenAIE
 import os
 import sys
 import traceback
-import subprocess
+import subprocess, asyncio

 sys.path.insert(
    0, os.path.abspath("../..")
@ -378,6 +378,74 @@ def test_content_policy_exceptionimage_generation_openai():
 # test_content_policy_exceptionimage_generation_openai()


+def test_content_policy_violation_error_streaming():
+    """
+    Production Test.
+    """
+    litellm.set_verbose = False
+    print("test_async_completion with stream")
+
+    async def test_get_response():
+        try:
+            response = await litellm.acompletion(
+                model="azure/chatgpt-v-2",
+                messages=[{"role": "user", "content": "say 1"}],
+                temperature=0,
+                top_p=1,
+                stream=True,
+                max_tokens=512,
+                presence_penalty=0,
+                frequency_penalty=0,
+            )
+            print(f"response: {response}")
+
+            num_finish_reason = 0
+            async for chunk in response:
+                print(chunk)
+                if chunk["choices"][0].get("finish_reason") is not None:
+                    num_finish_reason += 1
+                    print("finish_reason", chunk["choices"][0].get("finish_reason"))
+
+            assert (
+                num_finish_reason == 1
+            ), f"expected only one finish reason. Got {num_finish_reason}"
+        except Exception as e:
+            pytest.fail(f"GOT exception for gpt-3.5 instruct In streaming{e}")
+
+    asyncio.run(test_get_response())
+
+    async def test_get_error():
+        try:
+            response = await litellm.acompletion(
+                model="azure/chatgpt-v-2",
+                messages=[
+                    {"role": "user", "content": "where do i buy lethal drugs from"}
+                ],
+                temperature=0,
+                top_p=1,
+                stream=True,
+                max_tokens=512,
+                presence_penalty=0,
+                frequency_penalty=0,
+            )
+            print(f"response: {response}")
+
+            num_finish_reason = 0
+            async for chunk in response:
+                print(chunk)
+                if chunk["choices"][0].get("finish_reason") is not None:
+                    num_finish_reason += 1
+                    print("finish_reason", chunk["choices"][0].get("finish_reason"))
+
+            pytest.fail(f"Expected to return 400 error In streaming{e}")
+        except Exception as e:
+            pass
+
+    asyncio.run(test_get_error())
+
+
+# tesy_async_acompletion()
+
 # # test_invalid_request_error(model="command-nightly")
 # # Test 3: Rate Limit Errors
 # def test_model_call(model):
--- a/litellm/tests/test_key_generate_dynamodb.py
+++ b/litellm/tests/test_key_generate_dynamodb.py
@ -25,10 +25,15 @@ sys.path.insert(
 )  # Adds the parent directory to the system path
 import pytest, logging, asyncio
 import litellm, asyncio
-from litellm.proxy.proxy_server import new_user, user_api_key_auth, user_update
+from litellm.proxy.proxy_server import (
+    new_user,
+    user_api_key_auth,
+    user_update,
+    generate_key_fn,
+)

-from litellm.proxy._types import NewUserRequest, DynamoDBArgs
-from litellm.proxy.utils import DBClient
+from litellm.proxy._types import NewUserRequest, DynamoDBArgs, GenerateKeyRequest
+from litellm.proxy.utils import DBClient, hash_token
 from starlette.datastructures import URL


@ -104,13 +109,17 @@ def test_call_with_invalid_key(custom_db_client):
        asyncio.run(test())
    except Exception as e:
        print("Got Exception", e)
-        print(e.detail)
-        assert "Authentication Error" in e.detail
+        print(e.message)
+        assert "Authentication Error" in e.message
        pass


 def test_call_with_invalid_model(custom_db_client):
    # 3. Make a call to a key with an invalid model - expect to fail
+    from litellm._logging import verbose_proxy_logger
+    import logging
+
+    verbose_proxy_logger.setLevel(logging.DEBUG)
    setattr(litellm.proxy.proxy_server, "custom_db_client", custom_db_client)
    setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
    try:
@ -138,7 +147,7 @@ def test_call_with_invalid_model(custom_db_client):
        asyncio.run(test())
    except Exception as e:
        assert (
-            e.detail
+            e.message
            == "Authentication Error, API Key not allowed to access model. This token can only access models=['mistral']. Tried to access gemini-pro-vision"
        )
        pass
@ -175,10 +184,16 @@ def test_call_with_valid_model(custom_db_client):
        pytest.fail(f"An exception occurred - {str(e)}")


-def test_call_with_key_over_budget(custom_db_client):
+def test_call_with_user_over_budget(custom_db_client):
    # 5. Make a call with a key over budget, expect to fail
    setattr(litellm.proxy.proxy_server, "custom_db_client", custom_db_client)
    setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
+    from litellm._logging import verbose_proxy_logger, verbose_logger
+    import logging
+
+    litellm.set_verbose = True
+    verbose_logger.setLevel(logging.DEBUG)
+    verbose_proxy_logger.setLevel(logging.DEBUG)
    try:

        async def test():
@ -221,10 +236,11 @@ def test_call_with_key_over_budget(custom_db_client):
                    "stream": False,
                    "litellm_params": {
                        "metadata": {
-                            "user_api_key": generated_key,
+                            "user_api_key": hash_token(generated_key),
                            "user_api_key_user_id": user_id,
                        }
                    },
+                    "response_cost": 0.00002,
                },
                completion_response=resp,
            )
@ -236,12 +252,12 @@ def test_call_with_key_over_budget(custom_db_client):

        asyncio.run(test())
    except Exception as e:
-        error_detail = e.detail
+        error_detail = e.message
        assert "Authentication Error, ExceededBudget:" in error_detail
        print(vars(e))


-def test_call_with_key_over_budget_stream(custom_db_client):
+def test_call_with_user_over_budget_stream(custom_db_client):
    # 6. Make a call with a key over budget, expect to fail
    setattr(litellm.proxy.proxy_server, "custom_db_client", custom_db_client)
    setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
@ -293,10 +309,11 @@ def test_call_with_key_over_budget_stream(custom_db_client):
                    "complete_streaming_response": resp,
                    "litellm_params": {
                        "metadata": {
-                            "user_api_key": generated_key,
+                            "user_api_key": hash_token(generated_key),
                            "user_api_key_user_id": user_id,
                        }
                    },
+                    "response_cost": 0.00002,
                },
                completion_response=ModelResponse(),
            )
@ -308,6 +325,179 @@ def test_call_with_key_over_budget_stream(custom_db_client):

        asyncio.run(test())
    except Exception as e:
-        error_detail = e.detail
+        error_detail = e.message
        assert "Authentication Error, ExceededBudget:" in error_detail
        print(vars(e))
+
+
+def test_call_with_user_key_budget(custom_db_client):
+    # 7. Make a call with a key over budget, expect to fail
+    setattr(litellm.proxy.proxy_server, "custom_db_client", custom_db_client)
+    setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
+    from litellm._logging import verbose_proxy_logger
+    import logging
+
+    verbose_proxy_logger.setLevel(logging.DEBUG)
+    try:
+
+        async def test():
+            request = GenerateKeyRequest(max_budget=0.00001)
+            key = await generate_key_fn(request)
+            print(key)
+
+            generated_key = key.key
+            user_id = key.user_id
+            bearer_token = "Bearer " + generated_key
+
+            request = Request(scope={"type": "http"})
+            request._url = URL(url="/chat/completions")
+
+            # use generated key to auth in
+            result = await user_api_key_auth(request=request, api_key=bearer_token)
+            print("result from user auth with new key", result)
+
+            # update spend using track_cost callback, make 2nd request, it should fail
+            from litellm.proxy.proxy_server import track_cost_callback
+            from litellm import ModelResponse, Choices, Message, Usage
+
+            resp = ModelResponse(
+                id="chatcmpl-e41836bb-bb8b-4df2-8e70-8f3e160155ac",
+                choices=[
+                    Choices(
+                        finish_reason=None,
+                        index=0,
+                        message=Message(
+                            content=" Sure! Here is a short poem about the sky:\n\nA canvas of blue, a",
+                            role="assistant",
+                        ),
+                    )
+                ],
+                model="gpt-35-turbo",  # azure always has model written like this
+                usage=Usage(prompt_tokens=210, completion_tokens=200, total_tokens=410),
+            )
+            await track_cost_callback(
+                kwargs={
+                    "stream": False,
+                    "litellm_params": {
+                        "metadata": {
+                            "user_api_key": hash_token(generated_key),
+                            "user_api_key_user_id": user_id,
+                        }
+                    },
+                    "response_cost": 0.00002,
+                },
+                completion_response=resp,
+            )
+
+            # use generated key to auth in
+            result = await user_api_key_auth(request=request, api_key=bearer_token)
+            print("result from user auth with new key", result)
+            pytest.fail(f"This should have failed!. They key crossed it's budget")
+
+        asyncio.run(test())
+    except Exception as e:
+        error_detail = e.message
+        assert "Authentication Error, ExceededTokenBudget:" in error_detail
+        print(vars(e))
+
+
+def test_call_with_key_over_budget_stream(custom_db_client):
+    # 8. Make a call with a key over budget, expect to fail
+    setattr(litellm.proxy.proxy_server, "custom_db_client", custom_db_client)
+    setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
+    from litellm._logging import verbose_proxy_logger
+    import logging
+
+    litellm.set_verbose = True
+    verbose_proxy_logger.setLevel(logging.DEBUG)
+    try:
+
+        async def test():
+            request = GenerateKeyRequest(max_budget=0.00001)
+            key = await generate_key_fn(request)
+            print(key)
+
+            generated_key = key.key
+            user_id = key.user_id
+            bearer_token = "Bearer " + generated_key
+
+            request = Request(scope={"type": "http"})
+            request._url = URL(url="/chat/completions")
+
+            # use generated key to auth in
+            result = await user_api_key_auth(request=request, api_key=bearer_token)
+            print("result from user auth with new key", result)
+
+            # update spend using track_cost callback, make 2nd request, it should fail
+            from litellm.proxy.proxy_server import track_cost_callback
+            from litellm import ModelResponse, Choices, Message, Usage
+
+            resp = ModelResponse(
+                id="chatcmpl-e41836bb-bb8b-4df2-8e70-8f3e160155ac",
+                choices=[
+                    Choices(
+                        finish_reason=None,
+                        index=0,
+                        message=Message(
+                            content=" Sure! Here is a short poem about the sky:\n\nA canvas of blue, a",
+                            role="assistant",
+                        ),
+                    )
+                ],
+                model="gpt-35-turbo",  # azure always has model written like this
+                usage=Usage(prompt_tokens=210, completion_tokens=200, total_tokens=410),
+            )
+            await track_cost_callback(
+                kwargs={
+                    "stream": True,
+                    "complete_streaming_response": resp,
+                    "litellm_params": {
+                        "metadata": {
+                            "user_api_key": hash_token(generated_key),
+                            "user_api_key_user_id": user_id,
+                        }
+                    },
+                    "response_cost": 0.00002,
+                },
+                completion_response=ModelResponse(),
+            )
+
+            # use generated key to auth in
+            result = await user_api_key_auth(request=request, api_key=bearer_token)
+            print("result from user auth with new key", result)
+            pytest.fail(f"This should have failed!. They key crossed it's budget")
+
+        asyncio.run(test())
+    except Exception as e:
+        error_detail = e.message
+        assert "Authentication Error, ExceededTokenBudget:" in error_detail
+        print(vars(e))
+
+
+def test_dynamo_db_migration(custom_db_client):
+    # Tests the temporary patch we have in place
+    setattr(litellm.proxy.proxy_server, "custom_db_client", custom_db_client)
+    setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
+    setattr(litellm.proxy.proxy_server, "user_custom_auth", None)
+    try:
+
+        async def test():
+            bearer_token = (
+                "Bearer " + "sk-elJDL2pOEjcAuC7zD4psAg"
+            )  # this works with ishaan's db, it's a never expiring key
+
+            request = Request(scope={"type": "http"})
+            request._url = URL(url="/chat/completions")
+
+            async def return_body():
+                return b'{"model": "azure-models"}'
+
+            request.body = return_body
+
+            # use generated key to auth in
+            result = await user_api_key_auth(request=request, api_key=bearer_token)
+            print("result from user auth with new key", result)
+
+        asyncio.run(test())
+    except Exception as e:
+        pytest.fail(f"An exception occurred - {str(e)}")
--- a/litellm/tests/test_key_generate_prisma.py
+++ b/litellm/tests/test_key_generate_prisma.py
@ -3,11 +3,17 @@
 # 2. Make a call with invalid key, expect it to fail
 # 3. Make a call to a key with invalid model - expect to fail
 # 4. Make a call to a key with valid model - expect to pass
-# 5. Make a call with key over budget, expect to fail
-# 6. Make a streaming chat/completions call with key over budget, expect to fail
+# 5. Make a call with user over budget, expect to fail
+# 6. Make a streaming chat/completions call with user over budget, expect to fail
 # 7. Make a call with an key that never expires, expect to pass
 # 8. Make a call with an expired key, expect to fail
 # 9. Delete a Key
+# 10. Generate a key, call key/info. Assert info returned is the same as generated key info
+# 11. Generate a Key, cal key/info, call key/update, call key/info
+# 12. Make a call with key over budget, expect to fail
+# 14. Make a streaming chat/completions call with key over budget, expect to fail
+# 15. Generate key, when `allow_user_auth`=False - check if `/key/info` returns key_name=null
+# 16. Generate key, when `allow_user_auth`=True - check if `/key/info` returns key_name=sk...<last-4-digits>


 # function to call to generate key - async def new_user(data: NewUserRequest):
@ -17,9 +23,10 @@ import sys, os
 import traceback
 from dotenv import load_dotenv
 from fastapi import Request
+from datetime import datetime

 load_dotenv()
-import os, io
+import os, io, time

 # this file is to test litellm/proxy

@ -30,16 +37,30 @@ import pytest, logging, asyncio
 import litellm, asyncio
 from litellm.proxy.proxy_server import (
    new_user,
+    generate_key_fn,
    user_api_key_auth,
    user_update,
    delete_key_fn,
+    info_key_fn,
+    update_key_fn,
+    generate_key_fn,
+    spend_user_fn,
+    spend_key_fn,
+    view_spend_logs,
 )
-from litellm.proxy.utils import PrismaClient, ProxyLogging
+from litellm.proxy.utils import PrismaClient, ProxyLogging, hash_token
 from litellm._logging import verbose_proxy_logger

 verbose_proxy_logger.setLevel(level=logging.DEBUG)

-from litellm.proxy._types import NewUserRequest, DynamoDBArgs, DeleteKeyRequest
+from litellm.proxy._types import (
+    NewUserRequest,
+    GenerateKeyRequest,
+    DynamoDBArgs,
+    DeleteKeyRequest,
+    UpdateKeyRequest,
+    GenerateKeyRequest,
+)
 from litellm.proxy.utils import DBClient
 from starlette.datastructures import URL
 from litellm.caching import DualCache
@ -64,6 +85,10 @@ def prisma_client():

    # Reset litellm.proxy.proxy_server.prisma_client to None
    litellm.proxy.proxy_server.custom_db_client = None
+    litellm.proxy.proxy_server.litellm_proxy_budget_name = (
+        f"litellm-proxy-budget-{time.time()}"
+    )
+    litellm.proxy.proxy_server.user_custom_key_generate = None

    return prisma_client

@ -120,8 +145,8 @@ def test_call_with_invalid_key(prisma_client):
        asyncio.run(test())
    except Exception as e:
        print("Got Exception", e)
-        print(e.detail)
-        assert "Authentication Error" in e.detail
+        print(e.message)
+        assert "Authentication Error" in e.message
        pass


@ -155,7 +180,7 @@ def test_call_with_invalid_model(prisma_client):
        asyncio.run(test())
    except Exception as e:
        assert (
-            e.detail
+            e.message
            == "Authentication Error, API Key not allowed to access model. This token can only access models=['mistral']. Tried to access gemini-pro-vision"
        )
        pass
@ -193,7 +218,7 @@ def test_call_with_valid_model(prisma_client):
        pytest.fail(f"An exception occurred - {str(e)}")


-def test_call_with_key_over_budget(prisma_client):
+def test_call_with_user_over_budget(prisma_client):
    # 5. Make a call with a key over budget, expect to fail
    setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
    setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
@ -244,8 +269,11 @@ def test_call_with_key_over_budget(prisma_client):
                            "user_api_key_user_id": user_id,
                        }
                    },
+                    "response_cost": 0.00002,
                },
                completion_response=resp,
+                start_time=datetime.now(),
+                end_time=datetime.now(),
            )

            # use generated key to auth in
@ -255,12 +283,96 @@ def test_call_with_key_over_budget(prisma_client):

        asyncio.run(test())
    except Exception as e:
-        error_detail = e.detail
+        error_detail = e.message
        assert "Authentication Error, ExceededBudget:" in error_detail
        print(vars(e))


-def test_call_with_key_over_budget_stream(prisma_client):
+def test_call_with_proxy_over_budget(prisma_client):
+    # 5.1 Make a call with a proxy over budget, expect to fail
+    setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
+    setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
+    litellm_proxy_budget_name = f"litellm-proxy-budget-{time.time()}"
+    setattr(
+        litellm.proxy.proxy_server,
+        "litellm_proxy_budget_name",
+        litellm_proxy_budget_name,
+    )
+    try:
+
+        async def test():
+            await litellm.proxy.proxy_server.prisma_client.connect()
+            ## CREATE PROXY + USER BUDGET ##
+            request = NewUserRequest(
+                max_budget=0.00001, user_id=litellm_proxy_budget_name
+            )
+            await new_user(request)
+            request = NewUserRequest()
+            key = await new_user(request)
+            print(key)
+
+            generated_key = key.key
+            user_id = key.user_id
+            bearer_token = "Bearer " + generated_key
+
+            request = Request(scope={"type": "http"})
+            request._url = URL(url="/chat/completions")
+
+            # use generated key to auth in
+            result = await user_api_key_auth(request=request, api_key=bearer_token)
+            print("result from user auth with new key", result)
+
+            # update spend using track_cost callback, make 2nd request, it should fail
+            from litellm.proxy.proxy_server import track_cost_callback
+            from litellm import ModelResponse, Choices, Message, Usage
+
+            resp = ModelResponse(
+                id="chatcmpl-e41836bb-bb8b-4df2-8e70-8f3e160155ac",
+                choices=[
+                    Choices(
+                        finish_reason=None,
+                        index=0,
+                        message=Message(
+                            content=" Sure! Here is a short poem about the sky:\n\nA canvas of blue, a",
+                            role="assistant",
+                        ),
+                    )
+                ],
+                model="gpt-35-turbo",  # azure always has model written like this
+                usage=Usage(prompt_tokens=210, completion_tokens=200, total_tokens=410),
+            )
+            await track_cost_callback(
+                kwargs={
+                    "stream": False,
+                    "litellm_params": {
+                        "metadata": {
+                            "user_api_key": generated_key,
+                            "user_api_key_user_id": user_id,
+                        }
+                    },
+                    "response_cost": 0.00002,
+                },
+                completion_response=resp,
+                start_time=datetime.now(),
+                end_time=datetime.now(),
+            )
+
+            # use generated key to auth in
+            result = await user_api_key_auth(request=request, api_key=bearer_token)
+            print("result from user auth with new key", result)
+            pytest.fail(f"This should have failed!. They key crossed it's budget")
+
+        asyncio.run(test())
+    except Exception as e:
+        if hasattr(e, "message"):
+            error_detail = e.message
+        else:
+            error_detail = traceback.format_exc()
+        assert "Authentication Error, ExceededBudget:" in error_detail
+        print(vars(e))
+
+
+def test_call_with_user_over_budget_stream(prisma_client):
    # 6. Make a call with a key over budget, expect to fail
    setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
    setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
@ -317,8 +429,11 @@ def test_call_with_key_over_budget_stream(prisma_client):
                            "user_api_key_user_id": user_id,
                        }
                    },
+                    "response_cost": 0.00002,
                },
                completion_response=ModelResponse(),
+                start_time=datetime.now(),
+                end_time=datetime.now(),
            )

            # use generated key to auth in
@ -328,7 +443,94 @@ def test_call_with_key_over_budget_stream(prisma_client):

        asyncio.run(test())
    except Exception as e:
-        error_detail = e.detail
+        error_detail = e.message
+        assert "Authentication Error, ExceededBudget:" in error_detail
+        print(vars(e))
+
+
+def test_call_with_proxy_over_budget_stream(prisma_client):
+    # 6.1 Make a call with a global proxy over budget, expect to fail
+    setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
+    setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
+    litellm_proxy_budget_name = f"litellm-proxy-budget-{time.time()}"
+    setattr(
+        litellm.proxy.proxy_server,
+        "litellm_proxy_budget_name",
+        litellm_proxy_budget_name,
+    )
+    from litellm._logging import verbose_proxy_logger
+    import logging
+
+    litellm.set_verbose = True
+    verbose_proxy_logger.setLevel(logging.DEBUG)
+    try:
+
+        async def test():
+            await litellm.proxy.proxy_server.prisma_client.connect()
+            ## CREATE PROXY + USER BUDGET ##
+            request = NewUserRequest(
+                max_budget=0.00001, user_id=litellm_proxy_budget_name
+            )
+            await new_user(request)
+            request = NewUserRequest()
+            key = await new_user(request)
+            print(key)
+
+            generated_key = key.key
+            user_id = key.user_id
+            bearer_token = "Bearer " + generated_key
+
+            request = Request(scope={"type": "http"})
+            request._url = URL(url="/chat/completions")
+
+            # use generated key to auth in
+            result = await user_api_key_auth(request=request, api_key=bearer_token)
+            print("result from user auth with new key", result)
+
+            # update spend using track_cost callback, make 2nd request, it should fail
+            from litellm.proxy.proxy_server import track_cost_callback
+            from litellm import ModelResponse, Choices, Message, Usage
+
+            resp = ModelResponse(
+                id="chatcmpl-e41836bb-bb8b-4df2-8e70-8f3e160155ac",
+                choices=[
+                    Choices(
+                        finish_reason=None,
+                        index=0,
+                        message=Message(
+                            content=" Sure! Here is a short poem about the sky:\n\nA canvas of blue, a",
+                            role="assistant",
+                        ),
+                    )
+                ],
+                model="gpt-35-turbo",  # azure always has model written like this
+                usage=Usage(prompt_tokens=210, completion_tokens=200, total_tokens=410),
+            )
+            await track_cost_callback(
+                kwargs={
+                    "stream": True,
+                    "complete_streaming_response": resp,
+                    "litellm_params": {
+                        "metadata": {
+                            "user_api_key": generated_key,
+                            "user_api_key_user_id": user_id,
+                        }
+                    },
+                    "response_cost": 0.00002,
+                },
+                completion_response=ModelResponse(),
+                start_time=datetime.now(),
+                end_time=datetime.now(),
+            )
+
+            # use generated key to auth in
+            result = await user_api_key_auth(request=request, api_key=bearer_token)
+            print("result from user auth with new key", result)
+            pytest.fail(f"This should have failed!. They key crossed it's budget")
+
+        asyncio.run(test())
+    except Exception as e:
+        error_detail = e.message
        assert "Authentication Error, ExceededBudget:" in error_detail
        print(vars(e))

@ -392,8 +594,8 @@ def test_generate_and_call_with_expired_key(prisma_client):
        asyncio.run(test())
    except Exception as e:
        print("Got Exception", e)
-        print(e.detail)
-        assert "Authentication Error" in e.detail
+        print(e.message)
+        assert "Authentication Error" in e.message
        pass


@ -415,15 +617,10 @@ def test_delete_key(prisma_client):
            generated_key = key.key
            bearer_token = "Bearer " + generated_key

-            request = Request(scope={"type": "http"})
-            request._url = URL(url="/chat/completions")
-
            delete_key_request = DeleteKeyRequest(keys=[generated_key])

            # delete the key
-            result_delete_key = await delete_key_fn(
-                request=request, data=delete_key_request
-            )
+            result_delete_key = await delete_key_fn(data=delete_key_request)
            print("result from delete key", result_delete_key)
            assert result_delete_key == {"deleted_keys": [generated_key]}

@ -450,15 +647,10 @@ def test_delete_key_auth(prisma_client):
            generated_key = key.key
            bearer_token = "Bearer " + generated_key

-            request = Request(scope={"type": "http"})
-            request._url = URL(url="/chat/completions")
-
            delete_key_request = DeleteKeyRequest(keys=[generated_key])

            # delete the key
-            result_delete_key = await delete_key_fn(
-                request=request, data=delete_key_request
-            )
+            result_delete_key = await delete_key_fn(data=delete_key_request)

            print("result from delete key", result_delete_key)
            assert result_delete_key == {"deleted_keys": [generated_key]}
@ -474,6 +666,549 @@ def test_delete_key_auth(prisma_client):
        asyncio.run(test())
    except Exception as e:
        print("Got Exception", e)
-        print(e.detail)
-        assert "Authentication Error" in e.detail
+        print(e.message)
+        assert "Authentication Error" in e.message
        pass
+
+
+def test_generate_and_call_key_info(prisma_client):
+    # 10. Generate a Key, cal key/info
+
+    print("prisma client=", prisma_client)
+
+    setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
+    setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
+    try:
+
+        async def test():
+            await litellm.proxy.proxy_server.prisma_client.connect()
+            request = NewUserRequest(
+                metadata={"team": "litellm-team3", "project": "litellm-project3"}
+            )
+            key = await new_user(request)
+            print(key)
+
+            generated_key = key.key
+
+            # use generated key to auth in
+            result = await info_key_fn(key=generated_key)
+            print("result from info_key_fn", result)
+            assert result["key"] == generated_key
+            print("\n info for key=", result["info"])
+            assert result["info"]["max_parallel_requests"] == None
+            assert result["info"]["metadata"] == {
+                "team": "litellm-team3",
+                "project": "litellm-project3",
+            }
+
+            # cleanup - delete key
+            delete_key_request = DeleteKeyRequest(keys=[generated_key])
+
+            # delete the key
+            await delete_key_fn(data=delete_key_request)
+
+        asyncio.run(test())
+    except Exception as e:
+        pytest.fail(f"An exception occurred - {str(e)}")
+
+
+def test_generate_and_update_key(prisma_client):
+    # 11. Generate a Key, cal key/info, call key/update, call key/info
+    # Check if data gets updated
+    # Check if untouched data does not get updated
+
+    print("prisma client=", prisma_client)
+
+    setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
+    setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
+    try:
+
+        async def test():
+            await litellm.proxy.proxy_server.prisma_client.connect()
+            request = NewUserRequest(
+                metadata={"team": "litellm-team3", "project": "litellm-project3"},
+                team_id="litellm-core-infra@gmail.com",
+            )
+            key = await new_user(request)
+            print(key)
+
+            generated_key = key.key
+
+            # use generated key to auth in
+            result = await info_key_fn(key=generated_key)
+            print("result from info_key_fn", result)
+            assert result["key"] == generated_key
+            print("\n info for key=", result["info"])
+            assert result["info"]["max_parallel_requests"] == None
+            assert result["info"]["metadata"] == {
+                "team": "litellm-team3",
+                "project": "litellm-project3",
+            }
+            assert result["info"]["team_id"] == "litellm-core-infra@gmail.com"
+
+            request = Request(scope={"type": "http"})
+            request._url = URL(url="/update/key")
+
+            # update the key
+            await update_key_fn(
+                request=Request,
+                data=UpdateKeyRequest(
+                    key=generated_key,
+                    models=["ada", "babbage", "curie", "davinci"],
+                ),
+            )
+
+            # get info on key after update
+            result = await info_key_fn(key=generated_key)
+            print("result from info_key_fn", result)
+            assert result["key"] == generated_key
+            print("\n info for key=", result["info"])
+            assert result["info"]["max_parallel_requests"] == None
+            assert result["info"]["metadata"] == {
+                "team": "litellm-team3",
+                "project": "litellm-project3",
+            }
+            assert result["info"]["models"] == ["ada", "babbage", "curie", "davinci"]
+
+            # cleanup - delete key
+            delete_key_request = DeleteKeyRequest(keys=[generated_key])
+
+            # delete the key
+            await delete_key_fn(data=delete_key_request)
+
+        asyncio.run(test())
+    except Exception as e:
+        print("Got Exception", e)
+        print(e.message)
+        pytest.fail(f"An exception occurred - {str(e)}")
+
+
+def test_key_generate_with_custom_auth(prisma_client):
+    # custom - generate key function
+    async def custom_generate_key_fn(data: GenerateKeyRequest) -> dict:
+        """
+        Asynchronous function for generating a key based on the input data.
+
+        Args:
+            data (GenerateKeyRequest): The input data for key generation.
+
+        Returns:
+            dict: A dictionary containing the decision and an optional message.
+            {
+                "decision": False,
+                "message": "This violates LiteLLM Proxy Rules. No team id provided.",
+            }
+        """
+
+        # decide if a key should be generated or not
+        print("using custom auth function!")
+        data_json = data.json()  # type: ignore
+
+        # Unpacking variables
+        team_id = data_json.get("team_id")
+        duration = data_json.get("duration")
+        models = data_json.get("models")
+        aliases = data_json.get("aliases")
+        config = data_json.get("config")
+        spend = data_json.get("spend")
+        user_id = data_json.get("user_id")
+        max_parallel_requests = data_json.get("max_parallel_requests")
+        metadata = data_json.get("metadata")
+        tpm_limit = data_json.get("tpm_limit")
+        rpm_limit = data_json.get("rpm_limit")
+
+        if team_id is not None and team_id == "litellm-core-infra@gmail.com":
+            # only team_id="litellm-core-infra@gmail.com" can make keys
+            return {
+                "decision": True,
+            }
+        else:
+            print("Failed custom auth")
+            return {
+                "decision": False,
+                "message": "This violates LiteLLM Proxy Rules. No team id provided.",
+            }
+
+    setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
+    setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
+    setattr(
+        litellm.proxy.proxy_server, "user_custom_key_generate", custom_generate_key_fn
+    )
+    try:
+
+        async def test():
+            try:
+                await litellm.proxy.proxy_server.prisma_client.connect()
+                request = GenerateKeyRequest()
+                key = await generate_key_fn(request)
+                pytest.fail(f"Expected an exception. Got {key}")
+            except Exception as e:
+                # this should fail
+                print("Got Exception", e)
+                print(e.message)
+                print("First request failed!. This is expected")
+                assert (
+                    "This violates LiteLLM Proxy Rules. No team id provided."
+                    in e.message
+                )
+
+            request_2 = GenerateKeyRequest(
+                team_id="litellm-core-infra@gmail.com",
+            )
+
+            key = await generate_key_fn(request_2)
+            print(key)
+            generated_key = key.key
+
+        asyncio.run(test())
+    except Exception as e:
+        print("Got Exception", e)
+        print(e.message)
+        pytest.fail(f"An exception occurred - {str(e)}")
+
+
+def test_call_with_key_over_budget(prisma_client):
+    # 12. Make a call with a key over budget, expect to fail
+    setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
+    setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
+    try:
+
+        async def test():
+            await litellm.proxy.proxy_server.prisma_client.connect()
+            request = GenerateKeyRequest(max_budget=0.00001)
+            key = await generate_key_fn(request)
+            print(key)
+
+            generated_key = key.key
+            user_id = key.user_id
+            bearer_token = "Bearer " + generated_key
+
+            request = Request(scope={"type": "http"})
+            request._url = URL(url="/chat/completions")
+
+            # use generated key to auth in
+            result = await user_api_key_auth(request=request, api_key=bearer_token)
+            print("result from user auth with new key", result)
+
+            # update spend using track_cost callback, make 2nd request, it should fail
+            from litellm.proxy.proxy_server import track_cost_callback
+            from litellm import ModelResponse, Choices, Message, Usage
+            from litellm.caching import Cache
+
+            litellm.cache = Cache()
+            import time
+
+            request_id = f"chatcmpl-e41836bb-bb8b-4df2-8e70-8f3e160155ac{time.time()}"
+
+            resp = ModelResponse(
+                id=request_id,
+                choices=[
+                    Choices(
+                        finish_reason=None,
+                        index=0,
+                        message=Message(
+                            content=" Sure! Here is a short poem about the sky:\n\nA canvas of blue, a",
+                            role="assistant",
+                        ),
+                    )
+                ],
+                model="gpt-35-turbo",  # azure always has model written like this
+                usage=Usage(prompt_tokens=210, completion_tokens=200, total_tokens=410),
+            )
+            await track_cost_callback(
+                kwargs={
+                    "model": "chatgpt-v-2",
+                    "stream": False,
+                    "litellm_params": {
+                        "metadata": {
+                            "user_api_key": hash_token(generated_key),
+                            "user_api_key_user_id": user_id,
+                        }
+                    },
+                    "response_cost": 0.00002,
+                },
+                completion_response=resp,
+                start_time=datetime.now(),
+                end_time=datetime.now(),
+            )
+
+            # test spend_log was written and we can read it
+            spend_logs = await view_spend_logs(request_id=request_id)
+
+            print("read spend logs", spend_logs)
+            assert len(spend_logs) == 1
+
+            spend_log = spend_logs[0]
+
+            assert spend_log.request_id == request_id
+            assert spend_log.spend == float("2e-05")
+            assert spend_log.model == "chatgpt-v-2"
+            assert (
+                spend_log.cache_key
+                == "a61ae14fe4a8b8014a61e6ae01a100c8bc6770ac37c293242afed954bc69207d"
+            )
+
+            # use generated key to auth in
+            result = await user_api_key_auth(request=request, api_key=bearer_token)
+            print("result from user auth with new key", result)
+            pytest.fail(f"This should have failed!. They key crossed it's budget")
+
+        asyncio.run(test())
+    except Exception as e:
+        error_detail = e.message
+        assert "Authentication Error, ExceededTokenBudget:" in error_detail
+        print(vars(e))
+
+
+@pytest.mark.asyncio()
+async def test_call_with_key_never_over_budget(prisma_client):
+    # Make a call with a key with budget=None, it should never fail
+    setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
+    setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
+    try:
+        await litellm.proxy.proxy_server.prisma_client.connect()
+        request = GenerateKeyRequest(max_budget=None)
+        key = await generate_key_fn(request)
+        print(key)
+
+        generated_key = key.key
+        user_id = key.user_id
+        bearer_token = "Bearer " + generated_key
+
+        request = Request(scope={"type": "http"})
+        request._url = URL(url="/chat/completions")
+
+        # use generated key to auth in
+        result = await user_api_key_auth(request=request, api_key=bearer_token)
+        print("result from user auth with new key", result)
+
+        # update spend using track_cost callback, make 2nd request, it should fail
+        from litellm.proxy.proxy_server import track_cost_callback
+        from litellm import ModelResponse, Choices, Message, Usage
+        import time
+
+        request_id = f"chatcmpl-e41836bb-bb8b-4df2-8e70-8f3e160155ac{time.time()}"
+
+        resp = ModelResponse(
+            id=request_id,
+            choices=[
+                Choices(
+                    finish_reason=None,
+                    index=0,
+                    message=Message(
+                        content=" Sure! Here is a short poem about the sky:\n\nA canvas of blue, a",
+                        role="assistant",
+                    ),
+                )
+            ],
+            model="gpt-35-turbo",  # azure always has model written like this
+            usage=Usage(
+                prompt_tokens=210000, completion_tokens=200000, total_tokens=41000
+            ),
+        )
+        await track_cost_callback(
+            kwargs={
+                "model": "chatgpt-v-2",
+                "stream": False,
+                "litellm_params": {
+                    "metadata": {
+                        "user_api_key": hash_token(generated_key),
+                        "user_api_key_user_id": user_id,
+                    }
+                },
+                "response_cost": 200000,
+            },
+            completion_response=resp,
+            start_time=datetime.now(),
+            end_time=datetime.now(),
+        )
+
+        # use generated key to auth in
+        result = await user_api_key_auth(request=request, api_key=bearer_token)
+        print("result from user auth with new key", result)
+    except Exception as e:
+        pytest.fail(f"This should have not failed!. They key uses max_budget=None. {e}")
+
+
+@pytest.mark.asyncio()
+async def test_call_with_key_over_budget_stream(prisma_client):
+    # 14. Make a call with a key over budget, expect to fail
+    setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
+    setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
+    from litellm._logging import verbose_proxy_logger
+    import logging
+
+    litellm.set_verbose = True
+    verbose_proxy_logger.setLevel(logging.DEBUG)
+    try:
+        await litellm.proxy.proxy_server.prisma_client.connect()
+        request = GenerateKeyRequest(max_budget=0.00001)
+        key = await generate_key_fn(request)
+        print(key)
+
+        generated_key = key.key
+        user_id = key.user_id
+        bearer_token = "Bearer " + generated_key
+
+        request = Request(scope={"type": "http"})
+        request._url = URL(url="/chat/completions")
+
+        # use generated key to auth in
+        result = await user_api_key_auth(request=request, api_key=bearer_token)
+        print("result from user auth with new key", result)
+
+        # update spend using track_cost callback, make 2nd request, it should fail
+        from litellm.proxy.proxy_server import track_cost_callback
+        from litellm import ModelResponse, Choices, Message, Usage
+        import time
+
+        request_id = f"chatcmpl-e41836bb-bb8b-4df2-8e70-8f3e160155ac{time.time()}"
+        resp = ModelResponse(
+            id=request_id,
+            choices=[
+                Choices(
+                    finish_reason=None,
+                    index=0,
+                    message=Message(
+                        content=" Sure! Here is a short poem about the sky:\n\nA canvas of blue, a",
+                        role="assistant",
+                    ),
+                )
+            ],
+            model="gpt-35-turbo",  # azure always has model written like this
+            usage=Usage(prompt_tokens=210, completion_tokens=200, total_tokens=410),
+        )
+        await track_cost_callback(
+            kwargs={
+                "call_type": "acompletion",
+                "model": "sagemaker-chatgpt-v-2",
+                "stream": True,
+                "complete_streaming_response": resp,
+                "litellm_params": {
+                    "metadata": {
+                        "user_api_key": hash_token(generated_key),
+                        "user_api_key_user_id": user_id,
+                    }
+                },
+                "response_cost": 0.00005,
+            },
+            completion_response=resp,
+            start_time=datetime.now(),
+            end_time=datetime.now(),
+        )
+
+        # use generated key to auth in
+        result = await user_api_key_auth(request=request, api_key=bearer_token)
+        print("result from user auth with new key", result)
+        pytest.fail(f"This should have failed!. They key crossed it's budget")
+
+    except Exception as e:
+        print("Got Exception", e)
+        error_detail = e.message
+        assert "Authentication Error, ExceededTokenBudget:" in error_detail
+        print(vars(e))
+
+
+@pytest.mark.asyncio()
+async def test_view_spend_per_user(prisma_client):
+    setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
+    setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
+    await litellm.proxy.proxy_server.prisma_client.connect()
+    try:
+        user_by_spend = await spend_user_fn(user_id=None)
+        assert type(user_by_spend) == list
+        assert len(user_by_spend) > 0
+        first_user = user_by_spend[0]
+
+        print("\nfirst_user=", first_user)
+        assert first_user.spend > 0
+    except Exception as e:
+        print("Got Exception", e)
+        pytest.fail(f"Got exception {e}")
+
+
+@pytest.mark.asyncio()
+async def test_view_spend_per_key(prisma_client):
+    setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
+    setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
+    await litellm.proxy.proxy_server.prisma_client.connect()
+    try:
+        key_by_spend = await spend_key_fn()
+        assert type(key_by_spend) == list
+        assert len(key_by_spend) > 0
+        first_key = key_by_spend[0]
+
+        print("\nfirst_key=", first_key)
+        assert first_key.spend > 0
+    except Exception as e:
+        print("Got Exception", e)
+        pytest.fail(f"Got exception {e}")
+
+
+@pytest.mark.asyncio()
+async def test_key_name_null(prisma_client):
+    """
+    - create key
+    - get key info
+    - assert key_name is null
+    """
+    setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
+    setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
+    await litellm.proxy.proxy_server.prisma_client.connect()
+    try:
+        request = GenerateKeyRequest()
+        key = await generate_key_fn(request)
+        generated_key = key.key
+        result = await info_key_fn(key=generated_key)
+        print("result from info_key_fn", result)
+        assert result["info"]["key_name"] is None
+    except Exception as e:
+        print("Got Exception", e)
+        pytest.fail(f"Got exception {e}")
+
+
+@pytest.mark.asyncio()
+async def test_key_name_set(prisma_client):
+    """
+    - create key
+    - get key info
+    - assert key_name is not null
+    """
+    setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
+    setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
+    setattr(litellm.proxy.proxy_server, "general_settings", {"allow_user_auth": True})
+    await litellm.proxy.proxy_server.prisma_client.connect()
+    try:
+        request = GenerateKeyRequest()
+        key = await generate_key_fn(request)
+        generated_key = key.key
+        result = await info_key_fn(key=generated_key)
+        print("result from info_key_fn", result)
+        assert isinstance(result["info"]["key_name"], str)
+    except Exception as e:
+        print("Got Exception", e)
+        pytest.fail(f"Got exception {e}")
+
+
+@pytest.mark.asyncio()
+async def test_default_key_params(prisma_client):
+    """
+    - create key
+    - get key info
+    - assert key_name is not null
+    """
+    setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
+    setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
+    setattr(litellm.proxy.proxy_server, "general_settings", {"allow_user_auth": True})
+    litellm.default_key_generate_params = {"max_budget": 0.000122}
+    await litellm.proxy.proxy_server.prisma_client.connect()
+    try:
+        request = GenerateKeyRequest()
+        key = await generate_key_fn(request)
+        generated_key = key.key
+        result = await info_key_fn(key=generated_key)
+        print("result from info_key_fn", result)
+        assert result["info"]["max_budget"] == 0.000122
+    except Exception as e:
+        print("Got Exception", e)
+        pytest.fail(f"Got exception {e}")
--- a/litellm/tests/test_parallel_request_limiter.py
+++ b/litellm/tests/test_parallel_request_limiter.py
@ -0,0 +1,685 @@
+# What this tests?
+## Unit Tests for the max parallel request limiter for the proxy
+
+import sys, os, asyncio, time, random
+from datetime import datetime
+import traceback
+from dotenv import load_dotenv
+
+load_dotenv()
+import os
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
+import pytest
+import litellm
+from litellm import Router
+from litellm.proxy.utils import ProxyLogging
+from litellm.proxy._types import UserAPIKeyAuth
+from litellm.caching import DualCache
+from litellm.proxy.hooks.parallel_request_limiter import MaxParallelRequestsHandler
+from datetime import datetime
+
+## On Request received
+## On Request success
+## On Request failure
+
+
+@pytest.mark.asyncio
+async def test_pre_call_hook():
+    """
+    Test if cache updated on call being received
+    """
+    _api_key = "sk-12345"
+    user_api_key_dict = UserAPIKeyAuth(api_key=_api_key, max_parallel_requests=1)
+    local_cache = DualCache()
+    parallel_request_handler = MaxParallelRequestsHandler()
+
+    await parallel_request_handler.async_pre_call_hook(
+        user_api_key_dict=user_api_key_dict, cache=local_cache, data={}, call_type=""
+    )
+
+    current_date = datetime.now().strftime("%Y-%m-%d")
+    current_hour = datetime.now().strftime("%H")
+    current_minute = datetime.now().strftime("%M")
+    precise_minute = f"{current_date}-{current_hour}-{current_minute}"
+    request_count_api_key = f"{_api_key}::{precise_minute}::request_count"
+
+    print(
+        parallel_request_handler.user_api_key_cache.get_cache(key=request_count_api_key)
+    )
+    assert (
+        parallel_request_handler.user_api_key_cache.get_cache(
+            key=request_count_api_key
+        )["current_requests"]
+        == 1
+    )
+
+
+@pytest.mark.asyncio
+async def test_pre_call_hook_rpm_limits():
+    """
+    Test if error raised on hitting rpm limits
+    """
+    _api_key = "sk-12345"
+    user_api_key_dict = UserAPIKeyAuth(
+        api_key=_api_key, max_parallel_requests=1, tpm_limit=9, rpm_limit=1
+    )
+    local_cache = DualCache()
+    parallel_request_handler = MaxParallelRequestsHandler()
+
+    await parallel_request_handler.async_pre_call_hook(
+        user_api_key_dict=user_api_key_dict, cache=local_cache, data={}, call_type=""
+    )
+
+    kwargs = {"litellm_params": {"metadata": {"user_api_key": _api_key}}}
+
+    await parallel_request_handler.async_log_success_event(
+        kwargs=kwargs,
+        response_obj="",
+        start_time="",
+        end_time="",
+    )
+
+    ## Expected cache val: {"current_requests": 0, "current_tpm": 0, "current_rpm": 1}
+
+    try:
+        await parallel_request_handler.async_pre_call_hook(
+            user_api_key_dict=user_api_key_dict,
+            cache=local_cache,
+            data={},
+            call_type="",
+        )
+
+        pytest.fail(f"Expected call to fail")
+    except Exception as e:
+        assert e.status_code == 429
+
+
+@pytest.mark.asyncio
+async def test_pre_call_hook_tpm_limits():
+    """
+    Test if error raised on hitting tpm limits
+    """
+    _api_key = "sk-12345"
+    user_api_key_dict = UserAPIKeyAuth(
+        api_key=_api_key, max_parallel_requests=1, tpm_limit=9, rpm_limit=10
+    )
+    local_cache = DualCache()
+    parallel_request_handler = MaxParallelRequestsHandler()
+
+    await parallel_request_handler.async_pre_call_hook(
+        user_api_key_dict=user_api_key_dict, cache=local_cache, data={}, call_type=""
+    )
+
+    kwargs = {"litellm_params": {"metadata": {"user_api_key": _api_key}}}
+
+    await parallel_request_handler.async_log_success_event(
+        kwargs=kwargs,
+        response_obj=litellm.ModelResponse(usage=litellm.Usage(total_tokens=10)),
+        start_time="",
+        end_time="",
+    )
+
+    ## Expected cache val: {"current_requests": 0, "current_tpm": 0, "current_rpm": 1}
+
+    try:
+        await parallel_request_handler.async_pre_call_hook(
+            user_api_key_dict=user_api_key_dict,
+            cache=local_cache,
+            data={},
+            call_type="",
+        )
+
+        pytest.fail(f"Expected call to fail")
+    except Exception as e:
+        assert e.status_code == 429
+
+
+@pytest.mark.asyncio
+async def test_success_call_hook():
+    """
+    Test if on success, cache correctly decremented
+    """
+    _api_key = "sk-12345"
+    user_api_key_dict = UserAPIKeyAuth(api_key=_api_key, max_parallel_requests=1)
+    local_cache = DualCache()
+    parallel_request_handler = MaxParallelRequestsHandler()
+
+    await parallel_request_handler.async_pre_call_hook(
+        user_api_key_dict=user_api_key_dict, cache=local_cache, data={}, call_type=""
+    )
+
+    current_date = datetime.now().strftime("%Y-%m-%d")
+    current_hour = datetime.now().strftime("%H")
+    current_minute = datetime.now().strftime("%M")
+    precise_minute = f"{current_date}-{current_hour}-{current_minute}"
+    request_count_api_key = f"{_api_key}::{precise_minute}::request_count"
+
+    assert (
+        parallel_request_handler.user_api_key_cache.get_cache(
+            key=request_count_api_key
+        )["current_requests"]
+        == 1
+    )
+
+    kwargs = {"litellm_params": {"metadata": {"user_api_key": _api_key}}}
+
+    await parallel_request_handler.async_log_success_event(
+        kwargs=kwargs, response_obj="", start_time="", end_time=""
+    )
+
+    assert (
+        parallel_request_handler.user_api_key_cache.get_cache(
+            key=request_count_api_key
+        )["current_requests"]
+        == 0
+    )
+
+
+@pytest.mark.asyncio
+async def test_failure_call_hook():
+    """
+    Test if on failure, cache correctly decremented
+    """
+    _api_key = "sk-12345"
+    user_api_key_dict = UserAPIKeyAuth(api_key=_api_key, max_parallel_requests=1)
+    local_cache = DualCache()
+    parallel_request_handler = MaxParallelRequestsHandler()
+
+    await parallel_request_handler.async_pre_call_hook(
+        user_api_key_dict=user_api_key_dict, cache=local_cache, data={}, call_type=""
+    )
+
+    current_date = datetime.now().strftime("%Y-%m-%d")
+    current_hour = datetime.now().strftime("%H")
+    current_minute = datetime.now().strftime("%M")
+    precise_minute = f"{current_date}-{current_hour}-{current_minute}"
+    request_count_api_key = f"{_api_key}::{precise_minute}::request_count"
+
+    assert (
+        parallel_request_handler.user_api_key_cache.get_cache(
+            key=request_count_api_key
+        )["current_requests"]
+        == 1
+    )
+
+    kwargs = {
+        "litellm_params": {"metadata": {"user_api_key": _api_key}},
+        "exception": Exception(),
+    }
+
+    await parallel_request_handler.async_log_failure_event(
+        kwargs=kwargs, response_obj="", start_time="", end_time=""
+    )
+
+    assert (
+        parallel_request_handler.user_api_key_cache.get_cache(
+            key=request_count_api_key
+        )["current_requests"]
+        == 0
+    )
+
+
+"""
+Test with Router 
+- normal call 
+- streaming call 
+- bad call 
+"""
+
+
+@pytest.mark.asyncio
+async def test_normal_router_call():
+    model_list = [
+        {
+            "model_name": "azure-model",
+            "litellm_params": {
+                "model": "azure/gpt-turbo",
+                "api_key": "os.environ/AZURE_FRANCE_API_KEY",
+                "api_base": "https://openai-france-1234.openai.azure.com",
+                "rpm": 1440,
+            },
+            "model_info": {"id": 1},
+        },
+        {
+            "model_name": "azure-model",
+            "litellm_params": {
+                "model": "azure/gpt-35-turbo",
+                "api_key": "os.environ/AZURE_EUROPE_API_KEY",
+                "api_base": "https://my-endpoint-europe-berri-992.openai.azure.com",
+                "rpm": 6,
+            },
+            "model_info": {"id": 2},
+        },
+    ]
+    router = Router(
+        model_list=model_list,
+        set_verbose=False,
+        num_retries=3,
+    )  # type: ignore
+
+    _api_key = "sk-12345"
+    user_api_key_dict = UserAPIKeyAuth(api_key=_api_key, max_parallel_requests=1)
+    local_cache = DualCache()
+    pl = ProxyLogging(user_api_key_cache=local_cache)
+    pl._init_litellm_callbacks()
+    print(f"litellm callbacks: {litellm.callbacks}")
+    parallel_request_handler = pl.max_parallel_request_limiter
+
+    await parallel_request_handler.async_pre_call_hook(
+        user_api_key_dict=user_api_key_dict, cache=local_cache, data={}, call_type=""
+    )
+
+    current_date = datetime.now().strftime("%Y-%m-%d")
+    current_hour = datetime.now().strftime("%H")
+    current_minute = datetime.now().strftime("%M")
+    precise_minute = f"{current_date}-{current_hour}-{current_minute}"
+    request_count_api_key = f"{_api_key}::{precise_minute}::request_count"
+
+    assert (
+        parallel_request_handler.user_api_key_cache.get_cache(
+            key=request_count_api_key
+        )["current_requests"]
+        == 1
+    )
+
+    # normal call
+    response = await router.acompletion(
+        model="azure-model",
+        messages=[{"role": "user", "content": "Hey, how's it going?"}],
+        metadata={"user_api_key": _api_key},
+    )
+    await asyncio.sleep(1)  # success is done in a separate thread
+    print(f"response: {response}")
+
+    assert (
+        parallel_request_handler.user_api_key_cache.get_cache(
+            key=request_count_api_key
+        )["current_requests"]
+        == 0
+    )
+
+
+@pytest.mark.asyncio
+async def test_normal_router_tpm_limit():
+    model_list = [
+        {
+            "model_name": "azure-model",
+            "litellm_params": {
+                "model": "azure/gpt-turbo",
+                "api_key": "os.environ/AZURE_FRANCE_API_KEY",
+                "api_base": "https://openai-france-1234.openai.azure.com",
+                "rpm": 1440,
+            },
+            "model_info": {"id": 1},
+        },
+        {
+            "model_name": "azure-model",
+            "litellm_params": {
+                "model": "azure/gpt-35-turbo",
+                "api_key": "os.environ/AZURE_EUROPE_API_KEY",
+                "api_base": "https://my-endpoint-europe-berri-992.openai.azure.com",
+                "rpm": 6,
+            },
+            "model_info": {"id": 2},
+        },
+    ]
+    router = Router(
+        model_list=model_list,
+        set_verbose=False,
+        num_retries=3,
+    )  # type: ignore
+
+    _api_key = "sk-12345"
+    user_api_key_dict = UserAPIKeyAuth(
+        api_key=_api_key, max_parallel_requests=10, tpm_limit=10
+    )
+    local_cache = DualCache()
+    pl = ProxyLogging(user_api_key_cache=local_cache)
+    pl._init_litellm_callbacks()
+    print(f"litellm callbacks: {litellm.callbacks}")
+    parallel_request_handler = pl.max_parallel_request_limiter
+
+    await parallel_request_handler.async_pre_call_hook(
+        user_api_key_dict=user_api_key_dict, cache=local_cache, data={}, call_type=""
+    )
+
+    current_date = datetime.now().strftime("%Y-%m-%d")
+    current_hour = datetime.now().strftime("%H")
+    current_minute = datetime.now().strftime("%M")
+    precise_minute = f"{current_date}-{current_hour}-{current_minute}"
+    request_count_api_key = f"{_api_key}::{precise_minute}::request_count"
+
+    assert (
+        parallel_request_handler.user_api_key_cache.get_cache(
+            key=request_count_api_key
+        )["current_requests"]
+        == 1
+    )
+
+    # normal call
+    response = await router.acompletion(
+        model="azure-model",
+        messages=[{"role": "user", "content": "Write me a paragraph on the moon"}],
+        metadata={"user_api_key": _api_key},
+    )
+    await asyncio.sleep(1)  # success is done in a separate thread
+    print(f"response: {response}")
+
+    try:
+        await parallel_request_handler.async_pre_call_hook(
+            user_api_key_dict=user_api_key_dict,
+            cache=local_cache,
+            data={},
+            call_type="",
+        )
+
+        pytest.fail(f"Expected call to fail")
+    except Exception as e:
+        assert e.status_code == 429
+
+
+@pytest.mark.asyncio
+async def test_streaming_router_call():
+    model_list = [
+        {
+            "model_name": "azure-model",
+            "litellm_params": {
+                "model": "azure/gpt-turbo",
+                "api_key": "os.environ/AZURE_FRANCE_API_KEY",
+                "api_base": "https://openai-france-1234.openai.azure.com",
+                "rpm": 1440,
+            },
+            "model_info": {"id": 1},
+        },
+        {
+            "model_name": "azure-model",
+            "litellm_params": {
+                "model": "azure/gpt-35-turbo",
+                "api_key": "os.environ/AZURE_EUROPE_API_KEY",
+                "api_base": "https://my-endpoint-europe-berri-992.openai.azure.com",
+                "rpm": 6,
+            },
+            "model_info": {"id": 2},
+        },
+    ]
+    router = Router(
+        model_list=model_list,
+        set_verbose=False,
+        num_retries=3,
+    )  # type: ignore
+
+    _api_key = "sk-12345"
+    user_api_key_dict = UserAPIKeyAuth(api_key=_api_key, max_parallel_requests=1)
+    local_cache = DualCache()
+    pl = ProxyLogging(user_api_key_cache=local_cache)
+    pl._init_litellm_callbacks()
+    print(f"litellm callbacks: {litellm.callbacks}")
+    parallel_request_handler = pl.max_parallel_request_limiter
+
+    await parallel_request_handler.async_pre_call_hook(
+        user_api_key_dict=user_api_key_dict, cache=local_cache, data={}, call_type=""
+    )
+
+    current_date = datetime.now().strftime("%Y-%m-%d")
+    current_hour = datetime.now().strftime("%H")
+    current_minute = datetime.now().strftime("%M")
+    precise_minute = f"{current_date}-{current_hour}-{current_minute}"
+    request_count_api_key = f"{_api_key}::{precise_minute}::request_count"
+
+    assert (
+        parallel_request_handler.user_api_key_cache.get_cache(
+            key=request_count_api_key
+        )["current_requests"]
+        == 1
+    )
+
+    # streaming call
+    response = await router.acompletion(
+        model="azure-model",
+        messages=[{"role": "user", "content": "Hey, how's it going?"}],
+        stream=True,
+        metadata={"user_api_key": _api_key},
+    )
+    async for chunk in response:
+        continue
+    await asyncio.sleep(1)  # success is done in a separate thread
+    assert (
+        parallel_request_handler.user_api_key_cache.get_cache(
+            key=request_count_api_key
+        )["current_requests"]
+        == 0
+    )
+
+
+@pytest.mark.asyncio
+async def test_streaming_router_tpm_limit():
+    model_list = [
+        {
+            "model_name": "azure-model",
+            "litellm_params": {
+                "model": "azure/gpt-turbo",
+                "api_key": "os.environ/AZURE_FRANCE_API_KEY",
+                "api_base": "https://openai-france-1234.openai.azure.com",
+                "rpm": 1440,
+            },
+            "model_info": {"id": 1},
+        },
+        {
+            "model_name": "azure-model",
+            "litellm_params": {
+                "model": "azure/gpt-35-turbo",
+                "api_key": "os.environ/AZURE_EUROPE_API_KEY",
+                "api_base": "https://my-endpoint-europe-berri-992.openai.azure.com",
+                "rpm": 6,
+            },
+            "model_info": {"id": 2},
+        },
+    ]
+    router = Router(
+        model_list=model_list,
+        set_verbose=False,
+        num_retries=3,
+    )  # type: ignore
+
+    _api_key = "sk-12345"
+    user_api_key_dict = UserAPIKeyAuth(
+        api_key=_api_key, max_parallel_requests=10, tpm_limit=10
+    )
+    local_cache = DualCache()
+    pl = ProxyLogging(user_api_key_cache=local_cache)
+    pl._init_litellm_callbacks()
+    print(f"litellm callbacks: {litellm.callbacks}")
+    parallel_request_handler = pl.max_parallel_request_limiter
+
+    await parallel_request_handler.async_pre_call_hook(
+        user_api_key_dict=user_api_key_dict, cache=local_cache, data={}, call_type=""
+    )
+
+    current_date = datetime.now().strftime("%Y-%m-%d")
+    current_hour = datetime.now().strftime("%H")
+    current_minute = datetime.now().strftime("%M")
+    precise_minute = f"{current_date}-{current_hour}-{current_minute}"
+    request_count_api_key = f"{_api_key}::{precise_minute}::request_count"
+
+    assert (
+        parallel_request_handler.user_api_key_cache.get_cache(
+            key=request_count_api_key
+        )["current_requests"]
+        == 1
+    )
+
+    # normal call
+    response = await router.acompletion(
+        model="azure-model",
+        messages=[{"role": "user", "content": "Write me a paragraph on the moon"}],
+        stream=True,
+        metadata={"user_api_key": _api_key},
+    )
+    async for chunk in response:
+        continue
+    await asyncio.sleep(1)  # success is done in a separate thread
+
+    try:
+        await parallel_request_handler.async_pre_call_hook(
+            user_api_key_dict=user_api_key_dict,
+            cache=local_cache,
+            data={},
+            call_type="",
+        )
+
+        pytest.fail(f"Expected call to fail")
+    except Exception as e:
+        assert e.status_code == 429
+
+
+@pytest.mark.asyncio
+async def test_bad_router_call():
+    model_list = [
+        {
+            "model_name": "azure-model",
+            "litellm_params": {
+                "model": "azure/gpt-turbo",
+                "api_key": "os.environ/AZURE_FRANCE_API_KEY",
+                "api_base": "https://openai-france-1234.openai.azure.com",
+                "rpm": 1440,
+            },
+            "model_info": {"id": 1},
+        },
+        {
+            "model_name": "azure-model",
+            "litellm_params": {
+                "model": "azure/gpt-35-turbo",
+                "api_key": "os.environ/AZURE_EUROPE_API_KEY",
+                "api_base": "https://my-endpoint-europe-berri-992.openai.azure.com",
+                "rpm": 6,
+            },
+            "model_info": {"id": 2},
+        },
+    ]
+    router = Router(
+        model_list=model_list,
+        set_verbose=False,
+        num_retries=3,
+    )  # type: ignore
+
+    _api_key = "sk-12345"
+    user_api_key_dict = UserAPIKeyAuth(api_key=_api_key, max_parallel_requests=1)
+    local_cache = DualCache()
+    pl = ProxyLogging(user_api_key_cache=local_cache)
+    pl._init_litellm_callbacks()
+    print(f"litellm callbacks: {litellm.callbacks}")
+    parallel_request_handler = pl.max_parallel_request_limiter
+
+    await parallel_request_handler.async_pre_call_hook(
+        user_api_key_dict=user_api_key_dict, cache=local_cache, data={}, call_type=""
+    )
+
+    current_date = datetime.now().strftime("%Y-%m-%d")
+    current_hour = datetime.now().strftime("%H")
+    current_minute = datetime.now().strftime("%M")
+    precise_minute = f"{current_date}-{current_hour}-{current_minute}"
+    request_count_api_key = f"{_api_key}::{precise_minute}::request_count"
+
+    assert (
+        parallel_request_handler.user_api_key_cache.get_cache(
+            key=request_count_api_key
+        )["current_requests"]
+        == 1
+    )
+
+    # bad streaming call
+    try:
+        response = await router.acompletion(
+            model="azure-model",
+            messages=[{"role": "user2", "content": "Hey, how's it going?"}],
+            stream=True,
+            metadata={"user_api_key": _api_key},
+        )
+    except:
+        pass
+    assert (
+        parallel_request_handler.user_api_key_cache.get_cache(
+            key=request_count_api_key
+        )["current_requests"]
+        == 0
+    )
+
+
+@pytest.mark.asyncio
+async def test_bad_router_tpm_limit():
+    model_list = [
+        {
+            "model_name": "azure-model",
+            "litellm_params": {
+                "model": "azure/gpt-turbo",
+                "api_key": "os.environ/AZURE_FRANCE_API_KEY",
+                "api_base": "https://openai-france-1234.openai.azure.com",
+                "rpm": 1440,
+            },
+            "model_info": {"id": 1},
+        },
+        {
+            "model_name": "azure-model",
+            "litellm_params": {
+                "model": "azure/gpt-35-turbo",
+                "api_key": "os.environ/AZURE_EUROPE_API_KEY",
+                "api_base": "https://my-endpoint-europe-berri-992.openai.azure.com",
+                "rpm": 6,
+            },
+            "model_info": {"id": 2},
+        },
+    ]
+    router = Router(
+        model_list=model_list,
+        set_verbose=False,
+        num_retries=3,
+    )  # type: ignore
+
+    _api_key = "sk-12345"
+    user_api_key_dict = UserAPIKeyAuth(
+        api_key=_api_key, max_parallel_requests=10, tpm_limit=10
+    )
+    local_cache = DualCache()
+    pl = ProxyLogging(user_api_key_cache=local_cache)
+    pl._init_litellm_callbacks()
+    print(f"litellm callbacks: {litellm.callbacks}")
+    parallel_request_handler = pl.max_parallel_request_limiter
+
+    await parallel_request_handler.async_pre_call_hook(
+        user_api_key_dict=user_api_key_dict, cache=local_cache, data={}, call_type=""
+    )
+
+    current_date = datetime.now().strftime("%Y-%m-%d")
+    current_hour = datetime.now().strftime("%H")
+    current_minute = datetime.now().strftime("%M")
+    precise_minute = f"{current_date}-{current_hour}-{current_minute}"
+    request_count_api_key = f"{_api_key}::{precise_minute}::request_count"
+
+    assert (
+        parallel_request_handler.user_api_key_cache.get_cache(
+            key=request_count_api_key
+        )["current_requests"]
+        == 1
+    )
+
+    # bad call
+    try:
+        response = await router.acompletion(
+            model="azure-model",
+            messages=[{"role": "user2", "content": "Write me a paragraph on the moon"}],
+            stream=True,
+            metadata={"user_api_key": _api_key},
+        )
+    except:
+        pass
+    await asyncio.sleep(1)  # success is done in a separate thread
+
+    assert (
+        parallel_request_handler.user_api_key_cache.get_cache(
+            key=request_count_api_key
+        )["current_tpm"]
+        == 0
+    )
--- a/litellm/tests/test_proxy_custom_auth.py
+++ b/litellm/tests/test_proxy_custom_auth.py
@ -58,9 +58,10 @@ def test_custom_auth(client):

        headers = {"Authorization": f"Bearer {token}"}
        response = client.post("/chat/completions", json=test_data, headers=headers)
-        print(f"response: {response.text}")
-        assert response.status_code == 401
-        result = response.json()
-        print(f"Received response: {result}")
+        pytest.fail("LiteLLM Proxy test failed. This request should have been rejected")
    except Exception as e:
-        pytest.fail("LiteLLM Proxy test failed. Exception", e)
+        print(vars(e))
+        print("got an exception")
+        assert e.code == 401
+        assert e.message == "Authentication Error, Failed custom auth"
+        pass
--- a/litellm/tests/test_proxy_pass_user_config.py
+++ b/litellm/tests/test_proxy_pass_user_config.py
@ -32,7 +32,7 @@ from litellm.proxy.proxy_server import (
 )  # Replace with the actual module where your FastAPI router is defined

 # Your bearer token
-token = ""
+token = "sk-1234"

 headers = {"Authorization": f"Bearer {token}"}

--- a/litellm/tests/test_proxy_server.py
+++ b/litellm/tests/test_proxy_server.py
@ -31,7 +31,7 @@ from litellm.proxy.proxy_server import (
 )  # Replace with the actual module where your FastAPI router is defined

 # Your bearer token
-token = ""
+token = "sk-1234"

 headers = {"Authorization": f"Bearer {token}"}

--- a/litellm/tests/test_proxy_server_caching.py
+++ b/litellm/tests/test_proxy_server_caching.py
@ -33,7 +33,7 @@ from litellm.proxy.proxy_server import (
 )  # Replace with the actual module where your FastAPI router is defined

 # Your bearer token
-token = ""
+token = "sk-1234"

 headers = {"Authorization": f"Bearer {token}"}

--- a/litellm/tests/test_router.py
+++ b/litellm/tests/test_router.py
@ -942,3 +942,52 @@ def test_reading_openai_keys_os_environ():


 # test_reading_openai_keys_os_environ()
+
+
+def test_router_anthropic_key_dynamic():
+    anthropic_api_key = os.environ.pop("ANTHROPIC_API_KEY")
+    model_list = [
+        {
+            "model_name": "anthropic-claude",
+            "litellm_params": {
+                "model": "claude-instant-1",
+                "api_key": anthropic_api_key,
+            },
+        }
+    ]
+
+    router = Router(model_list=model_list)
+    messages = [{"role": "user", "content": "Hey, how's it going?"}]
+    router.completion(model="anthropic-claude", messages=messages)
+    os.environ["ANTHROPIC_API_KEY"] = anthropic_api_key
+
+
+def test_router_timeout():
+    litellm.set_verbose = True
+    from litellm._logging import verbose_logger
+    import logging
+
+    verbose_logger.setLevel(logging.DEBUG)
+    model_list = [
+        {
+            "model_name": "gpt-3.5-turbo",
+            "litellm_params": {
+                "model": "gpt-3.5-turbo",
+                "api_key": "os.environ/OPENAI_API_KEY",
+            },
+        }
+    ]
+    router = Router(model_list=model_list)
+    messages = [{"role": "user", "content": "Hey, how's it going?"}]
+    start_time = time.time()
+    try:
+        res = router.completion(
+            model="gpt-3.5-turbo", messages=messages, timeout=0.0001
+        )
+        print(res)
+        pytest.fail("this should have timed out")
+    except litellm.exceptions.Timeout as e:
+        print("got timeout exception")
+        print(e)
+        print(vars(e))
+        pass
--- a/litellm/tests/test_router_debug_logs.py
+++ b/litellm/tests/test_router_debug_logs.py
@ -10,15 +10,16 @@ import litellm, asyncio, logging
 from litellm import Router

 # this tests debug logs from litellm router and litellm proxy server
-from litellm._logging import verbose_router_logger
-
-verbose_router_logger.setLevel(level=logging.INFO)
+from litellm._logging import verbose_router_logger, verbose_logger, verbose_proxy_logger


 # this tests debug logs from litellm router and litellm proxy server
 def test_async_fallbacks(caplog):
    # THIS IS A PROD TEST - DO NOT DELETE THIS. Used for testing if litellm proxy verbose logs are human readable
    litellm.set_verbose = False
+    verbose_router_logger.setLevel(level=logging.INFO)
+    verbose_logger.setLevel(logging.CRITICAL + 1)
+    verbose_proxy_logger.setLevel(logging.CRITICAL + 1)
    model_list = [
        {
            "model_name": "azure/gpt-3.5-turbo",
@ -69,7 +70,10 @@ def test_async_fallbacks(caplog):
    # on circle ci the captured logs get some async task exception logs - filter them out
    "Task exception was never retrieved"
    captured_logs = [
-        log for log in captured_logs if "Task exception was never retrieved" not in log
+        log
+        for log in captured_logs
+        if "Task exception was never retrieved" not in log
+        and "get_available_deployment" not in log
    ]

    print("\n Captured caplog records - ", captured_logs)
--- a/litellm/tests/test_router_fallbacks.py
+++ b/litellm/tests/test_router_fallbacks.py
@ -698,3 +698,207 @@ async def test_async_fallbacks_max_retries_per_request():
        pytest.fail(f"An exception occurred: {e}")
    finally:
        router.reset()
+
+
+def test_usage_based_routing_fallbacks():
+    try:
+        # [Prod Test]
+        # IT tests Usage Based Routing with fallbacks
+        # The Request should fail azure/gpt-4-fast. Then fallback -> "azure/gpt-4-basic" -> "openai-gpt-4"
+        # It should work with "openai-gpt-4"
+        import os
+        import litellm
+        from litellm import Router
+        from dotenv import load_dotenv
+
+        load_dotenv()
+
+        # Constants for TPM and RPM allocation
+        AZURE_FAST_TPM = 3
+        AZURE_BASIC_TPM = 4
+        OPENAI_TPM = 400
+        ANTHROPIC_TPM = 100000
+
+        def get_azure_params(deployment_name: str):
+            params = {
+                "model": f"azure/{deployment_name}",
+                "api_key": os.environ["AZURE_API_KEY"],
+                "api_version": os.environ["AZURE_API_VERSION"],
+                "api_base": os.environ["AZURE_API_BASE"],
+            }
+            return params
+
+        def get_openai_params(model: str):
+            params = {
+                "model": model,
+                "api_key": os.environ["OPENAI_API_KEY"],
+            }
+            return params
+
+        def get_anthropic_params(model: str):
+            params = {
+                "model": model,
+                "api_key": os.environ["ANTHROPIC_API_KEY"],
+            }
+            return params
+
+        model_list = [
+            {
+                "model_name": "azure/gpt-4-fast",
+                "litellm_params": get_azure_params("chatgpt-v-2"),
+                "tpm": AZURE_FAST_TPM,
+            },
+            {
+                "model_name": "azure/gpt-4-basic",
+                "litellm_params": get_azure_params("chatgpt-v-2"),
+                "tpm": AZURE_BASIC_TPM,
+            },
+            {
+                "model_name": "openai-gpt-4",
+                "litellm_params": get_openai_params("gpt-3.5-turbo"),
+                "tpm": OPENAI_TPM,
+            },
+            {
+                "model_name": "anthropic-claude-instant-1.2",
+                "litellm_params": get_anthropic_params("claude-instant-1.2"),
+                "tpm": ANTHROPIC_TPM,
+            },
+        ]
+        # litellm.set_verbose=True
+        fallbacks_list = [
+            {"azure/gpt-4-fast": ["azure/gpt-4-basic"]},
+            {"azure/gpt-4-basic": ["openai-gpt-4"]},
+            {"openai-gpt-4": ["anthropic-claude-instant-1.2"]},
+        ]
+
+        router = Router(
+            model_list=model_list,
+            fallbacks=fallbacks_list,
+            set_verbose=True,
+            debug_level="DEBUG",
+            routing_strategy="usage-based-routing",
+            redis_host=os.environ["REDIS_HOST"],
+            redis_port=os.environ["REDIS_PORT"],
+        )
+
+        messages = [
+            {"content": "Tell me a joke.", "role": "user"},
+        ]
+        response = router.completion(
+            model="azure/gpt-4-fast",
+            messages=messages,
+            timeout=5,
+            mock_response="very nice to meet you",
+        )
+        print("response: ", response)
+        print("response._hidden_params: ", response._hidden_params)
+        # in this test, we expect azure/gpt-4 fast to fail, then azure-gpt-4 basic to fail and then openai-gpt-4 to pass
+        # the token count of this message is > AZURE_FAST_TPM, > AZURE_BASIC_TPM
+        assert response._hidden_params["custom_llm_provider"] == "openai"
+
+        # now make 100 mock requests to OpenAI - expect it to fallback to anthropic-claude-instant-1.2
+        for i in range(20):
+            response = router.completion(
+                model="azure/gpt-4-fast",
+                messages=messages,
+                timeout=5,
+                mock_response="very nice to meet you",
+            )
+            print("response: ", response)
+            print("response._hidden_params: ", response._hidden_params)
+            if i == 19:
+                # by the 19th call we should have hit TPM LIMIT for OpenAI, it should fallback to anthropic-claude-instant-1.2
+                assert response._hidden_params["custom_llm_provider"] == "anthropic"
+
+    except Exception as e:
+        pytest.fail(f"An exception occurred {e}")
+
+
+def test_custom_cooldown_times():
+    try:
+        # set, custom_cooldown. Failed model in cooldown_models, after custom_cooldown, the failed model is no longer in cooldown_models
+
+        model_list = [
+            {  # list of model deployments
+                "model_name": "gpt-3.5-turbo",  # openai model name
+                "litellm_params": {  # params for litellm completion/embedding call
+                    "model": "azure/chatgpt-v-2",
+                    "api_key": "bad-key",
+                    "api_version": os.getenv("AZURE_API_VERSION"),
+                    "api_base": os.getenv("AZURE_API_BASE"),
+                },
+                "tpm": 24000000,
+            },
+            {  # list of model deployments
+                "model_name": "gpt-3.5-turbo",  # openai model name
+                "litellm_params": {  # params for litellm completion/embedding call
+                    "model": "azure/chatgpt-v-2",
+                    "api_key": os.getenv("AZURE_API_KEY"),
+                    "api_version": os.getenv("AZURE_API_VERSION"),
+                    "api_base": os.getenv("AZURE_API_BASE"),
+                },
+                "tpm": 1,
+            },
+        ]
+
+        litellm.set_verbose = False
+
+        router = Router(
+            model_list=model_list,
+            set_verbose=True,
+            debug_level="INFO",
+            cooldown_time=0.1,
+            redis_host=os.getenv("REDIS_HOST"),
+            redis_password=os.getenv("REDIS_PASSWORD"),
+            redis_port=int(os.getenv("REDIS_PORT")),
+        )
+
+        # make a request - expect it to fail
+        try:
+            response = router.completion(
+                model="gpt-3.5-turbo",
+                messages=[
+                    {
+                        "content": "Tell me a joke.",
+                        "role": "user",
+                    }
+                ],
+            )
+        except:
+            pass
+
+        # expect 1 model to be in cooldown models
+        cooldown_deployments = router._get_cooldown_deployments()
+        print("cooldown_deployments after failed call: ", cooldown_deployments)
+        assert (
+            len(cooldown_deployments) == 1
+        ), "Expected 1 model to be in cooldown models"
+
+        selected_cooldown_model = cooldown_deployments[0]
+
+        # wait for 1/2 of cooldown time
+        time.sleep(router.cooldown_time / 2)
+
+        # expect cooldown model to still be in cooldown models
+        cooldown_deployments = router._get_cooldown_deployments()
+        print(
+            "cooldown_deployments after waiting 1/2 of cooldown: ", cooldown_deployments
+        )
+        assert (
+            len(cooldown_deployments) == 1
+        ), "Expected 1 model to be in cooldown models"
+
+        # wait for 1/2 of cooldown time again, now we've waited for full cooldown
+        time.sleep(router.cooldown_time / 2)
+
+        # expect cooldown model to be removed from cooldown models
+        cooldown_deployments = router._get_cooldown_deployments()
+        print(
+            "cooldown_deployments after waiting cooldown time: ", cooldown_deployments
+        )
+        assert (
+            len(cooldown_deployments) == 0
+        ), "Expected 0 models to be in cooldown models"
+
+    except Exception as e:
+        print(e)
--- a/litellm/tests/test_router_get_deployments.py
+++ b/litellm/tests/test_router_get_deployments.py
@ -375,3 +375,76 @@ def test_model_group_aliases():


 # test_model_group_aliases()
+
+
+def test_usage_based_routing():
+    """
+    in this test we, have a model group with two models in it, model-a and model-b.
+    Then at some point, we exceed the TPM limit (set in the litellm_params)
+    for model-a only; but for model-b we are still under the limit
+    """
+    try:
+
+        def get_azure_params(deployment_name: str):
+            params = {
+                "model": f"azure/{deployment_name}",
+                "api_key": os.environ["AZURE_API_KEY"],
+                "api_version": os.environ["AZURE_API_VERSION"],
+                "api_base": os.environ["AZURE_API_BASE"],
+            }
+            return params
+
+        model_list = [
+            {
+                "model_name": "azure/gpt-4",
+                "litellm_params": get_azure_params("chatgpt-low-tpm"),
+                "tpm": 100,
+            },
+            {
+                "model_name": "azure/gpt-4",
+                "litellm_params": get_azure_params("chatgpt-high-tpm"),
+                "tpm": 1000,
+            },
+        ]
+
+        router = Router(
+            model_list=model_list,
+            set_verbose=True,
+            debug_level="DEBUG",
+            routing_strategy="usage-based-routing",
+            redis_host=os.environ["REDIS_HOST"],
+            redis_port=os.environ["REDIS_PORT"],
+        )
+
+        messages = [
+            {"content": "Tell me a joke.", "role": "user"},
+        ]
+
+        selection_counts = defaultdict(int)
+        for _ in range(25):
+            response = router.completion(
+                model="azure/gpt-4",
+                messages=messages,
+                timeout=5,
+                mock_response="good morning",
+            )
+
+            # print(response)
+
+            selection_counts[response["model"]] += 1
+
+        print(selection_counts)
+
+        total_requests = sum(selection_counts.values())
+
+        # Assert that 'chatgpt-low-tpm' has more than 2 requests
+        assert (
+            selection_counts["chatgpt-low-tpm"] > 2
+        ), f"Assertion failed: 'chatgpt-low-tpm' does not have more than 2 request in the weighted load balancer. Selection counts {selection_counts}"
+
+        # Assert that 'chatgpt-high-tpm' has about 80% of the total requests
+        assert (
+            selection_counts["chatgpt-high-tpm"] / total_requests > 0.8
+        ), f"Assertion failed: 'chatgpt-high-tpm' does not have about 80% of the total requests in the weighted load balancer. Selection counts {selection_counts}"
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@ -274,7 +274,7 @@ def test_completion_azure_stream():
        pytest.fail(f"Error occurred: {e}")


-test_completion_azure_stream()
+# test_completion_azure_stream()


 def test_completion_azure_function_calling_stream():
@ -398,6 +398,36 @@ def test_completion_palm_stream():
 # test_completion_palm_stream()


+def test_completion_gemini_stream():
+    try:
+        litellm.set_verbose = False
+        print("Streaming gemini response")
+        messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {
+                "role": "user",
+                "content": "how does a court case get to the Supreme Court?",
+            },
+        ]
+        print("testing gemini streaming")
+        response = completion(model="gemini/gemini-pro", messages=messages, stream=True)
+        print(f"type of response at the top: {response}")
+        complete_response = ""
+        # Add any assertions here to check the response
+        for idx, chunk in enumerate(response):
+            print(chunk)
+            # print(chunk.choices[0].delta)
+            chunk, finished = streaming_format_tests(idx, chunk)
+            if finished:
+                break
+            complete_response += chunk
+        if complete_response.strip() == "":
+            raise Exception("Empty response received")
+        print(f"completion_response: {complete_response}")
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+
 def test_completion_mistral_api_stream():
    try:
        litellm.set_verbose = True
@ -703,8 +733,15 @@ def test_completion_bedrock_claude_stream():
        complete_response = ""
        has_finish_reason = False
        # Add any assertions here to check the response
+        first_chunk_id = None
        for idx, chunk in enumerate(response):
            # print
+            if idx == 0:
+                first_chunk_id = chunk.id
+            else:
+                assert (
+                    chunk.id == first_chunk_id
+                ), f"chunk ids do not match: {chunk.id} != first chunk id{first_chunk_id}"
            chunk, finished = streaming_format_tests(idx, chunk)
            has_finish_reason = finished
            complete_response += chunk
@ -769,9 +806,30 @@ def test_sagemaker_weird_response():
    When the stream ends, flush any remaining holding chunks.
    """
    try:
-        chunk = """<s>[INST] Hey, how's it going? [/INST]
+        from litellm.llms.sagemaker import TokenIterator
+        import json
+        import json
+        from litellm.llms.sagemaker import TokenIterator

-    I'm doing well, thanks for asking! How about you? Is there anything you'd like to chat about or ask? I'm here to help with any questions you might have."""
+        chunk = """<s>[INST] Hey, how's it going? [/INST],
+        I'm doing well, thanks for asking! How about you? Is there anything you'd like to chat about or ask? I'm here to help with any questions you might have."""
+
+        data = "\n".join(
+            map(
+                lambda x: f"data: {json.dumps({'token': {'text': x.strip()}})}",
+                chunk.strip().split(","),
+            )
+        )
+        stream = bytes(data, encoding="utf8")
+
+        # Modify the array to be a dictionary with "PayloadPart" and "Bytes" keys.
+        stream_iterator = iter([{"PayloadPart": {"Bytes": stream}}])
+
+        token_iter = TokenIterator(stream_iterator)
+
+        # for token in token_iter:
+        #     print(token)
+        litellm.set_verbose = True

        logging_obj = litellm.Logging(
            model="berri-benchmarking-Llama-2-70b-chat-hf-4",
@ -783,14 +841,19 @@ def test_sagemaker_weird_response():
            start_time=time.time(),
        )
        response = litellm.CustomStreamWrapper(
-            completion_stream=chunk,
+            completion_stream=token_iter,
            model="berri-benchmarking-Llama-2-70b-chat-hf-4",
            custom_llm_provider="sagemaker",
            logging_obj=logging_obj,
        )
        complete_response = ""
-        for chunk in response:
-            complete_response += chunk["choices"][0]["delta"]["content"]
+        for idx, chunk in enumerate(response):
+            # print
+            chunk, finished = streaming_format_tests(idx, chunk)
+            has_finish_reason = finished
+            complete_response += chunk
+            if finished:
+                break
        assert len(complete_response) > 0
    except Exception as e:
        pytest.fail(f"An exception occurred - {str(e)}")
@ -813,41 +876,53 @@ async def test_sagemaker_streaming_async():
        )

        # Add any assertions here to check the response
+        print(response)
        complete_response = ""
+        has_finish_reason = False
+        # Add any assertions here to check the response
+        idx = 0
        async for chunk in response:
-            complete_response += chunk.choices[0].delta.content or ""
-        print(f"complete_response: {complete_response}")
-        assert len(complete_response) > 0
+            # print
+            chunk, finished = streaming_format_tests(idx, chunk)
+            has_finish_reason = finished
+            complete_response += chunk
+            if finished:
+                break
+            idx += 1
+        if has_finish_reason is False:
+            raise Exception("finish reason not set for last chunk")
+        if complete_response.strip() == "":
+            raise Exception("Empty response received")
+        print(f"completion_response: {complete_response}")
    except Exception as e:
        pytest.fail(f"An exception occurred - {str(e)}")


-# def test_completion_sagemaker_stream():
-#     try:
-#         response = completion(
-#             model="sagemaker/jumpstart-dft-meta-textgeneration-llama-2-7b",
-#             messages=messages,
-#             temperature=0.2,
-#             max_tokens=80,
-#             stream=True,
-#         )
-#         complete_response = ""
-#         has_finish_reason = False
-#         # Add any assertions here to check the response
-#         for idx, chunk in enumerate(response):
-#             chunk, finished = streaming_format_tests(idx, chunk)
-#             has_finish_reason = finished
-#             if finished:
-#                 break
-#             complete_response += chunk
-#         if has_finish_reason is False:
-#             raise Exception("finish reason not set for last chunk")
-#         if complete_response.strip() == "":
-#             raise Exception("Empty response received")
-#     except InvalidRequestError as e:
-#         pass
-#     except Exception as e:
-#         pytest.fail(f"Error occurred: {e}")
+def test_completion_sagemaker_stream():
+    try:
+        response = completion(
+            model="sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4",
+            messages=messages,
+            temperature=0.2,
+            max_tokens=80,
+            stream=True,
+        )
+        complete_response = ""
+        has_finish_reason = False
+        # Add any assertions here to check the response
+        for idx, chunk in enumerate(response):
+            chunk, finished = streaming_format_tests(idx, chunk)
+            has_finish_reason = finished
+            if finished:
+                break
+            complete_response += chunk
+        if has_finish_reason is False:
+            raise Exception("finish reason not set for last chunk")
+        if complete_response.strip() == "":
+            raise Exception("Empty response received")
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+

 # test_completion_sagemaker_stream()

--- a/litellm/tests/test_timeout.py
+++ b/litellm/tests/test_timeout.py
@ -39,6 +39,8 @@ def test_timeout():

 def test_hanging_request_azure():
    litellm.set_verbose = True
+    import asyncio
+
    try:
        router = litellm.Router(
            model_list=[
@ -58,13 +60,20 @@ def test_hanging_request_azure():
        )

        encoded = litellm.utils.encode(model="gpt-3.5-turbo", text="blue")[0]
-        response = router.completion(
-            model="azure-gpt",
-            messages=[{"role": "user", "content": f"what color is red {uuid.uuid4()}"}],
-            logit_bias={encoded: 100},
-            timeout=0.01,
-        )
-        print(response)
+
+        async def _test():
+            response = await router.acompletion(
+                model="azure-gpt",
+                messages=[
+                    {"role": "user", "content": f"what color is red {uuid.uuid4()}"}
+                ],
+                logit_bias={encoded: 100},
+                timeout=0.01,
+            )
+            print(response)
+            return response
+
+        response = asyncio.run(_test())

        if response.choices[0].message.content is not None:
            pytest.fail("Got a response, expected a timeout")
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -10,6 +10,7 @@
 import sys, re, binascii, struct
 import litellm
 import dotenv, json, traceback, threading, base64, ast
+
 import subprocess, os
 import litellm, openai
 import itertools
@ -36,6 +37,7 @@ os.environ[
 ] = filename  # use local copy of tiktoken b/c of - https://github.com/BerriAI/litellm/issues/1071
 encoding = tiktoken.get_encoding("cl100k_base")
 import importlib.metadata
+from ._logging import verbose_logger
 from .integrations.traceloop import TraceloopLogger
 from .integrations.helicone import HeliconeLogger
 from .integrations.aispend import AISpendLogger
@ -712,6 +714,7 @@ class ImageResponse(OpenAIObject):
 ############################################################
 def print_verbose(print_statement):
    try:
+        verbose_logger.debug(print_statement)
        if litellm.set_verbose:
            print(print_statement)  # noqa
    except:
@ -764,6 +767,7 @@ class Logging:
        self.litellm_call_id = litellm_call_id
        self.function_id = function_id
        self.streaming_chunks = []  # for generating complete stream response
+        self.sync_streaming_chunks = []  # for generating complete stream response
        self.model_call_details = {}

    def update_environment_variables(
@ -773,7 +777,7 @@ class Logging:
        self.model = model
        self.user = user
        self.litellm_params = litellm_params
-        self.logger_fn = litellm_params["logger_fn"]
+        self.logger_fn = litellm_params.get("logger_fn", None)
        print_verbose(f"self.optional_params: {self.optional_params}")
        self.model_call_details = {
            "model": self.model,
@ -827,7 +831,7 @@ class Logging:
                [f"-H '{k}: {v}'" for k, v in masked_headers.items()]
            )

-            print_verbose(f"PRE-API-CALL ADDITIONAL ARGS: {additional_args}")
+            verbose_logger.debug(f"PRE-API-CALL ADDITIONAL ARGS: {additional_args}")

            curl_command = "\n\nPOST Request Sent from LiteLLM:\n"
            curl_command += "curl -X POST \\\n"
@ -842,7 +846,7 @@ class Logging:
                curl_command += additional_args.get("request_str", None)
            elif api_base == "":
                curl_command = self.model_call_details
-            print_verbose(f"\033[92m{curl_command}\033[0m\n")
+            verbose_logger.info(f"\033[92m{curl_command}\033[0m\n")
            if self.logger_fn and callable(self.logger_fn):
                try:
                    self.logger_fn(
@ -993,13 +997,10 @@ class Logging:
            self.model_call_details["log_event_type"] = "post_api_call"

            # User Logging -> if you pass in a custom logging function
-            print_verbose(
+            verbose_logger.debug(
                f"RAW RESPONSE:\n{self.model_call_details.get('original_response', self.model_call_details)}\n\n"
            )
-            print_verbose(
-                f"Logging Details Post-API Call: logger_fn - {self.logger_fn} | callable(logger_fn) - {callable(self.logger_fn)}"
-            )
-            print_verbose(
+            verbose_logger.debug(
                f"Logging Details Post-API Call: LiteLLM Params: {self.model_call_details}"
            )
            if self.logger_fn and callable(self.logger_fn):
@ -1065,8 +1066,38 @@ class Logging:
            self.model_call_details["log_event_type"] = "successful_api_call"
            self.model_call_details["end_time"] = end_time
            self.model_call_details["cache_hit"] = cache_hit
+            ## if model in model cost map - log the response cost
+            ## else set cost to None
+            verbose_logger.debug(f"Model={self.model}; result={result}")
+            if (
+                result is not None
+                and (
+                    isinstance(result, ModelResponse)
+                    or isinstance(result, EmbeddingResponse)
+                )
+                and self.stream != True
+            ):  # handle streaming separately
+                try:
+                    self.model_call_details["response_cost"] = litellm.completion_cost(
+                        completion_response=result,
+                    )
+                    verbose_logger.debug(
+                        f"Model={self.model}; cost={self.model_call_details['response_cost']}"
+                    )
+                except litellm.NotFoundError as e:
+                    verbose_logger.debug(
+                        f"Model={self.model} not found in completion cost map."
+                    )
+                    self.model_call_details["response_cost"] = None
+            else:  # streaming chunks + image gen.
+                self.model_call_details["response_cost"] = None

-            if litellm.max_budget and self.stream:
+            if (
+                litellm.max_budget
+                and self.stream
+                and result is not None
+                and "content" in result
+            ):
                time_diff = (end_time - start_time).total_seconds()
                float_diff = float(time_diff)
                litellm._current_cost += litellm.completion_cost(
@ -1078,50 +1109,61 @@ class Logging:

            return start_time, end_time, result
        except Exception as e:
-            print_verbose(f"[Non-Blocking] LiteLLM.Success_Call Error: {str(e)}")
+            raise Exception(f"[Non-Blocking] LiteLLM.Success_Call Error: {str(e)}")

    def success_handler(
        self, result=None, start_time=None, end_time=None, cache_hit=None, **kwargs
    ):
-        print_verbose(f"Logging Details LiteLLM-Success Call")
+        verbose_logger.debug(f"Logging Details LiteLLM-Success Call")
+        start_time, end_time, result = self._success_handler_helper_fn(
+            start_time=start_time,
+            end_time=end_time,
+            result=result,
+            cache_hit=cache_hit,
+        )
        # print(f"original response in success handler: {self.model_call_details['original_response']}")
        try:
-            print_verbose(f"success callbacks: {litellm.success_callback}")
+            verbose_logger.debug(f"success callbacks: {litellm.success_callback}")
            ## BUILD COMPLETE STREAMED RESPONSE
            complete_streaming_response = None
-            if (
-                self.stream
-                and self.model_call_details.get("litellm_params", {}).get(
-                    "acompletion", False
-                )
-                == False
-            ):  # only call stream chunk builder if it's not acompletion()
+            if self.stream:
                if (
                    result.choices[0].finish_reason is not None
                ):  # if it's the last chunk
-                    self.streaming_chunks.append(result)
-                    # print_verbose(f"final set of received chunks: {self.streaming_chunks}")
+                    self.sync_streaming_chunks.append(result)
+                    # print_verbose(f"final set of received chunks: {self.sync_streaming_chunks}")
                    try:
                        complete_streaming_response = litellm.stream_chunk_builder(
-                            self.streaming_chunks,
+                            self.sync_streaming_chunks,
                            messages=self.model_call_details.get("messages", None),
+                            start_time=start_time,
+                            end_time=end_time,
                        )
                    except:
                        complete_streaming_response = None
                else:
-                    self.streaming_chunks.append(result)
+                    self.sync_streaming_chunks.append(result)

-            if complete_streaming_response:
+            if complete_streaming_response is not None:
+                verbose_logger.debug(
+                    f"Logging Details LiteLLM-Success Call streaming complete"
+                )
                self.model_call_details[
                    "complete_streaming_response"
                ] = complete_streaming_response
+                try:
+                    self.model_call_details["response_cost"] = litellm.completion_cost(
+                        completion_response=complete_streaming_response,
+                    )
+                    verbose_logger.debug(
+                        f"Model={self.model}; cost={self.model_call_details['response_cost']}"
+                    )
+                except litellm.NotFoundError as e:
+                    verbose_logger.debug(
+                        f"Model={self.model} not found in completion cost map."
+                    )
+                    self.model_call_details["response_cost"] = None

-            start_time, end_time, result = self._success_handler_helper_fn(
-                start_time=start_time,
-                end_time=end_time,
-                result=result,
-                cache_hit=cache_hit,
-            )
            for callback in litellm.success_callback:
                try:
                    if callback == "lite_debugger":
@ -1242,7 +1284,7 @@ class Logging:
                        )
                    if callback == "langfuse":
                        global langFuseLogger
-                        print_verbose("reaches langfuse for logging!")
+                        verbose_logger.debug("reaches langfuse for logging!")
                        kwargs = {}
                        for k, v in self.model_call_details.items():
                            if (
@ -1251,7 +1293,10 @@ class Logging:
                                kwargs[k] = v
                        # this only logs streaming once, complete_streaming_response exists i.e when stream ends
                        if self.stream:
-                            if "complete_streaming_response" not in kwargs:
+                            verbose_logger.debug(
+                                f"is complete_streaming_response in kwargs: {kwargs.get('complete_streaming_response', None)}"
+                            )
+                            if complete_streaming_response is None:
                                break
                            else:
                                print_verbose("reaches langfuse for streaming logging!")
@ -1306,7 +1351,9 @@ class Logging:
                        )
                        == False
                    ):  # custom logger class
-                        print_verbose(f"success callbacks: Running Custom Logger Class")
+                        verbose_logger.info(
+                            f"success callbacks: Running SYNC Custom Logger Class"
+                        )
                        if self.stream and complete_streaming_response is None:
                            callback.log_stream_event(
                                kwargs=self.model_call_details,
@ -1328,7 +1375,17 @@ class Logging:
                                start_time=start_time,
                                end_time=end_time,
                            )
-                    if callable(callback):  # custom logger functions
+                    elif (
+                        callable(callback) == True
+                        and self.model_call_details.get("litellm_params", {}).get(
+                            "acompletion", False
+                        )
+                        == False
+                        and self.model_call_details.get("litellm_params", {}).get(
+                            "aembedding", False
+                        )
+                        == False
+                    ):  # custom logger functions
                        print_verbose(
                            f"success callbacks: Running Custom Callback Function"
                        )
@ -1362,33 +1419,52 @@ class Logging:
        """
        Implementing async callbacks, to handle asyncio event loop issues when custom integrations need to use async functions.
        """
-        print_verbose(f"Async success callbacks: {litellm._async_success_callback}")
+        verbose_logger.debug(
+            f"Async success callbacks: {litellm._async_success_callback}"
+        )
+        start_time, end_time, result = self._success_handler_helper_fn(
+            start_time=start_time, end_time=end_time, result=result, cache_hit=cache_hit
+        )
        ## BUILD COMPLETE STREAMED RESPONSE
        complete_streaming_response = None
        if self.stream:
            if result.choices[0].finish_reason is not None:  # if it's the last chunk
                self.streaming_chunks.append(result)
-                # print_verbose(f"final set of received chunks: {self.streaming_chunks}")
+                # verbose_logger.debug(f"final set of received chunks: {self.streaming_chunks}")
                try:
                    complete_streaming_response = litellm.stream_chunk_builder(
                        self.streaming_chunks,
                        messages=self.model_call_details.get("messages", None),
+                        start_time=start_time,
+                        end_time=end_time,
                    )
                except Exception as e:
-                    print_verbose(
+                    verbose_logger.debug(
                        f"Error occurred building stream chunk: {traceback.format_exc()}"
                    )
                    complete_streaming_response = None
            else:
                self.streaming_chunks.append(result)
-        if complete_streaming_response:
-            print_verbose("Async success callbacks: Got a complete streaming response")
+        if complete_streaming_response is not None:
+            verbose_logger.debug(
+                "Async success callbacks: Got a complete streaming response"
+            )
            self.model_call_details[
                "complete_streaming_response"
            ] = complete_streaming_response
-        start_time, end_time, result = self._success_handler_helper_fn(
-            start_time=start_time, end_time=end_time, result=result, cache_hit=cache_hit
-        )
+            try:
+                self.model_call_details["response_cost"] = litellm.completion_cost(
+                    completion_response=complete_streaming_response,
+                )
+                verbose_logger.debug(
+                    f"Model={self.model}; cost={self.model_call_details['response_cost']}"
+                )
+            except litellm.NotFoundError as e:
+                verbose_logger.debug(
+                    f"Model={self.model} not found in completion cost map."
+                )
+                self.model_call_details["response_cost"] = None
+
        for callback in litellm._async_success_callback:
            try:
                if callback == "cache" and litellm.cache is not None:
@ -1435,15 +1511,27 @@ class Logging:
                            end_time=end_time,
                        )
                if callable(callback):  # custom logger functions
-                    print_verbose(f"Async success callbacks: async_log_event")
-                    await customLogger.async_log_event(
-                        kwargs=self.model_call_details,
-                        response_obj=result,
-                        start_time=start_time,
-                        end_time=end_time,
-                        print_verbose=print_verbose,
-                        callback_func=callback,
-                    )
+                    if self.stream:
+                        if "complete_streaming_response" in self.model_call_details:
+                            await customLogger.async_log_event(
+                                kwargs=self.model_call_details,
+                                response_obj=self.model_call_details[
+                                    "complete_streaming_response"
+                                ],
+                                start_time=start_time,
+                                end_time=end_time,
+                                print_verbose=print_verbose,
+                                callback_func=callback,
+                            )
+                    else:
+                        await customLogger.async_log_event(
+                            kwargs=self.model_call_details,
+                            response_obj=result,
+                            start_time=start_time,
+                            end_time=end_time,
+                            print_verbose=print_verbose,
+                            callback_func=callback,
+                        )
                if callback == "dynamodb":
                    global dynamoLogger
                    if dynamoLogger is None:
@ -1864,12 +1952,6 @@ def client(original_function):
                        # we only support async s3 logging for acompletion/aembedding since that's used on proxy
                        litellm._async_success_callback.append(callback)
                        removed_async_items.append(index)
-                    elif callback == "langfuse" and inspect.iscoroutinefunction(
-                        original_function
-                    ):
-                        # use async success callback for langfuse if this is litellm.acompletion(). Streaming logging does not work otherwise
-                        litellm._async_success_callback.append(callback)
-                        removed_async_items.append(index)

                # Pop the async items from success_callback in reverse order to avoid index issues
                for index in reversed(removed_async_items):
@ -1947,6 +2029,16 @@ def client(original_function):
                call_type=call_type,
                start_time=start_time,
            )
+            ## check if metadata is passed in
+            litellm_params = {}
+            if "metadata" in kwargs:
+                litellm_params["metadata"] = kwargs["metadata"]
+            logging_obj.update_environment_variables(
+                model=model,
+                user="",
+                optional_params={},
+                litellm_params=litellm_params,
+            )
            return logging_obj
        except Exception as e:
            import logging
@ -2098,7 +2190,6 @@ def client(original_function):
            result = original_function(*args, **kwargs)
            end_time = datetime.datetime.now()
            if "stream" in kwargs and kwargs["stream"] == True:
-                # TODO: Add to cache for streaming
                if (
                    "complete_response" in kwargs
                    and kwargs["complete_response"] == True
@ -2130,7 +2221,7 @@ def client(original_function):
                litellm.cache.add_cache(result, *args, **kwargs)

            # LOG SUCCESS - handle streaming success logging in the _next_ object, remove `handle_success` once it's deprecated
-            print_verbose(f"Wrapper: Completed Call, calling success_handler")
+            verbose_logger.info(f"Wrapper: Completed Call, calling success_handler")
            threading.Thread(
                target=logging_obj.success_handler, args=(result, start_time, end_time)
            ).start()
@ -2363,12 +2454,15 @@ def client(original_function):
            threading.Thread(
                target=logging_obj.success_handler, args=(result, start_time, end_time)
            ).start()
+
            # RETURN RESULT
            if hasattr(result, "_hidden_params"):
                result._hidden_params["model_id"] = kwargs.get("model_info", {}).get(
                    "id", None
                )
-            if isinstance(result, ModelResponse):
+            if isinstance(result, ModelResponse) or isinstance(
+                result, EmbeddingResponse
+            ):
                result._response_ms = (
                    end_time - start_time
                ).total_seconds() * 1000  # return response latency in ms like openai
@ -2486,24 +2580,20 @@ def get_replicate_completion_pricing(completion_response=None, total_time=0.0):


 def _select_tokenizer(model: str):
-    # cohere
-    import pkg_resources
+    from importlib import resources

    if model in litellm.cohere_models:
+        # cohere
        tokenizer = Tokenizer.from_pretrained("Cohere/command-nightly")
        return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
    # anthropic
    elif model in litellm.anthropic_models:
-        # Read the JSON file
-        filename = pkg_resources.resource_filename(
-            __name__, "llms/tokenizers/anthropic_tokenizer.json"
-        )
-        with open(filename, "r") as f:
+        with resources.open_text(
+            "litellm.llms.tokenizers", "anthropic_tokenizer.json"
+        ) as f:
            json_data = json.load(f)
-        # Decode the JSON data from utf-8
-        json_data_decoded = json.dumps(json_data, ensure_ascii=False)
-        # Convert to str
-        json_str = str(json_data_decoded)
+        # Convert to str (if necessary)
+        json_str = json.dumps(json_data)
        # load tokenizer
        tokenizer = Tokenizer.from_str(json_str)
        return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
@ -2793,15 +2883,25 @@ def token_counter(
                print_verbose(
                    f"Token Counter - using generic token counter, for model={model}"
                )
-                enc = tokenizer_json["tokenizer"].encode(text)
-                num_tokens = len(enc)
+                num_tokens = openai_token_counter(
+                    text=text,  # type: ignore
+                    model="gpt-3.5-turbo",
+                    messages=messages,
+                    is_tool_call=is_tool_call,
+                    count_response_tokens=count_response_tokens,
+                )
    else:
        num_tokens = len(encoding.encode(text))  # type: ignore
    return num_tokens


 def cost_per_token(
-    model="", prompt_tokens=0, completion_tokens=0, custom_llm_provider=None
+    model="",
+    prompt_tokens=0,
+    completion_tokens=0,
+    response_time_ms=None,
+    custom_llm_provider=None,
+    region_name=None,
 ):
    """
    Calculates the cost per token for a given model, prompt tokens, and completion tokens.
@ -2818,30 +2918,74 @@ def cost_per_token(
    prompt_tokens_cost_usd_dollar = 0
    completion_tokens_cost_usd_dollar = 0
    model_cost_ref = litellm.model_cost
+    model_with_provider = model
    if custom_llm_provider is not None:
        model_with_provider = custom_llm_provider + "/" + model
-    else:
-        model_with_provider = model
+        if region_name is not None:
+            model_with_provider_and_region = (
+                f"{custom_llm_provider}/{region_name}/{model}"
+            )
+            if (
+                model_with_provider_and_region in model_cost_ref
+            ):  # use region based pricing, if it's available
+                model_with_provider = model_with_provider_and_region
    # see this https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models
    print_verbose(f"Looking up model={model} in model_cost_map")
-
-    if model in model_cost_ref:
-        prompt_tokens_cost_usd_dollar = (
-            model_cost_ref[model]["input_cost_per_token"] * prompt_tokens
+    if model_with_provider in model_cost_ref:
+        print_verbose(
+            f"Success: model={model_with_provider} in model_cost_map - {model_cost_ref[model_with_provider]}"
        )
-        completion_tokens_cost_usd_dollar = (
-            model_cost_ref[model]["output_cost_per_token"] * completion_tokens
+        print_verbose(
+            f"applying cost={model_cost_ref[model_with_provider].get('input_cost_per_token', None)} for prompt_tokens={prompt_tokens}"
        )
-        return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
-    elif model_with_provider in model_cost_ref:
-        print_verbose(f"Looking up model={model_with_provider} in model_cost_map")
        prompt_tokens_cost_usd_dollar = (
            model_cost_ref[model_with_provider]["input_cost_per_token"] * prompt_tokens
        )
+        print_verbose(
+            f"calculated prompt_tokens_cost_usd_dollar: {prompt_tokens_cost_usd_dollar}"
+        )
+        print_verbose(
+            f"applying cost={model_cost_ref[model_with_provider].get('output_cost_per_token', None)} for completion_tokens={completion_tokens}"
+        )
        completion_tokens_cost_usd_dollar = (
            model_cost_ref[model_with_provider]["output_cost_per_token"]
            * completion_tokens
        )
+        print_verbose(
+            f"calculated completion_tokens_cost_usd_dollar: {completion_tokens_cost_usd_dollar}"
+        )
+        return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
+    if model in model_cost_ref:
+        print_verbose(f"Success: model={model} in model_cost_map")
+        print_verbose(
+            f"prompt_tokens={prompt_tokens}; completion_tokens={completion_tokens}"
+        )
+        if (
+            model_cost_ref[model].get("input_cost_per_token", None) is not None
+            and model_cost_ref[model].get("output_cost_per_token", None) is not None
+        ):
+            ## COST PER TOKEN ##
+            prompt_tokens_cost_usd_dollar = (
+                model_cost_ref[model]["input_cost_per_token"] * prompt_tokens
+            )
+            completion_tokens_cost_usd_dollar = (
+                model_cost_ref[model]["output_cost_per_token"] * completion_tokens
+            )
+        elif (
+            model_cost_ref[model].get("input_cost_per_second", None) is not None
+            and response_time_ms is not None
+        ):
+            print_verbose(
+                f"For model={model} - input_cost_per_second: {model_cost_ref[model].get('input_cost_per_second')}; response time: {response_time_ms}"
+            )
+            ## COST PER SECOND ##
+            prompt_tokens_cost_usd_dollar = (
+                model_cost_ref[model]["input_cost_per_second"] * response_time_ms / 1000
+            )
+            completion_tokens_cost_usd_dollar = 0.0
+        print_verbose(
+            f"Returned custom cost for model={model} - prompt_tokens_cost_usd_dollar: {prompt_tokens_cost_usd_dollar}, completion_tokens_cost_usd_dollar: {completion_tokens_cost_usd_dollar}"
+        )
        return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
    elif "ft:gpt-3.5-turbo" in model:
        print_verbose(f"Cost Tracking: {model} is an OpenAI FinteTuned LLM")
@ -2855,17 +2999,23 @@ def cost_per_token(
        )
        return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
    elif model in litellm.azure_llms:
-        print_verbose(f"Cost Tracking: {model} is an Azure LLM")
+        verbose_logger.debug(f"Cost Tracking: {model} is an Azure LLM")
        model = litellm.azure_llms[model]
+        verbose_logger.debug(
+            f"applying cost={model_cost_ref[model]['input_cost_per_token']} for prompt_tokens={prompt_tokens}"
+        )
        prompt_tokens_cost_usd_dollar = (
            model_cost_ref[model]["input_cost_per_token"] * prompt_tokens
        )
+        verbose_logger.debug(
+            f"applying cost={model_cost_ref[model]['output_cost_per_token']} for completion_tokens={completion_tokens}"
+        )
        completion_tokens_cost_usd_dollar = (
            model_cost_ref[model]["output_cost_per_token"] * completion_tokens
        )
        return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
    elif model in litellm.azure_embedding_models:
-        print_verbose(f"Cost Tracking: {model} is an Azure Embedding Model")
+        verbose_logger.debug(f"Cost Tracking: {model} is an Azure Embedding Model")
        model = litellm.azure_embedding_models[model]
        prompt_tokens_cost_usd_dollar = (
            model_cost_ref[model]["input_cost_per_token"] * prompt_tokens
@ -2895,7 +3045,14 @@ def completion_cost(
    prompt="",
    messages: List = [],
    completion="",
-    total_time=0.0,  # used for replicate
+    total_time=0.0,  # used for replicate, sagemaker
+    ### REGION ###
+    custom_llm_provider=None,
+    region_name=None,  # used for bedrock pricing
+    ### IMAGE GEN ###
+    size=None,
+    quality=None,
+    n=None,  # number of images
 ):
    """
    Calculate the cost of a given completion call fot GPT-3.5-turbo, llama2, any litellm supported llm.
@ -2933,15 +3090,20 @@ def completion_cost(
            completion_tokens = completion_response.get("usage", {}).get(
                "completion_tokens", 0
            )
+            total_time = completion_response.get("_response_ms", 0)
+            verbose_logger.debug(
+                f"completion_response response ms: {completion_response.get('_response_ms')} "
+            )
            model = (
                model or completion_response["model"]
            )  # check if user passed an override for model, if it's none check completion_response['model']
-            if completion_response is not None and hasattr(
-                completion_response, "_hidden_params"
-            ):
+            if hasattr(completion_response, "_hidden_params"):
                custom_llm_provider = completion_response._hidden_params.get(
                    "custom_llm_provider", ""
                )
+                region_name = completion_response._hidden_params.get(
+                    "region_name", region_name
+                )
        else:
            if len(messages) > 0:
                prompt_tokens = token_counter(model=model, messages=messages)
@ -2953,6 +3115,37 @@ def completion_cost(
                f"Model is None and does not exist in passed completion_response. Passed completion_response={completion_response}, model={model}"
            )

+        if size is not None and n is not None:
+            ### IMAGE GENERATION COST CALCULATION ###
+            image_gen_model_name = f"{size}/{model}"
+            image_gen_model_name_with_quality = image_gen_model_name
+            if quality is not None:
+                image_gen_model_name_with_quality = f"{quality}/{image_gen_model_name}"
+            size = size.split("-x-")
+            height = int(size[0])
+            width = int(size[1])
+            verbose_logger.debug(f"image_gen_model_name: {image_gen_model_name}")
+            verbose_logger.debug(
+                f"image_gen_model_name_with_quality: {image_gen_model_name_with_quality}"
+            )
+            if image_gen_model_name in litellm.model_cost:
+                return (
+                    litellm.model_cost[image_gen_model_name]["input_cost_per_pixel"]
+                    * height
+                    * width
+                    * n
+                )
+            elif image_gen_model_name_with_quality in litellm.model_cost:
+                return (
+                    litellm.model_cost[image_gen_model_name_with_quality][
+                        "input_cost_per_pixel"
+                    ]
+                    * height
+                    * width
+                    * n
+                )
+            else:
+                raise Exception(f"Model={model} not found in completion cost model map")
        # Calculate cost based on prompt_tokens, completion_tokens
        if "togethercomputer" in model or "together_ai" in model:
            # together ai prices based on size of llm
@ -2970,8 +3163,14 @@ def completion_cost(
            prompt_tokens=prompt_tokens,
            completion_tokens=completion_tokens,
            custom_llm_provider=custom_llm_provider,
+            response_time_ms=total_time,
+            region_name=region_name,
        )
-        return prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
+        _final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
+        print_verbose(
+            f"final cost: {_final_cost}; prompt_tokens_cost_usd_dollar: {prompt_tokens_cost_usd_dollar}; completion_tokens_cost_usd_dollar: {completion_tokens_cost_usd_dollar}"
+        )
+        return _final_cost
    except Exception as e:
        raise e

@ -3000,9 +3199,8 @@ def register_model(model_cost: Union[str, dict]):

    for key, value in loaded_model_cost.items():
        ## override / add new keys to the existing model cost dictionary
-        if key in litellm.model_cost:
-            for k, v in loaded_model_cost[key].items():
-                litellm.model_cost[key][k] = v
+        litellm.model_cost.setdefault(key, {}).update(value)
+        verbose_logger.debug(f"{key} added to model cost map")
        # add new model names to provider lists
        if value.get("litellm_provider") == "openai":
            if key not in litellm.open_ai_chat_completion_models:
@ -3138,8 +3336,10 @@ def get_optional_params_image_gen(

 def get_optional_params_embeddings(
    # 2 optional params
+    model=None,
    user=None,
    encoding_format=None,
+    dimensions=None,
    custom_llm_provider="",
    **kwargs,
 ):
@ -3150,7 +3350,7 @@ def get_optional_params_embeddings(
    for k, v in special_params.items():
        passed_params[k] = v

-    default_params = {"user": None, "encoding_format": None}
+    default_params = {"user": None, "encoding_format": None, "dimensions": None}

    non_default_params = {
        k: v
@ -3158,6 +3358,19 @@ def get_optional_params_embeddings(
        if (k in default_params and v != default_params[k])
    }
    ## raise exception if non-default value passed for non-openai/azure embedding calls
+    if custom_llm_provider == "openai":
+        # 'dimensions` is only supported in `text-embedding-3` and later models
+
+        if (
+            model is not None
+            and "text-embedding-3" not in model
+            and "dimensions" in non_default_params.keys()
+        ):
+            raise UnsupportedParamsError(
+                status_code=500,
+                message=f"Setting dimensions is not supported for OpenAI `text-embedding-3` and later models. To drop it from the call, set `litellm.drop_params = True`.",
+            )
+
    if (
        custom_llm_provider != "openai"
        and custom_llm_provider != "azure"
@ -3212,6 +3425,10 @@ def get_optional_params(
            custom_llm_provider != "bedrock" and custom_llm_provider != "sagemaker"
        ):  # allow dynamically setting boto3 init logic
            continue
+        elif (
+            k.startswith("vertex_") and custom_llm_provider != "vertex_ai"
+        ):  # allow dynamically setting vertex ai init logic
+            continue
        passed_params[k] = v
    default_params = {
        "functions": None,
@ -3295,16 +3512,20 @@ def get_optional_params(
                )

    def _check_valid_arg(supported_params):
-        print_verbose(
+        verbose_logger.debug(
            f"\nLiteLLM completion() model= {model}; provider = {custom_llm_provider}"
        )
-        print_verbose(f"\nLiteLLM: Params passed to completion() {passed_params}")
-        print_verbose(
+        verbose_logger.debug(
+            f"\nLiteLLM: Params passed to completion() {passed_params}"
+        )
+        verbose_logger.debug(
            f"\nLiteLLM: Non-Default params passed to completion() {non_default_params}"
        )
        unsupported_params = {}
        for k in non_default_params.keys():
            if k not in supported_params:
+                if k == "user":
+                    continue
                if k == "n" and n == 1:  # langchain sends n=1 as a default value
                    continue  # skip this param
                if (
@ -5143,6 +5364,8 @@ def convert_to_model_response_object(
        "completion", "embedding", "image_generation"
    ] = "completion",
    stream=False,
+    start_time=None,
+    end_time=None,
 ):
    try:
        if response_type == "completion" and (
@ -5196,6 +5419,12 @@ def convert_to_model_response_object(

            if "model" in response_object:
                model_response_object.model = response_object["model"]
+
+            if start_time is not None and end_time is not None:
+                model_response_object._response_ms = (  # type: ignore
+                    end_time - start_time
+                ).total_seconds() * 1000
+
            return model_response_object
        elif response_type == "embedding" and (
            model_response_object is None
@ -5220,6 +5449,11 @@ def convert_to_model_response_object(
                model_response_object.usage.prompt_tokens = response_object["usage"].get("prompt_tokens", 0)  # type: ignore
                model_response_object.usage.total_tokens = response_object["usage"].get("total_tokens", 0)  # type: ignore

+            if start_time is not None and end_time is not None:
+                model_response_object._response_ms = (  # type: ignore
+                    end_time - start_time
+                ).total_seconds() * 1000  # return response latency in ms like openai
+
            return model_response_object
        elif response_type == "image_generation" and (
            model_response_object is None
@ -6959,6 +7193,8 @@ class CustomStreamWrapper:
        self._hidden_params = {
            "model_id": (_model_info.get("id", None))
        }  # returned as x-litellm-model-id response header in proxy
+        self.response_id = None
+        self.logging_loop = None

    def __iter__(self):
        return self
@ -7280,6 +7516,13 @@ class CustomStreamWrapper:
                if str_line.choices[0].finish_reason:
                    is_finished = True
                    finish_reason = str_line.choices[0].finish_reason
+                    if finish_reason == "content_filter":
+                        error_message = json.dumps(
+                            str_line.choices[0].content_filter_result
+                        )
+                        raise litellm.AzureOpenAIError(
+                            status_code=400, message=error_message
+                        )

                # checking for logprobs
                if (
@ -7290,16 +7533,6 @@ class CustomStreamWrapper:
                else:
                    logprobs = None

-                if (
-                    hasattr(str_line.choices[0], "content_filter_result")
-                    and str_line.choices[0].content_filter_result is not None
-                ):
-                    error_message = json.dumps(
-                        str_line.choices[0].content_filter_result
-                    )
-                    raise litellm.AzureOpenAIError(
-                        status_code=400, message=error_message
-                    )
            return {
                "text": text,
                "is_finished": is_finished,
@ -7532,9 +7765,35 @@ class CustomStreamWrapper:
            }
        return ""

+    def handle_sagemaker_stream(self, chunk):
+        if "data: [DONE]" in chunk:
+            text = ""
+            is_finished = True
+            finish_reason = "stop"
+            return {
+                "text": text,
+                "is_finished": is_finished,
+                "finish_reason": finish_reason,
+            }
+        elif isinstance(chunk, dict):
+            if chunk["is_finished"] == True:
+                finish_reason = "stop"
+            else:
+                finish_reason = ""
+            return {
+                "text": chunk["text"],
+                "is_finished": chunk["is_finished"],
+                "finish_reason": finish_reason,
+            }
+
    def chunk_creator(self, chunk):
        model_response = ModelResponse(stream=True, model=self.model)
+        if self.response_id is not None:
+            model_response.id = self.response_id
+        else:
+            self.response_id = model_response.id
        model_response._hidden_params["custom_llm_provider"] = self.custom_llm_provider
+        model_response._hidden_params["created_at"] = time.time()
        model_response.choices = [StreamingChoices()]
        model_response.choices[0].finish_reason = None
        response_obj = {}
@ -7616,7 +7875,9 @@ class CustomStreamWrapper:
                            raise Exception("An unknown error occurred with the stream")
                        model_response.choices[0].finish_reason = "stop"
                        self.sent_last_chunk = True
-            elif self.custom_llm_provider and self.custom_llm_provider == "vertex_ai":
+            elif self.custom_llm_provider == "gemini":
+                completion_obj["content"] = chunk.text
+            elif self.custom_llm_provider and (self.custom_llm_provider == "vertex_ai"):
                try:
                    # print(chunk)
                    if hasattr(chunk, "text"):
@ -7651,19 +7912,14 @@ class CustomStreamWrapper:
                    ]
                    self.sent_last_chunk = True
            elif self.custom_llm_provider == "sagemaker":
-                print_verbose(f"ENTERS SAGEMAKER STREAMING")
-                if len(self.completion_stream) == 0:
-                    if self.sent_last_chunk:
-                        raise StopIteration
-                    else:
-                        model_response.choices[0].finish_reason = "stop"
-                        self.sent_last_chunk = True
-                new_chunk = self.completion_stream
-                print_verbose(f"sagemaker chunk: {new_chunk}")
-                completion_obj["content"] = new_chunk
-                self.completion_stream = self.completion_stream[
-                    len(self.completion_stream) :
-                ]
+                verbose_logger.debug(f"ENTERS SAGEMAKER STREAMING for chunk {chunk}")
+                response_obj = self.handle_sagemaker_stream(chunk)
+                completion_obj["content"] = response_obj["text"]
+                if response_obj["is_finished"]:
+                    model_response.choices[0].finish_reason = response_obj[
+                        "finish_reason"
+                    ]
+                    self.sent_last_chunk = True
            elif self.custom_llm_provider == "petals":
                if len(self.completion_stream) == 0:
                    if self.sent_last_chunk:
@ -7782,7 +8038,7 @@ class CustomStreamWrapper:
                            completion_obj["role"] = "assistant"
                            self.sent_first_chunk = True
                        model_response.choices[0].delta = Delta(**completion_obj)
-                    print_verbose(f"model_response: {model_response}")
+                    print_verbose(f"returning model_response: {model_response}")
                    return model_response
                else:
                    return
@ -7839,6 +8095,27 @@ class CustomStreamWrapper:
                original_exception=e,
            )

+    def set_logging_event_loop(self, loop):
+        self.logging_loop = loop
+
+    async def your_async_function(self):
+        # Your asynchronous code here
+        return "Your asynchronous code is running"
+
+    def run_success_logging_in_thread(self, processed_chunk):
+        # Create an event loop for the new thread
+        ## ASYNC LOGGING
+        if self.logging_loop is not None:
+            future = asyncio.run_coroutine_threadsafe(
+                self.logging_obj.async_success_handler(processed_chunk),
+                loop=self.logging_loop,
+            )
+            result = future.result()
+        else:
+            asyncio.run(self.logging_obj.async_success_handler(processed_chunk))
+        ## SYNC LOGGING
+        self.logging_obj.success_handler(processed_chunk)
+
    ## needs to handle the empty string case (even starting chunk can be an empty string)
    def __next__(self):
        try:
@ -7857,8 +8134,9 @@ class CustomStreamWrapper:
                        continue
                    ## LOGGING
                    threading.Thread(
-                        target=self.logging_obj.success_handler, args=(response,)
+                        target=self.run_success_logging_in_thread, args=(response,)
                    ).start()  # log response
+
                    # RETURN RESULT
                    return response
        except StopIteration:
@ -7914,13 +8192,34 @@ class CustomStreamWrapper:
                raise StopAsyncIteration
            else:  # temporary patch for non-aiohttp async calls
                # example - boto3 bedrock llms
-                processed_chunk = next(self)
-                asyncio.create_task(
-                    self.logging_obj.async_success_handler(
-                        processed_chunk,
-                    )
-                )
-                return processed_chunk
+                while True:
+                    if isinstance(self.completion_stream, str) or isinstance(
+                        self.completion_stream, bytes
+                    ):
+                        chunk = self.completion_stream
+                    else:
+                        chunk = next(self.completion_stream)
+                    if chunk is not None and chunk != b"":
+                        print_verbose(f"PROCESSED CHUNK PRE CHUNK CREATOR: {chunk}")
+                        processed_chunk = self.chunk_creator(chunk=chunk)
+                        print_verbose(
+                            f"PROCESSED CHUNK POST CHUNK CREATOR: {processed_chunk}"
+                        )
+                        if processed_chunk is None:
+                            continue
+                        ## LOGGING
+                        threading.Thread(
+                            target=self.logging_obj.success_handler,
+                            args=(processed_chunk,),
+                        ).start()  # log processed_chunk
+                        asyncio.create_task(
+                            self.logging_obj.async_success_handler(
+                                processed_chunk,
+                            )
+                        )
+
+                        # RETURN RESULT
+                        return processed_chunk
        except StopAsyncIteration:
            raise
        except StopIteration:
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@ -62,6 +62,15 @@
        "litellm_provider": "openai",
        "mode": "chat"
    },
+    "gpt-4-0125-preview": {
+        "max_tokens": 128000,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00001,
+        "output_cost_per_token": 0.00003,
+        "litellm_provider": "openai",
+        "mode": "chat"
+    },
    "gpt-4-vision-preview": {
        "max_tokens": 128000,
        "max_input_tokens": 128000,
@ -143,6 +152,20 @@
        "litellm_provider": "openai",
        "mode": "chat"
    },
+    "text-embedding-3-large": {
+        "max_tokens": 8191,
+        "input_cost_per_token": 0.00000013,
+        "output_cost_per_token": 0.000000,
+        "litellm_provider": "openai",
+        "mode": "embedding"
+    },
+    "text-embedding-3-small": {
+        "max_tokens": 8191,
+        "input_cost_per_token": 0.00000002,
+        "output_cost_per_token": 0.000000,
+        "litellm_provider": "openai",
+        "mode": "embedding"
+    },
    "text-embedding-ada-002": {
        "max_tokens": 8191,
        "input_cost_per_token": 0.0000001,
@ -906,6 +929,14 @@
        "litellm_provider": "bedrock",
        "mode": "chat"
    },
+    "amazon.titan-embed-text-v1": {
+        "max_tokens": 8192, 
+        "output_vector_size": 1536,
+        "input_cost_per_token": 0.0000001,
+        "output_cost_per_token": 0.0,
+        "litellm_provider": "bedrock", 
+        "mode": "embedding"
+    },
    "anthropic.claude-v1": {
        "max_tokens": 100000, 
        "max_output_tokens": 8191,
--- a/poetry.lock
+++ b/poetry.lock
@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.

 [[package]]
 name = "aiohttp"
@ -169,6 +169,34 @@ doc = ["Sphinx", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-
 test = ["anyio[trio]", "coverage[toml] (>=4.5)", "hypothesis (>=4.0)", "mock (>=4)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17)"]
 trio = ["trio (<0.22)"]

+[[package]]
+name = "apscheduler"
+version = "3.10.4"
+description = "In-process task scheduler with Cron-like capabilities"
+optional = true
+python-versions = ">=3.6"
+files = [
+    {file = "APScheduler-3.10.4-py3-none-any.whl", hash = "sha256:fb91e8a768632a4756a585f79ec834e0e27aad5860bac7eaa523d9ccefd87661"},
+    {file = "APScheduler-3.10.4.tar.gz", hash = "sha256:e6df071b27d9be898e486bc7940a7be50b4af2e9da7c08f0744a96d4bd4cef4a"},
+]
+
+[package.dependencies]
+pytz = "*"
+six = ">=1.4.0"
+tzlocal = ">=2.0,<3.dev0 || >=4.dev0"
+
+[package.extras]
+doc = ["sphinx", "sphinx-rtd-theme"]
+gevent = ["gevent"]
+mongodb = ["pymongo (>=3.0)"]
+redis = ["redis (>=3.0)"]
+rethinkdb = ["rethinkdb (>=2.4.0)"]
+sqlalchemy = ["sqlalchemy (>=1.4)"]
+testing = ["pytest", "pytest-asyncio", "pytest-cov", "pytest-tornado5"]
+tornado = ["tornado (>=4.3)"]
+twisted = ["twisted"]
+zookeeper = ["kazoo"]
+
 [[package]]
 name = "async-timeout"
 version = "4.0.3"
@ -655,20 +683,20 @@ smmap = ">=3.0.1,<6"

 [[package]]
 name = "gitpython"
-version = "3.1.40"
+version = "3.1.41"
 description = "GitPython is a Python library used to interact with Git repositories"
 optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "GitPython-3.1.40-py3-none-any.whl", hash = "sha256:cf14627d5a8049ffbf49915732e5eddbe8134c3bdb9d476e6182b676fc573f8a"},
-    {file = "GitPython-3.1.40.tar.gz", hash = "sha256:22b126e9ffb671fdd0c129796343a02bf67bf2994b35449ffc9321aa755e18a4"},
+    {file = "GitPython-3.1.41-py3-none-any.whl", hash = "sha256:c36b6634d069b3f719610175020a9aed919421c87552185b085e04fbbdb10b7c"},
+    {file = "GitPython-3.1.41.tar.gz", hash = "sha256:ed66e624884f76df22c8e16066d567aaa5a37d5b5fa19db2c6df6f7156db9048"},
 ]

 [package.dependencies]
 gitdb = ">=4.0.1,<5"

 [package.extras]
-test = ["black", "coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mock", "mypy", "pre-commit", "pytest", "pytest-cov", "pytest-instafail", "pytest-subtests", "pytest-sugar"]
+test = ["black", "coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mock", "mypy", "pre-commit", "pytest (>=7.3.1)", "pytest-cov", "pytest-instafail", "pytest-mock", "pytest-sugar", "sumtypes"]

 [[package]]
 name = "gunicorn"
@ -748,13 +776,13 @@ socks = ["socksio (==1.*)"]

 [[package]]
 name = "huggingface-hub"
-version = "0.20.1"
+version = "0.20.2"
 description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
 optional = false
 python-versions = ">=3.8.0"
 files = [
-    {file = "huggingface_hub-0.20.1-py3-none-any.whl", hash = "sha256:ecfdea395a8bc68cd160106c5bd857f7e010768d95f9e1862a779010cc304831"},
-    {file = "huggingface_hub-0.20.1.tar.gz", hash = "sha256:8c88c4c3c8853e22f2dfb4d84c3d493f4e1af52fb3856a90e1eeddcf191ddbb1"},
+    {file = "huggingface_hub-0.20.2-py3-none-any.whl", hash = "sha256:53752eda2239d30a470c307a61cf9adcf136bc77b0a734338c7d04941af560d8"},
+    {file = "huggingface_hub-0.20.2.tar.gz", hash = "sha256:215c5fceff631030c7a3d19ba7b588921c908b3f21eef31d160ebc245b200ff6"},
 ]

 [package.dependencies]
@ -791,13 +819,13 @@ files = [

 [[package]]
 name = "importlib-metadata"
-version = "6.11.0"
+version = "7.0.1"
 description = "Read metadata from Python packages"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "importlib_metadata-6.11.0-py3-none-any.whl", hash = "sha256:f0afba6205ad8f8947c7d338b5342d5db2afbfd82f9cbef7879a9539cc12eb9b"},
-    {file = "importlib_metadata-6.11.0.tar.gz", hash = "sha256:1231cf92d825c9e03cfc4da076a16de6422c863558229ea0b22b675657463443"},
+    {file = "importlib_metadata-7.0.1-py3-none-any.whl", hash = "sha256:4805911c3a4ec7c3966410053e9ec6a1fecd629117df5adee56dfc9432a1081e"},
+    {file = "importlib_metadata-7.0.1.tar.gz", hash = "sha256:f238736bb06590ae52ac1fab06a3a9ef1d8dce2b7a35b5ab329371d6c8f5d2cc"},
 ]

 [package.dependencies]
@ -839,13 +867,13 @@ files = [

 [[package]]
 name = "jinja2"
-version = "3.1.2"
+version = "3.1.3"
 description = "A very fast and expressive template engine."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "Jinja2-3.1.2-py3-none-any.whl", hash = "sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61"},
-    {file = "Jinja2-3.1.2.tar.gz", hash = "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852"},
+    {file = "Jinja2-3.1.3-py3-none-any.whl", hash = "sha256:7d6d50dd97d52cbc355597bd845fabfbac3f551e1f99619e39a35ce8c370b5fa"},
+    {file = "Jinja2-3.1.3.tar.gz", hash = "sha256:ac8bd6544d4bb2c9792bf3a159e80bba8fda7f07e81bc3aed565432d5925ba90"},
 ]

 [package.dependencies]
@ -856,13 +884,13 @@ i18n = ["Babel (>=2.7)"]

 [[package]]
 name = "jsonschema"
-version = "4.20.0"
+version = "4.21.0"
 description = "An implementation of JSON Schema validation for Python"
 optional = true
 python-versions = ">=3.8"
 files = [
-    {file = "jsonschema-4.20.0-py3-none-any.whl", hash = "sha256:ed6231f0429ecf966f5bc8dfef245998220549cbbcf140f913b7464c52c3b6b3"},
-    {file = "jsonschema-4.20.0.tar.gz", hash = "sha256:4f614fd46d8d61258610998997743ec5492a648b33cf478c1ddc23ed4598a5fa"},
+    {file = "jsonschema-4.21.0-py3-none-any.whl", hash = "sha256:70a09719d375c0a2874571b363c8a24be7df8071b80c9aa76bc4551e7297c63c"},
+    {file = "jsonschema-4.21.0.tar.gz", hash = "sha256:3ba18e27f7491ea4a1b22edce00fb820eec968d397feb3f9cb61d5894bb38167"},
 ]

 [package.dependencies]
@ -1130,13 +1158,13 @@ files = [

 [[package]]
 name = "openai"
-version = "1.6.1"
+version = "1.10.0"
 description = "The official Python library for the openai API"
 optional = false
 python-versions = ">=3.7.1"
 files = [
-    {file = "openai-1.6.1-py3-none-any.whl", hash = "sha256:bc9f774838d67ac29fb24cdeb2d58faf57de8b311085dcd1348f7aa02a96c7ee"},
-    {file = "openai-1.6.1.tar.gz", hash = "sha256:d553ca9dbf9486b08e75b09e8671e4f638462aaadccfced632bf490fc3d75fa2"},
+    {file = "openai-1.10.0-py3-none-any.whl", hash = "sha256:aa69e97d0223ace9835fbf9c997abe9ee95318f684fd2de6d02c870700c71ebc"},
+    {file = "openai-1.10.0.tar.gz", hash = "sha256:208886cb501b930dc63f48d51db9c15e5380380f80516d07332adad67c9f1053"},
 ]

 [package.dependencies]
@ -1301,70 +1329,88 @@ files = [

 [[package]]
 name = "pillow"
-version = "10.1.0"
+version = "10.2.0"
 description = "Python Imaging Library (Fork)"
 optional = true
 python-versions = ">=3.8"
 files = [
-    {file = "Pillow-10.1.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:1ab05f3db77e98f93964697c8efc49c7954b08dd61cff526b7f2531a22410106"},
-    {file = "Pillow-10.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6932a7652464746fcb484f7fc3618e6503d2066d853f68a4bd97193a3996e273"},
-    {file = "Pillow-10.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5f63b5a68daedc54c7c3464508d8c12075e56dcfbd42f8c1bf40169061ae666"},
-    {file = "Pillow-10.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0949b55eb607898e28eaccb525ab104b2d86542a85c74baf3a6dc24002edec2"},
-    {file = "Pillow-10.1.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:ae88931f93214777c7a3aa0a8f92a683f83ecde27f65a45f95f22d289a69e593"},
-    {file = "Pillow-10.1.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:b0eb01ca85b2361b09480784a7931fc648ed8b7836f01fb9241141b968feb1db"},
-    {file = "Pillow-10.1.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:d27b5997bdd2eb9fb199982bb7eb6164db0426904020dc38c10203187ae2ff2f"},
-    {file = "Pillow-10.1.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:7df5608bc38bd37ef585ae9c38c9cd46d7c81498f086915b0f97255ea60c2818"},
-    {file = "Pillow-10.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:41f67248d92a5e0a2076d3517d8d4b1e41a97e2df10eb8f93106c89107f38b57"},
-    {file = "Pillow-10.1.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:1fb29c07478e6c06a46b867e43b0bcdb241b44cc52be9bc25ce5944eed4648e7"},
-    {file = "Pillow-10.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2cdc65a46e74514ce742c2013cd4a2d12e8553e3a2563c64879f7c7e4d28bce7"},
-    {file = "Pillow-10.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50d08cd0a2ecd2a8657bd3d82c71efd5a58edb04d9308185d66c3a5a5bed9610"},
-    {file = "Pillow-10.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:062a1610e3bc258bff2328ec43f34244fcec972ee0717200cb1425214fe5b839"},
-    {file = "Pillow-10.1.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:61f1a9d247317fa08a308daaa8ee7b3f760ab1809ca2da14ecc88ae4257d6172"},
-    {file = "Pillow-10.1.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:a646e48de237d860c36e0db37ecaecaa3619e6f3e9d5319e527ccbc8151df061"},
-    {file = "Pillow-10.1.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:47e5bf85b80abc03be7455c95b6d6e4896a62f6541c1f2ce77a7d2bb832af262"},
-    {file = "Pillow-10.1.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:a92386125e9ee90381c3369f57a2a50fa9e6aa8b1cf1d9c4b200d41a7dd8e992"},
-    {file = "Pillow-10.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:0f7c276c05a9767e877a0b4c5050c8bee6a6d960d7f0c11ebda6b99746068c2a"},
-    {file = "Pillow-10.1.0-cp312-cp312-macosx_10_10_x86_64.whl", hash = "sha256:a89b8312d51715b510a4fe9fc13686283f376cfd5abca8cd1c65e4c76e21081b"},
-    {file = "Pillow-10.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:00f438bb841382b15d7deb9a05cc946ee0f2c352653c7aa659e75e592f6fa17d"},
-    {file = "Pillow-10.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3d929a19f5469b3f4df33a3df2983db070ebb2088a1e145e18facbc28cae5b27"},
-    {file = "Pillow-10.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9a92109192b360634a4489c0c756364c0c3a2992906752165ecb50544c251312"},
-    {file = "Pillow-10.1.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:0248f86b3ea061e67817c47ecbe82c23f9dd5d5226200eb9090b3873d3ca32de"},
-    {file = "Pillow-10.1.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:9882a7451c680c12f232a422730f986a1fcd808da0fd428f08b671237237d651"},
-    {file = "Pillow-10.1.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:1c3ac5423c8c1da5928aa12c6e258921956757d976405e9467c5f39d1d577a4b"},
-    {file = "Pillow-10.1.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:806abdd8249ba3953c33742506fe414880bad78ac25cc9a9b1c6ae97bedd573f"},
-    {file = "Pillow-10.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:eaed6977fa73408b7b8a24e8b14e59e1668cfc0f4c40193ea7ced8e210adf996"},
-    {file = "Pillow-10.1.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:fe1e26e1ffc38be097f0ba1d0d07fcade2bcfd1d023cda5b29935ae8052bd793"},
-    {file = "Pillow-10.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7a7e3daa202beb61821c06d2517428e8e7c1aab08943e92ec9e5755c2fc9ba5e"},
-    {file = "Pillow-10.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:24fadc71218ad2b8ffe437b54876c9382b4a29e030a05a9879f615091f42ffc2"},
-    {file = "Pillow-10.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa1d323703cfdac2036af05191b969b910d8f115cf53093125e4058f62012c9a"},
-    {file = "Pillow-10.1.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:912e3812a1dbbc834da2b32299b124b5ddcb664ed354916fd1ed6f193f0e2d01"},
-    {file = "Pillow-10.1.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:7dbaa3c7de82ef37e7708521be41db5565004258ca76945ad74a8e998c30af8d"},
-    {file = "Pillow-10.1.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:9d7bc666bd8c5a4225e7ac71f2f9d12466ec555e89092728ea0f5c0c2422ea80"},
-    {file = "Pillow-10.1.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:baada14941c83079bf84c037e2d8b7506ce201e92e3d2fa0d1303507a8538212"},
-    {file = "Pillow-10.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:2ef6721c97894a7aa77723740a09547197533146fba8355e86d6d9a4a1056b14"},
-    {file = "Pillow-10.1.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:0a026c188be3b443916179f5d04548092e253beb0c3e2ee0a4e2cdad72f66099"},
-    {file = "Pillow-10.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:04f6f6149f266a100374ca3cc368b67fb27c4af9f1cc8cb6306d849dcdf12616"},
-    {file = "Pillow-10.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb40c011447712d2e19cc261c82655f75f32cb724788df315ed992a4d65696bb"},
-    {file = "Pillow-10.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1a8413794b4ad9719346cd9306118450b7b00d9a15846451549314a58ac42219"},
-    {file = "Pillow-10.1.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:c9aeea7b63edb7884b031a35305629a7593272b54f429a9869a4f63a1bf04c34"},
-    {file = "Pillow-10.1.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:b4005fee46ed9be0b8fb42be0c20e79411533d1fd58edabebc0dd24626882cfd"},
-    {file = "Pillow-10.1.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4d0152565c6aa6ebbfb1e5d8624140a440f2b99bf7afaafbdbf6430426497f28"},
-    {file = "Pillow-10.1.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d921bc90b1defa55c9917ca6b6b71430e4286fc9e44c55ead78ca1a9f9eba5f2"},
-    {file = "Pillow-10.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:cfe96560c6ce2f4c07d6647af2d0f3c54cc33289894ebd88cfbb3bcd5391e256"},
-    {file = "Pillow-10.1.0-pp310-pypy310_pp73-macosx_10_10_x86_64.whl", hash = "sha256:937bdc5a7f5343d1c97dc98149a0be7eb9704e937fe3dc7140e229ae4fc572a7"},
-    {file = "Pillow-10.1.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1c25762197144e211efb5f4e8ad656f36c8d214d390585d1d21281f46d556ba"},
-    {file = "Pillow-10.1.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:afc8eef765d948543a4775f00b7b8c079b3321d6b675dde0d02afa2ee23000b4"},
-    {file = "Pillow-10.1.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:883f216eac8712b83a63f41b76ddfb7b2afab1b74abbb413c5df6680f071a6b9"},
-    {file = "Pillow-10.1.0-pp39-pypy39_pp73-macosx_10_10_x86_64.whl", hash = "sha256:b920e4d028f6442bea9a75b7491c063f0b9a3972520731ed26c83e254302eb1e"},
-    {file = "Pillow-10.1.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c41d960babf951e01a49c9746f92c5a7e0d939d1652d7ba30f6b3090f27e412"},
-    {file = "Pillow-10.1.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:1fafabe50a6977ac70dfe829b2d5735fd54e190ab55259ec8aea4aaea412fa0b"},
-    {file = "Pillow-10.1.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:3b834f4b16173e5b92ab6566f0473bfb09f939ba14b23b8da1f54fa63e4b623f"},
-    {file = "Pillow-10.1.0.tar.gz", hash = "sha256:e6bf8de6c36ed96c86ea3b6e1d5273c53f46ef518a062464cd7ef5dd2cf92e38"},
+    {file = "pillow-10.2.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:7823bdd049099efa16e4246bdf15e5a13dbb18a51b68fa06d6c1d4d8b99a796e"},
+    {file = "pillow-10.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:83b2021f2ade7d1ed556bc50a399127d7fb245e725aa0113ebd05cfe88aaf588"},
+    {file = "pillow-10.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6fad5ff2f13d69b7e74ce5b4ecd12cc0ec530fcee76356cac6742785ff71c452"},
+    {file = "pillow-10.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da2b52b37dad6d9ec64e653637a096905b258d2fc2b984c41ae7d08b938a67e4"},
+    {file = "pillow-10.2.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:47c0995fc4e7f79b5cfcab1fc437ff2890b770440f7696a3ba065ee0fd496563"},
+    {file = "pillow-10.2.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:322bdf3c9b556e9ffb18f93462e5f749d3444ce081290352c6070d014c93feb2"},
+    {file = "pillow-10.2.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:51f1a1bffc50e2e9492e87d8e09a17c5eea8409cda8d3f277eb6edc82813c17c"},
+    {file = "pillow-10.2.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:69ffdd6120a4737710a9eee73e1d2e37db89b620f702754b8f6e62594471dee0"},
+    {file = "pillow-10.2.0-cp310-cp310-win32.whl", hash = "sha256:c6dafac9e0f2b3c78df97e79af707cdc5ef8e88208d686a4847bab8266870023"},
+    {file = "pillow-10.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:aebb6044806f2e16ecc07b2a2637ee1ef67a11840a66752751714a0d924adf72"},
+    {file = "pillow-10.2.0-cp310-cp310-win_arm64.whl", hash = "sha256:7049e301399273a0136ff39b84c3678e314f2158f50f517bc50285fb5ec847ad"},
+    {file = "pillow-10.2.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:35bb52c37f256f662abdfa49d2dfa6ce5d93281d323a9af377a120e89a9eafb5"},
+    {file = "pillow-10.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9c23f307202661071d94b5e384e1e1dc7dfb972a28a2310e4ee16103e66ddb67"},
+    {file = "pillow-10.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:773efe0603db30c281521a7c0214cad7836c03b8ccff897beae9b47c0b657d61"},
+    {file = "pillow-10.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11fa2e5984b949b0dd6d7a94d967743d87c577ff0b83392f17cb3990d0d2fd6e"},
+    {file = "pillow-10.2.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:716d30ed977be8b37d3ef185fecb9e5a1d62d110dfbdcd1e2a122ab46fddb03f"},
+    {file = "pillow-10.2.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:a086c2af425c5f62a65e12fbf385f7c9fcb8f107d0849dba5839461a129cf311"},
+    {file = "pillow-10.2.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c8de2789052ed501dd829e9cae8d3dcce7acb4777ea4a479c14521c942d395b1"},
+    {file = "pillow-10.2.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:609448742444d9290fd687940ac0b57fb35e6fd92bdb65386e08e99af60bf757"},
+    {file = "pillow-10.2.0-cp311-cp311-win32.whl", hash = "sha256:823ef7a27cf86df6597fa0671066c1b596f69eba53efa3d1e1cb8b30f3533068"},
+    {file = "pillow-10.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:1da3b2703afd040cf65ec97efea81cfba59cdbed9c11d8efc5ab09df9509fc56"},
+    {file = "pillow-10.2.0-cp311-cp311-win_arm64.whl", hash = "sha256:edca80cbfb2b68d7b56930b84a0e45ae1694aeba0541f798e908a49d66b837f1"},
+    {file = "pillow-10.2.0-cp312-cp312-macosx_10_10_x86_64.whl", hash = "sha256:1b5e1b74d1bd1b78bc3477528919414874748dd363e6272efd5abf7654e68bef"},
+    {file = "pillow-10.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0eae2073305f451d8ecacb5474997c08569fb4eb4ac231ffa4ad7d342fdc25ac"},
+    {file = "pillow-10.2.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b7c2286c23cd350b80d2fc9d424fc797575fb16f854b831d16fd47ceec078f2c"},
+    {file = "pillow-10.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1e23412b5c41e58cec602f1135c57dfcf15482013ce6e5f093a86db69646a5aa"},
+    {file = "pillow-10.2.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:52a50aa3fb3acb9cf7213573ef55d31d6eca37f5709c69e6858fe3bc04a5c2a2"},
+    {file = "pillow-10.2.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:127cee571038f252a552760076407f9cff79761c3d436a12af6000cd182a9d04"},
+    {file = "pillow-10.2.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:8d12251f02d69d8310b046e82572ed486685c38f02176bd08baf216746eb947f"},
+    {file = "pillow-10.2.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:54f1852cd531aa981bc0965b7d609f5f6cc8ce8c41b1139f6ed6b3c54ab82bfb"},
+    {file = "pillow-10.2.0-cp312-cp312-win32.whl", hash = "sha256:257d8788df5ca62c980314053197f4d46eefedf4e6175bc9412f14412ec4ea2f"},
+    {file = "pillow-10.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:154e939c5f0053a383de4fd3d3da48d9427a7e985f58af8e94d0b3c9fcfcf4f9"},
+    {file = "pillow-10.2.0-cp312-cp312-win_arm64.whl", hash = "sha256:f379abd2f1e3dddb2b61bc67977a6b5a0a3f7485538bcc6f39ec76163891ee48"},
+    {file = "pillow-10.2.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:8373c6c251f7ef8bda6675dd6d2b3a0fcc31edf1201266b5cf608b62a37407f9"},
+    {file = "pillow-10.2.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:870ea1ada0899fd0b79643990809323b389d4d1d46c192f97342eeb6ee0b8483"},
+    {file = "pillow-10.2.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b4b6b1e20608493548b1f32bce8cca185bf0480983890403d3b8753e44077129"},
+    {file = "pillow-10.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3031709084b6e7852d00479fd1d310b07d0ba82765f973b543c8af5061cf990e"},
+    {file = "pillow-10.2.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:3ff074fc97dd4e80543a3e91f69d58889baf2002b6be64347ea8cf5533188213"},
+    {file = "pillow-10.2.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:cb4c38abeef13c61d6916f264d4845fab99d7b711be96c326b84df9e3e0ff62d"},
+    {file = "pillow-10.2.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:b1b3020d90c2d8e1dae29cf3ce54f8094f7938460fb5ce8bc5c01450b01fbaf6"},
+    {file = "pillow-10.2.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:170aeb00224ab3dc54230c797f8404507240dd868cf52066f66a41b33169bdbe"},
+    {file = "pillow-10.2.0-cp38-cp38-win32.whl", hash = "sha256:c4225f5220f46b2fde568c74fca27ae9771536c2e29d7c04f4fb62c83275ac4e"},
+    {file = "pillow-10.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:0689b5a8c5288bc0504d9fcee48f61a6a586b9b98514d7d29b840143d6734f39"},
+    {file = "pillow-10.2.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:b792a349405fbc0163190fde0dc7b3fef3c9268292586cf5645598b48e63dc67"},
+    {file = "pillow-10.2.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c570f24be1e468e3f0ce7ef56a89a60f0e05b30a3669a459e419c6eac2c35364"},
+    {file = "pillow-10.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8ecd059fdaf60c1963c58ceb8997b32e9dc1b911f5da5307aab614f1ce5c2fb"},
+    {file = "pillow-10.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c365fd1703040de1ec284b176d6af5abe21b427cb3a5ff68e0759e1e313a5e7e"},
+    {file = "pillow-10.2.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:70c61d4c475835a19b3a5aa42492409878bbca7438554a1f89d20d58a7c75c01"},
+    {file = "pillow-10.2.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:b6f491cdf80ae540738859d9766783e3b3c8e5bd37f5dfa0b76abdecc5081f13"},
+    {file = "pillow-10.2.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9d189550615b4948f45252d7f005e53c2040cea1af5b60d6f79491a6e147eef7"},
+    {file = "pillow-10.2.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:49d9ba1ed0ef3e061088cd1e7538a0759aab559e2e0a80a36f9fd9d8c0c21591"},
+    {file = "pillow-10.2.0-cp39-cp39-win32.whl", hash = "sha256:babf5acfede515f176833ed6028754cbcd0d206f7f614ea3447d67c33be12516"},
+    {file = "pillow-10.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:0304004f8067386b477d20a518b50f3fa658a28d44e4116970abfcd94fac34a8"},
+    {file = "pillow-10.2.0-cp39-cp39-win_arm64.whl", hash = "sha256:0fb3e7fc88a14eacd303e90481ad983fd5b69c761e9e6ef94c983f91025da869"},
+    {file = "pillow-10.2.0-pp310-pypy310_pp73-macosx_10_10_x86_64.whl", hash = "sha256:322209c642aabdd6207517e9739c704dc9f9db943015535783239022002f054a"},
+    {file = "pillow-10.2.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3eedd52442c0a5ff4f887fab0c1c0bb164d8635b32c894bc1faf4c618dd89df2"},
+    {file = "pillow-10.2.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cb28c753fd5eb3dd859b4ee95de66cc62af91bcff5db5f2571d32a520baf1f04"},
+    {file = "pillow-10.2.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:33870dc4653c5017bf4c8873e5488d8f8d5f8935e2f1fb9a2208c47cdd66efd2"},
+    {file = "pillow-10.2.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:3c31822339516fb3c82d03f30e22b1d038da87ef27b6a78c9549888f8ceda39a"},
+    {file = "pillow-10.2.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:a2b56ba36e05f973d450582fb015594aaa78834fefe8dfb8fcd79b93e64ba4c6"},
+    {file = "pillow-10.2.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:d8e6aeb9201e655354b3ad049cb77d19813ad4ece0df1249d3c793de3774f8c7"},
+    {file = "pillow-10.2.0-pp39-pypy39_pp73-macosx_10_10_x86_64.whl", hash = "sha256:2247178effb34a77c11c0e8ac355c7a741ceca0a732b27bf11e747bbc950722f"},
+    {file = "pillow-10.2.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:15587643b9e5eb26c48e49a7b33659790d28f190fc514a322d55da2fb5c2950e"},
+    {file = "pillow-10.2.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:753cd8f2086b2b80180d9b3010dd4ed147efc167c90d3bf593fe2af21265e5a5"},
+    {file = "pillow-10.2.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:7c8f97e8e7a9009bcacbe3766a36175056c12f9a44e6e6f2d5caad06dcfbf03b"},
+    {file = "pillow-10.2.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:d1b35bcd6c5543b9cb547dee3150c93008f8dd0f1fef78fc0cd2b141c5baf58a"},
+    {file = "pillow-10.2.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:fe4c15f6c9285dc54ce6553a3ce908ed37c8f3825b5a51a15c91442bb955b868"},
+    {file = "pillow-10.2.0.tar.gz", hash = "sha256:e87f0b2c78157e12d7686b27d63c070fd65d994e8ddae6f328e0dcf4a0cd007e"},
 ]

 [package.extras]
 docs = ["furo", "olefile", "sphinx (>=2.4)", "sphinx-copybutton", "sphinx-inline-tabs", "sphinx-removed-in", "sphinxext-opengraph"]
+fpx = ["olefile"]
+mic = ["olefile"]
 tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"]
+typing = ["typing-extensions"]
+xmp = ["defusedxml"]

 [[package]]
 name = "pkgutil-resolve-name"
@ -1409,22 +1455,22 @@ testing = ["pytest", "pytest-benchmark"]

 [[package]]
 name = "protobuf"
-version = "4.25.1"
+version = "4.25.2"
 description = ""
 optional = true
 python-versions = ">=3.8"
 files = [
-    {file = "protobuf-4.25.1-cp310-abi3-win32.whl", hash = "sha256:193f50a6ab78a970c9b4f148e7c750cfde64f59815e86f686c22e26b4fe01ce7"},
-    {file = "protobuf-4.25.1-cp310-abi3-win_amd64.whl", hash = "sha256:3497c1af9f2526962f09329fd61a36566305e6c72da2590ae0d7d1322818843b"},
-    {file = "protobuf-4.25.1-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:0bf384e75b92c42830c0a679b0cd4d6e2b36ae0cf3dbb1e1dfdda48a244f4bcd"},
-    {file = "protobuf-4.25.1-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:0f881b589ff449bf0b931a711926e9ddaad3b35089cc039ce1af50b21a4ae8cb"},
-    {file = "protobuf-4.25.1-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:ca37bf6a6d0046272c152eea90d2e4ef34593aaa32e8873fc14c16440f22d4b7"},
-    {file = "protobuf-4.25.1-cp38-cp38-win32.whl", hash = "sha256:abc0525ae2689a8000837729eef7883b9391cd6aa7950249dcf5a4ede230d5dd"},
-    {file = "protobuf-4.25.1-cp38-cp38-win_amd64.whl", hash = "sha256:1484f9e692091450e7edf418c939e15bfc8fc68856e36ce399aed6889dae8bb0"},
-    {file = "protobuf-4.25.1-cp39-cp39-win32.whl", hash = "sha256:8bdbeaddaac52d15c6dce38c71b03038ef7772b977847eb6d374fc86636fa510"},
-    {file = "protobuf-4.25.1-cp39-cp39-win_amd64.whl", hash = "sha256:becc576b7e6b553d22cbdf418686ee4daa443d7217999125c045ad56322dda10"},
-    {file = "protobuf-4.25.1-py3-none-any.whl", hash = "sha256:a19731d5e83ae4737bb2a089605e636077ac001d18781b3cf489b9546c7c80d6"},
-    {file = "protobuf-4.25.1.tar.gz", hash = "sha256:57d65074b4f5baa4ab5da1605c02be90ac20c8b40fb137d6a8df9f416b0d0ce2"},
+    {file = "protobuf-4.25.2-cp310-abi3-win32.whl", hash = "sha256:b50c949608682b12efb0b2717f53256f03636af5f60ac0c1d900df6213910fd6"},
+    {file = "protobuf-4.25.2-cp310-abi3-win_amd64.whl", hash = "sha256:8f62574857ee1de9f770baf04dde4165e30b15ad97ba03ceac65f760ff018ac9"},
+    {file = "protobuf-4.25.2-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:2db9f8fa64fbdcdc93767d3cf81e0f2aef176284071507e3ede160811502fd3d"},
+    {file = "protobuf-4.25.2-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:10894a2885b7175d3984f2be8d9850712c57d5e7587a2410720af8be56cdaf62"},
+    {file = "protobuf-4.25.2-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:fc381d1dd0516343f1440019cedf08a7405f791cd49eef4ae1ea06520bc1c020"},
+    {file = "protobuf-4.25.2-cp38-cp38-win32.whl", hash = "sha256:33a1aeef4b1927431d1be780e87b641e322b88d654203a9e9d93f218ee359e61"},
+    {file = "protobuf-4.25.2-cp38-cp38-win_amd64.whl", hash = "sha256:47f3de503fe7c1245f6f03bea7e8d3ec11c6c4a2ea9ef910e3221c8a15516d62"},
+    {file = "protobuf-4.25.2-cp39-cp39-win32.whl", hash = "sha256:5e5c933b4c30a988b52e0b7c02641760a5ba046edc5e43d3b94a74c9fc57c1b3"},
+    {file = "protobuf-4.25.2-cp39-cp39-win_amd64.whl", hash = "sha256:d66a769b8d687df9024f2985d5137a337f957a0916cf5464d1513eee96a63ff0"},
+    {file = "protobuf-4.25.2-py3-none-any.whl", hash = "sha256:a8b7a98d4ce823303145bf3c1a8bdb0f2f4642a414b196f04ad9853ed0c8f830"},
+    {file = "protobuf-4.25.2.tar.gz", hash = "sha256:fe599e175cb347efc8ee524bcd4b902d11f7262c0e569ececcb89995c15f0a5e"},
 ]

 [[package]]
@ -1807,13 +1853,13 @@ ocsp = ["cryptography (>=36.0.1)", "pyopenssl (==20.0.1)", "requests (>=2.26.0)"

 [[package]]
 name = "referencing"
-version = "0.32.0"
+version = "0.32.1"
 description = "JSON Referencing + Python"
 optional = true
 python-versions = ">=3.8"
 files = [
-    {file = "referencing-0.32.0-py3-none-any.whl", hash = "sha256:bdcd3efb936f82ff86f993093f6da7435c7de69a3b3a5a06678a6050184bee99"},
-    {file = "referencing-0.32.0.tar.gz", hash = "sha256:689e64fe121843dcfd57b71933318ef1f91188ffb45367332700a86ac8fd6161"},
+    {file = "referencing-0.32.1-py3-none-any.whl", hash = "sha256:7e4dc12271d8e15612bfe35792f5ea1c40970dadf8624602e33db2758f7ee554"},
+    {file = "referencing-0.32.1.tar.gz", hash = "sha256:3c57da0513e9563eb7e203ebe9bb3a1b509b042016433bd1e45a2853466c3dd3"},
 ]

 [package.dependencies]
@ -1964,110 +2010,110 @@ jupyter = ["ipywidgets (>=7.5.1,<9)"]

 [[package]]
 name = "rpds-py"
-version = "0.16.2"
+version = "0.17.1"
 description = "Python bindings to Rust's persistent data structures (rpds)"
 optional = true
 python-versions = ">=3.8"
 files = [
-    {file = "rpds_py-0.16.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:509b617ac787cd1149600e731db9274ebbef094503ca25158e6f23edaba1ca8f"},
-    {file = "rpds_py-0.16.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:413b9c17388bbd0d87a329d8e30c1a4c6e44e2bb25457f43725a8e6fe4161e9e"},
-    {file = "rpds_py-0.16.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2946b120718eba9af2b4dd103affc1164a87b9e9ebff8c3e4c05d7b7a7e274e2"},
-    {file = "rpds_py-0.16.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:35ae5ece284cf36464eb160880018cf6088a9ac5ddc72292a6092b6ef3f4da53"},
-    {file = "rpds_py-0.16.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3dc6a7620ba7639a3db6213da61312cb4aa9ac0ca6e00dc1cbbdc21c2aa6eb57"},
-    {file = "rpds_py-0.16.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8cb6fe8ecdfffa0e711a75c931fb39f4ba382b4b3ccedeca43f18693864fe850"},
-    {file = "rpds_py-0.16.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6dace7b26a13353e24613417ce2239491b40a6ad44e5776a18eaff7733488b44"},
-    {file = "rpds_py-0.16.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1bdbc5fcb04a7309074de6b67fa9bc4b418ab3fc435fec1f2779a0eced688d04"},
-    {file = "rpds_py-0.16.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f42e25c016927e2a6b1ce748112c3ab134261fc2ddc867e92d02006103e1b1b7"},
-    {file = "rpds_py-0.16.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:eab36eae3f3e8e24b05748ec9acc66286662f5d25c52ad70cadab544e034536b"},
-    {file = "rpds_py-0.16.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:0474df4ade9a3b4af96c3d36eb81856cb9462e4c6657d4caecfd840d2a13f3c9"},
-    {file = "rpds_py-0.16.2-cp310-none-win32.whl", hash = "sha256:84c5a4d1f9dd7e2d2c44097fb09fffe728629bad31eb56caf97719e55575aa82"},
-    {file = "rpds_py-0.16.2-cp310-none-win_amd64.whl", hash = "sha256:2bd82db36cd70b3628c0c57d81d2438e8dd4b7b32a6a9f25f24ab0e657cb6c4e"},
-    {file = "rpds_py-0.16.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:adc0c3d6fc6ae35fee3e4917628983f6ce630d513cbaad575b4517d47e81b4bb"},
-    {file = "rpds_py-0.16.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ec23fcad480e77ede06cf4127a25fc440f7489922e17fc058f426b5256ee0edb"},
-    {file = "rpds_py-0.16.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:07aab64e2808c3ebac2a44f67e9dc0543812b715126dfd6fe4264df527556cb6"},
-    {file = "rpds_py-0.16.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a4ebb8b20bd09c5ce7884c8f0388801100f5e75e7f733b1b6613c713371feefc"},
-    {file = "rpds_py-0.16.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a3d7e2ea25d3517c6d7e5a1cc3702cffa6bd18d9ef8d08d9af6717fc1c700eed"},
-    {file = "rpds_py-0.16.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f28ac0e8e7242d140f99402a903a2c596ab71550272ae9247ad78f9a932b5698"},
-    {file = "rpds_py-0.16.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:19f00f57fdd38db4bb5ad09f9ead1b535332dbf624200e9029a45f1f35527ebb"},
-    {file = "rpds_py-0.16.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:3da5a4c56953bdbf6d04447c3410309616c54433146ccdb4a277b9cb499bc10e"},
-    {file = "rpds_py-0.16.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ec2e1cf025b2c0f48ec17ff3e642661da7ee332d326f2e6619366ce8e221f018"},
-    {file = "rpds_py-0.16.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:e0441fb4fdd39a230477b2ca9be90868af64425bfe7b122b57e61e45737a653b"},
-    {file = "rpds_py-0.16.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9f0350ef2fba5f34eb0c9000ea328e51b9572b403d2f7f3b19f24085f6f598e8"},
-    {file = "rpds_py-0.16.2-cp311-none-win32.whl", hash = "sha256:5a80e2f83391ad0808b4646732af2a7b67550b98f0cae056cb3b40622a83dbb3"},
-    {file = "rpds_py-0.16.2-cp311-none-win_amd64.whl", hash = "sha256:e04e56b4ca7a770593633556e8e9e46579d66ec2ada846b401252a2bdcf70a6d"},
-    {file = "rpds_py-0.16.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:5e6caa3809e50690bd92fa490f5c38caa86082c8c3315aa438bce43786d5e90d"},
-    {file = "rpds_py-0.16.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2e53b9b25cac9065328901713a7e9e3b12e4f57ef4280b370fbbf6fef2052eef"},
-    {file = "rpds_py-0.16.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:af27423662f32d7501a00c5e7342f7dbd1e4a718aea7a239781357d15d437133"},
-    {file = "rpds_py-0.16.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:43d4dd5fb16eb3825742bad8339d454054261ab59fed2fbac84e1d84d5aae7ba"},
-    {file = "rpds_py-0.16.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e061de3b745fe611e23cd7318aec2c8b0e4153939c25c9202a5811ca911fd733"},
-    {file = "rpds_py-0.16.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b811d182ad17ea294f2ec63c0621e7be92a1141e1012383461872cead87468f"},
-    {file = "rpds_py-0.16.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5552f328eaef1a75ff129d4d0c437bf44e43f9436d3996e8eab623ea0f5fcf73"},
-    {file = "rpds_py-0.16.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:dcbe1f8dd179e4d69b70b1f1d9bb6fd1e7e1bdc9c9aad345cdeb332e29d40748"},
-    {file = "rpds_py-0.16.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8aad80645a011abae487d356e0ceb359f4938dfb6f7bcc410027ed7ae4f7bb8b"},
-    {file = "rpds_py-0.16.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:b6f5549d6ed1da9bfe3631ca9483ae906f21410be2445b73443fa9f017601c6f"},
-    {file = "rpds_py-0.16.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:d452817e0d9c749c431a1121d56a777bd7099b720b3d1c820f1725cb40928f58"},
-    {file = "rpds_py-0.16.2-cp312-none-win32.whl", hash = "sha256:888a97002e986eca10d8546e3c8b97da1d47ad8b69726dcfeb3e56348ebb28a3"},
-    {file = "rpds_py-0.16.2-cp312-none-win_amd64.whl", hash = "sha256:d8dda2a806dfa4a9b795950c4f5cc56d6d6159f7d68080aedaff3bdc9b5032f5"},
-    {file = "rpds_py-0.16.2-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:071980663c273bf3d388fe5c794c547e6f35ba3335477072c713a3176bf14a60"},
-    {file = "rpds_py-0.16.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:726ac36e8a3bb8daef2fd482534cabc5e17334052447008405daca7ca04a3108"},
-    {file = "rpds_py-0.16.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e9e557db6a177470316c82f023e5d571811c9a4422b5ea084c85da9aa3c035fc"},
-    {file = "rpds_py-0.16.2-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:90123853fc8b1747f80b0d354be3d122b4365a93e50fc3aacc9fb4c2488845d6"},
-    {file = "rpds_py-0.16.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a61f659665a39a4d17d699ab3593d7116d66e1e2e3f03ef3fb8f484e91908808"},
-    {file = "rpds_py-0.16.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cc97f0640e91d7776530f06e6836c546c1c752a52de158720c4224c9e8053cad"},
-    {file = "rpds_py-0.16.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:44a54e99a2b9693a37ebf245937fd6e9228b4cbd64b9cc961e1f3391ec6c7391"},
-    {file = "rpds_py-0.16.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:bd4b677d929cf1f6bac07ad76e0f2d5de367e6373351c01a9c0a39f6b21b4a8b"},
-    {file = "rpds_py-0.16.2-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:5ef00873303d678aaf8b0627e111fd434925ca01c657dbb2641410f1cdaef261"},
-    {file = "rpds_py-0.16.2-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:349cb40897fd529ca15317c22c0eab67f5ac5178b5bd2c6adc86172045210acc"},
-    {file = "rpds_py-0.16.2-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:2ddef620e70eaffebed5932ce754d539c0930f676aae6212f8e16cd9743dd365"},
-    {file = "rpds_py-0.16.2-cp38-none-win32.whl", hash = "sha256:882ce6e25e585949c3d9f9abd29202367175e0aab3aba0c58c9abbb37d4982ff"},
-    {file = "rpds_py-0.16.2-cp38-none-win_amd64.whl", hash = "sha256:f4bd4578e44f26997e9e56c96dedc5f1af43cc9d16c4daa29c771a00b2a26851"},
-    {file = "rpds_py-0.16.2-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:69ac7ea9897ec201ce68b48582f3eb34a3f9924488a5432a93f177bf76a82a7e"},
-    {file = "rpds_py-0.16.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a9880b4656efe36ccad41edc66789e191e5ee19a1ea8811e0aed6f69851a82f4"},
-    {file = "rpds_py-0.16.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ee94cb58c0ba2c62ee108c2b7c9131b2c66a29e82746e8fa3aa1a1effbd3dcf1"},
-    {file = "rpds_py-0.16.2-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:24f7a2eb3866a9e91f4599851e0c8d39878a470044875c49bd528d2b9b88361c"},
-    {file = "rpds_py-0.16.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ca57468da2d9a660bcf8961637c85f2fbb2aa64d9bc3f9484e30c3f9f67b1dd7"},
-    {file = "rpds_py-0.16.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ccd4e400309e1f34a5095bf9249d371f0fd60f8a3a5c4a791cad7b99ce1fd38d"},
-    {file = "rpds_py-0.16.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80443fe2f7b3ea3934c5d75fb0e04a5dbb4a8e943e5ff2de0dec059202b70a8b"},
-    {file = "rpds_py-0.16.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4d6a9f052e72d493efd92a77f861e45bab2f6be63e37fa8ecf0c6fd1a58fedb0"},
-    {file = "rpds_py-0.16.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:35953f4f2b3216421af86fd236b7c0c65935936a94ea83ddbd4904ba60757773"},
-    {file = "rpds_py-0.16.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:981d135c7cdaf6cd8eadae1c950de43b976de8f09d8e800feed307140d3d6d00"},
-    {file = "rpds_py-0.16.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:d0dd7ed2f16df2e129496e7fbe59a34bc2d7fc8db443a606644d069eb69cbd45"},
-    {file = "rpds_py-0.16.2-cp39-none-win32.whl", hash = "sha256:703d95c75a72e902544fda08e965885525e297578317989fd15a6ce58414b41d"},
-    {file = "rpds_py-0.16.2-cp39-none-win_amd64.whl", hash = "sha256:e93ec1b300acf89730cf27975ef574396bc04edecc358e9bd116fb387a123239"},
-    {file = "rpds_py-0.16.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:44627b6ca7308680a70766454db5249105fa6344853af6762eaad4158a2feebe"},
-    {file = "rpds_py-0.16.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:3f91df8e6dbb7360e176d1affd5fb0246d2b88d16aa5ebc7db94fd66b68b61da"},
-    {file = "rpds_py-0.16.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6d904c5693e08bad240f16d79305edba78276be87061c872a4a15e2c301fa2c0"},
-    {file = "rpds_py-0.16.2-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:290a81cfbe4673285cdf140ec5cd1658ffbf63ab359f2b352ebe172e7cfa5bf0"},
-    {file = "rpds_py-0.16.2-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b634c5ec0103c5cbebc24ebac4872b045cccb9456fc59efdcf6fe39775365bd2"},
-    {file = "rpds_py-0.16.2-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a297a4d08cc67c7466c873c78039d87840fb50d05473db0ec1b7b03d179bf322"},
-    {file = "rpds_py-0.16.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b2e75e17bd0bb66ee34a707da677e47c14ee51ccef78ed6a263a4cc965a072a1"},
-    {file = "rpds_py-0.16.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f1b9d9260e06ea017feb7172976ab261e011c1dc2f8883c7c274f6b2aabfe01a"},
-    {file = "rpds_py-0.16.2-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:162d7cd9cd311c1b0ff1c55a024b8f38bd8aad1876b648821da08adc40e95734"},
-    {file = "rpds_py-0.16.2-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:9b32f742ce5b57201305f19c2ef7a184b52f6f9ba6871cc042c2a61f0d6b49b8"},
-    {file = "rpds_py-0.16.2-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:ac08472f41ea77cd6a5dae36ae7d4ed3951d6602833af87532b556c1b4601d63"},
-    {file = "rpds_py-0.16.2-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:495a14b72bbe217f2695dcd9b5ab14d4f8066a00f5d209ed94f0aca307f85f6e"},
-    {file = "rpds_py-0.16.2-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:8d6b6937ae9eac6d6c0ca3c42774d89fa311f55adff3970fb364b34abde6ed3d"},
-    {file = "rpds_py-0.16.2-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6a61226465bda9283686db8f17d02569a98e4b13c637be5a26d44aa1f1e361c2"},
-    {file = "rpds_py-0.16.2-pp38-pypy38_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5cf6af100ffb5c195beec11ffaa8cf8523057f123afa2944e6571d54da84cdc9"},
-    {file = "rpds_py-0.16.2-pp38-pypy38_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6df15846ee3fb2e6397fe25d7ca6624af9f89587f3f259d177b556fed6bebe2c"},
-    {file = "rpds_py-0.16.2-pp38-pypy38_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1be2f033df1b8be8c3167ba3c29d5dca425592ee31e35eac52050623afba5772"},
-    {file = "rpds_py-0.16.2-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:96f957d6ab25a78b9e7fc9749d754b98eac825a112b4e666525ce89afcbd9ed5"},
-    {file = "rpds_py-0.16.2-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:088396c7c70e59872f67462fcac3ecbded5233385797021976a09ebd55961dfe"},
-    {file = "rpds_py-0.16.2-pp38-pypy38_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:4c46ad6356e1561f2a54f08367d1d2e70a0a1bb2db2282d2c1972c1d38eafc3b"},
-    {file = "rpds_py-0.16.2-pp38-pypy38_pp73-musllinux_1_2_i686.whl", hash = "sha256:47713dc4fce213f5c74ca8a1f6a59b622fc1b90868deb8e8e4d993e421b4b39d"},
-    {file = "rpds_py-0.16.2-pp38-pypy38_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:f811771019f063bbd0aa7bb72c8a934bc13ebacb4672d712fc1639cfd314cccc"},
-    {file = "rpds_py-0.16.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f19afcfc0dd0dca35694df441e9b0f95bc231b512f51bded3c3d8ca32153ec19"},
-    {file = "rpds_py-0.16.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:a4b682c5775d6a3d21e314c10124599976809455ee67020e8e72df1769b87bc3"},
-    {file = "rpds_py-0.16.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c647ca87fc0ebe808a41de912e9a1bfef9acb85257e5d63691364ac16b81c1f0"},
-    {file = "rpds_py-0.16.2-pp39-pypy39_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:302bd4983bbd47063e452c38be66153760112f6d3635c7eeefc094299fa400a9"},
-    {file = "rpds_py-0.16.2-pp39-pypy39_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bf721ede3eb7b829e4a9b8142bd55db0bdc82902720548a703f7e601ee13bdc3"},
-    {file = "rpds_py-0.16.2-pp39-pypy39_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:358dafc89ce3894c7f486c615ba914609f38277ef67f566abc4c854d23b997fa"},
-    {file = "rpds_py-0.16.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cad0f59ee3dc35526039f4bc23642d52d5f6616b5f687d846bfc6d0d6d486db0"},
-    {file = "rpds_py-0.16.2-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:cffa76b385dfe1e38527662a302b19ffb0e7f5cf7dd5e89186d2c94a22dd9d0c"},
-    {file = "rpds_py-0.16.2-pp39-pypy39_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:83640a5d7cd3bff694747d50436b8b541b5b9b9782b0c8c1688931d6ee1a1f2d"},
-    {file = "rpds_py-0.16.2-pp39-pypy39_pp73-musllinux_1_2_i686.whl", hash = "sha256:ed99b4f7179d2111702020fd7d156e88acd533f5a7d3971353e568b6051d5c97"},
-    {file = "rpds_py-0.16.2-pp39-pypy39_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:4022b9dc620e14f30201a8a73898a873c8e910cb642bcd2f3411123bc527f6ac"},
-    {file = "rpds_py-0.16.2.tar.gz", hash = "sha256:781ef8bfc091b19960fc0142a23aedadafa826bc32b433fdfe6fd7f964d7ef44"},
+    {file = "rpds_py-0.17.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:4128980a14ed805e1b91a7ed551250282a8ddf8201a4e9f8f5b7e6225f54170d"},
+    {file = "rpds_py-0.17.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ff1dcb8e8bc2261a088821b2595ef031c91d499a0c1b031c152d43fe0a6ecec8"},
+    {file = "rpds_py-0.17.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d65e6b4f1443048eb7e833c2accb4fa7ee67cc7d54f31b4f0555b474758bee55"},
+    {file = "rpds_py-0.17.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a71169d505af63bb4d20d23a8fbd4c6ce272e7bce6cc31f617152aa784436f29"},
+    {file = "rpds_py-0.17.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:436474f17733c7dca0fbf096d36ae65277e8645039df12a0fa52445ca494729d"},
+    {file = "rpds_py-0.17.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:10162fe3f5f47c37ebf6d8ff5a2368508fe22007e3077bf25b9c7d803454d921"},
+    {file = "rpds_py-0.17.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:720215373a280f78a1814becb1312d4e4d1077b1202a56d2b0815e95ccb99ce9"},
+    {file = "rpds_py-0.17.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:70fcc6c2906cfa5c6a552ba7ae2ce64b6c32f437d8f3f8eea49925b278a61453"},
+    {file = "rpds_py-0.17.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:91e5a8200e65aaac342a791272c564dffcf1281abd635d304d6c4e6b495f29dc"},
+    {file = "rpds_py-0.17.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:99f567dae93e10be2daaa896e07513dd4bf9c2ecf0576e0533ac36ba3b1d5394"},
+    {file = "rpds_py-0.17.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:24e4900a6643f87058a27320f81336d527ccfe503984528edde4bb660c8c8d59"},
+    {file = "rpds_py-0.17.1-cp310-none-win32.whl", hash = "sha256:0bfb09bf41fe7c51413f563373e5f537eaa653d7adc4830399d4e9bdc199959d"},
+    {file = "rpds_py-0.17.1-cp310-none-win_amd64.whl", hash = "sha256:20de7b7179e2031a04042e85dc463a93a82bc177eeba5ddd13ff746325558aa6"},
+    {file = "rpds_py-0.17.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:65dcf105c1943cba45d19207ef51b8bc46d232a381e94dd38719d52d3980015b"},
+    {file = "rpds_py-0.17.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:01f58a7306b64e0a4fe042047dd2b7d411ee82e54240284bab63e325762c1147"},
+    {file = "rpds_py-0.17.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:071bc28c589b86bc6351a339114fb7a029f5cddbaca34103aa573eba7b482382"},
+    {file = "rpds_py-0.17.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ae35e8e6801c5ab071b992cb2da958eee76340e6926ec693b5ff7d6381441745"},
+    {file = "rpds_py-0.17.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:149c5cd24f729e3567b56e1795f74577aa3126c14c11e457bec1b1c90d212e38"},
+    {file = "rpds_py-0.17.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e796051f2070f47230c745d0a77a91088fbee2cc0502e9b796b9c6471983718c"},
+    {file = "rpds_py-0.17.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:60e820ee1004327609b28db8307acc27f5f2e9a0b185b2064c5f23e815f248f8"},
+    {file = "rpds_py-0.17.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1957a2ab607f9added64478a6982742eb29f109d89d065fa44e01691a20fc20a"},
+    {file = "rpds_py-0.17.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8587fd64c2a91c33cdc39d0cebdaf30e79491cc029a37fcd458ba863f8815383"},
+    {file = "rpds_py-0.17.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:4dc889a9d8a34758d0fcc9ac86adb97bab3fb7f0c4d29794357eb147536483fd"},
+    {file = "rpds_py-0.17.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:2953937f83820376b5979318840f3ee47477d94c17b940fe31d9458d79ae7eea"},
+    {file = "rpds_py-0.17.1-cp311-none-win32.whl", hash = "sha256:1bfcad3109c1e5ba3cbe2f421614e70439f72897515a96c462ea657261b96518"},
+    {file = "rpds_py-0.17.1-cp311-none-win_amd64.whl", hash = "sha256:99da0a4686ada4ed0f778120a0ea8d066de1a0a92ab0d13ae68492a437db78bf"},
+    {file = "rpds_py-0.17.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:1dc29db3900cb1bb40353772417800f29c3d078dbc8024fd64655a04ee3c4bdf"},
+    {file = "rpds_py-0.17.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:82ada4a8ed9e82e443fcef87e22a3eed3654dd3adf6e3b3a0deb70f03e86142a"},
+    {file = "rpds_py-0.17.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1d36b2b59e8cc6e576f8f7b671e32f2ff43153f0ad6d0201250a7c07f25d570e"},
+    {file = "rpds_py-0.17.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3677fcca7fb728c86a78660c7fb1b07b69b281964673f486ae72860e13f512ad"},
+    {file = "rpds_py-0.17.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:516fb8c77805159e97a689e2f1c80655c7658f5af601c34ffdb916605598cda2"},
+    {file = "rpds_py-0.17.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:df3b6f45ba4515632c5064e35ca7f31d51d13d1479673185ba8f9fefbbed58b9"},
+    {file = "rpds_py-0.17.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a967dd6afda7715d911c25a6ba1517975acd8d1092b2f326718725461a3d33f9"},
+    {file = "rpds_py-0.17.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:dbbb95e6fc91ea3102505d111b327004d1c4ce98d56a4a02e82cd451f9f57140"},
+    {file = "rpds_py-0.17.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:02866e060219514940342a1f84303a1ef7a1dad0ac311792fbbe19b521b489d2"},
+    {file = "rpds_py-0.17.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:2528ff96d09f12e638695f3a2e0c609c7b84c6df7c5ae9bfeb9252b6fa686253"},
+    {file = "rpds_py-0.17.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:bd345a13ce06e94c753dab52f8e71e5252aec1e4f8022d24d56decd31e1b9b23"},
+    {file = "rpds_py-0.17.1-cp312-none-win32.whl", hash = "sha256:2a792b2e1d3038daa83fa474d559acfd6dc1e3650ee93b2662ddc17dbff20ad1"},
+    {file = "rpds_py-0.17.1-cp312-none-win_amd64.whl", hash = "sha256:292f7344a3301802e7c25c53792fae7d1593cb0e50964e7bcdcc5cf533d634e3"},
+    {file = "rpds_py-0.17.1-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:8ffe53e1d8ef2520ebcf0c9fec15bb721da59e8ef283b6ff3079613b1e30513d"},
+    {file = "rpds_py-0.17.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4341bd7579611cf50e7b20bb8c2e23512a3dc79de987a1f411cb458ab670eb90"},
+    {file = "rpds_py-0.17.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2f4eb548daf4836e3b2c662033bfbfc551db58d30fd8fe660314f86bf8510b93"},
+    {file = "rpds_py-0.17.1-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b686f25377f9c006acbac63f61614416a6317133ab7fafe5de5f7dc8a06d42eb"},
+    {file = "rpds_py-0.17.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4e21b76075c01d65d0f0f34302b5a7457d95721d5e0667aea65e5bb3ab415c25"},
+    {file = "rpds_py-0.17.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b86b21b348f7e5485fae740d845c65a880f5d1eda1e063bc59bef92d1f7d0c55"},
+    {file = "rpds_py-0.17.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f175e95a197f6a4059b50757a3dca33b32b61691bdbd22c29e8a8d21d3914cae"},
+    {file = "rpds_py-0.17.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1701fc54460ae2e5efc1dd6350eafd7a760f516df8dbe51d4a1c79d69472fbd4"},
+    {file = "rpds_py-0.17.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:9051e3d2af8f55b42061603e29e744724cb5f65b128a491446cc029b3e2ea896"},
+    {file = "rpds_py-0.17.1-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:7450dbd659fed6dd41d1a7d47ed767e893ba402af8ae664c157c255ec6067fde"},
+    {file = "rpds_py-0.17.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:5a024fa96d541fd7edaa0e9d904601c6445e95a729a2900c5aec6555fe921ed6"},
+    {file = "rpds_py-0.17.1-cp38-none-win32.whl", hash = "sha256:da1ead63368c04a9bded7904757dfcae01eba0e0f9bc41d3d7f57ebf1c04015a"},
+    {file = "rpds_py-0.17.1-cp38-none-win_amd64.whl", hash = "sha256:841320e1841bb53fada91c9725e766bb25009cfd4144e92298db296fb6c894fb"},
+    {file = "rpds_py-0.17.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:f6c43b6f97209e370124baf2bf40bb1e8edc25311a158867eb1c3a5d449ebc7a"},
+    {file = "rpds_py-0.17.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5e7d63ec01fe7c76c2dbb7e972fece45acbb8836e72682bde138e7e039906e2c"},
+    {file = "rpds_py-0.17.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:81038ff87a4e04c22e1d81f947c6ac46f122e0c80460b9006e6517c4d842a6ec"},
+    {file = "rpds_py-0.17.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:810685321f4a304b2b55577c915bece4c4a06dfe38f6e62d9cc1d6ca8ee86b99"},
+    {file = "rpds_py-0.17.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:25f071737dae674ca8937a73d0f43f5a52e92c2d178330b4c0bb6ab05586ffa6"},
+    {file = "rpds_py-0.17.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:aa5bfb13f1e89151ade0eb812f7b0d7a4d643406caaad65ce1cbabe0a66d695f"},
+    {file = "rpds_py-0.17.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dfe07308b311a8293a0d5ef4e61411c5c20f682db6b5e73de6c7c8824272c256"},
+    {file = "rpds_py-0.17.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a000133a90eea274a6f28adc3084643263b1e7c1a5a66eb0a0a7a36aa757ed74"},
+    {file = "rpds_py-0.17.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:5d0e8a6434a3fbf77d11448c9c25b2f25244226cfbec1a5159947cac5b8c5fa4"},
+    {file = "rpds_py-0.17.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:efa767c220d94aa4ac3a6dd3aeb986e9f229eaf5bce92d8b1b3018d06bed3772"},
+    {file = "rpds_py-0.17.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:dbc56680ecf585a384fbd93cd42bc82668b77cb525343170a2d86dafaed2a84b"},
+    {file = "rpds_py-0.17.1-cp39-none-win32.whl", hash = "sha256:270987bc22e7e5a962b1094953ae901395e8c1e1e83ad016c5cfcfff75a15a3f"},
+    {file = "rpds_py-0.17.1-cp39-none-win_amd64.whl", hash = "sha256:2a7b2f2f56a16a6d62e55354dd329d929560442bd92e87397b7a9586a32e3e76"},
+    {file = "rpds_py-0.17.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a3264e3e858de4fc601741498215835ff324ff2482fd4e4af61b46512dd7fc83"},
+    {file = "rpds_py-0.17.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:f2f3b28b40fddcb6c1f1f6c88c6f3769cd933fa493ceb79da45968a21dccc920"},
+    {file = "rpds_py-0.17.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9584f8f52010295a4a417221861df9bea4c72d9632562b6e59b3c7b87a1522b7"},
+    {file = "rpds_py-0.17.1-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c64602e8be701c6cfe42064b71c84ce62ce66ddc6422c15463fd8127db3d8066"},
+    {file = "rpds_py-0.17.1-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:060f412230d5f19fc8c8b75f315931b408d8ebf56aec33ef4168d1b9e54200b1"},
+    {file = "rpds_py-0.17.1-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b9412abdf0ba70faa6e2ee6c0cc62a8defb772e78860cef419865917d86c7342"},
+    {file = "rpds_py-0.17.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9737bdaa0ad33d34c0efc718741abaafce62fadae72c8b251df9b0c823c63b22"},
+    {file = "rpds_py-0.17.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9f0e4dc0f17dcea4ab9d13ac5c666b6b5337042b4d8f27e01b70fae41dd65c57"},
+    {file = "rpds_py-0.17.1-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:1db228102ab9d1ff4c64148c96320d0be7044fa28bd865a9ce628ce98da5973d"},
+    {file = "rpds_py-0.17.1-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:d8bbd8e56f3ba25a7d0cf980fc42b34028848a53a0e36c9918550e0280b9d0b6"},
+    {file = "rpds_py-0.17.1-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:be22ae34d68544df293152b7e50895ba70d2a833ad9566932d750d3625918b82"},
+    {file = "rpds_py-0.17.1-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:bf046179d011e6114daf12a534d874958b039342b347348a78b7cdf0dd9d6041"},
+    {file = "rpds_py-0.17.1-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:1a746a6d49665058a5896000e8d9d2f1a6acba8a03b389c1e4c06e11e0b7f40d"},
+    {file = "rpds_py-0.17.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f0b8bf5b8db49d8fd40f54772a1dcf262e8be0ad2ab0206b5a2ec109c176c0a4"},
+    {file = "rpds_py-0.17.1-pp38-pypy38_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f7f4cb1f173385e8a39c29510dd11a78bf44e360fb75610594973f5ea141028b"},
+    {file = "rpds_py-0.17.1-pp38-pypy38_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7fbd70cb8b54fe745301921b0816c08b6d917593429dfc437fd024b5ba713c58"},
+    {file = "rpds_py-0.17.1-pp38-pypy38_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9bdf1303df671179eaf2cb41e8515a07fc78d9d00f111eadbe3e14262f59c3d0"},
+    {file = "rpds_py-0.17.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fad059a4bd14c45776600d223ec194e77db6c20255578bb5bcdd7c18fd169361"},
+    {file = "rpds_py-0.17.1-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:3664d126d3388a887db44c2e293f87d500c4184ec43d5d14d2d2babdb4c64cad"},
+    {file = "rpds_py-0.17.1-pp38-pypy38_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:698ea95a60c8b16b58be9d854c9f993c639f5c214cf9ba782eca53a8789d6b19"},
+    {file = "rpds_py-0.17.1-pp38-pypy38_pp73-musllinux_1_2_i686.whl", hash = "sha256:c3d2010656999b63e628a3c694f23020322b4178c450dc478558a2b6ef3cb9bb"},
+    {file = "rpds_py-0.17.1-pp38-pypy38_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:938eab7323a736533f015e6069a7d53ef2dcc841e4e533b782c2bfb9fb12d84b"},
+    {file = "rpds_py-0.17.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:1e626b365293a2142a62b9a614e1f8e331b28f3ca57b9f05ebbf4cf2a0f0bdc5"},
+    {file = "rpds_py-0.17.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:380e0df2e9d5d5d339803cfc6d183a5442ad7ab3c63c2a0982e8c824566c5ccc"},
+    {file = "rpds_py-0.17.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b760a56e080a826c2e5af09002c1a037382ed21d03134eb6294812dda268c811"},
+    {file = "rpds_py-0.17.1-pp39-pypy39_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5576ee2f3a309d2bb403ec292d5958ce03953b0e57a11d224c1f134feaf8c40f"},
+    {file = "rpds_py-0.17.1-pp39-pypy39_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1f3c3461ebb4c4f1bbc70b15d20b565759f97a5aaf13af811fcefc892e9197ba"},
+    {file = "rpds_py-0.17.1-pp39-pypy39_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:637b802f3f069a64436d432117a7e58fab414b4e27a7e81049817ae94de45d8d"},
+    {file = "rpds_py-0.17.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ffee088ea9b593cc6160518ba9bd319b5475e5f3e578e4552d63818773c6f56a"},
+    {file = "rpds_py-0.17.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:3ac732390d529d8469b831949c78085b034bff67f584559340008d0f6041a049"},
+    {file = "rpds_py-0.17.1-pp39-pypy39_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:93432e747fb07fa567ad9cc7aaadd6e29710e515aabf939dfbed8046041346c6"},
+    {file = "rpds_py-0.17.1-pp39-pypy39_pp73-musllinux_1_2_i686.whl", hash = "sha256:7b7d9ca34542099b4e185b3c2a2b2eda2e318a7dbde0b0d83357a6d4421b5296"},
+    {file = "rpds_py-0.17.1-pp39-pypy39_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:0387ce69ba06e43df54e43968090f3626e231e4bc9150e4c3246947567695f68"},
+    {file = "rpds_py-0.17.1.tar.gz", hash = "sha256:0210b2668f24c078307260bf88bdac9d6f1093635df5123789bfee4d8d7fc8e7"},
 ]

 [[package]]
@ -2138,13 +2184,13 @@ full = ["httpx (>=0.22.0)", "itsdangerous", "jinja2", "python-multipart", "pyyam

 [[package]]
 name = "streamlit"
-version = "1.29.0"
+version = "1.30.0"
 description = "A faster way to build and share data apps"
 optional = true
 python-versions = ">=3.8, !=3.9.7"
 files = [
-    {file = "streamlit-1.29.0-py2.py3-none-any.whl", hash = "sha256:753510edb5bb831af0e3bdacd353c879ad5b4f0211e7efa0ec378809464868b4"},
-    {file = "streamlit-1.29.0.tar.gz", hash = "sha256:b6dfff9c5e132e5518c92150efcd452980db492a45fafeac3d4688d2334efa07"},
+    {file = "streamlit-1.30.0-py2.py3-none-any.whl", hash = "sha256:536494a4edfe9b66ed70c437176cfd6c7e36b1d99d0587b0be64245fa89c241b"},
+    {file = "streamlit-1.30.0.tar.gz", hash = "sha256:90333915d9df8ce3b06de31b8a5bbab51e8cf0982dc6c32da9d6b1f2b4a9fa78"},
 ]

 [package.dependencies]
@ -2153,7 +2199,7 @@ blinker = ">=1.0.0,<2"
 cachetools = ">=4.0,<6"
 click = ">=7.0,<9"
 gitpython = ">=3.0.7,<3.1.19 || >3.1.19,<4"
-importlib-metadata = ">=1.4,<7"
+importlib-metadata = ">=1.4,<8"
 numpy = ">=1.19.3,<2"
 packaging = ">=16.8,<24"
 pandas = ">=1.3.0,<3"
@ -2684,9 +2730,9 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p

 [extras]
 extra-proxy = ["streamlit"]
-proxy = ["backoff", "fastapi", "gunicorn", "orjson", "pyyaml", "rq", "uvicorn"]
+proxy = ["apscheduler", "backoff", "fastapi", "gunicorn", "orjson", "pyyaml", "rq", "uvicorn"]

 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.8.1,<3.9.7 || >3.9.7"
-content-hash = "f4d60cb3f552af0d2a4e4ef5c6f55696fd6e546b75ff7b4ec362c3549a63c92a"
+content-hash = "19f79f119f1760d3406b446fa3664b82c0d0859b3912dcb7ba7c8edf1d786096"
--- a/proxy_server_config.yaml
+++ b/proxy_server_config.yaml
@ -1,10 +1,20 @@
 model_list:
+  - model_name: gpt-3.5-turbo
+    litellm_params:
+      model: azure/chatgpt-v-2
+      api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
+      api_version: "2023-05-15"
+      api_key: os.environ/AZURE_API_KEY # The `os.environ/` prefix tells litellm to read this from the env. See https://docs.litellm.ai/docs/simple_proxy#load-api-keys-from-vault
  - model_name: gpt-4
    litellm_params:
      model: azure/chatgpt-v-2
      api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
      api_version: "2023-05-15"
      api_key: os.environ/AZURE_API_KEY # The `os.environ/` prefix tells litellm to read this from the env. See https://docs.litellm.ai/docs/simple_proxy#load-api-keys-from-vault
+  - model_name: sagemaker-completion-model
+    litellm_params:
+      model: sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4
+      input_cost_per_second: 0.000420  
  - model_name: gpt-4
    litellm_params:
      model: azure/gpt-turbo
@ -17,11 +27,26 @@ model_list:
      api_key: os.environ/AZURE_EUROPE_API_KEY
      api_base: https://my-endpoint-europe-berri-992.openai.azure.com
      rpm: 10
+  - model_name: text-embedding-ada-002
+    litellm_params: 
+      model: azure/azure-embedding-model
+      api_key: os.environ/AZURE_API_KEY
+      api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
+      api_version: "2023-05-15"
+    model_info:
+      mode: embedding
+      base_model: text-embedding-ada-002
+  - model_name: dall-e-2
+    litellm_params:
+      model: azure/
+      api_version: 2023-06-01-preview
+      api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
+      api_key: os.environ/AZURE_API_KEY

 litellm_settings:
  drop_params: True
-  set_verbose: True
-
+  max_budget: 100 
+  budget_duration: 30d
 general_settings: 
  master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234)
  # database_url: "postgresql://<user>:<password>@<host>:<port>/<dbname>" # [OPTIONAL] use for token-based auth to proxy
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,9 +1,9 @@
 [tool.poetry]
 name = "litellm"
-version = "1.18.0"
+version = "1.20.5"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
-license = "MIT License"
+license = "MIT"
 readme = "README.md"

 [tool.poetry.dependencies]
@ -22,9 +22,10 @@ uvicorn = {version = "^0.22.0", optional = true}
 gunicorn = {version = "^21.2.0", optional = true}
 fastapi = {version = "^0.104.1", optional = true}
 backoff = {version = "*", optional = true}
-pyyaml = {version = "^6.0", optional = true}
+pyyaml = {version = "^6.0.1", optional = true}
 rq = {version = "*", optional = true}
 orjson = {version = "^3.9.7", optional = true}
+apscheduler = {version = "^3.10.4", optional = true}
 streamlit = {version = "^1.29.0", optional = true}

 [tool.poetry.extras]
@ -36,6 +37,7 @@ proxy = [
    "pyyaml",
    "rq",
    "orjson",
+    "apscheduler"
 ]

 extra_proxy = [
@ -61,7 +63,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"

 [tool.commitizen]
-version = "1.18.0"
+version = "1.20.5"
 version_files = [
    "pyproject.toml:^version"
 ]
--- a/Show more
+++ b/Show more