Merge branch 'main' into main

2024-03-30 13:21:53 +09:00 · 2024-03-30 13:21:53 +09:00 · 1b84dfac91
commit 1b84dfac91
parent 29e8c144fb 97616ee267
301 changed files with 62646 additions and 3691 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -28,8 +28,9 @@ jobs:
            pip install "pytest==7.3.1"
            pip install "pytest-asyncio==0.21.1"
            pip install mypy
-            pip install "google-generativeai>=0.3.2"
-            pip install "google-cloud-aiplatform>=1.38.0"
+            pip install "google-generativeai==0.3.2"
+            pip install "google-cloud-aiplatform==1.43.0"
+            pip install pyarrow
            pip install "boto3>=1.28.57"
            pip install "aioboto3>=12.3.0"
            pip install langchain
@ -48,6 +49,7 @@ jobs:
            pip install argon2-cffi
            pip install "pytest-mock==3.12.0"
            pip install python-multipart
+            pip install google-cloud-aiplatform
      - save_cache:
          paths:
            - ./venv
@ -152,10 +154,11 @@ jobs:
            pip install "pytest-mock==3.12.0"
            pip install "pytest-asyncio==0.21.1"
            pip install mypy
-            pip install "google-generativeai>=0.3.2"
-            pip install "google-cloud-aiplatform>=1.38.0"
-            pip install "boto3>=1.28.57"
-            pip install "aioboto3>=12.3.0"
+            pip install "google-generativeai==0.3.2"
+            pip install "google-cloud-aiplatform==1.43.0"
+            pip install pyarrow
+            pip install "boto3==1.34.34"
+            pip install "aioboto3==12.3.0"
            pip install langchain
            pip install "langfuse>=2.0.0"
            pip install numpydoc
--- a/.circleci/requirements.txt
+++ b/.circleci/requirements.txt
@ -7,8 +7,7 @@ baseten
 cohere
 redis
 anthropic
-boto3
 orjson
 pydantic
-google-cloud-aiplatform
+google-cloud-aiplatform==1.43.0
 redisvl==0.0.7 # semantic caching
--- a/.github/workflows/ghcr_deploy.yml
+++ b/.github/workflows/ghcr_deploy.yml
@ -43,6 +43,13 @@ jobs:
          push: true
          file: Dockerfile.database
          tags: litellm/litellm-database:${{ github.event.inputs.tag || 'latest' }}
+      -
+        name: Build and push litellm-spend-logs image
+        uses: docker/build-push-action@v5
+        with:
+          push: true
+          file: ./litellm-js/spend-logs/Dockerfile
+          tags: litellm/litellm-spend_logs:${{ github.event.inputs.tag || 'latest' }}
      
  build-and-push-image:
    runs-on: ubuntu-latest
@ -120,6 +127,44 @@ jobs:
          tags: ${{ steps.meta-database.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-database.outputs.tags }}-latest 
          labels: ${{ steps.meta-database.outputs.labels }} 
          platforms: local,linux/amd64,linux/arm64,linux/arm64/v8
+  
+  build-and-push-image-spend-logs:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Log in to the Container registry
+        uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Extract metadata (tags, labels) for spend-logs Dockerfile
+        id: meta-spend-logs
+        uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
+        with:
+          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}-spend_logs
+      # Configure multi platform Docker builds
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@e0e4588fad221d38ee467c0bffd91115366dc0c5
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@edfb0fe6204400c56fbfd3feba3fe9ad1adfa345
+
+      - name: Build and push Database Docker image
+        uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
+        with:
+          context: .
+          file: ./litellm-js/spend-logs/Dockerfile
+          push: true
+          tags: ${{ steps.meta-spend-logs.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-spend-logs.outputs.tags }}-latest 
+          labels: ${{ steps.meta-spend-logs.outputs.labels }} 
+          platforms: local,linux/amd64,linux/arm64,linux/arm64/v8
+
  build-and-push-helm-chart:
    runs-on: ubuntu-latest
    steps:
--- a/8
+++ b/8
@ -1,8 +1,8 @@
 # Base image for building
-ARG LITELLM_BUILD_IMAGE=python:3.9
+ARG LITELLM_BUILD_IMAGE=python:3.11.8-slim

 # Runtime image
-ARG LITELLM_RUNTIME_IMAGE=python:3.9-slim
+ARG LITELLM_RUNTIME_IMAGE=python:3.11.8-slim
 # Builder stage
 FROM $LITELLM_BUILD_IMAGE as builder

@ -70,5 +70,5 @@ EXPOSE 4000/tcp
 ENTRYPOINT ["litellm"]

 # Append "--detailed_debug" to the end of CMD to view detailed debug logs 
-# CMD ["--port", "4000", "--config", "./proxy_server_config.yaml", "--run_gunicorn", "--detailed_debug"]
-CMD ["--port", "4000", "--config", "./proxy_server_config.yaml", "--run_gunicorn", "--num_workers", "4"]
+# CMD ["--port", "4000", "--config", "./proxy_server_config.yaml"]
+CMD ["--port", "4000", "--config", "./proxy_server_config.yaml"]
--- a/Dockerfile.database
+++ b/Dockerfile.database
@ -1,8 +1,8 @@
 # Base image for building
-ARG LITELLM_BUILD_IMAGE=python:3.9
+ARG LITELLM_BUILD_IMAGE=python:3.11.8-slim

 # Runtime image
-ARG LITELLM_RUNTIME_IMAGE=python:3.9-slim
+ARG LITELLM_RUNTIME_IMAGE=python:3.11.8-slim
 # Builder stage
 FROM $LITELLM_BUILD_IMAGE as builder

@ -72,5 +72,5 @@ EXPOSE 4000/tcp
 ENTRYPOINT ["litellm"]

 # Append "--detailed_debug" to the end of CMD to view detailed debug logs 
-# CMD ["--port", "4000","--run_gunicorn", "--detailed_debug"]
-CMD ["--port", "4000", "--run_gunicorn"]
+# CMD ["--port", "4000", "--detailed_debug"]
+CMD ["--port", "4000"]
--- a/README.md
+++ b/README.md
@ -31,11 +31,11 @@ LiteLLM manages:
 - Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing)
 - Set Budgets & Rate limits per project, api key, model [OpenAI Proxy Server](https://docs.litellm.ai/docs/simple_proxy)

-**Stable Release**: v`1.30.2` 👈 Recommended stable version of proxy. 
-
 [**Jump to OpenAI Proxy Docs**](https://github.com/BerriAI/litellm?tab=readme-ov-file#openai-proxy---docs) <br>
 [**Jump to Supported LLM Providers**](https://github.com/BerriAI/litellm?tab=readme-ov-file#supported-provider-docs)

+🚨 **Stable Release:** v1.34.1 
+
 Support for more providers. Missing a provider or LLM Platform, raise a [feature request](https://github.com/BerriAI/litellm/issues/new?assignees=&labels=enhancement&projects=&template=feature_request.yml&title=%5BFeature%5D%3A+).

 # Usage ([**Docs**](https://docs.litellm.ai/docs/))
--- a/deploy/kubernetes/kub.yaml
+++ b/deploy/kubernetes/kub.yaml
@ -0,0 +1,55 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: litellm-deployment
+spec:
+  replicas: 3
+  selector:
+    matchLabels:
+      app: litellm
+  template:
+    metadata:
+      labels:
+        app: litellm
+    spec:
+      containers:
+        - name: litellm-container
+          image: ghcr.io/berriai/litellm:main-latest
+          env:
+            - name: AZURE_API_KEY
+              value: "d6f****"
+            - name: AZURE_API_BASE
+              value: "https://openai
+            - name: LITELLM_MASTER_KEY
+              value: "sk-1234"
+            - name: DATABASE_URL
+              value: "postgresql://ishaan:*********""
+          args:
+            - "--config"
+            - "/app/proxy_config.yaml"  # Update the path to mount the config file
+          volumeMounts:                 # Define volume mount for proxy_config.yaml
+            - name: config-volume
+              mountPath: /app
+              readOnly: true
+          livenessProbe:
+            httpGet:
+              path: /health/liveliness
+              port: 4000
+            initialDelaySeconds: 120
+            periodSeconds: 15
+            successThreshold: 1
+            failureThreshold: 3
+            timeoutSeconds: 10
+          readinessProbe:
+            httpGet:
+              path: /health/readiness
+              port: 4000
+            initialDelaySeconds: 120
+            periodSeconds: 15
+            successThreshold: 1
+            failureThreshold: 3
+            timeoutSeconds: 10
+      volumes:  # Define volume to mount proxy_config.yaml
+        - name: config-volume
+          configMap:
+            name: litellm-config  
--- a/deploy/kubernetes/service.yaml
+++ b/deploy/kubernetes/service.yaml
@ -0,0 +1,12 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: litellm-service
+spec:
+  selector:
+    app: litellm
+  ports:
+    - protocol: TCP
+      port: 4000
+      targetPort: 4000
+  type: LoadBalancer
--- a/docs/my-website/docs/debugging/hosted_debugging.md
+++ b/docs/my-website/docs/debugging/hosted_debugging.md
@ -76,7 +76,6 @@ Click on your personal dashboard link. Here's how you can find it 👇

 Oh! Looks like our request was made successfully. Let's click on it and see exactly what got sent to the LLM provider. 

-<Image img={require('../../img/dashboard_log_row.png')} alt="Dashboard Log Row" />    



--- a/docs/my-website/docs/observability/athina_integration.md
+++ b/docs/my-website/docs/observability/athina_integration.md
@ -41,6 +41,35 @@ response = completion(
 ) 
 ```

+## Additional information in metadata
+You can send some additional information to Athina by using the `metadata` field in completion. This can be useful for sending metadata about the request, such as the customer_id, prompt_slug, or any other information you want to track.
+
+```python
+#openai call with additional metadata
+response = completion(
+  model="gpt-3.5-turbo",
+  messages=[
+    {"role": "user", "content": "Hi 👋 - i'm openai"}
+  ],
+  metadata={
+    "environment": "staging",
+    "prompt_slug": "my_prompt_slug/v1"
+  }
+)
+```
+
+Following are the allowed fields in metadata, their types, and their descriptions:
+
+* `environment: Optional[str]` - Environment your app is running in (ex: production, staging, etc). This is useful for segmenting inference calls by environment.
+* `prompt_slug: Optional[str]` - Identifier for the prompt used for inference. This is useful for segmenting inference calls by prompt.
+* `customer_id: Optional[str]` - This is your customer ID. This is useful for segmenting inference calls by customer.
+* `customer_user_id: Optional[str]` - This is the end user ID. This is useful for segmenting inference calls by the end user.
+* `session_id: Optional[str]` - is the session or conversation ID. This is used for grouping different inferences into a conversation or chain. [Read more].(https://docs.athina.ai/logging/grouping_inferences)
+* `external_reference_id: Optional[str]` - This is useful if you want to associate your own internal identifier with the inference logged to Athina.
+* `context: Optional[Union[dict, str]]` - This is the context used as information for the prompt. For RAG applications, this is the "retrieved" data. You may log context as a string or as an object (dictionary).
+* `expected_response: Optional[str]` - This is the reference response to compare against for evaluation purposes. This is useful for segmenting inference calls by expected response.
+* `user_query: Optional[str]` - This is the user's query. For conversational applications, this is the user's last message.
+
 ## Support & Talk with Athina Team

 - [Schedule Demo 👋](https://cal.com/shiv-athina/30min)
--- a/docs/my-website/docs/providers/anthropic.md
+++ b/docs/my-website/docs/providers/anthropic.md
@ -60,11 +60,30 @@ export ANTHROPIC_API_KEY="your-api-key"

 ### 2. Start the proxy 

+<Tabs>
+<TabItem value="cli" label="cli">
+
 ```bash
 $ litellm --model claude-3-opus-20240229

 # Server running on http://0.0.0.0:4000
 ```
+</TabItem>
+<TabItem value="config" label="config.yaml">
+
+```yaml
+model_list:
+  - model_name: claude-3 ### RECEIVED MODEL NAME ###
+    litellm_params: # all params accepted by litellm.completion() - https://docs.litellm.ai/docs/completion/input
+      model: claude-3-opus-20240229 ### MODEL NAME sent to `litellm.completion()` ###
+      api_key: "os.environ/ANTHROPIC_API_KEY" # does os.getenv("AZURE_API_KEY_EU")
+```
+
+```bash
+litellm --config /path/to/config.yaml
+```
+</TabItem>
+</Tabs>

 ### 3. Test it

@ -76,7 +95,7 @@ $ litellm --model claude-3-opus-20240229
 curl --location 'http://0.0.0.0:4000/chat/completions' \
 --header 'Content-Type: application/json' \
 --data ' {
-      "model": "gpt-3.5-turbo",
+      "model": "claude-3",
      "messages": [
        {
          "role": "user",
@ -97,7 +116,7 @@ client = openai.OpenAI(
 )

 # request sent to model set on litellm proxy, `litellm --model`
-response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
+response = client.chat.completions.create(model="claude-3", messages = [
    {
        "role": "user",
        "content": "this is a test request, write a short poem"
@ -121,7 +140,7 @@ from langchain.schema import HumanMessage, SystemMessage

 chat = ChatOpenAI(
    openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy
-    model = "gpt-3.5-turbo",
+    model = "claude-3",
    temperature=0.1
 )

@ -238,7 +257,7 @@ resp = litellm.completion(
 print(f"\nResponse: {resp}")
 ```

-### Usage - "Assistant Pre-fill"
+## Usage - "Assistant Pre-fill"

 You can "put words in Claude's mouth" by including an `assistant` role message as the last item in the `messages` array.

@ -271,8 +290,8 @@ Human: How do you say 'Hello' in German? Return your answer as a JSON object, li
 Assistant: {
 ```

-### Usage - "System" messages
-If you're using Anthropic's Claude 2.1 with Bedrock, `system` role messages are properly formatted for you.
+## Usage - "System" messages
+If you're using Anthropic's Claude 2.1, `system` role messages are properly formatted for you.

 ```python
 import os
--- a/docs/my-website/docs/providers/aws_sagemaker.md
+++ b/docs/my-website/docs/providers/aws_sagemaker.md
@ -20,7 +20,28 @@ os.environ["AWS_SECRET_ACCESS_KEY"] = ""
 os.environ["AWS_REGION_NAME"] = ""

 response = completion(
-            model="sagemaker/jumpstart-dft-meta-textgeneration-llama-2-7b", 
+            model="sagemaker/<your-endpoint-name>", 
+            messages=[{ "content": "Hello, how are you?","role": "user"}],
+            temperature=0.2,
+            max_tokens=80
+        )
+```
+
+### Passing Inference Component Name
+
+If you have multiple models on an endpoint, you'll need to specify the individual model names, do this via `model_id`.  
+
+```python
+import os 
+from litellm import completion
+
+os.environ["AWS_ACCESS_KEY_ID"] = ""
+os.environ["AWS_SECRET_ACCESS_KEY"] = ""
+os.environ["AWS_REGION_NAME"] = ""
+
+response = completion(
+            model="sagemaker/<your-endpoint-name>", 
+            model_id="<your-model-name",
            messages=[{ "content": "Hello, how are you?","role": "user"}],
            temperature=0.2,
            max_tokens=80
--- a/docs/my-website/docs/providers/gemini.md
+++ b/docs/my-website/docs/providers/gemini.md
@ -2,6 +2,7 @@

 ## Pre-requisites
 * `pip install -q google-generativeai`
+* Get API Key - https://aistudio.google.com/

 # Gemini-Pro
 ## Sample Usage
@ -97,6 +98,6 @@ print(content)
 | Model Name       | Function Call                        | Required OS Variables    |
 |------------------|--------------------------------------|-------------------------|
 | gemini-pro       | `completion('gemini/gemini-pro', messages)` | `os.environ['GEMINI_API_KEY']` |
-| gemini-1.5-pro       | `completion('gemini/gemini-pro', messages)` | `os.environ['GEMINI_API_KEY']` |
+| gemini-1.5-pro       | `completion('gemini/gemini-1.5-pro', messages)` | `os.environ['GEMINI_API_KEY']` |
 | gemini-pro-vision       | `completion('gemini/gemini-pro-vision', messages)` | `os.environ['GEMINI_API_KEY']` |
-| gemini-1.5-pro-vision       | `completion('gemini/gemini-pro-vision', messages)` | `os.environ['GEMINI_API_KEY']` |
+| gemini-1.5-pro-vision       | `completion('gemini/gemini-1.5-pro-vision', messages)` | `os.environ['GEMINI_API_KEY']` |
--- a/docs/my-website/docs/proxy/budget_alerts.md
+++ b/docs/my-website/docs/proxy/budget_alerts.md
@ -1,61 +0,0 @@
-import Image from '@theme/IdealImage';
-
-# 🚨 Budget Alerting
-
-**Alerts when a project will exceed it’s planned limit**
-
-<Image img={require('../../img/budget_alerts.png')} />
-
-## Quick Start
-
-### 1. Setup Slack Alerting on your Proxy Config.yaml 
-
-**Add Slack Webhook to your env**
-Get a slack webhook url from https://api.slack.com/messaging/webhooks
-
-
-Set `SLACK_WEBHOOK_URL` in your proxy env
-
-```shell
-export SLACK_WEBHOOK_URL="https://hooks.slack.com/services/<>/<>/<>"
-```
-
-**Update proxy config.yaml with slack alerting**  
-
-Add `general_settings:alerting`
-```yaml
-model_list: 
-    model_name: "azure-model"
-    litellm_params:
-        model: "azure/gpt-35-turbo"
-
-general_settings: 
-    alerting: ["slack"]
-```
-
-
-
-Start proxy
-```bash
-$ litellm --config /path/to/config.yaml
-```
-
-
-### 2. Create API Key on Proxy Admin UI
-The Admin UI is found on `your-litellm-proxy-endpoint/ui`, example `http://localhost:4000/ui/` 
-
- Set a key name 
- Set a Soft Budget on when to get alerted 
-
-<Image img={require('../../img/create_key.png')} />
-
-
-### 3. Test Slack Alerting on Admin UI
-After creating a key on the Admin UI, click on "Test Slack Alert" to send a test alert to your Slack channel
-<Image img={require('../../img/test_alert.png')} />
-
-### 4. Check Slack 
-
-When the test alert works, you should expect to see this on your alerts slack channel 
-
-<Image img={require('../../img/budget_alerts.png')} />
--- a/docs/my-website/docs/proxy/caching.md
+++ b/docs/my-website/docs/proxy/caching.md
@ -32,8 +32,9 @@ litellm_settings:
  cache: True          # set cache responses to True, litellm defaults to using a redis cache
 ```

-#### [OPTIONAL] Step 1.5: Add redis namespaces 
+#### [OPTIONAL] Step 1.5: Add redis namespaces, default ttl 

+## Namespace
 If you want to create some folder for your keys, you can set a namespace, like this:

 ```yaml
@ -50,6 +51,16 @@ and keys will be stored like:
 litellm_caching:<hash>
 ```

+## TTL
+
+```yaml
+litellm_settings:
+  cache: true 
+  cache_params:        # set cache params for redis
+    type: redis
+    ttl: 600 # will be cached on redis for 600s
+```
+
 #### Step 2: Add Redis Credentials to .env
 Set either `REDIS_URL` or the `REDIS_HOST` in your os environment, to enable caching.

--- a/docs/my-website/docs/proxy/call_hooks.md
+++ b/docs/my-website/docs/proxy/call_hooks.md
@ -1,7 +1,10 @@
+import Image from '@theme/IdealImage';
+
 # Modify / Reject Incoming Requests

 - Modify data before making llm api calls on proxy
 - Reject data before making llm api calls / before returning the response 
+- Enforce 'user' param for all openai endpoint calls

 See a complete example with our [parallel request rate limiter](https://github.com/BerriAI/litellm/blob/main/litellm/proxy/hooks/parallel_request_limiter.py)

@ -95,7 +98,7 @@ We might need to update the function schema in the future, to support multiple e

 :::

-See a complete example with our [Llama Guard content moderation hook](https://github.com/BerriAI/litellm/blob/main/enterprise/hooks/llama_guard.py)
+See a complete example with our [Llama Guard content moderation hook](https://github.com/BerriAI/litellm/blob/main/enterprise/enterprise_hooks/llm_guard.py)

 ```python
 from litellm.integrations.custom_logger import CustomLogger
@ -172,4 +175,19 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
        }
    ],
    }'
-```
+```
+
+## Advanced - Enforce 'user' param 
+
+Set `enforce_user_param` to true, to require all calls to the openai endpoints to have the 'user' param. 
+
+[**See Code**](https://github.com/BerriAI/litellm/blob/4777921a31c4c70e4d87b927cb233b6a09cd8b51/litellm/proxy/auth/auth_checks.py#L72)
+
+```yaml
+general_settings:
+  enforce_user_param: True
+```
+
+**Result**
+
+<Image img={require('../../img/end_user_enforcement.png')}/>
--- a/docs/my-website/docs/proxy/configs.md
+++ b/docs/my-website/docs/proxy/configs.md
@ -62,7 +62,6 @@ model_list:

 litellm_settings: # module level litellm settings - https://github.com/BerriAI/litellm/blob/main/litellm/__init__.py
  drop_params: True
-  set_verbose: True

 general_settings: 
  master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234)
@ -558,6 +557,16 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
 }'
 ```

+## Disable Swagger UI 
+
+To disable the Swagger docs from the base url, set 
+
+```env
+NO_DOCS="True"
+```
+
+in your environment, and restart the proxy. 
+

 ## Configure DB Pool Limits + Connection Timeouts 

@ -592,7 +601,9 @@ general_settings:
    "completion_model": "string",
    "disable_spend_logs": "boolean", # turn off writing each transaction to the db
    "disable_reset_budget": "boolean", # turn off reset budget scheduled task
-    "enable_jwt_auth": "boolean", # allow proxy admin to auth in via jwt tokens with 'litellm_proxy_admin' in claims 
+    "enable_jwt_auth": "boolean", # allow proxy admin to auth in via jwt tokens with 'litellm_proxy_admin' in claims
+    "enforce_user_param": "boolean", # requires all openai endpoint requests to have a 'user' param
+    "allowed_routes": "list", # list of allowed proxy API routes - a user can access. (currently JWT-Auth only)
    "key_management_system": "google_kms", # either google_kms or azure_kms
    "master_key": "string",
    "database_url": "string",
--- a/docs/my-website/docs/proxy/deploy.md
+++ b/docs/my-website/docs/proxy/deploy.md
@ -103,7 +103,10 @@ RUN chmod +x entrypoint.sh
 EXPOSE 4000/tcp

 # Override the CMD instruction with your desired command and arguments
-CMD ["--port", "4000", "--config", "config.yaml", "--detailed_debug", "--run_gunicorn"]
+# WARNING: FOR PROD DO NOT USE `--detailed_debug` it slows down response times, instead use the following CMD
+# CMD ["--port", "4000", "--config", "config.yaml"]
+
+CMD ["--port", "4000", "--config", "config.yaml", "--detailed_debug"]
 ```

 </TabItem>
@ -232,7 +235,6 @@ Your OpenAI proxy server is now running on `http://127.0.0.1:4000`.
 | [LiteLLM container + Redis](#litellm-container--redis) | + load balance across multiple litellm containers |
 | [LiteLLM Database container + PostgresDB + Redis](#litellm-database-container--postgresdb--redis) | + use Virtual Keys + Track Spend + load balance across multiple litellm containers |

-
 ## Deploy with Database
 ### Docker, Kubernetes, Helm Chart

@ -474,25 +476,6 @@ docker run --name litellm-proxy \
 ghcr.io/berriai/litellm-database:main-latest --config your_config.yaml
 ```

-## Best Practices for Deploying to Production
-### 1. Switch of debug logs in production 
-don't use [`--detailed-debug`, `--debug`](https://docs.litellm.ai/docs/proxy/debugging#detailed-debug) or `litellm.set_verbose=True`. We found using debug logs can add 5-10% latency per LLM API call
-
-### 2. Use `run_gunicorn` and `num_workers`
-
-Example setting `--run_gunicorn` and `--num_workers`
-```shell
-docker run ghcr.io/berriai/litellm-database:main-latest --run_gunicorn --num_workers 4
-```
-
-Why `Gunicorn`?
- Gunicorn takes care of running multiple instances of your web application
- Gunicorn is ideal for running litellm proxy on cluster of machines with Kubernetes
-
-Why `num_workers`? 
-Setting `num_workers` to the number of CPUs available ensures optimal utilization of system resources by matching the number of worker processes to the available CPU cores.
-
-
 ## Advanced Deployment Settings

 ### Customization of the server root path
@ -525,6 +508,57 @@ Provide an ssl certificate when starting litellm proxy server
 ## Platform-specific Guide

 <Tabs>
+<TabItem value="AWS EKS" label="AWS EKS - Kubernetes">
+
+### Kubernetes - Deploy on EKS
+
+Step1. Create an EKS Cluster with the following spec
+
+```shell
+eksctl create cluster --name=litellm-cluster --region=us-west-2 --node-type=t2.small
+```
+
+Step 2. Mount litellm proxy config on kub cluster 
+
+This will mount your local file called `proxy_config.yaml` on kubernetes cluster
+
+```shell
+kubectl create configmap litellm-config --from-file=proxy_config.yaml
+```
+
+Step 3. Apply `kub.yaml` and `service.yaml`
+Clone the following `kub.yaml` and `service.yaml` files and apply locally
+
+- Use this `kub.yaml` file - [litellm kub.yaml](https://github.com/BerriAI/litellm/blob/main/deploy/kubernetes/kub.yaml)
+
+- Use this `service.yaml` file - [litellm service.yaml](https://github.com/BerriAI/litellm/blob/main/deploy/kubernetes/service.yaml)
+
+Apply `kub.yaml`
+```
+kubectl apply -f kub.yaml
+```
+
+Apply `service.yaml` - creates an AWS load balancer to expose the proxy
+```
+kubectl apply -f service.yaml
+
+# service/litellm-service created
+```
+
+Step 4. Get Proxy Base URL
+
+```shell
+kubectl get services
+
+# litellm-service   LoadBalancer   10.100.6.31   a472dc7c273fd47fd******.us-west-2.elb.amazonaws.com   4000:30374/TCP   63m
+```
+
+Proxy Base URL =  `a472dc7c273fd47fd******.us-west-2.elb.amazonaws.com:4000`
+
+That's it, now you can start using LiteLLM Proxy
+
+</TabItem>
+

 <TabItem value="aws-stack" label="AWS Cloud Formation Stack">

--- a/docs/my-website/docs/proxy/enterprise.md
+++ b/docs/my-website/docs/proxy/enterprise.md
@ -12,9 +12,9 @@ Features here are behind a commercial license in our `/enterprise` folder. [**Se
 :::

 Features: 
+- ✅ Content Moderation with LLM Guard
 - ✅ Content Moderation with LlamaGuard 
 - ✅ Content Moderation with Google Text Moderations 
- ✅ Content Moderation with LLM Guard
 - ✅ Reject calls from Blocked User list 
 - ✅ Reject calls (incoming / outgoing) with Banned Keywords (e.g. competitors)
 - ✅ Don't log/store specific requests (eg confidential LLM requests)
@ -23,6 +23,71 @@ Features:


 ## Content Moderation
+### Content Moderation with LLM Guard
+
+Set the LLM Guard API Base in your environment 
+
+```env
+LLM_GUARD_API_BASE = "http://0.0.0.0:8192" # deployed llm guard api
+```
+
+Add `llmguard_moderations` as a callback 
+
+```yaml
+litellm_settings:
+    callbacks: ["llmguard_moderations"]
+```
+
+Now you can easily test it
+
+- Make a regular /chat/completion call 
+
+- Check your proxy logs for any statement with `LLM Guard:`
+
+Expected results: 
+
+```
+LLM Guard: Received response - {"sanitized_prompt": "hello world", "is_valid": true, "scanners": { "Regex": 0.0 }}
+```
+#### Turn on/off per key
+
+**1. Update config**
+```yaml
+litellm_settings:
+    callbacks: ["llmguard_moderations"]
+    llm_guard_mode: "key-specific"
+```
+
+**2. Create new key**
+
+```bash
+curl --location 'http://localhost:4000/key/generate' \
+--header 'Authorization: Bearer sk-1234' \
+--header 'Content-Type: application/json' \
+--data '{
+    "models": ["fake-openai-endpoint"],
+    "permissions": {
+        "enable_llm_guard_check": true # 👈 KEY CHANGE
+    }
+}'
+
+# Returns {..'key': 'my-new-key'}
+```
+
+**2. Test it!**
+
+```bash
+curl --location 'http://0.0.0.0:4000/v1/chat/completions' \
+--header 'Content-Type: application/json' \
+--header 'Authorization: Bearer my-new-key' \ # 👈 TEST KEY
+--data '{"model": "fake-openai-endpoint", "messages": [
+        {"role": "system", "content": "Be helpful"},
+        {"role": "user", "content": "What do you know?"}
+    ]
+    }'
+```
+
+
 ### Content Moderation with LlamaGuard 

 Currently works with Sagemaker's LlamaGuard endpoint. 
@ -55,32 +120,7 @@ callbacks: ["llamaguard_moderations"]
  llamaguard_unsafe_content_categories: /path/to/llamaguard_prompt.txt
 ```

-### Content Moderation with LLM Guard

-Set the LLM Guard API Base in your environment 
-
-```env
-LLM_GUARD_API_BASE = "http://0.0.0.0:4000"
-```
-
-Add `llmguard_moderations` as a callback 
-
-```yaml
-litellm_settings:
-    callbacks: ["llmguard_moderations"]
-```
-
-Now you can easily test it
-
- Make a regular /chat/completion call 
-
- Check your proxy logs for any statement with `LLM Guard:`
-
-Expected results: 
-
-```
-LLM Guard: Received response - {"sanitized_prompt": "hello world", "is_valid": true, "scanners": { "Regex": 0.0 }}
-```

 ### Content Moderation with Google Text Moderation 

--- a/docs/my-website/docs/proxy/grafana_metrics.md
+++ b/docs/my-website/docs/proxy/grafana_metrics.md
@ -0,0 +1,53 @@
+# Grafana, Prometheus metrics [BETA]
+
+LiteLLM Exposes a `/metrics` endpoint for Prometheus to Poll
+
+## Quick Start
+
+If you're using the LiteLLM CLI with `litellm --config proxy_config.yaml` then you need to `pip install prometheus_client==0.20.0`. **This is already pre-installed on the litellm Docker image**
+
+Add this to your proxy config.yaml 
+```yaml
+model_list:
+ - model_name: gpt-3.5-turbo
+    litellm_params:
+      model: gpt-3.5-turbo
+litellm_settings:
+  success_callback: ["prometheus"]
+```
+
+Start the proxy
+```shell
+litellm --config config.yaml --debug
+```
+
+Test Request
+```shell
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+    --header 'Content-Type: application/json' \
+    --data '{
+    "model": "gpt-3.5-turbo",
+    "messages": [
+        {
+        "role": "user",
+        "content": "what llm are you"
+        }
+    ]
+}'
+```
+
+View Metrics on `/metrics`, Visit `http://localhost:4000/metrics` 
+```shell
+http://localhost:4000/metrics
+
+# <proxy_base_url>/metrics
+```
+
+## Metrics Tracked 
+
+
+| Metric Name          | Description                          |
+|----------------------|--------------------------------------|
+| `litellm_requests_metric`             | Number of requests made, per `"user", "key", "model"`          |
+| `litellm_spend_metric`                | Total Spend, per `"user", "key", "model"`                 |
+| `litellm_total_tokens`         | input + output tokens per `"user", "key", "model"`     |
--- a/docs/my-website/docs/proxy/prod.md
+++ b/docs/my-website/docs/proxy/prod.md
@ -0,0 +1,249 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# ⚡ Best Practices for Production
+
+Expected Performance in Production
+
+1 LiteLLM Uvicorn Worker on Kubernetes
+
+| Description | Value |
+|--------------|-------|
+| Avg latency | `50ms` |
+| Median latency | `51ms` |
+| `/chat/completions` Requests/second | `35` |
+| `/chat/completions` Requests/minute | `2100` |
+| `/chat/completions` Requests/hour | `126K` |
+
+
+## 1. Switch of Debug Logging
+
+Remove `set_verbose: True` from your config.yaml
+```yaml
+litellm_settings:
+  set_verbose: True
+```
+
+You should only see the following level of details in logs on the proxy server
+```shell
+# INFO:     192.168.2.205:11774 - "POST /chat/completions HTTP/1.1" 200 OK
+# INFO:     192.168.2.205:34717 - "POST /chat/completions HTTP/1.1" 200 OK
+# INFO:     192.168.2.205:29734 - "POST /chat/completions HTTP/1.1" 200 OK
+```
+
+## 2. On Kubernetes - Use 1 Uvicorn worker [Suggested CMD]
+
+Use this Docker `CMD`. This will start the proxy with 1 Uvicorn Async Worker
+
+(Ensure that you're not setting `run_gunicorn` or `num_workers` in the CMD). 
+```shell
+CMD ["--port", "4000", "--config", "./proxy_server_config.yaml"]
+```
+
+## 2. Batch write spend updates every 60s
+
+The default proxy batch write is 10s. This is to make it easy to see spend when debugging locally. 
+
+In production, we recommend using a longer interval period of 60s. This reduces the number of connections used to make DB writes. 
+
+```yaml
+general_settings:
+  master_key: sk-1234
+  proxy_batch_write_at: 5 # 👈 Frequency of batch writing logs to server (in seconds)
+```
+
+
+## 3. Move spend logs to separate server
+
+Writing each spend log to the db can slow down your proxy. In testing we saw a 70% improvement in median response time, by moving writing spend logs to a separate server. 
+
+👉 [LiteLLM Spend Logs Server](https://github.com/BerriAI/litellm/tree/main/litellm-js/spend-logs)
+
+
+**Spend Logs**  
+This is a log of the key, tokens, model, and latency for each call on the proxy. 
+
+[**Full Payload**](https://github.com/BerriAI/litellm/blob/8c9623a6bc4ad9da0a2dac64249a60ed8da719e8/litellm/proxy/utils.py#L1769)
+
+
+**1. Start the spend logs server**
+
+```bash
+docker run -p 3000:3000 \
+  -e DATABASE_URL="postgres://.." \
+  ghcr.io/berriai/litellm-spend_logs:main-latest
+
+# RUNNING on http://0.0.0.0:3000
+```
+
+**2. Connect to proxy**
+
+
+Example litellm_config.yaml
+
+```yaml
+model_list:
+- model_name: fake-openai-endpoint
+  litellm_params:
+    model: openai/my-fake-model
+    api_key: my-fake-key
+    api_base: https://exampleopenaiendpoint-production.up.railway.app/
+
+general_settings:
+  master_key: sk-1234
+  proxy_batch_write_at: 5 # 👈 Frequency of batch writing logs to server (in seconds)
+```
+
+Add `SPEND_LOGS_URL` as an environment variable when starting the proxy 
+
+```bash
+docker run \
+    -v $(pwd)/litellm_config.yaml:/app/config.yaml \
+    -e DATABASE_URL="postgresql://.." \
+    -e SPEND_LOGS_URL="http://host.docker.internal:3000" \ # 👈 KEY CHANGE
+    -p 4000:4000 \
+    ghcr.io/berriai/litellm:main-latest \
+    --config /app/config.yaml --detailed_debug
+
+# Running on http://0.0.0.0:4000
+```
+
+**3. Test Proxy!**
+
+
+```bash
+curl --location 'http://0.0.0.0:4000/v1/chat/completions' \
+--header 'Content-Type: application/json' \
+--header 'Authorization: Bearer sk-1234' \
+--data '{
+    "model": "fake-openai-endpoint", 
+    "messages": [
+        {"role": "system", "content": "Be helpful"},
+        {"role": "user", "content": "What do you know?"}
+    ]
+}'
+```
+
+In your LiteLLM Spend Logs Server, you should see
+
+**Expected Response**
+
+```
+Received and stored 1 logs. Total logs in memory: 1
+...
+Flushed 1 log to the DB.
+```
+
+
+### Machine Specification
+
+A t2.micro should be sufficient to handle 1k logs / minute on this server. 
+
+This consumes at max 120MB, and <0.1 vCPU. 
+
+## 4. Switch off resetting budgets
+
+Add this to your config.yaml. (Only spend per Key, User and Team will be tracked - spend per API Call will not be written to the LiteLLM Database)
+```yaml
+general_settings:
+  disable_spend_logs: true
+  disable_reset_budget: true
+```
+
+## 5. Switch of `litellm.telemetry`
+
+Switch of all telemetry tracking done by litellm
+
+```yaml
+litellm_settings:
+  telemetry: False
+```
+
+## Machine Specifications to Deploy LiteLLM
+
+| Service | Spec | CPUs | Memory | Architecture | Version|
+| --- | --- | --- | --- | --- | --- | 
+| Server | `t2.small`. | `1vCPUs` | `8GB` | `x86` |
+| Redis Cache | - | - | - | - | 7.0+ Redis Engine|
+
+
+## Reference Kubernetes Deployment YAML
+
+Reference Kubernetes `deployment.yaml` that was load tested by us
+
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: litellm-deployment
+spec:
+  replicas: 3
+  selector:
+    matchLabels:
+      app: litellm
+  template:
+    metadata:
+      labels:
+        app: litellm
+    spec:
+      containers:
+        - name: litellm-container
+          image: ghcr.io/berriai/litellm:main-latest
+          imagePullPolicy: Always
+          env:
+            - name: AZURE_API_KEY
+              value: "d6******"
+            - name: AZURE_API_BASE
+              value: "https://ope******"
+            - name: LITELLM_MASTER_KEY
+              value: "sk-1234"
+            - name: DATABASE_URL
+              value: "po**********"
+          args:
+            - "--config"
+            - "/app/proxy_config.yaml"  # Update the path to mount the config file
+          volumeMounts:                 # Define volume mount for proxy_config.yaml
+            - name: config-volume
+              mountPath: /app
+              readOnly: true
+          livenessProbe:
+            httpGet:
+              path: /health/liveliness
+              port: 4000
+            initialDelaySeconds: 120
+            periodSeconds: 15
+            successThreshold: 1
+            failureThreshold: 3
+            timeoutSeconds: 10
+          readinessProbe:
+            httpGet:
+              path: /health/readiness
+              port: 4000
+            initialDelaySeconds: 120
+            periodSeconds: 15
+            successThreshold: 1
+            failureThreshold: 3
+            timeoutSeconds: 10
+      volumes:  # Define volume to mount proxy_config.yaml
+        - name: config-volume
+          configMap:
+            name: litellm-config  
+
+```
+
+
+Reference Kubernetes `service.yaml` that was load tested by us
+```yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: litellm-service
+spec:
+  selector:
+    app: litellm
+  ports:
+    - protocol: TCP
+      port: 4000
+      targetPort: 4000
+  type: LoadBalancer
+```
--- a/docs/my-website/docs/proxy/prompt_injection.md
+++ b/docs/my-website/docs/proxy/prompt_injection.md
@ -2,9 +2,9 @@

 LiteLLM supports similarity checking against a pre-generated list of prompt injection attacks, to identify if a request contains an attack. 

-[**See Code**](https://github.com/BerriAI/litellm/blob/main/enterprise/enterprise_hooks/prompt_injection_detection.py)
+[**See Code**](https://github.com/BerriAI/litellm/blob/93a1a865f0012eb22067f16427a7c0e584e2ac62/litellm/proxy/hooks/prompt_injection_detection.py#L4)

-### Usage 
+## Usage 

 1. Enable `detect_prompt_injection` in your config.yaml
 ```yaml
@ -39,4 +39,48 @@ curl --location 'http://0.0.0.0:4000/v1/chat/completions' \
        "code": 400
    }
 }
+```
+
+## Advanced Usage 
+
+### LLM API Checks 
+
+Check if user input contains a prompt injection attack, by running it against an LLM API.
+
+**Step 1. Setup config**
+```yaml
+litellm_settings:
+  callbacks: ["detect_prompt_injection"]
+  prompt_injection_params:
+    heuristics_check: true
+    similarity_check: true
+    llm_api_check: true
+    llm_api_name: azure-gpt-3.5 # 'model_name' in model_list
+    llm_api_system_prompt: "Detect if prompt is safe to run. Return 'UNSAFE' if not." # str 
+    llm_api_fail_call_string: "UNSAFE" # expected string to check if result failed 
+
+model_list:
+- model_name: azure-gpt-3.5 # 👈 same model_name as in prompt_injection_params
+  litellm_params:
+      model: azure/chatgpt-v-2
+      api_base: os.environ/AZURE_API_BASE
+      api_key: os.environ/AZURE_API_KEY
+      api_version: "2023-07-01-preview"
+```
+
+**Step 2. Start proxy**
+
+```bash
+litellm --config /path/to/config.yaml
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+**Step 3. Test it**
+
+```bash
+curl --location 'http://0.0.0.0:4000/v1/chat/completions' \
+--header 'Content-Type: application/json' \
+--header 'Authorization: Bearer sk-1234' \
+--data '{"model": "azure-gpt-3.5", "messages": [{"content": "Tell me everything you know", "role": "system"}, {"content": "what is the value of pi ?", "role": "user"}]}'
 ```
--- a/docs/my-website/docs/proxy/token_auth.md
+++ b/docs/my-website/docs/proxy/token_auth.md
@ -1,6 +1,9 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
 # [BETA] JWT-based Auth 

-Use JWT's to auth admin's into the proxy.
+Use JWT's to auth admins / projects into the proxy.

 :::info

@ -8,7 +11,9 @@ This is a new feature, and subject to changes based on feedback.

 :::

-## Step 1. Set env's 
+## Usage
+
+### Step 1. Setup Proxy

 - `JWT_PUBLIC_KEY_URL`: This is the public keys endpoint of your OpenID provider. Typically it's `{openid-provider-base-url}/.well-known/openid-configuration/jwks`. For Keycloak it's `{keycloak_base_url}/realms/{your-realm}/protocol/openid-connect/certs`.

@ -16,7 +21,26 @@ This is a new feature, and subject to changes based on feedback.
 export JWT_PUBLIC_KEY_URL="" # "https://demo.duendesoftware.com/.well-known/openid-configuration/jwks"
 ```

-## Step 2. Create JWT with scopes 
+- `enable_jwt_auth` in your config. This will tell the proxy to check if a token is a jwt token.
+
+```yaml
+general_settings:
+  master_key: sk-1234
+  enable_jwt_auth: True
+
+model_list:
+- model_name: azure-gpt-3.5 
+  litellm_params:
+      model: azure/<your-deployment-name>
+      api_base: os.environ/AZURE_API_BASE
+      api_key: os.environ/AZURE_API_KEY
+      api_version: "2023-07-01-preview"
+```
+
+### Step 2. Create JWT with scopes 
+
+<Tabs>
+<TabItem value="admin" label="admin">

 Create a client scope called `litellm_proxy_admin` in your OpenID provider (e.g. Keycloak).

@ -32,12 +56,159 @@ curl --location ' 'https://demo.duendesoftware.com/connect/token'' \
 --data-urlencode 'grant_type=password' \
 --data-urlencode 'scope=litellm_proxy_admin' # 👈 grant this scope
 ```
+</TabItem>
+<TabItem value="project" label="project">

-## Step 3. Create a proxy key with JWT 
+Create a JWT for your project on your OpenID provider (e.g. Keycloak).
+
+```bash
+curl --location ' 'https://demo.duendesoftware.com/connect/token'' \
+--header 'Content-Type: application/x-www-form-urlencoded' \
+--data-urlencode 'client_id={CLIENT_ID}' \ # 👈 project id
+--data-urlencode 'client_secret={CLIENT_SECRET}' \
+--data-urlencode 'grant_type=client_credential' \
+```
+
+</TabItem>
+</Tabs>
+
+### Step 3. Test your JWT 
+
+<Tabs>
+<TabItem value="key" label="/key/generate">

 ```bash
 curl --location '{proxy_base_url}/key/generate' \
 --header 'Authorization: Bearer eyJhbGciOiJSUzI1NiI...' \
 --header 'Content-Type: application/json' \
 --data '{}'
-```
+```
+</TabItem>
+<TabItem value="llm_call" label="/chat/completions">
+
+```bash
+curl --location 'http://0.0.0.0:4000/v1/chat/completions' \
+--header 'Content-Type: application/json' \
+--header 'Authorization: Bearer eyJhbGciOiJSUzI1...' \
+--data '{"model": "azure-gpt-3.5", "messages": [ { "role": "user", "content": "What's the weather like in Boston today?" } ]}'
+```
+
+</TabItem>
+</Tabs>
+
+## Advanced - Set Accepted JWT Scope Names 
+
+Change the string in JWT 'scopes', that litellm evaluates to see if a user has admin access.
+
+```yaml
+general_settings:
+  master_key: sk-1234
+  enable_jwt_auth: True
+  litellm_jwtauth:
+    admin_jwt_scope: "litellm-proxy-admin"
+```
+### JWT Scopes
+
+Here's what scopes on JWT-Auth tokens look like
+
+**Can be a list**
+```
+scope: ["litellm-proxy-admin",...]
+```
+
+**Can be a space-separated string**
+```
+scope: "litellm-proxy-admin ..."
+```
+
+## Advanced - Allowed Routes 
+
+Configure which routes a JWT can access via the config.
+
+By default: 
+
+- Admins: can access only management routes (`/team/*`, `/key/*`, `/user/*`)
+- Teams: can access only openai routes (`/chat/completions`, etc.)+ info routes (`/*/info`)
+
+[**See Code**](https://github.com/BerriAI/litellm/blob/b204f0c01c703317d812a1553363ab0cb989d5b6/litellm/proxy/_types.py#L95)
+
+**Admin Routes**
+```yaml
+general_settings:
+  master_key: sk-1234
+  enable_jwt_auth: True
+  litellm_jwtauth:
+    admin_jwt_scope: "litellm-proxy-admin"
+    admin_allowed_routes: ["/v1/embeddings"]
+```
+
+**Team Routes**
+```yaml
+general_settings:
+  master_key: sk-1234
+  enable_jwt_auth: True
+  litellm_jwtauth:
+    ...
+    team_jwt_scope: "litellm-team" # 👈 Set JWT Scope string
+    team_allowed_routes: ["/v1/chat/completions"] # 👈 Set accepted routes
+```
+
+## Advanced - Caching Public Keys 
+
+Control how long public keys are cached for (in seconds).
+
+```yaml
+general_settings:
+  master_key: sk-1234
+  enable_jwt_auth: True
+  litellm_jwtauth:
+    admin_jwt_scope: "litellm-proxy-admin"
+    admin_allowed_routes: ["/v1/embeddings"]
+    public_key_ttl: 600 # 👈 KEY CHANGE
+```
+
+## Advanced - Custom JWT Field 
+
+Set a custom field in which the team_id exists. By default, the 'client_id' field is checked. 
+
+```yaml
+general_settings:
+  master_key: sk-1234
+  enable_jwt_auth: True
+  litellm_jwtauth:
+    team_id_jwt_field: "client_id" # 👈 KEY CHANGE
+```
+
+## All Params
+
+[**See Code**](https://github.com/BerriAI/litellm/blob/b204f0c01c703317d812a1553363ab0cb989d5b6/litellm/proxy/_types.py#L95)
+
+
+
+
+## Advanced - Block Teams 
+
+To block all requests for a certain team id, use `/team/block`
+
+**Block Team**
+
+```bash
+curl --location 'http://0.0.0.0:4000/team/block' \
+--header 'Authorization: Bearer <admin-token>' \
+--header 'Content-Type: application/json' \
+--data '{
+    "team_id": "litellm-test-client-id-new" # 👈 set team id
+}'
+```
+
+**Unblock Team**
+
+```bash
+curl --location 'http://0.0.0.0:4000/team/unblock' \
+--header 'Authorization: Bearer <admin-token>' \
+--header 'Content-Type: application/json' \
+--data '{
+    "team_id": "litellm-test-client-id-new" # 👈 set team id
+}'
+```
+
--- a/docs/my-website/docs/proxy/ui.md
+++ b/docs/my-website/docs/proxy/ui.md
@ -47,8 +47,9 @@ Your Proxy Swagger is available on the root of the Proxy: e.g.: `http://localhos
 Set the following in your .env on the Proxy

 ```shell
-UI_USERNAME=ishaan-litellm
-UI_PASSWORD=langchain
+LITELLM_MASTER_KEY="sk-1234" # this is your master key for using the proxy server
+UI_USERNAME=ishaan-litellm   # username to sign in on UI
+UI_PASSWORD=langchain        # password to sign in on UI
 ```

 On accessing the LiteLLM UI, you will be prompted to enter your username, password
--- a/docs/my-website/docs/proxy/virtual_keys.md
+++ b/docs/my-website/docs/proxy/virtual_keys.md
@ -1,14 +1,14 @@
-# 🔑 Virtual Keys, Users
-Track Spend, Set budgets and create virtual keys for the proxy
-
-Grant other's temporary access to your proxy, with keys that expire after a set duration.
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';

+# 🔑 Virtual Keys
+Track Spend, and control model access via virtual keys for the proxy

 :::info

 - 🔑 [UI to Generate, Edit, Delete Keys (with SSO)](https://docs.litellm.ai/docs/proxy/ui)
 - [Deploy LiteLLM Proxy with Key Management](https://docs.litellm.ai/docs/proxy/deploy#deploy-with-database)
- Dockerfile.database for LiteLLM Proxy + Key Management [here](https://github.com/BerriAI/litellm/blob/main/Dockerfile.database)
+- [Dockerfile.database for LiteLLM Proxy + Key Management](https://github.com/BerriAI/litellm/blob/main/Dockerfile.database)


 :::
@ -30,7 +30,7 @@ export DATABASE_URL=postgresql://<user>:<password>@<host>:<port>/<dbname>
 ```


-You can then generate temporary keys by hitting the `/key/generate` endpoint.
+You can then generate keys by hitting the `/key/generate` endpoint.

 [**See code**](https://github.com/BerriAI/litellm/blob/7a669a36d2689c7f7890bc9c93e04ff3c2641299/litellm/proxy/proxy_server.py#L672)

@ -46,8 +46,8 @@ model_list:
        model: ollama/llama2

 general_settings: 
-  master_key: sk-1234 # [OPTIONAL] if set all calls to proxy will require either this key or a valid generated token
-  database_url: "postgresql://<user>:<password>@<host>:<port>/<dbname>"
+  master_key: sk-1234 
+  database_url: "postgresql://<user>:<password>@<host>:<port>/<dbname>" # 👈 KEY CHANGE
 ```

 **Step 2: Start litellm**
@ -56,62 +56,220 @@ general_settings:
 litellm --config /path/to/config.yaml
 ```

-**Step 3: Generate temporary keys**
+**Step 3: Generate keys**

 ```shell 
 curl 'http://0.0.0.0:4000/key/generate' \
 --header 'Authorization: Bearer <your-master-key>' \
 --header 'Content-Type: application/json' \
--data-raw '{"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"], "duration": "20m","metadata": {"user": "ishaan@berri.ai"}}'
+--data-raw '{"models": ["gpt-3.5-turbo", "gpt-4"], "metadata": {"user": "ishaan@berri.ai"}}'
 ```

+## Advanced - Spend Tracking 

-## /key/generate
+Get spend per:
+- key - via `/key/info` [Swagger](https://litellm-api.up.railway.app/#/key%20management/info_key_fn_key_info_get)
+- user - via `/user/info` [Swagger](https://litellm-api.up.railway.app/#/user%20management/user_info_user_info_get)
+- team - via `/team/info` [Swagger](https://litellm-api.up.railway.app/#/team%20management/team_info_team_info_get)  
+- ⏳ end-users - via `/end_user/info` - [Comment on this issue for end-user cost tracking](https://github.com/BerriAI/litellm/issues/2633)

-### Request
-```shell
-curl 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
-  "models": ["gpt-3.5-turbo", "gpt-4", "claude-2"],
-  "duration": "20m",
-  "metadata": {"user": "ishaan@berri.ai"},
-  "team_id": "core-infra",
-  "max_budget": 10,
-  "soft_budget": 5,
-}'
+**How is it calculated?**
+
+The cost per model is stored [here](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json) and calculated by the [`completion_cost`](https://github.com/BerriAI/litellm/blob/db7974f9f216ee50b53c53120d1e3fc064173b60/litellm/utils.py#L3771) function.
+
+**How is it tracking?**
+
+Spend is automatically tracked for the key in the "LiteLLM_VerificationTokenTable". If the key has an attached 'user_id' or 'team_id', the spend for that user is tracked in the "LiteLLM_UserTable", and team in the "LiteLLM_TeamTable".
+
+<Tabs>
+<TabItem value="key-info" label="Key Spend">
+
+You can get spend for a key by using the `/key/info` endpoint. 
+
+```bash
+curl 'http://0.0.0.0:4000/key/info?key=<user-key>' \
+     -X GET \
+     -H 'Authorization: Bearer <your-master-key>'
 ```

+This is automatically updated (in USD) when calls are made to /completions, /chat/completions, /embeddings using litellm's completion_cost() function. [**See Code**](https://github.com/BerriAI/litellm/blob/1a6ea20a0bb66491968907c2bfaabb7fe45fc064/litellm/utils.py#L1654). 

-Request Params:
-
- `duration`: *Optional[str]* - Specify the length of time the token is valid for. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
- `key_alias`: *Optional[str]* - User defined key alias
- `team_id`: *Optional[str]* - The team id of the user
- `models`: *Optional[list]* - Model_name's a user is allowed to call. (if empty, key is allowed to call all models)
- `aliases`: *Optional[dict]* - Any alias mappings, on top of anything in the config.yaml model list. - https://docs.litellm.ai/docs/proxy/virtual_keys#managing-auth---upgradedowngrade-models
- `config`: *Optional[dict]* - any key-specific configs, overrides config in config.yaml
- `spend`: *Optional[int]* - Amount spent by key. Default is 0. Will be updated by proxy whenever key is used. https://docs.litellm.ai/docs/proxy/virtual_keys#managing-auth---tracking-spend
- `max_budget`: *Optional[float]* - Specify max budget for a given key.
- `soft_budget`: *Optional[float]* - Specify soft limit budget for a given key. Get Alerts when key hits its soft budget
- `model_max_budget`: *Optional[dict[str, float]]* - Specify max budget for each model, `model_max_budget={"gpt4": 0.5, "gpt-5": 0.01}`
- `max_parallel_requests`: *Optional[int]* - Rate limit a user based on the number of parallel requests. Raises 429 error, if user's parallel requests > x.
- `metadata`: *Optional[dict]* - Metadata for key, store information for key. Example metadata = {"team": "core-infra", "app": "app2", "email": "ishaan@berri.ai" }
-
-
-### Response
+**Sample response**

 ```python
 {
-    "key": "sk-kdEXbIqZRwEeEiHwdg7sFA", # Bearer token
-    "expires": "2023-11-19T01:38:25.834000+00:00" # datetime object
-    "key_name": "sk-...7sFA" # abbreviated key string, ONLY stored in db if `allow_user_auth: true` set - [see](./ui.md)
-    ...
+    "key": "sk-tXL0wt5-lOOVK9sfY2UacA",
+    "info": {
+        "token": "sk-tXL0wt5-lOOVK9sfY2UacA",
+        "spend": 0.0001065, # 👈 SPEND
+        "expires": "2023-11-24T23:19:11.131000Z",
+        "models": [
+            "gpt-3.5-turbo",
+            "gpt-4",
+            "claude-2"
+        ],
+        "aliases": {
+            "mistral-7b": "gpt-3.5-turbo"
+        },
+        "config": {}
+    }
 }
 ```

-### Upgrade/Downgrade Models 
+</TabItem>
+<TabItem value="user-info" label="User Spend">
+
+**1. Create a user**
+
+```bash
+curl --location 'http://localhost:4000/user/new' \
+--header 'Authorization: Bearer <your-master-key>' \
+--header 'Content-Type: application/json' \
+--data-raw '{user_email: "krrish@berri.ai"}' 
+```
+
+**Expected Response**
+
+```bash
+{
+    ...
+    "expires": "2023-12-22T09:53:13.861000Z",
+    "user_id": "my-unique-id", # 👈 unique id
+    "max_budget": 0.0
+}
+```
+
+**2. Create a key for that user**
+
+```bash
+curl 'http://0.0.0.0:4000/key/generate' \
+--header 'Authorization: Bearer <your-master-key>' \
+--header 'Content-Type: application/json' \
+--data-raw '{"models": ["gpt-3.5-turbo", "gpt-4"], "user_id": "my-unique-id"}'
+```
+
+Returns a key - `sk-...`.
+
+**3. See spend for user**
+
+```bash
+curl 'http://0.0.0.0:4000/user/info?user_id=my-unique-id' \
+     -X GET \
+     -H 'Authorization: Bearer <your-master-key>'
+```
+
+Expected Response
+
+```bash
+{
+  ...
+  "spend": 0 # 👈 SPEND
+}
+```
+
+</TabItem>
+<TabItem value="team-info" label="Team Spend">
+
+Use teams, if you want keys to be owned by multiple people (e.g. for a production app).
+
+**1. Create a team**
+
+```bash
+curl --location 'http://localhost:4000/team/new' \
+--header 'Authorization: Bearer <your-master-key>' \
+--header 'Content-Type: application/json' \
+--data-raw '{"team_alias": "my-awesome-team"}' 
+```
+
+**Expected Response**
+
+```bash
+{
+    ...
+    "expires": "2023-12-22T09:53:13.861000Z",
+    "team_id": "my-unique-id", # 👈 unique id
+    "max_budget": 0.0
+}
+```
+
+**2. Create a key for that team**
+
+```bash
+curl 'http://0.0.0.0:4000/key/generate' \
+--header 'Authorization: Bearer <your-master-key>' \
+--header 'Content-Type: application/json' \
+--data-raw '{"models": ["gpt-3.5-turbo", "gpt-4"], "team_id": "my-unique-id"}'
+```
+
+Returns a key - `sk-...`.
+
+**3. See spend for team**
+
+```bash
+curl 'http://0.0.0.0:4000/team/info?team_id=my-unique-id' \
+     -X GET \
+     -H 'Authorization: Bearer <your-master-key>'
+```
+
+Expected Response
+
+```bash
+{
+  ...
+  "spend": 0 # 👈 SPEND
+}
+```
+
+</TabItem>
+</Tabs>
+
+## Advanced - Model Access
+
+### Restrict models by `team_id`
+`litellm-dev` can only access `azure-gpt-3.5`
+
+**1. Create a team via `/team/new`**
+```shell
+curl --location 'http://localhost:4000/team/new' \
+--header 'Authorization: Bearer <your-master-key>' \
+--header 'Content-Type: application/json' \
+--data-raw '{
+  "team_alias": "litellm-dev",
+  "models": ["azure-gpt-3.5"]
+}' 
+
+# returns {...,"team_id": "my-unique-id"}
+```
+
+**2. Create a key for team**
+```shell
+curl --location 'http://localhost:4000/key/generate' \
+--header 'Authorization: Bearer sk-1234' \
+--header 'Content-Type: application/json' \
+--data-raw '{"team_id": "my-unique-id"}'
+```
+
+**3. Test it**
+```shell
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+    --header 'Content-Type: application/json' \
+    --header 'Authorization: Bearer sk-qo992IjKOC2CHKZGRoJIGA' \
+    --data '{
+        "model": "BEDROCK_GROUP",
+        "messages": [
+            {
+                "role": "user",
+                "content": "hi"
+            }
+        ]
+    }'
+```
+
+```shell
+{"error":{"message":"Invalid model for team litellm-dev: BEDROCK_GROUP.  Valid models for team are: ['azure-gpt-3.5']\n\n\nTraceback (most recent call last):\n  File \"/Users/ishaanjaffer/Github/litellm/litellm/proxy/proxy_server.py\", line 2298, in chat_completion\n    _is_valid_team_configs(\n  File \"/Users/ishaanjaffer/Github/litellm/litellm/proxy/utils.py\", line 1296, in _is_valid_team_configs\n    raise Exception(\nException: Invalid model for team litellm-dev: BEDROCK_GROUP.  Valid models for team are: ['azure-gpt-3.5']\n\n","type":"None","param":"None","code":500}}%            
+```         
+
+### Model Aliases

 If a user is expected to use a given model (i.e. gpt3-5), and you want to:

@ -189,421 +347,9 @@ curl --location 'http://localhost:4000/key/generate' \
 			"max_budget": 0,}'
 ```

+## Advanced - Custom Auth 

-## /key/info
-
-### Request
-```shell
-curl -X GET "http://0.0.0.0:4000/key/info?key=sk-02Wr4IAlN3NvPXvL5JVvDA" \
-H "Authorization: Bearer sk-1234"
-```
-
-Request Params:
- key: str - The key you want the info for
-
-### Response
-
-`token` is the hashed key (The DB stores the hashed key for security)
-```json
-{
-  "key": "sk-02Wr4IAlN3NvPXvL5JVvDA",
-  "info": {
-    "token": "80321a12d03412c527f2bd9db5fabd746abead2e1d50b435a534432fbaca9ef5",
-    "spend": 0.0,
-    "expires": "2024-01-18T23:52:09.125000+00:00",
-    "models": ["azure-gpt-3.5", "azure-embedding-model"],
-    "aliases": {},
-    "config": {},
-    "user_id": "ishaan2@berri.ai",
-    "team_id": "None",
-    "max_parallel_requests": null,
-    "metadata": {}
-  }
-}
-
-
-```
-
-## /key/update
-
-### Request
-```shell
-curl 'http://0.0.0.0:4000/key/update' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
-  "key": "sk-kdEXbIqZRwEeEiHwdg7sFA",
-  "models": ["gpt-3.5-turbo", "gpt-4", "claude-2"],
-  "metadata": {"user": "ishaan@berri.ai"},
-  "team_id": "core-infra"
-}'
-```
-
-Request Params:
- key: str - The key that needs to be updated.
-
- models: list or null (optional) - Specify the models a token has access to. If null, then the token has access to all models on the server.
-
- metadata: dict or null (optional) - Pass metadata for the updated token. If null, defaults to an empty dictionary.
-
- team_id: str or null (optional) - Specify the team_id for the associated key.
-
-### Response
-
-```json
-{
-  "key": "sk-kdEXbIqZRwEeEiHwdg7sFA",
-  "models": ["gpt-3.5-turbo", "gpt-4", "claude-2"],
-  "metadata": {
-    "user": "ishaan@berri.ai"
-  }
-}
-
-```
-
-
-## /key/delete
-
-### Request
-```shell
-curl 'http://0.0.0.0:4000/key/delete' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
-  "keys": ["sk-kdEXbIqZRwEeEiHwdg7sFA"]
-}'
-```
-
-Request Params:
- keys: List[str] - List of keys to delete
-
-### Response
-
-```json
-{
-  "deleted_keys": ["sk-kdEXbIqZRwEeEiHwdg7sFA"]
-}
-```
-
-## /user/new
-
-### Request
-
-All [key/generate params supported](#keygenerate) for creating a user
-```shell
-curl 'http://0.0.0.0:4000/user/new' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data-raw '{
-  "user_id": "ishaan1",
-  "user_email": "ishaan@litellm.ai",
-  "user_role": "admin",
-  "team_id": "cto-team",
-  "max_budget": 20,
-  "budget_duration": "1h"
-
-}'
-```
-
-Request Params:
-
- user_id: str (optional - defaults to uuid)  - The unique identifier for the user.
- user_email: str (optional - defaults to "")  - The email address associated with the user.
- user_role: str (optional - defaults to "app_user") - The role assigned to the user. Can be "admin", "app_owner", "app_user"
-
-**Possible `user_role` values**
-```
-"admin" - Maintaining the proxy and owning the overall budget
-"app_owner" - employees maintaining the apps, each owner may own more than one app
-"app_user" - users who know nothing about the proxy. These users get created when you pass `user` to /chat/completions
-```
- team_id: str (optional - defaults to "") - The identifier for the team to which the user belongs.
- max_budget: float (optional - defaults to `null`) - The maximum budget allocated for the user. No budget checks done if `max_budget==null`
- budget_duration: str (optional - defaults to `null`) - The duration for which the budget is valid, e.g., "1h", "1d"
-
-### Response
-A key will be generated for the new user created
-
-```shell
-{
-  "models": [],
-  "spend": 0.0,
-  "max_budget": null,
-  "user_id": "ishaan1",
-  "team_id": null,
-  "max_parallel_requests": null,
-  "metadata": {},
-  "tpm_limit": null,
-  "rpm_limit": null,
-  "budget_duration": null,
-  "allowed_cache_controls": [],
-  "key_alias": null,
-  "duration": null,
-  "aliases": {},
-  "config": {},
-  "key": "sk-JflB33ucTqc2NYvNAgiBCA",
-  "key_name": null,
-  "expires": null
-}
-```
-
-
-## /user/info
-
-### Request
-
-#### View all Users
-If you're trying to view all users, we recommend using pagination with the following args
- `view_all=true`
- `page=0` Optional(int) min = 0, default=0
- `page_size=25` Optional(int) min = 1, default = 25
-```shell
-curl -X GET "http://0.0.0.0:4000/user/info?view_all=true&page=0&page_size=25" -H "Authorization: Bearer sk-1234"
-```
-
-#### View specific user_id
-```shell
-curl -X GET "http://0.0.0.0:4000/user/info?user_id=228da235-eef0-4c30-bf53-5d6ac0d278c2" -H "Authorization: Bearer sk-1234"
-```
-
-### Response
-View user spend, budget, models, keys and teams 
-
-```json
-{
-  "user_id": "228da235-eef0-4c30-bf53-5d6ac0d278c2",
-  "user_info": {
-    "user_id": "228da235-eef0-4c30-bf53-5d6ac0d278c2",
-    "team_id": null,
-    "teams": [],
-    "user_role": "app_user",
-    "max_budget": null,
-    "spend": 200000.0,
-    "user_email": null,
-    "models": [],
-    "max_parallel_requests": null,
-    "tpm_limit": null,
-    "rpm_limit": null,
-    "budget_duration": null,
-    "budget_reset_at": null,
-    "allowed_cache_controls": [],
-    "model_spend": {
-      "chatgpt-v-2": 200000
-    },
-    "model_max_budget": {}
-  },
-  "keys": [
-    {
-      "token": "16c337f9df00a0e6472627e39a2ed02e67bc9a8a760c983c4e9b8cad7954f3c0",
-      "key_name": null,
-      "key_alias": null,
-      "spend": 200000.0,
-      "expires": null,
-      "models": [],
-      "aliases": {},
-      "config": {},
-      "user_id": "228da235-eef0-4c30-bf53-5d6ac0d278c2",
-      "team_id": null,
-      "permissions": {},
-      "max_parallel_requests": null,
-      "metadata": {},
-      "tpm_limit": null,
-      "rpm_limit": null,
-      "max_budget": null,
-      "budget_duration": null,
-      "budget_reset_at": null,
-      "allowed_cache_controls": [],
-      "model_spend": {
-        "chatgpt-v-2": 200000
-      },
-      "model_max_budget": {}
-    }
-  ],
-  "teams": []
-}
-
-```
-
-## Advanced 
-### Upperbound /key/generate params
-Use this, if you need to control the upperbound that users can use for `max_budget`, `budget_duration` or any `key/generate` param per key. 
-
-Set `litellm_settings:upperbound_key_generate_params`:
-```yaml
-litellm_settings:
-  upperbound_key_generate_params:
-    max_budget: 100 # upperbound of $100, for all /key/generate requests
-    duration: "30d" # upperbound of 30 days for all /key/generate requests
-```
-
-** Expected Behavior **
-
- Send a `/key/generate` request with `max_budget=200`
- Key will be created with `max_budget=100` since 100 is the upper bound
-
-### Default /key/generate params
-Use this, if you need to control the default `max_budget` or any `key/generate` param per key. 
-
-When a `/key/generate` request does not specify `max_budget`, it will use the `max_budget` specified in `default_key_generate_params`
-
-Set `litellm_settings:default_key_generate_params`:
-```yaml
-litellm_settings:
-  default_key_generate_params:
-    max_budget: 1.5000
-    models: ["azure-gpt-3.5"]
-    duration:     # blank means `null`
-    metadata: {"setting":"default"}
-    team_id: "core-infra"
-```
-
-### Restrict models by `team_id`
-`litellm-dev` can only access `azure-gpt-3.5`
-
-```yaml
-litellm_settings:
-  default_team_settings:
-    - team_id: litellm-dev
-      models: ["azure-gpt-3.5"]
-```
-
-#### Create key with team_id="litellm-dev"
-```shell
-curl --location 'http://localhost:4000/key/generate' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data-raw '{"team_id": "litellm-dev"}'
-```
-
-#### Use Key to call invalid model - Fails 
-```shell
-curl --location 'http://0.0.0.0:4000/chat/completions' \
-    --header 'Content-Type: application/json' \
-    --header 'Authorization: Bearer sk-qo992IjKOC2CHKZGRoJIGA' \
-    --data '{
-        "model": "BEDROCK_GROUP",
-        "messages": [
-            {
-                "role": "user",
-                "content": "hi"
-            }
-        ]
-    }'
-```
-
-```shell
-{"error":{"message":"Invalid model for team litellm-dev: BEDROCK_GROUP.  Valid models for team are: ['azure-gpt-3.5']\n\n\nTraceback (most recent call last):\n  File \"/Users/ishaanjaffer/Github/litellm/litellm/proxy/proxy_server.py\", line 2298, in chat_completion\n    _is_valid_team_configs(\n  File \"/Users/ishaanjaffer/Github/litellm/litellm/proxy/utils.py\", line 1296, in _is_valid_team_configs\n    raise Exception(\nException: Invalid model for team litellm-dev: BEDROCK_GROUP.  Valid models for team are: ['azure-gpt-3.5']\n\n","type":"None","param":"None","code":500}}%            
-```         
-
-### Set Budgets - Per Key
-
-Set `max_budget` in (USD $) param in the `key/generate` request. By default the `max_budget` is set to `null` and is not checked for keys
-
-```shell
-curl 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
-  "metadata": {"user": "ishaan@berri.ai"},
-  "team_id": "core-infra",
-  "max_budget": 10,
-}'
-```
-
-#### Expected Behaviour
- Costs Per key get auto-populated in `LiteLLM_VerificationToken` Table
- After the key crosses it's `max_budget`, requests fail
-
-Example Request to `/chat/completions` when key has crossed budget
-
-```shell
-curl --location 'http://0.0.0.0:4000/chat/completions' \
-  --header 'Content-Type: application/json' \
-  --header 'Authorization: Bearer sk-ULl_IKCVFy2EZRzQB16RUA' \
-  --data ' {
-  "model": "azure-gpt-3.5",
-  "user": "e09b4da8-ed80-4b05-ac93-e16d9eb56fca",
-  "messages": [
-      {
-      "role": "user",
-      "content": "respond in 50 lines"
-      }
-  ],
-}'
-```
-
-
-Expected Response from `/chat/completions` when key has crossed budget
-```shell
-{
-  "detail":"Authentication Error, ExceededTokenBudget: Current spend for token: 7.2e-05; Max Budget for Token: 2e-07"
-}   
-```
-
-
-### Set Budgets - Per User
-
-LiteLLM exposes a `/user/new` endpoint to create budgets for users, that persist across multiple keys. 
-
-This is documented in the swagger (live on your server root endpoint - e.g. `http://0.0.0.0:4000/`). Here's an example request. 
-
-```shell 
-curl --location 'http://localhost:4000/user/new' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{"models": ["azure-models"], "max_budget": 0, "user_id": "krrish3@berri.ai"}' 
-```
-The request is a normal `/key/generate` request body + a `max_budget` field. 
-
-**Sample Response**
-
-```shell
-{
-    "key": "sk-YF2OxDbrgd1y2KgwxmEA2w",
-    "expires": "2023-12-22T09:53:13.861000Z",
-    "user_id": "krrish3@berri.ai",
-    "max_budget": 0.0
-}
-```
-
-### Tracking Spend 
-
-You can get spend for a key by using the `/key/info` endpoint. 
-
-```bash
-curl 'http://0.0.0.0:4000/key/info?key=<user-key>' \
-     -X GET \
-     -H 'Authorization: Bearer <your-master-key>'
-```
-
-This is automatically updated (in USD) when calls are made to /completions, /chat/completions, /embeddings using litellm's completion_cost() function. [**See Code**](https://github.com/BerriAI/litellm/blob/1a6ea20a0bb66491968907c2bfaabb7fe45fc064/litellm/utils.py#L1654). 
-
-**Sample response**
-
-```python
-{
-    "key": "sk-tXL0wt5-lOOVK9sfY2UacA",
-    "info": {
-        "token": "sk-tXL0wt5-lOOVK9sfY2UacA",
-        "spend": 0.0001065,
-        "expires": "2023-11-24T23:19:11.131000Z",
-        "models": [
-            "gpt-3.5-turbo",
-            "gpt-4",
-            "claude-2"
-        ],
-        "aliases": {
-            "mistral-7b": "gpt-3.5-turbo"
-        },
-        "config": {}
-    }
-}
-```
-
-
-### Custom Auth 
-
-You can now override the default api key auth. 
+You can now override the default api key auth.

 Here's how: 

@ -737,4 +483,56 @@ litellm_settings:

 general_settings:
  custom_key_generate: custom_auth.custom_generate_key_fn
-```
+```
+
+
+## Upperbound /key/generate params
+Use this, if you need to set default upperbounds for `max_budget`, `budget_duration` or any `key/generate` param per key. 
+
+Set `litellm_settings:upperbound_key_generate_params`:
+```yaml
+litellm_settings:
+  upperbound_key_generate_params:
+    max_budget: 100 # upperbound of $100, for all /key/generate requests
+    duration: "30d" # upperbound of 30 days for all /key/generate requests
+```
+
+** Expected Behavior **
+
+- Send a `/key/generate` request with `max_budget=200`
+- Key will be created with `max_budget=100` since 100 is the upper bound
+
+## Default /key/generate params
+Use this, if you need to control the default `max_budget` or any `key/generate` param per key. 
+
+When a `/key/generate` request does not specify `max_budget`, it will use the `max_budget` specified in `default_key_generate_params`
+
+Set `litellm_settings:default_key_generate_params`:
+```yaml
+litellm_settings:
+  default_key_generate_params:
+    max_budget: 1.5000
+    models: ["azure-gpt-3.5"]
+    duration:     # blank means `null`
+    metadata: {"setting":"default"}
+    team_id: "core-infra"
+```
+
+## Endpoints
+
+### Keys 
+
+#### [**👉 API REFERENCE DOCS**](https://litellm-api.up.railway.app/#/key%20management/)
+
+### Users
+
+#### [**👉 API REFERENCE DOCS**](https://litellm-api.up.railway.app/#/user%20management/)
+
+
+### Teams
+
+#### [**👉 API REFERENCE DOCS**](https://litellm-api.up.railway.app/#/team%20management)
+
+
+
+
--- a/docs/my-website/docs/routing.md
+++ b/docs/my-website/docs/routing.md
@ -442,6 +442,8 @@ If a call fails after num_retries, fall back to another model group.

 If the error is a context window exceeded error, fall back to a larger model group (if given). 

+Fallbacks are done in-order - ["gpt-3.5-turbo, "gpt-4", "gpt-4-32k"], will do 'gpt-3.5-turbo' first, then 'gpt-4', etc.
+
 ```python
 from litellm import Router

@ -551,6 +553,156 @@ router = Router(model_list: Optional[list] = None,
 				 cache_responses=True)
 ```

+## Pre-Call Checks (Context Window)
+
+Enable pre-call checks to filter out deployments with context window limit < messages for a call.
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+**1. Enable pre-call checks**
+```python 
+from litellm import Router 
+# ...
+router = Router(model_list=model_list, enable_pre_call_checks=True) # 👈 Set to True
+```
+
+**2. (Azure-only) Set base model**
+
+For azure deployments, set the base model. Pick the base model from [this list](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json), all the azure models start with `azure/`. 
+
+```python
+model_list = [
+            {
+                "model_name": "gpt-3.5-turbo", # model group name
+                "litellm_params": {  # params for litellm completion/embedding call
+                    "model": "azure/chatgpt-v-2",
+                    "api_key": os.getenv("AZURE_API_KEY"),
+                    "api_version": os.getenv("AZURE_API_VERSION"),
+                    "api_base": os.getenv("AZURE_API_BASE"),
+                },
+				"model_info": {
+					"base_model": "azure/gpt-35-turbo", # 👈 SET BASE MODEL
+				}
+            },
+            {
+                "model_name": "gpt-3.5-turbo", # model group name
+                "litellm_params": {  # params for litellm completion/embedding call
+                    "model": "gpt-3.5-turbo-1106",
+                    "api_key": os.getenv("OPENAI_API_KEY"),
+                },
+            },
+        ]
+```
+
+**3. Test it!**
+
+```python
+"""
+- Give a gpt-3.5-turbo model group with different context windows (4k vs. 16k)
+- Send a 5k prompt
+- Assert it works
+"""
+from litellm import Router
+import os
+
+try:
+model_list = [
+	{
+		"model_name": "gpt-3.5-turbo",  # model group name
+		"litellm_params": {  # params for litellm completion/embedding call
+			"model": "azure/chatgpt-v-2",
+			"api_key": os.getenv("AZURE_API_KEY"),
+			"api_version": os.getenv("AZURE_API_VERSION"),
+			"api_base": os.getenv("AZURE_API_BASE"),
+		},
+		"model_info": {
+			"base_model": "azure/gpt-35-turbo", 
+		}
+	},
+	{
+		"model_name": "gpt-3.5-turbo",  # model group name
+		"litellm_params": {  # params for litellm completion/embedding call
+			"model": "gpt-3.5-turbo-1106",
+			"api_key": os.getenv("OPENAI_API_KEY"),
+		},
+	},
+]
+
+router = Router(model_list=model_list, enable_pre_call_checks=True) 
+
+text = "What is the meaning of 42?" * 5000
+
+response = router.completion(
+	model="gpt-3.5-turbo",
+	messages=[
+		{"role": "system", "content": text},
+		{"role": "user", "content": "Who was Alexander?"},
+	],
+)
+
+print(f"response: {response}")
+```
+</TabItem>
+<TabItem value="proxy" label="Proxy">
+
+**1. Setup config**
+
+For azure deployments, set the base model. Pick the base model from [this list](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json), all the azure models start with azure/.
+
+```yaml
+router_settings:
+	enable_pre_call_checks: true # 1. Enable pre-call checks
+
+model_list:
+	- model_name: gpt-3.5-turbo
+	  litellm_params:
+		model: azure/chatgpt-v-2
+		api_base: os.environ/AZURE_API_BASE
+		api_key: os.environ/AZURE_API_KEY
+		api_version: "2023-07-01-preview"
+	  model_info:
+		base_model: azure/gpt-4-1106-preview # 2. 👈 (azure-only) SET BASE MODEL
+	
+	- model_name: gpt-3.5-turbo
+	  litellm_params:
+		model: gpt-3.5-turbo-1106
+		api_key: os.environ/OPENAI_API_KEY
+```
+
+**2. Start proxy**
+
+```bash
+litellm --config /path/to/config.yaml
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+**3. Test it!**
+
+```python
+import openai
+client = openai.OpenAI(
+    api_key="anything",
+    base_url="http://0.0.0.0:4000"
+)
+
+text = "What is the meaning of 42?" * 5000
+
+# request sent to model set on litellm proxy, `litellm --model`
+response = client.chat.completions.create(
+    model="gpt-3.5-turbo",
+    messages = [
+        {"role": "system", "content": text},
+		{"role": "user", "content": "Who was Alexander?"},
+    ],
+)
+
+print(response)
+```
+</TabItem>
+</Tabs>
+
 ## Caching across model groups

 If you want to cache across 2 different model groups (e.g. azure deployments, and openai), use caching groups. 
--- a/docs/my-website/docs/tutorials/ab_test_llms.md
+++ b/docs/my-website/docs/tutorials/ab_test_llms.md
@ -95,5 +95,4 @@ completion_with_split_tests(
 )
 ```

-### A/B Testing Dashboard after running code - https://admin.litellm.ai/
-<Image img={require('../../img/ab_test_logs.png')} />
+
--- a/docs/my-website/docs/tutorials/instructor.md
+++ b/docs/my-website/docs/tutorials/instructor.md
@ -0,0 +1,95 @@
+# Instructor - Function Calling
+
+Use LiteLLM Router with [jxnl's instructor library](https://github.com/jxnl/instructor) for function calling in prod. 
+
+## Usage
+
+```python
+import litellm
+from litellm import Router
+import instructor
+from pydantic import BaseModel
+
+litellm.set_verbose = True # 👈 print DEBUG LOGS
+
+client = instructor.patch(
+    Router(
+        model_list=[
+            {
+                "model_name": "gpt-3.5-turbo",  openai model name
+                "litellm_params": {  # params for litellm completion/embedding call - e.g.: https://github.com/BerriAI/litellm/blob/62a591f90c99120e1a51a8445f5c3752586868ea/litellm/router.py#L111
+                    "model": "azure/chatgpt-v-2",
+                    "api_key": os.getenv("AZURE_API_KEY"),
+                    "api_version": os.getenv("AZURE_API_VERSION"),
+                    "api_base": os.getenv("AZURE_API_BASE"),
+                },
+            }
+        ]
+    )
+)
+
+
+class UserDetail(BaseModel):
+    name: str
+    age: int
+
+
+user = client.chat.completions.create(
+    model="gpt-3.5-turbo",
+    response_model=UserDetail,
+    messages=[
+        {"role": "user", "content": "Extract Jason is 25 years old"},
+    ],
+)
+
+assert isinstance(user, UserDetail)
+assert user.name == "Jason"
+assert user.age == 25
+
+print(f"user: {user}")
+```
+
+## Async Calls
+
+```python
+import litellm
+from litellm import Router
+import instructor, asyncio
+from pydantic import BaseModel
+
+aclient = instructor.apatch(
+    Router(
+        model_list=[
+            {
+                "model_name": "gpt-3.5-turbo",
+                "litellm_params": {
+                    "model": "azure/chatgpt-v-2",
+                    "api_key": os.getenv("AZURE_API_KEY"),
+                    "api_version": os.getenv("AZURE_API_VERSION"),
+                    "api_base": os.getenv("AZURE_API_BASE"),
+                },
+            }
+        ],
+        default_litellm_params={"acompletion": True}, # 👈 IMPORTANT - tells litellm to route to async completion function.
+    )
+)
+
+
+class UserExtract(BaseModel):
+    name: str
+    age: int
+
+
+async def main():
+    model = await aclient.chat.completions.create(
+        model="gpt-3.5-turbo",
+        response_model=UserExtract,
+        messages=[
+            {"role": "user", "content": "Extract jason is 25 years old"},
+        ],
+    )
+    print(f"model: {model}")
+
+
+asyncio.run(main())
+```
--- a/docs/my-website/img/ab_test_code.png
+++ b/docs/my-website/img/ab_test_code.png
--- a/docs/my-website/img/ab_test_logs.png
+++ b/docs/my-website/img/ab_test_logs.png
--- a/docs/my-website/img/add_model.png
+++ b/docs/my-website/img/add_model.png
--- a/docs/my-website/img/admin_dashboard.png
+++ b/docs/my-website/img/admin_dashboard.png
--- a/docs/my-website/img/budget_alerts.png
+++ b/docs/my-website/img/budget_alerts.png
--- a/docs/my-website/img/create_key.png
+++ b/docs/my-website/img/create_key.png
--- a/docs/my-website/img/dashboard.png
+++ b/docs/my-website/img/dashboard.png
--- a/docs/my-website/img/dashboard_log_row.png
+++ b/docs/my-website/img/dashboard_log_row.png
--- a/docs/my-website/img/end_user_enforcement.png
+++ b/docs/my-website/img/end_user_enforcement.png
--- a/docs/my-website/img/lite_logs.png
+++ b/docs/my-website/img/lite_logs.png
--- a/docs/my-website/img/lite_logs2.png
+++ b/docs/my-website/img/lite_logs2.png
--- a/docs/my-website/img/spend_per_api_key.png
+++ b/docs/my-website/img/spend_per_api_key.png
--- a/docs/my-website/img/term_output.png
+++ b/docs/my-website/img/term_output.png
--- a/docs/my-website/img/test_alert.png
+++ b/docs/my-website/img/test_alert.png
--- a/docs/my-website/img/user_auth_screen.png
+++ b/docs/my-website/img/user_auth_screen.png
--- a/docs/my-website/img/user_create_key_screen.png
+++ b/docs/my-website/img/user_create_key_screen.png
--- a/docs/my-website/package-lock.json
+++ b/docs/my-website/package-lock.json
@ -5561,12 +5561,12 @@
      }
    },
    "node_modules/body-parser": {
-      "version": "1.20.1",
-      "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.20.1.tgz",
-      "integrity": "sha512-jWi7abTbYwajOytWCQc37VulmWiRae5RyTpaCyDcS5/lMdtwSz5lOpDE67srw/HYe35f1z3fDQw+3txg7gNtWw==",
+      "version": "1.20.2",
+      "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.20.2.tgz",
+      "integrity": "sha512-ml9pReCu3M61kGlqoTm2umSXTlRTuGTx0bfYj+uIUKKYycG5NtSbeetV3faSU6R7ajOPw0g/J1PvK4qNy7s5bA==",
      "dependencies": {
        "bytes": "3.1.2",
-        "content-type": "~1.0.4",
+        "content-type": "~1.0.5",
        "debug": "2.6.9",
        "depd": "2.0.0",
        "destroy": "1.2.0",
@ -5574,7 +5574,7 @@
        "iconv-lite": "0.4.24",
        "on-finished": "2.4.1",
        "qs": "6.11.0",
-        "raw-body": "2.5.1",
+        "raw-body": "2.5.2",
        "type-is": "~1.6.18",
        "unpipe": "1.0.0"
      },
@ -6707,9 +6707,9 @@
      "integrity": "sha512-ASFBup0Mz1uyiIjANan1jzLQami9z1PoYSZCiiYW2FczPbenXc45FZdBZLzOT+r6+iciuEModtmCti+hjaAk0A=="
    },
    "node_modules/cookie": {
-      "version": "0.5.0",
-      "resolved": "https://registry.npmjs.org/cookie/-/cookie-0.5.0.tgz",
-      "integrity": "sha512-YZ3GUyn/o8gfKJlnlX7g7xq4gyO6OSuhGPKaaGssGB2qgDUS0gPgtTvoyZLTt9Ab6dC4hfc9dV5arkvc/OCmrw==",
+      "version": "0.6.0",
+      "resolved": "https://registry.npmjs.org/cookie/-/cookie-0.6.0.tgz",
+      "integrity": "sha512-U71cyTamuh1CRNCfpGY6to28lxvNwPG4Guz/EVjgf3Jmzv0vlDp1atT9eS5dDjMYHucpHbWns6Lwf3BKz6svdw==",
      "engines": {
        "node": ">= 0.6"
      }
@ -10411,16 +10411,16 @@
      }
    },
    "node_modules/express": {
-      "version": "4.18.2",
-      "resolved": "https://registry.npmjs.org/express/-/express-4.18.2.tgz",
-      "integrity": "sha512-5/PsL6iGPdfQ/lKM1UuielYgv3BUoJfz1aUwU9vHZ+J7gyvwdQXFEBIEIaxeGf0GIcreATNyBExtalisDbuMqQ==",
+      "version": "4.19.2",
+      "resolved": "https://registry.npmjs.org/express/-/express-4.19.2.tgz",
+      "integrity": "sha512-5T6nhjsT+EOMzuck8JjBHARTHfMht0POzlA60WV2pMD3gyXw2LZnZ+ueGdNxG+0calOJcWKbpFcuzLZ91YWq9Q==",
      "dependencies": {
        "accepts": "~1.3.8",
        "array-flatten": "1.1.1",
-        "body-parser": "1.20.1",
+        "body-parser": "1.20.2",
        "content-disposition": "0.5.4",
        "content-type": "~1.0.4",
-        "cookie": "0.5.0",
+        "cookie": "0.6.0",
        "cookie-signature": "1.0.6",
        "debug": "2.6.9",
        "depd": "2.0.0",
@ -17016,9 +17016,9 @@
      }
    },
    "node_modules/raw-body": {
-      "version": "2.5.1",
-      "resolved": "https://registry.npmjs.org/raw-body/-/raw-body-2.5.1.tgz",
-      "integrity": "sha512-qqJBtEyVgS0ZmPGdCFPWJ3FreoqvG4MVQln/kCgF7Olq95IbOp0/BWyMwbdtn4VTvkM8Y7khCQ2Xgk/tcrCXig==",
+      "version": "2.5.2",
+      "resolved": "https://registry.npmjs.org/raw-body/-/raw-body-2.5.2.tgz",
+      "integrity": "sha512-8zGqypfENjCIqGhgXToC8aB2r7YrBX+AQAfIPs/Mlk+BtPTztOvTS01NRW/3Eh60J+a48lt8qsCzirQ6loCVfA==",
      "dependencies": {
        "bytes": "3.1.2",
        "http-errors": "2.0.0",
@ -21554,9 +21554,9 @@
      }
    },
    "node_modules/webpack-dev-middleware": {
-      "version": "5.3.3",
-      "resolved": "https://registry.npmjs.org/webpack-dev-middleware/-/webpack-dev-middleware-5.3.3.tgz",
-      "integrity": "sha512-hj5CYrY0bZLB+eTO+x/j67Pkrquiy7kWepMHmUMoPsmcUaeEnQJqFzHJOyxgWlq746/wUuA64p9ta34Kyb01pA==",
+      "version": "5.3.4",
+      "resolved": "https://registry.npmjs.org/webpack-dev-middleware/-/webpack-dev-middleware-5.3.4.tgz",
+      "integrity": "sha512-BVdTqhhs+0IfoeAf7EoH5WE+exCmqGerHfDM0IL096Px60Tq2Mn9MAbnaGUe6HiMa41KMCYF19gyzZmBcq/o4Q==",
      "dependencies": {
        "colorette": "^2.0.10",
        "memfs": "^3.4.3",
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -28,8 +28,9 @@ const sidebars = {
        slug: "/simple_proxy",
      },
      items: [
-        "proxy/quick_start", 
-        "proxy/deploy",
+        "proxy/quick_start",
+        "proxy/deploy", 
+        "proxy/prod", 
        "proxy/configs",
        {
          type: "link",
@ -42,7 +43,6 @@ const sidebars = {
        "proxy/users",
        "proxy/team_based_routing",
        "proxy/ui",
-        "proxy/budget_alerts",
        "proxy/cost_tracking",
        "proxy/token_auth",
        {
@ -61,6 +61,7 @@ const sidebars = {
          label: "Logging, Alerting",
          items: ["proxy/logging", "proxy/alerting", "proxy/streaming_logging"],
        },
+        "proxy/grafana_metrics",
        "proxy/call_hooks",
        "proxy/rules",
        "proxy/cli", 
@ -180,8 +181,9 @@ const sidebars = {
      type: "category",
      label: "Tutorials",
      items: [
-        "tutorials/azure_openai",
-        "tutorials/oobabooga",
+        'tutorials/azure_openai',
+        'tutorials/instructor',
+        'tutorials/oobabooga',
        "tutorials/gradio_integration",
        "tutorials/huggingface_codellama",
        "tutorials/huggingface_tutorial",
--- a/docs/my-website/yarn.lock
+++ b/docs/my-website/yarn.lock
@ -3138,13 +3138,13 @@ bluebird@~3.4.1:
  resolved "https://registry.npmjs.org/bluebird/-/bluebird-3.4.7.tgz"
  integrity sha512-iD3898SR7sWVRHbiQv+sHUtHnMvC1o3nW5rAcqnq3uOn07DSAppZYUkIGslDz6gXC7HfunPe7YVBgoEJASPcHA==

-body-parser@1.20.1:
-  version "1.20.1"
-  resolved "https://registry.npmjs.org/body-parser/-/body-parser-1.20.1.tgz"
-  integrity sha512-jWi7abTbYwajOytWCQc37VulmWiRae5RyTpaCyDcS5/lMdtwSz5lOpDE67srw/HYe35f1z3fDQw+3txg7gNtWw==
+body-parser@1.20.2:
+  version "1.20.2"
+  resolved "https://registry.yarnpkg.com/body-parser/-/body-parser-1.20.2.tgz#6feb0e21c4724d06de7ff38da36dad4f57a747fd"
+  integrity sha512-ml9pReCu3M61kGlqoTm2umSXTlRTuGTx0bfYj+uIUKKYycG5NtSbeetV3faSU6R7ajOPw0g/J1PvK4qNy7s5bA==
  dependencies:
    bytes "3.1.2"
-    content-type "~1.0.4"
+    content-type "~1.0.5"
    debug "2.6.9"
    depd "2.0.0"
    destroy "1.2.0"
@ -3152,7 +3152,7 @@ body-parser@1.20.1:
    iconv-lite "0.4.24"
    on-finished "2.4.1"
    qs "6.11.0"
-    raw-body "2.5.1"
+    raw-body "2.5.2"
    type-is "~1.6.18"
    unpipe "1.0.0"

@ -3921,7 +3921,7 @@ content-disposition@0.5.4:
  dependencies:
    safe-buffer "5.2.1"

-content-type@~1.0.4:
+content-type@~1.0.4, content-type@~1.0.5:
  version "1.0.5"
  resolved "https://registry.npmjs.org/content-type/-/content-type-1.0.5.tgz"
  integrity sha512-nTjqfcBFEipKdXCv4YDQWCfmcLZKm81ldF0pAopTvyrFGVbcR6P/VAAd5G7N+0tTr8QqiU0tFadD6FK4NtJwOA==
@ -3941,10 +3941,10 @@ cookie-signature@1.0.6:
  resolved "https://registry.npmjs.org/cookie-signature/-/cookie-signature-1.0.6.tgz"
  integrity sha512-QADzlaHc8icV8I7vbaJXJwod9HWYp8uCqf1xa4OfNu1T7JVxQIrUgOWtHdNDtPiywmFbiS12VjotIXLrKM3orQ==

-cookie@0.5.0:
-  version "0.5.0"
-  resolved "https://registry.npmjs.org/cookie/-/cookie-0.5.0.tgz"
-  integrity sha512-YZ3GUyn/o8gfKJlnlX7g7xq4gyO6OSuhGPKaaGssGB2qgDUS0gPgtTvoyZLTt9Ab6dC4hfc9dV5arkvc/OCmrw==
+cookie@0.6.0:
+  version "0.6.0"
+  resolved "https://registry.yarnpkg.com/cookie/-/cookie-0.6.0.tgz#2798b04b071b0ecbff0dbb62a505a8efa4e19051"
+  integrity sha512-U71cyTamuh1CRNCfpGY6to28lxvNwPG4Guz/EVjgf3Jmzv0vlDp1atT9eS5dDjMYHucpHbWns6Lwf3BKz6svdw==

 copy-descriptor@^0.1.0:
  version "0.1.1"
@ -5325,16 +5325,16 @@ expand-template@^2.0.3:
  integrity sha512-XYfuKMvj4O35f/pOXLObndIRvyQ+/+6AhODh+OKWj9S9498pHHn/IMszH+gt0fBCRWMNfk1ZSp5x3AifmnI2vg==

 express@^4.17.1, express@^4.17.3:
-  version "4.18.2"
-  resolved "https://registry.npmjs.org/express/-/express-4.18.2.tgz"
-  integrity sha512-5/PsL6iGPdfQ/lKM1UuielYgv3BUoJfz1aUwU9vHZ+J7gyvwdQXFEBIEIaxeGf0GIcreATNyBExtalisDbuMqQ==
+  version "4.19.2"
+  resolved "https://registry.yarnpkg.com/express/-/express-4.19.2.tgz#e25437827a3aa7f2a827bc8171bbbb664a356465"
+  integrity sha512-5T6nhjsT+EOMzuck8JjBHARTHfMht0POzlA60WV2pMD3gyXw2LZnZ+ueGdNxG+0calOJcWKbpFcuzLZ91YWq9Q==
  dependencies:
    accepts "~1.3.8"
    array-flatten "1.1.1"
-    body-parser "1.20.1"
+    body-parser "1.20.2"
    content-disposition "0.5.4"
    content-type "~1.0.4"
-    cookie "0.5.0"
+    cookie "0.6.0"
    cookie-signature "1.0.6"
    debug "2.6.9"
    depd "2.0.0"
@ -9924,10 +9924,10 @@ range-parser@^1.2.1, range-parser@~1.2.1:
  resolved "https://registry.npmjs.org/range-parser/-/range-parser-1.2.1.tgz"
  integrity sha512-Hrgsx+orqoygnmhFbKaHE6c296J+HTAQXoxEF6gNupROmmGJRoyzfG3ccAveqCBrwr/2yxQ5BVd/GTl5agOwSg==

-raw-body@2.5.1:
-  version "2.5.1"
-  resolved "https://registry.npmjs.org/raw-body/-/raw-body-2.5.1.tgz"
-  integrity sha512-qqJBtEyVgS0ZmPGdCFPWJ3FreoqvG4MVQln/kCgF7Olq95IbOp0/BWyMwbdtn4VTvkM8Y7khCQ2Xgk/tcrCXig==
+raw-body@2.5.2:
+  version "2.5.2"
+  resolved "https://registry.yarnpkg.com/raw-body/-/raw-body-2.5.2.tgz#99febd83b90e08975087e8f1f9419a149366b68a"
+  integrity sha512-8zGqypfENjCIqGhgXToC8aB2r7YrBX+AQAfIPs/Mlk+BtPTztOvTS01NRW/3Eh60J+a48lt8qsCzirQ6loCVfA==
  dependencies:
    bytes "3.1.2"
    http-errors "2.0.0"
@ -12406,9 +12406,9 @@ webpack-bundle-analyzer@^4.5.0:
    ws "^7.3.1"

 webpack-dev-middleware@^5.3.1:
-  version "5.3.3"
-  resolved "https://registry.npmjs.org/webpack-dev-middleware/-/webpack-dev-middleware-5.3.3.tgz"
-  integrity sha512-hj5CYrY0bZLB+eTO+x/j67Pkrquiy7kWepMHmUMoPsmcUaeEnQJqFzHJOyxgWlq746/wUuA64p9ta34Kyb01pA==
+  version "5.3.4"
+  resolved "https://registry.yarnpkg.com/webpack-dev-middleware/-/webpack-dev-middleware-5.3.4.tgz#eb7b39281cbce10e104eb2b8bf2b63fce49a3517"
+  integrity sha512-BVdTqhhs+0IfoeAf7EoH5WE+exCmqGerHfDM0IL096Px60Tq2Mn9MAbnaGUe6HiMa41KMCYF19gyzZmBcq/o4Q==
  dependencies:
    colorette "^2.0.10"
    memfs "^3.4.3"
--- a/enterprise/enterprise_hooks/google_text_moderation.py
+++ b/enterprise/enterprise_hooks/google_text_moderation.py
@ -96,6 +96,8 @@ class _ENTERPRISE_GoogleTextModeration(CustomLogger):
    async def async_moderation_hook(
        self,
        data: dict,
+        user_api_key_dict: UserAPIKeyAuth,
+        call_type: Literal["completion", "embeddings", "image_generation"],
    ):
        """
        - Calls Google's Text Moderation API
--- a/enterprise/enterprise_hooks/llama_guard.py
+++ b/enterprise/enterprise_hooks/llama_guard.py
@ -99,6 +99,8 @@ class _ENTERPRISE_LlamaGuard(CustomLogger):
    async def async_moderation_hook(
        self,
        data: dict,
+        user_api_key_dict: UserAPIKeyAuth,
+        call_type: Literal["completion", "embeddings", "image_generation"],
    ):
        """
        - Calls the Llama Guard Endpoint
--- a/enterprise/enterprise_hooks/llm_guard.py
+++ b/enterprise/enterprise_hooks/llm_guard.py
@ -22,6 +22,7 @@ from litellm.utils import (
 )
 from datetime import datetime
 import aiohttp, asyncio
+from litellm.utils import get_formatted_prompt

 litellm.set_verbose = True

@ -29,9 +30,12 @@ litellm.set_verbose = True
 class _ENTERPRISE_LLMGuard(CustomLogger):
    # Class variables or attributes
    def __init__(
-        self, mock_testing: bool = False, mock_redacted_text: Optional[dict] = None
+        self,
+        mock_testing: bool = False,
+        mock_redacted_text: Optional[dict] = None,
    ):
        self.mock_redacted_text = mock_redacted_text
+        self.llm_guard_mode = litellm.llm_guard_mode
        if mock_testing == True:  # for testing purposes only
            return
        self.llm_guard_api_base = litellm.get_secret("LLM_GUARD_API_BASE", None)
@ -59,7 +63,7 @@ class _ENTERPRISE_LLMGuard(CustomLogger):
                else:
                    # Make the first request to /analyze
                    analyze_url = f"{self.llm_guard_api_base}analyze/prompt"
-                    verbose_proxy_logger.debug(f"Making request to: {analyze_url}")
+                    verbose_proxy_logger.debug("Making request to: %s", analyze_url)
                    analyze_payload = {"prompt": text}
                    redacted_text = None
                    async with session.post(
@ -72,7 +76,7 @@ class _ENTERPRISE_LLMGuard(CustomLogger):
                if redacted_text is not None:
                    if (
                        redacted_text.get("is_valid", None) is not None
-                        and redacted_text["is_valid"] == "True"
+                        and redacted_text["is_valid"] != True
                    ):
                        raise HTTPException(
                            status_code=400,
@ -91,9 +95,26 @@ class _ENTERPRISE_LLMGuard(CustomLogger):
            traceback.print_exc()
            raise e

+    def should_proceed(self, user_api_key_dict: UserAPIKeyAuth) -> bool:
+        if self.llm_guard_mode == "key-specific":
+            # check if llm guard enabled for specific keys only
+            self.print_verbose(
+                f"user_api_key_dict.permissions: {user_api_key_dict.permissions}"
+            )
+            if (
+                user_api_key_dict.permissions.get("enable_llm_guard_check", False)
+                == True
+            ):
+                return True
+        elif self.llm_guard_mode == "all":
+            return True
+        return False
+
    async def async_moderation_hook(
        self,
        data: dict,
+        user_api_key_dict: UserAPIKeyAuth,
+        call_type: Literal["completion", "embeddings", "image_generation"],
    ):
        """
        - Calls the LLM Guard Endpoint
@ -101,7 +122,32 @@ class _ENTERPRISE_LLMGuard(CustomLogger):
        - Use the sanitized prompt returned
            - LLM Guard can handle things like PII Masking, etc.
        """
-        return data
+        self.print_verbose(
+            f"Inside LLM Guard Pre-Call Hook - llm_guard_mode={self.llm_guard_mode}"
+        )
+
+        _proceed = self.should_proceed(user_api_key_dict=user_api_key_dict)
+        if _proceed == False:
+            return
+
+        self.print_verbose("Makes LLM Guard Check")
+        try:
+            assert call_type in [
+                "completion",
+                "embeddings",
+                "image_generation",
+                "moderation",
+                "audio_transcription",
+            ]
+        except Exception as e:
+            self.print_verbose(
+                f"Call Type - {call_type}, not in accepted list - ['completion','embeddings','image_generation','moderation','audio_transcription']"
+            )
+            return data
+
+        formatted_prompt = get_formatted_prompt(data=data, call_type=call_type)  # type: ignore
+        self.print_verbose(f"LLM Guard, formatted_prompt: {formatted_prompt}")
+        return await self.moderation_check(text=formatted_prompt)

    async def async_post_call_streaming_hook(
        self, user_api_key_dict: UserAPIKeyAuth, response: str
--- a/litellm-js/proxy/README.md
+++ b/litellm-js/proxy/README.md
@ -0,0 +1,8 @@
+```
+npm install
+npm run dev
+```
+
+```
+npm run deploy
+```
--- a/litellm-js/proxy/package.json
+++ b/litellm-js/proxy/package.json
@ -0,0 +1,14 @@
+{
+  "scripts": {
+    "dev": "wrangler dev src/index.ts",
+    "deploy": "wrangler deploy --minify src/index.ts"
+  },
+  "dependencies": {
+    "hono": "^4.1.4",
+    "openai": "^4.29.2"
+  },
+  "devDependencies": {
+    "@cloudflare/workers-types": "^4.20240208.0",
+    "wrangler": "^3.32.0"
+  }
+}
--- a/litellm-js/proxy/src/index.ts
+++ b/litellm-js/proxy/src/index.ts
@ -0,0 +1,59 @@
+import { Hono } from 'hono'
+import { Context } from 'hono';
+import { bearerAuth } from 'hono/bearer-auth'
+import OpenAI from "openai";
+
+const openai = new OpenAI({
+  apiKey: "sk-1234",
+  baseURL: "https://openai-endpoint.ishaanjaffer0324.workers.dev"
+});
+
+async function call_proxy() {
+  const completion = await openai.chat.completions.create({
+    messages: [{ role: "system", content: "You are a helpful assistant." }],
+    model: "gpt-3.5-turbo",
+  });
+
+  return completion
+}
+
+const app = new Hono()
+
+// Middleware for API Key Authentication
+const apiKeyAuth = async (c: Context, next: Function) => {
+  const apiKey = c.req.header('Authorization');
+  if (!apiKey || apiKey !== 'Bearer sk-1234') {
+    return c.text('Unauthorized', 401);
+  }
+  await next();
+};
+
+
+app.use('/*', apiKeyAuth)
+
+
+app.get('/', (c) => {
+  return c.text('Hello Hono!')
+})
+
+
+
+
+// Handler for chat completions
+const chatCompletionHandler = async (c: Context) => {
+  // Assuming your logic for handling chat completion goes here
+  // For demonstration, just returning a simple JSON response
+  const response = await call_proxy()
+  return c.json(response);
+};
+
+// Register the above handler for different POST routes with the apiKeyAuth middleware
+app.post('/v1/chat/completions', chatCompletionHandler);
+app.post('/chat/completions', chatCompletionHandler);
+
+// Example showing how you might handle dynamic segments within the URL
+// Here, using ':model*' to capture the rest of the path as a parameter 'model'
+app.post('/openai/deployments/:model*/chat/completions', chatCompletionHandler);
+
+
+export default app
--- a/litellm-js/proxy/tsconfig.json
+++ b/litellm-js/proxy/tsconfig.json
@ -0,0 +1,16 @@
+{
+  "compilerOptions": {
+    "target": "ESNext",
+    "module": "ESNext",
+    "moduleResolution": "Bundler",
+    "strict": true,
+    "lib": [
+      "ESNext"
+    ],
+    "types": [
+      "@cloudflare/workers-types"
+    ],
+    "jsx": "react-jsx",
+    "jsxImportSource": "hono/jsx"
+  },
+}
--- a/litellm-js/proxy/wrangler.toml
+++ b/litellm-js/proxy/wrangler.toml
@ -0,0 +1,18 @@
+name = "my-app"
+compatibility_date = "2023-12-01"
+
+# [vars]
+# MY_VAR = "my-variable"
+
+# [[kv_namespaces]]
+# binding = "MY_KV_NAMESPACE"
+# id = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+
+# [[r2_buckets]]
+# binding = "MY_BUCKET"
+# bucket_name = "my-bucket"
+
+# [[d1_databases]]
+# binding = "DB"
+# database_name = "my-database"
+# database_id = ""
--- a/litellm-js/spend-logs/Dockerfile
+++ b/litellm-js/spend-logs/Dockerfile
@ -0,0 +1,26 @@
+# Use the specific Node.js v20.11.0 image
+FROM node:20.11.0
+
+# Set the working directory inside the container
+WORKDIR /app
+
+# Copy package.json and package-lock.json to the working directory
+COPY ./litellm-js/spend-logs/package*.json ./
+
+# Install dependencies
+RUN npm install
+
+# Install Prisma globally
+RUN npm install -g prisma
+
+# Copy the rest of the application code
+COPY ./litellm-js/spend-logs .
+
+# Generate Prisma client
+RUN npx prisma generate
+
+# Expose the port that the Node.js server will run on
+EXPOSE 3000
+
+# Command to run the Node.js app with npm run dev
+CMD ["npm", "run", "dev"]
--- a/litellm-js/spend-logs/README.md
+++ b/litellm-js/spend-logs/README.md
@ -0,0 +1,8 @@
+```
+npm install
+npm run dev
+```
+
+```
+open http://localhost:3000
+```
--- a/litellm-js/spend-logs/package-lock.json
+++ b/litellm-js/spend-logs/package-lock.json
@ -0,0 +1,508 @@
+{
+  "name": "spend-logs",
+  "lockfileVersion": 3,
+  "requires": true,
+  "packages": {
+    "": {
+      "dependencies": {
+        "@hono/node-server": "^1.9.0",
+        "hono": "^4.1.5"
+      },
+      "devDependencies": {
+        "@types/node": "^20.11.17",
+        "tsx": "^4.7.1"
+      }
+    },
+    "node_modules/@esbuild/aix-ppc64": {
+      "version": "0.19.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.19.12.tgz",
+      "integrity": "sha512-bmoCYyWdEL3wDQIVbcyzRyeKLgk2WtWLTWz1ZIAZF/EGbNOwSA6ew3PftJ1PqMiOOGu0OyFMzG53L0zqIpPeNA==",
+      "cpu": [
+        "ppc64"
+      ],
+      "dev": true,
+      "optional": true,
+      "os": [
+        "aix"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/android-arm": {
+      "version": "0.19.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.19.12.tgz",
+      "integrity": "sha512-qg/Lj1mu3CdQlDEEiWrlC4eaPZ1KztwGJ9B6J+/6G+/4ewxJg7gqj8eVYWvao1bXrqGiW2rsBZFSX3q2lcW05w==",
+      "cpu": [
+        "arm"
+      ],
+      "dev": true,
+      "optional": true,
+      "os": [
+        "android"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/android-arm64": {
+      "version": "0.19.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.19.12.tgz",
+      "integrity": "sha512-P0UVNGIienjZv3f5zq0DP3Nt2IE/3plFzuaS96vihvD0Hd6H/q4WXUGpCxD/E8YrSXfNyRPbpTq+T8ZQioSuPA==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "optional": true,
+      "os": [
+        "android"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/android-x64": {
+      "version": "0.19.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.19.12.tgz",
+      "integrity": "sha512-3k7ZoUW6Q6YqhdhIaq/WZ7HwBpnFBlW905Fa4s4qWJyiNOgT1dOqDiVAQFwBH7gBRZr17gLrlFCRzF6jFh7Kew==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "optional": true,
+      "os": [
+        "android"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/darwin-arm64": {
+      "version": "0.19.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.19.12.tgz",
+      "integrity": "sha512-B6IeSgZgtEzGC42jsI+YYu9Z3HKRxp8ZT3cqhvliEHovq8HSX2YX8lNocDn79gCKJXOSaEot9MVYky7AKjCs8g==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/darwin-x64": {
+      "version": "0.19.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.19.12.tgz",
+      "integrity": "sha512-hKoVkKzFiToTgn+41qGhsUJXFlIjxI/jSYeZf3ugemDYZldIXIxhvwN6erJGlX4t5h417iFuheZ7l+YVn05N3A==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/freebsd-arm64": {
+      "version": "0.19.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.19.12.tgz",
+      "integrity": "sha512-4aRvFIXmwAcDBw9AueDQ2YnGmz5L6obe5kmPT8Vd+/+x/JMVKCgdcRwH6APrbpNXsPz+K653Qg8HB/oXvXVukA==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "optional": true,
+      "os": [
+        "freebsd"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/freebsd-x64": {
+      "version": "0.19.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.19.12.tgz",
+      "integrity": "sha512-EYoXZ4d8xtBoVN7CEwWY2IN4ho76xjYXqSXMNccFSx2lgqOG/1TBPW0yPx1bJZk94qu3tX0fycJeeQsKovA8gg==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "optional": true,
+      "os": [
+        "freebsd"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/linux-arm": {
+      "version": "0.19.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.19.12.tgz",
+      "integrity": "sha512-J5jPms//KhSNv+LO1S1TX1UWp1ucM6N6XuL6ITdKWElCu8wXP72l9MM0zDTzzeikVyqFE6U8YAV9/tFyj0ti+w==",
+      "cpu": [
+        "arm"
+      ],
+      "dev": true,
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/linux-arm64": {
+      "version": "0.19.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.19.12.tgz",
+      "integrity": "sha512-EoTjyYyLuVPfdPLsGVVVC8a0p1BFFvtpQDB/YLEhaXyf/5bczaGeN15QkR+O4S5LeJ92Tqotve7i1jn35qwvdA==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/linux-ia32": {
+      "version": "0.19.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.19.12.tgz",
+      "integrity": "sha512-Thsa42rrP1+UIGaWz47uydHSBOgTUnwBwNq59khgIwktK6x60Hivfbux9iNR0eHCHzOLjLMLfUMLCypBkZXMHA==",
+      "cpu": [
+        "ia32"
+      ],
+      "dev": true,
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/linux-loong64": {
+      "version": "0.19.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.19.12.tgz",
+      "integrity": "sha512-LiXdXA0s3IqRRjm6rV6XaWATScKAXjI4R4LoDlvO7+yQqFdlr1Bax62sRwkVvRIrwXxvtYEHHI4dm50jAXkuAA==",
+      "cpu": [
+        "loong64"
+      ],
+      "dev": true,
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/linux-mips64el": {
+      "version": "0.19.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.19.12.tgz",
+      "integrity": "sha512-fEnAuj5VGTanfJ07ff0gOA6IPsvrVHLVb6Lyd1g2/ed67oU1eFzL0r9WL7ZzscD+/N6i3dWumGE1Un4f7Amf+w==",
+      "cpu": [
+        "mips64el"
+      ],
+      "dev": true,
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/linux-ppc64": {
+      "version": "0.19.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.19.12.tgz",
+      "integrity": "sha512-nYJA2/QPimDQOh1rKWedNOe3Gfc8PabU7HT3iXWtNUbRzXS9+vgB0Fjaqr//XNbd82mCxHzik2qotuI89cfixg==",
+      "cpu": [
+        "ppc64"
+      ],
+      "dev": true,
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/linux-riscv64": {
+      "version": "0.19.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.19.12.tgz",
+      "integrity": "sha512-2MueBrlPQCw5dVJJpQdUYgeqIzDQgw3QtiAHUC4RBz9FXPrskyyU3VI1hw7C0BSKB9OduwSJ79FTCqtGMWqJHg==",
+      "cpu": [
+        "riscv64"
+      ],
+      "dev": true,
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/linux-s390x": {
+      "version": "0.19.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.19.12.tgz",
+      "integrity": "sha512-+Pil1Nv3Umes4m3AZKqA2anfhJiVmNCYkPchwFJNEJN5QxmTs1uzyy4TvmDrCRNT2ApwSari7ZIgrPeUx4UZDg==",
+      "cpu": [
+        "s390x"
+      ],
+      "dev": true,
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/linux-x64": {
+      "version": "0.19.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.19.12.tgz",
+      "integrity": "sha512-B71g1QpxfwBvNrfyJdVDexenDIt1CiDN1TIXLbhOw0KhJzE78KIFGX6OJ9MrtC0oOqMWf+0xop4qEU8JrJTwCg==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/netbsd-x64": {
+      "version": "0.19.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.19.12.tgz",
+      "integrity": "sha512-3ltjQ7n1owJgFbuC61Oj++XhtzmymoCihNFgT84UAmJnxJfm4sYCiSLTXZtE00VWYpPMYc+ZQmB6xbSdVh0JWA==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "optional": true,
+      "os": [
+        "netbsd"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/openbsd-x64": {
+      "version": "0.19.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.19.12.tgz",
+      "integrity": "sha512-RbrfTB9SWsr0kWmb9srfF+L933uMDdu9BIzdA7os2t0TXhCRjrQyCeOt6wVxr79CKD4c+p+YhCj31HBkYcXebw==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "optional": true,
+      "os": [
+        "openbsd"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/sunos-x64": {
+      "version": "0.19.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.19.12.tgz",
+      "integrity": "sha512-HKjJwRrW8uWtCQnQOz9qcU3mUZhTUQvi56Q8DPTLLB+DawoiQdjsYq+j+D3s9I8VFtDr+F9CjgXKKC4ss89IeA==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "optional": true,
+      "os": [
+        "sunos"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/win32-arm64": {
+      "version": "0.19.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.19.12.tgz",
+      "integrity": "sha512-URgtR1dJnmGvX864pn1B2YUYNzjmXkuJOIqG2HdU62MVS4EHpU2946OZoTMnRUHklGtJdJZ33QfzdjGACXhn1A==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/win32-ia32": {
+      "version": "0.19.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.19.12.tgz",
+      "integrity": "sha512-+ZOE6pUkMOJfmxmBZElNOx72NKpIa/HFOMGzu8fqzQJ5kgf6aTGrcJaFsNiVMH4JKpMipyK+7k0n2UXN7a8YKQ==",
+      "cpu": [
+        "ia32"
+      ],
+      "dev": true,
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@esbuild/win32-x64": {
+      "version": "0.19.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.19.12.tgz",
+      "integrity": "sha512-T1QyPSDCyMXaO3pzBkF96E8xMkiRYbUEZADd29SyPGabqxMViNoii+NcK7eWJAEoU6RZyEm5lVSIjTmcdoB9HA==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@hono/node-server": {
+      "version": "1.9.0",
+      "resolved": "https://registry.npmjs.org/@hono/node-server/-/node-server-1.9.0.tgz",
+      "integrity": "sha512-oJjk7WXBlENeHhWiMqSyxPIZ3Kmf5ZYxqdlcSIXyN8Rn50bNJsPl99G4POBS03Jxh56FdfRJ0SEnC8mAVIiavQ==",
+      "engines": {
+        "node": ">=18.14.1"
+      }
+    },
+    "node_modules/@types/node": {
+      "version": "20.11.30",
+      "resolved": "https://registry.npmjs.org/@types/node/-/node-20.11.30.tgz",
+      "integrity": "sha512-dHM6ZxwlmuZaRmUPfv1p+KrdD1Dci04FbdEm/9wEMouFqxYoFl5aMkt0VMAUtYRQDyYvD41WJLukhq/ha3YuTw==",
+      "dev": true,
+      "dependencies": {
+        "undici-types": "~5.26.4"
+      }
+    },
+    "node_modules/esbuild": {
+      "version": "0.19.12",
+      "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.19.12.tgz",
+      "integrity": "sha512-aARqgq8roFBj054KvQr5f1sFu0D65G+miZRCuJyJ0G13Zwx7vRar5Zhn2tkQNzIXcBrNVsv/8stehpj+GAjgbg==",
+      "dev": true,
+      "hasInstallScript": true,
+      "bin": {
+        "esbuild": "bin/esbuild"
+      },
+      "engines": {
+        "node": ">=12"
+      },
+      "optionalDependencies": {
+        "@esbuild/aix-ppc64": "0.19.12",
+        "@esbuild/android-arm": "0.19.12",
+        "@esbuild/android-arm64": "0.19.12",
+        "@esbuild/android-x64": "0.19.12",
+        "@esbuild/darwin-arm64": "0.19.12",
+        "@esbuild/darwin-x64": "0.19.12",
+        "@esbuild/freebsd-arm64": "0.19.12",
+        "@esbuild/freebsd-x64": "0.19.12",
+        "@esbuild/linux-arm": "0.19.12",
+        "@esbuild/linux-arm64": "0.19.12",
+        "@esbuild/linux-ia32": "0.19.12",
+        "@esbuild/linux-loong64": "0.19.12",
+        "@esbuild/linux-mips64el": "0.19.12",
+        "@esbuild/linux-ppc64": "0.19.12",
+        "@esbuild/linux-riscv64": "0.19.12",
+        "@esbuild/linux-s390x": "0.19.12",
+        "@esbuild/linux-x64": "0.19.12",
+        "@esbuild/netbsd-x64": "0.19.12",
+        "@esbuild/openbsd-x64": "0.19.12",
+        "@esbuild/sunos-x64": "0.19.12",
+        "@esbuild/win32-arm64": "0.19.12",
+        "@esbuild/win32-ia32": "0.19.12",
+        "@esbuild/win32-x64": "0.19.12"
+      }
+    },
+    "node_modules/fsevents": {
+      "version": "2.3.3",
+      "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz",
+      "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==",
+      "dev": true,
+      "hasInstallScript": true,
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": "^8.16.0 || ^10.6.0 || >=11.0.0"
+      }
+    },
+    "node_modules/get-tsconfig": {
+      "version": "4.7.3",
+      "resolved": "https://registry.npmjs.org/get-tsconfig/-/get-tsconfig-4.7.3.tgz",
+      "integrity": "sha512-ZvkrzoUA0PQZM6fy6+/Hce561s+faD1rsNwhnO5FelNjyy7EMGJ3Rz1AQ8GYDWjhRs/7dBLOEJvhK8MiEJOAFg==",
+      "dev": true,
+      "dependencies": {
+        "resolve-pkg-maps": "^1.0.0"
+      },
+      "funding": {
+        "url": "https://github.com/privatenumber/get-tsconfig?sponsor=1"
+      }
+    },
+    "node_modules/hono": {
+      "version": "4.1.5",
+      "resolved": "https://registry.npmjs.org/hono/-/hono-4.1.5.tgz",
+      "integrity": "sha512-3ChJiIoeCxvkt6vnkxJagplrt1YZg3NyNob7ssVeK2PUqEINp4q1F94HzFnvY9QE8asVmbW5kkTDlyWylfg2vg==",
+      "engines": {
+        "node": ">=16.0.0"
+      }
+    },
+    "node_modules/resolve-pkg-maps": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/resolve-pkg-maps/-/resolve-pkg-maps-1.0.0.tgz",
+      "integrity": "sha512-seS2Tj26TBVOC2NIc2rOe2y2ZO7efxITtLZcGSOnHHNOQ7CkiUBfw0Iw2ck6xkIhPwLhKNLS8BO+hEpngQlqzw==",
+      "dev": true,
+      "funding": {
+        "url": "https://github.com/privatenumber/resolve-pkg-maps?sponsor=1"
+      }
+    },
+    "node_modules/tsx": {
+      "version": "4.7.1",
+      "resolved": "https://registry.npmjs.org/tsx/-/tsx-4.7.1.tgz",
+      "integrity": "sha512-8d6VuibXHtlN5E3zFkgY8u4DX7Y3Z27zvvPKVmLon/D4AjuKzarkUBTLDBgj9iTQ0hg5xM7c/mYiRVM+HETf0g==",
+      "dev": true,
+      "dependencies": {
+        "esbuild": "~0.19.10",
+        "get-tsconfig": "^4.7.2"
+      },
+      "bin": {
+        "tsx": "dist/cli.mjs"
+      },
+      "engines": {
+        "node": ">=18.0.0"
+      },
+      "optionalDependencies": {
+        "fsevents": "~2.3.3"
+      }
+    },
+    "node_modules/undici-types": {
+      "version": "5.26.5",
+      "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz",
+      "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==",
+      "dev": true
+    }
+  }
+}
--- a/litellm-js/spend-logs/package.json
+++ b/litellm-js/spend-logs/package.json
@ -0,0 +1,13 @@
+{
+  "scripts": {
+    "dev": "tsx watch src/index.ts"
+  },
+  "dependencies": {
+    "@hono/node-server": "^1.9.0",
+    "hono": "^4.1.5"
+  },
+  "devDependencies": {
+    "@types/node": "^20.11.17",
+    "tsx": "^4.7.1"
+  }
+}
--- a/litellm-js/spend-logs/schema.prisma
+++ b/litellm-js/spend-logs/schema.prisma
@ -0,0 +1,29 @@
+generator client {
+  provider        = "prisma-client-js"
+}
+
+datasource client {
+  provider = "postgresql"
+  url      = env("DATABASE_URL")
+}
+
+model LiteLLM_SpendLogs {
+  request_id        String   @id
+  call_type         String
+  api_key           String   @default("")
+  spend             Float    @default(0.0)
+  total_tokens      Int      @default(0)
+  prompt_tokens     Int      @default(0)
+  completion_tokens Int      @default(0)
+  startTime         DateTime
+  endTime           DateTime
+  model             String   @default("")
+  api_base          String   @default("")
+  user              String   @default("")
+  metadata          Json     @default("{}")
+  cache_hit         String   @default("")
+  cache_key         String   @default("")
+  request_tags      Json     @default("[]")
+  team_id           String?
+  end_user          String?
+}
--- a/litellm-js/spend-logs/src/_types.ts
+++ b/litellm-js/spend-logs/src/_types.ts
@ -0,0 +1,32 @@
+export type LiteLLM_IncrementSpend = {
+    key_transactions: Array<LiteLLM_IncrementObject>, // [{"key": spend},..]
+    user_transactions: Array<LiteLLM_IncrementObject>, 
+    team_transactions: Array<LiteLLM_IncrementObject>,
+    spend_logs_transactions: Array<LiteLLM_SpendLogs>
+}
+
+export type LiteLLM_IncrementObject = {
+    key: string,
+    spend: number
+}
+
+export type LiteLLM_SpendLogs = {
+    request_id: string; // @id means it's a unique identifier
+    call_type: string;
+    api_key: string; // @default("") means it defaults to an empty string if not provided
+    spend: number; // Float in Prisma corresponds to number in TypeScript
+    total_tokens: number; // Int in Prisma corresponds to number in TypeScript
+    prompt_tokens: number;
+    completion_tokens: number;
+    startTime: Date; // DateTime in Prisma corresponds to Date in TypeScript
+    endTime: Date;
+    model: string; // @default("") means it defaults to an empty string if not provided
+    api_base: string;
+    user: string;
+    metadata: any; // Json type in Prisma is represented by any in TypeScript; could also use a more specific type if the structure of JSON is known
+    cache_hit: string;
+    cache_key: string;
+    request_tags: any; // Similarly, this could be an array or a more specific type depending on the expected structure
+    team_id?: string | null; // ? indicates it's optional and can be undefined, but could also be null if not provided
+    end_user?: string | null;
+};
--- a/litellm-js/spend-logs/src/index.ts
+++ b/litellm-js/spend-logs/src/index.ts
@ -0,0 +1,84 @@
+import { serve } from '@hono/node-server'
+import { Hono } from 'hono'
+import { PrismaClient } from '@prisma/client'
+import {LiteLLM_SpendLogs, LiteLLM_IncrementSpend, LiteLLM_IncrementObject} from './_types'
+
+const app = new Hono()
+const prisma = new PrismaClient()
+// In-memory storage for logs
+let spend_logs: LiteLLM_SpendLogs[] = [];
+const key_logs: LiteLLM_IncrementObject[] = [];
+const user_logs: LiteLLM_IncrementObject[] = [];
+const transaction_logs: LiteLLM_IncrementObject[] = [];
+
+
+app.get('/', (c) => {
+  return c.text('Hello Hono!')
+})
+
+const MIN_LOGS = 1; // Minimum number of logs needed to initiate a flush
+const FLUSH_INTERVAL = 5000; // Time in ms to wait before trying to flush again
+const BATCH_SIZE = 100; // Preferred size of each batch to write to the database
+const MAX_LOGS_PER_INTERVAL = 1000; // Maximum number of logs to flush in a single interval
+
+const flushLogsToDb = async () => {
+  if (spend_logs.length >= MIN_LOGS) {
+    // Limit the logs to process in this interval to MAX_LOGS_PER_INTERVAL or less
+    const logsToProcess = spend_logs.slice(0, MAX_LOGS_PER_INTERVAL);
+  
+    for (let i = 0; i < logsToProcess.length; i += BATCH_SIZE) {
+      // Create subarray for current batch, ensuring it doesn't exceed the BATCH_SIZE
+      const batch = logsToProcess.slice(i, i + BATCH_SIZE);
+
+      // Convert datetime strings to Date objects
+      const batchWithDates = batch.map(entry => ({
+        ...entry,
+        startTime: new Date(entry.startTime),
+        endTime: new Date(entry.endTime),
+        // Repeat for any other DateTime fields you may have
+      }));
+
+      await prisma.liteLLM_SpendLogs.createMany({
+        data: batchWithDates,
+      });
+
+      console.log(`Flushed ${batch.length} logs to the DB.`);
+    }
+
+    // Remove the processed logs from spend_logs
+    spend_logs = spend_logs.slice(logsToProcess.length);
+    
+    console.log(`${logsToProcess.length} logs processed. Remaining in queue: ${spend_logs.length}`);
+  } else {
+    // This will ensure it doesn't falsely claim "No logs to flush." when it's merely below the MIN_LOGS threshold.
+    if(spend_logs.length > 0) {
+      console.log(`Accumulating logs. Currently at ${spend_logs.length}, waiting for at least ${MIN_LOGS}.`);
+    } else {
+      console.log("No logs to flush.");
+    }
+  }
+};
+
+// Setup interval for attempting to flush the logs
+setInterval(flushLogsToDb, FLUSH_INTERVAL);
+
+// Route to receive log messages
+app.post('/spend/update', async (c) => {
+  const incomingLogs = await c.req.json<LiteLLM_SpendLogs[]>();
+  
+  spend_logs.push(...incomingLogs);
+
+  console.log(`Received and stored ${incomingLogs.length} logs. Total logs in memory: ${spend_logs.length}`);
+  
+  return c.json({ message: `Successfully stored ${incomingLogs.length} logs` });
+});
+
+
+
+const port = 3000
+console.log(`Server is running on port ${port}`)
+
+serve({
+  fetch: app.fetch,
+  port
+})
--- a/litellm-js/spend-logs/tsconfig.json
+++ b/litellm-js/spend-logs/tsconfig.json
@ -0,0 +1,13 @@
+{
+  "compilerOptions": {
+    "target": "ESNext",
+    "module": "ESNext",
+    "moduleResolution": "Bundler",
+    "strict": true,
+    "types": [
+      "node"
+    ],
+    "jsx": "react-jsx",
+    "jsxImportSource": "hono/jsx",
+  }
+}
--- a/litellm/init.py
+++ b/litellm/init.py
@ -1,6 +1,6 @@
 ### INIT VARIABLES ###
 import threading, requests, os
-from typing import Callable, List, Optional, Dict, Union, Any
+from typing import Callable, List, Optional, Dict, Union, Any, Literal
 from litellm.caching import Cache
 from litellm._logging import set_verbose, _turn_on_debug, verbose_logger
 from litellm.proxy._types import KeyManagementSystem, KeyManagementSettings
@ -56,6 +56,7 @@ baseten_key: Optional[str] = None
 aleph_alpha_key: Optional[str] = None
 nlp_cloud_key: Optional[str] = None
 use_client: bool = False
+disable_streaming_logging: bool = False
 ### GUARDRAILS ###
 llamaguard_model_name: Optional[str] = None
 presidio_ad_hoc_recognizers: Optional[str] = None
@ -63,6 +64,7 @@ google_moderation_confidence_threshold: Optional[float] = None
 llamaguard_unsafe_content_categories: Optional[str] = None
 blocked_user_list: Optional[Union[str, List]] = None
 banned_keywords_list: Optional[Union[str, List]] = None
+llm_guard_mode: Literal["all", "key-specific"] = "all"
 ##################
 logging: bool = True
 caching: bool = (
@ -172,6 +174,7 @@ upperbound_key_generate_params: Optional[Dict] = None
 default_user_params: Optional[Dict] = None
 default_team_settings: Optional[List] = None
 max_user_budget: Optional[float] = None
+max_end_user_budget: Optional[float] = None
 #### RELIABILITY ####
 request_timeout: Optional[float] = 6000
 num_retries: Optional[int] = None  # per model endpoint
--- a/litellm/caching.py
+++ b/litellm/caching.py
@ -38,6 +38,9 @@ class BaseCache:
    async def async_get_cache(self, key, **kwargs):
        raise NotImplementedError

+    async def batch_cache_write(self, result, *args, **kwargs):
+        raise NotImplementedError
+
    async def disconnect(self):
        raise NotImplementedError

@ -96,7 +99,9 @@ class InMemoryCache(BaseCache):
 class RedisCache(BaseCache):
    # if users don't provider one, use the default litellm cache

-    def __init__(self, host=None, port=None, password=None, **kwargs):
+    def __init__(
+        self, host=None, port=None, password=None, redis_flush_size=100, **kwargs
+    ):
        from ._redis import get_redis_client, get_redis_connection_pool

        redis_kwargs = {}
@ -111,6 +116,10 @@ class RedisCache(BaseCache):
        self.redis_client = get_redis_client(**redis_kwargs)
        self.redis_kwargs = redis_kwargs
        self.async_redis_conn_pool = get_redis_connection_pool(**redis_kwargs)
+
+        # for high traffic, we store the redis results in memory and then batch write to redis
+        self.redis_batch_writing_buffer = []
+        self.redis_flush_size = redis_flush_size
        self.redis_version = "Unknown"
        try:
            self.redis_version = self.redis_client.info()["redis_version"]
@ -161,8 +170,10 @@ class RedisCache(BaseCache):
                )
            except Exception as e:
                # NON blocking - notify users Redis is throwing an exception
-                print_verbose(
-                    f"LiteLLM Redis Caching: async set() - Got exception from REDIS : {str(e)}"
+                verbose_logger.error(
+                    "LiteLLM Redis Caching: async set() - Got exception from REDIS %s, Writing value=%s",
+                    str(e),
+                    value,
                )
                traceback.print_exc()

@ -191,7 +202,27 @@ class RedisCache(BaseCache):
            # Optionally, you could process 'results' to make sure that all set operations were successful.
            return results
        except Exception as e:
-            print_verbose(f"Error occurred in pipeline write - {str(e)}")
+            verbose_logger.error(
+                "LiteLLM Redis Caching: async set_cache_pipeline() - Got exception from REDIS %s, Writing value=%s",
+                str(e),
+                cache_value,
+            )
+            traceback.print_exc()
+
+    async def batch_cache_write(self, key, value, **kwargs):
+        print_verbose(
+            f"in batch cache writing for redis buffer size={len(self.redis_batch_writing_buffer)}",
+        )
+        self.redis_batch_writing_buffer.append((key, value))
+        if len(self.redis_batch_writing_buffer) >= self.redis_flush_size:
+            await self.flush_cache_buffer()
+
+    async def flush_cache_buffer(self):
+        print_verbose(
+            f"flushing to redis....reached size of buffer {len(self.redis_batch_writing_buffer)}"
+        )
+        await self.async_set_cache_pipeline(self.redis_batch_writing_buffer)
+        self.redis_batch_writing_buffer = []

    def _get_cache_logic(self, cached_response: Any):
        """
@ -287,6 +318,9 @@ class RedisCache(BaseCache):
    def flush_cache(self):
        self.redis_client.flushall()

+    def flushall(self):
+        self.redis_client.flushall()
+
    async def disconnect(self):
        await self.async_redis_conn_pool.disconnect(inuse_connections=True)

@ -874,6 +908,7 @@ class Cache:
        port: Optional[str] = None,
        password: Optional[str] = None,
        namespace: Optional[str] = None,
+        ttl: Optional[float] = None,
        similarity_threshold: Optional[float] = None,
        supported_call_types: Optional[
            List[
@ -908,6 +943,7 @@ class Cache:
        s3_path: Optional[str] = None,
        redis_semantic_cache_use_async=False,
        redis_semantic_cache_embedding_model="text-embedding-ada-002",
+        redis_flush_size=None,
        **kwargs,
    ):
        """
@ -930,7 +966,9 @@ class Cache:
            None. Cache is set as a litellm param
        """
        if type == "redis":
-            self.cache: BaseCache = RedisCache(host, port, password, **kwargs)
+            self.cache: BaseCache = RedisCache(
+                host, port, password, redis_flush_size, **kwargs
+            )
        elif type == "redis-semantic":
            self.cache = RedisSemanticCache(
                host,
@ -967,6 +1005,8 @@ class Cache:
        self.supported_call_types = supported_call_types  # default to ["completion", "acompletion", "embedding", "aembedding"]
        self.type = type
        self.namespace = namespace
+        self.redis_flush_size = redis_flush_size
+        self.ttl = ttl

    def get_cache_key(self, *args, **kwargs):
        """
@ -1206,6 +1246,9 @@ class Cache:
                if isinstance(result, OpenAIObject):
                    result = result.model_dump_json()

+                ## DEFAULT TTL ##
+                if self.ttl is not None:
+                    kwargs["ttl"] = self.ttl
                ## Get Cache-Controls ##
                if kwargs.get("cache", None) is not None and isinstance(
                    kwargs.get("cache"), dict
@ -1213,6 +1256,7 @@ class Cache:
                    for k, v in kwargs.get("cache").items():
                        if k == "ttl":
                            kwargs["ttl"] = v
+
                cached_data = {"timestamp": time.time(), "response": result}
                return cache_key, cached_data, kwargs
            else:
@ -1246,10 +1290,14 @@ class Cache:
        Async implementation of add_cache
        """
        try:
-            cache_key, cached_data, kwargs = self._add_cache_logic(
-                result=result, *args, **kwargs
-            )
-            await self.cache.async_set_cache(cache_key, cached_data, **kwargs)
+            if self.type == "redis" and self.redis_flush_size is not None:
+                # high traffic - fill in results in memory and then flush
+                await self.batch_cache_write(result, *args, **kwargs)
+            else:
+                cache_key, cached_data, kwargs = self._add_cache_logic(
+                    result=result, *args, **kwargs
+                )
+                await self.cache.async_set_cache(cache_key, cached_data, **kwargs)
        except Exception as e:
            print_verbose(f"LiteLLM Cache: Excepton add_cache: {str(e)}")
            traceback.print_exc()
@ -1287,6 +1335,12 @@ class Cache:
            print_verbose(f"LiteLLM Cache: Excepton add_cache: {str(e)}")
            traceback.print_exc()

+    async def batch_cache_write(self, result, *args, **kwargs):
+        cache_key, cached_data, kwargs = self._add_cache_logic(
+            result=result, *args, **kwargs
+        )
+        await self.cache.batch_cache_write(cache_key, cached_data, **kwargs)
+
    async def ping(self):
        if hasattr(self.cache, "ping"):
            return await self.cache.ping()
--- a/litellm/integrations/athina.py
+++ b/litellm/integrations/athina.py
@ -10,7 +10,7 @@ class AthinaLogger:
            "Content-Type": "application/json"
        }
        self.athina_logging_url = "https://log.athina.ai/api/v1/log/inference"
-        self.additional_keys = ["environment", "prompt_slug", "customer_id", "customer_user_id", "session_id", "external_reference_id", "context", "expected_response"]
+        self.additional_keys = ["environment", "prompt_slug", "customer_id", "customer_user_id", "session_id", "external_reference_id", "context", "expected_response", "user_query"]

    def log_event(self, kwargs, response_obj, start_time, end_time, print_verbose):
        import requests
@ -32,8 +32,6 @@ class AthinaLogger:

            if "messages" in kwargs:
                data["prompt"] = kwargs.get("messages", None)
-                if kwargs.get("messages") and len(kwargs.get("messages")) > 0:
-                    data["user_query"] = kwargs.get("messages")[0].get("content", None)

            # Directly add tools or functions if present
            optional_params = kwargs.get("optional_params", {})
--- a/litellm/integrations/custom_logger.py
+++ b/litellm/integrations/custom_logger.py
@ -72,7 +72,12 @@ class CustomLogger:  # https://docs.litellm.ai/docs/observability/custom_callbac
    ):
        pass

-    async def async_moderation_hook(self, data: dict):
+    async def async_moderation_hook(
+        self,
+        data: dict,
+        user_api_key_dict: UserAPIKeyAuth,
+        call_type: Literal["completion", "embeddings", "image_generation"],
+    ):
        pass

    async def async_post_call_streaming_hook(
--- a/litellm/integrations/langfuse.py
+++ b/litellm/integrations/langfuse.py
@ -246,13 +246,13 @@ class LangFuseLogger:
                metadata_tags = metadata.get("tags", [])
                tags = metadata_tags

-            generation_name = metadata.get("generation_name", None)
-            if generation_name is None:
-                # just log `litellm-{call_type}` as the generation name
-                generation_name = f"litellm-{kwargs.get('call_type', 'completion')}"
+            trace_name = metadata.get("trace_name", None)
+            if trace_name is None:
+                # just log `litellm-{call_type}` as the trace name
+                trace_name = f"litellm-{kwargs.get('call_type', 'completion')}"

            trace_params = {
-                "name": generation_name,
+                "name": trace_name,
                "input": input,
                "user_id": metadata.get("trace_user_id", user_id),
                "id": metadata.get("trace_id", None),
@ -311,6 +311,11 @@ class LangFuseLogger:
                    "completion_tokens": response_obj["usage"]["completion_tokens"],
                    "total_cost": cost if supports_costs else None,
                }
+            generation_name = metadata.get("generation_name", None)
+            if generation_name is None:
+                # just log `litellm-{call_type}` as the generation name
+                generation_name = f"litellm-{kwargs.get('call_type', 'completion')}"
+
            generation_params = {
                "name": generation_name,
                "id": metadata.get("generation_id", generation_id),
--- a/litellm/llms/anthropic.py
+++ b/litellm/llms/anthropic.py
@ -131,18 +131,24 @@ def completion(
        )
    else:
        # Separate system prompt from rest of message
-        system_prompt_idx: Optional[int] = None
+        system_prompt_indices = []
+        system_prompt = ""
        for idx, message in enumerate(messages):
            if message["role"] == "system":
-                optional_params["system"] = message["content"]
-                system_prompt_idx = idx
-                break
-        if system_prompt_idx is not None:
-            messages.pop(system_prompt_idx)
+                system_prompt += message["content"]
+                system_prompt_indices.append(idx)
+        if len(system_prompt_indices) > 0:
+            for idx in reversed(system_prompt_indices):
+                messages.pop(idx)
+        if len(system_prompt) > 0:
+            optional_params["system"] = system_prompt
        # Format rest of message according to anthropic guidelines
-        messages = prompt_factory(
-            model=model, messages=messages, custom_llm_provider="anthropic"
-        )
+        try:
+            messages = prompt_factory(
+                model=model, messages=messages, custom_llm_provider="anthropic"
+            )
+        except Exception as e:
+            raise AnthropicError(status_code=400, message=str(e))

    ## Load Config
    config = litellm.AnthropicConfig.get_config()
@ -295,7 +301,7 @@ def completion(
                )
                streaming_choice.delta = delta_obj
                streaming_model_response.choices = [streaming_choice]
-                completion_stream = model_response_iterator(
+                completion_stream = ModelResponseIterator(
                    model_response=streaming_model_response
                )
                print_verbose(
@ -324,8 +330,30 @@ def completion(
        return model_response


-def model_response_iterator(model_response):
-    yield model_response
+class ModelResponseIterator:
+    def __init__(self, model_response):
+        self.model_response = model_response
+        self.is_done = False
+
+    # Sync iterator
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        if self.is_done:
+            raise StopIteration
+        self.is_done = True
+        return self.model_response
+
+    # Async iterator
+    def __aiter__(self):
+        return self
+
+    async def __anext__(self):
+        if self.is_done:
+            raise StopAsyncIteration
+        self.is_done = True
+        return self.model_response


 def embedding():
--- a/litellm/llms/bedrock.py
+++ b/litellm/llms/bedrock.py
@ -11,6 +11,7 @@ from .prompt_templates.factory import (
    construct_tool_use_system_prompt,
    extract_between_tags,
    parse_xml_params,
+    contains_tag,
 )
 import httpx

@ -78,11 +79,13 @@ class AmazonTitanConfig:

 class AmazonAnthropicClaude3Config:
    """
-    Reference: https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=claude
+    Reference:
+        https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=claude
+        https://docs.anthropic.com/claude/docs/models-overview#model-comparison

    Supported Params for the Amazon / Anthropic Claude 3 models:

-    - `max_tokens` Required (integer) max tokens,
+    - `max_tokens` Required (integer) max tokens. Default is 4096
    - `anthropic_version` Required (string) version of anthropic for bedrock - e.g. "bedrock-2023-05-31"
    - `system` Optional (string) the system prompt, conversion from openai format to this is handled in factory.py
    - `temperature` Optional (float) The amount of randomness injected into the response
@ -91,7 +94,7 @@ class AmazonAnthropicClaude3Config:
    - `stop_sequences` Optional (List[str]) Custom text sequences that cause the model to stop generating
    """

-    max_tokens: Optional[int] = litellm.max_tokens
+    max_tokens: Optional[int] = 4096  # Opus, Sonnet, and Haiku default
    anthropic_version: Optional[str] = "bedrock-2023-05-31"
    system: Optional[str] = None
    temperature: Optional[float] = None
@ -128,7 +131,15 @@ class AmazonAnthropicClaude3Config:
        }

    def get_supported_openai_params(self):
-        return ["max_tokens", "tools", "tool_choice", "stream"]
+        return [
+            "max_tokens",
+            "tools",
+            "tool_choice",
+            "stream",
+            "stop",
+            "temperature",
+            "top_p",
+        ]

    def map_openai_params(self, non_default_params: dict, optional_params: dict):
        for param, value in non_default_params.items():
@ -679,6 +690,7 @@ def completion(
    timeout=None,
 ):
    exception_mapping_worked = False
+    _is_function_call = False
    try:
        # pop aws_secret_access_key, aws_access_key_id, aws_region_name from kwargs, since completion calls fail with them
        aws_secret_access_key = optional_params.pop("aws_secret_access_key", None)
@ -727,8 +739,10 @@ def completion(
                        system_messages.append(message["content"])
                        system_prompt_idx.append(idx)
                if len(system_prompt_idx) > 0:
-                    inference_params["system"] = '\n'.join(system_messages)
-                    messages = [i for j, i in enumerate(messages) if j not in system_prompt_idx]
+                    inference_params["system"] = "\n".join(system_messages)
+                    messages = [
+                        i for j, i in enumerate(messages) if j not in system_prompt_idx
+                    ]
                # Format rest of message according to anthropic guidelines
                messages = prompt_factory(
                    model=model, messages=messages, custom_llm_provider="anthropic"
@ -742,6 +756,7 @@ def completion(
                        inference_params[k] = v
                ## Handle Tool Calling
                if "tools" in inference_params:
+                    _is_function_call = True
                    tool_calling_system_prompt = construct_tool_use_system_prompt(
                        tools=inference_params["tools"]
                    )
@ -823,7 +838,7 @@ def completion(
        ## COMPLETION CALL
        accept = "application/json"
        contentType = "application/json"
-        if stream == True:
+        if stream == True and _is_function_call == False:
            if provider == "ai21":
                ## LOGGING
                request_str = f"""
@ -918,7 +933,9 @@ def completion(
        elif provider == "anthropic":
            if model.startswith("anthropic.claude-3"):
                outputText = response_body.get("content")[0].get("text", None)
-                if "<invoke>" in outputText:  # OUTPUT PARSE FUNCTION CALL
+                if outputText is not None and contains_tag(
+                    "invoke", outputText
+                ):  # OUTPUT PARSE FUNCTION CALL
                    function_name = extract_between_tags("tool_name", outputText)[0]
                    function_arguments_str = extract_between_tags("invoke", outputText)[
                        0
@ -941,6 +958,56 @@ def completion(
                        content=None,
                    )
                    model_response.choices[0].message = _message  # type: ignore
+                if _is_function_call == True and stream is not None and stream == True:
+                    print_verbose(
+                        f"INSIDE BEDROCK STREAMING TOOL CALLING CONDITION BLOCK"
+                    )
+                    # return an iterator
+                    streaming_model_response = ModelResponse(stream=True)
+                    streaming_model_response.choices[0].finish_reason = (
+                        model_response.choices[0].finish_reason
+                    )
+                    # streaming_model_response.choices = [litellm.utils.StreamingChoices()]
+                    streaming_choice = litellm.utils.StreamingChoices()
+                    streaming_choice.index = model_response.choices[0].index
+                    _tool_calls = []
+                    print_verbose(
+                        f"type of model_response.choices[0]: {type(model_response.choices[0])}"
+                    )
+                    print_verbose(f"type of streaming_choice: {type(streaming_choice)}")
+                    if isinstance(model_response.choices[0], litellm.Choices):
+                        if getattr(
+                            model_response.choices[0].message, "tool_calls", None
+                        ) is not None and isinstance(
+                            model_response.choices[0].message.tool_calls, list
+                        ):
+                            for tool_call in model_response.choices[
+                                0
+                            ].message.tool_calls:
+                                _tool_call = {**tool_call.dict(), "index": 0}
+                                _tool_calls.append(_tool_call)
+                        delta_obj = litellm.utils.Delta(
+                            content=getattr(
+                                model_response.choices[0].message, "content", None
+                            ),
+                            role=model_response.choices[0].message.role,
+                            tool_calls=_tool_calls,
+                        )
+                        streaming_choice.delta = delta_obj
+                        streaming_model_response.choices = [streaming_choice]
+                        completion_stream = model_response_iterator(
+                            model_response=streaming_model_response
+                        )
+                        print_verbose(
+                            f"Returns anthropic CustomStreamWrapper with 'cached_response' streaming object"
+                        )
+                        return litellm.CustomStreamWrapper(
+                            completion_stream=completion_stream,
+                            model=model,
+                            custom_llm_provider="cached_response",
+                            logging_obj=logging_obj,
+                        )
+
                model_response["finish_reason"] = response_body["stop_reason"]
                _usage = litellm.Usage(
                    prompt_tokens=response_body["usage"]["input_tokens"],
@ -1029,6 +1096,10 @@ def completion(
            raise BedrockError(status_code=500, message=traceback.format_exc())


+async def model_response_iterator(model_response):
+    yield model_response
+
+
 def _embedding_func_single(
    model: str,
    input: str,
--- a/litellm/llms/custom_httpx/httpx_handler.py
+++ b/litellm/llms/custom_httpx/httpx_handler.py
@ -0,0 +1,38 @@
+from typing import Optional
+import httpx
+
+
+class HTTPHandler:
+    def __init__(self, concurrent_limit=1000):
+        # Create a client with a connection pool
+        self.client = httpx.AsyncClient(
+            limits=httpx.Limits(
+                max_connections=concurrent_limit,
+                max_keepalive_connections=concurrent_limit,
+            )
+        )
+
+    async def close(self):
+        # Close the client when you're done with it
+        await self.client.aclose()
+
+    async def get(
+        self, url: str, params: Optional[dict] = None, headers: Optional[dict] = None
+    ):
+        response = await self.client.get(url, params=params, headers=headers)
+        return response
+
+    async def post(
+        self,
+        url: str,
+        data: Optional[dict] = None,
+        params: Optional[dict] = None,
+        headers: Optional[dict] = None,
+    ):
+        try:
+            response = await self.client.post(
+                url, data=data, params=params, headers=headers
+            )
+            return response
+        except Exception as e:
+            raise e
--- a/litellm/llms/gemini.py
+++ b/litellm/llms/gemini.py
@ -118,7 +118,7 @@ def completion(
    logger_fn=None,
 ):
    try:
-        import google.generativeai as genai
+        import google.generativeai as genai  # type: ignore
    except:
        raise Exception(
            "Importing google.generativeai failed, please run 'pip install -q google-generativeai"
@ -308,7 +308,7 @@ async def async_completion(
    messages,
    encoding,
 ):
-    import google.generativeai as genai
+    import google.generativeai as genai  # type: ignore

    response = await _model.generate_content_async(
        contents=prompt,
--- a/litellm/llms/ollama.py
+++ b/litellm/llms/ollama.py
@ -68,9 +68,9 @@ class OllamaConfig:
    repeat_last_n: Optional[int] = None
    repeat_penalty: Optional[float] = None
    temperature: Optional[float] = None
-    stop: Optional[
-        list
-    ] = None  # stop is a list based on this - https://github.com/jmorganca/ollama/pull/442
+    stop: Optional[list] = (
+        None  # stop is a list based on this - https://github.com/jmorganca/ollama/pull/442
+    )
    tfs_z: Optional[float] = None
    num_predict: Optional[int] = None
    top_k: Optional[int] = None
@ -344,9 +344,9 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj):


 async def ollama_aembeddings(
-    api_base="http://localhost:11434",
-    model="llama2",
-    prompt="Why is the sky blue?",
+    api_base: str,
+    model: str,
+    prompts: list,
    optional_params=None,
    logging_obj=None,
    model_response=None,
@ -365,51 +365,56 @@ async def ollama_aembeddings(
        ):  # completion(top_k=3) > cohere_config(top_k=3) <- allows for dynamic variables to be passed in
            optional_params[k] = v

-    data = {
-        "model": model,
-        "prompt": prompt,
-    }
-    ## LOGGING
-    logging_obj.pre_call(
-        input=None,
-        api_key=None,
-        additional_args={"api_base": url, "complete_input_dict": data, "headers": {}},
-    )
+    total_input_tokens = 0
+    output_data = []
    timeout = aiohttp.ClientTimeout(total=litellm.request_timeout)  # 10 minutes
    async with aiohttp.ClientSession(timeout=timeout) as session:
-        response = await session.post(url, json=data)
-
-        if response.status != 200:
-            text = await response.text()
-            raise OllamaError(status_code=response.status, message=text)
-
-        ## LOGGING
-        logging_obj.post_call(
-            input=prompt,
-            api_key="",
-            original_response=response.text,
-            additional_args={
-                "headers": None,
-                "api_base": api_base,
-            },
-        )
-
-        response_json = await response.json()
-        embeddings = response_json["embedding"]
-        ## RESPONSE OBJECT
-        output_data = []
-        for idx, embedding in enumerate(embeddings):
-            output_data.append(
-                {"object": "embedding", "index": idx, "embedding": embedding}
+        for idx, prompt in enumerate(prompts):
+            data = {
+                "model": model,
+                "prompt": prompt,
+            }
+            ## LOGGING
+            logging_obj.pre_call(
+                input=None,
+                api_key=None,
+                additional_args={
+                    "api_base": url,
+                    "complete_input_dict": data,
+                    "headers": {},
+                },
            )
-        model_response["object"] = "list"
-        model_response["data"] = output_data
-        model_response["model"] = model

-        input_tokens = len(encoding.encode(prompt))
+            response = await session.post(url, json=data)
+            if response.status != 200:
+                text = await response.text()
+                raise OllamaError(status_code=response.status, message=text)

-        model_response["usage"] = {
-            "prompt_tokens": input_tokens,
-            "total_tokens": input_tokens,
-        }
-        return model_response
+            ## LOGGING
+            logging_obj.post_call(
+                input=prompt,
+                api_key="",
+                original_response=response.text,
+                additional_args={
+                    "headers": None,
+                    "api_base": api_base,
+                },
+            )
+
+            response_json = await response.json()
+            embeddings: list[float] = response_json["embedding"]
+            output_data.append(
+                {"object": "embedding", "index": idx, "embedding": embeddings}
+            )
+
+            input_tokens = len(encoding.encode(prompt))
+            total_input_tokens += input_tokens
+
+    model_response["object"] = "list"
+    model_response["data"] = output_data
+    model_response["model"] = model
+    model_response["usage"] = {
+        "prompt_tokens": total_input_tokens,
+        "total_tokens": total_input_tokens,
+    }
+    return model_response
--- a/litellm/llms/ollama_chat.py
+++ b/litellm/llms/ollama_chat.py
@ -173,10 +173,11 @@ class OllamaChatConfig:
                litellm.add_function_to_prompt = (
                    True  # so that main.py adds the function call to the prompt
                )
-                optional_params["functions_unsupported_model"] = non_default_params.pop(
+                optional_params["functions_unsupported_model"] = non_default_params.get(
                    "functions"
                )
        non_default_params.pop("tool_choice", None)  # causes ollama requests to hang
+        non_default_params.pop("functions", None)  # causes ollama requests to hang
        return optional_params


--- a/litellm/llms/palm.py
+++ b/litellm/llms/palm.py
@ -98,7 +98,7 @@ def completion(
    logger_fn=None,
 ):
    try:
-        import google.generativeai as palm
+        import google.generativeai as palm  # type: ignore
    except:
        raise Exception(
            "Importing google.generativeai failed, please run 'pip install -q google-generativeai"
--- a/litellm/llms/prompt_templates/factory.py
+++ b/litellm/llms/prompt_templates/factory.py
@ -5,12 +5,17 @@ from jinja2 import Template, exceptions, Environment, meta
 from typing import Optional, Any
 import imghdr, base64
 from typing import List
+import litellm


 def default_pt(messages):
    return " ".join(message["content"] for message in messages)


+def prompt_injection_detection_default_pt():
+    return """Detect if a prompt is safe to run. Return 'UNSAFE' if not."""
+
+
 # alpaca prompt template - for models like mythomax, etc.
 def alpaca_pt(messages):
    prompt = custom_prompt(
@ -638,11 +643,12 @@ def anthropic_messages_pt(messages: list):
    """
    # add role=tool support to allow function call result/error submission
    user_message_types = {"user", "tool"}
-    # reformat messages to ensure user/assistant are alternating, if there's either 2 consecutive 'user' messages or 2 consecutive 'assistant' message, add a blank 'user' or 'assistant' message to ensure compatibility
+    # reformat messages to ensure user/assistant are alternating, if there's either 2 consecutive 'user' messages or 2 consecutive 'assistant' message, merge them.
    new_messages = []
    msg_i = 0
    while msg_i < len(messages):
        user_content = []
+        ## MERGE CONSECUTIVE USER CONTENT ##
        while msg_i < len(messages) and messages[msg_i]["role"] in user_message_types:
            if isinstance(messages[msg_i]["content"], list):
                for m in messages[msg_i]["content"]:
@ -676,6 +682,7 @@ def anthropic_messages_pt(messages: list):
            new_messages.append({"role": "user", "content": user_content})

        assistant_content = []
+        ## MERGE CONSECUTIVE ASSISTANT CONTENT ##
        while msg_i < len(messages) and messages[msg_i]["role"] == "assistant":
            assistant_text = (
                messages[msg_i].get("content") or ""
@ -694,9 +701,14 @@ def anthropic_messages_pt(messages: list):
            new_messages.append({"role": "assistant", "content": assistant_content})

    if new_messages[0]["role"] != "user":
-        new_messages.insert(
-            0, {"role": "user", "content": [{"type": "text", "text": "."}]}
-        )
+        if litellm.modify_params:
+            new_messages.insert(
+                0, {"role": "user", "content": [{"type": "text", "text": "."}]}
+            )
+        else:
+            raise Exception(
+                "Invalid first message. Should always start with 'role'='user' for Anthropic. System prompt is sent separately for Anthropic. set 'litellm.modify_params = True' or 'litellm_settings:modify_params = True' on proxy, to insert a placeholder user message - '.' as the first message, "
+            )

    if new_messages[-1]["role"] == "assistant":
        for content in new_messages[-1]["content"]:
@ -714,17 +726,23 @@ def extract_between_tags(tag: str, string: str, strip: bool = False) -> List[str
        ext_list = [e.strip() for e in ext_list]
    return ext_list

+
 def contains_tag(tag: str, string: str) -> bool:
    return bool(re.search(f"<{tag}>(.+?)</{tag}>", string, re.DOTALL))

+
 def parse_xml_params(xml_content):
    root = ET.fromstring(xml_content)
    params = {}
    for child in root.findall(".//parameters/*"):
-        params[child.tag] = child.text
+        try:
+            # Attempt to decode the element's text as JSON
+            params[child.tag] = json.loads(child.text)
+        except json.JSONDecodeError:
+            # If JSON decoding fails, use the original text
+            params[child.tag] = child.text
    return params

-
 ###


@ -917,7 +935,7 @@ def gemini_text_image_pt(messages: list):
    }
    """
    try:
-        import google.generativeai as genai
+        import google.generativeai as genai  # type: ignore
    except:
        raise Exception(
            "Importing google.generativeai failed, please run 'pip install -q google-generativeai"
@ -958,9 +976,7 @@ def azure_text_pt(messages: list):

 # Function call template
 def function_call_prompt(messages: list, functions: list):
-    function_prompt = (
-        """Produce JSON OUTPUT ONLY! Adhere to this format {"name": "function_name", "arguments":{"argument_name": "argument_value"}} The following functions are available to you:"""
-    )
+    function_prompt = """Produce JSON OUTPUT ONLY! Adhere to this format {"name": "function_name", "arguments":{"argument_name": "argument_value"}} The following functions are available to you:"""
    for function in functions:
        function_prompt += f"""\n{function}\n"""

--- a/litellm/llms/sagemaker.py
+++ b/litellm/llms/sagemaker.py
@ -166,6 +166,7 @@ def completion(
    aws_secret_access_key = optional_params.pop("aws_secret_access_key", None)
    aws_access_key_id = optional_params.pop("aws_access_key_id", None)
    aws_region_name = optional_params.pop("aws_region_name", None)
+    model_id = optional_params.pop("model_id", None)

    if aws_access_key_id != None:
        # uses auth params passed to completion
@ -245,15 +246,28 @@ def completion(
                model=model,
                logging_obj=logging_obj,
                data=data,
+                model_id=model_id,
+                aws_secret_access_key=aws_secret_access_key,
+                aws_access_key_id=aws_access_key_id,
+                aws_region_name=aws_region_name,
            )
            return response
-        response = client.invoke_endpoint_with_response_stream(
-            EndpointName=model,
-            ContentType="application/json",
-            Body=data,
-            CustomAttributes="accept_eula=true",
-        )

+        if model_id is not None:
+            response = client.invoke_endpoint_with_response_stream(
+                EndpointName=model,
+                InferenceComponentName=model_id,
+                ContentType="application/json",
+                Body=data,
+                CustomAttributes="accept_eula=true",
+            )
+        else:
+            response = client.invoke_endpoint_with_response_stream(
+                EndpointName=model,
+                ContentType="application/json",
+                Body=data,
+                CustomAttributes="accept_eula=true",
+            )
        return response["Body"]
    elif acompletion == True:
        _data = {"inputs": prompt, "parameters": inference_params}
@ -264,36 +278,68 @@ def completion(
            model=model,
            logging_obj=logging_obj,
            data=_data,
+            model_id=model_id,
+            aws_secret_access_key=aws_secret_access_key,
+            aws_access_key_id=aws_access_key_id,
+            aws_region_name=aws_region_name,
        )
    data = json.dumps({"inputs": prompt, "parameters": inference_params}).encode(
        "utf-8"
    )
-    ## LOGGING
-    request_str = f"""
-    response = client.invoke_endpoint(
-        EndpointName={model},
-        ContentType="application/json",
-        Body={data},
-        CustomAttributes="accept_eula=true",
-    )
-    """  # type: ignore
-    logging_obj.pre_call(
-        input=prompt,
-        api_key="",
-        additional_args={
-            "complete_input_dict": data,
-            "request_str": request_str,
-            "hf_model_name": hf_model_name,
-        },
-    )
    ## COMPLETION CALL
    try:
-        response = client.invoke_endpoint(
-            EndpointName=model,
-            ContentType="application/json",
-            Body=data,
-            CustomAttributes="accept_eula=true",
-        )
+        if model_id is not None:
+            ## LOGGING
+            request_str = f"""
+            response = client.invoke_endpoint(
+                EndpointName={model},
+                InferenceComponentName={model_id},
+                ContentType="application/json",
+                Body={data},
+                CustomAttributes="accept_eula=true",
+            )
+            """  # type: ignore
+            logging_obj.pre_call(
+                input=prompt,
+                api_key="",
+                additional_args={
+                    "complete_input_dict": data,
+                    "request_str": request_str,
+                    "hf_model_name": hf_model_name,
+                },
+            )
+            response = client.invoke_endpoint(
+                EndpointName=model,
+                InferenceComponentName=model_id,
+                ContentType="application/json",
+                Body=data,
+                CustomAttributes="accept_eula=true",
+            )
+        else:
+            ## LOGGING
+            request_str = f"""
+            response = client.invoke_endpoint(
+                EndpointName={model},
+                ContentType="application/json",
+                Body={data},
+                CustomAttributes="accept_eula=true",
+            )
+            """  # type: ignore
+            logging_obj.pre_call(
+                input=prompt,
+                api_key="",
+                additional_args={
+                    "complete_input_dict": data,
+                    "request_str": request_str,
+                    "hf_model_name": hf_model_name,
+                },
+            )
+            response = client.invoke_endpoint(
+                EndpointName=model,
+                ContentType="application/json",
+                Body=data,
+                CustomAttributes="accept_eula=true",
+            )
    except Exception as e:
        status_code = (
            getattr(e, "response", {})
@ -303,6 +349,8 @@ def completion(
        error_message = (
            getattr(e, "response", {}).get("Error", {}).get("Message", str(e))
        )
+        if "Inference Component Name header is required" in error_message:
+            error_message += "\n pass in via `litellm.completion(..., model_id={InferenceComponentName})`"
        raise SagemakerError(status_code=status_code, message=error_message)

    response = response["Body"].read().decode("utf8")
@ -357,8 +405,12 @@ async def async_streaming(
    encoding,
    model_response: ModelResponse,
    model: str,
+    model_id: Optional[str],
    logging_obj: Any,
    data,
+    aws_secret_access_key: Optional[str],
+    aws_access_key_id: Optional[str],
+    aws_region_name: Optional[str],
 ):
    """
    Use aioboto3
@ -367,11 +419,6 @@ async def async_streaming(

    session = aioboto3.Session()

-    # pop aws_secret_access_key, aws_access_key_id, aws_region_name from kwargs, since completion calls fail with them
-    aws_secret_access_key = optional_params.pop("aws_secret_access_key", None)
-    aws_access_key_id = optional_params.pop("aws_access_key_id", None)
-    aws_region_name = optional_params.pop("aws_region_name", None)
-
    if aws_access_key_id != None:
        # uses auth params passed to completion
        # aws_access_key_id is not None, assume user is trying to auth using litellm.completion
@ -398,12 +445,21 @@ async def async_streaming(

    async with _client as client:
        try:
-            response = await client.invoke_endpoint_with_response_stream(
-                EndpointName=model,
-                ContentType="application/json",
-                Body=data,
-                CustomAttributes="accept_eula=true",
-            )
+            if model_id is not None:
+                response = await client.invoke_endpoint_with_response_stream(
+                    EndpointName=model,
+                    InferenceComponentName=model_id,
+                    ContentType="application/json",
+                    Body=data,
+                    CustomAttributes="accept_eula=true",
+                )
+            else:
+                response = await client.invoke_endpoint_with_response_stream(
+                    EndpointName=model,
+                    ContentType="application/json",
+                    Body=data,
+                    CustomAttributes="accept_eula=true",
+                )
        except Exception as e:
            raise SagemakerError(status_code=500, message=f"{str(e)}")
        response = response["Body"]
@ -418,6 +474,10 @@ async def async_completion(
    model: str,
    logging_obj: Any,
    data: dict,
+    model_id: Optional[str],
+    aws_secret_access_key: Optional[str],
+    aws_access_key_id: Optional[str],
+    aws_region_name: Optional[str],
 ):
    """
    Use aioboto3
@ -426,11 +486,6 @@ async def async_completion(

    session = aioboto3.Session()

-    # pop aws_secret_access_key, aws_access_key_id, aws_region_name from kwargs, since completion calls fail with them
-    aws_secret_access_key = optional_params.pop("aws_secret_access_key", None)
-    aws_access_key_id = optional_params.pop("aws_access_key_id", None)
-    aws_region_name = optional_params.pop("aws_region_name", None)
-
    if aws_access_key_id != None:
        # uses auth params passed to completion
        # aws_access_key_id is not None, assume user is trying to auth using litellm.completion
@ -456,33 +511,63 @@ async def async_completion(
        )

    async with _client as client:
-        ## LOGGING
-        request_str = f"""
-        response = client.invoke_endpoint(
-            EndpointName={model},
-            ContentType="application/json",
-            Body={data},
-            CustomAttributes="accept_eula=true",
-        )
-        """  # type: ignore
-        logging_obj.pre_call(
-            input=data["inputs"],
-            api_key="",
-            additional_args={
-                "complete_input_dict": data,
-                "request_str": request_str,
-            },
-        )
        encoded_data = json.dumps(data).encode("utf-8")
        try:
-            response = await client.invoke_endpoint(
-                EndpointName=model,
-                ContentType="application/json",
-                Body=encoded_data,
-                CustomAttributes="accept_eula=true",
-            )
+            if model_id is not None:
+                ## LOGGING
+                request_str = f"""
+                response = client.invoke_endpoint(
+                    EndpointName={model},
+                    InferenceComponentName={model_id},
+                    ContentType="application/json",
+                    Body={data},
+                    CustomAttributes="accept_eula=true",
+                )
+                """  # type: ignore
+                logging_obj.pre_call(
+                    input=data["inputs"],
+                    api_key="",
+                    additional_args={
+                        "complete_input_dict": data,
+                        "request_str": request_str,
+                    },
+                )
+                response = await client.invoke_endpoint(
+                    EndpointName=model,
+                    InferenceComponentName=model_id,
+                    ContentType="application/json",
+                    Body=encoded_data,
+                    CustomAttributes="accept_eula=true",
+                )
+            else:
+                ## LOGGING
+                request_str = f"""
+                response = client.invoke_endpoint(
+                    EndpointName={model},
+                    ContentType="application/json",
+                    Body={data},
+                    CustomAttributes="accept_eula=true",
+                )
+                """  # type: ignore
+                logging_obj.pre_call(
+                    input=data["inputs"],
+                    api_key="",
+                    additional_args={
+                        "complete_input_dict": data,
+                        "request_str": request_str,
+                    },
+                )
+                response = await client.invoke_endpoint(
+                    EndpointName=model,
+                    ContentType="application/json",
+                    Body=encoded_data,
+                    CustomAttributes="accept_eula=true",
+                )
        except Exception as e:
-            raise SagemakerError(status_code=500, message=f"{str(e)}")
+            error_message = f"{str(e)}"
+            if "Inference Component Name header is required" in error_message:
+                error_message += "\n pass in via `litellm.completion(..., model_id={InferenceComponentName})`"
+            raise SagemakerError(status_code=500, message=error_message)
        response = await response["Body"].read()
        response = response.decode("utf8")
        ## LOGGING
--- a/litellm/llms/vertex_ai.py
+++ b/litellm/llms/vertex_ai.py
@ -289,11 +289,11 @@ def completion(
            Part,
            GenerationConfig,
        )
-        from google.cloud import aiplatform
+        from google.cloud import aiplatform  # type: ignore
        from google.protobuf import json_format  # type: ignore
        from google.protobuf.struct_pb2 import Value  # type: ignore
-        from google.cloud.aiplatform_v1beta1.types import content as gapic_content_types
-        import google.auth
+        from google.cloud.aiplatform_v1beta1.types import content as gapic_content_types  # type: ignore
+        import google.auth  # type: ignore

        ## Load credentials with the correct quota project ref: https://github.com/googleapis/python-aiplatform/issues/2557#issuecomment-1709284744
        print_verbose(
@ -783,7 +783,7 @@ async def async_completion(
            """
            Vertex AI Model Garden
            """
-            from google.cloud import aiplatform
+            from google.cloud import aiplatform  # type: ignore

            ## LOGGING
            logging_obj.pre_call(
@ -969,7 +969,7 @@ async def async_streaming(
        )
        response = llm_model.predict_streaming_async(prompt, **optional_params)
    elif mode == "custom":
-        from google.cloud import aiplatform
+        from google.cloud import aiplatform  # type: ignore

        stream = optional_params.pop("stream", None)

@ -1059,7 +1059,7 @@ def embedding(
        )

    from vertexai.language_models import TextEmbeddingModel
-    import google.auth
+    import google.auth  # type: ignore

    ## Load credentials with the correct quota project ref: https://github.com/googleapis/python-aiplatform/issues/2557#issuecomment-1709284744
    try:
--- a/litellm/main.py
+++ b/litellm/main.py
@ -115,24 +115,54 @@ class LiteLLM:
        default_headers: Optional[Mapping[str, str]] = None,
    ):
        self.params = locals()
-        self.chat = Chat(self.params)
+        self.chat = Chat(self.params, router_obj=None)


 class Chat:
-    def __init__(self, params):
+    def __init__(self, params, router_obj: Optional[Any]):
        self.params = params
-        self.completions = Completions(self.params)
+        if self.params.get("acompletion", False) == True:
+            self.params.pop("acompletion")
+            self.completions: Union[AsyncCompletions, Completions] = AsyncCompletions(
+                self.params, router_obj=router_obj
+            )
+        else:
+            self.completions = Completions(self.params, router_obj=router_obj)


 class Completions:
-    def __init__(self, params):
+    def __init__(self, params, router_obj: Optional[Any]):
        self.params = params
+        self.router_obj = router_obj

    def create(self, messages, model=None, **kwargs):
        for k, v in kwargs.items():
            self.params[k] = v
        model = model or self.params.get("model")
-        response = completion(model=model, messages=messages, **self.params)
+        if self.router_obj is not None:
+            response = self.router_obj.completion(
+                model=model, messages=messages, **self.params
+            )
+        else:
+            response = completion(model=model, messages=messages, **self.params)
+        return response
+
+
+class AsyncCompletions:
+    def __init__(self, params, router_obj: Optional[Any]):
+        self.params = params
+        self.router_obj = router_obj
+
+    async def create(self, messages, model=None, **kwargs):
+        for k, v in kwargs.items():
+            self.params[k] = v
+        model = model or self.params.get("model")
+        if self.router_obj is not None:
+            response = await self.router_obj.acompletion(
+                model=model, messages=messages, **self.params
+            )
+        else:
+            response = await acompletion(model=model, messages=messages, **self.params)
        return response


@ -571,6 +601,7 @@ def completion(
        "ttl",
        "cache",
        "no-log",
+        "base_model",
    ]
    default_params = openai_params + litellm_params
    non_default_params = {
@ -639,7 +670,7 @@ def completion(
        elif (
            input_cost_per_second is not None
        ):  # time based pricing just needs cost in place
-            output_cost_per_second = output_cost_per_second or 0.0
+            output_cost_per_second = output_cost_per_second
            litellm.register_model(
                {
                    f"{custom_llm_provider}/{model}": {
@ -1752,7 +1783,11 @@ def completion(
                timeout=timeout,
            )

-            if "stream" in optional_params and optional_params["stream"] == True:
+            if (
+                "stream" in optional_params
+                and optional_params["stream"] == True
+                and not isinstance(response, CustomStreamWrapper)
+            ):
                # don't try to access stream object,
                if "ai21" in model:
                    response = CustomStreamWrapper(
@ -2754,28 +2789,25 @@ def embedding(
                model_response=EmbeddingResponse(),
            )
        elif custom_llm_provider == "ollama":
-            ollama_input = None
-            if isinstance(input, list) and len(input) > 1:
-                raise litellm.BadRequestError(
-                    message=f"Ollama Embeddings don't support batch embeddings",
-                    model=model,  # type: ignore
-                    llm_provider="ollama",  # type: ignore
-                )
-            if isinstance(input, list) and len(input) == 1:
-                ollama_input = "".join(input[0])
-            elif isinstance(input, str):
-                ollama_input = input
-            else:
+            api_base = (
+                litellm.api_base
+                or api_base
+                or get_secret("OLLAMA_API_BASE")
+                or "http://localhost:11434"
+            )
+            if isinstance(input, str):
+                input = [input]
+            if not all(isinstance(item, str) for item in input):
                raise litellm.BadRequestError(
                    message=f"Invalid input for ollama embeddings. input={input}",
                    model=model,  # type: ignore
                    llm_provider="ollama",  # type: ignore
                )
-
-            if aembedding == True:
+            if aembedding:
                response = ollama.ollama_aembeddings(
+                    api_base=api_base,
                    model=model,
-                    prompt=ollama_input,
+                    prompts=input,
                    encoding=encoding,
                    logging_obj=logging,
                    optional_params=optional_params,
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
--- a/litellm/proxy/_experimental/out/404.html
+++ b/litellm/proxy/_experimental/out/404.html
--- a/litellm/proxy/_experimental/out/_next/static/DptMjzo5xd96cx0b56k4u/_buildManifest.js
+++ b/litellm/proxy/_experimental/out/_next/static/DptMjzo5xd96cx0b56k4u/_buildManifest.js
--- a/litellm/proxy/_experimental/out/_next/static/DptMjzo5xd96cx0b56k4u/_ssgManifest.js
+++ b/litellm/proxy/_experimental/out/_next/static/DptMjzo5xd96cx0b56k4u/_ssgManifest.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/69-589b47e7a69d316f.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/69-589b47e7a69d316f.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/730-1411b729a1c79695.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/730-1411b729a1c79695.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/798-4baed68da0c5497d.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/798-4baed68da0c5497d.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/_not-found-f5ff8fa2e73dffb1.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/_not-found-f5ff8fa2e73dffb1.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/layout-bdf72a417c5e4f1f.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/layout-bdf72a417c5e4f1f.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/page-a5a04da2a9356785.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/page-a5a04da2a9356785.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/page-df9015da04018cc1.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/page-df9015da04018cc1.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/fd9d1056-a507ee9e75a3be72.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/fd9d1056-a507ee9e75a3be72.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/webpack-d1ad37b1875df240.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/webpack-d1ad37b1875df240.js
@ -1 +1 @@
-!function(){"use strict";var e,t,n,r,o,u,i,c,f,a={},l={};function d(e){var t=l[e];if(void 0!==t)return t.exports;var n=l[e]={id:e,loaded:!1,exports:{}},r=!0;try{a[e](n,n.exports,d),r=!1}finally{r&&delete l[e]}return n.loaded=!0,n.exports}d.m=a,e=[],d.O=function(t,n,r,o){if(n){o=o||0;for(var u=e.length;u>0&&e[u-1][2]>o;u--)e[u]=e[u-1];e[u]=[n,r,o];return}for(var i=1/0,u=0;u<e.length;u++){for(var n=e[u][0],r=e[u][1],o=e[u][2],c=!0,f=0;f<n.length;f++)i>=o&&Object.keys(d.O).every(function(e){return d.O[e](n[f])})?n.splice(f--,1):(c=!1,o<i&&(i=o));if(c){e.splice(u--,1);var a=r();void 0!==a&&(t=a)}}return t},d.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return d.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},d.t=function(e,r){if(1&r&&(e=this(e)),8&r||"object"==typeof e&&e&&(4&r&&e.__esModule||16&r&&"function"==typeof e.then))return e;var o=Object.create(null);d.r(o);var u={};t=t||[null,n({}),n([]),n(n)];for(var i=2&r&&e;"object"==typeof i&&!~t.indexOf(i);i=n(i))Object.getOwnPropertyNames(i).forEach(function(t){u[t]=function(){return e[t]}});return u.default=function(){return e},d.d(o,u),o},d.d=function(e,t){for(var n in t)d.o(t,n)&&!d.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},d.f={},d.e=function(e){return Promise.all(Object.keys(d.f).reduce(function(t,n){return d.f[n](e,t),t},[]))},d.u=function(e){},d.miniCssF=function(e){return"static/css/68a21c6e6697f7ca.css"},d.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||Function("return this")()}catch(e){if("object"==typeof window)return window}}(),d.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="_N_E:",d.l=function(e,t,n,u){if(r[e]){r[e].push(t);return}if(void 0!==n)for(var i,c,f=document.getElementsByTagName("script"),a=0;a<f.length;a++){var l=f[a];if(l.getAttribute("src")==e||l.getAttribute("data-webpack")==o+n){i=l;break}}i||(c=!0,(i=document.createElement("script")).charset="utf-8",i.timeout=120,d.nc&&i.setAttribute("nonce",d.nc),i.setAttribute("data-webpack",o+n),i.src=d.tu(e)),r[e]=[t];var s=function(t,n){i.onerror=i.onload=null,clearTimeout(p);var o=r[e];if(delete r[e],i.parentNode&&i.parentNode.removeChild(i),o&&o.forEach(function(e){return e(n)}),t)return t(n)},p=setTimeout(s.bind(null,void 0,{type:"timeout",target:i}),12e4);i.onerror=s.bind(null,i.onerror),i.onload=s.bind(null,i.onload),c&&document.head.appendChild(i)},d.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},d.nmd=function(e){return e.paths=[],e.children||(e.children=[]),e},d.tt=function(){return void 0===u&&(u={createScriptURL:function(e){return e}},"undefined"!=typeof trustedTypes&&trustedTypes.createPolicy&&(u=trustedTypes.createPolicy("nextjs#bundler",u))),u},d.tu=function(e){return d.tt().createScriptURL(e)},d.p="/ui/_next/",i={272:0},d.f.j=function(e,t){var n=d.o(i,e)?i[e]:void 0;if(0!==n){if(n)t.push(n[2]);else if(272!=e){var r=new Promise(function(t,r){n=i[e]=[t,r]});t.push(n[2]=r);var o=d.p+d.u(e),u=Error();d.l(o,function(t){if(d.o(i,e)&&(0!==(n=i[e])&&(i[e]=void 0),n)){var r=t&&("load"===t.type?"missing":t.type),o=t&&t.target&&t.target.src;u.message="Loading chunk "+e+" failed.\n("+r+": "+o+")",u.name="ChunkLoadError",u.type=r,u.request=o,n[1](u)}},"chunk-"+e,e)}else i[e]=0}},d.O.j=function(e){return 0===i[e]},c=function(e,t){var n,r,o=t[0],u=t[1],c=t[2],f=0;if(o.some(function(e){return 0!==i[e]})){for(n in u)d.o(u,n)&&(d.m[n]=u[n]);if(c)var a=c(d)}for(e&&e(t);f<o.length;f++)r=o[f],d.o(i,r)&&i[r]&&i[r][0](),i[r]=0;return d.O(a)},(f=self.webpackChunk_N_E=self.webpackChunk_N_E||[]).forEach(c.bind(null,0)),f.push=c.bind(null,f.push.bind(f))}();
+!function(){"use strict";var e,t,n,r,o,u,i,c,f,a={},l={};function d(e){var t=l[e];if(void 0!==t)return t.exports;var n=l[e]={id:e,loaded:!1,exports:{}},r=!0;try{a[e](n,n.exports,d),r=!1}finally{r&&delete l[e]}return n.loaded=!0,n.exports}d.m=a,e=[],d.O=function(t,n,r,o){if(n){o=o||0;for(var u=e.length;u>0&&e[u-1][2]>o;u--)e[u]=e[u-1];e[u]=[n,r,o];return}for(var i=1/0,u=0;u<e.length;u++){for(var n=e[u][0],r=e[u][1],o=e[u][2],c=!0,f=0;f<n.length;f++)i>=o&&Object.keys(d.O).every(function(e){return d.O[e](n[f])})?n.splice(f--,1):(c=!1,o<i&&(i=o));if(c){e.splice(u--,1);var a=r();void 0!==a&&(t=a)}}return t},d.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return d.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},d.t=function(e,r){if(1&r&&(e=this(e)),8&r||"object"==typeof e&&e&&(4&r&&e.__esModule||16&r&&"function"==typeof e.then))return e;var o=Object.create(null);d.r(o);var u={};t=t||[null,n({}),n([]),n(n)];for(var i=2&r&&e;"object"==typeof i&&!~t.indexOf(i);i=n(i))Object.getOwnPropertyNames(i).forEach(function(t){u[t]=function(){return e[t]}});return u.default=function(){return e},d.d(o,u),o},d.d=function(e,t){for(var n in t)d.o(t,n)&&!d.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},d.f={},d.e=function(e){return Promise.all(Object.keys(d.f).reduce(function(t,n){return d.f[n](e,t),t},[]))},d.u=function(e){},d.miniCssF=function(e){return"static/css/f8da5a6a5b29d249.css"},d.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||Function("return this")()}catch(e){if("object"==typeof window)return window}}(),d.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="_N_E:",d.l=function(e,t,n,u){if(r[e]){r[e].push(t);return}if(void 0!==n)for(var i,c,f=document.getElementsByTagName("script"),a=0;a<f.length;a++){var l=f[a];if(l.getAttribute("src")==e||l.getAttribute("data-webpack")==o+n){i=l;break}}i||(c=!0,(i=document.createElement("script")).charset="utf-8",i.timeout=120,d.nc&&i.setAttribute("nonce",d.nc),i.setAttribute("data-webpack",o+n),i.src=d.tu(e)),r[e]=[t];var s=function(t,n){i.onerror=i.onload=null,clearTimeout(p);var o=r[e];if(delete r[e],i.parentNode&&i.parentNode.removeChild(i),o&&o.forEach(function(e){return e(n)}),t)return t(n)},p=setTimeout(s.bind(null,void 0,{type:"timeout",target:i}),12e4);i.onerror=s.bind(null,i.onerror),i.onload=s.bind(null,i.onload),c&&document.head.appendChild(i)},d.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},d.nmd=function(e){return e.paths=[],e.children||(e.children=[]),e},d.tt=function(){return void 0===u&&(u={createScriptURL:function(e){return e}},"undefined"!=typeof trustedTypes&&trustedTypes.createPolicy&&(u=trustedTypes.createPolicy("nextjs#bundler",u))),u},d.tu=function(e){return d.tt().createScriptURL(e)},d.p="/ui/_next/",i={272:0},d.f.j=function(e,t){var n=d.o(i,e)?i[e]:void 0;if(0!==n){if(n)t.push(n[2]);else if(272!=e){var r=new Promise(function(t,r){n=i[e]=[t,r]});t.push(n[2]=r);var o=d.p+d.u(e),u=Error();d.l(o,function(t){if(d.o(i,e)&&(0!==(n=i[e])&&(i[e]=void 0),n)){var r=t&&("load"===t.type?"missing":t.type),o=t&&t.target&&t.target.src;u.message="Loading chunk "+e+" failed.\n("+r+": "+o+")",u.name="ChunkLoadError",u.type=r,u.request=o,n[1](u)}},"chunk-"+e,e)}else i[e]=0}},d.O.j=function(e){return 0===i[e]},c=function(e,t){var n,r,o=t[0],u=t[1],c=t[2],f=0;if(o.some(function(e){return 0!==i[e]})){for(n in u)d.o(u,n)&&(d.m[n]=u[n]);if(c)var a=c(d)}for(e&&e(t);f<o.length;f++)r=o[f],d.o(i,r)&&i[r]&&i[r][0](),i[r]=0;return d.O(a)},(f=self.webpackChunk_N_E=self.webpackChunk_N_E||[]).forEach(c.bind(null,0)),f.push=c.bind(null,f.push.bind(f))}();
--- a/litellm/proxy/_experimental/out/_next/static/css/68a21c6e6697f7ca.css
+++ b/litellm/proxy/_experimental/out/_next/static/css/68a21c6e6697f7ca.css
--- a/litellm/proxy/_experimental/out/_next/static/css/f8da5a6a5b29d249.css
+++ b/litellm/proxy/_experimental/out/_next/static/css/f8da5a6a5b29d249.css
--- a/litellm/proxy/_experimental/out/index.html
+++ b/litellm/proxy/_experimental/out/index.html
@ -1 +1 @@
-<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-3b0d290a8fe6941d.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-a85b2c176012d8e5.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-e1b183dda365ec86.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>🚅 LiteLLM</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-3b0d290a8fe6941d.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/68a21c6e6697f7ca.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[19914,[\"730\",\"static/chunks/730-1411b729a1c79695.js\",\"931\",\"static/chunks/app/page-df9015da04018cc1.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/68a21c6e6697f7ca.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"tXZFkeqtgh-goIRVbw_9q\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"🚅 LiteLLM\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
+<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-d1ad37b1875df240.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-a507ee9e75a3be72.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-589b47e7a69d316f.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>🚅 LiteLLM</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-d1ad37b1875df240.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/f8da5a6a5b29d249.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[90177,[\"798\",\"static/chunks/798-4baed68da0c5497d.js\",\"931\",\"static/chunks/app/page-a5a04da2a9356785.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/f8da5a6a5b29d249.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"DptMjzo5xd96cx0b56k4u\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"🚅 LiteLLM\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
--- a/litellm/proxy/_experimental/out/index.txt
+++ b/litellm/proxy/_experimental/out/index.txt
@ -1,7 +1,7 @@
 2:I[77831,[],""]
-3:I[19914,["730","static/chunks/730-1411b729a1c79695.js","931","static/chunks/app/page-df9015da04018cc1.js"],""]
+3:I[90177,["798","static/chunks/798-4baed68da0c5497d.js","931","static/chunks/app/page-a5a04da2a9356785.js"],""]
 4:I[5613,[],""]
 5:I[31778,[],""]
-0:["tXZFkeqtgh-goIRVbw_9q",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/68a21c6e6697f7ca.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
+0:["DptMjzo5xd96cx0b56k4u",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/f8da5a6a5b29d249.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
 6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"🚅 LiteLLM"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
 1:null
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@ -1,21 +1,20 @@
 model_list:
- model_name: fake_openai
+- model_name: fake-openai-endpoint
  litellm_params:
    model: openai/my-fake-model
    api_key: my-fake-key
-    api_base: http://0.0.0.0:8080
- model_name: gpt-3.5-turbo
-  litellm_params:
-    model: gpt-3.5-turbo-1106
-    api_key: os.environ/OPENAI_API_KEY
+    api_base: https://exampleopenaiendpoint-production.up.railway.app/

 litellm_settings:
-  cache: true
-  cache_params:
-    type: redis
-  callbacks: ["batch_redis_requests"]
-  # success_callbacks: ["langfuse"]
+  max_budget: 600020
+  budget_duration: 30d

 general_settings:
  master_key: sk-1234
-  database_url: "postgresql://neondb_owner:hz8tyUlJ5ivV@ep-cool-sunset-a5ywubeh.us-east-2.aws.neon.tech/neondb?sslmode=require"
+  proxy_batch_write_at: 60 # 👈 Frequency of batch writing logs to server (in seconds)
+  enable_jwt_auth: True
+  alerting: ["slack"]
+  litellm_jwtauth:
+    admin_jwt_scope: "litellm_proxy_admin"
+    team_jwt_scope: "litellm_team" 
+    public_key_ttl: 600
--- a/litellm/proxy/_types.py
+++ b/litellm/proxy/_types.py
@ -1,4 +1,5 @@
-from pydantic import BaseModel, Extra, Field, root_validator, Json
+from pydantic import BaseModel, Extra, Field, root_validator, Json, validator
+from dataclasses import fields
 import enum
 from typing import Optional, List, Union, Dict, Literal, Any
 from datetime import datetime
@ -14,11 +15,6 @@ def hash_token(token: str):
    return hashed_token


-class LiteLLMProxyRoles(enum.Enum):
-    PROXY_ADMIN = "litellm_proxy_admin"
-    USER = "litellm_user"
-
-
 class LiteLLMBase(BaseModel):
    """
    Implements default functions, all pydantic objects should have.
@ -42,6 +38,135 @@ class LiteLLMBase(BaseModel):
        protected_namespaces = ()


+class LiteLLMRoutes(enum.Enum):
+    openai_routes: List = [  # chat completions
+        "/openai/deployments/{model}/chat/completions",
+        "/chat/completions",
+        "/v1/chat/completions",
+        # completions
+        "/openai/deployments/{model}/completions",
+        "/completions",
+        "/v1/completions",
+        # embeddings
+        "/openai/deployments/{model}/embeddings",
+        "/embeddings",
+        "/v1/embeddings",
+        # image generation
+        "/images/generations",
+        "/v1/images/generations",
+        # audio transcription
+        "/audio/transcriptions",
+        "/v1/audio/transcriptions",
+        # moderations
+        "/moderations",
+        "/v1/moderations",
+        # models
+        "/models",
+        "/v1/models",
+    ]
+
+    info_routes: List = ["/key/info", "/team/info", "/user/info", "/model/info"]
+
+    management_routes: List = [  # key
+        "/key/generate",
+        "/key/update",
+        "/key/delete",
+        "/key/info",
+        # user
+        "/user/new",
+        "/user/update",
+        "/user/delete",
+        "/user/info",
+        # team
+        "/team/new",
+        "/team/update",
+        "/team/delete",
+        "/team/info",
+        "/team/block",
+        "/team/unblock",
+        # model
+        "/model/new",
+        "/model/update",
+        "/model/delete",
+        "/model/info",
+    ]
+
+
+class LiteLLM_JWTAuth(LiteLLMBase):
+    """
+    A class to define the roles and permissions for a LiteLLM Proxy w/ JWT Auth.
+
+    Attributes:
+    - admin_jwt_scope: The JWT scope required for proxy admin roles.
+    - admin_allowed_routes: list of allowed routes for proxy admin roles.
+    - team_jwt_scope: The JWT scope required for proxy team roles.
+    - team_id_jwt_field: The field in the JWT token that stores the team ID. Default - `client_id`.
+    - team_allowed_routes: list of allowed routes for proxy team roles.
+    - end_user_id_jwt_field: Default - `sub`. The field in the JWT token that stores the end-user ID. Turn this off by setting to `None`. Enables end-user cost tracking.
+    - public_key_ttl: Default - 600s. TTL for caching public JWT keys.
+
+    See `auth_checks.py` for the specific routes
+    """
+
+    admin_jwt_scope: str = "litellm_proxy_admin"
+    admin_allowed_routes: List[
+        Literal["openai_routes", "info_routes", "management_routes"]
+    ] = ["management_routes"]
+    team_jwt_scope: str = "litellm_team"
+    team_id_jwt_field: str = "client_id"
+    team_allowed_routes: List[
+        Literal["openai_routes", "info_routes", "management_routes"]
+    ] = ["openai_routes", "info_routes"]
+    end_user_id_jwt_field: Optional[str] = "sub"
+    public_key_ttl: float = 600
+
+    def __init__(self, **kwargs: Any) -> None:
+        # get the attribute names for this Pydantic model
+        allowed_keys = self.__annotations__.keys()
+
+        invalid_keys = set(kwargs.keys()) - allowed_keys
+
+        if invalid_keys:
+            raise ValueError(
+                f"Invalid arguments provided: {', '.join(invalid_keys)}. Allowed arguments are: {', '.join(allowed_keys)}."
+            )
+
+        super().__init__(**kwargs)
+
+
+class LiteLLMPromptInjectionParams(LiteLLMBase):
+    heuristics_check: bool = False
+    vector_db_check: bool = False
+    llm_api_check: bool = False
+    llm_api_name: Optional[str] = None
+    llm_api_system_prompt: Optional[str] = None
+    llm_api_fail_call_string: Optional[str] = None
+
+    @root_validator(pre=True)
+    def check_llm_api_params(cls, values):
+        llm_api_check = values.get("llm_api_check")
+        if llm_api_check is True:
+            if "llm_api_name" not in values or not values["llm_api_name"]:
+                raise ValueError(
+                    "If llm_api_check is set to True, llm_api_name must be provided"
+                )
+            if (
+                "llm_api_system_prompt" not in values
+                or not values["llm_api_system_prompt"]
+            ):
+                raise ValueError(
+                    "If llm_api_check is set to True, llm_api_system_prompt must be provided"
+                )
+            if (
+                "llm_api_fail_call_string" not in values
+                or not values["llm_api_fail_call_string"]
+            ):
+                raise ValueError(
+                    "If llm_api_check is set to True, llm_api_fail_call_string must be provided"
+                )
+        return values
+
+
 ######### Request Class Definition ######
 class ProxyChatCompletionRequest(LiteLLMBase):
    model: str
@ -180,7 +305,7 @@ class GenerateKeyResponse(GenerateKeyRequest):
    key: str
    key_name: Optional[str] = None
    expires: Optional[datetime]
-    user_id: str
+    user_id: Optional[str] = None

    @root_validator(pre=True)
    def set_model_info(cls, values):
@ -274,6 +399,7 @@ class TeamBase(LiteLLMBase):
    rpm_limit: Optional[int] = None
    max_budget: Optional[float] = None
    models: list = []
+    blocked: bool = False


 class NewTeamRequest(TeamBase):
@ -301,19 +427,18 @@ class TeamMemberDeleteRequest(LiteLLMBase):
        return values


-class UpdateTeamRequest(LiteLLMBase):
+class UpdateTeamRequest(TeamBase):
    team_id: str  # required
-    team_alias: Optional[str] = None
-    admins: Optional[list] = None
-    members: Optional[list] = None
-    members_with_roles: Optional[List[Member]] = None
-    metadata: Optional[dict] = None


 class DeleteTeamRequest(LiteLLMBase):
    team_ids: List[str]  # required


+class BlockTeamRequest(LiteLLMBase):
+    team_id: str  # required
+
+
 class LiteLLM_TeamTable(TeamBase):
    spend: Optional[float] = None
    max_parallel_requests: Optional[int] = None
@ -498,6 +623,9 @@ class ConfigGeneralSettings(LiteLLMBase):
    ui_access_mode: Optional[Literal["admin_only", "all"]] = Field(
        "all", description="Control access to the Proxy UI"
    )
+    allowed_routes: Optional[List] = Field(
+        None, description="Proxy API Endpoints you want users to be able to access"
+    )


 class ConfigYAML(LiteLLMBase):
@ -565,6 +693,8 @@ class LiteLLM_VerificationTokenView(LiteLLM_VerificationToken):
    team_tpm_limit: Optional[int] = None
    team_rpm_limit: Optional[int] = None
    team_max_budget: Optional[float] = None
+    team_models: List = []
+    team_blocked: bool = False
    soft_budget: Optional[float] = None
    team_model_aliases: Optional[Dict] = None

--- a/litellm/proxy/auth/auth_checks.py
+++ b/litellm/proxy/auth/auth_checks.py
@ -8,45 +8,160 @@ Run checks for:
 2. If user is in budget 
 3. If end_user ('user' passed to /chat/completions, /embeddings endpoint) is in budget 
 """
-from litellm.proxy._types import LiteLLM_UserTable, LiteLLM_EndUserTable
-from typing import Optional
+from litellm.proxy._types import (
+    LiteLLM_UserTable,
+    LiteLLM_EndUserTable,
+    LiteLLM_JWTAuth,
+    LiteLLM_TeamTable,
+    LiteLLMRoutes,
+)
+from typing import Optional, Literal, Union
 from litellm.proxy.utils import PrismaClient
 from litellm.caching import DualCache
+import litellm
+
+all_routes = LiteLLMRoutes.openai_routes.value + LiteLLMRoutes.management_routes.value


 def common_checks(
    request_body: dict,
-    user_object: LiteLLM_UserTable,
+    team_object: LiteLLM_TeamTable,
    end_user_object: Optional[LiteLLM_EndUserTable],
+    global_proxy_spend: Optional[float],
+    general_settings: dict,
+    route: str,
 ) -> bool:
+    """
+    Common checks across jwt + key-based auth.
+
+    1. If team is blocked
+    2. If team can call model
+    3. If team is in budget
+    4. If end_user ('user' passed to /chat/completions, /embeddings endpoint) is in budget
+    5. [OPTIONAL] If 'enforce_end_user' enabled - did developer pass in 'user' param for openai endpoints
+    6. [OPTIONAL] If 'litellm.max_budget' is set (>0), is proxy under budget
+    """
    _model = request_body.get("model", None)
-    # 1. If user can call model
+    if team_object.blocked == True:
+        raise Exception(
+            f"Team={team_object.team_id} is blocked. Update via `/team/unblock` if your admin."
+        )
+    # 2. If user can call model
    if (
        _model is not None
-        and len(user_object.models) > 0
-        and _model not in user_object.models
+        and len(team_object.models) > 0
+        and _model not in team_object.models
    ):
        raise Exception(
-            f"User={user_object.user_id} not allowed to call model={_model}. Allowed user models = {user_object.models}"
+            f"Team={team_object.team_id} not allowed to call model={_model}. Allowed team models = {team_object.models}"
        )
-    # 2. If user is in budget
+    # 3. If team is in budget
    if (
-        user_object.max_budget is not None
-        and user_object.spend > user_object.max_budget
+        team_object.max_budget is not None
+        and team_object.spend is not None
+        and team_object.spend > team_object.max_budget
    ):
        raise Exception(
-            f"User={user_object.user_id} over budget. Spend={user_object.spend}, Budget={user_object.max_budget}"
+            f"Team={team_object.team_id} over budget. Spend={team_object.spend}, Budget={team_object.max_budget}"
        )
-    # 3. If end_user ('user' passed to /chat/completions, /embeddings endpoint) is in budget
+    # 4. If end_user ('user' passed to /chat/completions, /embeddings endpoint) is in budget
    if end_user_object is not None and end_user_object.litellm_budget_table is not None:
        end_user_budget = end_user_object.litellm_budget_table.max_budget
        if end_user_budget is not None and end_user_object.spend > end_user_budget:
            raise Exception(
-                f"End User={end_user_object.user_id} over budget. Spend={end_user_object.spend}, Budget={end_user_budget}"
+                f"ExceededBudget: End User={end_user_object.user_id} over budget. Spend={end_user_object.spend}, Budget={end_user_budget}"
+            )
+    # 5. [OPTIONAL] If 'enforce_user_param' enabled - did developer pass in 'user' param for openai endpoints
+    if (
+        general_settings.get("enforce_user_param", None) is not None
+        and general_settings["enforce_user_param"] == True
+    ):
+        if route in LiteLLMRoutes.openai_routes.value and "user" not in request_body:
+            raise Exception(
+                f"'user' param not passed in. 'enforce_user_param'={general_settings['enforce_user_param']}"
+            )
+    # 6. [OPTIONAL] If 'litellm.max_budget' is set (>0), is proxy under budget
+    if litellm.max_budget > 0 and global_proxy_spend is not None:
+        if global_proxy_spend > litellm.max_budget:
+            raise Exception(
+                f"ExceededBudget: LiteLLM Proxy has exceeded its budget. Current spend: {global_proxy_spend}; Max Budget: {litellm.max_budget}"
            )
    return True


+def _allowed_routes_check(user_route: str, allowed_routes: list) -> bool:
+    for allowed_route in allowed_routes:
+        if (
+            allowed_route == LiteLLMRoutes.openai_routes.name
+            and user_route in LiteLLMRoutes.openai_routes.value
+        ):
+            return True
+        elif (
+            allowed_route == LiteLLMRoutes.info_routes.name
+            and user_route in LiteLLMRoutes.info_routes.value
+        ):
+            return True
+        elif (
+            allowed_route == LiteLLMRoutes.management_routes.name
+            and user_route in LiteLLMRoutes.management_routes.value
+        ):
+            return True
+        elif allowed_route == user_route:
+            return True
+    return False
+
+
+def allowed_routes_check(
+    user_role: Literal["proxy_admin", "team"],
+    user_route: str,
+    litellm_proxy_roles: LiteLLM_JWTAuth,
+) -> bool:
+    """
+    Check if user -> not admin - allowed to access these routes
+    """
+
+    if user_role == "proxy_admin":
+        if litellm_proxy_roles.admin_allowed_routes is None:
+            is_allowed = _allowed_routes_check(
+                user_route=user_route, allowed_routes=["management_routes"]
+            )
+            return is_allowed
+        elif litellm_proxy_roles.admin_allowed_routes is not None:
+            is_allowed = _allowed_routes_check(
+                user_route=user_route,
+                allowed_routes=litellm_proxy_roles.admin_allowed_routes,
+            )
+            return is_allowed
+
+    elif user_role == "team":
+        if litellm_proxy_roles.team_allowed_routes is None:
+            """
+            By default allow a team to call openai + info routes
+            """
+            is_allowed = _allowed_routes_check(
+                user_route=user_route, allowed_routes=["openai_routes", "info_routes"]
+            )
+            return is_allowed
+        elif litellm_proxy_roles.team_allowed_routes is not None:
+            is_allowed = _allowed_routes_check(
+                user_route=user_route,
+                allowed_routes=litellm_proxy_roles.team_allowed_routes,
+            )
+            return is_allowed
+    return False
+
+
+def get_actual_routes(allowed_routes: list) -> list:
+    actual_routes: list = []
+    for route_name in allowed_routes:
+        try:
+            route_value = LiteLLMRoutes[route_name].value
+            actual_routes = actual_routes + route_value
+        except KeyError:
+            actual_routes.append(route_name)
+    return actual_routes
+
+
 async def get_end_user_object(
    end_user_id: Optional[str],
    prisma_client: Optional[PrismaClient],
@ -82,3 +197,75 @@ async def get_end_user_object(
        return LiteLLM_EndUserTable(**response.dict())
    except Exception as e:  # if end-user not in db
        return None
+
+
+async def get_user_object(self, user_id: str) -> LiteLLM_UserTable:
+    """
+    - Check if user id in proxy User Table
+    - if valid, return LiteLLM_UserTable object with defined limits
+    - if not, then raise an error
+    """
+    if self.prisma_client is None:
+        raise Exception(
+            "No DB Connected. See - https://docs.litellm.ai/docs/proxy/virtual_keys"
+        )
+
+    # check if in cache
+    cached_user_obj = self.user_api_key_cache.async_get_cache(key=user_id)
+    if cached_user_obj is not None:
+        if isinstance(cached_user_obj, dict):
+            return LiteLLM_UserTable(**cached_user_obj)
+        elif isinstance(cached_user_obj, LiteLLM_UserTable):
+            return cached_user_obj
+    # else, check db
+    try:
+        response = await self.prisma_client.db.litellm_usertable.find_unique(
+            where={"user_id": user_id}
+        )
+
+        if response is None:
+            raise Exception
+
+        return LiteLLM_UserTable(**response.dict())
+    except Exception as e:
+        raise Exception(
+            f"User doesn't exist in db. User={user_id}. Create user via `/user/new` call."
+        )
+
+
+async def get_team_object(
+    team_id: str,
+    prisma_client: Optional[PrismaClient],
+    user_api_key_cache: DualCache,
+) -> LiteLLM_TeamTable:
+    """
+    - Check if team id in proxy Team Table
+    - if valid, return LiteLLM_TeamTable object with defined limits
+    - if not, then raise an error
+    """
+    if prisma_client is None:
+        raise Exception(
+            "No DB Connected. See - https://docs.litellm.ai/docs/proxy/virtual_keys"
+        )
+
+    # check if in cache
+    cached_team_obj = user_api_key_cache.async_get_cache(key=team_id)
+    if cached_team_obj is not None:
+        if isinstance(cached_team_obj, dict):
+            return LiteLLM_TeamTable(**cached_team_obj)
+        elif isinstance(cached_team_obj, LiteLLM_TeamTable):
+            return cached_team_obj
+    # else, check db
+    try:
+        response = await prisma_client.db.litellm_teamtable.find_unique(
+            where={"team_id": team_id}
+        )
+
+        if response is None:
+            raise Exception
+
+        return LiteLLM_TeamTable(**response.dict())
+    except Exception as e:
+        raise Exception(
+            f"Team doesn't exist in db. Team={team_id}. Create team via `/team/new` call."
+        )
--- a/litellm/proxy/auth/handle_jwt.py
+++ b/litellm/proxy/auth/handle_jwt.py
@ -6,50 +6,17 @@ Currently only supports admin.
 JWT token must have 'litellm_proxy_admin' in scope. 
 """

-import httpx
 import jwt
-from jwt.algorithms import RSAAlgorithm
 import json
 import os
 from litellm.caching import DualCache
-from litellm.proxy._types import LiteLLMProxyRoles, LiteLLM_UserTable
+from litellm._logging import verbose_proxy_logger
+from litellm.proxy._types import LiteLLM_JWTAuth, LiteLLM_UserTable
 from litellm.proxy.utils import PrismaClient
+from litellm.llms.custom_httpx.httpx_handler import HTTPHandler
 from typing import Optional


-class HTTPHandler:
-    def __init__(self, concurrent_limit=1000):
-        # Create a client with a connection pool
-        self.client = httpx.AsyncClient(
-            limits=httpx.Limits(
-                max_connections=concurrent_limit,
-                max_keepalive_connections=concurrent_limit,
-            )
-        )
-
-    async def close(self):
-        # Close the client when you're done with it
-        await self.client.aclose()
-
-    async def get(
-        self, url: str, params: Optional[dict] = None, headers: Optional[dict] = None
-    ):
-        response = await self.client.get(url, params=params, headers=headers)
-        return response
-
-    async def post(
-        self,
-        url: str,
-        data: Optional[dict] = None,
-        params: Optional[dict] = None,
-        headers: Optional[dict] = None,
-    ):
-        response = await self.client.post(
-            url, data=data, params=params, headers=headers
-        )
-        return response
-
-
 class JWTHandler:
    """
    - treat the sub id passed in as the user id
@ -67,113 +34,139 @@ class JWTHandler:
        self.http_handler = HTTPHandler()

    def update_environment(
-        self, prisma_client: Optional[PrismaClient], user_api_key_cache: DualCache
+        self,
+        prisma_client: Optional[PrismaClient],
+        user_api_key_cache: DualCache,
+        litellm_jwtauth: LiteLLM_JWTAuth,
    ) -> None:
        self.prisma_client = prisma_client
        self.user_api_key_cache = user_api_key_cache
+        self.litellm_jwtauth = litellm_jwtauth

    def is_jwt(self, token: str):
        parts = token.split(".")
        return len(parts) == 3

    def is_admin(self, scopes: list) -> bool:
-        if LiteLLMProxyRoles.PROXY_ADMIN.value in scopes:
+        if self.litellm_jwtauth.admin_jwt_scope in scopes:
            return True
        return False

-    def get_user_id(self, token: dict, default_value: str) -> str:
+    def is_team(self, scopes: list) -> bool:
+        if self.litellm_jwtauth.team_jwt_scope in scopes:
+            return True
+        return False
+
+    def get_end_user_id(self, token: dict, default_value: Optional[str]) -> str:
        try:
-            user_id = token["sub"]
+            if self.litellm_jwtauth.end_user_id_jwt_field is not None:
+                user_id = token[self.litellm_jwtauth.end_user_id_jwt_field]
+            else:
+                user_id = None
        except KeyError:
            user_id = default_value
        return user_id

    def get_team_id(self, token: dict, default_value: Optional[str]) -> Optional[str]:
        try:
-            team_id = token["azp"]
+            team_id = token[self.litellm_jwtauth.team_id_jwt_field]
        except KeyError:
            team_id = default_value
        return team_id

-    async def get_user_object(self, user_id: str) -> LiteLLM_UserTable:
-        """
-        - Check if user id in proxy User Table
-        - if valid, return LiteLLM_UserTable object with defined limits
-        - if not, then raise an error
-        """
-        if self.prisma_client is None:
-            raise Exception(
-                "No DB Connected. See - https://docs.litellm.ai/docs/proxy/virtual_keys"
-            )
-
-        # check if in cache
-        cached_user_obj = self.user_api_key_cache.async_get_cache(key=user_id)
-        if cached_user_obj is not None:
-            if isinstance(cached_user_obj, dict):
-                return LiteLLM_UserTable(**cached_user_obj)
-            elif isinstance(cached_user_obj, LiteLLM_UserTable):
-                return cached_user_obj
-        # else, check db
-        try:
-            response = await self.prisma_client.db.litellm_usertable.find_unique(
-                where={"user_id": user_id}
-            )
-
-            if response is None:
-                raise Exception
-
-            return LiteLLM_UserTable(**response.dict())
-        except Exception as e:
-            raise Exception(
-                f"User doesn't exist in db. User={user_id}. Create user via `/user/new` call."
-            )
-
    def get_scopes(self, token: dict) -> list:
        try:
-            # Assuming the scopes are stored in 'scope' claim and are space-separated
-            scopes = token["scope"].split()
+            if isinstance(token["scope"], str):
+                # Assuming the scopes are stored in 'scope' claim and are space-separated
+                scopes = token["scope"].split()
+            elif isinstance(token["scope"], list):
+                scopes = token["scope"]
+            else:
+                raise Exception(
+                    f"Unmapped scope type - {type(token['scope'])}. Supported types - list, str."
+                )
        except KeyError:
            scopes = []
        return scopes

-    async def auth_jwt(self, token: str) -> dict:
+    async def get_public_key(self, kid: Optional[str]) -> dict:
        keys_url = os.getenv("JWT_PUBLIC_KEY_URL")

        if keys_url is None:
            raise Exception("Missing JWT Public Key URL from environment.")

-        response = await self.http_handler.get(keys_url)
+        cached_keys = await self.user_api_key_cache.async_get_cache(
+            "litellm_jwt_auth_keys"
+        )
+        if cached_keys is None:
+            response = await self.http_handler.get(keys_url)

-        keys = response.json()["keys"]
+            keys = response.json()["keys"]
+
+            await self.user_api_key_cache.async_set_cache(
+                key="litellm_jwt_auth_keys",
+                value=keys,
+                ttl=self.litellm_jwtauth.public_key_ttl,  # cache for 10 mins
+            )
+        else:
+            keys = cached_keys
+
+        public_key: Optional[dict] = None
+
+        if len(keys) == 1:
+            if kid is None or key["kid"] == kid:
+                public_key = keys[0]
+        elif len(keys) > 1:
+            for key in keys:
+                if kid is not None and key["kid"] == kid:
+                    public_key = key
+
+        if public_key is None:
+            raise Exception(
+                f"No matching public key found. kid={kid}, keys_url={keys_url}, cached_keys={cached_keys}"
+            )
+
+        return public_key
+
+    async def auth_jwt(self, token: str) -> dict:
+        from jwt.algorithms import RSAAlgorithm

        header = jwt.get_unverified_header(token)
-        kid = header["kid"]

-        for key in keys:
-            if key["kid"] == kid:
-                jwk = {
-                    "kty": key["kty"],
-                    "kid": key["kid"],
-                    "n": key["n"],
-                    "e": key["e"],
-                }
-                public_key = RSAAlgorithm.from_jwk(json.dumps(jwk))
+        verbose_proxy_logger.debug("header: %s", header)

-                try:
-                    # decode the token using the public key
-                    payload = jwt.decode(
-                        token,
-                        public_key,  # type: ignore
-                        algorithms=["RS256"],
-                        audience="account",
-                    )
-                    return payload
+        kid = header.get("kid", None)

-                except jwt.ExpiredSignatureError:
-                    # the token is expired, do something to refresh it
-                    raise Exception("Token Expired")
-                except Exception as e:
-                    raise Exception(f"Validation fails: {str(e)}")
+        public_key = await self.get_public_key(kid=kid)
+
+        if public_key is not None and isinstance(public_key, dict):
+            jwk = {}
+            if "kty" in public_key:
+                jwk["kty"] = public_key["kty"]
+            if "kid" in public_key:
+                jwk["kid"] = public_key["kid"]
+            if "n" in public_key:
+                jwk["n"] = public_key["n"]
+            if "e" in public_key:
+                jwk["e"] = public_key["e"]
+
+            public_key_rsa = RSAAlgorithm.from_jwk(json.dumps(jwk))
+
+            try:
+                # decode the token using the public key
+                payload = jwt.decode(
+                    token,
+                    public_key_rsa,  # type: ignore
+                    algorithms=["RS256"],
+                    options={"verify_aud": False},
+                )
+                return payload
+
+            except jwt.ExpiredSignatureError:
+                # the token is expired, do something to refresh it
+                raise Exception("Token Expired")
+            except Exception as e:
+                raise Exception(f"Validation fails: {str(e)}")

        raise Exception("Invalid JWT Submitted")

--- a/Show more
+++ b/Show more