Merge branch 'BerriAI:main' into main

2025-04-24 18:24:20 +00:00 · 2025-04-02 19:56:53 +05:30 · 2025-04-02 19:56:53 +05:30 · 75f41a2d64
commit 75f41a2d64
parent 04cd517fa4 6c69ad4c89
134 changed files with 3935 additions and 1451 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -1450,7 +1450,7 @@ jobs:
          command: |
            pwd
            ls
-            python -m pytest -s -vv tests/*.py -x --junitxml=test-results/junit.xml --durations=5 --ignore=tests/otel_tests --ignore=tests/pass_through_tests --ignore=tests/proxy_admin_ui_tests --ignore=tests/load_tests --ignore=tests/llm_translation --ignore=tests/llm_responses_api_testing --ignore=tests/mcp_tests --ignore=tests/image_gen_tests --ignore=tests/pass_through_unit_tests
+            python -m pytest -s -vv tests/*.py -x --junitxml=test-results/junit.xml --durations=5 --ignore=tests/otel_tests --ignore=tests/spend_tracking_tests --ignore=tests/pass_through_tests --ignore=tests/proxy_admin_ui_tests --ignore=tests/load_tests --ignore=tests/llm_translation --ignore=tests/llm_responses_api_testing --ignore=tests/mcp_tests --ignore=tests/image_gen_tests --ignore=tests/pass_through_unit_tests
          no_output_timeout: 120m

      # Store test results
@ -1743,6 +1743,96 @@ jobs:
      # Store test results
      - store_test_results:
          path: test-results
+  proxy_spend_accuracy_tests:
+    machine:
+      image: ubuntu-2204:2023.10.1
+    resource_class: xlarge
+    working_directory: ~/project
+    steps:
+      - checkout
+      - setup_google_dns
+      - run:
+          name: Install Docker CLI (In case it's not already installed)
+          command: |
+            sudo apt-get update
+            sudo apt-get install -y docker-ce docker-ce-cli containerd.io
+      - run:
+          name: Install Python 3.9
+          command: |
+            curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh --output miniconda.sh
+            bash miniconda.sh -b -p $HOME/miniconda
+            export PATH="$HOME/miniconda/bin:$PATH"
+            conda init bash
+            source ~/.bashrc
+            conda create -n myenv python=3.9 -y
+            conda activate myenv
+            python --version
+      - run:
+          name: Install Dependencies
+          command: |
+            pip install "pytest==7.3.1"
+            pip install "pytest-asyncio==0.21.1"
+            pip install aiohttp
+            python -m pip install --upgrade pip
+            python -m pip install -r requirements.txt
+      - run:
+          name: Build Docker image
+          command: docker build -t my-app:latest -f ./docker/Dockerfile.database .
+      - run:
+          name: Run Docker container
+          # intentionally give bad redis credentials here
+          # the OTEL test - should get this as a trace
+          command: |
+            docker run -d \
+              -p 4000:4000 \
+              -e DATABASE_URL=$PROXY_DATABASE_URL \
+              -e REDIS_HOST=$REDIS_HOST \
+              -e REDIS_PASSWORD=$REDIS_PASSWORD \
+              -e REDIS_PORT=$REDIS_PORT \
+              -e LITELLM_MASTER_KEY="sk-1234" \
+              -e OPENAI_API_KEY=$OPENAI_API_KEY \
+              -e LITELLM_LICENSE=$LITELLM_LICENSE \
+              -e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
+              -e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
+              -e USE_DDTRACE=True \
+              -e DD_API_KEY=$DD_API_KEY \
+              -e DD_SITE=$DD_SITE \
+              -e AWS_REGION_NAME=$AWS_REGION_NAME \
+              --name my-app \
+              -v $(pwd)/litellm/proxy/example_config_yaml/spend_tracking_config.yaml:/app/config.yaml \
+              my-app:latest \
+              --config /app/config.yaml \
+              --port 4000 \
+              --detailed_debug \
+      - run:
+          name: Install curl and dockerize
+          command: |
+            sudo apt-get update
+            sudo apt-get install -y curl
+            sudo wget https://github.com/jwilder/dockerize/releases/download/v0.6.1/dockerize-linux-amd64-v0.6.1.tar.gz
+            sudo tar -C /usr/local/bin -xzvf dockerize-linux-amd64-v0.6.1.tar.gz
+            sudo rm dockerize-linux-amd64-v0.6.1.tar.gz
+      - run:
+          name: Start outputting logs
+          command: docker logs -f my-app
+          background: true
+      - run:
+          name: Wait for app to be ready
+          command: dockerize -wait http://localhost:4000 -timeout 5m
+      - run:
+          name: Run tests
+          command: |
+            pwd
+            ls
+            python -m pytest -vv tests/spend_tracking_tests -x --junitxml=test-results/junit.xml --durations=5
+          no_output_timeout:
+            120m
+            # Clean up first container
+      - run:
+          name: Stop and remove first container
+          command: |
+            docker stop my-app
+            docker rm my-app

  proxy_multi_instance_tests:
    machine:
@ -2553,6 +2643,12 @@ workflows:
              only:
                - main
                - /litellm_.*/
+      - proxy_spend_accuracy_tests:
+          filters:
+            branches:
+              only:
+                - main
+                - /litellm_.*/
      - proxy_multi_instance_tests:
          filters:
            branches:
@ -2714,6 +2810,7 @@ workflows:
            - installing_litellm_on_python
            - installing_litellm_on_python_3_13
            - proxy_logging_guardrails_model_info_tests
+            - proxy_spend_accuracy_tests
            - proxy_multi_instance_tests
            - proxy_store_model_in_db_tests
            - proxy_build_from_pip_tests
--- a/.github/workflows/test-linting.yml
+++ b/.github/workflows/test-linting.yml
@ -24,10 +24,10 @@ jobs:
      run: |
        poetry install --with dev

-    - name: Run Black formatting check
+    - name: Run Black formatting
      run: |
        cd litellm
-        poetry run black . --check
+        poetry run black .
        cd ..

    - name: Run Ruff linting
--- a/cookbook/misc/dev_release.txt
+++ b/cookbook/misc/dev_release.txt
@ -1,2 +1,11 @@
 python3 -m build
-twine upload --verbose dist/litellm-1.18.13.dev4.tar.gz -u __token__ - 
+twine upload --verbose dist/litellm-1.18.13.dev4.tar.gz -u __token__ - 
+
+
+Note: You might need to make a MANIFEST.ini file on root for build process incase it fails 
+
+Place this in MANIFEST.ini
+recursive-exclude venv *
+recursive-exclude myenv *
+recursive-exclude py313_env *
+recursive-exclude **/.venv *
--- a/docs/my-website/docs/anthropic_unified.md
+++ b/docs/my-website/docs/anthropic_unified.md
@ -3,9 +3,10 @@ import TabItem from '@theme/TabItem';

 # /v1/messages [BETA] 

-LiteLLM provides a BETA endpoint in the spec of Anthropic's `/v1/messages` endpoint. 
+Use LiteLLM to call all your LLM APIs in the Anthropic `v1/messages` format. 

-This currently just supports the Anthropic API. 
+
+## Overview 

 | Feature | Supported | Notes | 
 |-------|-------|-------|
@ -21,9 +22,61 @@ Planned improvement:
 - Bedrock Anthropic support

 ## Usage 
+---
+
+### LiteLLM Python SDK 
+
+#### Non-streaming example
+```python showLineNumbers title="Example using LiteLLM Python SDK"
+import litellm
+response = await litellm.anthropic.messages.acreate(
+    messages=[{"role": "user", "content": "Hello, can you tell me a short joke?"}],
+    api_key=api_key,
+    model="anthropic/claude-3-haiku-20240307",
+    max_tokens=100,
+)
+```
+
+Example response:
+```json
+{
+  "content": [
+    {
+      "text": "Hi! this is a very short joke",
+      "type": "text"
+    }
+  ],
+  "id": "msg_013Zva2CMHLNnXjNJJKqJ2EF",
+  "model": "claude-3-7-sonnet-20250219",
+  "role": "assistant",
+  "stop_reason": "end_turn",
+  "stop_sequence": null,
+  "type": "message",
+  "usage": {
+    "input_tokens": 2095,
+    "output_tokens": 503,
+    "cache_creation_input_tokens": 2095,
+    "cache_read_input_tokens": 0
+  }
+}
+```
+
+#### Streaming example
+```python showLineNumbers title="Example using LiteLLM Python SDK"
+import litellm
+response = await litellm.anthropic.messages.acreate(
+    messages=[{"role": "user", "content": "Hello, can you tell me a short joke?"}],
+    api_key=api_key,
+    model="anthropic/claude-3-haiku-20240307",
+    max_tokens=100,
+    stream=True,
+)
+async for chunk in response:
+    print(chunk)
+```
+
+### LiteLLM Proxy Server 

-<Tabs>
-<TabItem label="PROXY" value="proxy">

 1. Setup config.yaml

@ -42,7 +95,28 @@ litellm --config /path/to/config.yaml

 3. Test it! 

-```bash
+<Tabs>
+<TabItem label="Anthropic Python SDK" value="python">
+
+```python showLineNumbers title="Example using LiteLLM Proxy Server"
+import anthropic
+
+# point anthropic sdk to litellm proxy 
+client = anthropic.Anthropic(
+    base_url="http://0.0.0.0:4000",
+    api_key="sk-1234",
+)
+
+response = client.messages.create(
+    messages=[{"role": "user", "content": "Hello, can you tell me a short joke?"}],
+    model="anthropic/claude-3-haiku-20240307",
+    max_tokens=100,
+)
+```
+</TabItem>
+<TabItem label="curl" value="curl">
+
+```bash showLineNumbers title="Example using LiteLLM Proxy Server"
 curl -L -X POST 'http://0.0.0.0:4000/v1/messages' \
 -H 'content-type: application/json' \
 -H 'x-api-key: $LITELLM_API_KEY' \
@ -52,41 +126,176 @@ curl -L -X POST 'http://0.0.0.0:4000/v1/messages' \
  "messages": [
    {
      "role": "user",
-      "content": [
-        {
-          "type": "text",
-          "text": "List 5 important events in the XIX century"
-        }
-      ]
+      "content": "Hello, can you tell me a short joke?"
    }
  ],
-  "max_tokens": 4096
+  "max_tokens": 100
 }'
 ```
+
 </TabItem>
-<TabItem value="sdk" label="SDK">
+</Tabs>

-```python
-from litellm.llms.anthropic.experimental_pass_through.messages.handler import anthropic_messages
-import asyncio 
-import os 

-# set env 
-os.environ["ANTHROPIC_API_KEY"] = "my-api-key"
+## Request Format
+---

-messages = [{"role": "user", "content": "Hello, can you tell me a short joke?"}]
+Request body will be in the Anthropic messages API format. **litellm follows the Anthropic messages specification for this endpoint.**

-# Call the handler
-async def call(): 
-    response = await anthropic_messages(
-        messages=messages,
-        api_key=api_key,
-        model="claude-3-haiku-20240307",
-        max_tokens=100,
-    )
+#### Example request body

-asyncio.run(call())
+```json
+{
+  "model": "claude-3-7-sonnet-20250219",
+  "max_tokens": 1024,
+  "messages": [
+    {
+      "role": "user",
+      "content": "Hello, world"
+    }
+  ]
+}
 ```

-</TabItem>
-</Tabs>
+#### Required Fields
+- **model** (string):  
+  The model identifier (e.g., `"claude-3-7-sonnet-20250219"`).
+- **max_tokens** (integer):  
+  The maximum number of tokens to generate before stopping.  
+  _Note: The model may stop before reaching this limit; value must be greater than 1._
+- **messages** (array of objects):  
+  An ordered list of conversational turns.  
+  Each message object must include:
+  - **role** (enum: `"user"` or `"assistant"`):  
+    Specifies the speaker of the message.
+  - **content** (string or array of content blocks):  
+    The text or content blocks (e.g., an array containing objects with a `type` such as `"text"`) that form the message.  
+    _Example equivalence:_
+    ```json
+    {"role": "user", "content": "Hello, Claude"}
+    ```
+    is equivalent to:
+    ```json
+    {"role": "user", "content": [{"type": "text", "text": "Hello, Claude"}]}
+    ```
+
+#### Optional Fields
+- **metadata** (object):  
+  Contains additional metadata about the request (e.g., `user_id` as an opaque identifier).
+- **stop_sequences** (array of strings):  
+  Custom sequences that, when encountered in the generated text, cause the model to stop.
+- **stream** (boolean):  
+  Indicates whether to stream the response using server-sent events.
+- **system** (string or array):  
+  A system prompt providing context or specific instructions to the model.
+- **temperature** (number):  
+  Controls randomness in the model’s responses. Valid range: `0 < temperature < 1`.
+- **thinking** (object):  
+  Configuration for enabling extended thinking. If enabled, it includes:
+  - **budget_tokens** (integer):  
+    Minimum of 1024 tokens (and less than `max_tokens`).
+  - **type** (enum):  
+    E.g., `"enabled"`.
+- **tool_choice** (object):  
+  Instructs how the model should utilize any provided tools.
+- **tools** (array of objects):  
+  Definitions for tools available to the model. Each tool includes:
+  - **name** (string):  
+    The tool’s name.
+  - **description** (string):  
+    A detailed description of the tool.
+  - **input_schema** (object):  
+    A JSON schema describing the expected input format for the tool.
+- **top_k** (integer):  
+  Limits sampling to the top K options.
+- **top_p** (number):  
+  Enables nucleus sampling with a cumulative probability cutoff. Valid range: `0 < top_p < 1`.
+
+
+## Response Format
+---
+
+Responses will be in the Anthropic messages API format.
+
+#### Example Response
+
+```json
+{
+  "content": [
+    {
+      "text": "Hi! My name is Claude.",
+      "type": "text"
+    }
+  ],
+  "id": "msg_013Zva2CMHLNnXjNJJKqJ2EF",
+  "model": "claude-3-7-sonnet-20250219",
+  "role": "assistant",
+  "stop_reason": "end_turn",
+  "stop_sequence": null,
+  "type": "message",
+  "usage": {
+    "input_tokens": 2095,
+    "output_tokens": 503,
+    "cache_creation_input_tokens": 2095,
+    "cache_read_input_tokens": 0
+  }
+}
+```
+
+#### Response fields
+
+- **content** (array of objects):  
+  Contains the generated content blocks from the model. Each block includes:
+  - **type** (string):  
+    Indicates the type of content (e.g., `"text"`, `"tool_use"`, `"thinking"`, or `"redacted_thinking"`).
+  - **text** (string):  
+    The generated text from the model.  
+    _Note: Maximum length is 5,000,000 characters._
+  - **citations** (array of objects or `null`):  
+    Optional field providing citation details. Each citation includes:
+    - **cited_text** (string):  
+      The excerpt being cited.
+    - **document_index** (integer):  
+      An index referencing the cited document.
+    - **document_title** (string or `null`):  
+      The title of the cited document.
+    - **start_char_index** (integer):  
+      The starting character index for the citation.
+    - **end_char_index** (integer):  
+      The ending character index for the citation.
+    - **type** (string):  
+      Typically `"char_location"`.
+
+- **id** (string):  
+  A unique identifier for the response message.  
+  _Note: The format and length of IDs may change over time._
+
+- **model** (string):  
+  Specifies the model that generated the response.
+
+- **role** (string):  
+  Indicates the role of the generated message. For responses, this is always `"assistant"`.
+
+- **stop_reason** (string):  
+  Explains why the model stopped generating text. Possible values include:
+  - `"end_turn"`: The model reached a natural stopping point.
+  - `"max_tokens"`: The generation stopped because the maximum token limit was reached.
+  - `"stop_sequence"`: A custom stop sequence was encountered.
+  - `"tool_use"`: The model invoked one or more tools.
+
+- **stop_sequence** (string or `null`):  
+  Contains the specific stop sequence that caused the generation to halt, if applicable; otherwise, it is `null`.
+
+- **type** (string):  
+  Denotes the type of response object, which is always `"message"`.
+
+- **usage** (object):  
+  Provides details on token usage for billing and rate limiting. This includes:
+  - **input_tokens** (integer):  
+    Total number of input tokens processed.
+  - **output_tokens** (integer):  
+    Total number of output tokens generated.
+  - **cache_creation_input_tokens** (integer or `null`):  
+    Number of tokens used to create a cache entry.
+  - **cache_read_input_tokens** (integer or `null`):  
+    Number of tokens read from the cache.
--- a/docs/my-website/docs/caching/all_caches.md
+++ b/docs/my-website/docs/caching/all_caches.md
@ -3,7 +3,7 @@ import TabItem from '@theme/TabItem';

 # Caching - In-Memory, Redis, s3, Redis Semantic Cache, Disk

-[**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm.caching.caching.py)
+[**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/caching/caching.py)

 :::info

--- a/docs/my-website/docs/enterprise.md
+++ b/docs/my-website/docs/enterprise.md
@ -1,3 +1,5 @@
+import Image from '@theme/IdealImage';
+
 # Enterprise
 For companies that need SSO, user management and professional support for LiteLLM Proxy

@ -7,6 +9,8 @@ Get free 7-day trial key [here](https://www.litellm.ai/#trial)

 Includes all enterprise features.

+<Image img={require('../img/enterprise_vs_oss.png')} />
+
 [**Procurement available via AWS / Azure Marketplace**](./data_security.md#legalcompliance-faqs)


--- a/docs/my-website/docs/providers/anthropic.md
+++ b/docs/my-website/docs/providers/anthropic.md
@ -1035,8 +1035,10 @@ response = completion(
            "content": [
                {"type": "text", "text": "You are a very professional document summarization specialist. Please summarize the given document."},
                {
-                    "type": "image_url",
-                    "image_url": f"data:application/pdf;base64,{encoded_file}", # 👈 PDF
+                    "type": "file",
+                    "file": {
+                       "file_data": f"data:application/pdf;base64,{encoded_file}", # 👈 PDF
+                    }
                },
            ],
        }
@ -1081,8 +1083,10 @@ curl http://0.0.0.0:4000/v1/chat/completions \
            "text": "You are a very professional document summarization specialist. Please summarize the given document"
          },
          {
-                "type": "image_url",
-                "image_url": "data:application/pdf;base64,{encoded_file}" # 👈 PDF
+                "type": "file",
+                "file": {
+                    "file_data": f"data:application/pdf;base64,{encoded_file}", # 👈 PDF
+                }
            }
          }
        ]
--- a/docs/my-website/docs/providers/bedrock.md
+++ b/docs/my-website/docs/providers/bedrock.md
@ -1168,14 +1168,22 @@ os.environ["AWS_REGION_NAME"] = ""
 # pdf url
 image_url = "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf"

+# Download the file
+response = requests.get(url)
+file_data = response.content
+
+encoded_file = base64.b64encode(file_data).decode("utf-8")
+
 # model
 model = "bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0"

 image_content = [
    {"type": "text", "text": "What's this file about?"},
    {
-        "type": "image_url",
-        "image_url": image_url, # OR {"url": image_url}
+        "type": "file",
+        "file": {
+            "file_data": f"data:application/pdf;base64,{encoded_file}", # 👈 PDF
+        }
    },
 ]

@ -1221,8 +1229,10 @@ curl -X POST 'http://0.0.0.0:4000/chat/completions' \
    "messages": [
        {"role": "user", "content": {"type": "text", "text": "What's this file about?"}},
        {
-            "type": "image_url",
-            "image_url": "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
+            "type": "file",
+            "file": {
+                "file_data": f"data:application/pdf;base64,{encoded_file}", # 👈 PDF
+            }
        }
    ]
 }'
--- a/docs/my-website/docs/providers/gemini.md
+++ b/docs/my-website/docs/providers/gemini.md
@ -365,7 +365,7 @@ curl -X POST 'http://0.0.0.0:4000/chat/completions' \
 </Tabs>

 ## Specifying Safety Settings 
-In certain use-cases you may need to make calls to the models and pass [safety settigns](https://ai.google.dev/docs/safety_setting_gemini) different from the defaults. To do so, simple pass the `safety_settings` argument to `completion` or `acompletion`. For example:
+In certain use-cases you may need to make calls to the models and pass [safety settings](https://ai.google.dev/docs/safety_setting_gemini) different from the defaults. To do so, simple pass the `safety_settings` argument to `completion` or `acompletion`. For example:

 ```python
 response = completion(
--- a/docs/my-website/docs/providers/xai.md
+++ b/docs/my-website/docs/providers/xai.md
@ -82,7 +82,7 @@ from litellm import completion
 os.environ["XAI_API_KEY"] = "your-api-key"

 response = completion(
-    model="xai/grok-2-latest",
+    model="xai/grok-2-vision-latest",
    messages=[
        {
            "role": "user",
--- a/docs/my-website/docs/proxy/guardrails/aim_security.md
+++ b/docs/my-website/docs/proxy/guardrails/aim_security.md
@ -23,6 +23,12 @@ In the newly created guard's page, you can find a reference to the prompt policy

 You can decide which detections will be enabled, and set the threshold for each detection.

+:::info 
+When using LiteLLM with virtual keys, key-specific policies can be set directly in Aim's guards page by specifying the virtual key alias when creating the guard.
+
+Only the aliases of your virtual keys (and not the actual key secrets) will be sent to Aim.
+:::
+
 ### 3. Add Aim Guardrail on your LiteLLM config.yaml 

 Define your guardrails under the `guardrails` section
--- a/docs/my-website/docs/proxy/guardrails/prompt_injection.md
+++ b/docs/my-website/docs/proxy/guardrails/prompt_injection.md
--- a/docs/my-website/docs/proxy/guardrails/quick_start.md
+++ b/docs/my-website/docs/proxy/guardrails/quick_start.md
@ -17,6 +17,14 @@ model_list:
      api_key: os.environ/OPENAI_API_KEY

 guardrails:
+  - guardrail_name: general-guard
+    litellm_params:
+      guardrail: aim
+      mode: [pre_call, post_call]
+      api_key: os.environ/AIM_API_KEY
+      api_base: os.environ/AIM_API_BASE
+      default_on: true # Optional
+  
  - guardrail_name: "aporia-pre-guard"
    litellm_params:
      guardrail: aporia  # supported values: "aporia", "lakera"
@ -45,6 +53,7 @@ guardrails:
 - `pre_call` Run **before** LLM call, on **input**
 - `post_call` Run **after** LLM call, on **input & output**
 - `during_call` Run **during** LLM call, on **input** Same as `pre_call` but runs in parallel as LLM call.  Response not returned until guardrail check completes
+- A list of the above values to run multiple modes, e.g. `mode: [pre_call, post_call]`


 ## 2. Start LiteLLM Gateway 
@ -569,4 +578,4 @@ guardrails: Union[

 class DynamicGuardrailParams:
    extra_body: Dict[str, Any]              # Additional parameters for the guardrail
-```
+```
--- a/docs/my-website/img/enterprise_vs_oss.png
+++ b/docs/my-website/img/enterprise_vs_oss.png
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -137,15 +137,17 @@ const sidebars = {
          label: "[Beta] Guardrails",
          items: [
            "proxy/guardrails/quick_start",
-            "proxy/guardrails/aim_security",
-            "proxy/guardrails/aporia_api",
-            "proxy/guardrails/bedrock",
-            "proxy/guardrails/guardrails_ai",
-            "proxy/guardrails/lakera_ai",
-            "proxy/guardrails/pii_masking_v2",
-            "proxy/guardrails/secret_detection",
-            "proxy/guardrails/custom_guardrail",
-            "prompt_injection"
+            ...[
+              "proxy/guardrails/aim_security",
+              "proxy/guardrails/aporia_api",
+              "proxy/guardrails/bedrock",
+              "proxy/guardrails/guardrails_ai",
+              "proxy/guardrails/lakera_ai",
+              "proxy/guardrails/pii_masking_v2",
+              "proxy/guardrails/secret_detection",
+              "proxy/guardrails/custom_guardrail",
+              "proxy/guardrails/prompt_injection",
+            ].sort(),
          ],
        },
        {
--- a/litellm-proxy-extras/dist/litellm_proxy_extras-0.1.2-py3-none-any.whl
+++ b/litellm-proxy-extras/dist/litellm_proxy_extras-0.1.2-py3-none-any.whl
--- a/litellm-proxy-extras/dist/litellm_proxy_extras-0.1.2.tar.gz
+++ b/litellm-proxy-extras/dist/litellm_proxy_extras-0.1.2.tar.gz
--- a/litellm-proxy-extras/litellm_proxy_extras/migrations/20250331215456_track_success_and_failed_requests_daily_agg_table/migration.sql
+++ b/litellm-proxy-extras/litellm_proxy_extras/migrations/20250331215456_track_success_and_failed_requests_daily_agg_table/migration.sql
@ -0,0 +1,4 @@
+-- AlterTable
+ALTER TABLE "LiteLLM_DailyUserSpend" ADD COLUMN     "failed_requests" INTEGER NOT NULL DEFAULT 0,
+ADD COLUMN     "successful_requests" INTEGER NOT NULL DEFAULT 0;
+
--- a/litellm-proxy-extras/poetry.lock
+++ b/litellm-proxy-extras/poetry.lock
@ -0,0 +1,7 @@
+# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
+package = []
+
+[metadata]
+lock-version = "2.0"
+python-versions = ">=3.8.1,<4.0, !=3.9.7"
+content-hash = "2cf39473e67ff0615f0a61c9d2ac9f02b38cc08cbb1bdb893d89bee002646623"
--- a/litellm-proxy-extras/pyproject.toml
+++ b/litellm-proxy-extras/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm-proxy-extras"
-version = "0.1.1"
+version = "0.1.2"
 description = "Additional files for the LiteLLM Proxy. Reduces the size of the main litellm package."
 authors = ["BerriAI"]
 readme = "README.md"
@ -22,7 +22,7 @@ requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"

 [tool.commitizen]
-version = "0.1.1"
+version = "0.1.2"
 version_files = [
    "pyproject.toml:version",
    "../requirements.txt:litellm-proxy-extras==",
--- a/litellm/init.py
+++ b/litellm/init.py
@ -1038,6 +1038,7 @@ from .cost_calculator import response_cost_calculator, cost_per_token

 ### ADAPTERS ###
 from .types.adapter import AdapterItem
+import litellm.anthropic_interface as anthropic

 adapters: List[AdapterItem] = []

--- a/litellm/_redis.py
+++ b/litellm/_redis.py
@ -214,7 +214,7 @@ def _init_redis_sentinel(redis_kwargs) -> redis.Redis:

    # Set up the Sentinel client
    sentinel = redis.Sentinel(
-        sentinel_nodes, 
+        sentinel_nodes,
        socket_timeout=0.1,
        password=sentinel_password,
    )
--- a/litellm/_version.py
+++ b/litellm/_version.py
@ -3,4 +3,4 @@ import importlib_metadata
 try:
    version = importlib_metadata.version("litellm")
 except Exception:
-    pass
+    version = "unknown"
--- a/litellm/anthropic_interface/init.py
+++ b/litellm/anthropic_interface/init.py
@ -0,0 +1,6 @@
+"""
+Anthropic module for LiteLLM
+"""
+from .messages import acreate, create
+
+__all__ = ["acreate", "create"]
--- a/litellm/anthropic_interface/messages/init.py
+++ b/litellm/anthropic_interface/messages/init.py
@ -0,0 +1,117 @@
+"""
+Interface for Anthropic's messages API
+
+Use this to call LLMs in Anthropic /messages Request/Response format
+
+This is an __init__.py file to allow the following interface
+
+- litellm.messages.acreate
+- litellm.messages.create
+
+"""
+
+from typing import AsyncIterator, Dict, Iterator, List, Optional, Union
+
+from litellm.llms.anthropic.experimental_pass_through.messages.handler import (
+    anthropic_messages as _async_anthropic_messages,
+)
+from litellm.types.llms.anthropic_messages.anthropic_response import (
+    AnthropicMessagesResponse,
+)
+
+
+async def acreate(
+    max_tokens: int,
+    messages: List[Dict],
+    model: str,
+    metadata: Optional[Dict] = None,
+    stop_sequences: Optional[List[str]] = None,
+    stream: Optional[bool] = False,
+    system: Optional[str] = None,
+    temperature: Optional[float] = 1.0,
+    thinking: Optional[Dict] = None,
+    tool_choice: Optional[Dict] = None,
+    tools: Optional[List[Dict]] = None,
+    top_k: Optional[int] = None,
+    top_p: Optional[float] = None,
+    **kwargs
+) -> Union[AnthropicMessagesResponse, AsyncIterator]:
+    """
+    Async wrapper for Anthropic's messages API
+
+    Args:
+        max_tokens (int): Maximum tokens to generate (required)
+        messages (List[Dict]): List of message objects with role and content (required)
+        model (str): Model name to use (required)
+        metadata (Dict, optional): Request metadata
+        stop_sequences (List[str], optional): Custom stop sequences
+        stream (bool, optional): Whether to stream the response
+        system (str, optional): System prompt
+        temperature (float, optional): Sampling temperature (0.0 to 1.0)
+        thinking (Dict, optional): Extended thinking configuration
+        tool_choice (Dict, optional): Tool choice configuration
+        tools (List[Dict], optional): List of tool definitions
+        top_k (int, optional): Top K sampling parameter
+        top_p (float, optional): Nucleus sampling parameter
+        **kwargs: Additional arguments
+
+    Returns:
+        Dict: Response from the API
+    """
+    return await _async_anthropic_messages(
+        max_tokens=max_tokens,
+        messages=messages,
+        model=model,
+        metadata=metadata,
+        stop_sequences=stop_sequences,
+        stream=stream,
+        system=system,
+        temperature=temperature,
+        thinking=thinking,
+        tool_choice=tool_choice,
+        tools=tools,
+        top_k=top_k,
+        top_p=top_p,
+        **kwargs,
+    )
+
+
+async def create(
+    max_tokens: int,
+    messages: List[Dict],
+    model: str,
+    metadata: Optional[Dict] = None,
+    stop_sequences: Optional[List[str]] = None,
+    stream: Optional[bool] = False,
+    system: Optional[str] = None,
+    temperature: Optional[float] = 1.0,
+    thinking: Optional[Dict] = None,
+    tool_choice: Optional[Dict] = None,
+    tools: Optional[List[Dict]] = None,
+    top_k: Optional[int] = None,
+    top_p: Optional[float] = None,
+    **kwargs
+) -> Union[AnthropicMessagesResponse, Iterator]:
+    """
+    Async wrapper for Anthropic's messages API
+
+    Args:
+        max_tokens (int): Maximum tokens to generate (required)
+        messages (List[Dict]): List of message objects with role and content (required)
+        model (str): Model name to use (required)
+        metadata (Dict, optional): Request metadata
+        stop_sequences (List[str], optional): Custom stop sequences
+        stream (bool, optional): Whether to stream the response
+        system (str, optional): System prompt
+        temperature (float, optional): Sampling temperature (0.0 to 1.0)
+        thinking (Dict, optional): Extended thinking configuration
+        tool_choice (Dict, optional): Tool choice configuration
+        tools (List[Dict], optional): List of tool definitions
+        top_k (int, optional): Top K sampling parameter
+        top_p (float, optional): Nucleus sampling parameter
+        **kwargs: Additional arguments
+
+    Returns:
+        Dict: Response from the API
+    """
+    raise NotImplementedError("This function is not implemented")
--- a/litellm/anthropic_interface/readme.md
+++ b/litellm/anthropic_interface/readme.md
@ -0,0 +1,116 @@
+## Use LLM API endpoints in Anthropic Interface
+
+Note: This is called `anthropic_interface` because `anthropic` is a known python package and was failing mypy type checking.
+
+
+## Usage 
+---
+
+### LiteLLM Python SDK 
+
+#### Non-streaming example
+```python showLineNumbers title="Example using LiteLLM Python SDK"
+import litellm
+response = await litellm.anthropic.messages.acreate(
+    messages=[{"role": "user", "content": "Hello, can you tell me a short joke?"}],
+    api_key=api_key,
+    model="anthropic/claude-3-haiku-20240307",
+    max_tokens=100,
+)
+```
+
+Example response:
+```json
+{
+  "content": [
+    {
+      "text": "Hi! this is a very short joke",
+      "type": "text"
+    }
+  ],
+  "id": "msg_013Zva2CMHLNnXjNJJKqJ2EF",
+  "model": "claude-3-7-sonnet-20250219",
+  "role": "assistant",
+  "stop_reason": "end_turn",
+  "stop_sequence": null,
+  "type": "message",
+  "usage": {
+    "input_tokens": 2095,
+    "output_tokens": 503,
+    "cache_creation_input_tokens": 2095,
+    "cache_read_input_tokens": 0
+  }
+}
+```
+
+#### Streaming example
+```python showLineNumbers title="Example using LiteLLM Python SDK"
+import litellm
+response = await litellm.anthropic.messages.acreate(
+    messages=[{"role": "user", "content": "Hello, can you tell me a short joke?"}],
+    api_key=api_key,
+    model="anthropic/claude-3-haiku-20240307",
+    max_tokens=100,
+    stream=True,
+)
+async for chunk in response:
+    print(chunk)
+```
+
+### LiteLLM Proxy Server 
+
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+    - model_name: anthropic-claude
+      litellm_params:
+        model: claude-3-7-sonnet-latest
+```
+
+2. Start proxy 
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+<Tabs>
+<TabItem label="Anthropic Python SDK" value="python">
+
+```python showLineNumbers title="Example using LiteLLM Proxy Server"
+import anthropic
+
+# point anthropic sdk to litellm proxy 
+client = anthropic.Anthropic(
+    base_url="http://0.0.0.0:4000",
+    api_key="sk-1234",
+)
+
+response = client.messages.create(
+    messages=[{"role": "user", "content": "Hello, can you tell me a short joke?"}],
+    model="anthropic/claude-3-haiku-20240307",
+    max_tokens=100,
+)
+```
+</TabItem>
+<TabItem label="curl" value="curl">
+
+```bash showLineNumbers title="Example using LiteLLM Proxy Server"
+curl -L -X POST 'http://0.0.0.0:4000/v1/messages' \
+-H 'content-type: application/json' \
+-H 'x-api-key: $LITELLM_API_KEY' \
+-H 'anthropic-version: 2023-06-01' \
+-d '{
+  "model": "anthropic-claude",
+  "messages": [
+    {
+      "role": "user",
+      "content": "Hello, can you tell me a short joke?"
+    }
+  ],
+  "max_tokens": 100
+}'
+```
--- a/litellm/constants.py
+++ b/litellm/constants.py
@ -19,6 +19,7 @@ DEFAULT_IMAGE_HEIGHT = 300
 MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 1024  # 1MB = 1024KB
 SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000  # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic.
 REDIS_UPDATE_BUFFER_KEY = "litellm_spend_update_buffer"
+REDIS_DAILY_SPEND_UPDATE_BUFFER_KEY = "litellm_daily_spend_update_buffer"
 MAX_REDIS_BUFFER_DEQUEUE_COUNT = 100
 #### RELIABILITY ####
 REPEATED_STREAMING_CHUNK_LIMIT = 100  # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives.
--- a/litellm/cost_calculator.py
+++ b/litellm/cost_calculator.py
@ -550,6 +550,7 @@ def completion_cost(  # noqa: PLR0915
    custom_pricing: Optional[bool] = None,
    base_model: Optional[str] = None,
    standard_built_in_tools_params: Optional[StandardBuiltInToolsParams] = None,
+    litellm_model_name: Optional[str] = None,
 ) -> float:
    """
    Calculate the cost of a given completion call fot GPT-3.5-turbo, llama2, any litellm supported llm.
@ -602,7 +603,7 @@ def completion_cost(  # noqa: PLR0915
            completion_response=completion_response
        )
        rerank_billed_units: Optional[RerankBilledUnits] = None
-        model = _select_model_name_for_cost_calc(
+        selected_model = _select_model_name_for_cost_calc(
            model=model,
            completion_response=completion_response,
            custom_llm_provider=custom_llm_provider,
@ -610,232 +611,268 @@ def completion_cost(  # noqa: PLR0915
            base_model=base_model,
        )

-        verbose_logger.info(f"selected model name for cost calculation: {model}")
+        potential_model_names = [selected_model]
+        if model is not None:
+            potential_model_names.append(model)

-        if completion_response is not None and (
-            isinstance(completion_response, BaseModel)
-            or isinstance(completion_response, dict)
-        ):  # tts returns a custom class
-            if isinstance(completion_response, dict):
-                usage_obj: Optional[Union[dict, Usage]] = completion_response.get(
-                    "usage", {}
-                )
-            else:
-                usage_obj = getattr(completion_response, "usage", {})
-            if isinstance(usage_obj, BaseModel) and not _is_known_usage_objects(
-                usage_obj=usage_obj
-            ):
-                setattr(
-                    completion_response,
-                    "usage",
-                    litellm.Usage(**usage_obj.model_dump()),
-                )
-            if usage_obj is None:
-                _usage = {}
-            elif isinstance(usage_obj, BaseModel):
-                _usage = usage_obj.model_dump()
-            else:
-                _usage = usage_obj
-
-            if ResponseAPILoggingUtils._is_response_api_usage(_usage):
-                _usage = (
-                    ResponseAPILoggingUtils._transform_response_api_usage_to_chat_usage(
-                        _usage
-                    ).model_dump()
-                )
-
-            # get input/output tokens from completion_response
-            prompt_tokens = _usage.get("prompt_tokens", 0)
-            completion_tokens = _usage.get("completion_tokens", 0)
-            cache_creation_input_tokens = _usage.get("cache_creation_input_tokens", 0)
-            cache_read_input_tokens = _usage.get("cache_read_input_tokens", 0)
-            if (
-                "prompt_tokens_details" in _usage
-                and _usage["prompt_tokens_details"] != {}
-                and _usage["prompt_tokens_details"]
-            ):
-                prompt_tokens_details = _usage.get("prompt_tokens_details", {})
-                cache_read_input_tokens = prompt_tokens_details.get("cached_tokens", 0)
-
-            total_time = getattr(completion_response, "_response_ms", 0)
-
-            hidden_params = getattr(completion_response, "_hidden_params", None)
-            if hidden_params is not None:
-                custom_llm_provider = hidden_params.get(
-                    "custom_llm_provider", custom_llm_provider or None
-                )
-                region_name = hidden_params.get("region_name", region_name)
-                size = hidden_params.get("optional_params", {}).get(
-                    "size", "1024-x-1024"
-                )  # openai default
-                quality = hidden_params.get("optional_params", {}).get(
-                    "quality", "standard"
-                )  # openai default
-                n = hidden_params.get("optional_params", {}).get(
-                    "n", 1
-                )  # openai default
-        else:
-            if model is None:
-                raise ValueError(
-                    f"Model is None and does not exist in passed completion_response. Passed completion_response={completion_response}, model={model}"
-                )
-            if len(messages) > 0:
-                prompt_tokens = token_counter(model=model, messages=messages)
-            elif len(prompt) > 0:
-                prompt_tokens = token_counter(model=model, text=prompt)
-            completion_tokens = token_counter(model=model, text=completion)
-
-        if model is None:
-            raise ValueError(
-                f"Model is None and does not exist in passed completion_response. Passed completion_response={completion_response}, model={model}"
-            )
-        if custom_llm_provider is None:
+        for idx, model in enumerate(potential_model_names):
            try:
-                model, custom_llm_provider, _, _ = litellm.get_llm_provider(
-                    model=model
-                )  # strip the llm provider from the model name -> for image gen cost calculation
+                verbose_logger.info(
+                    f"selected model name for cost calculation: {model}"
+                )
+
+                if completion_response is not None and (
+                    isinstance(completion_response, BaseModel)
+                    or isinstance(completion_response, dict)
+                ):  # tts returns a custom class
+                    if isinstance(completion_response, dict):
+                        usage_obj: Optional[
+                            Union[dict, Usage]
+                        ] = completion_response.get("usage", {})
+                    else:
+                        usage_obj = getattr(completion_response, "usage", {})
+                    if isinstance(usage_obj, BaseModel) and not _is_known_usage_objects(
+                        usage_obj=usage_obj
+                    ):
+                        setattr(
+                            completion_response,
+                            "usage",
+                            litellm.Usage(**usage_obj.model_dump()),
+                        )
+                    if usage_obj is None:
+                        _usage = {}
+                    elif isinstance(usage_obj, BaseModel):
+                        _usage = usage_obj.model_dump()
+                    else:
+                        _usage = usage_obj
+
+                    if ResponseAPILoggingUtils._is_response_api_usage(_usage):
+                        _usage = ResponseAPILoggingUtils._transform_response_api_usage_to_chat_usage(
+                            _usage
+                        ).model_dump()
+
+                    # get input/output tokens from completion_response
+                    prompt_tokens = _usage.get("prompt_tokens", 0)
+                    completion_tokens = _usage.get("completion_tokens", 0)
+                    cache_creation_input_tokens = _usage.get(
+                        "cache_creation_input_tokens", 0
+                    )
+                    cache_read_input_tokens = _usage.get("cache_read_input_tokens", 0)
+                    if (
+                        "prompt_tokens_details" in _usage
+                        and _usage["prompt_tokens_details"] != {}
+                        and _usage["prompt_tokens_details"]
+                    ):
+                        prompt_tokens_details = _usage.get("prompt_tokens_details", {})
+                        cache_read_input_tokens = prompt_tokens_details.get(
+                            "cached_tokens", 0
+                        )
+
+                    total_time = getattr(completion_response, "_response_ms", 0)
+
+                    hidden_params = getattr(completion_response, "_hidden_params", None)
+                    if hidden_params is not None:
+                        custom_llm_provider = hidden_params.get(
+                            "custom_llm_provider", custom_llm_provider or None
+                        )
+                        region_name = hidden_params.get("region_name", region_name)
+                        size = hidden_params.get("optional_params", {}).get(
+                            "size", "1024-x-1024"
+                        )  # openai default
+                        quality = hidden_params.get("optional_params", {}).get(
+                            "quality", "standard"
+                        )  # openai default
+                        n = hidden_params.get("optional_params", {}).get(
+                            "n", 1
+                        )  # openai default
+                else:
+                    if model is None:
+                        raise ValueError(
+                            f"Model is None and does not exist in passed completion_response. Passed completion_response={completion_response}, model={model}"
+                        )
+                    if len(messages) > 0:
+                        prompt_tokens = token_counter(model=model, messages=messages)
+                    elif len(prompt) > 0:
+                        prompt_tokens = token_counter(model=model, text=prompt)
+                    completion_tokens = token_counter(model=model, text=completion)
+
+                if model is None:
+                    raise ValueError(
+                        f"Model is None and does not exist in passed completion_response. Passed completion_response={completion_response}, model={model}"
+                    )
+                if custom_llm_provider is None:
+                    try:
+                        model, custom_llm_provider, _, _ = litellm.get_llm_provider(
+                            model=model
+                        )  # strip the llm provider from the model name -> for image gen cost calculation
+                    except Exception as e:
+                        verbose_logger.debug(
+                            "litellm.cost_calculator.py::completion_cost() - Error inferring custom_llm_provider - {}".format(
+                                str(e)
+                            )
+                        )
+                if (
+                    call_type == CallTypes.image_generation.value
+                    or call_type == CallTypes.aimage_generation.value
+                    or call_type
+                    == PassthroughCallTypes.passthrough_image_generation.value
+                ):
+                    ### IMAGE GENERATION COST CALCULATION ###
+                    if custom_llm_provider == "vertex_ai":
+                        if isinstance(completion_response, ImageResponse):
+                            return vertex_ai_image_cost_calculator(
+                                model=model,
+                                image_response=completion_response,
+                            )
+                    elif custom_llm_provider == "bedrock":
+                        if isinstance(completion_response, ImageResponse):
+                            return bedrock_image_cost_calculator(
+                                model=model,
+                                size=size,
+                                image_response=completion_response,
+                                optional_params=optional_params,
+                            )
+                        raise TypeError(
+                            "completion_response must be of type ImageResponse for bedrock image cost calculation"
+                        )
+                    else:
+                        return default_image_cost_calculator(
+                            model=model,
+                            quality=quality,
+                            custom_llm_provider=custom_llm_provider,
+                            n=n,
+                            size=size,
+                            optional_params=optional_params,
+                        )
+                elif (
+                    call_type == CallTypes.speech.value
+                    or call_type == CallTypes.aspeech.value
+                ):
+                    prompt_characters = litellm.utils._count_characters(text=prompt)
+                elif (
+                    call_type == CallTypes.atranscription.value
+                    or call_type == CallTypes.transcription.value
+                ):
+                    audio_transcription_file_duration = getattr(
+                        completion_response, "duration", 0.0
+                    )
+                elif (
+                    call_type == CallTypes.rerank.value
+                    or call_type == CallTypes.arerank.value
+                ):
+                    if completion_response is not None and isinstance(
+                        completion_response, RerankResponse
+                    ):
+                        meta_obj = completion_response.meta
+                        if meta_obj is not None:
+                            billed_units = meta_obj.get("billed_units", {}) or {}
+                        else:
+                            billed_units = {}
+
+                        rerank_billed_units = RerankBilledUnits(
+                            search_units=billed_units.get("search_units"),
+                            total_tokens=billed_units.get("total_tokens"),
+                        )
+
+                        search_units = (
+                            billed_units.get("search_units") or 1
+                        )  # cohere charges per request by default.
+                        completion_tokens = search_units
+                # Calculate cost based on prompt_tokens, completion_tokens
+                if (
+                    "togethercomputer" in model
+                    or "together_ai" in model
+                    or custom_llm_provider == "together_ai"
+                ):
+                    # together ai prices based on size of llm
+                    # get_model_params_and_category takes a model name and returns the category of LLM size it is in model_prices_and_context_window.json
+
+                    model = get_model_params_and_category(
+                        model, call_type=CallTypes(call_type)
+                    )
+
+                # replicate llms are calculate based on time for request running
+                # see https://replicate.com/pricing
+                elif (
+                    model in litellm.replicate_models or "replicate" in model
+                ) and model not in litellm.model_cost:
+                    # for unmapped replicate model, default to replicate's time tracking logic
+                    return get_replicate_completion_pricing(completion_response, total_time)  # type: ignore
+
+                if model is None:
+                    raise ValueError(
+                        f"Model is None and does not exist in passed completion_response. Passed completion_response={completion_response}, model={model}"
+                    )
+
+                if (
+                    custom_llm_provider is not None
+                    and custom_llm_provider == "vertex_ai"
+                ):
+                    # Calculate the prompt characters + response characters
+                    if len(messages) > 0:
+                        prompt_string = litellm.utils.get_formatted_prompt(
+                            data={"messages": messages}, call_type="completion"
+                        )
+
+                        prompt_characters = litellm.utils._count_characters(
+                            text=prompt_string
+                        )
+                    if completion_response is not None and isinstance(
+                        completion_response, ModelResponse
+                    ):
+                        completion_string = litellm.utils.get_response_string(
+                            response_obj=completion_response
+                        )
+                        completion_characters = litellm.utils._count_characters(
+                            text=completion_string
+                        )
+
+                (
+                    prompt_tokens_cost_usd_dollar,
+                    completion_tokens_cost_usd_dollar,
+                ) = cost_per_token(
+                    model=model,
+                    prompt_tokens=prompt_tokens,
+                    completion_tokens=completion_tokens,
+                    custom_llm_provider=custom_llm_provider,
+                    response_time_ms=total_time,
+                    region_name=region_name,
+                    custom_cost_per_second=custom_cost_per_second,
+                    custom_cost_per_token=custom_cost_per_token,
+                    prompt_characters=prompt_characters,
+                    completion_characters=completion_characters,
+                    cache_creation_input_tokens=cache_creation_input_tokens,
+                    cache_read_input_tokens=cache_read_input_tokens,
+                    usage_object=cost_per_token_usage_object,
+                    call_type=cast(CallTypesLiteral, call_type),
+                    audio_transcription_file_duration=audio_transcription_file_duration,
+                    rerank_billed_units=rerank_billed_units,
+                )
+                _final_cost = (
+                    prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
+                )
+                _final_cost += (
+                    StandardBuiltInToolCostTracking.get_cost_for_built_in_tools(
+                        model=model,
+                        response_object=completion_response,
+                        standard_built_in_tools_params=standard_built_in_tools_params,
+                        custom_llm_provider=custom_llm_provider,
+                    )
+                )
+                return _final_cost
            except Exception as e:
                verbose_logger.debug(
-                    "litellm.cost_calculator.py::completion_cost() - Error inferring custom_llm_provider - {}".format(
-                        str(e)
+                    "litellm.cost_calculator.py::completion_cost() - Error calculating cost for model={} - {}".format(
+                        model, str(e)
                    )
                )
-        if (
-            call_type == CallTypes.image_generation.value
-            or call_type == CallTypes.aimage_generation.value
-            or call_type == PassthroughCallTypes.passthrough_image_generation.value
-        ):
-            ### IMAGE GENERATION COST CALCULATION ###
-            if custom_llm_provider == "vertex_ai":
-                if isinstance(completion_response, ImageResponse):
-                    return vertex_ai_image_cost_calculator(
-                        model=model,
-                        image_response=completion_response,
-                    )
-            elif custom_llm_provider == "bedrock":
-                if isinstance(completion_response, ImageResponse):
-                    return bedrock_image_cost_calculator(
-                        model=model,
-                        size=size,
-                        image_response=completion_response,
-                        optional_params=optional_params,
-                    )
-                raise TypeError(
-                    "completion_response must be of type ImageResponse for bedrock image cost calculation"
-                )
-            else:
-                return default_image_cost_calculator(
-                    model=model,
-                    quality=quality,
-                    custom_llm_provider=custom_llm_provider,
-                    n=n,
-                    size=size,
-                    optional_params=optional_params,
-                )
-        elif (
-            call_type == CallTypes.speech.value or call_type == CallTypes.aspeech.value
-        ):
-            prompt_characters = litellm.utils._count_characters(text=prompt)
-        elif (
-            call_type == CallTypes.atranscription.value
-            or call_type == CallTypes.transcription.value
-        ):
-            audio_transcription_file_duration = getattr(
-                completion_response, "duration", 0.0
+                if idx == len(potential_model_names) - 1:
+                    raise e
+        raise Exception(
+            "Unable to calculat cost for received potential model names - {}".format(
+                potential_model_names
            )
-        elif (
-            call_type == CallTypes.rerank.value or call_type == CallTypes.arerank.value
-        ):
-            if completion_response is not None and isinstance(
-                completion_response, RerankResponse
-            ):
-                meta_obj = completion_response.meta
-                if meta_obj is not None:
-                    billed_units = meta_obj.get("billed_units", {}) or {}
-                else:
-                    billed_units = {}
-
-                rerank_billed_units = RerankBilledUnits(
-                    search_units=billed_units.get("search_units"),
-                    total_tokens=billed_units.get("total_tokens"),
-                )
-
-                search_units = (
-                    billed_units.get("search_units") or 1
-                )  # cohere charges per request by default.
-                completion_tokens = search_units
-        # Calculate cost based on prompt_tokens, completion_tokens
-        if (
-            "togethercomputer" in model
-            or "together_ai" in model
-            or custom_llm_provider == "together_ai"
-        ):
-            # together ai prices based on size of llm
-            # get_model_params_and_category takes a model name and returns the category of LLM size it is in model_prices_and_context_window.json
-
-            model = get_model_params_and_category(model, call_type=CallTypes(call_type))
-
-        # replicate llms are calculate based on time for request running
-        # see https://replicate.com/pricing
-        elif (
-            model in litellm.replicate_models or "replicate" in model
-        ) and model not in litellm.model_cost:
-            # for unmapped replicate model, default to replicate's time tracking logic
-            return get_replicate_completion_pricing(completion_response, total_time)  # type: ignore
-
-        if model is None:
-            raise ValueError(
-                f"Model is None and does not exist in passed completion_response. Passed completion_response={completion_response}, model={model}"
-            )
-
-        if custom_llm_provider is not None and custom_llm_provider == "vertex_ai":
-            # Calculate the prompt characters + response characters
-            if len(messages) > 0:
-                prompt_string = litellm.utils.get_formatted_prompt(
-                    data={"messages": messages}, call_type="completion"
-                )
-
-                prompt_characters = litellm.utils._count_characters(text=prompt_string)
-            if completion_response is not None and isinstance(
-                completion_response, ModelResponse
-            ):
-                completion_string = litellm.utils.get_response_string(
-                    response_obj=completion_response
-                )
-                completion_characters = litellm.utils._count_characters(
-                    text=completion_string
-                )
-
-        (
-            prompt_tokens_cost_usd_dollar,
-            completion_tokens_cost_usd_dollar,
-        ) = cost_per_token(
-            model=model,
-            prompt_tokens=prompt_tokens,
-            completion_tokens=completion_tokens,
-            custom_llm_provider=custom_llm_provider,
-            response_time_ms=total_time,
-            region_name=region_name,
-            custom_cost_per_second=custom_cost_per_second,
-            custom_cost_per_token=custom_cost_per_token,
-            prompt_characters=prompt_characters,
-            completion_characters=completion_characters,
-            cache_creation_input_tokens=cache_creation_input_tokens,
-            cache_read_input_tokens=cache_read_input_tokens,
-            usage_object=cost_per_token_usage_object,
-            call_type=call_type,
-            audio_transcription_file_duration=audio_transcription_file_duration,
-            rerank_billed_units=rerank_billed_units,
        )
-        _final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
-        _final_cost += StandardBuiltInToolCostTracking.get_cost_for_built_in_tools(
-            model=model,
-            response_object=completion_response,
-            standard_built_in_tools_params=standard_built_in_tools_params,
-            custom_llm_provider=custom_llm_provider,
-        )
-
-        return _final_cost
    except Exception as e:
        raise e

@ -897,6 +934,7 @@ def response_cost_calculator(
    custom_pricing: Optional[bool] = None,
    prompt: str = "",
    standard_built_in_tools_params: Optional[StandardBuiltInToolsParams] = None,
+    litellm_model_name: Optional[str] = None,
 ) -> float:
    """
    Returns
--- a/litellm/litellm_core_utils/litellm_logging.py
+++ b/litellm/litellm_core_utils/litellm_logging.py
@ -290,6 +290,7 @@ class Logging(LiteLLMLoggingBaseClass):
            "input": _input,
            "litellm_params": litellm_params,
            "applied_guardrails": applied_guardrails,
+            "model": model,
        }

    def process_dynamic_callbacks(self):
@ -892,6 +893,7 @@ class Logging(LiteLLMLoggingBaseClass):
            ResponseCompletedEvent,
        ],
        cache_hit: Optional[bool] = None,
+        litellm_model_name: Optional[str] = None,
    ) -> Optional[float]:
        """
        Calculate response cost using result + logging object variables.
@ -917,7 +919,7 @@ class Logging(LiteLLMLoggingBaseClass):
        try:
            response_cost_calculator_kwargs = {
                "response_object": result,
-                "model": self.model,
+                "model": litellm_model_name or self.model,
                "cache_hit": cache_hit,
                "custom_llm_provider": self.model_call_details.get(
                    "custom_llm_provider", None
@ -1009,6 +1011,10 @@ class Logging(LiteLLMLoggingBaseClass):
                return False
        return True

+    def _update_completion_start_time(self, completion_start_time: datetime.datetime):
+        self.completion_start_time = completion_start_time
+        self.model_call_details["completion_start_time"] = self.completion_start_time
+
    def _success_handler_helper_fn(
        self,
        result=None,
--- a/litellm/litellm_core_utils/prompt_templates/factory.py
+++ b/litellm/litellm_core_utils/prompt_templates/factory.py
@ -22,6 +22,7 @@ from litellm.types.llms.openai import (
    AllMessageValues,
    ChatCompletionAssistantMessage,
    ChatCompletionAssistantToolCall,
+    ChatCompletionFileObject,
    ChatCompletionFunctionMessage,
    ChatCompletionImageObject,
    ChatCompletionTextObject,
@ -1455,6 +1456,25 @@ def anthropic_messages_pt(  # noqa: PLR0915
                            user_content.append(_content_element)
                        elif m.get("type", "") == "document":
                            user_content.append(cast(AnthropicMessagesDocumentParam, m))
+                        elif m.get("type", "") == "file":
+                            file_message = cast(ChatCompletionFileObject, m)
+                            file_data = file_message["file"].get("file_data")
+                            if file_data:
+                                image_chunk = convert_to_anthropic_image_obj(
+                                    openai_image_url=file_data,
+                                    format=file_message["file"].get("format"),
+                                )
+                                anthropic_document_param = (
+                                    AnthropicMessagesDocumentParam(
+                                        type="document",
+                                        source=AnthropicContentParamSource(
+                                            type="base64",
+                                            media_type=image_chunk["media_type"],
+                                            data=image_chunk["data"],
+                                        ),
+                                    )
+                                )
+                                user_content.append(anthropic_document_param)
                elif isinstance(user_message_types_block["content"], str):
                    _anthropic_content_text_element: AnthropicMessagesTextParam = {
                        "type": "text",
@ -2885,6 +2905,11 @@ class BedrockConverseMessagesProcessor:
                                    image_url=image_url, format=format
                                )
                                _parts.append(_part)  # type: ignore
+                            elif element["type"] == "file":
+                                _part = await BedrockConverseMessagesProcessor._async_process_file_message(
+                                    message=cast(ChatCompletionFileObject, element)
+                                )
+                                _parts.append(_part)
                            _cache_point_block = (
                                litellm.AmazonConverseConfig()._get_cache_point_block(
                                    message_block=cast(
@ -3054,6 +3079,45 @@ class BedrockConverseMessagesProcessor:
            reasoning_content_blocks.append(bedrock_content_block)
        return reasoning_content_blocks

+    @staticmethod
+    def _process_file_message(message: ChatCompletionFileObject) -> BedrockContentBlock:
+        file_message = message["file"]
+        file_data = file_message.get("file_data")
+        file_id = file_message.get("file_id")
+
+        if file_data is None and file_id is None:
+            raise litellm.BadRequestError(
+                message="file_data and file_id cannot both be None. Got={}".format(
+                    message
+                ),
+                model="",
+                llm_provider="bedrock",
+            )
+        format = file_message.get("format")
+        return BedrockImageProcessor.process_image_sync(
+            image_url=cast(str, file_id or file_data), format=format
+        )
+
+    @staticmethod
+    async def _async_process_file_message(
+        message: ChatCompletionFileObject,
+    ) -> BedrockContentBlock:
+        file_message = message["file"]
+        file_data = file_message.get("file_data")
+        file_id = file_message.get("file_id")
+        format = file_message.get("format")
+        if file_data is None and file_id is None:
+            raise litellm.BadRequestError(
+                message="file_data and file_id cannot both be None. Got={}".format(
+                    message
+                ),
+                model="",
+                llm_provider="bedrock",
+            )
+        return await BedrockImageProcessor.process_image_async(
+            image_url=cast(str, file_id or file_data), format=format
+        )
+

 def _bedrock_converse_messages_pt(  # noqa: PLR0915
    messages: List,
@ -3126,6 +3190,13 @@ def _bedrock_converse_messages_pt(  # noqa: PLR0915
                                format=format,
                            )
                            _parts.append(_part)  # type: ignore
+                        elif element["type"] == "file":
+                            _part = (
+                                BedrockConverseMessagesProcessor._process_file_message(
+                                    message=cast(ChatCompletionFileObject, element)
+                                )
+                            )
+                            _parts.append(_part)
                        _cache_point_block = (
                            litellm.AmazonConverseConfig()._get_cache_point_block(
                                message_block=cast(
--- a/litellm/litellm_core_utils/streaming_handler.py
+++ b/litellm/litellm_core_utils/streaming_handler.py
@ -1,5 +1,6 @@
 import asyncio
 import collections.abc
+import datetime
 import json
 import threading
 import time
@ -1567,6 +1568,10 @@ class CustomStreamWrapper:

                    if response is None:
                        continue
+                    if self.logging_obj.completion_start_time is None:
+                        self.logging_obj._update_completion_start_time(
+                            completion_start_time=datetime.datetime.now()
+                        )
                    ## LOGGING
                    executor.submit(
                        self.run_success_logging_and_cache_storage,
@ -1721,6 +1726,11 @@ class CustomStreamWrapper:
                    if processed_chunk is None:
                        continue

+                    if self.logging_obj.completion_start_time is None:
+                        self.logging_obj._update_completion_start_time(
+                            completion_start_time=datetime.datetime.now()
+                        )
+
                    choice = processed_chunk.choices[0]
                    if isinstance(choice, StreamingChoices):
                        self.response_uptil_now += choice.delta.get("content", "") or ""
--- a/litellm/llms/anthropic/chat/transformation.py
+++ b/litellm/llms/anthropic/chat/transformation.py
@ -18,8 +18,10 @@ from litellm.types.llms.anthropic import (
    AnthropicMessagesTool,
    AnthropicMessagesToolChoice,
    AnthropicSystemMessageContent,
+    AnthropicThinkingParam,
 )
 from litellm.types.llms.openai import (
+    REASONING_EFFORT,
    AllMessageValues,
    ChatCompletionCachedContent,
    ChatCompletionSystemMessage,
@ -94,6 +96,7 @@ class AnthropicConfig(BaseConfig):
            "parallel_tool_calls",
            "response_format",
            "user",
+            "reasoning_effort",
        ]

        if "claude-3-7-sonnet" in model:
@ -141,15 +144,9 @@ class AnthropicConfig(BaseConfig):
        if user_anthropic_beta_headers is not None:
            betas.update(user_anthropic_beta_headers)

-        # Handle beta headers for Vertex AI
-        # We allow prompt caching beta header for Vertex, but exclude other beta headers that might cause issues
+        # Don't send any beta headers to Vertex, Vertex has failed requests when they are sent
        if is_vertex_request is True:
-            vertex_safe_betas = set()
-            # Allow prompt caching beta header for Vertex
-            if "prompt-caching-2024-07-31" in betas:
-                vertex_safe_betas.add("prompt-caching-2024-07-31")
-            if len(vertex_safe_betas) > 0:
-                headers["anthropic-beta"] = ",".join(vertex_safe_betas)
+            pass
        elif len(betas) > 0:
            headers["anthropic-beta"] = ",".join(betas)

@ -297,6 +294,21 @@ class AnthropicConfig(BaseConfig):
                new_stop = new_v
        return new_stop

+    @staticmethod
+    def _map_reasoning_effort(
+        reasoning_effort: Optional[Union[REASONING_EFFORT, str]]
+    ) -> Optional[AnthropicThinkingParam]:
+        if reasoning_effort is None:
+            return None
+        elif reasoning_effort == "low":
+            return AnthropicThinkingParam(type="enabled", budget_tokens=1024)
+        elif reasoning_effort == "medium":
+            return AnthropicThinkingParam(type="enabled", budget_tokens=2048)
+        elif reasoning_effort == "high":
+            return AnthropicThinkingParam(type="enabled", budget_tokens=4096)
+        else:
+            raise ValueError(f"Unmapped reasoning effort: {reasoning_effort}")
+
    def map_openai_params(
        self,
        non_default_params: dict,
@ -308,10 +320,6 @@ class AnthropicConfig(BaseConfig):
            non_default_params=non_default_params
        )

-        ## handle thinking tokens
-        self.update_optional_params_with_thinking_tokens(
-            non_default_params=non_default_params, optional_params=optional_params
-        )
        for param, value in non_default_params.items():
            if param == "max_tokens":
                optional_params["max_tokens"] = value
@ -376,7 +384,15 @@ class AnthropicConfig(BaseConfig):
                optional_params["metadata"] = {"user_id": value}
            if param == "thinking":
                optional_params["thinking"] = value
+            elif param == "reasoning_effort" and isinstance(value, str):
+                optional_params["thinking"] = AnthropicConfig._map_reasoning_effort(
+                    value
+                )

+        ## handle thinking tokens
+        self.update_optional_params_with_thinking_tokens(
+            non_default_params=non_default_params, optional_params=optional_params
+        )
        return optional_params

    def _create_json_tool_call_for_response_format(
--- a/litellm/llms/anthropic/experimental_pass_through/messages/handler.py
+++ b/litellm/llms/anthropic/experimental_pass_through/messages/handler.py
@ -6,7 +6,7 @@
 """

 import json
-from typing import Any, AsyncIterator, Dict, Optional, Union, cast
+from typing import AsyncIterator, Dict, List, Optional, Union, cast

 import httpx

@ -19,6 +19,9 @@ from litellm.llms.custom_httpx.http_handler import (
    AsyncHTTPHandler,
    get_async_httpx_client,
 )
+from litellm.types.llms.anthropic_messages.anthropic_response import (
+    AnthropicMessagesResponse,
+)
 from litellm.types.router import GenericLiteLLMParams
 from litellm.types.utils import ProviderSpecificHeader
 from litellm.utils import ProviderConfigManager, client
@ -60,14 +63,25 @@ class AnthropicMessagesHandler:

@client
 async def anthropic_messages(
-    api_key: str,
+    max_tokens: int,
+    messages: List[Dict],
    model: str,
-    stream: bool = False,
+    metadata: Optional[Dict] = None,
+    stop_sequences: Optional[List[str]] = None,
+    stream: Optional[bool] = False,
+    system: Optional[str] = None,
+    temperature: Optional[float] = None,
+    thinking: Optional[Dict] = None,
+    tool_choice: Optional[Dict] = None,
+    tools: Optional[List[Dict]] = None,
+    top_k: Optional[int] = None,
+    top_p: Optional[float] = None,
+    api_key: Optional[str] = None,
    api_base: Optional[str] = None,
    client: Optional[AsyncHTTPHandler] = None,
    custom_llm_provider: Optional[str] = None,
    **kwargs,
-) -> Union[Dict[str, Any], AsyncIterator]:
+) -> Union[AnthropicMessagesResponse, AsyncIterator]:
    """
    Makes Anthropic `/v1/messages` API calls In the Anthropic API Spec
    """
@ -129,10 +143,8 @@ async def anthropic_messages(
        },
        custom_llm_provider=_custom_llm_provider,
    )
-    litellm_logging_obj.model_call_details.update(kwargs)
-
    # Prepare request body
-    request_body = kwargs.copy()
+    request_body = locals().copy()
    request_body = {
        k: v
        for k, v in request_body.items()
@ -140,10 +152,12 @@ async def anthropic_messages(
        in anthropic_messages_provider_config.get_supported_anthropic_messages_params(
            model=model
        )
+        and v is not None
    }
    request_body["stream"] = stream
    request_body["model"] = model
    litellm_logging_obj.stream = stream
+    litellm_logging_obj.model_call_details.update(request_body)

    # Make the request
    request_url = anthropic_messages_provider_config.get_complete_url(
@ -164,7 +178,7 @@ async def anthropic_messages(
        url=request_url,
        headers=headers,
        data=json.dumps(request_body),
-        stream=stream,
+        stream=stream or False,
    )
    response.raise_for_status()

--- a/litellm/llms/base_llm/chat/transformation.py
+++ b/litellm/llms/base_llm/chat/transformation.py
@ -104,7 +104,10 @@ class BaseConfig(ABC):
        return type_to_response_format_param(response_format=response_format)

    def is_thinking_enabled(self, non_default_params: dict) -> bool:
-        return non_default_params.get("thinking", {}).get("type", None) == "enabled"
+        return (
+            non_default_params.get("thinking", {}).get("type") == "enabled"
+            or non_default_params.get("reasoning_effort") is not None
+        )

    def update_optional_params_with_thinking_tokens(
        self, non_default_params: dict, optional_params: dict
@ -116,9 +119,9 @@ class BaseConfig(ABC):

        if 'thinking' is enabled and 'max_tokens' is not specified, set 'max_tokens' to the thinking token budget + DEFAULT_MAX_TOKENS
        """
-        is_thinking_enabled = self.is_thinking_enabled(non_default_params)
+        is_thinking_enabled = self.is_thinking_enabled(optional_params)
        if is_thinking_enabled and "max_tokens" not in non_default_params:
-            thinking_token_budget = cast(dict, non_default_params["thinking"]).get(
+            thinking_token_budget = cast(dict, optional_params["thinking"]).get(
                "budget_tokens", None
            )
            if thinking_token_budget is not None:
--- a/litellm/llms/bedrock/chat/converse_transformation.py
+++ b/litellm/llms/bedrock/chat/converse_transformation.py
@ -17,6 +17,7 @@ from litellm.litellm_core_utils.prompt_templates.factory import (
    _bedrock_converse_messages_pt,
    _bedrock_tools_pt,
 )
+from litellm.llms.anthropic.chat.transformation import AnthropicConfig
 from litellm.llms.base_llm.chat.transformation import BaseConfig, BaseLLMException
 from litellm.types.llms.bedrock import *
 from litellm.types.llms.openai import (
@ -128,6 +129,7 @@ class AmazonConverseConfig(BaseConfig):
            "claude-3-7" in model
        ):  # [TODO]: move to a 'supports_reasoning_content' param from model cost map
            supported_params.append("thinking")
+            supported_params.append("reasoning_effort")
        return supported_params

    def map_tool_choice_values(
@ -218,9 +220,7 @@ class AmazonConverseConfig(BaseConfig):
        messages: Optional[List[AllMessageValues]] = None,
    ) -> dict:
        is_thinking_enabled = self.is_thinking_enabled(non_default_params)
-        self.update_optional_params_with_thinking_tokens(
-            non_default_params=non_default_params, optional_params=optional_params
-        )
+
        for param, value in non_default_params.items():
            if param == "response_format" and isinstance(value, dict):
                ignore_response_format_types = ["text"]
@ -297,6 +297,14 @@ class AmazonConverseConfig(BaseConfig):
                    optional_params["tool_choice"] = _tool_choice_value
            if param == "thinking":
                optional_params["thinking"] = value
+            elif param == "reasoning_effort" and isinstance(value, str):
+                optional_params["thinking"] = AnthropicConfig._map_reasoning_effort(
+                    value
+                )
+
+        self.update_optional_params_with_thinking_tokens(
+            non_default_params=non_default_params, optional_params=optional_params
+        )

        return optional_params

--- a/litellm/llms/openrouter/chat/transformation.py
+++ b/litellm/llms/openrouter/chat/transformation.py
@ -12,6 +12,7 @@ import httpx

 from litellm.llms.base_llm.base_model_iterator import BaseModelResponseIterator
 from litellm.llms.base_llm.chat.transformation import BaseLLMException
+from litellm.types.llms.openrouter import OpenRouterErrorMessage
 from litellm.types.utils import ModelResponse, ModelResponseStream

 from ...openai.chat.gpt_transformation import OpenAIGPTConfig
@ -71,6 +72,24 @@ class OpenrouterConfig(OpenAIGPTConfig):
 class OpenRouterChatCompletionStreamingHandler(BaseModelResponseIterator):
    def chunk_parser(self, chunk: dict) -> ModelResponseStream:
        try:
+            ## HANDLE ERROR IN CHUNK ##
+            if "error" in chunk:
+                error_chunk = chunk["error"]
+                error_message = OpenRouterErrorMessage(
+                    message="Message: {}, Metadata: {}, User ID: {}".format(
+                        error_chunk["message"],
+                        error_chunk.get("metadata", {}),
+                        error_chunk.get("user_id", ""),
+                    ),
+                    code=error_chunk["code"],
+                    metadata=error_chunk.get("metadata", {}),
+                )
+                raise OpenRouterException(
+                    message=error_message["message"],
+                    status_code=error_message["code"],
+                    headers=error_message["metadata"].get("headers", {}),
+                )
+
            new_choices = []
            for choice in chunk["choices"]:
                choice["delta"]["reasoning_content"] = choice["delta"].get("reasoning")
--- a/litellm/llms/sagemaker/common_utils.py
+++ b/litellm/llms/sagemaker/common_utils.py
@ -127,21 +127,25 @@ class AWSEventStreamDecoder:
        async for chunk in iterator:
            event_stream_buffer.add_data(chunk)
            for event in event_stream_buffer:
-                message = self._parse_message_from_event(event)
-                if message:
-                    verbose_logger.debug("sagemaker  parsed chunk bytes %s", message)
-                    # remove data: prefix and "\n\n" at the end
-                    message = (
-                        litellm.CustomStreamWrapper._strip_sse_data_from_chunk(message)
-                        or ""
-                    )
-                    message = message.replace("\n\n", "")
+                try:
+                    message = self._parse_message_from_event(event)
+                    if message:
+                        verbose_logger.debug(
+                            "sagemaker  parsed chunk bytes %s", message
+                        )
+                        # remove data: prefix and "\n\n" at the end
+                        message = (
+                            litellm.CustomStreamWrapper._strip_sse_data_from_chunk(
+                                message
+                            )
+                            or ""
+                        )
+                        message = message.replace("\n\n", "")

-                    # Accumulate JSON data
-                    accumulated_json += message
+                        # Accumulate JSON data
+                        accumulated_json += message

-                    # Try to parse the accumulated JSON
-                    try:
+                        # Try to parse the accumulated JSON
                        _data = json.loads(accumulated_json)
                        if self.is_messages_api:
                            yield self._chunk_parser_messages_api(chunk_data=_data)
@ -149,9 +153,19 @@ class AWSEventStreamDecoder:
                            yield self._chunk_parser(chunk_data=_data)
                        # Reset accumulated_json after successful parsing
                        accumulated_json = ""
-                    except json.JSONDecodeError:
-                        # If it's not valid JSON yet, continue to the next event
-                        continue
+                except json.JSONDecodeError:
+                    # If it's not valid JSON yet, continue to the next event
+                    continue
+                except UnicodeDecodeError as e:
+                    verbose_logger.warning(
+                        f"UnicodeDecodeError: {e}. Attempting to combine with next event."
+                    )
+                    continue
+                except Exception as e:
+                    verbose_logger.error(
+                        f"Error parsing message: {e}. Attempting to combine with next event."
+                    )
+                    continue

        # Handle any remaining data after the iterator is exhausted
        if accumulated_json:
@ -167,6 +181,8 @@ class AWSEventStreamDecoder:
                    f"Warning: Unparseable JSON data remained: {accumulated_json}"
                )
                yield None
+            except Exception as e:
+                verbose_logger.error(f"Final error parsing accumulated JSON: {e}")

    def _parse_message_from_event(self, event) -> Optional[str]:
        response_dict = event.to_response_dict()
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -4453,6 +4453,42 @@
        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models",
        "supports_tool_choice": true
    },
+    "gemini-2.5-pro-exp-03-25": {
+        "max_tokens": 65536,
+        "max_input_tokens": 1048576,
+        "max_output_tokens": 65536,
+        "max_images_per_prompt": 3000,
+        "max_videos_per_prompt": 10,
+        "max_video_length": 1,
+        "max_audio_length_hours": 8.4,
+        "max_audio_per_prompt": 1,
+        "max_pdf_size_mb": 30,
+        "input_cost_per_image": 0,
+        "input_cost_per_video_per_second": 0,
+        "input_cost_per_audio_per_second": 0,
+        "input_cost_per_token": 0,
+        "input_cost_per_character": 0, 
+        "input_cost_per_token_above_128k_tokens": 0, 
+        "input_cost_per_character_above_128k_tokens": 0, 
+        "input_cost_per_image_above_128k_tokens": 0,
+        "input_cost_per_video_per_second_above_128k_tokens": 0,
+        "input_cost_per_audio_per_second_above_128k_tokens": 0,
+        "output_cost_per_token": 0,
+        "output_cost_per_character": 0,
+        "output_cost_per_token_above_128k_tokens": 0,
+        "output_cost_per_character_above_128k_tokens": 0,
+        "litellm_provider": "vertex_ai-language-models",
+        "mode": "chat",
+        "supports_system_messages": true,
+        "supports_function_calling": true,
+        "supports_vision": true,
+        "supports_audio_input": true,
+        "supports_video_input": true,
+        "supports_pdf_input": true,
+        "supports_response_schema": true,
+        "supports_tool_choice": true,
+        "source": "https://cloud.google.com/vertex-ai/generative-ai/pricing"
+    },
    "gemini-2.0-pro-exp-02-05": {
        "max_tokens": 8192,
        "max_input_tokens": 2097152,
@ -10189,6 +10225,22 @@
        "litellm_provider": "voyage",
        "mode": "rerank"
    },
+    "databricks/databricks-claude-3-7-sonnet": {
+        "max_tokens": 200000,
+        "max_input_tokens": 200000,
+        "max_output_tokens": 128000, 
+        "input_cost_per_token": 0.0000025,
+        "input_dbu_cost_per_token": 0.00003571,
+        "output_cost_per_token": 0.00017857,
+        "output_db_cost_per_token": 0.000214286,
+        "litellm_provider": "databricks",
+        "mode": "chat",
+        "source": "https://www.databricks.com/product/pricing/foundation-model-serving",
+        "metadata": {"notes": "Input/output cost per token is dbu cost * $0.070, based on databricks Claude 3.7 conversion. Number provided for reference, '*_dbu_cost_per_token' used in actual calculation."},
+        "supports_assistant_prefill": true,
+        "supports_function_calling": true,
+        "supports_tool_choice": true
+    },
    "databricks/databricks-meta-llama-3-1-405b-instruct": {
        "max_tokens": 128000,
        "max_input_tokens": 128000,
@ -10217,7 +10269,7 @@
        "metadata": {"notes": "Input/output cost per token is dbu cost * $0.070, based on databricks Llama 3.1 70B conversion. Number provided for reference, '*_dbu_cost_per_token' used in actual calculation."},
        "supports_tool_choice": true
    },
-    "databricks/meta-llama-3.3-70b-instruct": {
+    "databricks/databricks-meta-llama-3-3-70b-instruct": {
        "max_tokens": 128000,
        "max_input_tokens": 128000,
        "max_output_tokens": 128000, 
--- a/litellm/proxy/_experimental/mcp_server/mcp_server_manager.py
+++ b/litellm/proxy/_experimental/mcp_server/mcp_server_manager.py
@ -3,7 +3,7 @@ MCP Client Manager

 This class is responsible for managing MCP SSE clients.

-This is a Proxy 
+This is a Proxy
 """

 import asyncio
--- a/litellm/proxy/_experimental/out/_next/static/Yb50LG5p7c9QpG54GIoFV/_buildManifest.js
+++ b/litellm/proxy/_experimental/out/_next/static/Yb50LG5p7c9QpG54GIoFV/_buildManifest.js
--- a/litellm/proxy/_experimental/out/_next/static/Yb50LG5p7c9QpG54GIoFV/_ssgManifest.js
+++ b/litellm/proxy/_experimental/out/_next/static/Yb50LG5p7c9QpG54GIoFV/_ssgManifest.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/250-601568e45a5ffece.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/250-601568e45a5ffece.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/250-dfc03a6fb4f0d254.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/250-dfc03a6fb4f0d254.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/274-bddaf0cf6c91e72f.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/274-bddaf0cf6c91e72f.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/394-48a36e9c9b2cb488.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/394-48a36e9c9b2cb488.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/699-87224ecba28f1f48.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/699-87224ecba28f1f48.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/onboarding/page-2bf7a26db5342dbf.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/onboarding/page-2bf7a26db5342dbf.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/page-0f46d4a8b9bdf1c0.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/page-0f46d4a8b9bdf1c0.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/page-e21d4be3d6c3c16e.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/page-e21d4be3d6c3c16e.js
--- a/litellm/proxy/_experimental/out/_next/static/css/169f9187db1ec37e.css
+++ b/litellm/proxy/_experimental/out/_next/static/css/169f9187db1ec37e.css
--- a/litellm/proxy/_experimental/out/_next/static/css/1f6915676624c422.css
+++ b/litellm/proxy/_experimental/out/_next/static/css/1f6915676624c422.css
--- a/litellm/proxy/_experimental/out/index.html
+++ b/litellm/proxy/_experimental/out/index.html
@ -1 +1 @@
-<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-75a5453f51d60261.js"/><script src="/ui/_next/static/chunks/fd9d1056-524b80e1a6b8bb06.js" async=""></script><script src="/ui/_next/static/chunks/117-883150efc583d711.js" async=""></script><script src="/ui/_next/static/chunks/main-app-4f7318ae681a6d94.js" async=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-42372ed130431b0a.js" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-75a5453f51d60261.js" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/a34f9d1faa5f3315-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/86f6cc749f6b8493.css\",\"style\"]\n3:HL[\"/ui/_next/static/css/169f9187db1ec37e.css\",\"style\"]\n"])</script><script>self.__next_f.push([1,"4:I[12846,[],\"\"]\n6:I[19107,[],\"ClientPageRoot\"]\n7:I[20314,[\"665\",\"static/chunks/3014691f-0b72c78cfebbd712.js\",\"990\",\"static/chunks/13b76428-ebdf3012af0e4489.js\",\"42\",\"static/chunks/42-1cbed529ecb084e0.js\",\"261\",\"static/chunks/261-57d48f76eec1e568.js\",\"899\",\"static/chunks/899-9af4feaf6f21839c.js\",\"394\",\"static/chunks/394-48a36e9c9b2cb488.js\",\"250\",\"static/chunks/250-601568e45a5ffece.js\",\"699\",\"static/chunks/699-2a1c30f260f44c15.js\",\"931\",\"static/chunks/app/page-e21d4be3d6c3c16e.js\"],\"default\",1]\n8:I[4707,[],\"\"]\n9:I[36423,[],\"\"]\nb:I[61060,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"0:[\"$\",\"$L4\",null,{\"buildId\":\"soi--ciJeUE6G2Fk4NMBG\",\"assetPrefix\":\"/ui\",\"urlParts\":[\"\",\"\"],\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[[\"$L5\",[\"$\",\"$L6\",null,{\"props\":{\"params\":{},\"searchParams\":{}},\"Component\":\"$7\"}],null],null],null]},[[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/86f6cc749f6b8493.css\",\"precedence\":\"next\",\"crossOrigin\":\"$undefined\"}],[\"$\",\"link\",\"1\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/169f9187db1ec37e.css\",\"precedence\":\"next\",\"crossOrigin\":\"$undefined\"}]],[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_cf7686\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[]}]}]}]],null],null],\"couldBeIntercepted\":false,\"initialHead\":[null,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script></body></html>
+<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-75a5453f51d60261.js"/><script src="/ui/_next/static/chunks/fd9d1056-524b80e1a6b8bb06.js" async=""></script><script src="/ui/_next/static/chunks/117-883150efc583d711.js" async=""></script><script src="/ui/_next/static/chunks/main-app-4f7318ae681a6d94.js" async=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-42372ed130431b0a.js" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-75a5453f51d60261.js" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/a34f9d1faa5f3315-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/86f6cc749f6b8493.css\",\"style\"]\n3:HL[\"/ui/_next/static/css/1f6915676624c422.css\",\"style\"]\n"])</script><script>self.__next_f.push([1,"4:I[12846,[],\"\"]\n6:I[19107,[],\"ClientPageRoot\"]\n7:I[38411,[\"665\",\"static/chunks/3014691f-0b72c78cfebbd712.js\",\"990\",\"static/chunks/13b76428-ebdf3012af0e4489.js\",\"42\",\"static/chunks/42-1cbed529ecb084e0.js\",\"261\",\"static/chunks/261-57d48f76eec1e568.js\",\"899\",\"static/chunks/899-9af4feaf6f21839c.js\",\"274\",\"static/chunks/274-bddaf0cf6c91e72f.js\",\"250\",\"static/chunks/250-dfc03a6fb4f0d254.js\",\"699\",\"static/chunks/699-87224ecba28f1f48.js\",\"931\",\"static/chunks/app/page-0f46d4a8b9bdf1c0.js\"],\"default\",1]\n8:I[4707,[],\"\"]\n9:I[36423,[],\"\"]\nb:I[61060,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"0:[\"$\",\"$L4\",null,{\"buildId\":\"Yb50LG5p7c9QpG54GIoFV\",\"assetPrefix\":\"/ui\",\"urlParts\":[\"\",\"\"],\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[[\"$L5\",[\"$\",\"$L6\",null,{\"props\":{\"params\":{},\"searchParams\":{}},\"Component\":\"$7\"}],null],null],null]},[[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/86f6cc749f6b8493.css\",\"precedence\":\"next\",\"crossOrigin\":\"$undefined\"}],[\"$\",\"link\",\"1\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/1f6915676624c422.css\",\"precedence\":\"next\",\"crossOrigin\":\"$undefined\"}]],[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_cf7686\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[]}]}]}]],null],null],\"couldBeIntercepted\":false,\"initialHead\":[null,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script></body></html>
--- a/litellm/proxy/_experimental/out/index.txt
+++ b/litellm/proxy/_experimental/out/index.txt
@ -1,7 +1,7 @@
 2:I[19107,[],"ClientPageRoot"]
-3:I[20314,["665","static/chunks/3014691f-0b72c78cfebbd712.js","990","static/chunks/13b76428-ebdf3012af0e4489.js","42","static/chunks/42-1cbed529ecb084e0.js","261","static/chunks/261-57d48f76eec1e568.js","899","static/chunks/899-9af4feaf6f21839c.js","394","static/chunks/394-48a36e9c9b2cb488.js","250","static/chunks/250-601568e45a5ffece.js","699","static/chunks/699-2a1c30f260f44c15.js","931","static/chunks/app/page-e21d4be3d6c3c16e.js"],"default",1]
+3:I[38411,["665","static/chunks/3014691f-0b72c78cfebbd712.js","990","static/chunks/13b76428-ebdf3012af0e4489.js","42","static/chunks/42-1cbed529ecb084e0.js","261","static/chunks/261-57d48f76eec1e568.js","899","static/chunks/899-9af4feaf6f21839c.js","274","static/chunks/274-bddaf0cf6c91e72f.js","250","static/chunks/250-dfc03a6fb4f0d254.js","699","static/chunks/699-87224ecba28f1f48.js","931","static/chunks/app/page-0f46d4a8b9bdf1c0.js"],"default",1]
 4:I[4707,[],""]
 5:I[36423,[],""]
-0:["soi--ciJeUE6G2Fk4NMBG",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},[["$L1",["$","$L2",null,{"props":{"params":{},"searchParams":{}},"Component":"$3"}],null],null],null]},[[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/86f6cc749f6b8493.css","precedence":"next","crossOrigin":"$undefined"}],["$","link","1",{"rel":"stylesheet","href":"/ui/_next/static/css/169f9187db1ec37e.css","precedence":"next","crossOrigin":"$undefined"}]],["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_cf7686","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[]}]}]}]],null],null],["$L6",null]]]]
+0:["Yb50LG5p7c9QpG54GIoFV",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},[["$L1",["$","$L2",null,{"props":{"params":{},"searchParams":{}},"Component":"$3"}],null],null],null]},[[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/86f6cc749f6b8493.css","precedence":"next","crossOrigin":"$undefined"}],["$","link","1",{"rel":"stylesheet","href":"/ui/_next/static/css/1f6915676624c422.css","precedence":"next","crossOrigin":"$undefined"}]],["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_cf7686","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[]}]}]}]],null],null],["$L6",null]]]]
 6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
 1:null
--- a/litellm/proxy/_experimental/out/model_hub.txt
+++ b/litellm/proxy/_experimental/out/model_hub.txt
@ -1,7 +1,7 @@
 2:I[19107,[],"ClientPageRoot"]
-3:I[52829,["42","static/chunks/42-1cbed529ecb084e0.js","261","static/chunks/261-57d48f76eec1e568.js","250","static/chunks/250-601568e45a5ffece.js","699","static/chunks/699-2a1c30f260f44c15.js","418","static/chunks/app/model_hub/page-cde2fb783e81a6c1.js"],"default",1]
+3:I[52829,["42","static/chunks/42-1cbed529ecb084e0.js","261","static/chunks/261-57d48f76eec1e568.js","250","static/chunks/250-dfc03a6fb4f0d254.js","699","static/chunks/699-87224ecba28f1f48.js","418","static/chunks/app/model_hub/page-cde2fb783e81a6c1.js"],"default",1]
 4:I[4707,[],""]
 5:I[36423,[],""]
-0:["soi--ciJeUE6G2Fk4NMBG",[[["",{"children":["model_hub",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["model_hub",{"children":["__PAGE__",{},[["$L1",["$","$L2",null,{"props":{"params":{},"searchParams":{}},"Component":"$3"}],null],null],null]},[null,["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","model_hub","children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined"}]],null]},[[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/86f6cc749f6b8493.css","precedence":"next","crossOrigin":"$undefined"}],["$","link","1",{"rel":"stylesheet","href":"/ui/_next/static/css/169f9187db1ec37e.css","precedence":"next","crossOrigin":"$undefined"}]],["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_cf7686","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[]}]}]}]],null],null],["$L6",null]]]]
+0:["Yb50LG5p7c9QpG54GIoFV",[[["",{"children":["model_hub",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["model_hub",{"children":["__PAGE__",{},[["$L1",["$","$L2",null,{"props":{"params":{},"searchParams":{}},"Component":"$3"}],null],null],null]},[null,["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","model_hub","children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined"}]],null]},[[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/86f6cc749f6b8493.css","precedence":"next","crossOrigin":"$undefined"}],["$","link","1",{"rel":"stylesheet","href":"/ui/_next/static/css/1f6915676624c422.css","precedence":"next","crossOrigin":"$undefined"}]],["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_cf7686","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[]}]}]}]],null],null],["$L6",null]]]]
 6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
 1:null
--- a/litellm/proxy/_experimental/out/onboarding.txt
+++ b/litellm/proxy/_experimental/out/onboarding.txt
@ -1,7 +1,7 @@
 2:I[19107,[],"ClientPageRoot"]
-3:I[12011,["665","static/chunks/3014691f-0b72c78cfebbd712.js","42","static/chunks/42-1cbed529ecb084e0.js","899","static/chunks/899-9af4feaf6f21839c.js","250","static/chunks/250-601568e45a5ffece.js","461","static/chunks/app/onboarding/page-5110f2c6a3c9a2f4.js"],"default",1]
+3:I[12011,["665","static/chunks/3014691f-0b72c78cfebbd712.js","42","static/chunks/42-1cbed529ecb084e0.js","899","static/chunks/899-9af4feaf6f21839c.js","250","static/chunks/250-dfc03a6fb4f0d254.js","461","static/chunks/app/onboarding/page-2bf7a26db5342dbf.js"],"default",1]
 4:I[4707,[],""]
 5:I[36423,[],""]
-0:["soi--ciJeUE6G2Fk4NMBG",[[["",{"children":["onboarding",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["onboarding",{"children":["__PAGE__",{},[["$L1",["$","$L2",null,{"props":{"params":{},"searchParams":{}},"Component":"$3"}],null],null],null]},[null,["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","onboarding","children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined"}]],null]},[[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/86f6cc749f6b8493.css","precedence":"next","crossOrigin":"$undefined"}],["$","link","1",{"rel":"stylesheet","href":"/ui/_next/static/css/169f9187db1ec37e.css","precedence":"next","crossOrigin":"$undefined"}]],["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_cf7686","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[]}]}]}]],null],null],["$L6",null]]]]
+0:["Yb50LG5p7c9QpG54GIoFV",[[["",{"children":["onboarding",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["onboarding",{"children":["__PAGE__",{},[["$L1",["$","$L2",null,{"props":{"params":{},"searchParams":{}},"Component":"$3"}],null],null],null]},[null,["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","onboarding","children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined"}]],null]},[[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/86f6cc749f6b8493.css","precedence":"next","crossOrigin":"$undefined"}],["$","link","1",{"rel":"stylesheet","href":"/ui/_next/static/css/1f6915676624c422.css","precedence":"next","crossOrigin":"$undefined"}]],["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_cf7686","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[]}]}]}]],null],null],["$L6",null]]]]
 6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
 1:null
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@ -1,33 +1,39 @@
 model_list:
- - model_name: "gpt-4o"
-   litellm_params:
-    model: azure/chatgpt-v-2
-    api_key: os.environ/AZURE_API_KEY
-    api_base: http://0.0.0.0:8090
-    rpm: 3
- - model_name: "gpt-4o-mini-openai"
-   litellm_params:
-    model: gpt-4o-mini
-    api_key: os.environ/OPENAI_API_KEY
- - model_name: "openai/*"
-   litellm_params:
-    model: openai/*
-    api_key: os.environ/OPENAI_API_KEY
- - model_name: "bedrock-nova"
-   litellm_params:
-    model: us.amazon.nova-pro-v1:0
- - model_name: "gemini-2.0-flash"
-   litellm_params:
-    model: gemini/gemini-2.0-flash
-    api_key: os.environ/GEMINI_API_KEY
+  - model_name: "gpt-4o"
+    litellm_params:
+      model: azure/chatgpt-v-2
+      api_key: os.environ/AZURE_API_KEY
+      api_base: http://0.0.0.0:8090
+      rpm: 3
+  - model_name: "gpt-4o-mini-openai"
+    litellm_params:
+      model: gpt-4o-mini
+      api_key: os.environ/OPENAI_API_KEY
+  - model_name: "openai/*"
+    litellm_params:
+      model: openai/*
+      api_key: os.environ/OPENAI_API_KEY
+  - model_name: "bedrock-nova"
+    litellm_params:
+      model: us.amazon.nova-pro-v1:0
+  - model_name: "gemini-2.0-flash"
+    litellm_params:
+      model: gemini/gemini-2.0-flash
+      api_key: os.environ/GEMINI_API_KEY
+  - model_name: openrouter_model
+    litellm_params:
+      model: openrouter/openrouter_model
+      api_key: os.environ/OPENROUTER_API_KEY
+      api_base: http://0.0.0.0:8090
+

 litellm_settings:
  num_retries: 0
  callbacks: ["prometheus"]
  # json_logs: true

-# router_settings:
-#   routing_strategy: usage-based-routing-v2 # 👈 KEY CHANGE
-#   redis_host: os.environ/REDIS_HOST
-#   redis_password: os.environ/REDIS_PASSWORD
-#   redis_port: os.environ/REDIS_PORT
+router_settings:
+  routing_strategy: usage-based-routing-v2 # 👈 KEY CHANGE
+  redis_host: os.environ/REDIS_HOST
+  redis_password: os.environ/REDIS_PASSWORD
+  redis_port: os.environ/REDIS_PORT
--- a/litellm/proxy/_types.py
+++ b/litellm/proxy/_types.py
@ -432,6 +432,7 @@ class LiteLLMRoutes(enum.Enum):
        "/model/new",
        "/model/update",
        "/model/delete",
+        "/user/daily/activity",
    ]  # routes that manage their own allowed/disallowed logic

    ## Org Admin Routes ##
@ -2736,6 +2737,8 @@ class DailyUserSpendTransaction(TypedDict):
    completion_tokens: int
    spend: float
    api_requests: int
+    successful_requests: int
+    failed_requests: int


 class DBSpendUpdateTransactions(TypedDict):
@ -2749,3 +2752,9 @@ class DBSpendUpdateTransactions(TypedDict):
    team_list_transactions: Optional[Dict[str, float]]
    team_member_list_transactions: Optional[Dict[str, float]]
    org_list_transactions: Optional[Dict[str, float]]
+
+
+class SpendUpdateQueueItem(TypedDict, total=False):
+    entity_type: Litellm_EntityType
+    entity_id: str
+    response_cost: Optional[float]
--- a/litellm/proxy/auth/service_account_checks.py
+++ b/litellm/proxy/auth/service_account_checks.py
@ -1,53 +0,0 @@
-"""
-Checks for LiteLLM service account keys
-
-"""
-
-from litellm.proxy._types import ProxyErrorTypes, ProxyException, UserAPIKeyAuth
-
-
-def check_if_token_is_service_account(valid_token: UserAPIKeyAuth) -> bool:
-    """
-    Checks if the token is a service account
-
-    Returns:
-        bool: True if token is a service account
-
-    """
-    if valid_token.metadata:
-        if "service_account_id" in valid_token.metadata:
-            return True
-    return False
-
-
-async def service_account_checks(
-    valid_token: UserAPIKeyAuth, request_data: dict
-) -> bool:
-    """
-    If a virtual key is a service account, checks it's a valid service account
-
-    A token is a service account if it has a service_account_id in its metadata
-
-    Service Account Specific Checks:
-        - Check if required_params is set
-    """
-
-    if check_if_token_is_service_account(valid_token) is not True:
-        return True
-
-    from litellm.proxy.proxy_server import general_settings
-
-    if "service_account_settings" in general_settings:
-        service_account_settings = general_settings["service_account_settings"]
-        if "enforced_params" in service_account_settings:
-            _enforced_params = service_account_settings["enforced_params"]
-            for param in _enforced_params:
-                if param not in request_data:
-                    raise ProxyException(
-                        type=ProxyErrorTypes.bad_request_error.value,
-                        code=400,
-                        param=param,
-                        message=f"BadRequest please pass param={param} in request body. This is a required param for service account",
-                    )
-
-    return True
--- a/litellm/proxy/auth/user_api_key_auth.py
+++ b/litellm/proxy/auth/user_api_key_auth.py
@ -49,7 +49,6 @@ from litellm.proxy.auth.auth_utils import (
 from litellm.proxy.auth.handle_jwt import JWTAuthManager, JWTHandler
 from litellm.proxy.auth.oauth2_check import check_oauth2_token
 from litellm.proxy.auth.oauth2_proxy_hook import handle_oauth2_proxy_request
-from litellm.proxy.auth.service_account_checks import service_account_checks
 from litellm.proxy.common_utils.http_parsing_utils import _read_request_body
 from litellm.proxy.utils import PrismaClient, ProxyLogging
 from litellm.types.services import ServiceTypes
@ -905,12 +904,6 @@ async def _user_api_key_auth_builder(  # noqa: PLR0915
            else:
                _team_obj = None

-            # Check 7: Check if key is a service account key
-            await service_account_checks(
-                valid_token=valid_token,
-                request_data=request_data,
-            )
-
            user_api_key_cache.set_cache(
                key=valid_token.team_id, value=_team_obj
            )  # save team table in cache - used for tpm/rpm limiting - tpm_rpm_limiter.py
--- a/litellm/proxy/common_request_processing.py
+++ b/litellm/proxy/common_request_processing.py
@ -123,6 +123,7 @@ class ProxyBaseLLMRequestProcessing:
        """
        Common request processing logic for both chat completions and responses API endpoints
        """
+
        verbose_proxy_logger.debug(
            "Request received by LiteLLM:\n{}".format(json.dumps(self.data, indent=4)),
        )
--- a/litellm/proxy/common_utils/http_parsing_utils.py
+++ b/litellm/proxy/common_utils/http_parsing_utils.py
@ -81,8 +81,13 @@ async def _read_request_body(request: Optional[Request]) -> Dict:
 def _safe_get_request_parsed_body(request: Optional[Request]) -> Optional[dict]:
    if request is None:
        return None
-    if hasattr(request, "scope") and "parsed_body" in request.scope:
-        return request.scope["parsed_body"]
+    if (
+        hasattr(request, "scope")
+        and "parsed_body" in request.scope
+        and isinstance(request.scope["parsed_body"], tuple)
+    ):
+        accepted_keys, parsed_body = request.scope["parsed_body"]
+        return {key: parsed_body[key] for key in accepted_keys}
    return None


@ -93,7 +98,7 @@ def _safe_set_request_parsed_body(
    try:
        if request is None:
            return
-        request.scope["parsed_body"] = parsed_body
+        request.scope["parsed_body"] = (tuple(parsed_body.keys()), parsed_body)
    except Exception as e:
        verbose_proxy_logger.debug(
            "Unexpected error setting request parsed body - {}".format(e)
--- a/litellm/proxy/db/db_spend_update_writer.py
+++ b/litellm/proxy/db/db_spend_update_writer.py
@ -10,7 +10,7 @@ import os
 import time
 import traceback
 from datetime import datetime, timedelta
-from typing import TYPE_CHECKING, Any, Optional, Union
+from typing import TYPE_CHECKING, Any, Dict, Optional, Union

 import litellm
 from litellm._logging import verbose_proxy_logger
@ -18,13 +18,19 @@ from litellm.caching import DualCache, RedisCache
 from litellm.constants import DB_SPEND_UPDATE_JOB_NAME
 from litellm.proxy._types import (
    DB_CONNECTION_ERROR_TYPES,
+    DailyUserSpendTransaction,
    DBSpendUpdateTransactions,
    Litellm_EntityType,
    LiteLLM_UserTable,
    SpendLogsPayload,
+    SpendUpdateQueueItem,
 )
-from litellm.proxy.db.pod_lock_manager import PodLockManager
-from litellm.proxy.db.redis_update_buffer import RedisUpdateBuffer
+from litellm.proxy.db.db_transaction_queue.daily_spend_update_queue import (
+    DailySpendUpdateQueue,
+)
+from litellm.proxy.db.db_transaction_queue.pod_lock_manager import PodLockManager
+from litellm.proxy.db.db_transaction_queue.redis_update_buffer import RedisUpdateBuffer
+from litellm.proxy.db.db_transaction_queue.spend_update_queue import SpendUpdateQueue

 if TYPE_CHECKING:
    from litellm.proxy.utils import PrismaClient, ProxyLogging
@ -48,10 +54,12 @@ class DBSpendUpdateWriter:
        self.redis_cache = redis_cache
        self.redis_update_buffer = RedisUpdateBuffer(redis_cache=self.redis_cache)
        self.pod_lock_manager = PodLockManager(cronjob_id=DB_SPEND_UPDATE_JOB_NAME)
+        self.spend_update_queue = SpendUpdateQueue()
+        self.daily_spend_update_queue = DailySpendUpdateQueue()

-    @staticmethod
    async def update_database(
        # LiteLLM management object fields
+        self,
        token: Optional[str],
        user_id: Optional[str],
        end_user_id: Optional[str],
@ -84,7 +92,7 @@ class DBSpendUpdateWriter:
                hashed_token = token

            asyncio.create_task(
-                DBSpendUpdateWriter._update_user_db(
+                self._update_user_db(
                    response_cost=response_cost,
                    user_id=user_id,
                    prisma_client=prisma_client,
@ -94,14 +102,14 @@ class DBSpendUpdateWriter:
                )
            )
            asyncio.create_task(
-                DBSpendUpdateWriter._update_key_db(
+                self._update_key_db(
                    response_cost=response_cost,
                    hashed_token=hashed_token,
                    prisma_client=prisma_client,
                )
            )
            asyncio.create_task(
-                DBSpendUpdateWriter._update_team_db(
+                self._update_team_db(
                    response_cost=response_cost,
                    team_id=team_id,
                    user_id=user_id,
@ -109,14 +117,14 @@ class DBSpendUpdateWriter:
                )
            )
            asyncio.create_task(
-                DBSpendUpdateWriter._update_org_db(
+                self._update_org_db(
                    response_cost=response_cost,
                    org_id=org_id,
                    prisma_client=prisma_client,
                )
            )
            if disable_spend_logs is False:
-                await DBSpendUpdateWriter._insert_spend_log_to_db(
+                await self._insert_spend_log_to_db(
                    kwargs=kwargs,
                    completion_response=completion_response,
                    start_time=start_time,
@ -135,56 +143,8 @@ class DBSpendUpdateWriter:
                f"Error updating Prisma database: {traceback.format_exc()}"
            )

-    @staticmethod
-    async def _update_transaction_list(
-        response_cost: Optional[float],
-        entity_id: Optional[str],
-        transaction_list: dict,
-        entity_type: Litellm_EntityType,
-        debug_msg: Optional[str] = None,
-        prisma_client: Optional[PrismaClient] = None,
-    ) -> bool:
-        """
-        Common helper method to update a transaction list for an entity
-
-        Args:
-            response_cost: The cost to add
-            entity_id: The ID of the entity to update
-            transaction_list: The transaction list dictionary to update
-            entity_type: The type of entity (from EntityType enum)
-            debug_msg: Optional custom debug message
-
-        Returns:
-            bool: True if update happened, False otherwise
-        """
-        try:
-            if debug_msg:
-                verbose_proxy_logger.debug(debug_msg)
-            else:
-                verbose_proxy_logger.debug(
-                    f"adding spend to {entity_type.value} db. Response cost: {response_cost}. {entity_type.value}_id: {entity_id}."
-                )
-            if prisma_client is None:
-                return False
-
-            if entity_id is None:
-                verbose_proxy_logger.debug(
-                    f"track_cost_callback: {entity_type.value}_id is None. Not tracking spend for {entity_type.value}"
-                )
-                return False
-            transaction_list[entity_id] = response_cost + transaction_list.get(
-                entity_id, 0
-            )
-            return True
-
-        except Exception as e:
-            verbose_proxy_logger.info(
-                f"Update {entity_type.value.capitalize()} DB failed to execute - {str(e)}\n{traceback.format_exc()}"
-            )
-            raise e
-
-    @staticmethod
    async def _update_key_db(
+        self,
        response_cost: Optional[float],
        hashed_token: Optional[str],
        prisma_client: Optional[PrismaClient],
@ -193,13 +153,12 @@ class DBSpendUpdateWriter:
            if hashed_token is None or prisma_client is None:
                return

-            await DBSpendUpdateWriter._update_transaction_list(
-                response_cost=response_cost,
-                entity_id=hashed_token,
-                transaction_list=prisma_client.key_list_transactions,
-                entity_type=Litellm_EntityType.KEY,
-                debug_msg=f"adding spend to key db. Response cost: {response_cost}. Token: {hashed_token}.",
-                prisma_client=prisma_client,
+            await self.spend_update_queue.add_update(
+                update=SpendUpdateQueueItem(
+                    entity_type=Litellm_EntityType.KEY,
+                    entity_id=hashed_token,
+                    response_cost=response_cost,
+                )
            )
        except Exception as e:
            verbose_proxy_logger.exception(
@ -207,8 +166,8 @@ class DBSpendUpdateWriter:
            )
            raise e

-    @staticmethod
    async def _update_user_db(
+        self,
        response_cost: Optional[float],
        user_id: Optional[str],
        prisma_client: Optional[PrismaClient],
@ -234,21 +193,21 @@ class DBSpendUpdateWriter:

                for _id in user_ids:
                    if _id is not None:
-                        await DBSpendUpdateWriter._update_transaction_list(
-                            response_cost=response_cost,
-                            entity_id=_id,
-                            transaction_list=prisma_client.user_list_transactions,
-                            entity_type=Litellm_EntityType.USER,
-                            prisma_client=prisma_client,
+                        await self.spend_update_queue.add_update(
+                            update=SpendUpdateQueueItem(
+                                entity_type=Litellm_EntityType.USER,
+                                entity_id=_id,
+                                response_cost=response_cost,
+                            )
                        )

                if end_user_id is not None:
-                    await DBSpendUpdateWriter._update_transaction_list(
-                        response_cost=response_cost,
-                        entity_id=end_user_id,
-                        transaction_list=prisma_client.end_user_list_transactions,
-                        entity_type=Litellm_EntityType.END_USER,
-                        prisma_client=prisma_client,
+                    await self.spend_update_queue.add_update(
+                        update=SpendUpdateQueueItem(
+                            entity_type=Litellm_EntityType.END_USER,
+                            entity_id=end_user_id,
+                            response_cost=response_cost,
+                        )
                    )
        except Exception as e:
            verbose_proxy_logger.info(
@ -256,8 +215,8 @@ class DBSpendUpdateWriter:
                + f"Update User DB call failed to execute {str(e)}\n{traceback.format_exc()}"
            )

-    @staticmethod
    async def _update_team_db(
+        self,
        response_cost: Optional[float],
        team_id: Optional[str],
        user_id: Optional[str],
@ -270,12 +229,12 @@ class DBSpendUpdateWriter:
                )
                return

-            await DBSpendUpdateWriter._update_transaction_list(
-                response_cost=response_cost,
-                entity_id=team_id,
-                transaction_list=prisma_client.team_list_transactions,
-                entity_type=Litellm_EntityType.TEAM,
-                prisma_client=prisma_client,
+            await self.spend_update_queue.add_update(
+                update=SpendUpdateQueueItem(
+                    entity_type=Litellm_EntityType.TEAM,
+                    entity_id=team_id,
+                    response_cost=response_cost,
+                )
            )

            try:
@ -283,12 +242,12 @@ class DBSpendUpdateWriter:
                if user_id is not None:
                    # key is "team_id::<value>::user_id::<value>"
                    team_member_key = f"team_id::{team_id}::user_id::{user_id}"
-                    await DBSpendUpdateWriter._update_transaction_list(
-                        response_cost=response_cost,
-                        entity_id=team_member_key,
-                        transaction_list=prisma_client.team_member_list_transactions,
-                        entity_type=Litellm_EntityType.TEAM_MEMBER,
-                        prisma_client=prisma_client,
+                    await self.spend_update_queue.add_update(
+                        update=SpendUpdateQueueItem(
+                            entity_type=Litellm_EntityType.TEAM_MEMBER,
+                            entity_id=team_member_key,
+                            response_cost=response_cost,
+                        )
                    )
            except Exception:
                pass
@ -298,8 +257,8 @@ class DBSpendUpdateWriter:
            )
            raise e

-    @staticmethod
    async def _update_org_db(
+        self,
        response_cost: Optional[float],
        org_id: Optional[str],
        prisma_client: Optional[PrismaClient],
@ -311,12 +270,12 @@ class DBSpendUpdateWriter:
                )
                return

-            await DBSpendUpdateWriter._update_transaction_list(
-                response_cost=response_cost,
-                entity_id=org_id,
-                transaction_list=prisma_client.org_list_transactions,
-                entity_type=Litellm_EntityType.ORGANIZATION,
-                prisma_client=prisma_client,
+            await self.spend_update_queue.add_update(
+                update=SpendUpdateQueueItem(
+                    entity_type=Litellm_EntityType.ORGANIZATION,
+                    entity_id=org_id,
+                    response_cost=response_cost,
+                )
            )
        except Exception as e:
            verbose_proxy_logger.info(
@ -324,8 +283,8 @@ class DBSpendUpdateWriter:
            )
            raise e

-    @staticmethod
    async def _insert_spend_log_to_db(
+        self,
        kwargs: Optional[dict],
        completion_response: Optional[Union[litellm.ModelResponse, Any, Exception]],
        start_time: Optional[datetime],
@ -346,7 +305,7 @@ class DBSpendUpdateWriter:
                    end_time=end_time,
                )
                payload["spend"] = response_cost or 0.0
-                DBSpendUpdateWriter._set_spend_logs_payload(
+                await self._set_spend_logs_payload(
                    payload=payload,
                    spend_logs_url=os.getenv("SPEND_LOGS_URL"),
                    prisma_client=prisma_client,
@ -357,8 +316,8 @@ class DBSpendUpdateWriter:
            )
            raise e

-    @staticmethod
-    def _set_spend_logs_payload(
+    async def _set_spend_logs_payload(
+        self,
        payload: Union[dict, SpendLogsPayload],
        prisma_client: PrismaClient,
        spend_logs_url: Optional[str] = None,
@ -377,8 +336,9 @@ class DBSpendUpdateWriter:
        elif prisma_client is not None:
            prisma_client.spend_log_transactions.append(payload)

-        prisma_client.add_spend_log_transaction_to_daily_user_transaction(
-            payload.copy()
+        await self.add_spend_log_transaction_to_daily_user_transaction(
+            payload=payload.copy(),
+            prisma_client=prisma_client,
        )
        return prisma_client

@ -435,7 +395,8 @@ class DBSpendUpdateWriter:
            - Only 1 pod will commit to db at a time (based on if it can acquire the lock over writing to DB)
        """
        await self.redis_update_buffer.store_in_memory_spend_updates_in_redis(
-            prisma_client=prisma_client,
+            spend_update_queue=self.spend_update_queue,
+            daily_spend_update_queue=self.daily_spend_update_queue,
        )

        # Only commit from redis to db if this pod is the leader
@ -447,12 +408,23 @@ class DBSpendUpdateWriter:
                    await self.redis_update_buffer.get_all_update_transactions_from_redis_buffer()
                )
                if db_spend_update_transactions is not None:
-                    await DBSpendUpdateWriter._commit_spend_updates_to_db(
+                    await self._commit_spend_updates_to_db(
                        prisma_client=prisma_client,
                        n_retry_times=n_retry_times,
                        proxy_logging_obj=proxy_logging_obj,
                        db_spend_update_transactions=db_spend_update_transactions,
                    )
+
+                daily_spend_update_transactions = (
+                    await self.redis_update_buffer.get_all_daily_spend_update_transactions_from_redis_buffer()
+                )
+                if daily_spend_update_transactions is not None:
+                    await DBSpendUpdateWriter.update_daily_user_spend(
+                        n_retry_times=n_retry_times,
+                        prisma_client=prisma_client,
+                        proxy_logging_obj=proxy_logging_obj,
+                        daily_spend_transactions=daily_spend_update_transactions,
+                    )
            except Exception as e:
                verbose_proxy_logger.error(f"Error committing spend updates: {e}")
            finally:
@ -471,23 +443,34 @@ class DBSpendUpdateWriter:

        Note: This flow causes Deadlocks in production (1K RPS+). Use self._commit_spend_updates_to_db_with_redis() instead if you expect 1K+ RPS.
        """
-        db_spend_update_transactions = DBSpendUpdateTransactions(
-            user_list_transactions=prisma_client.user_list_transactions,
-            end_user_list_transactions=prisma_client.end_user_list_transactions,
-            key_list_transactions=prisma_client.key_list_transactions,
-            team_list_transactions=prisma_client.team_list_transactions,
-            team_member_list_transactions=prisma_client.team_member_list_transactions,
-            org_list_transactions=prisma_client.org_list_transactions,
+
+        # Aggregate all in memory spend updates (key, user, end_user, team, team_member, org) and commit to db
+        ################## Spend Update Transactions ##################
+        db_spend_update_transactions = (
+            await self.spend_update_queue.flush_and_get_aggregated_db_spend_update_transactions()
        )
-        await DBSpendUpdateWriter._commit_spend_updates_to_db(
+        await self._commit_spend_updates_to_db(
            prisma_client=prisma_client,
            n_retry_times=n_retry_times,
            proxy_logging_obj=proxy_logging_obj,
            db_spend_update_transactions=db_spend_update_transactions,
        )

-    @staticmethod
+        ################## Daily Spend Update Transactions ##################
+        # Aggregate all in memory daily spend transactions and commit to db
+        daily_spend_update_transactions = (
+            await self.daily_spend_update_queue.flush_and_get_aggregated_daily_spend_update_transactions()
+        )
+
+        await DBSpendUpdateWriter.update_daily_user_spend(
+            n_retry_times=n_retry_times,
+            prisma_client=prisma_client,
+            proxy_logging_obj=proxy_logging_obj,
+            daily_spend_transactions=daily_spend_update_transactions,
+        )
+
    async def _commit_spend_updates_to_db(  # noqa: PLR0915
+        self,
        prisma_client: PrismaClient,
        n_retry_times: int,
        proxy_logging_obj: ProxyLogging,
@ -526,9 +509,6 @@ class DBSpendUpdateWriter:
                                    where={"user_id": user_id},
                                    data={"spend": {"increment": response_cost}},
                                )
-                    prisma_client.user_list_transactions = (
-                        {}
-                    )  # Clear the remaining transactions after processing all batches in the loop.
                    break
                except DB_CONNECTION_ERROR_TYPES as e:
                    if (
@ -561,6 +541,7 @@ class DBSpendUpdateWriter:
                n_retry_times=n_retry_times,
                prisma_client=prisma_client,
                proxy_logging_obj=proxy_logging_obj,
+                end_user_list_transactions=end_user_list_transactions,
            )
        ### UPDATE KEY TABLE ###
        key_list_transactions = db_spend_update_transactions["key_list_transactions"]
@ -583,9 +564,6 @@ class DBSpendUpdateWriter:
                                    where={"token": token},
                                    data={"spend": {"increment": response_cost}},
                                )
-                    prisma_client.key_list_transactions = (
-                        {}
-                    )  # Clear the remaining transactions after processing all batches in the loop.
                    break
                except DB_CONNECTION_ERROR_TYPES as e:
                    if (
@ -632,9 +610,6 @@ class DBSpendUpdateWriter:
                                    where={"team_id": team_id},
                                    data={"spend": {"increment": response_cost}},
                                )
-                    prisma_client.team_list_transactions = (
-                        {}
-                    )  # Clear the remaining transactions after processing all batches in the loop.
                    break
                except DB_CONNECTION_ERROR_TYPES as e:
                    if (
@ -684,9 +659,6 @@ class DBSpendUpdateWriter:
                                    where={"team_id": team_id, "user_id": user_id},
                                    data={"spend": {"increment": response_cost}},
                                )
-                    prisma_client.team_member_list_transactions = (
-                        {}
-                    )  # Clear the remaining transactions after processing all batches in the loop.
                    break
                except DB_CONNECTION_ERROR_TYPES as e:
                    if (
@ -725,9 +697,6 @@ class DBSpendUpdateWriter:
                                    where={"organization_id": org_id},
                                    data={"spend": {"increment": response_cost}},
                                )
-                    prisma_client.org_list_transactions = (
-                        {}
-                    )  # Clear the remaining transactions after processing all batches in the loop.
                    break
                except DB_CONNECTION_ERROR_TYPES as e:
                    if (
@ -744,3 +713,192 @@ class DBSpendUpdateWriter:
                    _raise_failed_update_spend_exception(
                        e=e, start_time=start_time, proxy_logging_obj=proxy_logging_obj
                    )
+
+    @staticmethod
+    async def update_daily_user_spend(
+        n_retry_times: int,
+        prisma_client: PrismaClient,
+        proxy_logging_obj: ProxyLogging,
+        daily_spend_transactions: Dict[str, DailyUserSpendTransaction],
+    ):
+        """
+        Batch job to update LiteLLM_DailyUserSpend table using in-memory daily_spend_transactions
+        """
+        from litellm.proxy.utils import _raise_failed_update_spend_exception
+
+        ### UPDATE DAILY USER SPEND ###
+        verbose_proxy_logger.debug(
+            "Daily User Spend transactions: {}".format(len(daily_spend_transactions))
+        )
+        BATCH_SIZE = (
+            100  # Number of aggregated records to update in each database operation
+        )
+        start_time = time.time()
+
+        try:
+            for i in range(n_retry_times + 1):
+                try:
+                    # Get transactions to process
+                    transactions_to_process = dict(
+                        list(daily_spend_transactions.items())[:BATCH_SIZE]
+                    )
+
+                    if len(transactions_to_process) == 0:
+                        verbose_proxy_logger.debug(
+                            "No new transactions to process for daily spend update"
+                        )
+                        break
+
+                    # Update DailyUserSpend table in batches
+                    async with prisma_client.db.batch_() as batcher:
+                        for _, transaction in transactions_to_process.items():
+                            user_id = transaction.get("user_id")
+                            if not user_id:  # Skip if no user_id
+                                continue
+
+                            batcher.litellm_dailyuserspend.upsert(
+                                where={
+                                    "user_id_date_api_key_model_custom_llm_provider": {
+                                        "user_id": user_id,
+                                        "date": transaction["date"],
+                                        "api_key": transaction["api_key"],
+                                        "model": transaction["model"],
+                                        "custom_llm_provider": transaction.get(
+                                            "custom_llm_provider"
+                                        ),
+                                    }
+                                },
+                                data={
+                                    "create": {
+                                        "user_id": user_id,
+                                        "date": transaction["date"],
+                                        "api_key": transaction["api_key"],
+                                        "model": transaction["model"],
+                                        "model_group": transaction.get("model_group"),
+                                        "custom_llm_provider": transaction.get(
+                                            "custom_llm_provider"
+                                        ),
+                                        "prompt_tokens": transaction["prompt_tokens"],
+                                        "completion_tokens": transaction[
+                                            "completion_tokens"
+                                        ],
+                                        "spend": transaction["spend"],
+                                        "api_requests": transaction["api_requests"],
+                                        "successful_requests": transaction[
+                                            "successful_requests"
+                                        ],
+                                        "failed_requests": transaction[
+                                            "failed_requests"
+                                        ],
+                                    },
+                                    "update": {
+                                        "prompt_tokens": {
+                                            "increment": transaction["prompt_tokens"]
+                                        },
+                                        "completion_tokens": {
+                                            "increment": transaction[
+                                                "completion_tokens"
+                                            ]
+                                        },
+                                        "spend": {"increment": transaction["spend"]},
+                                        "api_requests": {
+                                            "increment": transaction["api_requests"]
+                                        },
+                                        "successful_requests": {
+                                            "increment": transaction[
+                                                "successful_requests"
+                                            ]
+                                        },
+                                        "failed_requests": {
+                                            "increment": transaction["failed_requests"]
+                                        },
+                                    },
+                                },
+                            )
+
+                    verbose_proxy_logger.info(
+                        f"Processed {len(transactions_to_process)} daily spend transactions in {time.time() - start_time:.2f}s"
+                    )
+
+                    # Remove processed transactions
+                    for key in transactions_to_process.keys():
+                        daily_spend_transactions.pop(key, None)
+
+                    verbose_proxy_logger.debug(
+                        f"Processed {len(transactions_to_process)} daily spend transactions in {time.time() - start_time:.2f}s"
+                    )
+                    break
+
+                except DB_CONNECTION_ERROR_TYPES as e:
+                    if i >= n_retry_times:
+                        _raise_failed_update_spend_exception(
+                            e=e,
+                            start_time=start_time,
+                            proxy_logging_obj=proxy_logging_obj,
+                        )
+                    await asyncio.sleep(2**i)  # Exponential backoff
+
+        except Exception as e:
+            # Remove processed transactions even if there was an error
+            if "transactions_to_process" in locals():
+                for key in transactions_to_process.keys():  # type: ignore
+                    daily_spend_transactions.pop(key, None)
+            _raise_failed_update_spend_exception(
+                e=e, start_time=start_time, proxy_logging_obj=proxy_logging_obj
+            )
+
+    async def add_spend_log_transaction_to_daily_user_transaction(
+        self,
+        payload: Union[dict, SpendLogsPayload],
+        prisma_client: PrismaClient,
+    ):
+        """
+        Add a spend log transaction to the `daily_spend_update_queue`
+
+        Key = @@unique([user_id, date, api_key, model, custom_llm_provider])    )
+
+        If key exists, update the transaction with the new spend and usage
+        """
+        expected_keys = ["user", "startTime", "api_key", "model", "custom_llm_provider"]
+
+        if not all(key in payload for key in expected_keys):
+            verbose_proxy_logger.debug(
+                f"Missing expected keys: {expected_keys}, in payload, skipping from daily_user_spend_transactions"
+            )
+            return
+
+        request_status = prisma_client.get_request_status(payload)
+        verbose_proxy_logger.info(f"Logged request status: {request_status}")
+        if isinstance(payload["startTime"], datetime):
+            start_time = payload["startTime"].isoformat()
+            date = start_time.split("T")[0]
+        elif isinstance(payload["startTime"], str):
+            date = payload["startTime"].split("T")[0]
+        else:
+            verbose_proxy_logger.debug(
+                f"Invalid start time: {payload['startTime']}, skipping from daily_user_spend_transactions"
+            )
+            return
+        try:
+            daily_transaction_key = f"{payload['user']}_{date}_{payload['api_key']}_{payload['model']}_{payload['custom_llm_provider']}"
+            daily_transaction = DailyUserSpendTransaction(
+                user_id=payload["user"],
+                date=date,
+                api_key=payload["api_key"],
+                model=payload["model"],
+                model_group=payload["model_group"],
+                custom_llm_provider=payload["custom_llm_provider"],
+                prompt_tokens=payload["prompt_tokens"],
+                completion_tokens=payload["completion_tokens"],
+                spend=payload["spend"],
+                api_requests=1,
+                successful_requests=1 if request_status == "success" else 0,
+                failed_requests=1 if request_status != "success" else 0,
+            )
+
+            await self.daily_spend_update_queue.add_update(
+                update={daily_transaction_key: daily_transaction}
+            )
+
+        except Exception as e:
+            raise e
--- a/litellm/proxy/db/db_transaction_queue/base_update_queue.py
+++ b/litellm/proxy/db/db_transaction_queue/base_update_queue.py
@ -0,0 +1,25 @@
+"""
+Base class for in memory buffer for database transactions
+"""
+import asyncio
+
+from litellm._logging import verbose_proxy_logger
+
+
+class BaseUpdateQueue:
+    """Base class for in memory buffer for database transactions"""
+
+    def __init__(self):
+        self.update_queue = asyncio.Queue()
+
+    async def add_update(self, update):
+        """Enqueue an update."""
+        verbose_proxy_logger.debug("Adding update to queue: %s", update)
+        await self.update_queue.put(update)
+
+    async def flush_all_updates_from_in_memory_queue(self):
+        """Get all updates from the queue."""
+        updates = []
+        while not self.update_queue.empty():
+            updates.append(await self.update_queue.get())
+        return updates
--- a/litellm/proxy/db/db_transaction_queue/daily_spend_update_queue.py
+++ b/litellm/proxy/db/db_transaction_queue/daily_spend_update_queue.py
@ -0,0 +1,95 @@
+import asyncio
+from typing import Dict, List
+
+from litellm._logging import verbose_proxy_logger
+from litellm.proxy._types import DailyUserSpendTransaction
+from litellm.proxy.db.db_transaction_queue.base_update_queue import BaseUpdateQueue
+
+
+class DailySpendUpdateQueue(BaseUpdateQueue):
+    """
+    In memory buffer for daily spend updates that should be committed to the database
+
+    To add a new daily spend update transaction, use the following format:
+        daily_spend_update_queue.add_update({
+            "user1_date_api_key_model_custom_llm_provider": {
+                "spend": 10,
+                "prompt_tokens": 100,
+                "completion_tokens": 100,
+            }
+        })
+
+    Queue contains a list of daily spend update transactions
+
+    eg
+        queue = [
+            {
+                "user1_date_api_key_model_custom_llm_provider": {
+                    "spend": 10,
+                    "prompt_tokens": 100,
+                    "completion_tokens": 100,
+                    "api_requests": 100,
+                    "successful_requests": 100,
+                    "failed_requests": 100,
+                }
+            },
+            {
+                "user2_date_api_key_model_custom_llm_provider": {
+                    "spend": 10,
+                    "prompt_tokens": 100,
+                    "completion_tokens": 100,
+                    "api_requests": 100,
+                    "successful_requests": 100,
+                    "failed_requests": 100,
+                }
+            }
+        ]
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.update_queue: asyncio.Queue[
+            Dict[str, DailyUserSpendTransaction]
+        ] = asyncio.Queue()
+
+    async def flush_and_get_aggregated_daily_spend_update_transactions(
+        self,
+    ) -> Dict[str, DailyUserSpendTransaction]:
+        """Get all updates from the queue and return all updates aggregated by daily_transaction_key."""
+        updates = await self.flush_all_updates_from_in_memory_queue()
+        aggregated_daily_spend_update_transactions = (
+            DailySpendUpdateQueue.get_aggregated_daily_spend_update_transactions(
+                updates
+            )
+        )
+        verbose_proxy_logger.debug(
+            "Aggregated daily spend update transactions: %s",
+            aggregated_daily_spend_update_transactions,
+        )
+        return aggregated_daily_spend_update_transactions
+
+    @staticmethod
+    def get_aggregated_daily_spend_update_transactions(
+        updates: List[Dict[str, DailyUserSpendTransaction]]
+    ) -> Dict[str, DailyUserSpendTransaction]:
+        """Aggregate updates by daily_transaction_key."""
+        aggregated_daily_spend_update_transactions: Dict[
+            str, DailyUserSpendTransaction
+        ] = {}
+        for _update in updates:
+            for _key, payload in _update.items():
+                if _key in aggregated_daily_spend_update_transactions:
+                    daily_transaction = aggregated_daily_spend_update_transactions[_key]
+                    daily_transaction["spend"] += payload["spend"]
+                    daily_transaction["prompt_tokens"] += payload["prompt_tokens"]
+                    daily_transaction["completion_tokens"] += payload[
+                        "completion_tokens"
+                    ]
+                    daily_transaction["api_requests"] += payload["api_requests"]
+                    daily_transaction["successful_requests"] += payload[
+                        "successful_requests"
+                    ]
+                    daily_transaction["failed_requests"] += payload["failed_requests"]
+                else:
+                    aggregated_daily_spend_update_transactions[_key] = payload
+        return aggregated_daily_spend_update_transactions
--- a/litellm/proxy/db/db_transaction_queue/pod_lock_manager.py
+++ b/litellm/proxy/db/db_transaction_queue/pod_lock_manager.py
--- a/litellm/proxy/db/db_transaction_queue/redis_update_buffer.py
+++ b/litellm/proxy/db/db_transaction_queue/redis_update_buffer.py
@ -9,9 +9,17 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union

 from litellm._logging import verbose_proxy_logger
 from litellm.caching import RedisCache
-from litellm.constants import MAX_REDIS_BUFFER_DEQUEUE_COUNT, REDIS_UPDATE_BUFFER_KEY
+from litellm.constants import (
+    MAX_REDIS_BUFFER_DEQUEUE_COUNT,
+    REDIS_DAILY_SPEND_UPDATE_BUFFER_KEY,
+    REDIS_UPDATE_BUFFER_KEY,
+)
 from litellm.litellm_core_utils.safe_json_dumps import safe_dumps
-from litellm.proxy._types import DBSpendUpdateTransactions
+from litellm.proxy._types import DailyUserSpendTransaction, DBSpendUpdateTransactions
+from litellm.proxy.db.db_transaction_queue.daily_spend_update_queue import (
+    DailySpendUpdateQueue,
+)
+from litellm.proxy.db.db_transaction_queue.spend_update_queue import SpendUpdateQueue
 from litellm.secret_managers.main import str_to_bool

 if TYPE_CHECKING:
@ -54,37 +62,70 @@ class RedisUpdateBuffer:

    async def store_in_memory_spend_updates_in_redis(
        self,
-        prisma_client: PrismaClient,
+        spend_update_queue: SpendUpdateQueue,
+        daily_spend_update_queue: DailySpendUpdateQueue,
    ):
        """
        Stores the in-memory spend updates to Redis

-        Each transaction is a dict stored as following:
-        - key is the entity id
-        - value is the spend amount
+        Stores the following in memory data structures in Redis:
+            - SpendUpdateQueue - Key, User, Team, TeamMember, Org, EndUser Spend updates
+            - DailySpendUpdateQueue - Daily Spend updates Aggregate view

-            ```
-            Redis List:
-            key_list_transactions:
-            [
-                "0929880201": 1.2,
-                "0929880202": 0.01,
-                "0929880203": 0.001,
-            ]
-            ```
+        For SpendUpdateQueue:
+            Each transaction is a dict stored as following:
+            - key is the entity id
+            - value is the spend amount
+
+                ```
+                Redis List:
+                key_list_transactions:
+                [
+                    "0929880201": 1.2,
+                    "0929880202": 0.01,
+                    "0929880203": 0.001,
+                ]
+                ```
+
+        For DailySpendUpdateQueue:
+            Each transaction is a Dict[str, DailyUserSpendTransaction] stored as following:
+            - key is the daily_transaction_key
+            - value is the DailyUserSpendTransaction
+
+                ```
+                Redis List:
+                daily_spend_update_transactions:
+                [
+                    {
+                        "user_keyhash_1_model_1": {
+                            "spend": 1.2,
+                            "prompt_tokens": 1000,
+                            "completion_tokens": 1000,
+                            "api_requests": 1000,
+                            "successful_requests": 1000,
+                        },
+
+                    }
+                ]
+                ```
        """
        if self.redis_cache is None:
            verbose_proxy_logger.debug(
                "redis_cache is None, skipping store_in_memory_spend_updates_in_redis"
            )
            return
-        db_spend_update_transactions: DBSpendUpdateTransactions = DBSpendUpdateTransactions(
-            user_list_transactions=prisma_client.user_list_transactions,
-            end_user_list_transactions=prisma_client.end_user_list_transactions,
-            key_list_transactions=prisma_client.key_list_transactions,
-            team_list_transactions=prisma_client.team_list_transactions,
-            team_member_list_transactions=prisma_client.team_member_list_transactions,
-            org_list_transactions=prisma_client.org_list_transactions,
+
+        db_spend_update_transactions = (
+            await spend_update_queue.flush_and_get_aggregated_db_spend_update_transactions()
+        )
+        verbose_proxy_logger.debug(
+            "ALL DB SPEND UPDATE TRANSACTIONS: %s", db_spend_update_transactions
+        )
+        daily_spend_update_transactions = (
+            await daily_spend_update_queue.flush_and_get_aggregated_daily_spend_update_transactions()
+        )
+        verbose_proxy_logger.debug(
+            "ALL DAILY SPEND UPDATE TRANSACTIONS: %s", daily_spend_update_transactions
        )

        # only store in redis if there are any updates to commit
@ -100,8 +141,13 @@ class RedisUpdateBuffer:
            values=list_of_transactions,
        )

-        # clear the in-memory spend updates
-        RedisUpdateBuffer._clear_all_in_memory_spend_updates(prisma_client)
+        list_of_daily_spend_update_transactions = [
+            safe_dumps(daily_spend_update_transactions)
+        ]
+        await self.redis_cache.async_rpush(
+            key=REDIS_DAILY_SPEND_UPDATE_BUFFER_KEY,
+            values=list_of_daily_spend_update_transactions,
+        )

    @staticmethod
    def _number_of_transactions_to_store_in_redis(
@ -116,20 +162,6 @@ class RedisUpdateBuffer:
                num_transactions += len(v)
        return num_transactions

-    @staticmethod
-    def _clear_all_in_memory_spend_updates(
-        prisma_client: PrismaClient,
-    ):
-        """
-        Clears all in-memory spend updates
-        """
-        prisma_client.user_list_transactions = {}
-        prisma_client.end_user_list_transactions = {}
-        prisma_client.key_list_transactions = {}
-        prisma_client.team_list_transactions = {}
-        prisma_client.team_member_list_transactions = {}
-        prisma_client.org_list_transactions = {}
-
    @staticmethod
    def _remove_prefix_from_keys(data: Dict[str, Any], prefix: str) -> Dict[str, Any]:
        """
@ -197,6 +229,27 @@ class RedisUpdateBuffer:

        return combined_transaction

+    async def get_all_daily_spend_update_transactions_from_redis_buffer(
+        self,
+    ) -> Optional[Dict[str, DailyUserSpendTransaction]]:
+        """
+        Gets all the daily spend update transactions from Redis
+        """
+        if self.redis_cache is None:
+            return None
+        list_of_transactions = await self.redis_cache.async_lpop(
+            key=REDIS_DAILY_SPEND_UPDATE_BUFFER_KEY,
+            count=MAX_REDIS_BUFFER_DEQUEUE_COUNT,
+        )
+        if list_of_transactions is None:
+            return None
+        list_of_daily_spend_update_transactions = [
+            json.loads(transaction) for transaction in list_of_transactions
+        ]
+        return DailySpendUpdateQueue.get_aggregated_daily_spend_update_transactions(
+            list_of_daily_spend_update_transactions
+        )
+
    @staticmethod
    def _parse_list_of_transactions(
        list_of_transactions: Union[Any, List[Any]],
--- a/litellm/proxy/db/db_transaction_queue/spend_update_queue.py
+++ b/litellm/proxy/db/db_transaction_queue/spend_update_queue.py
@ -0,0 +1,113 @@
+import asyncio
+from typing import List
+
+from litellm._logging import verbose_proxy_logger
+from litellm.proxy._types import (
+    DBSpendUpdateTransactions,
+    Litellm_EntityType,
+    SpendUpdateQueueItem,
+)
+from litellm.proxy.db.db_transaction_queue.base_update_queue import BaseUpdateQueue
+
+
+class SpendUpdateQueue(BaseUpdateQueue):
+    """
+    In memory buffer for spend updates that should be committed to the database
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.update_queue: asyncio.Queue[SpendUpdateQueueItem] = asyncio.Queue()
+
+    async def flush_and_get_aggregated_db_spend_update_transactions(
+        self,
+    ) -> DBSpendUpdateTransactions:
+        """Flush all updates from the queue and return all updates aggregated by entity type."""
+        updates = await self.flush_all_updates_from_in_memory_queue()
+        verbose_proxy_logger.debug("Aggregating updates by entity type: %s", updates)
+        return self.get_aggregated_db_spend_update_transactions(updates)
+
+    def get_aggregated_db_spend_update_transactions(
+        self, updates: List[SpendUpdateQueueItem]
+    ) -> DBSpendUpdateTransactions:
+        """Aggregate updates by entity type."""
+        # Initialize all transaction lists as empty dicts
+        db_spend_update_transactions = DBSpendUpdateTransactions(
+            user_list_transactions={},
+            end_user_list_transactions={},
+            key_list_transactions={},
+            team_list_transactions={},
+            team_member_list_transactions={},
+            org_list_transactions={},
+        )
+
+        # Map entity types to their corresponding transaction dictionary keys
+        entity_type_to_dict_key = {
+            Litellm_EntityType.USER: "user_list_transactions",
+            Litellm_EntityType.END_USER: "end_user_list_transactions",
+            Litellm_EntityType.KEY: "key_list_transactions",
+            Litellm_EntityType.TEAM: "team_list_transactions",
+            Litellm_EntityType.TEAM_MEMBER: "team_member_list_transactions",
+            Litellm_EntityType.ORGANIZATION: "org_list_transactions",
+        }
+
+        for update in updates:
+            entity_type = update.get("entity_type")
+            entity_id = update.get("entity_id") or ""
+            response_cost = update.get("response_cost") or 0
+
+            if entity_type is None:
+                verbose_proxy_logger.debug(
+                    "Skipping update spend for update: %s, because entity_type is None",
+                    update,
+                )
+                continue
+
+            dict_key = entity_type_to_dict_key.get(entity_type)
+            if dict_key is None:
+                verbose_proxy_logger.debug(
+                    "Skipping update spend for update: %s, because entity_type is not in entity_type_to_dict_key",
+                    update,
+                )
+                continue  # Skip unknown entity types
+
+            # Type-safe access using if/elif statements
+            if dict_key == "user_list_transactions":
+                transactions_dict = db_spend_update_transactions[
+                    "user_list_transactions"
+                ]
+            elif dict_key == "end_user_list_transactions":
+                transactions_dict = db_spend_update_transactions[
+                    "end_user_list_transactions"
+                ]
+            elif dict_key == "key_list_transactions":
+                transactions_dict = db_spend_update_transactions[
+                    "key_list_transactions"
+                ]
+            elif dict_key == "team_list_transactions":
+                transactions_dict = db_spend_update_transactions[
+                    "team_list_transactions"
+                ]
+            elif dict_key == "team_member_list_transactions":
+                transactions_dict = db_spend_update_transactions[
+                    "team_member_list_transactions"
+                ]
+            elif dict_key == "org_list_transactions":
+                transactions_dict = db_spend_update_transactions[
+                    "org_list_transactions"
+                ]
+            else:
+                continue
+
+            if transactions_dict is None:
+                transactions_dict = {}
+
+                # type ignore: dict_key is guaranteed to be one of "one of ("user_list_transactions", "end_user_list_transactions", "key_list_transactions", "team_list_transactions", "team_member_list_transactions", "org_list_transactions")"
+                db_spend_update_transactions[dict_key] = transactions_dict  # type: ignore
+
+            if entity_id not in transactions_dict:
+                transactions_dict[entity_id] = 0
+
+            transactions_dict[entity_id] += response_cost or 0
+
+        return db_spend_update_transactions
--- a/litellm/proxy/example_config_yaml/spend_tracking_config.yaml
+++ b/litellm/proxy/example_config_yaml/spend_tracking_config.yaml
@ -0,0 +1,15 @@
+model_list:
+  - model_name: fake-openai-endpoint
+    litellm_params:
+      model: openai/fake
+      api_key: fake-key
+      api_base: https://exampleopenaiendpoint-production.up.railway.app/
+
+general_settings:
+  use_redis_transaction_buffer: true
+
+litellm_settings:
+  cache: True
+  cache_params:
+    type: redis
+    supported_call_types: []
--- a/litellm/proxy/guardrails/guardrail_hooks/aim.py
+++ b/litellm/proxy/guardrails/guardrail_hooks/aim.py
@ -14,6 +14,7 @@ from pydantic import BaseModel
 from websockets.asyncio.client import ClientConnection, connect

 from litellm import DualCache
+from litellm._version import version as litellm_version
 from litellm._logging import verbose_proxy_logger
 from litellm.integrations.custom_guardrail import CustomGuardrail
 from litellm.llms.custom_httpx.http_handler import (
@ -75,7 +76,9 @@ class AimGuardrail(CustomGuardrail):
    ) -> Union[Exception, str, dict, None]:
        verbose_proxy_logger.debug("Inside AIM Pre-Call Hook")

-        await self.call_aim_guardrail(data, hook="pre_call")
+        await self.call_aim_guardrail(
+            data, hook="pre_call", key_alias=user_api_key_dict.key_alias
+        )
        return data

    async def async_moderation_hook(
@ -93,15 +96,18 @@ class AimGuardrail(CustomGuardrail):
    ) -> Union[Exception, str, dict, None]:
        verbose_proxy_logger.debug("Inside AIM Moderation Hook")

-        await self.call_aim_guardrail(data, hook="moderation")
+        await self.call_aim_guardrail(
+            data, hook="moderation", key_alias=user_api_key_dict.key_alias
+        )
        return data

-    async def call_aim_guardrail(self, data: dict, hook: str) -> None:
+    async def call_aim_guardrail(
+        self, data: dict, hook: str, key_alias: Optional[str]
+    ) -> None:
        user_email = data.get("metadata", {}).get("headers", {}).get("x-aim-user-email")
-        headers = {
-            "Authorization": f"Bearer {self.api_key}",
-            "x-aim-litellm-hook": hook,
-        } | ({"x-aim-user-email": user_email} if user_email else {})
+        headers = self._build_aim_headers(
+            hook=hook, key_alias=key_alias, user_email=user_email
+        )
        response = await self.async_handler.post(
            f"{self.api_base}/detect/openai",
            headers=headers,
@ -120,18 +126,16 @@ class AimGuardrail(CustomGuardrail):
            raise HTTPException(status_code=400, detail=res["detection_message"])

    async def call_aim_guardrail_on_output(
-        self, request_data: dict, output: str, hook: str
+        self, request_data: dict, output: str, hook: str, key_alias: Optional[str]
    ) -> Optional[str]:
        user_email = (
            request_data.get("metadata", {}).get("headers", {}).get("x-aim-user-email")
        )
-        headers = {
-            "Authorization": f"Bearer {self.api_key}",
-            "x-aim-litellm-hook": hook,
-        } | ({"x-aim-user-email": user_email} if user_email else {})
        response = await self.async_handler.post(
            f"{self.api_base}/detect/output",
-            headers=headers,
+            headers=self._build_aim_headers(
+                hook=hook, key_alias=key_alias, user_email=user_email
+            ),
            json={"output": output, "messages": request_data.get("messages", [])},
        )
        response.raise_for_status()
@ -147,6 +151,32 @@ class AimGuardrail(CustomGuardrail):
            return res["detection_message"]
        return None

+    def _build_aim_headers(
+        self, *, hook: str, key_alias: Optional[str], user_email: Optional[str]
+    ):
+        """
+        A helper function to build the http headers that are required by AIM guardrails.
+        """
+        return (
+            {
+                "Authorization": f"Bearer {self.api_key}",
+                # Used by Aim to apply only the guardrails that should be applied in a specific request phase.
+                "x-aim-litellm-hook": hook,
+                # Used by Aim to track LiteLLM version and provide backward compatibility.
+                "x-aim-litellm-version": litellm_version,
+            }
+            # Used by Aim to track guardrails violations by user.
+            | ({"x-aim-user-email": user_email} if user_email else {})
+            | (
+                {
+                    # Used by Aim apply only the guardrails that are associated with the key alias.
+                    "x-aim-litellm-key-alias": key_alias,
+                }
+                if key_alias
+                else {}
+            )
+        )
+
    async def async_post_call_success_hook(
        self,
        data: dict,
@ -160,7 +190,7 @@ class AimGuardrail(CustomGuardrail):
        ):
            content = response.choices[0].message.content or ""
            detection = await self.call_aim_guardrail_on_output(
-                data, content, hook="output"
+                data, content, hook="output", key_alias=user_api_key_dict.key_alias
            )
            if detection:
                raise HTTPException(status_code=400, detail=detection)
@ -174,11 +204,13 @@ class AimGuardrail(CustomGuardrail):
        user_email = (
            request_data.get("metadata", {}).get("headers", {}).get("x-aim-user-email")
        )
-        headers = {
-            "Authorization": f"Bearer {self.api_key}",
-        } | ({"x-aim-user-email": user_email} if user_email else {})
        async with connect(
-            f"{self.ws_api_base}/detect/output/ws", additional_headers=headers
+            f"{self.ws_api_base}/detect/output/ws",
+            additional_headers=self._build_aim_headers(
+                hook="output",
+                key_alias=user_api_key_dict.key_alias,
+                user_email=user_email,
+            ),
        ) as websocket:
            sender = asyncio.create_task(
                self.forward_the_stream_to_aim(websocket, response)
--- a/litellm/proxy/hooks/proxy_track_cost_callback.py
+++ b/litellm/proxy/hooks/proxy_track_cost_callback.py
@ -13,7 +13,6 @@ from litellm.litellm_core_utils.core_helpers import (
 from litellm.litellm_core_utils.litellm_logging import StandardLoggingPayloadSetup
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.proxy.auth.auth_checks import log_db_metrics
-from litellm.proxy.db.db_spend_update_writer import DBSpendUpdateWriter
 from litellm.proxy.utils import ProxyUpdateSpend
 from litellm.types.utils import (
    StandardLoggingPayload,
@ -37,6 +36,8 @@ class _ProxyDBLogger(CustomLogger):
        if _ProxyDBLogger._should_track_errors_in_db() is False:
            return

+        from litellm.proxy.proxy_server import proxy_logging_obj
+
        _metadata = dict(
            StandardLoggingUserAPIKeyMetadata(
                user_api_key_hash=user_api_key_dict.api_key,
@ -66,7 +67,7 @@ class _ProxyDBLogger(CustomLogger):
            request_data.get("proxy_server_request") or {}
        )
        request_data["litellm_params"]["metadata"] = existing_metadata
-        await DBSpendUpdateWriter.update_database(
+        await proxy_logging_obj.db_spend_update_writer.update_database(
            token=user_api_key_dict.api_key,
            response_cost=0.0,
            user_id=user_api_key_dict.user_id,
@ -136,7 +137,7 @@ class _ProxyDBLogger(CustomLogger):
                    end_user_id=end_user_id,
                ):
                    ## UPDATE DATABASE
-                    await DBSpendUpdateWriter.update_database(
+                    await proxy_logging_obj.db_spend_update_writer.update_database(
                        token=user_api_key,
                        response_cost=response_cost,
                        user_id=user_id,
--- a/litellm/proxy/litellm_pre_call_utils.py
+++ b/litellm/proxy/litellm_pre_call_utils.py
@ -747,7 +747,10 @@ def _get_enforced_params(
    enforced_params: Optional[list] = None
    if general_settings is not None:
        enforced_params = general_settings.get("enforced_params")
-        if "service_account_settings" in general_settings:
+        if (
+            "service_account_settings" in general_settings
+            and check_if_token_is_service_account(user_api_key_dict) is True
+        ):
            service_account_settings = general_settings["service_account_settings"]
            if "enforced_params" in service_account_settings:
                if enforced_params is None:
@ -760,6 +763,20 @@ def _get_enforced_params(
    return enforced_params


+def check_if_token_is_service_account(valid_token: UserAPIKeyAuth) -> bool:
+    """
+    Checks if the token is a service account
+
+    Returns:
+        bool: True if token is a service account
+
+    """
+    if valid_token.metadata:
+        if "service_account_id" in valid_token.metadata:
+            return True
+    return False
+
+
 def _enforced_params_check(
    request_body: dict,
    general_settings: Optional[dict],
--- a/litellm/proxy/management_endpoints/internal_user_endpoints.py
+++ b/litellm/proxy/management_endpoints/internal_user_endpoints.py
@ -1259,19 +1259,43 @@ class SpendMetrics(BaseModel):
    prompt_tokens: int = Field(default=0)
    completion_tokens: int = Field(default=0)
    total_tokens: int = Field(default=0)
+    successful_requests: int = Field(default=0)
+    failed_requests: int = Field(default=0)
    api_requests: int = Field(default=0)


+class MetricBase(BaseModel):
+    metrics: SpendMetrics
+
+
+class MetricWithMetadata(MetricBase):
+    metadata: Dict[str, Any] = Field(default_factory=dict)
+
+
+class KeyMetadata(BaseModel):
+    """Metadata for a key"""
+
+    key_alias: Optional[str] = None
+
+
+class KeyMetricWithMetadata(MetricBase):
+    """Base class for metrics with additional metadata"""
+
+    metadata: KeyMetadata = Field(default_factory=KeyMetadata)
+
+
 class BreakdownMetrics(BaseModel):
    """Breakdown of spend by different dimensions"""

-    models: Dict[str, SpendMetrics] = Field(default_factory=dict)  # model -> metrics
-    providers: Dict[str, SpendMetrics] = Field(
+    models: Dict[str, MetricWithMetadata] = Field(
        default_factory=dict
-    )  # provider -> metrics
-    api_keys: Dict[str, SpendMetrics] = Field(
+    )  # model -> {metrics, metadata}
+    providers: Dict[str, MetricWithMetadata] = Field(
        default_factory=dict
-    )  # api_key -> metrics
+    )  # provider -> {metrics, metadata}
+    api_keys: Dict[str, KeyMetricWithMetadata] = Field(
+        default_factory=dict
+    )  # api_key -> {metrics, metadata}


 class DailySpendData(BaseModel):
@ -1284,7 +1308,10 @@ class DailySpendMetadata(BaseModel):
    total_spend: float = Field(default=0.0)
    total_prompt_tokens: int = Field(default=0)
    total_completion_tokens: int = Field(default=0)
+    total_tokens: int = Field(default=0)
    total_api_requests: int = Field(default=0)
+    total_successful_requests: int = Field(default=0)
+    total_failed_requests: int = Field(default=0)
    page: int = Field(default=1)
    total_pages: int = Field(default=1)
    has_more: bool = Field(default=False)
@ -1307,6 +1334,8 @@ class LiteLLM_DailyUserSpend(BaseModel):
    completion_tokens: int = 0
    spend: float = 0.0
    api_requests: int = 0
+    successful_requests: int = 0
+    failed_requests: int = 0


 class GroupedData(TypedDict):
@ -1322,34 +1351,57 @@ def update_metrics(
    group_metrics.completion_tokens += record.completion_tokens
    group_metrics.total_tokens += record.prompt_tokens + record.completion_tokens
    group_metrics.api_requests += record.api_requests
+    group_metrics.successful_requests += record.successful_requests
+    group_metrics.failed_requests += record.failed_requests
    return group_metrics


 def update_breakdown_metrics(
-    breakdown: BreakdownMetrics, record: LiteLLM_DailyUserSpend
+    breakdown: BreakdownMetrics,
+    record: LiteLLM_DailyUserSpend,
+    model_metadata: Dict[str, Dict[str, Any]],
+    provider_metadata: Dict[str, Dict[str, Any]],
+    api_key_metadata: Dict[str, Dict[str, Any]],
 ) -> BreakdownMetrics:
    """Updates breakdown metrics for a single record using the existing update_metrics function"""

    # Update model breakdown
    if record.model not in breakdown.models:
-        breakdown.models[record.model] = SpendMetrics()
-    breakdown.models[record.model] = update_metrics(
-        breakdown.models[record.model], record
+        breakdown.models[record.model] = MetricWithMetadata(
+            metrics=SpendMetrics(),
+            metadata=model_metadata.get(
+                record.model, {}
+            ),  # Add any model-specific metadata here
+        )
+    breakdown.models[record.model].metrics = update_metrics(
+        breakdown.models[record.model].metrics, record
    )

    # Update provider breakdown
    provider = record.custom_llm_provider or "unknown"
    if provider not in breakdown.providers:
-        breakdown.providers[provider] = SpendMetrics()
-    breakdown.providers[provider] = update_metrics(
-        breakdown.providers[provider], record
+        breakdown.providers[provider] = MetricWithMetadata(
+            metrics=SpendMetrics(),
+            metadata=provider_metadata.get(
+                provider, {}
+            ),  # Add any provider-specific metadata here
+        )
+    breakdown.providers[provider].metrics = update_metrics(
+        breakdown.providers[provider].metrics, record
    )

    # Update api key breakdown
    if record.api_key not in breakdown.api_keys:
-        breakdown.api_keys[record.api_key] = SpendMetrics()
-    breakdown.api_keys[record.api_key] = update_metrics(
-        breakdown.api_keys[record.api_key], record
+        breakdown.api_keys[record.api_key] = KeyMetricWithMetadata(
+            metrics=SpendMetrics(),
+            metadata=KeyMetadata(
+                key_alias=api_key_metadata.get(record.api_key, {}).get(
+                    "key_alias", None
+                )
+            ),  # Add any api_key-specific metadata here
+        )
+    breakdown.api_keys[record.api_key].metrics = update_metrics(
+        breakdown.api_keys[record.api_key].metrics, record
    )

    return breakdown
@ -1428,6 +1480,14 @@ async def get_user_daily_activity(
        if api_key:
            where_conditions["api_key"] = api_key

+        if (
+            user_api_key_dict.user_role != LitellmUserRoles.PROXY_ADMIN
+            and user_api_key_dict.user_role != LitellmUserRoles.PROXY_ADMIN_VIEW_ONLY
+        ):
+            where_conditions[
+                "user_id"
+            ] = user_api_key_dict.user_id  # only allow access to own data
+
        # Get total count for pagination
        total_count = await prisma_client.db.litellm_dailyuserspend.count(
            where=where_conditions
@ -1443,6 +1503,28 @@ async def get_user_daily_activity(
            take=page_size,
        )

+        daily_spend_data_pydantic_list = [
+            LiteLLM_DailyUserSpend(**record.model_dump()) for record in daily_spend_data
+        ]
+
+        # Get all unique API keys from the spend data
+        api_keys = set()
+        for record in daily_spend_data_pydantic_list:
+            if record.api_key:
+                api_keys.add(record.api_key)
+
+        # Fetch key aliases in bulk
+
+        api_key_metadata: Dict[str, Dict[str, Any]] = {}
+        model_metadata: Dict[str, Dict[str, Any]] = {}
+        provider_metadata: Dict[str, Dict[str, Any]] = {}
+        if api_keys:
+            key_records = await prisma_client.db.litellm_verificationtoken.find_many(
+                where={"token": {"in": list(api_keys)}}
+            )
+            api_key_metadata.update(
+                {k.token: {"key_alias": k.key_alias} for k in key_records}
+            )
        # Process results
        results = []
        total_metrics = SpendMetrics()
@ -1450,7 +1532,7 @@ async def get_user_daily_activity(
        # Group data by date and other dimensions

        grouped_data: Dict[str, Dict[str, Any]] = {}
-        for record in daily_spend_data:
+        for record in daily_spend_data_pydantic_list:
            date_str = record.date
            if date_str not in grouped_data:
                grouped_data[date_str] = {
@ -1464,7 +1546,11 @@ async def get_user_daily_activity(
            )
            # Update breakdowns
            grouped_data[date_str]["breakdown"] = update_breakdown_metrics(
-                grouped_data[date_str]["breakdown"], record
+                grouped_data[date_str]["breakdown"],
+                record,
+                model_metadata,
+                provider_metadata,
+                api_key_metadata,
            )

            # Update total metrics
@ -1474,7 +1560,9 @@ async def get_user_daily_activity(
            total_metrics.total_tokens += (
                record.prompt_tokens + record.completion_tokens
            )
-            total_metrics.api_requests += 1
+            total_metrics.api_requests += record.api_requests
+            total_metrics.successful_requests += record.successful_requests
+            total_metrics.failed_requests += record.failed_requests

        # Convert grouped data to response format
        for date_str, data in grouped_data.items():
@ -1495,7 +1583,10 @@ async def get_user_daily_activity(
                total_spend=total_metrics.spend,
                total_prompt_tokens=total_metrics.prompt_tokens,
                total_completion_tokens=total_metrics.completion_tokens,
+                total_tokens=total_metrics.total_tokens,
                total_api_requests=total_metrics.api_requests,
+                total_successful_requests=total_metrics.successful_requests,
+                total_failed_requests=total_metrics.failed_requests,
                page=page,
                total_pages=-(-total_count // page_size),  # Ceiling division
                has_more=(page * page_size) < total_count,
--- a/litellm/proxy/management_endpoints/model_management_endpoints.py
+++ b/litellm/proxy/management_endpoints/model_management_endpoints.py
@ -394,7 +394,7 @@ class ModelManagementAuthChecks:

    @staticmethod
    async def can_user_make_model_call(
-        model_params: Union[Deployment, updateDeployment],
+        model_params: Deployment,
        user_api_key_dict: UserAPIKeyAuth,
        prisma_client: PrismaClient,
        premium_user: bool,
@ -723,8 +723,38 @@ async def update_model(
                },
            )

+        _model_id = None
+        _model_info = getattr(model_params, "model_info", None)
+        if _model_info is None:
+            raise Exception("model_info not provided")
+
+        _model_id = _model_info.id
+        if _model_id is None:
+            raise Exception("model_info.id not provided")
+
+        _existing_litellm_params = (
+            await prisma_client.db.litellm_proxymodeltable.find_unique(
+                where={"model_id": _model_id}
+            )
+        )
+
+        if _existing_litellm_params is None:
+            if (
+                llm_router is not None
+                and llm_router.get_deployment(model_id=_model_id) is not None
+            ):
+                raise HTTPException(
+                    status_code=400,
+                    detail={
+                        "error": "Can't edit model. Model in config. Store model in db via `/model/new`. to edit."
+                    },
+                )
+            else:
+                raise Exception("model not found")
+        deployment = Deployment(**_existing_litellm_params.model_dump())
+
        await ModelManagementAuthChecks.can_user_make_model_call(
-            model_params=model_params,
+            model_params=deployment,
            user_api_key_dict=user_api_key_dict,
            prisma_client=prisma_client,
            premium_user=premium_user,
@ -732,31 +762,6 @@ async def update_model(

        # update DB
        if store_model_in_db is True:
-            _model_id = None
-            _model_info = getattr(model_params, "model_info", None)
-            if _model_info is None:
-                raise Exception("model_info not provided")
-
-            _model_id = _model_info.id
-            if _model_id is None:
-                raise Exception("model_info.id not provided")
-            _existing_litellm_params = (
-                await prisma_client.db.litellm_proxymodeltable.find_unique(
-                    where={"model_id": _model_id}
-                )
-            )
-            if _existing_litellm_params is None:
-                if (
-                    llm_router is not None
-                    and llm_router.get_deployment(model_id=_model_id) is not None
-                ):
-                    raise HTTPException(
-                        status_code=400,
-                        detail={
-                            "error": "Can't edit model. Model in config. Store model in db via `/model/new`. to edit."
-                        },
-                    )
-                raise Exception("model not found")
            _existing_litellm_params_dict = dict(
                _existing_litellm_params.litellm_params
            )
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@ -1,15 +1,6 @@
 model_list:
-  - model_name: gpt-4o
+  - model_name: fake-openai-endpoint
    litellm_params:
-      model: openai/gpt-4o
-      api_key: sk-xxxxxxx
-
-mcp_servers:
-  {
-    "zapier_mcp": {
-      "url": "https://actions.zapier.com/mcp/sk-akxxxxx/sse"
-    },
-    "fetch": {
-      "url": "http://localhost:8000/sse"
-    }
-  }
+      model: openai/fake
+      api_key: fake-key
+      api_base: https://exampleopenaiendpoint-production.up.railway.app/
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -3308,15 +3308,6 @@ async def model_list(
    tags=["chat/completions"],
    responses={200: {"description": "Successful response"}, **ERROR_RESPONSES},
 )  # azure compatible endpoint
-@backoff.on_exception(
-    backoff.expo,
-    Exception,  # base exception to catch for the backoff
-    max_tries=global_max_parallel_request_retries,  # maximum number of retries
-    max_time=global_max_parallel_request_retry_timeout,  # maximum total time to retry for
-    on_backoff=on_backoff,  # specifying the function to call on backoff
-    giveup=giveup,
-    logger=verbose_proxy_logger,
-)
 async def chat_completion(  # noqa: PLR0915
    request: Request,
    fastapi_response: Response,
--- a/litellm/proxy/schema.prisma
+++ b/litellm/proxy/schema.prisma
@ -327,6 +327,8 @@ model LiteLLM_DailyUserSpend {
  completion_tokens   Int      @default(0)
  spend               Float    @default(0.0)
  api_requests        Int      @default(0)
+  successful_requests Int      @default(0)
+  failed_requests     Int      @default(0)
  created_at          DateTime @default(now())
  updated_at          DateTime @updatedAt

@ -352,4 +354,3 @@ enum JobStatus {
  INACTIVE
 }

-
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@ -10,14 +10,24 @@ import traceback
 from datetime import datetime, timedelta
 from email.mime.multipart import MIMEMultipart
 from email.mime.text import MIMEText
-from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Union, overload
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Dict,
+    List,
+    Literal,
+    Optional,
+    Union,
+    cast,
+    overload,
+)

 from litellm.proxy._types import (
    DB_CONNECTION_ERROR_TYPES,
    CommonProxyErrors,
-    DailyUserSpendTransaction,
    ProxyErrorTypes,
    ProxyException,
+    SpendLogsMetadata,
    SpendLogsPayload,
 )
 from litellm.types.guardrails import GuardrailEventHooks
@ -1100,14 +1110,7 @@ def jsonify_object(data: dict) -> dict:


 class PrismaClient:
-    user_list_transactions: dict = {}
-    end_user_list_transactions: dict = {}
-    key_list_transactions: dict = {}
-    team_list_transactions: dict = {}
-    team_member_list_transactions: dict = {}  # key is ["team_id" + "user_id"]
-    org_list_transactions: dict = {}
    spend_log_transactions: List = []
-    daily_user_spend_transactions: Dict[str, DailyUserSpendTransaction] = {}

    def __init__(
        self,
@ -1145,62 +1148,40 @@ class PrismaClient:
            )  # Client to connect to Prisma db
        verbose_proxy_logger.debug("Success - Created Prisma Client")

-    def add_spend_log_transaction_to_daily_user_transaction(
+    def get_request_status(
        self, payload: Union[dict, SpendLogsPayload]
-    ):
+    ) -> Literal["success", "failure"]:
        """
-        Add a spend log transaction to the daily user transaction list
+        Determine if a request was successful or failed based on payload metadata.

-        Key = @@unique([user_id, date, api_key, model, custom_llm_provider])    )
+        Args:
+            payload (Union[dict, SpendLogsPayload]): Request payload containing metadata

-        If key exists, update the transaction with the new spend and usage
+        Returns:
+            Literal["success", "failure"]: Request status
        """
-        expected_keys = ["user", "startTime", "api_key", "model", "custom_llm_provider"]
-        if not all(key in payload for key in expected_keys):
-            verbose_proxy_logger.debug(
-                f"Missing expected keys: {expected_keys}, in payload, skipping from daily_user_spend_transactions"
-            )
-            return
-
-        if isinstance(payload["startTime"], datetime):
-            start_time = payload["startTime"].isoformat()
-            date = start_time.split("T")[0]
-        elif isinstance(payload["startTime"], str):
-            date = payload["startTime"].split("T")[0]
-        else:
-            verbose_proxy_logger.debug(
-                f"Invalid start time: {payload['startTime']}, skipping from daily_user_spend_transactions"
-            )
-            return
        try:
-            daily_transaction_key = f"{payload['user']}_{date}_{payload['api_key']}_{payload['model']}_{payload['custom_llm_provider']}"
-            if daily_transaction_key in self.daily_user_spend_transactions:
-                daily_transaction = self.daily_user_spend_transactions[
-                    daily_transaction_key
-                ]
-                daily_transaction["spend"] += payload["spend"]
-                daily_transaction["prompt_tokens"] += payload["prompt_tokens"]
-                daily_transaction["completion_tokens"] += payload["completion_tokens"]
-                daily_transaction["api_requests"] += 1
-            else:
-                daily_transaction = DailyUserSpendTransaction(
-                    user_id=payload["user"],
-                    date=date,
-                    api_key=payload["api_key"],
-                    model=payload["model"],
-                    model_group=payload["model_group"],
-                    custom_llm_provider=payload["custom_llm_provider"],
-                    prompt_tokens=payload["prompt_tokens"],
-                    completion_tokens=payload["completion_tokens"],
-                    spend=payload["spend"],
-                    api_requests=1,
+            # Get metadata and convert to dict if it's a JSON string
+            payload_metadata: Union[Dict, SpendLogsMetadata, str] = payload.get(
+                "metadata", {}
+            )
+            if isinstance(payload_metadata, str):
+                payload_metadata_json: Union[Dict, SpendLogsMetadata] = cast(
+                    Dict, json.loads(payload_metadata)
                )
+            else:
+                payload_metadata_json = payload_metadata

-            self.daily_user_spend_transactions[
-                daily_transaction_key
-            ] = daily_transaction
-        except Exception as e:
-            raise e
+            # Check status in metadata dict
+            return (
+                "failure"
+                if payload_metadata_json.get("status") == "failure"
+                else "success"
+            )
+
+        except (json.JSONDecodeError, AttributeError):
+            # Default to success if metadata parsing fails
+            return "success"

    def hash_token(self, token: str):
        # Hash the string using SHA-256
@ -2422,7 +2403,10 @@ def _hash_token_if_needed(token: str) -> str:
 class ProxyUpdateSpend:
    @staticmethod
    async def update_end_user_spend(
-        n_retry_times: int, prisma_client: PrismaClient, proxy_logging_obj: ProxyLogging
+        n_retry_times: int,
+        prisma_client: PrismaClient,
+        proxy_logging_obj: ProxyLogging,
+        end_user_list_transactions: Dict[str, float],
    ):
        for i in range(n_retry_times + 1):
            start_time = time.time()
@ -2434,7 +2418,7 @@ class ProxyUpdateSpend:
                        for (
                            end_user_id,
                            response_cost,
-                        ) in prisma_client.end_user_list_transactions.items():
+                        ) in end_user_list_transactions.items():
                            if litellm.max_end_user_budget is not None:
                                pass
                            batcher.litellm_endusertable.upsert(
@ -2461,10 +2445,6 @@ class ProxyUpdateSpend:
                _raise_failed_update_spend_exception(
                    e=e, start_time=start_time, proxy_logging_obj=proxy_logging_obj
                )
-            finally:
-                prisma_client.end_user_list_transactions = (
-                    {}
-                )  # reset the end user list transactions - prevent bad data from causing issues

    @staticmethod
    async def update_spend_logs(
@ -2538,120 +2518,6 @@ class ProxyUpdateSpend:
                e=e, start_time=start_time, proxy_logging_obj=proxy_logging_obj
            )

-    @staticmethod
-    async def update_daily_user_spend(
-        n_retry_times: int,
-        prisma_client: PrismaClient,
-        proxy_logging_obj: ProxyLogging,
-    ):
-        """
-        Batch job to update LiteLLM_DailyUserSpend table using in-memory daily_spend_transactions
-        """
-        BATCH_SIZE = (
-            100  # Number of aggregated records to update in each database operation
-        )
-        start_time = time.time()
-
-        try:
-            for i in range(n_retry_times + 1):
-                try:
-                    # Get transactions to process
-                    transactions_to_process = dict(
-                        list(prisma_client.daily_user_spend_transactions.items())[
-                            :BATCH_SIZE
-                        ]
-                    )
-
-                    if len(transactions_to_process) == 0:
-                        verbose_proxy_logger.debug(
-                            "No new transactions to process for daily spend update"
-                        )
-                        break
-
-                    # Update DailyUserSpend table in batches
-                    async with prisma_client.db.batch_() as batcher:
-                        for _, transaction in transactions_to_process.items():
-                            user_id = transaction.get("user_id")
-                            if not user_id:  # Skip if no user_id
-                                continue
-
-                            batcher.litellm_dailyuserspend.upsert(
-                                where={
-                                    "user_id_date_api_key_model_custom_llm_provider": {
-                                        "user_id": user_id,
-                                        "date": transaction["date"],
-                                        "api_key": transaction["api_key"],
-                                        "model": transaction["model"],
-                                        "custom_llm_provider": transaction.get(
-                                            "custom_llm_provider"
-                                        ),
-                                    }
-                                },
-                                data={
-                                    "create": {
-                                        "user_id": user_id,
-                                        "date": transaction["date"],
-                                        "api_key": transaction["api_key"],
-                                        "model": transaction["model"],
-                                        "model_group": transaction.get("model_group"),
-                                        "custom_llm_provider": transaction.get(
-                                            "custom_llm_provider"
-                                        ),
-                                        "prompt_tokens": transaction["prompt_tokens"],
-                                        "completion_tokens": transaction[
-                                            "completion_tokens"
-                                        ],
-                                        "spend": transaction["spend"],
-                                        "api_requests": transaction["api_requests"],
-                                    },
-                                    "update": {
-                                        "prompt_tokens": {
-                                            "increment": transaction["prompt_tokens"]
-                                        },
-                                        "completion_tokens": {
-                                            "increment": transaction[
-                                                "completion_tokens"
-                                            ]
-                                        },
-                                        "spend": {"increment": transaction["spend"]},
-                                        "api_requests": {
-                                            "increment": transaction["api_requests"]
-                                        },
-                                    },
-                                },
-                            )
-
-                    verbose_proxy_logger.info(
-                        f"Processed {len(transactions_to_process)} daily spend transactions in {time.time() - start_time:.2f}s"
-                    )
-
-                    # Remove processed transactions
-                    for key in transactions_to_process.keys():
-                        prisma_client.daily_user_spend_transactions.pop(key, None)
-
-                    verbose_proxy_logger.debug(
-                        f"Processed {len(transactions_to_process)} daily spend transactions in {time.time() - start_time:.2f}s"
-                    )
-                    break
-
-                except DB_CONNECTION_ERROR_TYPES as e:
-                    if i >= n_retry_times:
-                        _raise_failed_update_spend_exception(
-                            e=e,
-                            start_time=start_time,
-                            proxy_logging_obj=proxy_logging_obj,
-                        )
-                    await asyncio.sleep(2**i)  # Exponential backoff
-
-        except Exception as e:
-            # Remove processed transactions even if there was an error
-            if "transactions_to_process" in locals():
-                for key in transactions_to_process.keys():  # type: ignore
-                    prisma_client.daily_user_spend_transactions.pop(key, None)
-            _raise_failed_update_spend_exception(
-                e=e, start_time=start_time, proxy_logging_obj=proxy_logging_obj
-            )
-
    @staticmethod
    def disable_spend_updates() -> bool:
        """
@ -2701,20 +2567,6 @@ async def update_spend(  # noqa: PLR0915
            db_writer_client=db_writer_client,
        )

-    ### UPDATE DAILY USER SPEND ###
-    verbose_proxy_logger.debug(
-        "Daily User Spend transactions: {}".format(
-            len(prisma_client.daily_user_spend_transactions)
-        )
-    )
-
-    if len(prisma_client.daily_user_spend_transactions) > 0:
-        await ProxyUpdateSpend.update_daily_user_spend(
-            n_retry_times=n_retry_times,
-            prisma_client=prisma_client,
-            proxy_logging_obj=proxy_logging_obj,
-        )
-

 def _raise_failed_update_spend_exception(
    e: Exception, start_time: float, proxy_logging_obj: ProxyLogging
--- a/litellm/types/llms/anthropic_messages/anthropic_response.py
+++ b/litellm/types/llms/anthropic_messages/anthropic_response.py
@ -0,0 +1,83 @@
+from typing import Any, Dict, List, Literal, Optional, TypedDict, Union
+
+from typing_extensions import TypeAlias
+
+
+class AnthropicResponseTextBlock(TypedDict, total=False):
+    """
+    Anthropic Response Text Block: https://docs.anthropic.com/en/api/messages
+    """
+
+    citations: Optional[List[Dict[str, Any]]]
+    text: str
+    type: Literal["text"]
+
+
+class AnthropicResponseToolUseBlock(TypedDict, total=False):
+    """
+    Anthropic Response Tool Use Block: https://docs.anthropic.com/en/api/messages
+    """
+
+    id: Optional[str]
+    input: Optional[str]
+    name: Optional[str]
+    type: Literal["tool_use"]
+
+
+class AnthropicResponseThinkingBlock(TypedDict, total=False):
+    """
+    Anthropic Response Thinking Block: https://docs.anthropic.com/en/api/messages
+    """
+
+    signature: Optional[str]
+    thinking: Optional[str]
+    type: Literal["thinking"]
+
+
+class AnthropicResponseRedactedThinkingBlock(TypedDict, total=False):
+    """
+    Anthropic Response Redacted Thinking Block: https://docs.anthropic.com/en/api/messages
+    """
+
+    data: Optional[str]
+    type: Literal["redacted_thinking"]
+
+
+AnthropicResponseContentBlock: TypeAlias = Union[
+    AnthropicResponseTextBlock,
+    AnthropicResponseToolUseBlock,
+    AnthropicResponseThinkingBlock,
+    AnthropicResponseRedactedThinkingBlock,
+]
+
+
+class AnthropicUsage(TypedDict, total=False):
+    """
+    Input and output tokens used in the request
+    """
+
+    input_tokens: int
+    output_tokens: int
+
+    """
+    Cache Tokens Used
+    """
+    cache_creation_input_tokens: int
+    cache_read_input_tokens: int
+
+
+class AnthropicMessagesResponse(TypedDict, total=False):
+    """
+    Anthropic Messages API Response: https://docs.anthropic.com/en/api/messages
+    """
+
+    content: Optional[List[AnthropicResponseContentBlock]]
+    id: str
+    model: Optional[str]  # This represents the Model type from Anthropic
+    role: Optional[Literal["assistant"]]
+    stop_reason: Optional[
+        Literal["end_turn", "max_tokens", "stop_sequence", "tool_use"]
+    ]
+    stop_sequence: Optional[str]
+    type: Optional[Literal["message"]]
+    usage: Optional[AnthropicUsage]
--- a/litellm/types/llms/openai.py
+++ b/litellm/types/llms/openai.py
@ -1113,3 +1113,6 @@ ResponsesAPIStreamingResponse = Annotated[
    ],
    Discriminator("type"),
 ]
+
+
+REASONING_EFFORT = Literal["low", "medium", "high"]
--- a/litellm/types/llms/openrouter.py
+++ b/litellm/types/llms/openrouter.py
@ -0,0 +1,9 @@
+import json
+from enum import Enum
+from typing import Any, Dict, List, Literal, Optional, Tuple, TypedDict, Union
+
+
+class OpenRouterErrorMessage(TypedDict):
+    message: str
+    code: int
+    metadata: Dict
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -5901,9 +5901,10 @@ class ModelResponseIterator:


 class ModelResponseListIterator:
-    def __init__(self, model_responses):
+    def __init__(self, model_responses, delay: Optional[float] = None):
        self.model_responses = model_responses
        self.index = 0
+        self.delay = delay

    # Sync iterator
    def __iter__(self):
@ -5914,6 +5915,8 @@ class ModelResponseListIterator:
            raise StopIteration
        model_response = self.model_responses[self.index]
        self.index += 1
+        if self.delay:
+            time.sleep(self.delay)
        return model_response

    # Async iterator
@ -5925,6 +5928,8 @@ class ModelResponseListIterator:
            raise StopAsyncIteration
        model_response = self.model_responses[self.index]
        self.index += 1
+        if self.delay:
+            await asyncio.sleep(self.delay)
        return model_response


--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@ -4453,6 +4453,42 @@
        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models",
        "supports_tool_choice": true
    },
+    "gemini-2.5-pro-exp-03-25": {
+        "max_tokens": 65536,
+        "max_input_tokens": 1048576,
+        "max_output_tokens": 65536,
+        "max_images_per_prompt": 3000,
+        "max_videos_per_prompt": 10,
+        "max_video_length": 1,
+        "max_audio_length_hours": 8.4,
+        "max_audio_per_prompt": 1,
+        "max_pdf_size_mb": 30,
+        "input_cost_per_image": 0,
+        "input_cost_per_video_per_second": 0,
+        "input_cost_per_audio_per_second": 0,
+        "input_cost_per_token": 0,
+        "input_cost_per_character": 0, 
+        "input_cost_per_token_above_128k_tokens": 0, 
+        "input_cost_per_character_above_128k_tokens": 0, 
+        "input_cost_per_image_above_128k_tokens": 0,
+        "input_cost_per_video_per_second_above_128k_tokens": 0,
+        "input_cost_per_audio_per_second_above_128k_tokens": 0,
+        "output_cost_per_token": 0,
+        "output_cost_per_character": 0,
+        "output_cost_per_token_above_128k_tokens": 0,
+        "output_cost_per_character_above_128k_tokens": 0,
+        "litellm_provider": "vertex_ai-language-models",
+        "mode": "chat",
+        "supports_system_messages": true,
+        "supports_function_calling": true,
+        "supports_vision": true,
+        "supports_audio_input": true,
+        "supports_video_input": true,
+        "supports_pdf_input": true,
+        "supports_response_schema": true,
+        "supports_tool_choice": true,
+        "source": "https://cloud.google.com/vertex-ai/generative-ai/pricing"
+    },
    "gemini-2.0-pro-exp-02-05": {
        "max_tokens": 8192,
        "max_input_tokens": 2097152,
@ -10189,6 +10225,22 @@
        "litellm_provider": "voyage",
        "mode": "rerank"
    },
+    "databricks/databricks-claude-3-7-sonnet": {
+        "max_tokens": 200000,
+        "max_input_tokens": 200000,
+        "max_output_tokens": 128000, 
+        "input_cost_per_token": 0.0000025,
+        "input_dbu_cost_per_token": 0.00003571,
+        "output_cost_per_token": 0.00017857,
+        "output_db_cost_per_token": 0.000214286,
+        "litellm_provider": "databricks",
+        "mode": "chat",
+        "source": "https://www.databricks.com/product/pricing/foundation-model-serving",
+        "metadata": {"notes": "Input/output cost per token is dbu cost * $0.070, based on databricks Claude 3.7 conversion. Number provided for reference, '*_dbu_cost_per_token' used in actual calculation."},
+        "supports_assistant_prefill": true,
+        "supports_function_calling": true,
+        "supports_tool_choice": true
+    },
    "databricks/databricks-meta-llama-3-1-405b-instruct": {
        "max_tokens": 128000,
        "max_input_tokens": 128000,
@ -10217,7 +10269,7 @@
        "metadata": {"notes": "Input/output cost per token is dbu cost * $0.070, based on databricks Llama 3.1 70B conversion. Number provided for reference, '*_dbu_cost_per_token' used in actual calculation."},
        "supports_tool_choice": true
    },
-    "databricks/meta-llama-3.3-70b-instruct": {
+    "databricks/databricks-meta-llama-3-3-70b-instruct": {
        "max_tokens": 128000,
        "max_input_tokens": 128000,
        "max_output_tokens": 128000, 
--- a/mypy.ini
+++ b/mypy.ini
@ -2,6 +2,7 @@
 warn_return_any = False
 ignore_missing_imports = True
 mypy_path = litellm/stubs
+namespace_packages = True

 [mypy-google.*]
 ignore_missing_imports = True
--- a/poetry.lock
+++ b/poetry.lock
@ -1151,69 +1151,6 @@ files = [
 [package.extras]
 protobuf = ["grpcio-tools (>=1.70.0)"]

-[[package]]
-name = "grpcio"
-version = "1.71.0"
-description = "HTTP/2-based RPC framework"
-optional = true
-python-versions = ">=3.9"
-files = [
-    {file = "grpcio-1.71.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:c200cb6f2393468142eb50ab19613229dcc7829b5ccee8b658a36005f6669fdd"},
-    {file = "grpcio-1.71.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:b2266862c5ad664a380fbbcdbdb8289d71464c42a8c29053820ee78ba0119e5d"},
-    {file = "grpcio-1.71.0-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:0ab8b2864396663a5b0b0d6d79495657ae85fa37dcb6498a2669d067c65c11ea"},
-    {file = "grpcio-1.71.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c30f393f9d5ff00a71bb56de4aa75b8fe91b161aeb61d39528db6b768d7eac69"},
-    {file = "grpcio-1.71.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f250ff44843d9a0615e350c77f890082102a0318d66a99540f54769c8766ab73"},
-    {file = "grpcio-1.71.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e6d8de076528f7c43a2f576bc311799f89d795aa6c9b637377cc2b1616473804"},
-    {file = "grpcio-1.71.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:9b91879d6da1605811ebc60d21ab6a7e4bae6c35f6b63a061d61eb818c8168f6"},
-    {file = "grpcio-1.71.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f71574afdf944e6652203cd1badcda195b2a27d9c83e6d88dc1ce3cfb73b31a5"},
-    {file = "grpcio-1.71.0-cp310-cp310-win32.whl", hash = "sha256:8997d6785e93308f277884ee6899ba63baafa0dfb4729748200fcc537858a509"},
-    {file = "grpcio-1.71.0-cp310-cp310-win_amd64.whl", hash = "sha256:7d6ac9481d9d0d129224f6d5934d5832c4b1cddb96b59e7eba8416868909786a"},
-    {file = "grpcio-1.71.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:d6aa986318c36508dc1d5001a3ff169a15b99b9f96ef5e98e13522c506b37eef"},
-    {file = "grpcio-1.71.0-cp311-cp311-macosx_10_14_universal2.whl", hash = "sha256:d2c170247315f2d7e5798a22358e982ad6eeb68fa20cf7a820bb74c11f0736e7"},
-    {file = "grpcio-1.71.0-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:e6f83a583ed0a5b08c5bc7a3fe860bb3c2eac1f03f1f63e0bc2091325605d2b7"},
-    {file = "grpcio-1.71.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4be74ddeeb92cc87190e0e376dbc8fc7736dbb6d3d454f2fa1f5be1dee26b9d7"},
-    {file = "grpcio-1.71.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4dd0dfbe4d5eb1fcfec9490ca13f82b089a309dc3678e2edabc144051270a66e"},
-    {file = "grpcio-1.71.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:a2242d6950dc892afdf9e951ed7ff89473aaf744b7d5727ad56bdaace363722b"},
-    {file = "grpcio-1.71.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:0fa05ee31a20456b13ae49ad2e5d585265f71dd19fbd9ef983c28f926d45d0a7"},
-    {file = "grpcio-1.71.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3d081e859fb1ebe176de33fc3adb26c7d46b8812f906042705346b314bde32c3"},
-    {file = "grpcio-1.71.0-cp311-cp311-win32.whl", hash = "sha256:d6de81c9c00c8a23047136b11794b3584cdc1460ed7cbc10eada50614baa1444"},
-    {file = "grpcio-1.71.0-cp311-cp311-win_amd64.whl", hash = "sha256:24e867651fc67717b6f896d5f0cac0ec863a8b5fb7d6441c2ab428f52c651c6b"},
-    {file = "grpcio-1.71.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:0ff35c8d807c1c7531d3002be03221ff9ae15712b53ab46e2a0b4bb271f38537"},
-    {file = "grpcio-1.71.0-cp312-cp312-macosx_10_14_universal2.whl", hash = "sha256:b78a99cd1ece4be92ab7c07765a0b038194ded2e0a26fd654591ee136088d8d7"},
-    {file = "grpcio-1.71.0-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:dc1a1231ed23caac1de9f943d031f1bc38d0f69d2a3b243ea0d664fc1fbd7fec"},
-    {file = "grpcio-1.71.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e6beeea5566092c5e3c4896c6d1d307fb46b1d4bdf3e70c8340b190a69198594"},
-    {file = "grpcio-1.71.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5170929109450a2c031cfe87d6716f2fae39695ad5335d9106ae88cc32dc84c"},
-    {file = "grpcio-1.71.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:5b08d03ace7aca7b2fadd4baf291139b4a5f058805a8327bfe9aece7253b6d67"},
-    {file = "grpcio-1.71.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:f903017db76bf9cc2b2d8bdd37bf04b505bbccad6be8a81e1542206875d0e9db"},
-    {file = "grpcio-1.71.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:469f42a0b410883185eab4689060a20488a1a0a00f8bbb3cbc1061197b4c5a79"},
-    {file = "grpcio-1.71.0-cp312-cp312-win32.whl", hash = "sha256:ad9f30838550695b5eb302add33f21f7301b882937460dd24f24b3cc5a95067a"},
-    {file = "grpcio-1.71.0-cp312-cp312-win_amd64.whl", hash = "sha256:652350609332de6dac4ece254e5d7e1ff834e203d6afb769601f286886f6f3a8"},
-    {file = "grpcio-1.71.0-cp313-cp313-linux_armv7l.whl", hash = "sha256:cebc1b34ba40a312ab480ccdb396ff3c529377a2fce72c45a741f7215bfe8379"},
-    {file = "grpcio-1.71.0-cp313-cp313-macosx_10_14_universal2.whl", hash = "sha256:85da336e3649a3d2171e82f696b5cad2c6231fdd5bad52616476235681bee5b3"},
-    {file = "grpcio-1.71.0-cp313-cp313-manylinux_2_17_aarch64.whl", hash = "sha256:f9a412f55bb6e8f3bb000e020dbc1e709627dcb3a56f6431fa7076b4c1aab0db"},
-    {file = "grpcio-1.71.0-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:47be9584729534660416f6d2a3108aaeac1122f6b5bdbf9fd823e11fe6fbaa29"},
-    {file = "grpcio-1.71.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7c9c80ac6091c916db81131d50926a93ab162a7e97e4428ffc186b6e80d6dda4"},
-    {file = "grpcio-1.71.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:789d5e2a3a15419374b7b45cd680b1e83bbc1e52b9086e49308e2c0b5bbae6e3"},
-    {file = "grpcio-1.71.0-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:1be857615e26a86d7363e8a163fade914595c81fec962b3d514a4b1e8760467b"},
-    {file = "grpcio-1.71.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:a76d39b5fafd79ed604c4be0a869ec3581a172a707e2a8d7a4858cb05a5a7637"},
-    {file = "grpcio-1.71.0-cp313-cp313-win32.whl", hash = "sha256:74258dce215cb1995083daa17b379a1a5a87d275387b7ffe137f1d5131e2cfbb"},
-    {file = "grpcio-1.71.0-cp313-cp313-win_amd64.whl", hash = "sha256:22c3bc8d488c039a199f7a003a38cb7635db6656fa96437a8accde8322ce2366"},
-    {file = "grpcio-1.71.0-cp39-cp39-linux_armv7l.whl", hash = "sha256:c6a0a28450c16809f94e0b5bfe52cabff63e7e4b97b44123ebf77f448534d07d"},
-    {file = "grpcio-1.71.0-cp39-cp39-macosx_10_14_universal2.whl", hash = "sha256:a371e6b6a5379d3692cc4ea1cb92754d2a47bdddeee755d3203d1f84ae08e03e"},
-    {file = "grpcio-1.71.0-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:39983a9245d37394fd59de71e88c4b295eb510a3555e0a847d9965088cdbd033"},
-    {file = "grpcio-1.71.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9182e0063112e55e74ee7584769ec5a0b4f18252c35787f48738627e23a62b97"},
-    {file = "grpcio-1.71.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:693bc706c031aeb848849b9d1c6b63ae6bcc64057984bb91a542332b75aa4c3d"},
-    {file = "grpcio-1.71.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:20e8f653abd5ec606be69540f57289274c9ca503ed38388481e98fa396ed0b41"},
-    {file = "grpcio-1.71.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:8700a2a57771cc43ea295296330daaddc0d93c088f0a35cc969292b6db959bf3"},
-    {file = "grpcio-1.71.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d35a95f05a8a2cbe8e02be137740138b3b2ea5f80bd004444e4f9a1ffc511e32"},
-    {file = "grpcio-1.71.0-cp39-cp39-win32.whl", hash = "sha256:f9c30c464cb2ddfbc2ddf9400287701270fdc0f14be5f08a1e3939f1e749b455"},
-    {file = "grpcio-1.71.0-cp39-cp39-win_amd64.whl", hash = "sha256:63e41b91032f298b3e973b3fa4093cbbc620c875e2da7b93e249d4728b54559a"},
-    {file = "grpcio-1.71.0.tar.gz", hash = "sha256:2b85f7820475ad3edec209d3d89a7909ada16caab05d3f2e08a7e8ae3200a55c"},
-]
-
-[package.extras]
-protobuf = ["grpcio-tools (>=1.71.0)"]
-
 [[package]]
 name = "grpcio-status"
 version = "1.70.0"
@ -1230,22 +1167,6 @@ googleapis-common-protos = ">=1.5.5"
 grpcio = ">=1.70.0"
 protobuf = ">=5.26.1,<6.0dev"

-[[package]]
-name = "grpcio-status"
-version = "1.71.0"
-description = "Status proto mapping for gRPC"
-optional = true
-python-versions = ">=3.9"
-files = [
-    {file = "grpcio_status-1.71.0-py3-none-any.whl", hash = "sha256:843934ef8c09e3e858952887467f8256aac3910c55f077a359a65b2b3cde3e68"},
-    {file = "grpcio_status-1.71.0.tar.gz", hash = "sha256:11405fed67b68f406b3f3c7c5ae5104a79d2d309666d10d61b152e91d28fb968"},
-]
-
-[package.dependencies]
-googleapis-common-protos = ">=1.5.5"
-grpcio = ">=1.71.0"
-protobuf = ">=5.26.1,<6.0dev"
-
 [[package]]
 name = "gunicorn"
 version = "23.0.0"
@ -1678,13 +1599,13 @@ referencing = ">=0.31.0"

 [[package]]
 name = "litellm-proxy-extras"
-version = "0.1.1"
+version = "0.1.2"
 description = "Additional files for the LiteLLM Proxy. Reduces the size of the main litellm package."
 optional = true
 python-versions = "!=2.7.*,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,!=3.7.*,>=3.8"
 files = [
-    {file = "litellm_proxy_extras-0.1.1-py3-none-any.whl", hash = "sha256:2b3c4c5474bacbde2424c1cd13b21f85c65e9c4346f6159badd49a210eedef5c"},
-    {file = "litellm_proxy_extras-0.1.1.tar.gz", hash = "sha256:a1eb911ad2e3742238863d314a8bd6d02dd0cc213ba040b2c0593f132fbf3117"},
+    {file = "litellm_proxy_extras-0.1.2-py3-none-any.whl", hash = "sha256:2caa7bdba5a533cd1781b55e3f7c581138d2a5b68a7e6d737327669dd21d5e08"},
+    {file = "litellm_proxy_extras-0.1.2.tar.gz", hash = "sha256:218e97980ab5a34eed7dcd1564a910c9a790168d672cdec3c464eba9b7cb1518"},
 ]

 [[package]]
@ -4135,4 +4056,4 @@ proxy = ["PyJWT", "apscheduler", "backoff", "boto3", "cryptography", "fastapi",
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.8.1,<4.0, !=3.9.7"
-content-hash = "16cbf20784776377805f5e33c6bc97dce76303132aa3d81c7e6fe743f0ee3fc1"
+content-hash = "524b2f8276ba057f8dc8a79dd460c1a243ef4aece7c08a8bf344e029e07b8841"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.65.1"
+version = "1.65.2"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@ -55,7 +55,7 @@ websockets = {version = "^13.1.0", optional = true}
 boto3 = {version = "1.34.34", optional = true}
 redisvl = {version = "^0.4.1", optional = true, markers = "python_version >= '3.9' and python_version < '3.14'"}
 mcp = {version = "1.5.0", optional = true, python = ">=3.10"}
-litellm-proxy-extras = {version = "0.1.1", optional = true}
+litellm-proxy-extras = {version = "0.1.2", optional = true}

 [tool.poetry.extras]
 proxy = [
@ -117,7 +117,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"

 [tool.commitizen]
-version = "1.65.1"
+version = "1.65.2"
 version_files = [
    "pyproject.toml:^version"
 ]
--- a/requirements.txt
+++ b/requirements.txt
@ -38,7 +38,7 @@ sentry_sdk==2.21.0 # for sentry error handling
 detect-secrets==1.5.0 # Enterprise - secret detection / masking in LLM requests
 cryptography==43.0.1
 tzdata==2025.1 # IANA time zone database
-litellm-proxy-extras==0.1.1 # for proxy extras - e.g. prisma migrations
+litellm-proxy-extras==0.1.2 # for proxy extras - e.g. prisma migrations

 ### LITELLM PACKAGE DEPENDENCIES
 python-dotenv==1.0.0 # for env 
--- a/schema.prisma
+++ b/schema.prisma
@ -327,6 +327,8 @@ model LiteLLM_DailyUserSpend {
  completion_tokens   Int      @default(0)
  spend               Float    @default(0.0)
  api_requests        Int      @default(0)
+  successful_requests Int      @default(0)
+  failed_requests     Int      @default(0)
  created_at          DateTime @default(now())
  updated_at          DateTime @updatedAt

@ -351,3 +353,4 @@ enum JobStatus {
  ACTIVE
  INACTIVE
 }
+
--- a/tests/litellm/litellm_core_utils/test_streaming_handler.py
+++ b/tests/litellm/litellm_core_utils/test_streaming_handler.py
@ -1,6 +1,7 @@
 import json
 import os
 import sys
+import time
 from unittest.mock import MagicMock, Mock, patch

 import pytest
@ -19,6 +20,7 @@ from litellm.types.utils import (
    Delta,
    ModelResponseStream,
    PromptTokensDetailsWrapper,
+    StandardLoggingPayload,
    StreamingChoices,
    Usage,
 )
@ -36,6 +38,22 @@ def initialized_custom_stream_wrapper() -> CustomStreamWrapper:
    return streaming_handler


+@pytest.fixture
+def logging_obj() -> Logging:
+    import time
+
+    logging_obj = Logging(
+        model="my-random-model",
+        messages=[{"role": "user", "content": "Hey"}],
+        stream=True,
+        call_type="completion",
+        start_time=time.time(),
+        litellm_call_id="12345",
+        function_id="1245",
+    )
+    return logging_obj
+
+
 bedrock_chunks = [
    ModelResponseStream(
        id="chatcmpl-d249def8-a78b-464c-87b5-3a6f43565292",
@ -577,3 +595,36 @@ def test_streaming_handler_with_stop_chunk(
        **args, model_response=ModelResponseStream()
    )
    assert returned_chunk is None
+
+
+@pytest.mark.asyncio
+async def test_streaming_completion_start_time(logging_obj: Logging):
+    """Test that the start time is set correctly"""
+    from litellm.integrations.custom_logger import CustomLogger
+
+    class MockCallback(CustomLogger):
+        pass
+
+    mock_callback = MockCallback()
+    litellm.success_callback = [mock_callback, "langfuse"]
+
+    completion_stream = ModelResponseListIterator(
+        model_responses=bedrock_chunks, delay=0.1
+    )
+
+    response = CustomStreamWrapper(
+        completion_stream=completion_stream,
+        model="bedrock/claude-3-5-sonnet-20240620-v1:0",
+        logging_obj=logging_obj,
+    )
+
+    async for chunk in response:
+        print(chunk)
+
+    await asyncio.sleep(2)
+
+    assert logging_obj.model_call_details["completion_start_time"] is not None
+    assert (
+        logging_obj.model_call_details["completion_start_time"]
+        < logging_obj.model_call_details["end_time"]
+    )
--- a/tests/litellm/llms/openrouter/chat/test_openrouter_chat_transformation.py
+++ b/tests/litellm/llms/openrouter/chat/test_openrouter_chat_transformation.py
@ -0,0 +1,81 @@
+import json
+import os
+import sys
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import httpx
+import pytest
+
+sys.path.insert(
+    0, os.path.abspath("../../../../..")
+)  # Adds the parent directory to the system path
+
+from litellm.llms.openrouter.chat.transformation import (
+    OpenRouterChatCompletionStreamingHandler,
+    OpenRouterException,
+)
+
+
+class TestOpenRouterChatCompletionStreamingHandler:
+    def test_chunk_parser_successful(self):
+        handler = OpenRouterChatCompletionStreamingHandler(
+            streaming_response=None, sync_stream=True
+        )
+
+        # Test input chunk
+        chunk = {
+            "id": "test_id",
+            "created": 1234567890,
+            "model": "test_model",
+            "choices": [
+                {"delta": {"content": "test content", "reasoning": "test reasoning"}}
+            ],
+        }
+
+        # Parse chunk
+        result = handler.chunk_parser(chunk)
+
+        # Verify response
+        assert result.id == "test_id"
+        assert result.object == "chat.completion.chunk"
+        assert result.created == 1234567890
+        assert result.model == "test_model"
+        assert len(result.choices) == 1
+        assert result.choices[0]["delta"]["reasoning_content"] == "test reasoning"
+
+    def test_chunk_parser_error_response(self):
+        handler = OpenRouterChatCompletionStreamingHandler(
+            streaming_response=None, sync_stream=True
+        )
+
+        # Test error chunk
+        error_chunk = {
+            "error": {
+                "message": "test error",
+                "code": 400,
+                "metadata": {"key": "value"},
+                "user_id": "test_user",
+            }
+        }
+
+        # Verify error handling
+        with pytest.raises(OpenRouterException) as exc_info:
+            handler.chunk_parser(error_chunk)
+
+        assert "Message: test error" in str(exc_info.value)
+        assert exc_info.value.status_code == 400
+
+    def test_chunk_parser_key_error(self):
+        handler = OpenRouterChatCompletionStreamingHandler(
+            streaming_response=None, sync_stream=True
+        )
+
+        # Test invalid chunk missing required fields
+        invalid_chunk = {"incomplete": "data"}
+
+        # Verify KeyError handling
+        with pytest.raises(OpenRouterException) as exc_info:
+            handler.chunk_parser(invalid_chunk)
+
+        assert "KeyError" in str(exc_info.value)
+        assert exc_info.value.status_code == 400
--- a/tests/litellm/llms/sagemaker/test_sagemaker_common_utils.py
+++ b/tests/litellm/llms/sagemaker/test_sagemaker_common_utils.py
@ -0,0 +1,97 @@
+import json
+import os
+import sys
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import httpx
+import pytest
+
+sys.path.insert(0, os.path.abspath("../../../../.."))
+from litellm.llms.sagemaker.common_utils import AWSEventStreamDecoder
+
+
+@pytest.mark.asyncio
+async def test_aiter_bytes_unicode_decode_error():
+    """
+    Test that AWSEventStreamDecoder.aiter_bytes() does not raise an error when encountering invalid UTF-8 bytes. (UnicodeDecodeError)
+
+
+    Ensures stream processing continues despite the error.
+
+    Relevant issue: https://github.com/BerriAI/litellm/issues/9165
+    """
+    # Create an instance of AWSEventStreamDecoder
+    decoder = AWSEventStreamDecoder(model="test-model")
+
+    # Create a mock event that will trigger a UnicodeDecodeError
+    mock_event = MagicMock()
+    mock_event.to_response_dict.return_value = {
+        "status_code": 200,
+        "headers": {},
+        "body": b"\xff\xfe",  # Invalid UTF-8 bytes
+    }
+
+    # Create a mock EventStreamBuffer that yields our mock event
+    mock_buffer = MagicMock()
+    mock_buffer.__iter__.return_value = [mock_event]
+
+    # Mock the EventStreamBuffer class
+    with patch("botocore.eventstream.EventStreamBuffer", return_value=mock_buffer):
+        # Create an async generator that yields some test bytes
+        async def mock_iterator():
+            yield b""
+
+        # Process the stream
+        chunks = []
+        async for chunk in decoder.aiter_bytes(mock_iterator()):
+            if chunk is not None:
+                print("chunk=", chunk)
+                chunks.append(chunk)
+
+        # Verify that processing continued despite the error
+        # The chunks list should be empty since we only sent invalid data
+        assert len(chunks) == 0
+
+
+@pytest.mark.asyncio
+async def test_aiter_bytes_valid_chunk_followed_by_unicode_error():
+    """
+    Test that valid chunks are processed correctly even when followed by Unicode decode errors.
+    This ensures errors don't corrupt or prevent processing of valid data that came before.
+
+    Relevant issue: https://github.com/BerriAI/litellm/issues/9165
+    """
+    decoder = AWSEventStreamDecoder(model="test-model")
+
+    # Create two mock events - first valid, then invalid
+    mock_valid_event = MagicMock()
+    mock_valid_event.to_response_dict.return_value = {
+        "status_code": 200,
+        "headers": {},
+        "body": json.dumps({"token": {"text": "hello"}}).encode(),  # Valid data first
+    }
+
+    mock_invalid_event = MagicMock()
+    mock_invalid_event.to_response_dict.return_value = {
+        "status_code": 200,
+        "headers": {},
+        "body": b"\xff\xfe",  # Invalid UTF-8 bytes second
+    }
+
+    # Create a mock EventStreamBuffer that yields valid event first, then invalid
+    mock_buffer = MagicMock()
+    mock_buffer.__iter__.return_value = [mock_valid_event, mock_invalid_event]
+
+    with patch("botocore.eventstream.EventStreamBuffer", return_value=mock_buffer):
+
+        async def mock_iterator():
+            yield b"test_bytes"
+
+        chunks = []
+        async for chunk in decoder.aiter_bytes(mock_iterator()):
+            if chunk is not None:
+                chunks.append(chunk)
+
+        # Verify we got our valid chunk despite the subsequent error
+        assert len(chunks) == 1
+        assert chunks[0]["text"] == "hello"  # Verify the content of the valid chunk
--- a/tests/litellm/llms/vertex_ai/test_vertex_anthropic_prompt_caching.py
+++ b/tests/litellm/llms/vertex_ai/test_vertex_anthropic_prompt_caching.py
@ -1,137 +0,0 @@
-import os
-import sys
-from unittest.mock import MagicMock, patch
-
-import pytest
-
-sys.path.insert(
-    0, os.path.abspath("../../../..")
-)  # Adds the parent directory to the system path
-
-from litellm.llms.anthropic.chat.transformation import AnthropicConfig
-
-
-def test_anthropic_prompt_caching_headers_for_vertex():
-    """
-    Test that the prompt caching beta header is correctly set for Vertex AI requests
-    with Anthropic models when cache control is present in the messages.
-    """
-    # Create an instance of AnthropicConfig
-    config = AnthropicConfig()
-
-    # Test case 1: Vertex request with prompt caching
-    # Create a message with cache control
-    messages = [
-        {
-            "role": "system",
-            "content": "You are a helpful assistant.",
-            "cache_control": {"type": "ephemeral"}
-        },
-        {
-            "role": "user",
-            "content": "Tell me about the solar system."
-        }
-    ]
-
-    # Check if cache control is detected
-    is_cache_control_set = config.is_cache_control_set(messages=messages)
-    assert is_cache_control_set is True, "Cache control should be detected in messages"
-
-    # Generate headers for a Vertex AI request with prompt caching
-    headers = config.get_anthropic_headers(
-        api_key="test-api-key",
-        prompt_caching_set=is_cache_control_set,
-        is_vertex_request=True
-    )
-
-    # Verify that the anthropic-beta header is set with prompt-caching-2024-07-31
-    assert "anthropic-beta" in headers, "anthropic-beta header should be present"
-    assert "prompt-caching-2024-07-31" in headers["anthropic-beta"], "prompt-caching-2024-07-31 should be in the beta header"
-
-    # Test case 2: Vertex request without prompt caching
-    messages_without_cache = [
-        {
-            "role": "system",
-            "content": "You are a helpful assistant."
-        },
-        {
-            "role": "user",
-            "content": "Tell me about the solar system."
-        }
-    ]
-
-    # Check if cache control is detected
-    is_cache_control_set = config.is_cache_control_set(messages=messages_without_cache)
-    assert is_cache_control_set is False, "Cache control should not be detected in messages"
-
-    # Generate headers for a Vertex AI request without prompt caching
-    headers = config.get_anthropic_headers(
-        api_key="test-api-key",
-        prompt_caching_set=is_cache_control_set,
-        is_vertex_request=True
-    )
-
-    # Verify that the anthropic-beta header is not set
-    assert "anthropic-beta" not in headers, "anthropic-beta header should not be present"
-
-
-def test_anthropic_prompt_caching_with_content_blocks():
-    """
-    Test that prompt caching is correctly detected when cache control is in content blocks.
-    """
-    config = AnthropicConfig()
-
-    # Message with cache control in content blocks
-    messages = [
-        {
-            "role": "system",
-            "content": [
-                {
-                    "type": "text",
-                    "text": "You are a helpful assistant.",
-                    "cache_control": {"type": "ephemeral"}
-                }
-            ]
-        },
-        {
-            "role": "user",
-            "content": "Tell me about the solar system."
-        }
-    ]
-
-    # Check if cache control is detected
-    is_cache_control_set = config.is_cache_control_set(messages=messages)
-    assert is_cache_control_set is True, "Cache control should be detected in content blocks"
-
-    # Generate headers for a Vertex AI request with prompt caching
-    headers = config.get_anthropic_headers(
-        api_key="test-api-key",
-        prompt_caching_set=is_cache_control_set,
-        is_vertex_request=True
-    )
-
-    # Verify that the anthropic-beta header is set with prompt-caching-2024-07-31
-    assert "anthropic-beta" in headers, "anthropic-beta header should be present"
-    assert "prompt-caching-2024-07-31" in headers["anthropic-beta"], "prompt-caching-2024-07-31 should be in the beta header"
-
-
-def test_anthropic_vertex_other_beta_headers():
-    """
-    Test that other beta headers are not included for Vertex AI requests.
-    """
-    config = AnthropicConfig()
-
-    # Generate headers with multiple beta features
-    headers = config.get_anthropic_headers(
-        api_key="test-api-key",
-        prompt_caching_set=True,
-        computer_tool_used=True,  # This should be excluded for Vertex
-        pdf_used=True,  # This should be excluded for Vertex
-        is_vertex_request=True
-    )
-
-    # Verify that only prompt-caching is included in the beta header
-    assert "anthropic-beta" in headers, "anthropic-beta header should be present"
-    assert headers["anthropic-beta"] == "prompt-caching-2024-07-31", "Only prompt-caching should be in the beta header"
-    assert "computer-use-2024-10-22" not in headers["anthropic-beta"], "computer-use beta should not be included"
-    assert "pdfs-2024-09-25" not in headers["anthropic-beta"], "pdfs beta should not be included"
--- a/tests/litellm/proxy/common_utils/test_http_parsing_utils.py
+++ b/tests/litellm/proxy/common_utils/test_http_parsing_utils.py
@ -39,7 +39,7 @@ async def test_request_body_caching():
    result1 = await _read_request_body(mock_request)
    assert result1 == test_data
    assert "parsed_body" in mock_request.scope
-    assert mock_request.scope["parsed_body"] == test_data
+    assert mock_request.scope["parsed_body"] == (("key",), {"key": "value"})

    # Verify the body was read once
    mock_request.body.assert_called_once()
@ -49,7 +49,7 @@ async def test_request_body_caching():

    # Second call should use the cached body
    result2 = await _read_request_body(mock_request)
-    assert result2 == test_data
+    assert result2 == {"key": "value"}

    # Verify the body was not read again
    mock_request.body.assert_not_called()
@ -75,7 +75,10 @@ async def test_form_data_parsing():
    # Verify the form data was correctly parsed
    assert result == test_data
    assert "parsed_body" in mock_request.scope
-    assert mock_request.scope["parsed_body"] == test_data
+    assert mock_request.scope["parsed_body"] == (
+        ("name", "message"),
+        {"name": "test_user", "message": "hello world"},
+    )

    # Verify form() was called
    mock_request.form.assert_called_once()
@ -101,7 +104,46 @@ async def test_empty_request_body():
    # Verify an empty dict is returned
    assert result == {}
    assert "parsed_body" in mock_request.scope
-    assert mock_request.scope["parsed_body"] == {}
+    assert mock_request.scope["parsed_body"] == ((), {})

    # Verify the body was read
    mock_request.body.assert_called_once()
+
+
+@pytest.mark.asyncio
+async def test_circular_reference_handling():
+    """
+    Test that cached request body isn't modified when the returned result is modified.
+    Demonstrates the mutable dictionary reference issue.
+    """
+    # Create a mock request with initial data
+    mock_request = MagicMock()
+    initial_body = {
+        "model": "gpt-4",
+        "messages": [{"role": "user", "content": "Hello"}],
+    }
+
+    mock_request.body = AsyncMock(return_value=orjson.dumps(initial_body))
+    mock_request.headers = {"content-type": "application/json"}
+    mock_request.scope = {}
+
+    # First parse
+    result = await _read_request_body(mock_request)
+
+    # Verify initial parse
+    assert result["model"] == "gpt-4"
+    assert result["messages"] == [{"role": "user", "content": "Hello"}]
+
+    # Modify the result by adding proxy_server_request
+    result["proxy_server_request"] = {
+        "url": "http://0.0.0.0:4000/v1/chat/completions",
+        "method": "POST",
+        "headers": {"content-type": "application/json"},
+        "body": result,  # Creates circular reference
+    }
+
+    # Second parse using the same request - will use the modified cached value
+    result2 = await _read_request_body(mock_request)
+    assert (
+        "proxy_server_request" not in result2
+    )  # This will pass, showing the cache pollution
--- a/tests/litellm/proxy/db/db_transaction_queue/test_daily_spend_update_queue.py
+++ b/tests/litellm/proxy/db/db_transaction_queue/test_daily_spend_update_queue.py
@ -0,0 +1,264 @@
+import asyncio
+import json
+import os
+import sys
+
+import pytest
+from fastapi.testclient import TestClient
+
+from litellm.proxy._types import (
+    DailyUserSpendTransaction,
+    Litellm_EntityType,
+    SpendUpdateQueueItem,
+)
+from litellm.proxy.db.db_transaction_queue.daily_spend_update_queue import (
+    DailySpendUpdateQueue,
+)
+from litellm.proxy.db.db_transaction_queue.spend_update_queue import SpendUpdateQueue
+
+sys.path.insert(
+    0, os.path.abspath("../../..")
+)  # Adds the parent directory to the system path
+
+
+@pytest.fixture
+def daily_spend_update_queue():
+    return DailySpendUpdateQueue()
+
+
+@pytest.mark.asyncio
+async def test_empty_queue_flush(daily_spend_update_queue):
+    """Test flushing an empty queue returns an empty list"""
+    result = await daily_spend_update_queue.flush_all_updates_from_in_memory_queue()
+    assert result == []
+
+
+@pytest.mark.asyncio
+async def test_add_single_update(daily_spend_update_queue):
+    """Test adding a single update to the queue"""
+    test_key = "user1_2023-01-01_key123_gpt-4_openai"
+    test_transaction = {
+        "spend": 10.0,
+        "prompt_tokens": 100,
+        "completion_tokens": 50,
+        "api_requests": 1,
+        "successful_requests": 1,
+        "failed_requests": 0,
+    }
+
+    # Add update to queue
+    await daily_spend_update_queue.add_update({test_key: test_transaction})
+
+    # Flush and check
+    updates = await daily_spend_update_queue.flush_all_updates_from_in_memory_queue()
+    assert len(updates) == 1
+    assert test_key in updates[0]
+    assert updates[0][test_key] == test_transaction
+
+
+@pytest.mark.asyncio
+async def test_add_multiple_updates(daily_spend_update_queue):
+    """Test adding multiple updates to the queue"""
+    test_key1 = "user1_2023-01-01_key123_gpt-4_openai"
+    test_transaction1 = {
+        "spend": 10.0,
+        "prompt_tokens": 100,
+        "completion_tokens": 50,
+        "api_requests": 1,
+        "successful_requests": 1,
+        "failed_requests": 0,
+    }
+
+    test_key2 = "user2_2023-01-01_key456_gpt-3.5-turbo_openai"
+    test_transaction2 = {
+        "spend": 5.0,
+        "prompt_tokens": 200,
+        "completion_tokens": 30,
+        "api_requests": 1,
+        "successful_requests": 1,
+        "failed_requests": 0,
+    }
+
+    # Add updates to queue
+    await daily_spend_update_queue.add_update({test_key1: test_transaction1})
+    await daily_spend_update_queue.add_update({test_key2: test_transaction2})
+
+    # Flush and check
+    updates = await daily_spend_update_queue.flush_all_updates_from_in_memory_queue()
+    assert len(updates) == 2
+
+    # Find each transaction in the list of updates
+    found_transaction1 = False
+    found_transaction2 = False
+
+    for update in updates:
+        if test_key1 in update:
+            assert update[test_key1] == test_transaction1
+            found_transaction1 = True
+        if test_key2 in update:
+            assert update[test_key2] == test_transaction2
+            found_transaction2 = True
+
+    assert found_transaction1
+    assert found_transaction2
+
+
+@pytest.mark.asyncio
+async def test_aggregated_daily_spend_update_empty(daily_spend_update_queue):
+    """Test aggregating updates from an empty queue"""
+    result = (
+        await daily_spend_update_queue.flush_and_get_aggregated_daily_spend_update_transactions()
+    )
+    assert result == {}
+
+
+@pytest.mark.asyncio
+async def test_get_aggregated_daily_spend_update_transactions_single_key():
+    """Test static method for aggregating a single key"""
+    test_key = "user1_2023-01-01_key123_gpt-4_openai"
+    test_transaction = {
+        "spend": 10.0,
+        "prompt_tokens": 100,
+        "completion_tokens": 50,
+        "api_requests": 1,
+        "successful_requests": 1,
+        "failed_requests": 0,
+    }
+
+    updates = [{test_key: test_transaction}]
+
+    # Test aggregation
+    result = DailySpendUpdateQueue.get_aggregated_daily_spend_update_transactions(
+        updates
+    )
+
+    assert len(result) == 1
+    assert test_key in result
+    assert result[test_key] == test_transaction
+
+
+@pytest.mark.asyncio
+async def test_get_aggregated_daily_spend_update_transactions_multiple_keys():
+    """Test static method for aggregating multiple different keys"""
+    test_key1 = "user1_2023-01-01_key123_gpt-4_openai"
+    test_transaction1 = {
+        "spend": 10.0,
+        "prompt_tokens": 100,
+        "completion_tokens": 50,
+        "api_requests": 1,
+        "successful_requests": 1,
+        "failed_requests": 0,
+    }
+
+    test_key2 = "user2_2023-01-01_key456_gpt-3.5-turbo_openai"
+    test_transaction2 = {
+        "spend": 5.0,
+        "prompt_tokens": 200,
+        "completion_tokens": 30,
+        "api_requests": 1,
+        "successful_requests": 1,
+        "failed_requests": 0,
+    }
+
+    updates = [{test_key1: test_transaction1}, {test_key2: test_transaction2}]
+
+    # Test aggregation
+    result = DailySpendUpdateQueue.get_aggregated_daily_spend_update_transactions(
+        updates
+    )
+
+    assert len(result) == 2
+    assert test_key1 in result
+    assert test_key2 in result
+    assert result[test_key1] == test_transaction1
+    assert result[test_key2] == test_transaction2
+
+
+@pytest.mark.asyncio
+async def test_get_aggregated_daily_spend_update_transactions_same_key():
+    """Test static method for aggregating updates with the same key"""
+    test_key = "user1_2023-01-01_key123_gpt-4_openai"
+    test_transaction1 = {
+        "spend": 10.0,
+        "prompt_tokens": 100,
+        "completion_tokens": 50,
+        "api_requests": 1,
+        "successful_requests": 1,
+        "failed_requests": 0,
+    }
+
+    test_transaction2 = {
+        "spend": 5.0,
+        "prompt_tokens": 200,
+        "completion_tokens": 30,
+        "api_requests": 1,
+        "successful_requests": 1,
+        "failed_requests": 0,
+    }
+
+    expected_transaction = {
+        "spend": 15.0,  # 10 + 5
+        "prompt_tokens": 300,  # 100 + 200
+        "completion_tokens": 80,  # 50 + 30
+        "api_requests": 2,  # 1 + 1
+        "successful_requests": 2,  # 1 + 1
+        "failed_requests": 0,  # 0 + 0
+    }
+
+    updates = [{test_key: test_transaction1}, {test_key: test_transaction2}]
+
+    # Test aggregation
+    result = DailySpendUpdateQueue.get_aggregated_daily_spend_update_transactions(
+        updates
+    )
+
+    assert len(result) == 1
+    assert test_key in result
+    assert result[test_key] == expected_transaction
+
+
+@pytest.mark.asyncio
+async def test_flush_and_get_aggregated_daily_spend_update_transactions(
+    daily_spend_update_queue,
+):
+    """Test the full workflow of adding, flushing, and aggregating updates"""
+    test_key = "user1_2023-01-01_key123_gpt-4_openai"
+    test_transaction1 = {
+        "spend": 10.0,
+        "prompt_tokens": 100,
+        "completion_tokens": 50,
+        "api_requests": 1,
+        "successful_requests": 1,
+        "failed_requests": 0,
+    }
+
+    test_transaction2 = {
+        "spend": 5.0,
+        "prompt_tokens": 200,
+        "completion_tokens": 30,
+        "api_requests": 1,
+        "successful_requests": 1,
+        "failed_requests": 0,
+    }
+
+    expected_transaction = {
+        "spend": 15.0,  # 10 + 5
+        "prompt_tokens": 300,  # 100 + 200
+        "completion_tokens": 80,  # 50 + 30
+        "api_requests": 2,  # 1 + 1
+        "successful_requests": 2,  # 1 + 1
+        "failed_requests": 0,  # 0 + 0
+    }
+
+    # Add updates to queue
+    await daily_spend_update_queue.add_update({test_key: test_transaction1})
+    await daily_spend_update_queue.add_update({test_key: test_transaction2})
+
+    # Test full workflow
+    result = (
+        await daily_spend_update_queue.flush_and_get_aggregated_daily_spend_update_transactions()
+    )
+
+    assert len(result) == 1
+    assert test_key in result
+    assert result[test_key] == expected_transaction
--- a/tests/litellm/proxy/db/db_transaction_queue/test_pod_lock_manager.py
+++ b/tests/litellm/proxy/db/db_transaction_queue/test_pod_lock_manager.py
@ -12,7 +12,7 @@ sys.path.insert(
 )  # Adds the parent directory to the system path

 from litellm.constants import DEFAULT_CRON_JOB_LOCK_TTL_SECONDS
-from litellm.proxy.db.pod_lock_manager import PodLockManager
+from litellm.proxy.db.db_transaction_queue.pod_lock_manager import PodLockManager


 # Mock Prisma client class
--- a/tests/litellm/proxy/db/db_transaction_queue/test_spend_update_queue.py
+++ b/tests/litellm/proxy/db/db_transaction_queue/test_spend_update_queue.py
@ -0,0 +1,152 @@
+import asyncio
+import json
+import os
+import sys
+
+import pytest
+from fastapi.testclient import TestClient
+
+from litellm.proxy._types import Litellm_EntityType, SpendUpdateQueueItem
+from litellm.proxy.db.db_transaction_queue.spend_update_queue import SpendUpdateQueue
+
+sys.path.insert(
+    0, os.path.abspath("../../..")
+)  # Adds the parent directory to the system path
+
+
+@pytest.fixture
+def spend_queue():
+    return SpendUpdateQueue()
+
+
+@pytest.mark.asyncio
+async def test_add_update(spend_queue):
+    # Test adding a single update
+    update: SpendUpdateQueueItem = {
+        "entity_type": Litellm_EntityType.USER,
+        "entity_id": "user123",
+        "response_cost": 0.5,
+    }
+    await spend_queue.add_update(update)
+
+    # Verify update was added by checking queue size
+    assert spend_queue.update_queue.qsize() == 1
+
+
+@pytest.mark.asyncio
+async def test_missing_response_cost(spend_queue):
+    # Test with missing response_cost - should default to 0
+    update: SpendUpdateQueueItem = {
+        "entity_type": Litellm_EntityType.USER,
+        "entity_id": "user123",
+    }
+
+    await spend_queue.add_update(update)
+    aggregated = (
+        await spend_queue.flush_and_get_aggregated_db_spend_update_transactions()
+    )
+
+    # Should have created entry with 0 cost
+    assert aggregated["user_list_transactions"]["user123"] == 0
+
+
+@pytest.mark.asyncio
+async def test_missing_entity_id(spend_queue):
+    # Test with missing entity_id - should default to empty string
+    update: SpendUpdateQueueItem = {
+        "entity_type": Litellm_EntityType.USER,
+        "response_cost": 1.0,
+    }
+
+    await spend_queue.add_update(update)
+    aggregated = (
+        await spend_queue.flush_and_get_aggregated_db_spend_update_transactions()
+    )
+
+    # Should use empty string as key
+    assert aggregated["user_list_transactions"][""] == 1.0
+
+
+@pytest.mark.asyncio
+async def test_none_values(spend_queue):
+    # Test with None values
+    update: SpendUpdateQueueItem = {
+        "entity_type": Litellm_EntityType.USER,
+        "entity_id": None,  # type: ignore
+        "response_cost": None,
+    }
+
+    await spend_queue.add_update(update)
+    aggregated = (
+        await spend_queue.flush_and_get_aggregated_db_spend_update_transactions()
+    )
+
+    # Should handle None values gracefully
+    assert aggregated["user_list_transactions"][""] == 0
+
+
+@pytest.mark.asyncio
+async def test_multiple_updates_with_missing_fields(spend_queue):
+    # Test multiple updates with various missing fields
+    updates: list[SpendUpdateQueueItem] = [
+        {
+            "entity_type": Litellm_EntityType.USER,
+            "entity_id": "user123",
+            "response_cost": 0.5,
+        },
+        {
+            "entity_type": Litellm_EntityType.USER,
+            "entity_id": "user123",  # missing response_cost
+        },
+        {
+            "entity_type": Litellm_EntityType.USER,  # missing entity_id
+            "response_cost": 1.5,
+        },
+    ]
+
+    for update in updates:
+        await spend_queue.add_update(update)
+
+    aggregated = (
+        await spend_queue.flush_and_get_aggregated_db_spend_update_transactions()
+    )
+
+    # Verify aggregation
+    assert (
+        aggregated["user_list_transactions"]["user123"] == 0.5
+    )  # only the first update with valid cost
+    assert (
+        aggregated["user_list_transactions"][""] == 1.5
+    )  # update with missing entity_id
+
+
+@pytest.mark.asyncio
+async def test_unknown_entity_type(spend_queue):
+    # Test with unknown entity type
+    update: SpendUpdateQueueItem = {
+        "entity_type": "UNKNOWN_TYPE",  # type: ignore
+        "entity_id": "123",
+        "response_cost": 0.5,
+    }
+
+    await spend_queue.add_update(update)
+    aggregated = (
+        await spend_queue.flush_and_get_aggregated_db_spend_update_transactions()
+    )
+
+    # Should ignore unknown entity type
+    assert all(len(transactions) == 0 for transactions in aggregated.values())
+
+
+@pytest.mark.asyncio
+async def test_missing_entity_type(spend_queue):
+    # Test with missing entity type
+    update: SpendUpdateQueueItem = {"entity_id": "123", "response_cost": 0.5}
+
+    await spend_queue.add_update(update)
+    aggregated = (
+        await spend_queue.flush_and_get_aggregated_db_spend_update_transactions()
+    )
+
+    # Should ignore updates without entity type
+    assert all(len(transactions) == 0 for transactions in aggregated.values())
--- a/tests/litellm/proxy/management_endpoints/test_internal_user_endpoints.py
+++ b/tests/litellm/proxy/management_endpoints/test_internal_user_endpoints.py
@ -55,3 +55,30 @@ async def test_ui_view_users_with_null_email(mocker, caplog):
    assert response == [
        LiteLLM_UserTableFiltered(user_id="test-user-null-email", user_email=None)
    ]
+
+
+def test_user_daily_activity_types():
+    """
+    Assert all fiels in SpendMetrics are reported in DailySpendMetadata as "total_"
+    """
+    from litellm.proxy.management_endpoints.internal_user_endpoints import (
+        DailySpendMetadata,
+        SpendMetrics,
+    )
+
+    # Create a SpendMetrics instance
+    spend_metrics = SpendMetrics()
+
+    # Create a DailySpendMetadata instance
+    daily_spend_metadata = DailySpendMetadata()
+
+    # Assert all fields in SpendMetrics are reported in DailySpendMetadata as "total_"
+    for field in spend_metrics.__dict__:
+        if field.startswith("total_"):
+            assert hasattr(
+                daily_spend_metadata, field
+            ), f"Field {field} is not reported in DailySpendMetadata"
+        else:
+            assert not hasattr(
+                daily_spend_metadata, field
+            ), f"Field {field} is reported in DailySpendMetadata"
--- a/tests/litellm/proxy/test_litellm_pre_call_utils.py
+++ b/tests/litellm/proxy/test_litellm_pre_call_utils.py
@ -0,0 +1,105 @@
+import json
+import os
+import sys
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from litellm.proxy._types import UserAPIKeyAuth
+from litellm.proxy.litellm_pre_call_utils import (
+    _get_enforced_params,
+    check_if_token_is_service_account,
+)
+
+sys.path.insert(
+    0, os.path.abspath("../../..")
+)  # Adds the parent directory to the system path
+
+
+def test_check_if_token_is_service_account():
+    """
+    Test that only keys with `service_account_id` in metadata are considered service accounts
+    """
+    # Test case 1: Service account token
+    service_account_token = UserAPIKeyAuth(
+        api_key="test-key", metadata={"service_account_id": "test-service-account"}
+    )
+    assert check_if_token_is_service_account(service_account_token) == True
+
+    # Test case 2: Regular user token
+    regular_token = UserAPIKeyAuth(api_key="test-key", metadata={})
+    assert check_if_token_is_service_account(regular_token) == False
+
+    # Test case 3: Token with other metadata
+    other_metadata_token = UserAPIKeyAuth(
+        api_key="test-key", metadata={"user_id": "test-user"}
+    )
+    assert check_if_token_is_service_account(other_metadata_token) == False
+
+
+def test_get_enforced_params_for_service_account_settings():
+    """
+    Test that service account enforced params are only added to service account keys
+    """
+    service_account_token = UserAPIKeyAuth(
+        api_key="test-key", metadata={"service_account_id": "test-service-account"}
+    )
+    general_settings_with_service_account_settings = {
+        "service_account_settings": {"enforced_params": ["metadata.service"]},
+    }
+    result = _get_enforced_params(
+        general_settings=general_settings_with_service_account_settings,
+        user_api_key_dict=service_account_token,
+    )
+    assert result == ["metadata.service"]
+
+    regular_token = UserAPIKeyAuth(
+        api_key="test-key", metadata={"enforced_params": ["user"]}
+    )
+    result = _get_enforced_params(
+        general_settings=general_settings_with_service_account_settings,
+        user_api_key_dict=regular_token,
+    )
+    assert result == ["user"]
+
+
+@pytest.mark.parametrize(
+    "general_settings, user_api_key_dict, expected_enforced_params",
+    [
+        (
+            {"enforced_params": ["param1", "param2"]},
+            UserAPIKeyAuth(
+                api_key="test_api_key", user_id="test_user_id", org_id="test_org_id"
+            ),
+            ["param1", "param2"],
+        ),
+        (
+            {"service_account_settings": {"enforced_params": ["param1", "param2"]}},
+            UserAPIKeyAuth(
+                api_key="test_api_key",
+                user_id="test_user_id",
+                org_id="test_org_id",
+                metadata={"service_account_id": "test_service_account_id"},
+            ),
+            ["param1", "param2"],
+        ),
+        (
+            {"service_account_settings": {"enforced_params": ["param1", "param2"]}},
+            UserAPIKeyAuth(
+                api_key="test_api_key",
+                metadata={
+                    "enforced_params": ["param3", "param4"],
+                    "service_account_id": "test_service_account_id",
+                },
+            ),
+            ["param1", "param2", "param3", "param4"],
+        ),
+    ],
+)
+def test_get_enforced_params(
+    general_settings, user_api_key_dict, expected_enforced_params
+):
+    from litellm.proxy.litellm_pre_call_utils import _get_enforced_params
+
+    enforced_params = _get_enforced_params(general_settings, user_api_key_dict)
+    assert enforced_params == expected_enforced_params
--- a/tests/llm_translation/base_llm_unit_tests.py
+++ b/tests/llm_translation/base_llm_unit_tests.py
@ -198,6 +198,42 @@ class BaseLLMChatTest(ABC):
            messages=image_messages,
        )
        assert response is not None
+    
+    def test_file_data_unit_test(self, pdf_messages):
+        from litellm.utils import supports_pdf_input, return_raw_request
+        from litellm.types.utils import CallTypes
+        from litellm.litellm_core_utils.prompt_templates.factory import convert_to_anthropic_image_obj
+
+        media_chunk = convert_to_anthropic_image_obj(
+            openai_image_url=pdf_messages,
+            format=None,
+        )
+
+        file_content = [
+            {"type": "text", "text": "What's this file about?"},
+            {
+                "type": "file",
+                "file": {
+                    "file_data": pdf_messages,
+                }
+            },
+        ]
+
+        image_messages = [{"role": "user", "content": file_content}]
+
+        base_completion_call_args = self.get_base_completion_call_args()
+
+        if not supports_pdf_input(base_completion_call_args["model"], None):
+            pytest.skip("Model does not support image input")
+
+        raw_request = return_raw_request(
+            endpoint=CallTypes.completion,
+            kwargs={**base_completion_call_args, "messages": image_messages},
+        )
+
+        print("RAW REQUEST", raw_request)
+
+        assert media_chunk["data"] in json.dumps(raw_request)

    def test_message_with_name(self):
        try:
--- a/tests/llm_translation/test_openai.py
+++ b/tests/llm_translation/test_openai.py
@ -268,7 +268,7 @@ async def test_vision_with_custom_model():
                    {
                        "type": "image_url",
                        "image_url": {
-                            "url": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAGQAAABkBAMAAACCzIhnAAAAG1BMVEURAAD///+ln5/h39/Dv79qX18uHx+If39MPz9oMSdmAAAACXBIWXMAAA7EAAAOxAGVKw4bAAABB0lEQVRYhe2SzWrEIBCAh2A0jxEs4j6GLDS9hqWmV5Flt0cJS+lRwv742DXpEjY1kOZW6HwHFZnPmVEBEARBEARB/jd0KYA/bcUYbPrRLh6amXHJ/K+ypMoyUaGthILzw0l+xI0jsO7ZcmCcm4ILd+QuVYgpHOmDmz6jBeJImdcUCmeBqQpuqRIbVmQsLCrAalrGpfoEqEogqbLTWuXCPCo+Ki1XGqgQ+jVVuhB8bOaHkvmYuzm/b0KYLWwoK58oFqi6XfxQ4Uz7d6WeKpna6ytUs5e8betMcqAv5YPC5EZB2Lm9FIn0/VP6R58+/GEY1X1egVoZ/3bt/EqF6malgSAIgiDIH+QL41409QMY0LMAAAAASUVORK5CYII="
+                            "url": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAGQAAABkBAMAAACCzIhnAAAAG1BMVEURAAD///+ln5/h39/Dv79qX18uHx+If39MPz9oMSdmAAAACXBIWXMAAA7EAAAOxAGVKw4bAAABDElEQVRYhe2SzWqEMBRGPyQTfQxJsc5jBKGzFmlslyFIZxsCQ7sUaWd87EanpdpIrbtC71mE/NyTm9wEIAiCIAiC+N/otQBxU2Sf/aeh4enqptHXri+/yxIq63jlKCw6cXssnr3ObdzdGYFYCJ2IzHKXLygHXCB98Gm4DE+ZZemu5EisQSyZTmyg+AuzQbkezCuIy7EI0k9Ig3FtruwydY+qniqtV5yQyo8qpUIl2fc90KVzJWohWf2qu75vlw52rdfjVDHg8vLWwixW7PChqLkSyUadwfSS0uQZhEvRuIkS53uJvrK8cGWYaPwpGt8efvw+vlo8TPMzcmP8w7lrNypc1RsNgiAIgiD+Iu/RyDYhCaWrgQAAAABJRU5ErkJggg=="
                        },
                    },
                ],
--- a/tests/llm_translation/test_optional_params.py
+++ b/tests/llm_translation/test_optional_params.py
@ -1379,3 +1379,20 @@ def test_azure_modalities_param():
    )
    assert optional_params["modalities"] == ["text", "audio"]
    assert optional_params["audio"] == {"type": "audio_input", "input": "test.wav"}
+
+@pytest.mark.parametrize(
+    "model, provider",
+    [
+        ("claude-3-7-sonnet-20240620-v1:0", "anthropic"),
+        ("anthropic.claude-3-7-sonnet-20250219-v1:0", "bedrock"),
+        ("invoke/anthropic.claude-3-7-sonnet-20240620-v1:0", "bedrock"),
+        ("claude-3-7-sonnet@20250219", "vertex_ai"),
+    ],
+)
+def test_anthropic_unified_reasoning_content(model, provider):
+    optional_params = get_optional_params(
+        model=model,
+        custom_llm_provider=provider,
+        reasoning_effort="high",
+    )
+    assert optional_params["thinking"] == {"type": "enabled", "budget_tokens": 4096}
--- a/Show more
+++ b/Show more