diff --git a/litellm/caching/in_memory_cache.py b/litellm/caching/in_memory_cache.py
index a28fd6e415..5e09fe845f 100644
--- a/litellm/caching/in_memory_cache.py
+++ b/litellm/caching/in_memory_cache.py
@@ -9,9 +9,13 @@ Has 4 methods:
 """
 
 import json
+import sys
 import time
 from typing import Any, List, Optional
 
+from pydantic import BaseModel
+
+from ..constants import MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB
 from .base_cache import BaseCache
 
 
@@ -22,6 +26,7 @@ class InMemoryCache(BaseCache):
         default_ttl: Optional[
             int
         ] = 600,  # default ttl is 10 minutes. At maximum litellm rate limiting logic requires objects to be in memory for 1 minute
+        max_size_per_item: Optional[int] = 1024,  # 1MB = 1024KB
     ):
         """
         max_size_in_memory [int]: Maximum number of items in cache. done to prevent memory leaks. Use 200 items as a default
@@ -30,7 +35,9 @@ class InMemoryCache(BaseCache):
             max_size_in_memory or 200
         )  # set an upper bound of 200 items in-memory
         self.default_ttl = default_ttl or 600
-        self.max_size_per_item = 1024  # 1MB = 1024KB
+        self.max_size_per_item = (
+            max_size_per_item or MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB
+        )  # 1MB = 1024KB
 
         # in-memory cache
         self.cache_dict: dict = {}
@@ -42,26 +49,37 @@ class InMemoryCache(BaseCache):
         Returns True if value size is acceptable, False otherwise
         """
         try:
-            # Handle special types
-            if hasattr(value, "model_dump"):  # Pydantic v2
+            # Fast path for common primitive types that are typically small
+            if (
+                isinstance(value, (bool, int, float, str))
+                and len(str(value)) < self.max_size_per_item * 512
+            ):  # Conservative estimate
+                return True
+
+            # Direct size check for bytes objects
+            if isinstance(value, bytes):
+                return sys.getsizeof(value) / 1024 <= self.max_size_per_item
+
+            # Handle special types without full conversion when possible
+            if hasattr(value, "__sizeof__"):  # Use __sizeof__ if available
+                size = value.__sizeof__() / 1024
+                return size <= self.max_size_per_item
+
+            # Fallback for complex types
+            if isinstance(value, BaseModel) and hasattr(
+                value, "model_dump"
+            ):  # Pydantic v2
                 value = value.model_dump()
-            elif hasattr(value, "dict"):  # Pydantic v1
-                value = value.dict()
             elif hasattr(value, "isoformat"):  # datetime objects
-                value = value.isoformat()
+                return True  # datetime strings are always small
 
-            # Convert value to JSON string to get a consistent size measurement
+            # Only convert to JSON if absolutely necessary
             if not isinstance(value, (str, bytes)):
-                value = json.dumps(
-                    value, default=str
-                )  # default=str handles any remaining datetime objects
+                value = json.dumps(value, default=str)
 
-            # Get size in KB (1KB = 1024 bytes)
-            value_size = len(str(value).encode("utf-8")) / 1024
+            return sys.getsizeof(value) / 1024 <= self.max_size_per_item
 
-            return value_size <= self.max_size_per_item
         except Exception:
-            # If we can't measure the size, assume it's too large
             return False
 
     def evict_cache(self):
diff --git a/litellm/constants.py b/litellm/constants.py
index eb59858d43..da66f897c9 100644
--- a/litellm/constants.py
+++ b/litellm/constants.py
@@ -14,6 +14,7 @@ DEFAULT_REPLICATE_POLLING_DELAY_SECONDS = 1
 DEFAULT_IMAGE_TOKEN_COUNT = 250
 DEFAULT_IMAGE_WIDTH = 300
 DEFAULT_IMAGE_HEIGHT = 300
+MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 1024  # 1MB = 1024KB
 SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000  # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic.
 #### RELIABILITY ####
 REPEATED_STREAMING_CHUNK_LIMIT = 100  # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives.
diff --git a/tests/litellm/caching/test_in_memory_cache.py b/tests/litellm/caching/test_in_memory_cache.py
index de24bdf11a..d69899fec1 100644
--- a/tests/litellm/caching/test_in_memory_cache.py
+++ b/tests/litellm/caching/test_in_memory_cache.py
@@ -32,3 +32,14 @@ def test_in_memory_openai_obj_cache():
     assert cached_obj is not None
 
     assert cached_obj == openai_obj
+
+
+def test_in_memory_cache_max_size_per_item():
+    """
+    Test that the cache will not store items larger than the max size per item
+    """
+    in_memory_cache = InMemoryCache(max_size_per_item=100)
+
+    result = in_memory_cache.check_value_size("a" * 100000000)
+
+    assert result is False