diff --git a/litellm/__init__.py b/litellm/__init__.py
index 9997b9a8ac..42a96abf13 100644
--- a/litellm/__init__.py
+++ b/litellm/__init__.py
@@ -56,6 +56,9 @@ from litellm.constants import (
     bedrock_embedding_models,
     known_tokenizer_config,
     BEDROCK_INVOKE_PROVIDERS_LITERAL,
+    DEFAULT_MAX_TOKENS,
+    DEFAULT_SOFT_BUDGET,
+    DEFAULT_ALLOWED_FAILS,
 )
 from litellm.types.guardrails import GuardrailItem
 from litellm.proxy._types import (
@@ -155,7 +158,7 @@ token: Optional[
     str
 ] = None  # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
 telemetry = True
-max_tokens = 256  # OpenAI Defaults
+max_tokens: int = DEFAULT_MAX_TOKENS  # OpenAI Defaults
 drop_params = bool(os.getenv("LITELLM_DROP_PARAMS", False))
 modify_params = False
 retry = True
@@ -244,7 +247,7 @@ budget_duration: Optional[
     str
 ] = None  # proxy only - resets budget after fixed duration. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
 default_soft_budget: float = (
-    50.0  # by default all litellm proxy keys have a soft budget of 50.0
+    DEFAULT_SOFT_BUDGET  # by default all litellm proxy keys have a soft budget of 50.0
 )
 forward_traceparent_to_llm_provider: bool = False
 
diff --git a/litellm/_redis.py b/litellm/_redis.py
index b2624d4280..14813c436e 100644
--- a/litellm/_redis.py
+++ b/litellm/_redis.py
@@ -18,6 +18,7 @@ import redis  # type: ignore
 import redis.asyncio as async_redis  # type: ignore
 
 from litellm import get_secret, get_secret_str
+from litellm.constants import REDIS_CONNECTION_POOL_TIMEOUT, REDIS_SOCKET_TIMEOUT
 
 from ._logging import verbose_logger
 
@@ -215,7 +216,7 @@ def _init_redis_sentinel(redis_kwargs) -> redis.Redis:
     # Set up the Sentinel client
     sentinel = redis.Sentinel(
         sentinel_nodes,
-        socket_timeout=0.1,
+        socket_timeout=REDIS_SOCKET_TIMEOUT,
         password=sentinel_password,
     )
 
@@ -239,7 +240,7 @@ def _init_async_redis_sentinel(redis_kwargs) -> async_redis.Redis:
     # Set up the Sentinel client
     sentinel = async_redis.Sentinel(
         sentinel_nodes,
-        socket_timeout=0.1,
+        socket_timeout=REDIS_SOCKET_TIMEOUT,
         password=sentinel_password,
     )
 
@@ -319,7 +320,7 @@ def get_redis_connection_pool(**env_overrides):
     verbose_logger.debug("get_redis_connection_pool: redis_kwargs", redis_kwargs)
     if "url" in redis_kwargs and redis_kwargs["url"] is not None:
         return async_redis.BlockingConnectionPool.from_url(
-            timeout=5, url=redis_kwargs["url"]
+            timeout=REDIS_CONNECTION_POOL_TIMEOUT, url=redis_kwargs["url"]
         )
     connection_class = async_redis.Connection
     if "ssl" in redis_kwargs:
@@ -327,4 +328,6 @@ def get_redis_connection_pool(**env_overrides):
         redis_kwargs.pop("ssl", None)
         redis_kwargs["connection_class"] = connection_class
     redis_kwargs.pop("startup_nodes", None)
-    return async_redis.BlockingConnectionPool(timeout=5, **redis_kwargs)
+    return async_redis.BlockingConnectionPool(
+        timeout=REDIS_CONNECTION_POOL_TIMEOUT, **redis_kwargs
+    )
diff --git a/litellm/budget_manager.py b/litellm/budget_manager.py
index e664c4f44f..b25967579e 100644
--- a/litellm/budget_manager.py
+++ b/litellm/budget_manager.py
@@ -14,6 +14,12 @@ import time
 from typing import Literal, Optional
 
 import litellm
+from litellm.constants import (
+    DAYS_IN_A_MONTH,
+    DAYS_IN_A_WEEK,
+    DAYS_IN_A_YEAR,
+    HOURS_IN_A_DAY,
+)
 from litellm.utils import ModelResponse
 
 
@@ -81,11 +87,11 @@ class BudgetManager:
         if duration == "daily":
             duration_in_days = 1
         elif duration == "weekly":
-            duration_in_days = 7
+            duration_in_days = DAYS_IN_A_WEEK
         elif duration == "monthly":
-            duration_in_days = 28
+            duration_in_days = DAYS_IN_A_MONTH
         elif duration == "yearly":
-            duration_in_days = 365
+            duration_in_days = DAYS_IN_A_YEAR
         else:
             raise ValueError(
                 """duration needs to be one of ["daily", "weekly", "monthly", "yearly"]"""
@@ -182,7 +188,9 @@ class BudgetManager:
         current_time = time.time()
 
         # Convert duration from days to seconds
-        duration_in_seconds = self.user_dict[user]["duration"] * 24 * 60 * 60
+        duration_in_seconds = (
+            self.user_dict[user]["duration"] * HOURS_IN_A_DAY * 60 * 60
+        )
 
         # Check if duration has elapsed
         if current_time - last_updated_at >= duration_in_seconds:
diff --git a/litellm/caching/caching.py b/litellm/caching/caching.py
index affb8e3855..6a7c93e3fe 100644
--- a/litellm/caching/caching.py
+++ b/litellm/caching/caching.py
@@ -19,6 +19,7 @@ from pydantic import BaseModel
 
 import litellm
 from litellm._logging import verbose_logger
+from litellm.constants import CACHED_STREAMING_CHUNK_DELAY
 from litellm.litellm_core_utils.model_param_helper import ModelParamHelper
 from litellm.types.caching import *
 from litellm.types.utils import all_litellm_params
@@ -406,7 +407,7 @@ class Cache:
                     }
                 ]
             }
-            time.sleep(0.02)
+            time.sleep(CACHED_STREAMING_CHUNK_DELAY)
 
     def _get_cache_logic(
         self,
diff --git a/litellm/caching/in_memory_cache.py b/litellm/caching/in_memory_cache.py
index 5e09fe845f..e3d757d08d 100644
--- a/litellm/caching/in_memory_cache.py
+++ b/litellm/caching/in_memory_cache.py
@@ -15,7 +15,8 @@ from typing import Any, List, Optional
 
 from pydantic import BaseModel
 
-from ..constants import MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB
+from litellm.constants import MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB
+
 from .base_cache import BaseCache
 
 
@@ -52,7 +53,8 @@ class InMemoryCache(BaseCache):
             # Fast path for common primitive types that are typically small
             if (
                 isinstance(value, (bool, int, float, str))
-                and len(str(value)) < self.max_size_per_item * 512
+                and len(str(value))
+                < self.max_size_per_item * MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB
             ):  # Conservative estimate
                 return True
 
diff --git a/litellm/caching/qdrant_semantic_cache.py b/litellm/caching/qdrant_semantic_cache.py
index bdfd3770ae..32d4d8b0fd 100644
--- a/litellm/caching/qdrant_semantic_cache.py
+++ b/litellm/caching/qdrant_semantic_cache.py
@@ -11,10 +11,12 @@ Has 4 methods:
 import ast
 import asyncio
 import json
-from typing import Any
+from typing import Any, cast
 
 import litellm
 from litellm._logging import print_verbose
+from litellm.constants import QDRANT_SCALAR_QUANTILE, QDRANT_VECTOR_SIZE
+from litellm.types.utils import EmbeddingResponse
 
 from .base_cache import BaseCache
 
@@ -118,7 +120,11 @@ class QdrantSemanticCache(BaseCache):
                 }
             elif quantization_config == "scalar":
                 quantization_params = {
-                    "scalar": {"type": "int8", "quantile": 0.99, "always_ram": False}
+                    "scalar": {
+                        "type": "int8",
+                        "quantile": QDRANT_SCALAR_QUANTILE,
+                        "always_ram": False,
+                    }
                 }
             elif quantization_config == "product":
                 quantization_params = {
@@ -132,7 +138,7 @@ class QdrantSemanticCache(BaseCache):
             new_collection_status = self.sync_client.put(
                 url=f"{self.qdrant_api_base}/collections/{self.collection_name}",
                 json={
-                    "vectors": {"size": 1536, "distance": "Cosine"},
+                    "vectors": {"size": QDRANT_VECTOR_SIZE, "distance": "Cosine"},
                     "quantization_config": quantization_params,
                 },
                 headers=self.headers,
@@ -171,10 +177,13 @@ class QdrantSemanticCache(BaseCache):
             prompt += message["content"]
 
         # create an embedding for prompt
-        embedding_response = litellm.embedding(
-            model=self.embedding_model,
-            input=prompt,
-            cache={"no-store": True, "no-cache": True},
+        embedding_response = cast(
+            EmbeddingResponse,
+            litellm.embedding(
+                model=self.embedding_model,
+                input=prompt,
+                cache={"no-store": True, "no-cache": True},
+            ),
         )
 
         # get the embedding
@@ -212,10 +221,13 @@ class QdrantSemanticCache(BaseCache):
             prompt += message["content"]
 
         # convert to embedding
-        embedding_response = litellm.embedding(
-            model=self.embedding_model,
-            input=prompt,
-            cache={"no-store": True, "no-cache": True},
+        embedding_response = cast(
+            EmbeddingResponse,
+            litellm.embedding(
+                model=self.embedding_model,
+                input=prompt,
+                cache={"no-store": True, "no-cache": True},
+            ),
         )
 
         # get the embedding
diff --git a/litellm/constants.py b/litellm/constants.py
index cace674f2f..a2fd373a61 100644
--- a/litellm/constants.py
+++ b/litellm/constants.py
@@ -9,6 +9,7 @@ DEFAULT_FAILURE_THRESHOLD_PERCENT = (
     0.5  # default cooldown a deployment if 50% of requests fail in a given minute
 )
 DEFAULT_MAX_TOKENS = 4096
+DEFAULT_ALLOWED_FAILS = 3
 DEFAULT_REDIS_SYNC_INTERVAL = 1
 DEFAULT_COOLDOWN_TIME_SECONDS = 5
 DEFAULT_REPLICATE_POLLING_RETRIES = 5
@@ -16,16 +17,71 @@ DEFAULT_REPLICATE_POLLING_DELAY_SECONDS = 1
 DEFAULT_IMAGE_TOKEN_COUNT = 250
 DEFAULT_IMAGE_WIDTH = 300
 DEFAULT_IMAGE_HEIGHT = 300
+DEFAULT_MAX_TOKENS = 256  # used when providers need a default
 MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 1024  # 1MB = 1024KB
 SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000  # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic.
 REDIS_UPDATE_BUFFER_KEY = "litellm_spend_update_buffer"
 REDIS_DAILY_SPEND_UPDATE_BUFFER_KEY = "litellm_daily_spend_update_buffer"
 MAX_REDIS_BUFFER_DEQUEUE_COUNT = 100
+MINIMUM_PROMPT_CACHE_TOKEN_COUNT = (
+    1024  # minimum number of tokens to cache a prompt by Anthropic
+)
+DEFAULT_TRIM_RATIO = 0.75  # default ratio of tokens to trim from the end of a prompt
+HOURS_IN_A_DAY = 24
+DAYS_IN_A_WEEK = 7
+DAYS_IN_A_MONTH = 28
+DAYS_IN_A_YEAR = 365
+REPLICATE_MODEL_NAME_WITH_ID_LENGTH = 64
+#### TOKEN COUNTING ####
+FUNCTION_DEFINITION_TOKEN_COUNT = 9
+SYSTEM_MESSAGE_TOKEN_COUNT = 4
+TOOL_CHOICE_OBJECT_TOKEN_COUNT = 4
+DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT = 10
+DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT = 20
+MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES = 768
+MAX_LONG_SIDE_FOR_IMAGE_HIGH_RES = 2000
+MAX_TILE_WIDTH = 512
+MAX_TILE_HEIGHT = 512
+OPENAI_FILE_SEARCH_COST_PER_1K_CALLS = 2.5 / 1000
+MIN_NON_ZERO_TEMPERATURE = 0.0001
 #### RELIABILITY ####
 REPEATED_STREAMING_CHUNK_LIMIT = 100  # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives.
+DEFAULT_MAX_LRU_CACHE_SIZE = 16
+INITIAL_RETRY_DELAY = 0.5
+MAX_RETRY_DELAY = 8.0
+JITTER = 0.75
+DEFAULT_IN_MEMORY_TTL = 5  # default time to live for the in-memory cache
+DEFAULT_POLLING_INTERVAL = 0.03  # default polling interval for the scheduler
+AZURE_OPERATION_POLLING_TIMEOUT = 120
+REDIS_SOCKET_TIMEOUT = 0.1
+REDIS_CONNECTION_POOL_TIMEOUT = 5
+NON_LLM_CONNECTION_TIMEOUT = 15  # timeout for adjacent services (e.g. jwt auth)
+MAX_EXCEPTION_MESSAGE_LENGTH = 2000
+BEDROCK_MAX_POLICY_SIZE = 75
+REPLICATE_POLLING_DELAY_SECONDS = 0.5
+DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS = 4096
+TOGETHER_AI_4_B = 4
+TOGETHER_AI_8_B = 8
+TOGETHER_AI_21_B = 21
+TOGETHER_AI_41_B = 41
+TOGETHER_AI_80_B = 80
+TOGETHER_AI_110_B = 110
+TOGETHER_AI_EMBEDDING_150_M = 150
+TOGETHER_AI_EMBEDDING_350_M = 350
+QDRANT_SCALAR_QUANTILE = 0.99
+QDRANT_VECTOR_SIZE = 1536
+CACHED_STREAMING_CHUNK_DELAY = 0.02
+MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 512
+DEFAULT_MAX_TOKENS_FOR_TRITON = 2000
 #### Networking settings ####
 request_timeout: float = 6000  # time in seconds
 STREAM_SSE_DONE_STRING: str = "[DONE]"
+### SPEND TRACKING ###
+DEFAULT_REPLICATE_GPU_PRICE_PER_SECOND = 0.001400  # price per second for a100 80GB
+FIREWORKS_AI_56_B_MOE = 56
+FIREWORKS_AI_176_B_MOE = 176
+FIREWORKS_AI_16_B = 16
+FIREWORKS_AI_80_B = 80
 
 LITELLM_CHAT_PROVIDERS = [
     "openai",
@@ -426,6 +482,9 @@ MCP_TOOL_NAME_PREFIX = "mcp_tool"
 MAX_SPENDLOG_ROWS_TO_QUERY = (
     1_000_000  # if spendLogs has more than 1M rows, do not query the DB
 )
+DEFAULT_SOFT_BUDGET = (
+    50.0  # by default all litellm proxy keys have a soft budget of 50.0
+)
 # makes it clear this is a rate limit error for a litellm virtual key
 RATE_LIMIT_ERROR_MESSAGE_FOR_VIRTUAL_KEY = "LiteLLM Virtual Key user_api_key_hash"
 
@@ -451,3 +510,14 @@ LITELLM_PROXY_ADMIN_NAME = "default_user_id"
 ########################### DB CRON JOB NAMES ###########################
 DB_SPEND_UPDATE_JOB_NAME = "db_spend_update_job"
 DEFAULT_CRON_JOB_LOCK_TTL_SECONDS = 60  # 1 minute
+PROXY_BUDGET_RESCHEDULER_MIN_TIME = 597
+PROXY_BUDGET_RESCHEDULER_MAX_TIME = 605
+PROXY_BATCH_WRITE_AT = 10  # in seconds
+DEFAULT_HEALTH_CHECK_INTERVAL = 300  # 5 minutes
+PROMETHEUS_FALLBACK_STATS_SEND_TIME_HOURS = 9
+DEFAULT_MODEL_CREATED_AT_TIME = 1677610602  # returns on `/models` endpoint
+DEFAULT_SLACK_ALERTING_THRESHOLD = 300
+MAX_TEAM_LIST_LIMIT = 20
+DEFAULT_PROMPT_INJECTION_SIMILARITY_THRESHOLD = 0.7
+LENGTH_OF_LITELLM_GENERATED_KEY = 16
+SECRET_MANAGER_REFRESH_INTERVAL = 86400
diff --git a/litellm/cost_calculator.py b/litellm/cost_calculator.py
index de12698658..98c73a4ce7 100644
--- a/litellm/cost_calculator.py
+++ b/litellm/cost_calculator.py
@@ -9,6 +9,10 @@ from pydantic import BaseModel
 import litellm
 import litellm._logging
 from litellm import verbose_logger
+from litellm.constants import (
+    DEFAULT_MAX_LRU_CACHE_SIZE,
+    DEFAULT_REPLICATE_GPU_PRICE_PER_SECOND,
+)
 from litellm.litellm_core_utils.llm_cost_calc.tool_call_cost_tracking import (
     StandardBuiltInToolCostTracking,
 )
@@ -355,9 +359,7 @@ def cost_per_token(  # noqa: PLR0915
 def get_replicate_completion_pricing(completion_response: dict, total_time=0.0):
     # see https://replicate.com/pricing
     # for all litellm currently supported LLMs, almost all requests go to a100_80gb
-    a100_80gb_price_per_second_public = (
-        0.001400  # assume all calls sent to A100 80GB for now
-    )
+    a100_80gb_price_per_second_public = DEFAULT_REPLICATE_GPU_PRICE_PER_SECOND  # assume all calls sent to A100 80GB for now
     if total_time == 0.0:  # total time is in ms
         start_time = completion_response.get("created", time.time())
         end_time = getattr(completion_response, "ended", time.time())
@@ -450,7 +452,7 @@ def _select_model_name_for_cost_calc(
     return return_model
 
 
-@lru_cache(maxsize=16)
+@lru_cache(maxsize=DEFAULT_MAX_LRU_CACHE_SIZE)
 def _model_contains_known_llm_provider(model: str) -> bool:
     """
     Check if the model contains a known llm provider
diff --git a/litellm/integrations/SlackAlerting/slack_alerting.py b/litellm/integrations/SlackAlerting/slack_alerting.py
index 50f0538cfd..9fde042ae7 100644
--- a/litellm/integrations/SlackAlerting/slack_alerting.py
+++ b/litellm/integrations/SlackAlerting/slack_alerting.py
@@ -16,6 +16,7 @@ import litellm.litellm_core_utils.litellm_logging
 import litellm.types
 from litellm._logging import verbose_logger, verbose_proxy_logger
 from litellm.caching.caching import DualCache
+from litellm.constants import HOURS_IN_A_DAY
 from litellm.integrations.custom_batch_logger import CustomBatchLogger
 from litellm.litellm_core_utils.duration_parser import duration_in_seconds
 from litellm.litellm_core_utils.exception_mapping_utils import (
@@ -649,10 +650,10 @@ class SlackAlerting(CustomBatchLogger):
                 event_message += (
                     f"Budget Crossed\n Total Budget:`{user_info.max_budget}`"
                 )
-            elif percent_left <= 0.05:
+            elif percent_left <= SLACK_ALERTING_THRESHOLD_5_PERCENT:
                 event = "threshold_crossed"
                 event_message += "5% Threshold Crossed "
-            elif percent_left <= 0.15:
+            elif percent_left <= SLACK_ALERTING_THRESHOLD_15_PERCENT:
                 event = "threshold_crossed"
                 event_message += "15% Threshold Crossed"
         elif user_info.soft_budget is not None:
@@ -1718,7 +1719,7 @@ Model Info:
             await self.internal_usage_cache.async_set_cache(
                 key=_event_cache_key,
                 value="SENT",
-                ttl=(30 * 24 * 60 * 60),  # 1 month
+                ttl=(30 * HOURS_IN_A_DAY * 60 * 60),  # 1 month
             )
 
         except Exception as e:
diff --git a/litellm/integrations/datadog/datadog.py b/litellm/integrations/datadog/datadog.py
index e9b6b6b164..fb6fee6dc6 100644
--- a/litellm/integrations/datadog/datadog.py
+++ b/litellm/integrations/datadog/datadog.py
@@ -41,7 +41,7 @@ from litellm.types.utils import StandardLoggingPayload
 from ..additional_logging_utils import AdditionalLoggingUtils
 
 # max number of logs DD API can accept
-DD_MAX_BATCH_SIZE = 1000
+
 
 # specify what ServiceTypes are logged as success events to DD. (We don't want to spam DD traces with large number of service types)
 DD_LOGGED_SUCCESS_SERVICE_TYPES = [
diff --git a/litellm/integrations/gcs_bucket/gcs_bucket.py b/litellm/integrations/gcs_bucket/gcs_bucket.py
index 187ab779c0..fc98b0948f 100644
--- a/litellm/integrations/gcs_bucket/gcs_bucket.py
+++ b/litellm/integrations/gcs_bucket/gcs_bucket.py
@@ -20,10 +20,6 @@ else:
     VertexBase = Any
 
 
-GCS_DEFAULT_BATCH_SIZE = 2048
-GCS_DEFAULT_FLUSH_INTERVAL_SECONDS = 20
-
-
 class GCSBucketLogger(GCSBucketBase, AdditionalLoggingUtils):
     def __init__(self, bucket_name: Optional[str] = None) -> None:
         from litellm.proxy.proxy_server import premium_user
diff --git a/litellm/litellm_core_utils/get_llm_provider_logic.py b/litellm/litellm_core_utils/get_llm_provider_logic.py
index 037351d0e6..13103c85a0 100644
--- a/litellm/litellm_core_utils/get_llm_provider_logic.py
+++ b/litellm/litellm_core_utils/get_llm_provider_logic.py
@@ -3,6 +3,7 @@ from typing import Optional, Tuple
 import httpx
 
 import litellm
+from litellm.constants import REPLICATE_MODEL_NAME_WITH_ID_LENGTH
 from litellm.secret_managers.main import get_secret, get_secret_str
 
 from ..types.router import LiteLLM_Params
@@ -256,10 +257,13 @@ def get_llm_provider(  # noqa: PLR0915
         elif model in litellm.cohere_chat_models:
             custom_llm_provider = "cohere_chat"
         ## replicate
-        elif model in litellm.replicate_models or (":" in model and len(model) > 64):
+        elif model in litellm.replicate_models or (
+            ":" in model and len(model) > REPLICATE_MODEL_NAME_WITH_ID_LENGTH
+        ):
             model_parts = model.split(":")
             if (
-                len(model_parts) > 1 and len(model_parts[1]) == 64
+                len(model_parts) > 1
+                and len(model_parts[1]) == REPLICATE_MODEL_NAME_WITH_ID_LENGTH
             ):  ## checks if model name has a 64 digit code - e.g. "meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3"
                 custom_llm_provider = "replicate"
             elif model in litellm.replicate_models:
diff --git a/litellm/litellm_core_utils/litellm_logging.py b/litellm/litellm_core_utils/litellm_logging.py
index 84825535c9..255cce7336 100644
--- a/litellm/litellm_core_utils/litellm_logging.py
+++ b/litellm/litellm_core_utils/litellm_logging.py
@@ -28,6 +28,10 @@ from litellm._logging import _is_debugging_on, verbose_logger
 from litellm.batches.batch_utils import _handle_completed_batch
 from litellm.caching.caching import DualCache, InMemoryCache
 from litellm.caching.caching_handler import LLMCachingHandler
+from litellm.constants import (
+    DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT,
+    DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT,
+)
 from litellm.cost_calculator import _select_model_name_for_cost_calc
 from litellm.integrations.arize.arize import ArizeLogger
 from litellm.integrations.custom_guardrail import CustomGuardrail
@@ -3745,9 +3749,12 @@ def create_dummy_standard_logging_payload() -> StandardLoggingPayload:
         response_cost=response_cost,
         response_cost_failure_debug_info=None,
         status=str("success"),
-        total_tokens=int(30),
-        prompt_tokens=int(20),
-        completion_tokens=int(10),
+        total_tokens=int(
+            DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT
+            + DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT
+        ),
+        prompt_tokens=int(DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT),
+        completion_tokens=int(DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT),
         startTime=start_time,
         endTime=end_time,
         completionStartTime=completion_start_time,
diff --git a/litellm/litellm_core_utils/llm_cost_calc/tool_call_cost_tracking.py b/litellm/litellm_core_utils/llm_cost_calc/tool_call_cost_tracking.py
index 74d15e9a01..34c370ffca 100644
--- a/litellm/litellm_core_utils/llm_cost_calc/tool_call_cost_tracking.py
+++ b/litellm/litellm_core_utils/llm_cost_calc/tool_call_cost_tracking.py
@@ -5,6 +5,7 @@ Helper utilities for tracking the cost of built-in tools.
 from typing import Any, Dict, List, Optional
 
 import litellm
+from litellm.constants import OPENAI_FILE_SEARCH_COST_PER_1K_CALLS
 from litellm.types.llms.openai import FileSearchTool, WebSearchOptions
 from litellm.types.utils import (
     ModelInfo,
@@ -132,7 +133,7 @@ class StandardBuiltInToolCostTracking:
         """
         if file_search is None:
             return 0.0
-        return 2.5 / 1000
+        return OPENAI_FILE_SEARCH_COST_PER_1K_CALLS
 
     @staticmethod
     def chat_completion_response_includes_annotations(
diff --git a/litellm/litellm_core_utils/token_counter.py b/litellm/litellm_core_utils/token_counter.py
index e6bc65ccff..afd5ab5ff4 100644
--- a/litellm/litellm_core_utils/token_counter.py
+++ b/litellm/litellm_core_utils/token_counter.py
@@ -11,6 +11,10 @@ from litellm.constants import (
     DEFAULT_IMAGE_HEIGHT,
     DEFAULT_IMAGE_TOKEN_COUNT,
     DEFAULT_IMAGE_WIDTH,
+    MAX_LONG_SIDE_FOR_IMAGE_HIGH_RES,
+    MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES,
+    MAX_TILE_HEIGHT,
+    MAX_TILE_WIDTH,
 )
 from litellm.llms.custom_httpx.http_handler import _get_httpx_client
 
@@ -97,11 +101,14 @@ def resize_image_high_res(
     height: int,
 ) -> Tuple[int, int]:
     # Maximum dimensions for high res mode
-    max_short_side = 768
-    max_long_side = 2000
+    max_short_side = MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES
+    max_long_side = MAX_LONG_SIDE_FOR_IMAGE_HIGH_RES
 
     # Return early if no resizing is needed
-    if width <= 768 and height <= 768:
+    if (
+        width <= MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES
+        and height <= MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES
+    ):
         return width, height
 
     # Determine the longer and shorter sides
@@ -132,7 +139,10 @@ def resize_image_high_res(
 
 # Test the function with the given example
 def calculate_tiles_needed(
-    resized_width, resized_height, tile_width=512, tile_height=512
+    resized_width,
+    resized_height,
+    tile_width=MAX_TILE_WIDTH,
+    tile_height=MAX_TILE_HEIGHT,
 ):
     tiles_across = (resized_width + tile_width - 1) // tile_width
     tiles_down = (resized_height + tile_height - 1) // tile_height
diff --git a/litellm/llms/anthropic/chat/transformation.py b/litellm/llms/anthropic/chat/transformation.py
index 09096c89e7..64702b4f26 100644
--- a/litellm/llms/anthropic/chat/transformation.py
+++ b/litellm/llms/anthropic/chat/transformation.py
@@ -5,7 +5,10 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union, cast
 import httpx
 
 import litellm
-from litellm.constants import RESPONSE_FORMAT_TOOL_NAME
+from litellm.constants import (
+    DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS,
+    RESPONSE_FORMAT_TOOL_NAME,
+)
 from litellm.litellm_core_utils.core_helpers import map_finish_reason
 from litellm.litellm_core_utils.prompt_templates.factory import anthropic_messages_pt
 from litellm.llms.base_llm.base_utils import type_to_response_format_param
@@ -53,7 +56,7 @@ class AnthropicConfig(BaseConfig):
 
     max_tokens: Optional[
         int
-    ] = 4096  # anthropic requires a default value (Opus, Sonnet, and Haiku have the same default)
+    ] = DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS  # anthropic requires a default value (Opus, Sonnet, and Haiku have the same default)
     stop_sequences: Optional[list] = None
     temperature: Optional[int] = None
     top_p: Optional[int] = None
@@ -65,7 +68,7 @@ class AnthropicConfig(BaseConfig):
         self,
         max_tokens: Optional[
             int
-        ] = 4096,  # You can pass in a value yourself or use the default value 4096
+        ] = DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS,  # You can pass in a value yourself or use the default value 4096
         stop_sequences: Optional[list] = None,
         temperature: Optional[int] = None,
         top_p: Optional[int] = None,
diff --git a/litellm/llms/anthropic/completion/transformation.py b/litellm/llms/anthropic/completion/transformation.py
index 5cbc0b5fd8..e4e04df4d6 100644
--- a/litellm/llms/anthropic/completion/transformation.py
+++ b/litellm/llms/anthropic/completion/transformation.py
@@ -11,6 +11,7 @@ from typing import AsyncIterator, Dict, Iterator, List, Optional, Union
 import httpx
 
 import litellm
+from litellm.constants import DEFAULT_MAX_TOKENS
 from litellm.litellm_core_utils.prompt_templates.factory import (
     custom_prompt,
     prompt_factory,
@@ -65,7 +66,9 @@ class AnthropicTextConfig(BaseConfig):
 
     def __init__(
         self,
-        max_tokens_to_sample: Optional[int] = 256,  # anthropic requires a default
+        max_tokens_to_sample: Optional[
+            int
+        ] = DEFAULT_MAX_TOKENS,  # anthropic requires a default
         stop_sequences: Optional[list] = None,
         temperature: Optional[int] = None,
         top_p: Optional[int] = None,
diff --git a/litellm/llms/azure/azure.py b/litellm/llms/azure/azure.py
index aed813fdab..bb60680ebc 100644
--- a/litellm/llms/azure/azure.py
+++ b/litellm/llms/azure/azure.py
@@ -7,7 +7,7 @@ import httpx  # type: ignore
 from openai import APITimeoutError, AsyncAzureOpenAI, AzureOpenAI
 
 import litellm
-from litellm.constants import DEFAULT_MAX_RETRIES
+from litellm.constants import AZURE_OPERATION_POLLING_TIMEOUT, DEFAULT_MAX_RETRIES
 from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
 from litellm.litellm_core_utils.logging_utils import track_llm_api_timing
 from litellm.llms.custom_httpx.http_handler import (
@@ -857,7 +857,7 @@ class AzureChatCompletion(BaseAzureLLM, BaseLLM):
 
             await response.aread()
 
-            timeout_secs: int = 120
+            timeout_secs: int = AZURE_OPERATION_POLLING_TIMEOUT
             start_time = time.time()
             if "status" not in response.json():
                 raise Exception(
@@ -955,7 +955,7 @@ class AzureChatCompletion(BaseAzureLLM, BaseLLM):
 
             response.read()
 
-            timeout_secs: int = 120
+            timeout_secs: int = AZURE_OPERATION_POLLING_TIMEOUT
             start_time = time.time()
             if "status" not in response.json():
                 raise Exception(
diff --git a/litellm/llms/azure/chat/gpt_transformation.py b/litellm/llms/azure/chat/gpt_transformation.py
index ee85517e66..e30d68f97d 100644
--- a/litellm/llms/azure/chat/gpt_transformation.py
+++ b/litellm/llms/azure/chat/gpt_transformation.py
@@ -7,6 +7,10 @@ from litellm.litellm_core_utils.prompt_templates.factory import (
     convert_to_azure_openai_messages,
 )
 from litellm.llms.base_llm.chat.transformation import BaseLLMException
+from litellm.types.llms.azure import (
+    API_VERSION_MONTH_SUPPORTED_RESPONSE_FORMAT,
+    API_VERSION_YEAR_SUPPORTED_RESPONSE_FORMAT,
+)
 from litellm.types.utils import ModelResponse
 from litellm.utils import supports_response_schema
 
@@ -123,7 +127,10 @@ class AzureOpenAIConfig(BaseConfig):
         - check if api_version is supported for response_format
         """
 
-        is_supported = int(api_version_year) <= 2024 and int(api_version_month) >= 8
+        is_supported = (
+            int(api_version_year) <= API_VERSION_YEAR_SUPPORTED_RESPONSE_FORMAT
+            and int(api_version_month) >= API_VERSION_MONTH_SUPPORTED_RESPONSE_FORMAT
+        )
 
         return is_supported
 
diff --git a/litellm/llms/bedrock/base_aws_llm.py b/litellm/llms/bedrock/base_aws_llm.py
index 5482d80687..133ef6a952 100644
--- a/litellm/llms/bedrock/base_aws_llm.py
+++ b/litellm/llms/bedrock/base_aws_llm.py
@@ -9,7 +9,7 @@ from pydantic import BaseModel
 
 from litellm._logging import verbose_logger
 from litellm.caching.caching import DualCache
-from litellm.constants import BEDROCK_INVOKE_PROVIDERS_LITERAL
+from litellm.constants import BEDROCK_INVOKE_PROVIDERS_LITERAL, BEDROCK_MAX_POLICY_SIZE
 from litellm.litellm_core_utils.dd_tracing import tracer
 from litellm.secret_managers.main import get_secret
 
@@ -381,7 +381,7 @@ class BaseAWSLLM:
             "region_name": aws_region_name,
         }
 
-        if sts_response["PackedPolicySize"] > 75:
+        if sts_response["PackedPolicySize"] > BEDROCK_MAX_POLICY_SIZE:
             verbose_logger.warning(
                 f"The policy size is greater than 75% of the allowed size, PackedPolicySize: {sts_response['PackedPolicySize']}"
             )
diff --git a/litellm/llms/deepinfra/chat/transformation.py b/litellm/llms/deepinfra/chat/transformation.py
index 429759fad1..0d446d39b9 100644
--- a/litellm/llms/deepinfra/chat/transformation.py
+++ b/litellm/llms/deepinfra/chat/transformation.py
@@ -1,6 +1,7 @@
 from typing import Optional, Tuple, Union
 
 import litellm
+from litellm.constants import MIN_NON_ZERO_TEMPERATURE
 from litellm.llms.openai.chat.gpt_transformation import OpenAIGPTConfig
 from litellm.secret_managers.main import get_secret_str
 
@@ -84,7 +85,7 @@ class DeepInfraConfig(OpenAIGPTConfig):
                 and value == 0
                 and model == "mistralai/Mistral-7B-Instruct-v0.1"
             ):  # this model does no support temperature == 0
-                value = 0.0001  # close to 0
+                value = MIN_NON_ZERO_TEMPERATURE  # close to 0
             if param == "tool_choice":
                 if (
                     value != "auto" and value != "none"
diff --git a/litellm/llms/fireworks_ai/cost_calculator.py b/litellm/llms/fireworks_ai/cost_calculator.py
index f53aba4a47..31414625ab 100644
--- a/litellm/llms/fireworks_ai/cost_calculator.py
+++ b/litellm/llms/fireworks_ai/cost_calculator.py
@@ -4,6 +4,12 @@ For calculating cost of fireworks ai serverless inference models.
 
 from typing import Tuple
 
+from litellm.constants import (
+    FIREWORKS_AI_16_B,
+    FIREWORKS_AI_56_B_MOE,
+    FIREWORKS_AI_80_B,
+    FIREWORKS_AI_176_B_MOE,
+)
 from litellm.types.utils import Usage
 from litellm.utils import get_model_info
 
@@ -25,9 +31,9 @@ def get_base_model_for_pricing(model_name: str) -> str:
     moe_match = re.search(r"(\d+)x(\d+)b", model_name)
     if moe_match:
         total_billion = int(moe_match.group(1)) * int(moe_match.group(2))
-        if total_billion <= 56:
+        if total_billion <= FIREWORKS_AI_56_B_MOE:
             return "fireworks-ai-moe-up-to-56b"
-        elif total_billion <= 176:
+        elif total_billion <= FIREWORKS_AI_176_B_MOE:
             return "fireworks-ai-56b-to-176b"
 
     # Check for standard models in the form <number>b
@@ -37,9 +43,9 @@ def get_base_model_for_pricing(model_name: str) -> str:
         params_billion = float(params_match)
 
         # Determine the category based on the number of parameters
-        if params_billion <= 16.0:
+        if params_billion <= FIREWORKS_AI_16_B:
             return "fireworks-ai-up-to-16b"
-        elif params_billion <= 80.0:
+        elif params_billion <= FIREWORKS_AI_80_B:
             return "fireworks-ai-16b-80b"
 
     # If no matches, return the original model_name
diff --git a/litellm/llms/predibase/chat/transformation.py b/litellm/llms/predibase/chat/transformation.py
index f1a2163d24..8ef0eea173 100644
--- a/litellm/llms/predibase/chat/transformation.py
+++ b/litellm/llms/predibase/chat/transformation.py
@@ -2,6 +2,7 @@ from typing import TYPE_CHECKING, Any, List, Literal, Optional, Union
 
 from httpx import Headers, Response
 
+from litellm.constants import DEFAULT_MAX_TOKENS
 from litellm.llms.base_llm.chat.transformation import BaseConfig, BaseLLMException
 from litellm.types.llms.openai import AllMessageValues
 from litellm.types.utils import ModelResponse
@@ -27,7 +28,7 @@ class PredibaseConfig(BaseConfig):
     decoder_input_details: Optional[bool] = None
     details: bool = True  # enables returning logprobs + best of
     max_new_tokens: int = (
-        256  # openai default - requests hang if max_new_tokens not given
+        DEFAULT_MAX_TOKENS  # openai default - requests hang if max_new_tokens not given
     )
     repetition_penalty: Optional[float] = None
     return_full_text: Optional[
diff --git a/litellm/llms/replicate/chat/handler.py b/litellm/llms/replicate/chat/handler.py
index 7991c61ee3..d954416381 100644
--- a/litellm/llms/replicate/chat/handler.py
+++ b/litellm/llms/replicate/chat/handler.py
@@ -4,6 +4,7 @@ import time
 from typing import Callable, List, Union
 
 import litellm
+from litellm.constants import REPLICATE_POLLING_DELAY_SECONDS
 from litellm.llms.custom_httpx.http_handler import (
     AsyncHTTPHandler,
     HTTPHandler,
@@ -28,7 +29,9 @@ def handle_prediction_response_streaming(
 
     status = ""
     while True and (status not in ["succeeded", "failed", "canceled"]):
-        time.sleep(0.5)  # prevent being rate limited by replicate
+        time.sleep(
+            REPLICATE_POLLING_DELAY_SECONDS
+        )  # prevent being rate limited by replicate
         print_verbose(f"replicate: polling endpoint: {prediction_url}")
         response = http_client.get(prediction_url, headers=headers)
         if response.status_code == 200:
@@ -77,7 +80,9 @@ async def async_handle_prediction_response_streaming(
 
     status = ""
     while True and (status not in ["succeeded", "failed", "canceled"]):
-        await asyncio.sleep(0.5)  # prevent being rate limited by replicate
+        await asyncio.sleep(
+            REPLICATE_POLLING_DELAY_SECONDS
+        )  # prevent being rate limited by replicate
         print_verbose(f"replicate: polling endpoint: {prediction_url}")
         response = await http_client.get(prediction_url, headers=headers)
         if response.status_code == 200:
diff --git a/litellm/llms/replicate/chat/transformation.py b/litellm/llms/replicate/chat/transformation.py
index d49350dea7..604e6eefe6 100644
--- a/litellm/llms/replicate/chat/transformation.py
+++ b/litellm/llms/replicate/chat/transformation.py
@@ -3,6 +3,7 @@ from typing import TYPE_CHECKING, Any, List, Optional, Union
 import httpx
 
 import litellm
+from litellm.constants import REPLICATE_MODEL_NAME_WITH_ID_LENGTH
 from litellm.litellm_core_utils.prompt_templates.common_utils import (
     convert_content_list_to_str,
 )
@@ -221,10 +222,11 @@ class ReplicateConfig(BaseConfig):
 
         version_id = self.model_to_version_id(model)
         request_data: dict = {"input": input_data}
-        if ":" in version_id and len(version_id) > 64:
+        if ":" in version_id and len(version_id) > REPLICATE_MODEL_NAME_WITH_ID_LENGTH:
             model_parts = version_id.split(":")
             if (
-                len(model_parts) > 1 and len(model_parts[1]) == 64
+                len(model_parts) > 1
+                and len(model_parts[1]) == REPLICATE_MODEL_NAME_WITH_ID_LENGTH
             ):  ## checks if model name has a 64 digit code - e.g. "meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3"
                 request_data["version"] = model_parts[1]
 
diff --git a/litellm/llms/together_ai/cost_calculator.py b/litellm/llms/together_ai/cost_calculator.py
index d3b0db8b89..a1be097bc8 100644
--- a/litellm/llms/together_ai/cost_calculator.py
+++ b/litellm/llms/together_ai/cost_calculator.py
@@ -4,6 +4,16 @@ Handles calculating cost for together ai models
 
 import re
 
+from litellm.constants import (
+    TOGETHER_AI_4_B,
+    TOGETHER_AI_8_B,
+    TOGETHER_AI_21_B,
+    TOGETHER_AI_41_B,
+    TOGETHER_AI_80_B,
+    TOGETHER_AI_110_B,
+    TOGETHER_AI_EMBEDDING_150_M,
+    TOGETHER_AI_EMBEDDING_350_M,
+)
 from litellm.types.utils import CallTypes
 
 
@@ -31,17 +41,17 @@ def get_model_params_and_category(model_name, call_type: CallTypes) -> str:
         else:
             return model_name
         # Determine the category based on the number of parameters
-        if params_billion <= 4.0:
+        if params_billion <= TOGETHER_AI_4_B:
             category = "together-ai-up-to-4b"
-        elif params_billion <= 8.0:
+        elif params_billion <= TOGETHER_AI_8_B:
             category = "together-ai-4.1b-8b"
-        elif params_billion <= 21.0:
+        elif params_billion <= TOGETHER_AI_21_B:
             category = "together-ai-8.1b-21b"
-        elif params_billion <= 41.0:
+        elif params_billion <= TOGETHER_AI_41_B:
             category = "together-ai-21.1b-41b"
-        elif params_billion <= 80.0:
+        elif params_billion <= TOGETHER_AI_80_B:
             category = "together-ai-41.1b-80b"
-        elif params_billion <= 110.0:
+        elif params_billion <= TOGETHER_AI_110_B:
             category = "together-ai-81.1b-110b"
         if category is not None:
             return category
@@ -69,9 +79,9 @@ def get_model_params_and_category_embeddings(model_name) -> str:
         else:
             return model_name
         # Determine the category based on the number of parameters
-        if params_million <= 150:
+        if params_million <= TOGETHER_AI_EMBEDDING_150_M:
             category = "together-ai-embedding-up-to-150m"
-        elif params_million <= 350:
+        elif params_million <= TOGETHER_AI_EMBEDDING_350_M:
             category = "together-ai-embedding-151m-to-350m"
         if category is not None:
             return category
diff --git a/litellm/llms/triton/completion/transformation.py b/litellm/llms/triton/completion/transformation.py
index db0add6f35..49126917f2 100644
--- a/litellm/llms/triton/completion/transformation.py
+++ b/litellm/llms/triton/completion/transformation.py
@@ -7,6 +7,7 @@ from typing import Any, AsyncIterator, Dict, Iterator, List, Literal, Optional,
 
 from httpx import Headers, Response
 
+from litellm.constants import DEFAULT_MAX_TOKENS_FOR_TRITON
 from litellm.litellm_core_utils.prompt_templates.factory import prompt_factory
 from litellm.llms.base_llm.base_model_iterator import BaseModelResponseIterator
 from litellm.llms.base_llm.chat.transformation import (
@@ -196,7 +197,9 @@ class TritonGenerateConfig(TritonConfig):
         data_for_triton: Dict[str, Any] = {
             "text_input": prompt_factory(model=model, messages=messages),
             "parameters": {
-                "max_tokens": int(optional_params.get("max_tokens", 2000)),
+                "max_tokens": int(
+                    optional_params.get("max_tokens", DEFAULT_MAX_TOKENS_FOR_TRITON)
+                ),
                 "bad_words": [""],
                 "stop_words": [""],
             },
diff --git a/litellm/main.py b/litellm/main.py
index 56b0aa3671..5d058c0c44 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -51,6 +51,10 @@ from litellm import (  # type: ignore
     get_litellm_params,
     get_optional_params,
 )
+from litellm.constants import (
+    DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT,
+    DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT,
+)
 from litellm.exceptions import LiteLLMUnknownProvider
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.litellm_core_utils.audio_utils.utils import get_audio_file_for_health_check
@@ -740,7 +744,12 @@ def mock_completion(
         setattr(
             model_response,
             "usage",
-            Usage(prompt_tokens=10, completion_tokens=20, total_tokens=30),
+            Usage(
+                prompt_tokens=DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT,
+                completion_tokens=DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT,
+                total_tokens=DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT
+                + DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT,
+            ),
         )
 
         try:
@@ -3067,7 +3076,7 @@ def completion(  # type: ignore # noqa: PLR0915
                         "max_tokens": max_tokens,
                         "temperature": temperature,
                         "top_p": top_p,
-                        "top_k": kwargs.get("top_k", 40),
+                        "top_k": kwargs.get("top_k"),
                     },
                 },
             )
diff --git a/litellm/proxy/auth/auth_checks.py b/litellm/proxy/auth/auth_checks.py
index ddd1008bd0..1e0c8a4609 100644
--- a/litellm/proxy/auth/auth_checks.py
+++ b/litellm/proxy/auth/auth_checks.py
@@ -20,6 +20,7 @@ import litellm
 from litellm._logging import verbose_proxy_logger
 from litellm.caching.caching import DualCache
 from litellm.caching.dual_cache import LimitedSizeOrderedDict
+from litellm.constants import DEFAULT_IN_MEMORY_TTL
 from litellm.litellm_core_utils.get_llm_provider_logic import get_llm_provider
 from litellm.proxy._types import (
     RBAC_ROLES,
@@ -55,7 +56,7 @@ else:
 
 
 last_db_access_time = LimitedSizeOrderedDict(max_size=100)
-db_cache_expiry = 5  # refresh every 5s
+db_cache_expiry = DEFAULT_IN_MEMORY_TTL  # refresh every 5s
 
 all_routes = LiteLLMRoutes.openai_routes.value + LiteLLMRoutes.management_routes.value
 
diff --git a/litellm/proxy/auth/litellm_license.py b/litellm/proxy/auth/litellm_license.py
index d962aad2c0..936f372181 100644
--- a/litellm/proxy/auth/litellm_license.py
+++ b/litellm/proxy/auth/litellm_license.py
@@ -9,6 +9,7 @@ from typing import Optional
 import httpx
 
 from litellm._logging import verbose_proxy_logger
+from litellm.constants import NON_LLM_CONNECTION_TIMEOUT
 from litellm.llms.custom_httpx.http_handler import HTTPHandler
 
 
@@ -23,7 +24,7 @@ class LicenseCheck:
     def __init__(self) -> None:
         self.license_str = os.getenv("LITELLM_LICENSE", None)
         verbose_proxy_logger.debug("License Str value - {}".format(self.license_str))
-        self.http_handler = HTTPHandler(timeout=15)
+        self.http_handler = HTTPHandler(timeout=NON_LLM_CONNECTION_TIMEOUT)
         self.public_key = None
         self.read_public_key()
 
diff --git a/litellm/proxy/hooks/prompt_injection_detection.py b/litellm/proxy/hooks/prompt_injection_detection.py
index b8fa8466a3..ee5d192555 100644
--- a/litellm/proxy/hooks/prompt_injection_detection.py
+++ b/litellm/proxy/hooks/prompt_injection_detection.py
@@ -15,6 +15,7 @@ from fastapi import HTTPException
 import litellm
 from litellm._logging import verbose_proxy_logger
 from litellm.caching.caching import DualCache
+from litellm.constants import DEFAULT_PROMPT_INJECTION_SIMILARITY_THRESHOLD
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.litellm_core_utils.prompt_templates.factory import (
     prompt_injection_detection_default_pt,
@@ -110,7 +111,9 @@ class _OPTIONAL_PromptInjectionDetection(CustomLogger):
         return combinations
 
     def check_user_input_similarity(
-        self, user_input: str, similarity_threshold: float = 0.7
+        self,
+        user_input: str,
+        similarity_threshold: float = DEFAULT_PROMPT_INJECTION_SIMILARITY_THRESHOLD,
     ) -> bool:
         user_input_lower = user_input.lower()
         keywords = self.generate_injection_keywords()
diff --git a/litellm/proxy/management_endpoints/key_management_endpoints.py b/litellm/proxy/management_endpoints/key_management_endpoints.py
index b0bf1fb619..f78ac8744c 100644
--- a/litellm/proxy/management_endpoints/key_management_endpoints.py
+++ b/litellm/proxy/management_endpoints/key_management_endpoints.py
@@ -24,7 +24,7 @@ from fastapi import APIRouter, Depends, Header, HTTPException, Query, Request, s
 import litellm
 from litellm._logging import verbose_proxy_logger
 from litellm.caching import DualCache
-from litellm.constants import UI_SESSION_TOKEN_TEAM_ID
+from litellm.constants import LENGTH_OF_LITELLM_GENERATED_KEY, UI_SESSION_TOKEN_TEAM_ID
 from litellm.litellm_core_utils.duration_parser import duration_in_seconds
 from litellm.proxy._types import *
 from litellm.proxy.auth.auth_checks import (
@@ -1164,7 +1164,7 @@ async def generate_key_helper_fn(  # noqa: PLR0915
         if key is not None:
             token = key
         else:
-            token = f"sk-{secrets.token_urlsafe(16)}"
+            token = f"sk-{secrets.token_urlsafe(LENGTH_OF_LITELLM_GENERATED_KEY)}"
 
     if duration is None:  # allow tokens that never expire
         expires = None
@@ -1745,7 +1745,7 @@ async def regenerate_key_fn(
 
         verbose_proxy_logger.debug("key_in_db: %s", _key_in_db)
 
-        new_token = f"sk-{secrets.token_urlsafe(16)}"
+        new_token = f"sk-{secrets.token_urlsafe(LENGTH_OF_LITELLM_GENERATED_KEY)}"
         new_token_hash = hash_token(new_token)
         new_token_key_name = f"sk-...{new_token[-4:]}"
 
diff --git a/litellm/proxy/pass_through_endpoints/llm_provider_handlers/assembly_passthrough_logging_handler.py b/litellm/proxy/pass_through_endpoints/llm_provider_handlers/assembly_passthrough_logging_handler.py
index 7cf3013db0..cba558248d 100644
--- a/litellm/proxy/pass_through_endpoints/llm_provider_handlers/assembly_passthrough_logging_handler.py
+++ b/litellm/proxy/pass_through_endpoints/llm_provider_handlers/assembly_passthrough_logging_handler.py
@@ -15,6 +15,10 @@ from litellm.litellm_core_utils.litellm_logging import (
 )
 from litellm.litellm_core_utils.thread_pool_executor import executor
 from litellm.proxy.pass_through_endpoints.types import PassthroughStandardLoggingPayload
+from litellm.types.passthrough_endpoints.assembly_ai import (
+    ASSEMBLY_AI_MAX_POLLING_ATTEMPTS,
+    ASSEMBLY_AI_POLLING_INTERVAL,
+)
 
 
 class AssemblyAITranscriptResponse(TypedDict, total=False):
@@ -34,13 +38,13 @@ class AssemblyAIPassthroughLoggingHandler:
         The base URL for the AssemblyAI API
         """
 
-        self.polling_interval: float = 10
+        self.polling_interval: float = ASSEMBLY_AI_POLLING_INTERVAL
         """
         The polling interval for the AssemblyAI API. 
         litellm needs to poll the GET /transcript/{transcript_id} endpoint to get the status of the transcript.
         """
 
-        self.max_polling_attempts = 180
+        self.max_polling_attempts = ASSEMBLY_AI_MAX_POLLING_ATTEMPTS
         """
         The maximum number of polling attempts for the AssemblyAI API.
         """
diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index d265f3bbca..100b0bf6db 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -25,7 +25,10 @@ from typing import (
     get_type_hints,
 )
 
-from litellm.constants import DEFAULT_MAX_RECURSE_DEPTH
+from litellm.constants import (
+    DEFAULT_MAX_RECURSE_DEPTH,
+    DEFAULT_SLACK_ALERTING_THRESHOLD,
+)
 from litellm.types.utils import (
     ModelResponse,
     ModelResponseStream,
@@ -118,7 +121,16 @@ import litellm
 from litellm import Router
 from litellm._logging import verbose_proxy_logger, verbose_router_logger
 from litellm.caching.caching import DualCache, RedisCache
-from litellm.constants import LITELLM_PROXY_ADMIN_NAME
+from litellm.constants import (
+    DAYS_IN_A_MONTH,
+    DEFAULT_HEALTH_CHECK_INTERVAL,
+    DEFAULT_MODEL_CREATED_AT_TIME,
+    LITELLM_PROXY_ADMIN_NAME,
+    PROMETHEUS_FALLBACK_STATS_SEND_TIME_HOURS,
+    PROXY_BATCH_WRITE_AT,
+    PROXY_BUDGET_RESCHEDULER_MAX_TIME,
+    PROXY_BUDGET_RESCHEDULER_MIN_TIME,
+)
 from litellm.exceptions import RejectedRequestError
 from litellm.integrations.SlackAlerting.slack_alerting import SlackAlerting
 from litellm.litellm_core_utils.core_helpers import (
@@ -287,7 +299,7 @@ from litellm.router import (
     LiteLLM_Params,
     ModelGroupInfo,
 )
-from litellm.scheduler import DefaultPriorities, FlowItem, Scheduler
+from litellm.scheduler import FlowItem, Scheduler
 from litellm.secret_managers.aws_secret_manager import load_aws_kms
 from litellm.secret_managers.google_kms import load_google_kms
 from litellm.secret_managers.main import (
@@ -307,6 +319,7 @@ from litellm.types.llms.openai import HttpxBinaryResponseContent
 from litellm.types.router import DeploymentTypedDict
 from litellm.types.router import ModelInfo as RouterModelInfo
 from litellm.types.router import RouterGeneralSettings, updateDeployment
+from litellm.types.scheduler import DefaultPriorities
 from litellm.types.utils import CredentialItem, CustomHuggingfaceTokenizer
 from litellm.types.utils import ModelInfo as ModelMapInfo
 from litellm.types.utils import RawRequestTypedDict, StandardLoggingPayload
@@ -779,9 +792,9 @@ queue: List = []
 litellm_proxy_budget_name = "litellm-proxy-budget"
 litellm_proxy_admin_name = LITELLM_PROXY_ADMIN_NAME
 ui_access_mode: Literal["admin", "all"] = "all"
-proxy_budget_rescheduler_min_time = 597
-proxy_budget_rescheduler_max_time = 605
-proxy_batch_write_at = 10  # in seconds
+proxy_budget_rescheduler_min_time = PROXY_BUDGET_RESCHEDULER_MIN_TIME
+proxy_budget_rescheduler_max_time = PROXY_BUDGET_RESCHEDULER_MAX_TIME
+proxy_batch_write_at = PROXY_BATCH_WRITE_AT
 litellm_master_key_hash = None
 disable_spend_logs = False
 jwt_handler = JWTHandler()
@@ -1846,7 +1859,9 @@ class ProxyConfig:
             use_background_health_checks = general_settings.get(
                 "background_health_checks", False
             )
-            health_check_interval = general_settings.get("health_check_interval", 300)
+            health_check_interval = general_settings.get(
+                "health_check_interval", DEFAULT_HEALTH_CHECK_INTERVAL
+            )
             health_check_details = general_settings.get("health_check_details", True)
 
             ### RBAC ###
@@ -3145,7 +3160,7 @@ class ProxyStartupEvent:
                 scheduler.add_job(
                     proxy_logging_obj.slack_alerting_instance.send_fallback_stats_from_prometheus,
                     "cron",
-                    hour=9,
+                    hour=PROMETHEUS_FALLBACK_STATS_SEND_TIME_HOURS,
                     minute=0,
                     timezone=ZoneInfo("America/Los_Angeles"),  # Pacific Time
                 )
@@ -3278,7 +3293,7 @@ async def model_list(
             {
                 "id": model,
                 "object": "model",
-                "created": 1677610602,
+                "created": DEFAULT_MODEL_CREATED_AT_TIME,
                 "owned_by": "openai",
             }
             for model in all_models
@@ -5592,7 +5607,7 @@ async def model_metrics(
             param="None",
             code=status.HTTP_500_INTERNAL_SERVER_ERROR,
         )
-    startTime = startTime or datetime.now() - timedelta(days=30)
+    startTime = startTime or datetime.now() - timedelta(days=DAYS_IN_A_MONTH)
     endTime = endTime or datetime.now()
 
     if api_key is None or api_key == "undefined":
@@ -5713,11 +5728,12 @@ async def model_metrics_slow_responses(
     if customer is None or customer == "undefined":
         customer = "null"
 
-    startTime = startTime or datetime.now() - timedelta(days=30)
+    startTime = startTime or datetime.now() - timedelta(days=DAYS_IN_A_MONTH)
     endTime = endTime or datetime.now()
 
     alerting_threshold = (
-        proxy_logging_obj.slack_alerting_instance.alerting_threshold or 300
+        proxy_logging_obj.slack_alerting_instance.alerting_threshold
+        or DEFAULT_SLACK_ALERTING_THRESHOLD
     )
     alerting_threshold = int(alerting_threshold)
 
@@ -5797,7 +5813,7 @@ async def model_metrics_exceptions(
             code=status.HTTP_500_INTERNAL_SERVER_ERROR,
         )
 
-    startTime = startTime or datetime.now() - timedelta(days=30)
+    startTime = startTime or datetime.now() - timedelta(days=DAYS_IN_A_MONTH)
     endTime = endTime or datetime.now()
 
     if api_key is None or api_key == "undefined":
diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py
index eb733e7370..7831d42d81 100644
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@@ -22,6 +22,7 @@ from typing import (
     overload,
 )
 
+from litellm.constants import MAX_TEAM_LIST_LIMIT
 from litellm.proxy._types import (
     DB_CONNECTION_ERROR_TYPES,
     CommonProxyErrors,
@@ -1596,7 +1597,9 @@ class PrismaClient:
                         where={"team_id": {"in": team_id_list}}
                     )
                 elif query_type == "find_all" and team_id_list is None:
-                    response = await self.db.litellm_teamtable.find_many(take=20)
+                    response = await self.db.litellm_teamtable.find_many(
+                        take=MAX_TEAM_LIST_LIMIT
+                    )
                 return response
             elif table_name == "user_notification":
                 if query_type == "find_unique":
diff --git a/litellm/router.py b/litellm/router.py
index 78ad2afe1a..b0a04abcaa 100644
--- a/litellm/router.py
+++ b/litellm/router.py
@@ -50,6 +50,7 @@ from litellm.caching.caching import (
     RedisCache,
     RedisClusterCache,
 )
+from litellm.constants import DEFAULT_MAX_LRU_CACHE_SIZE
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.litellm_core_utils.asyncify import run_async_function
 from litellm.litellm_core_utils.core_helpers import _get_parent_otel_span_from_kwargs
@@ -5073,7 +5074,7 @@ class Router:
                     rpm_usage += t
         return tpm_usage, rpm_usage
 
-    @lru_cache(maxsize=64)
+    @lru_cache(maxsize=DEFAULT_MAX_LRU_CACHE_SIZE)
     def _cached_get_model_group_info(
         self, model_group: str
     ) -> Optional[ModelGroupInfo]:
diff --git a/litellm/router_utils/handle_error.py b/litellm/router_utils/handle_error.py
index c331da70ac..ba12e1cbed 100644
--- a/litellm/router_utils/handle_error.py
+++ b/litellm/router_utils/handle_error.py
@@ -1,6 +1,7 @@
 from typing import TYPE_CHECKING, Any, Optional, Union
 
 from litellm._logging import verbose_router_logger
+from litellm.constants import MAX_EXCEPTION_MESSAGE_LENGTH
 from litellm.router_utils.cooldown_handlers import (
     _async_get_cooldown_deployments_with_debug_info,
 )
@@ -54,7 +55,7 @@ async def send_llm_exception_alert(
     exception_str = str(original_exception)
     if litellm_debug_info is not None:
         exception_str += litellm_debug_info
-    exception_str += f"\n\n{error_traceback_str[:2000]}"
+    exception_str += f"\n\n{error_traceback_str[:MAX_EXCEPTION_MESSAGE_LENGTH]}"
 
     await litellm_router_instance.slack_alerting_logger.send_alert(
         message=f"LLM API call failed: `{exception_str}`",
diff --git a/litellm/scheduler.py b/litellm/scheduler.py
index 23346e982a..3225ba0451 100644
--- a/litellm/scheduler.py
+++ b/litellm/scheduler.py
@@ -6,17 +6,14 @@ from pydantic import BaseModel
 
 from litellm import print_verbose
 from litellm.caching.caching import DualCache, RedisCache
+from litellm.constants import DEFAULT_IN_MEMORY_TTL, DEFAULT_POLLING_INTERVAL
 
 
 class SchedulerCacheKeys(enum.Enum):
     queue = "scheduler:queue"
-    default_in_memory_ttl = 5  # cache queue in-memory for 5s when redis cache available
-
-
-class DefaultPriorities(enum.Enum):
-    High = 0
-    Medium = 128
-    Low = 255
+    default_in_memory_ttl = (
+        DEFAULT_IN_MEMORY_TTL  # cache queue in-memory for 5s when redis cache available
+    )
 
 
 class FlowItem(BaseModel):
@@ -44,7 +41,9 @@ class Scheduler:
         self.cache = DualCache(
             redis_cache=redis_cache, default_in_memory_ttl=default_in_memory_ttl
         )
-        self.polling_interval = polling_interval or 0.03  # default to 3ms
+        self.polling_interval = (
+            polling_interval or DEFAULT_POLLING_INTERVAL
+        )  # default to 3ms
 
     async def add_request(self, request: FlowItem):
         # We use the priority directly, as lower values indicate higher priority
diff --git a/litellm/secret_managers/google_secret_manager.py b/litellm/secret_managers/google_secret_manager.py
index f21963c38a..2fd35ced6e 100644
--- a/litellm/secret_managers/google_secret_manager.py
+++ b/litellm/secret_managers/google_secret_manager.py
@@ -5,6 +5,7 @@ from typing import Optional
 import litellm
 from litellm._logging import verbose_logger
 from litellm.caching.caching import InMemoryCache
+from litellm.constants import SECRET_MANAGER_REFRESH_INTERVAL
 from litellm.integrations.gcs_bucket.gcs_bucket_base import GCSBucketBase
 from litellm.llms.custom_httpx.http_handler import _get_httpx_client
 from litellm.proxy._types import CommonProxyErrors, KeyManagementSystem
@@ -13,7 +14,7 @@ from litellm.proxy._types import CommonProxyErrors, KeyManagementSystem
 class GoogleSecretManager(GCSBucketBase):
     def __init__(
         self,
-        refresh_interval: Optional[int] = 86400,
+        refresh_interval: Optional[int] = SECRET_MANAGER_REFRESH_INTERVAL,
         always_read_secret_manager: Optional[bool] = False,
     ) -> None:
         """
diff --git a/litellm/secret_managers/hashicorp_secret_manager.py b/litellm/secret_managers/hashicorp_secret_manager.py
index e0b4a08ce8..e5911ffa9b 100644
--- a/litellm/secret_managers/hashicorp_secret_manager.py
+++ b/litellm/secret_managers/hashicorp_secret_manager.py
@@ -6,6 +6,7 @@ import httpx
 import litellm
 from litellm._logging import verbose_logger
 from litellm.caching import InMemoryCache
+from litellm.constants import SECRET_MANAGER_REFRESH_INTERVAL
 from litellm.llms.custom_httpx.http_handler import (
     _get_httpx_client,
     get_async_httpx_client,
@@ -39,8 +40,14 @@ class HashicorpSecretManager(BaseSecretManager):
 
         litellm.secret_manager_client = self
         litellm._key_management_system = KeyManagementSystem.HASHICORP_VAULT
-        _refresh_interval = os.environ.get("HCP_VAULT_REFRESH_INTERVAL", 86400)
-        _refresh_interval = int(_refresh_interval) if _refresh_interval else 86400
+        _refresh_interval = os.environ.get(
+            "HCP_VAULT_REFRESH_INTERVAL", SECRET_MANAGER_REFRESH_INTERVAL
+        )
+        _refresh_interval = (
+            int(_refresh_interval)
+            if _refresh_interval
+            else SECRET_MANAGER_REFRESH_INTERVAL
+        )
         self.cache = InMemoryCache(
             default_ttl=_refresh_interval
         )  # store in memory for 1 day
diff --git a/litellm/types/integrations/datadog.py b/litellm/types/integrations/datadog.py
index 79d4eded47..7ea25561f9 100644
--- a/litellm/types/integrations/datadog.py
+++ b/litellm/types/integrations/datadog.py
@@ -1,6 +1,8 @@
 from enum import Enum
 from typing import Optional, TypedDict
 
+DD_MAX_BATCH_SIZE = 1000
+
 
 class DataDogStatus(str, Enum):
     INFO = "info"
diff --git a/litellm/types/integrations/gcs_bucket.py b/litellm/types/integrations/gcs_bucket.py
index a4fd8a6a11..9f5065ced2 100644
--- a/litellm/types/integrations/gcs_bucket.py
+++ b/litellm/types/integrations/gcs_bucket.py
@@ -8,6 +8,10 @@ else:
     VertexBase = Any
 
 
+GCS_DEFAULT_BATCH_SIZE = 2048
+GCS_DEFAULT_FLUSH_INTERVAL_SECONDS = 20
+
+
 class GCSLoggingConfig(TypedDict):
     """
     Internal LiteLLM Config for GCS Bucket logging
diff --git a/litellm/types/integrations/slack_alerting.py b/litellm/types/integrations/slack_alerting.py
index 9019b098d9..052fd05ea8 100644
--- a/litellm/types/integrations/slack_alerting.py
+++ b/litellm/types/integrations/slack_alerting.py
@@ -7,6 +7,9 @@ from pydantic import BaseModel, Field
 
 from litellm.types.utils import LiteLLMPydanticObjectBase
 
+SLACK_ALERTING_THRESHOLD_5_PERCENT = 0.05
+SLACK_ALERTING_THRESHOLD_15_PERCENT = 0.15
+
 
 class BaseOutageModel(TypedDict):
     alerts: List[int]
diff --git a/litellm/types/llms/azure.py b/litellm/types/llms/azure.py
new file mode 100644
index 0000000000..36c4258abd
--- /dev/null
+++ b/litellm/types/llms/azure.py
@@ -0,0 +1,2 @@
+API_VERSION_YEAR_SUPPORTED_RESPONSE_FORMAT = 2024
+API_VERSION_MONTH_SUPPORTED_RESPONSE_FORMAT = 8
diff --git a/litellm/types/llms/triton.py b/litellm/types/llms/triton.py
new file mode 100644
index 0000000000..8b13789179
--- /dev/null
+++ b/litellm/types/llms/triton.py
@@ -0,0 +1 @@
+
diff --git a/litellm/types/passthrough_endpoints/assembly_ai.py b/litellm/types/passthrough_endpoints/assembly_ai.py
new file mode 100644
index 0000000000..91b7273a48
--- /dev/null
+++ b/litellm/types/passthrough_endpoints/assembly_ai.py
@@ -0,0 +1,2 @@
+ASSEMBLY_AI_POLLING_INTERVAL = 10
+ASSEMBLY_AI_MAX_POLLING_ATTEMPTS = 180
diff --git a/litellm/types/scheduler.py b/litellm/types/scheduler.py
new file mode 100644
index 0000000000..1b2073f257
--- /dev/null
+++ b/litellm/types/scheduler.py
@@ -0,0 +1,7 @@
+from enum import Enum
+
+
+class DefaultPriorities(Enum):
+    High = 0
+    Medium = 128
+    Low = 255
diff --git a/litellm/utils.py b/litellm/utils.py
index 4283cf2df1..cdee0abcd7 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -62,6 +62,16 @@ import litellm.llms.gemini
 from litellm.caching._internal_lru_cache import lru_cache_wrapper
 from litellm.caching.caching import DualCache
 from litellm.caching.caching_handler import CachingHandlerResponse, LLMCachingHandler
+from litellm.constants import (
+    DEFAULT_MAX_LRU_CACHE_SIZE,
+    DEFAULT_TRIM_RATIO,
+    FUNCTION_DEFINITION_TOKEN_COUNT,
+    INITIAL_RETRY_DELAY,
+    JITTER,
+    MAX_RETRY_DELAY,
+    MINIMUM_PROMPT_CACHE_TOKEN_COUNT,
+    TOOL_CHOICE_OBJECT_TOKEN_COUNT,
+)
 from litellm.integrations.custom_guardrail import CustomGuardrail
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.litellm_core_utils.core_helpers import (
@@ -1520,7 +1530,7 @@ def _select_tokenizer(
     return _select_tokenizer_helper(model=model)
 
 
-@lru_cache(maxsize=128)
+@lru_cache(maxsize=DEFAULT_MAX_LRU_CACHE_SIZE)
 def _select_tokenizer_helper(model: str) -> SelectTokenizerResponse:
     if litellm.disable_hf_tokenizer_download is True:
         return _return_openai_tokenizer(model)
@@ -5336,15 +5346,15 @@ def _calculate_retry_after(
     if retry_after is not None and 0 < retry_after <= 60:
         return retry_after
 
-    initial_retry_delay = 0.5
-    max_retry_delay = 8.0
+    initial_retry_delay = INITIAL_RETRY_DELAY
+    max_retry_delay = MAX_RETRY_DELAY
     nb_retries = max_retries - remaining_retries
 
     # Apply exponential backoff, but not more than the max.
     sleep_seconds = min(initial_retry_delay * pow(2.0, nb_retries), max_retry_delay)
 
     # Apply some jitter, plus-or-minus half a second.
-    jitter = 1 - 0.25 * random.random()
+    jitter = JITTER * random.random()
     timeout = sleep_seconds * jitter
     return timeout if timeout >= min_timeout else min_timeout
 
@@ -5670,7 +5680,7 @@ def shorten_message_to_fit_limit(message, tokens_needed, model: Optional[str]):
 def trim_messages(
     messages,
     model: Optional[str] = None,
-    trim_ratio: float = 0.75,
+    trim_ratio: float = DEFAULT_TRIM_RATIO,
     return_response_tokens: bool = False,
     max_tokens=None,
 ):
@@ -6543,7 +6553,7 @@ def is_prompt_caching_valid_prompt(
             model=model,
             use_default_image_token_count=True,
         )
-        return token_count >= 1024
+        return token_count >= MINIMUM_PROMPT_CACHE_TOKEN_COUNT
     except Exception as e:
         verbose_logger.error(f"Error in is_prompt_caching_valid_prompt: {e}")
         return False
diff --git a/mypy.ini b/mypy.ini
index 3ce8c5fcc0..bb0e9ec871 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -3,6 +3,7 @@ warn_return_any = False
 ignore_missing_imports = True
 mypy_path = litellm/stubs
 namespace_packages = True
+disable_error_code = valid-type
 
 [mypy-google.*]
 ignore_missing_imports = True
diff --git a/tests/code_coverage_tests/ban_constant_numbers.py b/tests/code_coverage_tests/ban_constant_numbers.py
new file mode 100644
index 0000000000..c23b338086
--- /dev/null
+++ b/tests/code_coverage_tests/ban_constant_numbers.py
@@ -0,0 +1,152 @@
+import sys
+import ast
+import os
+
+# Extremely restrictive set of allowed numbers
+ALLOWED_NUMBERS = {
+    0,
+    1,
+    -1,
+    2,
+    10,
+    100,
+    1000,
+    4,
+    3,
+    500,
+    6,
+    60,
+    3600,
+    0.75,
+    7,
+    1024,
+    1011,
+    600,
+    12,
+    1000000000.0,
+    0.1,
+    50,
+    128,
+    6000,
+    30,
+    1000000,
+    5,
+    15,
+    25,
+    10000,
+    60000,
+    8,
+    2048,
+    16000000000,
+    16,
+    16383,
+    14,
+    24,
+    128000,
+    0.01,
+    20,
+}
+
+# Add all standard HTTP status codes
+HTTP_STATUS_CODES = {
+    200,  # OK
+    201,  # Created
+    202,  # Accepted
+    204,  # No Content
+    300,  # Multiple Choices
+    301,  # Moved Permanently
+    302,  # Found
+    303,  # See Other
+    304,  # Not Modified
+    307,  # Temporary Redirect
+    308,  # Permanent Redirect
+    400,  # Bad Request
+    401,  # Unauthorized
+    402,  # Payment Required
+    403,  # Forbidden
+    404,  # Not Found
+    406,  # Not Acceptable
+    408,  # Request Timeout
+    409,  # Conflict
+    413,  # Payload Too Large
+    422,  # Unprocessable Entity
+    424,  # Failed Dependency
+    429,  # Too Many Requests
+    498,  # Invalid Token
+    499,  # Client Closed Request
+    500,  # Internal Server Error
+    501,  # Not Implemented
+    502,  # Bad Gateway
+    503,  # Service Unavailable
+    504,  # Gateway Timeout
+    520,  # Web server is returning an unknown error
+    522,  # Connection timed out
+    524,  # A timeout occurred
+    529,  # Site is overloaded
+}
+
+# Combine the sets
+ALLOWED_NUMBERS = ALLOWED_NUMBERS.union(HTTP_STATUS_CODES)
+
+
+class HardcodedNumberFinder(ast.NodeVisitor):
+    def __init__(self):
+        self.hardcoded_numbers = []
+
+    def visit_Constant(self, node):
+        # For Python 3.8+
+        if isinstance(node.value, (int, float)) and node.value not in ALLOWED_NUMBERS:
+            self.hardcoded_numbers.append((node.lineno, node.value))
+        self.generic_visit(node)
+
+    def visit_Num(self, node):
+        # For older Python versions
+        if node.n not in ALLOWED_NUMBERS:
+            self.hardcoded_numbers.append((node.lineno, node.n))
+        self.generic_visit(node)
+
+
+def check_file(filename):
+    try:
+        with open(filename, "r") as f:
+            content = f.read()
+
+        tree = ast.parse(content)
+        finder = HardcodedNumberFinder()
+        finder.visit(tree)
+
+        if finder.hardcoded_numbers:
+            print(f"ERROR in {filename}: Hardcoded numbers detected:")
+            for line, value in finder.hardcoded_numbers:
+                print(f"  Line {line}: {value}")
+            return 1
+        return 0
+    except SyntaxError:
+        print(f"Syntax error in {filename}")
+        return 0
+
+
+def main():
+    exit_code = 0
+    folder = "../../litellm"
+    ignore_files = [
+        "constants.py",
+        "proxy_cli.py",
+        "token_counter.py",
+        "mock_functions.py",
+        "duration_parser.py",
+        "utils.py",
+    ]
+    ignore_folder = "types"
+    for root, dirs, files in os.walk(folder):
+        for filename in files:
+            if filename.endswith(".py") and filename not in ignore_files:
+                full_path = os.path.join(root, filename)
+                if ignore_folder in full_path:
+                    continue
+                exit_code |= check_file(full_path)
+    sys.exit(exit_code)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/code_coverage_tests/log.txt b/tests/code_coverage_tests/log.txt
new file mode 100644
index 0000000000..e69de29bb2