Squashed commit of the following: (#9709)

commit b12a9892b7 Author: Krrish Dholakia <krrishdholakia@gmail.com> Date: Wed Apr 2 08:09:56 2025 -0700 fix(utils.py): don't modify openai_token_counter commit 294de31803 Author: Krrish Dholakia <krrishdholakia@gmail.com> Date: Mon Mar 24 21:22:40 2025 -0700 fix: fix linting error commit cb6e9fbe40 Author: Krrish Dholakia <krrishdholakia@gmail.com> Date: Mon Mar 24 19:52:45 2025 -0700 refactor: complete migration commit bfc159172d Author: Krrish Dholakia <krrishdholakia@gmail.com> Date: Mon Mar 24 19:09:59 2025 -0700 refactor: refactor more constants commit 43ffb6a558 Author: Krrish Dholakia <krrishdholakia@gmail.com> Date: Mon Mar 24 18:45:24 2025 -0700 fix: test commit 04dbe4310c Author: Krrish Dholakia <krrishdholakia@gmail.com> Date: Mon Mar 24 18:28:58 2025 -0700 refactor: refactor: move more constants into constants.py commit 3c26284aff Author: Krrish Dholakia <krrishdholakia@gmail.com> Date: Mon Mar 24 18:14:46 2025 -0700 refactor: migrate hardcoded constants out of __init__.py commit c11e0de69d Author: Krrish Dholakia <krrishdholakia@gmail.com> Date: Mon Mar 24 18:11:21 2025 -0700 build: migrate all constants into constants.py commit 7882bdc787 Author: Krrish Dholakia <krrishdholakia@gmail.com> Date: Mon Mar 24 18:07:37 2025 -0700 build: initial test banning hardcoded numbers in repo
2025-04-24 18:24:20 +00:00 · 2025-04-02 21:24:54 -07:00 · 2025-04-02 21:24:54 -07:00 · 8ee32291e0
commit 8ee32291e0
parent 5a722ef18f
51 changed files with 509 additions and 118 deletions
--- a/litellm/init.py
+++ b/litellm/init.py
@ -56,6 +56,9 @@ from litellm.constants import (
    bedrock_embedding_models,
    known_tokenizer_config,
    BEDROCK_INVOKE_PROVIDERS_LITERAL,
    DEFAULT_MAX_TOKENS,
    DEFAULT_SOFT_BUDGET,
    DEFAULT_ALLOWED_FAILS,
 )
 from litellm.types.guardrails import GuardrailItem
 from litellm.proxy._types import (
@ -155,7 +158,7 @@ token: Optional[
    str
 ] = None  # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
 telemetry = True
-max_tokens = 256  # OpenAI Defaults
+max_tokens: int = DEFAULT_MAX_TOKENS  # OpenAI Defaults
 drop_params = bool(os.getenv("LITELLM_DROP_PARAMS", False))
 modify_params = False
 retry = True
@ -244,7 +247,7 @@ budget_duration: Optional[
    str
 ] = None  # proxy only - resets budget after fixed duration. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
 default_soft_budget: float = (
-    50.0  # by default all litellm proxy keys have a soft budget of 50.0
+    DEFAULT_SOFT_BUDGET  # by default all litellm proxy keys have a soft budget of 50.0
 )
 forward_traceparent_to_llm_provider: bool = False
--- a/litellm/_redis.py
+++ b/litellm/_redis.py
@ -18,6 +18,7 @@ import redis  # type: ignore
 import redis.asyncio as async_redis  # type: ignore
 from litellm import get_secret, get_secret_str
 from litellm.constants import REDIS_CONNECTION_POOL_TIMEOUT, REDIS_SOCKET_TIMEOUT
 from ._logging import verbose_logger
@ -215,7 +216,7 @@ def _init_redis_sentinel(redis_kwargs) -> redis.Redis:
    # Set up the Sentinel client
    sentinel = redis.Sentinel(
        sentinel_nodes,
-        socket_timeout=0.1,
+        socket_timeout=REDIS_SOCKET_TIMEOUT,
        password=sentinel_password,
    )
@ -239,7 +240,7 @@ def _init_async_redis_sentinel(redis_kwargs) -> async_redis.Redis:
    # Set up the Sentinel client
    sentinel = async_redis.Sentinel(
        sentinel_nodes,
-        socket_timeout=0.1,
+        socket_timeout=REDIS_SOCKET_TIMEOUT,
        password=sentinel_password,
    )
@ -319,7 +320,7 @@ def get_redis_connection_pool(**env_overrides):
    verbose_logger.debug("get_redis_connection_pool: redis_kwargs", redis_kwargs)
    if "url" in redis_kwargs and redis_kwargs["url"] is not None:
        return async_redis.BlockingConnectionPool.from_url(
-            timeout=5, url=redis_kwargs["url"]
+            timeout=REDIS_CONNECTION_POOL_TIMEOUT, url=redis_kwargs["url"]
        )
    connection_class = async_redis.Connection
    if "ssl" in redis_kwargs:
@ -327,4 +328,6 @@ def get_redis_connection_pool(**env_overrides):
        redis_kwargs.pop("ssl", None)
        redis_kwargs["connection_class"] = connection_class
    redis_kwargs.pop("startup_nodes", None)
-    return async_redis.BlockingConnectionPool(timeout=5, **redis_kwargs)
+    return async_redis.BlockingConnectionPool(
        timeout=REDIS_CONNECTION_POOL_TIMEOUT, **redis_kwargs
    )
--- a/litellm/budget_manager.py
+++ b/litellm/budget_manager.py
@ -14,6 +14,12 @@ import time
 from typing import Literal, Optional
 import litellm
 from litellm.constants import (
    DAYS_IN_A_MONTH,
    DAYS_IN_A_WEEK,
    DAYS_IN_A_YEAR,
    HOURS_IN_A_DAY,
 )
 from litellm.utils import ModelResponse
@ -81,11 +87,11 @@ class BudgetManager:
        if duration == "daily":
            duration_in_days = 1
        elif duration == "weekly":
-            duration_in_days = 7
+            duration_in_days = DAYS_IN_A_WEEK
        elif duration == "monthly":
-            duration_in_days = 28
+            duration_in_days = DAYS_IN_A_MONTH
        elif duration == "yearly":
-            duration_in_days = 365
+            duration_in_days = DAYS_IN_A_YEAR
        else:
            raise ValueError(
                """duration needs to be one of ["daily", "weekly", "monthly", "yearly"]"""
@ -182,7 +188,9 @@ class BudgetManager:
        current_time = time.time()
        # Convert duration from days to seconds
-        duration_in_seconds = self.user_dict[user]["duration"] * 24 * 60 * 60
+        duration_in_seconds = (
            self.user_dict[user]["duration"] * HOURS_IN_A_DAY * 60 * 60
        )
        # Check if duration has elapsed
        if current_time - last_updated_at >= duration_in_seconds:
--- a/litellm/caching/caching.py
+++ b/litellm/caching/caching.py
@ -19,6 +19,7 @@ from pydantic import BaseModel
 import litellm
 from litellm._logging import verbose_logger
 from litellm.constants import CACHED_STREAMING_CHUNK_DELAY
 from litellm.litellm_core_utils.model_param_helper import ModelParamHelper
 from litellm.types.caching import *
 from litellm.types.utils import all_litellm_params
@ -406,7 +407,7 @@ class Cache:
                    }
                ]
            }
-            time.sleep(0.02)
+            time.sleep(CACHED_STREAMING_CHUNK_DELAY)
    def _get_cache_logic(
        self,
--- a/litellm/caching/in_memory_cache.py
+++ b/litellm/caching/in_memory_cache.py
@ -15,7 +15,8 @@ from typing import Any, List, Optional
 from pydantic import BaseModel
-from ..constants import MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB
+from litellm.constants import MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB
 from .base_cache import BaseCache
@ -52,7 +53,8 @@ class InMemoryCache(BaseCache):
            # Fast path for common primitive types that are typically small
            if (
                isinstance(value, (bool, int, float, str))
-                and len(str(value)) < self.max_size_per_item * 512
+                and len(str(value))
                < self.max_size_per_item * MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB
            ):  # Conservative estimate
                return True
--- a/litellm/caching/qdrant_semantic_cache.py
+++ b/litellm/caching/qdrant_semantic_cache.py
@ -11,10 +11,12 @@ Has 4 methods:
 import ast
 import asyncio
 import json
-from typing import Any
+from typing import Any, cast
 import litellm
 from litellm._logging import print_verbose
 from litellm.constants import QDRANT_SCALAR_QUANTILE, QDRANT_VECTOR_SIZE
 from litellm.types.utils import EmbeddingResponse
 from .base_cache import BaseCache
@ -118,7 +120,11 @@ class QdrantSemanticCache(BaseCache):
                }
            elif quantization_config == "scalar":
                quantization_params = {
-                    "scalar": {"type": "int8", "quantile": 0.99, "always_ram": False}
+                    "scalar": {
                        "type": "int8",
                        "quantile": QDRANT_SCALAR_QUANTILE,
                        "always_ram": False,
                    }
                }
            elif quantization_config == "product":
                quantization_params = {
@ -132,7 +138,7 @@ class QdrantSemanticCache(BaseCache):
            new_collection_status = self.sync_client.put(
                url=f"{self.qdrant_api_base}/collections/{self.collection_name}",
                json={
-                    "vectors": {"size": 1536, "distance": "Cosine"},
+                    "vectors": {"size": QDRANT_VECTOR_SIZE, "distance": "Cosine"},
                    "quantization_config": quantization_params,
                },
                headers=self.headers,
@ -171,10 +177,13 @@ class QdrantSemanticCache(BaseCache):
            prompt += message["content"]
        # create an embedding for prompt
-        embedding_response = litellm.embedding(
+        embedding_response = cast(
-            model=self.embedding_model,
+            EmbeddingResponse,
-            input=prompt,
+            litellm.embedding(
-            cache={"no-store": True, "no-cache": True},
+                model=self.embedding_model,
                input=prompt,
                cache={"no-store": True, "no-cache": True},
            ),
        )
        # get the embedding
@ -212,10 +221,13 @@ class QdrantSemanticCache(BaseCache):
            prompt += message["content"]
        # convert to embedding
-        embedding_response = litellm.embedding(
+        embedding_response = cast(
-            model=self.embedding_model,
+            EmbeddingResponse,
-            input=prompt,
+            litellm.embedding(
-            cache={"no-store": True, "no-cache": True},
+                model=self.embedding_model,
                input=prompt,
                cache={"no-store": True, "no-cache": True},
            ),
        )
        # get the embedding
--- a/litellm/constants.py
+++ b/litellm/constants.py
@ -9,6 +9,7 @@ DEFAULT_FAILURE_THRESHOLD_PERCENT = (
    0.5  # default cooldown a deployment if 50% of requests fail in a given minute
 )
 DEFAULT_MAX_TOKENS = 4096
 DEFAULT_ALLOWED_FAILS = 3
 DEFAULT_REDIS_SYNC_INTERVAL = 1
 DEFAULT_COOLDOWN_TIME_SECONDS = 5
 DEFAULT_REPLICATE_POLLING_RETRIES = 5
@ -16,16 +17,71 @@ DEFAULT_REPLICATE_POLLING_DELAY_SECONDS = 1
 DEFAULT_IMAGE_TOKEN_COUNT = 250
 DEFAULT_IMAGE_WIDTH = 300
 DEFAULT_IMAGE_HEIGHT = 300
 DEFAULT_MAX_TOKENS = 256  # used when providers need a default
 MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 1024  # 1MB = 1024KB
 SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000  # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic.
 REDIS_UPDATE_BUFFER_KEY = "litellm_spend_update_buffer"
 REDIS_DAILY_SPEND_UPDATE_BUFFER_KEY = "litellm_daily_spend_update_buffer"
 MAX_REDIS_BUFFER_DEQUEUE_COUNT = 100
 MINIMUM_PROMPT_CACHE_TOKEN_COUNT = (
    1024  # minimum number of tokens to cache a prompt by Anthropic
 )
 DEFAULT_TRIM_RATIO = 0.75  # default ratio of tokens to trim from the end of a prompt
 HOURS_IN_A_DAY = 24
 DAYS_IN_A_WEEK = 7
 DAYS_IN_A_MONTH = 28
 DAYS_IN_A_YEAR = 365
 REPLICATE_MODEL_NAME_WITH_ID_LENGTH = 64
 #### TOKEN COUNTING ####
 FUNCTION_DEFINITION_TOKEN_COUNT = 9
 SYSTEM_MESSAGE_TOKEN_COUNT = 4
 TOOL_CHOICE_OBJECT_TOKEN_COUNT = 4
 DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT = 10
 DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT = 20
 MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES = 768
 MAX_LONG_SIDE_FOR_IMAGE_HIGH_RES = 2000
 MAX_TILE_WIDTH = 512
 MAX_TILE_HEIGHT = 512
 OPENAI_FILE_SEARCH_COST_PER_1K_CALLS = 2.5 / 1000
 MIN_NON_ZERO_TEMPERATURE = 0.0001
 #### RELIABILITY ####
 REPEATED_STREAMING_CHUNK_LIMIT = 100  # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives.
 DEFAULT_MAX_LRU_CACHE_SIZE = 16
 INITIAL_RETRY_DELAY = 0.5
 MAX_RETRY_DELAY = 8.0
 JITTER = 0.75
 DEFAULT_IN_MEMORY_TTL = 5  # default time to live for the in-memory cache
 DEFAULT_POLLING_INTERVAL = 0.03  # default polling interval for the scheduler
 AZURE_OPERATION_POLLING_TIMEOUT = 120
 REDIS_SOCKET_TIMEOUT = 0.1
 REDIS_CONNECTION_POOL_TIMEOUT = 5
 NON_LLM_CONNECTION_TIMEOUT = 15  # timeout for adjacent services (e.g. jwt auth)
 MAX_EXCEPTION_MESSAGE_LENGTH = 2000
 BEDROCK_MAX_POLICY_SIZE = 75
 REPLICATE_POLLING_DELAY_SECONDS = 0.5
 DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS = 4096
 TOGETHER_AI_4_B = 4
 TOGETHER_AI_8_B = 8
 TOGETHER_AI_21_B = 21
 TOGETHER_AI_41_B = 41
 TOGETHER_AI_80_B = 80
 TOGETHER_AI_110_B = 110
 TOGETHER_AI_EMBEDDING_150_M = 150
 TOGETHER_AI_EMBEDDING_350_M = 350
 QDRANT_SCALAR_QUANTILE = 0.99
 QDRANT_VECTOR_SIZE = 1536
 CACHED_STREAMING_CHUNK_DELAY = 0.02
 MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 512
 DEFAULT_MAX_TOKENS_FOR_TRITON = 2000
 #### Networking settings ####
 request_timeout: float = 6000  # time in seconds
 STREAM_SSE_DONE_STRING: str = "[DONE]"
 ### SPEND TRACKING ###
 DEFAULT_REPLICATE_GPU_PRICE_PER_SECOND = 0.001400  # price per second for a100 80GB
 FIREWORKS_AI_56_B_MOE = 56
 FIREWORKS_AI_176_B_MOE = 176
 FIREWORKS_AI_16_B = 16
 FIREWORKS_AI_80_B = 80
 LITELLM_CHAT_PROVIDERS = [
    "openai",
@ -426,6 +482,9 @@ MCP_TOOL_NAME_PREFIX = "mcp_tool"
 MAX_SPENDLOG_ROWS_TO_QUERY = (
    1_000_000  # if spendLogs has more than 1M rows, do not query the DB
 )
 DEFAULT_SOFT_BUDGET = (
    50.0  # by default all litellm proxy keys have a soft budget of 50.0
 )
 # makes it clear this is a rate limit error for a litellm virtual key
 RATE_LIMIT_ERROR_MESSAGE_FOR_VIRTUAL_KEY = "LiteLLM Virtual Key user_api_key_hash"
@ -451,3 +510,14 @@ LITELLM_PROXY_ADMIN_NAME = "default_user_id"
 ########################### DB CRON JOB NAMES ###########################
 DB_SPEND_UPDATE_JOB_NAME = "db_spend_update_job"
 DEFAULT_CRON_JOB_LOCK_TTL_SECONDS = 60  # 1 minute
 PROXY_BUDGET_RESCHEDULER_MIN_TIME = 597
 PROXY_BUDGET_RESCHEDULER_MAX_TIME = 605
 PROXY_BATCH_WRITE_AT = 10  # in seconds
 DEFAULT_HEALTH_CHECK_INTERVAL = 300  # 5 minutes
 PROMETHEUS_FALLBACK_STATS_SEND_TIME_HOURS = 9
 DEFAULT_MODEL_CREATED_AT_TIME = 1677610602  # returns on `/models` endpoint
 DEFAULT_SLACK_ALERTING_THRESHOLD = 300
 MAX_TEAM_LIST_LIMIT = 20
 DEFAULT_PROMPT_INJECTION_SIMILARITY_THRESHOLD = 0.7
 LENGTH_OF_LITELLM_GENERATED_KEY = 16
 SECRET_MANAGER_REFRESH_INTERVAL = 86400
--- a/litellm/cost_calculator.py
+++ b/litellm/cost_calculator.py
@ -9,6 +9,10 @@ from pydantic import BaseModel
 import litellm
 import litellm._logging
 from litellm import verbose_logger
 from litellm.constants import (
    DEFAULT_MAX_LRU_CACHE_SIZE,
    DEFAULT_REPLICATE_GPU_PRICE_PER_SECOND,
 )
 from litellm.litellm_core_utils.llm_cost_calc.tool_call_cost_tracking import (
    StandardBuiltInToolCostTracking,
 )
@ -355,9 +359,7 @@ def cost_per_token(  # noqa: PLR0915
 def get_replicate_completion_pricing(completion_response: dict, total_time=0.0):
    # see https://replicate.com/pricing
    # for all litellm currently supported LLMs, almost all requests go to a100_80gb
-    a100_80gb_price_per_second_public = (
+    a100_80gb_price_per_second_public = DEFAULT_REPLICATE_GPU_PRICE_PER_SECOND  # assume all calls sent to A100 80GB for now
        0.001400  # assume all calls sent to A100 80GB for now
    )
    if total_time == 0.0:  # total time is in ms
        start_time = completion_response.get("created", time.time())
        end_time = getattr(completion_response, "ended", time.time())
@ -450,7 +452,7 @@ def _select_model_name_for_cost_calc(
    return return_model
-@lru_cache(maxsize=16)
+@lru_cache(maxsize=DEFAULT_MAX_LRU_CACHE_SIZE)
 def _model_contains_known_llm_provider(model: str) -> bool:
    """
    Check if the model contains a known llm provider
--- a/litellm/integrations/SlackAlerting/slack_alerting.py
+++ b/litellm/integrations/SlackAlerting/slack_alerting.py
@ -16,6 +16,7 @@ import litellm.litellm_core_utils.litellm_logging
 import litellm.types
 from litellm._logging import verbose_logger, verbose_proxy_logger
 from litellm.caching.caching import DualCache
 from litellm.constants import HOURS_IN_A_DAY
 from litellm.integrations.custom_batch_logger import CustomBatchLogger
 from litellm.litellm_core_utils.duration_parser import duration_in_seconds
 from litellm.litellm_core_utils.exception_mapping_utils import (
@ -649,10 +650,10 @@ class SlackAlerting(CustomBatchLogger):
                event_message += (
                    f"Budget Crossed\n Total Budget:`{user_info.max_budget}`"
                )
-            elif percent_left <= 0.05:
+            elif percent_left <= SLACK_ALERTING_THRESHOLD_5_PERCENT:
                event = "threshold_crossed"
                event_message += "5% Threshold Crossed "
-            elif percent_left <= 0.15:
+            elif percent_left <= SLACK_ALERTING_THRESHOLD_15_PERCENT:
                event = "threshold_crossed"
                event_message += "15% Threshold Crossed"
        elif user_info.soft_budget is not None:
@ -1718,7 +1719,7 @@ Model Info:
            await self.internal_usage_cache.async_set_cache(
                key=_event_cache_key,
                value="SENT",
-                ttl=(30 * 24 * 60 * 60),  # 1 month
+                ttl=(30 * HOURS_IN_A_DAY * 60 * 60),  # 1 month
            )
        except Exception as e:
--- a/litellm/integrations/datadog/datadog.py
+++ b/litellm/integrations/datadog/datadog.py
@ -41,7 +41,7 @@ from litellm.types.utils import StandardLoggingPayload
 from ..additional_logging_utils import AdditionalLoggingUtils
 # max number of logs DD API can accept
-DD_MAX_BATCH_SIZE = 1000
+
 # specify what ServiceTypes are logged as success events to DD. (We don't want to spam DD traces with large number of service types)
 DD_LOGGED_SUCCESS_SERVICE_TYPES = [
--- a/litellm/integrations/gcs_bucket/gcs_bucket.py
+++ b/litellm/integrations/gcs_bucket/gcs_bucket.py
@ -20,10 +20,6 @@ else:
    VertexBase = Any
 GCS_DEFAULT_BATCH_SIZE = 2048
 GCS_DEFAULT_FLUSH_INTERVAL_SECONDS = 20
 class GCSBucketLogger(GCSBucketBase, AdditionalLoggingUtils):
    def __init__(self, bucket_name: Optional[str] = None) -> None:
        from litellm.proxy.proxy_server import premium_user
--- a/litellm/litellm_core_utils/get_llm_provider_logic.py
+++ b/litellm/litellm_core_utils/get_llm_provider_logic.py
@ -3,6 +3,7 @@ from typing import Optional, Tuple
 import httpx
 import litellm
 from litellm.constants import REPLICATE_MODEL_NAME_WITH_ID_LENGTH
 from litellm.secret_managers.main import get_secret, get_secret_str
 from ..types.router import LiteLLM_Params
@ -256,10 +257,13 @@ def get_llm_provider(  # noqa: PLR0915
        elif model in litellm.cohere_chat_models:
            custom_llm_provider = "cohere_chat"
        ## replicate
-        elif model in litellm.replicate_models or (":" in model and len(model) > 64):
+        elif model in litellm.replicate_models or (
            ":" in model and len(model) > REPLICATE_MODEL_NAME_WITH_ID_LENGTH
        ):
            model_parts = model.split(":")
            if (
-                len(model_parts) > 1 and len(model_parts[1]) == 64
+                len(model_parts) > 1
                and len(model_parts[1]) == REPLICATE_MODEL_NAME_WITH_ID_LENGTH
            ):  ## checks if model name has a 64 digit code - e.g. "meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3"
                custom_llm_provider = "replicate"
            elif model in litellm.replicate_models:
--- a/litellm/litellm_core_utils/litellm_logging.py
+++ b/litellm/litellm_core_utils/litellm_logging.py
@ -28,6 +28,10 @@ from litellm._logging import _is_debugging_on, verbose_logger
 from litellm.batches.batch_utils import _handle_completed_batch
 from litellm.caching.caching import DualCache, InMemoryCache
 from litellm.caching.caching_handler import LLMCachingHandler
 from litellm.constants import (
    DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT,
    DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT,
 )
 from litellm.cost_calculator import _select_model_name_for_cost_calc
 from litellm.integrations.arize.arize import ArizeLogger
 from litellm.integrations.custom_guardrail import CustomGuardrail
@ -3745,9 +3749,12 @@ def create_dummy_standard_logging_payload() -> StandardLoggingPayload:
        response_cost=response_cost,
        response_cost_failure_debug_info=None,
        status=str("success"),
-        total_tokens=int(30),
+        total_tokens=int(
-        prompt_tokens=int(20),
+            DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT
-        completion_tokens=int(10),
+            + DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT
        ),
        prompt_tokens=int(DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT),
        completion_tokens=int(DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT),
        startTime=start_time,
        endTime=end_time,
        completionStartTime=completion_start_time,
--- a/litellm/litellm_core_utils/llm_cost_calc/tool_call_cost_tracking.py
+++ b/litellm/litellm_core_utils/llm_cost_calc/tool_call_cost_tracking.py
@ -5,6 +5,7 @@ Helper utilities for tracking the cost of built-in tools.
 from typing import Any, Dict, List, Optional
 import litellm
 from litellm.constants import OPENAI_FILE_SEARCH_COST_PER_1K_CALLS
 from litellm.types.llms.openai import FileSearchTool, WebSearchOptions
 from litellm.types.utils import (
    ModelInfo,
@ -132,7 +133,7 @@ class StandardBuiltInToolCostTracking:
        """
        if file_search is None:
            return 0.0
-        return 2.5 / 1000
+        return OPENAI_FILE_SEARCH_COST_PER_1K_CALLS
    @staticmethod
    def chat_completion_response_includes_annotations(
--- a/litellm/litellm_core_utils/token_counter.py
+++ b/litellm/litellm_core_utils/token_counter.py
@ -11,6 +11,10 @@ from litellm.constants import (
    DEFAULT_IMAGE_HEIGHT,
    DEFAULT_IMAGE_TOKEN_COUNT,
    DEFAULT_IMAGE_WIDTH,
    MAX_LONG_SIDE_FOR_IMAGE_HIGH_RES,
    MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES,
    MAX_TILE_HEIGHT,
    MAX_TILE_WIDTH,
 )
 from litellm.llms.custom_httpx.http_handler import _get_httpx_client
@ -97,11 +101,14 @@ def resize_image_high_res(
    height: int,
 ) -> Tuple[int, int]:
    # Maximum dimensions for high res mode
-    max_short_side = 768
+    max_short_side = MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES
-    max_long_side = 2000
+    max_long_side = MAX_LONG_SIDE_FOR_IMAGE_HIGH_RES
    # Return early if no resizing is needed
-    if width <= 768 and height <= 768:
+    if (
        width <= MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES
        and height <= MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES
    ):
        return width, height
    # Determine the longer and shorter sides
@ -132,7 +139,10 @@ def resize_image_high_res(
 # Test the function with the given example
 def calculate_tiles_needed(
-    resized_width, resized_height, tile_width=512, tile_height=512
+    resized_width,
    resized_height,
    tile_width=MAX_TILE_WIDTH,
    tile_height=MAX_TILE_HEIGHT,
 ):
    tiles_across = (resized_width + tile_width - 1) // tile_width
    tiles_down = (resized_height + tile_height - 1) // tile_height
--- a/litellm/llms/anthropic/chat/transformation.py
+++ b/litellm/llms/anthropic/chat/transformation.py
@ -5,7 +5,10 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union, cast
 import httpx
 import litellm
-from litellm.constants import RESPONSE_FORMAT_TOOL_NAME
+from litellm.constants import (
    DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS,
    RESPONSE_FORMAT_TOOL_NAME,
 )
 from litellm.litellm_core_utils.core_helpers import map_finish_reason
 from litellm.litellm_core_utils.prompt_templates.factory import anthropic_messages_pt
 from litellm.llms.base_llm.base_utils import type_to_response_format_param
@ -53,7 +56,7 @@ class AnthropicConfig(BaseConfig):
    max_tokens: Optional[
        int
-    ] = 4096  # anthropic requires a default value (Opus, Sonnet, and Haiku have the same default)
+    ] = DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS  # anthropic requires a default value (Opus, Sonnet, and Haiku have the same default)
    stop_sequences: Optional[list] = None
    temperature: Optional[int] = None
    top_p: Optional[int] = None
@ -65,7 +68,7 @@ class AnthropicConfig(BaseConfig):
        self,
        max_tokens: Optional[
            int
-        ] = 4096,  # You can pass in a value yourself or use the default value 4096
+        ] = DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS,  # You can pass in a value yourself or use the default value 4096
        stop_sequences: Optional[list] = None,
        temperature: Optional[int] = None,
        top_p: Optional[int] = None,
--- a/litellm/llms/anthropic/completion/transformation.py
+++ b/litellm/llms/anthropic/completion/transformation.py
@ -11,6 +11,7 @@ from typing import AsyncIterator, Dict, Iterator, List, Optional, Union
 import httpx
 import litellm
 from litellm.constants import DEFAULT_MAX_TOKENS
 from litellm.litellm_core_utils.prompt_templates.factory import (
    custom_prompt,
    prompt_factory,
@ -65,7 +66,9 @@ class AnthropicTextConfig(BaseConfig):
    def __init__(
        self,
-        max_tokens_to_sample: Optional[int] = 256,  # anthropic requires a default
+        max_tokens_to_sample: Optional[
            int
        ] = DEFAULT_MAX_TOKENS,  # anthropic requires a default
        stop_sequences: Optional[list] = None,
        temperature: Optional[int] = None,
        top_p: Optional[int] = None,
--- a/litellm/llms/azure/azure.py
+++ b/litellm/llms/azure/azure.py
@ -7,7 +7,7 @@ import httpx  # type: ignore
 from openai import APITimeoutError, AsyncAzureOpenAI, AzureOpenAI
 import litellm
-from litellm.constants import DEFAULT_MAX_RETRIES
+from litellm.constants import AZURE_OPERATION_POLLING_TIMEOUT, DEFAULT_MAX_RETRIES
 from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
 from litellm.litellm_core_utils.logging_utils import track_llm_api_timing
 from litellm.llms.custom_httpx.http_handler import (
@ -857,7 +857,7 @@ class AzureChatCompletion(BaseAzureLLM, BaseLLM):
            await response.aread()
-            timeout_secs: int = 120
+            timeout_secs: int = AZURE_OPERATION_POLLING_TIMEOUT
            start_time = time.time()
            if "status" not in response.json():
                raise Exception(
@ -955,7 +955,7 @@ class AzureChatCompletion(BaseAzureLLM, BaseLLM):
            response.read()
-            timeout_secs: int = 120
+            timeout_secs: int = AZURE_OPERATION_POLLING_TIMEOUT
            start_time = time.time()
            if "status" not in response.json():
                raise Exception(
--- a/litellm/llms/azure/chat/gpt_transformation.py
+++ b/litellm/llms/azure/chat/gpt_transformation.py
@ -7,6 +7,10 @@ from litellm.litellm_core_utils.prompt_templates.factory import (
    convert_to_azure_openai_messages,
 )
 from litellm.llms.base_llm.chat.transformation import BaseLLMException
 from litellm.types.llms.azure import (
    API_VERSION_MONTH_SUPPORTED_RESPONSE_FORMAT,
    API_VERSION_YEAR_SUPPORTED_RESPONSE_FORMAT,
 )
 from litellm.types.utils import ModelResponse
 from litellm.utils import supports_response_schema
@ -123,7 +127,10 @@ class AzureOpenAIConfig(BaseConfig):
        - check if api_version is supported for response_format
        """
-        is_supported = int(api_version_year) <= 2024 and int(api_version_month) >= 8
+        is_supported = (
            int(api_version_year) <= API_VERSION_YEAR_SUPPORTED_RESPONSE_FORMAT
            and int(api_version_month) >= API_VERSION_MONTH_SUPPORTED_RESPONSE_FORMAT
        )
        return is_supported
--- a/litellm/llms/bedrock/base_aws_llm.py
+++ b/litellm/llms/bedrock/base_aws_llm.py
@ -9,7 +9,7 @@ from pydantic import BaseModel
 from litellm._logging import verbose_logger
 from litellm.caching.caching import DualCache
-from litellm.constants import BEDROCK_INVOKE_PROVIDERS_LITERAL
+from litellm.constants import BEDROCK_INVOKE_PROVIDERS_LITERAL, BEDROCK_MAX_POLICY_SIZE
 from litellm.litellm_core_utils.dd_tracing import tracer
 from litellm.secret_managers.main import get_secret
@ -381,7 +381,7 @@ class BaseAWSLLM:
            "region_name": aws_region_name,
        }
-        if sts_response["PackedPolicySize"] > 75:
+        if sts_response["PackedPolicySize"] > BEDROCK_MAX_POLICY_SIZE:
            verbose_logger.warning(
                f"The policy size is greater than 75% of the allowed size, PackedPolicySize: {sts_response['PackedPolicySize']}"
            )
--- a/litellm/llms/deepinfra/chat/transformation.py
+++ b/litellm/llms/deepinfra/chat/transformation.py
@ -1,6 +1,7 @@
 from typing import Optional, Tuple, Union
 import litellm
 from litellm.constants import MIN_NON_ZERO_TEMPERATURE
 from litellm.llms.openai.chat.gpt_transformation import OpenAIGPTConfig
 from litellm.secret_managers.main import get_secret_str
@ -84,7 +85,7 @@ class DeepInfraConfig(OpenAIGPTConfig):
                and value == 0
                and model == "mistralai/Mistral-7B-Instruct-v0.1"
            ):  # this model does no support temperature == 0
-                value = 0.0001  # close to 0
+                value = MIN_NON_ZERO_TEMPERATURE  # close to 0
            if param == "tool_choice":
                if (
                    value != "auto" and value != "none"
--- a/litellm/llms/fireworks_ai/cost_calculator.py
+++ b/litellm/llms/fireworks_ai/cost_calculator.py
@ -4,6 +4,12 @@ For calculating cost of fireworks ai serverless inference models.
 from typing import Tuple
 from litellm.constants import (
    FIREWORKS_AI_16_B,
    FIREWORKS_AI_56_B_MOE,
    FIREWORKS_AI_80_B,
    FIREWORKS_AI_176_B_MOE,
 )
 from litellm.types.utils import Usage
 from litellm.utils import get_model_info
@ -25,9 +31,9 @@ def get_base_model_for_pricing(model_name: str) -> str:
    moe_match = re.search(r"(\d+)x(\d+)b", model_name)
    if moe_match:
        total_billion = int(moe_match.group(1)) * int(moe_match.group(2))
-        if total_billion <= 56:
+        if total_billion <= FIREWORKS_AI_56_B_MOE:
            return "fireworks-ai-moe-up-to-56b"
-        elif total_billion <= 176:
+        elif total_billion <= FIREWORKS_AI_176_B_MOE:
            return "fireworks-ai-56b-to-176b"
    # Check for standard models in the form <number>b
@ -37,9 +43,9 @@ def get_base_model_for_pricing(model_name: str) -> str:
        params_billion = float(params_match)
        # Determine the category based on the number of parameters
-        if params_billion <= 16.0:
+        if params_billion <= FIREWORKS_AI_16_B:
            return "fireworks-ai-up-to-16b"
-        elif params_billion <= 80.0:
+        elif params_billion <= FIREWORKS_AI_80_B:
            return "fireworks-ai-16b-80b"
    # If no matches, return the original model_name
--- a/litellm/llms/predibase/chat/transformation.py
+++ b/litellm/llms/predibase/chat/transformation.py
@ -2,6 +2,7 @@ from typing import TYPE_CHECKING, Any, List, Literal, Optional, Union
 from httpx import Headers, Response
 from litellm.constants import DEFAULT_MAX_TOKENS
 from litellm.llms.base_llm.chat.transformation import BaseConfig, BaseLLMException
 from litellm.types.llms.openai import AllMessageValues
 from litellm.types.utils import ModelResponse
@ -27,7 +28,7 @@ class PredibaseConfig(BaseConfig):
    decoder_input_details: Optional[bool] = None
    details: bool = True  # enables returning logprobs + best of
    max_new_tokens: int = (
-        256  # openai default - requests hang if max_new_tokens not given
+        DEFAULT_MAX_TOKENS  # openai default - requests hang if max_new_tokens not given
    )
    repetition_penalty: Optional[float] = None
    return_full_text: Optional[
--- a/litellm/llms/replicate/chat/handler.py
+++ b/litellm/llms/replicate/chat/handler.py
@ -4,6 +4,7 @@ import time
 from typing import Callable, List, Union
 import litellm
 from litellm.constants import REPLICATE_POLLING_DELAY_SECONDS
 from litellm.llms.custom_httpx.http_handler import (
    AsyncHTTPHandler,
    HTTPHandler,
@ -28,7 +29,9 @@ def handle_prediction_response_streaming(
    status = ""
    while True and (status not in ["succeeded", "failed", "canceled"]):
-        time.sleep(0.5)  # prevent being rate limited by replicate
+        time.sleep(
            REPLICATE_POLLING_DELAY_SECONDS
        )  # prevent being rate limited by replicate
        print_verbose(f"replicate: polling endpoint: {prediction_url}")
        response = http_client.get(prediction_url, headers=headers)
        if response.status_code == 200:
@ -77,7 +80,9 @@ async def async_handle_prediction_response_streaming(
    status = ""
    while True and (status not in ["succeeded", "failed", "canceled"]):
-        await asyncio.sleep(0.5)  # prevent being rate limited by replicate
+        await asyncio.sleep(
            REPLICATE_POLLING_DELAY_SECONDS
        )  # prevent being rate limited by replicate
        print_verbose(f"replicate: polling endpoint: {prediction_url}")
        response = await http_client.get(prediction_url, headers=headers)
        if response.status_code == 200:
--- a/litellm/llms/replicate/chat/transformation.py
+++ b/litellm/llms/replicate/chat/transformation.py
@ -3,6 +3,7 @@ from typing import TYPE_CHECKING, Any, List, Optional, Union
 import httpx
 import litellm
 from litellm.constants import REPLICATE_MODEL_NAME_WITH_ID_LENGTH
 from litellm.litellm_core_utils.prompt_templates.common_utils import (
    convert_content_list_to_str,
 )
@ -221,10 +222,11 @@ class ReplicateConfig(BaseConfig):
        version_id = self.model_to_version_id(model)
        request_data: dict = {"input": input_data}
-        if ":" in version_id and len(version_id) > 64:
+        if ":" in version_id and len(version_id) > REPLICATE_MODEL_NAME_WITH_ID_LENGTH:
            model_parts = version_id.split(":")
            if (
-                len(model_parts) > 1 and len(model_parts[1]) == 64
+                len(model_parts) > 1
                and len(model_parts[1]) == REPLICATE_MODEL_NAME_WITH_ID_LENGTH
            ):  ## checks if model name has a 64 digit code - e.g. "meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3"
                request_data["version"] = model_parts[1]
--- a/litellm/llms/together_ai/cost_calculator.py
+++ b/litellm/llms/together_ai/cost_calculator.py
@ -4,6 +4,16 @@ Handles calculating cost for together ai models
 import re
 from litellm.constants import (
    TOGETHER_AI_4_B,
    TOGETHER_AI_8_B,
    TOGETHER_AI_21_B,
    TOGETHER_AI_41_B,
    TOGETHER_AI_80_B,
    TOGETHER_AI_110_B,
    TOGETHER_AI_EMBEDDING_150_M,
    TOGETHER_AI_EMBEDDING_350_M,
 )
 from litellm.types.utils import CallTypes
@ -31,17 +41,17 @@ def get_model_params_and_category(model_name, call_type: CallTypes) -> str:
        else:
            return model_name
        # Determine the category based on the number of parameters
-        if params_billion <= 4.0:
+        if params_billion <= TOGETHER_AI_4_B:
            category = "together-ai-up-to-4b"
-        elif params_billion <= 8.0:
+        elif params_billion <= TOGETHER_AI_8_B:
            category = "together-ai-4.1b-8b"
-        elif params_billion <= 21.0:
+        elif params_billion <= TOGETHER_AI_21_B:
            category = "together-ai-8.1b-21b"
-        elif params_billion <= 41.0:
+        elif params_billion <= TOGETHER_AI_41_B:
            category = "together-ai-21.1b-41b"
-        elif params_billion <= 80.0:
+        elif params_billion <= TOGETHER_AI_80_B:
            category = "together-ai-41.1b-80b"
-        elif params_billion <= 110.0:
+        elif params_billion <= TOGETHER_AI_110_B:
            category = "together-ai-81.1b-110b"
        if category is not None:
            return category
@ -69,9 +79,9 @@ def get_model_params_and_category_embeddings(model_name) -> str:
        else:
            return model_name
        # Determine the category based on the number of parameters
-        if params_million <= 150:
+        if params_million <= TOGETHER_AI_EMBEDDING_150_M:
            category = "together-ai-embedding-up-to-150m"
-        elif params_million <= 350:
+        elif params_million <= TOGETHER_AI_EMBEDDING_350_M:
            category = "together-ai-embedding-151m-to-350m"
        if category is not None:
            return category
--- a/litellm/llms/triton/completion/transformation.py
+++ b/litellm/llms/triton/completion/transformation.py
@ -7,6 +7,7 @@ from typing import Any, AsyncIterator, Dict, Iterator, List, Literal, Optional,
 from httpx import Headers, Response
 from litellm.constants import DEFAULT_MAX_TOKENS_FOR_TRITON
 from litellm.litellm_core_utils.prompt_templates.factory import prompt_factory
 from litellm.llms.base_llm.base_model_iterator import BaseModelResponseIterator
 from litellm.llms.base_llm.chat.transformation import (
@ -196,7 +197,9 @@ class TritonGenerateConfig(TritonConfig):
        data_for_triton: Dict[str, Any] = {
            "text_input": prompt_factory(model=model, messages=messages),
            "parameters": {
-                "max_tokens": int(optional_params.get("max_tokens", 2000)),
+                "max_tokens": int(
                    optional_params.get("max_tokens", DEFAULT_MAX_TOKENS_FOR_TRITON)
                ),
                "bad_words": [""],
                "stop_words": [""],
            },
--- a/litellm/main.py
+++ b/litellm/main.py
@ -51,6 +51,10 @@ from litellm import (  # type: ignore
    get_litellm_params,
    get_optional_params,
 )
 from litellm.constants import (
    DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT,
    DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT,
 )
 from litellm.exceptions import LiteLLMUnknownProvider
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.litellm_core_utils.audio_utils.utils import get_audio_file_for_health_check
@ -740,7 +744,12 @@ def mock_completion(
        setattr(
            model_response,
            "usage",
-            Usage(prompt_tokens=10, completion_tokens=20, total_tokens=30),
+            Usage(
                prompt_tokens=DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT,
                completion_tokens=DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT,
                total_tokens=DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT
                + DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT,
            ),
        )
        try:
@ -3067,7 +3076,7 @@ def completion(  # type: ignore # noqa: PLR0915
                        "max_tokens": max_tokens,
                        "temperature": temperature,
                        "top_p": top_p,
-                        "top_k": kwargs.get("top_k", 40),
+                        "top_k": kwargs.get("top_k"),
                    },
                },
            )
--- a/litellm/proxy/auth/auth_checks.py
+++ b/litellm/proxy/auth/auth_checks.py
@ -20,6 +20,7 @@ import litellm
 from litellm._logging import verbose_proxy_logger
 from litellm.caching.caching import DualCache
 from litellm.caching.dual_cache import LimitedSizeOrderedDict
 from litellm.constants import DEFAULT_IN_MEMORY_TTL
 from litellm.litellm_core_utils.get_llm_provider_logic import get_llm_provider
 from litellm.proxy._types import (
    RBAC_ROLES,
@ -55,7 +56,7 @@ else:
 last_db_access_time = LimitedSizeOrderedDict(max_size=100)
-db_cache_expiry = 5  # refresh every 5s
+db_cache_expiry = DEFAULT_IN_MEMORY_TTL  # refresh every 5s
 all_routes = LiteLLMRoutes.openai_routes.value + LiteLLMRoutes.management_routes.value
--- a/litellm/proxy/auth/litellm_license.py
+++ b/litellm/proxy/auth/litellm_license.py
@ -9,6 +9,7 @@ from typing import Optional
 import httpx
 from litellm._logging import verbose_proxy_logger
 from litellm.constants import NON_LLM_CONNECTION_TIMEOUT
 from litellm.llms.custom_httpx.http_handler import HTTPHandler
@ -23,7 +24,7 @@ class LicenseCheck:
    def __init__(self) -> None:
        self.license_str = os.getenv("LITELLM_LICENSE", None)
        verbose_proxy_logger.debug("License Str value - {}".format(self.license_str))
-        self.http_handler = HTTPHandler(timeout=15)
+        self.http_handler = HTTPHandler(timeout=NON_LLM_CONNECTION_TIMEOUT)
        self.public_key = None
        self.read_public_key()
--- a/litellm/proxy/hooks/prompt_injection_detection.py
+++ b/litellm/proxy/hooks/prompt_injection_detection.py
@ -15,6 +15,7 @@ from fastapi import HTTPException
 import litellm
 from litellm._logging import verbose_proxy_logger
 from litellm.caching.caching import DualCache
 from litellm.constants import DEFAULT_PROMPT_INJECTION_SIMILARITY_THRESHOLD
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.litellm_core_utils.prompt_templates.factory import (
    prompt_injection_detection_default_pt,
@ -110,7 +111,9 @@ class _OPTIONAL_PromptInjectionDetection(CustomLogger):
        return combinations
    def check_user_input_similarity(
-        self, user_input: str, similarity_threshold: float = 0.7
+        self,
        user_input: str,
        similarity_threshold: float = DEFAULT_PROMPT_INJECTION_SIMILARITY_THRESHOLD,
    ) -> bool:
        user_input_lower = user_input.lower()
        keywords = self.generate_injection_keywords()
--- a/litellm/proxy/management_endpoints/key_management_endpoints.py
+++ b/litellm/proxy/management_endpoints/key_management_endpoints.py
@ -24,7 +24,7 @@ from fastapi import APIRouter, Depends, Header, HTTPException, Query, Request, s
 import litellm
 from litellm._logging import verbose_proxy_logger
 from litellm.caching import DualCache
-from litellm.constants import UI_SESSION_TOKEN_TEAM_ID
+from litellm.constants import LENGTH_OF_LITELLM_GENERATED_KEY, UI_SESSION_TOKEN_TEAM_ID
 from litellm.litellm_core_utils.duration_parser import duration_in_seconds
 from litellm.proxy._types import *
 from litellm.proxy.auth.auth_checks import (
@ -1164,7 +1164,7 @@ async def generate_key_helper_fn(  # noqa: PLR0915
        if key is not None:
            token = key
        else:
-            token = f"sk-{secrets.token_urlsafe(16)}"
+            token = f"sk-{secrets.token_urlsafe(LENGTH_OF_LITELLM_GENERATED_KEY)}"
    if duration is None:  # allow tokens that never expire
        expires = None
@ -1745,7 +1745,7 @@ async def regenerate_key_fn(
        verbose_proxy_logger.debug("key_in_db: %s", _key_in_db)
-        new_token = f"sk-{secrets.token_urlsafe(16)}"
+        new_token = f"sk-{secrets.token_urlsafe(LENGTH_OF_LITELLM_GENERATED_KEY)}"
        new_token_hash = hash_token(new_token)
        new_token_key_name = f"sk-...{new_token[-4:]}"
--- a/litellm/proxy/pass_through_endpoints/llm_provider_handlers/assembly_passthrough_logging_handler.py
+++ b/litellm/proxy/pass_through_endpoints/llm_provider_handlers/assembly_passthrough_logging_handler.py
@ -15,6 +15,10 @@ from litellm.litellm_core_utils.litellm_logging import (
 )
 from litellm.litellm_core_utils.thread_pool_executor import executor
 from litellm.proxy.pass_through_endpoints.types import PassthroughStandardLoggingPayload
 from litellm.types.passthrough_endpoints.assembly_ai import (
    ASSEMBLY_AI_MAX_POLLING_ATTEMPTS,
    ASSEMBLY_AI_POLLING_INTERVAL,
 )
 class AssemblyAITranscriptResponse(TypedDict, total=False):
@ -34,13 +38,13 @@ class AssemblyAIPassthroughLoggingHandler:
        The base URL for the AssemblyAI API
        """
-        self.polling_interval: float = 10
+        self.polling_interval: float = ASSEMBLY_AI_POLLING_INTERVAL
        """
        The polling interval for the AssemblyAI API. 
        litellm needs to poll the GET /transcript/{transcript_id} endpoint to get the status of the transcript.
        """
-        self.max_polling_attempts = 180
+        self.max_polling_attempts = ASSEMBLY_AI_MAX_POLLING_ATTEMPTS
        """
        The maximum number of polling attempts for the AssemblyAI API.
        """
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -25,7 +25,10 @@ from typing import (
    get_type_hints,
 )
-from litellm.constants import DEFAULT_MAX_RECURSE_DEPTH
+from litellm.constants import (
    DEFAULT_MAX_RECURSE_DEPTH,
    DEFAULT_SLACK_ALERTING_THRESHOLD,
 )
 from litellm.types.utils import (
    ModelResponse,
    ModelResponseStream,
@ -118,7 +121,16 @@ import litellm
 from litellm import Router
 from litellm._logging import verbose_proxy_logger, verbose_router_logger
 from litellm.caching.caching import DualCache, RedisCache
-from litellm.constants import LITELLM_PROXY_ADMIN_NAME
+from litellm.constants import (
    DAYS_IN_A_MONTH,
    DEFAULT_HEALTH_CHECK_INTERVAL,
    DEFAULT_MODEL_CREATED_AT_TIME,
    LITELLM_PROXY_ADMIN_NAME,
    PROMETHEUS_FALLBACK_STATS_SEND_TIME_HOURS,
    PROXY_BATCH_WRITE_AT,
    PROXY_BUDGET_RESCHEDULER_MAX_TIME,
    PROXY_BUDGET_RESCHEDULER_MIN_TIME,
 )
 from litellm.exceptions import RejectedRequestError
 from litellm.integrations.SlackAlerting.slack_alerting import SlackAlerting
 from litellm.litellm_core_utils.core_helpers import (
@ -287,7 +299,7 @@ from litellm.router import (
    LiteLLM_Params,
    ModelGroupInfo,
 )
-from litellm.scheduler import DefaultPriorities, FlowItem, Scheduler
+from litellm.scheduler import FlowItem, Scheduler
 from litellm.secret_managers.aws_secret_manager import load_aws_kms
 from litellm.secret_managers.google_kms import load_google_kms
 from litellm.secret_managers.main import (
@ -307,6 +319,7 @@ from litellm.types.llms.openai import HttpxBinaryResponseContent
 from litellm.types.router import DeploymentTypedDict
 from litellm.types.router import ModelInfo as RouterModelInfo
 from litellm.types.router import RouterGeneralSettings, updateDeployment
 from litellm.types.scheduler import DefaultPriorities
 from litellm.types.utils import CredentialItem, CustomHuggingfaceTokenizer
 from litellm.types.utils import ModelInfo as ModelMapInfo
 from litellm.types.utils import RawRequestTypedDict, StandardLoggingPayload
@ -779,9 +792,9 @@ queue: List = []
 litellm_proxy_budget_name = "litellm-proxy-budget"
 litellm_proxy_admin_name = LITELLM_PROXY_ADMIN_NAME
 ui_access_mode: Literal["admin", "all"] = "all"
-proxy_budget_rescheduler_min_time = 597
+proxy_budget_rescheduler_min_time = PROXY_BUDGET_RESCHEDULER_MIN_TIME
-proxy_budget_rescheduler_max_time = 605
+proxy_budget_rescheduler_max_time = PROXY_BUDGET_RESCHEDULER_MAX_TIME
-proxy_batch_write_at = 10  # in seconds
+proxy_batch_write_at = PROXY_BATCH_WRITE_AT
 litellm_master_key_hash = None
 disable_spend_logs = False
 jwt_handler = JWTHandler()
@ -1846,7 +1859,9 @@ class ProxyConfig:
            use_background_health_checks = general_settings.get(
                "background_health_checks", False
            )
-            health_check_interval = general_settings.get("health_check_interval", 300)
+            health_check_interval = general_settings.get(
                "health_check_interval", DEFAULT_HEALTH_CHECK_INTERVAL
            )
            health_check_details = general_settings.get("health_check_details", True)
            ### RBAC ###
@ -3145,7 +3160,7 @@ class ProxyStartupEvent:
                scheduler.add_job(
                    proxy_logging_obj.slack_alerting_instance.send_fallback_stats_from_prometheus,
                    "cron",
-                    hour=9,
+                    hour=PROMETHEUS_FALLBACK_STATS_SEND_TIME_HOURS,
                    minute=0,
                    timezone=ZoneInfo("America/Los_Angeles"),  # Pacific Time
                )
@ -3278,7 +3293,7 @@ async def model_list(
            {
                "id": model,
                "object": "model",
-                "created": 1677610602,
+                "created": DEFAULT_MODEL_CREATED_AT_TIME,
                "owned_by": "openai",
            }
            for model in all_models
@ -5592,7 +5607,7 @@ async def model_metrics(
            param="None",
            code=status.HTTP_500_INTERNAL_SERVER_ERROR,
        )
-    startTime = startTime or datetime.now() - timedelta(days=30)
+    startTime = startTime or datetime.now() - timedelta(days=DAYS_IN_A_MONTH)
    endTime = endTime or datetime.now()
    if api_key is None or api_key == "undefined":
@ -5713,11 +5728,12 @@ async def model_metrics_slow_responses(
    if customer is None or customer == "undefined":
        customer = "null"
-    startTime = startTime or datetime.now() - timedelta(days=30)
+    startTime = startTime or datetime.now() - timedelta(days=DAYS_IN_A_MONTH)
    endTime = endTime or datetime.now()
    alerting_threshold = (
-        proxy_logging_obj.slack_alerting_instance.alerting_threshold or 300
+        proxy_logging_obj.slack_alerting_instance.alerting_threshold
        or DEFAULT_SLACK_ALERTING_THRESHOLD
    )
    alerting_threshold = int(alerting_threshold)
@ -5797,7 +5813,7 @@ async def model_metrics_exceptions(
            code=status.HTTP_500_INTERNAL_SERVER_ERROR,
        )
-    startTime = startTime or datetime.now() - timedelta(days=30)
+    startTime = startTime or datetime.now() - timedelta(days=DAYS_IN_A_MONTH)
    endTime = endTime or datetime.now()
    if api_key is None or api_key == "undefined":
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@ -22,6 +22,7 @@ from typing import (
    overload,
 )
 from litellm.constants import MAX_TEAM_LIST_LIMIT
 from litellm.proxy._types import (
    DB_CONNECTION_ERROR_TYPES,
    CommonProxyErrors,
@ -1596,7 +1597,9 @@ class PrismaClient:
                        where={"team_id": {"in": team_id_list}}
                    )
                elif query_type == "find_all" and team_id_list is None:
-                    response = await self.db.litellm_teamtable.find_many(take=20)
+                    response = await self.db.litellm_teamtable.find_many(
                        take=MAX_TEAM_LIST_LIMIT
                    )
                return response
            elif table_name == "user_notification":
                if query_type == "find_unique":
--- a/litellm/router.py
+++ b/litellm/router.py
@ -50,6 +50,7 @@ from litellm.caching.caching import (
    RedisCache,
    RedisClusterCache,
 )
 from litellm.constants import DEFAULT_MAX_LRU_CACHE_SIZE
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.litellm_core_utils.asyncify import run_async_function
 from litellm.litellm_core_utils.core_helpers import _get_parent_otel_span_from_kwargs
@ -5073,7 +5074,7 @@ class Router:
                    rpm_usage += t
        return tpm_usage, rpm_usage
-    @lru_cache(maxsize=64)
+    @lru_cache(maxsize=DEFAULT_MAX_LRU_CACHE_SIZE)
    def _cached_get_model_group_info(
        self, model_group: str
    ) -> Optional[ModelGroupInfo]:
--- a/litellm/router_utils/handle_error.py
+++ b/litellm/router_utils/handle_error.py
@ -1,6 +1,7 @@
 from typing import TYPE_CHECKING, Any, Optional, Union
 from litellm._logging import verbose_router_logger
 from litellm.constants import MAX_EXCEPTION_MESSAGE_LENGTH
 from litellm.router_utils.cooldown_handlers import (
    _async_get_cooldown_deployments_with_debug_info,
 )
@ -54,7 +55,7 @@ async def send_llm_exception_alert(
    exception_str = str(original_exception)
    if litellm_debug_info is not None:
        exception_str += litellm_debug_info
-    exception_str += f"\n\n{error_traceback_str[:2000]}"
+    exception_str += f"\n\n{error_traceback_str[:MAX_EXCEPTION_MESSAGE_LENGTH]}"
    await litellm_router_instance.slack_alerting_logger.send_alert(
        message=f"LLM API call failed: `{exception_str}`",
--- a/litellm/scheduler.py
+++ b/litellm/scheduler.py
@ -6,17 +6,14 @@ from pydantic import BaseModel
 from litellm import print_verbose
 from litellm.caching.caching import DualCache, RedisCache
 from litellm.constants import DEFAULT_IN_MEMORY_TTL, DEFAULT_POLLING_INTERVAL
 class SchedulerCacheKeys(enum.Enum):
    queue = "scheduler:queue"
-    default_in_memory_ttl = 5  # cache queue in-memory for 5s when redis cache available
+    default_in_memory_ttl = (
-
+        DEFAULT_IN_MEMORY_TTL  # cache queue in-memory for 5s when redis cache available
-
+    )
 class DefaultPriorities(enum.Enum):
    High = 0
    Medium = 128
    Low = 255
 class FlowItem(BaseModel):
@ -44,7 +41,9 @@ class Scheduler:
        self.cache = DualCache(
            redis_cache=redis_cache, default_in_memory_ttl=default_in_memory_ttl
        )
-        self.polling_interval = polling_interval or 0.03  # default to 3ms
+        self.polling_interval = (
            polling_interval or DEFAULT_POLLING_INTERVAL
        )  # default to 3ms
    async def add_request(self, request: FlowItem):
        # We use the priority directly, as lower values indicate higher priority
--- a/litellm/secret_managers/google_secret_manager.py
+++ b/litellm/secret_managers/google_secret_manager.py
@ -5,6 +5,7 @@ from typing import Optional
 import litellm
 from litellm._logging import verbose_logger
 from litellm.caching.caching import InMemoryCache
 from litellm.constants import SECRET_MANAGER_REFRESH_INTERVAL
 from litellm.integrations.gcs_bucket.gcs_bucket_base import GCSBucketBase
 from litellm.llms.custom_httpx.http_handler import _get_httpx_client
 from litellm.proxy._types import CommonProxyErrors, KeyManagementSystem
@ -13,7 +14,7 @@ from litellm.proxy._types import CommonProxyErrors, KeyManagementSystem
 class GoogleSecretManager(GCSBucketBase):
    def __init__(
        self,
-        refresh_interval: Optional[int] = 86400,
+        refresh_interval: Optional[int] = SECRET_MANAGER_REFRESH_INTERVAL,
        always_read_secret_manager: Optional[bool] = False,
    ) -> None:
        """
--- a/litellm/secret_managers/hashicorp_secret_manager.py
+++ b/litellm/secret_managers/hashicorp_secret_manager.py
@ -6,6 +6,7 @@ import httpx
 import litellm
 from litellm._logging import verbose_logger
 from litellm.caching import InMemoryCache
 from litellm.constants import SECRET_MANAGER_REFRESH_INTERVAL
 from litellm.llms.custom_httpx.http_handler import (
    _get_httpx_client,
    get_async_httpx_client,
@ -39,8 +40,14 @@ class HashicorpSecretManager(BaseSecretManager):
        litellm.secret_manager_client = self
        litellm._key_management_system = KeyManagementSystem.HASHICORP_VAULT
-        _refresh_interval = os.environ.get("HCP_VAULT_REFRESH_INTERVAL", 86400)
+        _refresh_interval = os.environ.get(
-        _refresh_interval = int(_refresh_interval) if _refresh_interval else 86400
+            "HCP_VAULT_REFRESH_INTERVAL", SECRET_MANAGER_REFRESH_INTERVAL
        )
        _refresh_interval = (
            int(_refresh_interval)
            if _refresh_interval
            else SECRET_MANAGER_REFRESH_INTERVAL
        )
        self.cache = InMemoryCache(
            default_ttl=_refresh_interval
        )  # store in memory for 1 day
--- a/litellm/types/integrations/datadog.py
+++ b/litellm/types/integrations/datadog.py
@ -1,6 +1,8 @@
 from enum import Enum
 from typing import Optional, TypedDict
 DD_MAX_BATCH_SIZE = 1000
 class DataDogStatus(str, Enum):
    INFO = "info"
--- a/litellm/types/integrations/gcs_bucket.py
+++ b/litellm/types/integrations/gcs_bucket.py
@ -8,6 +8,10 @@ else:
    VertexBase = Any
 GCS_DEFAULT_BATCH_SIZE = 2048
 GCS_DEFAULT_FLUSH_INTERVAL_SECONDS = 20
 class GCSLoggingConfig(TypedDict):
    """
    Internal LiteLLM Config for GCS Bucket logging
--- a/litellm/types/integrations/slack_alerting.py
+++ b/litellm/types/integrations/slack_alerting.py
@ -7,6 +7,9 @@ from pydantic import BaseModel, Field
 from litellm.types.utils import LiteLLMPydanticObjectBase
 SLACK_ALERTING_THRESHOLD_5_PERCENT = 0.05
 SLACK_ALERTING_THRESHOLD_15_PERCENT = 0.15
 class BaseOutageModel(TypedDict):
    alerts: List[int]
--- a/litellm/types/llms/azure.py
+++ b/litellm/types/llms/azure.py
@ -0,0 +1,2 @@
 API_VERSION_YEAR_SUPPORTED_RESPONSE_FORMAT = 2024
 API_VERSION_MONTH_SUPPORTED_RESPONSE_FORMAT = 8
--- a/litellm/types/llms/triton.py
+++ b/litellm/types/llms/triton.py
@ -0,0 +1 @@
--- a/litellm/types/passthrough_endpoints/assembly_ai.py
+++ b/litellm/types/passthrough_endpoints/assembly_ai.py
@ -0,0 +1,2 @@
 ASSEMBLY_AI_POLLING_INTERVAL = 10
 ASSEMBLY_AI_MAX_POLLING_ATTEMPTS = 180
--- a/litellm/types/scheduler.py
+++ b/litellm/types/scheduler.py
@ -0,0 +1,7 @@
 from enum import Enum
 class DefaultPriorities(Enum):
    High = 0
    Medium = 128
    Low = 255
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -62,6 +62,16 @@ import litellm.llms.gemini
 from litellm.caching._internal_lru_cache import lru_cache_wrapper
 from litellm.caching.caching import DualCache
 from litellm.caching.caching_handler import CachingHandlerResponse, LLMCachingHandler
 from litellm.constants import (
    DEFAULT_MAX_LRU_CACHE_SIZE,
    DEFAULT_TRIM_RATIO,
    FUNCTION_DEFINITION_TOKEN_COUNT,
    INITIAL_RETRY_DELAY,
    JITTER,
    MAX_RETRY_DELAY,
    MINIMUM_PROMPT_CACHE_TOKEN_COUNT,
    TOOL_CHOICE_OBJECT_TOKEN_COUNT,
 )
 from litellm.integrations.custom_guardrail import CustomGuardrail
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.litellm_core_utils.core_helpers import (
@ -1520,7 +1530,7 @@ def _select_tokenizer(
    return _select_tokenizer_helper(model=model)
-@lru_cache(maxsize=128)
+@lru_cache(maxsize=DEFAULT_MAX_LRU_CACHE_SIZE)
 def _select_tokenizer_helper(model: str) -> SelectTokenizerResponse:
    if litellm.disable_hf_tokenizer_download is True:
        return _return_openai_tokenizer(model)
@ -5336,15 +5346,15 @@ def _calculate_retry_after(
    if retry_after is not None and 0 < retry_after <= 60:
        return retry_after
-    initial_retry_delay = 0.5
+    initial_retry_delay = INITIAL_RETRY_DELAY
-    max_retry_delay = 8.0
+    max_retry_delay = MAX_RETRY_DELAY
    nb_retries = max_retries - remaining_retries
    # Apply exponential backoff, but not more than the max.
    sleep_seconds = min(initial_retry_delay * pow(2.0, nb_retries), max_retry_delay)
    # Apply some jitter, plus-or-minus half a second.
-    jitter = 1 - 0.25 * random.random()
+    jitter = JITTER * random.random()
    timeout = sleep_seconds * jitter
    return timeout if timeout >= min_timeout else min_timeout
@ -5670,7 +5680,7 @@ def shorten_message_to_fit_limit(message, tokens_needed, model: Optional[str]):
 def trim_messages(
    messages,
    model: Optional[str] = None,
-    trim_ratio: float = 0.75,
+    trim_ratio: float = DEFAULT_TRIM_RATIO,
    return_response_tokens: bool = False,
    max_tokens=None,
 ):
@ -6543,7 +6553,7 @@ def is_prompt_caching_valid_prompt(
            model=model,
            use_default_image_token_count=True,
        )
-        return token_count >= 1024
+        return token_count >= MINIMUM_PROMPT_CACHE_TOKEN_COUNT
    except Exception as e:
        verbose_logger.error(f"Error in is_prompt_caching_valid_prompt: {e}")
        return False
--- a/mypy.ini
+++ b/mypy.ini
@ -3,6 +3,7 @@ warn_return_any = False
 ignore_missing_imports = True
 mypy_path = litellm/stubs
 namespace_packages = True
 disable_error_code = valid-type
 [mypy-google.*]
 ignore_missing_imports = True
--- a/tests/code_coverage_tests/ban_constant_numbers.py
+++ b/tests/code_coverage_tests/ban_constant_numbers.py
@ -0,0 +1,152 @@
 import sys
 import ast
 import os
 # Extremely restrictive set of allowed numbers
 ALLOWED_NUMBERS = {
    0,
    1,
    -1,
    2,
    10,
    100,
    1000,
    4,
    3,
    500,
    6,
    60,
    3600,
    0.75,
    7,
    1024,
    1011,
    600,
    12,
    1000000000.0,
    0.1,
    50,
    128,
    6000,
    30,
    1000000,
    5,
    15,
    25,
    10000,
    60000,
    8,
    2048,
    16000000000,
    16,
    16383,
    14,
    24,
    128000,
    0.01,
    20,
 }
 # Add all standard HTTP status codes
 HTTP_STATUS_CODES = {
    200,  # OK
    201,  # Created
    202,  # Accepted
    204,  # No Content
    300,  # Multiple Choices
    301,  # Moved Permanently
    302,  # Found
    303,  # See Other
    304,  # Not Modified
    307,  # Temporary Redirect
    308,  # Permanent Redirect
    400,  # Bad Request
    401,  # Unauthorized
    402,  # Payment Required
    403,  # Forbidden
    404,  # Not Found
    406,  # Not Acceptable
    408,  # Request Timeout
    409,  # Conflict
    413,  # Payload Too Large
    422,  # Unprocessable Entity
    424,  # Failed Dependency
    429,  # Too Many Requests
    498,  # Invalid Token
    499,  # Client Closed Request
    500,  # Internal Server Error
    501,  # Not Implemented
    502,  # Bad Gateway
    503,  # Service Unavailable
    504,  # Gateway Timeout
    520,  # Web server is returning an unknown error
    522,  # Connection timed out
    524,  # A timeout occurred
    529,  # Site is overloaded
 }
 # Combine the sets
 ALLOWED_NUMBERS = ALLOWED_NUMBERS.union(HTTP_STATUS_CODES)
 class HardcodedNumberFinder(ast.NodeVisitor):
    def __init__(self):
        self.hardcoded_numbers = []
    def visit_Constant(self, node):
        # For Python 3.8+
        if isinstance(node.value, (int, float)) and node.value not in ALLOWED_NUMBERS:
            self.hardcoded_numbers.append((node.lineno, node.value))
        self.generic_visit(node)
    def visit_Num(self, node):
        # For older Python versions
        if node.n not in ALLOWED_NUMBERS:
            self.hardcoded_numbers.append((node.lineno, node.n))
        self.generic_visit(node)
 def check_file(filename):
    try:
        with open(filename, "r") as f:
            content = f.read()
        tree = ast.parse(content)
        finder = HardcodedNumberFinder()
        finder.visit(tree)
        if finder.hardcoded_numbers:
            print(f"ERROR in {filename}: Hardcoded numbers detected:")
            for line, value in finder.hardcoded_numbers:
                print(f"  Line {line}: {value}")
            return 1
        return 0
    except SyntaxError:
        print(f"Syntax error in {filename}")
        return 0
 def main():
    exit_code = 0
    folder = "../../litellm"
    ignore_files = [
        "constants.py",
        "proxy_cli.py",
        "token_counter.py",
        "mock_functions.py",
        "duration_parser.py",
        "utils.py",
    ]
    ignore_folder = "types"
    for root, dirs, files in os.walk(folder):
        for filename in files:
            if filename.endswith(".py") and filename not in ignore_files:
                full_path = os.path.join(root, filename)
                if ignore_folder in full_path:
                    continue
                exit_code |= check_file(full_path)
    sys.exit(exit_code)
 if __name__ == "__main__":
    main()
--- a/tests/code_coverage_tests/log.txt
+++ b/tests/code_coverage_tests/log.txt
		`@ -0,0 +1,2 @@`
							`API_VERSION_YEAR_SUPPORTED_RESPONSE_FORMAT = 2024`
							`API_VERSION_MONTH_SUPPORTED_RESPONSE_FORMAT = 8`
		`@ -0,0 +1,2 @@`
							`ASSEMBLY_AI_POLLING_INTERVAL = 10`
							`ASSEMBLY_AI_MAX_POLLING_ATTEMPTS = 180`