refactor: complete migration

2025-04-25 18:54:30 +00:00 · 2025-03-24 19:52:45 -07:00 · 2025-03-24 19:52:45 -07:00 · cb6e9fbe40
commit cb6e9fbe40
parent bfc159172d
32 changed files with 203 additions and 210 deletions
--- a/litellm/caching/caching.py
+++ b/litellm/caching/caching.py
@ -19,6 +19,7 @@ from pydantic import BaseModel
 import litellm
 from litellm._logging import verbose_logger
 from litellm.constants import CACHED_STREAMING_CHUNK_DELAY
 from litellm.litellm_core_utils.model_param_helper import ModelParamHelper
 from litellm.types.caching import *
 from litellm.types.utils import all_litellm_params
@ -406,7 +407,7 @@ class Cache:
                    }
                ]
            }
-            time.sleep(0.02)
+            time.sleep(CACHED_STREAMING_CHUNK_DELAY)
    def _get_cache_logic(
        self,
--- a/litellm/caching/in_memory_cache.py
+++ b/litellm/caching/in_memory_cache.py
@ -15,7 +15,8 @@ from typing import Any, List, Optional
 from pydantic import BaseModel
-from ..constants import MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB
+from litellm.constants import MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB
 from .base_cache import BaseCache
@ -52,7 +53,8 @@ class InMemoryCache(BaseCache):
            # Fast path for common primitive types that are typically small
            if (
                isinstance(value, (bool, int, float, str))
-                and len(str(value)) < self.max_size_per_item * 512
+                and len(str(value))
                < self.max_size_per_item * MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB
            ):  # Conservative estimate
                return True
--- a/litellm/caching/qdrant_semantic_cache.py
+++ b/litellm/caching/qdrant_semantic_cache.py
@ -11,10 +11,12 @@ Has 4 methods:
 import ast
 import asyncio
 import json
-from typing import Any
+from typing import Any, cast
 import litellm
 from litellm._logging import print_verbose
 from litellm.constants import QDRANT_SCALAR_QUANTILE, QDRANT_VECTOR_SIZE
 from litellm.types.utils import EmbeddingResponse
 from .base_cache import BaseCache
@ -118,7 +120,11 @@ class QdrantSemanticCache(BaseCache):
                }
            elif quantization_config == "scalar":
                quantization_params = {
-                    "scalar": {"type": "int8", "quantile": 0.99, "always_ram": False}
+                    "scalar": {
                        "type": "int8",
                        "quantile": QDRANT_SCALAR_QUANTILE,
                        "always_ram": False,
                    }
                }
            elif quantization_config == "product":
                quantization_params = {
@ -132,7 +138,7 @@ class QdrantSemanticCache(BaseCache):
            new_collection_status = self.sync_client.put(
                url=f"{self.qdrant_api_base}/collections/{self.collection_name}",
                json={
-                    "vectors": {"size": 1536, "distance": "Cosine"},
+                    "vectors": {"size": QDRANT_VECTOR_SIZE, "distance": "Cosine"},
                    "quantization_config": quantization_params,
                },
                headers=self.headers,
@ -171,10 +177,13 @@ class QdrantSemanticCache(BaseCache):
            prompt += message["content"]
        # create an embedding for prompt
-        embedding_response = litellm.embedding(
+        embedding_response = cast(
-            model=self.embedding_model,
+            EmbeddingResponse,
-            input=prompt,
+            litellm.embedding(
-            cache={"no-store": True, "no-cache": True},
+                model=self.embedding_model,
                input=prompt,
                cache={"no-store": True, "no-cache": True},
            ),
        )
        # get the embedding
@ -212,10 +221,13 @@ class QdrantSemanticCache(BaseCache):
            prompt += message["content"]
        # convert to embedding
-        embedding_response = litellm.embedding(
+        embedding_response = cast(
-            model=self.embedding_model,
+            EmbeddingResponse,
-            input=prompt,
+            litellm.embedding(
-            cache={"no-store": True, "no-cache": True},
+                model=self.embedding_model,
                input=prompt,
                cache={"no-store": True, "no-cache": True},
            ),
        )
        # get the embedding
--- a/litellm/constants.py
+++ b/litellm/constants.py
@ -26,12 +26,19 @@ HOURS_IN_A_DAY = 24
 DAYS_IN_A_WEEK = 7
 DAYS_IN_A_MONTH = 28
 DAYS_IN_A_YEAR = 365
 REPLICATE_MODEL_NAME_WITH_ID_LENGTH = 64
 #### TOKEN COUNTING ####
 FUNCTION_DEFINITION_TOKEN_COUNT = 9
 SYSTEM_MESSAGE_TOKEN_COUNT = 4
 TOOL_CHOICE_OBJECT_TOKEN_COUNT = 4
 DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT = 10
 DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT = 20
 MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES = 768
 MAX_LONG_SIDE_FOR_IMAGE_HIGH_RES = 2000
 MAX_TILE_WIDTH = 512
 MAX_TILE_HEIGHT = 512
 OPENAI_FILE_SEARCH_COST_PER_1K_CALLS = 2.5 / 1000
 MIN_NON_ZERO_TEMPERATURE = 0.0001
 #### RELIABILITY ####
 REPEATED_STREAMING_CHUNK_LIMIT = 100  # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives.
 DEFAULT_MAX_LRU_CACHE_SIZE = 16
@ -40,15 +47,36 @@ MAX_RETRY_DELAY = 8.0
 JITTER = 0.75
 DEFAULT_IN_MEMORY_TTL = 5  # default time to live for the in-memory cache
 DEFAULT_POLLING_INTERVAL = 0.03  # default polling interval for the scheduler
 AZURE_OPERATION_POLLING_TIMEOUT = 120
 REDIS_SOCKET_TIMEOUT = 0.1
 REDIS_CONNECTION_POOL_TIMEOUT = 5
 NON_LLM_CONNECTION_TIMEOUT = 15  # timeout for adjacent services (e.g. jwt auth)
 MAX_EXCEPTION_MESSAGE_LENGTH = 2000
 BEDROCK_MAX_POLICY_SIZE = 75
 REPLICATE_POLLING_DELAY_SECONDS = 0.5
 DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS = 4096
 TOGETHER_AI_4_B = 4
 TOGETHER_AI_8_B = 8
 TOGETHER_AI_21_B = 21
 TOGETHER_AI_41_B = 41
 TOGETHER_AI_80_B = 80
 TOGETHER_AI_110_B = 110
 TOGETHER_AI_EMBEDDING_150_M = 150
 TOGETHER_AI_EMBEDDING_350_M = 350
 QDRANT_SCALAR_QUANTILE = 0.99
 QDRANT_VECTOR_SIZE = 1536
 CACHED_STREAMING_CHUNK_DELAY = 0.02
 MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 512
 DEFAULT_MAX_TOKENS_FOR_TRITON = 2000
 #### Networking settings ####
 request_timeout: float = 6000  # time in seconds
 STREAM_SSE_DONE_STRING: str = "[DONE]"
 ### SPEND TRACKING ###
 DEFAULT_REPLICATE_GPU_PRICE_PER_SECOND = 0.001400  # price per second for a100 80GB
-
+FIREWORKS_AI_56_B_MOE = 56
 FIREWORKS_AI_176_B_MOE = 176
 FIREWORKS_AI_16_B = 16
 FIREWORKS_AI_80_B = 80
 LITELLM_CHAT_PROVIDERS = [
    "openai",
--- a/litellm/integrations/SlackAlerting/slack_alerting.py
+++ b/litellm/integrations/SlackAlerting/slack_alerting.py
@ -16,6 +16,7 @@ import litellm.litellm_core_utils.litellm_logging
 import litellm.types
 from litellm._logging import verbose_logger, verbose_proxy_logger
 from litellm.caching.caching import DualCache
 from litellm.constants import HOURS_IN_A_DAY
 from litellm.integrations.custom_batch_logger import CustomBatchLogger
 from litellm.litellm_core_utils.duration_parser import duration_in_seconds
 from litellm.litellm_core_utils.exception_mapping_utils import (
@ -646,10 +647,10 @@ class SlackAlerting(CustomBatchLogger):
                event_message += (
                    f"Budget Crossed\n Total Budget:`{user_info.max_budget}`"
                )
-            elif percent_left <= 0.05:
+            elif percent_left <= SLACK_ALERTING_THRESHOLD_5_PERCENT:
                event = "threshold_crossed"
                event_message += "5% Threshold Crossed "
-            elif percent_left <= 0.15:
+            elif percent_left <= SLACK_ALERTING_THRESHOLD_15_PERCENT:
                event = "threshold_crossed"
                event_message += "15% Threshold Crossed"
        elif user_info.soft_budget is not None:
@ -1715,7 +1716,7 @@ Model Info:
            await self.internal_usage_cache.async_set_cache(
                key=_event_cache_key,
                value="SENT",
-                ttl=(30 * 24 * 60 * 60),  # 1 month
+                ttl=(30 * HOURS_IN_A_DAY * 60 * 60),  # 1 month
            )
        except Exception as e:
--- a/litellm/integrations/datadog/datadog.py
+++ b/litellm/integrations/datadog/datadog.py
@ -41,7 +41,7 @@ from litellm.types.utils import StandardLoggingPayload
 from ..additional_logging_utils import AdditionalLoggingUtils
 # max number of logs DD API can accept
-DD_MAX_BATCH_SIZE = 1000
+
 # specify what ServiceTypes are logged as success events to DD. (We don't want to spam DD traces with large number of service types)
 DD_LOGGED_SUCCESS_SERVICE_TYPES = [
--- a/litellm/integrations/gcs_bucket/gcs_bucket.py
+++ b/litellm/integrations/gcs_bucket/gcs_bucket.py
@ -20,10 +20,6 @@ else:
    VertexBase = Any
 GCS_DEFAULT_BATCH_SIZE = 2048
 GCS_DEFAULT_FLUSH_INTERVAL_SECONDS = 20
 class GCSBucketLogger(GCSBucketBase, AdditionalLoggingUtils):
    def __init__(self, bucket_name: Optional[str] = None) -> None:
        from litellm.proxy.proxy_server import premium_user
--- a/litellm/litellm_core_utils/get_llm_provider_logic.py
+++ b/litellm/litellm_core_utils/get_llm_provider_logic.py
@ -3,6 +3,7 @@ from typing import Optional, Tuple
 import httpx
 import litellm
 from litellm.constants import REPLICATE_MODEL_NAME_WITH_ID_LENGTH
 from litellm.secret_managers.main import get_secret, get_secret_str
 from ..types.router import LiteLLM_Params
@ -256,10 +257,13 @@ def get_llm_provider(  # noqa: PLR0915
        elif model in litellm.cohere_chat_models:
            custom_llm_provider = "cohere_chat"
        ## replicate
-        elif model in litellm.replicate_models or (":" in model and len(model) > 64):
+        elif model in litellm.replicate_models or (
            ":" in model and len(model) > REPLICATE_MODEL_NAME_WITH_ID_LENGTH
        ):
            model_parts = model.split(":")
            if (
-                len(model_parts) > 1 and len(model_parts[1]) == 64
+                len(model_parts) > 1
                and len(model_parts[1]) == REPLICATE_MODEL_NAME_WITH_ID_LENGTH
            ):  ## checks if model name has a 64 digit code - e.g. "meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3"
                custom_llm_provider = "replicate"
            elif model in litellm.replicate_models:
--- a/litellm/litellm_core_utils/litellm_logging.py
+++ b/litellm/litellm_core_utils/litellm_logging.py
@ -28,6 +28,10 @@ from litellm._logging import _is_debugging_on, verbose_logger
 from litellm.batches.batch_utils import _handle_completed_batch
 from litellm.caching.caching import DualCache, InMemoryCache
 from litellm.caching.caching_handler import LLMCachingHandler
 from litellm.constants import (
    DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT,
    DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT,
 )
 from litellm.cost_calculator import _select_model_name_for_cost_calc
 from litellm.integrations.arize.arize import ArizeLogger
 from litellm.integrations.custom_guardrail import CustomGuardrail
@ -3743,9 +3747,12 @@ def create_dummy_standard_logging_payload() -> StandardLoggingPayload:
        response_cost=response_cost,
        response_cost_failure_debug_info=None,
        status=str("success"),
-        total_tokens=int(30),
+        total_tokens=int(
-        prompt_tokens=int(20),
+            DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT
-        completion_tokens=int(10),
+            + DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT
        ),
        prompt_tokens=int(DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT),
        completion_tokens=int(DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT),
        startTime=start_time,
        endTime=end_time,
        completionStartTime=completion_start_time,
--- a/litellm/litellm_core_utils/llm_cost_calc/tool_call_cost_tracking.py
+++ b/litellm/litellm_core_utils/llm_cost_calc/tool_call_cost_tracking.py
@ -5,6 +5,7 @@ Helper utilities for tracking the cost of built-in tools.
 from typing import Any, Dict, List, Optional
 import litellm
 from litellm.constants import OPENAI_FILE_SEARCH_COST_PER_1K_CALLS
 from litellm.types.llms.openai import FileSearchTool, WebSearchOptions
 from litellm.types.utils import (
    ModelInfo,
@ -132,7 +133,7 @@ class StandardBuiltInToolCostTracking:
        """
        if file_search is None:
            return 0.0
-        return 2.5 / 1000
+        return OPENAI_FILE_SEARCH_COST_PER_1K_CALLS
    @staticmethod
    def chat_completion_response_includes_annotations(
--- a/litellm/litellm_core_utils/token_counter.py
+++ b/litellm/litellm_core_utils/token_counter.py
@ -11,6 +11,10 @@ from litellm.constants import (
    DEFAULT_IMAGE_HEIGHT,
    DEFAULT_IMAGE_TOKEN_COUNT,
    DEFAULT_IMAGE_WIDTH,
    MAX_LONG_SIDE_FOR_IMAGE_HIGH_RES,
    MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES,
    MAX_TILE_HEIGHT,
    MAX_TILE_WIDTH,
 )
 from litellm.llms.custom_httpx.http_handler import _get_httpx_client
@ -97,11 +101,14 @@ def resize_image_high_res(
    height: int,
 ) -> Tuple[int, int]:
    # Maximum dimensions for high res mode
-    max_short_side = 768
+    max_short_side = MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES
-    max_long_side = 2000
+    max_long_side = MAX_LONG_SIDE_FOR_IMAGE_HIGH_RES
    # Return early if no resizing is needed
-    if width <= 768 and height <= 768:
+    if (
        width <= MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES
        and height <= MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES
    ):
        return width, height
    # Determine the longer and shorter sides
@ -132,7 +139,10 @@ def resize_image_high_res(
 # Test the function with the given example
 def calculate_tiles_needed(
-    resized_width, resized_height, tile_width=512, tile_height=512
+    resized_width,
    resized_height,
    tile_width=MAX_TILE_WIDTH,
    tile_height=MAX_TILE_HEIGHT,
 ):
    tiles_across = (resized_width + tile_width - 1) // tile_width
    tiles_down = (resized_height + tile_height - 1) // tile_height
--- a/litellm/llms/anthropic/chat/transformation.py
+++ b/litellm/llms/anthropic/chat/transformation.py
@ -5,7 +5,10 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union, cast
 import httpx
 import litellm
-from litellm.constants import RESPONSE_FORMAT_TOOL_NAME
+from litellm.constants import (
    DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS,
    RESPONSE_FORMAT_TOOL_NAME,
 )
 from litellm.litellm_core_utils.core_helpers import map_finish_reason
 from litellm.litellm_core_utils.prompt_templates.factory import anthropic_messages_pt
 from litellm.llms.base_llm.base_utils import type_to_response_format_param
@ -50,7 +53,7 @@ class AnthropicConfig(BaseConfig):
    """
    max_tokens: Optional[int] = (
-        4096  # anthropic requires a default value (Opus, Sonnet, and Haiku have the same default)
+        DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS  # anthropic requires a default value (Opus, Sonnet, and Haiku have the same default)
    )
    stop_sequences: Optional[list] = None
    temperature: Optional[int] = None
@ -63,7 +66,7 @@ class AnthropicConfig(BaseConfig):
        self,
        max_tokens: Optional[
            int
-        ] = 4096,  # You can pass in a value yourself or use the default value 4096
+        ] = DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS,  # You can pass in a value yourself or use the default value 4096
        stop_sequences: Optional[list] = None,
        temperature: Optional[int] = None,
        top_p: Optional[int] = None,
--- a/litellm/llms/anthropic/completion/transformation.py
+++ b/litellm/llms/anthropic/completion/transformation.py
@ -11,6 +11,7 @@ from typing import AsyncIterator, Dict, Iterator, List, Optional, Union
 import httpx
 import litellm
 from litellm.constants import DEFAULT_MAX_TOKENS
 from litellm.litellm_core_utils.prompt_templates.factory import (
    custom_prompt,
    prompt_factory,
@ -65,7 +66,9 @@ class AnthropicTextConfig(BaseConfig):
    def __init__(
        self,
-        max_tokens_to_sample: Optional[int] = 256,  # anthropic requires a default
+        max_tokens_to_sample: Optional[
            int
        ] = DEFAULT_MAX_TOKENS,  # anthropic requires a default
        stop_sequences: Optional[list] = None,
        temperature: Optional[int] = None,
        top_p: Optional[int] = None,
--- a/litellm/llms/azure/azure.py
+++ b/litellm/llms/azure/azure.py
@ -7,7 +7,7 @@ import httpx  # type: ignore
 from openai import APITimeoutError, AsyncAzureOpenAI, AzureOpenAI
 import litellm
-from litellm.constants import DEFAULT_MAX_RETRIES
+from litellm.constants import AZURE_OPERATION_POLLING_TIMEOUT, DEFAULT_MAX_RETRIES
 from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
 from litellm.litellm_core_utils.logging_utils import track_llm_api_timing
 from litellm.llms.custom_httpx.http_handler import (
@ -859,7 +859,7 @@ class AzureChatCompletion(BaseAzureLLM, BaseLLM):
            await response.aread()
-            timeout_secs: int = 120
+            timeout_secs: int = AZURE_OPERATION_POLLING_TIMEOUT
            start_time = time.time()
            if "status" not in response.json():
                raise Exception(
@ -959,7 +959,7 @@ class AzureChatCompletion(BaseAzureLLM, BaseLLM):
            response.read()
-            timeout_secs: int = 120
+            timeout_secs: int = AZURE_OPERATION_POLLING_TIMEOUT
            start_time = time.time()
            if "status" not in response.json():
                raise Exception(
--- a/litellm/llms/azure/chat/gpt_transformation.py
+++ b/litellm/llms/azure/chat/gpt_transformation.py
@ -7,6 +7,10 @@ from litellm.litellm_core_utils.prompt_templates.factory import (
    convert_to_azure_openai_messages,
 )
 from litellm.llms.base_llm.chat.transformation import BaseLLMException
 from litellm.types.llms.azure import (
    API_VERSION_MONTH_SUPPORTED_RESPONSE_FORMAT,
    API_VERSION_YEAR_SUPPORTED_RESPONSE_FORMAT,
 )
 from litellm.types.utils import ModelResponse
 from litellm.utils import supports_response_schema
@ -123,7 +127,10 @@ class AzureOpenAIConfig(BaseConfig):
        - check if api_version is supported for response_format
        """
-        is_supported = int(api_version_year) <= 2024 and int(api_version_month) >= 8
+        is_supported = (
            int(api_version_year) <= API_VERSION_YEAR_SUPPORTED_RESPONSE_FORMAT
            and int(api_version_month) >= API_VERSION_MONTH_SUPPORTED_RESPONSE_FORMAT
        )
        return is_supported
--- a/litellm/llms/bedrock/base_aws_llm.py
+++ b/litellm/llms/bedrock/base_aws_llm.py
@ -9,7 +9,7 @@ from pydantic import BaseModel
 from litellm._logging import verbose_logger
 from litellm.caching.caching import DualCache
-from litellm.constants import BEDROCK_INVOKE_PROVIDERS_LITERAL
+from litellm.constants import BEDROCK_INVOKE_PROVIDERS_LITERAL, BEDROCK_MAX_POLICY_SIZE
 from litellm.litellm_core_utils.dd_tracing import tracer
 from litellm.secret_managers.main import get_secret
@ -381,7 +381,7 @@ class BaseAWSLLM:
            "region_name": aws_region_name,
        }
-        if sts_response["PackedPolicySize"] > 75:
+        if sts_response["PackedPolicySize"] > BEDROCK_MAX_POLICY_SIZE:
            verbose_logger.warning(
                f"The policy size is greater than 75% of the allowed size, PackedPolicySize: {sts_response['PackedPolicySize']}"
            )
--- a/litellm/llms/bedrock/chat/invoke_handler.py
+++ b/litellm/llms/bedrock/chat/invoke_handler.py
@ -1274,13 +1274,6 @@ class AWSEventStreamDecoder:
    def converse_chunk_parser(self, chunk_data: dict) -> ModelResponseStream:
        try:
            verbose_logger.debug("\n\nRaw Chunk: {}\n\n".format(chunk_data))
            chunk_data["usage"] = {
                "inputTokens": 3,
                "outputTokens": 392,
                "totalTokens": 2191,
                "cacheReadInputTokens": 1796,
                "cacheWriteInputTokens": 0,
            }
            text = ""
            tool_use: Optional[ChatCompletionToolCallChunk] = None
            finish_reason = ""
--- a/litellm/llms/deepinfra/chat/transformation.py
+++ b/litellm/llms/deepinfra/chat/transformation.py
@ -1,6 +1,7 @@
 from typing import Optional, Tuple, Union
 import litellm
 from litellm.constants import MIN_NON_ZERO_TEMPERATURE
 from litellm.llms.openai.chat.gpt_transformation import OpenAIGPTConfig
 from litellm.secret_managers.main import get_secret_str
@ -84,7 +85,7 @@ class DeepInfraConfig(OpenAIGPTConfig):
                and value == 0
                and model == "mistralai/Mistral-7B-Instruct-v0.1"
            ):  # this model does no support temperature == 0
-                value = 0.0001  # close to 0
+                value = MIN_NON_ZERO_TEMPERATURE  # close to 0
            if param == "tool_choice":
                if (
                    value != "auto" and value != "none"
--- a/litellm/llms/fireworks_ai/cost_calculator.py
+++ b/litellm/llms/fireworks_ai/cost_calculator.py
@ -4,6 +4,12 @@ For calculating cost of fireworks ai serverless inference models.
 from typing import Tuple
 from litellm.constants import (
    FIREWORKS_AI_16_B,
    FIREWORKS_AI_56_B_MOE,
    FIREWORKS_AI_80_B,
    FIREWORKS_AI_176_B_MOE,
 )
 from litellm.types.utils import Usage
 from litellm.utils import get_model_info
@ -25,9 +31,9 @@ def get_base_model_for_pricing(model_name: str) -> str:
    moe_match = re.search(r"(\d+)x(\d+)b", model_name)
    if moe_match:
        total_billion = int(moe_match.group(1)) * int(moe_match.group(2))
-        if total_billion <= 56:
+        if total_billion <= FIREWORKS_AI_56_B_MOE:
            return "fireworks-ai-moe-up-to-56b"
-        elif total_billion <= 176:
+        elif total_billion <= FIREWORKS_AI_176_B_MOE:
            return "fireworks-ai-56b-to-176b"
    # Check for standard models in the form <number>b
@ -37,9 +43,9 @@ def get_base_model_for_pricing(model_name: str) -> str:
        params_billion = float(params_match)
        # Determine the category based on the number of parameters
-        if params_billion <= 16.0:
+        if params_billion <= FIREWORKS_AI_16_B:
            return "fireworks-ai-up-to-16b"
-        elif params_billion <= 80.0:
+        elif params_billion <= FIREWORKS_AI_80_B:
            return "fireworks-ai-16b-80b"
    # If no matches, return the original model_name
--- a/litellm/llms/predibase/chat/transformation.py
+++ b/litellm/llms/predibase/chat/transformation.py
@ -2,6 +2,7 @@ from typing import TYPE_CHECKING, Any, List, Literal, Optional, Union
 from httpx import Headers, Response
 from litellm.constants import DEFAULT_MAX_TOKENS
 from litellm.llms.base_llm.chat.transformation import BaseConfig, BaseLLMException
 from litellm.types.llms.openai import AllMessageValues
 from litellm.types.utils import ModelResponse
@ -27,7 +28,7 @@ class PredibaseConfig(BaseConfig):
    decoder_input_details: Optional[bool] = None
    details: bool = True  # enables returning logprobs + best of
    max_new_tokens: int = (
-        256  # openai default - requests hang if max_new_tokens not given
+        DEFAULT_MAX_TOKENS  # openai default - requests hang if max_new_tokens not given
    )
    repetition_penalty: Optional[float] = None
    return_full_text: Optional[bool] = (
--- a/litellm/llms/replicate/chat/handler.py
+++ b/litellm/llms/replicate/chat/handler.py
@ -4,6 +4,7 @@ import time
 from typing import Callable, List, Union
 import litellm
 from litellm.constants import REPLICATE_POLLING_DELAY_SECONDS
 from litellm.llms.custom_httpx.http_handler import (
    AsyncHTTPHandler,
    HTTPHandler,
@ -28,7 +29,9 @@ def handle_prediction_response_streaming(
    status = ""
    while True and (status not in ["succeeded", "failed", "canceled"]):
-        time.sleep(0.5)  # prevent being rate limited by replicate
+        time.sleep(
            REPLICATE_POLLING_DELAY_SECONDS
        )  # prevent being rate limited by replicate
        print_verbose(f"replicate: polling endpoint: {prediction_url}")
        response = http_client.get(prediction_url, headers=headers)
        if response.status_code == 200:
@ -77,7 +80,9 @@ async def async_handle_prediction_response_streaming(
    status = ""
    while True and (status not in ["succeeded", "failed", "canceled"]):
-        await asyncio.sleep(0.5)  # prevent being rate limited by replicate
+        await asyncio.sleep(
            REPLICATE_POLLING_DELAY_SECONDS
        )  # prevent being rate limited by replicate
        print_verbose(f"replicate: polling endpoint: {prediction_url}")
        response = await http_client.get(prediction_url, headers=headers)
        if response.status_code == 200:
--- a/litellm/llms/replicate/chat/transformation.py
+++ b/litellm/llms/replicate/chat/transformation.py
@ -3,6 +3,7 @@ from typing import TYPE_CHECKING, Any, List, Optional, Union
 import httpx
 import litellm
 from litellm.constants import REPLICATE_MODEL_NAME_WITH_ID_LENGTH
 from litellm.litellm_core_utils.prompt_templates.common_utils import (
    convert_content_list_to_str,
 )
@ -220,10 +221,11 @@ class ReplicateConfig(BaseConfig):
        version_id = self.model_to_version_id(model)
        request_data: dict = {"input": input_data}
-        if ":" in version_id and len(version_id) > 64:
+        if ":" in version_id and len(version_id) > REPLICATE_MODEL_NAME_WITH_ID_LENGTH:
            model_parts = version_id.split(":")
            if (
-                len(model_parts) > 1 and len(model_parts[1]) == 64
+                len(model_parts) > 1
                and len(model_parts[1]) == REPLICATE_MODEL_NAME_WITH_ID_LENGTH
            ):  ## checks if model name has a 64 digit code - e.g. "meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3"
                request_data["version"] = model_parts[1]
--- a/litellm/llms/together_ai/cost_calculator.py
+++ b/litellm/llms/together_ai/cost_calculator.py
@ -4,6 +4,16 @@ Handles calculating cost for together ai models
 import re
 from litellm.constants import (
    TOGETHER_AI_4_B,
    TOGETHER_AI_8_B,
    TOGETHER_AI_21_B,
    TOGETHER_AI_41_B,
    TOGETHER_AI_80_B,
    TOGETHER_AI_110_B,
    TOGETHER_AI_EMBEDDING_150_M,
    TOGETHER_AI_EMBEDDING_350_M,
 )
 from litellm.types.utils import CallTypes
@ -31,17 +41,17 @@ def get_model_params_and_category(model_name, call_type: CallTypes) -> str:
        else:
            return model_name
        # Determine the category based on the number of parameters
-        if params_billion <= 4.0:
+        if params_billion <= TOGETHER_AI_4_B:
            category = "together-ai-up-to-4b"
-        elif params_billion <= 8.0:
+        elif params_billion <= TOGETHER_AI_8_B:
            category = "together-ai-4.1b-8b"
-        elif params_billion <= 21.0:
+        elif params_billion <= TOGETHER_AI_21_B:
            category = "together-ai-8.1b-21b"
-        elif params_billion <= 41.0:
+        elif params_billion <= TOGETHER_AI_41_B:
            category = "together-ai-21.1b-41b"
-        elif params_billion <= 80.0:
+        elif params_billion <= TOGETHER_AI_80_B:
            category = "together-ai-41.1b-80b"
-        elif params_billion <= 110.0:
+        elif params_billion <= TOGETHER_AI_110_B:
            category = "together-ai-81.1b-110b"
        if category is not None:
            return category
@ -69,9 +79,9 @@ def get_model_params_and_category_embeddings(model_name) -> str:
        else:
            return model_name
        # Determine the category based on the number of parameters
-        if params_million <= 150:
+        if params_million <= TOGETHER_AI_EMBEDDING_150_M:
            category = "together-ai-embedding-up-to-150m"
-        elif params_million <= 350:
+        elif params_million <= TOGETHER_AI_EMBEDDING_350_M:
            category = "together-ai-embedding-151m-to-350m"
        if category is not None:
            return category
--- a/litellm/llms/triton/completion/transformation.py
+++ b/litellm/llms/triton/completion/transformation.py
@ -7,6 +7,7 @@ from typing import Any, AsyncIterator, Dict, Iterator, List, Literal, Optional,
 from httpx import Headers, Response
 from litellm.constants import DEFAULT_MAX_TOKENS_FOR_TRITON
 from litellm.litellm_core_utils.prompt_templates.factory import prompt_factory
 from litellm.llms.base_llm.base_model_iterator import BaseModelResponseIterator
 from litellm.llms.base_llm.chat.transformation import (
@ -195,7 +196,9 @@ class TritonGenerateConfig(TritonConfig):
        data_for_triton: Dict[str, Any] = {
            "text_input": prompt_factory(model=model, messages=messages),
            "parameters": {
-                "max_tokens": int(optional_params.get("max_tokens", 2000)),
+                "max_tokens": int(
                    optional_params.get("max_tokens", DEFAULT_MAX_TOKENS_FOR_TRITON)
                ),
                "bad_words": [""],
                "stop_words": [""],
            },
--- a/litellm/router_utils/handle_error.py
+++ b/litellm/router_utils/handle_error.py
@ -1,6 +1,7 @@
 from typing import TYPE_CHECKING, Any, Optional
 from litellm._logging import verbose_router_logger
 from litellm.constants import MAX_EXCEPTION_MESSAGE_LENGTH
 from litellm.router_utils.cooldown_handlers import (
    _async_get_cooldown_deployments_with_debug_info,
 )
@ -54,7 +55,7 @@ async def send_llm_exception_alert(
    exception_str = str(original_exception)
    if litellm_debug_info is not None:
        exception_str += litellm_debug_info
-    exception_str += f"\n\n{error_traceback_str[:2000]}"
+    exception_str += f"\n\n{error_traceback_str[:MAX_EXCEPTION_MESSAGE_LENGTH]}"
    await litellm_router_instance.slack_alerting_logger.send_alert(
        message=f"LLM API call failed: `{exception_str}`",
--- a/litellm/types/integrations/datadog.py
+++ b/litellm/types/integrations/datadog.py
@ -1,6 +1,8 @@
 from enum import Enum
 from typing import Optional, TypedDict
 DD_MAX_BATCH_SIZE = 1000
 class DataDogStatus(str, Enum):
    INFO = "info"
--- a/litellm/types/integrations/gcs_bucket.py
+++ b/litellm/types/integrations/gcs_bucket.py
@ -8,6 +8,10 @@ else:
    VertexBase = Any
 GCS_DEFAULT_BATCH_SIZE = 2048
 GCS_DEFAULT_FLUSH_INTERVAL_SECONDS = 20
 class GCSLoggingConfig(TypedDict):
    """
    Internal LiteLLM Config for GCS Bucket logging
--- a/litellm/types/integrations/slack_alerting.py
+++ b/litellm/types/integrations/slack_alerting.py
@ -7,6 +7,9 @@ from pydantic import BaseModel, Field
 from litellm.types.utils import LiteLLMPydanticObjectBase
 SLACK_ALERTING_THRESHOLD_5_PERCENT = 0.05
 SLACK_ALERTING_THRESHOLD_15_PERCENT = 0.15
 class BaseOutageModel(TypedDict):
    alerts: List[int]
--- a/litellm/types/llms/azure.py
+++ b/litellm/types/llms/azure.py
@ -0,0 +1,2 @@
 API_VERSION_YEAR_SUPPORTED_RESPONSE_FORMAT = 2024
 API_VERSION_MONTH_SUPPORTED_RESPONSE_FORMAT = 8
--- a/litellm/types/llms/triton.py
+++ b/litellm/types/llms/triton.py
@ -0,0 +1 @@
--- a/tests/code_coverage_tests/ban_constant_numbers.py
+++ b/tests/code_coverage_tests/ban_constant_numbers.py
@ -35,6 +35,16 @@ ALLOWED_NUMBERS = {
    25,
    10000,
    60000,
    8,
    2048,
    16000000000,
    16,
    16383,
    14,
    24,
    128000,
    0.01,
    20,
 }
 # Add all standard HTTP status codes
@ -55,16 +65,23 @@ HTTP_STATUS_CODES = {
    402,  # Payment Required
    403,  # Forbidden
    404,  # Not Found
    406,  # Not Acceptable
    408,  # Request Timeout
    409,  # Conflict
    413,  # Payload Too Large
    422,  # Unprocessable Entity
    424,  # Failed Dependency
    429,  # Too Many Requests
    498,  # Invalid Token
    499,  # Client Closed Request
    500,  # Internal Server Error
    501,  # Not Implemented
    502,  # Bad Gateway
    503,  # Service Unavailable
    504,  # Gateway Timeout
    520,  # Web server is returning an unknown error
    522,  # Connection timed out
    524,  # A timeout occurred
    529,  # Site is overloaded
 }
@ -112,7 +129,13 @@ def check_file(filename):
 def main():
    exit_code = 0
    folder = "../../litellm"
-    ignore_files = ["constants.py", "proxy_cli.py"]
+    ignore_files = [
        "constants.py",
        "proxy_cli.py",
        "token_counter.py",
        "mock_functions.py",
        "duration_parser.py",
    ]
    ignore_folder = "types"
    for root, dirs, files in os.walk(folder):
        for filename in files:
--- a/tests/code_coverage_tests/log.txt
+++ b/tests/code_coverage_tests/log.txt
@ -1,139 +0,0 @@
 ERROR in ../../litellm/integrations/weights_biases.py: Hardcoded numbers detected:
  Line 10: 8
 ERROR in ../../litellm/integrations/gcs_bucket/gcs_bucket.py: Hardcoded numbers detected:
  Line 23: 2048
  Line 24: 20
 ERROR in ../../litellm/integrations/datadog/datadog.py: Hardcoded numbers detected:
  Line 165: 413
 ERROR in ../../litellm/integrations/SlackAlerting/slack_alerting.py: Hardcoded numbers detected:
  Line 649: 0.05
  Line 652: 0.15
  Line 1718: 24
 ERROR in ../../litellm/integrations/opik/utils.py: Hardcoded numbers detected:
  Line 14: 16000000000
  Line 16: 16
  Line 29: 16383
  Line 33: 14
 ERROR in ../../litellm/litellm_core_utils/token_counter.py: Hardcoded numbers detected:
  Line 100: 768
  Line 101: 2000
  Line 104: 768
  Line 104: 768
  Line 135: 512
  Line 135: 512
  Line 148: 8
  Line 157: 8
  Line 160: 8
  Line 192: 16
  Line 192: 24
  Line 202: 192
  Line 202: 207
  Line 202: 196
  Line 205: 255
  Line 215: 16
  Line 216: 24
  Line 216: 27
  Line 217: 27
  Line 220: 16
  Line 221: 26
  Line 221: 28
  Line 221: 16383
  Line 222: 28
  Line 222: 16383
  Line 225: 16
  Line 226: 21
  Line 227: 16383
  Line 228: 14
  Line 228: 16383
  Line 238: 85
 ERROR in ../../litellm/litellm_core_utils/litellm_logging.py: Hardcoded numbers detected:
  Line 3681: 16
  Line 3747: 20
 ERROR in ../../litellm/litellm_core_utils/mock_functions.py: Hardcoded numbers detected:
  Line 14: 1536
 ERROR in ../../litellm/litellm_core_utils/duration_parser.py: Hardcoded numbers detected:
  Line 30: 31
  Line 59: 86400
  Line 61: 604800
 ERROR in ../../litellm/litellm_core_utils/get_llm_provider_logic.py: Hardcoded numbers detected:
  Line 259: 64
  Line 262: 64
 ERROR in ../../litellm/litellm_core_utils/exception_mapping_utils.py: Hardcoded numbers detected:
  Line 527: 413
  Line 617: 413
  Line 688: 14
  Line 772: 424
  Line 1058: 424
  Line 1386: 498
  Line 1612: 406
  Line 1613: 413
  Line 1635: 522
  Line 1636: 524
  Line 1669: 520
  Line 1780: 524
 ERROR in ../../litellm/litellm_core_utils/llm_cost_calc/tool_call_cost_tracking.py: Hardcoded numbers detected:
  Line 135: 2.5
 ERROR in ../../litellm/litellm_core_utils/llm_cost_calc/utils.py: Hardcoded numbers detected:
  Line 13: 128000
 ERROR in ../../litellm/router_utils/handle_error.py: Hardcoded numbers detected:
  Line 57: 2000
 ERROR in ../../litellm/llms/azure/azure.py: Hardcoded numbers detected:
  Line 862: 120
  Line 962: 120
 ERROR in ../../litellm/llms/azure/common_utils.py: Hardcoded numbers detected:
  Line 353: 8
 ERROR in ../../litellm/llms/azure/chat/gpt_transformation.py: Hardcoded numbers detected:
  Line 126: 2024
  Line 126: 8
 ERROR in ../../litellm/llms/predibase/chat/transformation.py: Hardcoded numbers detected:
  Line 30: 256
  Line 96: 0.01
 ERROR in ../../litellm/llms/deepinfra/chat/transformation.py: Hardcoded numbers detected:
  Line 87: 0.0001
 ERROR in ../../litellm/llms/triton/completion/transformation.py: Hardcoded numbers detected:
  Line 198: 2000
  Line 274: 20
 ERROR in ../../litellm/llms/bedrock/base_aws_llm.py: Hardcoded numbers detected:
  Line 384: 75
 ERROR in ../../litellm/llms/bedrock/chat/invoke_handler.py: Hardcoded numbers detected:
  Line 1279: 392
  Line 1280: 2191
  Line 1281: 1796
 ERROR in ../../litellm/llms/fireworks_ai/cost_calculator.py: Hardcoded numbers detected:
  Line 28: 56
  Line 30: 176
  Line 40: 16.0
  Line 42: 80.0
 ERROR in ../../litellm/llms/replicate/chat/transformation.py: Hardcoded numbers detected:
  Line 223: 64
  Line 226: 64
 ERROR in ../../litellm/llms/replicate/chat/handler.py: Hardcoded numbers detected:
  Line 31: 0.5
  Line 80: 0.5
 ERROR in ../../litellm/llms/anthropic/chat/transformation.py: Hardcoded numbers detected:
  Line 53: 4096
  Line 66: 4096
 ERROR in ../../litellm/llms/anthropic/completion/transformation.py: Hardcoded numbers detected:
  Line 68: 256
 ERROR in ../../litellm/llms/huggingface/chat/transformation.py: Hardcoded numbers detected:
  Line 117: 0.01
 ERROR in ../../litellm/llms/together_ai/cost_calculator.py: Hardcoded numbers detected:
  Line 36: 8.0
  Line 38: 21.0
  Line 40: 41.0
  Line 42: 80.0
  Line 44: 110.0
  Line 72: 150
  Line 74: 350
 ERROR in ../../litellm/llms/openai/openai.py: Hardcoded numbers detected:
  Line 2018: 20
  Line 2083: 20
 ERROR in ../../litellm/llms/sagemaker/completion/transformation.py: Hardcoded numbers detected:
  Line 84: 0.01
 ERROR in ../../litellm/caching/qdrant_semantic_cache.py: Hardcoded numbers detected:
  Line 121: 0.99
  Line 135: 1536
 ERROR in ../../litellm/caching/caching.py: Hardcoded numbers detected:
  Line 409: 0.02
 ERROR in ../../litellm/caching/in_memory_cache.py: Hardcoded numbers detected:
  Line 55: 512
		`@ -0,0 +1,2 @@`
							`API_VERSION_YEAR_SUPPORTED_RESPONSE_FORMAT = 2024`
							`API_VERSION_MONTH_SUPPORTED_RESPONSE_FORMAT = 8`