refactor: refactor: move more constants into constants.py

2025-04-25 02:34:29 +00:00 · 2025-03-24 18:28:58 -07:00 · 2025-03-24 18:28:58 -07:00 · 04dbe4310c
commit 04dbe4310c
parent 3c26284aff
7 changed files with 99 additions and 1115 deletions
--- a/litellm/constants.py
+++ b/litellm/constants.py
@ -18,9 +18,22 @@ DEFAULT_IMAGE_HEIGHT = 300
 DEFAULT_MAX_TOKENS = 256  # used when providers need a default
 MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 1024  # 1MB = 1024KB
 SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000  # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic.
+MINIMUM_PROMPT_CACHE_TOKEN_COUNT = (
+    1024  # minimum number of tokens to cache a prompt by Anthropic
+)
+DEFAULT_TRIM_RATIO = 0.75  # default ratio of tokens to trim from the end of a prompt
+#### TOKEN COUNTING ####
+FUNCTION_DEFINITION_TOKEN_COUNT = 9
+SYSTEM_MESSAGE_TOKEN_COUNT = 4
+TOOL_CHOICE_OBJECT_TOKEN_COUNT = 4
 #### RELIABILITY ####
 REPEATED_STREAMING_CHUNK_LIMIT = 100  # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives.
 DEFAULT_MAX_LRU_CACHE_SIZE = 16
+INITIAL_RETRY_DELAY = 0.5
+MAX_RETRY_DELAY = 8.0
+JITTER = 0.75
+DEFAULT_IN_MEMORY_TTL = 5  # default time to live for the in-memory cache
+DEFAULT_POLLING_INTERVAL = 0.03  # default polling interval for the scheduler
 #### Networking settings ####
 request_timeout: float = 6000  # time in seconds
 STREAM_SSE_DONE_STRING: str = "[DONE]"
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -281,7 +281,7 @@ from litellm.router import (
    LiteLLM_Params,
    ModelGroupInfo,
 )
-from litellm.scheduler import DefaultPriorities, FlowItem, Scheduler
+from litellm.scheduler import FlowItem, Scheduler
 from litellm.secret_managers.aws_secret_manager import load_aws_kms
 from litellm.secret_managers.google_kms import load_google_kms
 from litellm.secret_managers.main import (
@ -301,6 +301,7 @@ from litellm.types.llms.openai import HttpxBinaryResponseContent
 from litellm.types.router import DeploymentTypedDict
 from litellm.types.router import ModelInfo as RouterModelInfo
 from litellm.types.router import RouterGeneralSettings, updateDeployment
+from litellm.types.scheduler import DefaultPriorities
 from litellm.types.utils import CredentialItem, CustomHuggingfaceTokenizer
 from litellm.types.utils import ModelInfo as ModelMapInfo
 from litellm.types.utils import RawRequestTypedDict, StandardLoggingPayload
--- a/litellm/scheduler.py
+++ b/litellm/scheduler.py
@ -6,17 +6,14 @@ from pydantic import BaseModel

 from litellm import print_verbose
 from litellm.caching.caching import DualCache, RedisCache
+from litellm.constants import DEFAULT_IN_MEMORY_TTL, DEFAULT_POLLING_INTERVAL


 class SchedulerCacheKeys(enum.Enum):
    queue = "scheduler:queue"
-    default_in_memory_ttl = 5  # cache queue in-memory for 5s when redis cache available
-
-
-class DefaultPriorities(enum.Enum):
-    High = 0
-    Medium = 128
-    Low = 255
+    default_in_memory_ttl = (
+        DEFAULT_IN_MEMORY_TTL  # cache queue in-memory for 5s when redis cache available
+    )


 class FlowItem(BaseModel):
@ -44,7 +41,9 @@ class Scheduler:
        self.cache = DualCache(
            redis_cache=redis_cache, default_in_memory_ttl=default_in_memory_ttl
        )
-        self.polling_interval = polling_interval or 0.03  # default to 3ms
+        self.polling_interval = (
+            polling_interval or DEFAULT_POLLING_INTERVAL
+        )  # default to 3ms

    async def add_request(self, request: FlowItem):
        # We use the priority directly, as lower values indicate higher priority
--- a/litellm/types/scheduler.py
+++ b/litellm/types/scheduler.py
@ -0,0 +1,7 @@
+from enum import Enum
+
+
+class DefaultPriorities(Enum):
+    High = 0
+    Medium = 128
+    Low = 255
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -60,6 +60,16 @@ import litellm.litellm_core_utils.json_validation_rule
 from litellm.caching._internal_lru_cache import lru_cache_wrapper
 from litellm.caching.caching import DualCache
 from litellm.caching.caching_handler import CachingHandlerResponse, LLMCachingHandler
+from litellm.constants import (
+    DEFAULT_MAX_LRU_CACHE_SIZE,
+    DEFAULT_TRIM_RATIO,
+    FUNCTION_DEFINITION_TOKEN_COUNT,
+    INITIAL_RETRY_DELAY,
+    JITTER,
+    MAX_RETRY_DELAY,
+    MINIMUM_PROMPT_CACHE_TOKEN_COUNT,
+    TOOL_CHOICE_OBJECT_TOKEN_COUNT,
+)
 from litellm.integrations.custom_guardrail import CustomGuardrail
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.litellm_core_utils.core_helpers import (
@ -1519,7 +1529,7 @@ def _select_tokenizer(
    return _select_tokenizer_helper(model=model)


-@lru_cache(maxsize=128)
+@lru_cache(maxsize=DEFAULT_MAX_LRU_CACHE_SIZE)
 def _select_tokenizer_helper(model: str) -> SelectTokenizerResponse:

    if litellm.disable_hf_tokenizer_download is True:
@ -1664,7 +1674,7 @@ def openai_token_counter(  # noqa: PLR0915

    if tools:
        num_tokens += len(encoding.encode(_format_function_definitions(tools)))
-        num_tokens += 9  # Additional tokens for function definition of tools
+        num_tokens += FUNCTION_DEFINITION_TOKEN_COUNT  # Additional tokens for function definition of tools
    # If there's a system message and tools are present, subtract four tokens
    if tools and includes_system_message:
        num_tokens -= 4
@ -1674,7 +1684,7 @@ def openai_token_counter(  # noqa: PLR0915
    if tool_choice == "none":
        num_tokens += 1
    elif isinstance(tool_choice, dict):
-        num_tokens += 7
+        num_tokens += TOOL_CHOICE_OBJECT_TOKEN_COUNT
        num_tokens += len(encoding.encode(tool_choice["function"]["name"]))

    return num_tokens
@ -5311,15 +5321,15 @@ def _calculate_retry_after(
    if retry_after is not None and 0 < retry_after <= 60:
        return retry_after

-    initial_retry_delay = 0.5
-    max_retry_delay = 8.0
+    initial_retry_delay = INITIAL_RETRY_DELAY
+    max_retry_delay = MAX_RETRY_DELAY
    nb_retries = max_retries - remaining_retries

    # Apply exponential backoff, but not more than the max.
    sleep_seconds = min(initial_retry_delay * pow(2.0, nb_retries), max_retry_delay)

    # Apply some jitter, plus-or-minus half a second.
-    jitter = 1 - 0.25 * random.random()
+    jitter = JITTER * random.random()
    timeout = sleep_seconds * jitter
    return timeout if timeout >= min_timeout else min_timeout

@ -5645,7 +5655,7 @@ def shorten_message_to_fit_limit(message, tokens_needed, model: Optional[str]):
 def trim_messages(
    messages,
    model: Optional[str] = None,
-    trim_ratio: float = 0.75,
+    trim_ratio: float = DEFAULT_TRIM_RATIO,
    return_response_tokens: bool = False,
    max_tokens=None,
 ):
@ -6477,7 +6487,7 @@ def is_prompt_caching_valid_prompt(
            model=model,
            use_default_image_token_count=True,
        )
-        return token_count >= 1024
+        return token_count >= MINIMUM_PROMPT_CACHE_TOKEN_COUNT
    except Exception as e:
        verbose_logger.error(f"Error in is_prompt_caching_valid_prompt: {e}")
        return False
--- a/tests/code_coverage_tests/ban_constant_numbers.py
+++ b/tests/code_coverage_tests/ban_constant_numbers.py
@ -3,7 +3,32 @@ import ast
 import os

 # Extremely restrictive set of allowed numbers
-ALLOWED_NUMBERS = {0, 1, -1, 2, 10, 100, 1000}
+ALLOWED_NUMBERS = {
+    0,
+    1,
+    -1,
+    2,
+    10,
+    100,
+    1000,
+    1,
+    4,
+    3,
+    500,
+    408,
+    422,
+    401,
+    404,
+    429,
+    6,
+    409,
+    60,
+    403,
+    400,
+    3600,
+    0.75,
+    503,
+}


 class HardcodedNumberFinder(ast.NodeVisitor):
@ -47,10 +72,13 @@ def main():
    exit_code = 0
    folder = "../../litellm"
    ignore_file = "constants.py"
+    ignore_folder = "types"
    for root, dirs, files in os.walk(folder):
        for filename in files:
            if filename.endswith(".py") and filename != ignore_file:
                full_path = os.path.join(root, filename)
+                if ignore_folder in full_path:
+                    continue
                exit_code |= check_file(full_path)
    sys.exit(exit_code)

--- a/tests/code_coverage_tests/log.txt
+++ b/tests/code_coverage_tests/log.txt