refactor: refactor: move more constants into constants.py

This commit is contained in:
Krrish Dholakia 2025-03-24 18:28:58 -07:00
parent 3c26284aff
commit 04dbe4310c
7 changed files with 99 additions and 1115 deletions

View file

@ -18,9 +18,22 @@ DEFAULT_IMAGE_HEIGHT = 300
DEFAULT_MAX_TOKENS = 256 # used when providers need a default DEFAULT_MAX_TOKENS = 256 # used when providers need a default
MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 1024 # 1MB = 1024KB MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 1024 # 1MB = 1024KB
SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000 # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic. SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000 # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic.
MINIMUM_PROMPT_CACHE_TOKEN_COUNT = (
1024 # minimum number of tokens to cache a prompt by Anthropic
)
DEFAULT_TRIM_RATIO = 0.75 # default ratio of tokens to trim from the end of a prompt
#### TOKEN COUNTING ####
FUNCTION_DEFINITION_TOKEN_COUNT = 9
SYSTEM_MESSAGE_TOKEN_COUNT = 4
TOOL_CHOICE_OBJECT_TOKEN_COUNT = 4
#### RELIABILITY #### #### RELIABILITY ####
REPEATED_STREAMING_CHUNK_LIMIT = 100 # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives. REPEATED_STREAMING_CHUNK_LIMIT = 100 # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives.
DEFAULT_MAX_LRU_CACHE_SIZE = 16 DEFAULT_MAX_LRU_CACHE_SIZE = 16
INITIAL_RETRY_DELAY = 0.5
MAX_RETRY_DELAY = 8.0
JITTER = 0.75
DEFAULT_IN_MEMORY_TTL = 5 # default time to live for the in-memory cache
DEFAULT_POLLING_INTERVAL = 0.03 # default polling interval for the scheduler
#### Networking settings #### #### Networking settings ####
request_timeout: float = 6000 # time in seconds request_timeout: float = 6000 # time in seconds
STREAM_SSE_DONE_STRING: str = "[DONE]" STREAM_SSE_DONE_STRING: str = "[DONE]"

View file

@ -281,7 +281,7 @@ from litellm.router import (
LiteLLM_Params, LiteLLM_Params,
ModelGroupInfo, ModelGroupInfo,
) )
from litellm.scheduler import DefaultPriorities, FlowItem, Scheduler from litellm.scheduler import FlowItem, Scheduler
from litellm.secret_managers.aws_secret_manager import load_aws_kms from litellm.secret_managers.aws_secret_manager import load_aws_kms
from litellm.secret_managers.google_kms import load_google_kms from litellm.secret_managers.google_kms import load_google_kms
from litellm.secret_managers.main import ( from litellm.secret_managers.main import (
@ -301,6 +301,7 @@ from litellm.types.llms.openai import HttpxBinaryResponseContent
from litellm.types.router import DeploymentTypedDict from litellm.types.router import DeploymentTypedDict
from litellm.types.router import ModelInfo as RouterModelInfo from litellm.types.router import ModelInfo as RouterModelInfo
from litellm.types.router import RouterGeneralSettings, updateDeployment from litellm.types.router import RouterGeneralSettings, updateDeployment
from litellm.types.scheduler import DefaultPriorities
from litellm.types.utils import CredentialItem, CustomHuggingfaceTokenizer from litellm.types.utils import CredentialItem, CustomHuggingfaceTokenizer
from litellm.types.utils import ModelInfo as ModelMapInfo from litellm.types.utils import ModelInfo as ModelMapInfo
from litellm.types.utils import RawRequestTypedDict, StandardLoggingPayload from litellm.types.utils import RawRequestTypedDict, StandardLoggingPayload

View file

@ -6,17 +6,14 @@ from pydantic import BaseModel
from litellm import print_verbose from litellm import print_verbose
from litellm.caching.caching import DualCache, RedisCache from litellm.caching.caching import DualCache, RedisCache
from litellm.constants import DEFAULT_IN_MEMORY_TTL, DEFAULT_POLLING_INTERVAL
class SchedulerCacheKeys(enum.Enum): class SchedulerCacheKeys(enum.Enum):
queue = "scheduler:queue" queue = "scheduler:queue"
default_in_memory_ttl = 5 # cache queue in-memory for 5s when redis cache available default_in_memory_ttl = (
DEFAULT_IN_MEMORY_TTL # cache queue in-memory for 5s when redis cache available
)
class DefaultPriorities(enum.Enum):
High = 0
Medium = 128
Low = 255
class FlowItem(BaseModel): class FlowItem(BaseModel):
@ -44,7 +41,9 @@ class Scheduler:
self.cache = DualCache( self.cache = DualCache(
redis_cache=redis_cache, default_in_memory_ttl=default_in_memory_ttl redis_cache=redis_cache, default_in_memory_ttl=default_in_memory_ttl
) )
self.polling_interval = polling_interval or 0.03 # default to 3ms self.polling_interval = (
polling_interval or DEFAULT_POLLING_INTERVAL
) # default to 3ms
async def add_request(self, request: FlowItem): async def add_request(self, request: FlowItem):
# We use the priority directly, as lower values indicate higher priority # We use the priority directly, as lower values indicate higher priority

View file

@ -0,0 +1,7 @@
from enum import Enum
class DefaultPriorities(Enum):
High = 0
Medium = 128
Low = 255

View file

@ -60,6 +60,16 @@ import litellm.litellm_core_utils.json_validation_rule
from litellm.caching._internal_lru_cache import lru_cache_wrapper from litellm.caching._internal_lru_cache import lru_cache_wrapper
from litellm.caching.caching import DualCache from litellm.caching.caching import DualCache
from litellm.caching.caching_handler import CachingHandlerResponse, LLMCachingHandler from litellm.caching.caching_handler import CachingHandlerResponse, LLMCachingHandler
from litellm.constants import (
DEFAULT_MAX_LRU_CACHE_SIZE,
DEFAULT_TRIM_RATIO,
FUNCTION_DEFINITION_TOKEN_COUNT,
INITIAL_RETRY_DELAY,
JITTER,
MAX_RETRY_DELAY,
MINIMUM_PROMPT_CACHE_TOKEN_COUNT,
TOOL_CHOICE_OBJECT_TOKEN_COUNT,
)
from litellm.integrations.custom_guardrail import CustomGuardrail from litellm.integrations.custom_guardrail import CustomGuardrail
from litellm.integrations.custom_logger import CustomLogger from litellm.integrations.custom_logger import CustomLogger
from litellm.litellm_core_utils.core_helpers import ( from litellm.litellm_core_utils.core_helpers import (
@ -1519,7 +1529,7 @@ def _select_tokenizer(
return _select_tokenizer_helper(model=model) return _select_tokenizer_helper(model=model)
@lru_cache(maxsize=128) @lru_cache(maxsize=DEFAULT_MAX_LRU_CACHE_SIZE)
def _select_tokenizer_helper(model: str) -> SelectTokenizerResponse: def _select_tokenizer_helper(model: str) -> SelectTokenizerResponse:
if litellm.disable_hf_tokenizer_download is True: if litellm.disable_hf_tokenizer_download is True:
@ -1664,7 +1674,7 @@ def openai_token_counter( # noqa: PLR0915
if tools: if tools:
num_tokens += len(encoding.encode(_format_function_definitions(tools))) num_tokens += len(encoding.encode(_format_function_definitions(tools)))
num_tokens += 9 # Additional tokens for function definition of tools num_tokens += FUNCTION_DEFINITION_TOKEN_COUNT # Additional tokens for function definition of tools
# If there's a system message and tools are present, subtract four tokens # If there's a system message and tools are present, subtract four tokens
if tools and includes_system_message: if tools and includes_system_message:
num_tokens -= 4 num_tokens -= 4
@ -1674,7 +1684,7 @@ def openai_token_counter( # noqa: PLR0915
if tool_choice == "none": if tool_choice == "none":
num_tokens += 1 num_tokens += 1
elif isinstance(tool_choice, dict): elif isinstance(tool_choice, dict):
num_tokens += 7 num_tokens += TOOL_CHOICE_OBJECT_TOKEN_COUNT
num_tokens += len(encoding.encode(tool_choice["function"]["name"])) num_tokens += len(encoding.encode(tool_choice["function"]["name"]))
return num_tokens return num_tokens
@ -5311,15 +5321,15 @@ def _calculate_retry_after(
if retry_after is not None and 0 < retry_after <= 60: if retry_after is not None and 0 < retry_after <= 60:
return retry_after return retry_after
initial_retry_delay = 0.5 initial_retry_delay = INITIAL_RETRY_DELAY
max_retry_delay = 8.0 max_retry_delay = MAX_RETRY_DELAY
nb_retries = max_retries - remaining_retries nb_retries = max_retries - remaining_retries
# Apply exponential backoff, but not more than the max. # Apply exponential backoff, but not more than the max.
sleep_seconds = min(initial_retry_delay * pow(2.0, nb_retries), max_retry_delay) sleep_seconds = min(initial_retry_delay * pow(2.0, nb_retries), max_retry_delay)
# Apply some jitter, plus-or-minus half a second. # Apply some jitter, plus-or-minus half a second.
jitter = 1 - 0.25 * random.random() jitter = JITTER * random.random()
timeout = sleep_seconds * jitter timeout = sleep_seconds * jitter
return timeout if timeout >= min_timeout else min_timeout return timeout if timeout >= min_timeout else min_timeout
@ -5645,7 +5655,7 @@ def shorten_message_to_fit_limit(message, tokens_needed, model: Optional[str]):
def trim_messages( def trim_messages(
messages, messages,
model: Optional[str] = None, model: Optional[str] = None,
trim_ratio: float = 0.75, trim_ratio: float = DEFAULT_TRIM_RATIO,
return_response_tokens: bool = False, return_response_tokens: bool = False,
max_tokens=None, max_tokens=None,
): ):
@ -6477,7 +6487,7 @@ def is_prompt_caching_valid_prompt(
model=model, model=model,
use_default_image_token_count=True, use_default_image_token_count=True,
) )
return token_count >= 1024 return token_count >= MINIMUM_PROMPT_CACHE_TOKEN_COUNT
except Exception as e: except Exception as e:
verbose_logger.error(f"Error in is_prompt_caching_valid_prompt: {e}") verbose_logger.error(f"Error in is_prompt_caching_valid_prompt: {e}")
return False return False

View file

@ -3,7 +3,32 @@ import ast
import os import os
# Extremely restrictive set of allowed numbers # Extremely restrictive set of allowed numbers
ALLOWED_NUMBERS = {0, 1, -1, 2, 10, 100, 1000} ALLOWED_NUMBERS = {
0,
1,
-1,
2,
10,
100,
1000,
1,
4,
3,
500,
408,
422,
401,
404,
429,
6,
409,
60,
403,
400,
3600,
0.75,
503,
}
class HardcodedNumberFinder(ast.NodeVisitor): class HardcodedNumberFinder(ast.NodeVisitor):
@ -47,10 +72,13 @@ def main():
exit_code = 0 exit_code = 0
folder = "../../litellm" folder = "../../litellm"
ignore_file = "constants.py" ignore_file = "constants.py"
ignore_folder = "types"
for root, dirs, files in os.walk(folder): for root, dirs, files in os.walk(folder):
for filename in files: for filename in files:
if filename.endswith(".py") and filename != ignore_file: if filename.endswith(".py") and filename != ignore_file:
full_path = os.path.join(root, filename) full_path = os.path.join(root, filename)
if ignore_folder in full_path:
continue
exit_code |= check_file(full_path) exit_code |= check_file(full_path)
sys.exit(exit_code) sys.exit(exit_code)

File diff suppressed because it is too large Load diff