refactor: refactor: move more constants into constants.py

This commit is contained in:
Krrish Dholakia 2025-03-24 18:28:58 -07:00
parent 3c26284aff
commit 04dbe4310c
7 changed files with 99 additions and 1115 deletions

View file

@ -18,9 +18,22 @@ DEFAULT_IMAGE_HEIGHT = 300
DEFAULT_MAX_TOKENS = 256 # used when providers need a default
MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 1024 # 1MB = 1024KB
SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000 # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic.
MINIMUM_PROMPT_CACHE_TOKEN_COUNT = (
1024 # minimum number of tokens to cache a prompt by Anthropic
)
DEFAULT_TRIM_RATIO = 0.75 # default ratio of tokens to trim from the end of a prompt
#### TOKEN COUNTING ####
FUNCTION_DEFINITION_TOKEN_COUNT = 9
SYSTEM_MESSAGE_TOKEN_COUNT = 4
TOOL_CHOICE_OBJECT_TOKEN_COUNT = 4
#### RELIABILITY ####
REPEATED_STREAMING_CHUNK_LIMIT = 100 # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives.
DEFAULT_MAX_LRU_CACHE_SIZE = 16
INITIAL_RETRY_DELAY = 0.5
MAX_RETRY_DELAY = 8.0
JITTER = 0.75
DEFAULT_IN_MEMORY_TTL = 5 # default time to live for the in-memory cache
DEFAULT_POLLING_INTERVAL = 0.03 # default polling interval for the scheduler
#### Networking settings ####
request_timeout: float = 6000 # time in seconds
STREAM_SSE_DONE_STRING: str = "[DONE]"

View file

@ -281,7 +281,7 @@ from litellm.router import (
LiteLLM_Params,
ModelGroupInfo,
)
from litellm.scheduler import DefaultPriorities, FlowItem, Scheduler
from litellm.scheduler import FlowItem, Scheduler
from litellm.secret_managers.aws_secret_manager import load_aws_kms
from litellm.secret_managers.google_kms import load_google_kms
from litellm.secret_managers.main import (
@ -301,6 +301,7 @@ from litellm.types.llms.openai import HttpxBinaryResponseContent
from litellm.types.router import DeploymentTypedDict
from litellm.types.router import ModelInfo as RouterModelInfo
from litellm.types.router import RouterGeneralSettings, updateDeployment
from litellm.types.scheduler import DefaultPriorities
from litellm.types.utils import CredentialItem, CustomHuggingfaceTokenizer
from litellm.types.utils import ModelInfo as ModelMapInfo
from litellm.types.utils import RawRequestTypedDict, StandardLoggingPayload

View file

@ -6,17 +6,14 @@ from pydantic import BaseModel
from litellm import print_verbose
from litellm.caching.caching import DualCache, RedisCache
from litellm.constants import DEFAULT_IN_MEMORY_TTL, DEFAULT_POLLING_INTERVAL
class SchedulerCacheKeys(enum.Enum):
queue = "scheduler:queue"
default_in_memory_ttl = 5 # cache queue in-memory for 5s when redis cache available
class DefaultPriorities(enum.Enum):
High = 0
Medium = 128
Low = 255
default_in_memory_ttl = (
DEFAULT_IN_MEMORY_TTL # cache queue in-memory for 5s when redis cache available
)
class FlowItem(BaseModel):
@ -44,7 +41,9 @@ class Scheduler:
self.cache = DualCache(
redis_cache=redis_cache, default_in_memory_ttl=default_in_memory_ttl
)
self.polling_interval = polling_interval or 0.03 # default to 3ms
self.polling_interval = (
polling_interval or DEFAULT_POLLING_INTERVAL
) # default to 3ms
async def add_request(self, request: FlowItem):
# We use the priority directly, as lower values indicate higher priority

View file

@ -0,0 +1,7 @@
from enum import Enum
class DefaultPriorities(Enum):
High = 0
Medium = 128
Low = 255

View file

@ -60,6 +60,16 @@ import litellm.litellm_core_utils.json_validation_rule
from litellm.caching._internal_lru_cache import lru_cache_wrapper
from litellm.caching.caching import DualCache
from litellm.caching.caching_handler import CachingHandlerResponse, LLMCachingHandler
from litellm.constants import (
DEFAULT_MAX_LRU_CACHE_SIZE,
DEFAULT_TRIM_RATIO,
FUNCTION_DEFINITION_TOKEN_COUNT,
INITIAL_RETRY_DELAY,
JITTER,
MAX_RETRY_DELAY,
MINIMUM_PROMPT_CACHE_TOKEN_COUNT,
TOOL_CHOICE_OBJECT_TOKEN_COUNT,
)
from litellm.integrations.custom_guardrail import CustomGuardrail
from litellm.integrations.custom_logger import CustomLogger
from litellm.litellm_core_utils.core_helpers import (
@ -1519,7 +1529,7 @@ def _select_tokenizer(
return _select_tokenizer_helper(model=model)
@lru_cache(maxsize=128)
@lru_cache(maxsize=DEFAULT_MAX_LRU_CACHE_SIZE)
def _select_tokenizer_helper(model: str) -> SelectTokenizerResponse:
if litellm.disable_hf_tokenizer_download is True:
@ -1664,7 +1674,7 @@ def openai_token_counter( # noqa: PLR0915
if tools:
num_tokens += len(encoding.encode(_format_function_definitions(tools)))
num_tokens += 9 # Additional tokens for function definition of tools
num_tokens += FUNCTION_DEFINITION_TOKEN_COUNT # Additional tokens for function definition of tools
# If there's a system message and tools are present, subtract four tokens
if tools and includes_system_message:
num_tokens -= 4
@ -1674,7 +1684,7 @@ def openai_token_counter( # noqa: PLR0915
if tool_choice == "none":
num_tokens += 1
elif isinstance(tool_choice, dict):
num_tokens += 7
num_tokens += TOOL_CHOICE_OBJECT_TOKEN_COUNT
num_tokens += len(encoding.encode(tool_choice["function"]["name"]))
return num_tokens
@ -5311,15 +5321,15 @@ def _calculate_retry_after(
if retry_after is not None and 0 < retry_after <= 60:
return retry_after
initial_retry_delay = 0.5
max_retry_delay = 8.0
initial_retry_delay = INITIAL_RETRY_DELAY
max_retry_delay = MAX_RETRY_DELAY
nb_retries = max_retries - remaining_retries
# Apply exponential backoff, but not more than the max.
sleep_seconds = min(initial_retry_delay * pow(2.0, nb_retries), max_retry_delay)
# Apply some jitter, plus-or-minus half a second.
jitter = 1 - 0.25 * random.random()
jitter = JITTER * random.random()
timeout = sleep_seconds * jitter
return timeout if timeout >= min_timeout else min_timeout
@ -5645,7 +5655,7 @@ def shorten_message_to_fit_limit(message, tokens_needed, model: Optional[str]):
def trim_messages(
messages,
model: Optional[str] = None,
trim_ratio: float = 0.75,
trim_ratio: float = DEFAULT_TRIM_RATIO,
return_response_tokens: bool = False,
max_tokens=None,
):
@ -6477,7 +6487,7 @@ def is_prompt_caching_valid_prompt(
model=model,
use_default_image_token_count=True,
)
return token_count >= 1024
return token_count >= MINIMUM_PROMPT_CACHE_TOKEN_COUNT
except Exception as e:
verbose_logger.error(f"Error in is_prompt_caching_valid_prompt: {e}")
return False

View file

@ -3,7 +3,32 @@ import ast
import os
# Extremely restrictive set of allowed numbers
ALLOWED_NUMBERS = {0, 1, -1, 2, 10, 100, 1000}
ALLOWED_NUMBERS = {
0,
1,
-1,
2,
10,
100,
1000,
1,
4,
3,
500,
408,
422,
401,
404,
429,
6,
409,
60,
403,
400,
3600,
0.75,
503,
}
class HardcodedNumberFinder(ast.NodeVisitor):
@ -47,10 +72,13 @@ def main():
exit_code = 0
folder = "../../litellm"
ignore_file = "constants.py"
ignore_folder = "types"
for root, dirs, files in os.walk(folder):
for filename in files:
if filename.endswith(".py") and filename != ignore_file:
full_path = os.path.join(root, filename)
if ignore_folder in full_path:
continue
exit_code |= check_file(full_path)
sys.exit(exit_code)

File diff suppressed because it is too large Load diff