mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-25 10:44:24 +00:00
refactor: refactor: move more constants into constants.py
This commit is contained in:
parent
3c26284aff
commit
04dbe4310c
7 changed files with 99 additions and 1115 deletions
|
@ -18,9 +18,22 @@ DEFAULT_IMAGE_HEIGHT = 300
|
||||||
DEFAULT_MAX_TOKENS = 256 # used when providers need a default
|
DEFAULT_MAX_TOKENS = 256 # used when providers need a default
|
||||||
MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 1024 # 1MB = 1024KB
|
MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 1024 # 1MB = 1024KB
|
||||||
SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000 # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic.
|
SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000 # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic.
|
||||||
|
MINIMUM_PROMPT_CACHE_TOKEN_COUNT = (
|
||||||
|
1024 # minimum number of tokens to cache a prompt by Anthropic
|
||||||
|
)
|
||||||
|
DEFAULT_TRIM_RATIO = 0.75 # default ratio of tokens to trim from the end of a prompt
|
||||||
|
#### TOKEN COUNTING ####
|
||||||
|
FUNCTION_DEFINITION_TOKEN_COUNT = 9
|
||||||
|
SYSTEM_MESSAGE_TOKEN_COUNT = 4
|
||||||
|
TOOL_CHOICE_OBJECT_TOKEN_COUNT = 4
|
||||||
#### RELIABILITY ####
|
#### RELIABILITY ####
|
||||||
REPEATED_STREAMING_CHUNK_LIMIT = 100 # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives.
|
REPEATED_STREAMING_CHUNK_LIMIT = 100 # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives.
|
||||||
DEFAULT_MAX_LRU_CACHE_SIZE = 16
|
DEFAULT_MAX_LRU_CACHE_SIZE = 16
|
||||||
|
INITIAL_RETRY_DELAY = 0.5
|
||||||
|
MAX_RETRY_DELAY = 8.0
|
||||||
|
JITTER = 0.75
|
||||||
|
DEFAULT_IN_MEMORY_TTL = 5 # default time to live for the in-memory cache
|
||||||
|
DEFAULT_POLLING_INTERVAL = 0.03 # default polling interval for the scheduler
|
||||||
#### Networking settings ####
|
#### Networking settings ####
|
||||||
request_timeout: float = 6000 # time in seconds
|
request_timeout: float = 6000 # time in seconds
|
||||||
STREAM_SSE_DONE_STRING: str = "[DONE]"
|
STREAM_SSE_DONE_STRING: str = "[DONE]"
|
||||||
|
|
|
@ -281,7 +281,7 @@ from litellm.router import (
|
||||||
LiteLLM_Params,
|
LiteLLM_Params,
|
||||||
ModelGroupInfo,
|
ModelGroupInfo,
|
||||||
)
|
)
|
||||||
from litellm.scheduler import DefaultPriorities, FlowItem, Scheduler
|
from litellm.scheduler import FlowItem, Scheduler
|
||||||
from litellm.secret_managers.aws_secret_manager import load_aws_kms
|
from litellm.secret_managers.aws_secret_manager import load_aws_kms
|
||||||
from litellm.secret_managers.google_kms import load_google_kms
|
from litellm.secret_managers.google_kms import load_google_kms
|
||||||
from litellm.secret_managers.main import (
|
from litellm.secret_managers.main import (
|
||||||
|
@ -301,6 +301,7 @@ from litellm.types.llms.openai import HttpxBinaryResponseContent
|
||||||
from litellm.types.router import DeploymentTypedDict
|
from litellm.types.router import DeploymentTypedDict
|
||||||
from litellm.types.router import ModelInfo as RouterModelInfo
|
from litellm.types.router import ModelInfo as RouterModelInfo
|
||||||
from litellm.types.router import RouterGeneralSettings, updateDeployment
|
from litellm.types.router import RouterGeneralSettings, updateDeployment
|
||||||
|
from litellm.types.scheduler import DefaultPriorities
|
||||||
from litellm.types.utils import CredentialItem, CustomHuggingfaceTokenizer
|
from litellm.types.utils import CredentialItem, CustomHuggingfaceTokenizer
|
||||||
from litellm.types.utils import ModelInfo as ModelMapInfo
|
from litellm.types.utils import ModelInfo as ModelMapInfo
|
||||||
from litellm.types.utils import RawRequestTypedDict, StandardLoggingPayload
|
from litellm.types.utils import RawRequestTypedDict, StandardLoggingPayload
|
||||||
|
|
|
@ -6,17 +6,14 @@ from pydantic import BaseModel
|
||||||
|
|
||||||
from litellm import print_verbose
|
from litellm import print_verbose
|
||||||
from litellm.caching.caching import DualCache, RedisCache
|
from litellm.caching.caching import DualCache, RedisCache
|
||||||
|
from litellm.constants import DEFAULT_IN_MEMORY_TTL, DEFAULT_POLLING_INTERVAL
|
||||||
|
|
||||||
|
|
||||||
class SchedulerCacheKeys(enum.Enum):
|
class SchedulerCacheKeys(enum.Enum):
|
||||||
queue = "scheduler:queue"
|
queue = "scheduler:queue"
|
||||||
default_in_memory_ttl = 5 # cache queue in-memory for 5s when redis cache available
|
default_in_memory_ttl = (
|
||||||
|
DEFAULT_IN_MEMORY_TTL # cache queue in-memory for 5s when redis cache available
|
||||||
|
)
|
||||||
class DefaultPriorities(enum.Enum):
|
|
||||||
High = 0
|
|
||||||
Medium = 128
|
|
||||||
Low = 255
|
|
||||||
|
|
||||||
|
|
||||||
class FlowItem(BaseModel):
|
class FlowItem(BaseModel):
|
||||||
|
@ -44,7 +41,9 @@ class Scheduler:
|
||||||
self.cache = DualCache(
|
self.cache = DualCache(
|
||||||
redis_cache=redis_cache, default_in_memory_ttl=default_in_memory_ttl
|
redis_cache=redis_cache, default_in_memory_ttl=default_in_memory_ttl
|
||||||
)
|
)
|
||||||
self.polling_interval = polling_interval or 0.03 # default to 3ms
|
self.polling_interval = (
|
||||||
|
polling_interval or DEFAULT_POLLING_INTERVAL
|
||||||
|
) # default to 3ms
|
||||||
|
|
||||||
async def add_request(self, request: FlowItem):
|
async def add_request(self, request: FlowItem):
|
||||||
# We use the priority directly, as lower values indicate higher priority
|
# We use the priority directly, as lower values indicate higher priority
|
||||||
|
|
7
litellm/types/scheduler.py
Normal file
7
litellm/types/scheduler.py
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
|
||||||
|
class DefaultPriorities(Enum):
|
||||||
|
High = 0
|
||||||
|
Medium = 128
|
||||||
|
Low = 255
|
|
@ -60,6 +60,16 @@ import litellm.litellm_core_utils.json_validation_rule
|
||||||
from litellm.caching._internal_lru_cache import lru_cache_wrapper
|
from litellm.caching._internal_lru_cache import lru_cache_wrapper
|
||||||
from litellm.caching.caching import DualCache
|
from litellm.caching.caching import DualCache
|
||||||
from litellm.caching.caching_handler import CachingHandlerResponse, LLMCachingHandler
|
from litellm.caching.caching_handler import CachingHandlerResponse, LLMCachingHandler
|
||||||
|
from litellm.constants import (
|
||||||
|
DEFAULT_MAX_LRU_CACHE_SIZE,
|
||||||
|
DEFAULT_TRIM_RATIO,
|
||||||
|
FUNCTION_DEFINITION_TOKEN_COUNT,
|
||||||
|
INITIAL_RETRY_DELAY,
|
||||||
|
JITTER,
|
||||||
|
MAX_RETRY_DELAY,
|
||||||
|
MINIMUM_PROMPT_CACHE_TOKEN_COUNT,
|
||||||
|
TOOL_CHOICE_OBJECT_TOKEN_COUNT,
|
||||||
|
)
|
||||||
from litellm.integrations.custom_guardrail import CustomGuardrail
|
from litellm.integrations.custom_guardrail import CustomGuardrail
|
||||||
from litellm.integrations.custom_logger import CustomLogger
|
from litellm.integrations.custom_logger import CustomLogger
|
||||||
from litellm.litellm_core_utils.core_helpers import (
|
from litellm.litellm_core_utils.core_helpers import (
|
||||||
|
@ -1519,7 +1529,7 @@ def _select_tokenizer(
|
||||||
return _select_tokenizer_helper(model=model)
|
return _select_tokenizer_helper(model=model)
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=128)
|
@lru_cache(maxsize=DEFAULT_MAX_LRU_CACHE_SIZE)
|
||||||
def _select_tokenizer_helper(model: str) -> SelectTokenizerResponse:
|
def _select_tokenizer_helper(model: str) -> SelectTokenizerResponse:
|
||||||
|
|
||||||
if litellm.disable_hf_tokenizer_download is True:
|
if litellm.disable_hf_tokenizer_download is True:
|
||||||
|
@ -1664,7 +1674,7 @@ def openai_token_counter( # noqa: PLR0915
|
||||||
|
|
||||||
if tools:
|
if tools:
|
||||||
num_tokens += len(encoding.encode(_format_function_definitions(tools)))
|
num_tokens += len(encoding.encode(_format_function_definitions(tools)))
|
||||||
num_tokens += 9 # Additional tokens for function definition of tools
|
num_tokens += FUNCTION_DEFINITION_TOKEN_COUNT # Additional tokens for function definition of tools
|
||||||
# If there's a system message and tools are present, subtract four tokens
|
# If there's a system message and tools are present, subtract four tokens
|
||||||
if tools and includes_system_message:
|
if tools and includes_system_message:
|
||||||
num_tokens -= 4
|
num_tokens -= 4
|
||||||
|
@ -1674,7 +1684,7 @@ def openai_token_counter( # noqa: PLR0915
|
||||||
if tool_choice == "none":
|
if tool_choice == "none":
|
||||||
num_tokens += 1
|
num_tokens += 1
|
||||||
elif isinstance(tool_choice, dict):
|
elif isinstance(tool_choice, dict):
|
||||||
num_tokens += 7
|
num_tokens += TOOL_CHOICE_OBJECT_TOKEN_COUNT
|
||||||
num_tokens += len(encoding.encode(tool_choice["function"]["name"]))
|
num_tokens += len(encoding.encode(tool_choice["function"]["name"]))
|
||||||
|
|
||||||
return num_tokens
|
return num_tokens
|
||||||
|
@ -5311,15 +5321,15 @@ def _calculate_retry_after(
|
||||||
if retry_after is not None and 0 < retry_after <= 60:
|
if retry_after is not None and 0 < retry_after <= 60:
|
||||||
return retry_after
|
return retry_after
|
||||||
|
|
||||||
initial_retry_delay = 0.5
|
initial_retry_delay = INITIAL_RETRY_DELAY
|
||||||
max_retry_delay = 8.0
|
max_retry_delay = MAX_RETRY_DELAY
|
||||||
nb_retries = max_retries - remaining_retries
|
nb_retries = max_retries - remaining_retries
|
||||||
|
|
||||||
# Apply exponential backoff, but not more than the max.
|
# Apply exponential backoff, but not more than the max.
|
||||||
sleep_seconds = min(initial_retry_delay * pow(2.0, nb_retries), max_retry_delay)
|
sleep_seconds = min(initial_retry_delay * pow(2.0, nb_retries), max_retry_delay)
|
||||||
|
|
||||||
# Apply some jitter, plus-or-minus half a second.
|
# Apply some jitter, plus-or-minus half a second.
|
||||||
jitter = 1 - 0.25 * random.random()
|
jitter = JITTER * random.random()
|
||||||
timeout = sleep_seconds * jitter
|
timeout = sleep_seconds * jitter
|
||||||
return timeout if timeout >= min_timeout else min_timeout
|
return timeout if timeout >= min_timeout else min_timeout
|
||||||
|
|
||||||
|
@ -5645,7 +5655,7 @@ def shorten_message_to_fit_limit(message, tokens_needed, model: Optional[str]):
|
||||||
def trim_messages(
|
def trim_messages(
|
||||||
messages,
|
messages,
|
||||||
model: Optional[str] = None,
|
model: Optional[str] = None,
|
||||||
trim_ratio: float = 0.75,
|
trim_ratio: float = DEFAULT_TRIM_RATIO,
|
||||||
return_response_tokens: bool = False,
|
return_response_tokens: bool = False,
|
||||||
max_tokens=None,
|
max_tokens=None,
|
||||||
):
|
):
|
||||||
|
@ -6477,7 +6487,7 @@ def is_prompt_caching_valid_prompt(
|
||||||
model=model,
|
model=model,
|
||||||
use_default_image_token_count=True,
|
use_default_image_token_count=True,
|
||||||
)
|
)
|
||||||
return token_count >= 1024
|
return token_count >= MINIMUM_PROMPT_CACHE_TOKEN_COUNT
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
verbose_logger.error(f"Error in is_prompt_caching_valid_prompt: {e}")
|
verbose_logger.error(f"Error in is_prompt_caching_valid_prompt: {e}")
|
||||||
return False
|
return False
|
||||||
|
|
|
@ -3,7 +3,32 @@ import ast
|
||||||
import os
|
import os
|
||||||
|
|
||||||
# Extremely restrictive set of allowed numbers
|
# Extremely restrictive set of allowed numbers
|
||||||
ALLOWED_NUMBERS = {0, 1, -1, 2, 10, 100, 1000}
|
ALLOWED_NUMBERS = {
|
||||||
|
0,
|
||||||
|
1,
|
||||||
|
-1,
|
||||||
|
2,
|
||||||
|
10,
|
||||||
|
100,
|
||||||
|
1000,
|
||||||
|
1,
|
||||||
|
4,
|
||||||
|
3,
|
||||||
|
500,
|
||||||
|
408,
|
||||||
|
422,
|
||||||
|
401,
|
||||||
|
404,
|
||||||
|
429,
|
||||||
|
6,
|
||||||
|
409,
|
||||||
|
60,
|
||||||
|
403,
|
||||||
|
400,
|
||||||
|
3600,
|
||||||
|
0.75,
|
||||||
|
503,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class HardcodedNumberFinder(ast.NodeVisitor):
|
class HardcodedNumberFinder(ast.NodeVisitor):
|
||||||
|
@ -47,10 +72,13 @@ def main():
|
||||||
exit_code = 0
|
exit_code = 0
|
||||||
folder = "../../litellm"
|
folder = "../../litellm"
|
||||||
ignore_file = "constants.py"
|
ignore_file = "constants.py"
|
||||||
|
ignore_folder = "types"
|
||||||
for root, dirs, files in os.walk(folder):
|
for root, dirs, files in os.walk(folder):
|
||||||
for filename in files:
|
for filename in files:
|
||||||
if filename.endswith(".py") and filename != ignore_file:
|
if filename.endswith(".py") and filename != ignore_file:
|
||||||
full_path = os.path.join(root, filename)
|
full_path = os.path.join(root, filename)
|
||||||
|
if ignore_folder in full_path:
|
||||||
|
continue
|
||||||
exit_code |= check_file(full_path)
|
exit_code |= check_file(full_path)
|
||||||
sys.exit(exit_code)
|
sys.exit(exit_code)
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue