Squashed commit of the following: (#9709)

commit b12a9892b7
Author: Krrish Dholakia <krrishdholakia@gmail.com>
Date:   Wed Apr 2 08:09:56 2025 -0700

    fix(utils.py): don't modify openai_token_counter

commit 294de31803
Author: Krrish Dholakia <krrishdholakia@gmail.com>
Date:   Mon Mar 24 21:22:40 2025 -0700

    fix: fix linting error

commit cb6e9fbe40
Author: Krrish Dholakia <krrishdholakia@gmail.com>
Date:   Mon Mar 24 19:52:45 2025 -0700

    refactor: complete migration

commit bfc159172d
Author: Krrish Dholakia <krrishdholakia@gmail.com>
Date:   Mon Mar 24 19:09:59 2025 -0700

    refactor: refactor more constants

commit 43ffb6a558
Author: Krrish Dholakia <krrishdholakia@gmail.com>
Date:   Mon Mar 24 18:45:24 2025 -0700

    fix: test

commit 04dbe4310c
Author: Krrish Dholakia <krrishdholakia@gmail.com>
Date:   Mon Mar 24 18:28:58 2025 -0700

    refactor: refactor: move more constants into constants.py

commit 3c26284aff
Author: Krrish Dholakia <krrishdholakia@gmail.com>
Date:   Mon Mar 24 18:14:46 2025 -0700

    refactor: migrate hardcoded constants out of __init__.py

commit c11e0de69d
Author: Krrish Dholakia <krrishdholakia@gmail.com>
Date:   Mon Mar 24 18:11:21 2025 -0700

    build: migrate all constants into constants.py

commit 7882bdc787
Author: Krrish Dholakia <krrishdholakia@gmail.com>
Date:   Mon Mar 24 18:07:37 2025 -0700

    build: initial test banning hardcoded numbers in repo
This commit is contained in:
Krish Dholakia 2025-04-02 21:24:54 -07:00 committed by GitHub
parent 5a722ef18f
commit 8ee32291e0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
51 changed files with 509 additions and 118 deletions

View file

@ -56,6 +56,9 @@ from litellm.constants import (
bedrock_embedding_models,
known_tokenizer_config,
BEDROCK_INVOKE_PROVIDERS_LITERAL,
DEFAULT_MAX_TOKENS,
DEFAULT_SOFT_BUDGET,
DEFAULT_ALLOWED_FAILS,
)
from litellm.types.guardrails import GuardrailItem
from litellm.proxy._types import (
@ -155,7 +158,7 @@ token: Optional[
str
] = None # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
telemetry = True
max_tokens = 256 # OpenAI Defaults
max_tokens: int = DEFAULT_MAX_TOKENS # OpenAI Defaults
drop_params = bool(os.getenv("LITELLM_DROP_PARAMS", False))
modify_params = False
retry = True
@ -244,7 +247,7 @@ budget_duration: Optional[
str
] = None # proxy only - resets budget after fixed duration. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
default_soft_budget: float = (
50.0 # by default all litellm proxy keys have a soft budget of 50.0
DEFAULT_SOFT_BUDGET # by default all litellm proxy keys have a soft budget of 50.0
)
forward_traceparent_to_llm_provider: bool = False

View file

@ -18,6 +18,7 @@ import redis # type: ignore
import redis.asyncio as async_redis # type: ignore
from litellm import get_secret, get_secret_str
from litellm.constants import REDIS_CONNECTION_POOL_TIMEOUT, REDIS_SOCKET_TIMEOUT
from ._logging import verbose_logger
@ -215,7 +216,7 @@ def _init_redis_sentinel(redis_kwargs) -> redis.Redis:
# Set up the Sentinel client
sentinel = redis.Sentinel(
sentinel_nodes,
socket_timeout=0.1,
socket_timeout=REDIS_SOCKET_TIMEOUT,
password=sentinel_password,
)
@ -239,7 +240,7 @@ def _init_async_redis_sentinel(redis_kwargs) -> async_redis.Redis:
# Set up the Sentinel client
sentinel = async_redis.Sentinel(
sentinel_nodes,
socket_timeout=0.1,
socket_timeout=REDIS_SOCKET_TIMEOUT,
password=sentinel_password,
)
@ -319,7 +320,7 @@ def get_redis_connection_pool(**env_overrides):
verbose_logger.debug("get_redis_connection_pool: redis_kwargs", redis_kwargs)
if "url" in redis_kwargs and redis_kwargs["url"] is not None:
return async_redis.BlockingConnectionPool.from_url(
timeout=5, url=redis_kwargs["url"]
timeout=REDIS_CONNECTION_POOL_TIMEOUT, url=redis_kwargs["url"]
)
connection_class = async_redis.Connection
if "ssl" in redis_kwargs:
@ -327,4 +328,6 @@ def get_redis_connection_pool(**env_overrides):
redis_kwargs.pop("ssl", None)
redis_kwargs["connection_class"] = connection_class
redis_kwargs.pop("startup_nodes", None)
return async_redis.BlockingConnectionPool(timeout=5, **redis_kwargs)
return async_redis.BlockingConnectionPool(
timeout=REDIS_CONNECTION_POOL_TIMEOUT, **redis_kwargs
)

View file

@ -14,6 +14,12 @@ import time
from typing import Literal, Optional
import litellm
from litellm.constants import (
DAYS_IN_A_MONTH,
DAYS_IN_A_WEEK,
DAYS_IN_A_YEAR,
HOURS_IN_A_DAY,
)
from litellm.utils import ModelResponse
@ -81,11 +87,11 @@ class BudgetManager:
if duration == "daily":
duration_in_days = 1
elif duration == "weekly":
duration_in_days = 7
duration_in_days = DAYS_IN_A_WEEK
elif duration == "monthly":
duration_in_days = 28
duration_in_days = DAYS_IN_A_MONTH
elif duration == "yearly":
duration_in_days = 365
duration_in_days = DAYS_IN_A_YEAR
else:
raise ValueError(
"""duration needs to be one of ["daily", "weekly", "monthly", "yearly"]"""
@ -182,7 +188,9 @@ class BudgetManager:
current_time = time.time()
# Convert duration from days to seconds
duration_in_seconds = self.user_dict[user]["duration"] * 24 * 60 * 60
duration_in_seconds = (
self.user_dict[user]["duration"] * HOURS_IN_A_DAY * 60 * 60
)
# Check if duration has elapsed
if current_time - last_updated_at >= duration_in_seconds:

View file

@ -19,6 +19,7 @@ from pydantic import BaseModel
import litellm
from litellm._logging import verbose_logger
from litellm.constants import CACHED_STREAMING_CHUNK_DELAY
from litellm.litellm_core_utils.model_param_helper import ModelParamHelper
from litellm.types.caching import *
from litellm.types.utils import all_litellm_params
@ -406,7 +407,7 @@ class Cache:
}
]
}
time.sleep(0.02)
time.sleep(CACHED_STREAMING_CHUNK_DELAY)
def _get_cache_logic(
self,

View file

@ -15,7 +15,8 @@ from typing import Any, List, Optional
from pydantic import BaseModel
from ..constants import MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB
from litellm.constants import MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB
from .base_cache import BaseCache
@ -52,7 +53,8 @@ class InMemoryCache(BaseCache):
# Fast path for common primitive types that are typically small
if (
isinstance(value, (bool, int, float, str))
and len(str(value)) < self.max_size_per_item * 512
and len(str(value))
< self.max_size_per_item * MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB
): # Conservative estimate
return True

View file

@ -11,10 +11,12 @@ Has 4 methods:
import ast
import asyncio
import json
from typing import Any
from typing import Any, cast
import litellm
from litellm._logging import print_verbose
from litellm.constants import QDRANT_SCALAR_QUANTILE, QDRANT_VECTOR_SIZE
from litellm.types.utils import EmbeddingResponse
from .base_cache import BaseCache
@ -118,7 +120,11 @@ class QdrantSemanticCache(BaseCache):
}
elif quantization_config == "scalar":
quantization_params = {
"scalar": {"type": "int8", "quantile": 0.99, "always_ram": False}
"scalar": {
"type": "int8",
"quantile": QDRANT_SCALAR_QUANTILE,
"always_ram": False,
}
}
elif quantization_config == "product":
quantization_params = {
@ -132,7 +138,7 @@ class QdrantSemanticCache(BaseCache):
new_collection_status = self.sync_client.put(
url=f"{self.qdrant_api_base}/collections/{self.collection_name}",
json={
"vectors": {"size": 1536, "distance": "Cosine"},
"vectors": {"size": QDRANT_VECTOR_SIZE, "distance": "Cosine"},
"quantization_config": quantization_params,
},
headers=self.headers,
@ -171,10 +177,13 @@ class QdrantSemanticCache(BaseCache):
prompt += message["content"]
# create an embedding for prompt
embedding_response = litellm.embedding(
model=self.embedding_model,
input=prompt,
cache={"no-store": True, "no-cache": True},
embedding_response = cast(
EmbeddingResponse,
litellm.embedding(
model=self.embedding_model,
input=prompt,
cache={"no-store": True, "no-cache": True},
),
)
# get the embedding
@ -212,10 +221,13 @@ class QdrantSemanticCache(BaseCache):
prompt += message["content"]
# convert to embedding
embedding_response = litellm.embedding(
model=self.embedding_model,
input=prompt,
cache={"no-store": True, "no-cache": True},
embedding_response = cast(
EmbeddingResponse,
litellm.embedding(
model=self.embedding_model,
input=prompt,
cache={"no-store": True, "no-cache": True},
),
)
# get the embedding

View file

@ -9,6 +9,7 @@ DEFAULT_FAILURE_THRESHOLD_PERCENT = (
0.5 # default cooldown a deployment if 50% of requests fail in a given minute
)
DEFAULT_MAX_TOKENS = 4096
DEFAULT_ALLOWED_FAILS = 3
DEFAULT_REDIS_SYNC_INTERVAL = 1
DEFAULT_COOLDOWN_TIME_SECONDS = 5
DEFAULT_REPLICATE_POLLING_RETRIES = 5
@ -16,16 +17,71 @@ DEFAULT_REPLICATE_POLLING_DELAY_SECONDS = 1
DEFAULT_IMAGE_TOKEN_COUNT = 250
DEFAULT_IMAGE_WIDTH = 300
DEFAULT_IMAGE_HEIGHT = 300
DEFAULT_MAX_TOKENS = 256 # used when providers need a default
MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 1024 # 1MB = 1024KB
SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000 # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic.
REDIS_UPDATE_BUFFER_KEY = "litellm_spend_update_buffer"
REDIS_DAILY_SPEND_UPDATE_BUFFER_KEY = "litellm_daily_spend_update_buffer"
MAX_REDIS_BUFFER_DEQUEUE_COUNT = 100
MINIMUM_PROMPT_CACHE_TOKEN_COUNT = (
1024 # minimum number of tokens to cache a prompt by Anthropic
)
DEFAULT_TRIM_RATIO = 0.75 # default ratio of tokens to trim from the end of a prompt
HOURS_IN_A_DAY = 24
DAYS_IN_A_WEEK = 7
DAYS_IN_A_MONTH = 28
DAYS_IN_A_YEAR = 365
REPLICATE_MODEL_NAME_WITH_ID_LENGTH = 64
#### TOKEN COUNTING ####
FUNCTION_DEFINITION_TOKEN_COUNT = 9
SYSTEM_MESSAGE_TOKEN_COUNT = 4
TOOL_CHOICE_OBJECT_TOKEN_COUNT = 4
DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT = 10
DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT = 20
MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES = 768
MAX_LONG_SIDE_FOR_IMAGE_HIGH_RES = 2000
MAX_TILE_WIDTH = 512
MAX_TILE_HEIGHT = 512
OPENAI_FILE_SEARCH_COST_PER_1K_CALLS = 2.5 / 1000
MIN_NON_ZERO_TEMPERATURE = 0.0001
#### RELIABILITY ####
REPEATED_STREAMING_CHUNK_LIMIT = 100 # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives.
DEFAULT_MAX_LRU_CACHE_SIZE = 16
INITIAL_RETRY_DELAY = 0.5
MAX_RETRY_DELAY = 8.0
JITTER = 0.75
DEFAULT_IN_MEMORY_TTL = 5 # default time to live for the in-memory cache
DEFAULT_POLLING_INTERVAL = 0.03 # default polling interval for the scheduler
AZURE_OPERATION_POLLING_TIMEOUT = 120
REDIS_SOCKET_TIMEOUT = 0.1
REDIS_CONNECTION_POOL_TIMEOUT = 5
NON_LLM_CONNECTION_TIMEOUT = 15 # timeout for adjacent services (e.g. jwt auth)
MAX_EXCEPTION_MESSAGE_LENGTH = 2000
BEDROCK_MAX_POLICY_SIZE = 75
REPLICATE_POLLING_DELAY_SECONDS = 0.5
DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS = 4096
TOGETHER_AI_4_B = 4
TOGETHER_AI_8_B = 8
TOGETHER_AI_21_B = 21
TOGETHER_AI_41_B = 41
TOGETHER_AI_80_B = 80
TOGETHER_AI_110_B = 110
TOGETHER_AI_EMBEDDING_150_M = 150
TOGETHER_AI_EMBEDDING_350_M = 350
QDRANT_SCALAR_QUANTILE = 0.99
QDRANT_VECTOR_SIZE = 1536
CACHED_STREAMING_CHUNK_DELAY = 0.02
MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 512
DEFAULT_MAX_TOKENS_FOR_TRITON = 2000
#### Networking settings ####
request_timeout: float = 6000 # time in seconds
STREAM_SSE_DONE_STRING: str = "[DONE]"
### SPEND TRACKING ###
DEFAULT_REPLICATE_GPU_PRICE_PER_SECOND = 0.001400 # price per second for a100 80GB
FIREWORKS_AI_56_B_MOE = 56
FIREWORKS_AI_176_B_MOE = 176
FIREWORKS_AI_16_B = 16
FIREWORKS_AI_80_B = 80
LITELLM_CHAT_PROVIDERS = [
"openai",
@ -426,6 +482,9 @@ MCP_TOOL_NAME_PREFIX = "mcp_tool"
MAX_SPENDLOG_ROWS_TO_QUERY = (
1_000_000 # if spendLogs has more than 1M rows, do not query the DB
)
DEFAULT_SOFT_BUDGET = (
50.0 # by default all litellm proxy keys have a soft budget of 50.0
)
# makes it clear this is a rate limit error for a litellm virtual key
RATE_LIMIT_ERROR_MESSAGE_FOR_VIRTUAL_KEY = "LiteLLM Virtual Key user_api_key_hash"
@ -451,3 +510,14 @@ LITELLM_PROXY_ADMIN_NAME = "default_user_id"
########################### DB CRON JOB NAMES ###########################
DB_SPEND_UPDATE_JOB_NAME = "db_spend_update_job"
DEFAULT_CRON_JOB_LOCK_TTL_SECONDS = 60 # 1 minute
PROXY_BUDGET_RESCHEDULER_MIN_TIME = 597
PROXY_BUDGET_RESCHEDULER_MAX_TIME = 605
PROXY_BATCH_WRITE_AT = 10 # in seconds
DEFAULT_HEALTH_CHECK_INTERVAL = 300 # 5 minutes
PROMETHEUS_FALLBACK_STATS_SEND_TIME_HOURS = 9
DEFAULT_MODEL_CREATED_AT_TIME = 1677610602 # returns on `/models` endpoint
DEFAULT_SLACK_ALERTING_THRESHOLD = 300
MAX_TEAM_LIST_LIMIT = 20
DEFAULT_PROMPT_INJECTION_SIMILARITY_THRESHOLD = 0.7
LENGTH_OF_LITELLM_GENERATED_KEY = 16
SECRET_MANAGER_REFRESH_INTERVAL = 86400

View file

@ -9,6 +9,10 @@ from pydantic import BaseModel
import litellm
import litellm._logging
from litellm import verbose_logger
from litellm.constants import (
DEFAULT_MAX_LRU_CACHE_SIZE,
DEFAULT_REPLICATE_GPU_PRICE_PER_SECOND,
)
from litellm.litellm_core_utils.llm_cost_calc.tool_call_cost_tracking import (
StandardBuiltInToolCostTracking,
)
@ -355,9 +359,7 @@ def cost_per_token( # noqa: PLR0915
def get_replicate_completion_pricing(completion_response: dict, total_time=0.0):
# see https://replicate.com/pricing
# for all litellm currently supported LLMs, almost all requests go to a100_80gb
a100_80gb_price_per_second_public = (
0.001400 # assume all calls sent to A100 80GB for now
)
a100_80gb_price_per_second_public = DEFAULT_REPLICATE_GPU_PRICE_PER_SECOND # assume all calls sent to A100 80GB for now
if total_time == 0.0: # total time is in ms
start_time = completion_response.get("created", time.time())
end_time = getattr(completion_response, "ended", time.time())
@ -450,7 +452,7 @@ def _select_model_name_for_cost_calc(
return return_model
@lru_cache(maxsize=16)
@lru_cache(maxsize=DEFAULT_MAX_LRU_CACHE_SIZE)
def _model_contains_known_llm_provider(model: str) -> bool:
"""
Check if the model contains a known llm provider

View file

@ -16,6 +16,7 @@ import litellm.litellm_core_utils.litellm_logging
import litellm.types
from litellm._logging import verbose_logger, verbose_proxy_logger
from litellm.caching.caching import DualCache
from litellm.constants import HOURS_IN_A_DAY
from litellm.integrations.custom_batch_logger import CustomBatchLogger
from litellm.litellm_core_utils.duration_parser import duration_in_seconds
from litellm.litellm_core_utils.exception_mapping_utils import (
@ -649,10 +650,10 @@ class SlackAlerting(CustomBatchLogger):
event_message += (
f"Budget Crossed\n Total Budget:`{user_info.max_budget}`"
)
elif percent_left <= 0.05:
elif percent_left <= SLACK_ALERTING_THRESHOLD_5_PERCENT:
event = "threshold_crossed"
event_message += "5% Threshold Crossed "
elif percent_left <= 0.15:
elif percent_left <= SLACK_ALERTING_THRESHOLD_15_PERCENT:
event = "threshold_crossed"
event_message += "15% Threshold Crossed"
elif user_info.soft_budget is not None:
@ -1718,7 +1719,7 @@ Model Info:
await self.internal_usage_cache.async_set_cache(
key=_event_cache_key,
value="SENT",
ttl=(30 * 24 * 60 * 60), # 1 month
ttl=(30 * HOURS_IN_A_DAY * 60 * 60), # 1 month
)
except Exception as e:

View file

@ -41,7 +41,7 @@ from litellm.types.utils import StandardLoggingPayload
from ..additional_logging_utils import AdditionalLoggingUtils
# max number of logs DD API can accept
DD_MAX_BATCH_SIZE = 1000
# specify what ServiceTypes are logged as success events to DD. (We don't want to spam DD traces with large number of service types)
DD_LOGGED_SUCCESS_SERVICE_TYPES = [

View file

@ -20,10 +20,6 @@ else:
VertexBase = Any
GCS_DEFAULT_BATCH_SIZE = 2048
GCS_DEFAULT_FLUSH_INTERVAL_SECONDS = 20
class GCSBucketLogger(GCSBucketBase, AdditionalLoggingUtils):
def __init__(self, bucket_name: Optional[str] = None) -> None:
from litellm.proxy.proxy_server import premium_user

View file

@ -3,6 +3,7 @@ from typing import Optional, Tuple
import httpx
import litellm
from litellm.constants import REPLICATE_MODEL_NAME_WITH_ID_LENGTH
from litellm.secret_managers.main import get_secret, get_secret_str
from ..types.router import LiteLLM_Params
@ -256,10 +257,13 @@ def get_llm_provider( # noqa: PLR0915
elif model in litellm.cohere_chat_models:
custom_llm_provider = "cohere_chat"
## replicate
elif model in litellm.replicate_models or (":" in model and len(model) > 64):
elif model in litellm.replicate_models or (
":" in model and len(model) > REPLICATE_MODEL_NAME_WITH_ID_LENGTH
):
model_parts = model.split(":")
if (
len(model_parts) > 1 and len(model_parts[1]) == 64
len(model_parts) > 1
and len(model_parts[1]) == REPLICATE_MODEL_NAME_WITH_ID_LENGTH
): ## checks if model name has a 64 digit code - e.g. "meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3"
custom_llm_provider = "replicate"
elif model in litellm.replicate_models:

View file

@ -28,6 +28,10 @@ from litellm._logging import _is_debugging_on, verbose_logger
from litellm.batches.batch_utils import _handle_completed_batch
from litellm.caching.caching import DualCache, InMemoryCache
from litellm.caching.caching_handler import LLMCachingHandler
from litellm.constants import (
DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT,
DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT,
)
from litellm.cost_calculator import _select_model_name_for_cost_calc
from litellm.integrations.arize.arize import ArizeLogger
from litellm.integrations.custom_guardrail import CustomGuardrail
@ -3745,9 +3749,12 @@ def create_dummy_standard_logging_payload() -> StandardLoggingPayload:
response_cost=response_cost,
response_cost_failure_debug_info=None,
status=str("success"),
total_tokens=int(30),
prompt_tokens=int(20),
completion_tokens=int(10),
total_tokens=int(
DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT
+ DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT
),
prompt_tokens=int(DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT),
completion_tokens=int(DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT),
startTime=start_time,
endTime=end_time,
completionStartTime=completion_start_time,

View file

@ -5,6 +5,7 @@ Helper utilities for tracking the cost of built-in tools.
from typing import Any, Dict, List, Optional
import litellm
from litellm.constants import OPENAI_FILE_SEARCH_COST_PER_1K_CALLS
from litellm.types.llms.openai import FileSearchTool, WebSearchOptions
from litellm.types.utils import (
ModelInfo,
@ -132,7 +133,7 @@ class StandardBuiltInToolCostTracking:
"""
if file_search is None:
return 0.0
return 2.5 / 1000
return OPENAI_FILE_SEARCH_COST_PER_1K_CALLS
@staticmethod
def chat_completion_response_includes_annotations(

View file

@ -11,6 +11,10 @@ from litellm.constants import (
DEFAULT_IMAGE_HEIGHT,
DEFAULT_IMAGE_TOKEN_COUNT,
DEFAULT_IMAGE_WIDTH,
MAX_LONG_SIDE_FOR_IMAGE_HIGH_RES,
MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES,
MAX_TILE_HEIGHT,
MAX_TILE_WIDTH,
)
from litellm.llms.custom_httpx.http_handler import _get_httpx_client
@ -97,11 +101,14 @@ def resize_image_high_res(
height: int,
) -> Tuple[int, int]:
# Maximum dimensions for high res mode
max_short_side = 768
max_long_side = 2000
max_short_side = MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES
max_long_side = MAX_LONG_SIDE_FOR_IMAGE_HIGH_RES
# Return early if no resizing is needed
if width <= 768 and height <= 768:
if (
width <= MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES
and height <= MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES
):
return width, height
# Determine the longer and shorter sides
@ -132,7 +139,10 @@ def resize_image_high_res(
# Test the function with the given example
def calculate_tiles_needed(
resized_width, resized_height, tile_width=512, tile_height=512
resized_width,
resized_height,
tile_width=MAX_TILE_WIDTH,
tile_height=MAX_TILE_HEIGHT,
):
tiles_across = (resized_width + tile_width - 1) // tile_width
tiles_down = (resized_height + tile_height - 1) // tile_height

View file

@ -5,7 +5,10 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union, cast
import httpx
import litellm
from litellm.constants import RESPONSE_FORMAT_TOOL_NAME
from litellm.constants import (
DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS,
RESPONSE_FORMAT_TOOL_NAME,
)
from litellm.litellm_core_utils.core_helpers import map_finish_reason
from litellm.litellm_core_utils.prompt_templates.factory import anthropic_messages_pt
from litellm.llms.base_llm.base_utils import type_to_response_format_param
@ -53,7 +56,7 @@ class AnthropicConfig(BaseConfig):
max_tokens: Optional[
int
] = 4096 # anthropic requires a default value (Opus, Sonnet, and Haiku have the same default)
] = DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS # anthropic requires a default value (Opus, Sonnet, and Haiku have the same default)
stop_sequences: Optional[list] = None
temperature: Optional[int] = None
top_p: Optional[int] = None
@ -65,7 +68,7 @@ class AnthropicConfig(BaseConfig):
self,
max_tokens: Optional[
int
] = 4096, # You can pass in a value yourself or use the default value 4096
] = DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS, # You can pass in a value yourself or use the default value 4096
stop_sequences: Optional[list] = None,
temperature: Optional[int] = None,
top_p: Optional[int] = None,

View file

@ -11,6 +11,7 @@ from typing import AsyncIterator, Dict, Iterator, List, Optional, Union
import httpx
import litellm
from litellm.constants import DEFAULT_MAX_TOKENS
from litellm.litellm_core_utils.prompt_templates.factory import (
custom_prompt,
prompt_factory,
@ -65,7 +66,9 @@ class AnthropicTextConfig(BaseConfig):
def __init__(
self,
max_tokens_to_sample: Optional[int] = 256, # anthropic requires a default
max_tokens_to_sample: Optional[
int
] = DEFAULT_MAX_TOKENS, # anthropic requires a default
stop_sequences: Optional[list] = None,
temperature: Optional[int] = None,
top_p: Optional[int] = None,

View file

@ -7,7 +7,7 @@ import httpx # type: ignore
from openai import APITimeoutError, AsyncAzureOpenAI, AzureOpenAI
import litellm
from litellm.constants import DEFAULT_MAX_RETRIES
from litellm.constants import AZURE_OPERATION_POLLING_TIMEOUT, DEFAULT_MAX_RETRIES
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
from litellm.litellm_core_utils.logging_utils import track_llm_api_timing
from litellm.llms.custom_httpx.http_handler import (
@ -857,7 +857,7 @@ class AzureChatCompletion(BaseAzureLLM, BaseLLM):
await response.aread()
timeout_secs: int = 120
timeout_secs: int = AZURE_OPERATION_POLLING_TIMEOUT
start_time = time.time()
if "status" not in response.json():
raise Exception(
@ -955,7 +955,7 @@ class AzureChatCompletion(BaseAzureLLM, BaseLLM):
response.read()
timeout_secs: int = 120
timeout_secs: int = AZURE_OPERATION_POLLING_TIMEOUT
start_time = time.time()
if "status" not in response.json():
raise Exception(

View file

@ -7,6 +7,10 @@ from litellm.litellm_core_utils.prompt_templates.factory import (
convert_to_azure_openai_messages,
)
from litellm.llms.base_llm.chat.transformation import BaseLLMException
from litellm.types.llms.azure import (
API_VERSION_MONTH_SUPPORTED_RESPONSE_FORMAT,
API_VERSION_YEAR_SUPPORTED_RESPONSE_FORMAT,
)
from litellm.types.utils import ModelResponse
from litellm.utils import supports_response_schema
@ -123,7 +127,10 @@ class AzureOpenAIConfig(BaseConfig):
- check if api_version is supported for response_format
"""
is_supported = int(api_version_year) <= 2024 and int(api_version_month) >= 8
is_supported = (
int(api_version_year) <= API_VERSION_YEAR_SUPPORTED_RESPONSE_FORMAT
and int(api_version_month) >= API_VERSION_MONTH_SUPPORTED_RESPONSE_FORMAT
)
return is_supported

View file

@ -9,7 +9,7 @@ from pydantic import BaseModel
from litellm._logging import verbose_logger
from litellm.caching.caching import DualCache
from litellm.constants import BEDROCK_INVOKE_PROVIDERS_LITERAL
from litellm.constants import BEDROCK_INVOKE_PROVIDERS_LITERAL, BEDROCK_MAX_POLICY_SIZE
from litellm.litellm_core_utils.dd_tracing import tracer
from litellm.secret_managers.main import get_secret
@ -381,7 +381,7 @@ class BaseAWSLLM:
"region_name": aws_region_name,
}
if sts_response["PackedPolicySize"] > 75:
if sts_response["PackedPolicySize"] > BEDROCK_MAX_POLICY_SIZE:
verbose_logger.warning(
f"The policy size is greater than 75% of the allowed size, PackedPolicySize: {sts_response['PackedPolicySize']}"
)

View file

@ -1,6 +1,7 @@
from typing import Optional, Tuple, Union
import litellm
from litellm.constants import MIN_NON_ZERO_TEMPERATURE
from litellm.llms.openai.chat.gpt_transformation import OpenAIGPTConfig
from litellm.secret_managers.main import get_secret_str
@ -84,7 +85,7 @@ class DeepInfraConfig(OpenAIGPTConfig):
and value == 0
and model == "mistralai/Mistral-7B-Instruct-v0.1"
): # this model does no support temperature == 0
value = 0.0001 # close to 0
value = MIN_NON_ZERO_TEMPERATURE # close to 0
if param == "tool_choice":
if (
value != "auto" and value != "none"

View file

@ -4,6 +4,12 @@ For calculating cost of fireworks ai serverless inference models.
from typing import Tuple
from litellm.constants import (
FIREWORKS_AI_16_B,
FIREWORKS_AI_56_B_MOE,
FIREWORKS_AI_80_B,
FIREWORKS_AI_176_B_MOE,
)
from litellm.types.utils import Usage
from litellm.utils import get_model_info
@ -25,9 +31,9 @@ def get_base_model_for_pricing(model_name: str) -> str:
moe_match = re.search(r"(\d+)x(\d+)b", model_name)
if moe_match:
total_billion = int(moe_match.group(1)) * int(moe_match.group(2))
if total_billion <= 56:
if total_billion <= FIREWORKS_AI_56_B_MOE:
return "fireworks-ai-moe-up-to-56b"
elif total_billion <= 176:
elif total_billion <= FIREWORKS_AI_176_B_MOE:
return "fireworks-ai-56b-to-176b"
# Check for standard models in the form <number>b
@ -37,9 +43,9 @@ def get_base_model_for_pricing(model_name: str) -> str:
params_billion = float(params_match)
# Determine the category based on the number of parameters
if params_billion <= 16.0:
if params_billion <= FIREWORKS_AI_16_B:
return "fireworks-ai-up-to-16b"
elif params_billion <= 80.0:
elif params_billion <= FIREWORKS_AI_80_B:
return "fireworks-ai-16b-80b"
# If no matches, return the original model_name

View file

@ -2,6 +2,7 @@ from typing import TYPE_CHECKING, Any, List, Literal, Optional, Union
from httpx import Headers, Response
from litellm.constants import DEFAULT_MAX_TOKENS
from litellm.llms.base_llm.chat.transformation import BaseConfig, BaseLLMException
from litellm.types.llms.openai import AllMessageValues
from litellm.types.utils import ModelResponse
@ -27,7 +28,7 @@ class PredibaseConfig(BaseConfig):
decoder_input_details: Optional[bool] = None
details: bool = True # enables returning logprobs + best of
max_new_tokens: int = (
256 # openai default - requests hang if max_new_tokens not given
DEFAULT_MAX_TOKENS # openai default - requests hang if max_new_tokens not given
)
repetition_penalty: Optional[float] = None
return_full_text: Optional[

View file

@ -4,6 +4,7 @@ import time
from typing import Callable, List, Union
import litellm
from litellm.constants import REPLICATE_POLLING_DELAY_SECONDS
from litellm.llms.custom_httpx.http_handler import (
AsyncHTTPHandler,
HTTPHandler,
@ -28,7 +29,9 @@ def handle_prediction_response_streaming(
status = ""
while True and (status not in ["succeeded", "failed", "canceled"]):
time.sleep(0.5) # prevent being rate limited by replicate
time.sleep(
REPLICATE_POLLING_DELAY_SECONDS
) # prevent being rate limited by replicate
print_verbose(f"replicate: polling endpoint: {prediction_url}")
response = http_client.get(prediction_url, headers=headers)
if response.status_code == 200:
@ -77,7 +80,9 @@ async def async_handle_prediction_response_streaming(
status = ""
while True and (status not in ["succeeded", "failed", "canceled"]):
await asyncio.sleep(0.5) # prevent being rate limited by replicate
await asyncio.sleep(
REPLICATE_POLLING_DELAY_SECONDS
) # prevent being rate limited by replicate
print_verbose(f"replicate: polling endpoint: {prediction_url}")
response = await http_client.get(prediction_url, headers=headers)
if response.status_code == 200:

View file

@ -3,6 +3,7 @@ from typing import TYPE_CHECKING, Any, List, Optional, Union
import httpx
import litellm
from litellm.constants import REPLICATE_MODEL_NAME_WITH_ID_LENGTH
from litellm.litellm_core_utils.prompt_templates.common_utils import (
convert_content_list_to_str,
)
@ -221,10 +222,11 @@ class ReplicateConfig(BaseConfig):
version_id = self.model_to_version_id(model)
request_data: dict = {"input": input_data}
if ":" in version_id and len(version_id) > 64:
if ":" in version_id and len(version_id) > REPLICATE_MODEL_NAME_WITH_ID_LENGTH:
model_parts = version_id.split(":")
if (
len(model_parts) > 1 and len(model_parts[1]) == 64
len(model_parts) > 1
and len(model_parts[1]) == REPLICATE_MODEL_NAME_WITH_ID_LENGTH
): ## checks if model name has a 64 digit code - e.g. "meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3"
request_data["version"] = model_parts[1]

View file

@ -4,6 +4,16 @@ Handles calculating cost for together ai models
import re
from litellm.constants import (
TOGETHER_AI_4_B,
TOGETHER_AI_8_B,
TOGETHER_AI_21_B,
TOGETHER_AI_41_B,
TOGETHER_AI_80_B,
TOGETHER_AI_110_B,
TOGETHER_AI_EMBEDDING_150_M,
TOGETHER_AI_EMBEDDING_350_M,
)
from litellm.types.utils import CallTypes
@ -31,17 +41,17 @@ def get_model_params_and_category(model_name, call_type: CallTypes) -> str:
else:
return model_name
# Determine the category based on the number of parameters
if params_billion <= 4.0:
if params_billion <= TOGETHER_AI_4_B:
category = "together-ai-up-to-4b"
elif params_billion <= 8.0:
elif params_billion <= TOGETHER_AI_8_B:
category = "together-ai-4.1b-8b"
elif params_billion <= 21.0:
elif params_billion <= TOGETHER_AI_21_B:
category = "together-ai-8.1b-21b"
elif params_billion <= 41.0:
elif params_billion <= TOGETHER_AI_41_B:
category = "together-ai-21.1b-41b"
elif params_billion <= 80.0:
elif params_billion <= TOGETHER_AI_80_B:
category = "together-ai-41.1b-80b"
elif params_billion <= 110.0:
elif params_billion <= TOGETHER_AI_110_B:
category = "together-ai-81.1b-110b"
if category is not None:
return category
@ -69,9 +79,9 @@ def get_model_params_and_category_embeddings(model_name) -> str:
else:
return model_name
# Determine the category based on the number of parameters
if params_million <= 150:
if params_million <= TOGETHER_AI_EMBEDDING_150_M:
category = "together-ai-embedding-up-to-150m"
elif params_million <= 350:
elif params_million <= TOGETHER_AI_EMBEDDING_350_M:
category = "together-ai-embedding-151m-to-350m"
if category is not None:
return category

View file

@ -7,6 +7,7 @@ from typing import Any, AsyncIterator, Dict, Iterator, List, Literal, Optional,
from httpx import Headers, Response
from litellm.constants import DEFAULT_MAX_TOKENS_FOR_TRITON
from litellm.litellm_core_utils.prompt_templates.factory import prompt_factory
from litellm.llms.base_llm.base_model_iterator import BaseModelResponseIterator
from litellm.llms.base_llm.chat.transformation import (
@ -196,7 +197,9 @@ class TritonGenerateConfig(TritonConfig):
data_for_triton: Dict[str, Any] = {
"text_input": prompt_factory(model=model, messages=messages),
"parameters": {
"max_tokens": int(optional_params.get("max_tokens", 2000)),
"max_tokens": int(
optional_params.get("max_tokens", DEFAULT_MAX_TOKENS_FOR_TRITON)
),
"bad_words": [""],
"stop_words": [""],
},

View file

@ -51,6 +51,10 @@ from litellm import ( # type: ignore
get_litellm_params,
get_optional_params,
)
from litellm.constants import (
DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT,
DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT,
)
from litellm.exceptions import LiteLLMUnknownProvider
from litellm.integrations.custom_logger import CustomLogger
from litellm.litellm_core_utils.audio_utils.utils import get_audio_file_for_health_check
@ -740,7 +744,12 @@ def mock_completion(
setattr(
model_response,
"usage",
Usage(prompt_tokens=10, completion_tokens=20, total_tokens=30),
Usage(
prompt_tokens=DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT,
completion_tokens=DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT,
total_tokens=DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT
+ DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT,
),
)
try:
@ -3067,7 +3076,7 @@ def completion( # type: ignore # noqa: PLR0915
"max_tokens": max_tokens,
"temperature": temperature,
"top_p": top_p,
"top_k": kwargs.get("top_k", 40),
"top_k": kwargs.get("top_k"),
},
},
)

View file

@ -20,6 +20,7 @@ import litellm
from litellm._logging import verbose_proxy_logger
from litellm.caching.caching import DualCache
from litellm.caching.dual_cache import LimitedSizeOrderedDict
from litellm.constants import DEFAULT_IN_MEMORY_TTL
from litellm.litellm_core_utils.get_llm_provider_logic import get_llm_provider
from litellm.proxy._types import (
RBAC_ROLES,
@ -55,7 +56,7 @@ else:
last_db_access_time = LimitedSizeOrderedDict(max_size=100)
db_cache_expiry = 5 # refresh every 5s
db_cache_expiry = DEFAULT_IN_MEMORY_TTL # refresh every 5s
all_routes = LiteLLMRoutes.openai_routes.value + LiteLLMRoutes.management_routes.value

View file

@ -9,6 +9,7 @@ from typing import Optional
import httpx
from litellm._logging import verbose_proxy_logger
from litellm.constants import NON_LLM_CONNECTION_TIMEOUT
from litellm.llms.custom_httpx.http_handler import HTTPHandler
@ -23,7 +24,7 @@ class LicenseCheck:
def __init__(self) -> None:
self.license_str = os.getenv("LITELLM_LICENSE", None)
verbose_proxy_logger.debug("License Str value - {}".format(self.license_str))
self.http_handler = HTTPHandler(timeout=15)
self.http_handler = HTTPHandler(timeout=NON_LLM_CONNECTION_TIMEOUT)
self.public_key = None
self.read_public_key()

View file

@ -15,6 +15,7 @@ from fastapi import HTTPException
import litellm
from litellm._logging import verbose_proxy_logger
from litellm.caching.caching import DualCache
from litellm.constants import DEFAULT_PROMPT_INJECTION_SIMILARITY_THRESHOLD
from litellm.integrations.custom_logger import CustomLogger
from litellm.litellm_core_utils.prompt_templates.factory import (
prompt_injection_detection_default_pt,
@ -110,7 +111,9 @@ class _OPTIONAL_PromptInjectionDetection(CustomLogger):
return combinations
def check_user_input_similarity(
self, user_input: str, similarity_threshold: float = 0.7
self,
user_input: str,
similarity_threshold: float = DEFAULT_PROMPT_INJECTION_SIMILARITY_THRESHOLD,
) -> bool:
user_input_lower = user_input.lower()
keywords = self.generate_injection_keywords()

View file

@ -24,7 +24,7 @@ from fastapi import APIRouter, Depends, Header, HTTPException, Query, Request, s
import litellm
from litellm._logging import verbose_proxy_logger
from litellm.caching import DualCache
from litellm.constants import UI_SESSION_TOKEN_TEAM_ID
from litellm.constants import LENGTH_OF_LITELLM_GENERATED_KEY, UI_SESSION_TOKEN_TEAM_ID
from litellm.litellm_core_utils.duration_parser import duration_in_seconds
from litellm.proxy._types import *
from litellm.proxy.auth.auth_checks import (
@ -1164,7 +1164,7 @@ async def generate_key_helper_fn( # noqa: PLR0915
if key is not None:
token = key
else:
token = f"sk-{secrets.token_urlsafe(16)}"
token = f"sk-{secrets.token_urlsafe(LENGTH_OF_LITELLM_GENERATED_KEY)}"
if duration is None: # allow tokens that never expire
expires = None
@ -1745,7 +1745,7 @@ async def regenerate_key_fn(
verbose_proxy_logger.debug("key_in_db: %s", _key_in_db)
new_token = f"sk-{secrets.token_urlsafe(16)}"
new_token = f"sk-{secrets.token_urlsafe(LENGTH_OF_LITELLM_GENERATED_KEY)}"
new_token_hash = hash_token(new_token)
new_token_key_name = f"sk-...{new_token[-4:]}"

View file

@ -15,6 +15,10 @@ from litellm.litellm_core_utils.litellm_logging import (
)
from litellm.litellm_core_utils.thread_pool_executor import executor
from litellm.proxy.pass_through_endpoints.types import PassthroughStandardLoggingPayload
from litellm.types.passthrough_endpoints.assembly_ai import (
ASSEMBLY_AI_MAX_POLLING_ATTEMPTS,
ASSEMBLY_AI_POLLING_INTERVAL,
)
class AssemblyAITranscriptResponse(TypedDict, total=False):
@ -34,13 +38,13 @@ class AssemblyAIPassthroughLoggingHandler:
The base URL for the AssemblyAI API
"""
self.polling_interval: float = 10
self.polling_interval: float = ASSEMBLY_AI_POLLING_INTERVAL
"""
The polling interval for the AssemblyAI API.
litellm needs to poll the GET /transcript/{transcript_id} endpoint to get the status of the transcript.
"""
self.max_polling_attempts = 180
self.max_polling_attempts = ASSEMBLY_AI_MAX_POLLING_ATTEMPTS
"""
The maximum number of polling attempts for the AssemblyAI API.
"""

View file

@ -25,7 +25,10 @@ from typing import (
get_type_hints,
)
from litellm.constants import DEFAULT_MAX_RECURSE_DEPTH
from litellm.constants import (
DEFAULT_MAX_RECURSE_DEPTH,
DEFAULT_SLACK_ALERTING_THRESHOLD,
)
from litellm.types.utils import (
ModelResponse,
ModelResponseStream,
@ -118,7 +121,16 @@ import litellm
from litellm import Router
from litellm._logging import verbose_proxy_logger, verbose_router_logger
from litellm.caching.caching import DualCache, RedisCache
from litellm.constants import LITELLM_PROXY_ADMIN_NAME
from litellm.constants import (
DAYS_IN_A_MONTH,
DEFAULT_HEALTH_CHECK_INTERVAL,
DEFAULT_MODEL_CREATED_AT_TIME,
LITELLM_PROXY_ADMIN_NAME,
PROMETHEUS_FALLBACK_STATS_SEND_TIME_HOURS,
PROXY_BATCH_WRITE_AT,
PROXY_BUDGET_RESCHEDULER_MAX_TIME,
PROXY_BUDGET_RESCHEDULER_MIN_TIME,
)
from litellm.exceptions import RejectedRequestError
from litellm.integrations.SlackAlerting.slack_alerting import SlackAlerting
from litellm.litellm_core_utils.core_helpers import (
@ -287,7 +299,7 @@ from litellm.router import (
LiteLLM_Params,
ModelGroupInfo,
)
from litellm.scheduler import DefaultPriorities, FlowItem, Scheduler
from litellm.scheduler import FlowItem, Scheduler
from litellm.secret_managers.aws_secret_manager import load_aws_kms
from litellm.secret_managers.google_kms import load_google_kms
from litellm.secret_managers.main import (
@ -307,6 +319,7 @@ from litellm.types.llms.openai import HttpxBinaryResponseContent
from litellm.types.router import DeploymentTypedDict
from litellm.types.router import ModelInfo as RouterModelInfo
from litellm.types.router import RouterGeneralSettings, updateDeployment
from litellm.types.scheduler import DefaultPriorities
from litellm.types.utils import CredentialItem, CustomHuggingfaceTokenizer
from litellm.types.utils import ModelInfo as ModelMapInfo
from litellm.types.utils import RawRequestTypedDict, StandardLoggingPayload
@ -779,9 +792,9 @@ queue: List = []
litellm_proxy_budget_name = "litellm-proxy-budget"
litellm_proxy_admin_name = LITELLM_PROXY_ADMIN_NAME
ui_access_mode: Literal["admin", "all"] = "all"
proxy_budget_rescheduler_min_time = 597
proxy_budget_rescheduler_max_time = 605
proxy_batch_write_at = 10 # in seconds
proxy_budget_rescheduler_min_time = PROXY_BUDGET_RESCHEDULER_MIN_TIME
proxy_budget_rescheduler_max_time = PROXY_BUDGET_RESCHEDULER_MAX_TIME
proxy_batch_write_at = PROXY_BATCH_WRITE_AT
litellm_master_key_hash = None
disable_spend_logs = False
jwt_handler = JWTHandler()
@ -1846,7 +1859,9 @@ class ProxyConfig:
use_background_health_checks = general_settings.get(
"background_health_checks", False
)
health_check_interval = general_settings.get("health_check_interval", 300)
health_check_interval = general_settings.get(
"health_check_interval", DEFAULT_HEALTH_CHECK_INTERVAL
)
health_check_details = general_settings.get("health_check_details", True)
### RBAC ###
@ -3145,7 +3160,7 @@ class ProxyStartupEvent:
scheduler.add_job(
proxy_logging_obj.slack_alerting_instance.send_fallback_stats_from_prometheus,
"cron",
hour=9,
hour=PROMETHEUS_FALLBACK_STATS_SEND_TIME_HOURS,
minute=0,
timezone=ZoneInfo("America/Los_Angeles"), # Pacific Time
)
@ -3278,7 +3293,7 @@ async def model_list(
{
"id": model,
"object": "model",
"created": 1677610602,
"created": DEFAULT_MODEL_CREATED_AT_TIME,
"owned_by": "openai",
}
for model in all_models
@ -5592,7 +5607,7 @@ async def model_metrics(
param="None",
code=status.HTTP_500_INTERNAL_SERVER_ERROR,
)
startTime = startTime or datetime.now() - timedelta(days=30)
startTime = startTime or datetime.now() - timedelta(days=DAYS_IN_A_MONTH)
endTime = endTime or datetime.now()
if api_key is None or api_key == "undefined":
@ -5713,11 +5728,12 @@ async def model_metrics_slow_responses(
if customer is None or customer == "undefined":
customer = "null"
startTime = startTime or datetime.now() - timedelta(days=30)
startTime = startTime or datetime.now() - timedelta(days=DAYS_IN_A_MONTH)
endTime = endTime or datetime.now()
alerting_threshold = (
proxy_logging_obj.slack_alerting_instance.alerting_threshold or 300
proxy_logging_obj.slack_alerting_instance.alerting_threshold
or DEFAULT_SLACK_ALERTING_THRESHOLD
)
alerting_threshold = int(alerting_threshold)
@ -5797,7 +5813,7 @@ async def model_metrics_exceptions(
code=status.HTTP_500_INTERNAL_SERVER_ERROR,
)
startTime = startTime or datetime.now() - timedelta(days=30)
startTime = startTime or datetime.now() - timedelta(days=DAYS_IN_A_MONTH)
endTime = endTime or datetime.now()
if api_key is None or api_key == "undefined":

View file

@ -22,6 +22,7 @@ from typing import (
overload,
)
from litellm.constants import MAX_TEAM_LIST_LIMIT
from litellm.proxy._types import (
DB_CONNECTION_ERROR_TYPES,
CommonProxyErrors,
@ -1596,7 +1597,9 @@ class PrismaClient:
where={"team_id": {"in": team_id_list}}
)
elif query_type == "find_all" and team_id_list is None:
response = await self.db.litellm_teamtable.find_many(take=20)
response = await self.db.litellm_teamtable.find_many(
take=MAX_TEAM_LIST_LIMIT
)
return response
elif table_name == "user_notification":
if query_type == "find_unique":

View file

@ -50,6 +50,7 @@ from litellm.caching.caching import (
RedisCache,
RedisClusterCache,
)
from litellm.constants import DEFAULT_MAX_LRU_CACHE_SIZE
from litellm.integrations.custom_logger import CustomLogger
from litellm.litellm_core_utils.asyncify import run_async_function
from litellm.litellm_core_utils.core_helpers import _get_parent_otel_span_from_kwargs
@ -5073,7 +5074,7 @@ class Router:
rpm_usage += t
return tpm_usage, rpm_usage
@lru_cache(maxsize=64)
@lru_cache(maxsize=DEFAULT_MAX_LRU_CACHE_SIZE)
def _cached_get_model_group_info(
self, model_group: str
) -> Optional[ModelGroupInfo]:

View file

@ -1,6 +1,7 @@
from typing import TYPE_CHECKING, Any, Optional, Union
from litellm._logging import verbose_router_logger
from litellm.constants import MAX_EXCEPTION_MESSAGE_LENGTH
from litellm.router_utils.cooldown_handlers import (
_async_get_cooldown_deployments_with_debug_info,
)
@ -54,7 +55,7 @@ async def send_llm_exception_alert(
exception_str = str(original_exception)
if litellm_debug_info is not None:
exception_str += litellm_debug_info
exception_str += f"\n\n{error_traceback_str[:2000]}"
exception_str += f"\n\n{error_traceback_str[:MAX_EXCEPTION_MESSAGE_LENGTH]}"
await litellm_router_instance.slack_alerting_logger.send_alert(
message=f"LLM API call failed: `{exception_str}`",

View file

@ -6,17 +6,14 @@ from pydantic import BaseModel
from litellm import print_verbose
from litellm.caching.caching import DualCache, RedisCache
from litellm.constants import DEFAULT_IN_MEMORY_TTL, DEFAULT_POLLING_INTERVAL
class SchedulerCacheKeys(enum.Enum):
queue = "scheduler:queue"
default_in_memory_ttl = 5 # cache queue in-memory for 5s when redis cache available
class DefaultPriorities(enum.Enum):
High = 0
Medium = 128
Low = 255
default_in_memory_ttl = (
DEFAULT_IN_MEMORY_TTL # cache queue in-memory for 5s when redis cache available
)
class FlowItem(BaseModel):
@ -44,7 +41,9 @@ class Scheduler:
self.cache = DualCache(
redis_cache=redis_cache, default_in_memory_ttl=default_in_memory_ttl
)
self.polling_interval = polling_interval or 0.03 # default to 3ms
self.polling_interval = (
polling_interval or DEFAULT_POLLING_INTERVAL
) # default to 3ms
async def add_request(self, request: FlowItem):
# We use the priority directly, as lower values indicate higher priority

View file

@ -5,6 +5,7 @@ from typing import Optional
import litellm
from litellm._logging import verbose_logger
from litellm.caching.caching import InMemoryCache
from litellm.constants import SECRET_MANAGER_REFRESH_INTERVAL
from litellm.integrations.gcs_bucket.gcs_bucket_base import GCSBucketBase
from litellm.llms.custom_httpx.http_handler import _get_httpx_client
from litellm.proxy._types import CommonProxyErrors, KeyManagementSystem
@ -13,7 +14,7 @@ from litellm.proxy._types import CommonProxyErrors, KeyManagementSystem
class GoogleSecretManager(GCSBucketBase):
def __init__(
self,
refresh_interval: Optional[int] = 86400,
refresh_interval: Optional[int] = SECRET_MANAGER_REFRESH_INTERVAL,
always_read_secret_manager: Optional[bool] = False,
) -> None:
"""

View file

@ -6,6 +6,7 @@ import httpx
import litellm
from litellm._logging import verbose_logger
from litellm.caching import InMemoryCache
from litellm.constants import SECRET_MANAGER_REFRESH_INTERVAL
from litellm.llms.custom_httpx.http_handler import (
_get_httpx_client,
get_async_httpx_client,
@ -39,8 +40,14 @@ class HashicorpSecretManager(BaseSecretManager):
litellm.secret_manager_client = self
litellm._key_management_system = KeyManagementSystem.HASHICORP_VAULT
_refresh_interval = os.environ.get("HCP_VAULT_REFRESH_INTERVAL", 86400)
_refresh_interval = int(_refresh_interval) if _refresh_interval else 86400
_refresh_interval = os.environ.get(
"HCP_VAULT_REFRESH_INTERVAL", SECRET_MANAGER_REFRESH_INTERVAL
)
_refresh_interval = (
int(_refresh_interval)
if _refresh_interval
else SECRET_MANAGER_REFRESH_INTERVAL
)
self.cache = InMemoryCache(
default_ttl=_refresh_interval
) # store in memory for 1 day

View file

@ -1,6 +1,8 @@
from enum import Enum
from typing import Optional, TypedDict
DD_MAX_BATCH_SIZE = 1000
class DataDogStatus(str, Enum):
INFO = "info"

View file

@ -8,6 +8,10 @@ else:
VertexBase = Any
GCS_DEFAULT_BATCH_SIZE = 2048
GCS_DEFAULT_FLUSH_INTERVAL_SECONDS = 20
class GCSLoggingConfig(TypedDict):
"""
Internal LiteLLM Config for GCS Bucket logging

View file

@ -7,6 +7,9 @@ from pydantic import BaseModel, Field
from litellm.types.utils import LiteLLMPydanticObjectBase
SLACK_ALERTING_THRESHOLD_5_PERCENT = 0.05
SLACK_ALERTING_THRESHOLD_15_PERCENT = 0.15
class BaseOutageModel(TypedDict):
alerts: List[int]

View file

@ -0,0 +1,2 @@
API_VERSION_YEAR_SUPPORTED_RESPONSE_FORMAT = 2024
API_VERSION_MONTH_SUPPORTED_RESPONSE_FORMAT = 8

View file

@ -0,0 +1 @@

View file

@ -0,0 +1,2 @@
ASSEMBLY_AI_POLLING_INTERVAL = 10
ASSEMBLY_AI_MAX_POLLING_ATTEMPTS = 180

View file

@ -0,0 +1,7 @@
from enum import Enum
class DefaultPriorities(Enum):
High = 0
Medium = 128
Low = 255

View file

@ -62,6 +62,16 @@ import litellm.llms.gemini
from litellm.caching._internal_lru_cache import lru_cache_wrapper
from litellm.caching.caching import DualCache
from litellm.caching.caching_handler import CachingHandlerResponse, LLMCachingHandler
from litellm.constants import (
DEFAULT_MAX_LRU_CACHE_SIZE,
DEFAULT_TRIM_RATIO,
FUNCTION_DEFINITION_TOKEN_COUNT,
INITIAL_RETRY_DELAY,
JITTER,
MAX_RETRY_DELAY,
MINIMUM_PROMPT_CACHE_TOKEN_COUNT,
TOOL_CHOICE_OBJECT_TOKEN_COUNT,
)
from litellm.integrations.custom_guardrail import CustomGuardrail
from litellm.integrations.custom_logger import CustomLogger
from litellm.litellm_core_utils.core_helpers import (
@ -1520,7 +1530,7 @@ def _select_tokenizer(
return _select_tokenizer_helper(model=model)
@lru_cache(maxsize=128)
@lru_cache(maxsize=DEFAULT_MAX_LRU_CACHE_SIZE)
def _select_tokenizer_helper(model: str) -> SelectTokenizerResponse:
if litellm.disable_hf_tokenizer_download is True:
return _return_openai_tokenizer(model)
@ -5336,15 +5346,15 @@ def _calculate_retry_after(
if retry_after is not None and 0 < retry_after <= 60:
return retry_after
initial_retry_delay = 0.5
max_retry_delay = 8.0
initial_retry_delay = INITIAL_RETRY_DELAY
max_retry_delay = MAX_RETRY_DELAY
nb_retries = max_retries - remaining_retries
# Apply exponential backoff, but not more than the max.
sleep_seconds = min(initial_retry_delay * pow(2.0, nb_retries), max_retry_delay)
# Apply some jitter, plus-or-minus half a second.
jitter = 1 - 0.25 * random.random()
jitter = JITTER * random.random()
timeout = sleep_seconds * jitter
return timeout if timeout >= min_timeout else min_timeout
@ -5670,7 +5680,7 @@ def shorten_message_to_fit_limit(message, tokens_needed, model: Optional[str]):
def trim_messages(
messages,
model: Optional[str] = None,
trim_ratio: float = 0.75,
trim_ratio: float = DEFAULT_TRIM_RATIO,
return_response_tokens: bool = False,
max_tokens=None,
):
@ -6543,7 +6553,7 @@ def is_prompt_caching_valid_prompt(
model=model,
use_default_image_token_count=True,
)
return token_count >= 1024
return token_count >= MINIMUM_PROMPT_CACHE_TOKEN_COUNT
except Exception as e:
verbose_logger.error(f"Error in is_prompt_caching_valid_prompt: {e}")
return False

View file

@ -3,6 +3,7 @@ warn_return_any = False
ignore_missing_imports = True
mypy_path = litellm/stubs
namespace_packages = True
disable_error_code = valid-type
[mypy-google.*]
ignore_missing_imports = True

View file

@ -0,0 +1,152 @@
import sys
import ast
import os
# Extremely restrictive set of allowed numbers
ALLOWED_NUMBERS = {
0,
1,
-1,
2,
10,
100,
1000,
4,
3,
500,
6,
60,
3600,
0.75,
7,
1024,
1011,
600,
12,
1000000000.0,
0.1,
50,
128,
6000,
30,
1000000,
5,
15,
25,
10000,
60000,
8,
2048,
16000000000,
16,
16383,
14,
24,
128000,
0.01,
20,
}
# Add all standard HTTP status codes
HTTP_STATUS_CODES = {
200, # OK
201, # Created
202, # Accepted
204, # No Content
300, # Multiple Choices
301, # Moved Permanently
302, # Found
303, # See Other
304, # Not Modified
307, # Temporary Redirect
308, # Permanent Redirect
400, # Bad Request
401, # Unauthorized
402, # Payment Required
403, # Forbidden
404, # Not Found
406, # Not Acceptable
408, # Request Timeout
409, # Conflict
413, # Payload Too Large
422, # Unprocessable Entity
424, # Failed Dependency
429, # Too Many Requests
498, # Invalid Token
499, # Client Closed Request
500, # Internal Server Error
501, # Not Implemented
502, # Bad Gateway
503, # Service Unavailable
504, # Gateway Timeout
520, # Web server is returning an unknown error
522, # Connection timed out
524, # A timeout occurred
529, # Site is overloaded
}
# Combine the sets
ALLOWED_NUMBERS = ALLOWED_NUMBERS.union(HTTP_STATUS_CODES)
class HardcodedNumberFinder(ast.NodeVisitor):
def __init__(self):
self.hardcoded_numbers = []
def visit_Constant(self, node):
# For Python 3.8+
if isinstance(node.value, (int, float)) and node.value not in ALLOWED_NUMBERS:
self.hardcoded_numbers.append((node.lineno, node.value))
self.generic_visit(node)
def visit_Num(self, node):
# For older Python versions
if node.n not in ALLOWED_NUMBERS:
self.hardcoded_numbers.append((node.lineno, node.n))
self.generic_visit(node)
def check_file(filename):
try:
with open(filename, "r") as f:
content = f.read()
tree = ast.parse(content)
finder = HardcodedNumberFinder()
finder.visit(tree)
if finder.hardcoded_numbers:
print(f"ERROR in {filename}: Hardcoded numbers detected:")
for line, value in finder.hardcoded_numbers:
print(f" Line {line}: {value}")
return 1
return 0
except SyntaxError:
print(f"Syntax error in {filename}")
return 0
def main():
exit_code = 0
folder = "../../litellm"
ignore_files = [
"constants.py",
"proxy_cli.py",
"token_counter.py",
"mock_functions.py",
"duration_parser.py",
"utils.py",
]
ignore_folder = "types"
for root, dirs, files in os.walk(folder):
for filename in files:
if filename.endswith(".py") and filename not in ignore_files:
full_path = os.path.join(root, filename)
if ignore_folder in full_path:
continue
exit_code |= check_file(full_path)
sys.exit(exit_code)
if __name__ == "__main__":
main()

View file