mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-24 18:24:20 +00:00
Squashed commit of the following: (#9709)
commitb12a9892b7
Author: Krrish Dholakia <krrishdholakia@gmail.com> Date: Wed Apr 2 08:09:56 2025 -0700 fix(utils.py): don't modify openai_token_counter commit294de31803
Author: Krrish Dholakia <krrishdholakia@gmail.com> Date: Mon Mar 24 21:22:40 2025 -0700 fix: fix linting error commitcb6e9fbe40
Author: Krrish Dholakia <krrishdholakia@gmail.com> Date: Mon Mar 24 19:52:45 2025 -0700 refactor: complete migration commitbfc159172d
Author: Krrish Dholakia <krrishdholakia@gmail.com> Date: Mon Mar 24 19:09:59 2025 -0700 refactor: refactor more constants commit43ffb6a558
Author: Krrish Dholakia <krrishdholakia@gmail.com> Date: Mon Mar 24 18:45:24 2025 -0700 fix: test commit04dbe4310c
Author: Krrish Dholakia <krrishdholakia@gmail.com> Date: Mon Mar 24 18:28:58 2025 -0700 refactor: refactor: move more constants into constants.py commit3c26284aff
Author: Krrish Dholakia <krrishdholakia@gmail.com> Date: Mon Mar 24 18:14:46 2025 -0700 refactor: migrate hardcoded constants out of __init__.py commitc11e0de69d
Author: Krrish Dholakia <krrishdholakia@gmail.com> Date: Mon Mar 24 18:11:21 2025 -0700 build: migrate all constants into constants.py commit7882bdc787
Author: Krrish Dholakia <krrishdholakia@gmail.com> Date: Mon Mar 24 18:07:37 2025 -0700 build: initial test banning hardcoded numbers in repo
This commit is contained in:
parent
5a722ef18f
commit
8ee32291e0
51 changed files with 509 additions and 118 deletions
|
@ -56,6 +56,9 @@ from litellm.constants import (
|
||||||
bedrock_embedding_models,
|
bedrock_embedding_models,
|
||||||
known_tokenizer_config,
|
known_tokenizer_config,
|
||||||
BEDROCK_INVOKE_PROVIDERS_LITERAL,
|
BEDROCK_INVOKE_PROVIDERS_LITERAL,
|
||||||
|
DEFAULT_MAX_TOKENS,
|
||||||
|
DEFAULT_SOFT_BUDGET,
|
||||||
|
DEFAULT_ALLOWED_FAILS,
|
||||||
)
|
)
|
||||||
from litellm.types.guardrails import GuardrailItem
|
from litellm.types.guardrails import GuardrailItem
|
||||||
from litellm.proxy._types import (
|
from litellm.proxy._types import (
|
||||||
|
@ -155,7 +158,7 @@ token: Optional[
|
||||||
str
|
str
|
||||||
] = None # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
|
] = None # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
|
||||||
telemetry = True
|
telemetry = True
|
||||||
max_tokens = 256 # OpenAI Defaults
|
max_tokens: int = DEFAULT_MAX_TOKENS # OpenAI Defaults
|
||||||
drop_params = bool(os.getenv("LITELLM_DROP_PARAMS", False))
|
drop_params = bool(os.getenv("LITELLM_DROP_PARAMS", False))
|
||||||
modify_params = False
|
modify_params = False
|
||||||
retry = True
|
retry = True
|
||||||
|
@ -244,7 +247,7 @@ budget_duration: Optional[
|
||||||
str
|
str
|
||||||
] = None # proxy only - resets budget after fixed duration. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
|
] = None # proxy only - resets budget after fixed duration. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
|
||||||
default_soft_budget: float = (
|
default_soft_budget: float = (
|
||||||
50.0 # by default all litellm proxy keys have a soft budget of 50.0
|
DEFAULT_SOFT_BUDGET # by default all litellm proxy keys have a soft budget of 50.0
|
||||||
)
|
)
|
||||||
forward_traceparent_to_llm_provider: bool = False
|
forward_traceparent_to_llm_provider: bool = False
|
||||||
|
|
||||||
|
|
|
@ -18,6 +18,7 @@ import redis # type: ignore
|
||||||
import redis.asyncio as async_redis # type: ignore
|
import redis.asyncio as async_redis # type: ignore
|
||||||
|
|
||||||
from litellm import get_secret, get_secret_str
|
from litellm import get_secret, get_secret_str
|
||||||
|
from litellm.constants import REDIS_CONNECTION_POOL_TIMEOUT, REDIS_SOCKET_TIMEOUT
|
||||||
|
|
||||||
from ._logging import verbose_logger
|
from ._logging import verbose_logger
|
||||||
|
|
||||||
|
@ -215,7 +216,7 @@ def _init_redis_sentinel(redis_kwargs) -> redis.Redis:
|
||||||
# Set up the Sentinel client
|
# Set up the Sentinel client
|
||||||
sentinel = redis.Sentinel(
|
sentinel = redis.Sentinel(
|
||||||
sentinel_nodes,
|
sentinel_nodes,
|
||||||
socket_timeout=0.1,
|
socket_timeout=REDIS_SOCKET_TIMEOUT,
|
||||||
password=sentinel_password,
|
password=sentinel_password,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -239,7 +240,7 @@ def _init_async_redis_sentinel(redis_kwargs) -> async_redis.Redis:
|
||||||
# Set up the Sentinel client
|
# Set up the Sentinel client
|
||||||
sentinel = async_redis.Sentinel(
|
sentinel = async_redis.Sentinel(
|
||||||
sentinel_nodes,
|
sentinel_nodes,
|
||||||
socket_timeout=0.1,
|
socket_timeout=REDIS_SOCKET_TIMEOUT,
|
||||||
password=sentinel_password,
|
password=sentinel_password,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -319,7 +320,7 @@ def get_redis_connection_pool(**env_overrides):
|
||||||
verbose_logger.debug("get_redis_connection_pool: redis_kwargs", redis_kwargs)
|
verbose_logger.debug("get_redis_connection_pool: redis_kwargs", redis_kwargs)
|
||||||
if "url" in redis_kwargs and redis_kwargs["url"] is not None:
|
if "url" in redis_kwargs and redis_kwargs["url"] is not None:
|
||||||
return async_redis.BlockingConnectionPool.from_url(
|
return async_redis.BlockingConnectionPool.from_url(
|
||||||
timeout=5, url=redis_kwargs["url"]
|
timeout=REDIS_CONNECTION_POOL_TIMEOUT, url=redis_kwargs["url"]
|
||||||
)
|
)
|
||||||
connection_class = async_redis.Connection
|
connection_class = async_redis.Connection
|
||||||
if "ssl" in redis_kwargs:
|
if "ssl" in redis_kwargs:
|
||||||
|
@ -327,4 +328,6 @@ def get_redis_connection_pool(**env_overrides):
|
||||||
redis_kwargs.pop("ssl", None)
|
redis_kwargs.pop("ssl", None)
|
||||||
redis_kwargs["connection_class"] = connection_class
|
redis_kwargs["connection_class"] = connection_class
|
||||||
redis_kwargs.pop("startup_nodes", None)
|
redis_kwargs.pop("startup_nodes", None)
|
||||||
return async_redis.BlockingConnectionPool(timeout=5, **redis_kwargs)
|
return async_redis.BlockingConnectionPool(
|
||||||
|
timeout=REDIS_CONNECTION_POOL_TIMEOUT, **redis_kwargs
|
||||||
|
)
|
||||||
|
|
|
@ -14,6 +14,12 @@ import time
|
||||||
from typing import Literal, Optional
|
from typing import Literal, Optional
|
||||||
|
|
||||||
import litellm
|
import litellm
|
||||||
|
from litellm.constants import (
|
||||||
|
DAYS_IN_A_MONTH,
|
||||||
|
DAYS_IN_A_WEEK,
|
||||||
|
DAYS_IN_A_YEAR,
|
||||||
|
HOURS_IN_A_DAY,
|
||||||
|
)
|
||||||
from litellm.utils import ModelResponse
|
from litellm.utils import ModelResponse
|
||||||
|
|
||||||
|
|
||||||
|
@ -81,11 +87,11 @@ class BudgetManager:
|
||||||
if duration == "daily":
|
if duration == "daily":
|
||||||
duration_in_days = 1
|
duration_in_days = 1
|
||||||
elif duration == "weekly":
|
elif duration == "weekly":
|
||||||
duration_in_days = 7
|
duration_in_days = DAYS_IN_A_WEEK
|
||||||
elif duration == "monthly":
|
elif duration == "monthly":
|
||||||
duration_in_days = 28
|
duration_in_days = DAYS_IN_A_MONTH
|
||||||
elif duration == "yearly":
|
elif duration == "yearly":
|
||||||
duration_in_days = 365
|
duration_in_days = DAYS_IN_A_YEAR
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"""duration needs to be one of ["daily", "weekly", "monthly", "yearly"]"""
|
"""duration needs to be one of ["daily", "weekly", "monthly", "yearly"]"""
|
||||||
|
@ -182,7 +188,9 @@ class BudgetManager:
|
||||||
current_time = time.time()
|
current_time = time.time()
|
||||||
|
|
||||||
# Convert duration from days to seconds
|
# Convert duration from days to seconds
|
||||||
duration_in_seconds = self.user_dict[user]["duration"] * 24 * 60 * 60
|
duration_in_seconds = (
|
||||||
|
self.user_dict[user]["duration"] * HOURS_IN_A_DAY * 60 * 60
|
||||||
|
)
|
||||||
|
|
||||||
# Check if duration has elapsed
|
# Check if duration has elapsed
|
||||||
if current_time - last_updated_at >= duration_in_seconds:
|
if current_time - last_updated_at >= duration_in_seconds:
|
||||||
|
|
|
@ -19,6 +19,7 @@ from pydantic import BaseModel
|
||||||
|
|
||||||
import litellm
|
import litellm
|
||||||
from litellm._logging import verbose_logger
|
from litellm._logging import verbose_logger
|
||||||
|
from litellm.constants import CACHED_STREAMING_CHUNK_DELAY
|
||||||
from litellm.litellm_core_utils.model_param_helper import ModelParamHelper
|
from litellm.litellm_core_utils.model_param_helper import ModelParamHelper
|
||||||
from litellm.types.caching import *
|
from litellm.types.caching import *
|
||||||
from litellm.types.utils import all_litellm_params
|
from litellm.types.utils import all_litellm_params
|
||||||
|
@ -406,7 +407,7 @@ class Cache:
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
time.sleep(0.02)
|
time.sleep(CACHED_STREAMING_CHUNK_DELAY)
|
||||||
|
|
||||||
def _get_cache_logic(
|
def _get_cache_logic(
|
||||||
self,
|
self,
|
||||||
|
|
|
@ -15,7 +15,8 @@ from typing import Any, List, Optional
|
||||||
|
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from ..constants import MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB
|
from litellm.constants import MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB
|
||||||
|
|
||||||
from .base_cache import BaseCache
|
from .base_cache import BaseCache
|
||||||
|
|
||||||
|
|
||||||
|
@ -52,7 +53,8 @@ class InMemoryCache(BaseCache):
|
||||||
# Fast path for common primitive types that are typically small
|
# Fast path for common primitive types that are typically small
|
||||||
if (
|
if (
|
||||||
isinstance(value, (bool, int, float, str))
|
isinstance(value, (bool, int, float, str))
|
||||||
and len(str(value)) < self.max_size_per_item * 512
|
and len(str(value))
|
||||||
|
< self.max_size_per_item * MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB
|
||||||
): # Conservative estimate
|
): # Conservative estimate
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
|
@ -11,10 +11,12 @@ Has 4 methods:
|
||||||
import ast
|
import ast
|
||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
from typing import Any
|
from typing import Any, cast
|
||||||
|
|
||||||
import litellm
|
import litellm
|
||||||
from litellm._logging import print_verbose
|
from litellm._logging import print_verbose
|
||||||
|
from litellm.constants import QDRANT_SCALAR_QUANTILE, QDRANT_VECTOR_SIZE
|
||||||
|
from litellm.types.utils import EmbeddingResponse
|
||||||
|
|
||||||
from .base_cache import BaseCache
|
from .base_cache import BaseCache
|
||||||
|
|
||||||
|
@ -118,7 +120,11 @@ class QdrantSemanticCache(BaseCache):
|
||||||
}
|
}
|
||||||
elif quantization_config == "scalar":
|
elif quantization_config == "scalar":
|
||||||
quantization_params = {
|
quantization_params = {
|
||||||
"scalar": {"type": "int8", "quantile": 0.99, "always_ram": False}
|
"scalar": {
|
||||||
|
"type": "int8",
|
||||||
|
"quantile": QDRANT_SCALAR_QUANTILE,
|
||||||
|
"always_ram": False,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
elif quantization_config == "product":
|
elif quantization_config == "product":
|
||||||
quantization_params = {
|
quantization_params = {
|
||||||
|
@ -132,7 +138,7 @@ class QdrantSemanticCache(BaseCache):
|
||||||
new_collection_status = self.sync_client.put(
|
new_collection_status = self.sync_client.put(
|
||||||
url=f"{self.qdrant_api_base}/collections/{self.collection_name}",
|
url=f"{self.qdrant_api_base}/collections/{self.collection_name}",
|
||||||
json={
|
json={
|
||||||
"vectors": {"size": 1536, "distance": "Cosine"},
|
"vectors": {"size": QDRANT_VECTOR_SIZE, "distance": "Cosine"},
|
||||||
"quantization_config": quantization_params,
|
"quantization_config": quantization_params,
|
||||||
},
|
},
|
||||||
headers=self.headers,
|
headers=self.headers,
|
||||||
|
@ -171,10 +177,13 @@ class QdrantSemanticCache(BaseCache):
|
||||||
prompt += message["content"]
|
prompt += message["content"]
|
||||||
|
|
||||||
# create an embedding for prompt
|
# create an embedding for prompt
|
||||||
embedding_response = litellm.embedding(
|
embedding_response = cast(
|
||||||
model=self.embedding_model,
|
EmbeddingResponse,
|
||||||
input=prompt,
|
litellm.embedding(
|
||||||
cache={"no-store": True, "no-cache": True},
|
model=self.embedding_model,
|
||||||
|
input=prompt,
|
||||||
|
cache={"no-store": True, "no-cache": True},
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
# get the embedding
|
# get the embedding
|
||||||
|
@ -212,10 +221,13 @@ class QdrantSemanticCache(BaseCache):
|
||||||
prompt += message["content"]
|
prompt += message["content"]
|
||||||
|
|
||||||
# convert to embedding
|
# convert to embedding
|
||||||
embedding_response = litellm.embedding(
|
embedding_response = cast(
|
||||||
model=self.embedding_model,
|
EmbeddingResponse,
|
||||||
input=prompt,
|
litellm.embedding(
|
||||||
cache={"no-store": True, "no-cache": True},
|
model=self.embedding_model,
|
||||||
|
input=prompt,
|
||||||
|
cache={"no-store": True, "no-cache": True},
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
# get the embedding
|
# get the embedding
|
||||||
|
|
|
@ -9,6 +9,7 @@ DEFAULT_FAILURE_THRESHOLD_PERCENT = (
|
||||||
0.5 # default cooldown a deployment if 50% of requests fail in a given minute
|
0.5 # default cooldown a deployment if 50% of requests fail in a given minute
|
||||||
)
|
)
|
||||||
DEFAULT_MAX_TOKENS = 4096
|
DEFAULT_MAX_TOKENS = 4096
|
||||||
|
DEFAULT_ALLOWED_FAILS = 3
|
||||||
DEFAULT_REDIS_SYNC_INTERVAL = 1
|
DEFAULT_REDIS_SYNC_INTERVAL = 1
|
||||||
DEFAULT_COOLDOWN_TIME_SECONDS = 5
|
DEFAULT_COOLDOWN_TIME_SECONDS = 5
|
||||||
DEFAULT_REPLICATE_POLLING_RETRIES = 5
|
DEFAULT_REPLICATE_POLLING_RETRIES = 5
|
||||||
|
@ -16,16 +17,71 @@ DEFAULT_REPLICATE_POLLING_DELAY_SECONDS = 1
|
||||||
DEFAULT_IMAGE_TOKEN_COUNT = 250
|
DEFAULT_IMAGE_TOKEN_COUNT = 250
|
||||||
DEFAULT_IMAGE_WIDTH = 300
|
DEFAULT_IMAGE_WIDTH = 300
|
||||||
DEFAULT_IMAGE_HEIGHT = 300
|
DEFAULT_IMAGE_HEIGHT = 300
|
||||||
|
DEFAULT_MAX_TOKENS = 256 # used when providers need a default
|
||||||
MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 1024 # 1MB = 1024KB
|
MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 1024 # 1MB = 1024KB
|
||||||
SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000 # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic.
|
SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000 # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic.
|
||||||
REDIS_UPDATE_BUFFER_KEY = "litellm_spend_update_buffer"
|
REDIS_UPDATE_BUFFER_KEY = "litellm_spend_update_buffer"
|
||||||
REDIS_DAILY_SPEND_UPDATE_BUFFER_KEY = "litellm_daily_spend_update_buffer"
|
REDIS_DAILY_SPEND_UPDATE_BUFFER_KEY = "litellm_daily_spend_update_buffer"
|
||||||
MAX_REDIS_BUFFER_DEQUEUE_COUNT = 100
|
MAX_REDIS_BUFFER_DEQUEUE_COUNT = 100
|
||||||
|
MINIMUM_PROMPT_CACHE_TOKEN_COUNT = (
|
||||||
|
1024 # minimum number of tokens to cache a prompt by Anthropic
|
||||||
|
)
|
||||||
|
DEFAULT_TRIM_RATIO = 0.75 # default ratio of tokens to trim from the end of a prompt
|
||||||
|
HOURS_IN_A_DAY = 24
|
||||||
|
DAYS_IN_A_WEEK = 7
|
||||||
|
DAYS_IN_A_MONTH = 28
|
||||||
|
DAYS_IN_A_YEAR = 365
|
||||||
|
REPLICATE_MODEL_NAME_WITH_ID_LENGTH = 64
|
||||||
|
#### TOKEN COUNTING ####
|
||||||
|
FUNCTION_DEFINITION_TOKEN_COUNT = 9
|
||||||
|
SYSTEM_MESSAGE_TOKEN_COUNT = 4
|
||||||
|
TOOL_CHOICE_OBJECT_TOKEN_COUNT = 4
|
||||||
|
DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT = 10
|
||||||
|
DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT = 20
|
||||||
|
MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES = 768
|
||||||
|
MAX_LONG_SIDE_FOR_IMAGE_HIGH_RES = 2000
|
||||||
|
MAX_TILE_WIDTH = 512
|
||||||
|
MAX_TILE_HEIGHT = 512
|
||||||
|
OPENAI_FILE_SEARCH_COST_PER_1K_CALLS = 2.5 / 1000
|
||||||
|
MIN_NON_ZERO_TEMPERATURE = 0.0001
|
||||||
#### RELIABILITY ####
|
#### RELIABILITY ####
|
||||||
REPEATED_STREAMING_CHUNK_LIMIT = 100 # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives.
|
REPEATED_STREAMING_CHUNK_LIMIT = 100 # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives.
|
||||||
|
DEFAULT_MAX_LRU_CACHE_SIZE = 16
|
||||||
|
INITIAL_RETRY_DELAY = 0.5
|
||||||
|
MAX_RETRY_DELAY = 8.0
|
||||||
|
JITTER = 0.75
|
||||||
|
DEFAULT_IN_MEMORY_TTL = 5 # default time to live for the in-memory cache
|
||||||
|
DEFAULT_POLLING_INTERVAL = 0.03 # default polling interval for the scheduler
|
||||||
|
AZURE_OPERATION_POLLING_TIMEOUT = 120
|
||||||
|
REDIS_SOCKET_TIMEOUT = 0.1
|
||||||
|
REDIS_CONNECTION_POOL_TIMEOUT = 5
|
||||||
|
NON_LLM_CONNECTION_TIMEOUT = 15 # timeout for adjacent services (e.g. jwt auth)
|
||||||
|
MAX_EXCEPTION_MESSAGE_LENGTH = 2000
|
||||||
|
BEDROCK_MAX_POLICY_SIZE = 75
|
||||||
|
REPLICATE_POLLING_DELAY_SECONDS = 0.5
|
||||||
|
DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS = 4096
|
||||||
|
TOGETHER_AI_4_B = 4
|
||||||
|
TOGETHER_AI_8_B = 8
|
||||||
|
TOGETHER_AI_21_B = 21
|
||||||
|
TOGETHER_AI_41_B = 41
|
||||||
|
TOGETHER_AI_80_B = 80
|
||||||
|
TOGETHER_AI_110_B = 110
|
||||||
|
TOGETHER_AI_EMBEDDING_150_M = 150
|
||||||
|
TOGETHER_AI_EMBEDDING_350_M = 350
|
||||||
|
QDRANT_SCALAR_QUANTILE = 0.99
|
||||||
|
QDRANT_VECTOR_SIZE = 1536
|
||||||
|
CACHED_STREAMING_CHUNK_DELAY = 0.02
|
||||||
|
MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 512
|
||||||
|
DEFAULT_MAX_TOKENS_FOR_TRITON = 2000
|
||||||
#### Networking settings ####
|
#### Networking settings ####
|
||||||
request_timeout: float = 6000 # time in seconds
|
request_timeout: float = 6000 # time in seconds
|
||||||
STREAM_SSE_DONE_STRING: str = "[DONE]"
|
STREAM_SSE_DONE_STRING: str = "[DONE]"
|
||||||
|
### SPEND TRACKING ###
|
||||||
|
DEFAULT_REPLICATE_GPU_PRICE_PER_SECOND = 0.001400 # price per second for a100 80GB
|
||||||
|
FIREWORKS_AI_56_B_MOE = 56
|
||||||
|
FIREWORKS_AI_176_B_MOE = 176
|
||||||
|
FIREWORKS_AI_16_B = 16
|
||||||
|
FIREWORKS_AI_80_B = 80
|
||||||
|
|
||||||
LITELLM_CHAT_PROVIDERS = [
|
LITELLM_CHAT_PROVIDERS = [
|
||||||
"openai",
|
"openai",
|
||||||
|
@ -426,6 +482,9 @@ MCP_TOOL_NAME_PREFIX = "mcp_tool"
|
||||||
MAX_SPENDLOG_ROWS_TO_QUERY = (
|
MAX_SPENDLOG_ROWS_TO_QUERY = (
|
||||||
1_000_000 # if spendLogs has more than 1M rows, do not query the DB
|
1_000_000 # if spendLogs has more than 1M rows, do not query the DB
|
||||||
)
|
)
|
||||||
|
DEFAULT_SOFT_BUDGET = (
|
||||||
|
50.0 # by default all litellm proxy keys have a soft budget of 50.0
|
||||||
|
)
|
||||||
# makes it clear this is a rate limit error for a litellm virtual key
|
# makes it clear this is a rate limit error for a litellm virtual key
|
||||||
RATE_LIMIT_ERROR_MESSAGE_FOR_VIRTUAL_KEY = "LiteLLM Virtual Key user_api_key_hash"
|
RATE_LIMIT_ERROR_MESSAGE_FOR_VIRTUAL_KEY = "LiteLLM Virtual Key user_api_key_hash"
|
||||||
|
|
||||||
|
@ -451,3 +510,14 @@ LITELLM_PROXY_ADMIN_NAME = "default_user_id"
|
||||||
########################### DB CRON JOB NAMES ###########################
|
########################### DB CRON JOB NAMES ###########################
|
||||||
DB_SPEND_UPDATE_JOB_NAME = "db_spend_update_job"
|
DB_SPEND_UPDATE_JOB_NAME = "db_spend_update_job"
|
||||||
DEFAULT_CRON_JOB_LOCK_TTL_SECONDS = 60 # 1 minute
|
DEFAULT_CRON_JOB_LOCK_TTL_SECONDS = 60 # 1 minute
|
||||||
|
PROXY_BUDGET_RESCHEDULER_MIN_TIME = 597
|
||||||
|
PROXY_BUDGET_RESCHEDULER_MAX_TIME = 605
|
||||||
|
PROXY_BATCH_WRITE_AT = 10 # in seconds
|
||||||
|
DEFAULT_HEALTH_CHECK_INTERVAL = 300 # 5 minutes
|
||||||
|
PROMETHEUS_FALLBACK_STATS_SEND_TIME_HOURS = 9
|
||||||
|
DEFAULT_MODEL_CREATED_AT_TIME = 1677610602 # returns on `/models` endpoint
|
||||||
|
DEFAULT_SLACK_ALERTING_THRESHOLD = 300
|
||||||
|
MAX_TEAM_LIST_LIMIT = 20
|
||||||
|
DEFAULT_PROMPT_INJECTION_SIMILARITY_THRESHOLD = 0.7
|
||||||
|
LENGTH_OF_LITELLM_GENERATED_KEY = 16
|
||||||
|
SECRET_MANAGER_REFRESH_INTERVAL = 86400
|
||||||
|
|
|
@ -9,6 +9,10 @@ from pydantic import BaseModel
|
||||||
import litellm
|
import litellm
|
||||||
import litellm._logging
|
import litellm._logging
|
||||||
from litellm import verbose_logger
|
from litellm import verbose_logger
|
||||||
|
from litellm.constants import (
|
||||||
|
DEFAULT_MAX_LRU_CACHE_SIZE,
|
||||||
|
DEFAULT_REPLICATE_GPU_PRICE_PER_SECOND,
|
||||||
|
)
|
||||||
from litellm.litellm_core_utils.llm_cost_calc.tool_call_cost_tracking import (
|
from litellm.litellm_core_utils.llm_cost_calc.tool_call_cost_tracking import (
|
||||||
StandardBuiltInToolCostTracking,
|
StandardBuiltInToolCostTracking,
|
||||||
)
|
)
|
||||||
|
@ -355,9 +359,7 @@ def cost_per_token( # noqa: PLR0915
|
||||||
def get_replicate_completion_pricing(completion_response: dict, total_time=0.0):
|
def get_replicate_completion_pricing(completion_response: dict, total_time=0.0):
|
||||||
# see https://replicate.com/pricing
|
# see https://replicate.com/pricing
|
||||||
# for all litellm currently supported LLMs, almost all requests go to a100_80gb
|
# for all litellm currently supported LLMs, almost all requests go to a100_80gb
|
||||||
a100_80gb_price_per_second_public = (
|
a100_80gb_price_per_second_public = DEFAULT_REPLICATE_GPU_PRICE_PER_SECOND # assume all calls sent to A100 80GB for now
|
||||||
0.001400 # assume all calls sent to A100 80GB for now
|
|
||||||
)
|
|
||||||
if total_time == 0.0: # total time is in ms
|
if total_time == 0.0: # total time is in ms
|
||||||
start_time = completion_response.get("created", time.time())
|
start_time = completion_response.get("created", time.time())
|
||||||
end_time = getattr(completion_response, "ended", time.time())
|
end_time = getattr(completion_response, "ended", time.time())
|
||||||
|
@ -450,7 +452,7 @@ def _select_model_name_for_cost_calc(
|
||||||
return return_model
|
return return_model
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=16)
|
@lru_cache(maxsize=DEFAULT_MAX_LRU_CACHE_SIZE)
|
||||||
def _model_contains_known_llm_provider(model: str) -> bool:
|
def _model_contains_known_llm_provider(model: str) -> bool:
|
||||||
"""
|
"""
|
||||||
Check if the model contains a known llm provider
|
Check if the model contains a known llm provider
|
||||||
|
|
|
@ -16,6 +16,7 @@ import litellm.litellm_core_utils.litellm_logging
|
||||||
import litellm.types
|
import litellm.types
|
||||||
from litellm._logging import verbose_logger, verbose_proxy_logger
|
from litellm._logging import verbose_logger, verbose_proxy_logger
|
||||||
from litellm.caching.caching import DualCache
|
from litellm.caching.caching import DualCache
|
||||||
|
from litellm.constants import HOURS_IN_A_DAY
|
||||||
from litellm.integrations.custom_batch_logger import CustomBatchLogger
|
from litellm.integrations.custom_batch_logger import CustomBatchLogger
|
||||||
from litellm.litellm_core_utils.duration_parser import duration_in_seconds
|
from litellm.litellm_core_utils.duration_parser import duration_in_seconds
|
||||||
from litellm.litellm_core_utils.exception_mapping_utils import (
|
from litellm.litellm_core_utils.exception_mapping_utils import (
|
||||||
|
@ -649,10 +650,10 @@ class SlackAlerting(CustomBatchLogger):
|
||||||
event_message += (
|
event_message += (
|
||||||
f"Budget Crossed\n Total Budget:`{user_info.max_budget}`"
|
f"Budget Crossed\n Total Budget:`{user_info.max_budget}`"
|
||||||
)
|
)
|
||||||
elif percent_left <= 0.05:
|
elif percent_left <= SLACK_ALERTING_THRESHOLD_5_PERCENT:
|
||||||
event = "threshold_crossed"
|
event = "threshold_crossed"
|
||||||
event_message += "5% Threshold Crossed "
|
event_message += "5% Threshold Crossed "
|
||||||
elif percent_left <= 0.15:
|
elif percent_left <= SLACK_ALERTING_THRESHOLD_15_PERCENT:
|
||||||
event = "threshold_crossed"
|
event = "threshold_crossed"
|
||||||
event_message += "15% Threshold Crossed"
|
event_message += "15% Threshold Crossed"
|
||||||
elif user_info.soft_budget is not None:
|
elif user_info.soft_budget is not None:
|
||||||
|
@ -1718,7 +1719,7 @@ Model Info:
|
||||||
await self.internal_usage_cache.async_set_cache(
|
await self.internal_usage_cache.async_set_cache(
|
||||||
key=_event_cache_key,
|
key=_event_cache_key,
|
||||||
value="SENT",
|
value="SENT",
|
||||||
ttl=(30 * 24 * 60 * 60), # 1 month
|
ttl=(30 * HOURS_IN_A_DAY * 60 * 60), # 1 month
|
||||||
)
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
|
@ -41,7 +41,7 @@ from litellm.types.utils import StandardLoggingPayload
|
||||||
from ..additional_logging_utils import AdditionalLoggingUtils
|
from ..additional_logging_utils import AdditionalLoggingUtils
|
||||||
|
|
||||||
# max number of logs DD API can accept
|
# max number of logs DD API can accept
|
||||||
DD_MAX_BATCH_SIZE = 1000
|
|
||||||
|
|
||||||
# specify what ServiceTypes are logged as success events to DD. (We don't want to spam DD traces with large number of service types)
|
# specify what ServiceTypes are logged as success events to DD. (We don't want to spam DD traces with large number of service types)
|
||||||
DD_LOGGED_SUCCESS_SERVICE_TYPES = [
|
DD_LOGGED_SUCCESS_SERVICE_TYPES = [
|
||||||
|
|
|
@ -20,10 +20,6 @@ else:
|
||||||
VertexBase = Any
|
VertexBase = Any
|
||||||
|
|
||||||
|
|
||||||
GCS_DEFAULT_BATCH_SIZE = 2048
|
|
||||||
GCS_DEFAULT_FLUSH_INTERVAL_SECONDS = 20
|
|
||||||
|
|
||||||
|
|
||||||
class GCSBucketLogger(GCSBucketBase, AdditionalLoggingUtils):
|
class GCSBucketLogger(GCSBucketBase, AdditionalLoggingUtils):
|
||||||
def __init__(self, bucket_name: Optional[str] = None) -> None:
|
def __init__(self, bucket_name: Optional[str] = None) -> None:
|
||||||
from litellm.proxy.proxy_server import premium_user
|
from litellm.proxy.proxy_server import premium_user
|
||||||
|
|
|
@ -3,6 +3,7 @@ from typing import Optional, Tuple
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
import litellm
|
import litellm
|
||||||
|
from litellm.constants import REPLICATE_MODEL_NAME_WITH_ID_LENGTH
|
||||||
from litellm.secret_managers.main import get_secret, get_secret_str
|
from litellm.secret_managers.main import get_secret, get_secret_str
|
||||||
|
|
||||||
from ..types.router import LiteLLM_Params
|
from ..types.router import LiteLLM_Params
|
||||||
|
@ -256,10 +257,13 @@ def get_llm_provider( # noqa: PLR0915
|
||||||
elif model in litellm.cohere_chat_models:
|
elif model in litellm.cohere_chat_models:
|
||||||
custom_llm_provider = "cohere_chat"
|
custom_llm_provider = "cohere_chat"
|
||||||
## replicate
|
## replicate
|
||||||
elif model in litellm.replicate_models or (":" in model and len(model) > 64):
|
elif model in litellm.replicate_models or (
|
||||||
|
":" in model and len(model) > REPLICATE_MODEL_NAME_WITH_ID_LENGTH
|
||||||
|
):
|
||||||
model_parts = model.split(":")
|
model_parts = model.split(":")
|
||||||
if (
|
if (
|
||||||
len(model_parts) > 1 and len(model_parts[1]) == 64
|
len(model_parts) > 1
|
||||||
|
and len(model_parts[1]) == REPLICATE_MODEL_NAME_WITH_ID_LENGTH
|
||||||
): ## checks if model name has a 64 digit code - e.g. "meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3"
|
): ## checks if model name has a 64 digit code - e.g. "meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3"
|
||||||
custom_llm_provider = "replicate"
|
custom_llm_provider = "replicate"
|
||||||
elif model in litellm.replicate_models:
|
elif model in litellm.replicate_models:
|
||||||
|
|
|
@ -28,6 +28,10 @@ from litellm._logging import _is_debugging_on, verbose_logger
|
||||||
from litellm.batches.batch_utils import _handle_completed_batch
|
from litellm.batches.batch_utils import _handle_completed_batch
|
||||||
from litellm.caching.caching import DualCache, InMemoryCache
|
from litellm.caching.caching import DualCache, InMemoryCache
|
||||||
from litellm.caching.caching_handler import LLMCachingHandler
|
from litellm.caching.caching_handler import LLMCachingHandler
|
||||||
|
from litellm.constants import (
|
||||||
|
DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT,
|
||||||
|
DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT,
|
||||||
|
)
|
||||||
from litellm.cost_calculator import _select_model_name_for_cost_calc
|
from litellm.cost_calculator import _select_model_name_for_cost_calc
|
||||||
from litellm.integrations.arize.arize import ArizeLogger
|
from litellm.integrations.arize.arize import ArizeLogger
|
||||||
from litellm.integrations.custom_guardrail import CustomGuardrail
|
from litellm.integrations.custom_guardrail import CustomGuardrail
|
||||||
|
@ -3745,9 +3749,12 @@ def create_dummy_standard_logging_payload() -> StandardLoggingPayload:
|
||||||
response_cost=response_cost,
|
response_cost=response_cost,
|
||||||
response_cost_failure_debug_info=None,
|
response_cost_failure_debug_info=None,
|
||||||
status=str("success"),
|
status=str("success"),
|
||||||
total_tokens=int(30),
|
total_tokens=int(
|
||||||
prompt_tokens=int(20),
|
DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT
|
||||||
completion_tokens=int(10),
|
+ DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT
|
||||||
|
),
|
||||||
|
prompt_tokens=int(DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT),
|
||||||
|
completion_tokens=int(DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT),
|
||||||
startTime=start_time,
|
startTime=start_time,
|
||||||
endTime=end_time,
|
endTime=end_time,
|
||||||
completionStartTime=completion_start_time,
|
completionStartTime=completion_start_time,
|
||||||
|
|
|
@ -5,6 +5,7 @@ Helper utilities for tracking the cost of built-in tools.
|
||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
import litellm
|
import litellm
|
||||||
|
from litellm.constants import OPENAI_FILE_SEARCH_COST_PER_1K_CALLS
|
||||||
from litellm.types.llms.openai import FileSearchTool, WebSearchOptions
|
from litellm.types.llms.openai import FileSearchTool, WebSearchOptions
|
||||||
from litellm.types.utils import (
|
from litellm.types.utils import (
|
||||||
ModelInfo,
|
ModelInfo,
|
||||||
|
@ -132,7 +133,7 @@ class StandardBuiltInToolCostTracking:
|
||||||
"""
|
"""
|
||||||
if file_search is None:
|
if file_search is None:
|
||||||
return 0.0
|
return 0.0
|
||||||
return 2.5 / 1000
|
return OPENAI_FILE_SEARCH_COST_PER_1K_CALLS
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def chat_completion_response_includes_annotations(
|
def chat_completion_response_includes_annotations(
|
||||||
|
|
|
@ -11,6 +11,10 @@ from litellm.constants import (
|
||||||
DEFAULT_IMAGE_HEIGHT,
|
DEFAULT_IMAGE_HEIGHT,
|
||||||
DEFAULT_IMAGE_TOKEN_COUNT,
|
DEFAULT_IMAGE_TOKEN_COUNT,
|
||||||
DEFAULT_IMAGE_WIDTH,
|
DEFAULT_IMAGE_WIDTH,
|
||||||
|
MAX_LONG_SIDE_FOR_IMAGE_HIGH_RES,
|
||||||
|
MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES,
|
||||||
|
MAX_TILE_HEIGHT,
|
||||||
|
MAX_TILE_WIDTH,
|
||||||
)
|
)
|
||||||
from litellm.llms.custom_httpx.http_handler import _get_httpx_client
|
from litellm.llms.custom_httpx.http_handler import _get_httpx_client
|
||||||
|
|
||||||
|
@ -97,11 +101,14 @@ def resize_image_high_res(
|
||||||
height: int,
|
height: int,
|
||||||
) -> Tuple[int, int]:
|
) -> Tuple[int, int]:
|
||||||
# Maximum dimensions for high res mode
|
# Maximum dimensions for high res mode
|
||||||
max_short_side = 768
|
max_short_side = MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES
|
||||||
max_long_side = 2000
|
max_long_side = MAX_LONG_SIDE_FOR_IMAGE_HIGH_RES
|
||||||
|
|
||||||
# Return early if no resizing is needed
|
# Return early if no resizing is needed
|
||||||
if width <= 768 and height <= 768:
|
if (
|
||||||
|
width <= MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES
|
||||||
|
and height <= MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES
|
||||||
|
):
|
||||||
return width, height
|
return width, height
|
||||||
|
|
||||||
# Determine the longer and shorter sides
|
# Determine the longer and shorter sides
|
||||||
|
@ -132,7 +139,10 @@ def resize_image_high_res(
|
||||||
|
|
||||||
# Test the function with the given example
|
# Test the function with the given example
|
||||||
def calculate_tiles_needed(
|
def calculate_tiles_needed(
|
||||||
resized_width, resized_height, tile_width=512, tile_height=512
|
resized_width,
|
||||||
|
resized_height,
|
||||||
|
tile_width=MAX_TILE_WIDTH,
|
||||||
|
tile_height=MAX_TILE_HEIGHT,
|
||||||
):
|
):
|
||||||
tiles_across = (resized_width + tile_width - 1) // tile_width
|
tiles_across = (resized_width + tile_width - 1) // tile_width
|
||||||
tiles_down = (resized_height + tile_height - 1) // tile_height
|
tiles_down = (resized_height + tile_height - 1) // tile_height
|
||||||
|
|
|
@ -5,7 +5,10 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union, cast
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
import litellm
|
import litellm
|
||||||
from litellm.constants import RESPONSE_FORMAT_TOOL_NAME
|
from litellm.constants import (
|
||||||
|
DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS,
|
||||||
|
RESPONSE_FORMAT_TOOL_NAME,
|
||||||
|
)
|
||||||
from litellm.litellm_core_utils.core_helpers import map_finish_reason
|
from litellm.litellm_core_utils.core_helpers import map_finish_reason
|
||||||
from litellm.litellm_core_utils.prompt_templates.factory import anthropic_messages_pt
|
from litellm.litellm_core_utils.prompt_templates.factory import anthropic_messages_pt
|
||||||
from litellm.llms.base_llm.base_utils import type_to_response_format_param
|
from litellm.llms.base_llm.base_utils import type_to_response_format_param
|
||||||
|
@ -53,7 +56,7 @@ class AnthropicConfig(BaseConfig):
|
||||||
|
|
||||||
max_tokens: Optional[
|
max_tokens: Optional[
|
||||||
int
|
int
|
||||||
] = 4096 # anthropic requires a default value (Opus, Sonnet, and Haiku have the same default)
|
] = DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS # anthropic requires a default value (Opus, Sonnet, and Haiku have the same default)
|
||||||
stop_sequences: Optional[list] = None
|
stop_sequences: Optional[list] = None
|
||||||
temperature: Optional[int] = None
|
temperature: Optional[int] = None
|
||||||
top_p: Optional[int] = None
|
top_p: Optional[int] = None
|
||||||
|
@ -65,7 +68,7 @@ class AnthropicConfig(BaseConfig):
|
||||||
self,
|
self,
|
||||||
max_tokens: Optional[
|
max_tokens: Optional[
|
||||||
int
|
int
|
||||||
] = 4096, # You can pass in a value yourself or use the default value 4096
|
] = DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS, # You can pass in a value yourself or use the default value 4096
|
||||||
stop_sequences: Optional[list] = None,
|
stop_sequences: Optional[list] = None,
|
||||||
temperature: Optional[int] = None,
|
temperature: Optional[int] = None,
|
||||||
top_p: Optional[int] = None,
|
top_p: Optional[int] = None,
|
||||||
|
|
|
@ -11,6 +11,7 @@ from typing import AsyncIterator, Dict, Iterator, List, Optional, Union
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
import litellm
|
import litellm
|
||||||
|
from litellm.constants import DEFAULT_MAX_TOKENS
|
||||||
from litellm.litellm_core_utils.prompt_templates.factory import (
|
from litellm.litellm_core_utils.prompt_templates.factory import (
|
||||||
custom_prompt,
|
custom_prompt,
|
||||||
prompt_factory,
|
prompt_factory,
|
||||||
|
@ -65,7 +66,9 @@ class AnthropicTextConfig(BaseConfig):
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
max_tokens_to_sample: Optional[int] = 256, # anthropic requires a default
|
max_tokens_to_sample: Optional[
|
||||||
|
int
|
||||||
|
] = DEFAULT_MAX_TOKENS, # anthropic requires a default
|
||||||
stop_sequences: Optional[list] = None,
|
stop_sequences: Optional[list] = None,
|
||||||
temperature: Optional[int] = None,
|
temperature: Optional[int] = None,
|
||||||
top_p: Optional[int] = None,
|
top_p: Optional[int] = None,
|
||||||
|
|
|
@ -7,7 +7,7 @@ import httpx # type: ignore
|
||||||
from openai import APITimeoutError, AsyncAzureOpenAI, AzureOpenAI
|
from openai import APITimeoutError, AsyncAzureOpenAI, AzureOpenAI
|
||||||
|
|
||||||
import litellm
|
import litellm
|
||||||
from litellm.constants import DEFAULT_MAX_RETRIES
|
from litellm.constants import AZURE_OPERATION_POLLING_TIMEOUT, DEFAULT_MAX_RETRIES
|
||||||
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
|
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
|
||||||
from litellm.litellm_core_utils.logging_utils import track_llm_api_timing
|
from litellm.litellm_core_utils.logging_utils import track_llm_api_timing
|
||||||
from litellm.llms.custom_httpx.http_handler import (
|
from litellm.llms.custom_httpx.http_handler import (
|
||||||
|
@ -857,7 +857,7 @@ class AzureChatCompletion(BaseAzureLLM, BaseLLM):
|
||||||
|
|
||||||
await response.aread()
|
await response.aread()
|
||||||
|
|
||||||
timeout_secs: int = 120
|
timeout_secs: int = AZURE_OPERATION_POLLING_TIMEOUT
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
if "status" not in response.json():
|
if "status" not in response.json():
|
||||||
raise Exception(
|
raise Exception(
|
||||||
|
@ -955,7 +955,7 @@ class AzureChatCompletion(BaseAzureLLM, BaseLLM):
|
||||||
|
|
||||||
response.read()
|
response.read()
|
||||||
|
|
||||||
timeout_secs: int = 120
|
timeout_secs: int = AZURE_OPERATION_POLLING_TIMEOUT
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
if "status" not in response.json():
|
if "status" not in response.json():
|
||||||
raise Exception(
|
raise Exception(
|
||||||
|
|
|
@ -7,6 +7,10 @@ from litellm.litellm_core_utils.prompt_templates.factory import (
|
||||||
convert_to_azure_openai_messages,
|
convert_to_azure_openai_messages,
|
||||||
)
|
)
|
||||||
from litellm.llms.base_llm.chat.transformation import BaseLLMException
|
from litellm.llms.base_llm.chat.transformation import BaseLLMException
|
||||||
|
from litellm.types.llms.azure import (
|
||||||
|
API_VERSION_MONTH_SUPPORTED_RESPONSE_FORMAT,
|
||||||
|
API_VERSION_YEAR_SUPPORTED_RESPONSE_FORMAT,
|
||||||
|
)
|
||||||
from litellm.types.utils import ModelResponse
|
from litellm.types.utils import ModelResponse
|
||||||
from litellm.utils import supports_response_schema
|
from litellm.utils import supports_response_schema
|
||||||
|
|
||||||
|
@ -123,7 +127,10 @@ class AzureOpenAIConfig(BaseConfig):
|
||||||
- check if api_version is supported for response_format
|
- check if api_version is supported for response_format
|
||||||
"""
|
"""
|
||||||
|
|
||||||
is_supported = int(api_version_year) <= 2024 and int(api_version_month) >= 8
|
is_supported = (
|
||||||
|
int(api_version_year) <= API_VERSION_YEAR_SUPPORTED_RESPONSE_FORMAT
|
||||||
|
and int(api_version_month) >= API_VERSION_MONTH_SUPPORTED_RESPONSE_FORMAT
|
||||||
|
)
|
||||||
|
|
||||||
return is_supported
|
return is_supported
|
||||||
|
|
||||||
|
|
|
@ -9,7 +9,7 @@ from pydantic import BaseModel
|
||||||
|
|
||||||
from litellm._logging import verbose_logger
|
from litellm._logging import verbose_logger
|
||||||
from litellm.caching.caching import DualCache
|
from litellm.caching.caching import DualCache
|
||||||
from litellm.constants import BEDROCK_INVOKE_PROVIDERS_LITERAL
|
from litellm.constants import BEDROCK_INVOKE_PROVIDERS_LITERAL, BEDROCK_MAX_POLICY_SIZE
|
||||||
from litellm.litellm_core_utils.dd_tracing import tracer
|
from litellm.litellm_core_utils.dd_tracing import tracer
|
||||||
from litellm.secret_managers.main import get_secret
|
from litellm.secret_managers.main import get_secret
|
||||||
|
|
||||||
|
@ -381,7 +381,7 @@ class BaseAWSLLM:
|
||||||
"region_name": aws_region_name,
|
"region_name": aws_region_name,
|
||||||
}
|
}
|
||||||
|
|
||||||
if sts_response["PackedPolicySize"] > 75:
|
if sts_response["PackedPolicySize"] > BEDROCK_MAX_POLICY_SIZE:
|
||||||
verbose_logger.warning(
|
verbose_logger.warning(
|
||||||
f"The policy size is greater than 75% of the allowed size, PackedPolicySize: {sts_response['PackedPolicySize']}"
|
f"The policy size is greater than 75% of the allowed size, PackedPolicySize: {sts_response['PackedPolicySize']}"
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
from typing import Optional, Tuple, Union
|
from typing import Optional, Tuple, Union
|
||||||
|
|
||||||
import litellm
|
import litellm
|
||||||
|
from litellm.constants import MIN_NON_ZERO_TEMPERATURE
|
||||||
from litellm.llms.openai.chat.gpt_transformation import OpenAIGPTConfig
|
from litellm.llms.openai.chat.gpt_transformation import OpenAIGPTConfig
|
||||||
from litellm.secret_managers.main import get_secret_str
|
from litellm.secret_managers.main import get_secret_str
|
||||||
|
|
||||||
|
@ -84,7 +85,7 @@ class DeepInfraConfig(OpenAIGPTConfig):
|
||||||
and value == 0
|
and value == 0
|
||||||
and model == "mistralai/Mistral-7B-Instruct-v0.1"
|
and model == "mistralai/Mistral-7B-Instruct-v0.1"
|
||||||
): # this model does no support temperature == 0
|
): # this model does no support temperature == 0
|
||||||
value = 0.0001 # close to 0
|
value = MIN_NON_ZERO_TEMPERATURE # close to 0
|
||||||
if param == "tool_choice":
|
if param == "tool_choice":
|
||||||
if (
|
if (
|
||||||
value != "auto" and value != "none"
|
value != "auto" and value != "none"
|
||||||
|
|
|
@ -4,6 +4,12 @@ For calculating cost of fireworks ai serverless inference models.
|
||||||
|
|
||||||
from typing import Tuple
|
from typing import Tuple
|
||||||
|
|
||||||
|
from litellm.constants import (
|
||||||
|
FIREWORKS_AI_16_B,
|
||||||
|
FIREWORKS_AI_56_B_MOE,
|
||||||
|
FIREWORKS_AI_80_B,
|
||||||
|
FIREWORKS_AI_176_B_MOE,
|
||||||
|
)
|
||||||
from litellm.types.utils import Usage
|
from litellm.types.utils import Usage
|
||||||
from litellm.utils import get_model_info
|
from litellm.utils import get_model_info
|
||||||
|
|
||||||
|
@ -25,9 +31,9 @@ def get_base_model_for_pricing(model_name: str) -> str:
|
||||||
moe_match = re.search(r"(\d+)x(\d+)b", model_name)
|
moe_match = re.search(r"(\d+)x(\d+)b", model_name)
|
||||||
if moe_match:
|
if moe_match:
|
||||||
total_billion = int(moe_match.group(1)) * int(moe_match.group(2))
|
total_billion = int(moe_match.group(1)) * int(moe_match.group(2))
|
||||||
if total_billion <= 56:
|
if total_billion <= FIREWORKS_AI_56_B_MOE:
|
||||||
return "fireworks-ai-moe-up-to-56b"
|
return "fireworks-ai-moe-up-to-56b"
|
||||||
elif total_billion <= 176:
|
elif total_billion <= FIREWORKS_AI_176_B_MOE:
|
||||||
return "fireworks-ai-56b-to-176b"
|
return "fireworks-ai-56b-to-176b"
|
||||||
|
|
||||||
# Check for standard models in the form <number>b
|
# Check for standard models in the form <number>b
|
||||||
|
@ -37,9 +43,9 @@ def get_base_model_for_pricing(model_name: str) -> str:
|
||||||
params_billion = float(params_match)
|
params_billion = float(params_match)
|
||||||
|
|
||||||
# Determine the category based on the number of parameters
|
# Determine the category based on the number of parameters
|
||||||
if params_billion <= 16.0:
|
if params_billion <= FIREWORKS_AI_16_B:
|
||||||
return "fireworks-ai-up-to-16b"
|
return "fireworks-ai-up-to-16b"
|
||||||
elif params_billion <= 80.0:
|
elif params_billion <= FIREWORKS_AI_80_B:
|
||||||
return "fireworks-ai-16b-80b"
|
return "fireworks-ai-16b-80b"
|
||||||
|
|
||||||
# If no matches, return the original model_name
|
# If no matches, return the original model_name
|
||||||
|
|
|
@ -2,6 +2,7 @@ from typing import TYPE_CHECKING, Any, List, Literal, Optional, Union
|
||||||
|
|
||||||
from httpx import Headers, Response
|
from httpx import Headers, Response
|
||||||
|
|
||||||
|
from litellm.constants import DEFAULT_MAX_TOKENS
|
||||||
from litellm.llms.base_llm.chat.transformation import BaseConfig, BaseLLMException
|
from litellm.llms.base_llm.chat.transformation import BaseConfig, BaseLLMException
|
||||||
from litellm.types.llms.openai import AllMessageValues
|
from litellm.types.llms.openai import AllMessageValues
|
||||||
from litellm.types.utils import ModelResponse
|
from litellm.types.utils import ModelResponse
|
||||||
|
@ -27,7 +28,7 @@ class PredibaseConfig(BaseConfig):
|
||||||
decoder_input_details: Optional[bool] = None
|
decoder_input_details: Optional[bool] = None
|
||||||
details: bool = True # enables returning logprobs + best of
|
details: bool = True # enables returning logprobs + best of
|
||||||
max_new_tokens: int = (
|
max_new_tokens: int = (
|
||||||
256 # openai default - requests hang if max_new_tokens not given
|
DEFAULT_MAX_TOKENS # openai default - requests hang if max_new_tokens not given
|
||||||
)
|
)
|
||||||
repetition_penalty: Optional[float] = None
|
repetition_penalty: Optional[float] = None
|
||||||
return_full_text: Optional[
|
return_full_text: Optional[
|
||||||
|
|
|
@ -4,6 +4,7 @@ import time
|
||||||
from typing import Callable, List, Union
|
from typing import Callable, List, Union
|
||||||
|
|
||||||
import litellm
|
import litellm
|
||||||
|
from litellm.constants import REPLICATE_POLLING_DELAY_SECONDS
|
||||||
from litellm.llms.custom_httpx.http_handler import (
|
from litellm.llms.custom_httpx.http_handler import (
|
||||||
AsyncHTTPHandler,
|
AsyncHTTPHandler,
|
||||||
HTTPHandler,
|
HTTPHandler,
|
||||||
|
@ -28,7 +29,9 @@ def handle_prediction_response_streaming(
|
||||||
|
|
||||||
status = ""
|
status = ""
|
||||||
while True and (status not in ["succeeded", "failed", "canceled"]):
|
while True and (status not in ["succeeded", "failed", "canceled"]):
|
||||||
time.sleep(0.5) # prevent being rate limited by replicate
|
time.sleep(
|
||||||
|
REPLICATE_POLLING_DELAY_SECONDS
|
||||||
|
) # prevent being rate limited by replicate
|
||||||
print_verbose(f"replicate: polling endpoint: {prediction_url}")
|
print_verbose(f"replicate: polling endpoint: {prediction_url}")
|
||||||
response = http_client.get(prediction_url, headers=headers)
|
response = http_client.get(prediction_url, headers=headers)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
|
@ -77,7 +80,9 @@ async def async_handle_prediction_response_streaming(
|
||||||
|
|
||||||
status = ""
|
status = ""
|
||||||
while True and (status not in ["succeeded", "failed", "canceled"]):
|
while True and (status not in ["succeeded", "failed", "canceled"]):
|
||||||
await asyncio.sleep(0.5) # prevent being rate limited by replicate
|
await asyncio.sleep(
|
||||||
|
REPLICATE_POLLING_DELAY_SECONDS
|
||||||
|
) # prevent being rate limited by replicate
|
||||||
print_verbose(f"replicate: polling endpoint: {prediction_url}")
|
print_verbose(f"replicate: polling endpoint: {prediction_url}")
|
||||||
response = await http_client.get(prediction_url, headers=headers)
|
response = await http_client.get(prediction_url, headers=headers)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
|
|
|
@ -3,6 +3,7 @@ from typing import TYPE_CHECKING, Any, List, Optional, Union
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
import litellm
|
import litellm
|
||||||
|
from litellm.constants import REPLICATE_MODEL_NAME_WITH_ID_LENGTH
|
||||||
from litellm.litellm_core_utils.prompt_templates.common_utils import (
|
from litellm.litellm_core_utils.prompt_templates.common_utils import (
|
||||||
convert_content_list_to_str,
|
convert_content_list_to_str,
|
||||||
)
|
)
|
||||||
|
@ -221,10 +222,11 @@ class ReplicateConfig(BaseConfig):
|
||||||
|
|
||||||
version_id = self.model_to_version_id(model)
|
version_id = self.model_to_version_id(model)
|
||||||
request_data: dict = {"input": input_data}
|
request_data: dict = {"input": input_data}
|
||||||
if ":" in version_id and len(version_id) > 64:
|
if ":" in version_id and len(version_id) > REPLICATE_MODEL_NAME_WITH_ID_LENGTH:
|
||||||
model_parts = version_id.split(":")
|
model_parts = version_id.split(":")
|
||||||
if (
|
if (
|
||||||
len(model_parts) > 1 and len(model_parts[1]) == 64
|
len(model_parts) > 1
|
||||||
|
and len(model_parts[1]) == REPLICATE_MODEL_NAME_WITH_ID_LENGTH
|
||||||
): ## checks if model name has a 64 digit code - e.g. "meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3"
|
): ## checks if model name has a 64 digit code - e.g. "meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3"
|
||||||
request_data["version"] = model_parts[1]
|
request_data["version"] = model_parts[1]
|
||||||
|
|
||||||
|
|
|
@ -4,6 +4,16 @@ Handles calculating cost for together ai models
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
from litellm.constants import (
|
||||||
|
TOGETHER_AI_4_B,
|
||||||
|
TOGETHER_AI_8_B,
|
||||||
|
TOGETHER_AI_21_B,
|
||||||
|
TOGETHER_AI_41_B,
|
||||||
|
TOGETHER_AI_80_B,
|
||||||
|
TOGETHER_AI_110_B,
|
||||||
|
TOGETHER_AI_EMBEDDING_150_M,
|
||||||
|
TOGETHER_AI_EMBEDDING_350_M,
|
||||||
|
)
|
||||||
from litellm.types.utils import CallTypes
|
from litellm.types.utils import CallTypes
|
||||||
|
|
||||||
|
|
||||||
|
@ -31,17 +41,17 @@ def get_model_params_and_category(model_name, call_type: CallTypes) -> str:
|
||||||
else:
|
else:
|
||||||
return model_name
|
return model_name
|
||||||
# Determine the category based on the number of parameters
|
# Determine the category based on the number of parameters
|
||||||
if params_billion <= 4.0:
|
if params_billion <= TOGETHER_AI_4_B:
|
||||||
category = "together-ai-up-to-4b"
|
category = "together-ai-up-to-4b"
|
||||||
elif params_billion <= 8.0:
|
elif params_billion <= TOGETHER_AI_8_B:
|
||||||
category = "together-ai-4.1b-8b"
|
category = "together-ai-4.1b-8b"
|
||||||
elif params_billion <= 21.0:
|
elif params_billion <= TOGETHER_AI_21_B:
|
||||||
category = "together-ai-8.1b-21b"
|
category = "together-ai-8.1b-21b"
|
||||||
elif params_billion <= 41.0:
|
elif params_billion <= TOGETHER_AI_41_B:
|
||||||
category = "together-ai-21.1b-41b"
|
category = "together-ai-21.1b-41b"
|
||||||
elif params_billion <= 80.0:
|
elif params_billion <= TOGETHER_AI_80_B:
|
||||||
category = "together-ai-41.1b-80b"
|
category = "together-ai-41.1b-80b"
|
||||||
elif params_billion <= 110.0:
|
elif params_billion <= TOGETHER_AI_110_B:
|
||||||
category = "together-ai-81.1b-110b"
|
category = "together-ai-81.1b-110b"
|
||||||
if category is not None:
|
if category is not None:
|
||||||
return category
|
return category
|
||||||
|
@ -69,9 +79,9 @@ def get_model_params_and_category_embeddings(model_name) -> str:
|
||||||
else:
|
else:
|
||||||
return model_name
|
return model_name
|
||||||
# Determine the category based on the number of parameters
|
# Determine the category based on the number of parameters
|
||||||
if params_million <= 150:
|
if params_million <= TOGETHER_AI_EMBEDDING_150_M:
|
||||||
category = "together-ai-embedding-up-to-150m"
|
category = "together-ai-embedding-up-to-150m"
|
||||||
elif params_million <= 350:
|
elif params_million <= TOGETHER_AI_EMBEDDING_350_M:
|
||||||
category = "together-ai-embedding-151m-to-350m"
|
category = "together-ai-embedding-151m-to-350m"
|
||||||
if category is not None:
|
if category is not None:
|
||||||
return category
|
return category
|
||||||
|
|
|
@ -7,6 +7,7 @@ from typing import Any, AsyncIterator, Dict, Iterator, List, Literal, Optional,
|
||||||
|
|
||||||
from httpx import Headers, Response
|
from httpx import Headers, Response
|
||||||
|
|
||||||
|
from litellm.constants import DEFAULT_MAX_TOKENS_FOR_TRITON
|
||||||
from litellm.litellm_core_utils.prompt_templates.factory import prompt_factory
|
from litellm.litellm_core_utils.prompt_templates.factory import prompt_factory
|
||||||
from litellm.llms.base_llm.base_model_iterator import BaseModelResponseIterator
|
from litellm.llms.base_llm.base_model_iterator import BaseModelResponseIterator
|
||||||
from litellm.llms.base_llm.chat.transformation import (
|
from litellm.llms.base_llm.chat.transformation import (
|
||||||
|
@ -196,7 +197,9 @@ class TritonGenerateConfig(TritonConfig):
|
||||||
data_for_triton: Dict[str, Any] = {
|
data_for_triton: Dict[str, Any] = {
|
||||||
"text_input": prompt_factory(model=model, messages=messages),
|
"text_input": prompt_factory(model=model, messages=messages),
|
||||||
"parameters": {
|
"parameters": {
|
||||||
"max_tokens": int(optional_params.get("max_tokens", 2000)),
|
"max_tokens": int(
|
||||||
|
optional_params.get("max_tokens", DEFAULT_MAX_TOKENS_FOR_TRITON)
|
||||||
|
),
|
||||||
"bad_words": [""],
|
"bad_words": [""],
|
||||||
"stop_words": [""],
|
"stop_words": [""],
|
||||||
},
|
},
|
||||||
|
|
|
@ -51,6 +51,10 @@ from litellm import ( # type: ignore
|
||||||
get_litellm_params,
|
get_litellm_params,
|
||||||
get_optional_params,
|
get_optional_params,
|
||||||
)
|
)
|
||||||
|
from litellm.constants import (
|
||||||
|
DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT,
|
||||||
|
DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT,
|
||||||
|
)
|
||||||
from litellm.exceptions import LiteLLMUnknownProvider
|
from litellm.exceptions import LiteLLMUnknownProvider
|
||||||
from litellm.integrations.custom_logger import CustomLogger
|
from litellm.integrations.custom_logger import CustomLogger
|
||||||
from litellm.litellm_core_utils.audio_utils.utils import get_audio_file_for_health_check
|
from litellm.litellm_core_utils.audio_utils.utils import get_audio_file_for_health_check
|
||||||
|
@ -740,7 +744,12 @@ def mock_completion(
|
||||||
setattr(
|
setattr(
|
||||||
model_response,
|
model_response,
|
||||||
"usage",
|
"usage",
|
||||||
Usage(prompt_tokens=10, completion_tokens=20, total_tokens=30),
|
Usage(
|
||||||
|
prompt_tokens=DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT,
|
||||||
|
completion_tokens=DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT,
|
||||||
|
total_tokens=DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT
|
||||||
|
+ DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT,
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
@ -3067,7 +3076,7 @@ def completion( # type: ignore # noqa: PLR0915
|
||||||
"max_tokens": max_tokens,
|
"max_tokens": max_tokens,
|
||||||
"temperature": temperature,
|
"temperature": temperature,
|
||||||
"top_p": top_p,
|
"top_p": top_p,
|
||||||
"top_k": kwargs.get("top_k", 40),
|
"top_k": kwargs.get("top_k"),
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
|
@ -20,6 +20,7 @@ import litellm
|
||||||
from litellm._logging import verbose_proxy_logger
|
from litellm._logging import verbose_proxy_logger
|
||||||
from litellm.caching.caching import DualCache
|
from litellm.caching.caching import DualCache
|
||||||
from litellm.caching.dual_cache import LimitedSizeOrderedDict
|
from litellm.caching.dual_cache import LimitedSizeOrderedDict
|
||||||
|
from litellm.constants import DEFAULT_IN_MEMORY_TTL
|
||||||
from litellm.litellm_core_utils.get_llm_provider_logic import get_llm_provider
|
from litellm.litellm_core_utils.get_llm_provider_logic import get_llm_provider
|
||||||
from litellm.proxy._types import (
|
from litellm.proxy._types import (
|
||||||
RBAC_ROLES,
|
RBAC_ROLES,
|
||||||
|
@ -55,7 +56,7 @@ else:
|
||||||
|
|
||||||
|
|
||||||
last_db_access_time = LimitedSizeOrderedDict(max_size=100)
|
last_db_access_time = LimitedSizeOrderedDict(max_size=100)
|
||||||
db_cache_expiry = 5 # refresh every 5s
|
db_cache_expiry = DEFAULT_IN_MEMORY_TTL # refresh every 5s
|
||||||
|
|
||||||
all_routes = LiteLLMRoutes.openai_routes.value + LiteLLMRoutes.management_routes.value
|
all_routes = LiteLLMRoutes.openai_routes.value + LiteLLMRoutes.management_routes.value
|
||||||
|
|
||||||
|
|
|
@ -9,6 +9,7 @@ from typing import Optional
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
from litellm._logging import verbose_proxy_logger
|
from litellm._logging import verbose_proxy_logger
|
||||||
|
from litellm.constants import NON_LLM_CONNECTION_TIMEOUT
|
||||||
from litellm.llms.custom_httpx.http_handler import HTTPHandler
|
from litellm.llms.custom_httpx.http_handler import HTTPHandler
|
||||||
|
|
||||||
|
|
||||||
|
@ -23,7 +24,7 @@ class LicenseCheck:
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
self.license_str = os.getenv("LITELLM_LICENSE", None)
|
self.license_str = os.getenv("LITELLM_LICENSE", None)
|
||||||
verbose_proxy_logger.debug("License Str value - {}".format(self.license_str))
|
verbose_proxy_logger.debug("License Str value - {}".format(self.license_str))
|
||||||
self.http_handler = HTTPHandler(timeout=15)
|
self.http_handler = HTTPHandler(timeout=NON_LLM_CONNECTION_TIMEOUT)
|
||||||
self.public_key = None
|
self.public_key = None
|
||||||
self.read_public_key()
|
self.read_public_key()
|
||||||
|
|
||||||
|
|
|
@ -15,6 +15,7 @@ from fastapi import HTTPException
|
||||||
import litellm
|
import litellm
|
||||||
from litellm._logging import verbose_proxy_logger
|
from litellm._logging import verbose_proxy_logger
|
||||||
from litellm.caching.caching import DualCache
|
from litellm.caching.caching import DualCache
|
||||||
|
from litellm.constants import DEFAULT_PROMPT_INJECTION_SIMILARITY_THRESHOLD
|
||||||
from litellm.integrations.custom_logger import CustomLogger
|
from litellm.integrations.custom_logger import CustomLogger
|
||||||
from litellm.litellm_core_utils.prompt_templates.factory import (
|
from litellm.litellm_core_utils.prompt_templates.factory import (
|
||||||
prompt_injection_detection_default_pt,
|
prompt_injection_detection_default_pt,
|
||||||
|
@ -110,7 +111,9 @@ class _OPTIONAL_PromptInjectionDetection(CustomLogger):
|
||||||
return combinations
|
return combinations
|
||||||
|
|
||||||
def check_user_input_similarity(
|
def check_user_input_similarity(
|
||||||
self, user_input: str, similarity_threshold: float = 0.7
|
self,
|
||||||
|
user_input: str,
|
||||||
|
similarity_threshold: float = DEFAULT_PROMPT_INJECTION_SIMILARITY_THRESHOLD,
|
||||||
) -> bool:
|
) -> bool:
|
||||||
user_input_lower = user_input.lower()
|
user_input_lower = user_input.lower()
|
||||||
keywords = self.generate_injection_keywords()
|
keywords = self.generate_injection_keywords()
|
||||||
|
|
|
@ -24,7 +24,7 @@ from fastapi import APIRouter, Depends, Header, HTTPException, Query, Request, s
|
||||||
import litellm
|
import litellm
|
||||||
from litellm._logging import verbose_proxy_logger
|
from litellm._logging import verbose_proxy_logger
|
||||||
from litellm.caching import DualCache
|
from litellm.caching import DualCache
|
||||||
from litellm.constants import UI_SESSION_TOKEN_TEAM_ID
|
from litellm.constants import LENGTH_OF_LITELLM_GENERATED_KEY, UI_SESSION_TOKEN_TEAM_ID
|
||||||
from litellm.litellm_core_utils.duration_parser import duration_in_seconds
|
from litellm.litellm_core_utils.duration_parser import duration_in_seconds
|
||||||
from litellm.proxy._types import *
|
from litellm.proxy._types import *
|
||||||
from litellm.proxy.auth.auth_checks import (
|
from litellm.proxy.auth.auth_checks import (
|
||||||
|
@ -1164,7 +1164,7 @@ async def generate_key_helper_fn( # noqa: PLR0915
|
||||||
if key is not None:
|
if key is not None:
|
||||||
token = key
|
token = key
|
||||||
else:
|
else:
|
||||||
token = f"sk-{secrets.token_urlsafe(16)}"
|
token = f"sk-{secrets.token_urlsafe(LENGTH_OF_LITELLM_GENERATED_KEY)}"
|
||||||
|
|
||||||
if duration is None: # allow tokens that never expire
|
if duration is None: # allow tokens that never expire
|
||||||
expires = None
|
expires = None
|
||||||
|
@ -1745,7 +1745,7 @@ async def regenerate_key_fn(
|
||||||
|
|
||||||
verbose_proxy_logger.debug("key_in_db: %s", _key_in_db)
|
verbose_proxy_logger.debug("key_in_db: %s", _key_in_db)
|
||||||
|
|
||||||
new_token = f"sk-{secrets.token_urlsafe(16)}"
|
new_token = f"sk-{secrets.token_urlsafe(LENGTH_OF_LITELLM_GENERATED_KEY)}"
|
||||||
new_token_hash = hash_token(new_token)
|
new_token_hash = hash_token(new_token)
|
||||||
new_token_key_name = f"sk-...{new_token[-4:]}"
|
new_token_key_name = f"sk-...{new_token[-4:]}"
|
||||||
|
|
||||||
|
|
|
@ -15,6 +15,10 @@ from litellm.litellm_core_utils.litellm_logging import (
|
||||||
)
|
)
|
||||||
from litellm.litellm_core_utils.thread_pool_executor import executor
|
from litellm.litellm_core_utils.thread_pool_executor import executor
|
||||||
from litellm.proxy.pass_through_endpoints.types import PassthroughStandardLoggingPayload
|
from litellm.proxy.pass_through_endpoints.types import PassthroughStandardLoggingPayload
|
||||||
|
from litellm.types.passthrough_endpoints.assembly_ai import (
|
||||||
|
ASSEMBLY_AI_MAX_POLLING_ATTEMPTS,
|
||||||
|
ASSEMBLY_AI_POLLING_INTERVAL,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class AssemblyAITranscriptResponse(TypedDict, total=False):
|
class AssemblyAITranscriptResponse(TypedDict, total=False):
|
||||||
|
@ -34,13 +38,13 @@ class AssemblyAIPassthroughLoggingHandler:
|
||||||
The base URL for the AssemblyAI API
|
The base URL for the AssemblyAI API
|
||||||
"""
|
"""
|
||||||
|
|
||||||
self.polling_interval: float = 10
|
self.polling_interval: float = ASSEMBLY_AI_POLLING_INTERVAL
|
||||||
"""
|
"""
|
||||||
The polling interval for the AssemblyAI API.
|
The polling interval for the AssemblyAI API.
|
||||||
litellm needs to poll the GET /transcript/{transcript_id} endpoint to get the status of the transcript.
|
litellm needs to poll the GET /transcript/{transcript_id} endpoint to get the status of the transcript.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
self.max_polling_attempts = 180
|
self.max_polling_attempts = ASSEMBLY_AI_MAX_POLLING_ATTEMPTS
|
||||||
"""
|
"""
|
||||||
The maximum number of polling attempts for the AssemblyAI API.
|
The maximum number of polling attempts for the AssemblyAI API.
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -25,7 +25,10 @@ from typing import (
|
||||||
get_type_hints,
|
get_type_hints,
|
||||||
)
|
)
|
||||||
|
|
||||||
from litellm.constants import DEFAULT_MAX_RECURSE_DEPTH
|
from litellm.constants import (
|
||||||
|
DEFAULT_MAX_RECURSE_DEPTH,
|
||||||
|
DEFAULT_SLACK_ALERTING_THRESHOLD,
|
||||||
|
)
|
||||||
from litellm.types.utils import (
|
from litellm.types.utils import (
|
||||||
ModelResponse,
|
ModelResponse,
|
||||||
ModelResponseStream,
|
ModelResponseStream,
|
||||||
|
@ -118,7 +121,16 @@ import litellm
|
||||||
from litellm import Router
|
from litellm import Router
|
||||||
from litellm._logging import verbose_proxy_logger, verbose_router_logger
|
from litellm._logging import verbose_proxy_logger, verbose_router_logger
|
||||||
from litellm.caching.caching import DualCache, RedisCache
|
from litellm.caching.caching import DualCache, RedisCache
|
||||||
from litellm.constants import LITELLM_PROXY_ADMIN_NAME
|
from litellm.constants import (
|
||||||
|
DAYS_IN_A_MONTH,
|
||||||
|
DEFAULT_HEALTH_CHECK_INTERVAL,
|
||||||
|
DEFAULT_MODEL_CREATED_AT_TIME,
|
||||||
|
LITELLM_PROXY_ADMIN_NAME,
|
||||||
|
PROMETHEUS_FALLBACK_STATS_SEND_TIME_HOURS,
|
||||||
|
PROXY_BATCH_WRITE_AT,
|
||||||
|
PROXY_BUDGET_RESCHEDULER_MAX_TIME,
|
||||||
|
PROXY_BUDGET_RESCHEDULER_MIN_TIME,
|
||||||
|
)
|
||||||
from litellm.exceptions import RejectedRequestError
|
from litellm.exceptions import RejectedRequestError
|
||||||
from litellm.integrations.SlackAlerting.slack_alerting import SlackAlerting
|
from litellm.integrations.SlackAlerting.slack_alerting import SlackAlerting
|
||||||
from litellm.litellm_core_utils.core_helpers import (
|
from litellm.litellm_core_utils.core_helpers import (
|
||||||
|
@ -287,7 +299,7 @@ from litellm.router import (
|
||||||
LiteLLM_Params,
|
LiteLLM_Params,
|
||||||
ModelGroupInfo,
|
ModelGroupInfo,
|
||||||
)
|
)
|
||||||
from litellm.scheduler import DefaultPriorities, FlowItem, Scheduler
|
from litellm.scheduler import FlowItem, Scheduler
|
||||||
from litellm.secret_managers.aws_secret_manager import load_aws_kms
|
from litellm.secret_managers.aws_secret_manager import load_aws_kms
|
||||||
from litellm.secret_managers.google_kms import load_google_kms
|
from litellm.secret_managers.google_kms import load_google_kms
|
||||||
from litellm.secret_managers.main import (
|
from litellm.secret_managers.main import (
|
||||||
|
@ -307,6 +319,7 @@ from litellm.types.llms.openai import HttpxBinaryResponseContent
|
||||||
from litellm.types.router import DeploymentTypedDict
|
from litellm.types.router import DeploymentTypedDict
|
||||||
from litellm.types.router import ModelInfo as RouterModelInfo
|
from litellm.types.router import ModelInfo as RouterModelInfo
|
||||||
from litellm.types.router import RouterGeneralSettings, updateDeployment
|
from litellm.types.router import RouterGeneralSettings, updateDeployment
|
||||||
|
from litellm.types.scheduler import DefaultPriorities
|
||||||
from litellm.types.utils import CredentialItem, CustomHuggingfaceTokenizer
|
from litellm.types.utils import CredentialItem, CustomHuggingfaceTokenizer
|
||||||
from litellm.types.utils import ModelInfo as ModelMapInfo
|
from litellm.types.utils import ModelInfo as ModelMapInfo
|
||||||
from litellm.types.utils import RawRequestTypedDict, StandardLoggingPayload
|
from litellm.types.utils import RawRequestTypedDict, StandardLoggingPayload
|
||||||
|
@ -779,9 +792,9 @@ queue: List = []
|
||||||
litellm_proxy_budget_name = "litellm-proxy-budget"
|
litellm_proxy_budget_name = "litellm-proxy-budget"
|
||||||
litellm_proxy_admin_name = LITELLM_PROXY_ADMIN_NAME
|
litellm_proxy_admin_name = LITELLM_PROXY_ADMIN_NAME
|
||||||
ui_access_mode: Literal["admin", "all"] = "all"
|
ui_access_mode: Literal["admin", "all"] = "all"
|
||||||
proxy_budget_rescheduler_min_time = 597
|
proxy_budget_rescheduler_min_time = PROXY_BUDGET_RESCHEDULER_MIN_TIME
|
||||||
proxy_budget_rescheduler_max_time = 605
|
proxy_budget_rescheduler_max_time = PROXY_BUDGET_RESCHEDULER_MAX_TIME
|
||||||
proxy_batch_write_at = 10 # in seconds
|
proxy_batch_write_at = PROXY_BATCH_WRITE_AT
|
||||||
litellm_master_key_hash = None
|
litellm_master_key_hash = None
|
||||||
disable_spend_logs = False
|
disable_spend_logs = False
|
||||||
jwt_handler = JWTHandler()
|
jwt_handler = JWTHandler()
|
||||||
|
@ -1846,7 +1859,9 @@ class ProxyConfig:
|
||||||
use_background_health_checks = general_settings.get(
|
use_background_health_checks = general_settings.get(
|
||||||
"background_health_checks", False
|
"background_health_checks", False
|
||||||
)
|
)
|
||||||
health_check_interval = general_settings.get("health_check_interval", 300)
|
health_check_interval = general_settings.get(
|
||||||
|
"health_check_interval", DEFAULT_HEALTH_CHECK_INTERVAL
|
||||||
|
)
|
||||||
health_check_details = general_settings.get("health_check_details", True)
|
health_check_details = general_settings.get("health_check_details", True)
|
||||||
|
|
||||||
### RBAC ###
|
### RBAC ###
|
||||||
|
@ -3145,7 +3160,7 @@ class ProxyStartupEvent:
|
||||||
scheduler.add_job(
|
scheduler.add_job(
|
||||||
proxy_logging_obj.slack_alerting_instance.send_fallback_stats_from_prometheus,
|
proxy_logging_obj.slack_alerting_instance.send_fallback_stats_from_prometheus,
|
||||||
"cron",
|
"cron",
|
||||||
hour=9,
|
hour=PROMETHEUS_FALLBACK_STATS_SEND_TIME_HOURS,
|
||||||
minute=0,
|
minute=0,
|
||||||
timezone=ZoneInfo("America/Los_Angeles"), # Pacific Time
|
timezone=ZoneInfo("America/Los_Angeles"), # Pacific Time
|
||||||
)
|
)
|
||||||
|
@ -3278,7 +3293,7 @@ async def model_list(
|
||||||
{
|
{
|
||||||
"id": model,
|
"id": model,
|
||||||
"object": "model",
|
"object": "model",
|
||||||
"created": 1677610602,
|
"created": DEFAULT_MODEL_CREATED_AT_TIME,
|
||||||
"owned_by": "openai",
|
"owned_by": "openai",
|
||||||
}
|
}
|
||||||
for model in all_models
|
for model in all_models
|
||||||
|
@ -5592,7 +5607,7 @@ async def model_metrics(
|
||||||
param="None",
|
param="None",
|
||||||
code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||||
)
|
)
|
||||||
startTime = startTime or datetime.now() - timedelta(days=30)
|
startTime = startTime or datetime.now() - timedelta(days=DAYS_IN_A_MONTH)
|
||||||
endTime = endTime or datetime.now()
|
endTime = endTime or datetime.now()
|
||||||
|
|
||||||
if api_key is None or api_key == "undefined":
|
if api_key is None or api_key == "undefined":
|
||||||
|
@ -5713,11 +5728,12 @@ async def model_metrics_slow_responses(
|
||||||
if customer is None or customer == "undefined":
|
if customer is None or customer == "undefined":
|
||||||
customer = "null"
|
customer = "null"
|
||||||
|
|
||||||
startTime = startTime or datetime.now() - timedelta(days=30)
|
startTime = startTime or datetime.now() - timedelta(days=DAYS_IN_A_MONTH)
|
||||||
endTime = endTime or datetime.now()
|
endTime = endTime or datetime.now()
|
||||||
|
|
||||||
alerting_threshold = (
|
alerting_threshold = (
|
||||||
proxy_logging_obj.slack_alerting_instance.alerting_threshold or 300
|
proxy_logging_obj.slack_alerting_instance.alerting_threshold
|
||||||
|
or DEFAULT_SLACK_ALERTING_THRESHOLD
|
||||||
)
|
)
|
||||||
alerting_threshold = int(alerting_threshold)
|
alerting_threshold = int(alerting_threshold)
|
||||||
|
|
||||||
|
@ -5797,7 +5813,7 @@ async def model_metrics_exceptions(
|
||||||
code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||||
)
|
)
|
||||||
|
|
||||||
startTime = startTime or datetime.now() - timedelta(days=30)
|
startTime = startTime or datetime.now() - timedelta(days=DAYS_IN_A_MONTH)
|
||||||
endTime = endTime or datetime.now()
|
endTime = endTime or datetime.now()
|
||||||
|
|
||||||
if api_key is None or api_key == "undefined":
|
if api_key is None or api_key == "undefined":
|
||||||
|
|
|
@ -22,6 +22,7 @@ from typing import (
|
||||||
overload,
|
overload,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
from litellm.constants import MAX_TEAM_LIST_LIMIT
|
||||||
from litellm.proxy._types import (
|
from litellm.proxy._types import (
|
||||||
DB_CONNECTION_ERROR_TYPES,
|
DB_CONNECTION_ERROR_TYPES,
|
||||||
CommonProxyErrors,
|
CommonProxyErrors,
|
||||||
|
@ -1596,7 +1597,9 @@ class PrismaClient:
|
||||||
where={"team_id": {"in": team_id_list}}
|
where={"team_id": {"in": team_id_list}}
|
||||||
)
|
)
|
||||||
elif query_type == "find_all" and team_id_list is None:
|
elif query_type == "find_all" and team_id_list is None:
|
||||||
response = await self.db.litellm_teamtable.find_many(take=20)
|
response = await self.db.litellm_teamtable.find_many(
|
||||||
|
take=MAX_TEAM_LIST_LIMIT
|
||||||
|
)
|
||||||
return response
|
return response
|
||||||
elif table_name == "user_notification":
|
elif table_name == "user_notification":
|
||||||
if query_type == "find_unique":
|
if query_type == "find_unique":
|
||||||
|
|
|
@ -50,6 +50,7 @@ from litellm.caching.caching import (
|
||||||
RedisCache,
|
RedisCache,
|
||||||
RedisClusterCache,
|
RedisClusterCache,
|
||||||
)
|
)
|
||||||
|
from litellm.constants import DEFAULT_MAX_LRU_CACHE_SIZE
|
||||||
from litellm.integrations.custom_logger import CustomLogger
|
from litellm.integrations.custom_logger import CustomLogger
|
||||||
from litellm.litellm_core_utils.asyncify import run_async_function
|
from litellm.litellm_core_utils.asyncify import run_async_function
|
||||||
from litellm.litellm_core_utils.core_helpers import _get_parent_otel_span_from_kwargs
|
from litellm.litellm_core_utils.core_helpers import _get_parent_otel_span_from_kwargs
|
||||||
|
@ -5073,7 +5074,7 @@ class Router:
|
||||||
rpm_usage += t
|
rpm_usage += t
|
||||||
return tpm_usage, rpm_usage
|
return tpm_usage, rpm_usage
|
||||||
|
|
||||||
@lru_cache(maxsize=64)
|
@lru_cache(maxsize=DEFAULT_MAX_LRU_CACHE_SIZE)
|
||||||
def _cached_get_model_group_info(
|
def _cached_get_model_group_info(
|
||||||
self, model_group: str
|
self, model_group: str
|
||||||
) -> Optional[ModelGroupInfo]:
|
) -> Optional[ModelGroupInfo]:
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
from typing import TYPE_CHECKING, Any, Optional, Union
|
from typing import TYPE_CHECKING, Any, Optional, Union
|
||||||
|
|
||||||
from litellm._logging import verbose_router_logger
|
from litellm._logging import verbose_router_logger
|
||||||
|
from litellm.constants import MAX_EXCEPTION_MESSAGE_LENGTH
|
||||||
from litellm.router_utils.cooldown_handlers import (
|
from litellm.router_utils.cooldown_handlers import (
|
||||||
_async_get_cooldown_deployments_with_debug_info,
|
_async_get_cooldown_deployments_with_debug_info,
|
||||||
)
|
)
|
||||||
|
@ -54,7 +55,7 @@ async def send_llm_exception_alert(
|
||||||
exception_str = str(original_exception)
|
exception_str = str(original_exception)
|
||||||
if litellm_debug_info is not None:
|
if litellm_debug_info is not None:
|
||||||
exception_str += litellm_debug_info
|
exception_str += litellm_debug_info
|
||||||
exception_str += f"\n\n{error_traceback_str[:2000]}"
|
exception_str += f"\n\n{error_traceback_str[:MAX_EXCEPTION_MESSAGE_LENGTH]}"
|
||||||
|
|
||||||
await litellm_router_instance.slack_alerting_logger.send_alert(
|
await litellm_router_instance.slack_alerting_logger.send_alert(
|
||||||
message=f"LLM API call failed: `{exception_str}`",
|
message=f"LLM API call failed: `{exception_str}`",
|
||||||
|
|
|
@ -6,17 +6,14 @@ from pydantic import BaseModel
|
||||||
|
|
||||||
from litellm import print_verbose
|
from litellm import print_verbose
|
||||||
from litellm.caching.caching import DualCache, RedisCache
|
from litellm.caching.caching import DualCache, RedisCache
|
||||||
|
from litellm.constants import DEFAULT_IN_MEMORY_TTL, DEFAULT_POLLING_INTERVAL
|
||||||
|
|
||||||
|
|
||||||
class SchedulerCacheKeys(enum.Enum):
|
class SchedulerCacheKeys(enum.Enum):
|
||||||
queue = "scheduler:queue"
|
queue = "scheduler:queue"
|
||||||
default_in_memory_ttl = 5 # cache queue in-memory for 5s when redis cache available
|
default_in_memory_ttl = (
|
||||||
|
DEFAULT_IN_MEMORY_TTL # cache queue in-memory for 5s when redis cache available
|
||||||
|
)
|
||||||
class DefaultPriorities(enum.Enum):
|
|
||||||
High = 0
|
|
||||||
Medium = 128
|
|
||||||
Low = 255
|
|
||||||
|
|
||||||
|
|
||||||
class FlowItem(BaseModel):
|
class FlowItem(BaseModel):
|
||||||
|
@ -44,7 +41,9 @@ class Scheduler:
|
||||||
self.cache = DualCache(
|
self.cache = DualCache(
|
||||||
redis_cache=redis_cache, default_in_memory_ttl=default_in_memory_ttl
|
redis_cache=redis_cache, default_in_memory_ttl=default_in_memory_ttl
|
||||||
)
|
)
|
||||||
self.polling_interval = polling_interval or 0.03 # default to 3ms
|
self.polling_interval = (
|
||||||
|
polling_interval or DEFAULT_POLLING_INTERVAL
|
||||||
|
) # default to 3ms
|
||||||
|
|
||||||
async def add_request(self, request: FlowItem):
|
async def add_request(self, request: FlowItem):
|
||||||
# We use the priority directly, as lower values indicate higher priority
|
# We use the priority directly, as lower values indicate higher priority
|
||||||
|
|
|
@ -5,6 +5,7 @@ from typing import Optional
|
||||||
import litellm
|
import litellm
|
||||||
from litellm._logging import verbose_logger
|
from litellm._logging import verbose_logger
|
||||||
from litellm.caching.caching import InMemoryCache
|
from litellm.caching.caching import InMemoryCache
|
||||||
|
from litellm.constants import SECRET_MANAGER_REFRESH_INTERVAL
|
||||||
from litellm.integrations.gcs_bucket.gcs_bucket_base import GCSBucketBase
|
from litellm.integrations.gcs_bucket.gcs_bucket_base import GCSBucketBase
|
||||||
from litellm.llms.custom_httpx.http_handler import _get_httpx_client
|
from litellm.llms.custom_httpx.http_handler import _get_httpx_client
|
||||||
from litellm.proxy._types import CommonProxyErrors, KeyManagementSystem
|
from litellm.proxy._types import CommonProxyErrors, KeyManagementSystem
|
||||||
|
@ -13,7 +14,7 @@ from litellm.proxy._types import CommonProxyErrors, KeyManagementSystem
|
||||||
class GoogleSecretManager(GCSBucketBase):
|
class GoogleSecretManager(GCSBucketBase):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
refresh_interval: Optional[int] = 86400,
|
refresh_interval: Optional[int] = SECRET_MANAGER_REFRESH_INTERVAL,
|
||||||
always_read_secret_manager: Optional[bool] = False,
|
always_read_secret_manager: Optional[bool] = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -6,6 +6,7 @@ import httpx
|
||||||
import litellm
|
import litellm
|
||||||
from litellm._logging import verbose_logger
|
from litellm._logging import verbose_logger
|
||||||
from litellm.caching import InMemoryCache
|
from litellm.caching import InMemoryCache
|
||||||
|
from litellm.constants import SECRET_MANAGER_REFRESH_INTERVAL
|
||||||
from litellm.llms.custom_httpx.http_handler import (
|
from litellm.llms.custom_httpx.http_handler import (
|
||||||
_get_httpx_client,
|
_get_httpx_client,
|
||||||
get_async_httpx_client,
|
get_async_httpx_client,
|
||||||
|
@ -39,8 +40,14 @@ class HashicorpSecretManager(BaseSecretManager):
|
||||||
|
|
||||||
litellm.secret_manager_client = self
|
litellm.secret_manager_client = self
|
||||||
litellm._key_management_system = KeyManagementSystem.HASHICORP_VAULT
|
litellm._key_management_system = KeyManagementSystem.HASHICORP_VAULT
|
||||||
_refresh_interval = os.environ.get("HCP_VAULT_REFRESH_INTERVAL", 86400)
|
_refresh_interval = os.environ.get(
|
||||||
_refresh_interval = int(_refresh_interval) if _refresh_interval else 86400
|
"HCP_VAULT_REFRESH_INTERVAL", SECRET_MANAGER_REFRESH_INTERVAL
|
||||||
|
)
|
||||||
|
_refresh_interval = (
|
||||||
|
int(_refresh_interval)
|
||||||
|
if _refresh_interval
|
||||||
|
else SECRET_MANAGER_REFRESH_INTERVAL
|
||||||
|
)
|
||||||
self.cache = InMemoryCache(
|
self.cache = InMemoryCache(
|
||||||
default_ttl=_refresh_interval
|
default_ttl=_refresh_interval
|
||||||
) # store in memory for 1 day
|
) # store in memory for 1 day
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import Optional, TypedDict
|
from typing import Optional, TypedDict
|
||||||
|
|
||||||
|
DD_MAX_BATCH_SIZE = 1000
|
||||||
|
|
||||||
|
|
||||||
class DataDogStatus(str, Enum):
|
class DataDogStatus(str, Enum):
|
||||||
INFO = "info"
|
INFO = "info"
|
||||||
|
|
|
@ -8,6 +8,10 @@ else:
|
||||||
VertexBase = Any
|
VertexBase = Any
|
||||||
|
|
||||||
|
|
||||||
|
GCS_DEFAULT_BATCH_SIZE = 2048
|
||||||
|
GCS_DEFAULT_FLUSH_INTERVAL_SECONDS = 20
|
||||||
|
|
||||||
|
|
||||||
class GCSLoggingConfig(TypedDict):
|
class GCSLoggingConfig(TypedDict):
|
||||||
"""
|
"""
|
||||||
Internal LiteLLM Config for GCS Bucket logging
|
Internal LiteLLM Config for GCS Bucket logging
|
||||||
|
|
|
@ -7,6 +7,9 @@ from pydantic import BaseModel, Field
|
||||||
|
|
||||||
from litellm.types.utils import LiteLLMPydanticObjectBase
|
from litellm.types.utils import LiteLLMPydanticObjectBase
|
||||||
|
|
||||||
|
SLACK_ALERTING_THRESHOLD_5_PERCENT = 0.05
|
||||||
|
SLACK_ALERTING_THRESHOLD_15_PERCENT = 0.15
|
||||||
|
|
||||||
|
|
||||||
class BaseOutageModel(TypedDict):
|
class BaseOutageModel(TypedDict):
|
||||||
alerts: List[int]
|
alerts: List[int]
|
||||||
|
|
2
litellm/types/llms/azure.py
Normal file
2
litellm/types/llms/azure.py
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
API_VERSION_YEAR_SUPPORTED_RESPONSE_FORMAT = 2024
|
||||||
|
API_VERSION_MONTH_SUPPORTED_RESPONSE_FORMAT = 8
|
1
litellm/types/llms/triton.py
Normal file
1
litellm/types/llms/triton.py
Normal file
|
@ -0,0 +1 @@
|
||||||
|
|
2
litellm/types/passthrough_endpoints/assembly_ai.py
Normal file
2
litellm/types/passthrough_endpoints/assembly_ai.py
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
ASSEMBLY_AI_POLLING_INTERVAL = 10
|
||||||
|
ASSEMBLY_AI_MAX_POLLING_ATTEMPTS = 180
|
7
litellm/types/scheduler.py
Normal file
7
litellm/types/scheduler.py
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
|
||||||
|
class DefaultPriorities(Enum):
|
||||||
|
High = 0
|
||||||
|
Medium = 128
|
||||||
|
Low = 255
|
|
@ -62,6 +62,16 @@ import litellm.llms.gemini
|
||||||
from litellm.caching._internal_lru_cache import lru_cache_wrapper
|
from litellm.caching._internal_lru_cache import lru_cache_wrapper
|
||||||
from litellm.caching.caching import DualCache
|
from litellm.caching.caching import DualCache
|
||||||
from litellm.caching.caching_handler import CachingHandlerResponse, LLMCachingHandler
|
from litellm.caching.caching_handler import CachingHandlerResponse, LLMCachingHandler
|
||||||
|
from litellm.constants import (
|
||||||
|
DEFAULT_MAX_LRU_CACHE_SIZE,
|
||||||
|
DEFAULT_TRIM_RATIO,
|
||||||
|
FUNCTION_DEFINITION_TOKEN_COUNT,
|
||||||
|
INITIAL_RETRY_DELAY,
|
||||||
|
JITTER,
|
||||||
|
MAX_RETRY_DELAY,
|
||||||
|
MINIMUM_PROMPT_CACHE_TOKEN_COUNT,
|
||||||
|
TOOL_CHOICE_OBJECT_TOKEN_COUNT,
|
||||||
|
)
|
||||||
from litellm.integrations.custom_guardrail import CustomGuardrail
|
from litellm.integrations.custom_guardrail import CustomGuardrail
|
||||||
from litellm.integrations.custom_logger import CustomLogger
|
from litellm.integrations.custom_logger import CustomLogger
|
||||||
from litellm.litellm_core_utils.core_helpers import (
|
from litellm.litellm_core_utils.core_helpers import (
|
||||||
|
@ -1520,7 +1530,7 @@ def _select_tokenizer(
|
||||||
return _select_tokenizer_helper(model=model)
|
return _select_tokenizer_helper(model=model)
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=128)
|
@lru_cache(maxsize=DEFAULT_MAX_LRU_CACHE_SIZE)
|
||||||
def _select_tokenizer_helper(model: str) -> SelectTokenizerResponse:
|
def _select_tokenizer_helper(model: str) -> SelectTokenizerResponse:
|
||||||
if litellm.disable_hf_tokenizer_download is True:
|
if litellm.disable_hf_tokenizer_download is True:
|
||||||
return _return_openai_tokenizer(model)
|
return _return_openai_tokenizer(model)
|
||||||
|
@ -5336,15 +5346,15 @@ def _calculate_retry_after(
|
||||||
if retry_after is not None and 0 < retry_after <= 60:
|
if retry_after is not None and 0 < retry_after <= 60:
|
||||||
return retry_after
|
return retry_after
|
||||||
|
|
||||||
initial_retry_delay = 0.5
|
initial_retry_delay = INITIAL_RETRY_DELAY
|
||||||
max_retry_delay = 8.0
|
max_retry_delay = MAX_RETRY_DELAY
|
||||||
nb_retries = max_retries - remaining_retries
|
nb_retries = max_retries - remaining_retries
|
||||||
|
|
||||||
# Apply exponential backoff, but not more than the max.
|
# Apply exponential backoff, but not more than the max.
|
||||||
sleep_seconds = min(initial_retry_delay * pow(2.0, nb_retries), max_retry_delay)
|
sleep_seconds = min(initial_retry_delay * pow(2.0, nb_retries), max_retry_delay)
|
||||||
|
|
||||||
# Apply some jitter, plus-or-minus half a second.
|
# Apply some jitter, plus-or-minus half a second.
|
||||||
jitter = 1 - 0.25 * random.random()
|
jitter = JITTER * random.random()
|
||||||
timeout = sleep_seconds * jitter
|
timeout = sleep_seconds * jitter
|
||||||
return timeout if timeout >= min_timeout else min_timeout
|
return timeout if timeout >= min_timeout else min_timeout
|
||||||
|
|
||||||
|
@ -5670,7 +5680,7 @@ def shorten_message_to_fit_limit(message, tokens_needed, model: Optional[str]):
|
||||||
def trim_messages(
|
def trim_messages(
|
||||||
messages,
|
messages,
|
||||||
model: Optional[str] = None,
|
model: Optional[str] = None,
|
||||||
trim_ratio: float = 0.75,
|
trim_ratio: float = DEFAULT_TRIM_RATIO,
|
||||||
return_response_tokens: bool = False,
|
return_response_tokens: bool = False,
|
||||||
max_tokens=None,
|
max_tokens=None,
|
||||||
):
|
):
|
||||||
|
@ -6543,7 +6553,7 @@ def is_prompt_caching_valid_prompt(
|
||||||
model=model,
|
model=model,
|
||||||
use_default_image_token_count=True,
|
use_default_image_token_count=True,
|
||||||
)
|
)
|
||||||
return token_count >= 1024
|
return token_count >= MINIMUM_PROMPT_CACHE_TOKEN_COUNT
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
verbose_logger.error(f"Error in is_prompt_caching_valid_prompt: {e}")
|
verbose_logger.error(f"Error in is_prompt_caching_valid_prompt: {e}")
|
||||||
return False
|
return False
|
||||||
|
|
1
mypy.ini
1
mypy.ini
|
@ -3,6 +3,7 @@ warn_return_any = False
|
||||||
ignore_missing_imports = True
|
ignore_missing_imports = True
|
||||||
mypy_path = litellm/stubs
|
mypy_path = litellm/stubs
|
||||||
namespace_packages = True
|
namespace_packages = True
|
||||||
|
disable_error_code = valid-type
|
||||||
|
|
||||||
[mypy-google.*]
|
[mypy-google.*]
|
||||||
ignore_missing_imports = True
|
ignore_missing_imports = True
|
||||||
|
|
152
tests/code_coverage_tests/ban_constant_numbers.py
Normal file
152
tests/code_coverage_tests/ban_constant_numbers.py
Normal file
|
@ -0,0 +1,152 @@
|
||||||
|
import sys
|
||||||
|
import ast
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Extremely restrictive set of allowed numbers
|
||||||
|
ALLOWED_NUMBERS = {
|
||||||
|
0,
|
||||||
|
1,
|
||||||
|
-1,
|
||||||
|
2,
|
||||||
|
10,
|
||||||
|
100,
|
||||||
|
1000,
|
||||||
|
4,
|
||||||
|
3,
|
||||||
|
500,
|
||||||
|
6,
|
||||||
|
60,
|
||||||
|
3600,
|
||||||
|
0.75,
|
||||||
|
7,
|
||||||
|
1024,
|
||||||
|
1011,
|
||||||
|
600,
|
||||||
|
12,
|
||||||
|
1000000000.0,
|
||||||
|
0.1,
|
||||||
|
50,
|
||||||
|
128,
|
||||||
|
6000,
|
||||||
|
30,
|
||||||
|
1000000,
|
||||||
|
5,
|
||||||
|
15,
|
||||||
|
25,
|
||||||
|
10000,
|
||||||
|
60000,
|
||||||
|
8,
|
||||||
|
2048,
|
||||||
|
16000000000,
|
||||||
|
16,
|
||||||
|
16383,
|
||||||
|
14,
|
||||||
|
24,
|
||||||
|
128000,
|
||||||
|
0.01,
|
||||||
|
20,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add all standard HTTP status codes
|
||||||
|
HTTP_STATUS_CODES = {
|
||||||
|
200, # OK
|
||||||
|
201, # Created
|
||||||
|
202, # Accepted
|
||||||
|
204, # No Content
|
||||||
|
300, # Multiple Choices
|
||||||
|
301, # Moved Permanently
|
||||||
|
302, # Found
|
||||||
|
303, # See Other
|
||||||
|
304, # Not Modified
|
||||||
|
307, # Temporary Redirect
|
||||||
|
308, # Permanent Redirect
|
||||||
|
400, # Bad Request
|
||||||
|
401, # Unauthorized
|
||||||
|
402, # Payment Required
|
||||||
|
403, # Forbidden
|
||||||
|
404, # Not Found
|
||||||
|
406, # Not Acceptable
|
||||||
|
408, # Request Timeout
|
||||||
|
409, # Conflict
|
||||||
|
413, # Payload Too Large
|
||||||
|
422, # Unprocessable Entity
|
||||||
|
424, # Failed Dependency
|
||||||
|
429, # Too Many Requests
|
||||||
|
498, # Invalid Token
|
||||||
|
499, # Client Closed Request
|
||||||
|
500, # Internal Server Error
|
||||||
|
501, # Not Implemented
|
||||||
|
502, # Bad Gateway
|
||||||
|
503, # Service Unavailable
|
||||||
|
504, # Gateway Timeout
|
||||||
|
520, # Web server is returning an unknown error
|
||||||
|
522, # Connection timed out
|
||||||
|
524, # A timeout occurred
|
||||||
|
529, # Site is overloaded
|
||||||
|
}
|
||||||
|
|
||||||
|
# Combine the sets
|
||||||
|
ALLOWED_NUMBERS = ALLOWED_NUMBERS.union(HTTP_STATUS_CODES)
|
||||||
|
|
||||||
|
|
||||||
|
class HardcodedNumberFinder(ast.NodeVisitor):
|
||||||
|
def __init__(self):
|
||||||
|
self.hardcoded_numbers = []
|
||||||
|
|
||||||
|
def visit_Constant(self, node):
|
||||||
|
# For Python 3.8+
|
||||||
|
if isinstance(node.value, (int, float)) and node.value not in ALLOWED_NUMBERS:
|
||||||
|
self.hardcoded_numbers.append((node.lineno, node.value))
|
||||||
|
self.generic_visit(node)
|
||||||
|
|
||||||
|
def visit_Num(self, node):
|
||||||
|
# For older Python versions
|
||||||
|
if node.n not in ALLOWED_NUMBERS:
|
||||||
|
self.hardcoded_numbers.append((node.lineno, node.n))
|
||||||
|
self.generic_visit(node)
|
||||||
|
|
||||||
|
|
||||||
|
def check_file(filename):
|
||||||
|
try:
|
||||||
|
with open(filename, "r") as f:
|
||||||
|
content = f.read()
|
||||||
|
|
||||||
|
tree = ast.parse(content)
|
||||||
|
finder = HardcodedNumberFinder()
|
||||||
|
finder.visit(tree)
|
||||||
|
|
||||||
|
if finder.hardcoded_numbers:
|
||||||
|
print(f"ERROR in {filename}: Hardcoded numbers detected:")
|
||||||
|
for line, value in finder.hardcoded_numbers:
|
||||||
|
print(f" Line {line}: {value}")
|
||||||
|
return 1
|
||||||
|
return 0
|
||||||
|
except SyntaxError:
|
||||||
|
print(f"Syntax error in {filename}")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
exit_code = 0
|
||||||
|
folder = "../../litellm"
|
||||||
|
ignore_files = [
|
||||||
|
"constants.py",
|
||||||
|
"proxy_cli.py",
|
||||||
|
"token_counter.py",
|
||||||
|
"mock_functions.py",
|
||||||
|
"duration_parser.py",
|
||||||
|
"utils.py",
|
||||||
|
]
|
||||||
|
ignore_folder = "types"
|
||||||
|
for root, dirs, files in os.walk(folder):
|
||||||
|
for filename in files:
|
||||||
|
if filename.endswith(".py") and filename not in ignore_files:
|
||||||
|
full_path = os.path.join(root, filename)
|
||||||
|
if ignore_folder in full_path:
|
||||||
|
continue
|
||||||
|
exit_code |= check_file(full_path)
|
||||||
|
sys.exit(exit_code)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
0
tests/code_coverage_tests/log.txt
Normal file
0
tests/code_coverage_tests/log.txt
Normal file
Loading…
Add table
Add a link
Reference in a new issue