mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-26 11:14:04 +00:00
(bug fix) SpendLogs update DB catch all possible DB errors for retrying (#7082)
* catch DB_CONNECTION_ERROR_TYPES * fix DB retry mechanism for SpendLog updates * use DB_CONNECTION_ERROR_TYPES in auth checks * fix exp back off for writing SpendLogs * use _raise_failed_update_spend_exception to ensure errors print as NON blocking * test_update_spend_logs_multiple_batches_with_failure
This commit is contained in:
parent
6ec920d0b4
commit
b78eb6654d
4 changed files with 377 additions and 139 deletions
|
@ -31,7 +31,11 @@ from litellm.litellm_core_utils.duration_parser import (
|
|||
duration_in_seconds,
|
||||
get_last_day_of_month,
|
||||
)
|
||||
from litellm.proxy._types import ProxyErrorTypes, ProxyException
|
||||
from litellm.proxy._types import (
|
||||
DB_CONNECTION_ERROR_TYPES,
|
||||
ProxyErrorTypes,
|
||||
ProxyException,
|
||||
)
|
||||
|
||||
try:
|
||||
import backoff
|
||||
|
@ -2591,30 +2595,17 @@ async def update_spend( # noqa: PLR0915
|
|||
{}
|
||||
) # Clear the remaining transactions after processing all batches in the loop.
|
||||
break
|
||||
except httpx.ReadTimeout:
|
||||
except DB_CONNECTION_ERROR_TYPES as e:
|
||||
if i >= n_retry_times: # If we've reached the maximum number of retries
|
||||
raise # Re-raise the last exception
|
||||
_raise_failed_update_spend_exception(
|
||||
e=e, start_time=start_time, proxy_logging_obj=proxy_logging_obj
|
||||
)
|
||||
# Optionally, sleep for a bit before retrying
|
||||
await asyncio.sleep(2**i) # Exponential backoff
|
||||
except Exception as e:
|
||||
import traceback
|
||||
|
||||
error_msg = (
|
||||
f"LiteLLM Prisma Client Exception - update user spend: {str(e)}"
|
||||
_raise_failed_update_spend_exception(
|
||||
e=e, start_time=start_time, proxy_logging_obj=proxy_logging_obj
|
||||
)
|
||||
print_verbose(error_msg)
|
||||
error_traceback = error_msg + "\n" + traceback.format_exc()
|
||||
end_time = time.time()
|
||||
_duration = end_time - start_time
|
||||
asyncio.create_task(
|
||||
proxy_logging_obj.failure_handler(
|
||||
original_exception=e,
|
||||
duration=_duration,
|
||||
call_type="update_spend",
|
||||
traceback_str=error_traceback,
|
||||
)
|
||||
)
|
||||
raise e
|
||||
|
||||
### UPDATE END-USER TABLE ###
|
||||
verbose_proxy_logger.debug(
|
||||
|
@ -2652,30 +2643,17 @@ async def update_spend( # noqa: PLR0915
|
|||
{}
|
||||
) # Clear the remaining transactions after processing all batches in the loop.
|
||||
break
|
||||
except httpx.ReadTimeout:
|
||||
except DB_CONNECTION_ERROR_TYPES as e:
|
||||
if i >= n_retry_times: # If we've reached the maximum number of retries
|
||||
raise # Re-raise the last exception
|
||||
_raise_failed_update_spend_exception(
|
||||
e=e, start_time=start_time, proxy_logging_obj=proxy_logging_obj
|
||||
)
|
||||
# Optionally, sleep for a bit before retrying
|
||||
await asyncio.sleep(2**i) # Exponential backoff
|
||||
except Exception as e:
|
||||
import traceback
|
||||
|
||||
error_msg = (
|
||||
f"LiteLLM Prisma Client Exception - update end-user spend: {str(e)}"
|
||||
_raise_failed_update_spend_exception(
|
||||
e=e, start_time=start_time, proxy_logging_obj=proxy_logging_obj
|
||||
)
|
||||
print_verbose(error_msg)
|
||||
error_traceback = error_msg + "\n" + traceback.format_exc()
|
||||
end_time = time.time()
|
||||
_duration = end_time - start_time
|
||||
asyncio.create_task(
|
||||
proxy_logging_obj.failure_handler(
|
||||
original_exception=e,
|
||||
duration=_duration,
|
||||
call_type="update_spend",
|
||||
traceback_str=error_traceback,
|
||||
)
|
||||
)
|
||||
raise e
|
||||
|
||||
### UPDATE KEY TABLE ###
|
||||
verbose_proxy_logger.debug(
|
||||
|
@ -2703,30 +2681,17 @@ async def update_spend( # noqa: PLR0915
|
|||
{}
|
||||
) # Clear the remaining transactions after processing all batches in the loop.
|
||||
break
|
||||
except httpx.ReadTimeout:
|
||||
except DB_CONNECTION_ERROR_TYPES as e:
|
||||
if i >= n_retry_times: # If we've reached the maximum number of retries
|
||||
raise # Re-raise the last exception
|
||||
_raise_failed_update_spend_exception(
|
||||
e=e, start_time=start_time, proxy_logging_obj=proxy_logging_obj
|
||||
)
|
||||
# Optionally, sleep for a bit before retrying
|
||||
await asyncio.sleep(2**i) # Exponential backoff
|
||||
except Exception as e:
|
||||
import traceback
|
||||
|
||||
error_msg = (
|
||||
f"LiteLLM Prisma Client Exception - update key spend: {str(e)}"
|
||||
_raise_failed_update_spend_exception(
|
||||
e=e, start_time=start_time, proxy_logging_obj=proxy_logging_obj
|
||||
)
|
||||
print_verbose(error_msg)
|
||||
error_traceback = error_msg + "\n" + traceback.format_exc()
|
||||
end_time = time.time()
|
||||
_duration = end_time - start_time
|
||||
asyncio.create_task(
|
||||
proxy_logging_obj.failure_handler(
|
||||
original_exception=e,
|
||||
duration=_duration,
|
||||
call_type="update_spend",
|
||||
traceback_str=error_traceback,
|
||||
)
|
||||
)
|
||||
raise e
|
||||
|
||||
### UPDATE TEAM TABLE ###
|
||||
verbose_proxy_logger.debug(
|
||||
|
@ -2759,30 +2724,17 @@ async def update_spend( # noqa: PLR0915
|
|||
{}
|
||||
) # Clear the remaining transactions after processing all batches in the loop.
|
||||
break
|
||||
except httpx.ReadTimeout:
|
||||
except DB_CONNECTION_ERROR_TYPES as e:
|
||||
if i >= n_retry_times: # If we've reached the maximum number of retries
|
||||
raise # Re-raise the last exception
|
||||
_raise_failed_update_spend_exception(
|
||||
e=e, start_time=start_time, proxy_logging_obj=proxy_logging_obj
|
||||
)
|
||||
# Optionally, sleep for a bit before retrying
|
||||
await asyncio.sleep(2**i) # Exponential backoff
|
||||
except Exception as e:
|
||||
import traceback
|
||||
|
||||
error_msg = (
|
||||
f"LiteLLM Prisma Client Exception - update team spend: {str(e)}"
|
||||
_raise_failed_update_spend_exception(
|
||||
e=e, start_time=start_time, proxy_logging_obj=proxy_logging_obj
|
||||
)
|
||||
print_verbose(error_msg)
|
||||
error_traceback = error_msg + "\n" + traceback.format_exc()
|
||||
end_time = time.time()
|
||||
_duration = end_time - start_time
|
||||
asyncio.create_task(
|
||||
proxy_logging_obj.failure_handler(
|
||||
original_exception=e,
|
||||
duration=_duration,
|
||||
call_type="update_spend",
|
||||
traceback_str=error_traceback,
|
||||
)
|
||||
)
|
||||
raise e
|
||||
|
||||
### UPDATE TEAM Membership TABLE with spend ###
|
||||
if len(prisma_client.team_member_list_transactons.keys()) > 0:
|
||||
|
@ -2809,30 +2761,17 @@ async def update_spend( # noqa: PLR0915
|
|||
{}
|
||||
) # Clear the remaining transactions after processing all batches in the loop.
|
||||
break
|
||||
except httpx.ReadTimeout:
|
||||
except DB_CONNECTION_ERROR_TYPES as e:
|
||||
if i >= n_retry_times: # If we've reached the maximum number of retries
|
||||
raise # Re-raise the last exception
|
||||
_raise_failed_update_spend_exception(
|
||||
e=e, start_time=start_time, proxy_logging_obj=proxy_logging_obj
|
||||
)
|
||||
# Optionally, sleep for a bit before retrying
|
||||
await asyncio.sleep(2**i) # Exponential backoff
|
||||
except Exception as e:
|
||||
import traceback
|
||||
|
||||
error_msg = (
|
||||
f"LiteLLM Prisma Client Exception - update team spend: {str(e)}"
|
||||
_raise_failed_update_spend_exception(
|
||||
e=e, start_time=start_time, proxy_logging_obj=proxy_logging_obj
|
||||
)
|
||||
print_verbose(error_msg)
|
||||
error_traceback = error_msg + "\n" + traceback.format_exc()
|
||||
end_time = time.time()
|
||||
_duration = end_time - start_time
|
||||
asyncio.create_task(
|
||||
proxy_logging_obj.failure_handler(
|
||||
original_exception=e,
|
||||
duration=_duration,
|
||||
call_type="update_spend",
|
||||
traceback_str=error_traceback,
|
||||
)
|
||||
)
|
||||
raise e
|
||||
|
||||
### UPDATE ORG TABLE ###
|
||||
if len(prisma_client.org_list_transactons.keys()) > 0:
|
||||
|
@ -2855,30 +2794,17 @@ async def update_spend( # noqa: PLR0915
|
|||
{}
|
||||
) # Clear the remaining transactions after processing all batches in the loop.
|
||||
break
|
||||
except httpx.ReadTimeout:
|
||||
except DB_CONNECTION_ERROR_TYPES as e:
|
||||
if i >= n_retry_times: # If we've reached the maximum number of retries
|
||||
raise # Re-raise the last exception
|
||||
_raise_failed_update_spend_exception(
|
||||
e=e, start_time=start_time, proxy_logging_obj=proxy_logging_obj
|
||||
)
|
||||
# Optionally, sleep for a bit before retrying
|
||||
await asyncio.sleep(2**i) # Exponential backoff
|
||||
except Exception as e:
|
||||
import traceback
|
||||
|
||||
error_msg = (
|
||||
f"LiteLLM Prisma Client Exception - update org spend: {str(e)}"
|
||||
_raise_failed_update_spend_exception(
|
||||
e=e, start_time=start_time, proxy_logging_obj=proxy_logging_obj
|
||||
)
|
||||
print_verbose(error_msg)
|
||||
error_traceback = error_msg + "\n" + traceback.format_exc()
|
||||
end_time = time.time()
|
||||
_duration = end_time - start_time
|
||||
asyncio.create_task(
|
||||
proxy_logging_obj.failure_handler(
|
||||
original_exception=e,
|
||||
duration=_duration,
|
||||
call_type="update_spend",
|
||||
traceback_str=error_traceback,
|
||||
)
|
||||
)
|
||||
raise e
|
||||
|
||||
### UPDATE SPEND LOGS ###
|
||||
verbose_proxy_logger.debug(
|
||||
|
@ -2889,7 +2815,7 @@ async def update_spend( # noqa: PLR0915
|
|||
MAX_LOGS_PER_INTERVAL = 1000 # Maximum number of logs to flush in a single interval
|
||||
|
||||
if len(prisma_client.spend_log_transactions) > 0:
|
||||
for _ in range(n_retry_times + 1):
|
||||
for i in range(n_retry_times + 1):
|
||||
start_time = time.time()
|
||||
try:
|
||||
base_url = os.getenv("SPEND_LOGS_URL", None)
|
||||
|
@ -2913,9 +2839,9 @@ async def update_spend( # noqa: PLR0915
|
|||
logs_to_process = prisma_client.spend_log_transactions[
|
||||
:MAX_LOGS_PER_INTERVAL
|
||||
]
|
||||
for i in range(0, len(logs_to_process), BATCH_SIZE):
|
||||
for j in range(0, len(logs_to_process), BATCH_SIZE):
|
||||
# Create sublist for current batch, ensuring it doesn't exceed the BATCH_SIZE
|
||||
batch = logs_to_process[i : i + BATCH_SIZE]
|
||||
batch = logs_to_process[j : j + BATCH_SIZE]
|
||||
|
||||
# Convert datetime strings to Date objects
|
||||
batch_with_dates = [
|
||||
|
@ -2943,32 +2869,50 @@ async def update_spend( # noqa: PLR0915
|
|||
f"{len(logs_to_process)} logs processed. Remaining in queue: {len(prisma_client.spend_log_transactions)}"
|
||||
)
|
||||
break
|
||||
except httpx.ReadTimeout:
|
||||
except DB_CONNECTION_ERROR_TYPES as e:
|
||||
if i is None:
|
||||
i = 0
|
||||
if i >= n_retry_times: # If we've reached the maximum number of retries
|
||||
raise # Re-raise the last exception
|
||||
if (
|
||||
i >= n_retry_times
|
||||
): # If we've reached the maximum number of retries raise the exception
|
||||
_raise_failed_update_spend_exception(
|
||||
e=e, start_time=start_time, proxy_logging_obj=proxy_logging_obj
|
||||
)
|
||||
|
||||
# Optionally, sleep for a bit before retrying
|
||||
await asyncio.sleep(2**i) # type: ignore
|
||||
except Exception as e:
|
||||
import traceback
|
||||
_raise_failed_update_spend_exception(
|
||||
e=e, start_time=start_time, proxy_logging_obj=proxy_logging_obj
|
||||
)
|
||||
|
||||
error_msg = (
|
||||
f"LiteLLM Prisma Client Exception - update spend logs: {str(e)}"
|
||||
)
|
||||
print_verbose(error_msg)
|
||||
error_traceback = error_msg + "\n" + traceback.format_exc()
|
||||
end_time = time.time()
|
||||
_duration = end_time - start_time
|
||||
asyncio.create_task(
|
||||
proxy_logging_obj.failure_handler(
|
||||
original_exception=e,
|
||||
duration=_duration,
|
||||
call_type="update_spend",
|
||||
traceback_str=error_traceback,
|
||||
)
|
||||
)
|
||||
raise e
|
||||
|
||||
def _raise_failed_update_spend_exception(
|
||||
e: Exception, start_time: float, proxy_logging_obj: ProxyLogging
|
||||
):
|
||||
"""
|
||||
Raise an exception for failed update spend logs
|
||||
|
||||
- Calls proxy_logging_obj.failure_handler to log the error
|
||||
- Ensures error messages says "Non-Blocking"
|
||||
"""
|
||||
import traceback
|
||||
|
||||
error_msg = (
|
||||
f"[Non-Blocking]LiteLLM Prisma Client Exception - update spend logs: {str(e)}"
|
||||
)
|
||||
error_traceback = error_msg + "\n" + traceback.format_exc()
|
||||
end_time = time.time()
|
||||
_duration = end_time - start_time
|
||||
asyncio.create_task(
|
||||
proxy_logging_obj.failure_handler(
|
||||
original_exception=e,
|
||||
duration=_duration,
|
||||
call_type="update_spend",
|
||||
traceback_str=error_traceback,
|
||||
)
|
||||
)
|
||||
raise e
|
||||
|
||||
|
||||
def _is_projected_spend_over_limit(
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue