forked from phoenix/litellm-mirror
Merge pull request #1356 from BerriAI/litellm_improve_proxy_logs
[Feat] Improve Proxy Logging
This commit is contained in:
commit
a70626d6e9
9 changed files with 323 additions and 174 deletions
|
@ -82,6 +82,15 @@ Cli arguments, --host, --port, --num_workers
|
||||||
litellm --debug
|
litellm --debug
|
||||||
```
|
```
|
||||||
|
|
||||||
|
#### --detailed_debug
|
||||||
|
- **Default:** `False`
|
||||||
|
- **Type:** `bool` (Flag)
|
||||||
|
- Enable debugging mode for the input.
|
||||||
|
- **Usage:**
|
||||||
|
```shell
|
||||||
|
litellm --detailed_debug
|
||||||
|
``
|
||||||
|
|
||||||
#### --temperature
|
#### --temperature
|
||||||
- **Default:** `None`
|
- **Default:** `None`
|
||||||
- **Type:** `float`
|
- **Type:** `float`
|
||||||
|
|
|
@ -947,6 +947,13 @@ Run the proxy with `--debug` to easily view debug logs
|
||||||
litellm --model gpt-3.5-turbo --debug
|
litellm --model gpt-3.5-turbo --debug
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Detailed Debug Logs
|
||||||
|
|
||||||
|
Run the proxy with `--detailed_debug` to view detailed debug logs
|
||||||
|
```shell
|
||||||
|
litellm --model gpt-3.5-turbo --detailed_debug
|
||||||
|
```
|
||||||
|
|
||||||
When making requests you should see the POST request sent by LiteLLM to the LLM on the Terminal output
|
When making requests you should see the POST request sent by LiteLLM to the LLM on the Terminal output
|
||||||
```shell
|
```shell
|
||||||
POST Request Sent from LiteLLM:
|
POST Request Sent from LiteLLM:
|
||||||
|
@ -1281,6 +1288,14 @@ LiteLLM proxy adds **0.00325 seconds** latency as compared to using the Raw Open
|
||||||
```shell
|
```shell
|
||||||
litellm --debug
|
litellm --debug
|
||||||
```
|
```
|
||||||
|
#### --detailed_debug
|
||||||
|
- **Default:** `False`
|
||||||
|
- **Type:** `bool` (Flag)
|
||||||
|
- Enable debugging mode for the input.
|
||||||
|
- **Usage:**
|
||||||
|
```shell
|
||||||
|
litellm --detailed_debug
|
||||||
|
```
|
||||||
|
|
||||||
#### --temperature
|
#### --temperature
|
||||||
- **Default:** `None`
|
- **Default:** `None`
|
||||||
|
|
|
@ -1,5 +1,17 @@
|
||||||
|
import logging
|
||||||
|
|
||||||
set_verbose = False
|
set_verbose = False
|
||||||
|
|
||||||
|
# Create a handler for the logger (you may need to adapt this based on your needs)
|
||||||
|
handler = logging.StreamHandler()
|
||||||
|
handler.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
# Create a formatter and set it for the handler
|
||||||
|
|
||||||
|
formatter = logging.Formatter("\033[92m%(name)s - %(levelname)s\033[0m: %(message)s")
|
||||||
|
|
||||||
|
handler.setFormatter(formatter)
|
||||||
|
|
||||||
|
|
||||||
def print_verbose(print_statement):
|
def print_verbose(print_statement):
|
||||||
try:
|
try:
|
||||||
|
@ -7,3 +19,11 @@ def print_verbose(print_statement):
|
||||||
print(print_statement) # noqa
|
print(print_statement) # noqa
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
verbose_proxy_logger = logging.getLogger("LiteLLM Proxy")
|
||||||
|
verbose_router_logger = logging.getLogger("LiteLLM Router")
|
||||||
|
verbose_logger = logging.getLogger("LiteLLM")
|
||||||
|
|
||||||
|
# Add the handler to the logger
|
||||||
|
verbose_router_logger.addHandler(handler)
|
||||||
|
|
|
@ -88,6 +88,13 @@ def is_port_in_use(port):
|
||||||
@click.option(
|
@click.option(
|
||||||
"--debug", default=False, is_flag=True, type=bool, help="To debug the input"
|
"--debug", default=False, is_flag=True, type=bool, help="To debug the input"
|
||||||
)
|
)
|
||||||
|
@click.option(
|
||||||
|
"--detailed_debug",
|
||||||
|
default=False,
|
||||||
|
is_flag=True,
|
||||||
|
type=bool,
|
||||||
|
help="To view detailed debug logs",
|
||||||
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
"--use_queue",
|
"--use_queue",
|
||||||
default=False,
|
default=False,
|
||||||
|
@ -139,12 +146,6 @@ def is_port_in_use(port):
|
||||||
type=bool,
|
type=bool,
|
||||||
help="Print LiteLLM version",
|
help="Print LiteLLM version",
|
||||||
)
|
)
|
||||||
@click.option(
|
|
||||||
"--logs",
|
|
||||||
flag_value=False,
|
|
||||||
type=int,
|
|
||||||
help='Gets the "n" most recent logs. By default gets most recent log.',
|
|
||||||
)
|
|
||||||
@click.option(
|
@click.option(
|
||||||
"--health",
|
"--health",
|
||||||
flag_value=True,
|
flag_value=True,
|
||||||
|
@ -179,6 +180,7 @@ def run_server(
|
||||||
headers,
|
headers,
|
||||||
save,
|
save,
|
||||||
debug,
|
debug,
|
||||||
|
detailed_debug,
|
||||||
temperature,
|
temperature,
|
||||||
max_tokens,
|
max_tokens,
|
||||||
request_timeout,
|
request_timeout,
|
||||||
|
@ -187,7 +189,6 @@ def run_server(
|
||||||
config,
|
config,
|
||||||
max_budget,
|
max_budget,
|
||||||
telemetry,
|
telemetry,
|
||||||
logs,
|
|
||||||
test,
|
test,
|
||||||
local,
|
local,
|
||||||
num_workers,
|
num_workers,
|
||||||
|
@ -212,32 +213,6 @@ def run_server(
|
||||||
# this is just a local/relative import error, user git cloned litellm
|
# this is just a local/relative import error, user git cloned litellm
|
||||||
from proxy_server import app, save_worker_config, usage_telemetry
|
from proxy_server import app, save_worker_config, usage_telemetry
|
||||||
feature_telemetry = usage_telemetry
|
feature_telemetry = usage_telemetry
|
||||||
if logs is not None:
|
|
||||||
if logs == 0: # default to 1
|
|
||||||
logs = 1
|
|
||||||
try:
|
|
||||||
with open("api_log.json") as f:
|
|
||||||
data = json.load(f)
|
|
||||||
|
|
||||||
# convert keys to datetime objects
|
|
||||||
log_times = {
|
|
||||||
datetime.strptime(k, "%Y%m%d%H%M%S%f"): v for k, v in data.items()
|
|
||||||
}
|
|
||||||
|
|
||||||
# sort by timestamp
|
|
||||||
sorted_times = sorted(
|
|
||||||
log_times.items(), key=operator.itemgetter(0), reverse=True
|
|
||||||
)
|
|
||||||
|
|
||||||
# get n recent logs
|
|
||||||
recent_logs = {
|
|
||||||
k.strftime("%Y%m%d%H%M%S%f"): v for k, v in sorted_times[:logs]
|
|
||||||
}
|
|
||||||
|
|
||||||
print(json.dumps(recent_logs, indent=4)) # noqa
|
|
||||||
except:
|
|
||||||
raise Exception("LiteLLM: No logs saved!")
|
|
||||||
return
|
|
||||||
if version == True:
|
if version == True:
|
||||||
pkg_version = importlib.metadata.version("litellm")
|
pkg_version = importlib.metadata.version("litellm")
|
||||||
click.echo(f"\nLiteLLM: Current Version = {pkg_version}\n")
|
click.echo(f"\nLiteLLM: Current Version = {pkg_version}\n")
|
||||||
|
@ -377,6 +352,7 @@ def run_server(
|
||||||
api_base=api_base,
|
api_base=api_base,
|
||||||
api_version=api_version,
|
api_version=api_version,
|
||||||
debug=debug,
|
debug=debug,
|
||||||
|
detailed_debug=detailed_debug,
|
||||||
temperature=temperature,
|
temperature=temperature,
|
||||||
max_tokens=max_tokens,
|
max_tokens=max_tokens,
|
||||||
request_timeout=request_timeout,
|
request_timeout=request_timeout,
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
model_list:
|
model_list:
|
||||||
- model_name: Azure OpenAI GPT-4 Canada-East (External)
|
- model_name: azure-gpt-3.5
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: azure/chatgpt-v-2
|
model: azure/chatgpt-v-2
|
||||||
api_base: os.environ/AZURE_API_BASE
|
api_base: os.environ/AZURE_API_BASE
|
||||||
|
@ -49,6 +49,7 @@ model_list:
|
||||||
mode: completion
|
mode: completion
|
||||||
|
|
||||||
litellm_settings:
|
litellm_settings:
|
||||||
|
fallbacks: [{"openai-gpt-3.5": ["azure-gpt-3.5"]}]
|
||||||
# cache: True
|
# cache: True
|
||||||
# setting callback class
|
# setting callback class
|
||||||
# callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance]
|
# callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance]
|
||||||
|
|
|
@ -19,6 +19,7 @@ try:
|
||||||
import backoff
|
import backoff
|
||||||
import yaml
|
import yaml
|
||||||
import orjson
|
import orjson
|
||||||
|
import logging
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
raise ImportError(f"Missing dependency {e}. Run `pip install 'litellm[proxy]'`")
|
raise ImportError(f"Missing dependency {e}. Run `pip install 'litellm[proxy]'`")
|
||||||
|
|
||||||
|
@ -77,6 +78,7 @@ import pydantic
|
||||||
from litellm.proxy._types import *
|
from litellm.proxy._types import *
|
||||||
from litellm.caching import DualCache
|
from litellm.caching import DualCache
|
||||||
from litellm.proxy.health_check import perform_health_check
|
from litellm.proxy.health_check import perform_health_check
|
||||||
|
from litellm._logging import verbose_router_logger, verbose_proxy_logger
|
||||||
|
|
||||||
litellm.suppress_debug_info = True
|
litellm.suppress_debug_info = True
|
||||||
from fastapi import (
|
from fastapi import (
|
||||||
|
@ -153,16 +155,7 @@ proxy_logging_obj = ProxyLogging(user_api_key_cache=user_api_key_cache)
|
||||||
async_result = None
|
async_result = None
|
||||||
celery_app_conn = None
|
celery_app_conn = None
|
||||||
celery_fn = None # Redis Queue for handling requests
|
celery_fn = None # Redis Queue for handling requests
|
||||||
|
### logger ###
|
||||||
|
|
||||||
#### HELPER FUNCTIONS ####
|
|
||||||
def print_verbose(print_statement):
|
|
||||||
try:
|
|
||||||
global user_debug
|
|
||||||
if user_debug:
|
|
||||||
print(print_statement) # noqa
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def usage_telemetry(
|
def usage_telemetry(
|
||||||
|
@ -245,24 +238,26 @@ async def user_api_key_auth(
|
||||||
|
|
||||||
## check for cache hit (In-Memory Cache)
|
## check for cache hit (In-Memory Cache)
|
||||||
valid_token = user_api_key_cache.get_cache(key=api_key)
|
valid_token = user_api_key_cache.get_cache(key=api_key)
|
||||||
print_verbose(f"valid_token from cache: {valid_token}")
|
verbose_proxy_logger.debug(f"valid_token from cache: {valid_token}")
|
||||||
if valid_token is None:
|
if valid_token is None:
|
||||||
## check db
|
## check db
|
||||||
print_verbose(f"api key: {api_key}")
|
verbose_proxy_logger.debug(f"api key: {api_key}")
|
||||||
valid_token = await prisma_client.get_data(
|
valid_token = await prisma_client.get_data(
|
||||||
token=api_key, expires=datetime.utcnow().replace(tzinfo=timezone.utc)
|
token=api_key, expires=datetime.utcnow().replace(tzinfo=timezone.utc)
|
||||||
)
|
)
|
||||||
print_verbose(f"valid token from prisma: {valid_token}")
|
verbose_proxy_logger.debug(f"valid token from prisma: {valid_token}")
|
||||||
user_api_key_cache.set_cache(key=api_key, value=valid_token, ttl=60)
|
user_api_key_cache.set_cache(key=api_key, value=valid_token, ttl=60)
|
||||||
elif valid_token is not None:
|
elif valid_token is not None:
|
||||||
print_verbose(f"API Key Cache Hit!")
|
verbose_proxy_logger.debug(f"API Key Cache Hit!")
|
||||||
if valid_token:
|
if valid_token:
|
||||||
litellm.model_alias_map = valid_token.aliases
|
litellm.model_alias_map = valid_token.aliases
|
||||||
config = valid_token.config
|
config = valid_token.config
|
||||||
if config != {}:
|
if config != {}:
|
||||||
model_list = config.get("model_list", [])
|
model_list = config.get("model_list", [])
|
||||||
llm_model_list = model_list
|
llm_model_list = model_list
|
||||||
print_verbose(f"\n new llm router model list {llm_model_list}")
|
verbose_proxy_logger.debug(
|
||||||
|
f"\n new llm router model list {llm_model_list}"
|
||||||
|
)
|
||||||
if (
|
if (
|
||||||
len(valid_token.models) == 0
|
len(valid_token.models) == 0
|
||||||
): # assume an empty model list means all models are allowed to be called
|
): # assume an empty model list means all models are allowed to be called
|
||||||
|
@ -296,7 +291,7 @@ async def user_api_key_auth(
|
||||||
else:
|
else:
|
||||||
raise Exception(f"Invalid token")
|
raise Exception(f"Invalid token")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# print_verbose(f"An exception occurred - {traceback.format_exc()}")
|
# verbose_proxy_logger.debug(f"An exception occurred - {traceback.format_exc()}")
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
if isinstance(e, HTTPException):
|
if isinstance(e, HTTPException):
|
||||||
raise e
|
raise e
|
||||||
|
@ -316,7 +311,7 @@ def prisma_setup(database_url: Optional[str]):
|
||||||
database_url=database_url, proxy_logging_obj=proxy_logging_obj
|
database_url=database_url, proxy_logging_obj=proxy_logging_obj
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print_verbose(
|
verbose_proxy_logger.debug(
|
||||||
f"Error when initializing prisma, Ensure you run pip install prisma {str(e)}"
|
f"Error when initializing prisma, Ensure you run pip install prisma {str(e)}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -358,7 +353,7 @@ def load_from_azure_key_vault(use_azure_key_vault: bool = False):
|
||||||
f"Missing KVUri or client_id or client_secret or tenant_id from environment"
|
f"Missing KVUri or client_id or client_secret or tenant_id from environment"
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print_verbose(
|
verbose_proxy_logger.debug(
|
||||||
"Error when loading keys from Azure Key Vault. Ensure you run `pip install azure-identity azure-keyvault-secrets`"
|
"Error when loading keys from Azure Key Vault. Ensure you run `pip install azure-identity azure-keyvault-secrets`"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -367,7 +362,7 @@ def cost_tracking():
|
||||||
global prisma_client
|
global prisma_client
|
||||||
if prisma_client is not None:
|
if prisma_client is not None:
|
||||||
if isinstance(litellm.success_callback, list):
|
if isinstance(litellm.success_callback, list):
|
||||||
print_verbose("setting litellm success callback to track cost")
|
verbose_proxy_logger.debug("setting litellm success callback to track cost")
|
||||||
if (track_cost_callback) not in litellm.success_callback: # type: ignore
|
if (track_cost_callback) not in litellm.success_callback: # type: ignore
|
||||||
litellm.success_callback.append(track_cost_callback) # type: ignore
|
litellm.success_callback.append(track_cost_callback) # type: ignore
|
||||||
|
|
||||||
|
@ -381,7 +376,7 @@ async def track_cost_callback(
|
||||||
global prisma_client
|
global prisma_client
|
||||||
try:
|
try:
|
||||||
# check if it has collected an entire stream response
|
# check if it has collected an entire stream response
|
||||||
print_verbose(
|
verbose_proxy_logger.debug(
|
||||||
f"kwargs stream: {kwargs.get('stream', None)} + complete streaming response: {kwargs.get('complete_streaming_response', None)}"
|
f"kwargs stream: {kwargs.get('stream', None)} + complete streaming response: {kwargs.get('complete_streaming_response', None)}"
|
||||||
)
|
)
|
||||||
if "complete_streaming_response" in kwargs:
|
if "complete_streaming_response" in kwargs:
|
||||||
|
@ -390,7 +385,7 @@ async def track_cost_callback(
|
||||||
response_cost = litellm.completion_cost(
|
response_cost = litellm.completion_cost(
|
||||||
completion_response=completion_response
|
completion_response=completion_response
|
||||||
)
|
)
|
||||||
print_verbose(f"streaming response_cost {response_cost}")
|
verbose_proxy_logger.debug(f"streaming response_cost {response_cost}")
|
||||||
user_api_key = kwargs["litellm_params"]["metadata"].get(
|
user_api_key = kwargs["litellm_params"]["metadata"].get(
|
||||||
"user_api_key", None
|
"user_api_key", None
|
||||||
)
|
)
|
||||||
|
@ -416,12 +411,14 @@ async def track_cost_callback(
|
||||||
token=user_api_key, response_cost=response_cost, user_id=user_id
|
token=user_api_key, response_cost=response_cost, user_id=user_id
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print_verbose(f"error in tracking cost callback - {str(e)}")
|
verbose_proxy_logger.debug(f"error in tracking cost callback - {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
async def update_prisma_database(token, response_cost, user_id=None):
|
async def update_prisma_database(token, response_cost, user_id=None):
|
||||||
try:
|
try:
|
||||||
print_verbose(f"Enters prisma db call, token: {token}; user_id: {user_id}")
|
verbose_proxy_logger.debug(
|
||||||
|
f"Enters prisma db call, token: {token}; user_id: {user_id}"
|
||||||
|
)
|
||||||
|
|
||||||
### UPDATE USER SPEND ###
|
### UPDATE USER SPEND ###
|
||||||
async def _update_user_db():
|
async def _update_user_db():
|
||||||
|
@ -436,7 +433,7 @@ async def update_prisma_database(token, response_cost, user_id=None):
|
||||||
# Calculate the new cost by adding the existing cost and response_cost
|
# Calculate the new cost by adding the existing cost and response_cost
|
||||||
new_spend = existing_spend + response_cost
|
new_spend = existing_spend + response_cost
|
||||||
|
|
||||||
print_verbose(f"new cost: {new_spend}")
|
verbose_proxy_logger.debug(f"new cost: {new_spend}")
|
||||||
# Update the cost column for the given user id
|
# Update the cost column for the given user id
|
||||||
await prisma_client.update_data(user_id=user_id, data={"spend": new_spend})
|
await prisma_client.update_data(user_id=user_id, data={"spend": new_spend})
|
||||||
|
|
||||||
|
@ -444,7 +441,7 @@ async def update_prisma_database(token, response_cost, user_id=None):
|
||||||
async def _update_key_db():
|
async def _update_key_db():
|
||||||
# Fetch the existing cost for the given token
|
# Fetch the existing cost for the given token
|
||||||
existing_spend_obj = await prisma_client.get_data(token=token)
|
existing_spend_obj = await prisma_client.get_data(token=token)
|
||||||
print_verbose(f"existing spend: {existing_spend_obj}")
|
verbose_proxy_logger.debug(f"existing spend: {existing_spend_obj}")
|
||||||
if existing_spend_obj is None:
|
if existing_spend_obj is None:
|
||||||
existing_spend = 0
|
existing_spend = 0
|
||||||
else:
|
else:
|
||||||
|
@ -452,7 +449,7 @@ async def update_prisma_database(token, response_cost, user_id=None):
|
||||||
# Calculate the new cost by adding the existing cost and response_cost
|
# Calculate the new cost by adding the existing cost and response_cost
|
||||||
new_spend = existing_spend + response_cost
|
new_spend = existing_spend + response_cost
|
||||||
|
|
||||||
print_verbose(f"new cost: {new_spend}")
|
verbose_proxy_logger.debug(f"new cost: {new_spend}")
|
||||||
# Update the cost column for the given token
|
# Update the cost column for the given token
|
||||||
await prisma_client.update_data(token=token, data={"spend": new_spend})
|
await prisma_client.update_data(token=token, data={"spend": new_spend})
|
||||||
|
|
||||||
|
@ -461,7 +458,9 @@ async def update_prisma_database(token, response_cost, user_id=None):
|
||||||
tasks.append(_update_key_db())
|
tasks.append(_update_key_db())
|
||||||
await asyncio.gather(*tasks)
|
await asyncio.gather(*tasks)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print_verbose(f"Error updating Prisma database: {traceback.format_exc()}")
|
verbose_proxy_logger.debug(
|
||||||
|
f"Error updating Prisma database: {traceback.format_exc()}"
|
||||||
|
)
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@ -472,7 +471,7 @@ def run_ollama_serve():
|
||||||
with open(os.devnull, "w") as devnull:
|
with open(os.devnull, "w") as devnull:
|
||||||
process = subprocess.Popen(command, stdout=devnull, stderr=devnull)
|
process = subprocess.Popen(command, stdout=devnull, stderr=devnull)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print_verbose(
|
verbose_proxy_logger.debug(
|
||||||
f"""
|
f"""
|
||||||
LiteLLM Warning: proxy started with `ollama` model\n`ollama serve` failed with Exception{e}. \nEnsure you run `ollama serve`
|
LiteLLM Warning: proxy started with `ollama` model\n`ollama serve` failed with Exception{e}. \nEnsure you run `ollama serve`
|
||||||
"""
|
"""
|
||||||
|
@ -612,7 +611,7 @@ class ProxyConfig:
|
||||||
printed_yaml = copy.deepcopy(config)
|
printed_yaml = copy.deepcopy(config)
|
||||||
printed_yaml.pop("environment_variables", None)
|
printed_yaml.pop("environment_variables", None)
|
||||||
|
|
||||||
print_verbose(
|
verbose_proxy_logger.debug(
|
||||||
f"Loaded config YAML (api_key and environment_variables are not shown):\n{json.dumps(printed_yaml, indent=2)}"
|
f"Loaded config YAML (api_key and environment_variables are not shown):\n{json.dumps(printed_yaml, indent=2)}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -643,7 +642,7 @@ class ProxyConfig:
|
||||||
|
|
||||||
cache_type = cache_params.get("type", "redis")
|
cache_type = cache_params.get("type", "redis")
|
||||||
|
|
||||||
print_verbose(f"passed cache type={cache_type}")
|
verbose_proxy_logger.debug(f"passed cache type={cache_type}")
|
||||||
|
|
||||||
if cache_type == "redis":
|
if cache_type == "redis":
|
||||||
cache_host = litellm.get_secret("REDIS_HOST", None)
|
cache_host = litellm.get_secret("REDIS_HOST", None)
|
||||||
|
@ -687,14 +686,16 @@ class ProxyConfig:
|
||||||
litellm.callbacks = [
|
litellm.callbacks = [
|
||||||
get_instance_fn(value=value, config_file_path=config_file_path)
|
get_instance_fn(value=value, config_file_path=config_file_path)
|
||||||
]
|
]
|
||||||
print_verbose(
|
verbose_proxy_logger.debug(
|
||||||
f"{blue_color_code} Initialized Callbacks - {litellm.callbacks} {reset_color_code}"
|
f"{blue_color_code} Initialized Callbacks - {litellm.callbacks} {reset_color_code}"
|
||||||
)
|
)
|
||||||
elif key == "post_call_rules":
|
elif key == "post_call_rules":
|
||||||
litellm.post_call_rules = [
|
litellm.post_call_rules = [
|
||||||
get_instance_fn(value=value, config_file_path=config_file_path)
|
get_instance_fn(value=value, config_file_path=config_file_path)
|
||||||
]
|
]
|
||||||
print_verbose(f"litellm.post_call_rules: {litellm.post_call_rules}")
|
verbose_proxy_logger.debug(
|
||||||
|
f"litellm.post_call_rules: {litellm.post_call_rules}"
|
||||||
|
)
|
||||||
elif key == "success_callback":
|
elif key == "success_callback":
|
||||||
litellm.success_callback = []
|
litellm.success_callback = []
|
||||||
|
|
||||||
|
@ -708,7 +709,7 @@ class ProxyConfig:
|
||||||
# these are litellm callbacks - "langfuse", "sentry", "wandb"
|
# these are litellm callbacks - "langfuse", "sentry", "wandb"
|
||||||
else:
|
else:
|
||||||
litellm.success_callback.append(callback)
|
litellm.success_callback.append(callback)
|
||||||
print_verbose(
|
verbose_proxy_logger.debug(
|
||||||
f"{blue_color_code} Initialized Success Callbacks - {litellm.success_callback} {reset_color_code}"
|
f"{blue_color_code} Initialized Success Callbacks - {litellm.success_callback} {reset_color_code}"
|
||||||
)
|
)
|
||||||
elif key == "failure_callback":
|
elif key == "failure_callback":
|
||||||
|
@ -724,7 +725,7 @@ class ProxyConfig:
|
||||||
# these are litellm callbacks - "langfuse", "sentry", "wandb"
|
# these are litellm callbacks - "langfuse", "sentry", "wandb"
|
||||||
else:
|
else:
|
||||||
litellm.failure_callback.append(callback)
|
litellm.failure_callback.append(callback)
|
||||||
print_verbose(
|
verbose_proxy_logger.debug(
|
||||||
f"{blue_color_code} Initialized Success Callbacks - {litellm.failure_callback} {reset_color_code}"
|
f"{blue_color_code} Initialized Success Callbacks - {litellm.failure_callback} {reset_color_code}"
|
||||||
)
|
)
|
||||||
elif key == "cache_params":
|
elif key == "cache_params":
|
||||||
|
@ -764,9 +765,9 @@ class ProxyConfig:
|
||||||
### CONNECT TO DATABASE ###
|
### CONNECT TO DATABASE ###
|
||||||
database_url = general_settings.get("database_url", None)
|
database_url = general_settings.get("database_url", None)
|
||||||
if database_url and database_url.startswith("os.environ/"):
|
if database_url and database_url.startswith("os.environ/"):
|
||||||
print_verbose(f"GOING INTO LITELLM.GET_SECRET!")
|
verbose_proxy_logger.debug(f"GOING INTO LITELLM.GET_SECRET!")
|
||||||
database_url = litellm.get_secret(database_url)
|
database_url = litellm.get_secret(database_url)
|
||||||
print_verbose(f"RETRIEVED DB URL: {database_url}")
|
verbose_proxy_logger.debug(f"RETRIEVED DB URL: {database_url}")
|
||||||
prisma_setup(database_url=database_url)
|
prisma_setup(database_url=database_url)
|
||||||
## COST TRACKING ##
|
## COST TRACKING ##
|
||||||
cost_tracking()
|
cost_tracking()
|
||||||
|
@ -902,7 +903,7 @@ async def generate_key_helper_fn(
|
||||||
"max_budget": max_budget,
|
"max_budget": max_budget,
|
||||||
"user_email": user_email,
|
"user_email": user_email,
|
||||||
}
|
}
|
||||||
print_verbose("PrismaClient: Before Insert Data")
|
verbose_proxy_logger.debug("PrismaClient: Before Insert Data")
|
||||||
new_verification_token = await prisma_client.insert_data(
|
new_verification_token = await prisma_client.insert_data(
|
||||||
data=verification_token_data
|
data=verification_token_data
|
||||||
)
|
)
|
||||||
|
@ -943,6 +944,7 @@ async def initialize(
|
||||||
api_base=None,
|
api_base=None,
|
||||||
api_version=None,
|
api_version=None,
|
||||||
debug=False,
|
debug=False,
|
||||||
|
detailed_debug=False,
|
||||||
temperature=None,
|
temperature=None,
|
||||||
max_tokens=None,
|
max_tokens=None,
|
||||||
request_timeout=600,
|
request_timeout=600,
|
||||||
|
@ -955,12 +957,22 @@ async def initialize(
|
||||||
use_queue=False,
|
use_queue=False,
|
||||||
config=None,
|
config=None,
|
||||||
):
|
):
|
||||||
global user_model, user_api_base, user_debug, user_max_tokens, user_request_timeout, user_temperature, user_telemetry, user_headers, experimental, llm_model_list, llm_router, general_settings, master_key, user_custom_auth, prisma_client
|
global user_model, user_api_base, user_debug, user_detailed_debug, user_user_max_tokens, user_request_timeout, user_temperature, user_telemetry, user_headers, experimental, llm_model_list, llm_router, general_settings, master_key, user_custom_auth, prisma_client
|
||||||
generate_feedback_box()
|
generate_feedback_box()
|
||||||
user_model = model
|
user_model = model
|
||||||
user_debug = debug
|
user_debug = debug
|
||||||
if debug == True: # this needs to be first, so users can see Router init debugg
|
if debug == True: # this needs to be first, so users can see Router init debugg
|
||||||
|
from litellm._logging import verbose_router_logger, verbose_proxy_logger
|
||||||
|
import logging
|
||||||
|
|
||||||
|
verbose_router_logger.setLevel(level=logging.INFO) # set router logs to info
|
||||||
|
if detailed_debug == True:
|
||||||
|
from litellm._logging import verbose_router_logger, verbose_proxy_logger
|
||||||
|
import logging
|
||||||
|
|
||||||
|
verbose_router_logger.setLevel(level=logging.DEBUG) # set router logs to info
|
||||||
litellm.set_verbose = True
|
litellm.set_verbose = True
|
||||||
|
|
||||||
dynamic_config = {"general": {}, user_model: {}}
|
dynamic_config = {"general": {}, user_model: {}}
|
||||||
if config:
|
if config:
|
||||||
(
|
(
|
||||||
|
@ -1031,9 +1043,9 @@ async def initialize(
|
||||||
|
|
||||||
# for streaming
|
# for streaming
|
||||||
def data_generator(response):
|
def data_generator(response):
|
||||||
print_verbose("inside generator")
|
verbose_proxy_logger.debug("inside generator")
|
||||||
for chunk in response:
|
for chunk in response:
|
||||||
print_verbose(f"returned chunk: {chunk}")
|
verbose_proxy_logger.debug(f"returned chunk: {chunk}")
|
||||||
try:
|
try:
|
||||||
yield f"data: {json.dumps(chunk.dict())}\n\n"
|
yield f"data: {json.dumps(chunk.dict())}\n\n"
|
||||||
except:
|
except:
|
||||||
|
@ -1041,11 +1053,11 @@ def data_generator(response):
|
||||||
|
|
||||||
|
|
||||||
async def async_data_generator(response, user_api_key_dict):
|
async def async_data_generator(response, user_api_key_dict):
|
||||||
print_verbose("inside generator")
|
verbose_proxy_logger.debug("inside generator")
|
||||||
try:
|
try:
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
async for chunk in response:
|
async for chunk in response:
|
||||||
print_verbose(f"returned chunk: {chunk}")
|
verbose_proxy_logger.debug(f"returned chunk: {chunk}")
|
||||||
try:
|
try:
|
||||||
yield f"data: {json.dumps(chunk.dict())}\n\n"
|
yield f"data: {json.dumps(chunk.dict())}\n\n"
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -1110,7 +1122,7 @@ async def startup_event():
|
||||||
|
|
||||||
### LOAD CONFIG ###
|
### LOAD CONFIG ###
|
||||||
worker_config = litellm.get_secret("WORKER_CONFIG")
|
worker_config = litellm.get_secret("WORKER_CONFIG")
|
||||||
print_verbose(f"worker_config: {worker_config}")
|
verbose_proxy_logger.debug(f"worker_config: {worker_config}")
|
||||||
# check if it's a valid file path
|
# check if it's a valid file path
|
||||||
if os.path.isfile(worker_config):
|
if os.path.isfile(worker_config):
|
||||||
await initialize(**worker_config)
|
await initialize(**worker_config)
|
||||||
|
@ -1125,7 +1137,7 @@ async def startup_event():
|
||||||
_run_background_health_check()
|
_run_background_health_check()
|
||||||
) # start the background health check coroutine.
|
) # start the background health check coroutine.
|
||||||
|
|
||||||
print_verbose(f"prisma client - {prisma_client}")
|
verbose_proxy_logger.debug(f"prisma client - {prisma_client}")
|
||||||
if prisma_client:
|
if prisma_client:
|
||||||
await prisma_client.connect()
|
await prisma_client.connect()
|
||||||
|
|
||||||
|
@ -1152,7 +1164,7 @@ def model_list():
|
||||||
all_models = list(set(all_models + [m["model_name"] for m in llm_model_list]))
|
all_models = list(set(all_models + [m["model_name"] for m in llm_model_list]))
|
||||||
if user_model is not None:
|
if user_model is not None:
|
||||||
all_models += [user_model]
|
all_models += [user_model]
|
||||||
print_verbose(f"all_models: {all_models}")
|
verbose_proxy_logger.debug(f"all_models: {all_models}")
|
||||||
### CHECK OLLAMA MODELS ###
|
### CHECK OLLAMA MODELS ###
|
||||||
try:
|
try:
|
||||||
response = requests.get("http://0.0.0.0:11434/api/tags")
|
response = requests.get("http://0.0.0.0:11434/api/tags")
|
||||||
|
@ -1271,7 +1283,7 @@ async def completion(
|
||||||
else:
|
else:
|
||||||
model_id = ""
|
model_id = ""
|
||||||
|
|
||||||
print_verbose(f"final response: {response}")
|
verbose_proxy_logger.debug(f"final response: {response}")
|
||||||
if (
|
if (
|
||||||
"stream" in data and data["stream"] == True
|
"stream" in data and data["stream"] == True
|
||||||
): # use generate_responses to stream responses
|
): # use generate_responses to stream responses
|
||||||
|
@ -1296,8 +1308,8 @@ async def completion(
|
||||||
fastapi_response.headers["x-litellm-model-id"] = model_id
|
fastapi_response.headers["x-litellm-model-id"] = model_id
|
||||||
return response
|
return response
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print_verbose(f"EXCEPTION RAISED IN PROXY MAIN.PY")
|
verbose_proxy_logger.debug(f"EXCEPTION RAISED IN PROXY MAIN.PY")
|
||||||
print_verbose(
|
verbose_proxy_logger.debug(
|
||||||
f"\033[1;31mAn error occurred: {e}\n\n Debug this by setting `--debug`, e.g. `litellm --model gpt-3.5-turbo --debug`"
|
f"\033[1;31mAn error occurred: {e}\n\n Debug this by setting `--debug`, e.g. `litellm --model gpt-3.5-turbo --debug`"
|
||||||
)
|
)
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
@ -1352,13 +1364,13 @@ async def chat_completion(
|
||||||
|
|
||||||
## Cache Controls
|
## Cache Controls
|
||||||
headers = request.headers
|
headers = request.headers
|
||||||
print_verbose(f"Request Headers: {headers}")
|
verbose_proxy_logger.debug(f"Request Headers: {headers}")
|
||||||
cache_control_header = headers.get("Cache-Control", None)
|
cache_control_header = headers.get("Cache-Control", None)
|
||||||
if cache_control_header:
|
if cache_control_header:
|
||||||
cache_dict = parse_cache_control(cache_control_header)
|
cache_dict = parse_cache_control(cache_control_header)
|
||||||
data["ttl"] = cache_dict.get("s-maxage")
|
data["ttl"] = cache_dict.get("s-maxage")
|
||||||
|
|
||||||
print_verbose(f"receiving data: {data}")
|
verbose_proxy_logger.debug(f"receiving data: {data}")
|
||||||
data["model"] = (
|
data["model"] = (
|
||||||
general_settings.get("completion_model", None) # server default
|
general_settings.get("completion_model", None) # server default
|
||||||
or user_model # model name passed via cli args
|
or user_model # model name passed via cli args
|
||||||
|
@ -1372,7 +1384,7 @@ async def chat_completion(
|
||||||
data["user"] = user_api_key_dict.user_id
|
data["user"] = user_api_key_dict.user_id
|
||||||
|
|
||||||
if "metadata" in data:
|
if "metadata" in data:
|
||||||
print_verbose(f'received metadata: {data["metadata"]}')
|
verbose_proxy_logger.debug(f'received metadata: {data["metadata"]}')
|
||||||
data["metadata"]["user_api_key"] = user_api_key_dict.api_key
|
data["metadata"]["user_api_key"] = user_api_key_dict.api_key
|
||||||
data["metadata"]["user_api_key_user_id"] = user_api_key_dict.user_id
|
data["metadata"]["user_api_key_user_id"] = user_api_key_dict.user_id
|
||||||
data["metadata"]["headers"] = dict(request.headers)
|
data["metadata"]["headers"] = dict(request.headers)
|
||||||
|
@ -1463,7 +1475,7 @@ async def chat_completion(
|
||||||
await proxy_logging_obj.post_call_failure_hook(
|
await proxy_logging_obj.post_call_failure_hook(
|
||||||
user_api_key_dict=user_api_key_dict, original_exception=e
|
user_api_key_dict=user_api_key_dict, original_exception=e
|
||||||
)
|
)
|
||||||
print_verbose(
|
verbose_proxy_logger.debug(
|
||||||
f"\033[1;31mAn error occurred: {e}\n\n Debug this by setting `--debug`, e.g. `litellm --model gpt-3.5-turbo --debug`"
|
f"\033[1;31mAn error occurred: {e}\n\n Debug this by setting `--debug`, e.g. `litellm --model gpt-3.5-turbo --debug`"
|
||||||
)
|
)
|
||||||
router_model_names = (
|
router_model_names = (
|
||||||
|
@ -1472,17 +1484,17 @@ async def chat_completion(
|
||||||
else []
|
else []
|
||||||
)
|
)
|
||||||
if llm_router is not None and data.get("model", "") in router_model_names:
|
if llm_router is not None and data.get("model", "") in router_model_names:
|
||||||
print_verbose("Results from router")
|
verbose_proxy_logger.debug("Results from router")
|
||||||
print_verbose("\nRouter stats")
|
verbose_proxy_logger.debug("\nRouter stats")
|
||||||
print_verbose("\nTotal Calls made")
|
verbose_proxy_logger.debug("\nTotal Calls made")
|
||||||
for key, value in llm_router.total_calls.items():
|
for key, value in llm_router.total_calls.items():
|
||||||
print_verbose(f"{key}: {value}")
|
verbose_proxy_logger.debug(f"{key}: {value}")
|
||||||
print_verbose("\nSuccess Calls made")
|
verbose_proxy_logger.debug("\nSuccess Calls made")
|
||||||
for key, value in llm_router.success_calls.items():
|
for key, value in llm_router.success_calls.items():
|
||||||
print_verbose(f"{key}: {value}")
|
verbose_proxy_logger.debug(f"{key}: {value}")
|
||||||
print_verbose("\nFail Calls made")
|
verbose_proxy_logger.debug("\nFail Calls made")
|
||||||
for key, value in llm_router.fail_calls.items():
|
for key, value in llm_router.fail_calls.items():
|
||||||
print_verbose(f"{key}: {value}")
|
verbose_proxy_logger.debug(f"{key}: {value}")
|
||||||
if user_debug:
|
if user_debug:
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
|
@ -1785,7 +1797,7 @@ async def generate_key_fn(
|
||||||
- expires: (datetime) Datetime object for when key expires.
|
- expires: (datetime) Datetime object for when key expires.
|
||||||
- user_id: (str) Unique user id - used for tracking spend across multiple keys for same user id.
|
- user_id: (str) Unique user id - used for tracking spend across multiple keys for same user id.
|
||||||
"""
|
"""
|
||||||
print_verbose("entered /key/generate")
|
verbose_proxy_logger.debug("entered /key/generate")
|
||||||
data_json = data.json() # type: ignore
|
data_json = data.json() # type: ignore
|
||||||
response = await generate_key_helper_fn(**data_json)
|
response = await generate_key_helper_fn(**data_json)
|
||||||
return GenerateKeyResponse(
|
return GenerateKeyResponse(
|
||||||
|
@ -2001,9 +2013,9 @@ async def add_new_model(model_params: ModelParams):
|
||||||
# Load existing config
|
# Load existing config
|
||||||
config = await proxy_config.get_config()
|
config = await proxy_config.get_config()
|
||||||
|
|
||||||
print_verbose(f"User config path: {user_config_file_path}")
|
verbose_proxy_logger.debug(f"User config path: {user_config_file_path}")
|
||||||
|
|
||||||
print_verbose(f"Loaded config: {config}")
|
verbose_proxy_logger.debug(f"Loaded config: {config}")
|
||||||
# Add the new model to the config
|
# Add the new model to the config
|
||||||
model_info = model_params.model_info.json()
|
model_info = model_params.model_info.json()
|
||||||
model_info = {k: v for k, v in model_info.items() if v is not None}
|
model_info = {k: v for k, v in model_info.items() if v is not None}
|
||||||
|
@ -2015,7 +2027,7 @@ async def add_new_model(model_params: ModelParams):
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
print_verbose(f"updated model list: {config['model_list']}")
|
verbose_proxy_logger.debug(f"updated model list: {config['model_list']}")
|
||||||
|
|
||||||
# Save new config
|
# Save new config
|
||||||
await proxy_config.save_config(new_config=config)
|
await proxy_config.save_config(new_config=config)
|
||||||
|
@ -2065,7 +2077,7 @@ async def model_info_v1(request: Request):
|
||||||
# don't return the api key
|
# don't return the api key
|
||||||
model["litellm_params"].pop("api_key", None)
|
model["litellm_params"].pop("api_key", None)
|
||||||
|
|
||||||
print_verbose(f"all_models: {all_models}")
|
verbose_proxy_logger.debug(f"all_models: {all_models}")
|
||||||
return {"data": all_models}
|
return {"data": all_models}
|
||||||
|
|
||||||
|
|
||||||
|
@ -2130,7 +2142,7 @@ async def _litellm_chat_completions_worker(data, user_api_key_dict):
|
||||||
user_api_key_dict=user_api_key_dict, data=data, call_type="completion"
|
user_api_key_dict=user_api_key_dict, data=data, call_type="completion"
|
||||||
)
|
)
|
||||||
|
|
||||||
print_verbose(f"_litellm_chat_completions_worker started")
|
verbose_proxy_logger.debug(f"_litellm_chat_completions_worker started")
|
||||||
### ROUTE THE REQUEST ###
|
### ROUTE THE REQUEST ###
|
||||||
router_model_names = (
|
router_model_names = (
|
||||||
[m["model_name"] for m in llm_model_list]
|
[m["model_name"] for m in llm_model_list]
|
||||||
|
@ -2156,17 +2168,17 @@ async def _litellm_chat_completions_worker(data, user_api_key_dict):
|
||||||
else: # router is not set
|
else: # router is not set
|
||||||
response = await litellm.acompletion(**data)
|
response = await litellm.acompletion(**data)
|
||||||
|
|
||||||
print_verbose(f"final response: {response}")
|
verbose_proxy_logger.debug(f"final response: {response}")
|
||||||
return response
|
return response
|
||||||
except HTTPException as e:
|
except HTTPException as e:
|
||||||
print_verbose(
|
verbose_proxy_logger.debug(
|
||||||
f"EXCEPTION RAISED IN _litellm_chat_completions_worker - {e.status_code}; {e.detail}"
|
f"EXCEPTION RAISED IN _litellm_chat_completions_worker - {e.status_code}; {e.detail}"
|
||||||
)
|
)
|
||||||
if (
|
if (
|
||||||
e.status_code == 429
|
e.status_code == 429
|
||||||
and "Max parallel request limit reached" in e.detail
|
and "Max parallel request limit reached" in e.detail
|
||||||
):
|
):
|
||||||
print_verbose(f"Max parallel request limit reached!")
|
verbose_proxy_logger.debug(f"Max parallel request limit reached!")
|
||||||
timeout = litellm._calculate_retry_after(
|
timeout = litellm._calculate_retry_after(
|
||||||
remaining_retries=3, max_retries=3, min_timeout=1
|
remaining_retries=3, max_retries=3, min_timeout=1
|
||||||
)
|
)
|
||||||
|
@ -2206,7 +2218,7 @@ async def async_queue_request(
|
||||||
"body": copy.copy(data), # use copy instead of deepcopy
|
"body": copy.copy(data), # use copy instead of deepcopy
|
||||||
}
|
}
|
||||||
|
|
||||||
print_verbose(f"receiving data: {data}")
|
verbose_proxy_logger.debug(f"receiving data: {data}")
|
||||||
data["model"] = (
|
data["model"] = (
|
||||||
general_settings.get("completion_model", None) # server default
|
general_settings.get("completion_model", None) # server default
|
||||||
or user_model # model name passed via cli args
|
or user_model # model name passed via cli args
|
||||||
|
@ -2220,7 +2232,7 @@ async def async_queue_request(
|
||||||
data["user"] = user_api_key_dict.user_id
|
data["user"] = user_api_key_dict.user_id
|
||||||
|
|
||||||
if "metadata" in data:
|
if "metadata" in data:
|
||||||
print_verbose(f'received metadata: {data["metadata"]}')
|
verbose_proxy_logger.debug(f'received metadata: {data["metadata"]}')
|
||||||
data["metadata"]["user_api_key"] = user_api_key_dict.api_key
|
data["metadata"]["user_api_key"] = user_api_key_dict.api_key
|
||||||
data["metadata"]["headers"] = dict(request.headers)
|
data["metadata"]["headers"] = dict(request.headers)
|
||||||
data["metadata"]["user_api_key_user_id"] = user_api_key_dict.user_id
|
data["metadata"]["user_api_key_user_id"] = user_api_key_dict.user_id
|
||||||
|
@ -2294,7 +2306,7 @@ async def update_config(config_info: ConfigYAML):
|
||||||
config = await proxy_config.get_config()
|
config = await proxy_config.get_config()
|
||||||
|
|
||||||
backup_config = copy.deepcopy(config)
|
backup_config = copy.deepcopy(config)
|
||||||
print_verbose(f"Loaded config: {config}")
|
verbose_proxy_logger.debug(f"Loaded config: {config}")
|
||||||
|
|
||||||
# update the general settings
|
# update the general settings
|
||||||
if config_info.general_settings is not None:
|
if config_info.general_settings is not None:
|
||||||
|
@ -2478,7 +2490,7 @@ async def get_routes():
|
||||||
async def shutdown_event():
|
async def shutdown_event():
|
||||||
global prisma_client, master_key, user_custom_auth
|
global prisma_client, master_key, user_custom_auth
|
||||||
if prisma_client:
|
if prisma_client:
|
||||||
print_verbose("Disconnecting from Prisma")
|
verbose_proxy_logger.debug("Disconnecting from Prisma")
|
||||||
await prisma_client.disconnect()
|
await prisma_client.disconnect()
|
||||||
|
|
||||||
## RESET CUSTOM VARIABLES ##
|
## RESET CUSTOM VARIABLES ##
|
||||||
|
|
|
@ -26,6 +26,8 @@ from litellm.llms.custom_httpx.azure_dall_e_2 import (
|
||||||
)
|
)
|
||||||
from litellm.utils import ModelResponse, CustomStreamWrapper
|
from litellm.utils import ModelResponse, CustomStreamWrapper
|
||||||
import copy
|
import copy
|
||||||
|
from litellm._logging import verbose_router_logger
|
||||||
|
import logging
|
||||||
|
|
||||||
|
|
||||||
class Router:
|
class Router:
|
||||||
|
@ -223,7 +225,7 @@ class Router:
|
||||||
litellm.failure_callback.append(self.deployment_callback_on_failure)
|
litellm.failure_callback.append(self.deployment_callback_on_failure)
|
||||||
else:
|
else:
|
||||||
litellm.failure_callback = [self.deployment_callback_on_failure]
|
litellm.failure_callback = [self.deployment_callback_on_failure]
|
||||||
self.print_verbose(
|
verbose_router_logger.debug(
|
||||||
f"Intialized router with Routing strategy: {self.routing_strategy}\n"
|
f"Intialized router with Routing strategy: {self.routing_strategy}\n"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -316,7 +318,7 @@ class Router:
|
||||||
async def _acompletion(self, model: str, messages: List[Dict[str, str]], **kwargs):
|
async def _acompletion(self, model: str, messages: List[Dict[str, str]], **kwargs):
|
||||||
model_name = None
|
model_name = None
|
||||||
try:
|
try:
|
||||||
self.print_verbose(
|
verbose_router_logger.debug(
|
||||||
f"Inside _acompletion()- model: {model}; kwargs: {kwargs}"
|
f"Inside _acompletion()- model: {model}; kwargs: {kwargs}"
|
||||||
)
|
)
|
||||||
deployment = self.get_available_deployment(
|
deployment = self.get_available_deployment(
|
||||||
|
@ -353,18 +355,24 @@ class Router:
|
||||||
model_client = potential_model_client
|
model_client = potential_model_client
|
||||||
self.total_calls[model_name] += 1
|
self.total_calls[model_name] += 1
|
||||||
response = await litellm.acompletion(
|
response = await litellm.acompletion(
|
||||||
**{
|
**{
|
||||||
**data,
|
**data,
|
||||||
"messages": messages,
|
"messages": messages,
|
||||||
"caching": self.cache_responses,
|
"caching": self.cache_responses,
|
||||||
"client": model_client,
|
"client": model_client,
|
||||||
"timeout": self.timeout,
|
"timeout": self.timeout,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
self.success_calls[model_name] += 1
|
self.success_calls[model_name] += 1
|
||||||
|
verbose_router_logger.info(
|
||||||
|
f"litellm.acompletion(model={model_name})\033[32m 200 OK\033[0m"
|
||||||
|
)
|
||||||
return response
|
return response
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
verbose_router_logger.info(
|
||||||
|
f"litellm.acompletion(model={model_name})\033[31m Exception {str(e)}\033[0m"
|
||||||
|
)
|
||||||
if model_name is not None:
|
if model_name is not None:
|
||||||
self.fail_calls[model_name] += 1
|
self.fail_calls[model_name] += 1
|
||||||
raise e
|
raise e
|
||||||
|
@ -385,7 +393,7 @@ class Router:
|
||||||
|
|
||||||
def _image_generation(self, prompt: str, model: str, **kwargs):
|
def _image_generation(self, prompt: str, model: str, **kwargs):
|
||||||
try:
|
try:
|
||||||
self.print_verbose(
|
verbose_router_logger.debug(
|
||||||
f"Inside _image_generation()- model: {model}; kwargs: {kwargs}"
|
f"Inside _image_generation()- model: {model}; kwargs: {kwargs}"
|
||||||
)
|
)
|
||||||
deployment = self.get_available_deployment(
|
deployment = self.get_available_deployment(
|
||||||
|
@ -432,8 +440,14 @@ class Router:
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
self.success_calls[model_name] += 1
|
self.success_calls[model_name] += 1
|
||||||
|
verbose_router_logger.info(
|
||||||
|
f"litellm.image_generation(model={model_name})\033[32m 200 OK\033[0m"
|
||||||
|
)
|
||||||
return response
|
return response
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
verbose_router_logger.info(
|
||||||
|
f"litellm.image_generation(model={model_name})\033[31m Exception {str(e)}\033[0m"
|
||||||
|
)
|
||||||
if model_name is not None:
|
if model_name is not None:
|
||||||
self.fail_calls[model_name] += 1
|
self.fail_calls[model_name] += 1
|
||||||
raise e
|
raise e
|
||||||
|
@ -454,7 +468,7 @@ class Router:
|
||||||
|
|
||||||
async def _aimage_generation(self, prompt: str, model: str, **kwargs):
|
async def _aimage_generation(self, prompt: str, model: str, **kwargs):
|
||||||
try:
|
try:
|
||||||
self.print_verbose(
|
verbose_router_logger.debug(
|
||||||
f"Inside _image_generation()- model: {model}; kwargs: {kwargs}"
|
f"Inside _image_generation()- model: {model}; kwargs: {kwargs}"
|
||||||
)
|
)
|
||||||
deployment = self.get_available_deployment(
|
deployment = self.get_available_deployment(
|
||||||
|
@ -501,8 +515,14 @@ class Router:
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
self.success_calls[model_name] += 1
|
self.success_calls[model_name] += 1
|
||||||
|
verbose_router_logger.info(
|
||||||
|
f"litellm.aimage_generation(model={model_name})\033[32m 200 OK\033[0m"
|
||||||
|
)
|
||||||
return response
|
return response
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
verbose_router_logger.info(
|
||||||
|
f"litellm.aimage_generation(model={model_name})\033[31m Exception {str(e)}\033[0m"
|
||||||
|
)
|
||||||
if model_name is not None:
|
if model_name is not None:
|
||||||
self.fail_calls[model_name] += 1
|
self.fail_calls[model_name] += 1
|
||||||
raise e
|
raise e
|
||||||
|
@ -576,7 +596,7 @@ class Router:
|
||||||
|
|
||||||
async def _atext_completion(self, model: str, prompt: str, **kwargs):
|
async def _atext_completion(self, model: str, prompt: str, **kwargs):
|
||||||
try:
|
try:
|
||||||
self.print_verbose(
|
verbose_router_logger.debug(
|
||||||
f"Inside _atext_completion()- model: {model}; kwargs: {kwargs}"
|
f"Inside _atext_completion()- model: {model}; kwargs: {kwargs}"
|
||||||
)
|
)
|
||||||
deployment = self.get_available_deployment(
|
deployment = self.get_available_deployment(
|
||||||
|
@ -613,18 +633,24 @@ class Router:
|
||||||
model_client = potential_model_client
|
model_client = potential_model_client
|
||||||
self.total_calls[model_name] += 1
|
self.total_calls[model_name] += 1
|
||||||
response = await litellm.atext_completion(
|
response = await litellm.atext_completion(
|
||||||
**{
|
**{
|
||||||
**data,
|
**data,
|
||||||
"prompt": prompt,
|
"prompt": prompt,
|
||||||
"caching": self.cache_responses,
|
"caching": self.cache_responses,
|
||||||
"client": model_client,
|
"client": model_client,
|
||||||
"timeout": self.timeout,
|
"timeout": self.timeout,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
self.success_calls[model_name] += 1
|
self.success_calls[model_name] += 1
|
||||||
|
verbose_router_logger.info(
|
||||||
|
f"litellm.atext_completion(model={model_name})\033[32m 200 OK\033[0m"
|
||||||
|
)
|
||||||
return response
|
return response
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
verbose_router_logger.info(
|
||||||
|
f"litellm.atext_completion(model={model_name})\033[31m Exception {str(e)}\033[0m"
|
||||||
|
)
|
||||||
if model_name is not None:
|
if model_name is not None:
|
||||||
self.fail_calls[model_name] += 1
|
self.fail_calls[model_name] += 1
|
||||||
raise e
|
raise e
|
||||||
|
@ -696,7 +722,7 @@ class Router:
|
||||||
|
|
||||||
async def _aembedding(self, input: Union[str, List], model: str, **kwargs):
|
async def _aembedding(self, input: Union[str, List], model: str, **kwargs):
|
||||||
try:
|
try:
|
||||||
self.print_verbose(
|
verbose_router_logger.debug(
|
||||||
f"Inside _aembedding()- model: {model}; kwargs: {kwargs}"
|
f"Inside _aembedding()- model: {model}; kwargs: {kwargs}"
|
||||||
)
|
)
|
||||||
deployment = self.get_available_deployment(
|
deployment = self.get_available_deployment(
|
||||||
|
@ -743,8 +769,14 @@ class Router:
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
self.success_calls[model_name] += 1
|
self.success_calls[model_name] += 1
|
||||||
|
verbose_router_logger.info(
|
||||||
|
f"litellm.aembedding(model={model_name})\033[32m 200 OK\033[0m"
|
||||||
|
)
|
||||||
return response
|
return response
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
verbose_router_logger.info(
|
||||||
|
f"litellm.aembedding(model={model_name})\033[31m Exception {str(e)}\033[0m"
|
||||||
|
)
|
||||||
if model_name is not None:
|
if model_name is not None:
|
||||||
self.fail_calls[model_name] += 1
|
self.fail_calls[model_name] += 1
|
||||||
raise e
|
raise e
|
||||||
|
@ -761,19 +793,18 @@ class Router:
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
response = await self.async_function_with_retries(*args, **kwargs)
|
response = await self.async_function_with_retries(*args, **kwargs)
|
||||||
self.print_verbose(f"Async Response: {response}")
|
verbose_router_logger.debug(f"Async Response: {response}")
|
||||||
return response
|
return response
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.print_verbose(
|
verbose_router_logger.debug(f"Traceback{traceback.format_exc()}")
|
||||||
f"An exception occurs: {e}\n\n Traceback{traceback.format_exc()}"
|
|
||||||
)
|
|
||||||
original_exception = e
|
original_exception = e
|
||||||
|
fallback_model_group = None
|
||||||
try:
|
try:
|
||||||
if (
|
if (
|
||||||
hasattr(e, "status_code") and e.status_code == 400
|
hasattr(e, "status_code") and e.status_code == 400
|
||||||
): # don't retry a malformed request
|
): # don't retry a malformed request
|
||||||
raise e
|
raise e
|
||||||
self.print_verbose(f"Trying to fallback b/w models")
|
verbose_router_logger.debug(f"Trying to fallback b/w models")
|
||||||
if (
|
if (
|
||||||
isinstance(e, litellm.ContextWindowExceededError)
|
isinstance(e, litellm.ContextWindowExceededError)
|
||||||
and context_window_fallbacks is not None
|
and context_window_fallbacks is not None
|
||||||
|
@ -802,11 +833,16 @@ class Router:
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pass
|
pass
|
||||||
elif fallbacks is not None:
|
elif fallbacks is not None:
|
||||||
self.print_verbose(f"inside model fallbacks: {fallbacks}")
|
verbose_router_logger.debug(f"inside model fallbacks: {fallbacks}")
|
||||||
for item in fallbacks:
|
for item in fallbacks:
|
||||||
if list(item.keys())[0] == model_group:
|
if list(item.keys())[0] == model_group:
|
||||||
fallback_model_group = item[model_group]
|
fallback_model_group = item[model_group]
|
||||||
break
|
break
|
||||||
|
if fallback_model_group is None:
|
||||||
|
verbose_router_logger.info(
|
||||||
|
f"No fallback model group found for original model_group={model_group}. Fallbacks={fallbacks}"
|
||||||
|
)
|
||||||
|
raise original_exception
|
||||||
for mg in fallback_model_group:
|
for mg in fallback_model_group:
|
||||||
"""
|
"""
|
||||||
Iterate through the model groups and try calling that deployment
|
Iterate through the model groups and try calling that deployment
|
||||||
|
@ -814,6 +850,9 @@ class Router:
|
||||||
try:
|
try:
|
||||||
## LOGGING
|
## LOGGING
|
||||||
kwargs = self.log_retry(kwargs=kwargs, e=original_exception)
|
kwargs = self.log_retry(kwargs=kwargs, e=original_exception)
|
||||||
|
verbose_router_logger.info(
|
||||||
|
f"Falling back to model_group = {mg}"
|
||||||
|
)
|
||||||
kwargs["model"] = mg
|
kwargs["model"] = mg
|
||||||
kwargs["metadata"]["model_group"] = mg
|
kwargs["metadata"]["model_group"] = mg
|
||||||
response = await self.async_function_with_retries(
|
response = await self.async_function_with_retries(
|
||||||
|
@ -823,12 +862,12 @@ class Router:
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise e
|
raise e
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.print_verbose(f"An exception occurred - {str(e)}")
|
verbose_router_logger.debug(f"An exception occurred - {str(e)}")
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
raise original_exception
|
raise original_exception
|
||||||
|
|
||||||
async def async_function_with_retries(self, *args, **kwargs):
|
async def async_function_with_retries(self, *args, **kwargs):
|
||||||
self.print_verbose(
|
verbose_router_logger.debug(
|
||||||
f"Inside async function with retries: args - {args}; kwargs - {kwargs}"
|
f"Inside async function with retries: args - {args}; kwargs - {kwargs}"
|
||||||
)
|
)
|
||||||
original_function = kwargs.pop("original_function")
|
original_function = kwargs.pop("original_function")
|
||||||
|
@ -836,7 +875,7 @@ class Router:
|
||||||
context_window_fallbacks = kwargs.pop(
|
context_window_fallbacks = kwargs.pop(
|
||||||
"context_window_fallbacks", self.context_window_fallbacks
|
"context_window_fallbacks", self.context_window_fallbacks
|
||||||
)
|
)
|
||||||
self.print_verbose(
|
verbose_router_logger.debug(
|
||||||
f"async function w/ retries: original_function - {original_function}"
|
f"async function w/ retries: original_function - {original_function}"
|
||||||
)
|
)
|
||||||
num_retries = kwargs.pop("num_retries")
|
num_retries = kwargs.pop("num_retries")
|
||||||
|
@ -891,7 +930,7 @@ class Router:
|
||||||
kwargs = self.log_retry(kwargs=kwargs, e=original_exception)
|
kwargs = self.log_retry(kwargs=kwargs, e=original_exception)
|
||||||
|
|
||||||
for current_attempt in range(num_retries):
|
for current_attempt in range(num_retries):
|
||||||
self.print_verbose(
|
verbose_router_logger.debug(
|
||||||
f"retrying request. Current attempt - {current_attempt}; num retries: {num_retries}"
|
f"retrying request. Current attempt - {current_attempt}; num retries: {num_retries}"
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
|
@ -952,9 +991,9 @@ class Router:
|
||||||
return response
|
return response
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
original_exception = e
|
original_exception = e
|
||||||
self.print_verbose(f"An exception occurs {original_exception}")
|
verbose_router_logger.debug(f"An exception occurs {original_exception}")
|
||||||
try:
|
try:
|
||||||
self.print_verbose(
|
verbose_router_logger.debug(
|
||||||
f"Trying to fallback b/w models. Initial model group: {model_group}"
|
f"Trying to fallback b/w models. Initial model group: {model_group}"
|
||||||
)
|
)
|
||||||
if (
|
if (
|
||||||
|
@ -986,7 +1025,7 @@ class Router:
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pass
|
pass
|
||||||
elif fallbacks is not None:
|
elif fallbacks is not None:
|
||||||
self.print_verbose(f"inside model fallbacks: {fallbacks}")
|
verbose_router_logger.debug(f"inside model fallbacks: {fallbacks}")
|
||||||
fallback_model_group = None
|
fallback_model_group = None
|
||||||
for item in fallbacks:
|
for item in fallbacks:
|
||||||
if list(item.keys())[0] == model_group:
|
if list(item.keys())[0] == model_group:
|
||||||
|
@ -1016,7 +1055,7 @@ class Router:
|
||||||
"""
|
"""
|
||||||
Try calling the model 3 times. Shuffle between available deployments.
|
Try calling the model 3 times. Shuffle between available deployments.
|
||||||
"""
|
"""
|
||||||
self.print_verbose(
|
verbose_router_logger.debug(
|
||||||
f"Inside function with retries: args - {args}; kwargs - {kwargs}"
|
f"Inside function with retries: args - {args}; kwargs - {kwargs}"
|
||||||
)
|
)
|
||||||
original_function = kwargs.pop("original_function")
|
original_function = kwargs.pop("original_function")
|
||||||
|
@ -1031,7 +1070,9 @@ class Router:
|
||||||
return response
|
return response
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
original_exception = e
|
original_exception = e
|
||||||
self.print_verbose(f"num retries in function with retries: {num_retries}")
|
verbose_router_logger.debug(
|
||||||
|
f"num retries in function with retries: {num_retries}"
|
||||||
|
)
|
||||||
### CHECK IF RATE LIMIT / CONTEXT WINDOW ERROR
|
### CHECK IF RATE LIMIT / CONTEXT WINDOW ERROR
|
||||||
if (
|
if (
|
||||||
isinstance(original_exception, litellm.ContextWindowExceededError)
|
isinstance(original_exception, litellm.ContextWindowExceededError)
|
||||||
|
@ -1046,7 +1087,7 @@ class Router:
|
||||||
kwargs = self.log_retry(kwargs=kwargs, e=original_exception)
|
kwargs = self.log_retry(kwargs=kwargs, e=original_exception)
|
||||||
### RETRY
|
### RETRY
|
||||||
for current_attempt in range(num_retries):
|
for current_attempt in range(num_retries):
|
||||||
self.print_verbose(
|
verbose_router_logger.debug(
|
||||||
f"retrying request. Current attempt - {current_attempt}; retries left: {num_retries}"
|
f"retrying request. Current attempt - {current_attempt}; retries left: {num_retries}"
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
|
@ -1129,10 +1170,10 @@ class Router:
|
||||||
deployment_exceptions = self.model_exception_map.get(deployment, [])
|
deployment_exceptions = self.model_exception_map.get(deployment, [])
|
||||||
deployment_exceptions.append(exception_str)
|
deployment_exceptions.append(exception_str)
|
||||||
self.model_exception_map[deployment] = deployment_exceptions
|
self.model_exception_map[deployment] = deployment_exceptions
|
||||||
self.print_verbose("\nEXCEPTION FOR DEPLOYMENTS\n")
|
verbose_router_logger.debug("\nEXCEPTION FOR DEPLOYMENTS\n")
|
||||||
self.print_verbose(self.model_exception_map)
|
verbose_router_logger.debug(self.model_exception_map)
|
||||||
for model in self.model_exception_map:
|
for model in self.model_exception_map:
|
||||||
self.print_verbose(
|
verbose_router_logger.debug(
|
||||||
f"Model {model} had {len(self.model_exception_map[model])} exception"
|
f"Model {model} had {len(self.model_exception_map[model])} exception"
|
||||||
)
|
)
|
||||||
if custom_llm_provider:
|
if custom_llm_provider:
|
||||||
|
@ -1184,7 +1225,7 @@ class Router:
|
||||||
# cooldown deployment
|
# cooldown deployment
|
||||||
current_fails = self.failed_calls.get_cache(key=deployment) or 0
|
current_fails = self.failed_calls.get_cache(key=deployment) or 0
|
||||||
updated_fails = current_fails + 1
|
updated_fails = current_fails + 1
|
||||||
self.print_verbose(
|
verbose_router_logger.debug(
|
||||||
f"Attempting to add {deployment} to cooldown list. updated_fails: {updated_fails}; self.allowed_fails: {self.allowed_fails}"
|
f"Attempting to add {deployment} to cooldown list. updated_fails: {updated_fails}; self.allowed_fails: {self.allowed_fails}"
|
||||||
)
|
)
|
||||||
if updated_fails > self.allowed_fails:
|
if updated_fails > self.allowed_fails:
|
||||||
|
@ -1192,7 +1233,7 @@ class Router:
|
||||||
cooldown_key = f"{current_minute}:cooldown_models" # group cooldown models by minute to reduce number of redis calls
|
cooldown_key = f"{current_minute}:cooldown_models" # group cooldown models by minute to reduce number of redis calls
|
||||||
cached_value = self.cache.get_cache(key=cooldown_key)
|
cached_value = self.cache.get_cache(key=cooldown_key)
|
||||||
|
|
||||||
self.print_verbose(f"adding {deployment} to cooldown models")
|
verbose_router_logger.debug(f"adding {deployment} to cooldown models")
|
||||||
# update value
|
# update value
|
||||||
try:
|
try:
|
||||||
if deployment in cached_value:
|
if deployment in cached_value:
|
||||||
|
@ -1221,7 +1262,7 @@ class Router:
|
||||||
# ----------------------
|
# ----------------------
|
||||||
cooldown_models = self.cache.get_cache(key=cooldown_key) or []
|
cooldown_models = self.cache.get_cache(key=cooldown_key) or []
|
||||||
|
|
||||||
self.print_verbose(f"retrieve cooldown models: {cooldown_models}")
|
verbose_router_logger.debug(f"retrieve cooldown models: {cooldown_models}")
|
||||||
return cooldown_models
|
return cooldown_models
|
||||||
|
|
||||||
def set_client(self, model: dict):
|
def set_client(self, model: dict):
|
||||||
|
@ -1371,7 +1412,7 @@ class Router:
|
||||||
local_only=True,
|
local_only=True,
|
||||||
) # cache for 1 hr
|
) # cache for 1 hr
|
||||||
else:
|
else:
|
||||||
self.print_verbose(
|
verbose_router_logger.debug(
|
||||||
f"Initializing Azure OpenAI Client for {model_name}, Api Base: {str(api_base)}, Api Key:{api_key}"
|
f"Initializing Azure OpenAI Client for {model_name}, Api Base: {str(api_base)}, Api Key:{api_key}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -1463,7 +1504,7 @@ class Router:
|
||||||
) # cache for 1 hr
|
) # cache for 1 hr
|
||||||
|
|
||||||
else:
|
else:
|
||||||
self.print_verbose(
|
verbose_router_logger.debug(
|
||||||
f"Initializing OpenAI Client for {model_name}, Api Base:{str(api_base)}, Api Key:{api_key}"
|
f"Initializing OpenAI Client for {model_name}, Api Base:{str(api_base)}, Api Key:{api_key}"
|
||||||
)
|
)
|
||||||
cache_key = f"{model_id}_async_client"
|
cache_key = f"{model_id}_async_client"
|
||||||
|
@ -1569,7 +1610,7 @@ class Router:
|
||||||
|
|
||||||
self.set_client(model=model)
|
self.set_client(model=model)
|
||||||
|
|
||||||
self.print_verbose(f"\nInitialized Model List {self.model_list}")
|
verbose_router_logger.debug(f"\nInitialized Model List {self.model_list}")
|
||||||
self.model_names = [m["model_name"] for m in model_list]
|
self.model_names = [m["model_name"] for m in model_list]
|
||||||
|
|
||||||
def get_model_names(self):
|
def get_model_names(self):
|
||||||
|
@ -1631,13 +1672,6 @@ class Router:
|
||||||
client = self.cache.get_cache(key=cache_key)
|
client = self.cache.get_cache(key=cache_key)
|
||||||
return client
|
return client
|
||||||
|
|
||||||
def print_verbose(self, print_statement):
|
|
||||||
try:
|
|
||||||
if self.set_verbose or litellm.set_verbose:
|
|
||||||
print(f"LiteLLM.Router: {print_statement}") # noqa
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
def get_available_deployment(
|
def get_available_deployment(
|
||||||
self,
|
self,
|
||||||
model: str,
|
model: str,
|
||||||
|
@ -1665,7 +1699,7 @@ class Router:
|
||||||
|
|
||||||
# check if aliases set on litellm model alias map
|
# check if aliases set on litellm model alias map
|
||||||
if model in self.model_group_alias:
|
if model in self.model_group_alias:
|
||||||
self.print_verbose(
|
verbose_router_logger.debug(
|
||||||
f"Using a model alias. Got Request for {model}, sending requests to {self.model_group_alias.get(model)}"
|
f"Using a model alias. Got Request for {model}, sending requests to {self.model_group_alias.get(model)}"
|
||||||
)
|
)
|
||||||
model = self.model_group_alias[model]
|
model = self.model_group_alias[model]
|
||||||
|
@ -1679,13 +1713,15 @@ class Router:
|
||||||
m for m in self.model_list if m["litellm_params"]["model"] == model
|
m for m in self.model_list if m["litellm_params"]["model"] == model
|
||||||
]
|
]
|
||||||
|
|
||||||
self.print_verbose(f"initial list of deployments: {healthy_deployments}")
|
verbose_router_logger.debug(
|
||||||
|
f"initial list of deployments: {healthy_deployments}"
|
||||||
|
)
|
||||||
|
|
||||||
# filter out the deployments currently cooling down
|
# filter out the deployments currently cooling down
|
||||||
deployments_to_remove = []
|
deployments_to_remove = []
|
||||||
# cooldown_deployments is a list of model_id's cooling down, cooldown_deployments = ["16700539-b3cd-42f4-b426-6a12a1bb706a", "16700539-b3cd-42f4-b426-7899"]
|
# cooldown_deployments is a list of model_id's cooling down, cooldown_deployments = ["16700539-b3cd-42f4-b426-6a12a1bb706a", "16700539-b3cd-42f4-b426-7899"]
|
||||||
cooldown_deployments = self._get_cooldown_deployments()
|
cooldown_deployments = self._get_cooldown_deployments()
|
||||||
self.print_verbose(f"cooldown deployments: {cooldown_deployments}")
|
verbose_router_logger.debug(f"cooldown deployments: {cooldown_deployments}")
|
||||||
# Find deployments in model_list whose model_id is cooling down
|
# Find deployments in model_list whose model_id is cooling down
|
||||||
for deployment in healthy_deployments:
|
for deployment in healthy_deployments:
|
||||||
deployment_id = deployment["model_info"]["id"]
|
deployment_id = deployment["model_info"]["id"]
|
||||||
|
@ -1695,7 +1731,7 @@ class Router:
|
||||||
for deployment in deployments_to_remove:
|
for deployment in deployments_to_remove:
|
||||||
healthy_deployments.remove(deployment)
|
healthy_deployments.remove(deployment)
|
||||||
|
|
||||||
self.print_verbose(
|
verbose_router_logger.debug(
|
||||||
f"healthy deployments: length {len(healthy_deployments)} {healthy_deployments}"
|
f"healthy deployments: length {len(healthy_deployments)} {healthy_deployments}"
|
||||||
)
|
)
|
||||||
if len(healthy_deployments) == 0:
|
if len(healthy_deployments) == 0:
|
||||||
|
@ -1715,13 +1751,13 @@ class Router:
|
||||||
if rpm is not None:
|
if rpm is not None:
|
||||||
# use weight-random pick if rpms provided
|
# use weight-random pick if rpms provided
|
||||||
rpms = [m["litellm_params"].get("rpm", 0) for m in healthy_deployments]
|
rpms = [m["litellm_params"].get("rpm", 0) for m in healthy_deployments]
|
||||||
self.print_verbose(f"\nrpms {rpms}")
|
verbose_router_logger.debug(f"\nrpms {rpms}")
|
||||||
total_rpm = sum(rpms)
|
total_rpm = sum(rpms)
|
||||||
weights = [rpm / total_rpm for rpm in rpms]
|
weights = [rpm / total_rpm for rpm in rpms]
|
||||||
self.print_verbose(f"\n weights {weights}")
|
verbose_router_logger.debug(f"\n weights {weights}")
|
||||||
# Perform weighted random pick
|
# Perform weighted random pick
|
||||||
selected_index = random.choices(range(len(rpms)), weights=weights)[0]
|
selected_index = random.choices(range(len(rpms)), weights=weights)[0]
|
||||||
self.print_verbose(f"\n selected index, {selected_index}")
|
verbose_router_logger.debug(f"\n selected index, {selected_index}")
|
||||||
deployment = healthy_deployments[selected_index]
|
deployment = healthy_deployments[selected_index]
|
||||||
return deployment or deployment[0]
|
return deployment or deployment[0]
|
||||||
############## Check if we can do a RPM/TPM based weighted pick #################
|
############## Check if we can do a RPM/TPM based weighted pick #################
|
||||||
|
@ -1729,13 +1765,13 @@ class Router:
|
||||||
if tpm is not None:
|
if tpm is not None:
|
||||||
# use weight-random pick if rpms provided
|
# use weight-random pick if rpms provided
|
||||||
tpms = [m["litellm_params"].get("tpm", 0) for m in healthy_deployments]
|
tpms = [m["litellm_params"].get("tpm", 0) for m in healthy_deployments]
|
||||||
self.print_verbose(f"\ntpms {tpms}")
|
verbose_router_logger.debug(f"\ntpms {tpms}")
|
||||||
total_tpm = sum(tpms)
|
total_tpm = sum(tpms)
|
||||||
weights = [tpm / total_tpm for tpm in tpms]
|
weights = [tpm / total_tpm for tpm in tpms]
|
||||||
self.print_verbose(f"\n weights {weights}")
|
verbose_router_logger.debug(f"\n weights {weights}")
|
||||||
# Perform weighted random pick
|
# Perform weighted random pick
|
||||||
selected_index = random.choices(range(len(tpms)), weights=weights)[0]
|
selected_index = random.choices(range(len(tpms)), weights=weights)[0]
|
||||||
self.print_verbose(f"\n selected index, {selected_index}")
|
verbose_router_logger.debug(f"\n selected index, {selected_index}")
|
||||||
deployment = healthy_deployments[selected_index]
|
deployment = healthy_deployments[selected_index]
|
||||||
return deployment or deployment[0]
|
return deployment or deployment[0]
|
||||||
|
|
||||||
|
|
|
@ -1310,12 +1310,13 @@ def test_completion_together_ai_mixtral():
|
||||||
|
|
||||||
|
|
||||||
def test_completion_together_ai_yi_chat():
|
def test_completion_together_ai_yi_chat():
|
||||||
|
litellm.set_verbose = True
|
||||||
model_name = "together_ai/zero-one-ai/Yi-34B-Chat"
|
model_name = "together_ai/zero-one-ai/Yi-34B-Chat"
|
||||||
try:
|
try:
|
||||||
messages = [
|
messages = [
|
||||||
{"role": "user", "content": "What llm are you?"},
|
{"role": "user", "content": "What llm are you?"},
|
||||||
]
|
]
|
||||||
response = completion(model=model_name, messages=messages)
|
response = completion(model=model_name, messages=messages, max_tokens=5)
|
||||||
# Add any assertions here to check the response
|
# Add any assertions here to check the response
|
||||||
print(response)
|
print(response)
|
||||||
cost = completion_cost(completion_response=response)
|
cost = completion_cost(completion_response=response)
|
||||||
|
|
79
litellm/tests/test_router_debug_logs.py
Normal file
79
litellm/tests/test_router_debug_logs.py
Normal file
|
@ -0,0 +1,79 @@
|
||||||
|
import sys, os, time
|
||||||
|
import traceback, asyncio
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
sys.path.insert(
|
||||||
|
0, os.path.abspath("../..")
|
||||||
|
) # Adds the parent directory to the system path
|
||||||
|
|
||||||
|
import litellm, asyncio, logging
|
||||||
|
from litellm import Router
|
||||||
|
|
||||||
|
# this tests debug logs from litellm router and litellm proxy server
|
||||||
|
from litellm._logging import verbose_router_logger
|
||||||
|
|
||||||
|
verbose_router_logger.setLevel(level=logging.INFO)
|
||||||
|
|
||||||
|
|
||||||
|
# this tests debug logs from litellm router and litellm proxy server
|
||||||
|
def test_async_fallbacks(caplog):
|
||||||
|
# THIS IS A PROD TEST - DO NOT DELETE THIS. Used for testing if litellm proxy verbose logs are human readable
|
||||||
|
litellm.set_verbose = False
|
||||||
|
model_list = [
|
||||||
|
{
|
||||||
|
"model_name": "azure/gpt-3.5-turbo",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "azure/chatgpt-v-2",
|
||||||
|
"api_key": os.getenv("AZURE_API_KEY"),
|
||||||
|
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||||
|
"api_base": os.getenv("AZURE_API_BASE"),
|
||||||
|
},
|
||||||
|
"tpm": 240000,
|
||||||
|
"rpm": 1800,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "gpt-3.5-turbo",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"api_key": "bad-key",
|
||||||
|
},
|
||||||
|
"tpm": 1000000,
|
||||||
|
"rpm": 9000,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
router = Router(
|
||||||
|
model_list=model_list,
|
||||||
|
fallbacks=[{"gpt-3.5-turbo": ["azure/gpt-3.5-turbo"]}],
|
||||||
|
)
|
||||||
|
|
||||||
|
user_message = "Hello, how are you?"
|
||||||
|
messages = [{"content": user_message, "role": "user"}]
|
||||||
|
|
||||||
|
async def _make_request():
|
||||||
|
try:
|
||||||
|
response = await router.acompletion(
|
||||||
|
model="gpt-3.5-turbo", messages=messages, max_tokens=1
|
||||||
|
)
|
||||||
|
router.reset()
|
||||||
|
except litellm.Timeout as e:
|
||||||
|
pass
|
||||||
|
except Exception as e:
|
||||||
|
pytest.fail(f"An exception occurred: {e}")
|
||||||
|
finally:
|
||||||
|
router.reset()
|
||||||
|
|
||||||
|
asyncio.run(_make_request())
|
||||||
|
captured_logs = [rec.message for rec in caplog.records]
|
||||||
|
print("\n Captured caplog records - ", captured_logs)
|
||||||
|
|
||||||
|
# Define the expected log messages
|
||||||
|
# - error request, falling back notice, success notice
|
||||||
|
expected_logs = [
|
||||||
|
"litellm.acompletion(model=gpt-3.5-turbo)\x1b[31m Exception OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: bad-key. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}\x1b[0m",
|
||||||
|
"Falling back to model_group = azure/gpt-3.5-turbo",
|
||||||
|
"litellm.acompletion(model=azure/chatgpt-v-2)\x1b[32m 200 OK\x1b[0m",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Assert that the captured logs match the expected log messages
|
||||||
|
assert captured_logs == expected_logs
|
Loading…
Add table
Add a link
Reference in a new issue