mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-26 11:14:04 +00:00
(feat) print debug info per deployment
This commit is contained in:
parent
1f15c79252
commit
6f0faca85b
3 changed files with 51 additions and 59 deletions
|
@ -5,63 +5,12 @@ model_list:
|
||||||
api_base: os.environ/AZURE_API_BASE
|
api_base: os.environ/AZURE_API_BASE
|
||||||
api_key: os.environ/AZURE_API_KEY
|
api_key: os.environ/AZURE_API_KEY
|
||||||
api_version: "2023-07-01-preview"
|
api_version: "2023-07-01-preview"
|
||||||
model_info:
|
- model_name: azure-gpt-3.5
|
||||||
mode: chat
|
|
||||||
max_tokens: 4096
|
|
||||||
base_model: azure/gpt-4-1106-preview
|
|
||||||
access_groups: ["public"]
|
|
||||||
- model_name: openai-gpt-3.5
|
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: gpt-3.5-turbo
|
model: gpt-3.5-turbo
|
||||||
api_key: os.environ/OPENAI_API_KEY
|
api_key: os.environ/OPENAI_API_KEY
|
||||||
model_info:
|
model_info:
|
||||||
access_groups: ["public"]
|
access_groups: ["public"]
|
||||||
- model_name: anthropic-claude-v2.1
|
router_settings:
|
||||||
litellm_params:
|
set_verbose: True
|
||||||
model: bedrock/anthropic.claude-v2:1
|
debug_level: "DEBUG"
|
||||||
timeout: 300 # sets a 5 minute timeout
|
|
||||||
model_info:
|
|
||||||
access_groups: ["private"]
|
|
||||||
- model_name: anthropic-claude-v2
|
|
||||||
litellm_params:
|
|
||||||
model: bedrock/anthropic.claude-v2
|
|
||||||
- model_name: bedrock-cohere
|
|
||||||
litellm_params:
|
|
||||||
model: bedrock/cohere.command-text-v14
|
|
||||||
timeout: 0.0001
|
|
||||||
- model_name: gpt-4
|
|
||||||
litellm_params:
|
|
||||||
model: azure/chatgpt-v-2
|
|
||||||
api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
|
|
||||||
api_version: "2023-05-15"
|
|
||||||
api_key: os.environ/AZURE_API_KEY # The `os.environ/` prefix tells litellm to read this from the env. See https://docs.litellm.ai/docs/simple_proxy#load-api-keys-from-vault
|
|
||||||
model_info:
|
|
||||||
base_model: azure/gpt-4
|
|
||||||
- model_name: text-moderation-stable
|
|
||||||
litellm_params:
|
|
||||||
model: text-moderation-stable
|
|
||||||
api_key: os.environ/OPENAI_API_KEY
|
|
||||||
litellm_settings:
|
|
||||||
fallbacks: [{"openai-gpt-3.5": ["azure-gpt-3.5"]}]
|
|
||||||
success_callback: ['langfuse']
|
|
||||||
# setting callback class
|
|
||||||
callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance]
|
|
||||||
|
|
||||||
general_settings:
|
|
||||||
master_key: sk-1234
|
|
||||||
alerting: ["slack"]
|
|
||||||
alerting_threshold: 10 # sends alerts if requests hang for 2 seconds
|
|
||||||
# database_type: "dynamo_db"
|
|
||||||
# database_args: { # 👈 all args - https://github.com/BerriAI/litellm/blob/befbcbb7ac8f59835ce47415c128decf37aac328/litellm/proxy/_types.py#L190
|
|
||||||
# "billing_mode": "PAY_PER_REQUEST",
|
|
||||||
# "region_name": "us-west-2",
|
|
||||||
# "ssl_verify": False
|
|
||||||
# }
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
environment_variables:
|
|
||||||
# otel: True # OpenTelemetry Logger
|
|
||||||
# master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234)
|
|
|
@ -4,9 +4,7 @@ import uuid
|
||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
|
|
||||||
litellm_client = AsyncOpenAI(
|
litellm_client = AsyncOpenAI(base_url="http://0.0.0.0:4000", api_key="sk-1234")
|
||||||
base_url="http://0.0.0.0:4000", api_key="sk-iNwH_oOtAQ6syi_2gkEOpQ"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
async def litellm_completion():
|
async def litellm_completion():
|
||||||
|
@ -29,7 +27,7 @@ async def litellm_completion():
|
||||||
async def main():
|
async def main():
|
||||||
for i in range(150):
|
for i in range(150):
|
||||||
start = time.time()
|
start = time.time()
|
||||||
n = 2000 # Number of concurrent tasks
|
n = 20 # Number of concurrent tasks
|
||||||
tasks = [litellm_completion() for _ in range(n)]
|
tasks = [litellm_completion() for _ in range(n)]
|
||||||
|
|
||||||
chat_completions = await asyncio.gather(*tasks)
|
chat_completions = await asyncio.gather(*tasks)
|
||||||
|
|
|
@ -240,6 +240,21 @@ class Router:
|
||||||
{"caching_groups": caching_groups}
|
{"caching_groups": caching_groups}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
self.deployment_stats: dict = {} # used for debugging load balancing
|
||||||
|
"""
|
||||||
|
deployment_stats = {
|
||||||
|
"122999-2828282-277:
|
||||||
|
{
|
||||||
|
"model": "gpt-3",
|
||||||
|
"api_base": "http://localhost:8000",
|
||||||
|
"num_requests": 20,
|
||||||
|
"avg_latency": 0.001,
|
||||||
|
"num_failures": 0,
|
||||||
|
"num_successes": 20
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
|
||||||
### ROUTING SETUP ###
|
### ROUTING SETUP ###
|
||||||
if routing_strategy == "least-busy":
|
if routing_strategy == "least-busy":
|
||||||
self.leastbusy_logger = LeastBusyLoggingHandler(
|
self.leastbusy_logger = LeastBusyLoggingHandler(
|
||||||
|
@ -390,6 +405,10 @@ class Router:
|
||||||
messages=messages,
|
messages=messages,
|
||||||
specific_deployment=kwargs.pop("specific_deployment", None),
|
specific_deployment=kwargs.pop("specific_deployment", None),
|
||||||
)
|
)
|
||||||
|
if self.set_verbose == True and self.debug_level == "DEBUG":
|
||||||
|
# debug how often this deployment picked
|
||||||
|
self._print_deployment_metrics(deployment=deployment)
|
||||||
|
|
||||||
kwargs.setdefault("metadata", {}).update(
|
kwargs.setdefault("metadata", {}).update(
|
||||||
{
|
{
|
||||||
"deployment": deployment["litellm_params"]["model"],
|
"deployment": deployment["litellm_params"]["model"],
|
||||||
|
@ -2124,6 +2143,32 @@ class Router:
|
||||||
)
|
)
|
||||||
return deployment
|
return deployment
|
||||||
|
|
||||||
|
def _print_deployment_metrics(self, deployment):
|
||||||
|
litellm_params = deployment["litellm_params"]
|
||||||
|
api_base = litellm_params.get("api_base", "")
|
||||||
|
model = litellm_params.get("model", "")
|
||||||
|
|
||||||
|
model_id = deployment.get("model_info", {}).get("id", None)
|
||||||
|
|
||||||
|
# update self.deployment_stats
|
||||||
|
if model_id is not None:
|
||||||
|
if model_id in self.deployment_stats:
|
||||||
|
# only update num_requests
|
||||||
|
self.deployment_stats[model_id]["num_requests"] += 1
|
||||||
|
else:
|
||||||
|
self.deployment_stats[model_id] = {
|
||||||
|
"api_base": api_base,
|
||||||
|
"model": model,
|
||||||
|
"num_requests": 1,
|
||||||
|
}
|
||||||
|
from pprint import pformat
|
||||||
|
|
||||||
|
# Assuming self.deployment_stats is your dictionary
|
||||||
|
formatted_stats = pformat(self.deployment_stats)
|
||||||
|
|
||||||
|
# Assuming verbose_router_logger is your logger
|
||||||
|
verbose_router_logger.info("self.deployment_stats: \n%s", formatted_stats)
|
||||||
|
|
||||||
def flush_cache(self):
|
def flush_cache(self):
|
||||||
litellm.cache = None
|
litellm.cache = None
|
||||||
self.cache.flush_cache()
|
self.cache.flush_cache()
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue