From b6172638609c5eb70c75e9331b5318a2d88e4623 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Mon, 11 Mar 2024 16:22:04 -0700 Subject: [PATCH 1/2] (fix) improve mem util --- litellm/router.py | 14 --- litellm/tests/test_mem_usage.py | 149 ++++++++++++++++++++++++++++++++ 2 files changed, 149 insertions(+), 14 deletions(-) create mode 100644 litellm/tests/test_mem_usage.py diff --git a/litellm/router.py b/litellm/router.py index e4b14dd097..995a23f543 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -210,9 +210,6 @@ class Router: self.context_window_fallbacks = ( context_window_fallbacks or litellm.context_window_fallbacks ) - self.model_exception_map: dict = ( - {} - ) # dict to store model: list exceptions. self.exceptions = {"gpt-3.5": ["API KEY Error", "Rate Limit Error", "good morning error"]} self.total_calls: defaultdict = defaultdict( int ) # dict to store total calls made to each model @@ -1487,17 +1484,6 @@ class Router: self._set_cooldown_deployments( deployment_id ) # setting deployment_id in cooldown deployments - if metadata: - deployment = metadata.get("deployment", None) - deployment_exceptions = self.model_exception_map.get(deployment, []) - deployment_exceptions.append(exception_str) - self.model_exception_map[deployment] = deployment_exceptions - verbose_router_logger.debug("\nEXCEPTION FOR DEPLOYMENTS\n") - verbose_router_logger.debug(self.model_exception_map) - for model in self.model_exception_map: - verbose_router_logger.debug( - f"Model {model} had {len(self.model_exception_map[model])} exception" - ) if custom_llm_provider: model_name = f"{custom_llm_provider}/{model_name}" diff --git a/litellm/tests/test_mem_usage.py b/litellm/tests/test_mem_usage.py new file mode 100644 index 0000000000..31e15c6d6b --- /dev/null +++ b/litellm/tests/test_mem_usage.py @@ -0,0 +1,149 @@ +#### What this tests #### + +from memory_profiler import profile, memory_usage +import sys, os, time +import traceback, asyncio +import pytest + +sys.path.insert( + 0, os.path.abspath("../..") +) # Adds the parent directory to the system path +import litellm +from litellm import Router +from concurrent.futures import ThreadPoolExecutor +from collections import defaultdict +from dotenv import load_dotenv +import uuid +import tracemalloc +import objgraph + +objgraph.growth(shortnames=True) +objgraph.show_most_common_types(limit=10) + +from mem_top import mem_top + +load_dotenv() + + +model_list = [ + { + "model_name": "gpt-3.5-turbo", # openai model name + "litellm_params": { # params for litellm completion/embedding call + "model": "azure/chatgpt-v-2", + "api_key": os.getenv("AZURE_API_KEY"), + "api_version": os.getenv("AZURE_API_VERSION"), + "api_base": os.getenv("AZURE_API_BASE"), + }, + "tpm": 240000, + "rpm": 1800, + }, + { + "model_name": "bad-model", # openai model name + "litellm_params": { # params for litellm completion/embedding call + "model": "azure/chatgpt-v-2", + "api_key": "bad-key", + "api_version": os.getenv("AZURE_API_VERSION"), + "api_base": os.getenv("AZURE_API_BASE"), + }, + "tpm": 240000, + "rpm": 1800, + }, + { + "model_name": "text-embedding-ada-002", + "litellm_params": { + "model": "azure/azure-embedding-model", + "api_key": os.environ["AZURE_API_KEY"], + "api_base": os.environ["AZURE_API_BASE"], + }, + "tpm": 100000, + "rpm": 10000, + }, +] +litellm.set_verbose = True +litellm.cache = litellm.Cache( + type="s3", s3_bucket_name="litellm-my-test-bucket-2", s3_region_name="us-east-1" +) +router = Router( + model_list=model_list, + fallbacks=[ + {"bad-model": ["gpt-3.5-turbo"]}, + ], +) # type: ignore + + +async def router_acompletion(): + # embedding call + question = f"This is a test: {uuid.uuid4()}" * 1 + + response = await router.acompletion( + model="bad-model", messages=[{"role": "user", "content": question}] + ) + print("completion-resp", response) + return response + + +async def main(): + for i in range(1): + start = time.time() + n = 15 # Number of concurrent tasks + tasks = [router_acompletion() for _ in range(n)] + + chat_completions = await asyncio.gather(*tasks) + + successful_completions = [c for c in chat_completions if c is not None] + + # Write errors to error_log.txt + with open("error_log.txt", "a") as error_log: + for completion in chat_completions: + if isinstance(completion, str): + error_log.write(completion + "\n") + + print(n, time.time() - start, len(successful_completions)) + print() + print(vars(router)) + + +if __name__ == "__main__": + # Blank out contents of error_log.txt + open("error_log.txt", "w").close() + + import tracemalloc + + tracemalloc.start(25) + + # ... run your application ... + + asyncio.run(main()) + print(mem_top()) + + snapshot = tracemalloc.take_snapshot() + # top_stats = snapshot.statistics('lineno') + + # print("[ Top 10 ]") + # for stat in top_stats[:50]: + # print(stat) + + top_stats = snapshot.statistics("traceback") + + # pick the biggest memory block + stat = top_stats[0] + print("%s memory blocks: %.1f KiB" % (stat.count, stat.size / 1024)) + for line in stat.traceback.format(): + print(line) + print() + stat = top_stats[1] + print("%s memory blocks: %.1f KiB" % (stat.count, stat.size / 1024)) + for line in stat.traceback.format(): + print(line) + + print() + stat = top_stats[2] + print("%s memory blocks: %.1f KiB" % (stat.count, stat.size / 1024)) + for line in stat.traceback.format(): + print(line) + print() + + stat = top_stats[3] + print("%s memory blocks: %.1f KiB" % (stat.count, stat.size / 1024)) + for line in stat.traceback.format(): + print(line) From e9b511338a7498eceb467307f2921b625326bfea Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Mon, 11 Mar 2024 16:38:31 -0700 Subject: [PATCH 2/2] (fix) test_mem_usage --- litellm/tests/test_mem_usage.py | 242 ++++++++++++++++---------------- 1 file changed, 121 insertions(+), 121 deletions(-) diff --git a/litellm/tests/test_mem_usage.py b/litellm/tests/test_mem_usage.py index 31e15c6d6b..95bf3993f7 100644 --- a/litellm/tests/test_mem_usage.py +++ b/litellm/tests/test_mem_usage.py @@ -1,149 +1,149 @@ -#### What this tests #### +# #### What this tests #### -from memory_profiler import profile, memory_usage -import sys, os, time -import traceback, asyncio -import pytest +# from memory_profiler import profile, memory_usage +# import sys, os, time +# import traceback, asyncio +# import pytest -sys.path.insert( - 0, os.path.abspath("../..") -) # Adds the parent directory to the system path -import litellm -from litellm import Router -from concurrent.futures import ThreadPoolExecutor -from collections import defaultdict -from dotenv import load_dotenv -import uuid -import tracemalloc -import objgraph +# sys.path.insert( +# 0, os.path.abspath("../..") +# ) # Adds the parent directory to the system path +# import litellm +# from litellm import Router +# from concurrent.futures import ThreadPoolExecutor +# from collections import defaultdict +# from dotenv import load_dotenv +# import uuid +# import tracemalloc +# import objgraph -objgraph.growth(shortnames=True) -objgraph.show_most_common_types(limit=10) +# objgraph.growth(shortnames=True) +# objgraph.show_most_common_types(limit=10) -from mem_top import mem_top +# from mem_top import mem_top -load_dotenv() +# load_dotenv() -model_list = [ - { - "model_name": "gpt-3.5-turbo", # openai model name - "litellm_params": { # params for litellm completion/embedding call - "model": "azure/chatgpt-v-2", - "api_key": os.getenv("AZURE_API_KEY"), - "api_version": os.getenv("AZURE_API_VERSION"), - "api_base": os.getenv("AZURE_API_BASE"), - }, - "tpm": 240000, - "rpm": 1800, - }, - { - "model_name": "bad-model", # openai model name - "litellm_params": { # params for litellm completion/embedding call - "model": "azure/chatgpt-v-2", - "api_key": "bad-key", - "api_version": os.getenv("AZURE_API_VERSION"), - "api_base": os.getenv("AZURE_API_BASE"), - }, - "tpm": 240000, - "rpm": 1800, - }, - { - "model_name": "text-embedding-ada-002", - "litellm_params": { - "model": "azure/azure-embedding-model", - "api_key": os.environ["AZURE_API_KEY"], - "api_base": os.environ["AZURE_API_BASE"], - }, - "tpm": 100000, - "rpm": 10000, - }, -] -litellm.set_verbose = True -litellm.cache = litellm.Cache( - type="s3", s3_bucket_name="litellm-my-test-bucket-2", s3_region_name="us-east-1" -) -router = Router( - model_list=model_list, - fallbacks=[ - {"bad-model": ["gpt-3.5-turbo"]}, - ], -) # type: ignore +# model_list = [ +# { +# "model_name": "gpt-3.5-turbo", # openai model name +# "litellm_params": { # params for litellm completion/embedding call +# "model": "azure/chatgpt-v-2", +# "api_key": os.getenv("AZURE_API_KEY"), +# "api_version": os.getenv("AZURE_API_VERSION"), +# "api_base": os.getenv("AZURE_API_BASE"), +# }, +# "tpm": 240000, +# "rpm": 1800, +# }, +# { +# "model_name": "bad-model", # openai model name +# "litellm_params": { # params for litellm completion/embedding call +# "model": "azure/chatgpt-v-2", +# "api_key": "bad-key", +# "api_version": os.getenv("AZURE_API_VERSION"), +# "api_base": os.getenv("AZURE_API_BASE"), +# }, +# "tpm": 240000, +# "rpm": 1800, +# }, +# { +# "model_name": "text-embedding-ada-002", +# "litellm_params": { +# "model": "azure/azure-embedding-model", +# "api_key": os.environ["AZURE_API_KEY"], +# "api_base": os.environ["AZURE_API_BASE"], +# }, +# "tpm": 100000, +# "rpm": 10000, +# }, +# ] +# litellm.set_verbose = True +# litellm.cache = litellm.Cache( +# type="s3", s3_bucket_name="litellm-my-test-bucket-2", s3_region_name="us-east-1" +# ) +# router = Router( +# model_list=model_list, +# fallbacks=[ +# {"bad-model": ["gpt-3.5-turbo"]}, +# ], +# ) # type: ignore -async def router_acompletion(): - # embedding call - question = f"This is a test: {uuid.uuid4()}" * 1 +# async def router_acompletion(): +# # embedding call +# question = f"This is a test: {uuid.uuid4()}" * 1 - response = await router.acompletion( - model="bad-model", messages=[{"role": "user", "content": question}] - ) - print("completion-resp", response) - return response +# response = await router.acompletion( +# model="bad-model", messages=[{"role": "user", "content": question}] +# ) +# print("completion-resp", response) +# return response -async def main(): - for i in range(1): - start = time.time() - n = 15 # Number of concurrent tasks - tasks = [router_acompletion() for _ in range(n)] +# async def main(): +# for i in range(1): +# start = time.time() +# n = 15 # Number of concurrent tasks +# tasks = [router_acompletion() for _ in range(n)] - chat_completions = await asyncio.gather(*tasks) +# chat_completions = await asyncio.gather(*tasks) - successful_completions = [c for c in chat_completions if c is not None] +# successful_completions = [c for c in chat_completions if c is not None] - # Write errors to error_log.txt - with open("error_log.txt", "a") as error_log: - for completion in chat_completions: - if isinstance(completion, str): - error_log.write(completion + "\n") +# # Write errors to error_log.txt +# with open("error_log.txt", "a") as error_log: +# for completion in chat_completions: +# if isinstance(completion, str): +# error_log.write(completion + "\n") - print(n, time.time() - start, len(successful_completions)) - print() - print(vars(router)) +# print(n, time.time() - start, len(successful_completions)) +# print() +# print(vars(router)) -if __name__ == "__main__": - # Blank out contents of error_log.txt - open("error_log.txt", "w").close() +# if __name__ == "__main__": +# # Blank out contents of error_log.txt +# open("error_log.txt", "w").close() - import tracemalloc +# import tracemalloc - tracemalloc.start(25) +# tracemalloc.start(25) - # ... run your application ... +# # ... run your application ... - asyncio.run(main()) - print(mem_top()) +# asyncio.run(main()) +# print(mem_top()) - snapshot = tracemalloc.take_snapshot() - # top_stats = snapshot.statistics('lineno') +# snapshot = tracemalloc.take_snapshot() +# # top_stats = snapshot.statistics('lineno') - # print("[ Top 10 ]") - # for stat in top_stats[:50]: - # print(stat) +# # print("[ Top 10 ]") +# # for stat in top_stats[:50]: +# # print(stat) - top_stats = snapshot.statistics("traceback") +# top_stats = snapshot.statistics("traceback") - # pick the biggest memory block - stat = top_stats[0] - print("%s memory blocks: %.1f KiB" % (stat.count, stat.size / 1024)) - for line in stat.traceback.format(): - print(line) - print() - stat = top_stats[1] - print("%s memory blocks: %.1f KiB" % (stat.count, stat.size / 1024)) - for line in stat.traceback.format(): - print(line) +# # pick the biggest memory block +# stat = top_stats[0] +# print("%s memory blocks: %.1f KiB" % (stat.count, stat.size / 1024)) +# for line in stat.traceback.format(): +# print(line) +# print() +# stat = top_stats[1] +# print("%s memory blocks: %.1f KiB" % (stat.count, stat.size / 1024)) +# for line in stat.traceback.format(): +# print(line) - print() - stat = top_stats[2] - print("%s memory blocks: %.1f KiB" % (stat.count, stat.size / 1024)) - for line in stat.traceback.format(): - print(line) - print() +# print() +# stat = top_stats[2] +# print("%s memory blocks: %.1f KiB" % (stat.count, stat.size / 1024)) +# for line in stat.traceback.format(): +# print(line) +# print() - stat = top_stats[3] - print("%s memory blocks: %.1f KiB" % (stat.count, stat.size / 1024)) - for line in stat.traceback.format(): - print(line) +# stat = top_stats[3] +# print("%s memory blocks: %.1f KiB" % (stat.count, stat.size / 1024)) +# for line in stat.traceback.format(): +# print(line)