Merge pull request #2461 from BerriAI/litellm_improve_mem_use

LiteLLM - improve memory utilization - don't use inMemCache on Router
2025-04-26 19:24:27 +00:00 · 2024-03-11 18:59:57 -07:00 · 2024-03-11 18:59:57 -07:00 · fa655d62fb
commit fa655d62fb
parent 7030d6d684 e9b511338a
2 changed files with 149 additions and 14 deletions
--- a/litellm/router.py
+++ b/litellm/router.py
@ -210,9 +210,6 @@ class Router:
        self.context_window_fallbacks = (
            context_window_fallbacks or litellm.context_window_fallbacks
        )
-        self.model_exception_map: dict = (
-            {}
-        )  # dict to store model: list exceptions. self.exceptions = {"gpt-3.5": ["API KEY Error", "Rate Limit Error", "good morning error"]}
        self.total_calls: defaultdict = defaultdict(
            int
        )  # dict to store total calls made to each model
@ -1524,17 +1521,6 @@ class Router:
                self._set_cooldown_deployments(
                    deployment_id
                )  # setting deployment_id in cooldown deployments
-            if metadata:
-                deployment = metadata.get("deployment", None)
-                deployment_exceptions = self.model_exception_map.get(deployment, [])
-                deployment_exceptions.append(exception_str)
-                self.model_exception_map[deployment] = deployment_exceptions
-                verbose_router_logger.debug("\nEXCEPTION FOR DEPLOYMENTS\n")
-                verbose_router_logger.debug(self.model_exception_map)
-                for model in self.model_exception_map:
-                    verbose_router_logger.debug(
-                        f"Model {model} had {len(self.model_exception_map[model])} exception"
-                    )
            if custom_llm_provider:
                model_name = f"{custom_llm_provider}/{model_name}"

--- a/litellm/tests/test_mem_usage.py
+++ b/litellm/tests/test_mem_usage.py
@ -0,0 +1,149 @@
+# #### What this tests ####
+
+# from memory_profiler import profile, memory_usage
+# import sys, os, time
+# import traceback, asyncio
+# import pytest
+
+# sys.path.insert(
+#     0, os.path.abspath("../..")
+# )  # Adds the parent directory to the system path
+# import litellm
+# from litellm import Router
+# from concurrent.futures import ThreadPoolExecutor
+# from collections import defaultdict
+# from dotenv import load_dotenv
+# import uuid
+# import tracemalloc
+# import objgraph
+
+# objgraph.growth(shortnames=True)
+# objgraph.show_most_common_types(limit=10)
+
+# from mem_top import mem_top
+
+# load_dotenv()
+
+
+# model_list = [
+#     {
+#         "model_name": "gpt-3.5-turbo",  # openai model name
+#         "litellm_params": {  # params for litellm completion/embedding call
+#             "model": "azure/chatgpt-v-2",
+#             "api_key": os.getenv("AZURE_API_KEY"),
+#             "api_version": os.getenv("AZURE_API_VERSION"),
+#             "api_base": os.getenv("AZURE_API_BASE"),
+#         },
+#         "tpm": 240000,
+#         "rpm": 1800,
+#     },
+#     {
+#         "model_name": "bad-model",  # openai model name
+#         "litellm_params": {  # params for litellm completion/embedding call
+#             "model": "azure/chatgpt-v-2",
+#             "api_key": "bad-key",
+#             "api_version": os.getenv("AZURE_API_VERSION"),
+#             "api_base": os.getenv("AZURE_API_BASE"),
+#         },
+#         "tpm": 240000,
+#         "rpm": 1800,
+#     },
+#     {
+#         "model_name": "text-embedding-ada-002",
+#         "litellm_params": {
+#             "model": "azure/azure-embedding-model",
+#             "api_key": os.environ["AZURE_API_KEY"],
+#             "api_base": os.environ["AZURE_API_BASE"],
+#         },
+#         "tpm": 100000,
+#         "rpm": 10000,
+#     },
+# ]
+# litellm.set_verbose = True
+# litellm.cache = litellm.Cache(
+#     type="s3", s3_bucket_name="litellm-my-test-bucket-2", s3_region_name="us-east-1"
+# )
+# router = Router(
+#     model_list=model_list,
+#     fallbacks=[
+#         {"bad-model": ["gpt-3.5-turbo"]},
+#     ],
+# )  # type: ignore
+
+
+# async def router_acompletion():
+#     # embedding call
+#     question = f"This is a test: {uuid.uuid4()}" * 1
+
+#     response = await router.acompletion(
+#         model="bad-model", messages=[{"role": "user", "content": question}]
+#     )
+#     print("completion-resp", response)
+#     return response
+
+
+# async def main():
+#     for i in range(1):
+#         start = time.time()
+#         n = 15  # Number of concurrent tasks
+#         tasks = [router_acompletion() for _ in range(n)]
+
+#         chat_completions = await asyncio.gather(*tasks)
+
+#         successful_completions = [c for c in chat_completions if c is not None]
+
+#         # Write errors to error_log.txt
+#         with open("error_log.txt", "a") as error_log:
+#             for completion in chat_completions:
+#                 if isinstance(completion, str):
+#                     error_log.write(completion + "\n")
+
+#         print(n, time.time() - start, len(successful_completions))
+#     print()
+#     print(vars(router))
+
+
+# if __name__ == "__main__":
+#     # Blank out contents of error_log.txt
+#     open("error_log.txt", "w").close()
+
+#     import tracemalloc
+
+#     tracemalloc.start(25)
+
+#     # ... run your application ...
+
+#     asyncio.run(main())
+#     print(mem_top())
+
+#     snapshot = tracemalloc.take_snapshot()
+#     # top_stats = snapshot.statistics('lineno')
+
+#     # print("[ Top 10 ]")
+#     # for stat in top_stats[:50]:
+#     #     print(stat)
+
+#     top_stats = snapshot.statistics("traceback")
+
+#     # pick the biggest memory block
+#     stat = top_stats[0]
+#     print("%s memory blocks: %.1f KiB" % (stat.count, stat.size / 1024))
+#     for line in stat.traceback.format():
+#         print(line)
+#     print()
+#     stat = top_stats[1]
+#     print("%s memory blocks: %.1f KiB" % (stat.count, stat.size / 1024))
+#     for line in stat.traceback.format():
+#         print(line)
+
+#     print()
+#     stat = top_stats[2]
+#     print("%s memory blocks: %.1f KiB" % (stat.count, stat.size / 1024))
+#     for line in stat.traceback.format():
+#         print(line)
+#     print()
+
+#     stat = top_stats[3]
+#     print("%s memory blocks: %.1f KiB" % (stat.count, stat.size / 1024))
+#     for line in stat.traceback.format():
+#         print(line)