diff --git a/litellm/__init__.py b/litellm/__init__.py
index 60b8cf81a0..a3756251d1 100644
--- a/litellm/__init__.py
+++ b/litellm/__init__.py
@@ -308,7 +308,7 @@ _key_management_settings: KeyManagementSettings = KeyManagementSettings()
 #### PII MASKING ####
 output_parse_pii: bool = False
 #############################################
-from litellm.litellm_core_utils.get_model_cost_map import get_model_cost_map
+from litellm.litellm_core_utils.get_model_cost_map import get_model_cost_map, get_locally_cached_model_cost_map
 
 model_cost = get_model_cost_map(url=model_cost_map_url)
 custom_prompt_dict: Dict[str, dict] = {}
diff --git a/litellm/litellm_core_utils/get_model_cost_map.py b/litellm/litellm_core_utils/get_model_cost_map.py
index b8bdaee19c..0e14457b2a 100644
--- a/litellm/litellm_core_utils/get_model_cost_map.py
+++ b/litellm/litellm_core_utils/get_model_cost_map.py
@@ -8,24 +8,29 @@ export LITELLM_LOCAL_MODEL_COST_MAP=True
 ```
 """
 
+from functools import cache
 import os
 
 import httpx
 
+@cache
+def get_locally_cached_model_cost_map():
+    import importlib.resources
+    import json
+
+    with importlib.resources.open_text(
+        "litellm", "model_prices_and_context_window_backup.json"
+    ) as f:
+        content = json.load(f)
+        return content
+
 
 def get_model_cost_map(url: str):
     if (
         os.getenv("LITELLM_LOCAL_MODEL_COST_MAP", False)
         or os.getenv("LITELLM_LOCAL_MODEL_COST_MAP", False) == "True"
     ):
-        import importlib.resources
-        import json
-
-        with importlib.resources.open_text(
-            "litellm", "model_prices_and_context_window_backup.json"
-        ) as f:
-            content = json.load(f)
-            return content
+        return get_locally_cached_model_cost_map()
 
     try:
         response = httpx.get(
diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json
index 96076fa3b8..961b55f49b 100644
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@@ -6,7 +6,7 @@
         "input_cost_per_token": 0.0000,
         "output_cost_per_token": 0.000,
         "litellm_provider": "one of https://docs.litellm.ai/docs/providers",
-        "mode": "one of chat, embedding, completion, image_generation, audio_transcription, audio_speech",
+        "mode": "one of: chat, embedding, completion, image_generation, audio_transcription, audio_speech, image_generation, moderation, moderations, rerank",
         "supports_function_calling": true,
         "supports_parallel_function_calling": true,
         "supports_vision": true,
diff --git a/tests/code_coverage_tests/bedrock_pricing.py b/tests/code_coverage_tests/bedrock_pricing.py
index b2c9e78b06..9984cb8b0e 100644
--- a/tests/code_coverage_tests/bedrock_pricing.py
+++ b/tests/code_coverage_tests/bedrock_pricing.py
@@ -191,8 +191,7 @@ def _check_if_model_name_in_pricing(
     input_cost_per_1k_tokens: str,
     output_cost_per_1k_tokens: str,
 ):
-    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
 
     for model, value in litellm.model_cost.items():
         if model.startswith(bedrock_model_name):
diff --git a/tests/litellm_utils_tests/test_utils.py b/tests/litellm_utils_tests/test_utils.py
index 2b1e78a681..fd8ad01c8b 100644
--- a/tests/litellm_utils_tests/test_utils.py
+++ b/tests/litellm_utils_tests/test_utils.py
@@ -907,8 +907,7 @@ def test_supports_response_schema(model, expected_bool):
     Should be true for gemini-1.5-pro on google ai studio / vertex ai AND predibase models
     Should be false otherwise
     """
-    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
 
     from litellm.utils import supports_response_schema
 
@@ -1066,8 +1065,7 @@ def test_async_http_handler_force_ipv4(mock_async_client):
     "model, expected_bool", [("gpt-3.5-turbo", False), ("gpt-4o-audio-preview", True)]
 )
 def test_supports_audio_input(model, expected_bool):
-    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
 
     from litellm.utils import supports_audio_input, supports_audio_output
 
@@ -1165,8 +1163,7 @@ def test_models_by_provider():
     """
     Make sure all providers from model map are in the valid providers list
     """
-    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
 
     from litellm import models_by_provider
 
@@ -1484,8 +1481,7 @@ def test_get_valid_models_default(monkeypatch):
 
 
 def test_supports_vision_gemini():
-    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
     from litellm.utils import supports_vision
 
     assert supports_vision("gemini-1.5-pro") is True
diff --git a/tests/llm_translation/base_embedding_unit_tests.py b/tests/llm_translation/base_embedding_unit_tests.py
index 30a9dcc0da..1fcc825481 100644
--- a/tests/llm_translation/base_embedding_unit_tests.py
+++ b/tests/llm_translation/base_embedding_unit_tests.py
@@ -84,8 +84,7 @@ class BaseLLMEmbeddingTest(ABC):
         litellm.set_verbose = True
         from litellm.utils import supports_embedding_image_input
 
-        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-        litellm.model_cost = litellm.get_model_cost_map(url="")
+        litellm.model_cost = litellm.get_locally_cached_model_cost_map()
 
         base_embedding_call_args = self.get_base_embedding_call_args()
         if not supports_embedding_image_input(base_embedding_call_args["model"], None):
diff --git a/tests/llm_translation/base_llm_unit_tests.py b/tests/llm_translation/base_llm_unit_tests.py
index f91ef0eae9..eb18cbce90 100644
--- a/tests/llm_translation/base_llm_unit_tests.py
+++ b/tests/llm_translation/base_llm_unit_tests.py
@@ -342,8 +342,7 @@ class BaseLLMChatTest(ABC):
         from pydantic import BaseModel
         from litellm.utils import supports_response_schema
 
-        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-        litellm.model_cost = litellm.get_model_cost_map(url="")
+        litellm.model_cost = litellm.get_locally_cached_model_cost_map()
 
         class TestModel(BaseModel):
             first_response: str
@@ -382,16 +381,14 @@ class BaseLLMChatTest(ABC):
         from pydantic import BaseModel
         from litellm.utils import supports_response_schema
 
-        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-        litellm.model_cost = litellm.get_model_cost_map(url="")
+        litellm.model_cost = litellm.get_locally_cached_model_cost_map()
 
     @pytest.mark.flaky(retries=6, delay=1)
     def test_json_response_nested_pydantic_obj(self):
         from pydantic import BaseModel
         from litellm.utils import supports_response_schema
 
-        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-        litellm.model_cost = litellm.get_model_cost_map(url="")
+        litellm.model_cost = litellm.get_locally_cached_model_cost_map()
 
         class CalendarEvent(BaseModel):
             name: str
@@ -438,8 +435,7 @@ class BaseLLMChatTest(ABC):
         from litellm.utils import supports_response_schema
         from litellm.llms.base_llm.base_utils import type_to_response_format_param
 
-        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-        litellm.model_cost = litellm.get_model_cost_map(url="")
+        litellm.model_cost = litellm.get_locally_cached_model_cost_map()
 
         class CalendarEvent(BaseModel):
             name: str
@@ -560,8 +556,7 @@ class BaseLLMChatTest(ABC):
         litellm.set_verbose = True
         from litellm.utils import supports_vision
 
-        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-        litellm.model_cost = litellm.get_model_cost_map(url="")
+        litellm.model_cost = litellm.get_locally_cached_model_cost_map()
 
         base_completion_call_args = self.get_base_completion_call_args()
         if not supports_vision(base_completion_call_args["model"], None):
@@ -615,8 +610,7 @@ class BaseLLMChatTest(ABC):
         litellm.set_verbose = True
         from litellm.utils import supports_vision
 
-        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-        litellm.model_cost = litellm.get_model_cost_map(url="")
+        litellm.model_cost = litellm.get_locally_cached_model_cost_map()
 
         image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
 
@@ -656,8 +650,7 @@ class BaseLLMChatTest(ABC):
         litellm.set_verbose = True
         from litellm.utils import supports_prompt_caching
 
-        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-        litellm.model_cost = litellm.get_model_cost_map(url="")
+        litellm.model_cost = litellm.get_locally_cached_model_cost_map()
 
         base_completion_call_args = self.get_base_completion_call_args()
         if not supports_prompt_caching(base_completion_call_args["model"], None):
@@ -773,8 +766,7 @@ class BaseLLMChatTest(ABC):
             litellm._turn_on_debug()
             from litellm.utils import supports_function_calling
 
-            os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-            litellm.model_cost = litellm.get_model_cost_map(url="")
+            litellm.model_cost = litellm.get_locally_cached_model_cost_map()
 
             base_completion_call_args = self.get_base_completion_call_args()
             if not supports_function_calling(base_completion_call_args["model"], None):
@@ -872,8 +864,7 @@ class BaseLLMChatTest(ABC):
     async def test_completion_cost(self):
         from litellm import completion_cost
 
-        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-        litellm.model_cost = litellm.get_model_cost_map(url="")
+        litellm.model_cost = litellm.get_locally_cached_model_cost_map()
 
         litellm.set_verbose = True
         response = await self.async_completion_function(
diff --git a/tests/llm_translation/base_rerank_unit_tests.py b/tests/llm_translation/base_rerank_unit_tests.py
index cff4a02753..b3f56f7c64 100644
--- a/tests/llm_translation/base_rerank_unit_tests.py
+++ b/tests/llm_translation/base_rerank_unit_tests.py
@@ -87,8 +87,7 @@ class BaseLLMRerankTest(ABC):
     @pytest.mark.parametrize("sync_mode", [True, False])
     async def test_basic_rerank(self, sync_mode):
         litellm._turn_on_debug()
-        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-        litellm.model_cost = litellm.get_model_cost_map(url="")
+        litellm.model_cost = litellm.get_locally_cached_model_cost_map()
         rerank_call_args = self.get_base_rerank_call_args()
         custom_llm_provider = self.get_custom_llm_provider()
         if sync_mode is True:
diff --git a/tests/llm_translation/test_anthropic_completion.py b/tests/llm_translation/test_anthropic_completion.py
index 37253a37e6..04158b4ab4 100644
--- a/tests/llm_translation/test_anthropic_completion.py
+++ b/tests/llm_translation/test_anthropic_completion.py
@@ -693,8 +693,7 @@ class TestAnthropicCompletion(BaseLLMChatTest):
         from pydantic import BaseModel
         from litellm.utils import supports_response_schema
 
-        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-        litellm.model_cost = litellm.get_model_cost_map(url="")
+        litellm.model_cost = litellm.get_locally_cached_model_cost_map()
 
         class RFormat(BaseModel):
             question: str
diff --git a/tests/llm_translation/test_bedrock_completion.py b/tests/llm_translation/test_bedrock_completion.py
index 2fb0ffb9e5..99e4e7ed1a 100644
--- a/tests/llm_translation/test_bedrock_completion.py
+++ b/tests/llm_translation/test_bedrock_completion.py
@@ -1975,8 +1975,7 @@ def test_bedrock_converse_route():
 
 def test_bedrock_mapped_converse_models():
     litellm.set_verbose = True
-    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
     litellm.add_known_models()
     litellm.completion(
         model="bedrock/us.amazon.nova-pro-v1:0",
@@ -2108,8 +2107,7 @@ def test_bedrock_supports_tool_call(model, expected_supports_tool_call):
 
 class TestBedrockConverseChatCrossRegion(BaseLLMChatTest):
     def get_base_completion_call_args(self) -> dict:
-        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-        litellm.model_cost = litellm.get_model_cost_map(url="")
+        litellm.model_cost = litellm.get_locally_cached_model_cost_map()
         litellm.add_known_models()
         return {
             "model": "bedrock/us.anthropic.claude-3-5-sonnet-20241022-v2:0",
@@ -2137,8 +2135,7 @@ class TestBedrockConverseChatCrossRegion(BaseLLMChatTest):
         """
         Test if region models info is correctly used for cost calculation. Using the base model info for cost calculation.
         """
-        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-        litellm.model_cost = litellm.get_model_cost_map(url="")
+        litellm.model_cost = litellm.get_locally_cached_model_cost_map()
         bedrock_model = "us.anthropic.claude-3-5-sonnet-20241022-v2:0"
         litellm.model_cost.pop(bedrock_model, None)
         model = f"bedrock/{bedrock_model}"
@@ -2155,8 +2152,7 @@ class TestBedrockConverseChatCrossRegion(BaseLLMChatTest):
 
 class TestBedrockConverseChatNormal(BaseLLMChatTest):
     def get_base_completion_call_args(self) -> dict:
-        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-        litellm.model_cost = litellm.get_model_cost_map(url="")
+        litellm.model_cost = litellm.get_locally_cached_model_cost_map()
         litellm.add_known_models()
         return {
             "model": "bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0",
@@ -2325,8 +2321,7 @@ def test_bedrock_nova_topk(top_k_param):
 def test_bedrock_cross_region_inference(monkeypatch):
     from litellm.llms.custom_httpx.http_handler import HTTPHandler
 
-    monkeypatch.setenv("LITELLM_LOCAL_MODEL_COST_MAP", "True")
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
     litellm.add_known_models()
 
     litellm.set_verbose = True
diff --git a/tests/llm_translation/test_openai_o1.py b/tests/llm_translation/test_openai_o1.py
index 4208f1ae38..bcd7648f2a 100644
--- a/tests/llm_translation/test_openai_o1.py
+++ b/tests/llm_translation/test_openai_o1.py
@@ -29,8 +29,7 @@ async def test_o1_handle_system_role(model):
     from openai import AsyncOpenAI
     from litellm.utils import supports_system_messages
 
-    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
 
     litellm.set_verbose = True
 
@@ -83,8 +82,7 @@ async def test_o1_handle_tool_calling_optional_params(
     from litellm.utils import ProviderConfigManager
     from litellm.types.utils import LlmProviders
 
-    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
 
     config = ProviderConfigManager.get_provider_chat_config(
         model=model, provider=LlmProviders.OPENAI
@@ -190,8 +188,7 @@ class TestOpenAIO3(BaseOSeriesModelsTest, BaseLLMChatTest):
 
 def test_o1_supports_vision():
     """Test that o1 supports vision"""
-    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
     for k, v in litellm.model_cost.items():
         if k.startswith("o1") and v.get("litellm_provider") == "openai":
             assert v.get("supports_vision") is True, f"{k} does not support vision"
diff --git a/tests/llm_translation/test_rerank.py b/tests/llm_translation/test_rerank.py
index d2cb2b6fea..ef5df795ab 100644
--- a/tests/llm_translation/test_rerank.py
+++ b/tests/llm_translation/test_rerank.py
@@ -274,8 +274,7 @@ class TestLogger(CustomLogger):
 
 @pytest.mark.asyncio()
 async def test_rerank_custom_callbacks():
-    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
 
     custom_logger = TestLogger()
     litellm.callbacks = [custom_logger]
diff --git a/tests/llm_translation/test_together_ai.py b/tests/llm_translation/test_together_ai.py
index b83a700002..f275500817 100644
--- a/tests/llm_translation/test_together_ai.py
+++ b/tests/llm_translation/test_together_ai.py
@@ -42,8 +42,7 @@ class TestTogetherAI(BaseLLMChatTest):
     def test_get_supported_response_format_together_ai(
         self, model: str, expected_bool: bool
     ) -> None:
-        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-        litellm.model_cost = litellm.get_model_cost_map(url="")
+        litellm.model_cost = litellm.get_locally_cached_model_cost_map()
         optional_params = litellm.get_supported_openai_params(
             model, custom_llm_provider="together_ai"
         )
diff --git a/tests/local_testing/test_amazing_vertex_completion.py b/tests/local_testing/test_amazing_vertex_completion.py
index 02e0c9b2f1..d59df956be 100644
--- a/tests/local_testing/test_amazing_vertex_completion.py
+++ b/tests/local_testing/test_amazing_vertex_completion.py
@@ -1433,8 +1433,7 @@ async def test_gemini_pro_json_schema_args_sent_httpx(
     enforce_validation,
 ):
     load_vertex_ai_credentials()
-    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
 
     litellm.set_verbose = True
     messages = [{"role": "user", "content": "List 5 cookie recipes"}]
@@ -1554,8 +1553,7 @@ async def test_gemini_pro_json_schema_args_sent_httpx_openai_schema(
     from pydantic import BaseModel
 
     load_vertex_ai_credentials()
-    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
 
     litellm.set_verbose = True
 
diff --git a/tests/local_testing/test_completion_cost.py b/tests/local_testing/test_completion_cost.py
index 200f2c012e..77d49961bd 100644
--- a/tests/local_testing/test_completion_cost.py
+++ b/tests/local_testing/test_completion_cost.py
@@ -634,8 +634,7 @@ def test_gemini_completion_cost(above_128k, provider):
     """
     Check if cost correctly calculated for gemini models based on context window
     """
-    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
     if provider == "gemini":
         model_name = "gemini-1.5-flash-latest"
     else:
@@ -690,8 +689,7 @@ def _count_characters(text):
 
 
 def test_vertex_ai_completion_cost():
-    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
 
     text = "The quick brown fox jumps over the lazy dog."
     characters = _count_characters(text=text)
@@ -726,8 +724,7 @@ def test_vertex_ai_medlm_completion_cost():
             model=model, messages=messages, custom_llm_provider="vertex_ai"
         )
 
-    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
 
     model = "vertex_ai/medlm-medium"
     messages = [{"role": "user", "content": "Test MedLM completion cost."}]
@@ -746,8 +743,7 @@ def test_vertex_ai_claude_completion_cost():
     from litellm import Choices, Message, ModelResponse
     from litellm.utils import Usage
 
-    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
 
     litellm.set_verbose = True
     input_tokens = litellm.token_counter(
@@ -796,8 +792,7 @@ def test_vertex_ai_embedding_completion_cost(caplog):
     """
     Relevant issue - https://github.com/BerriAI/litellm/issues/4630
     """
-    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
 
     text = "The quick brown fox jumps over the lazy dog."
     input_tokens = litellm.token_counter(
@@ -839,8 +834,7 @@ def test_vertex_ai_embedding_completion_cost(caplog):
 #     from test_amazing_vertex_completion import load_vertex_ai_credentials
 
 #     load_vertex_ai_credentials()
-#     os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-#     litellm.model_cost = litellm.get_model_cost_map(url="")
+#     litellm.model_cost = litellm.get_locally_cached_model_cost_map()
 
 #     text = "The quick brown fox jumps over the lazy dog."
 #     input_tokens = litellm.token_counter(
@@ -867,8 +861,7 @@ def test_vertex_ai_embedding_completion_cost(caplog):
 
 def test_completion_azure_ai():
     try:
-        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-        litellm.model_cost = litellm.get_model_cost_map(url="")
+        litellm.model_cost = litellm.get_locally_cached_model_cost_map()
 
         litellm.set_verbose = True
         response = litellm.completion(
@@ -974,8 +967,7 @@ def test_vertex_ai_mistral_predict_cost(usage):
 
 @pytest.mark.parametrize("model", ["openai/tts-1", "azure/tts-1"])
 def test_completion_cost_tts(model):
-    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
 
     cost = completion_cost(
         model=model,
@@ -1171,8 +1163,7 @@ def test_completion_cost_azure_common_deployment_name():
     ],
 )
 def test_completion_cost_prompt_caching(model, custom_llm_provider):
-    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
 
     from litellm.utils import Choices, Message, ModelResponse, Usage
 
@@ -1273,8 +1264,7 @@ def test_completion_cost_prompt_caching(model, custom_llm_provider):
     ],
 )
 def test_completion_cost_databricks(model):
-    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
     model, messages = model, [{"role": "user", "content": "What is 2+2?"}]
 
     resp = litellm.completion(model=model, messages=messages)  # works fine
@@ -1291,8 +1281,7 @@ def test_completion_cost_databricks(model):
     ],
 )
 def test_completion_cost_databricks_embedding(model):
-    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
     resp = litellm.embedding(model=model, input=["hey, how's it going?"])  # works fine
 
     print(resp)
@@ -1319,8 +1308,7 @@ def test_get_model_params_fireworks_ai(model, base_model):
     ["fireworks_ai/llama-v3p1-405b-instruct", "fireworks_ai/mixtral-8x7b-instruct"],
 )
 def test_completion_cost_fireworks_ai(model):
-    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
 
     messages = [{"role": "user", "content": "Hey, how's it going?"}]
     resp = litellm.completion(model=model, messages=messages)  # works fine
@@ -1337,8 +1325,7 @@ def test_cost_azure_openai_prompt_caching():
     )
     from litellm import get_model_info
 
-    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
 
     model = "azure/o1-mini"
 
@@ -1427,8 +1414,7 @@ def test_cost_azure_openai_prompt_caching():
 
 
 def test_completion_cost_vertex_llama3():
-    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
 
     from litellm.utils import Choices, Message, ModelResponse, Usage
 
@@ -1468,8 +1454,7 @@ def test_cost_openai_prompt_caching():
     from litellm.utils import Choices, Message, ModelResponse, Usage
     from litellm import get_model_info
 
-    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
 
     model = "gpt-4o-mini-2024-07-18"
 
@@ -1559,8 +1544,7 @@ def test_cost_openai_prompt_caching():
 def test_completion_cost_azure_ai_rerank(model):
     from litellm import RerankResponse, rerank
 
-    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
 
     response = RerankResponse(
         id="b01dbf2e-63c8-4981-9e69-32241da559ed",
@@ -1591,8 +1575,7 @@ def test_completion_cost_azure_ai_rerank(model):
 def test_together_ai_embedding_completion_cost():
     from litellm.utils import Choices, EmbeddingResponse, Message, ModelResponse, Usage
 
-    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
     response = EmbeddingResponse(
         model="togethercomputer/m2-bert-80M-8k-retrieval",
         data=[
@@ -2449,8 +2432,7 @@ def test_completion_cost_params_gemini_3():
 
     from litellm.llms.vertex_ai.cost_calculator import cost_per_character
 
-    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
 
     response = ModelResponse(
         id="chatcmpl-61043504-4439-48be-9996-e29bdee24dc3",
@@ -2519,8 +2501,7 @@ def test_completion_cost_params_gemini_3():
 # @pytest.mark.flaky(retries=3, delay=1)
 @pytest.mark.parametrize("stream", [False])  # True,
 async def test_test_completion_cost_gpt4o_audio_output_from_model(stream):
-    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
     from litellm.types.utils import (
         Choices,
         Message,
@@ -2617,8 +2598,7 @@ def test_completion_cost_model_response_cost(response_model, custom_llm_provider
     """
     from litellm import ModelResponse
 
-    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
 
     litellm.set_verbose = True
     response = {
@@ -2718,8 +2698,7 @@ def test_select_model_name_for_cost_calc():
 def test_moderations():
     from litellm import moderation
 
-    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
     litellm.add_known_models()
 
     assert "omni-moderation-latest" in litellm.model_cost
@@ -2772,8 +2751,7 @@ def test_bedrock_cost_calc_with_region():
 
     from litellm import ModelResponse
 
-    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
 
     litellm.add_known_models()
 
@@ -2972,9 +2950,7 @@ async def test_cost_calculator_with_custom_pricing_router(model_item, custom_pri
 def test_json_valid_model_cost_map():
     import json
 
-    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-
-    model_cost = litellm.get_model_cost_map(url="")
+    model_cost = litellm.get_locally_cached_model_cost_map()
 
     try:
         # Attempt to serialize and deserialize the JSON
diff --git a/tests/local_testing/test_embedding.py b/tests/local_testing/test_embedding.py
index c85a830e5f..c369dd73eb 100644
--- a/tests/local_testing/test_embedding.py
+++ b/tests/local_testing/test_embedding.py
@@ -115,8 +115,7 @@ def test_openai_embedding_3():
 @pytest.mark.asyncio
 async def test_openai_azure_embedding_simple(model, api_base, api_key, sync_mode):
     try:
-        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-        litellm.model_cost = litellm.get_model_cost_map(url="")
+        litellm.model_cost = litellm.get_locally_cached_model_cost_map()
         # litellm.set_verbose = True
         if sync_mode:
             response = embedding(
@@ -198,8 +197,7 @@ def _azure_ai_image_mock_response(*args, **kwargs):
 @pytest.mark.asyncio
 async def test_azure_ai_embedding_image(model, api_base, api_key, sync_mode):
     try:
-        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-        litellm.model_cost = litellm.get_model_cost_map(url="")
+        litellm.model_cost = litellm.get_locally_cached_model_cost_map()
         input = base64_image
         if sync_mode:
             client = HTTPHandler()
diff --git a/tests/local_testing/test_get_model_info.py b/tests/local_testing/test_get_model_info.py
index c879332c7b..c40ac41be2 100644
--- a/tests/local_testing/test_get_model_info.py
+++ b/tests/local_testing/test_get_model_info.py
@@ -58,16 +58,14 @@ def test_get_model_info_shows_correct_supports_vision():
 
 
 def test_get_model_info_shows_assistant_prefill():
-    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
     info = litellm.get_model_info("deepseek/deepseek-chat")
     print("info", info)
     assert info.get("supports_assistant_prefill") is True
 
 
 def test_get_model_info_shows_supports_prompt_caching():
-    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
     info = litellm.get_model_info("deepseek/deepseek-chat")
     print("info", info)
     assert info.get("supports_prompt_caching") is True
@@ -116,8 +114,7 @@ def test_get_model_info_gemini():
     """
     Tests if ALL gemini models have 'tpm' and 'rpm' in the model info
     """
-    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
 
     model_map = litellm.model_cost
     for model, info in model_map.items():
@@ -127,8 +124,7 @@ def test_get_model_info_gemini():
 
 
 def test_get_model_info_bedrock_region():
-    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
     args = {
         "model": "us.anthropic.claude-3-5-sonnet-20241022-v2:0",
         "custom_llm_provider": "bedrock",
@@ -212,8 +208,7 @@ def test_model_info_bedrock_converse(monkeypatch):
 
     This ensures they are automatically routed to the converse endpoint.
     """
-    monkeypatch.setenv("LITELLM_LOCAL_MODEL_COST_MAP", "True")
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
     try:
         # Load whitelist models from file
         with open("whitelisted_bedrock_models.txt", "r") as file:
@@ -231,8 +226,7 @@ def test_model_info_bedrock_converse_enforcement(monkeypatch):
     """
     Test the enforcement of the whitelist by adding a fake model and ensuring the test fails.
     """
-    monkeypatch.setenv("LITELLM_LOCAL_MODEL_COST_MAP", "True")
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
 
     # Add a fake unwhitelisted model
     litellm.model_cost["fake.bedrock-chat-model"] = {
@@ -323,8 +317,7 @@ def test_get_model_info_bedrock_models():
     """
     from litellm.llms.bedrock.common_utils import BedrockModelInfo
 
-    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
 
     for k, v in litellm.model_cost.items():
         if v["litellm_provider"] == "bedrock":
diff --git a/tests/local_testing/test_router_utils.py b/tests/local_testing/test_router_utils.py
index 7de9707579..d0afc440d9 100644
--- a/tests/local_testing/test_router_utils.py
+++ b/tests/local_testing/test_router_utils.py
@@ -178,8 +178,7 @@ async def test_update_kwargs_before_fallbacks(call_type):
 
 
 def test_router_get_model_info_wildcard_routes():
-    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
     router = Router(
         model_list=[
             {
@@ -200,8 +199,7 @@ def test_router_get_model_info_wildcard_routes():
 
 @pytest.mark.asyncio
 async def test_router_get_model_group_usage_wildcard_routes():
-    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
     router = Router(
         model_list=[
             {
@@ -297,8 +295,7 @@ async def test_call_router_callbacks_on_failure():
 
 @pytest.mark.asyncio
 async def test_router_model_group_headers():
-    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
     from litellm.types.utils import OPENAI_RESPONSE_HEADERS
 
     router = Router(
@@ -330,8 +327,7 @@ async def test_router_model_group_headers():
 
 @pytest.mark.asyncio
 async def test_get_remaining_model_group_usage():
-    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
-    litellm.model_cost = litellm.get_model_cost_map(url="")
+    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
     from litellm.types.utils import OPENAI_RESPONSE_HEADERS
 
     router = Router(
diff --git a/tests/test_model_prices_and_context_window_schema.py b/tests/test_model_prices_and_context_window_schema.py
new file mode 100644
index 0000000000..80d35f84b4
--- /dev/null
+++ b/tests/test_model_prices_and_context_window_schema.py
@@ -0,0 +1,111 @@
+import litellm
+from jsonschema import validate
+
+def test_model_prices_and_context_window_json_is_valid():
+    '''
+    Validates the `model_prices_and_context_window.json` file.
+
+    If this test fails after you update the json, you need to update the schema or correct the change you made.
+    '''
+
+    INTENDED_SCHEMA = {
+        "type": "object",
+        "additionalProperties": {
+            "type": "object",
+            "properties": {
+                "cache_creation_input_audio_token_cost": {"type": "number"},
+                "cache_creation_input_token_cost": {"type": "number"},
+                "cache_read_input_token_cost": {"type": "number"},
+                "deprecation_date": {"type": "string"},
+                "input_cost_per_audio_per_second": {"type": "number"},
+                "input_cost_per_audio_per_second_above_128k_tokens": {"type": "number"},
+                "input_cost_per_audio_token": {"type": "number"},
+                "input_cost_per_character": {"type": "number"},
+                "input_cost_per_character_above_128k_tokens": {"type": "number"},
+                "input_cost_per_image": {"type": "number"},
+                "input_cost_per_image_above_128k_tokens": {"type": "number"},
+                "input_cost_per_pixel": {"type": "number"},
+                "input_cost_per_query": {"type": "number"},
+                "input_cost_per_request": {"type": "number"},
+                "input_cost_per_second": {"type": "number"},
+                "input_cost_per_token": {"type": "number"},
+                "input_cost_per_token_above_128k_tokens": {"type": "number"},
+                "input_cost_per_token_batch_requests": {"type": "number"},
+                "input_cost_per_token_batches": {"type": "number"},
+                "input_cost_per_token_cache_hit": {"type": "number"},
+                "input_cost_per_video_per_second": {"type": "number"},
+                "input_cost_per_video_per_second_above_128k_tokens": {"type": "number"},
+                "input_dbu_cost_per_token": {"type": "number"},
+                "litellm_provider": {"type": "string"},
+                "max_audio_length_hours": {"type": "number"},
+                "max_audio_per_prompt": {"type": "number"},
+                "max_document_chunks_per_query": {"type": "number"},
+                "max_images_per_prompt": {"type": "number"},
+                "max_input_tokens": {"type": "number"},
+                "max_output_tokens": {"type": "number"},
+                "max_pdf_size_mb": {"type": "number"},
+                "max_query_tokens": {"type": "number"},
+                "max_tokens": {"type": "number"},
+                "max_tokens_per_document_chunk": {"type": "number"},
+                "max_video_length": {"type": "number"},
+                "max_videos_per_prompt": {"type": "number"},
+                "metadata": {"type": "object"},
+                "mode": {
+                    "type": "string",
+                    "enum": [
+                        "audio_speech",
+                        "audio_transcription",
+                        "chat",
+                        "completion",
+                        "embedding",
+                        "image_generation",
+                        "moderation",
+                        "moderations",
+                        "rerank"
+                    ],
+                },
+                "output_cost_per_audio_token": {"type": "number"},
+                "output_cost_per_character": {"type": "number"},
+                "output_cost_per_character_above_128k_tokens": {"type": "number"},
+                "output_cost_per_image": {"type": "number"},
+                "output_cost_per_pixel": {"type": "number"},
+                "output_cost_per_second": {"type": "number"},
+                "output_cost_per_token": {"type": "number"},
+                "output_cost_per_token_above_128k_tokens": {"type": "number"},
+                "output_cost_per_token_batches": {"type": "number"},
+                "output_db_cost_per_token": {"type": "number"},
+                "output_dbu_cost_per_token": {"type": "number"},
+                "output_vector_size": {"type": "number"},
+                "rpd": {"type": "number"},
+                "rpm": {"type": "number"},
+                "source": {"type": "string"},
+                "supports_assistant_prefill": {"type": "boolean"},
+                "supports_audio_input": {"type": "boolean"},
+                "supports_audio_output": {"type": "boolean"},
+                "supports_embedding_image_input": {"type": "boolean"},
+                "supports_function_calling": {"type": "boolean"},
+                "supports_image_input": {"type": "boolean"},
+                "supports_parallel_function_calling": {"type": "boolean"},
+                "supports_pdf_input": {"type": "boolean"},
+                "supports_prompt_caching": {"type": "boolean"},
+                "supports_response_schema": {"type": "boolean"},
+                "supports_system_messages": {"type": "boolean"},
+                "supports_tool_choice": {"type": "boolean"},
+                "supports_video_input": {"type": "boolean"},
+                "supports_vision": {"type": "boolean"},
+                "tool_use_system_prompt_tokens": {"type": "number"},
+                "tpm": {"type": "number"},
+            },
+            "additionalProperties": False,
+        },
+    }
+
+    actual_json = litellm.get_locally_cached_model_cost_map()
+    assert isinstance(actual_json, dict)
+    temporarily_removed = actual_json.pop('sample_spec', None) # remove the sample, whose schema is inconsistent with the real data
+
+    validate(actual_json, INTENDED_SCHEMA)
+
+    if temporarily_removed is not None:
+        # put back the sample spec that we removed
+        actual_json.update({'sample_spec': temporarily_removed})