Revert "changes"

This reverts commit fa88bc9632.
2025-04-25 10:44:24 +00:00 · 2025-03-10 14:46:21 -07:00 · 2025-03-10 14:46:21 -07:00 · c7a04140e9
commit c7a04140e9
parent 1f17daf52c
19 changed files with 141 additions and 191 deletions
--- a/litellm/init.py
+++ b/litellm/init.py
@ -306,7 +306,7 @@ _key_management_settings: KeyManagementSettings = KeyManagementSettings()
 #### PII MASKING ####
 output_parse_pii: bool = False
 #############################################
-from litellm.litellm_core_utils.get_model_cost_map import get_model_cost_map, get_locally_cached_model_cost_map
+from litellm.litellm_core_utils.get_model_cost_map import get_model_cost_map
 model_cost = get_model_cost_map(url=model_cost_map_url)
 custom_prompt_dict: Dict[str, dict] = {}
--- a/litellm/litellm_core_utils/get_model_cost_map.py
+++ b/litellm/litellm_core_utils/get_model_cost_map.py
@ -8,13 +8,16 @@ export LITELLM_LOCAL_MODEL_COST_MAP=True
 ```
 """
 from functools import cache
 import os
 import httpx
-@cache
+
-def get_locally_cached_model_cost_map():
+def get_model_cost_map(url: str):
    if (
        os.getenv("LITELLM_LOCAL_MODEL_COST_MAP", False)
        or os.getenv("LITELLM_LOCAL_MODEL_COST_MAP", False) == "True"
    ):
        import importlib.resources
        import json
@ -24,14 +27,6 @@ def get_locally_cached_model_cost_map():
            content = json.load(f)
            return content
 def get_model_cost_map(url: str):
    if (
        os.getenv("LITELLM_LOCAL_MODEL_COST_MAP", False)
        or os.getenv("LITELLM_LOCAL_MODEL_COST_MAP", False) == "True"
    ):
        return get_locally_cached_model_cost_map()
    try:
        response = httpx.get(
            url, timeout=5
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@ -6,7 +6,7 @@
        "input_cost_per_token": 0.0000,
        "output_cost_per_token": 0.000,
        "litellm_provider": "one of https://docs.litellm.ai/docs/providers",
-        "mode": "one of: chat, embedding, completion, image_generation, audio_transcription, audio_speech, image_generation, moderation, moderations, rerank",
+        "mode": "one of chat, embedding, completion, image_generation, audio_transcription, audio_speech",
        "supports_function_calling": true,
        "supports_parallel_function_calling": true,
        "supports_vision": true,
--- a/tests/code_coverage_tests/bedrock_pricing.py
+++ b/tests/code_coverage_tests/bedrock_pricing.py
@ -191,7 +191,8 @@ def _check_if_model_name_in_pricing(
    input_cost_per_1k_tokens: str,
    output_cost_per_1k_tokens: str,
 ):
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    for model, value in litellm.model_cost.items():
        if model.startswith(bedrock_model_name):
--- a/tests/litellm_utils_tests/test_utils.py
+++ b/tests/litellm_utils_tests/test_utils.py
@ -907,7 +907,8 @@ def test_supports_response_schema(model, expected_bool):
    Should be true for gemini-1.5-pro on google ai studio / vertex ai AND predibase models
    Should be false otherwise
    """
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    from litellm.utils import supports_response_schema
@ -1065,7 +1066,8 @@ def test_async_http_handler_force_ipv4(mock_async_client):
    "model, expected_bool", [("gpt-3.5-turbo", False), ("gpt-4o-audio-preview", True)]
 )
 def test_supports_audio_input(model, expected_bool):
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    from litellm.utils import supports_audio_input, supports_audio_output
@ -1163,7 +1165,8 @@ def test_models_by_provider():
    """
    Make sure all providers from model map are in the valid providers list
    """
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    from litellm import models_by_provider
@ -1481,7 +1484,8 @@ def test_get_valid_models_default(monkeypatch):
 def test_supports_vision_gemini():
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    from litellm.utils import supports_vision
    assert supports_vision("gemini-1.5-pro") is True
--- a/tests/llm_translation/base_embedding_unit_tests.py
+++ b/tests/llm_translation/base_embedding_unit_tests.py
@ -84,7 +84,8 @@ class BaseLLMEmbeddingTest(ABC):
        litellm.set_verbose = True
        from litellm.utils import supports_embedding_image_input
-        litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
        litellm.model_cost = litellm.get_model_cost_map(url="")
        base_embedding_call_args = self.get_base_embedding_call_args()
        if not supports_embedding_image_input(base_embedding_call_args["model"], None):
--- a/tests/llm_translation/base_llm_unit_tests.py
+++ b/tests/llm_translation/base_llm_unit_tests.py
@ -342,7 +342,8 @@ class BaseLLMChatTest(ABC):
        from pydantic import BaseModel
        from litellm.utils import supports_response_schema
-        litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
        litellm.model_cost = litellm.get_model_cost_map(url="")
        class TestModel(BaseModel):
            first_response: str
@ -381,14 +382,16 @@ class BaseLLMChatTest(ABC):
        from pydantic import BaseModel
        from litellm.utils import supports_response_schema
-        litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
        litellm.model_cost = litellm.get_model_cost_map(url="")
    @pytest.mark.flaky(retries=6, delay=1)
    def test_json_response_nested_pydantic_obj(self):
        from pydantic import BaseModel
        from litellm.utils import supports_response_schema
-        litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
        litellm.model_cost = litellm.get_model_cost_map(url="")
        class CalendarEvent(BaseModel):
            name: str
@ -435,7 +438,8 @@ class BaseLLMChatTest(ABC):
        from litellm.utils import supports_response_schema
        from litellm.llms.base_llm.base_utils import type_to_response_format_param
-        litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
        litellm.model_cost = litellm.get_model_cost_map(url="")
        class CalendarEvent(BaseModel):
            name: str
@ -556,7 +560,8 @@ class BaseLLMChatTest(ABC):
        litellm.set_verbose = True
        from litellm.utils import supports_vision
-        litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
        litellm.model_cost = litellm.get_model_cost_map(url="")
        base_completion_call_args = self.get_base_completion_call_args()
        if not supports_vision(base_completion_call_args["model"], None):
@ -610,7 +615,8 @@ class BaseLLMChatTest(ABC):
        litellm.set_verbose = True
        from litellm.utils import supports_vision
-        litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
        litellm.model_cost = litellm.get_model_cost_map(url="")
        image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
@ -650,7 +656,8 @@ class BaseLLMChatTest(ABC):
        litellm.set_verbose = True
        from litellm.utils import supports_prompt_caching
-        litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
        litellm.model_cost = litellm.get_model_cost_map(url="")
        base_completion_call_args = self.get_base_completion_call_args()
        if not supports_prompt_caching(base_completion_call_args["model"], None):
@ -766,7 +773,8 @@ class BaseLLMChatTest(ABC):
            litellm._turn_on_debug()
            from litellm.utils import supports_function_calling
-            litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+            os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
            litellm.model_cost = litellm.get_model_cost_map(url="")
            base_completion_call_args = self.get_base_completion_call_args()
            if not supports_function_calling(base_completion_call_args["model"], None):
@ -864,7 +872,8 @@ class BaseLLMChatTest(ABC):
    async def test_completion_cost(self):
        from litellm import completion_cost
-        litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
        litellm.model_cost = litellm.get_model_cost_map(url="")
        litellm.set_verbose = True
        response = await self.async_completion_function(
--- a/tests/llm_translation/base_rerank_unit_tests.py
+++ b/tests/llm_translation/base_rerank_unit_tests.py
@ -87,7 +87,8 @@ class BaseLLMRerankTest(ABC):
    @pytest.mark.parametrize("sync_mode", [True, False])
    async def test_basic_rerank(self, sync_mode):
        litellm._turn_on_debug()
-        litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
        litellm.model_cost = litellm.get_model_cost_map(url="")
        rerank_call_args = self.get_base_rerank_call_args()
        custom_llm_provider = self.get_custom_llm_provider()
        if sync_mode is True:
--- a/tests/llm_translation/test_anthropic_completion.py
+++ b/tests/llm_translation/test_anthropic_completion.py
@ -491,7 +491,8 @@ class TestAnthropicCompletion(BaseLLMChatTest):
        from pydantic import BaseModel
        from litellm.utils import supports_response_schema
-        litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
        litellm.model_cost = litellm.get_model_cost_map(url="")
        class RFormat(BaseModel):
            question: str
--- a/tests/llm_translation/test_bedrock_completion.py
+++ b/tests/llm_translation/test_bedrock_completion.py
@ -1975,7 +1975,8 @@ def test_bedrock_converse_route():
 def test_bedrock_mapped_converse_models():
    litellm.set_verbose = True
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    litellm.add_known_models()
    litellm.completion(
        model="bedrock/us.amazon.nova-pro-v1:0",
@ -2107,7 +2108,8 @@ def test_bedrock_supports_tool_call(model, expected_supports_tool_call):
 class TestBedrockConverseChatCrossRegion(BaseLLMChatTest):
    def get_base_completion_call_args(self) -> dict:
-        litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
        litellm.model_cost = litellm.get_model_cost_map(url="")
        litellm.add_known_models()
        return {
            "model": "bedrock/us.anthropic.claude-3-5-sonnet-20241022-v2:0",
@ -2135,7 +2137,8 @@ class TestBedrockConverseChatCrossRegion(BaseLLMChatTest):
        """
        Test if region models info is correctly used for cost calculation. Using the base model info for cost calculation.
        """
-        litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
        litellm.model_cost = litellm.get_model_cost_map(url="")
        bedrock_model = "us.anthropic.claude-3-5-sonnet-20241022-v2:0"
        litellm.model_cost.pop(bedrock_model, None)
        model = f"bedrock/{bedrock_model}"
@ -2152,7 +2155,8 @@ class TestBedrockConverseChatCrossRegion(BaseLLMChatTest):
 class TestBedrockConverseChatNormal(BaseLLMChatTest):
    def get_base_completion_call_args(self) -> dict:
-        litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
        litellm.model_cost = litellm.get_model_cost_map(url="")
        litellm.add_known_models()
        return {
            "model": "bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0",
@ -2321,7 +2325,8 @@ def test_bedrock_nova_topk(top_k_param):
 def test_bedrock_cross_region_inference(monkeypatch):
    from litellm.llms.custom_httpx.http_handler import HTTPHandler
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    monkeypatch.setenv("LITELLM_LOCAL_MODEL_COST_MAP", "True")
    litellm.model_cost = litellm.get_model_cost_map(url="")
    litellm.add_known_models()
    litellm.set_verbose = True
--- a/tests/llm_translation/test_openai_o1.py
+++ b/tests/llm_translation/test_openai_o1.py
@ -29,7 +29,8 @@ async def test_o1_handle_system_role(model):
    from openai import AsyncOpenAI
    from litellm.utils import supports_system_messages
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    litellm.set_verbose = True
@ -82,7 +83,8 @@ async def test_o1_handle_tool_calling_optional_params(
    from litellm.utils import ProviderConfigManager
    from litellm.types.utils import LlmProviders
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    config = ProviderConfigManager.get_provider_chat_config(
        model=model, provider=LlmProviders.OPENAI
@ -188,7 +190,8 @@ class TestOpenAIO3(BaseOSeriesModelsTest, BaseLLMChatTest):
 def test_o1_supports_vision():
    """Test that o1 supports vision"""
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    for k, v in litellm.model_cost.items():
        if k.startswith("o1") and v.get("litellm_provider") == "openai":
            assert v.get("supports_vision") is True, f"{k} does not support vision"
--- a/tests/llm_translation/test_rerank.py
+++ b/tests/llm_translation/test_rerank.py
@ -274,7 +274,8 @@ class TestLogger(CustomLogger):
@pytest.mark.asyncio()
 async def test_rerank_custom_callbacks():
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    custom_logger = TestLogger()
    litellm.callbacks = [custom_logger]
--- a/tests/llm_translation/test_together_ai.py
+++ b/tests/llm_translation/test_together_ai.py
@ -42,7 +42,8 @@ class TestTogetherAI(BaseLLMChatTest):
    def test_get_supported_response_format_together_ai(
        self, model: str, expected_bool: bool
    ) -> None:
-        litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
        litellm.model_cost = litellm.get_model_cost_map(url="")
        optional_params = litellm.get_supported_openai_params(
            model, custom_llm_provider="together_ai"
        )
--- a/tests/local_testing/test_amazing_vertex_completion.py
+++ b/tests/local_testing/test_amazing_vertex_completion.py
@ -1433,7 +1433,8 @@ async def test_gemini_pro_json_schema_args_sent_httpx(
    enforce_validation,
 ):
    load_vertex_ai_credentials()
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    litellm.set_verbose = True
    messages = [{"role": "user", "content": "List 5 cookie recipes"}]
@ -1553,7 +1554,8 @@ async def test_gemini_pro_json_schema_args_sent_httpx_openai_schema(
    from pydantic import BaseModel
    load_vertex_ai_credentials()
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    litellm.set_verbose = True
--- a/tests/local_testing/test_completion_cost.py
+++ b/tests/local_testing/test_completion_cost.py
@ -634,7 +634,8 @@ def test_gemini_completion_cost(above_128k, provider):
    """
    Check if cost correctly calculated for gemini models based on context window
    """
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    if provider == "gemini":
        model_name = "gemini-1.5-flash-latest"
    else:
@ -689,7 +690,8 @@ def _count_characters(text):
 def test_vertex_ai_completion_cost():
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    text = "The quick brown fox jumps over the lazy dog."
    characters = _count_characters(text=text)
@ -724,7 +726,8 @@ def test_vertex_ai_medlm_completion_cost():
            model=model, messages=messages, custom_llm_provider="vertex_ai"
        )
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    model = "vertex_ai/medlm-medium"
    messages = [{"role": "user", "content": "Test MedLM completion cost."}]
@ -743,7 +746,8 @@ def test_vertex_ai_claude_completion_cost():
    from litellm import Choices, Message, ModelResponse
    from litellm.utils import Usage
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    litellm.set_verbose = True
    input_tokens = litellm.token_counter(
@ -792,7 +796,8 @@ def test_vertex_ai_embedding_completion_cost(caplog):
    """
    Relevant issue - https://github.com/BerriAI/litellm/issues/4630
    """
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    text = "The quick brown fox jumps over the lazy dog."
    input_tokens = litellm.token_counter(
@ -834,7 +839,8 @@ def test_vertex_ai_embedding_completion_cost(caplog):
 #     from test_amazing_vertex_completion import load_vertex_ai_credentials
 #     load_vertex_ai_credentials()
-#     litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+#     os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
 #     litellm.model_cost = litellm.get_model_cost_map(url="")
 #     text = "The quick brown fox jumps over the lazy dog."
 #     input_tokens = litellm.token_counter(
@ -861,7 +867,8 @@ def test_vertex_ai_embedding_completion_cost(caplog):
 def test_completion_azure_ai():
    try:
-        litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
        litellm.model_cost = litellm.get_model_cost_map(url="")
        litellm.set_verbose = True
        response = litellm.completion(
@ -967,7 +974,8 @@ def test_vertex_ai_mistral_predict_cost(usage):
@pytest.mark.parametrize("model", ["openai/tts-1", "azure/tts-1"])
 def test_completion_cost_tts(model):
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    cost = completion_cost(
        model=model,
@ -1163,7 +1171,8 @@ def test_completion_cost_azure_common_deployment_name():
    ],
 )
 def test_completion_cost_prompt_caching(model, custom_llm_provider):
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    from litellm.utils import Choices, Message, ModelResponse, Usage
@ -1264,7 +1273,8 @@ def test_completion_cost_prompt_caching(model, custom_llm_provider):
    ],
 )
 def test_completion_cost_databricks(model):
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    model, messages = model, [{"role": "user", "content": "What is 2+2?"}]
    resp = litellm.completion(model=model, messages=messages)  # works fine
@ -1281,7 +1291,8 @@ def test_completion_cost_databricks(model):
    ],
 )
 def test_completion_cost_databricks_embedding(model):
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    resp = litellm.embedding(model=model, input=["hey, how's it going?"])  # works fine
    print(resp)
@ -1308,7 +1319,8 @@ def test_get_model_params_fireworks_ai(model, base_model):
    ["fireworks_ai/llama-v3p1-405b-instruct", "fireworks_ai/mixtral-8x7b-instruct"],
 )
 def test_completion_cost_fireworks_ai(model):
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    messages = [{"role": "user", "content": "Hey, how's it going?"}]
    resp = litellm.completion(model=model, messages=messages)  # works fine
@ -1325,7 +1337,8 @@ def test_cost_azure_openai_prompt_caching():
    )
    from litellm import get_model_info
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    model = "azure/o1-mini"
@ -1416,7 +1429,8 @@ def test_cost_azure_openai_prompt_caching():
 def test_completion_cost_vertex_llama3():
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    from litellm.utils import Choices, Message, ModelResponse, Usage
@ -1456,7 +1470,8 @@ def test_cost_openai_prompt_caching():
    from litellm.utils import Choices, Message, ModelResponse, Usage
    from litellm import get_model_info
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    model = "gpt-4o-mini-2024-07-18"
@ -1546,7 +1561,8 @@ def test_cost_openai_prompt_caching():
 def test_completion_cost_azure_ai_rerank(model):
    from litellm import RerankResponse, rerank
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    response = RerankResponse(
        id="b01dbf2e-63c8-4981-9e69-32241da559ed",
@ -1577,7 +1593,8 @@ def test_completion_cost_azure_ai_rerank(model):
 def test_together_ai_embedding_completion_cost():
    from litellm.utils import Choices, EmbeddingResponse, Message, ModelResponse, Usage
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    response = EmbeddingResponse(
        model="togethercomputer/m2-bert-80M-8k-retrieval",
        data=[
@ -2434,7 +2451,8 @@ def test_completion_cost_params_gemini_3():
    from litellm.llms.vertex_ai.cost_calculator import cost_per_character
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    response = ModelResponse(
        id="chatcmpl-61043504-4439-48be-9996-e29bdee24dc3",
@ -2503,7 +2521,8 @@ def test_completion_cost_params_gemini_3():
 # @pytest.mark.flaky(retries=3, delay=1)
@pytest.mark.parametrize("stream", [False])  # True,
 async def test_test_completion_cost_gpt4o_audio_output_from_model(stream):
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    from litellm.types.utils import (
        Choices,
        Message,
@ -2600,7 +2619,8 @@ def test_completion_cost_model_response_cost(response_model, custom_llm_provider
    """
    from litellm import ModelResponse
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    litellm.set_verbose = True
    response = {
@ -2700,7 +2720,8 @@ def test_select_model_name_for_cost_calc():
 def test_moderations():
    from litellm import moderation
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    litellm.add_known_models()
    assert "omni-moderation-latest" in litellm.model_cost
@ -2753,7 +2774,8 @@ def test_bedrock_cost_calc_with_region():
    from litellm import ModelResponse
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    litellm.add_known_models()
@ -2952,7 +2974,9 @@ async def test_cost_calculator_with_custom_pricing_router(model_item, custom_pri
 def test_json_valid_model_cost_map():
    import json
-    model_cost = litellm.get_locally_cached_model_cost_map()
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    model_cost = litellm.get_model_cost_map(url="")
    try:
        # Attempt to serialize and deserialize the JSON
--- a/tests/local_testing/test_embedding.py
+++ b/tests/local_testing/test_embedding.py
@ -115,7 +115,8 @@ def test_openai_embedding_3():
@pytest.mark.asyncio
 async def test_openai_azure_embedding_simple(model, api_base, api_key, sync_mode):
    try:
-        litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
        litellm.model_cost = litellm.get_model_cost_map(url="")
        # litellm.set_verbose = True
        if sync_mode:
            response = embedding(
@ -197,7 +198,8 @@ def _azure_ai_image_mock_response(*args, **kwargs):
@pytest.mark.asyncio
 async def test_azure_ai_embedding_image(model, api_base, api_key, sync_mode):
    try:
-        litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
        litellm.model_cost = litellm.get_model_cost_map(url="")
        input = base64_image
        if sync_mode:
            client = HTTPHandler()
--- a/tests/local_testing/test_get_model_info.py
+++ b/tests/local_testing/test_get_model_info.py
@ -58,14 +58,16 @@ def test_get_model_info_shows_correct_supports_vision():
 def test_get_model_info_shows_assistant_prefill():
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    info = litellm.get_model_info("deepseek/deepseek-chat")
    print("info", info)
    assert info.get("supports_assistant_prefill") is True
 def test_get_model_info_shows_supports_prompt_caching():
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    info = litellm.get_model_info("deepseek/deepseek-chat")
    print("info", info)
    assert info.get("supports_prompt_caching") is True
@ -114,7 +116,8 @@ def test_get_model_info_gemini():
    """
    Tests if ALL gemini models have 'tpm' and 'rpm' in the model info
    """
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    model_map = litellm.model_cost
    for model, info in model_map.items():
@ -124,7 +127,8 @@ def test_get_model_info_gemini():
 def test_get_model_info_bedrock_region():
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    args = {
        "model": "us.anthropic.claude-3-5-sonnet-20241022-v2:0",
        "custom_llm_provider": "bedrock",
@ -208,7 +212,8 @@ def test_model_info_bedrock_converse(monkeypatch):
    This ensures they are automatically routed to the converse endpoint.
    """
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    monkeypatch.setenv("LITELLM_LOCAL_MODEL_COST_MAP", "True")
    litellm.model_cost = litellm.get_model_cost_map(url="")
    try:
        # Load whitelist models from file
        with open("whitelisted_bedrock_models.txt", "r") as file:
@ -226,7 +231,8 @@ def test_model_info_bedrock_converse_enforcement(monkeypatch):
    """
    Test the enforcement of the whitelist by adding a fake model and ensuring the test fails.
    """
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    monkeypatch.setenv("LITELLM_LOCAL_MODEL_COST_MAP", "True")
    litellm.model_cost = litellm.get_model_cost_map(url="")
    # Add a fake unwhitelisted model
    litellm.model_cost["fake.bedrock-chat-model"] = {
@ -317,7 +323,8 @@ def test_get_model_info_bedrock_models():
    """
    from litellm.llms.bedrock.common_utils import BedrockModelInfo
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    for k, v in litellm.model_cost.items():
        if v["litellm_provider"] == "bedrock":
--- a/tests/local_testing/test_router_utils.py
+++ b/tests/local_testing/test_router_utils.py
@ -178,7 +178,8 @@ async def test_update_kwargs_before_fallbacks(call_type):
 def test_router_get_model_info_wildcard_routes():
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    router = Router(
        model_list=[
            {
@ -199,7 +200,8 @@ def test_router_get_model_info_wildcard_routes():
@pytest.mark.asyncio
 async def test_router_get_model_group_usage_wildcard_routes():
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    router = Router(
        model_list=[
            {
@ -295,7 +297,8 @@ async def test_call_router_callbacks_on_failure():
@pytest.mark.asyncio
 async def test_router_model_group_headers():
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    from litellm.types.utils import OPENAI_RESPONSE_HEADERS
    router = Router(
@ -327,7 +330,8 @@ async def test_router_model_group_headers():
@pytest.mark.asyncio
 async def test_get_remaining_model_group_usage():
-    litellm.model_cost = litellm.get_locally_cached_model_cost_map()
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    from litellm.types.utils import OPENAI_RESPONSE_HEADERS
    router = Router(
--- a/tests/test_model_prices_and_context_window_schema.py
+++ b/tests/test_model_prices_and_context_window_schema.py
@ -1,111 +0,0 @@
 import litellm
 from jsonschema import validate
 def test_model_prices_and_context_window_json_is_valid():
    '''
    Validates the `model_prices_and_context_window.json` file.
    If this test fails after you update the json, you need to update the schema or correct the change you made.
    '''
    INTENDED_SCHEMA = {
        "type": "object",
        "additionalProperties": {
            "type": "object",
            "properties": {
                "cache_creation_input_audio_token_cost": {"type": "number"},
                "cache_creation_input_token_cost": {"type": "number"},
                "cache_read_input_token_cost": {"type": "number"},
                "deprecation_date": {"type": "string"},
                "input_cost_per_audio_per_second": {"type": "number"},
                "input_cost_per_audio_per_second_above_128k_tokens": {"type": "number"},
                "input_cost_per_audio_token": {"type": "number"},
                "input_cost_per_character": {"type": "number"},
                "input_cost_per_character_above_128k_tokens": {"type": "number"},
                "input_cost_per_image": {"type": "number"},
                "input_cost_per_image_above_128k_tokens": {"type": "number"},
                "input_cost_per_pixel": {"type": "number"},
                "input_cost_per_query": {"type": "number"},
                "input_cost_per_request": {"type": "number"},
                "input_cost_per_second": {"type": "number"},
                "input_cost_per_token": {"type": "number"},
                "input_cost_per_token_above_128k_tokens": {"type": "number"},
                "input_cost_per_token_batch_requests": {"type": "number"},
                "input_cost_per_token_batches": {"type": "number"},
                "input_cost_per_token_cache_hit": {"type": "number"},
                "input_cost_per_video_per_second": {"type": "number"},
                "input_cost_per_video_per_second_above_128k_tokens": {"type": "number"},
                "input_dbu_cost_per_token": {"type": "number"},
                "litellm_provider": {"type": "string"},
                "max_audio_length_hours": {"type": "number"},
                "max_audio_per_prompt": {"type": "number"},
                "max_document_chunks_per_query": {"type": "number"},
                "max_images_per_prompt": {"type": "number"},
                "max_input_tokens": {"type": "number"},
                "max_output_tokens": {"type": "number"},
                "max_pdf_size_mb": {"type": "number"},
                "max_query_tokens": {"type": "number"},
                "max_tokens": {"type": "number"},
                "max_tokens_per_document_chunk": {"type": "number"},
                "max_video_length": {"type": "number"},
                "max_videos_per_prompt": {"type": "number"},
                "metadata": {"type": "object"},
                "mode": {
                    "type": "string",
                    "enum": [
                        "audio_speech",
                        "audio_transcription",
                        "chat",
                        "completion",
                        "embedding",
                        "image_generation",
                        "moderation",
                        "moderations",
                        "rerank"
                    ],
                },
                "output_cost_per_audio_token": {"type": "number"},
                "output_cost_per_character": {"type": "number"},
                "output_cost_per_character_above_128k_tokens": {"type": "number"},
                "output_cost_per_image": {"type": "number"},
                "output_cost_per_pixel": {"type": "number"},
                "output_cost_per_second": {"type": "number"},
                "output_cost_per_token": {"type": "number"},
                "output_cost_per_token_above_128k_tokens": {"type": "number"},
                "output_cost_per_token_batches": {"type": "number"},
                "output_db_cost_per_token": {"type": "number"},
                "output_dbu_cost_per_token": {"type": "number"},
                "output_vector_size": {"type": "number"},
                "rpd": {"type": "number"},
                "rpm": {"type": "number"},
                "source": {"type": "string"},
                "supports_assistant_prefill": {"type": "boolean"},
                "supports_audio_input": {"type": "boolean"},
                "supports_audio_output": {"type": "boolean"},
                "supports_embedding_image_input": {"type": "boolean"},
                "supports_function_calling": {"type": "boolean"},
                "supports_image_input": {"type": "boolean"},
                "supports_parallel_function_calling": {"type": "boolean"},
                "supports_pdf_input": {"type": "boolean"},
                "supports_prompt_caching": {"type": "boolean"},
                "supports_response_schema": {"type": "boolean"},
                "supports_system_messages": {"type": "boolean"},
                "supports_tool_choice": {"type": "boolean"},
                "supports_video_input": {"type": "boolean"},
                "supports_vision": {"type": "boolean"},
                "tool_use_system_prompt_tokens": {"type": "number"},
                "tpm": {"type": "number"},
            },
            "additionalProperties": False,
        },
    }
    actual_json = litellm.get_locally_cached_model_cost_map()
    assert isinstance(actual_json, dict)
    temporarily_removed = actual_json.pop('sample_spec', None) # remove the sample, whose schema is inconsistent with the real data
    validate(actual_json, INTENDED_SCHEMA)
    if temporarily_removed is not None:
        # put back the sample spec that we removed
        actual_json.update({'sample_spec': temporarily_removed})